From be39e5b58365940cab37b16f3a6b1776f3ce5401 Mon Sep 17 00:00:00 2001 From: jannchie Date: Thu, 27 Sep 2018 15:32:21 +0800 Subject: [PATCH 001/469] =?UTF-8?q?=E5=AE=8C=E6=88=90B=E7=AB=99=E5=85=A8?= =?UTF-8?q?=E9=83=A8=E8=A7=86=E9=A2=91=E7=88=AC=E8=99=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 + biliob_spider/__init__.py | 0 biliob_spider/items.py | 24 ++++ biliob_spider/middlewares.py | 103 +++++++++++++++++ biliob_spider/pipelines.py | 43 +++++++ biliob_spider/settings.py | 103 +++++++++++++++++ biliob_spider/spiders/__init__.py | 4 + biliob_spider/spiders/video.py | 179 ++++++++++++++++++++++++++++++ scrapy.cfg | 11 ++ 9 files changed, 470 insertions(+) create mode 100644 .gitignore create mode 100644 biliob_spider/__init__.py create mode 100644 biliob_spider/items.py create mode 100644 biliob_spider/middlewares.py create mode 100644 biliob_spider/pipelines.py create mode 100644 biliob_spider/settings.py create mode 100644 biliob_spider/spiders/__init__.py create mode 100644 biliob_spider/spiders/video.py create mode 100644 scrapy.cfg diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6813027 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.pyc +db.py + diff --git a/biliob_spider/__init__.py b/biliob_spider/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/biliob_spider/items.py b/biliob_spider/items.py new file mode 100644 index 0000000..12129b9 --- /dev/null +++ b/biliob_spider/items.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class videoItem(scrapy.Item): + channel = scrapy.Field() + aid = scrapy.Field() + datetime = scrapy.Field() + author = scrapy.Field() + view = scrapy.Field() + favorite = scrapy.Field() + coin = scrapy.Field() + share = scrapy.Field() + like = scrapy.Field() + danmaku = scrapy.Field() + dislike = scrapy.Field() + subChannel = scrapy.Field() + title = scrapy.Field() \ No newline at end of file diff --git a/biliob_spider/middlewares.py b/biliob_spider/middlewares.py new file mode 100644 index 0000000..019d6b4 --- /dev/null +++ b/biliob_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class ObilibiliSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class ObilibiliSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py new file mode 100644 index 0000000..c8093b4 --- /dev/null +++ b/biliob_spider/pipelines.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html +from pymongo import MongoClient +from db import settings +import datetime +import logging + +class VideoPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient('localhost', 27017) + + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW']) + self.db = self.client['bili_data'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + "aid": int(item["aid"]) + }, {"$set": { + "author": item['author'], + "subChannel": item['subChannel'], + "channel": item['channel'], + "view": int(item['view']), + "favorite": int(item['favorite']), + "coin": int(item['coin']), + "share": int(item['share']), + "like": int(item['like']), + "dislike": int(item['dislike']), + "danmaku": int(item['danmaku']), + "title": item['title'], + "datetime": datetime.datetime.fromtimestamp(item['datetime']) + }},True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) \ No newline at end of file diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py new file mode 100644 index 0000000..2518105 --- /dev/null +++ b/biliob_spider/settings.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for biliob_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +import random + +BOT_NAME = 'biliob_spider' + +SPIDER_MODULES = ['biliob_spider.spiders'] +NEWSPIDER_MODULE = 'biliob_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'biliob_spider (+http://www.yourdomain.com)' + +# user agent 列表 +USER_AGENT_LIST = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36' +] +# 随机生成user agent +USER_AGENT = random.choice(USER_AGENT_LIST) + + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 64 + +# Disable cookies (enabled by default) +COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'BilibiliRankListSpider.middlewares.BilibiliranklistspiderSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'BilibiliRankListSpider.middlewares.BilibiliranklistspiderDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'BilibiliRankListSpider.pipelines.BilibiliranklistspiderPipeline': 300, + 'BilibiliRankListSpider.pipelines.DailyRankListPipeLine': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' +DOWNLOAD_FAIL_ON_DATALOSS = True +RETRY_ENABLED = True \ No newline at end of file diff --git a/biliob_spider/spiders/__init__.py b/biliob_spider/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/biliob_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/biliob_spider/spiders/video.py b/biliob_spider/spiders/video.py new file mode 100644 index 0000000..7fc3cf8 --- /dev/null +++ b/biliob_spider/spiders/video.py @@ -0,0 +1,179 @@ +#coding=utf-8 +import scrapy +from scrapy.http import Request +from biliob_spider.items import videoItem +import time +import json +import logging +from dateutil import parser +from pymongo import MongoClient +sub_channel_2_channel = { + 'ASMR': '生活', + 'GMV': '游戏', + 'Korea相关': '娱乐', + 'MAD·AMV': '动画', + 'MMD·3D': '动画', + 'Mugen': '游戏', + 'OP/ED/OST': '音乐', + 'VOCALOID·UTAU': '音乐', + '三次元舞蹈': '舞蹈', + '三次元音乐': '音乐', + '人力VOCALOID': '鬼畜', + '人文·历史': '纪录片', + '健身': '时尚', + '其他': '生活', + '其他国家': '电影', + '军事': '纪录片', + '动物圈': '生活', + '华语电影': '电影', + '单机游戏': '游戏', + '原创音乐': '音乐', + '国产剧': '电视剧', + '国产动画': '国创', + '国产原创相关': '国创', + '宅舞': '舞蹈', + '完结动画': '番剧', + '官方延伸': '番剧', + '布袋戏': '国创', + '广告': '广告', + '影视剪辑': '影视', + '影视杂谈': '影视', + '手工': '生活', + '手机游戏': '游戏', + '搞笑': '生活', + '教程演示': '鬼畜', + '数码': '科技', + '日常': '生活', + '明星': '娱乐', + '星海': '科技', + '服饰': '时尚', + '机械': '科技', + '桌游棋牌': '游戏', + '欧美电影': '电影', + '汽车': '科技', + '海外剧': '电视剧', + '演奏': '音乐', + '演讲·公开课': '科技', + '特摄': '影视', + '电子竞技': '游戏', + '短片': '影视', + '短片·手书·配音': '动画', + '社会·美食·旅行': '纪录片', + '科学·探索·自然': '纪录片', + '绘画': '生活', + '综合': '动画', + '综艺': '娱乐', + '网络游戏': '游戏', + '美妆': '时尚', + '美食圈': '生活', + '翻唱': '音乐', + '舞蹈教程': '舞蹈', + '资讯': '国创', + '趣味科普人文': '科技', + '运动': '生活', + '连载动画': '番剧', + '野生技术协会': '科技', + '音MAD': '鬼畜', + '音乐选集': '音乐', + '音游': '游戏', + '预告 资讯': '影视', + '预告·资讯': '影视', + '单机联机': '游戏', + '鬼畜调教': '鬼畜', + '演讲• 公开课': '科技', + '国产电影': '电影', + '日本电影': '电影', + '番剧': '番剧', + '国创': '国创', + '鬼畜': '鬼畜', + '电视剧': '电视剧', + '动画': '动画', + '时尚': '时尚', + '娱乐': '娱乐', + '电影': '电影', + '舞蹈': '舞蹈', + '科技': '科技', + '生活': '生活', + '音乐': '音乐', + '纪录片': '纪录片', + '生活': '生活', + '游戏': '游戏' +} +class VideoSpider(scrapy.spiders.Spider): + name = "videoSpider" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + 'obilibili_spider.pipelines.VideoPipeline': 300 + } + } + def __init__(self,start_aid=1,length=99999999,limit_view=50000, *args, **kwargs): + # super(HighSpeedVideoSpider2, self).__init__(*args, **kwargs) + print("开始的av号为:" + str(start_aid) + ",计划抓取的视频个数为:" + str(length)) + self.start_aid = int(start_aid) + self.length = int(length) + self.limit_view = limit_view + def start_requests(self): + i = (x for x in range(self.start_aid, self.start_aid + self.length)) + while True: + aid_str = '' + for j in range(100): + aid_str += str(next(i))+',' + yield Request("https://api.bilibili.com/x/article/archives?ids=" + aid_str.rstrip(',')) + def parse(self, response): + try: + r = json.loads(response.body) + d = r["data"] + keys = list(d.keys()) + for each_key in keys: + aid = d[each_key]['stat']['aid'] + author = d[each_key]['owner']['name'] + view = d[each_key]['stat']['view'] + favorite = d[each_key]['stat']['favorite'] + danmaku = favorite = d[each_key]['stat']['danmaku'] + coin = d[each_key]['stat']['coin'] + share = d[each_key]['stat']['share'] + like = d[each_key]['stat']['like'] + dislike = d[each_key]['stat']['dislike'] + subChannel = d[each_key]['tname'] + title = d[each_key]['title'] + datetime = d[each_key]['pubdate'] + tid = d[each_key]['tid'] + item = videoItem() + item['aid'] = aid + item['author'] = author + item['view'] = view + item['favorite'] = favorite + item['coin'] = coin + item['share'] = share + item['like'] = like + item['dislike'] = dislike + item['danmaku'] = danmaku + item['title'] = title + item['subChannel'] = subChannel + item['datetime'] = datetime + if subChannel != '': + item['channel'] = sub_channel_2_channel[subChannel] + elif subChannel == '资讯': + if tid == 51: + item['channel'] == '番剧' + if tid == 170: + item['channel'] == '国创' + if tid == 159: + item['channel'] == '娱乐' + else: + item['channel'] = None + + # 只收录大于limit_view的视频 + if view > self.limit_view: + yield item + except Exception as error: + # 出现错误时打印错误日志 + if r['code'] == -404: + return + logging.error("视频爬虫在解析时发生错误") + logging.error(item) + logging.error(response.url) + logging.error(error) +# scrapy crawl VideoTagSpider -a start_aid=26053983 -a length=2000000 -s JOBDIR=tag-07-21 -L INFO diff --git a/scrapy.cfg b/scrapy.cfg new file mode 100644 index 0000000..b7d9327 --- /dev/null +++ b/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = obilibili_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = obilibili_spider From 2f7d0468b5663bdd3bf07aa1c7136e181701c718 Mon Sep 17 00:00:00 2001 From: jannchie Date: Thu, 27 Sep 2018 20:17:22 +0800 Subject: [PATCH 002/469] =?UTF-8?q?=E5=AE=8C=E6=88=90=E7=94=A8=E6=88=B7?= =?UTF-8?q?=E8=BF=BD=E8=B8=AA=E4=B8=8E=E7=83=AD=E7=82=B9=E8=87=AA=E5=8A=A8?= =?UTF-8?q?=E6=9B=B4=E6=96=B0=E7=88=AC=E8=99=AB=EF=BC=8C=E6=B7=BB=E5=8A=A0?= =?UTF-8?q?=E8=AE=A1=E5=88=92=E4=BB=BB=E5=8A=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + biliob_spider/items.py | 15 ++++- biliob_spider/middlewares.py | 4 +- biliob_spider/pipelines.py | 72 +++++++++++++++++------ biliob_spider/settings.py | 14 +++-- biliob_spider/spiders/author_auto_add.py | 71 ++++++++++++++++++++++ biliob_spider/spiders/author_update.py | 75 ++++++++++++++++++++++++ biliob_spider/spiders/video.py | 9 +-- run.py | 36 ++++++++++++ scrapy.cfg | 4 +- 10 files changed, 267 insertions(+), 34 deletions(-) create mode 100644 biliob_spider/spiders/author_auto_add.py create mode 100644 biliob_spider/spiders/author_update.py create mode 100644 run.py diff --git a/.gitignore b/.gitignore index 6813027..16f8c54 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.pyc db.py +biliob_spider.log diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 12129b9..5eb1622 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -8,7 +8,7 @@ import scrapy -class videoItem(scrapy.Item): +class VideoItem(scrapy.Item): channel = scrapy.Field() aid = scrapy.Field() datetime = scrapy.Field() @@ -21,4 +21,15 @@ class videoItem(scrapy.Item): danmaku = scrapy.Field() dislike = scrapy.Field() subChannel = scrapy.Field() - title = scrapy.Field() \ No newline at end of file + title = scrapy.Field() + + +class AuthorItem(scrapy.Item): + mid = scrapy.Field() + name = scrapy.Field() + face = scrapy.Field() + official = scrapy.Field() + sex = scrapy.Field() + data = scrapy.Field() + level = scrapy.Field() + diff --git a/biliob_spider/middlewares.py b/biliob_spider/middlewares.py index 019d6b4..d877208 100644 --- a/biliob_spider/middlewares.py +++ b/biliob_spider/middlewares.py @@ -8,7 +8,7 @@ from scrapy import signals -class ObilibiliSpiderSpiderMiddleware(object): +class BiliobSpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @@ -56,7 +56,7 @@ def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) -class ObilibiliSpiderDownloaderMiddleware(object): +class BiliobDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index c8093b4..92220d5 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -9,34 +9,70 @@ import datetime import logging + class VideoPipeline(object): def __init__(self): # 链接mongoDB - self.client = MongoClient('localhost', 27017) - + self.client = MongoClient(settings['MINGO_HOST'], 27017) # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW']) - self.db = self.client['bili_data'] # 获得数据库的句柄 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['video'] # 获得collection的句柄 def process_item(self, item, spider): try: self.coll.update_one({ "aid": int(item["aid"]) - }, {"$set": { - "author": item['author'], - "subChannel": item['subChannel'], - "channel": item['channel'], - "view": int(item['view']), - "favorite": int(item['favorite']), - "coin": int(item['coin']), - "share": int(item['share']), - "like": int(item['like']), - "dislike": int(item['dislike']), - "danmaku": int(item['danmaku']), - "title": item['title'], - "datetime": datetime.datetime.fromtimestamp(item['datetime']) - }},True) + }, { + "$set": { + "author": item['author'], + "subChannel": item['subChannel'], + "channel": item['channel'], + "view": int(item['view']), + "favorite": int(item['favorite']), + "coin": int(item['coin']), + "share": int(item['share']), + "like": int(item['like']), + "dislike": int(item['dislike']), + "danmaku": int(item['danmaku']), + "title": item['title'], + "datetime": datetime.datetime.fromtimestamp( + item['datetime']) + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) + + +class AuthorPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + "mid": item["mid"] + }, { + "$set": { + "name": item['name'], + "face": item['face'], + "official": item['official'], + "level": item['level'], + "sex": item['sex'], + }, + "$addToSet": { + 'data': item['data'] + } + }, True) return item except Exception as error: # 出现错误时打印错误日志 diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index 2518105..477ab67 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -11,6 +11,9 @@ import random +LOG_FILE = "biliob_spider.log" +LOG_LEVEL = "INFO" + BOT_NAME = 'biliob_spider' SPIDER_MODULES = ['biliob_spider.spiders'] @@ -32,12 +35,12 @@ ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) -CONCURRENT_REQUESTS = 32 +# CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 +# DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: # CONCURRENT_REQUESTS_PER_DOMAIN = 16 # CONCURRENT_REQUESTS_PER_IP = 64 @@ -74,10 +77,9 @@ # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html -ITEM_PIPELINES = { - 'BilibiliRankListSpider.pipelines.BilibiliranklistspiderPipeline': 300, - 'BilibiliRankListSpider.pipelines.DailyRankListPipeLine': 300, -} +# ITEM_PIPELINES = { + +# } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py new file mode 100644 index 0000000..5852469 --- /dev/null +++ b/biliob_spider/spiders/author_auto_add.py @@ -0,0 +1,71 @@ +#coding=utf-8 +import scrapy +from scrapy.http import Request +from biliob_spider.items import AuthorItem +import time +import json +import logging +from dateutil import parser +from pymongo import MongoClient +import datetime + +class AuthorAutoAddSpider(scrapy.spiders.Spider): + name = "authorAutoAdd" + allowed_domains = ["bilibili.com"] + start_urls = ['https://www.bilibili.com/ranking'] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.AuthorPipeline': 300 + } + } + + def parse(self, response): + try: + url_list = response.xpath( + "//*[@id='app']/div[2]/div/div[1]/div[2]/div[3]/ul/li/div[2]/div[2]/div/a/@href" + ).extract() + + # 为了爬取分区、粉丝数等数据,需要进入每一个视频的详情页面进行抓取 + for each_url in url_list: + yield Request( + "https://api.bilibili.com/x/web-interface/card?mid=" + + each_url[21:], + method='GET', + callback=self.detailParse) + except Exception as error: + # 出现错误时打印错误日志 + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) + + def detailParse(self, response): + j = json.loads(response.body) + name = j['data']['card']['name'] + mid = j['data']['card']['mid'] + sex = j['data']['card']['sex'] + face = j['data']['card']['face'] + fans = j['data']['card']['fans'] + attention = j['data']['card']['attention'] + level = j['data']['card']['level_info']['current_level'] + official = j['data']['card']['Official']['title'] + archive_count = j['data']['archive_count'] + article_count = j['data']['article_count'] + face = j['data']['card']['face'] + item = AuthorItem() + item['mid'] = int(mid) + item['name'] = name + item['face'] = face + item['official'] = official + item['sex'] = sex + item['level'] = int(level) + item['data'] = [{ + 'fans': int(fans), + 'attention': int(attention), + 'archive_count': int(archive_count), + 'article_count': int(article_count), + 'datetime': datetime.datetime.now() + }] + yield item + + +# scrapy crawl VideoTagSpider -a start_aid=26053983 -a length=2000000 -s JOBDIR=tag-07-21 -L INFO diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py new file mode 100644 index 0000000..ba80529 --- /dev/null +++ b/biliob_spider/spiders/author_update.py @@ -0,0 +1,75 @@ +#coding=utf-8 +import scrapy +from scrapy.http import Request +from biliob_spider.items import AuthorItem +import time +import json +import logging +from dateutil import parser +from pymongo import MongoClient +import datetime +from db import settings + + +class AuthorUpdate(scrapy.spiders.Spider): + name = "authorUpdate" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.AuthorPipeline': 300 + }, + 'DOWNLOAD_DELAY' : 0.5 + } + + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + + def start_requests(self): + c = self.coll.find() + for each_doc in c: + yield Request( + "https://api.bilibili.com/x/web-interface/card?mid=" + + str(each_doc['mid']), + method='GET') + + def parse(self, response): + try: + j = json.loads(response.body) + name = j['data']['card']['name'] + mid = j['data']['card']['mid'] + sex = j['data']['card']['sex'] + face = j['data']['card']['face'] + fans = j['data']['card']['fans'] + attention = j['data']['card']['attention'] + level = j['data']['card']['level_info']['current_level'] + official = j['data']['card']['Official']['title'] + archive_count = j['data']['archive_count'] + article_count = j['data']['article_count'] + face = j['data']['card']['face'] + item = AuthorItem() + item['mid'] = int(mid) + item['name'] = name + item['face'] = face + item['official'] = official + item['sex'] = sex + item['level'] = int(level) + item['data'] = [{ + 'fans': int(fans), + 'attention': int(attention), + 'archive_count': int(archive_count), + 'article_count': int(article_count), + 'datetime': datetime.datetime.now() + }] + yield item + except Exception as error: + # 出现错误时打印错误日志 + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) diff --git a/biliob_spider/spiders/video.py b/biliob_spider/spiders/video.py index 7fc3cf8..3ff276e 100644 --- a/biliob_spider/spiders/video.py +++ b/biliob_spider/spiders/video.py @@ -1,7 +1,7 @@ #coding=utf-8 import scrapy from scrapy.http import Request -from biliob_spider.items import videoItem +from biliob_spider.items import VideoItem import time import json import logging @@ -105,8 +105,9 @@ class VideoSpider(scrapy.spiders.Spider): start_urls = [] custom_settings = { 'ITEM_PIPELINES': { - 'obilibili_spider.pipelines.VideoPipeline': 300 - } + 'biliob_spider.pipelines.VideoPipeline': 300, + }, + 'DOWNLOAD_DELAY' : 1 } def __init__(self,start_aid=1,length=99999999,limit_view=50000, *args, **kwargs): # super(HighSpeedVideoSpider2, self).__init__(*args, **kwargs) @@ -140,7 +141,7 @@ def parse(self, response): title = d[each_key]['title'] datetime = d[each_key]['pubdate'] tid = d[each_key]['tid'] - item = videoItem() + item = VideoItem() item['aid'] = aid item['author'] = author item['view'] = view diff --git a/run.py b/run.py new file mode 100644 index 0000000..05cd269 --- /dev/null +++ b/run.py @@ -0,0 +1,36 @@ +import schedule +import time +from subprocess import Popen +import logging + +# 第一步,创建一个logger +logger = logging.getLogger() +logger.setLevel(logging.INFO) # Log等级总开关 +# 第二步,创建一个handler,用于写入日志文件 +rq = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) +log_path = './' +log_name = log_path + rq + '.log' +logfile = log_name +fh = logging.FileHandler(logfile, mode='w') +fh.setLevel(logging.DEBUG) # 输出到file的log等级的开关 +# 第三步,定义handler的输出格式 +formatter = logging.Formatter( + "%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s") +fh.setFormatter(formatter) +# 第四步,将logger添加到handler里面 +logger.addHandler(fh) + +def update_author(): + logging.info("开始定期更新author数据...") + Popen("scrapy crawl authorUpdate") + +def auto_add_author(): + logging.info("开始定期更新author数据...") + Popen("scrapy crawl authorAutoAdd") + +schedule.every().hour.do(update_author) +schedule.every().day.at('13:00').do(auto_add_author) + +while True: + schedule.run_pending() + time.sleep(60) diff --git a/scrapy.cfg b/scrapy.cfg index b7d9327..631c2d7 100644 --- a/scrapy.cfg +++ b/scrapy.cfg @@ -4,8 +4,8 @@ # https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] -default = obilibili_spider.settings +default = biliob_spider.settings [deploy] #url = http://localhost:6800/ -project = obilibili_spider +project = biliob_spider From ab53e06128c96380e3099d70b9fea5ecdf1ceab7 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Thu, 27 Sep 2018 20:23:19 +0800 Subject: [PATCH 003/469] Initial commit --- LICENSE | 21 +++++++++++++++++++++ README.md | 2 ++ 2 files changed, 23 insertions(+) create mode 100644 LICENSE create mode 100644 README.md diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..caff7be --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Jannchie + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..a61d1c1 --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# biliob +bilibili data acquisition and analysis. From 367e4fe9622fd07bf25b90b747c7bc3ed684ec0f Mon Sep 17 00:00:00 2001 From: Jannchie Date: Thu, 27 Sep 2018 20:23:19 +0800 Subject: [PATCH 004/469] Initial commit --- LICENSE | 21 +++++++++++++++++++++ README.md | 2 ++ 2 files changed, 23 insertions(+) create mode 100644 LICENSE create mode 100644 README.md diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..caff7be --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Jannchie + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..a61d1c1 --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# biliob +bilibili data acquisition and analysis. From caea0880117af13015a2b3ad27f9696b58b90c05 Mon Sep 17 00:00:00 2001 From: jannchie Date: Thu, 27 Sep 2018 22:47:24 +0800 Subject: [PATCH 005/469] =?UTF-8?q?=E5=8E=BB=E9=99=A4=E4=BA=86=E6=B2=A1?= =?UTF-8?q?=E6=9C=89=E4=BD=BF=E7=94=A8=E7=9A=84=E6=A8=A1=E5=9D=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + biliob_spider/spiders/author_auto_add.py | 1 - biliob_spider/spiders/author_update.py | 1 - biliob_spider/spiders/video.py | 1 - run.py | 6 ++++-- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 16f8c54..e84d65e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.pyc +*.log db.py biliob_spider.log diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index 5852469..dfb8306 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -5,7 +5,6 @@ import time import json import logging -from dateutil import parser from pymongo import MongoClient import datetime diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index ba80529..3b99096 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -5,7 +5,6 @@ import time import json import logging -from dateutil import parser from pymongo import MongoClient import datetime from db import settings diff --git a/biliob_spider/spiders/video.py b/biliob_spider/spiders/video.py index 3ff276e..bfdc536 100644 --- a/biliob_spider/spiders/video.py +++ b/biliob_spider/spiders/video.py @@ -5,7 +5,6 @@ import time import json import logging -from dateutil import parser from pymongo import MongoClient sub_channel_2_channel = { 'ASMR': '生活', diff --git a/run.py b/run.py index 05cd269..338fdd4 100644 --- a/run.py +++ b/run.py @@ -1,3 +1,6 @@ +#!/usr/bin/python3.6 +# -*- coding:utf-8 -*- + import schedule import time from subprocess import Popen @@ -21,16 +24,15 @@ logger.addHandler(fh) def update_author(): - logging.info("开始定期更新author数据...") Popen("scrapy crawl authorUpdate") def auto_add_author(): - logging.info("开始定期更新author数据...") Popen("scrapy crawl authorAutoAdd") schedule.every().hour.do(update_author) schedule.every().day.at('13:00').do(auto_add_author) +logging.info('开始运行计划任务..') while True: schedule.run_pending() time.sleep(60) From 8781025cbe02318991fe4ef52e88031c7552c5a0 Mon Sep 17 00:00:00 2001 From: jannchie Date: Thu, 27 Sep 2018 22:47:24 +0800 Subject: [PATCH 006/469] =?UTF-8?q?=E5=8E=BB=E9=99=A4=E4=BA=86=E6=B2=A1?= =?UTF-8?q?=E6=9C=89=E4=BD=BF=E7=94=A8=E7=9A=84=E6=A8=A1=E5=9D=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + biliob_spider/spiders/author_auto_add.py | 1 - biliob_spider/spiders/author_update.py | 1 - biliob_spider/spiders/video.py | 1 - run.py | 6 ++++-- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 16f8c54..e84d65e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.pyc +*.log db.py biliob_spider.log diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index 5852469..dfb8306 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -5,7 +5,6 @@ import time import json import logging -from dateutil import parser from pymongo import MongoClient import datetime diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index ba80529..3b99096 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -5,7 +5,6 @@ import time import json import logging -from dateutil import parser from pymongo import MongoClient import datetime from db import settings diff --git a/biliob_spider/spiders/video.py b/biliob_spider/spiders/video.py index 3ff276e..bfdc536 100644 --- a/biliob_spider/spiders/video.py +++ b/biliob_spider/spiders/video.py @@ -5,7 +5,6 @@ import time import json import logging -from dateutil import parser from pymongo import MongoClient sub_channel_2_channel = { 'ASMR': '生活', diff --git a/run.py b/run.py index 05cd269..338fdd4 100644 --- a/run.py +++ b/run.py @@ -1,3 +1,6 @@ +#!/usr/bin/python3.6 +# -*- coding:utf-8 -*- + import schedule import time from subprocess import Popen @@ -21,16 +24,15 @@ logger.addHandler(fh) def update_author(): - logging.info("开始定期更新author数据...") Popen("scrapy crawl authorUpdate") def auto_add_author(): - logging.info("开始定期更新author数据...") Popen("scrapy crawl authorAutoAdd") schedule.every().hour.do(update_author) schedule.every().day.at('13:00').do(auto_add_author) +logging.info('开始运行计划任务..') while True: schedule.run_pending() time.sleep(60) From 9a04ef272e3d292af06d8dac223069a82db3d297 Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 28 Sep 2018 10:26:42 +0800 Subject: [PATCH 007/469] =?UTF-8?q?DEBUG:=E4=BF=AE=E6=94=B9=E8=AE=A1?= =?UTF-8?q?=E5=88=92=E4=BB=BB=E5=8A=A1=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 +- run.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index e84d65e..89e235f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ *.pyc *.log db.py - +nohup.out biliob_spider.log diff --git a/run.py b/run.py index 338fdd4..f5829e1 100644 --- a/run.py +++ b/run.py @@ -24,10 +24,10 @@ logger.addHandler(fh) def update_author(): - Popen("scrapy crawl authorUpdate") + Popen(["scrapy","crawl","authorUpdate"]) def auto_add_author(): - Popen("scrapy crawl authorAutoAdd") + Popen(["scrapy","crawl","authorAutoAdd"]) schedule.every().hour.do(update_author) schedule.every().day.at('13:00').do(auto_add_author) From defb6e91d8e2685d3083928c050ad3977cc54a2a Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 28 Sep 2018 10:26:42 +0800 Subject: [PATCH 008/469] =?UTF-8?q?DEBUG:=E4=BF=AE=E6=94=B9=E8=AE=A1?= =?UTF-8?q?=E5=88=92=E4=BB=BB=E5=8A=A1=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 +- run.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index e84d65e..89e235f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ *.pyc *.log db.py - +nohup.out biliob_spider.log diff --git a/run.py b/run.py index 338fdd4..f5829e1 100644 --- a/run.py +++ b/run.py @@ -24,10 +24,10 @@ logger.addHandler(fh) def update_author(): - Popen("scrapy crawl authorUpdate") + Popen(["scrapy","crawl","authorUpdate"]) def auto_add_author(): - Popen("scrapy crawl authorAutoAdd") + Popen(["scrapy","crawl","authorAutoAdd"]) schedule.every().hour.do(update_author) schedule.every().day.at('13:00').do(auto_add_author) From 19b951ccb50573f4b68ce40ca5e63595574216ba Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 28 Sep 2018 13:34:29 +0800 Subject: [PATCH 009/469] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=90=8C=E6=97=B6?= =?UTF-8?q?=E8=A7=82=E7=9C=8B=E7=88=AC=E8=99=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/items.py | 5 +++ biliob_spider/pipelines.py | 28 +++++++++++++++++ biliob_spider/spiders/author_auto_add.py | 6 +--- biliob_spider/spiders/author_update.py | 3 +- biliob_spider/spiders/online.py | 40 ++++++++++++++++++++++++ biliob_spider/spiders/video.py | 2 +- 6 files changed, 76 insertions(+), 8 deletions(-) create mode 100644 biliob_spider/spiders/online.py diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 5eb1622..71b396b 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -33,3 +33,8 @@ class AuthorItem(scrapy.Item): data = scrapy.Field() level = scrapy.Field() +class VideoOnline(scrapy.Item): + title = scrapy.Field() + author = scrapy.Field() + data = scrapy.Field() + diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 92220d5..0f35af0 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -74,6 +74,34 @@ def process_item(self, item, spider): } }, True) return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) + +class OnlinePipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video_online'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + "title": item["title"] + }, { + "$set": { + "title": item['title'], + "author": item['author'], + }, + "$addToSet": { + 'data': item['data'] + } + }, True) + return item except Exception as error: # 出现错误时打印错误日志 logging.error(error) \ No newline at end of file diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index 5852469..601af0c 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -5,7 +5,6 @@ import time import json import logging -from dateutil import parser from pymongo import MongoClient import datetime @@ -65,7 +64,4 @@ def detailParse(self, response): 'article_count': int(article_count), 'datetime': datetime.datetime.now() }] - yield item - - -# scrapy crawl VideoTagSpider -a start_aid=26053983 -a length=2000000 -s JOBDIR=tag-07-21 -L INFO + yield item \ No newline at end of file diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index ba80529..b1dadb4 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -5,7 +5,6 @@ import time import json import logging -from dateutil import parser from pymongo import MongoClient import datetime from db import settings @@ -72,4 +71,4 @@ def parse(self, response): # 出现错误时打印错误日志 logging.error("视频爬虫在解析时发生错误") logging.error(response.url) - logging.error(error) + logging.error(error) \ No newline at end of file diff --git a/biliob_spider/spiders/online.py b/biliob_spider/spiders/online.py new file mode 100644 index 0000000..02a7bf0 --- /dev/null +++ b/biliob_spider/spiders/online.py @@ -0,0 +1,40 @@ +#coding=utf-8 +import scrapy +from scrapy.http import Request +from biliob_spider.items import VideoOnline +import time +import json +import logging +from pymongo import MongoClient +import datetime + +class OnlineSpider(scrapy.spiders.Spider): + name = "online" + allowed_domains = ["bilibili.com"] + start_urls = ['https://www.bilibili.com/video/online.html'] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.OnlinePipeline': 300 + } + } + + def parse(self, response): + try: + video_list = response.xpath('//*[@id="app"]/div[2]/div[2]/div') + + # 为了爬取分区、粉丝数等数据,需要进入每一个视频的详情页面进行抓取 + title_list = video_list.xpath('./a/p/text()').extract() + watch_list = video_list.xpath('./p/b/text()').extract() + author_list = video_list.xpath('./div[1]/a/text()').extract() + for i in range(len(title_list)): + item = VideoOnline() + item['title'] = title_list[i] + item['author'] = author_list[i] + item['data'] = [{'datetime':datetime.datetime.now(),'number':watch_list[i]}] + yield item + + except Exception as error: + # 出现错误时打印错误日志 + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) \ No newline at end of file diff --git a/biliob_spider/spiders/video.py b/biliob_spider/spiders/video.py index 3ff276e..4c5032e 100644 --- a/biliob_spider/spiders/video.py +++ b/biliob_spider/spiders/video.py @@ -10,7 +10,7 @@ sub_channel_2_channel = { 'ASMR': '生活', 'GMV': '游戏', - 'Korea相关': '娱乐', + 'Korea相关': '娱乐', 'MAD·AMV': '动画', 'MMD·3D': '动画', 'Mugen': '游戏', From bb6edefc2460dba87ec5e139dc5baa17e3460ceb Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 28 Sep 2018 13:34:29 +0800 Subject: [PATCH 010/469] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=90=8C=E6=97=B6?= =?UTF-8?q?=E8=A7=82=E7=9C=8B=E7=88=AC=E8=99=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/items.py | 5 +++ biliob_spider/pipelines.py | 28 +++++++++++++++++ biliob_spider/spiders/author_auto_add.py | 6 +--- biliob_spider/spiders/author_update.py | 3 +- biliob_spider/spiders/online.py | 40 ++++++++++++++++++++++++ biliob_spider/spiders/video.py | 2 +- 6 files changed, 76 insertions(+), 8 deletions(-) create mode 100644 biliob_spider/spiders/online.py diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 5eb1622..71b396b 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -33,3 +33,8 @@ class AuthorItem(scrapy.Item): data = scrapy.Field() level = scrapy.Field() +class VideoOnline(scrapy.Item): + title = scrapy.Field() + author = scrapy.Field() + data = scrapy.Field() + diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 92220d5..0f35af0 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -74,6 +74,34 @@ def process_item(self, item, spider): } }, True) return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) + +class OnlinePipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video_online'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + "title": item["title"] + }, { + "$set": { + "title": item['title'], + "author": item['author'], + }, + "$addToSet": { + 'data': item['data'] + } + }, True) + return item except Exception as error: # 出现错误时打印错误日志 logging.error(error) \ No newline at end of file diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index 5852469..601af0c 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -5,7 +5,6 @@ import time import json import logging -from dateutil import parser from pymongo import MongoClient import datetime @@ -65,7 +64,4 @@ def detailParse(self, response): 'article_count': int(article_count), 'datetime': datetime.datetime.now() }] - yield item - - -# scrapy crawl VideoTagSpider -a start_aid=26053983 -a length=2000000 -s JOBDIR=tag-07-21 -L INFO + yield item \ No newline at end of file diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index ba80529..b1dadb4 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -5,7 +5,6 @@ import time import json import logging -from dateutil import parser from pymongo import MongoClient import datetime from db import settings @@ -72,4 +71,4 @@ def parse(self, response): # 出现错误时打印错误日志 logging.error("视频爬虫在解析时发生错误") logging.error(response.url) - logging.error(error) + logging.error(error) \ No newline at end of file diff --git a/biliob_spider/spiders/online.py b/biliob_spider/spiders/online.py new file mode 100644 index 0000000..02a7bf0 --- /dev/null +++ b/biliob_spider/spiders/online.py @@ -0,0 +1,40 @@ +#coding=utf-8 +import scrapy +from scrapy.http import Request +from biliob_spider.items import VideoOnline +import time +import json +import logging +from pymongo import MongoClient +import datetime + +class OnlineSpider(scrapy.spiders.Spider): + name = "online" + allowed_domains = ["bilibili.com"] + start_urls = ['https://www.bilibili.com/video/online.html'] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.OnlinePipeline': 300 + } + } + + def parse(self, response): + try: + video_list = response.xpath('//*[@id="app"]/div[2]/div[2]/div') + + # 为了爬取分区、粉丝数等数据,需要进入每一个视频的详情页面进行抓取 + title_list = video_list.xpath('./a/p/text()').extract() + watch_list = video_list.xpath('./p/b/text()').extract() + author_list = video_list.xpath('./div[1]/a/text()').extract() + for i in range(len(title_list)): + item = VideoOnline() + item['title'] = title_list[i] + item['author'] = author_list[i] + item['data'] = [{'datetime':datetime.datetime.now(),'number':watch_list[i]}] + yield item + + except Exception as error: + # 出现错误时打印错误日志 + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) \ No newline at end of file diff --git a/biliob_spider/spiders/video.py b/biliob_spider/spiders/video.py index 3ff276e..4c5032e 100644 --- a/biliob_spider/spiders/video.py +++ b/biliob_spider/spiders/video.py @@ -10,7 +10,7 @@ sub_channel_2_channel = { 'ASMR': '生活', 'GMV': '游戏', - 'Korea相关': '娱乐', + 'Korea相关': '娱乐', 'MAD·AMV': '动画', 'MMD·3D': '动画', 'Mugen': '游戏', From e16b3751236d8fb71857ab4409c887bbe71eb010 Mon Sep 17 00:00:00 2001 From: jannchie Date: Sat, 29 Sep 2018 19:37:17 +0800 Subject: [PATCH 011/469] =?UTF-8?q?DEBUG=EF=BC=9A=E4=BF=AE=E6=AD=A3?= =?UTF-8?q?=E4=BA=86=E6=95=B0=E6=8D=AE=E7=BB=93=E6=9E=84=E9=94=99=E8=AF=AF?= =?UTF-8?q?=20=E5=A2=9E=E5=8A=A0=E6=97=B6=E5=BB=B6=EF=BC=8C=E5=87=8F?= =?UTF-8?q?=E5=B0=91=E8=A2=ABBan=E5=87=A0=E7=8E=87=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/pipelines.py | 2 +- biliob_spider/settings.py | 2 +- biliob_spider/spiders/author_auto_add.py | 9 +++++---- biliob_spider/spiders/author_update.py | 8 ++++---- biliob_spider/spiders/online.py | 4 ++-- run.py | 6 +++++- 6 files changed, 18 insertions(+), 13 deletions(-) diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 0f35af0..e00cb27 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -104,4 +104,4 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) \ No newline at end of file + logging.error(error) diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index 477ab67..79aeeaf 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -102,4 +102,4 @@ # HTTPCACHE_IGNORE_HTTP_CODES = [] # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' DOWNLOAD_FAIL_ON_DATALOSS = True -RETRY_ENABLED = True \ No newline at end of file +RETRY_ENABLED = True diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index 601af0c..1fa5d31 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -15,7 +15,8 @@ class AuthorAutoAddSpider(scrapy.spiders.Spider): custom_settings = { 'ITEM_PIPELINES': { 'biliob_spider.pipelines.AuthorPipeline': 300 - } + }, + 'DOWNLOAD_DELAY' : 10 } def parse(self, response): @@ -57,11 +58,11 @@ def detailParse(self, response): item['official'] = official item['sex'] = sex item['level'] = int(level) - item['data'] = [{ + item['data'] = { 'fans': int(fans), 'attention': int(attention), 'archive_count': int(archive_count), 'article_count': int(article_count), 'datetime': datetime.datetime.now() - }] - yield item \ No newline at end of file + } + yield item diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index b1dadb4..69a62f8 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -18,7 +18,7 @@ class AuthorUpdate(scrapy.spiders.Spider): 'ITEM_PIPELINES': { 'biliob_spider.pipelines.AuthorPipeline': 300 }, - 'DOWNLOAD_DELAY' : 0.5 + 'DOWNLOAD_DELAY' : 1 } def __init__(self): @@ -59,16 +59,16 @@ def parse(self, response): item['official'] = official item['sex'] = sex item['level'] = int(level) - item['data'] = [{ + item['data'] = { 'fans': int(fans), 'attention': int(attention), 'archive_count': int(archive_count), 'article_count': int(article_count), 'datetime': datetime.datetime.now() - }] + } yield item except Exception as error: # 出现错误时打印错误日志 logging.error("视频爬虫在解析时发生错误") logging.error(response.url) - logging.error(error) \ No newline at end of file + logging.error(error) diff --git a/biliob_spider/spiders/online.py b/biliob_spider/spiders/online.py index 02a7bf0..b2bd139 100644 --- a/biliob_spider/spiders/online.py +++ b/biliob_spider/spiders/online.py @@ -30,11 +30,11 @@ def parse(self, response): item = VideoOnline() item['title'] = title_list[i] item['author'] = author_list[i] - item['data'] = [{'datetime':datetime.datetime.now(),'number':watch_list[i]}] + item['data'] = {'datetime':datetime.datetime.now(),'number':watch_list[i]} yield item except Exception as error: # 出现错误时打印错误日志 logging.error("视频爬虫在解析时发生错误") logging.error(response.url) - logging.error(error) \ No newline at end of file + logging.error(error) diff --git a/run.py b/run.py index f5829e1..0ea64b0 100644 --- a/run.py +++ b/run.py @@ -29,8 +29,12 @@ def update_author(): def auto_add_author(): Popen(["scrapy","crawl","authorAutoAdd"]) +def online(): + Popen(['scrapy','crawl','online']) + schedule.every().hour.do(update_author) -schedule.every().day.at('13:00').do(auto_add_author) +schedule.every().day.at('14:00').do(auto_add_author) +schedule.every().minute.do(online) logging.info('开始运行计划任务..') while True: From 5e7cd4ec51da56378f96abace7b76f067339da63 Mon Sep 17 00:00:00 2001 From: jannchie Date: Sat, 29 Sep 2018 19:37:17 +0800 Subject: [PATCH 012/469] =?UTF-8?q?DEBUG=EF=BC=9A=E4=BF=AE=E6=AD=A3?= =?UTF-8?q?=E4=BA=86=E6=95=B0=E6=8D=AE=E7=BB=93=E6=9E=84=E9=94=99=E8=AF=AF?= =?UTF-8?q?=20=E5=A2=9E=E5=8A=A0=E6=97=B6=E5=BB=B6=EF=BC=8C=E5=87=8F?= =?UTF-8?q?=E5=B0=91=E8=A2=ABBan=E5=87=A0=E7=8E=87=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/pipelines.py | 2 +- biliob_spider/settings.py | 2 +- biliob_spider/spiders/author_auto_add.py | 9 +++++---- biliob_spider/spiders/author_update.py | 8 ++++---- biliob_spider/spiders/online.py | 4 ++-- run.py | 6 +++++- 6 files changed, 18 insertions(+), 13 deletions(-) diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 0f35af0..e00cb27 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -104,4 +104,4 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) \ No newline at end of file + logging.error(error) diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index 477ab67..79aeeaf 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -102,4 +102,4 @@ # HTTPCACHE_IGNORE_HTTP_CODES = [] # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' DOWNLOAD_FAIL_ON_DATALOSS = True -RETRY_ENABLED = True \ No newline at end of file +RETRY_ENABLED = True diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index 601af0c..1fa5d31 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -15,7 +15,8 @@ class AuthorAutoAddSpider(scrapy.spiders.Spider): custom_settings = { 'ITEM_PIPELINES': { 'biliob_spider.pipelines.AuthorPipeline': 300 - } + }, + 'DOWNLOAD_DELAY' : 10 } def parse(self, response): @@ -57,11 +58,11 @@ def detailParse(self, response): item['official'] = official item['sex'] = sex item['level'] = int(level) - item['data'] = [{ + item['data'] = { 'fans': int(fans), 'attention': int(attention), 'archive_count': int(archive_count), 'article_count': int(article_count), 'datetime': datetime.datetime.now() - }] - yield item \ No newline at end of file + } + yield item diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index b1dadb4..69a62f8 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -18,7 +18,7 @@ class AuthorUpdate(scrapy.spiders.Spider): 'ITEM_PIPELINES': { 'biliob_spider.pipelines.AuthorPipeline': 300 }, - 'DOWNLOAD_DELAY' : 0.5 + 'DOWNLOAD_DELAY' : 1 } def __init__(self): @@ -59,16 +59,16 @@ def parse(self, response): item['official'] = official item['sex'] = sex item['level'] = int(level) - item['data'] = [{ + item['data'] = { 'fans': int(fans), 'attention': int(attention), 'archive_count': int(archive_count), 'article_count': int(article_count), 'datetime': datetime.datetime.now() - }] + } yield item except Exception as error: # 出现错误时打印错误日志 logging.error("视频爬虫在解析时发生错误") logging.error(response.url) - logging.error(error) \ No newline at end of file + logging.error(error) diff --git a/biliob_spider/spiders/online.py b/biliob_spider/spiders/online.py index 02a7bf0..b2bd139 100644 --- a/biliob_spider/spiders/online.py +++ b/biliob_spider/spiders/online.py @@ -30,11 +30,11 @@ def parse(self, response): item = VideoOnline() item['title'] = title_list[i] item['author'] = author_list[i] - item['data'] = [{'datetime':datetime.datetime.now(),'number':watch_list[i]}] + item['data'] = {'datetime':datetime.datetime.now(),'number':watch_list[i]} yield item except Exception as error: # 出现错误时打印错误日志 logging.error("视频爬虫在解析时发生错误") logging.error(response.url) - logging.error(error) \ No newline at end of file + logging.error(error) diff --git a/run.py b/run.py index f5829e1..0ea64b0 100644 --- a/run.py +++ b/run.py @@ -29,8 +29,12 @@ def update_author(): def auto_add_author(): Popen(["scrapy","crawl","authorAutoAdd"]) +def online(): + Popen(['scrapy','crawl','online']) + schedule.every().hour.do(update_author) -schedule.every().day.at('13:00').do(auto_add_author) +schedule.every().day.at('14:00').do(auto_add_author) +schedule.every().minute.do(online) logging.info('开始运行计划任务..') while True: From f39b061dd6428309fdcdf061c01f22551663cc75 Mon Sep 17 00:00:00 2001 From: jannchie Date: Sat, 29 Sep 2018 21:15:39 +0800 Subject: [PATCH 013/469] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E8=A7=86=E9=A2=91?= =?UTF-8?q?=E8=87=AA=E5=8A=A8=E8=BF=BD=E8=B8=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/items.py | 13 ++--- biliob_spider/pipelines.py | 58 ++++++++++++++++++--- biliob_spider/settings.py | 2 +- biliob_spider/spiders/author_auto_add.py | 8 +-- biliob_spider/spiders/author_update.py | 8 +-- biliob_spider/spiders/video.py | 66 +++++++++++++++--------- biliob_spider/spiders/video_watcher.py | 60 +++++++++++++++++++++ 7 files changed, 167 insertions(+), 48 deletions(-) create mode 100644 biliob_spider/spiders/video_watcher.py diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 71b396b..fdf09a0 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -13,13 +13,7 @@ class VideoItem(scrapy.Item): aid = scrapy.Field() datetime = scrapy.Field() author = scrapy.Field() - view = scrapy.Field() - favorite = scrapy.Field() - coin = scrapy.Field() - share = scrapy.Field() - like = scrapy.Field() - danmaku = scrapy.Field() - dislike = scrapy.Field() + data = scrapy.Field() subChannel = scrapy.Field() title = scrapy.Field() @@ -37,4 +31,7 @@ class VideoOnline(scrapy.Item): title = scrapy.Field() author = scrapy.Field() data = scrapy.Field() - +class VideoWatcherItem(scrapy.Item): + mid = scrapy.Field() + aid = scrapy.Field() + channels = scrapy.Field() \ No newline at end of file diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index e00cb27..f561352 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -29,16 +29,12 @@ def process_item(self, item, spider): "author": item['author'], "subChannel": item['subChannel'], "channel": item['channel'], - "view": int(item['view']), - "favorite": int(item['favorite']), - "coin": int(item['coin']), - "share": int(item['share']), - "like": int(item['like']), - "dislike": int(item['dislike']), - "danmaku": int(item['danmaku']), "title": item['title'], "datetime": datetime.datetime.fromtimestamp( item['datetime']) + }, + "$addToSet": { + 'data': item['data'] } }, True) return item @@ -105,3 +101,51 @@ def process_item(self, item, spider): except Exception as error: # 出现错误时打印错误日志 logging.error(error) + +class VideoAddPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + "aid": item["aid"] + }, { + "$set": { + "aid": item['aid'] + }, + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) + +class AuthorChannelPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + "mid": item["mid"] + }, { + "$set": { + "channels": item['channels'] + }, + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index 79aeeaf..5290297 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -12,7 +12,7 @@ import random LOG_FILE = "biliob_spider.log" -LOG_LEVEL = "INFO" +LOG_LEVEL = "DEBUG" BOT_NAME = 'biliob_spider' diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index 1fa5d31..b190e7e 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -48,8 +48,8 @@ def detailParse(self, response): attention = j['data']['card']['attention'] level = j['data']['card']['level_info']['current_level'] official = j['data']['card']['Official']['title'] - archive_count = j['data']['archive_count'] - article_count = j['data']['article_count'] + archive = j['data']['archive'] + article = j['data']['article'] face = j['data']['card']['face'] item = AuthorItem() item['mid'] = int(mid) @@ -61,8 +61,8 @@ def detailParse(self, response): item['data'] = { 'fans': int(fans), 'attention': int(attention), - 'archive_count': int(archive_count), - 'article_count': int(article_count), + 'archive': int(archive), + 'article': int(article), 'datetime': datetime.datetime.now() } yield item diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index 69a62f8..10b0910 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -49,8 +49,8 @@ def parse(self, response): attention = j['data']['card']['attention'] level = j['data']['card']['level_info']['current_level'] official = j['data']['card']['Official']['title'] - archive_count = j['data']['archive_count'] - article_count = j['data']['article_count'] + archive = j['data']['archive'] + article = j['data']['article'] face = j['data']['card']['face'] item = AuthorItem() item['mid'] = int(mid) @@ -62,8 +62,8 @@ def parse(self, response): item['data'] = { 'fans': int(fans), 'attention': int(attention), - 'archive_count': int(archive_count), - 'article_count': int(article_count), + 'archive': int(archive), + 'article': int(article), 'datetime': datetime.datetime.now() } yield item diff --git a/biliob_spider/spiders/video.py b/biliob_spider/spiders/video.py index adcd7be..6fdf02f 100644 --- a/biliob_spider/spiders/video.py +++ b/biliob_spider/spiders/video.py @@ -2,10 +2,13 @@ import scrapy from scrapy.http import Request from biliob_spider.items import VideoItem +from datetime import datetime import time import json import logging from pymongo import MongoClient +from db import settings + sub_channel_2_channel = { 'ASMR': '生活', 'GMV': '游戏', @@ -108,19 +111,30 @@ class VideoSpider(scrapy.spiders.Spider): }, 'DOWNLOAD_DELAY' : 1 } - def __init__(self,start_aid=1,length=99999999,limit_view=50000, *args, **kwargs): - # super(HighSpeedVideoSpider2, self).__init__(*args, **kwargs) - print("开始的av号为:" + str(start_aid) + ",计划抓取的视频个数为:" + str(length)) - self.start_aid = int(start_aid) - self.length = int(length) - self.limit_view = limit_view + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + def start_requests(self): - i = (x for x in range(self.start_aid, self.start_aid + self.length)) - while True: - aid_str = '' - for j in range(100): - aid_str += str(next(i))+',' - yield Request("https://api.bilibili.com/x/article/archives?ids=" + aid_str.rstrip(',')) + c = self.coll.find() + aid_list = [] + for each_doc in c: + aid_list.append(each_doc['aid']) + i = 0 + while aid_list != []: + if i == 0: + aid_str = '' + aid_str += str(aid_list.pop())+',' + i = i+1 + if i == 100 or aid_list == []: + i = 0 + yield Request("https://api.bilibili.com/x/article/archives?ids=" + aid_str.rstrip(',')) + def parse(self, response): try: r = json.loads(response.body) @@ -136,23 +150,29 @@ def parse(self, response): share = d[each_key]['stat']['share'] like = d[each_key]['stat']['like'] dislike = d[each_key]['stat']['dislike'] + + data = { + 'view':int(view), + 'favorite':int(favorite), + 'danmaku':int(danmaku), + 'coin':int(coin), + 'share':int(share), + 'like':int(like), + 'dislike':int(dislike), + 'datetime': datetime.now() + } + subChannel = d[each_key]['tname'] title = d[each_key]['title'] - datetime = d[each_key]['pubdate'] + date = d[each_key]['pubdate'] tid = d[each_key]['tid'] item = VideoItem() item['aid'] = aid item['author'] = author - item['view'] = view - item['favorite'] = favorite - item['coin'] = coin - item['share'] = share - item['like'] = like - item['dislike'] = dislike - item['danmaku'] = danmaku + item['data'] = data item['title'] = title item['subChannel'] = subChannel - item['datetime'] = datetime + item['datetime'] = date if subChannel != '': item['channel'] = sub_channel_2_channel[subChannel] elif subChannel == '资讯': @@ -164,10 +184,8 @@ def parse(self, response): item['channel'] == '娱乐' else: item['channel'] = None + yield item - # 只收录大于limit_view的视频 - if view > self.limit_view: - yield item except Exception as error: # 出现错误时打印错误日志 if r['code'] == -404: diff --git a/biliob_spider/spiders/video_watcher.py b/biliob_spider/spiders/video_watcher.py new file mode 100644 index 0000000..d818ee3 --- /dev/null +++ b/biliob_spider/spiders/video_watcher.py @@ -0,0 +1,60 @@ +#coding=utf-8 +import scrapy +from scrapy.http import Request +from biliob_spider.items import VideoWatcherItem +import time +import json +import logging +from pymongo import MongoClient +import datetime +from db import settings + + +class VideoWatch(scrapy.spiders.Spider): + name = "videoWatcher" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.VideoAddPipeline': 300, + 'biliob_spider.pipelines.AuthorChannelPipeline': 301 + }, + 'DOWNLOAD_DELAY' : 1 + } + + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + + def start_requests(self): + c = self.coll.find() + for each_doc in c: + yield Request( + 'http://space.bilibili.com/ajax/member/getSubmitVideos?mid=' + + str(each_doc['mid'])+'&pagesize=1&page=1&order=pubdate', + method='GET') + + def parse(self, response): + try: + j = json.loads(response.body) + channels = j['data']['tlist'] + list_channel = [] + for each_channel in channels: + list_channel.append(channels[each_channel]) + aid = j['data']['vlist'][0]['aid'] + mid = j['data']['vlist'][0]['mid'] + item = VideoWatcherItem() + item['aid'] = int(aid) + item['channels'] = list_channel + item['mid'] = mid + yield item + except Exception as error: + # 出现错误时打印错误日志 + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) From 6e9305fa555e80db0b41fbc5c94577a35f874705 Mon Sep 17 00:00:00 2001 From: jannchie Date: Sat, 29 Sep 2018 21:15:39 +0800 Subject: [PATCH 014/469] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E8=A7=86=E9=A2=91?= =?UTF-8?q?=E8=87=AA=E5=8A=A8=E8=BF=BD=E8=B8=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/items.py | 13 ++--- biliob_spider/pipelines.py | 58 ++++++++++++++++++--- biliob_spider/settings.py | 2 +- biliob_spider/spiders/author_auto_add.py | 8 +-- biliob_spider/spiders/author_update.py | 8 +-- biliob_spider/spiders/video.py | 66 +++++++++++++++--------- biliob_spider/spiders/video_watcher.py | 60 +++++++++++++++++++++ 7 files changed, 167 insertions(+), 48 deletions(-) create mode 100644 biliob_spider/spiders/video_watcher.py diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 71b396b..fdf09a0 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -13,13 +13,7 @@ class VideoItem(scrapy.Item): aid = scrapy.Field() datetime = scrapy.Field() author = scrapy.Field() - view = scrapy.Field() - favorite = scrapy.Field() - coin = scrapy.Field() - share = scrapy.Field() - like = scrapy.Field() - danmaku = scrapy.Field() - dislike = scrapy.Field() + data = scrapy.Field() subChannel = scrapy.Field() title = scrapy.Field() @@ -37,4 +31,7 @@ class VideoOnline(scrapy.Item): title = scrapy.Field() author = scrapy.Field() data = scrapy.Field() - +class VideoWatcherItem(scrapy.Item): + mid = scrapy.Field() + aid = scrapy.Field() + channels = scrapy.Field() \ No newline at end of file diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index e00cb27..f561352 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -29,16 +29,12 @@ def process_item(self, item, spider): "author": item['author'], "subChannel": item['subChannel'], "channel": item['channel'], - "view": int(item['view']), - "favorite": int(item['favorite']), - "coin": int(item['coin']), - "share": int(item['share']), - "like": int(item['like']), - "dislike": int(item['dislike']), - "danmaku": int(item['danmaku']), "title": item['title'], "datetime": datetime.datetime.fromtimestamp( item['datetime']) + }, + "$addToSet": { + 'data': item['data'] } }, True) return item @@ -105,3 +101,51 @@ def process_item(self, item, spider): except Exception as error: # 出现错误时打印错误日志 logging.error(error) + +class VideoAddPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + "aid": item["aid"] + }, { + "$set": { + "aid": item['aid'] + }, + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) + +class AuthorChannelPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + "mid": item["mid"] + }, { + "$set": { + "channels": item['channels'] + }, + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index 79aeeaf..5290297 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -12,7 +12,7 @@ import random LOG_FILE = "biliob_spider.log" -LOG_LEVEL = "INFO" +LOG_LEVEL = "DEBUG" BOT_NAME = 'biliob_spider' diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index 1fa5d31..b190e7e 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -48,8 +48,8 @@ def detailParse(self, response): attention = j['data']['card']['attention'] level = j['data']['card']['level_info']['current_level'] official = j['data']['card']['Official']['title'] - archive_count = j['data']['archive_count'] - article_count = j['data']['article_count'] + archive = j['data']['archive'] + article = j['data']['article'] face = j['data']['card']['face'] item = AuthorItem() item['mid'] = int(mid) @@ -61,8 +61,8 @@ def detailParse(self, response): item['data'] = { 'fans': int(fans), 'attention': int(attention), - 'archive_count': int(archive_count), - 'article_count': int(article_count), + 'archive': int(archive), + 'article': int(article), 'datetime': datetime.datetime.now() } yield item diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index 69a62f8..10b0910 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -49,8 +49,8 @@ def parse(self, response): attention = j['data']['card']['attention'] level = j['data']['card']['level_info']['current_level'] official = j['data']['card']['Official']['title'] - archive_count = j['data']['archive_count'] - article_count = j['data']['article_count'] + archive = j['data']['archive'] + article = j['data']['article'] face = j['data']['card']['face'] item = AuthorItem() item['mid'] = int(mid) @@ -62,8 +62,8 @@ def parse(self, response): item['data'] = { 'fans': int(fans), 'attention': int(attention), - 'archive_count': int(archive_count), - 'article_count': int(article_count), + 'archive': int(archive), + 'article': int(article), 'datetime': datetime.datetime.now() } yield item diff --git a/biliob_spider/spiders/video.py b/biliob_spider/spiders/video.py index adcd7be..6fdf02f 100644 --- a/biliob_spider/spiders/video.py +++ b/biliob_spider/spiders/video.py @@ -2,10 +2,13 @@ import scrapy from scrapy.http import Request from biliob_spider.items import VideoItem +from datetime import datetime import time import json import logging from pymongo import MongoClient +from db import settings + sub_channel_2_channel = { 'ASMR': '生活', 'GMV': '游戏', @@ -108,19 +111,30 @@ class VideoSpider(scrapy.spiders.Spider): }, 'DOWNLOAD_DELAY' : 1 } - def __init__(self,start_aid=1,length=99999999,limit_view=50000, *args, **kwargs): - # super(HighSpeedVideoSpider2, self).__init__(*args, **kwargs) - print("开始的av号为:" + str(start_aid) + ",计划抓取的视频个数为:" + str(length)) - self.start_aid = int(start_aid) - self.length = int(length) - self.limit_view = limit_view + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + def start_requests(self): - i = (x for x in range(self.start_aid, self.start_aid + self.length)) - while True: - aid_str = '' - for j in range(100): - aid_str += str(next(i))+',' - yield Request("https://api.bilibili.com/x/article/archives?ids=" + aid_str.rstrip(',')) + c = self.coll.find() + aid_list = [] + for each_doc in c: + aid_list.append(each_doc['aid']) + i = 0 + while aid_list != []: + if i == 0: + aid_str = '' + aid_str += str(aid_list.pop())+',' + i = i+1 + if i == 100 or aid_list == []: + i = 0 + yield Request("https://api.bilibili.com/x/article/archives?ids=" + aid_str.rstrip(',')) + def parse(self, response): try: r = json.loads(response.body) @@ -136,23 +150,29 @@ def parse(self, response): share = d[each_key]['stat']['share'] like = d[each_key]['stat']['like'] dislike = d[each_key]['stat']['dislike'] + + data = { + 'view':int(view), + 'favorite':int(favorite), + 'danmaku':int(danmaku), + 'coin':int(coin), + 'share':int(share), + 'like':int(like), + 'dislike':int(dislike), + 'datetime': datetime.now() + } + subChannel = d[each_key]['tname'] title = d[each_key]['title'] - datetime = d[each_key]['pubdate'] + date = d[each_key]['pubdate'] tid = d[each_key]['tid'] item = VideoItem() item['aid'] = aid item['author'] = author - item['view'] = view - item['favorite'] = favorite - item['coin'] = coin - item['share'] = share - item['like'] = like - item['dislike'] = dislike - item['danmaku'] = danmaku + item['data'] = data item['title'] = title item['subChannel'] = subChannel - item['datetime'] = datetime + item['datetime'] = date if subChannel != '': item['channel'] = sub_channel_2_channel[subChannel] elif subChannel == '资讯': @@ -164,10 +184,8 @@ def parse(self, response): item['channel'] == '娱乐' else: item['channel'] = None + yield item - # 只收录大于limit_view的视频 - if view > self.limit_view: - yield item except Exception as error: # 出现错误时打印错误日志 if r['code'] == -404: diff --git a/biliob_spider/spiders/video_watcher.py b/biliob_spider/spiders/video_watcher.py new file mode 100644 index 0000000..d818ee3 --- /dev/null +++ b/biliob_spider/spiders/video_watcher.py @@ -0,0 +1,60 @@ +#coding=utf-8 +import scrapy +from scrapy.http import Request +from biliob_spider.items import VideoWatcherItem +import time +import json +import logging +from pymongo import MongoClient +import datetime +from db import settings + + +class VideoWatch(scrapy.spiders.Spider): + name = "videoWatcher" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.VideoAddPipeline': 300, + 'biliob_spider.pipelines.AuthorChannelPipeline': 301 + }, + 'DOWNLOAD_DELAY' : 1 + } + + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + + def start_requests(self): + c = self.coll.find() + for each_doc in c: + yield Request( + 'http://space.bilibili.com/ajax/member/getSubmitVideos?mid=' + + str(each_doc['mid'])+'&pagesize=1&page=1&order=pubdate', + method='GET') + + def parse(self, response): + try: + j = json.loads(response.body) + channels = j['data']['tlist'] + list_channel = [] + for each_channel in channels: + list_channel.append(channels[each_channel]) + aid = j['data']['vlist'][0]['aid'] + mid = j['data']['vlist'][0]['mid'] + item = VideoWatcherItem() + item['aid'] = int(aid) + item['channels'] = list_channel + item['mid'] = mid + yield item + except Exception as error: + # 出现错误时打印错误日志 + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) From c1951ac833c44c0b99a080bc75244deeabd35107 Mon Sep 17 00:00:00 2001 From: jannchie Date: Sat, 29 Sep 2018 21:46:55 +0800 Subject: [PATCH 015/469] =?UTF-8?q?=E6=94=B9=E4=B8=AA=E5=90=8D=E5=AD=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/spiders/{video.py => video_spider.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename biliob_spider/spiders/{video.py => video_spider.py} (100%) diff --git a/biliob_spider/spiders/video.py b/biliob_spider/spiders/video_spider.py similarity index 100% rename from biliob_spider/spiders/video.py rename to biliob_spider/spiders/video_spider.py From 9224b0e881bf6bf02847855c12a7beda80958f42 Mon Sep 17 00:00:00 2001 From: jannchie Date: Sat, 29 Sep 2018 21:46:55 +0800 Subject: [PATCH 016/469] =?UTF-8?q?=E6=94=B9=E4=B8=AA=E5=90=8D=E5=AD=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/spiders/{video.py => video_spider.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename biliob_spider/spiders/{video.py => video_spider.py} (100%) diff --git a/biliob_spider/spiders/video.py b/biliob_spider/spiders/video_spider.py similarity index 100% rename from biliob_spider/spiders/video.py rename to biliob_spider/spiders/video_spider.py From 221cfa46112665ebcc6d6c2dc030a2416b9217d7 Mon Sep 17 00:00:00 2001 From: jannchie Date: Sat, 29 Sep 2018 21:47:12 +0800 Subject: [PATCH 017/469] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BA=86=E8=A7=86?= =?UTF-8?q?=E9=A2=91=E7=9B=B8=E5=85=B3=E7=9A=84=E8=AE=A1=E5=88=92=E4=BB=BB?= =?UTF-8?q?=E5=8A=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- run.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/run.py b/run.py index 0ea64b0..e7b804c 100644 --- a/run.py +++ b/run.py @@ -5,6 +5,8 @@ import time from subprocess import Popen import logging +import threading + # 第一步,创建一个logger logger = logging.getLogger() @@ -29,12 +31,24 @@ def update_author(): def auto_add_author(): Popen(["scrapy","crawl","authorAutoAdd"]) +def video_watcher(): + Popen(["scrapy","crawl","videoWatcher"]) + +def video_spider(): + Popen(["scrapy","crawl","videoSpider"]) + def online(): Popen(['scrapy','crawl','online']) -schedule.every().hour.do(update_author) -schedule.every().day.at('14:00').do(auto_add_author) -schedule.every().minute.do(online) +def run_threaded(job_func): + job_thread = threading.Thread(target=job_func) + job_thread.start() + +schedule.every().hour.do(run_threaded,update_author) +schedule.every().hour.do(run_threaded,video_watcher) +schedule.every(2).hours.do(run_threaded,video_spider) +schedule.every().day.at('14:00').do(run_threaded,auto_add_author) +schedule.every().minute.do(run_threaded,online) logging.info('开始运行计划任务..') while True: From f9268fc58a76fdb3ca453ac2873f924ca4f20c0c Mon Sep 17 00:00:00 2001 From: jannchie Date: Sat, 29 Sep 2018 21:47:12 +0800 Subject: [PATCH 018/469] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BA=86=E8=A7=86?= =?UTF-8?q?=E9=A2=91=E7=9B=B8=E5=85=B3=E7=9A=84=E8=AE=A1=E5=88=92=E4=BB=BB?= =?UTF-8?q?=E5=8A=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- run.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/run.py b/run.py index 0ea64b0..e7b804c 100644 --- a/run.py +++ b/run.py @@ -5,6 +5,8 @@ import time from subprocess import Popen import logging +import threading + # 第一步,创建一个logger logger = logging.getLogger() @@ -29,12 +31,24 @@ def update_author(): def auto_add_author(): Popen(["scrapy","crawl","authorAutoAdd"]) +def video_watcher(): + Popen(["scrapy","crawl","videoWatcher"]) + +def video_spider(): + Popen(["scrapy","crawl","videoSpider"]) + def online(): Popen(['scrapy','crawl','online']) -schedule.every().hour.do(update_author) -schedule.every().day.at('14:00').do(auto_add_author) -schedule.every().minute.do(online) +def run_threaded(job_func): + job_thread = threading.Thread(target=job_func) + job_thread.start() + +schedule.every().hour.do(run_threaded,update_author) +schedule.every().hour.do(run_threaded,video_watcher) +schedule.every(2).hours.do(run_threaded,video_spider) +schedule.every().day.at('14:00').do(run_threaded,auto_add_author) +schedule.every().minute.do(run_threaded,online) logging.info('开始运行计划任务..') while True: From e9d96e11223479e75fc8304947aaf183a7acf495 Mon Sep 17 00:00:00 2001 From: jannchie Date: Mon, 1 Oct 2018 20:55:20 +0800 Subject: [PATCH 019/469] =?UTF-8?q?=E4=BF=AE=E6=94=B9video=E7=88=AC?= =?UTF-8?q?=E8=99=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/settings.py | 2 +- biliob_spider/spiders/author_auto_add.py | 4 ++-- biliob_spider/spiders/author_update.py | 4 ++-- biliob_spider/spiders/video.py | 0 biliob_spider/spiders/video_spider.py | 21 +++++++++++---------- 5 files changed, 16 insertions(+), 15 deletions(-) create mode 100644 biliob_spider/spiders/video.py diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index 5290297..80ba4ad 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -12,7 +12,7 @@ import random LOG_FILE = "biliob_spider.log" -LOG_LEVEL = "DEBUG" +LOG_LEVEL = "ERROR" BOT_NAME = 'biliob_spider' diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index b190e7e..350f5b4 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -48,8 +48,8 @@ def detailParse(self, response): attention = j['data']['card']['attention'] level = j['data']['card']['level_info']['current_level'] official = j['data']['card']['Official']['title'] - archive = j['data']['archive'] - article = j['data']['article'] + archive = j['data']['archive_count'] + article = j['data']['article_count'] face = j['data']['card']['face'] item = AuthorItem() item['mid'] = int(mid) diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index 10b0910..557cb6f 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -49,8 +49,8 @@ def parse(self, response): attention = j['data']['card']['attention'] level = j['data']['card']['level_info']['current_level'] official = j['data']['card']['Official']['title'] - archive = j['data']['archive'] - article = j['data']['article'] + archive = j['data']['archive_count'] + article = j['data']['article_count'] face = j['data']['card']['face'] item = AuthorItem() item['mid'] = int(mid) diff --git a/biliob_spider/spiders/video.py b/biliob_spider/spiders/video.py new file mode 100644 index 0000000..e69de29 diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index 6fdf02f..34ab4a1 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -1,4 +1,4 @@ -#coding=utf-8 +# coding=utf-8 import scrapy from scrapy.http import Request from biliob_spider.items import VideoItem @@ -141,6 +141,7 @@ def parse(self, response): d = r["data"] keys = list(d.keys()) for each_key in keys: + aid = d[each_key]['stat']['aid'] author = d[each_key]['owner']['name'] view = d[each_key]['stat']['view'] @@ -161,7 +162,7 @@ def parse(self, response): 'dislike':int(dislike), 'datetime': datetime.now() } - + subChannel = d[each_key]['tname'] title = d[each_key]['title'] date = d[each_key]['pubdate'] @@ -173,15 +174,16 @@ def parse(self, response): item['title'] = title item['subChannel'] = subChannel item['datetime'] = date - if subChannel != '': - item['channel'] = sub_channel_2_channel[subChannel] - elif subChannel == '资讯': + if subChannel.encode('utf-8') != '': + item['channel'] = sub_channel_2_channel[subChannel.encode('utf-8')] + + elif subChannel.encode('utf-8') == '资讯': if tid == 51: - item['channel'] == '番剧' + item['channel'] == u'番剧' if tid == 170: - item['channel'] == '国创' + item['channel'] == u'国创' if tid == 159: - item['channel'] == '娱乐' + item['channel'] == u'娱乐' else: item['channel'] = None yield item @@ -190,8 +192,7 @@ def parse(self, response): # 出现错误时打印错误日志 if r['code'] == -404: return - logging.error("视频爬虫在解析时发生错误") + logging.error(u"视频爬虫在解析时发生错误") logging.error(item) logging.error(response.url) logging.error(error) -# scrapy crawl VideoTagSpider -a start_aid=26053983 -a length=2000000 -s JOBDIR=tag-07-21 -L INFO From 53d9659b3922f5d07b35f7352d727f9b7df6829f Mon Sep 17 00:00:00 2001 From: jannchie Date: Mon, 1 Oct 2018 20:55:20 +0800 Subject: [PATCH 020/469] =?UTF-8?q?=E4=BF=AE=E6=94=B9video=E7=88=AC?= =?UTF-8?q?=E8=99=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/settings.py | 2 +- biliob_spider/spiders/author_auto_add.py | 4 ++-- biliob_spider/spiders/author_update.py | 4 ++-- biliob_spider/spiders/video.py | 0 biliob_spider/spiders/video_spider.py | 21 +++++++++++---------- 5 files changed, 16 insertions(+), 15 deletions(-) create mode 100644 biliob_spider/spiders/video.py diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index 5290297..80ba4ad 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -12,7 +12,7 @@ import random LOG_FILE = "biliob_spider.log" -LOG_LEVEL = "DEBUG" +LOG_LEVEL = "ERROR" BOT_NAME = 'biliob_spider' diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index b190e7e..350f5b4 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -48,8 +48,8 @@ def detailParse(self, response): attention = j['data']['card']['attention'] level = j['data']['card']['level_info']['current_level'] official = j['data']['card']['Official']['title'] - archive = j['data']['archive'] - article = j['data']['article'] + archive = j['data']['archive_count'] + article = j['data']['article_count'] face = j['data']['card']['face'] item = AuthorItem() item['mid'] = int(mid) diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index 10b0910..557cb6f 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -49,8 +49,8 @@ def parse(self, response): attention = j['data']['card']['attention'] level = j['data']['card']['level_info']['current_level'] official = j['data']['card']['Official']['title'] - archive = j['data']['archive'] - article = j['data']['article'] + archive = j['data']['archive_count'] + article = j['data']['article_count'] face = j['data']['card']['face'] item = AuthorItem() item['mid'] = int(mid) diff --git a/biliob_spider/spiders/video.py b/biliob_spider/spiders/video.py new file mode 100644 index 0000000..e69de29 diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index 6fdf02f..34ab4a1 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -1,4 +1,4 @@ -#coding=utf-8 +# coding=utf-8 import scrapy from scrapy.http import Request from biliob_spider.items import VideoItem @@ -141,6 +141,7 @@ def parse(self, response): d = r["data"] keys = list(d.keys()) for each_key in keys: + aid = d[each_key]['stat']['aid'] author = d[each_key]['owner']['name'] view = d[each_key]['stat']['view'] @@ -161,7 +162,7 @@ def parse(self, response): 'dislike':int(dislike), 'datetime': datetime.now() } - + subChannel = d[each_key]['tname'] title = d[each_key]['title'] date = d[each_key]['pubdate'] @@ -173,15 +174,16 @@ def parse(self, response): item['title'] = title item['subChannel'] = subChannel item['datetime'] = date - if subChannel != '': - item['channel'] = sub_channel_2_channel[subChannel] - elif subChannel == '资讯': + if subChannel.encode('utf-8') != '': + item['channel'] = sub_channel_2_channel[subChannel.encode('utf-8')] + + elif subChannel.encode('utf-8') == '资讯': if tid == 51: - item['channel'] == '番剧' + item['channel'] == u'番剧' if tid == 170: - item['channel'] == '国创' + item['channel'] == u'国创' if tid == 159: - item['channel'] == '娱乐' + item['channel'] == u'娱乐' else: item['channel'] = None yield item @@ -190,8 +192,7 @@ def parse(self, response): # 出现错误时打印错误日志 if r['code'] == -404: return - logging.error("视频爬虫在解析时发生错误") + logging.error(u"视频爬虫在解析时发生错误") logging.error(item) logging.error(response.url) logging.error(error) -# scrapy crawl VideoTagSpider -a start_aid=26053983 -a length=2000000 -s JOBDIR=tag-07-21 -L INFO From 5c4aaf3e3f3318efdafb8e23d9d9e507d68f693a Mon Sep 17 00:00:00 2001 From: jannchie Date: Mon, 1 Oct 2018 20:59:55 +0800 Subject: [PATCH 021/469] =?UTF-8?q?=E5=88=A0=E9=99=A4=E5=AD=97=E5=85=B8?= =?UTF-8?q?=E5=86=97=E4=BD=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/spiders/video_spider.py | 1 - 1 file changed, 1 deletion(-) diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index 6fdf02f..00beedf 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -98,7 +98,6 @@ '生活': '生活', '音乐': '音乐', '纪录片': '纪录片', - '生活': '生活', '游戏': '游戏' } class VideoSpider(scrapy.spiders.Spider): From 32b3368808b24e50a2e363c626f003b694c3daa8 Mon Sep 17 00:00:00 2001 From: jannchie Date: Mon, 1 Oct 2018 20:59:55 +0800 Subject: [PATCH 022/469] =?UTF-8?q?=E5=88=A0=E9=99=A4=E5=AD=97=E5=85=B8?= =?UTF-8?q?=E5=86=97=E4=BD=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/spiders/video_spider.py | 1 - 1 file changed, 1 deletion(-) diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index 6fdf02f..00beedf 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -98,7 +98,6 @@ '生活': '生活', '音乐': '音乐', '纪录片': '纪录片', - '生活': '生活', '游戏': '游戏' } class VideoSpider(scrapy.spiders.Spider): From 50f68f88921df947379c9a26e1e51a43698b165a Mon Sep 17 00:00:00 2001 From: jannchie Date: Mon, 1 Oct 2018 21:10:53 +0800 Subject: [PATCH 023/469] =?UTF-8?q?=E6=B7=BB=E5=8A=A0pic=E5=92=8Cmid?= =?UTF-8?q?=E7=9A=84=E7=88=AC=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/items.py | 2 ++ biliob_spider/pipelines.py | 2 ++ biliob_spider/spiders/video_spider.py | 5 +++++ 3 files changed, 9 insertions(+) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index fdf09a0..23bff2d 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -16,6 +16,8 @@ class VideoItem(scrapy.Item): data = scrapy.Field() subChannel = scrapy.Field() title = scrapy.Field() + mid = scrapy.Field() + pic = scrapy.Field() class AuthorItem(scrapy.Item): diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index f561352..022c68a 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -29,6 +29,8 @@ def process_item(self, item, spider): "author": item['author'], "subChannel": item['subChannel'], "channel": item['channel'], + "mid": item['mid'], + "pic": item['pic'], "title": item['title'], "datetime": datetime.datetime.fromtimestamp( item['datetime']) diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index 01c8a60..be7c3a2 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -143,6 +143,7 @@ def parse(self, response): aid = d[each_key]['stat']['aid'] author = d[each_key]['owner']['name'] + mid = d[each_key]['owner']['mid'] view = d[each_key]['stat']['view'] favorite = d[each_key]['stat']['favorite'] danmaku = favorite = d[each_key]['stat']['danmaku'] @@ -166,13 +167,17 @@ def parse(self, response): title = d[each_key]['title'] date = d[each_key]['pubdate'] tid = d[each_key]['tid'] + pic = d[each_key]['pic'] item = VideoItem() item['aid'] = aid + item['mid'] = mid + item['pic'] = pic item['author'] = author item['data'] = data item['title'] = title item['subChannel'] = subChannel item['datetime'] = date + if subChannel.encode('utf-8') != '': item['channel'] = sub_channel_2_channel[subChannel.encode('utf-8')] From 210cda8996aff76c0dea5c5a09553db19e2dfc93 Mon Sep 17 00:00:00 2001 From: jannchie Date: Mon, 1 Oct 2018 21:10:53 +0800 Subject: [PATCH 024/469] =?UTF-8?q?=E6=B7=BB=E5=8A=A0pic=E5=92=8Cmid?= =?UTF-8?q?=E7=9A=84=E7=88=AC=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/items.py | 2 ++ biliob_spider/pipelines.py | 2 ++ biliob_spider/spiders/video_spider.py | 5 +++++ 3 files changed, 9 insertions(+) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index fdf09a0..23bff2d 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -16,6 +16,8 @@ class VideoItem(scrapy.Item): data = scrapy.Field() subChannel = scrapy.Field() title = scrapy.Field() + mid = scrapy.Field() + pic = scrapy.Field() class AuthorItem(scrapy.Item): diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index f561352..022c68a 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -29,6 +29,8 @@ def process_item(self, item, spider): "author": item['author'], "subChannel": item['subChannel'], "channel": item['channel'], + "mid": item['mid'], + "pic": item['pic'], "title": item['title'], "datetime": datetime.datetime.fromtimestamp( item['datetime']) diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index 01c8a60..be7c3a2 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -143,6 +143,7 @@ def parse(self, response): aid = d[each_key]['stat']['aid'] author = d[each_key]['owner']['name'] + mid = d[each_key]['owner']['mid'] view = d[each_key]['stat']['view'] favorite = d[each_key]['stat']['favorite'] danmaku = favorite = d[each_key]['stat']['danmaku'] @@ -166,13 +167,17 @@ def parse(self, response): title = d[each_key]['title'] date = d[each_key]['pubdate'] tid = d[each_key]['tid'] + pic = d[each_key]['pic'] item = VideoItem() item['aid'] = aid + item['mid'] = mid + item['pic'] = pic item['author'] = author item['data'] = data item['title'] = title item['subChannel'] = subChannel item['datetime'] = date + if subChannel.encode('utf-8') != '': item['channel'] = sub_channel_2_channel[subChannel.encode('utf-8')] From ae61caec4c3d7c711f9eef164db74ae76da7f31e Mon Sep 17 00:00:00 2001 From: jannchie Date: Sat, 13 Oct 2018 13:50:10 +0800 Subject: [PATCH 025/469] =?UTF-8?q?online=E8=BF=BD=E5=8A=A0=E8=BF=BD?= =?UTF-8?q?=E8=B8=AA=E5=88=86=E5=8C=BA=E4=BF=A1=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/items.py | 3 +++ biliob_spider/pipelines.py | 3 +++ biliob_spider/spiders/online.py | 31 +++++++++++++++++++++++++++++-- 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 23bff2d..05d267c 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -33,6 +33,9 @@ class VideoOnline(scrapy.Item): title = scrapy.Field() author = scrapy.Field() data = scrapy.Field() + aid = scrapy.Field() + subChannel = scrapy.Field() + channel = scrapy.Field() class VideoWatcherItem(scrapy.Item): mid = scrapy.Field() aid = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 022c68a..66a68f1 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -88,12 +88,15 @@ def __init__(self): def process_item(self, item, spider): try: + self.coll.update_one({ "title": item["title"] }, { "$set": { "title": item['title'], "author": item['author'], + "channel": item['channel'], + "subChannel": item['subChannel'], }, "$addToSet": { 'data': item['data'] diff --git a/biliob_spider/spiders/online.py b/biliob_spider/spiders/online.py index b2bd139..a18e125 100644 --- a/biliob_spider/spiders/online.py +++ b/biliob_spider/spiders/online.py @@ -26,15 +26,42 @@ def parse(self, response): title_list = video_list.xpath('./a/p/text()').extract() watch_list = video_list.xpath('./p/b/text()').extract() author_list = video_list.xpath('./div[1]/a/text()').extract() + href_list = video_list.xpath('./a/@href').extract() for i in range(len(title_list)): item = VideoOnline() item['title'] = title_list[i] item['author'] = author_list[i] item['data'] = {'datetime':datetime.datetime.now(),'number':watch_list[i]} - yield item - + item['aid'] = href_list[i][9:-1] + # 为了爬取分区等数据,需要进入每一个视频的详情页面进行抓取 + yield Request( + "https://www.bilibili.com" + href_list[i], + meta={'item': item}, + callback=self.detailParse) except Exception as error: # 出现错误时打印错误日志 logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) + + def detailParse(self,response): + try: + item = response.meta['item'] + c = response.xpath("//span[@class='crumb'][2]/a/text()").extract() + if c != []: + item['channel'] = response.xpath("//span[@class='crumb'][2]/a/text()").extract()[0] + else: + item['channel'] = '番剧' + + c = response.xpath("//span[@class='crumb'][3]/a/text()").extract() + if c != []: + item['subChannel'] = response.xpath("//span[@class='crumb'][3]/a/text()").extract()[0] + else: + item['subChannel'] = '番剧' + + yield item + except Exception as error: + # 出现错误时打印错误日志 + logging.error("视频爬虫在解析细节时发生错误") + logging.error(response.url) + logging.error(error) From 5174edb79e3ab736067898f50a88ad63489fa93c Mon Sep 17 00:00:00 2001 From: jannchie Date: Sat, 13 Oct 2018 13:50:10 +0800 Subject: [PATCH 026/469] =?UTF-8?q?online=E8=BF=BD=E5=8A=A0=E8=BF=BD?= =?UTF-8?q?=E8=B8=AA=E5=88=86=E5=8C=BA=E4=BF=A1=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/items.py | 3 +++ biliob_spider/pipelines.py | 3 +++ biliob_spider/spiders/online.py | 31 +++++++++++++++++++++++++++++-- 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 23bff2d..05d267c 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -33,6 +33,9 @@ class VideoOnline(scrapy.Item): title = scrapy.Field() author = scrapy.Field() data = scrapy.Field() + aid = scrapy.Field() + subChannel = scrapy.Field() + channel = scrapy.Field() class VideoWatcherItem(scrapy.Item): mid = scrapy.Field() aid = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 022c68a..66a68f1 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -88,12 +88,15 @@ def __init__(self): def process_item(self, item, spider): try: + self.coll.update_one({ "title": item["title"] }, { "$set": { "title": item['title'], "author": item['author'], + "channel": item['channel'], + "subChannel": item['subChannel'], }, "$addToSet": { 'data': item['data'] diff --git a/biliob_spider/spiders/online.py b/biliob_spider/spiders/online.py index b2bd139..a18e125 100644 --- a/biliob_spider/spiders/online.py +++ b/biliob_spider/spiders/online.py @@ -26,15 +26,42 @@ def parse(self, response): title_list = video_list.xpath('./a/p/text()').extract() watch_list = video_list.xpath('./p/b/text()').extract() author_list = video_list.xpath('./div[1]/a/text()').extract() + href_list = video_list.xpath('./a/@href').extract() for i in range(len(title_list)): item = VideoOnline() item['title'] = title_list[i] item['author'] = author_list[i] item['data'] = {'datetime':datetime.datetime.now(),'number':watch_list[i]} - yield item - + item['aid'] = href_list[i][9:-1] + # 为了爬取分区等数据,需要进入每一个视频的详情页面进行抓取 + yield Request( + "https://www.bilibili.com" + href_list[i], + meta={'item': item}, + callback=self.detailParse) except Exception as error: # 出现错误时打印错误日志 logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) + + def detailParse(self,response): + try: + item = response.meta['item'] + c = response.xpath("//span[@class='crumb'][2]/a/text()").extract() + if c != []: + item['channel'] = response.xpath("//span[@class='crumb'][2]/a/text()").extract()[0] + else: + item['channel'] = '番剧' + + c = response.xpath("//span[@class='crumb'][3]/a/text()").extract() + if c != []: + item['subChannel'] = response.xpath("//span[@class='crumb'][3]/a/text()").extract()[0] + else: + item['subChannel'] = '番剧' + + yield item + except Exception as error: + # 出现错误时打印错误日志 + logging.error("视频爬虫在解析细节时发生错误") + logging.error(response.url) + logging.error(error) From 107938d0ba439acd132c3d9d0ea23ff4701b5a96 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Thu, 18 Oct 2018 19:03:05 +0800 Subject: [PATCH 027/469] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a61d1c1..68ee905 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,2 @@ -# biliob +# biliob-spider bilibili data acquisition and analysis. From 24a9cd7ba834031a84bb565c6e656d0b0bf4a9fc Mon Sep 17 00:00:00 2001 From: Jannchie Date: Thu, 18 Oct 2018 19:03:05 +0800 Subject: [PATCH 028/469] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a61d1c1..68ee905 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,2 @@ -# biliob +# biliob-spider bilibili data acquisition and analysis. From 6f3f4bf7ee87b2a59e72bebb42bd8b758fb0da03 Mon Sep 17 00:00:00 2001 From: jannchie Date: Thu, 18 Oct 2018 19:13:32 +0800 Subject: [PATCH 029/469] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dbug=EF=BC=8C=E5=85=A8?= =?UTF-8?q?=E9=9D=A2=E6=94=AF=E6=8C=81python3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/settings.py | 4 +- biliob_spider/spiders/author_auto_add.py | 3 +- biliob_spider/spiders/author_update.py | 8 +-- biliob_spider/spiders/online.py | 16 ++++-- biliob_spider/spiders/video_spider.py | 64 +++++++++++++++--------- biliob_spider/spiders/video_watcher.py | 8 +-- 6 files changed, 63 insertions(+), 40 deletions(-) diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index 80ba4ad..4981fb1 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -11,8 +11,8 @@ import random -LOG_FILE = "biliob_spider.log" -LOG_LEVEL = "ERROR" +# LOG_FILE = "biliob_spider.log" +# LOG_LEVEL = "ERROR" BOT_NAME = 'biliob_spider' diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index 350f5b4..258f4ea 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -8,6 +8,7 @@ from pymongo import MongoClient import datetime + class AuthorAutoAddSpider(scrapy.spiders.Spider): name = "authorAutoAdd" allowed_domains = ["bilibili.com"] @@ -16,7 +17,7 @@ class AuthorAutoAddSpider(scrapy.spiders.Spider): 'ITEM_PIPELINES': { 'biliob_spider.pipelines.AuthorPipeline': 300 }, - 'DOWNLOAD_DELAY' : 10 + 'DOWNLOAD_DELAY': 10 } def parse(self, response): diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index 557cb6f..3c044f8 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -18,7 +18,7 @@ class AuthorUpdate(scrapy.spiders.Spider): 'ITEM_PIPELINES': { 'biliob_spider.pipelines.AuthorPipeline': 300 }, - 'DOWNLOAD_DELAY' : 1 + 'DOWNLOAD_DELAY': 1 } def __init__(self): @@ -31,11 +31,11 @@ def __init__(self): self.coll = self.db['author'] # 获得collection的句柄 def start_requests(self): - c = self.coll.find() + c = self.coll.find({}, {"mid": 1}) for each_doc in c: yield Request( - "https://api.bilibili.com/x/web-interface/card?mid=" + - str(each_doc['mid']), + "https://api.bilibili.com/x/web-interface/card?mid=" + str( + each_doc['mid']), method='GET') def parse(self, response): diff --git a/biliob_spider/spiders/online.py b/biliob_spider/spiders/online.py index a18e125..ed6e35d 100644 --- a/biliob_spider/spiders/online.py +++ b/biliob_spider/spiders/online.py @@ -8,6 +8,7 @@ from pymongo import MongoClient import datetime + class OnlineSpider(scrapy.spiders.Spider): name = "online" allowed_domains = ["bilibili.com"] @@ -31,7 +32,10 @@ def parse(self, response): item = VideoOnline() item['title'] = title_list[i] item['author'] = author_list[i] - item['data'] = {'datetime':datetime.datetime.now(),'number':watch_list[i]} + item['data'] = { + 'datetime': datetime.datetime.now(), + 'number': watch_list[i] + } item['aid'] = href_list[i][9:-1] # 为了爬取分区等数据,需要进入每一个视频的详情页面进行抓取 yield Request( @@ -44,20 +48,22 @@ def parse(self, response): logging.error(response.url) logging.error(error) - def detailParse(self,response): + def detailParse(self, response): try: item = response.meta['item'] c = response.xpath("//span[@class='crumb'][2]/a/text()").extract() if c != []: - item['channel'] = response.xpath("//span[@class='crumb'][2]/a/text()").extract()[0] + item['channel'] = response.xpath( + "//span[@class='crumb'][2]/a/text()").extract()[0] else: item['channel'] = '番剧' c = response.xpath("//span[@class='crumb'][3]/a/text()").extract() if c != []: - item['subChannel'] = response.xpath("//span[@class='crumb'][3]/a/text()").extract()[0] + item['subChannel'] = response.xpath( + "//span[@class='crumb'][3]/a/text()").extract()[0] else: - item['subChannel'] = '番剧' + item['subChannel'] = '番剧' yield item except Exception as error: diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index be7c3a2..07e865b 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -12,7 +12,7 @@ sub_channel_2_channel = { 'ASMR': '生活', 'GMV': '游戏', - 'Korea相关': '娱乐', + 'Korea相关': '娱乐', 'MAD·AMV': '动画', 'MMD·3D': '动画', 'Mugen': '游戏', @@ -100,6 +100,8 @@ '纪录片': '纪录片', '游戏': '游戏' } + + class VideoSpider(scrapy.spiders.Spider): name = "videoSpider" allowed_domains = ["bilibili.com"] @@ -108,8 +110,9 @@ class VideoSpider(scrapy.spiders.Spider): 'ITEM_PIPELINES': { 'biliob_spider.pipelines.VideoPipeline': 300, }, - 'DOWNLOAD_DELAY' : 1 + 'DOWNLOAD_DELAY': 1 } + def __init__(self): # 链接mongoDB self.client = MongoClient(settings['MINGO_HOST'], 27017) @@ -120,27 +123,39 @@ def __init__(self): self.coll = self.db['video'] # 获得collection的句柄 def start_requests(self): - c = self.coll.find() + # 只需要aid + c = self.coll.find({}, {'aid': 1}) + + x = 0 + aid_list = [] for each_doc in c: + + print(x) + x = x + 1 + aid_list.append(each_doc['aid']) i = 0 while aid_list != []: if i == 0: aid_str = '' - aid_str += str(aid_list.pop())+',' - i = i+1 + aid_str += str(aid_list.pop()) + ',' + i = i + 1 if i == 100 or aid_list == []: i = 0 - yield Request("https://api.bilibili.com/x/article/archives?ids=" + aid_str.rstrip(',')) - + print('yield') + yield Request( + "https://api.bilibili.com/x/article/archives?ids=" + + aid_str.rstrip(',')) + def parse(self, response): try: + print('parse') r = json.loads(response.body) d = r["data"] keys = list(d.keys()) for each_key in keys: - + aid = d[each_key]['stat']['aid'] author = d[each_key]['owner']['name'] mid = d[each_key]['owner']['mid'] @@ -153,16 +168,16 @@ def parse(self, response): dislike = d[each_key]['stat']['dislike'] data = { - 'view':int(view), - 'favorite':int(favorite), - 'danmaku':int(danmaku), - 'coin':int(coin), - 'share':int(share), - 'like':int(like), - 'dislike':int(dislike), + 'view': int(view), + 'favorite': int(favorite), + 'danmaku': int(danmaku), + 'coin': int(coin), + 'share': int(share), + 'like': int(like), + 'dislike': int(dislike), 'datetime': datetime.now() } - + subChannel = d[each_key]['tname'] title = d[each_key]['title'] date = d[each_key]['pubdate'] @@ -177,26 +192,25 @@ def parse(self, response): item['title'] = title item['subChannel'] = subChannel item['datetime'] = date - - if subChannel.encode('utf-8') != '': - item['channel'] = sub_channel_2_channel[subChannel.encode('utf-8')] - elif subChannel.encode('utf-8') == '资讯': + if subChannel != '': + item['channel'] = sub_channel_2_channel[subChannel] + elif subChannel == '资讯': if tid == 51: - item['channel'] == u'番剧' + item['channel'] == '番剧' if tid == 170: - item['channel'] == u'国创' + item['channel'] == '国创' if tid == 159: - item['channel'] == u'娱乐' + item['channel'] == '娱乐' else: item['channel'] = None yield item - + except Exception as error: # 出现错误时打印错误日志 if r['code'] == -404: return - logging.error(u"视频爬虫在解析时发生错误") + logging.error("视频爬虫在解析时发生错误") logging.error(item) logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/video_watcher.py b/biliob_spider/spiders/video_watcher.py index d818ee3..059977a 100644 --- a/biliob_spider/spiders/video_watcher.py +++ b/biliob_spider/spiders/video_watcher.py @@ -19,7 +19,7 @@ class VideoWatch(scrapy.spiders.Spider): 'biliob_spider.pipelines.VideoAddPipeline': 300, 'biliob_spider.pipelines.AuthorChannelPipeline': 301 }, - 'DOWNLOAD_DELAY' : 1 + 'DOWNLOAD_DELAY': 1 } def __init__(self): @@ -32,16 +32,18 @@ def __init__(self): self.coll = self.db['author'] # 获得collection的句柄 def start_requests(self): - c = self.coll.find() + c = self.coll.find({}, {'mid': 1}) for each_doc in c: yield Request( 'http://space.bilibili.com/ajax/member/getSubmitVideos?mid=' + - str(each_doc['mid'])+'&pagesize=1&page=1&order=pubdate', + str(each_doc['mid']) + '&pagesize=1&page=1&order=pubdate', method='GET') def parse(self, response): try: j = json.loads(response.body) + if len(j['data']['vlist']) == 0: + return channels = j['data']['tlist'] list_channel = [] for each_channel in channels: From 0c65ecf869004d8bae949ddacd2130be302d3042 Mon Sep 17 00:00:00 2001 From: jannchie Date: Thu, 18 Oct 2018 19:13:32 +0800 Subject: [PATCH 030/469] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dbug=EF=BC=8C=E5=85=A8?= =?UTF-8?q?=E9=9D=A2=E6=94=AF=E6=8C=81python3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/settings.py | 4 +- biliob_spider/spiders/author_auto_add.py | 3 +- biliob_spider/spiders/author_update.py | 8 +-- biliob_spider/spiders/online.py | 16 ++++-- biliob_spider/spiders/video_spider.py | 64 +++++++++++++++--------- biliob_spider/spiders/video_watcher.py | 8 +-- 6 files changed, 63 insertions(+), 40 deletions(-) diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index 80ba4ad..4981fb1 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -11,8 +11,8 @@ import random -LOG_FILE = "biliob_spider.log" -LOG_LEVEL = "ERROR" +# LOG_FILE = "biliob_spider.log" +# LOG_LEVEL = "ERROR" BOT_NAME = 'biliob_spider' diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index 350f5b4..258f4ea 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -8,6 +8,7 @@ from pymongo import MongoClient import datetime + class AuthorAutoAddSpider(scrapy.spiders.Spider): name = "authorAutoAdd" allowed_domains = ["bilibili.com"] @@ -16,7 +17,7 @@ class AuthorAutoAddSpider(scrapy.spiders.Spider): 'ITEM_PIPELINES': { 'biliob_spider.pipelines.AuthorPipeline': 300 }, - 'DOWNLOAD_DELAY' : 10 + 'DOWNLOAD_DELAY': 10 } def parse(self, response): diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index 557cb6f..3c044f8 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -18,7 +18,7 @@ class AuthorUpdate(scrapy.spiders.Spider): 'ITEM_PIPELINES': { 'biliob_spider.pipelines.AuthorPipeline': 300 }, - 'DOWNLOAD_DELAY' : 1 + 'DOWNLOAD_DELAY': 1 } def __init__(self): @@ -31,11 +31,11 @@ def __init__(self): self.coll = self.db['author'] # 获得collection的句柄 def start_requests(self): - c = self.coll.find() + c = self.coll.find({}, {"mid": 1}) for each_doc in c: yield Request( - "https://api.bilibili.com/x/web-interface/card?mid=" + - str(each_doc['mid']), + "https://api.bilibili.com/x/web-interface/card?mid=" + str( + each_doc['mid']), method='GET') def parse(self, response): diff --git a/biliob_spider/spiders/online.py b/biliob_spider/spiders/online.py index a18e125..ed6e35d 100644 --- a/biliob_spider/spiders/online.py +++ b/biliob_spider/spiders/online.py @@ -8,6 +8,7 @@ from pymongo import MongoClient import datetime + class OnlineSpider(scrapy.spiders.Spider): name = "online" allowed_domains = ["bilibili.com"] @@ -31,7 +32,10 @@ def parse(self, response): item = VideoOnline() item['title'] = title_list[i] item['author'] = author_list[i] - item['data'] = {'datetime':datetime.datetime.now(),'number':watch_list[i]} + item['data'] = { + 'datetime': datetime.datetime.now(), + 'number': watch_list[i] + } item['aid'] = href_list[i][9:-1] # 为了爬取分区等数据,需要进入每一个视频的详情页面进行抓取 yield Request( @@ -44,20 +48,22 @@ def parse(self, response): logging.error(response.url) logging.error(error) - def detailParse(self,response): + def detailParse(self, response): try: item = response.meta['item'] c = response.xpath("//span[@class='crumb'][2]/a/text()").extract() if c != []: - item['channel'] = response.xpath("//span[@class='crumb'][2]/a/text()").extract()[0] + item['channel'] = response.xpath( + "//span[@class='crumb'][2]/a/text()").extract()[0] else: item['channel'] = '番剧' c = response.xpath("//span[@class='crumb'][3]/a/text()").extract() if c != []: - item['subChannel'] = response.xpath("//span[@class='crumb'][3]/a/text()").extract()[0] + item['subChannel'] = response.xpath( + "//span[@class='crumb'][3]/a/text()").extract()[0] else: - item['subChannel'] = '番剧' + item['subChannel'] = '番剧' yield item except Exception as error: diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index be7c3a2..07e865b 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -12,7 +12,7 @@ sub_channel_2_channel = { 'ASMR': '生活', 'GMV': '游戏', - 'Korea相关': '娱乐', + 'Korea相关': '娱乐', 'MAD·AMV': '动画', 'MMD·3D': '动画', 'Mugen': '游戏', @@ -100,6 +100,8 @@ '纪录片': '纪录片', '游戏': '游戏' } + + class VideoSpider(scrapy.spiders.Spider): name = "videoSpider" allowed_domains = ["bilibili.com"] @@ -108,8 +110,9 @@ class VideoSpider(scrapy.spiders.Spider): 'ITEM_PIPELINES': { 'biliob_spider.pipelines.VideoPipeline': 300, }, - 'DOWNLOAD_DELAY' : 1 + 'DOWNLOAD_DELAY': 1 } + def __init__(self): # 链接mongoDB self.client = MongoClient(settings['MINGO_HOST'], 27017) @@ -120,27 +123,39 @@ def __init__(self): self.coll = self.db['video'] # 获得collection的句柄 def start_requests(self): - c = self.coll.find() + # 只需要aid + c = self.coll.find({}, {'aid': 1}) + + x = 0 + aid_list = [] for each_doc in c: + + print(x) + x = x + 1 + aid_list.append(each_doc['aid']) i = 0 while aid_list != []: if i == 0: aid_str = '' - aid_str += str(aid_list.pop())+',' - i = i+1 + aid_str += str(aid_list.pop()) + ',' + i = i + 1 if i == 100 or aid_list == []: i = 0 - yield Request("https://api.bilibili.com/x/article/archives?ids=" + aid_str.rstrip(',')) - + print('yield') + yield Request( + "https://api.bilibili.com/x/article/archives?ids=" + + aid_str.rstrip(',')) + def parse(self, response): try: + print('parse') r = json.loads(response.body) d = r["data"] keys = list(d.keys()) for each_key in keys: - + aid = d[each_key]['stat']['aid'] author = d[each_key]['owner']['name'] mid = d[each_key]['owner']['mid'] @@ -153,16 +168,16 @@ def parse(self, response): dislike = d[each_key]['stat']['dislike'] data = { - 'view':int(view), - 'favorite':int(favorite), - 'danmaku':int(danmaku), - 'coin':int(coin), - 'share':int(share), - 'like':int(like), - 'dislike':int(dislike), + 'view': int(view), + 'favorite': int(favorite), + 'danmaku': int(danmaku), + 'coin': int(coin), + 'share': int(share), + 'like': int(like), + 'dislike': int(dislike), 'datetime': datetime.now() } - + subChannel = d[each_key]['tname'] title = d[each_key]['title'] date = d[each_key]['pubdate'] @@ -177,26 +192,25 @@ def parse(self, response): item['title'] = title item['subChannel'] = subChannel item['datetime'] = date - - if subChannel.encode('utf-8') != '': - item['channel'] = sub_channel_2_channel[subChannel.encode('utf-8')] - elif subChannel.encode('utf-8') == '资讯': + if subChannel != '': + item['channel'] = sub_channel_2_channel[subChannel] + elif subChannel == '资讯': if tid == 51: - item['channel'] == u'番剧' + item['channel'] == '番剧' if tid == 170: - item['channel'] == u'国创' + item['channel'] == '国创' if tid == 159: - item['channel'] == u'娱乐' + item['channel'] == '娱乐' else: item['channel'] = None yield item - + except Exception as error: # 出现错误时打印错误日志 if r['code'] == -404: return - logging.error(u"视频爬虫在解析时发生错误") + logging.error("视频爬虫在解析时发生错误") logging.error(item) logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/video_watcher.py b/biliob_spider/spiders/video_watcher.py index d818ee3..059977a 100644 --- a/biliob_spider/spiders/video_watcher.py +++ b/biliob_spider/spiders/video_watcher.py @@ -19,7 +19,7 @@ class VideoWatch(scrapy.spiders.Spider): 'biliob_spider.pipelines.VideoAddPipeline': 300, 'biliob_spider.pipelines.AuthorChannelPipeline': 301 }, - 'DOWNLOAD_DELAY' : 1 + 'DOWNLOAD_DELAY': 1 } def __init__(self): @@ -32,16 +32,18 @@ def __init__(self): self.coll = self.db['author'] # 获得collection的句柄 def start_requests(self): - c = self.coll.find() + c = self.coll.find({}, {'mid': 1}) for each_doc in c: yield Request( 'http://space.bilibili.com/ajax/member/getSubmitVideos?mid=' + - str(each_doc['mid'])+'&pagesize=1&page=1&order=pubdate', + str(each_doc['mid']) + '&pagesize=1&page=1&order=pubdate', method='GET') def parse(self, response): try: j = json.loads(response.body) + if len(j['data']['vlist']) == 0: + return channels = j['data']['tlist'] list_channel = [] for each_channel in channels: From 6d14a4e2976de434e23eb90d239ced42efc19dbf Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 19 Oct 2018 15:59:42 +0800 Subject: [PATCH 031/469] =?UTF-8?q?=E4=BF=AE=E6=94=B9run=E6=96=B9=E6=B3=95?= =?UTF-8?q?=EF=BC=8C=E4=BF=AE=E6=94=B9settings=E7=9A=84=E6=97=A5=E5=BF=97?= =?UTF-8?q?=E8=BE=93=E5=87=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/settings.py | 4 ++-- run.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index 4981fb1..80ba4ad 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -11,8 +11,8 @@ import random -# LOG_FILE = "biliob_spider.log" -# LOG_LEVEL = "ERROR" +LOG_FILE = "biliob_spider.log" +LOG_LEVEL = "ERROR" BOT_NAME = 'biliob_spider' diff --git a/run.py b/run.py index e7b804c..e9ea027 100644 --- a/run.py +++ b/run.py @@ -44,9 +44,9 @@ def run_threaded(job_func): job_thread = threading.Thread(target=job_func) job_thread.start() -schedule.every().hour.do(run_threaded,update_author) -schedule.every().hour.do(run_threaded,video_watcher) -schedule.every(2).hours.do(run_threaded,video_spider) +schedule.every().day.at('01:00').do(run_threaded,update_author) +schedule.every(120).minutes.do(run_threaded,video_watcher) +schedule.every().day.at('07:00').do(run_threaded,video_spider) schedule.every().day.at('14:00').do(run_threaded,auto_add_author) schedule.every().minute.do(run_threaded,online) @@ -54,3 +54,4 @@ def run_threaded(job_func): while True: schedule.run_pending() time.sleep(60) + From 1c5a5866ce9713f32caf347ae5d94ec6bec5d2a7 Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 19 Oct 2018 15:59:42 +0800 Subject: [PATCH 032/469] =?UTF-8?q?=E4=BF=AE=E6=94=B9run=E6=96=B9=E6=B3=95?= =?UTF-8?q?=EF=BC=8C=E4=BF=AE=E6=94=B9settings=E7=9A=84=E6=97=A5=E5=BF=97?= =?UTF-8?q?=E8=BE=93=E5=87=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/settings.py | 4 ++-- run.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index 4981fb1..80ba4ad 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -11,8 +11,8 @@ import random -# LOG_FILE = "biliob_spider.log" -# LOG_LEVEL = "ERROR" +LOG_FILE = "biliob_spider.log" +LOG_LEVEL = "ERROR" BOT_NAME = 'biliob_spider' diff --git a/run.py b/run.py index e7b804c..e9ea027 100644 --- a/run.py +++ b/run.py @@ -44,9 +44,9 @@ def run_threaded(job_func): job_thread = threading.Thread(target=job_func) job_thread.start() -schedule.every().hour.do(run_threaded,update_author) -schedule.every().hour.do(run_threaded,video_watcher) -schedule.every(2).hours.do(run_threaded,video_spider) +schedule.every().day.at('01:00').do(run_threaded,update_author) +schedule.every(120).minutes.do(run_threaded,video_watcher) +schedule.every().day.at('07:00').do(run_threaded,video_spider) schedule.every().day.at('14:00').do(run_threaded,auto_add_author) schedule.every().minute.do(run_threaded,online) @@ -54,3 +54,4 @@ def run_threaded(job_func): while True: schedule.run_pending() time.sleep(60) + From 497253d0749754ac475ad7838174eb15b22c8443 Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 19 Oct 2018 16:10:36 +0800 Subject: [PATCH 033/469] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=AF=B9=E7=95=AA?= =?UTF-8?q?=E5=89=A7=E7=9A=84=E8=BF=BD=E8=B8=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + biliob_spider/items.py | 4 +++ biliob_spider/pipelines.py | 27 +++++++++++++++- biliob_spider/spiders/bangumi.py | 53 ++++++++++++++++++++++++++++++++ run.py | 4 +++ 5 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 biliob_spider/spiders/bangumi.py diff --git a/.gitignore b/.gitignore index 89e235f..f6806e5 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ db.py nohup.out biliob_spider.log +debug.py \ No newline at end of file diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 05d267c..0881f68 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -7,6 +7,10 @@ import scrapy +class BangumiItem(scrapy.Item): + title = scrapy.Field() + data = scrapy.Field() + class VideoItem(scrapy.Item): channel = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 66a68f1..686e2c7 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -9,7 +9,6 @@ import datetime import logging - class VideoPipeline(object): def __init__(self): # 链接mongoDB @@ -44,6 +43,32 @@ def process_item(self, item, spider): # 出现错误时打印错误日志 logging.error(error) +class BangumiPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['bangumi'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + "title": item['title'] + }, { + "$set": { + "title": item['title'], + }, + "$addToSet": { + 'data': item['data'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) class AuthorPipeline(object): def __init__(self): diff --git a/biliob_spider/spiders/bangumi.py b/biliob_spider/spiders/bangumi.py new file mode 100644 index 0000000..4856344 --- /dev/null +++ b/biliob_spider/spiders/bangumi.py @@ -0,0 +1,53 @@ +#coding=utf-8 +import scrapy +from scrapy.http import Request +from biliob_spider.items import BangumiItem +import time +import datetime + + +class BangumiSpider(scrapy.spiders.Spider): + name = "bangumi" + allowed_domains = ["bilibili.com"] + start_urls = ["https://www.bilibili.com/ranking/bangumi/13/0/7"] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.BangumiPipeLine': 200 + } + } + + def parse(self, response): + detail_href = response.xpath("//div[@class='img']/a/@href").extract() + + pts = response.xpath("//div[@class='pts']/div/text()").extract() + for (each_href, each_pts) in zip(detail_href, pts): + yield Request( + "https:" + each_href, + meta={'pts': each_pts}, + callback=self.detail_parse) + + def detail_parse(self, response): + pts = response.meta['pts'] + play = response.xpath( + '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[1]/em/text()' + ).extract()[0] + watch = response.xpath( + '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[2]/em/text()' + ).extract()[0] + danmaku = response.xpath( + '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[3]/em/text()' + ).extract()[0] + title = response.xpath( + '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[1]/span[1]/text()' + ).extract()[0] + data = { + 'danmaku': danmaku, + 'watch': watch, + 'play': play, + 'pts': int(pts), + 'datetime': datetime.datetime.now() + } + item = BangumiItem() + item['title'] = title + item['data'] = data + yield item \ No newline at end of file diff --git a/run.py b/run.py index e9ea027..7cbdbdd 100644 --- a/run.py +++ b/run.py @@ -25,6 +25,9 @@ # 第四步,将logger添加到handler里面 logger.addHandler(fh) +def bangumi(): + Popen(["scrapy","crawl","bangumi"]) + def update_author(): Popen(["scrapy","crawl","authorUpdate"]) @@ -48,6 +51,7 @@ def run_threaded(job_func): schedule.every(120).minutes.do(run_threaded,video_watcher) schedule.every().day.at('07:00').do(run_threaded,video_spider) schedule.every().day.at('14:00').do(run_threaded,auto_add_author) +schedule.every().day.at('16:30').do(run_threaded,bangumi) schedule.every().minute.do(run_threaded,online) logging.info('开始运行计划任务..') From c552d08ce5ba6f61fee23028a4ee8391d1b86ec7 Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 19 Oct 2018 16:10:36 +0800 Subject: [PATCH 034/469] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=AF=B9=E7=95=AA?= =?UTF-8?q?=E5=89=A7=E7=9A=84=E8=BF=BD=E8=B8=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + biliob_spider/items.py | 4 +++ biliob_spider/pipelines.py | 27 +++++++++++++++- biliob_spider/spiders/bangumi.py | 53 ++++++++++++++++++++++++++++++++ run.py | 4 +++ 5 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 biliob_spider/spiders/bangumi.py diff --git a/.gitignore b/.gitignore index 89e235f..f6806e5 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ db.py nohup.out biliob_spider.log +debug.py \ No newline at end of file diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 05d267c..0881f68 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -7,6 +7,10 @@ import scrapy +class BangumiItem(scrapy.Item): + title = scrapy.Field() + data = scrapy.Field() + class VideoItem(scrapy.Item): channel = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 66a68f1..686e2c7 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -9,7 +9,6 @@ import datetime import logging - class VideoPipeline(object): def __init__(self): # 链接mongoDB @@ -44,6 +43,32 @@ def process_item(self, item, spider): # 出现错误时打印错误日志 logging.error(error) +class BangumiPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['bangumi'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + "title": item['title'] + }, { + "$set": { + "title": item['title'], + }, + "$addToSet": { + 'data': item['data'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) class AuthorPipeline(object): def __init__(self): diff --git a/biliob_spider/spiders/bangumi.py b/biliob_spider/spiders/bangumi.py new file mode 100644 index 0000000..4856344 --- /dev/null +++ b/biliob_spider/spiders/bangumi.py @@ -0,0 +1,53 @@ +#coding=utf-8 +import scrapy +from scrapy.http import Request +from biliob_spider.items import BangumiItem +import time +import datetime + + +class BangumiSpider(scrapy.spiders.Spider): + name = "bangumi" + allowed_domains = ["bilibili.com"] + start_urls = ["https://www.bilibili.com/ranking/bangumi/13/0/7"] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.BangumiPipeLine': 200 + } + } + + def parse(self, response): + detail_href = response.xpath("//div[@class='img']/a/@href").extract() + + pts = response.xpath("//div[@class='pts']/div/text()").extract() + for (each_href, each_pts) in zip(detail_href, pts): + yield Request( + "https:" + each_href, + meta={'pts': each_pts}, + callback=self.detail_parse) + + def detail_parse(self, response): + pts = response.meta['pts'] + play = response.xpath( + '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[1]/em/text()' + ).extract()[0] + watch = response.xpath( + '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[2]/em/text()' + ).extract()[0] + danmaku = response.xpath( + '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[3]/em/text()' + ).extract()[0] + title = response.xpath( + '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[1]/span[1]/text()' + ).extract()[0] + data = { + 'danmaku': danmaku, + 'watch': watch, + 'play': play, + 'pts': int(pts), + 'datetime': datetime.datetime.now() + } + item = BangumiItem() + item['title'] = title + item['data'] = data + yield item \ No newline at end of file diff --git a/run.py b/run.py index e9ea027..7cbdbdd 100644 --- a/run.py +++ b/run.py @@ -25,6 +25,9 @@ # 第四步,将logger添加到handler里面 logger.addHandler(fh) +def bangumi(): + Popen(["scrapy","crawl","bangumi"]) + def update_author(): Popen(["scrapy","crawl","authorUpdate"]) @@ -48,6 +51,7 @@ def run_threaded(job_func): schedule.every(120).minutes.do(run_threaded,video_watcher) schedule.every().day.at('07:00').do(run_threaded,video_spider) schedule.every().day.at('14:00').do(run_threaded,auto_add_author) +schedule.every().day.at('16:30').do(run_threaded,bangumi) schedule.every().minute.do(run_threaded,online) logging.info('开始运行计划任务..') From eb7feb0b6cf3738be7af21cebeb274017a8ec3db Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 19 Oct 2018 16:17:42 +0800 Subject: [PATCH 035/469] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=8A=A8=E7=94=BB?= =?UTF-8?q?=E7=9A=84=E8=BF=BD=E8=B8=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/pipelines.py | 27 ++++++++++++++++ biliob_spider/spiders/donghua.py | 53 ++++++++++++++++++++++++++++++++ run.py | 4 +++ 3 files changed, 84 insertions(+) create mode 100644 biliob_spider/spiders/donghua.py diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 686e2c7..7c14cb5 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -70,6 +70,33 @@ def process_item(self, item, spider): # 出现错误时打印错误日志 logging.error(error) +class DonghuaPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['donghua'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + "title": item['title'] + }, { + "$set": { + "title": item['title'], + }, + "$addToSet": { + 'data': item['data'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) + class AuthorPipeline(object): def __init__(self): # 链接mongoDB diff --git a/biliob_spider/spiders/donghua.py b/biliob_spider/spiders/donghua.py new file mode 100644 index 0000000..be6e450 --- /dev/null +++ b/biliob_spider/spiders/donghua.py @@ -0,0 +1,53 @@ +#coding=utf-8 +import scrapy +from scrapy.http import Request +from biliob_spider.items import BangumiItem +import time +import datetime + + +class DonghuaSpider(scrapy.spiders.Spider): + name = "donghua" + allowed_domains = ["bilibili.com"] + start_urls = ["https://www.bilibili.com/ranking/bangumi/167/0/7"] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.DonghuaPipeLine': 200 + } + } + + def parse(self, response): + detail_href = response.xpath("//div[@class='img']/a/@href").extract() + + pts = response.xpath("//div[@class='pts']/div/text()").extract() + for (each_href, each_pts) in zip(detail_href, pts): + yield Request( + "https:" + each_href, + meta={'pts': each_pts}, + callback=self.detail_parse) + + def detail_parse(self, response): + pts = response.meta['pts'] + play = response.xpath( + '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[1]/em/text()' + ).extract()[0] + watch = response.xpath( + '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[2]/em/text()' + ).extract()[0] + danmaku = response.xpath( + '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[3]/em/text()' + ).extract()[0] + title = response.xpath( + '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[1]/span[1]/text()' + ).extract()[0] + data = { + 'danmaku': danmaku, + 'watch': watch, + 'play': play, + 'pts': int(pts), + 'datetime': datetime.datetime.now() + } + item = BangumiItem() + item['title'] = title + item['data'] = data + yield item \ No newline at end of file diff --git a/run.py b/run.py index 7cbdbdd..3b9ffcf 100644 --- a/run.py +++ b/run.py @@ -28,6 +28,9 @@ def bangumi(): Popen(["scrapy","crawl","bangumi"]) +def donghua(): + Popen(["scrapy","crawl","donghua"]) + def update_author(): Popen(["scrapy","crawl","authorUpdate"]) @@ -52,6 +55,7 @@ def run_threaded(job_func): schedule.every().day.at('07:00').do(run_threaded,video_spider) schedule.every().day.at('14:00').do(run_threaded,auto_add_author) schedule.every().day.at('16:30').do(run_threaded,bangumi) +schedule.every().day.at('16:30').do(run_threaded,donghua) schedule.every().minute.do(run_threaded,online) logging.info('开始运行计划任务..') From 9a189e567207d361c0842dfcdac379570c306d82 Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 19 Oct 2018 16:17:42 +0800 Subject: [PATCH 036/469] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=8A=A8=E7=94=BB?= =?UTF-8?q?=E7=9A=84=E8=BF=BD=E8=B8=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/pipelines.py | 27 ++++++++++++++++ biliob_spider/spiders/donghua.py | 53 ++++++++++++++++++++++++++++++++ run.py | 4 +++ 3 files changed, 84 insertions(+) create mode 100644 biliob_spider/spiders/donghua.py diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 686e2c7..7c14cb5 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -70,6 +70,33 @@ def process_item(self, item, spider): # 出现错误时打印错误日志 logging.error(error) +class DonghuaPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['donghua'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + "title": item['title'] + }, { + "$set": { + "title": item['title'], + }, + "$addToSet": { + 'data': item['data'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) + class AuthorPipeline(object): def __init__(self): # 链接mongoDB diff --git a/biliob_spider/spiders/donghua.py b/biliob_spider/spiders/donghua.py new file mode 100644 index 0000000..be6e450 --- /dev/null +++ b/biliob_spider/spiders/donghua.py @@ -0,0 +1,53 @@ +#coding=utf-8 +import scrapy +from scrapy.http import Request +from biliob_spider.items import BangumiItem +import time +import datetime + + +class DonghuaSpider(scrapy.spiders.Spider): + name = "donghua" + allowed_domains = ["bilibili.com"] + start_urls = ["https://www.bilibili.com/ranking/bangumi/167/0/7"] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.DonghuaPipeLine': 200 + } + } + + def parse(self, response): + detail_href = response.xpath("//div[@class='img']/a/@href").extract() + + pts = response.xpath("//div[@class='pts']/div/text()").extract() + for (each_href, each_pts) in zip(detail_href, pts): + yield Request( + "https:" + each_href, + meta={'pts': each_pts}, + callback=self.detail_parse) + + def detail_parse(self, response): + pts = response.meta['pts'] + play = response.xpath( + '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[1]/em/text()' + ).extract()[0] + watch = response.xpath( + '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[2]/em/text()' + ).extract()[0] + danmaku = response.xpath( + '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[3]/em/text()' + ).extract()[0] + title = response.xpath( + '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[1]/span[1]/text()' + ).extract()[0] + data = { + 'danmaku': danmaku, + 'watch': watch, + 'play': play, + 'pts': int(pts), + 'datetime': datetime.datetime.now() + } + item = BangumiItem() + item['title'] = title + item['data'] = data + yield item \ No newline at end of file diff --git a/run.py b/run.py index 7cbdbdd..3b9ffcf 100644 --- a/run.py +++ b/run.py @@ -28,6 +28,9 @@ def bangumi(): Popen(["scrapy","crawl","bangumi"]) +def donghua(): + Popen(["scrapy","crawl","donghua"]) + def update_author(): Popen(["scrapy","crawl","authorUpdate"]) @@ -52,6 +55,7 @@ def run_threaded(job_func): schedule.every().day.at('07:00').do(run_threaded,video_spider) schedule.every().day.at('14:00').do(run_threaded,auto_add_author) schedule.every().day.at('16:30').do(run_threaded,bangumi) +schedule.every().day.at('16:30').do(run_threaded,donghua) schedule.every().minute.do(run_threaded,online) logging.info('开始运行计划任务..') From 897845e770aaf41fa260f39eaf897ea3755b136f Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 19 Oct 2018 16:26:00 +0800 Subject: [PATCH 037/469] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E7=95=AA=E5=89=A7?= =?UTF-8?q?=E3=80=81=E5=8A=A8=E7=94=BB=E7=9A=84tag=E8=BF=BD=E8=B8=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/items.py | 1 + biliob_spider/pipelines.py | 2 ++ biliob_spider/spiders/bangumi.py | 2 ++ biliob_spider/spiders/donghua.py | 2 ++ 4 files changed, 7 insertions(+) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 0881f68..909a529 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -9,6 +9,7 @@ class BangumiItem(scrapy.Item): title = scrapy.Field() + tag = scrapy.Field() data = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 7c14cb5..f6f416d 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -59,6 +59,7 @@ def process_item(self, item, spider): "title": item['title'] }, { "$set": { + 'tag':item['tag'], "title": item['title'], }, "$addToSet": { @@ -86,6 +87,7 @@ def process_item(self, item, spider): "title": item['title'] }, { "$set": { + 'tag':item['tag'], "title": item['title'], }, "$addToSet": { diff --git a/biliob_spider/spiders/bangumi.py b/biliob_spider/spiders/bangumi.py index 4856344..5cd4da7 100644 --- a/biliob_spider/spiders/bangumi.py +++ b/biliob_spider/spiders/bangumi.py @@ -40,6 +40,7 @@ def detail_parse(self, response): title = response.xpath( '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[1]/span[1]/text()' ).extract()[0] + tag = response.xpath('//span[@class="media-tag"]/text()').extract() data = { 'danmaku': danmaku, 'watch': watch, @@ -48,6 +49,7 @@ def detail_parse(self, response): 'datetime': datetime.datetime.now() } item = BangumiItem() + item['tag'] = tag item['title'] = title item['data'] = data yield item \ No newline at end of file diff --git a/biliob_spider/spiders/donghua.py b/biliob_spider/spiders/donghua.py index be6e450..0f856f1 100644 --- a/biliob_spider/spiders/donghua.py +++ b/biliob_spider/spiders/donghua.py @@ -40,6 +40,7 @@ def detail_parse(self, response): title = response.xpath( '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[1]/span[1]/text()' ).extract()[0] + tag = response.xpath('//span[@class="media-tag"]/text()').extract() data = { 'danmaku': danmaku, 'watch': watch, @@ -48,6 +49,7 @@ def detail_parse(self, response): 'datetime': datetime.datetime.now() } item = BangumiItem() + item['tag'] = tag item['title'] = title item['data'] = data yield item \ No newline at end of file From 7ea4fdf7a48a1181087bb93486d76758e2d84dbc Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 19 Oct 2018 16:26:00 +0800 Subject: [PATCH 038/469] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E7=95=AA=E5=89=A7?= =?UTF-8?q?=E3=80=81=E5=8A=A8=E7=94=BB=E7=9A=84tag=E8=BF=BD=E8=B8=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/items.py | 1 + biliob_spider/pipelines.py | 2 ++ biliob_spider/spiders/bangumi.py | 2 ++ biliob_spider/spiders/donghua.py | 2 ++ 4 files changed, 7 insertions(+) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 0881f68..909a529 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -9,6 +9,7 @@ class BangumiItem(scrapy.Item): title = scrapy.Field() + tag = scrapy.Field() data = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 7c14cb5..f6f416d 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -59,6 +59,7 @@ def process_item(self, item, spider): "title": item['title'] }, { "$set": { + 'tag':item['tag'], "title": item['title'], }, "$addToSet": { @@ -86,6 +87,7 @@ def process_item(self, item, spider): "title": item['title'] }, { "$set": { + 'tag':item['tag'], "title": item['title'], }, "$addToSet": { diff --git a/biliob_spider/spiders/bangumi.py b/biliob_spider/spiders/bangumi.py index 4856344..5cd4da7 100644 --- a/biliob_spider/spiders/bangumi.py +++ b/biliob_spider/spiders/bangumi.py @@ -40,6 +40,7 @@ def detail_parse(self, response): title = response.xpath( '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[1]/span[1]/text()' ).extract()[0] + tag = response.xpath('//span[@class="media-tag"]/text()').extract() data = { 'danmaku': danmaku, 'watch': watch, @@ -48,6 +49,7 @@ def detail_parse(self, response): 'datetime': datetime.datetime.now() } item = BangumiItem() + item['tag'] = tag item['title'] = title item['data'] = data yield item \ No newline at end of file diff --git a/biliob_spider/spiders/donghua.py b/biliob_spider/spiders/donghua.py index be6e450..0f856f1 100644 --- a/biliob_spider/spiders/donghua.py +++ b/biliob_spider/spiders/donghua.py @@ -40,6 +40,7 @@ def detail_parse(self, response): title = response.xpath( '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[1]/span[1]/text()' ).extract()[0] + tag = response.xpath('//span[@class="media-tag"]/text()').extract() data = { 'danmaku': danmaku, 'watch': watch, @@ -48,6 +49,7 @@ def detail_parse(self, response): 'datetime': datetime.datetime.now() } item = BangumiItem() + item['tag'] = tag item['title'] = title item['data'] = data yield item \ No newline at end of file From 0964691ec97d53986a3a48af35fa6dd1f34f51de Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 19 Oct 2018 21:12:24 +0800 Subject: [PATCH 039/469] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=BC=B9=E5=B9=95?= =?UTF-8?q?=E6=95=B0=E8=AF=BB=E5=8F=96=E9=94=99=E8=AF=AF=E7=9A=84=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/spiders/video_spider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index 07e865b..da2b1e8 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -161,7 +161,7 @@ def parse(self, response): mid = d[each_key]['owner']['mid'] view = d[each_key]['stat']['view'] favorite = d[each_key]['stat']['favorite'] - danmaku = favorite = d[each_key]['stat']['danmaku'] + danmaku = d[each_key]['stat']['danmaku'] coin = d[each_key]['stat']['coin'] share = d[each_key]['stat']['share'] like = d[each_key]['stat']['like'] From 5744c0c85dc33b035b671b33059c68ee621d40e0 Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 19 Oct 2018 21:12:24 +0800 Subject: [PATCH 040/469] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=BC=B9=E5=B9=95?= =?UTF-8?q?=E6=95=B0=E8=AF=BB=E5=8F=96=E9=94=99=E8=AF=AF=E7=9A=84=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/spiders/video_spider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index 07e865b..da2b1e8 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -161,7 +161,7 @@ def parse(self, response): mid = d[each_key]['owner']['mid'] view = d[each_key]['stat']['view'] favorite = d[each_key]['stat']['favorite'] - danmaku = favorite = d[each_key]['stat']['danmaku'] + danmaku = d[each_key]['stat']['danmaku'] coin = d[each_key]['stat']['coin'] share = d[each_key]['stat']['share'] like = d[each_key]['stat']['like'] From c767b11a4c17e335926d71449f8efde86f1f6ebc Mon Sep 17 00:00:00 2001 From: jannchie Date: Mon, 22 Oct 2018 20:55:11 +0800 Subject: [PATCH 041/469] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E7=88=AC=E8=99=AB?= =?UTF-8?q?=EF=BC=8C=E6=B7=BB=E5=8A=A0focus=E5=B1=9E=E6=80=A7=E3=80=82focu?= =?UTF-8?q?s=E5=B1=9E=E6=80=A7=E5=86=B3=E5=AE=9A=E6=98=AF=E5=90=A6?= =?UTF-8?q?=E7=BB=A7=E7=BB=AD=E8=BF=BD=E8=B8=AA=E4=B8=80=E4=B8=AA=E8=A7=86?= =?UTF-8?q?=E9=A2=91=E6=88=96=E6=98=AFup=E4=B8=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/items.py | 1 + biliob_spider/pipelines.py | 17 ++++++++++++----- biliob_spider/spiders/author_auto_add.py | 1 + biliob_spider/spiders/author_update.py | 2 +- biliob_spider/spiders/video_spider.py | 6 +----- biliob_spider/spiders/video_watcher.py | 2 +- 6 files changed, 17 insertions(+), 12 deletions(-) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 909a529..a6dcd2e 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -33,6 +33,7 @@ class AuthorItem(scrapy.Item): sex = scrapy.Field() data = scrapy.Field() level = scrapy.Field() + focus = scrapy.Field() class VideoOnline(scrapy.Item): title = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index f6f416d..5b34351 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -34,8 +34,11 @@ def process_item(self, item, spider): "datetime": datetime.datetime.fromtimestamp( item['datetime']) }, - "$addToSet": { - 'data': item['data'] + "$push": { + 'data': { + '$each':[item['data']], + '$position':0 + } } }, True) return item @@ -121,8 +124,11 @@ def process_item(self, item, spider): "level": item['level'], "sex": item['sex'], }, - "$addToSet": { - 'data': item['data'] + "$push": { + 'data': { + '$each':[item['data']], + '$position':0 + } } }, True) return item @@ -177,7 +183,8 @@ def process_item(self, item, spider): "aid": item["aid"] }, { "$set": { - "aid": item['aid'] + 'aid': item['aid'], + 'focus': True }, }, True) return item diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index 258f4ea..c393225 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -58,6 +58,7 @@ def detailParse(self, response): item['face'] = face item['official'] = official item['sex'] = sex + item['focus'] = True item['level'] = int(level) item['data'] = { 'fans': int(fans), diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index 3c044f8..d96710c 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -31,7 +31,7 @@ def __init__(self): self.coll = self.db['author'] # 获得collection的句柄 def start_requests(self): - c = self.coll.find({}, {"mid": 1}) + c = self.coll.find({'focus':True}, {"mid": 1}) for each_doc in c: yield Request( "https://api.bilibili.com/x/web-interface/card?mid=" + str( diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index da2b1e8..c0a9062 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -124,16 +124,14 @@ def __init__(self): def start_requests(self): # 只需要aid - c = self.coll.find({}, {'aid': 1}) + c = self.coll.find({'focus':True}, {'aid': 1}) x = 0 aid_list = [] for each_doc in c: - print(x) x = x + 1 - aid_list.append(each_doc['aid']) i = 0 while aid_list != []: @@ -143,14 +141,12 @@ def start_requests(self): i = i + 1 if i == 100 or aid_list == []: i = 0 - print('yield') yield Request( "https://api.bilibili.com/x/article/archives?ids=" + aid_str.rstrip(',')) def parse(self, response): try: - print('parse') r = json.loads(response.body) d = r["data"] keys = list(d.keys()) diff --git a/biliob_spider/spiders/video_watcher.py b/biliob_spider/spiders/video_watcher.py index 059977a..3febfbc 100644 --- a/biliob_spider/spiders/video_watcher.py +++ b/biliob_spider/spiders/video_watcher.py @@ -32,7 +32,7 @@ def __init__(self): self.coll = self.db['author'] # 获得collection的句柄 def start_requests(self): - c = self.coll.find({}, {'mid': 1}) + c = self.coll.find({'focus':True}, {'mid': 1}) for each_doc in c: yield Request( 'http://space.bilibili.com/ajax/member/getSubmitVideos?mid=' + From 5b85cfb17455e0a765852e84958b7f271ef3f2d4 Mon Sep 17 00:00:00 2001 From: jannchie Date: Mon, 22 Oct 2018 20:55:11 +0800 Subject: [PATCH 042/469] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E7=88=AC=E8=99=AB?= =?UTF-8?q?=EF=BC=8C=E6=B7=BB=E5=8A=A0focus=E5=B1=9E=E6=80=A7=E3=80=82focu?= =?UTF-8?q?s=E5=B1=9E=E6=80=A7=E5=86=B3=E5=AE=9A=E6=98=AF=E5=90=A6?= =?UTF-8?q?=E7=BB=A7=E7=BB=AD=E8=BF=BD=E8=B8=AA=E4=B8=80=E4=B8=AA=E8=A7=86?= =?UTF-8?q?=E9=A2=91=E6=88=96=E6=98=AFup=E4=B8=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/items.py | 1 + biliob_spider/pipelines.py | 17 ++++++++++++----- biliob_spider/spiders/author_auto_add.py | 1 + biliob_spider/spiders/author_update.py | 2 +- biliob_spider/spiders/video_spider.py | 6 +----- biliob_spider/spiders/video_watcher.py | 2 +- 6 files changed, 17 insertions(+), 12 deletions(-) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 909a529..a6dcd2e 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -33,6 +33,7 @@ class AuthorItem(scrapy.Item): sex = scrapy.Field() data = scrapy.Field() level = scrapy.Field() + focus = scrapy.Field() class VideoOnline(scrapy.Item): title = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index f6f416d..5b34351 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -34,8 +34,11 @@ def process_item(self, item, spider): "datetime": datetime.datetime.fromtimestamp( item['datetime']) }, - "$addToSet": { - 'data': item['data'] + "$push": { + 'data': { + '$each':[item['data']], + '$position':0 + } } }, True) return item @@ -121,8 +124,11 @@ def process_item(self, item, spider): "level": item['level'], "sex": item['sex'], }, - "$addToSet": { - 'data': item['data'] + "$push": { + 'data': { + '$each':[item['data']], + '$position':0 + } } }, True) return item @@ -177,7 +183,8 @@ def process_item(self, item, spider): "aid": item["aid"] }, { "$set": { - "aid": item['aid'] + 'aid': item['aid'], + 'focus': True }, }, True) return item diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index 258f4ea..c393225 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -58,6 +58,7 @@ def detailParse(self, response): item['face'] = face item['official'] = official item['sex'] = sex + item['focus'] = True item['level'] = int(level) item['data'] = { 'fans': int(fans), diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index 3c044f8..d96710c 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -31,7 +31,7 @@ def __init__(self): self.coll = self.db['author'] # 获得collection的句柄 def start_requests(self): - c = self.coll.find({}, {"mid": 1}) + c = self.coll.find({'focus':True}, {"mid": 1}) for each_doc in c: yield Request( "https://api.bilibili.com/x/web-interface/card?mid=" + str( diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index da2b1e8..c0a9062 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -124,16 +124,14 @@ def __init__(self): def start_requests(self): # 只需要aid - c = self.coll.find({}, {'aid': 1}) + c = self.coll.find({'focus':True}, {'aid': 1}) x = 0 aid_list = [] for each_doc in c: - print(x) x = x + 1 - aid_list.append(each_doc['aid']) i = 0 while aid_list != []: @@ -143,14 +141,12 @@ def start_requests(self): i = i + 1 if i == 100 or aid_list == []: i = 0 - print('yield') yield Request( "https://api.bilibili.com/x/article/archives?ids=" + aid_str.rstrip(',')) def parse(self, response): try: - print('parse') r = json.loads(response.body) d = r["data"] keys = list(d.keys()) diff --git a/biliob_spider/spiders/video_watcher.py b/biliob_spider/spiders/video_watcher.py index 059977a..3febfbc 100644 --- a/biliob_spider/spiders/video_watcher.py +++ b/biliob_spider/spiders/video_watcher.py @@ -32,7 +32,7 @@ def __init__(self): self.coll = self.db['author'] # 获得collection的句柄 def start_requests(self): - c = self.coll.find({}, {'mid': 1}) + c = self.coll.find({'focus':True}, {'mid': 1}) for each_doc in c: yield Request( 'http://space.bilibili.com/ajax/member/getSubmitVideos?mid=' + From 7ca72d5e87581c9bc8fbda15fba4e9672e162863 Mon Sep 17 00:00:00 2001 From: jannchie Date: Mon, 22 Oct 2018 21:59:17 +0800 Subject: [PATCH 043/469] =?UTF-8?q?=E5=88=9B=E5=BB=BA=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E5=88=86=E6=9E=90=E5=99=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_analyzer/author_analyzer.py | 71 ++++++++++++++++++++++ biliob_analyzer/author_fans_variation.py | 3 + biliob_analyzer/delete_wrong_favorite.py | 18 ++++++ biliob_analyzer/reverse_data.py | 14 +++++ biliob_analyzer/video_analyzer.py | 75 ++++++++++++++++++++++++ biliob_spider/spiders/video_spider.py | 1 - 6 files changed, 181 insertions(+), 1 deletion(-) create mode 100644 biliob_analyzer/author_analyzer.py create mode 100644 biliob_analyzer/author_fans_variation.py create mode 100644 biliob_analyzer/delete_wrong_favorite.py create mode 100644 biliob_analyzer/reverse_data.py create mode 100644 biliob_analyzer/video_analyzer.py diff --git a/biliob_analyzer/author_analyzer.py b/biliob_analyzer/author_analyzer.py new file mode 100644 index 0000000..90cec3d --- /dev/null +++ b/biliob_analyzer/author_analyzer.py @@ -0,0 +1,71 @@ +from db import settings +from pymongo import MongoClient +from datetime import datetime +from datetime import timedelta +class AuthorAnalyzer(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + def focus_filter(self): + pre_fans = -1 + c_fans = -1 + delta = timedelta(1) + pre_date = datetime + c_date = datetime + count_unfocus = 0 + count_focus = 0 + for each_doc in self.coll.find(): + flag_cool = 0 + for each_data in each_doc['data']: + if pre_fans == -1: + pre_fans = each_data['fans'] + pre_date = each_data['datetime'] + continue + c_fans = each_data['fans'] + c_date = each_data['datetime'] + if pre_date + delta > c_date: + continue + rate = (c_fans-pre_fans)/((c_date-pre_date).seconds*60*60*24+1) + pre_fans = c_fans + pre_date = c_date + if abs(rate) < 100: + flag_cool += 1 + else: + flag_cool = 0 + + # 连续30日日均涨粉小于100且粉丝数小于100000则不追踪 + if flag_cool > 30 and each_data['fans'] < 100000: + focus = False + break + elif flag_cool > 15 and each_data['fans'] < 5000: + focus = False + break + elif flag_cool > 7 and each_data['fans'] < 1000: + focus = False + break + else: + focus = True + + if focus: + count_focus += 1 + print("√ 持续追踪:"+each_doc['name']) + self.coll.update_one({'mid':each_doc['mid']},{'$set':{'focus':True}}) + else: + count_unfocus += 1 + print("× 不再追踪:"+each_doc['name']) + self.coll.update_one({'mid':each_doc['mid']},{'$set':{'focus':False}}) + pre_fans = -1 + c_fans = -1 + print("· 本轮筛选结果:") + print("× 不再追踪总数:"+str(count_unfocus)) + print("√ 持续追踪总数:"+str(count_focus)) + + def fans_variation(self): + pass +author_analyzer = AuthorAnalyzer() +author_analyzer.focus_filter() \ No newline at end of file diff --git a/biliob_analyzer/author_fans_variation.py b/biliob_analyzer/author_fans_variation.py new file mode 100644 index 0000000..eb163f8 --- /dev/null +++ b/biliob_analyzer/author_fans_variation.py @@ -0,0 +1,3 @@ +from db import settings +from pymongo import MongoClient +from datetime import datetime \ No newline at end of file diff --git a/biliob_analyzer/delete_wrong_favorite.py b/biliob_analyzer/delete_wrong_favorite.py new file mode 100644 index 0000000..c9c4bc9 --- /dev/null +++ b/biliob_analyzer/delete_wrong_favorite.py @@ -0,0 +1,18 @@ +from db import settings +from pymongo import MongoClient +# 链接mongoDB +client = MongoClient(settings['MINGO_HOST'], 27017) +# 数据库登录需要帐号密码 +client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW']) +db = client['biliob'] # 获得数据库的句柄 +coll = db['video'] # 获得collection的句柄 +docs = coll.find().batch_size(60) +for each_doc in docs: + if 'data' in each_doc: + for each_data in each_doc['data']: + if 'favorite' not in each_data: + break + if each_data['favorite'] == each_data['danmaku']: + each_data.pop(' favorite') + coll.update_one({'aid': each_doc['aid']},{'$set':each_doc}) + print('已修复av'+str(each_doc['aid'])) \ No newline at end of file diff --git a/biliob_analyzer/reverse_data.py b/biliob_analyzer/reverse_data.py new file mode 100644 index 0000000..e0aa2e5 --- /dev/null +++ b/biliob_analyzer/reverse_data.py @@ -0,0 +1,14 @@ +from db import settings +from pymongo import MongoClient +# 链接mongoDB +client = MongoClient(settings['MINGO_HOST'], 27017) +# 数据库登录需要帐号密码 +client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW']) +db = client['biliob'] # 获得数据库的句柄 +coll = db['video'] # 获得collection的句柄 +docs = coll.find().batch_size(300) +for each_doc in docs: + if 'data' in each_doc: + each_doc['data'].sort(key=lambda d:d['datetime'],reverse=True) + coll.update_one({'aid': each_doc['aid']},{'$set':each_doc}) + print('已修复av'+str(each_doc['aid'])) \ No newline at end of file diff --git a/biliob_analyzer/video_analyzer.py b/biliob_analyzer/video_analyzer.py new file mode 100644 index 0000000..a837885 --- /dev/null +++ b/biliob_analyzer/video_analyzer.py @@ -0,0 +1,75 @@ +from db import settings +from pymongo import MongoClient +from datetime import datetime +from datetime import timedelta +class AuthorAnalyzer(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + def video_cleaner(self): + pre_view = -1 + c_view = -1 + delta = timedelta(1) + pre_date = datetime + c_date = datetime + count_delete = 0 + count_unfocus = 0 + count_focus = 0 + for each_doc in self.coll.find(): + live_time = 0 + delete = False + focus = True + if 'data' in each_doc: + each_doc['data'].reverse() + for each_data in each_doc['data']: + + if pre_view == -1: + pre_view = each_data['view'] + pre_date = each_data['datetime'] + continue + c_view = each_data['view'] + c_date = each_data['datetime'] + + if pre_date + delta > c_date: + continue + live_time +=1 + rate = (c_view-pre_view) + pre_view = c_view + pre_date = c_date + + if live_time == 3 and c_view < 3000: + delete = True + focus = False + break + elif live_time > 3 and rate < 100: + focus = False + delete = False + break + else: + focus = True + delete = False + if delete: + count_delete += 1 + print("! 删除追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data']-1)]['view'])) + self.coll.delete_one({'aid':each_doc['aid']}) + elif focus: + count_focus += 1 + print("√ 持续追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data']-1)]['view'])) + else: + count_unfocus += 1 + print("× 不再追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data']-1)]['view'])) + self.coll.update_one({'aid':each_doc['aid']},{'$set':{'focus':False}}) + pre_view = -1 + c_view = -1 + print("· 本轮筛选结果:") + print("! 删除辣鸡总数:"+str(count_delete)) + print("× 不再追踪总数:"+str(count_unfocus)) + print("√ 持续追踪总数:"+str(count_focus)) + +author_analyzer = AuthorAnalyzer() +author_analyzer.video_cleaner() \ No newline at end of file diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index c0a9062..b1fd5b9 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -130,7 +130,6 @@ def start_requests(self): aid_list = [] for each_doc in c: - print(x) x = x + 1 aid_list.append(each_doc['aid']) i = 0 From 8425c79d76497f84ca382741c0aac13fa57d7453 Mon Sep 17 00:00:00 2001 From: jannchie Date: Mon, 22 Oct 2018 21:59:17 +0800 Subject: [PATCH 044/469] =?UTF-8?q?=E5=88=9B=E5=BB=BA=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E5=88=86=E6=9E=90=E5=99=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_analyzer/author_analyzer.py | 71 ++++++++++++++++++++++ biliob_analyzer/author_fans_variation.py | 3 + biliob_analyzer/delete_wrong_favorite.py | 18 ++++++ biliob_analyzer/reverse_data.py | 14 +++++ biliob_analyzer/video_analyzer.py | 75 ++++++++++++++++++++++++ biliob_spider/spiders/video_spider.py | 1 - 6 files changed, 181 insertions(+), 1 deletion(-) create mode 100644 biliob_analyzer/author_analyzer.py create mode 100644 biliob_analyzer/author_fans_variation.py create mode 100644 biliob_analyzer/delete_wrong_favorite.py create mode 100644 biliob_analyzer/reverse_data.py create mode 100644 biliob_analyzer/video_analyzer.py diff --git a/biliob_analyzer/author_analyzer.py b/biliob_analyzer/author_analyzer.py new file mode 100644 index 0000000..90cec3d --- /dev/null +++ b/biliob_analyzer/author_analyzer.py @@ -0,0 +1,71 @@ +from db import settings +from pymongo import MongoClient +from datetime import datetime +from datetime import timedelta +class AuthorAnalyzer(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + def focus_filter(self): + pre_fans = -1 + c_fans = -1 + delta = timedelta(1) + pre_date = datetime + c_date = datetime + count_unfocus = 0 + count_focus = 0 + for each_doc in self.coll.find(): + flag_cool = 0 + for each_data in each_doc['data']: + if pre_fans == -1: + pre_fans = each_data['fans'] + pre_date = each_data['datetime'] + continue + c_fans = each_data['fans'] + c_date = each_data['datetime'] + if pre_date + delta > c_date: + continue + rate = (c_fans-pre_fans)/((c_date-pre_date).seconds*60*60*24+1) + pre_fans = c_fans + pre_date = c_date + if abs(rate) < 100: + flag_cool += 1 + else: + flag_cool = 0 + + # 连续30日日均涨粉小于100且粉丝数小于100000则不追踪 + if flag_cool > 30 and each_data['fans'] < 100000: + focus = False + break + elif flag_cool > 15 and each_data['fans'] < 5000: + focus = False + break + elif flag_cool > 7 and each_data['fans'] < 1000: + focus = False + break + else: + focus = True + + if focus: + count_focus += 1 + print("√ 持续追踪:"+each_doc['name']) + self.coll.update_one({'mid':each_doc['mid']},{'$set':{'focus':True}}) + else: + count_unfocus += 1 + print("× 不再追踪:"+each_doc['name']) + self.coll.update_one({'mid':each_doc['mid']},{'$set':{'focus':False}}) + pre_fans = -1 + c_fans = -1 + print("· 本轮筛选结果:") + print("× 不再追踪总数:"+str(count_unfocus)) + print("√ 持续追踪总数:"+str(count_focus)) + + def fans_variation(self): + pass +author_analyzer = AuthorAnalyzer() +author_analyzer.focus_filter() \ No newline at end of file diff --git a/biliob_analyzer/author_fans_variation.py b/biliob_analyzer/author_fans_variation.py new file mode 100644 index 0000000..eb163f8 --- /dev/null +++ b/biliob_analyzer/author_fans_variation.py @@ -0,0 +1,3 @@ +from db import settings +from pymongo import MongoClient +from datetime import datetime \ No newline at end of file diff --git a/biliob_analyzer/delete_wrong_favorite.py b/biliob_analyzer/delete_wrong_favorite.py new file mode 100644 index 0000000..c9c4bc9 --- /dev/null +++ b/biliob_analyzer/delete_wrong_favorite.py @@ -0,0 +1,18 @@ +from db import settings +from pymongo import MongoClient +# 链接mongoDB +client = MongoClient(settings['MINGO_HOST'], 27017) +# 数据库登录需要帐号密码 +client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW']) +db = client['biliob'] # 获得数据库的句柄 +coll = db['video'] # 获得collection的句柄 +docs = coll.find().batch_size(60) +for each_doc in docs: + if 'data' in each_doc: + for each_data in each_doc['data']: + if 'favorite' not in each_data: + break + if each_data['favorite'] == each_data['danmaku']: + each_data.pop(' favorite') + coll.update_one({'aid': each_doc['aid']},{'$set':each_doc}) + print('已修复av'+str(each_doc['aid'])) \ No newline at end of file diff --git a/biliob_analyzer/reverse_data.py b/biliob_analyzer/reverse_data.py new file mode 100644 index 0000000..e0aa2e5 --- /dev/null +++ b/biliob_analyzer/reverse_data.py @@ -0,0 +1,14 @@ +from db import settings +from pymongo import MongoClient +# 链接mongoDB +client = MongoClient(settings['MINGO_HOST'], 27017) +# 数据库登录需要帐号密码 +client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW']) +db = client['biliob'] # 获得数据库的句柄 +coll = db['video'] # 获得collection的句柄 +docs = coll.find().batch_size(300) +for each_doc in docs: + if 'data' in each_doc: + each_doc['data'].sort(key=lambda d:d['datetime'],reverse=True) + coll.update_one({'aid': each_doc['aid']},{'$set':each_doc}) + print('已修复av'+str(each_doc['aid'])) \ No newline at end of file diff --git a/biliob_analyzer/video_analyzer.py b/biliob_analyzer/video_analyzer.py new file mode 100644 index 0000000..a837885 --- /dev/null +++ b/biliob_analyzer/video_analyzer.py @@ -0,0 +1,75 @@ +from db import settings +from pymongo import MongoClient +from datetime import datetime +from datetime import timedelta +class AuthorAnalyzer(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + def video_cleaner(self): + pre_view = -1 + c_view = -1 + delta = timedelta(1) + pre_date = datetime + c_date = datetime + count_delete = 0 + count_unfocus = 0 + count_focus = 0 + for each_doc in self.coll.find(): + live_time = 0 + delete = False + focus = True + if 'data' in each_doc: + each_doc['data'].reverse() + for each_data in each_doc['data']: + + if pre_view == -1: + pre_view = each_data['view'] + pre_date = each_data['datetime'] + continue + c_view = each_data['view'] + c_date = each_data['datetime'] + + if pre_date + delta > c_date: + continue + live_time +=1 + rate = (c_view-pre_view) + pre_view = c_view + pre_date = c_date + + if live_time == 3 and c_view < 3000: + delete = True + focus = False + break + elif live_time > 3 and rate < 100: + focus = False + delete = False + break + else: + focus = True + delete = False + if delete: + count_delete += 1 + print("! 删除追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data']-1)]['view'])) + self.coll.delete_one({'aid':each_doc['aid']}) + elif focus: + count_focus += 1 + print("√ 持续追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data']-1)]['view'])) + else: + count_unfocus += 1 + print("× 不再追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data']-1)]['view'])) + self.coll.update_one({'aid':each_doc['aid']},{'$set':{'focus':False}}) + pre_view = -1 + c_view = -1 + print("· 本轮筛选结果:") + print("! 删除辣鸡总数:"+str(count_delete)) + print("× 不再追踪总数:"+str(count_unfocus)) + print("√ 持续追踪总数:"+str(count_focus)) + +author_analyzer = AuthorAnalyzer() +author_analyzer.video_cleaner() \ No newline at end of file diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index c0a9062..b1fd5b9 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -130,7 +130,6 @@ def start_requests(self): aid_list = [] for each_doc in c: - print(x) x = x + 1 aid_list.append(each_doc['aid']) i = 0 From 4c85cca5a1276b87c85af6984c490035614edebd Mon Sep 17 00:00:00 2001 From: jannchie Date: Wed, 24 Oct 2018 22:43:07 +0800 Subject: [PATCH 045/469] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dbug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_analyzer/add_focus.py | 13 ++++++++++ biliob_analyzer/author_analyzer.py | 4 ++- biliob_analyzer/reverse_data.py | 6 ++--- biliob_analyzer/video_analyzer.py | 8 +++--- biliob_spider/pipelines.py | 1 + biliob_spider/spiders/author_auto_add.py | 32 +++++++++++++----------- 6 files changed, 41 insertions(+), 23 deletions(-) create mode 100644 biliob_analyzer/add_focus.py diff --git a/biliob_analyzer/add_focus.py b/biliob_analyzer/add_focus.py new file mode 100644 index 0000000..5d4d529 --- /dev/null +++ b/biliob_analyzer/add_focus.py @@ -0,0 +1,13 @@ +from db import settings +from pymongo import MongoClient +# 链接mongoDB +client = MongoClient(settings['MINGO_HOST'], 27017) +# 数据库登录需要帐号密码 +client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW']) +db = client['biliob'] # 获得数据库的句柄 +coll = db['author'] # 获得collection的句柄 +docs = coll.find({'focus': {'$exists': False}}).batch_size(60) +for each_doc in docs: + each_doc['focus'] = True + coll.update_one({'mid': each_doc['mid']}, {'$set': each_doc}) + print('已修复mid' + str(each_doc['mid'])) diff --git a/biliob_analyzer/author_analyzer.py b/biliob_analyzer/author_analyzer.py index 90cec3d..90dd078 100644 --- a/biliob_analyzer/author_analyzer.py +++ b/biliob_analyzer/author_analyzer.py @@ -19,8 +19,9 @@ def focus_filter(self): c_date = datetime count_unfocus = 0 count_focus = 0 - for each_doc in self.coll.find(): + for each_doc in self.coll.find({'focus':True}): flag_cool = 0 + each_doc['data'].reverse() for each_data in each_doc['data']: if pre_fans == -1: pre_fans = each_data['fans'] @@ -67,5 +68,6 @@ def focus_filter(self): def fans_variation(self): pass + author_analyzer = AuthorAnalyzer() author_analyzer.focus_filter() \ No newline at end of file diff --git a/biliob_analyzer/reverse_data.py b/biliob_analyzer/reverse_data.py index e0aa2e5..f3ecf7d 100644 --- a/biliob_analyzer/reverse_data.py +++ b/biliob_analyzer/reverse_data.py @@ -5,10 +5,10 @@ # 数据库登录需要帐号密码 client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW']) db = client['biliob'] # 获得数据库的句柄 -coll = db['video'] # 获得collection的句柄 +coll = db['author'] # 获得collection的句柄 docs = coll.find().batch_size(300) for each_doc in docs: if 'data' in each_doc: each_doc['data'].sort(key=lambda d:d['datetime'],reverse=True) - coll.update_one({'aid': each_doc['aid']},{'$set':each_doc}) - print('已修复av'+str(each_doc['aid'])) \ No newline at end of file + coll.update_one({'mid': each_doc['mid']},{'$set':each_doc}) + print('已修复av'+str(each_doc['mid'])) \ No newline at end of file diff --git a/biliob_analyzer/video_analyzer.py b/biliob_analyzer/video_analyzer.py index a837885..d394a4b 100644 --- a/biliob_analyzer/video_analyzer.py +++ b/biliob_analyzer/video_analyzer.py @@ -20,7 +20,7 @@ def video_cleaner(self): count_delete = 0 count_unfocus = 0 count_focus = 0 - for each_doc in self.coll.find(): + for each_doc in self.coll.find({'focus':True}): live_time = 0 delete = False focus = True @@ -55,14 +55,14 @@ def video_cleaner(self): delete = False if delete: count_delete += 1 - print("! 删除追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data']-1)]['view'])) + print("! 删除追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data'])-1]['view'])) self.coll.delete_one({'aid':each_doc['aid']}) elif focus: count_focus += 1 - print("√ 持续追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data']-1)]['view'])) + print("√ 持续追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data'])-1]['view'])) else: count_unfocus += 1 - print("× 不再追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data']-1)]['view'])) + print("× 不再追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data'])-1]['view'])) self.coll.update_one({'aid':each_doc['aid']},{'$set':{'focus':False}}) pre_view = -1 c_view = -1 diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 5b34351..959d5c9 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -123,6 +123,7 @@ def process_item(self, item, spider): "official": item['official'], "level": item['level'], "sex": item['sex'], + "focus":True }, "$push": { 'data': { diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index c393225..bb6dc2c 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -53,18 +53,20 @@ def detailParse(self, response): article = j['data']['article_count'] face = j['data']['card']['face'] item = AuthorItem() - item['mid'] = int(mid) - item['name'] = name - item['face'] = face - item['official'] = official - item['sex'] = sex - item['focus'] = True - item['level'] = int(level) - item['data'] = { - 'fans': int(fans), - 'attention': int(attention), - 'archive': int(archive), - 'article': int(article), - 'datetime': datetime.datetime.now() - } - yield item + # 粉丝数大于1000才加入 + if int(fans) > 1000: + item['mid'] = int(mid) + item['name'] = name + item['face'] = face + item['official'] = official + item['sex'] = sex + item['focus'] = True + item['level'] = int(level) + item['data'] = { + 'fans': int(fans), + 'attention': int(attention), + 'archive': int(archive), + 'article': int(article), + 'datetime': datetime.datetime.now() + } + yield item From 5d12e48bd2aa1274bdf6647a9ece5b225985e37a Mon Sep 17 00:00:00 2001 From: jannchie Date: Wed, 24 Oct 2018 22:43:07 +0800 Subject: [PATCH 046/469] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dbug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_analyzer/add_focus.py | 13 ++++++++++ biliob_analyzer/author_analyzer.py | 4 ++- biliob_analyzer/reverse_data.py | 6 ++--- biliob_analyzer/video_analyzer.py | 8 +++--- biliob_spider/pipelines.py | 1 + biliob_spider/spiders/author_auto_add.py | 32 +++++++++++++----------- 6 files changed, 41 insertions(+), 23 deletions(-) create mode 100644 biliob_analyzer/add_focus.py diff --git a/biliob_analyzer/add_focus.py b/biliob_analyzer/add_focus.py new file mode 100644 index 0000000..5d4d529 --- /dev/null +++ b/biliob_analyzer/add_focus.py @@ -0,0 +1,13 @@ +from db import settings +from pymongo import MongoClient +# 链接mongoDB +client = MongoClient(settings['MINGO_HOST'], 27017) +# 数据库登录需要帐号密码 +client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW']) +db = client['biliob'] # 获得数据库的句柄 +coll = db['author'] # 获得collection的句柄 +docs = coll.find({'focus': {'$exists': False}}).batch_size(60) +for each_doc in docs: + each_doc['focus'] = True + coll.update_one({'mid': each_doc['mid']}, {'$set': each_doc}) + print('已修复mid' + str(each_doc['mid'])) diff --git a/biliob_analyzer/author_analyzer.py b/biliob_analyzer/author_analyzer.py index 90cec3d..90dd078 100644 --- a/biliob_analyzer/author_analyzer.py +++ b/biliob_analyzer/author_analyzer.py @@ -19,8 +19,9 @@ def focus_filter(self): c_date = datetime count_unfocus = 0 count_focus = 0 - for each_doc in self.coll.find(): + for each_doc in self.coll.find({'focus':True}): flag_cool = 0 + each_doc['data'].reverse() for each_data in each_doc['data']: if pre_fans == -1: pre_fans = each_data['fans'] @@ -67,5 +68,6 @@ def focus_filter(self): def fans_variation(self): pass + author_analyzer = AuthorAnalyzer() author_analyzer.focus_filter() \ No newline at end of file diff --git a/biliob_analyzer/reverse_data.py b/biliob_analyzer/reverse_data.py index e0aa2e5..f3ecf7d 100644 --- a/biliob_analyzer/reverse_data.py +++ b/biliob_analyzer/reverse_data.py @@ -5,10 +5,10 @@ # 数据库登录需要帐号密码 client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW']) db = client['biliob'] # 获得数据库的句柄 -coll = db['video'] # 获得collection的句柄 +coll = db['author'] # 获得collection的句柄 docs = coll.find().batch_size(300) for each_doc in docs: if 'data' in each_doc: each_doc['data'].sort(key=lambda d:d['datetime'],reverse=True) - coll.update_one({'aid': each_doc['aid']},{'$set':each_doc}) - print('已修复av'+str(each_doc['aid'])) \ No newline at end of file + coll.update_one({'mid': each_doc['mid']},{'$set':each_doc}) + print('已修复av'+str(each_doc['mid'])) \ No newline at end of file diff --git a/biliob_analyzer/video_analyzer.py b/biliob_analyzer/video_analyzer.py index a837885..d394a4b 100644 --- a/biliob_analyzer/video_analyzer.py +++ b/biliob_analyzer/video_analyzer.py @@ -20,7 +20,7 @@ def video_cleaner(self): count_delete = 0 count_unfocus = 0 count_focus = 0 - for each_doc in self.coll.find(): + for each_doc in self.coll.find({'focus':True}): live_time = 0 delete = False focus = True @@ -55,14 +55,14 @@ def video_cleaner(self): delete = False if delete: count_delete += 1 - print("! 删除追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data']-1)]['view'])) + print("! 删除追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data'])-1]['view'])) self.coll.delete_one({'aid':each_doc['aid']}) elif focus: count_focus += 1 - print("√ 持续追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data']-1)]['view'])) + print("√ 持续追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data'])-1]['view'])) else: count_unfocus += 1 - print("× 不再追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data']-1)]['view'])) + print("× 不再追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data'])-1]['view'])) self.coll.update_one({'aid':each_doc['aid']},{'$set':{'focus':False}}) pre_view = -1 c_view = -1 diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 5b34351..959d5c9 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -123,6 +123,7 @@ def process_item(self, item, spider): "official": item['official'], "level": item['level'], "sex": item['sex'], + "focus":True }, "$push": { 'data': { diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index c393225..bb6dc2c 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -53,18 +53,20 @@ def detailParse(self, response): article = j['data']['article_count'] face = j['data']['card']['face'] item = AuthorItem() - item['mid'] = int(mid) - item['name'] = name - item['face'] = face - item['official'] = official - item['sex'] = sex - item['focus'] = True - item['level'] = int(level) - item['data'] = { - 'fans': int(fans), - 'attention': int(attention), - 'archive': int(archive), - 'article': int(article), - 'datetime': datetime.datetime.now() - } - yield item + # 粉丝数大于1000才加入 + if int(fans) > 1000: + item['mid'] = int(mid) + item['name'] = name + item['face'] = face + item['official'] = official + item['sex'] = sex + item['focus'] = True + item['level'] = int(level) + item['data'] = { + 'fans': int(fans), + 'attention': int(attention), + 'archive': int(archive), + 'article': int(article), + 'datetime': datetime.datetime.now() + } + yield item From 9cad63edbca17aec1c986947a16f93c369bca1b8 Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 26 Oct 2018 22:41:04 +0800 Subject: [PATCH 047/469] =?UTF-8?q?=E6=AF=8F=E6=97=A5=E5=88=86=E6=9E=90?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=EF=BC=8C=E8=87=AA=E5=8A=A8=E5=88=A0=E5=8E=BB?= =?UTF-8?q?=E4=B8=8D=E5=A4=9F=E7=9E=A9=E7=9B=AE=E7=9A=84=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_analyzer/author_analyzer.py | 4 +--- biliob_analyzer/video_analyzer.py | 6 ++---- run.py | 6 ++++++ run_analyzer.py | 8 ++++++++ 4 files changed, 17 insertions(+), 7 deletions(-) create mode 100644 run_analyzer.py diff --git a/biliob_analyzer/author_analyzer.py b/biliob_analyzer/author_analyzer.py index 90dd078..d9761fa 100644 --- a/biliob_analyzer/author_analyzer.py +++ b/biliob_analyzer/author_analyzer.py @@ -11,7 +11,7 @@ def __init__(self): settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['author'] # 获得collection的句柄 - def focus_filter(self): + def author_filter(self): pre_fans = -1 c_fans = -1 delta = timedelta(1) @@ -69,5 +69,3 @@ def focus_filter(self): def fans_variation(self): pass -author_analyzer = AuthorAnalyzer() -author_analyzer.focus_filter() \ No newline at end of file diff --git a/biliob_analyzer/video_analyzer.py b/biliob_analyzer/video_analyzer.py index d394a4b..fc1fef1 100644 --- a/biliob_analyzer/video_analyzer.py +++ b/biliob_analyzer/video_analyzer.py @@ -2,7 +2,7 @@ from pymongo import MongoClient from datetime import datetime from datetime import timedelta -class AuthorAnalyzer(object): +class VideoAnalyzer(object): def __init__(self): # 链接mongoDB self.client = MongoClient(settings['MINGO_HOST'], 27017) @@ -11,7 +11,7 @@ def __init__(self): settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['video'] # 获得collection的句柄 - def video_cleaner(self): + def video_filter(self): pre_view = -1 c_view = -1 delta = timedelta(1) @@ -71,5 +71,3 @@ def video_cleaner(self): print("× 不再追踪总数:"+str(count_unfocus)) print("√ 持续追踪总数:"+str(count_focus)) -author_analyzer = AuthorAnalyzer() -author_analyzer.video_cleaner() \ No newline at end of file diff --git a/run.py b/run.py index 3b9ffcf..6d5b761 100644 --- a/run.py +++ b/run.py @@ -46,10 +46,15 @@ def video_spider(): def online(): Popen(['scrapy','crawl','online']) +def data_analyze(): + Popen(['python','run_analyzer.py']) + def run_threaded(job_func): job_thread = threading.Thread(target=job_func) job_thread.start() +schedule.every().day.at('12:00').do(run_threaded,data_analyze) + schedule.every().day.at('01:00').do(run_threaded,update_author) schedule.every(120).minutes.do(run_threaded,video_watcher) schedule.every().day.at('07:00').do(run_threaded,video_spider) @@ -58,6 +63,7 @@ def run_threaded(job_func): schedule.every().day.at('16:30').do(run_threaded,donghua) schedule.every().minute.do(run_threaded,online) + logging.info('开始运行计划任务..') while True: schedule.run_pending() diff --git a/run_analyzer.py b/run_analyzer.py new file mode 100644 index 0000000..fffda00 --- /dev/null +++ b/run_analyzer.py @@ -0,0 +1,8 @@ +from biliob_analyzer.author_analyzer import AuthorAnalyzer +from biliob_analyzer.video_analyzer import VideoAnalyzer + +author_analyzer = AuthorAnalyzer() +video_analyzer = VideoAnalyzer() + +author_analyzer.author_filter() +video_analyzer.video_filter() \ No newline at end of file From 2a9fa63cad07d4027f2777e3f9eae2003b9b9563 Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 26 Oct 2018 22:41:04 +0800 Subject: [PATCH 048/469] =?UTF-8?q?=E6=AF=8F=E6=97=A5=E5=88=86=E6=9E=90?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=EF=BC=8C=E8=87=AA=E5=8A=A8=E5=88=A0=E5=8E=BB?= =?UTF-8?q?=E4=B8=8D=E5=A4=9F=E7=9E=A9=E7=9B=AE=E7=9A=84=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_analyzer/author_analyzer.py | 4 +--- biliob_analyzer/video_analyzer.py | 6 ++---- run.py | 6 ++++++ run_analyzer.py | 8 ++++++++ 4 files changed, 17 insertions(+), 7 deletions(-) create mode 100644 run_analyzer.py diff --git a/biliob_analyzer/author_analyzer.py b/biliob_analyzer/author_analyzer.py index 90dd078..d9761fa 100644 --- a/biliob_analyzer/author_analyzer.py +++ b/biliob_analyzer/author_analyzer.py @@ -11,7 +11,7 @@ def __init__(self): settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['author'] # 获得collection的句柄 - def focus_filter(self): + def author_filter(self): pre_fans = -1 c_fans = -1 delta = timedelta(1) @@ -69,5 +69,3 @@ def focus_filter(self): def fans_variation(self): pass -author_analyzer = AuthorAnalyzer() -author_analyzer.focus_filter() \ No newline at end of file diff --git a/biliob_analyzer/video_analyzer.py b/biliob_analyzer/video_analyzer.py index d394a4b..fc1fef1 100644 --- a/biliob_analyzer/video_analyzer.py +++ b/biliob_analyzer/video_analyzer.py @@ -2,7 +2,7 @@ from pymongo import MongoClient from datetime import datetime from datetime import timedelta -class AuthorAnalyzer(object): +class VideoAnalyzer(object): def __init__(self): # 链接mongoDB self.client = MongoClient(settings['MINGO_HOST'], 27017) @@ -11,7 +11,7 @@ def __init__(self): settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['video'] # 获得collection的句柄 - def video_cleaner(self): + def video_filter(self): pre_view = -1 c_view = -1 delta = timedelta(1) @@ -71,5 +71,3 @@ def video_cleaner(self): print("× 不再追踪总数:"+str(count_unfocus)) print("√ 持续追踪总数:"+str(count_focus)) -author_analyzer = AuthorAnalyzer() -author_analyzer.video_cleaner() \ No newline at end of file diff --git a/run.py b/run.py index 3b9ffcf..6d5b761 100644 --- a/run.py +++ b/run.py @@ -46,10 +46,15 @@ def video_spider(): def online(): Popen(['scrapy','crawl','online']) +def data_analyze(): + Popen(['python','run_analyzer.py']) + def run_threaded(job_func): job_thread = threading.Thread(target=job_func) job_thread.start() +schedule.every().day.at('12:00').do(run_threaded,data_analyze) + schedule.every().day.at('01:00').do(run_threaded,update_author) schedule.every(120).minutes.do(run_threaded,video_watcher) schedule.every().day.at('07:00').do(run_threaded,video_spider) @@ -58,6 +63,7 @@ def run_threaded(job_func): schedule.every().day.at('16:30').do(run_threaded,donghua) schedule.every().minute.do(run_threaded,online) + logging.info('开始运行计划任务..') while True: schedule.run_pending() diff --git a/run_analyzer.py b/run_analyzer.py new file mode 100644 index 0000000..fffda00 --- /dev/null +++ b/run_analyzer.py @@ -0,0 +1,8 @@ +from biliob_analyzer.author_analyzer import AuthorAnalyzer +from biliob_analyzer.video_analyzer import VideoAnalyzer + +author_analyzer = AuthorAnalyzer() +video_analyzer = VideoAnalyzer() + +author_analyzer.author_filter() +video_analyzer.video_filter() \ No newline at end of file From 7331fad0e418fe55dc40fa5d58d7365f33d0bf91 Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 26 Oct 2018 22:43:33 +0800 Subject: [PATCH 049/469] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E5=90=AF=E5=8A=A8?= =?UTF-8?q?=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- run.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/run.py b/run.py index 3b9ffcf..63c651a 100644 --- a/run.py +++ b/run.py @@ -54,8 +54,8 @@ def run_threaded(job_func): schedule.every(120).minutes.do(run_threaded,video_watcher) schedule.every().day.at('07:00').do(run_threaded,video_spider) schedule.every().day.at('14:00').do(run_threaded,auto_add_author) -schedule.every().day.at('16:30').do(run_threaded,bangumi) -schedule.every().day.at('16:30').do(run_threaded,donghua) +schedule.every().day.at('16:50').do(run_threaded,bangumi) +schedule.every().day.at('16:50').do(run_threaded,donghua) schedule.every().minute.do(run_threaded,online) logging.info('开始运行计划任务..') From 73ac9c2b38c24ad43abbe9fef1d13566d95802e8 Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 26 Oct 2018 22:43:33 +0800 Subject: [PATCH 050/469] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E5=90=AF=E5=8A=A8?= =?UTF-8?q?=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- run.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/run.py b/run.py index 3b9ffcf..63c651a 100644 --- a/run.py +++ b/run.py @@ -54,8 +54,8 @@ def run_threaded(job_func): schedule.every(120).minutes.do(run_threaded,video_watcher) schedule.every().day.at('07:00').do(run_threaded,video_spider) schedule.every().day.at('14:00').do(run_threaded,auto_add_author) -schedule.every().day.at('16:30').do(run_threaded,bangumi) -schedule.every().day.at('16:30').do(run_threaded,donghua) +schedule.every().day.at('16:50').do(run_threaded,bangumi) +schedule.every().day.at('16:50').do(run_threaded,donghua) schedule.every().minute.do(run_threaded,online) logging.info('开始运行计划任务..') From a239d8825377acedfa96f4df578fa3e38941f61d Mon Sep 17 00:00:00 2001 From: jannchie Date: Wed, 31 Oct 2018 22:56:29 +0800 Subject: [PATCH 051/469] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=85=A8=E7=AB=99?= =?UTF-8?q?=E4=BF=A1=E6=81=AF=E7=88=AC=E8=99=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_analyzer/add_focus.py | 9 ++++---- biliob_spider/items.py | 8 +++++-- biliob_spider/pipelines.py | 23 +++++++++++++++++++ biliob_spider/spiders/site_info.py | 37 ++++++++++++++++++++++++++++++ run.py | 4 ++++ 5 files changed, 75 insertions(+), 6 deletions(-) create mode 100644 biliob_spider/spiders/site_info.py diff --git a/biliob_analyzer/add_focus.py b/biliob_analyzer/add_focus.py index 5d4d529..f23de18 100644 --- a/biliob_analyzer/add_focus.py +++ b/biliob_analyzer/add_focus.py @@ -5,9 +5,10 @@ # 数据库登录需要帐号密码 client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW']) db = client['biliob'] # 获得数据库的句柄 -coll = db['author'] # 获得collection的句柄 +coll = db['video'] # 获得collection的句柄 docs = coll.find({'focus': {'$exists': False}}).batch_size(60) for each_doc in docs: - each_doc['focus'] = True - coll.update_one({'mid': each_doc['mid']}, {'$set': each_doc}) - print('已修复mid' + str(each_doc['mid'])) + if 'aid' in each_doc: + each_doc['focus'] = True + coll.update_one({'aid': each_doc['aid']}, {'$set': each_doc}) + print('已修复aid' + str(each_doc['aid'])) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index a6dcd2e..ac21cff 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -7,12 +7,17 @@ import scrapy +class SiteItem(scrapy.Item): + region_count = scrapy.Field() + all_count = scrapy.Field() + web_online = scrapy.Field() + play_online = scrapy.Field() + class BangumiItem(scrapy.Item): title = scrapy.Field() tag = scrapy.Field() data = scrapy.Field() - class VideoItem(scrapy.Item): channel = scrapy.Field() aid = scrapy.Field() @@ -24,7 +29,6 @@ class VideoItem(scrapy.Item): mid = scrapy.Field() pic = scrapy.Field() - class AuthorItem(scrapy.Item): mid = scrapy.Field() name = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 959d5c9..ef3603a 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -101,6 +101,29 @@ def process_item(self, item, spider): except Exception as error: # 出现错误时打印错误日志 logging.error(error) +class SiteInfoPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['site_info'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.insert_one({ + "region_count": item['region_count'], + "all_count": item['all_count'], + "web_online": item['web_online'], + "play_online": item['play_online'], + "datetime":datetime.datetime.now() + }) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) class AuthorPipeline(object): def __init__(self): diff --git a/biliob_spider/spiders/site_info.py b/biliob_spider/spiders/site_info.py new file mode 100644 index 0000000..6faafc6 --- /dev/null +++ b/biliob_spider/spiders/site_info.py @@ -0,0 +1,37 @@ +#coding=utf-8 +import scrapy +from scrapy.http import Request +from biliob_spider.items import SiteItem +import time +import json +import logging +from pymongo import MongoClient +import datetime + + +class OnlineSpider(scrapy.spiders.Spider): + name = "site" + allowed_domains = ["bilibili.com"] + start_urls = ['https://api.bilibili.com/x/web-interface/online'] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.SiteInfoPipeline': 300 + } + } + + def parse(self, response): + try: + r = json.loads(response.body) + d = r["data"] + item = SiteItem() + item['region_count'] = d['region_count'] + item['all_count'] = d['all_count'] + item['web_online'] = d['web_online'] + item['play_online'] = d['play_online'] + yield item + + except Exception as error: + # 出现错误时打印错误日志 + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) diff --git a/run.py b/run.py index 6d5b761..3a583f1 100644 --- a/run.py +++ b/run.py @@ -25,6 +25,9 @@ # 第四步,将logger添加到handler里面 logger.addHandler(fh) +def site(): + Popen(["scrapy","crawl","site"]) + def bangumi(): Popen(["scrapy","crawl","bangumi"]) @@ -61,6 +64,7 @@ def run_threaded(job_func): schedule.every().day.at('14:00').do(run_threaded,auto_add_author) schedule.every().day.at('16:30').do(run_threaded,bangumi) schedule.every().day.at('16:30').do(run_threaded,donghua) +schedule.every().hour.do(run_threaded,site) schedule.every().minute.do(run_threaded,online) From cba80b7f7b4f7b658071a0eb81a1e1a613c0507b Mon Sep 17 00:00:00 2001 From: jannchie Date: Wed, 31 Oct 2018 22:56:29 +0800 Subject: [PATCH 052/469] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=85=A8=E7=AB=99?= =?UTF-8?q?=E4=BF=A1=E6=81=AF=E7=88=AC=E8=99=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_analyzer/add_focus.py | 9 ++++---- biliob_spider/items.py | 8 +++++-- biliob_spider/pipelines.py | 23 +++++++++++++++++++ biliob_spider/spiders/site_info.py | 37 ++++++++++++++++++++++++++++++ run.py | 4 ++++ 5 files changed, 75 insertions(+), 6 deletions(-) create mode 100644 biliob_spider/spiders/site_info.py diff --git a/biliob_analyzer/add_focus.py b/biliob_analyzer/add_focus.py index 5d4d529..f23de18 100644 --- a/biliob_analyzer/add_focus.py +++ b/biliob_analyzer/add_focus.py @@ -5,9 +5,10 @@ # 数据库登录需要帐号密码 client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW']) db = client['biliob'] # 获得数据库的句柄 -coll = db['author'] # 获得collection的句柄 +coll = db['video'] # 获得collection的句柄 docs = coll.find({'focus': {'$exists': False}}).batch_size(60) for each_doc in docs: - each_doc['focus'] = True - coll.update_one({'mid': each_doc['mid']}, {'$set': each_doc}) - print('已修复mid' + str(each_doc['mid'])) + if 'aid' in each_doc: + each_doc['focus'] = True + coll.update_one({'aid': each_doc['aid']}, {'$set': each_doc}) + print('已修复aid' + str(each_doc['aid'])) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index a6dcd2e..ac21cff 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -7,12 +7,17 @@ import scrapy +class SiteItem(scrapy.Item): + region_count = scrapy.Field() + all_count = scrapy.Field() + web_online = scrapy.Field() + play_online = scrapy.Field() + class BangumiItem(scrapy.Item): title = scrapy.Field() tag = scrapy.Field() data = scrapy.Field() - class VideoItem(scrapy.Item): channel = scrapy.Field() aid = scrapy.Field() @@ -24,7 +29,6 @@ class VideoItem(scrapy.Item): mid = scrapy.Field() pic = scrapy.Field() - class AuthorItem(scrapy.Item): mid = scrapy.Field() name = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 959d5c9..ef3603a 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -101,6 +101,29 @@ def process_item(self, item, spider): except Exception as error: # 出现错误时打印错误日志 logging.error(error) +class SiteInfoPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['site_info'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.insert_one({ + "region_count": item['region_count'], + "all_count": item['all_count'], + "web_online": item['web_online'], + "play_online": item['play_online'], + "datetime":datetime.datetime.now() + }) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) class AuthorPipeline(object): def __init__(self): diff --git a/biliob_spider/spiders/site_info.py b/biliob_spider/spiders/site_info.py new file mode 100644 index 0000000..6faafc6 --- /dev/null +++ b/biliob_spider/spiders/site_info.py @@ -0,0 +1,37 @@ +#coding=utf-8 +import scrapy +from scrapy.http import Request +from biliob_spider.items import SiteItem +import time +import json +import logging +from pymongo import MongoClient +import datetime + + +class OnlineSpider(scrapy.spiders.Spider): + name = "site" + allowed_domains = ["bilibili.com"] + start_urls = ['https://api.bilibili.com/x/web-interface/online'] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.SiteInfoPipeline': 300 + } + } + + def parse(self, response): + try: + r = json.loads(response.body) + d = r["data"] + item = SiteItem() + item['region_count'] = d['region_count'] + item['all_count'] = d['all_count'] + item['web_online'] = d['web_online'] + item['play_online'] = d['play_online'] + yield item + + except Exception as error: + # 出现错误时打印错误日志 + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) diff --git a/run.py b/run.py index 6d5b761..3a583f1 100644 --- a/run.py +++ b/run.py @@ -25,6 +25,9 @@ # 第四步,将logger添加到handler里面 logger.addHandler(fh) +def site(): + Popen(["scrapy","crawl","site"]) + def bangumi(): Popen(["scrapy","crawl","bangumi"]) @@ -61,6 +64,7 @@ def run_threaded(job_func): schedule.every().day.at('14:00').do(run_threaded,auto_add_author) schedule.every().day.at('16:30').do(run_threaded,bangumi) schedule.every().day.at('16:30').do(run_threaded,donghua) +schedule.every().hour.do(run_threaded,site) schedule.every().minute.do(run_threaded,online) From fe9c014c5a035dec0656762f4e8799a65e4e670c Mon Sep 17 00:00:00 2001 From: jannchie Date: Wed, 31 Oct 2018 23:04:29 +0800 Subject: [PATCH 053/469] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=BC=BA=E5=88=B6?= =?UTF-8?q?=E8=BF=BD=E8=B8=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/items.py | 1 + biliob_spider/spiders/author_update.py | 2 +- biliob_spider/spiders/video_spider.py | 2 +- biliob_spider/spiders/video_watcher.py | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index ac21cff..682b34c 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -46,6 +46,7 @@ class VideoOnline(scrapy.Item): aid = scrapy.Field() subChannel = scrapy.Field() channel = scrapy.Field() + class VideoWatcherItem(scrapy.Item): mid = scrapy.Field() aid = scrapy.Field() diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index d96710c..87b8ba2 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -31,7 +31,7 @@ def __init__(self): self.coll = self.db['author'] # 获得collection的句柄 def start_requests(self): - c = self.coll.find({'focus':True}, {"mid": 1}) + c = self.coll.find({'$or':[{'focus':True},{'forceFocus':True}]}, {"mid": 1}) for each_doc in c: yield Request( "https://api.bilibili.com/x/web-interface/card?mid=" + str( diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index b1fd5b9..ab3dda4 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -124,7 +124,7 @@ def __init__(self): def start_requests(self): # 只需要aid - c = self.coll.find({'focus':True}, {'aid': 1}) + c = self.coll.find({'$or':[{'focus':True},{'forceFocus':True}]}, {'aid': 1}) x = 0 diff --git a/biliob_spider/spiders/video_watcher.py b/biliob_spider/spiders/video_watcher.py index 3febfbc..10cca6f 100644 --- a/biliob_spider/spiders/video_watcher.py +++ b/biliob_spider/spiders/video_watcher.py @@ -32,7 +32,7 @@ def __init__(self): self.coll = self.db['author'] # 获得collection的句柄 def start_requests(self): - c = self.coll.find({'focus':True}, {'mid': 1}) + c = self.coll.find({'$or':[{'focus':True},{'forceFocus':True}]}, {'mid': 1}) for each_doc in c: yield Request( 'http://space.bilibili.com/ajax/member/getSubmitVideos?mid=' + From 8c458c6a7fddbbc9c0e72af4331a609f4e96bcf0 Mon Sep 17 00:00:00 2001 From: jannchie Date: Wed, 31 Oct 2018 23:04:29 +0800 Subject: [PATCH 054/469] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=BC=BA=E5=88=B6?= =?UTF-8?q?=E8=BF=BD=E8=B8=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/items.py | 1 + biliob_spider/spiders/author_update.py | 2 +- biliob_spider/spiders/video_spider.py | 2 +- biliob_spider/spiders/video_watcher.py | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index ac21cff..682b34c 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -46,6 +46,7 @@ class VideoOnline(scrapy.Item): aid = scrapy.Field() subChannel = scrapy.Field() channel = scrapy.Field() + class VideoWatcherItem(scrapy.Item): mid = scrapy.Field() aid = scrapy.Field() diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index d96710c..87b8ba2 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -31,7 +31,7 @@ def __init__(self): self.coll = self.db['author'] # 获得collection的句柄 def start_requests(self): - c = self.coll.find({'focus':True}, {"mid": 1}) + c = self.coll.find({'$or':[{'focus':True},{'forceFocus':True}]}, {"mid": 1}) for each_doc in c: yield Request( "https://api.bilibili.com/x/web-interface/card?mid=" + str( diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index b1fd5b9..ab3dda4 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -124,7 +124,7 @@ def __init__(self): def start_requests(self): # 只需要aid - c = self.coll.find({'focus':True}, {'aid': 1}) + c = self.coll.find({'$or':[{'focus':True},{'forceFocus':True}]}, {'aid': 1}) x = 0 diff --git a/biliob_spider/spiders/video_watcher.py b/biliob_spider/spiders/video_watcher.py index 3febfbc..10cca6f 100644 --- a/biliob_spider/spiders/video_watcher.py +++ b/biliob_spider/spiders/video_watcher.py @@ -32,7 +32,7 @@ def __init__(self): self.coll = self.db['author'] # 获得collection的句柄 def start_requests(self): - c = self.coll.find({'focus':True}, {'mid': 1}) + c = self.coll.find({'$or':[{'focus':True},{'forceFocus':True}]}, {'mid': 1}) for each_doc in c: yield Request( 'http://space.bilibili.com/ajax/member/getSubmitVideos?mid=' + From e7b5374df7fcfa36ca7ba2d4755ef3da1e35dcbc Mon Sep 17 00:00:00 2001 From: jannchie Date: Thu, 1 Nov 2018 19:57:55 +0800 Subject: [PATCH 055/469] =?UTF-8?q?=E6=B7=BB=E5=8A=A0tag=E7=88=AC=E5=8F=96?= =?UTF-8?q?=EF=BC=8C=E6=B7=BB=E5=8A=A0=E5=85=A8=E7=AB=99=E6=89=80=E6=9C=89?= =?UTF-8?q?=E6=8E=92=E8=A1=8C=E7=9A=84=E7=88=AC=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/items.py | 6 ++++ biliob_spider/pipelines.py | 31 +++++++++++++++++ biliob_spider/settings.py | 4 +-- biliob_spider/spiders/author_auto_add.py | 16 ++++++++- biliob_spider/spiders/tag.py | 42 ++++++++++++++++++++++++ 5 files changed, 96 insertions(+), 3 deletions(-) create mode 100644 biliob_spider/spiders/tag.py diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 682b34c..de904c0 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -13,6 +13,12 @@ class SiteItem(scrapy.Item): web_online = scrapy.Field() play_online = scrapy.Field() +class TagItem(scrapy.Item): + tag_id = scrapy.Field() + tag_name = scrapy.Field() + use = scrapy.Field() + atten = scrapy.Field() + ctime = scrapy.Field() class BangumiItem(scrapy.Item): title = scrapy.Field() tag = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index ef3603a..bc11b7b 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -191,6 +191,37 @@ def process_item(self, item, spider): # 出现错误时打印错误日志 logging.error(error) + +class TagPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['tag'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + + self.coll.update_one({ + "tag_id": item["tag_id"] + }, { + "$set": { + "tag_name": item['tag_name'], + "ctime": item['ctime'], + }, + "$addToSet": { + 'use': item['use'], + 'atten': item['atten'], + 'datetime': datetime.datetime.now() + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) class VideoAddPipeline(object): def __init__(self): # 链接mongoDB diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index 80ba4ad..4981fb1 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -11,8 +11,8 @@ import random -LOG_FILE = "biliob_spider.log" -LOG_LEVEL = "ERROR" +# LOG_FILE = "biliob_spider.log" +# LOG_LEVEL = "ERROR" BOT_NAME = 'biliob_spider' diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index bb6dc2c..af6a451 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -12,7 +12,21 @@ class AuthorAutoAddSpider(scrapy.spiders.Spider): name = "authorAutoAdd" allowed_domains = ["bilibili.com"] - start_urls = ['https://www.bilibili.com/ranking'] + start_urls = [ + 'https://www.bilibili.com/ranking', + 'https://www.bilibili.com/ranking/all/1/0/3', + 'https://www.bilibili.com/ranking/all/168/0/3', + 'https://www.bilibili.com/ranking/all/3/0/3', + 'https://www.bilibili.com/ranking/all/129/0/3', + 'https://www.bilibili.com/ranking/all/4/0/3', + 'https://www.bilibili.com/ranking/all/36/0/3', + 'https://www.bilibili.com/ranking/all/160/0/3', + 'https://www.bilibili.com/ranking/all/119/0/3', + 'https://www.bilibili.com/ranking/all/155/0/3', + 'https://www.bilibili.com/ranking/all/5/0/3', + 'https://www.bilibili.com/ranking/all/181/0/3' + ] + custom_settings = { 'ITEM_PIPELINES': { 'biliob_spider.pipelines.AuthorPipeline': 300 diff --git a/biliob_spider/spiders/tag.py b/biliob_spider/spiders/tag.py new file mode 100644 index 0000000..afe61b7 --- /dev/null +++ b/biliob_spider/spiders/tag.py @@ -0,0 +1,42 @@ +#coding=utf-8 +import scrapy +from scrapy.http import Request +from biliob_spider.items import TagItem +import time +import json +import logging +from pymongo import MongoClient +import datetime + + +class TagSpider(scrapy.spiders.Spider): + name = "tag" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.TagPipeLine': 300 + }, + 'DOWNLOAD_DELAY': 1 + } + def start_requests(self): + for i in range(1000,9999999): + url = 'https://api.bilibili.com/x/tag/info?tag_id={tag_id}'.format(tag_id=i) + yield Request(url) + def parse(self, response): + try: + r = json.loads(response.body) + d = r["data"] + item = TagItem() + item['tag_id'] = d['tag_id'] + item['tag_name'] = d['tag_name'] + item['ctime'] = d['ctime'] + item['use'] = d['count']['use'] + item['atten'] = d['atten']['atten'] + yield item + + except Exception as error: + # 出现错误时打印错误日志 + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) From ff51ff8544d495b73a6f8c16f3e91c35dc95be75 Mon Sep 17 00:00:00 2001 From: jannchie Date: Thu, 1 Nov 2018 19:57:55 +0800 Subject: [PATCH 056/469] =?UTF-8?q?=E6=B7=BB=E5=8A=A0tag=E7=88=AC=E5=8F=96?= =?UTF-8?q?=EF=BC=8C=E6=B7=BB=E5=8A=A0=E5=85=A8=E7=AB=99=E6=89=80=E6=9C=89?= =?UTF-8?q?=E6=8E=92=E8=A1=8C=E7=9A=84=E7=88=AC=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_spider/items.py | 6 ++++ biliob_spider/pipelines.py | 31 +++++++++++++++++ biliob_spider/settings.py | 4 +-- biliob_spider/spiders/author_auto_add.py | 16 ++++++++- biliob_spider/spiders/tag.py | 42 ++++++++++++++++++++++++ 5 files changed, 96 insertions(+), 3 deletions(-) create mode 100644 biliob_spider/spiders/tag.py diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 682b34c..de904c0 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -13,6 +13,12 @@ class SiteItem(scrapy.Item): web_online = scrapy.Field() play_online = scrapy.Field() +class TagItem(scrapy.Item): + tag_id = scrapy.Field() + tag_name = scrapy.Field() + use = scrapy.Field() + atten = scrapy.Field() + ctime = scrapy.Field() class BangumiItem(scrapy.Item): title = scrapy.Field() tag = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index ef3603a..bc11b7b 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -191,6 +191,37 @@ def process_item(self, item, spider): # 出现错误时打印错误日志 logging.error(error) + +class TagPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['tag'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + + self.coll.update_one({ + "tag_id": item["tag_id"] + }, { + "$set": { + "tag_name": item['tag_name'], + "ctime": item['ctime'], + }, + "$addToSet": { + 'use': item['use'], + 'atten': item['atten'], + 'datetime': datetime.datetime.now() + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) class VideoAddPipeline(object): def __init__(self): # 链接mongoDB diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index 80ba4ad..4981fb1 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -11,8 +11,8 @@ import random -LOG_FILE = "biliob_spider.log" -LOG_LEVEL = "ERROR" +# LOG_FILE = "biliob_spider.log" +# LOG_LEVEL = "ERROR" BOT_NAME = 'biliob_spider' diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index bb6dc2c..af6a451 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -12,7 +12,21 @@ class AuthorAutoAddSpider(scrapy.spiders.Spider): name = "authorAutoAdd" allowed_domains = ["bilibili.com"] - start_urls = ['https://www.bilibili.com/ranking'] + start_urls = [ + 'https://www.bilibili.com/ranking', + 'https://www.bilibili.com/ranking/all/1/0/3', + 'https://www.bilibili.com/ranking/all/168/0/3', + 'https://www.bilibili.com/ranking/all/3/0/3', + 'https://www.bilibili.com/ranking/all/129/0/3', + 'https://www.bilibili.com/ranking/all/4/0/3', + 'https://www.bilibili.com/ranking/all/36/0/3', + 'https://www.bilibili.com/ranking/all/160/0/3', + 'https://www.bilibili.com/ranking/all/119/0/3', + 'https://www.bilibili.com/ranking/all/155/0/3', + 'https://www.bilibili.com/ranking/all/5/0/3', + 'https://www.bilibili.com/ranking/all/181/0/3' + ] + custom_settings = { 'ITEM_PIPELINES': { 'biliob_spider.pipelines.AuthorPipeline': 300 diff --git a/biliob_spider/spiders/tag.py b/biliob_spider/spiders/tag.py new file mode 100644 index 0000000..afe61b7 --- /dev/null +++ b/biliob_spider/spiders/tag.py @@ -0,0 +1,42 @@ +#coding=utf-8 +import scrapy +from scrapy.http import Request +from biliob_spider.items import TagItem +import time +import json +import logging +from pymongo import MongoClient +import datetime + + +class TagSpider(scrapy.spiders.Spider): + name = "tag" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.TagPipeLine': 300 + }, + 'DOWNLOAD_DELAY': 1 + } + def start_requests(self): + for i in range(1000,9999999): + url = 'https://api.bilibili.com/x/tag/info?tag_id={tag_id}'.format(tag_id=i) + yield Request(url) + def parse(self, response): + try: + r = json.loads(response.body) + d = r["data"] + item = TagItem() + item['tag_id'] = d['tag_id'] + item['tag_name'] = d['tag_name'] + item['ctime'] = d['ctime'] + item['use'] = d['count']['use'] + item['atten'] = d['atten']['atten'] + yield item + + except Exception as error: + # 出现错误时打印错误日志 + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) From bb034c0d3f84c213ee4e33ffe49692b31e788ba1 Mon Sep 17 00:00:00 2001 From: jannchie Date: Sat, 3 Nov 2018 19:03:01 +0800 Subject: [PATCH 057/469] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=88=86=E6=9E=90bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_analyzer/author_analyzer.py | 79 +++++++++++++++--------------- biliob_analyzer/video_analyzer.py | 4 +- biliob_spider/spiders/tag.py | 4 +- 3 files changed, 45 insertions(+), 42 deletions(-) diff --git a/biliob_analyzer/author_analyzer.py b/biliob_analyzer/author_analyzer.py index d9761fa..5bbf00e 100644 --- a/biliob_analyzer/author_analyzer.py +++ b/biliob_analyzer/author_analyzer.py @@ -21,47 +21,48 @@ def author_filter(self): count_focus = 0 for each_doc in self.coll.find({'focus':True}): flag_cool = 0 - each_doc['data'].reverse() - for each_data in each_doc['data']: - if pre_fans == -1: - pre_fans = each_data['fans'] - pre_date = each_data['datetime'] - continue - c_fans = each_data['fans'] - c_date = each_data['datetime'] - if pre_date + delta > c_date: - continue - rate = (c_fans-pre_fans)/((c_date-pre_date).seconds*60*60*24+1) - pre_fans = c_fans - pre_date = c_date - if abs(rate) < 100: - flag_cool += 1 - else: - flag_cool = 0 + if 'data' in each_doc: + each_doc['data'].reverse() + for each_data in each_doc['data']: + if pre_fans == -1: + pre_fans = each_data['fans'] + pre_date = each_data['datetime'] + continue + c_fans = each_data['fans'] + c_date = each_data['datetime'] + if pre_date + delta > c_date: + continue + rate = (c_fans-pre_fans)/((c_date-pre_date).seconds*60*60*24+1) + pre_fans = c_fans + pre_date = c_date + if abs(rate) < 100: + flag_cool += 1 + else: + flag_cool = 0 - # 连续30日日均涨粉小于100且粉丝数小于100000则不追踪 - if flag_cool > 30 and each_data['fans'] < 100000: - focus = False - break - elif flag_cool > 15 and each_data['fans'] < 5000: - focus = False - break - elif flag_cool > 7 and each_data['fans'] < 1000: - focus = False - break - else: - focus = True + # 连续30日日均涨粉小于100且粉丝数小于100000则不追踪 + if flag_cool > 30 and each_data['fans'] < 100000: + focus = False + break + elif flag_cool > 15 and each_data['fans'] < 5000: + focus = False + break + elif flag_cool > 7 and each_data['fans'] < 1000: + focus = False + break + else: + focus = True - if focus: - count_focus += 1 - print("√ 持续追踪:"+each_doc['name']) - self.coll.update_one({'mid':each_doc['mid']},{'$set':{'focus':True}}) - else: - count_unfocus += 1 - print("× 不再追踪:"+each_doc['name']) - self.coll.update_one({'mid':each_doc['mid']},{'$set':{'focus':False}}) - pre_fans = -1 - c_fans = -1 + if focus: + count_focus += 1 + print("√ 持续追踪:"+each_doc['name']) + self.coll.update_one({'mid':each_doc['mid']},{'$set':{'focus':True}}) + else: + count_unfocus += 1 + print("× 不再追踪:"+each_doc['name']) + self.coll.update_one({'mid':each_doc['mid']},{'$set':{'focus':False}}) + pre_fans = -1 + c_fans = -1 print("· 本轮筛选结果:") print("× 不再追踪总数:"+str(count_unfocus)) print("√ 持续追踪总数:"+str(count_focus)) diff --git a/biliob_analyzer/video_analyzer.py b/biliob_analyzer/video_analyzer.py index fc1fef1..022ed5a 100644 --- a/biliob_analyzer/video_analyzer.py +++ b/biliob_analyzer/video_analyzer.py @@ -41,15 +41,17 @@ def video_filter(self): rate = (c_view-pre_view) pre_view = c_view pre_date = c_date - + # 三天内播放增长小于3000则被认定为低质量 if live_time == 3 and c_view < 3000: delete = True focus = False break + # 大于三天后每日播放增长小于100则停止追踪 elif live_time > 3 and rate < 100: focus = False delete = False break + # 除此之外的情况则持续追踪 else: focus = True delete = False diff --git a/biliob_spider/spiders/tag.py b/biliob_spider/spiders/tag.py index afe61b7..e4f756d 100644 --- a/biliob_spider/spiders/tag.py +++ b/biliob_spider/spiders/tag.py @@ -20,7 +20,7 @@ class TagSpider(scrapy.spiders.Spider): 'DOWNLOAD_DELAY': 1 } def start_requests(self): - for i in range(1000,9999999): + for i in range(0,9999999): url = 'https://api.bilibili.com/x/tag/info?tag_id={tag_id}'.format(tag_id=i) yield Request(url) def parse(self, response): @@ -32,7 +32,7 @@ def parse(self, response): item['tag_name'] = d['tag_name'] item['ctime'] = d['ctime'] item['use'] = d['count']['use'] - item['atten'] = d['atten']['atten'] + item['atten'] = d['count']['atten'] yield item except Exception as error: From 29d8765eea463a10d08a9aea1d8818f5321701f2 Mon Sep 17 00:00:00 2001 From: jannchie Date: Sat, 3 Nov 2018 19:03:01 +0800 Subject: [PATCH 058/469] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=88=86=E6=9E=90bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- biliob_analyzer/author_analyzer.py | 79 +++++++++++++++--------------- biliob_analyzer/video_analyzer.py | 4 +- biliob_spider/spiders/tag.py | 4 +- 3 files changed, 45 insertions(+), 42 deletions(-) diff --git a/biliob_analyzer/author_analyzer.py b/biliob_analyzer/author_analyzer.py index d9761fa..5bbf00e 100644 --- a/biliob_analyzer/author_analyzer.py +++ b/biliob_analyzer/author_analyzer.py @@ -21,47 +21,48 @@ def author_filter(self): count_focus = 0 for each_doc in self.coll.find({'focus':True}): flag_cool = 0 - each_doc['data'].reverse() - for each_data in each_doc['data']: - if pre_fans == -1: - pre_fans = each_data['fans'] - pre_date = each_data['datetime'] - continue - c_fans = each_data['fans'] - c_date = each_data['datetime'] - if pre_date + delta > c_date: - continue - rate = (c_fans-pre_fans)/((c_date-pre_date).seconds*60*60*24+1) - pre_fans = c_fans - pre_date = c_date - if abs(rate) < 100: - flag_cool += 1 - else: - flag_cool = 0 + if 'data' in each_doc: + each_doc['data'].reverse() + for each_data in each_doc['data']: + if pre_fans == -1: + pre_fans = each_data['fans'] + pre_date = each_data['datetime'] + continue + c_fans = each_data['fans'] + c_date = each_data['datetime'] + if pre_date + delta > c_date: + continue + rate = (c_fans-pre_fans)/((c_date-pre_date).seconds*60*60*24+1) + pre_fans = c_fans + pre_date = c_date + if abs(rate) < 100: + flag_cool += 1 + else: + flag_cool = 0 - # 连续30日日均涨粉小于100且粉丝数小于100000则不追踪 - if flag_cool > 30 and each_data['fans'] < 100000: - focus = False - break - elif flag_cool > 15 and each_data['fans'] < 5000: - focus = False - break - elif flag_cool > 7 and each_data['fans'] < 1000: - focus = False - break - else: - focus = True + # 连续30日日均涨粉小于100且粉丝数小于100000则不追踪 + if flag_cool > 30 and each_data['fans'] < 100000: + focus = False + break + elif flag_cool > 15 and each_data['fans'] < 5000: + focus = False + break + elif flag_cool > 7 and each_data['fans'] < 1000: + focus = False + break + else: + focus = True - if focus: - count_focus += 1 - print("√ 持续追踪:"+each_doc['name']) - self.coll.update_one({'mid':each_doc['mid']},{'$set':{'focus':True}}) - else: - count_unfocus += 1 - print("× 不再追踪:"+each_doc['name']) - self.coll.update_one({'mid':each_doc['mid']},{'$set':{'focus':False}}) - pre_fans = -1 - c_fans = -1 + if focus: + count_focus += 1 + print("√ 持续追踪:"+each_doc['name']) + self.coll.update_one({'mid':each_doc['mid']},{'$set':{'focus':True}}) + else: + count_unfocus += 1 + print("× 不再追踪:"+each_doc['name']) + self.coll.update_one({'mid':each_doc['mid']},{'$set':{'focus':False}}) + pre_fans = -1 + c_fans = -1 print("· 本轮筛选结果:") print("× 不再追踪总数:"+str(count_unfocus)) print("√ 持续追踪总数:"+str(count_focus)) diff --git a/biliob_analyzer/video_analyzer.py b/biliob_analyzer/video_analyzer.py index fc1fef1..022ed5a 100644 --- a/biliob_analyzer/video_analyzer.py +++ b/biliob_analyzer/video_analyzer.py @@ -41,15 +41,17 @@ def video_filter(self): rate = (c_view-pre_view) pre_view = c_view pre_date = c_date - + # 三天内播放增长小于3000则被认定为低质量 if live_time == 3 and c_view < 3000: delete = True focus = False break + # 大于三天后每日播放增长小于100则停止追踪 elif live_time > 3 and rate < 100: focus = False delete = False break + # 除此之外的情况则持续追踪 else: focus = True delete = False diff --git a/biliob_spider/spiders/tag.py b/biliob_spider/spiders/tag.py index afe61b7..e4f756d 100644 --- a/biliob_spider/spiders/tag.py +++ b/biliob_spider/spiders/tag.py @@ -20,7 +20,7 @@ class TagSpider(scrapy.spiders.Spider): 'DOWNLOAD_DELAY': 1 } def start_requests(self): - for i in range(1000,9999999): + for i in range(0,9999999): url = 'https://api.bilibili.com/x/tag/info?tag_id={tag_id}'.format(tag_id=i) yield Request(url) def parse(self, response): @@ -32,7 +32,7 @@ def parse(self, response): item['tag_name'] = d['tag_name'] item['ctime'] = d['ctime'] item['use'] = d['count']['use'] - item['atten'] = d['atten']['atten'] + item['atten'] = d['count']['atten'] yield item except Exception as error: From a030b756145fff65a32f2d05518c254453044d90 Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 16 Nov 2018 20:57:14 +0800 Subject: [PATCH 059/469] fixbug: wrongly set unfocus --- biliob_analyzer/author_analyzer.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/biliob_analyzer/author_analyzer.py b/biliob_analyzer/author_analyzer.py index 5bbf00e..3001f23 100644 --- a/biliob_analyzer/author_analyzer.py +++ b/biliob_analyzer/author_analyzer.py @@ -19,7 +19,7 @@ def author_filter(self): c_date = datetime count_unfocus = 0 count_focus = 0 - for each_doc in self.coll.find({'focus':True}): + for each_doc in self.coll.find(): flag_cool = 0 if 'data' in each_doc: each_doc['data'].reverse() @@ -32,13 +32,12 @@ def author_filter(self): c_date = each_data['datetime'] if pre_date + delta > c_date: continue - rate = (c_fans-pre_fans)/((c_date-pre_date).seconds*60*60*24+1) - pre_fans = c_fans - pre_date = c_date - if abs(rate) < 100: + if abs(c_fans-pre_fans) < 100: flag_cool += 1 else: flag_cool = 0 + pre_fans = c_fans + pre_date = c_date # 连续30日日均涨粉小于100且粉丝数小于100000则不追踪 if flag_cool > 30 and each_data['fans'] < 100000: From 736bbaf936bff6908141a788cd2ac845cda673de Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 16 Nov 2018 20:57:14 +0800 Subject: [PATCH 060/469] fixbug: wrongly set unfocus --- biliob_analyzer/author_analyzer.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/biliob_analyzer/author_analyzer.py b/biliob_analyzer/author_analyzer.py index 5bbf00e..3001f23 100644 --- a/biliob_analyzer/author_analyzer.py +++ b/biliob_analyzer/author_analyzer.py @@ -19,7 +19,7 @@ def author_filter(self): c_date = datetime count_unfocus = 0 count_focus = 0 - for each_doc in self.coll.find({'focus':True}): + for each_doc in self.coll.find(): flag_cool = 0 if 'data' in each_doc: each_doc['data'].reverse() @@ -32,13 +32,12 @@ def author_filter(self): c_date = each_data['datetime'] if pre_date + delta > c_date: continue - rate = (c_fans-pre_fans)/((c_date-pre_date).seconds*60*60*24+1) - pre_fans = c_fans - pre_date = c_date - if abs(rate) < 100: + if abs(c_fans-pre_fans) < 100: flag_cool += 1 else: flag_cool = 0 + pre_fans = c_fans + pre_date = c_date # 连续30日日均涨粉小于100且粉丝数小于100000则不追踪 if flag_cool > 30 and each_data['fans'] < 100000: From a5e1076a3e797dc23822c184ddcee51633b5e207 Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 16 Nov 2018 20:59:41 +0800 Subject: [PATCH 061/469] feature: only find focusing author --- biliob_analyzer/author_analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biliob_analyzer/author_analyzer.py b/biliob_analyzer/author_analyzer.py index 3001f23..5835229 100644 --- a/biliob_analyzer/author_analyzer.py +++ b/biliob_analyzer/author_analyzer.py @@ -19,7 +19,7 @@ def author_filter(self): c_date = datetime count_unfocus = 0 count_focus = 0 - for each_doc in self.coll.find(): + for each_doc in self.coll.find({'focus':True}): flag_cool = 0 if 'data' in each_doc: each_doc['data'].reverse() From 583a354620e345f8dea49afd64182694b39b8161 Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 16 Nov 2018 20:59:41 +0800 Subject: [PATCH 062/469] feature: only find focusing author --- biliob_analyzer/author_analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biliob_analyzer/author_analyzer.py b/biliob_analyzer/author_analyzer.py index 3001f23..5835229 100644 --- a/biliob_analyzer/author_analyzer.py +++ b/biliob_analyzer/author_analyzer.py @@ -19,7 +19,7 @@ def author_filter(self): c_date = datetime count_unfocus = 0 count_focus = 0 - for each_doc in self.coll.find(): + for each_doc in self.coll.find({'focus':True}): flag_cool = 0 if 'data' in each_doc: each_doc['data'].reverse() From c94df35b2cca18ab6b263f1dcadc56bd8e1a2c3d Mon Sep 17 00:00:00 2001 From: jannchie Date: Wed, 21 Nov 2018 16:53:57 +0800 Subject: [PATCH 063/469] observe the amount of articles and archives viewed --- biliob_spider/spiders/author_update.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index 87b8ba2..70b03e2 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -66,9 +66,18 @@ def parse(self, response): 'article': int(article), 'datetime': datetime.datetime.now() } - yield item + yield Request("http://api.bilibili.com/x/space/upstat?mid={mid}".format(mid=str(mid)),meta={'item': item},method='GET',callback=self.parse_view) except Exception as error: # 出现错误时打印错误日志 logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) + + def parse_view(self,response): + j = json.loads(response.body) + archiveView = j['data']['archive']['view'] + articleView = j['data']['article']['view'] + item = response.meta['item'] + item['data']['archiveView'] = archiveView + item['data']['articleView'] = articleView + yield item From e313d9cf4163cf43ff21bbb2fc69c6b570d653a3 Mon Sep 17 00:00:00 2001 From: jannchie Date: Wed, 21 Nov 2018 16:53:57 +0800 Subject: [PATCH 064/469] observe the amount of articles and archives viewed --- biliob_spider/spiders/author_update.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index 87b8ba2..70b03e2 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -66,9 +66,18 @@ def parse(self, response): 'article': int(article), 'datetime': datetime.datetime.now() } - yield item + yield Request("http://api.bilibili.com/x/space/upstat?mid={mid}".format(mid=str(mid)),meta={'item': item},method='GET',callback=self.parse_view) except Exception as error: # 出现错误时打印错误日志 logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) + + def parse_view(self,response): + j = json.loads(response.body) + archiveView = j['data']['archive']['view'] + articleView = j['data']['article']['view'] + item = response.meta['item'] + item['data']['archiveView'] = archiveView + item['data']['articleView'] = articleView + yield item From b7a2a3230722dfbcb367cc96ba2556dcab235ac2 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 24 Nov 2018 21:58:35 +0800 Subject: [PATCH 065/469] update strategy --- biliob_analyzer/add_focus.py | 8 ++++---- biliob_spider/pipelines.py | 17 +++++++++-------- biliob_spider/settings.py | 4 ++-- biliob_spider/spiders/author_update.py | 2 +- biliob_spider/spiders/video_spider.py | 3 +-- biliob_spider/spiders/video_watcher.py | 14 ++++++++------ run.py | 2 +- 7 files changed, 26 insertions(+), 24 deletions(-) diff --git a/biliob_analyzer/add_focus.py b/biliob_analyzer/add_focus.py index f23de18..370095a 100644 --- a/biliob_analyzer/add_focus.py +++ b/biliob_analyzer/add_focus.py @@ -5,10 +5,10 @@ # 数据库登录需要帐号密码 client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW']) db = client['biliob'] # 获得数据库的句柄 -coll = db['video'] # 获得collection的句柄 +coll = db['author'] # 获得collection的句柄 docs = coll.find({'focus': {'$exists': False}}).batch_size(60) for each_doc in docs: - if 'aid' in each_doc: + if 'mid' in each_doc: each_doc['focus'] = True - coll.update_one({'aid': each_doc['aid']}, {'$set': each_doc}) - print('已修复aid' + str(each_doc['aid'])) + coll.update_one({'mid': each_doc['mid']}, {'$set': each_doc}) + print('已修复mid' + str(each_doc['mid'])) diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index bc11b7b..b688fee 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -234,14 +234,15 @@ def __init__(self): def process_item(self, item, spider): try: - self.coll.update_one({ - "aid": item["aid"] - }, { - "$set": { - 'aid': item['aid'], - 'focus': True - }, - }, True) + for each_aid in item["aid"]: + self.coll.update_one({ + "aid": each_aid + }, { + "$set": { + 'aid': each_aid, + 'focus': True + }, + }, True) return item except Exception as error: # 出现错误时打印错误日志 diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index 4981fb1..79aeeaf 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -11,8 +11,8 @@ import random -# LOG_FILE = "biliob_spider.log" -# LOG_LEVEL = "ERROR" +LOG_FILE = "biliob_spider.log" +LOG_LEVEL = "INFO" BOT_NAME = 'biliob_spider' diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index 70b03e2..4eadcf2 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -66,7 +66,7 @@ def parse(self, response): 'article': int(article), 'datetime': datetime.datetime.now() } - yield Request("http://api.bilibili.com/x/space/upstat?mid={mid}".format(mid=str(mid)),meta={'item': item},method='GET',callback=self.parse_view) + yield Request("https://api.bilibili.com/x/space/upstat?mid={mid}".format(mid=str(mid)),meta={'item': item},method='GET',callback=self.parse_view) except Exception as error: # 出现错误时打印错误日志 logging.error("视频爬虫在解析时发生错误") diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index ab3dda4..f072934 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -109,8 +109,7 @@ class VideoSpider(scrapy.spiders.Spider): custom_settings = { 'ITEM_PIPELINES': { 'biliob_spider.pipelines.VideoPipeline': 300, - }, - 'DOWNLOAD_DELAY': 1 + } } def __init__(self): diff --git a/biliob_spider/spiders/video_watcher.py b/biliob_spider/spiders/video_watcher.py index 10cca6f..511597b 100644 --- a/biliob_spider/spiders/video_watcher.py +++ b/biliob_spider/spiders/video_watcher.py @@ -19,7 +19,7 @@ class VideoWatch(scrapy.spiders.Spider): 'biliob_spider.pipelines.VideoAddPipeline': 300, 'biliob_spider.pipelines.AuthorChannelPipeline': 301 }, - 'DOWNLOAD_DELAY': 1 + # 'DOWNLOAD_DELAY': 0.5 } def __init__(self): @@ -35,8 +35,8 @@ def start_requests(self): c = self.coll.find({'$or':[{'focus':True},{'forceFocus':True}]}, {'mid': 1}) for each_doc in c: yield Request( - 'http://space.bilibili.com/ajax/member/getSubmitVideos?mid=' + - str(each_doc['mid']) + '&pagesize=1&page=1&order=pubdate', + 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid=' + + str(each_doc['mid']) + '&pagesize=10&page=1&order=pubdate', method='GET') def parse(self, response): @@ -48,10 +48,12 @@ def parse(self, response): list_channel = [] for each_channel in channels: list_channel.append(channels[each_channel]) - aid = j['data']['vlist'][0]['aid'] - mid = j['data']['vlist'][0]['mid'] + aid = [] + for each in j['data']['vlist']: + aid.append(int(each['aid'])) + mid = each['mid'] item = VideoWatcherItem() - item['aid'] = int(aid) + item['aid'] = aid item['channels'] = list_channel item['mid'] = mid yield item diff --git a/run.py b/run.py index 3a583f1..fcd2517 100644 --- a/run.py +++ b/run.py @@ -59,11 +59,11 @@ def run_threaded(job_func): schedule.every().day.at('12:00').do(run_threaded,data_analyze) schedule.every().day.at('01:00').do(run_threaded,update_author) -schedule.every(120).minutes.do(run_threaded,video_watcher) schedule.every().day.at('07:00').do(run_threaded,video_spider) schedule.every().day.at('14:00').do(run_threaded,auto_add_author) schedule.every().day.at('16:30').do(run_threaded,bangumi) schedule.every().day.at('16:30').do(run_threaded,donghua) +schedule.every().day.at('22:00').do(run_threaded,video_watcher) schedule.every().hour.do(run_threaded,site) schedule.every().minute.do(run_threaded,online) From 6f93fe8e8202f45986f8e8e0e0bed283dd6a288f Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 24 Nov 2018 21:58:35 +0800 Subject: [PATCH 066/469] update strategy --- biliob_analyzer/add_focus.py | 8 ++++---- biliob_spider/pipelines.py | 17 +++++++++-------- biliob_spider/settings.py | 4 ++-- biliob_spider/spiders/author_update.py | 2 +- biliob_spider/spiders/video_spider.py | 3 +-- biliob_spider/spiders/video_watcher.py | 14 ++++++++------ run.py | 2 +- 7 files changed, 26 insertions(+), 24 deletions(-) diff --git a/biliob_analyzer/add_focus.py b/biliob_analyzer/add_focus.py index f23de18..370095a 100644 --- a/biliob_analyzer/add_focus.py +++ b/biliob_analyzer/add_focus.py @@ -5,10 +5,10 @@ # 数据库登录需要帐号密码 client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW']) db = client['biliob'] # 获得数据库的句柄 -coll = db['video'] # 获得collection的句柄 +coll = db['author'] # 获得collection的句柄 docs = coll.find({'focus': {'$exists': False}}).batch_size(60) for each_doc in docs: - if 'aid' in each_doc: + if 'mid' in each_doc: each_doc['focus'] = True - coll.update_one({'aid': each_doc['aid']}, {'$set': each_doc}) - print('已修复aid' + str(each_doc['aid'])) + coll.update_one({'mid': each_doc['mid']}, {'$set': each_doc}) + print('已修复mid' + str(each_doc['mid'])) diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index bc11b7b..b688fee 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -234,14 +234,15 @@ def __init__(self): def process_item(self, item, spider): try: - self.coll.update_one({ - "aid": item["aid"] - }, { - "$set": { - 'aid': item['aid'], - 'focus': True - }, - }, True) + for each_aid in item["aid"]: + self.coll.update_one({ + "aid": each_aid + }, { + "$set": { + 'aid': each_aid, + 'focus': True + }, + }, True) return item except Exception as error: # 出现错误时打印错误日志 diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index 4981fb1..79aeeaf 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -11,8 +11,8 @@ import random -# LOG_FILE = "biliob_spider.log" -# LOG_LEVEL = "ERROR" +LOG_FILE = "biliob_spider.log" +LOG_LEVEL = "INFO" BOT_NAME = 'biliob_spider' diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index 70b03e2..4eadcf2 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -66,7 +66,7 @@ def parse(self, response): 'article': int(article), 'datetime': datetime.datetime.now() } - yield Request("http://api.bilibili.com/x/space/upstat?mid={mid}".format(mid=str(mid)),meta={'item': item},method='GET',callback=self.parse_view) + yield Request("https://api.bilibili.com/x/space/upstat?mid={mid}".format(mid=str(mid)),meta={'item': item},method='GET',callback=self.parse_view) except Exception as error: # 出现错误时打印错误日志 logging.error("视频爬虫在解析时发生错误") diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index ab3dda4..f072934 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -109,8 +109,7 @@ class VideoSpider(scrapy.spiders.Spider): custom_settings = { 'ITEM_PIPELINES': { 'biliob_spider.pipelines.VideoPipeline': 300, - }, - 'DOWNLOAD_DELAY': 1 + } } def __init__(self): diff --git a/biliob_spider/spiders/video_watcher.py b/biliob_spider/spiders/video_watcher.py index 10cca6f..511597b 100644 --- a/biliob_spider/spiders/video_watcher.py +++ b/biliob_spider/spiders/video_watcher.py @@ -19,7 +19,7 @@ class VideoWatch(scrapy.spiders.Spider): 'biliob_spider.pipelines.VideoAddPipeline': 300, 'biliob_spider.pipelines.AuthorChannelPipeline': 301 }, - 'DOWNLOAD_DELAY': 1 + # 'DOWNLOAD_DELAY': 0.5 } def __init__(self): @@ -35,8 +35,8 @@ def start_requests(self): c = self.coll.find({'$or':[{'focus':True},{'forceFocus':True}]}, {'mid': 1}) for each_doc in c: yield Request( - 'http://space.bilibili.com/ajax/member/getSubmitVideos?mid=' + - str(each_doc['mid']) + '&pagesize=1&page=1&order=pubdate', + 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid=' + + str(each_doc['mid']) + '&pagesize=10&page=1&order=pubdate', method='GET') def parse(self, response): @@ -48,10 +48,12 @@ def parse(self, response): list_channel = [] for each_channel in channels: list_channel.append(channels[each_channel]) - aid = j['data']['vlist'][0]['aid'] - mid = j['data']['vlist'][0]['mid'] + aid = [] + for each in j['data']['vlist']: + aid.append(int(each['aid'])) + mid = each['mid'] item = VideoWatcherItem() - item['aid'] = int(aid) + item['aid'] = aid item['channels'] = list_channel item['mid'] = mid yield item diff --git a/run.py b/run.py index 3a583f1..fcd2517 100644 --- a/run.py +++ b/run.py @@ -59,11 +59,11 @@ def run_threaded(job_func): schedule.every().day.at('12:00').do(run_threaded,data_analyze) schedule.every().day.at('01:00').do(run_threaded,update_author) -schedule.every(120).minutes.do(run_threaded,video_watcher) schedule.every().day.at('07:00').do(run_threaded,video_spider) schedule.every().day.at('14:00').do(run_threaded,auto_add_author) schedule.every().day.at('16:30').do(run_threaded,bangumi) schedule.every().day.at('16:30').do(run_threaded,donghua) +schedule.every().day.at('22:00').do(run_threaded,video_watcher) schedule.every().hour.do(run_threaded,site) schedule.every().minute.do(run_threaded,online) From 3072bcfcdd2261410e1b131e6672908647de5015 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Mon, 3 Dec 2018 15:45:12 +0800 Subject: [PATCH 067/469] feature: Bilibili monthly rank spider. --- biliob_analyzer/delete_dump.py | 21 ++++ biliob_analyzer/delete_wrong_favorite.py | 7 +- biliob_spider/items.py | 9 ++ biliob_spider/pipelines.py | 133 +++++++++++++-------- biliob_spider/settings.py | 4 +- biliob_spider/spiders/bili_monthly_rank.py | 62 ++++++++++ run.py | 5 + 7 files changed, 183 insertions(+), 58 deletions(-) create mode 100644 biliob_analyzer/delete_dump.py create mode 100644 biliob_spider/spiders/bili_monthly_rank.py diff --git a/biliob_analyzer/delete_dump.py b/biliob_analyzer/delete_dump.py new file mode 100644 index 0000000..7f2e8ba --- /dev/null +++ b/biliob_analyzer/delete_dump.py @@ -0,0 +1,21 @@ +from db import db +import functools +coll = db['user'] +f = coll.find() +names = set() +for each in f: + names.add(each['name']) +for each_name in names: + f = coll.find({'name': each_name}) + while f.count() > 1: + a= coll.delete_one({ + 'name': each_name, + 'favoriteMid': { + '$exists': False + }, + 'favoriteAid': { + '$exists': False + } + }) + print(a) + f = coll.find({'name': each_name}) diff --git a/biliob_analyzer/delete_wrong_favorite.py b/biliob_analyzer/delete_wrong_favorite.py index c9c4bc9..6f1c1fb 100644 --- a/biliob_analyzer/delete_wrong_favorite.py +++ b/biliob_analyzer/delete_wrong_favorite.py @@ -1,10 +1,5 @@ from db import settings -from pymongo import MongoClient -# 链接mongoDB -client = MongoClient(settings['MINGO_HOST'], 27017) -# 数据库登录需要帐号密码 -client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW']) -db = client['biliob'] # 获得数据库的句柄 +from db import db coll = db['video'] # 获得collection的句柄 docs = coll.find().batch_size(60) for each_doc in docs: diff --git a/biliob_spider/items.py b/biliob_spider/items.py index de904c0..25f9c31 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -44,6 +44,15 @@ class AuthorItem(scrapy.Item): data = scrapy.Field() level = scrapy.Field() focus = scrapy.Field() + pts = scrapy.Field() + +class RankItem(scrapy.Item): + title = scrapy.Field() + author = scrapy.Field() + aid = scrapy.Field() + pts = scrapy.Field() + mid = scrapy.Field() + channel = scrapy.Field() class VideoOnline(scrapy.Item): title = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index b688fee..d652130 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -22,19 +22,19 @@ def __init__(self): def process_item(self, item, spider): try: self.coll.update_one({ - "aid": int(item["aid"]) + 'aid': int(item['aid']) }, { - "$set": { - "author": item['author'], - "subChannel": item['subChannel'], - "channel": item['channel'], - "mid": item['mid'], - "pic": item['pic'], - "title": item['title'], - "datetime": datetime.datetime.fromtimestamp( + '$set': { + 'author': item['author'], + 'subChannel': item['subChannel'], + 'channel': item['channel'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': datetime.datetime.fromtimestamp( item['datetime']) }, - "$push": { + '$push': { 'data': { '$each':[item['data']], '$position':0 @@ -59,13 +59,13 @@ def __init__(self): def process_item(self, item, spider): try: self.coll.update_one({ - "title": item['title'] + 'title': item['title'] }, { - "$set": { + '$set': { 'tag':item['tag'], - "title": item['title'], + 'title': item['title'], }, - "$addToSet": { + '$addToSet': { 'data': item['data'] } }, True) @@ -87,13 +87,13 @@ def __init__(self): def process_item(self, item, spider): try: self.coll.update_one({ - "title": item['title'] + 'title': item['title'] }, { - "$set": { + '$set': { 'tag':item['tag'], - "title": item['title'], + 'title': item['title'], }, - "$addToSet": { + '$addToSet': { 'data': item['data'] } }, True) @@ -114,11 +114,11 @@ def __init__(self): def process_item(self, item, spider): try: self.coll.insert_one({ - "region_count": item['region_count'], - "all_count": item['all_count'], - "web_online": item['web_online'], - "play_online": item['play_online'], - "datetime":datetime.datetime.now() + 'region_count': item['region_count'], + 'all_count': item['all_count'], + 'web_online': item['web_online'], + 'play_online': item['play_online'], + 'datetime':datetime.datetime.now() }) return item except Exception as error: @@ -138,17 +138,17 @@ def __init__(self): def process_item(self, item, spider): try: self.coll.update_one({ - "mid": item["mid"] + 'mid': item['mid'] }, { - "$set": { - "name": item['name'], - "face": item['face'], - "official": item['official'], - "level": item['level'], - "sex": item['sex'], - "focus":True + '$set': { + 'name': item['name'], + 'face': item['face'], + 'official': item['official'], + 'level': item['level'], + 'sex': item['sex'], + 'focus':True }, - "$push": { + '$push': { 'data': { '$each':[item['data']], '$position':0 @@ -174,15 +174,15 @@ def process_item(self, item, spider): try: self.coll.update_one({ - "title": item["title"] + 'title': item['title'] }, { - "$set": { - "title": item['title'], - "author": item['author'], - "channel": item['channel'], - "subChannel": item['subChannel'], + '$set': { + 'title': item['title'], + 'author': item['author'], + 'channel': item['channel'], + 'subChannel': item['subChannel'], }, - "$addToSet": { + '$addToSet': { 'data': item['data'] } }, True) @@ -206,13 +206,13 @@ def process_item(self, item, spider): try: self.coll.update_one({ - "tag_id": item["tag_id"] + 'tag_id': item['tag_id'] }, { - "$set": { - "tag_name": item['tag_name'], - "ctime": item['ctime'], + '$set': { + 'tag_name': item['tag_name'], + 'ctime': item['ctime'], }, - "$addToSet": { + '$addToSet': { 'use': item['use'], 'atten': item['atten'], 'datetime': datetime.datetime.now() @@ -234,11 +234,11 @@ def __init__(self): def process_item(self, item, spider): try: - for each_aid in item["aid"]: + for each_aid in item['aid']: self.coll.update_one({ - "aid": each_aid + 'aid': each_aid }, { - "$set": { + '$set': { 'aid': each_aid, 'focus': True }, @@ -261,13 +261,46 @@ def __init__(self): def process_item(self, item, spider): try: self.coll.update_one({ - "mid": item["mid"] + 'mid': item['mid'] }, { - "$set": { - "channels": item['channels'] + '$set': { + 'channels': item['channels'] }, }, True) return item except Exception as error: # 出现错误时打印错误日志 logging.error(error) + +class BiliMonthlyRankPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['monthly_rank'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'aid': item['aid'] + }, { + '$addToSet': { + 'pts': item['pts'], + 'datetime': datetime.datetime.now() + }, + '$set':{ + 'title': item['title'], + 'author': item['author'], + 'aid': item['aid'], + 'mid': item['mid'], + 'channel': item['channel'], + 'currentPts':item['pts'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index 79aeeaf..1d0d6d5 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -11,8 +11,8 @@ import random -LOG_FILE = "biliob_spider.log" -LOG_LEVEL = "INFO" +# LOG_FILE = "biliob_spider.log" +# LOG_LEVEL = "INFO" BOT_NAME = 'biliob_spider' diff --git a/biliob_spider/spiders/bili_monthly_rank.py b/biliob_spider/spiders/bili_monthly_rank.py new file mode 100644 index 0000000..35eae38 --- /dev/null +++ b/biliob_spider/spiders/bili_monthly_rank.py @@ -0,0 +1,62 @@ +#coding=utf-8 +import scrapy +from scrapy.http import Request +from biliob_spider.items import RankItem +import time +import json +import logging +from pymongo import MongoClient +import datetime + + +class BiliMonthlyRankSpider(scrapy.spiders.Spider): + name = "biliMonthlyRank" + allowed_domains = ["bilibili.com"] + start_urls = [ + 'https://www.bilibili.com/ranking/all/0/0/30', + 'https://www.bilibili.com/ranking/all/1/0/30', + 'https://www.bilibili.com/ranking/all/168/0/30', + 'https://www.bilibili.com/ranking/all/3/0/30', + 'https://www.bilibili.com/ranking/all/129/0/30', + 'https://www.bilibili.com/ranking/all/4/0/30', + 'https://www.bilibili.com/ranking/all/36/0/30', + 'https://www.bilibili.com/ranking/all/160/0/30', + 'https://www.bilibili.com/ranking/all/119/0/30', + 'https://www.bilibili.com/ranking/all/155/0/30', + 'https://www.bilibili.com/ranking/all/5/0/30', + 'https://www.bilibili.com/ranking/all/181/0/30' + ] + + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.BiliMonthlyRankPipeline': 300 + }, + } + + def parse(self, response): + try: + url_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/a/@href').extract() + pts_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[2]/div/text()').extract() + mid_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/a/@href').extract() + + title_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/a/text()').extract() + author_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/a/span/text()').extract() + aid_list = list(map(lambda x: int(x[27:-1]),url_list)) + pts_list = list(map(lambda x : int(x),pts_list)) + mid_list = list(map(lambda x : int(x.lstrip('//space.bilibili.com/').rstrip('/')),mid_list)) + channel = response.xpath("//li[@class='active']/text()").extract()[0] + # 为了爬取分区、粉丝数等数据,需要进入每一个视频的详情页面进行抓取 + for each in zip(title_list,author_list,aid_list,pts_list,mid_list): + item = RankItem() + item['title'] = each[0] + item['author'] = each[1] + item['aid'] = each[2] + item['pts'] = each[3] + item['mid'] = each[4] + item['channel'] = channel + yield item + except Exception as error: + # 出现错误时打印错误日志 + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) \ No newline at end of file diff --git a/run.py b/run.py index fcd2517..7ac325e 100644 --- a/run.py +++ b/run.py @@ -52,6 +52,10 @@ def online(): def data_analyze(): Popen(['python','run_analyzer.py']) +def bili_monthly_rank(): + Popen(['python','crawl','biliMonthlyRank']) + + def run_threaded(job_func): job_thread = threading.Thread(target=job_func) job_thread.start() @@ -64,6 +68,7 @@ def run_threaded(job_func): schedule.every().day.at('16:30').do(run_threaded,bangumi) schedule.every().day.at('16:30').do(run_threaded,donghua) schedule.every().day.at('22:00').do(run_threaded,video_watcher) +schedule.every().day.at('21:00').do(run_threaded,bili_monthly_rank) schedule.every().hour.do(run_threaded,site) schedule.every().minute.do(run_threaded,online) From e4927076057bb3121fb0840465cf015cb4e6cc9f Mon Sep 17 00:00:00 2001 From: Jannchie Date: Mon, 3 Dec 2018 15:45:12 +0800 Subject: [PATCH 068/469] feature: Bilibili monthly rank spider. --- biliob_analyzer/delete_dump.py | 21 ++++ biliob_analyzer/delete_wrong_favorite.py | 7 +- biliob_spider/items.py | 9 ++ biliob_spider/pipelines.py | 133 +++++++++++++-------- biliob_spider/settings.py | 4 +- biliob_spider/spiders/bili_monthly_rank.py | 62 ++++++++++ run.py | 5 + 7 files changed, 183 insertions(+), 58 deletions(-) create mode 100644 biliob_analyzer/delete_dump.py create mode 100644 biliob_spider/spiders/bili_monthly_rank.py diff --git a/biliob_analyzer/delete_dump.py b/biliob_analyzer/delete_dump.py new file mode 100644 index 0000000..7f2e8ba --- /dev/null +++ b/biliob_analyzer/delete_dump.py @@ -0,0 +1,21 @@ +from db import db +import functools +coll = db['user'] +f = coll.find() +names = set() +for each in f: + names.add(each['name']) +for each_name in names: + f = coll.find({'name': each_name}) + while f.count() > 1: + a= coll.delete_one({ + 'name': each_name, + 'favoriteMid': { + '$exists': False + }, + 'favoriteAid': { + '$exists': False + } + }) + print(a) + f = coll.find({'name': each_name}) diff --git a/biliob_analyzer/delete_wrong_favorite.py b/biliob_analyzer/delete_wrong_favorite.py index c9c4bc9..6f1c1fb 100644 --- a/biliob_analyzer/delete_wrong_favorite.py +++ b/biliob_analyzer/delete_wrong_favorite.py @@ -1,10 +1,5 @@ from db import settings -from pymongo import MongoClient -# 链接mongoDB -client = MongoClient(settings['MINGO_HOST'], 27017) -# 数据库登录需要帐号密码 -client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW']) -db = client['biliob'] # 获得数据库的句柄 +from db import db coll = db['video'] # 获得collection的句柄 docs = coll.find().batch_size(60) for each_doc in docs: diff --git a/biliob_spider/items.py b/biliob_spider/items.py index de904c0..25f9c31 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -44,6 +44,15 @@ class AuthorItem(scrapy.Item): data = scrapy.Field() level = scrapy.Field() focus = scrapy.Field() + pts = scrapy.Field() + +class RankItem(scrapy.Item): + title = scrapy.Field() + author = scrapy.Field() + aid = scrapy.Field() + pts = scrapy.Field() + mid = scrapy.Field() + channel = scrapy.Field() class VideoOnline(scrapy.Item): title = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index b688fee..d652130 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -22,19 +22,19 @@ def __init__(self): def process_item(self, item, spider): try: self.coll.update_one({ - "aid": int(item["aid"]) + 'aid': int(item['aid']) }, { - "$set": { - "author": item['author'], - "subChannel": item['subChannel'], - "channel": item['channel'], - "mid": item['mid'], - "pic": item['pic'], - "title": item['title'], - "datetime": datetime.datetime.fromtimestamp( + '$set': { + 'author': item['author'], + 'subChannel': item['subChannel'], + 'channel': item['channel'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': datetime.datetime.fromtimestamp( item['datetime']) }, - "$push": { + '$push': { 'data': { '$each':[item['data']], '$position':0 @@ -59,13 +59,13 @@ def __init__(self): def process_item(self, item, spider): try: self.coll.update_one({ - "title": item['title'] + 'title': item['title'] }, { - "$set": { + '$set': { 'tag':item['tag'], - "title": item['title'], + 'title': item['title'], }, - "$addToSet": { + '$addToSet': { 'data': item['data'] } }, True) @@ -87,13 +87,13 @@ def __init__(self): def process_item(self, item, spider): try: self.coll.update_one({ - "title": item['title'] + 'title': item['title'] }, { - "$set": { + '$set': { 'tag':item['tag'], - "title": item['title'], + 'title': item['title'], }, - "$addToSet": { + '$addToSet': { 'data': item['data'] } }, True) @@ -114,11 +114,11 @@ def __init__(self): def process_item(self, item, spider): try: self.coll.insert_one({ - "region_count": item['region_count'], - "all_count": item['all_count'], - "web_online": item['web_online'], - "play_online": item['play_online'], - "datetime":datetime.datetime.now() + 'region_count': item['region_count'], + 'all_count': item['all_count'], + 'web_online': item['web_online'], + 'play_online': item['play_online'], + 'datetime':datetime.datetime.now() }) return item except Exception as error: @@ -138,17 +138,17 @@ def __init__(self): def process_item(self, item, spider): try: self.coll.update_one({ - "mid": item["mid"] + 'mid': item['mid'] }, { - "$set": { - "name": item['name'], - "face": item['face'], - "official": item['official'], - "level": item['level'], - "sex": item['sex'], - "focus":True + '$set': { + 'name': item['name'], + 'face': item['face'], + 'official': item['official'], + 'level': item['level'], + 'sex': item['sex'], + 'focus':True }, - "$push": { + '$push': { 'data': { '$each':[item['data']], '$position':0 @@ -174,15 +174,15 @@ def process_item(self, item, spider): try: self.coll.update_one({ - "title": item["title"] + 'title': item['title'] }, { - "$set": { - "title": item['title'], - "author": item['author'], - "channel": item['channel'], - "subChannel": item['subChannel'], + '$set': { + 'title': item['title'], + 'author': item['author'], + 'channel': item['channel'], + 'subChannel': item['subChannel'], }, - "$addToSet": { + '$addToSet': { 'data': item['data'] } }, True) @@ -206,13 +206,13 @@ def process_item(self, item, spider): try: self.coll.update_one({ - "tag_id": item["tag_id"] + 'tag_id': item['tag_id'] }, { - "$set": { - "tag_name": item['tag_name'], - "ctime": item['ctime'], + '$set': { + 'tag_name': item['tag_name'], + 'ctime': item['ctime'], }, - "$addToSet": { + '$addToSet': { 'use': item['use'], 'atten': item['atten'], 'datetime': datetime.datetime.now() @@ -234,11 +234,11 @@ def __init__(self): def process_item(self, item, spider): try: - for each_aid in item["aid"]: + for each_aid in item['aid']: self.coll.update_one({ - "aid": each_aid + 'aid': each_aid }, { - "$set": { + '$set': { 'aid': each_aid, 'focus': True }, @@ -261,13 +261,46 @@ def __init__(self): def process_item(self, item, spider): try: self.coll.update_one({ - "mid": item["mid"] + 'mid': item['mid'] }, { - "$set": { - "channels": item['channels'] + '$set': { + 'channels': item['channels'] }, }, True) return item except Exception as error: # 出现错误时打印错误日志 logging.error(error) + +class BiliMonthlyRankPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['monthly_rank'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'aid': item['aid'] + }, { + '$addToSet': { + 'pts': item['pts'], + 'datetime': datetime.datetime.now() + }, + '$set':{ + 'title': item['title'], + 'author': item['author'], + 'aid': item['aid'], + 'mid': item['mid'], + 'channel': item['channel'], + 'currentPts':item['pts'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index 79aeeaf..1d0d6d5 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -11,8 +11,8 @@ import random -LOG_FILE = "biliob_spider.log" -LOG_LEVEL = "INFO" +# LOG_FILE = "biliob_spider.log" +# LOG_LEVEL = "INFO" BOT_NAME = 'biliob_spider' diff --git a/biliob_spider/spiders/bili_monthly_rank.py b/biliob_spider/spiders/bili_monthly_rank.py new file mode 100644 index 0000000..35eae38 --- /dev/null +++ b/biliob_spider/spiders/bili_monthly_rank.py @@ -0,0 +1,62 @@ +#coding=utf-8 +import scrapy +from scrapy.http import Request +from biliob_spider.items import RankItem +import time +import json +import logging +from pymongo import MongoClient +import datetime + + +class BiliMonthlyRankSpider(scrapy.spiders.Spider): + name = "biliMonthlyRank" + allowed_domains = ["bilibili.com"] + start_urls = [ + 'https://www.bilibili.com/ranking/all/0/0/30', + 'https://www.bilibili.com/ranking/all/1/0/30', + 'https://www.bilibili.com/ranking/all/168/0/30', + 'https://www.bilibili.com/ranking/all/3/0/30', + 'https://www.bilibili.com/ranking/all/129/0/30', + 'https://www.bilibili.com/ranking/all/4/0/30', + 'https://www.bilibili.com/ranking/all/36/0/30', + 'https://www.bilibili.com/ranking/all/160/0/30', + 'https://www.bilibili.com/ranking/all/119/0/30', + 'https://www.bilibili.com/ranking/all/155/0/30', + 'https://www.bilibili.com/ranking/all/5/0/30', + 'https://www.bilibili.com/ranking/all/181/0/30' + ] + + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.BiliMonthlyRankPipeline': 300 + }, + } + + def parse(self, response): + try: + url_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/a/@href').extract() + pts_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[2]/div/text()').extract() + mid_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/a/@href').extract() + + title_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/a/text()').extract() + author_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/a/span/text()').extract() + aid_list = list(map(lambda x: int(x[27:-1]),url_list)) + pts_list = list(map(lambda x : int(x),pts_list)) + mid_list = list(map(lambda x : int(x.lstrip('//space.bilibili.com/').rstrip('/')),mid_list)) + channel = response.xpath("//li[@class='active']/text()").extract()[0] + # 为了爬取分区、粉丝数等数据,需要进入每一个视频的详情页面进行抓取 + for each in zip(title_list,author_list,aid_list,pts_list,mid_list): + item = RankItem() + item['title'] = each[0] + item['author'] = each[1] + item['aid'] = each[2] + item['pts'] = each[3] + item['mid'] = each[4] + item['channel'] = channel + yield item + except Exception as error: + # 出现错误时打印错误日志 + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) \ No newline at end of file diff --git a/run.py b/run.py index fcd2517..7ac325e 100644 --- a/run.py +++ b/run.py @@ -52,6 +52,10 @@ def online(): def data_analyze(): Popen(['python','run_analyzer.py']) +def bili_monthly_rank(): + Popen(['python','crawl','biliMonthlyRank']) + + def run_threaded(job_func): job_thread = threading.Thread(target=job_func) job_thread.start() @@ -64,6 +68,7 @@ def run_threaded(job_func): schedule.every().day.at('16:30').do(run_threaded,bangumi) schedule.every().day.at('16:30').do(run_threaded,donghua) schedule.every().day.at('22:00').do(run_threaded,video_watcher) +schedule.every().day.at('21:00').do(run_threaded,bili_monthly_rank) schedule.every().hour.do(run_threaded,site) schedule.every().minute.do(run_threaded,online) From d8a26876b83999e5a9f11f73a6fea5abd35ee5b5 Mon Sep 17 00:00:00 2001 From: jannchie Date: Mon, 3 Dec 2018 23:42:21 +0800 Subject: [PATCH 069/469] fix the script --- run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run.py b/run.py index 9e068e1..3f7dc74 100644 --- a/run.py +++ b/run.py @@ -53,7 +53,7 @@ def data_analyze(): Popen(['python','run_analyzer.py']) def bili_monthly_rank(): - Popen(['python','crawl','biliMonthlyRank']) + Popen(['scrapy','crawl','biliMonthlyRank']) def run_threaded(job_func): From f2998b46b69bab4e2ec922fe6ca7acc8fb657e90 Mon Sep 17 00:00:00 2001 From: jannchie Date: Mon, 3 Dec 2018 23:42:21 +0800 Subject: [PATCH 070/469] fix the script --- run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run.py b/run.py index 9e068e1..3f7dc74 100644 --- a/run.py +++ b/run.py @@ -53,7 +53,7 @@ def data_analyze(): Popen(['python','run_analyzer.py']) def bili_monthly_rank(): - Popen(['python','crawl','biliMonthlyRank']) + Popen(['scrapy','crawl','biliMonthlyRank']) def run_threaded(job_func): From eec8312cb8b60e648c4b3dec7070eed428e65ac4 Mon Sep 17 00:00:00 2001 From: jannchie Date: Mon, 3 Dec 2018 23:46:29 +0800 Subject: [PATCH 071/469] fix: remove the duplication --- biliob_spider/spiders/bili_monthly_rank.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/biliob_spider/spiders/bili_monthly_rank.py b/biliob_spider/spiders/bili_monthly_rank.py index 35eae38..eb7fcf9 100644 --- a/biliob_spider/spiders/bili_monthly_rank.py +++ b/biliob_spider/spiders/bili_monthly_rank.py @@ -13,7 +13,6 @@ class BiliMonthlyRankSpider(scrapy.spiders.Spider): name = "biliMonthlyRank" allowed_domains = ["bilibili.com"] start_urls = [ - 'https://www.bilibili.com/ranking/all/0/0/30', 'https://www.bilibili.com/ranking/all/1/0/30', 'https://www.bilibili.com/ranking/all/168/0/30', 'https://www.bilibili.com/ranking/all/3/0/30', @@ -59,4 +58,4 @@ def parse(self, response): # 出现错误时打印错误日志 logging.error("视频爬虫在解析时发生错误") logging.error(response.url) - logging.error(error) \ No newline at end of file + logging.error(error) From 9ab0dbda1c38eb9132cc6a10fa2cc7550f2ab955 Mon Sep 17 00:00:00 2001 From: jannchie Date: Mon, 3 Dec 2018 23:46:29 +0800 Subject: [PATCH 072/469] fix: remove the duplication --- biliob_spider/spiders/bili_monthly_rank.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/biliob_spider/spiders/bili_monthly_rank.py b/biliob_spider/spiders/bili_monthly_rank.py index 35eae38..eb7fcf9 100644 --- a/biliob_spider/spiders/bili_monthly_rank.py +++ b/biliob_spider/spiders/bili_monthly_rank.py @@ -13,7 +13,6 @@ class BiliMonthlyRankSpider(scrapy.spiders.Spider): name = "biliMonthlyRank" allowed_domains = ["bilibili.com"] start_urls = [ - 'https://www.bilibili.com/ranking/all/0/0/30', 'https://www.bilibili.com/ranking/all/1/0/30', 'https://www.bilibili.com/ranking/all/168/0/30', 'https://www.bilibili.com/ranking/all/3/0/30', @@ -59,4 +58,4 @@ def parse(self, response): # 出现错误时打印错误日志 logging.error("视频爬虫在解析时发生错误") logging.error(response.url) - logging.error(error) \ No newline at end of file + logging.error(error) From 89a415d7e6762de438bf662d9683167067321ad8 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 4 Dec 2018 12:54:11 +0800 Subject: [PATCH 073/469] Feature: Add current data field for donghua and bangumi --- biliob_spider/pipelines.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index d652130..64c467f 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -64,6 +64,10 @@ def process_item(self, item, spider): '$set': { 'tag':item['tag'], 'title': item['title'], + 'currentPts': item['data']['pts'], + 'currentPlay': item['data']['play'], + 'currentWatch': item['data']['watch'], + 'currentDanmaku': item['data']['danmaku'] }, '$addToSet': { 'data': item['data'] @@ -92,6 +96,10 @@ def process_item(self, item, spider): '$set': { 'tag':item['tag'], 'title': item['title'], + 'currentPts': item['data']['pts'], + 'currentPlay': item['data']['play'], + 'currentWatch': item['data']['watch'], + 'currentDanmaku': item['data']['danmaku'] }, '$addToSet': { 'data': item['data'] From 7c2c243cef9d7920b5d6eaf56274906137cede18 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 4 Dec 2018 12:54:11 +0800 Subject: [PATCH 074/469] Feature: Add current data field for donghua and bangumi --- biliob_spider/pipelines.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index d652130..64c467f 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -64,6 +64,10 @@ def process_item(self, item, spider): '$set': { 'tag':item['tag'], 'title': item['title'], + 'currentPts': item['data']['pts'], + 'currentPlay': item['data']['play'], + 'currentWatch': item['data']['watch'], + 'currentDanmaku': item['data']['danmaku'] }, '$addToSet': { 'data': item['data'] @@ -92,6 +96,10 @@ def process_item(self, item, spider): '$set': { 'tag':item['tag'], 'title': item['title'], + 'currentPts': item['data']['pts'], + 'currentPlay': item['data']['play'], + 'currentWatch': item['data']['watch'], + 'currentDanmaku': item['data']['danmaku'] }, '$addToSet': { 'data': item['data'] From a04af13866a2a75bdbb2c1b9d398bba3c98bbd97 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 4 Dec 2018 20:21:40 +0800 Subject: [PATCH 075/469] update: dangumi and donghua spider --- biliob_spider/items.py | 5 +++ biliob_spider/pipelines.py | 14 ++++++-- biliob_spider/spiders/bangumi.py | 55 +++++++++++-------------------- biliob_spider/spiders/donghua.py | 56 +++++++++++--------------------- 4 files changed, 55 insertions(+), 75 deletions(-) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 25f9c31..41a154e 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -22,6 +22,11 @@ class TagItem(scrapy.Item): class BangumiItem(scrapy.Item): title = scrapy.Field() tag = scrapy.Field() + cover = scrapy.Field() + square_cover = scrapy.Field() + is_finish = scrapy.Field() + is_started = scrapy.Field() + newest_ep_index = scrapy.Field() data = scrapy.Field() class VideoItem(scrapy.Item): diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 64c467f..bca2cb1 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -62,11 +62,16 @@ def process_item(self, item, spider): 'title': item['title'] }, { '$set': { - 'tag':item['tag'], 'title': item['title'], + 'cover': item['cover'], + 'isFinish': item['is_finish'], + 'isStarted': item['is_started'], + 'newest': item['newest_ep_index'], 'currentPts': item['data']['pts'], 'currentPlay': item['data']['play'], + 'squareCover': item['square_cover'], 'currentWatch': item['data']['watch'], + 'currentReview': item['data']['review'], 'currentDanmaku': item['data']['danmaku'] }, '$addToSet': { @@ -94,11 +99,16 @@ def process_item(self, item, spider): 'title': item['title'] }, { '$set': { - 'tag':item['tag'], 'title': item['title'], + 'cover': item['cover'], + 'isFinish': item['is_finish'], + 'isStarted': item['is_started'], + 'newest': item['newest_ep_index'], 'currentPts': item['data']['pts'], 'currentPlay': item['data']['play'], + 'squareCover': item['square_cover'], 'currentWatch': item['data']['watch'], + 'currentReview': item['data']['review'], 'currentDanmaku': item['data']['danmaku'] }, '$addToSet': { diff --git a/biliob_spider/spiders/bangumi.py b/biliob_spider/spiders/bangumi.py index 5cd4da7..d9d155b 100644 --- a/biliob_spider/spiders/bangumi.py +++ b/biliob_spider/spiders/bangumi.py @@ -4,6 +4,7 @@ from biliob_spider.items import BangumiItem import time import datetime +import json class BangumiSpider(scrapy.spiders.Spider): @@ -17,39 +18,21 @@ class BangumiSpider(scrapy.spiders.Spider): } def parse(self, response): - detail_href = response.xpath("//div[@class='img']/a/@href").extract() - - pts = response.xpath("//div[@class='pts']/div/text()").extract() - for (each_href, each_pts) in zip(detail_href, pts): - yield Request( - "https:" + each_href, - meta={'pts': each_pts}, - callback=self.detail_parse) - - def detail_parse(self, response): - pts = response.meta['pts'] - play = response.xpath( - '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[1]/em/text()' - ).extract()[0] - watch = response.xpath( - '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[2]/em/text()' - ).extract()[0] - danmaku = response.xpath( - '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[3]/em/text()' - ).extract()[0] - title = response.xpath( - '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[1]/span[1]/text()' - ).extract()[0] - tag = response.xpath('//span[@class="media-tag"]/text()').extract() - data = { - 'danmaku': danmaku, - 'watch': watch, - 'play': play, - 'pts': int(pts), - 'datetime': datetime.datetime.now() - } - item = BangumiItem() - item['tag'] = tag - item['title'] = title - item['data'] = data - yield item \ No newline at end of file + j = json.loads(response.xpath("//script[3]/text()").extract()[0][len('window.__INITIAL_STATE__='):].split(';')[0]) + for each in j['rankList']: + item = BangumiItem() + item['title'] = each['title'] + item['cover'] = each['cover'] + item['square_cover'] = each['square_cover'] + item['is_finish'] = each['is_finish'] + item['is_started'] = each['is_started'] + item['newest_ep_index'] = each['newest_ep_index'] + item['data'] = { + 'danmaku': each['dm_count'], + 'watch': each['fav'], + 'play': each['play'], + 'pts': each['pts'], + 'review': each['video_review'], + 'datetime': datetime.datetime.now() + } + yield item \ No newline at end of file diff --git a/biliob_spider/spiders/donghua.py b/biliob_spider/spiders/donghua.py index 0f856f1..63bb5af 100644 --- a/biliob_spider/spiders/donghua.py +++ b/biliob_spider/spiders/donghua.py @@ -4,7 +4,7 @@ from biliob_spider.items import BangumiItem import time import datetime - +import json class DonghuaSpider(scrapy.spiders.Spider): name = "donghua" @@ -17,39 +17,21 @@ class DonghuaSpider(scrapy.spiders.Spider): } def parse(self, response): - detail_href = response.xpath("//div[@class='img']/a/@href").extract() - - pts = response.xpath("//div[@class='pts']/div/text()").extract() - for (each_href, each_pts) in zip(detail_href, pts): - yield Request( - "https:" + each_href, - meta={'pts': each_pts}, - callback=self.detail_parse) - - def detail_parse(self, response): - pts = response.meta['pts'] - play = response.xpath( - '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[1]/em/text()' - ).extract()[0] - watch = response.xpath( - '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[2]/em/text()' - ).extract()[0] - danmaku = response.xpath( - '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[3]/em/text()' - ).extract()[0] - title = response.xpath( - '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[1]/span[1]/text()' - ).extract()[0] - tag = response.xpath('//span[@class="media-tag"]/text()').extract() - data = { - 'danmaku': danmaku, - 'watch': watch, - 'play': play, - 'pts': int(pts), - 'datetime': datetime.datetime.now() - } - item = BangumiItem() - item['tag'] = tag - item['title'] = title - item['data'] = data - yield item \ No newline at end of file + j = json.loads(response.xpath("//script[3]/text()").extract()[0][len('window.__INITIAL_STATE__='):].split(';')[0]) + for each in j['rankList']: + item = BangumiItem() + item['title'] = each['title'] + item['cover'] = each['cover'] + item['square_cover'] = each['square_cover'] + item['is_finish'] = each['is_finish'] + item['is_started'] = each['is_started'] + item['newest_ep_index'] = each['newest_ep_index'] + item['data'] = { + 'danmaku': each['dm_count'], + 'watch': each['fav'], + 'play': each['play'], + 'pts': each['pts'], + 'review': each['video_review'], + 'datetime': datetime.datetime.now() + } + yield item From cf1d7bb63810145d6652108f0d9dc92193108f20 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 4 Dec 2018 20:21:40 +0800 Subject: [PATCH 076/469] update: dangumi and donghua spider --- biliob_spider/items.py | 5 +++ biliob_spider/pipelines.py | 14 ++++++-- biliob_spider/spiders/bangumi.py | 55 +++++++++++-------------------- biliob_spider/spiders/donghua.py | 56 +++++++++++--------------------- 4 files changed, 55 insertions(+), 75 deletions(-) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 25f9c31..41a154e 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -22,6 +22,11 @@ class TagItem(scrapy.Item): class BangumiItem(scrapy.Item): title = scrapy.Field() tag = scrapy.Field() + cover = scrapy.Field() + square_cover = scrapy.Field() + is_finish = scrapy.Field() + is_started = scrapy.Field() + newest_ep_index = scrapy.Field() data = scrapy.Field() class VideoItem(scrapy.Item): diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 64c467f..bca2cb1 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -62,11 +62,16 @@ def process_item(self, item, spider): 'title': item['title'] }, { '$set': { - 'tag':item['tag'], 'title': item['title'], + 'cover': item['cover'], + 'isFinish': item['is_finish'], + 'isStarted': item['is_started'], + 'newest': item['newest_ep_index'], 'currentPts': item['data']['pts'], 'currentPlay': item['data']['play'], + 'squareCover': item['square_cover'], 'currentWatch': item['data']['watch'], + 'currentReview': item['data']['review'], 'currentDanmaku': item['data']['danmaku'] }, '$addToSet': { @@ -94,11 +99,16 @@ def process_item(self, item, spider): 'title': item['title'] }, { '$set': { - 'tag':item['tag'], 'title': item['title'], + 'cover': item['cover'], + 'isFinish': item['is_finish'], + 'isStarted': item['is_started'], + 'newest': item['newest_ep_index'], 'currentPts': item['data']['pts'], 'currentPlay': item['data']['play'], + 'squareCover': item['square_cover'], 'currentWatch': item['data']['watch'], + 'currentReview': item['data']['review'], 'currentDanmaku': item['data']['danmaku'] }, '$addToSet': { diff --git a/biliob_spider/spiders/bangumi.py b/biliob_spider/spiders/bangumi.py index 5cd4da7..d9d155b 100644 --- a/biliob_spider/spiders/bangumi.py +++ b/biliob_spider/spiders/bangumi.py @@ -4,6 +4,7 @@ from biliob_spider.items import BangumiItem import time import datetime +import json class BangumiSpider(scrapy.spiders.Spider): @@ -17,39 +18,21 @@ class BangumiSpider(scrapy.spiders.Spider): } def parse(self, response): - detail_href = response.xpath("//div[@class='img']/a/@href").extract() - - pts = response.xpath("//div[@class='pts']/div/text()").extract() - for (each_href, each_pts) in zip(detail_href, pts): - yield Request( - "https:" + each_href, - meta={'pts': each_pts}, - callback=self.detail_parse) - - def detail_parse(self, response): - pts = response.meta['pts'] - play = response.xpath( - '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[1]/em/text()' - ).extract()[0] - watch = response.xpath( - '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[2]/em/text()' - ).extract()[0] - danmaku = response.xpath( - '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[3]/em/text()' - ).extract()[0] - title = response.xpath( - '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[1]/span[1]/text()' - ).extract()[0] - tag = response.xpath('//span[@class="media-tag"]/text()').extract() - data = { - 'danmaku': danmaku, - 'watch': watch, - 'play': play, - 'pts': int(pts), - 'datetime': datetime.datetime.now() - } - item = BangumiItem() - item['tag'] = tag - item['title'] = title - item['data'] = data - yield item \ No newline at end of file + j = json.loads(response.xpath("//script[3]/text()").extract()[0][len('window.__INITIAL_STATE__='):].split(';')[0]) + for each in j['rankList']: + item = BangumiItem() + item['title'] = each['title'] + item['cover'] = each['cover'] + item['square_cover'] = each['square_cover'] + item['is_finish'] = each['is_finish'] + item['is_started'] = each['is_started'] + item['newest_ep_index'] = each['newest_ep_index'] + item['data'] = { + 'danmaku': each['dm_count'], + 'watch': each['fav'], + 'play': each['play'], + 'pts': each['pts'], + 'review': each['video_review'], + 'datetime': datetime.datetime.now() + } + yield item \ No newline at end of file diff --git a/biliob_spider/spiders/donghua.py b/biliob_spider/spiders/donghua.py index 0f856f1..63bb5af 100644 --- a/biliob_spider/spiders/donghua.py +++ b/biliob_spider/spiders/donghua.py @@ -4,7 +4,7 @@ from biliob_spider.items import BangumiItem import time import datetime - +import json class DonghuaSpider(scrapy.spiders.Spider): name = "donghua" @@ -17,39 +17,21 @@ class DonghuaSpider(scrapy.spiders.Spider): } def parse(self, response): - detail_href = response.xpath("//div[@class='img']/a/@href").extract() - - pts = response.xpath("//div[@class='pts']/div/text()").extract() - for (each_href, each_pts) in zip(detail_href, pts): - yield Request( - "https:" + each_href, - meta={'pts': each_pts}, - callback=self.detail_parse) - - def detail_parse(self, response): - pts = response.meta['pts'] - play = response.xpath( - '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[1]/em/text()' - ).extract()[0] - watch = response.xpath( - '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[2]/em/text()' - ).extract()[0] - danmaku = response.xpath( - '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[2]/div[1]/span[3]/em/text()' - ).extract()[0] - title = response.xpath( - '//*[@id="app"]/div[1]/div[2]/div/div[2]/div[1]/span[1]/text()' - ).extract()[0] - tag = response.xpath('//span[@class="media-tag"]/text()').extract() - data = { - 'danmaku': danmaku, - 'watch': watch, - 'play': play, - 'pts': int(pts), - 'datetime': datetime.datetime.now() - } - item = BangumiItem() - item['tag'] = tag - item['title'] = title - item['data'] = data - yield item \ No newline at end of file + j = json.loads(response.xpath("//script[3]/text()").extract()[0][len('window.__INITIAL_STATE__='):].split(';')[0]) + for each in j['rankList']: + item = BangumiItem() + item['title'] = each['title'] + item['cover'] = each['cover'] + item['square_cover'] = each['square_cover'] + item['is_finish'] = each['is_finish'] + item['is_started'] = each['is_started'] + item['newest_ep_index'] = each['newest_ep_index'] + item['data'] = { + 'danmaku': each['dm_count'], + 'watch': each['fav'], + 'play': each['play'], + 'pts': each['pts'], + 'review': each['video_review'], + 'datetime': datetime.datetime.now() + } + yield item From f21012be739c8a58652a2fa570ee413acbd908b5 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sun, 9 Dec 2018 16:53:10 +0800 Subject: [PATCH 077/469] feature: save current data as fields --- biliob_spider/items.py | 15 ++++++++++++ biliob_spider/pipelines.py | 16 +++++++++++- biliob_spider/spiders/author_update.py | 34 ++++++++++++++++++++------ biliob_spider/spiders/video_spider.py | 27 +++++++++++++------- 4 files changed, 74 insertions(+), 18 deletions(-) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 41a154e..e85c431 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -39,6 +39,14 @@ class VideoItem(scrapy.Item): title = scrapy.Field() mid = scrapy.Field() pic = scrapy.Field() + current_view = scrapy.Field() + current_favorite = scrapy.Field() + current_danmaku = scrapy.Field() + current_coin = scrapy.Field() + current_share = scrapy.Field() + current_like = scrapy.Field() + current_dislike = scrapy.Field() + current_datetime = scrapy.Field() class AuthorItem(scrapy.Item): mid = scrapy.Field() @@ -50,6 +58,13 @@ class AuthorItem(scrapy.Item): level = scrapy.Field() focus = scrapy.Field() pts = scrapy.Field() + c_fans = scrapy.Field() + c_attention = scrapy.Field() + c_archive = scrapy.Field() + c_article = scrapy.Field() + c_archive_view = scrapy.Field() + c_article_view = scrapy.Field() + c_datetime = scrapy.Field() class RankItem(scrapy.Item): title = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index bca2cb1..5103273 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -25,6 +25,14 @@ def process_item(self, item, spider): 'aid': int(item['aid']) }, { '$set': { + 'c_view':item['current_view'], + 'c_favorite':item['current_favorite'], + 'c_danmaku':item['current_danmaku'] , + 'c_coin':item['current_coin'], + 'c_share':item['current_share'] , + 'c_like':item['current_like'], + 'c_dislike':item['current_dislike'], + 'c_datetime':item['current_datetime'], 'author': item['author'], 'subChannel': item['subChannel'], 'channel': item['channel'], @@ -164,7 +172,13 @@ def process_item(self, item, spider): 'official': item['official'], 'level': item['level'], 'sex': item['sex'], - 'focus':True + 'focus':True, + 'c_fans':item['c_fans'], + 'c_attention':item['c_attention'] , + 'c_archive':item['c_archive'] , + 'c_article':item['c_article'] , + 'c_archive_view':item['c_archive_view'], + 'c_article_view':item['c_article_view'], }, '$push': { 'data': { diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index 4eadcf2..12f5280 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -31,7 +31,13 @@ def __init__(self): self.coll = self.db['author'] # 获得collection的句柄 def start_requests(self): - c = self.coll.find({'$or':[{'focus':True},{'forceFocus':True}]}, {"mid": 1}) + c = self.coll.find({ + '$or': [{ + 'focus': True + }, { + 'forceFocus': True + }] + }, {"mid": 1}) for each_doc in c: yield Request( "https://api.bilibili.com/x/web-interface/card?mid=" + str( @@ -66,18 +72,30 @@ def parse(self, response): 'article': int(article), 'datetime': datetime.datetime.now() } - yield Request("https://api.bilibili.com/x/space/upstat?mid={mid}".format(mid=str(mid)),meta={'item': item},method='GET',callback=self.parse_view) + item['c_fans'] = int(fans) + item['c_attention'] = int(attention) + item['c_archive'] = int(archive) + item['c_article'] = int(article) + yield Request( + "https://api.bilibili.com/x/space/upstat?mid={mid}".format( + mid=str(mid)), + meta={'item': item}, + method='GET', + callback=self.parse_view) except Exception as error: # 出现错误时打印错误日志 logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) - - def parse_view(self,response): + + def parse_view(self, response): j = json.loads(response.body) - archiveView = j['data']['archive']['view'] - articleView = j['data']['article']['view'] + archive_view = j['data']['archive']['view'] + article_view = j['data']['article']['view'] item = response.meta['item'] - item['data']['archiveView'] = archiveView - item['data']['articleView'] = articleView + item['data']['archiveView'] = archive_view + item['data']['articleView'] = article_view + item['c_archive_view'] = int(archive_view) + item['c_article_view'] = int(article_view) + yield item diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index f072934..30a4c55 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -160,24 +160,33 @@ def parse(self, response): share = d[each_key]['stat']['share'] like = d[each_key]['stat']['like'] dislike = d[each_key]['stat']['dislike'] - + current_date = datetime.now() data = { - 'view': int(view), - 'favorite': int(favorite), - 'danmaku': int(danmaku), - 'coin': int(coin), - 'share': int(share), - 'like': int(like), - 'dislike': int(dislike), - 'datetime': datetime.now() + 'view': view, + 'favorite': favorite, + 'danmaku': danmaku, + 'coin': coin, + 'share': share, + 'like': like, + 'dislike': dislike, + 'datetime': current_date } + subChannel = d[each_key]['tname'] title = d[each_key]['title'] date = d[each_key]['pubdate'] tid = d[each_key]['tid'] pic = d[each_key]['pic'] item = VideoItem() + item['current_view'] = view + item['current_favorite'] = favorite + item['current_danmaku'] = danmaku + item['current_coin'] = coin + item['current_share'] = share + item['current_like'] = like + item['current_dislike'] = dislike + item['current_datetime'] = current_date item['aid'] = aid item['mid'] = mid item['pic'] = pic From 9e7ca75a38c3fa02a7f8cdd95933aa029de35fc3 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sun, 9 Dec 2018 16:53:10 +0800 Subject: [PATCH 078/469] feature: save current data as fields --- biliob_spider/items.py | 15 ++++++++++++ biliob_spider/pipelines.py | 16 +++++++++++- biliob_spider/spiders/author_update.py | 34 ++++++++++++++++++++------ biliob_spider/spiders/video_spider.py | 27 +++++++++++++------- 4 files changed, 74 insertions(+), 18 deletions(-) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 41a154e..e85c431 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -39,6 +39,14 @@ class VideoItem(scrapy.Item): title = scrapy.Field() mid = scrapy.Field() pic = scrapy.Field() + current_view = scrapy.Field() + current_favorite = scrapy.Field() + current_danmaku = scrapy.Field() + current_coin = scrapy.Field() + current_share = scrapy.Field() + current_like = scrapy.Field() + current_dislike = scrapy.Field() + current_datetime = scrapy.Field() class AuthorItem(scrapy.Item): mid = scrapy.Field() @@ -50,6 +58,13 @@ class AuthorItem(scrapy.Item): level = scrapy.Field() focus = scrapy.Field() pts = scrapy.Field() + c_fans = scrapy.Field() + c_attention = scrapy.Field() + c_archive = scrapy.Field() + c_article = scrapy.Field() + c_archive_view = scrapy.Field() + c_article_view = scrapy.Field() + c_datetime = scrapy.Field() class RankItem(scrapy.Item): title = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index bca2cb1..5103273 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -25,6 +25,14 @@ def process_item(self, item, spider): 'aid': int(item['aid']) }, { '$set': { + 'c_view':item['current_view'], + 'c_favorite':item['current_favorite'], + 'c_danmaku':item['current_danmaku'] , + 'c_coin':item['current_coin'], + 'c_share':item['current_share'] , + 'c_like':item['current_like'], + 'c_dislike':item['current_dislike'], + 'c_datetime':item['current_datetime'], 'author': item['author'], 'subChannel': item['subChannel'], 'channel': item['channel'], @@ -164,7 +172,13 @@ def process_item(self, item, spider): 'official': item['official'], 'level': item['level'], 'sex': item['sex'], - 'focus':True + 'focus':True, + 'c_fans':item['c_fans'], + 'c_attention':item['c_attention'] , + 'c_archive':item['c_archive'] , + 'c_article':item['c_article'] , + 'c_archive_view':item['c_archive_view'], + 'c_article_view':item['c_article_view'], }, '$push': { 'data': { diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index 4eadcf2..12f5280 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -31,7 +31,13 @@ def __init__(self): self.coll = self.db['author'] # 获得collection的句柄 def start_requests(self): - c = self.coll.find({'$or':[{'focus':True},{'forceFocus':True}]}, {"mid": 1}) + c = self.coll.find({ + '$or': [{ + 'focus': True + }, { + 'forceFocus': True + }] + }, {"mid": 1}) for each_doc in c: yield Request( "https://api.bilibili.com/x/web-interface/card?mid=" + str( @@ -66,18 +72,30 @@ def parse(self, response): 'article': int(article), 'datetime': datetime.datetime.now() } - yield Request("https://api.bilibili.com/x/space/upstat?mid={mid}".format(mid=str(mid)),meta={'item': item},method='GET',callback=self.parse_view) + item['c_fans'] = int(fans) + item['c_attention'] = int(attention) + item['c_archive'] = int(archive) + item['c_article'] = int(article) + yield Request( + "https://api.bilibili.com/x/space/upstat?mid={mid}".format( + mid=str(mid)), + meta={'item': item}, + method='GET', + callback=self.parse_view) except Exception as error: # 出现错误时打印错误日志 logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) - - def parse_view(self,response): + + def parse_view(self, response): j = json.loads(response.body) - archiveView = j['data']['archive']['view'] - articleView = j['data']['article']['view'] + archive_view = j['data']['archive']['view'] + article_view = j['data']['article']['view'] item = response.meta['item'] - item['data']['archiveView'] = archiveView - item['data']['articleView'] = articleView + item['data']['archiveView'] = archive_view + item['data']['articleView'] = article_view + item['c_archive_view'] = int(archive_view) + item['c_article_view'] = int(article_view) + yield item diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index f072934..30a4c55 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -160,24 +160,33 @@ def parse(self, response): share = d[each_key]['stat']['share'] like = d[each_key]['stat']['like'] dislike = d[each_key]['stat']['dislike'] - + current_date = datetime.now() data = { - 'view': int(view), - 'favorite': int(favorite), - 'danmaku': int(danmaku), - 'coin': int(coin), - 'share': int(share), - 'like': int(like), - 'dislike': int(dislike), - 'datetime': datetime.now() + 'view': view, + 'favorite': favorite, + 'danmaku': danmaku, + 'coin': coin, + 'share': share, + 'like': like, + 'dislike': dislike, + 'datetime': current_date } + subChannel = d[each_key]['tname'] title = d[each_key]['title'] date = d[each_key]['pubdate'] tid = d[each_key]['tid'] pic = d[each_key]['pic'] item = VideoItem() + item['current_view'] = view + item['current_favorite'] = favorite + item['current_danmaku'] = danmaku + item['current_coin'] = coin + item['current_share'] = share + item['current_like'] = like + item['current_dislike'] = dislike + item['current_datetime'] = current_date item['aid'] = aid item['mid'] = mid item['pic'] = pic From 8ce2213e6adac8e239a457a207e0a8e68a3b69a4 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 11 Dec 2018 15:24:44 +0800 Subject: [PATCH 079/469] feature: auto caculate author fans change rant --- biliob_analyzer/author_rate_caculate.py | 38 +++++++++++++++++++++++++ biliob_spider/pipelines.py | 34 +++++++++++----------- run_analyzer.py | 1 + 3 files changed, 56 insertions(+), 17 deletions(-) create mode 100644 biliob_analyzer/author_rate_caculate.py diff --git a/biliob_analyzer/author_rate_caculate.py b/biliob_analyzer/author_rate_caculate.py new file mode 100644 index 0000000..445fc12 --- /dev/null +++ b/biliob_analyzer/author_rate_caculate.py @@ -0,0 +1,38 @@ +from db import settings +from db import db +import datetime +coll = db['author'] # 获得collection的句柄 +for each_author in coll.find({'$or':[{'focus':True},{'forceFocus':True}]}): + rate = [] + i = 0 + if('data' not in each_author or len(each_author['data'])<(i+2)): + continue + c_fans = each_author['data'][i]['fans'] + c_date = each_author['data'][i]['datetime'] + p_fans = each_author['data'][i+1]['fans'] + p_date = each_author['data'][i+1]['datetime'] + while i < len(each_author['data']): + delta_seconds = (c_date-p_date).seconds + delta_day = (c_date-p_date).days + while (delta_day < 1) and i < len(each_author['data'])-2: + i = i + 1 + p_fans = each_author['data'][i+1]['fans'] + p_date = each_author['data'][i+1]['datetime'] + delta_day = (c_date-p_date).days + delta_seconds = (c_date-p_date).seconds + if(i >= len(each_author['data'])-2) and len(rate) != 0: + coll.update_one({ + 'mid': each_author['mid'] + }, { + '$set': { + 'fansRate': rate, + 'cRate': rate[0]['rate'] + } + }, True) + break + delta_fans = c_fans-p_fans + day = delta_day+delta_seconds/(60*24*60) + rate.append({'rate':int(delta_fans/day),'datetime':c_date}) + c_fans = p_fans + c_date = p_date + pass diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 5103273..1402f68 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -25,14 +25,14 @@ def process_item(self, item, spider): 'aid': int(item['aid']) }, { '$set': { - 'c_view':item['current_view'], - 'c_favorite':item['current_favorite'], - 'c_danmaku':item['current_danmaku'] , - 'c_coin':item['current_coin'], - 'c_share':item['current_share'] , - 'c_like':item['current_like'], - 'c_dislike':item['current_dislike'], - 'c_datetime':item['current_datetime'], + 'cView':item['current_view'], + 'cFavorite':item['current_favorite'], + 'cDanmaku':item['current_danmaku'] , + 'cCoin':item['current_coin'], + 'cShare':item['current_share'] , + 'cLike':item['current_like'], + 'cDislike':item['current_dislike'], + 'cDatetime':item['current_datetime'], 'author': item['author'], 'subChannel': item['subChannel'], 'channel': item['channel'], @@ -167,18 +167,18 @@ def process_item(self, item, spider): 'mid': item['mid'] }, { '$set': { + 'focus':True, + 'sex': item['sex'], 'name': item['name'], 'face': item['face'], - 'official': item['official'], 'level': item['level'], - 'sex': item['sex'], - 'focus':True, - 'c_fans':item['c_fans'], - 'c_attention':item['c_attention'] , - 'c_archive':item['c_archive'] , - 'c_article':item['c_article'] , - 'c_archive_view':item['c_archive_view'], - 'c_article_view':item['c_article_view'], + 'cFans':item['c_fans'], + 'official': item['official'], + 'cArchive':item['c_archive'] , + 'cArticle':item['c_article'] , + 'cAttention':item['c_attention'] , + 'cArchive_view':item['c_archive_view'], + 'cArticle_view':item['c_article_view'], }, '$push': { 'data': { diff --git a/run_analyzer.py b/run_analyzer.py index fffda00..7673448 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -1,5 +1,6 @@ from biliob_analyzer.author_analyzer import AuthorAnalyzer from biliob_analyzer.video_analyzer import VideoAnalyzer +import biliob_analyzer.author_rate_caculate author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From 6df463397f316d944467cd00bc937c05a988bca3 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 11 Dec 2018 15:24:44 +0800 Subject: [PATCH 080/469] feature: auto caculate author fans change rant --- biliob_analyzer/author_rate_caculate.py | 38 +++++++++++++++++++++++++ biliob_spider/pipelines.py | 34 +++++++++++----------- run_analyzer.py | 1 + 3 files changed, 56 insertions(+), 17 deletions(-) create mode 100644 biliob_analyzer/author_rate_caculate.py diff --git a/biliob_analyzer/author_rate_caculate.py b/biliob_analyzer/author_rate_caculate.py new file mode 100644 index 0000000..445fc12 --- /dev/null +++ b/biliob_analyzer/author_rate_caculate.py @@ -0,0 +1,38 @@ +from db import settings +from db import db +import datetime +coll = db['author'] # 获得collection的句柄 +for each_author in coll.find({'$or':[{'focus':True},{'forceFocus':True}]}): + rate = [] + i = 0 + if('data' not in each_author or len(each_author['data'])<(i+2)): + continue + c_fans = each_author['data'][i]['fans'] + c_date = each_author['data'][i]['datetime'] + p_fans = each_author['data'][i+1]['fans'] + p_date = each_author['data'][i+1]['datetime'] + while i < len(each_author['data']): + delta_seconds = (c_date-p_date).seconds + delta_day = (c_date-p_date).days + while (delta_day < 1) and i < len(each_author['data'])-2: + i = i + 1 + p_fans = each_author['data'][i+1]['fans'] + p_date = each_author['data'][i+1]['datetime'] + delta_day = (c_date-p_date).days + delta_seconds = (c_date-p_date).seconds + if(i >= len(each_author['data'])-2) and len(rate) != 0: + coll.update_one({ + 'mid': each_author['mid'] + }, { + '$set': { + 'fansRate': rate, + 'cRate': rate[0]['rate'] + } + }, True) + break + delta_fans = c_fans-p_fans + day = delta_day+delta_seconds/(60*24*60) + rate.append({'rate':int(delta_fans/day),'datetime':c_date}) + c_fans = p_fans + c_date = p_date + pass diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 5103273..1402f68 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -25,14 +25,14 @@ def process_item(self, item, spider): 'aid': int(item['aid']) }, { '$set': { - 'c_view':item['current_view'], - 'c_favorite':item['current_favorite'], - 'c_danmaku':item['current_danmaku'] , - 'c_coin':item['current_coin'], - 'c_share':item['current_share'] , - 'c_like':item['current_like'], - 'c_dislike':item['current_dislike'], - 'c_datetime':item['current_datetime'], + 'cView':item['current_view'], + 'cFavorite':item['current_favorite'], + 'cDanmaku':item['current_danmaku'] , + 'cCoin':item['current_coin'], + 'cShare':item['current_share'] , + 'cLike':item['current_like'], + 'cDislike':item['current_dislike'], + 'cDatetime':item['current_datetime'], 'author': item['author'], 'subChannel': item['subChannel'], 'channel': item['channel'], @@ -167,18 +167,18 @@ def process_item(self, item, spider): 'mid': item['mid'] }, { '$set': { + 'focus':True, + 'sex': item['sex'], 'name': item['name'], 'face': item['face'], - 'official': item['official'], 'level': item['level'], - 'sex': item['sex'], - 'focus':True, - 'c_fans':item['c_fans'], - 'c_attention':item['c_attention'] , - 'c_archive':item['c_archive'] , - 'c_article':item['c_article'] , - 'c_archive_view':item['c_archive_view'], - 'c_article_view':item['c_article_view'], + 'cFans':item['c_fans'], + 'official': item['official'], + 'cArchive':item['c_archive'] , + 'cArticle':item['c_article'] , + 'cAttention':item['c_attention'] , + 'cArchive_view':item['c_archive_view'], + 'cArticle_view':item['c_article_view'], }, '$push': { 'data': { diff --git a/run_analyzer.py b/run_analyzer.py index fffda00..7673448 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -1,5 +1,6 @@ from biliob_analyzer.author_analyzer import AuthorAnalyzer from biliob_analyzer.video_analyzer import VideoAnalyzer +import biliob_analyzer.author_rate_caculate author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From 359789f8262de74ce157c9810919270299c5cc68 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 19 Dec 2018 22:18:18 +0800 Subject: [PATCH 081/469] hotfix: author fans rate --- biliob_analyzer/author_fans_watcher.py | 39 +++++++++ biliob_analyzer/author_rate_caculate.py | 104 +++++++++++++++++------- biliob_analyzer/delete_dump.py | 19 +---- biliob_analyzer/delete_robot.py | 25 ++++++ run.py | 1 - 5 files changed, 139 insertions(+), 49 deletions(-) create mode 100644 biliob_analyzer/author_fans_watcher.py create mode 100644 biliob_analyzer/delete_robot.py diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py new file mode 100644 index 0000000..3068c2f --- /dev/null +++ b/biliob_analyzer/author_fans_watcher.py @@ -0,0 +1,39 @@ +from db import settings +from db import db +import datetime +coll = db['author'] # 获得collection的句柄 + +MAGNIFICATION_INCREASE = 5 +MAGNIFICATION_DECREASE = 2 +FANS_INCREASE_THRESHOLD = 10000 +FANS_DECREASE_THRESHOLD = -3000 +for each_author in coll.find(): + if 'fansRate' in each_author and len(each_author['fansRate']) > 1: + index = 1 + while index < len(each_author['fansRate']): + # 涨粉超高 + c_index = index - 1 + if each_author['fansRate'][c_index][ + 'rate'] > FANS_INCREASE_THRESHOLD and each_author[ + 'fansRate'][c_index]['rate'] > each_author['fansRate'][ + index]['rate'] * MAGNIFICATION_INCREASE: + print('检测到大量涨粉:{name},速率:{rate},时间:{datetime}'.format( + name=each_author['name'], + rate=each_author['fansRate'][c_index]['rate'], + datetime=each_author['fansRate'][c_index]['datetime'])) + + if each_author['fansRate'][c_index]['rate'] < FANS_DECREASE_THRESHOLD and each_author['fansRate'][index]['rate'] > 1000: + print('检测到突然掉粉:{name},速率:{rate},时间:{datetime}'.format( + name=each_author['name'], + rate=each_author['fansRate'][c_index]['rate'], + datetime=each_author['fansRate'][c_index]['datetime'])) + + elif each_author['fansRate'][c_index][ + 'rate'] < FANS_DECREASE_THRESHOLD and abs( + each_author['fansRate'][c_index]['rate']) > abs( + each_author['fansRate'][index]['rate']) * MAGNIFICATION_DECREASE: + print('检测到大量掉粉:{name},速率:{rate},时间:{datetime}'.format( + name=each_author['name'], + rate=each_author['fansRate'][c_index]['rate'], + datetime=each_author['fansRate'][c_index]['datetime'])) + index += 1 diff --git a/biliob_analyzer/author_rate_caculate.py b/biliob_analyzer/author_rate_caculate.py index 445fc12..72126a7 100644 --- a/biliob_analyzer/author_rate_caculate.py +++ b/biliob_analyzer/author_rate_caculate.py @@ -2,37 +2,79 @@ from db import db import datetime coll = db['author'] # 获得collection的句柄 -for each_author in coll.find({'$or':[{'focus':True},{'forceFocus':True}]}): +for each_author in coll.find().batch_size(4): rate = [] i = 0 - if('data' not in each_author or len(each_author['data'])<(i+2)): + # 数据量小于等于2条 + if ('data' not in each_author or len(each_author['data']) < (i + 2)): continue - c_fans = each_author['data'][i]['fans'] - c_date = each_author['data'][i]['datetime'] - p_fans = each_author['data'][i+1]['fans'] - p_date = each_author['data'][i+1]['datetime'] - while i < len(each_author['data']): - delta_seconds = (c_date-p_date).seconds - delta_day = (c_date-p_date).days - while (delta_day < 1) and i < len(each_author['data'])-2: - i = i + 1 - p_fans = each_author['data'][i+1]['fans'] - p_date = each_author['data'][i+1]['datetime'] - delta_day = (c_date-p_date).days - delta_seconds = (c_date-p_date).seconds - if(i >= len(each_author['data'])-2) and len(rate) != 0: - coll.update_one({ - 'mid': each_author['mid'] - }, { - '$set': { - 'fansRate': rate, - 'cRate': rate[0]['rate'] - } - }, True) - break - delta_fans = c_fans-p_fans - day = delta_day+delta_seconds/(60*24*60) - rate.append({'rate':int(delta_fans/day),'datetime':c_date}) - c_fans = p_fans - c_date = p_date - pass + + def next_c(i): + return each_author['data'][i]['fans'], each_author['data'][i][ + 'datetime'], each_author['data'][i][ + 'datetime'] - datetime.timedelta( + hours=each_author['data'][i][ + 'datetime'].hour, seconds=each_author['data'][i][ + 'datetime'].second,microseconds=each_author['data'][i]['datetime'].microsecond,minutes=each_author['data'][i]['datetime'].minute) + + c_fans, c_datetime, c_date = next_c(i) + + def next_p(i): + return each_author['data'][i + 1]['fans'], each_author['data'][ + i + 1]['datetime'], each_author['data'][ + i + 1]['datetime'] - datetime.timedelta( + hours=each_author['data'][i + 1]['datetime'].hour, + seconds=each_author['data'][i + 1]['datetime'].second, + microseconds=each_author['data'][i + 1]['datetime']. + microsecond, + minutes=each_author['data'][i + 1]['datetime'].minute) + + p_fans, p_datetime, p_date = next_p(i) + + + # 相差粉丝数 + delta_fans = c_fans - p_fans + # 相差日期数 + days = c_datetime.day - p_datetime.day + # 相差秒数 + seconds = days + (c_datetime.second - p_datetime.second) + + while i < len(each_author['data']) - 2: + # 是同一天 + if c_datetime.day == p_datetime.day: + i += 1 + p_fans, p_datetime, p_date = next_p(i) + continue + + # 相差一天 + if (c_date - p_date).days == 1: + delta_fans = c_fans - p_fans + seconds = days + (c_datetime.second - p_datetime.second) + rate.append({'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}) + i += 1 + c_fans, c_datetime, c_date = next_c(i) + p_fans, p_datetime, p_date = next_p(i) + delta_fans = c_fans - p_fans + seconds = days + (c_datetime.second - p_datetime.second) + continue + + # 相差多天 + days = (c_date - p_date).days + while days > 1: + t_rate = delta_fans/(days + seconds/(60*60*24)) + t_date = c_date - datetime.timedelta(1) + t_fans = c_fans - t_rate + delta_fans = c_fans - t_fans + rate.append({'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}) + c_fans = t_fans + c_date = t_date + days -= 1 + + if len(rate) != 0: + coll.update_one({ + 'mid': each_author['mid'] + }, {'$set': { + 'fansRate': rate, + 'cRate': rate[0]['rate'] + }}, True) + pass diff --git a/biliob_analyzer/delete_dump.py b/biliob_analyzer/delete_dump.py index 7f2e8ba..777533a 100644 --- a/biliob_analyzer/delete_dump.py +++ b/biliob_analyzer/delete_dump.py @@ -1,21 +1,6 @@ from db import db import functools coll = db['user'] -f = coll.find() -names = set() +f = coll.find({"name": {"$exists": True, "$regex": '/^.{25,}$/'}}) for each in f: - names.add(each['name']) -for each_name in names: - f = coll.find({'name': each_name}) - while f.count() > 1: - a= coll.delete_one({ - 'name': each_name, - 'favoriteMid': { - '$exists': False - }, - 'favoriteAid': { - '$exists': False - } - }) - print(a) - f = coll.find({'name': each_name}) + print(each) diff --git a/biliob_analyzer/delete_robot.py b/biliob_analyzer/delete_robot.py new file mode 100644 index 0000000..2470548 --- /dev/null +++ b/biliob_analyzer/delete_robot.py @@ -0,0 +1,25 @@ +from db import settings +from db import db +from bson import ObjectId +coll = db['user'] # 获得collection的句柄 +d = coll.find({ + 'favoriteAid': { + '$exists': False + }, + 'favoriteMid': { + '$exists': False + }, + '_id': { + '$gt': ObjectId('5c139d1ca3d20a2e31d717f3') + } +}).batch_size(100) +# for each in d: +# if len(each['name'])>20: +# coll.delete_one({'name':each['name']}) +# pass + +s = [';','\\',']','[',',','_','<','`','.','\'','!','~','>',':','/',':','#','(',')'] +for each in d: + print(each['name']) + coll.delete_one({'name':each['name']}) + pass \ No newline at end of file diff --git a/run.py b/run.py index 3f7dc74..a53ba34 100644 --- a/run.py +++ b/run.py @@ -61,7 +61,6 @@ def run_threaded(job_func): job_thread.start() schedule.every().day.at('12:00').do(run_threaded,data_analyze) - schedule.every().day.at('01:00').do(run_threaded,update_author) schedule.every().day.at('07:00').do(run_threaded,video_spider) schedule.every().day.at('14:00').do(run_threaded,auto_add_author) From 34c12325b8b913b93f3590b2fa09b726ec71b1d8 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 19 Dec 2018 22:18:18 +0800 Subject: [PATCH 082/469] hotfix: author fans rate --- biliob_analyzer/author_fans_watcher.py | 39 +++++++++ biliob_analyzer/author_rate_caculate.py | 104 +++++++++++++++++------- biliob_analyzer/delete_dump.py | 19 +---- biliob_analyzer/delete_robot.py | 25 ++++++ run.py | 1 - 5 files changed, 139 insertions(+), 49 deletions(-) create mode 100644 biliob_analyzer/author_fans_watcher.py create mode 100644 biliob_analyzer/delete_robot.py diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py new file mode 100644 index 0000000..3068c2f --- /dev/null +++ b/biliob_analyzer/author_fans_watcher.py @@ -0,0 +1,39 @@ +from db import settings +from db import db +import datetime +coll = db['author'] # 获得collection的句柄 + +MAGNIFICATION_INCREASE = 5 +MAGNIFICATION_DECREASE = 2 +FANS_INCREASE_THRESHOLD = 10000 +FANS_DECREASE_THRESHOLD = -3000 +for each_author in coll.find(): + if 'fansRate' in each_author and len(each_author['fansRate']) > 1: + index = 1 + while index < len(each_author['fansRate']): + # 涨粉超高 + c_index = index - 1 + if each_author['fansRate'][c_index][ + 'rate'] > FANS_INCREASE_THRESHOLD and each_author[ + 'fansRate'][c_index]['rate'] > each_author['fansRate'][ + index]['rate'] * MAGNIFICATION_INCREASE: + print('检测到大量涨粉:{name},速率:{rate},时间:{datetime}'.format( + name=each_author['name'], + rate=each_author['fansRate'][c_index]['rate'], + datetime=each_author['fansRate'][c_index]['datetime'])) + + if each_author['fansRate'][c_index]['rate'] < FANS_DECREASE_THRESHOLD and each_author['fansRate'][index]['rate'] > 1000: + print('检测到突然掉粉:{name},速率:{rate},时间:{datetime}'.format( + name=each_author['name'], + rate=each_author['fansRate'][c_index]['rate'], + datetime=each_author['fansRate'][c_index]['datetime'])) + + elif each_author['fansRate'][c_index][ + 'rate'] < FANS_DECREASE_THRESHOLD and abs( + each_author['fansRate'][c_index]['rate']) > abs( + each_author['fansRate'][index]['rate']) * MAGNIFICATION_DECREASE: + print('检测到大量掉粉:{name},速率:{rate},时间:{datetime}'.format( + name=each_author['name'], + rate=each_author['fansRate'][c_index]['rate'], + datetime=each_author['fansRate'][c_index]['datetime'])) + index += 1 diff --git a/biliob_analyzer/author_rate_caculate.py b/biliob_analyzer/author_rate_caculate.py index 445fc12..72126a7 100644 --- a/biliob_analyzer/author_rate_caculate.py +++ b/biliob_analyzer/author_rate_caculate.py @@ -2,37 +2,79 @@ from db import db import datetime coll = db['author'] # 获得collection的句柄 -for each_author in coll.find({'$or':[{'focus':True},{'forceFocus':True}]}): +for each_author in coll.find().batch_size(4): rate = [] i = 0 - if('data' not in each_author or len(each_author['data'])<(i+2)): + # 数据量小于等于2条 + if ('data' not in each_author or len(each_author['data']) < (i + 2)): continue - c_fans = each_author['data'][i]['fans'] - c_date = each_author['data'][i]['datetime'] - p_fans = each_author['data'][i+1]['fans'] - p_date = each_author['data'][i+1]['datetime'] - while i < len(each_author['data']): - delta_seconds = (c_date-p_date).seconds - delta_day = (c_date-p_date).days - while (delta_day < 1) and i < len(each_author['data'])-2: - i = i + 1 - p_fans = each_author['data'][i+1]['fans'] - p_date = each_author['data'][i+1]['datetime'] - delta_day = (c_date-p_date).days - delta_seconds = (c_date-p_date).seconds - if(i >= len(each_author['data'])-2) and len(rate) != 0: - coll.update_one({ - 'mid': each_author['mid'] - }, { - '$set': { - 'fansRate': rate, - 'cRate': rate[0]['rate'] - } - }, True) - break - delta_fans = c_fans-p_fans - day = delta_day+delta_seconds/(60*24*60) - rate.append({'rate':int(delta_fans/day),'datetime':c_date}) - c_fans = p_fans - c_date = p_date - pass + + def next_c(i): + return each_author['data'][i]['fans'], each_author['data'][i][ + 'datetime'], each_author['data'][i][ + 'datetime'] - datetime.timedelta( + hours=each_author['data'][i][ + 'datetime'].hour, seconds=each_author['data'][i][ + 'datetime'].second,microseconds=each_author['data'][i]['datetime'].microsecond,minutes=each_author['data'][i]['datetime'].minute) + + c_fans, c_datetime, c_date = next_c(i) + + def next_p(i): + return each_author['data'][i + 1]['fans'], each_author['data'][ + i + 1]['datetime'], each_author['data'][ + i + 1]['datetime'] - datetime.timedelta( + hours=each_author['data'][i + 1]['datetime'].hour, + seconds=each_author['data'][i + 1]['datetime'].second, + microseconds=each_author['data'][i + 1]['datetime']. + microsecond, + minutes=each_author['data'][i + 1]['datetime'].minute) + + p_fans, p_datetime, p_date = next_p(i) + + + # 相差粉丝数 + delta_fans = c_fans - p_fans + # 相差日期数 + days = c_datetime.day - p_datetime.day + # 相差秒数 + seconds = days + (c_datetime.second - p_datetime.second) + + while i < len(each_author['data']) - 2: + # 是同一天 + if c_datetime.day == p_datetime.day: + i += 1 + p_fans, p_datetime, p_date = next_p(i) + continue + + # 相差一天 + if (c_date - p_date).days == 1: + delta_fans = c_fans - p_fans + seconds = days + (c_datetime.second - p_datetime.second) + rate.append({'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}) + i += 1 + c_fans, c_datetime, c_date = next_c(i) + p_fans, p_datetime, p_date = next_p(i) + delta_fans = c_fans - p_fans + seconds = days + (c_datetime.second - p_datetime.second) + continue + + # 相差多天 + days = (c_date - p_date).days + while days > 1: + t_rate = delta_fans/(days + seconds/(60*60*24)) + t_date = c_date - datetime.timedelta(1) + t_fans = c_fans - t_rate + delta_fans = c_fans - t_fans + rate.append({'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}) + c_fans = t_fans + c_date = t_date + days -= 1 + + if len(rate) != 0: + coll.update_one({ + 'mid': each_author['mid'] + }, {'$set': { + 'fansRate': rate, + 'cRate': rate[0]['rate'] + }}, True) + pass diff --git a/biliob_analyzer/delete_dump.py b/biliob_analyzer/delete_dump.py index 7f2e8ba..777533a 100644 --- a/biliob_analyzer/delete_dump.py +++ b/biliob_analyzer/delete_dump.py @@ -1,21 +1,6 @@ from db import db import functools coll = db['user'] -f = coll.find() -names = set() +f = coll.find({"name": {"$exists": True, "$regex": '/^.{25,}$/'}}) for each in f: - names.add(each['name']) -for each_name in names: - f = coll.find({'name': each_name}) - while f.count() > 1: - a= coll.delete_one({ - 'name': each_name, - 'favoriteMid': { - '$exists': False - }, - 'favoriteAid': { - '$exists': False - } - }) - print(a) - f = coll.find({'name': each_name}) + print(each) diff --git a/biliob_analyzer/delete_robot.py b/biliob_analyzer/delete_robot.py new file mode 100644 index 0000000..2470548 --- /dev/null +++ b/biliob_analyzer/delete_robot.py @@ -0,0 +1,25 @@ +from db import settings +from db import db +from bson import ObjectId +coll = db['user'] # 获得collection的句柄 +d = coll.find({ + 'favoriteAid': { + '$exists': False + }, + 'favoriteMid': { + '$exists': False + }, + '_id': { + '$gt': ObjectId('5c139d1ca3d20a2e31d717f3') + } +}).batch_size(100) +# for each in d: +# if len(each['name'])>20: +# coll.delete_one({'name':each['name']}) +# pass + +s = [';','\\',']','[',',','_','<','`','.','\'','!','~','>',':','/',':','#','(',')'] +for each in d: + print(each['name']) + coll.delete_one({'name':each['name']}) + pass \ No newline at end of file diff --git a/run.py b/run.py index 3f7dc74..a53ba34 100644 --- a/run.py +++ b/run.py @@ -61,7 +61,6 @@ def run_threaded(job_func): job_thread.start() schedule.every().day.at('12:00').do(run_threaded,data_analyze) - schedule.every().day.at('01:00').do(run_threaded,update_author) schedule.every().day.at('07:00').do(run_threaded,video_spider) schedule.every().day.at('14:00').do(run_threaded,auto_add_author) From 425afd96c32a670af88a461bfc2f74a884f771fc Mon Sep 17 00:00:00 2001 From: Jannchie Date: Thu, 20 Dec 2018 16:22:31 +0800 Subject: [PATCH 083/469] feature: author fans rate of change caculate --- biliob_analyzer/author_rate_caculate.py | 39 +++++++++++++++++-------- 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/biliob_analyzer/author_rate_caculate.py b/biliob_analyzer/author_rate_caculate.py index 72126a7..607b068 100644 --- a/biliob_analyzer/author_rate_caculate.py +++ b/biliob_analyzer/author_rate_caculate.py @@ -2,12 +2,17 @@ from db import db import datetime coll = db['author'] # 获得collection的句柄 -for each_author in coll.find().batch_size(4): +for each_author in coll.find().batch_size(8): rate = [] i = 0 # 数据量小于等于2条 if ('data' not in each_author or len(each_author['data']) < (i + 2)): continue + if ('fansRate' in each_author): + lastest_date = each_author['fansRate'][0]['datetime'] + + def getDate(date): + return date - datetime.timedelta(hours=date.hour, seconds=date.second,microseconds=date.microsecond,minutes=date.minute) def next_c(i): return each_author['data'][i]['fans'], each_author['data'][i][ @@ -40,6 +45,11 @@ def next_p(i): seconds = days + (c_datetime.second - p_datetime.second) while i < len(each_author['data']) - 2: + + # 已经有了该日期的数据 + if 'fansRate' in each_author and c_date <= lastest_date: + break + # 是同一天 if c_datetime.day == p_datetime.day: i += 1 @@ -50,7 +60,14 @@ def next_p(i): if (c_date - p_date).days == 1: delta_fans = c_fans - p_fans seconds = days + (c_datetime.second - p_datetime.second) - rate.append({'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}) + coll.update_one({ + 'mid': each_author['mid'] + }, {'$push': { + 'fansRate': { + '$each': [{'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}], + '$position': 0 + } + }}, True) i += 1 c_fans, c_datetime, c_date = next_c(i) p_fans, p_datetime, p_date = next_p(i) @@ -65,16 +82,14 @@ def next_p(i): t_date = c_date - datetime.timedelta(1) t_fans = c_fans - t_rate delta_fans = c_fans - t_fans - rate.append({'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}) + coll.update_one({ + 'mid': each_author['mid'] + }, {'$push': { + 'fansRate': { + '$each': [{'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}], + '$position': 0 + } + }}, True) c_fans = t_fans c_date = t_date days -= 1 - - if len(rate) != 0: - coll.update_one({ - 'mid': each_author['mid'] - }, {'$set': { - 'fansRate': rate, - 'cRate': rate[0]['rate'] - }}, True) - pass From fa687f98b93ced509ff02f4ba81b7ba0525dbfbd Mon Sep 17 00:00:00 2001 From: Jannchie Date: Thu, 20 Dec 2018 16:22:31 +0800 Subject: [PATCH 084/469] feature: author fans rate of change caculate --- biliob_analyzer/author_rate_caculate.py | 39 +++++++++++++++++-------- 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/biliob_analyzer/author_rate_caculate.py b/biliob_analyzer/author_rate_caculate.py index 72126a7..607b068 100644 --- a/biliob_analyzer/author_rate_caculate.py +++ b/biliob_analyzer/author_rate_caculate.py @@ -2,12 +2,17 @@ from db import db import datetime coll = db['author'] # 获得collection的句柄 -for each_author in coll.find().batch_size(4): +for each_author in coll.find().batch_size(8): rate = [] i = 0 # 数据量小于等于2条 if ('data' not in each_author or len(each_author['data']) < (i + 2)): continue + if ('fansRate' in each_author): + lastest_date = each_author['fansRate'][0]['datetime'] + + def getDate(date): + return date - datetime.timedelta(hours=date.hour, seconds=date.second,microseconds=date.microsecond,minutes=date.minute) def next_c(i): return each_author['data'][i]['fans'], each_author['data'][i][ @@ -40,6 +45,11 @@ def next_p(i): seconds = days + (c_datetime.second - p_datetime.second) while i < len(each_author['data']) - 2: + + # 已经有了该日期的数据 + if 'fansRate' in each_author and c_date <= lastest_date: + break + # 是同一天 if c_datetime.day == p_datetime.day: i += 1 @@ -50,7 +60,14 @@ def next_p(i): if (c_date - p_date).days == 1: delta_fans = c_fans - p_fans seconds = days + (c_datetime.second - p_datetime.second) - rate.append({'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}) + coll.update_one({ + 'mid': each_author['mid'] + }, {'$push': { + 'fansRate': { + '$each': [{'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}], + '$position': 0 + } + }}, True) i += 1 c_fans, c_datetime, c_date = next_c(i) p_fans, p_datetime, p_date = next_p(i) @@ -65,16 +82,14 @@ def next_p(i): t_date = c_date - datetime.timedelta(1) t_fans = c_fans - t_rate delta_fans = c_fans - t_fans - rate.append({'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}) + coll.update_one({ + 'mid': each_author['mid'] + }, {'$push': { + 'fansRate': { + '$each': [{'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}], + '$position': 0 + } + }}, True) c_fans = t_fans c_date = t_date days -= 1 - - if len(rate) != 0: - coll.update_one({ - 'mid': each_author['mid'] - }, {'$set': { - 'fansRate': rate, - 'cRate': rate[0]['rate'] - }}, True) - pass From 4599cb2b471b54211c42eda5154dd73000832716 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Thu, 20 Dec 2018 16:38:39 +0800 Subject: [PATCH 085/469] feature: author fans watcher --- biliob_analyzer/author_fans_watcher.py | 135 +++++++++++++++++++++---- run_analyzer.py | 1 + 2 files changed, 119 insertions(+), 17 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index 3068c2f..87ebfae 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -1,39 +1,140 @@ from db import settings from db import db import datetime +from enum import Enum coll = db['author'] # 获得collection的句柄 +monitor = db['monitor'] # 获得collection的句柄 +video = db['video'] # 获得collection的句柄 -MAGNIFICATION_INCREASE = 5 +# 对于上升趋势的UP主,较上日增加多少倍,才算是巨量涨粉 +WTF_INCREASE = 10 + +# 对于上升趋势的UP主,较上日增加多少倍,才算是超量涨粉 +AMAZING_INCREASE = 5 + +# 对于上升趋势的UP主,较上日增加多少倍,才算是大量涨粉 +MAGNIFICATION_INCREASE = 3 + +# 对于下降趋势的UP主,较上日减少多少倍,才算是大量掉粉 MAGNIFICATION_DECREASE = 2 + +# 对于上升趋势的UP主,较上日增加多少倍,才算是超量涨粉 +AMAZING_DECREASE = 5 + +# 对于下降趋势的UP主,较上日减少多少倍,才算是巨量掉粉 +WTF_DECREASE = 8 + +# 粉丝增加多少,才算大量涨粉 FANS_INCREASE_THRESHOLD = 10000 +# 粉丝减少多少,算作大量掉粉 FANS_DECREASE_THRESHOLD = -3000 -for each_author in coll.find(): +# 多少粉丝以上才关注掉粉 +WATCH_DECREASE = 1000 + +class Event(Enum): + increase_1 = 'I级增长' + increase_2 = 'II级猛增' + increase_3 = 'III级激增' + sudden_fall = 'SF级骤减' + decrease_1 = 'I级减少' + decrease_2 = 'II级锐减' + decrease_3 = 'III级暴减' + + +for each_author in coll.find().batch_size(8): if 'fansRate' in each_author and len(each_author['fansRate']) > 1: index = 1 + + def print_data(each_author): + return '{name},速率:{rate},时间:{datetime}'.format( + name=each_author['name'], + rate=each_author['fansRate'][c_index]['rate'], + datetime=each_author['fansRate'][c_index]['datetime']) + + def insert_event(event_type): + videos = video.find({'mid':each_author['mid']}) + temp_video = {} + cause = {} + for each_v in videos: + if abs(each_v['datetime'] - each_author['fansRate'][c_index]['datetime']).days <= 1: + if 'cView' not in temp_video or each_v['cView'] > temp_video['cView']: + temp_video['aid'] = each_v['aid'] + temp_video['title'] = each_v['title'] + temp_video['cView'] = each_v['cView'] + cause = { + 'type': 'video', + 'aid': temp_video['aid'], + 'title': temp_video['title'], + 'cView': temp_video['cView'] + } + + monitor.insert_one({ + 'type': + event_type.value, + 'mid': + each_author['mid'], + 'author': + each_author['name'], + 'rate': + each_author['fansRate'][c_index]['rate'], + 'datetime': + each_author['fansRate'][c_index]['datetime'], + 'cause': cause + }) + while index < len(each_author['fansRate']): # 涨粉超高 c_index = index - 1 if each_author['fansRate'][c_index][ 'rate'] > FANS_INCREASE_THRESHOLD and each_author[ 'fansRate'][c_index]['rate'] > each_author['fansRate'][ - index]['rate'] * MAGNIFICATION_INCREASE: - print('检测到大量涨粉:{name},速率:{rate},时间:{datetime}'.format( - name=each_author['name'], - rate=each_author['fansRate'][c_index]['rate'], - datetime=each_author['fansRate'][c_index]['datetime'])) + index]['rate'] * WTF_INCREASE: + insert_event(Event.increase_3) + print(Event.increase_3.value + print_data(each_author)) - if each_author['fansRate'][c_index]['rate'] < FANS_DECREASE_THRESHOLD and each_author['fansRate'][index]['rate'] > 1000: - print('检测到突然掉粉:{name},速率:{rate},时间:{datetime}'.format( - name=each_author['name'], - rate=each_author['fansRate'][c_index]['rate'], - datetime=each_author['fansRate'][c_index]['datetime'])) + elif each_author['fansRate'][c_index][ + 'rate'] > FANS_INCREASE_THRESHOLD and each_author[ + 'fansRate'][c_index]['rate'] > each_author['fansRate'][ + index]['rate'] * AMAZING_INCREASE: + insert_event(Event.increase_2) + print(Event.increase_2.value + print_data(each_author)) + elif each_author['fansRate'][c_index][ + 'rate'] > FANS_INCREASE_THRESHOLD and each_author[ + 'fansRate'][c_index]['rate'] > each_author['fansRate'][ + index]['rate'] * MAGNIFICATION_INCREASE: + insert_event(Event.increase_1) + print(Event.increase_1.value + print_data(each_author)) + + # 突然出现大量的掉粉 + if each_author['fansRate'][c_index][ + 'rate'] < FANS_DECREASE_THRESHOLD and each_author[ + 'fansRate'][index]['rate'] > WATCH_DECREASE: + insert_event(Event.sudden_fall) + print(Event.sudden_fall.value + print_data(each_author)) + # 一掉再掉 elif each_author['fansRate'][c_index][ 'rate'] < FANS_DECREASE_THRESHOLD and abs( each_author['fansRate'][c_index]['rate']) > abs( - each_author['fansRate'][index]['rate']) * MAGNIFICATION_DECREASE: - print('检测到大量掉粉:{name},速率:{rate},时间:{datetime}'.format( - name=each_author['name'], - rate=each_author['fansRate'][c_index]['rate'], - datetime=each_author['fansRate'][c_index]['datetime'])) + each_author['fansRate'][index] + ['rate']) * WTF_DECREASE: + insert_event(Event.decrease_3) + print(Event.decrease_3.value + print_data(each_author)) + # 一掉再掉 + elif each_author['fansRate'][c_index][ + 'rate'] < FANS_DECREASE_THRESHOLD and abs( + each_author['fansRate'][c_index]['rate']) > abs( + each_author['fansRate'][index] + ['rate']) * AMAZING_DECREASE: + insert_event(Event.decrease_2) + print(Event.decrease_2.value + print_data(each_author)) + # 一掉再掉 + elif each_author['fansRate'][c_index][ + 'rate'] < FANS_DECREASE_THRESHOLD and abs( + each_author['fansRate'][c_index]['rate']) > abs( + each_author['fansRate'][index] + ['rate']) * MAGNIFICATION_DECREASE: + insert_event(Event.decrease_1) + print(Event.decrease_1.value + print_data(each_author)) + index += 1 diff --git a/run_analyzer.py b/run_analyzer.py index 7673448..11754a4 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -1,6 +1,7 @@ from biliob_analyzer.author_analyzer import AuthorAnalyzer from biliob_analyzer.video_analyzer import VideoAnalyzer import biliob_analyzer.author_rate_caculate +import biliob_analyzer.author_fans_watcher author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From d2e650138d4e9483bc010d745bab3e6de5c59a07 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Thu, 20 Dec 2018 16:38:39 +0800 Subject: [PATCH 086/469] feature: author fans watcher --- biliob_analyzer/author_fans_watcher.py | 135 +++++++++++++++++++++---- run_analyzer.py | 1 + 2 files changed, 119 insertions(+), 17 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index 3068c2f..87ebfae 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -1,39 +1,140 @@ from db import settings from db import db import datetime +from enum import Enum coll = db['author'] # 获得collection的句柄 +monitor = db['monitor'] # 获得collection的句柄 +video = db['video'] # 获得collection的句柄 -MAGNIFICATION_INCREASE = 5 +# 对于上升趋势的UP主,较上日增加多少倍,才算是巨量涨粉 +WTF_INCREASE = 10 + +# 对于上升趋势的UP主,较上日增加多少倍,才算是超量涨粉 +AMAZING_INCREASE = 5 + +# 对于上升趋势的UP主,较上日增加多少倍,才算是大量涨粉 +MAGNIFICATION_INCREASE = 3 + +# 对于下降趋势的UP主,较上日减少多少倍,才算是大量掉粉 MAGNIFICATION_DECREASE = 2 + +# 对于上升趋势的UP主,较上日增加多少倍,才算是超量涨粉 +AMAZING_DECREASE = 5 + +# 对于下降趋势的UP主,较上日减少多少倍,才算是巨量掉粉 +WTF_DECREASE = 8 + +# 粉丝增加多少,才算大量涨粉 FANS_INCREASE_THRESHOLD = 10000 +# 粉丝减少多少,算作大量掉粉 FANS_DECREASE_THRESHOLD = -3000 -for each_author in coll.find(): +# 多少粉丝以上才关注掉粉 +WATCH_DECREASE = 1000 + +class Event(Enum): + increase_1 = 'I级增长' + increase_2 = 'II级猛增' + increase_3 = 'III级激增' + sudden_fall = 'SF级骤减' + decrease_1 = 'I级减少' + decrease_2 = 'II级锐减' + decrease_3 = 'III级暴减' + + +for each_author in coll.find().batch_size(8): if 'fansRate' in each_author and len(each_author['fansRate']) > 1: index = 1 + + def print_data(each_author): + return '{name},速率:{rate},时间:{datetime}'.format( + name=each_author['name'], + rate=each_author['fansRate'][c_index]['rate'], + datetime=each_author['fansRate'][c_index]['datetime']) + + def insert_event(event_type): + videos = video.find({'mid':each_author['mid']}) + temp_video = {} + cause = {} + for each_v in videos: + if abs(each_v['datetime'] - each_author['fansRate'][c_index]['datetime']).days <= 1: + if 'cView' not in temp_video or each_v['cView'] > temp_video['cView']: + temp_video['aid'] = each_v['aid'] + temp_video['title'] = each_v['title'] + temp_video['cView'] = each_v['cView'] + cause = { + 'type': 'video', + 'aid': temp_video['aid'], + 'title': temp_video['title'], + 'cView': temp_video['cView'] + } + + monitor.insert_one({ + 'type': + event_type.value, + 'mid': + each_author['mid'], + 'author': + each_author['name'], + 'rate': + each_author['fansRate'][c_index]['rate'], + 'datetime': + each_author['fansRate'][c_index]['datetime'], + 'cause': cause + }) + while index < len(each_author['fansRate']): # 涨粉超高 c_index = index - 1 if each_author['fansRate'][c_index][ 'rate'] > FANS_INCREASE_THRESHOLD and each_author[ 'fansRate'][c_index]['rate'] > each_author['fansRate'][ - index]['rate'] * MAGNIFICATION_INCREASE: - print('检测到大量涨粉:{name},速率:{rate},时间:{datetime}'.format( - name=each_author['name'], - rate=each_author['fansRate'][c_index]['rate'], - datetime=each_author['fansRate'][c_index]['datetime'])) + index]['rate'] * WTF_INCREASE: + insert_event(Event.increase_3) + print(Event.increase_3.value + print_data(each_author)) - if each_author['fansRate'][c_index]['rate'] < FANS_DECREASE_THRESHOLD and each_author['fansRate'][index]['rate'] > 1000: - print('检测到突然掉粉:{name},速率:{rate},时间:{datetime}'.format( - name=each_author['name'], - rate=each_author['fansRate'][c_index]['rate'], - datetime=each_author['fansRate'][c_index]['datetime'])) + elif each_author['fansRate'][c_index][ + 'rate'] > FANS_INCREASE_THRESHOLD and each_author[ + 'fansRate'][c_index]['rate'] > each_author['fansRate'][ + index]['rate'] * AMAZING_INCREASE: + insert_event(Event.increase_2) + print(Event.increase_2.value + print_data(each_author)) + elif each_author['fansRate'][c_index][ + 'rate'] > FANS_INCREASE_THRESHOLD and each_author[ + 'fansRate'][c_index]['rate'] > each_author['fansRate'][ + index]['rate'] * MAGNIFICATION_INCREASE: + insert_event(Event.increase_1) + print(Event.increase_1.value + print_data(each_author)) + + # 突然出现大量的掉粉 + if each_author['fansRate'][c_index][ + 'rate'] < FANS_DECREASE_THRESHOLD and each_author[ + 'fansRate'][index]['rate'] > WATCH_DECREASE: + insert_event(Event.sudden_fall) + print(Event.sudden_fall.value + print_data(each_author)) + # 一掉再掉 elif each_author['fansRate'][c_index][ 'rate'] < FANS_DECREASE_THRESHOLD and abs( each_author['fansRate'][c_index]['rate']) > abs( - each_author['fansRate'][index]['rate']) * MAGNIFICATION_DECREASE: - print('检测到大量掉粉:{name},速率:{rate},时间:{datetime}'.format( - name=each_author['name'], - rate=each_author['fansRate'][c_index]['rate'], - datetime=each_author['fansRate'][c_index]['datetime'])) + each_author['fansRate'][index] + ['rate']) * WTF_DECREASE: + insert_event(Event.decrease_3) + print(Event.decrease_3.value + print_data(each_author)) + # 一掉再掉 + elif each_author['fansRate'][c_index][ + 'rate'] < FANS_DECREASE_THRESHOLD and abs( + each_author['fansRate'][c_index]['rate']) > abs( + each_author['fansRate'][index] + ['rate']) * AMAZING_DECREASE: + insert_event(Event.decrease_2) + print(Event.decrease_2.value + print_data(each_author)) + # 一掉再掉 + elif each_author['fansRate'][c_index][ + 'rate'] < FANS_DECREASE_THRESHOLD and abs( + each_author['fansRate'][c_index]['rate']) > abs( + each_author['fansRate'][index] + ['rate']) * MAGNIFICATION_DECREASE: + insert_event(Event.decrease_1) + print(Event.decrease_1.value + print_data(each_author)) + index += 1 diff --git a/run_analyzer.py b/run_analyzer.py index 7673448..11754a4 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -1,6 +1,7 @@ from biliob_analyzer.author_analyzer import AuthorAnalyzer from biliob_analyzer.video_analyzer import VideoAnalyzer import biliob_analyzer.author_rate_caculate +import biliob_analyzer.author_fans_watcher author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From 0e0ce790c4c73747dec9782aa2739394e43c1c55 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Thu, 20 Dec 2018 19:22:55 +0800 Subject: [PATCH 087/469] feature: incremental computation --- biliob_analyzer/author_fans_watcher.py | 32 +++++++++++++++----------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index 87ebfae..82194d6 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -40,7 +40,10 @@ class Event(Enum): decrease_2 = 'II级锐减' decrease_3 = 'III级暴减' - +last_datetime = datetime.datetime(2000,1,1) +if monitor.count() != 0: + last_datetime = next(monitor.find().sort([('datetime',-1)]).limit(1))['datetime'] + for each_author in coll.find().batch_size(8): if 'fansRate' in each_author and len(each_author['fansRate']) > 1: index = 1 @@ -54,20 +57,18 @@ def print_data(each_author): def insert_event(event_type): videos = video.find({'mid':each_author['mid']}) temp_video = {} - cause = {} + cause = {'type':'video'} for each_v in videos: - if abs(each_v['datetime'] - each_author['fansRate'][c_index]['datetime']).days <= 1: - if 'cView' not in temp_video or each_v['cView'] > temp_video['cView']: - temp_video['aid'] = each_v['aid'] - temp_video['title'] = each_v['title'] - temp_video['cView'] = each_v['cView'] - cause = { - 'type': 'video', - 'aid': temp_video['aid'], - 'title': temp_video['title'], - 'cView': temp_video['cView'] - } - + # 相差一日之内 + if (each_author['fansRate'][c_index]['datetime'] - each_v['datetime']).days <= 1: + temp_video['aid'] = each_v['aid'] + temp_video['title'] = each_v['title'] + temp_video['cView'] = each_v['data'][0]['view'] + if 'cView' not in temp_video or 'aid' not in cause or temp_video['cView'] > cause['cView']: + cause['aid'] = temp_video['aid'] + cause['title'] = temp_video['title'] + cause['cView'] = temp_video['cView'] + monitor.insert_one({ 'type': event_type.value, @@ -83,6 +84,9 @@ def insert_event(event_type): }) while index < len(each_author['fansRate']): + c_datetime = each_author['fansRate'][index]['datetime'] + if c_datetime <= last_datetime: + break # 涨粉超高 c_index = index - 1 if each_author['fansRate'][c_index][ From 4961b63a1c7e8d3a810f97aea0911ca6411e294c Mon Sep 17 00:00:00 2001 From: Jannchie Date: Thu, 20 Dec 2018 19:22:55 +0800 Subject: [PATCH 088/469] feature: incremental computation --- biliob_analyzer/author_fans_watcher.py | 32 +++++++++++++++----------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index 87ebfae..82194d6 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -40,7 +40,10 @@ class Event(Enum): decrease_2 = 'II级锐减' decrease_3 = 'III级暴减' - +last_datetime = datetime.datetime(2000,1,1) +if monitor.count() != 0: + last_datetime = next(monitor.find().sort([('datetime',-1)]).limit(1))['datetime'] + for each_author in coll.find().batch_size(8): if 'fansRate' in each_author and len(each_author['fansRate']) > 1: index = 1 @@ -54,20 +57,18 @@ def print_data(each_author): def insert_event(event_type): videos = video.find({'mid':each_author['mid']}) temp_video = {} - cause = {} + cause = {'type':'video'} for each_v in videos: - if abs(each_v['datetime'] - each_author['fansRate'][c_index]['datetime']).days <= 1: - if 'cView' not in temp_video or each_v['cView'] > temp_video['cView']: - temp_video['aid'] = each_v['aid'] - temp_video['title'] = each_v['title'] - temp_video['cView'] = each_v['cView'] - cause = { - 'type': 'video', - 'aid': temp_video['aid'], - 'title': temp_video['title'], - 'cView': temp_video['cView'] - } - + # 相差一日之内 + if (each_author['fansRate'][c_index]['datetime'] - each_v['datetime']).days <= 1: + temp_video['aid'] = each_v['aid'] + temp_video['title'] = each_v['title'] + temp_video['cView'] = each_v['data'][0]['view'] + if 'cView' not in temp_video or 'aid' not in cause or temp_video['cView'] > cause['cView']: + cause['aid'] = temp_video['aid'] + cause['title'] = temp_video['title'] + cause['cView'] = temp_video['cView'] + monitor.insert_one({ 'type': event_type.value, @@ -83,6 +84,9 @@ def insert_event(event_type): }) while index < len(each_author['fansRate']): + c_datetime = each_author['fansRate'][index]['datetime'] + if c_datetime <= last_datetime: + break # 涨粉超高 c_index = index - 1 if each_author['fansRate'][c_index][ From f63efdd695528343ea853bfe3bc294227f20d9f6 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Thu, 20 Dec 2018 22:01:00 +0800 Subject: [PATCH 089/469] feature: rename collection name ,monitor => event --- biliob_analyzer/author_fans_watcher.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index 82194d6..de809f2 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -3,7 +3,7 @@ import datetime from enum import Enum coll = db['author'] # 获得collection的句柄 -monitor = db['monitor'] # 获得collection的句柄 +event = db['event'] # 获得collection的句柄 video = db['video'] # 获得collection的句柄 # 对于上升趋势的UP主,较上日增加多少倍,才算是巨量涨粉 @@ -41,8 +41,8 @@ class Event(Enum): decrease_3 = 'III级暴减' last_datetime = datetime.datetime(2000,1,1) -if monitor.count() != 0: - last_datetime = next(monitor.find().sort([('datetime',-1)]).limit(1))['datetime'] +if event.count() != 0: + last_datetime = next(event.find().sort([('datetime',-1)]).limit(1))['datetime'] for each_author in coll.find().batch_size(8): if 'fansRate' in each_author and len(each_author['fansRate']) > 1: @@ -69,7 +69,7 @@ def insert_event(event_type): cause['title'] = temp_video['title'] cause['cView'] = temp_video['cView'] - monitor.insert_one({ + event.insert_one({ 'type': event_type.value, 'mid': From bc7f8de8c376493d382dd7eaec5384f25dd220d6 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Thu, 20 Dec 2018 22:01:00 +0800 Subject: [PATCH 090/469] feature: rename collection name ,monitor => event --- biliob_analyzer/author_fans_watcher.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index 82194d6..de809f2 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -3,7 +3,7 @@ import datetime from enum import Enum coll = db['author'] # 获得collection的句柄 -monitor = db['monitor'] # 获得collection的句柄 +event = db['event'] # 获得collection的句柄 video = db['video'] # 获得collection的句柄 # 对于上升趋势的UP主,较上日增加多少倍,才算是巨量涨粉 @@ -41,8 +41,8 @@ class Event(Enum): decrease_3 = 'III级暴减' last_datetime = datetime.datetime(2000,1,1) -if monitor.count() != 0: - last_datetime = next(monitor.find().sort([('datetime',-1)]).limit(1))['datetime'] +if event.count() != 0: + last_datetime = next(event.find().sort([('datetime',-1)]).limit(1))['datetime'] for each_author in coll.find().batch_size(8): if 'fansRate' in each_author and len(each_author['fansRate']) > 1: @@ -69,7 +69,7 @@ def insert_event(event_type): cause['title'] = temp_video['title'] cause['cView'] = temp_video['cView'] - monitor.insert_one({ + event.insert_one({ 'type': event_type.value, 'mid': From 52e26d9490bd56c65dd05f590ebf05e682f935ae Mon Sep 17 00:00:00 2001 From: Jannchie Date: Fri, 21 Dec 2018 14:33:08 +0800 Subject: [PATCH 091/469] hotfix: author rate caculate --- biliob_analyzer/author_rate_caculate.py | 36 ++++++++++++++----------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/biliob_analyzer/author_rate_caculate.py b/biliob_analyzer/author_rate_caculate.py index 607b068..2fa0a26 100644 --- a/biliob_analyzer/author_rate_caculate.py +++ b/biliob_analyzer/author_rate_caculate.py @@ -60,14 +60,7 @@ def next_p(i): if (c_date - p_date).days == 1: delta_fans = c_fans - p_fans seconds = days + (c_datetime.second - p_datetime.second) - coll.update_one({ - 'mid': each_author['mid'] - }, {'$push': { - 'fansRate': { - '$each': [{'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}], - '$position': 0 - } - }}, True) + rate.append({'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}) i += 1 c_fans, c_datetime, c_date = next_c(i) p_fans, p_datetime, p_date = next_p(i) @@ -82,14 +75,25 @@ def next_p(i): t_date = c_date - datetime.timedelta(1) t_fans = c_fans - t_rate delta_fans = c_fans - t_fans - coll.update_one({ - 'mid': each_author['mid'] - }, {'$push': { - 'fansRate': { - '$each': [{'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}], - '$position': 0 - } - }}, True) + rate.append({'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}) c_fans = t_fans c_date = t_date days -= 1 + coll.update_one({ + 'mid': each_author['mid'] + }, { + '$push': { + 'fansRate': { + '$each': rate, + '$position': 0 + } + } + }, True) + coll.update_one({ + 'mid': each_author['mid'] + }, { + '$set': { + 'cRate': each_author['fansRate'][0]['rate'] + } + }, True) + pass From d219e8381864abeb219e07b265372ad03f51b9cb Mon Sep 17 00:00:00 2001 From: Jannchie Date: Fri, 21 Dec 2018 14:33:08 +0800 Subject: [PATCH 092/469] hotfix: author rate caculate --- biliob_analyzer/author_rate_caculate.py | 36 ++++++++++++++----------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/biliob_analyzer/author_rate_caculate.py b/biliob_analyzer/author_rate_caculate.py index 607b068..2fa0a26 100644 --- a/biliob_analyzer/author_rate_caculate.py +++ b/biliob_analyzer/author_rate_caculate.py @@ -60,14 +60,7 @@ def next_p(i): if (c_date - p_date).days == 1: delta_fans = c_fans - p_fans seconds = days + (c_datetime.second - p_datetime.second) - coll.update_one({ - 'mid': each_author['mid'] - }, {'$push': { - 'fansRate': { - '$each': [{'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}], - '$position': 0 - } - }}, True) + rate.append({'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}) i += 1 c_fans, c_datetime, c_date = next_c(i) p_fans, p_datetime, p_date = next_p(i) @@ -82,14 +75,25 @@ def next_p(i): t_date = c_date - datetime.timedelta(1) t_fans = c_fans - t_rate delta_fans = c_fans - t_fans - coll.update_one({ - 'mid': each_author['mid'] - }, {'$push': { - 'fansRate': { - '$each': [{'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}], - '$position': 0 - } - }}, True) + rate.append({'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}) c_fans = t_fans c_date = t_date days -= 1 + coll.update_one({ + 'mid': each_author['mid'] + }, { + '$push': { + 'fansRate': { + '$each': rate, + '$position': 0 + } + } + }, True) + coll.update_one({ + 'mid': each_author['mid'] + }, { + '$set': { + 'cRate': each_author['fansRate'][0]['rate'] + } + }, True) + pass From 7fb620159dc4e6dc3390192793d532aee4585de1 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Fri, 21 Dec 2018 14:46:11 +0800 Subject: [PATCH 093/469] hotfix: author rate caculate --- biliob_analyzer/author_rate_caculate.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/biliob_analyzer/author_rate_caculate.py b/biliob_analyzer/author_rate_caculate.py index 2fa0a26..3548f22 100644 --- a/biliob_analyzer/author_rate_caculate.py +++ b/biliob_analyzer/author_rate_caculate.py @@ -89,11 +89,12 @@ def next_p(i): } } }, True) - coll.update_one({ - 'mid': each_author['mid'] - }, { - '$set': { - 'cRate': each_author['fansRate'][0]['rate'] - } - }, True) + if len(each_author['fansRate'] != 0): + coll.update_one({ + 'mid': each_author['mid'] + }, { + '$set': { + 'cRate': each_author['fansRate'][0]['rate'] + } + }, True) pass From 334ce5f10441b80c1e22003723e4b4f9d8f54e1a Mon Sep 17 00:00:00 2001 From: Jannchie Date: Fri, 21 Dec 2018 14:46:11 +0800 Subject: [PATCH 094/469] hotfix: author rate caculate --- biliob_analyzer/author_rate_caculate.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/biliob_analyzer/author_rate_caculate.py b/biliob_analyzer/author_rate_caculate.py index 2fa0a26..3548f22 100644 --- a/biliob_analyzer/author_rate_caculate.py +++ b/biliob_analyzer/author_rate_caculate.py @@ -89,11 +89,12 @@ def next_p(i): } } }, True) - coll.update_one({ - 'mid': each_author['mid'] - }, { - '$set': { - 'cRate': each_author['fansRate'][0]['rate'] - } - }, True) + if len(each_author['fansRate'] != 0): + coll.update_one({ + 'mid': each_author['mid'] + }, { + '$set': { + 'cRate': each_author['fansRate'][0]['rate'] + } + }, True) pass From 1b2947a9ad1e2b82f93fd72b6ae15200a7ec39bc Mon Sep 17 00:00:00 2001 From: Jannchie Date: Fri, 21 Dec 2018 14:47:38 +0800 Subject: [PATCH 095/469] hotfix: author rate caculate --- biliob_analyzer/author_rate_caculate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biliob_analyzer/author_rate_caculate.py b/biliob_analyzer/author_rate_caculate.py index 3548f22..d765068 100644 --- a/biliob_analyzer/author_rate_caculate.py +++ b/biliob_analyzer/author_rate_caculate.py @@ -89,7 +89,7 @@ def next_p(i): } } }, True) - if len(each_author['fansRate'] != 0): + if len(each_author['fansRate']) != 0: coll.update_one({ 'mid': each_author['mid'] }, { From 61dda2ab5b8369545bb1278eb73973adbeecec3c Mon Sep 17 00:00:00 2001 From: Jannchie Date: Fri, 21 Dec 2018 14:47:38 +0800 Subject: [PATCH 096/469] hotfix: author rate caculate --- biliob_analyzer/author_rate_caculate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biliob_analyzer/author_rate_caculate.py b/biliob_analyzer/author_rate_caculate.py index 3548f22..d765068 100644 --- a/biliob_analyzer/author_rate_caculate.py +++ b/biliob_analyzer/author_rate_caculate.py @@ -89,7 +89,7 @@ def next_p(i): } } }, True) - if len(each_author['fansRate'] != 0): + if len(each_author['fansRate']) != 0: coll.update_one({ 'mid': each_author['mid'] }, { From ead2e4c5acd50d183a71cb27249e50ae77b10d19 Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 21 Dec 2018 20:47:32 +0800 Subject: [PATCH 097/469] fix bug --- biliob_analyzer/author_rate_caculate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/biliob_analyzer/author_rate_caculate.py b/biliob_analyzer/author_rate_caculate.py index d765068..50d80de 100644 --- a/biliob_analyzer/author_rate_caculate.py +++ b/biliob_analyzer/author_rate_caculate.py @@ -8,7 +8,7 @@ # 数据量小于等于2条 if ('data' not in each_author or len(each_author['data']) < (i + 2)): continue - if ('fansRate' in each_author): + if ('fansRate' in each_author and len(each_author['fansRate']) >= 1): lastest_date = each_author['fansRate'][0]['datetime'] def getDate(date): @@ -89,7 +89,7 @@ def next_p(i): } } }, True) - if len(each_author['fansRate']) != 0: + if 'fansRate' in each_author and len(each_author['fansRate']) != 0: coll.update_one({ 'mid': each_author['mid'] }, { From fd378e687980a3fd36c0c506cc1166f886d2b2e6 Mon Sep 17 00:00:00 2001 From: jannchie Date: Fri, 21 Dec 2018 20:47:32 +0800 Subject: [PATCH 098/469] fix bug --- biliob_analyzer/author_rate_caculate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/biliob_analyzer/author_rate_caculate.py b/biliob_analyzer/author_rate_caculate.py index d765068..50d80de 100644 --- a/biliob_analyzer/author_rate_caculate.py +++ b/biliob_analyzer/author_rate_caculate.py @@ -8,7 +8,7 @@ # 数据量小于等于2条 if ('data' not in each_author or len(each_author['data']) < (i + 2)): continue - if ('fansRate' in each_author): + if ('fansRate' in each_author and len(each_author['fansRate']) >= 1): lastest_date = each_author['fansRate'][0]['datetime'] def getDate(date): @@ -89,7 +89,7 @@ def next_p(i): } } }, True) - if len(each_author['fansRate']) != 0: + if 'fansRate' in each_author and len(each_author['fansRate']) != 0: coll.update_one({ 'mid': each_author['mid'] }, { From 2528e8e86caa92c56a83269d0a43a40c076f49c2 Mon Sep 17 00:00:00 2001 From: jannchie Date: Mon, 24 Dec 2018 14:06:43 +0800 Subject: [PATCH 099/469] fix pug --- run_analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run_analyzer.py b/run_analyzer.py index 11754a4..1d56976 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -7,4 +7,4 @@ video_analyzer = VideoAnalyzer() author_analyzer.author_filter() -video_analyzer.video_filter() \ No newline at end of file +video_analyzer.video_filter() From f19910c44c3c8a7668bb20cfad2fcceb46cf2d5b Mon Sep 17 00:00:00 2001 From: jannchie Date: Mon, 24 Dec 2018 14:06:43 +0800 Subject: [PATCH 100/469] fix pug --- run_analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run_analyzer.py b/run_analyzer.py index 11754a4..1d56976 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -7,4 +7,4 @@ video_analyzer = VideoAnalyzer() author_analyzer.author_filter() -video_analyzer.video_filter() \ No newline at end of file +video_analyzer.video_filter() From f5c60c15247c6f207a1665a06ac01b3f42f6022d Mon Sep 17 00:00:00 2001 From: Jannchie Date: Mon, 24 Dec 2018 14:23:09 +0800 Subject: [PATCH 101/469] fix bug --- biliob_analyzer/coin.py | 29 ++++++++++++++++++++ biliob_spider/spiders/dynamic.py | 45 ++++++++++++++++++++++++++++++++ run.py | 20 +------------- 3 files changed, 75 insertions(+), 19 deletions(-) create mode 100644 biliob_analyzer/coin.py create mode 100644 biliob_spider/spiders/dynamic.py diff --git a/biliob_analyzer/coin.py b/biliob_analyzer/coin.py new file mode 100644 index 0000000..e8d7027 --- /dev/null +++ b/biliob_analyzer/coin.py @@ -0,0 +1,29 @@ +from db import settings +from db import db +import datetime +coll = db['video'] # 获得collection的句柄 +start_date = datetime.datetime(2018,11,22) +end_date = datetime.datetime(2018,12,22) +value = 'view' +d = {} +for each in coll.find(): + author_name = each['author'] + d[author_name] = [] + each['data'].reverse() + s_value = None + s_date = None + for each_data in each['data']: + if each_data['datetime'] < start_date: + continue + if each_data['datetime'] > end_date: + continue + if s_value == None: + s_value = each_data[value] + s_date = each_data['datetime'] + d[author_name] = [{'value':0,'date':s_date.date()}] + continue + c_value = each_data[value] - s_value + c_date = each_data['datetime'] + d[author_name].append({'value':c_value,'date':c_date.date()}) + pass + pass \ No newline at end of file diff --git a/biliob_spider/spiders/dynamic.py b/biliob_spider/spiders/dynamic.py new file mode 100644 index 0000000..058481e --- /dev/null +++ b/biliob_spider/spiders/dynamic.py @@ -0,0 +1,45 @@ +#coding=utf-8 +import scrapy +from scrapy.http import Request +from biliob_spider.items import SiteItem +import time +import json +import logging +from pymongo import MongoClient +import datetime + + +class DynamicSpider(scrapy.spiders.Spider): + name = "dynamic" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + } + } + + def start_requests(self): + yield Request( + "https://api.vc.bilibili.com/dynamic_svr/v1/dynamic_svr/space_history?host_uid=221648", + method='GET', + callback=self.parse) + + def parse(self, response): + try: + j = json.loads(response.body) + cards = j['data']['cards'] + for each_card in cards: + + print('点赞数:{}'.format(each_card['desc']['like'])) + print('UP主ID:{}'.format(each_card['desc']['uid'])) + card = json.loads(each_card['card']) + if('title' in card): + print('标题:{}'.format(card['title'])) + if('description' in card): + print('内容:{}'.format(card['description'])) + + except Exception as error: + # 出现错误时打印错误日志 + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) diff --git a/run.py b/run.py index a53ba34..00e0700 100644 --- a/run.py +++ b/run.py @@ -7,24 +7,6 @@ import logging import threading - -# 第一步,创建一个logger -logger = logging.getLogger() -logger.setLevel(logging.INFO) # Log等级总开关 -# 第二步,创建一个handler,用于写入日志文件 -rq = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) -log_path = './' -log_name = log_path + rq + '.log' -logfile = log_name -fh = logging.FileHandler(logfile, mode='w') -fh.setLevel(logging.DEBUG) # 输出到file的log等级的开关 -# 第三步,定义handler的输出格式 -formatter = logging.Formatter( - "%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s") -fh.setFormatter(formatter) -# 第四步,将logger添加到handler里面 -logger.addHandler(fh) - def site(): Popen(["scrapy","crawl","site"]) @@ -60,7 +42,7 @@ def run_threaded(job_func): job_thread = threading.Thread(target=job_func) job_thread.start() -schedule.every().day.at('12:00').do(run_threaded,data_analyze) +schedule.every().day.at('14:25').do(run_threaded,data_analyze) schedule.every().day.at('01:00').do(run_threaded,update_author) schedule.every().day.at('07:00').do(run_threaded,video_spider) schedule.every().day.at('14:00').do(run_threaded,auto_add_author) From 007c7aef673378f8cae52c57e3c5304cdef0806f Mon Sep 17 00:00:00 2001 From: Jannchie Date: Mon, 24 Dec 2018 14:23:09 +0800 Subject: [PATCH 102/469] fix bug --- biliob_analyzer/coin.py | 29 ++++++++++++++++++++ biliob_spider/spiders/dynamic.py | 45 ++++++++++++++++++++++++++++++++ run.py | 20 +------------- 3 files changed, 75 insertions(+), 19 deletions(-) create mode 100644 biliob_analyzer/coin.py create mode 100644 biliob_spider/spiders/dynamic.py diff --git a/biliob_analyzer/coin.py b/biliob_analyzer/coin.py new file mode 100644 index 0000000..e8d7027 --- /dev/null +++ b/biliob_analyzer/coin.py @@ -0,0 +1,29 @@ +from db import settings +from db import db +import datetime +coll = db['video'] # 获得collection的句柄 +start_date = datetime.datetime(2018,11,22) +end_date = datetime.datetime(2018,12,22) +value = 'view' +d = {} +for each in coll.find(): + author_name = each['author'] + d[author_name] = [] + each['data'].reverse() + s_value = None + s_date = None + for each_data in each['data']: + if each_data['datetime'] < start_date: + continue + if each_data['datetime'] > end_date: + continue + if s_value == None: + s_value = each_data[value] + s_date = each_data['datetime'] + d[author_name] = [{'value':0,'date':s_date.date()}] + continue + c_value = each_data[value] - s_value + c_date = each_data['datetime'] + d[author_name].append({'value':c_value,'date':c_date.date()}) + pass + pass \ No newline at end of file diff --git a/biliob_spider/spiders/dynamic.py b/biliob_spider/spiders/dynamic.py new file mode 100644 index 0000000..058481e --- /dev/null +++ b/biliob_spider/spiders/dynamic.py @@ -0,0 +1,45 @@ +#coding=utf-8 +import scrapy +from scrapy.http import Request +from biliob_spider.items import SiteItem +import time +import json +import logging +from pymongo import MongoClient +import datetime + + +class DynamicSpider(scrapy.spiders.Spider): + name = "dynamic" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + } + } + + def start_requests(self): + yield Request( + "https://api.vc.bilibili.com/dynamic_svr/v1/dynamic_svr/space_history?host_uid=221648", + method='GET', + callback=self.parse) + + def parse(self, response): + try: + j = json.loads(response.body) + cards = j['data']['cards'] + for each_card in cards: + + print('点赞数:{}'.format(each_card['desc']['like'])) + print('UP主ID:{}'.format(each_card['desc']['uid'])) + card = json.loads(each_card['card']) + if('title' in card): + print('标题:{}'.format(card['title'])) + if('description' in card): + print('内容:{}'.format(card['description'])) + + except Exception as error: + # 出现错误时打印错误日志 + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) diff --git a/run.py b/run.py index a53ba34..00e0700 100644 --- a/run.py +++ b/run.py @@ -7,24 +7,6 @@ import logging import threading - -# 第一步,创建一个logger -logger = logging.getLogger() -logger.setLevel(logging.INFO) # Log等级总开关 -# 第二步,创建一个handler,用于写入日志文件 -rq = time.strftime('%Y%m%d%H%M', time.localtime(time.time())) -log_path = './' -log_name = log_path + rq + '.log' -logfile = log_name -fh = logging.FileHandler(logfile, mode='w') -fh.setLevel(logging.DEBUG) # 输出到file的log等级的开关 -# 第三步,定义handler的输出格式 -formatter = logging.Formatter( - "%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s") -fh.setFormatter(formatter) -# 第四步,将logger添加到handler里面 -logger.addHandler(fh) - def site(): Popen(["scrapy","crawl","site"]) @@ -60,7 +42,7 @@ def run_threaded(job_func): job_thread = threading.Thread(target=job_func) job_thread.start() -schedule.every().day.at('12:00').do(run_threaded,data_analyze) +schedule.every().day.at('14:25').do(run_threaded,data_analyze) schedule.every().day.at('01:00').do(run_threaded,update_author) schedule.every().day.at('07:00').do(run_threaded,video_spider) schedule.every().day.at('14:00').do(run_threaded,auto_add_author) From 5086a7eec04362f672043603980523bbec270ae2 Mon Sep 17 00:00:00 2001 From: jannchie Date: Tue, 25 Dec 2018 13:53:10 +0800 Subject: [PATCH 103/469] debug --- biliob_analyzer/author_fans_watcher.py | 1 + biliob_analyzer/author_rate_caculate.py | 1 + biliob_spider/settings.py | 2 +- run.py | 5 +++-- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index de809f2..b243bce 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -41,6 +41,7 @@ class Event(Enum): decrease_3 = 'III级暴减' last_datetime = datetime.datetime(2000,1,1) +print('开始捕捉事件') if event.count() != 0: last_datetime = next(event.find().sort([('datetime',-1)]).limit(1))['datetime'] diff --git a/biliob_analyzer/author_rate_caculate.py b/biliob_analyzer/author_rate_caculate.py index 50d80de..6123538 100644 --- a/biliob_analyzer/author_rate_caculate.py +++ b/biliob_analyzer/author_rate_caculate.py @@ -2,6 +2,7 @@ from db import db import datetime coll = db['author'] # 获得collection的句柄 +print('开始计算粉丝增速') for each_author in coll.find().batch_size(8): rate = [] i = 0 diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index 1d0d6d5..bd07c94 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -12,7 +12,7 @@ import random # LOG_FILE = "biliob_spider.log" -# LOG_LEVEL = "INFO" +LOG_LEVEL = "WARNING" BOT_NAME = 'biliob_spider' diff --git a/run.py b/run.py index 00e0700..4b149b3 100644 --- a/run.py +++ b/run.py @@ -32,6 +32,7 @@ def online(): Popen(['scrapy','crawl','online']) def data_analyze(): + print('执行data_analyzer') Popen(['python','run_analyzer.py']) def bili_monthly_rank(): @@ -42,7 +43,7 @@ def run_threaded(job_func): job_thread = threading.Thread(target=job_func) job_thread.start() -schedule.every().day.at('14:25').do(run_threaded,data_analyze) +schedule.every().day.at('11:40').do(run_threaded,data_analyze) schedule.every().day.at('01:00').do(run_threaded,update_author) schedule.every().day.at('07:00').do(run_threaded,video_spider) schedule.every().day.at('14:00').do(run_threaded,auto_add_author) @@ -54,7 +55,7 @@ def run_threaded(job_func): schedule.every().minute.do(run_threaded,online) -logging.info('开始运行计划任务..') +print('开始运行计划任务..') while True: schedule.run_pending() time.sleep(60) From a280473f3ad7535165d1bdcd3565e4855bb30e73 Mon Sep 17 00:00:00 2001 From: jannchie Date: Tue, 25 Dec 2018 13:53:10 +0800 Subject: [PATCH 104/469] debug --- biliob_analyzer/author_fans_watcher.py | 1 + biliob_analyzer/author_rate_caculate.py | 1 + biliob_spider/settings.py | 2 +- run.py | 5 +++-- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index de809f2..b243bce 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -41,6 +41,7 @@ class Event(Enum): decrease_3 = 'III级暴减' last_datetime = datetime.datetime(2000,1,1) +print('开始捕捉事件') if event.count() != 0: last_datetime = next(event.find().sort([('datetime',-1)]).limit(1))['datetime'] diff --git a/biliob_analyzer/author_rate_caculate.py b/biliob_analyzer/author_rate_caculate.py index 50d80de..6123538 100644 --- a/biliob_analyzer/author_rate_caculate.py +++ b/biliob_analyzer/author_rate_caculate.py @@ -2,6 +2,7 @@ from db import db import datetime coll = db['author'] # 获得collection的句柄 +print('开始计算粉丝增速') for each_author in coll.find().batch_size(8): rate = [] i = 0 diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index 1d0d6d5..bd07c94 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -12,7 +12,7 @@ import random # LOG_FILE = "biliob_spider.log" -# LOG_LEVEL = "INFO" +LOG_LEVEL = "WARNING" BOT_NAME = 'biliob_spider' diff --git a/run.py b/run.py index 00e0700..4b149b3 100644 --- a/run.py +++ b/run.py @@ -32,6 +32,7 @@ def online(): Popen(['scrapy','crawl','online']) def data_analyze(): + print('执行data_analyzer') Popen(['python','run_analyzer.py']) def bili_monthly_rank(): @@ -42,7 +43,7 @@ def run_threaded(job_func): job_thread = threading.Thread(target=job_func) job_thread.start() -schedule.every().day.at('14:25').do(run_threaded,data_analyze) +schedule.every().day.at('11:40').do(run_threaded,data_analyze) schedule.every().day.at('01:00').do(run_threaded,update_author) schedule.every().day.at('07:00').do(run_threaded,video_spider) schedule.every().day.at('14:00').do(run_threaded,auto_add_author) @@ -54,7 +55,7 @@ def run_threaded(job_func): schedule.every().minute.do(run_threaded,online) -logging.info('开始运行计划任务..') +print('开始运行计划任务..') while True: schedule.run_pending() time.sleep(60) From 3564692d2141859c69af591c1b0e06ff76c18c17 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 5 Jan 2019 21:34:01 +0800 Subject: [PATCH 105/469] update channels --- biliob_spider/spiders/video_spider.py | 7 +++++-- biliob_spider/spiders/video_watcher.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index 30a4c55..dda0884 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -44,7 +44,7 @@ '手机游戏': '游戏', '搞笑': '生活', '教程演示': '鬼畜', - '数码': '科技', + '数码': '数码', '日常': '生活', '明星': '娱乐', '星海': '科技', @@ -98,7 +98,10 @@ '生活': '生活', '音乐': '音乐', '纪录片': '纪录片', - '游戏': '游戏' + '手机平板': '数码', + '电脑装机': '数码', + '影音智能': '数码', + '摄影摄像': '数码', } diff --git a/biliob_spider/spiders/video_watcher.py b/biliob_spider/spiders/video_watcher.py index 511597b..51e2939 100644 --- a/biliob_spider/spiders/video_watcher.py +++ b/biliob_spider/spiders/video_watcher.py @@ -19,7 +19,7 @@ class VideoWatch(scrapy.spiders.Spider): 'biliob_spider.pipelines.VideoAddPipeline': 300, 'biliob_spider.pipelines.AuthorChannelPipeline': 301 }, - # 'DOWNLOAD_DELAY': 0.5 + 'DOWNLOAD_DELAY': 0.5 } def __init__(self): From 269742a05b98d32d4bf67cba8788af899ac8871c Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 5 Jan 2019 21:34:01 +0800 Subject: [PATCH 106/469] update channels --- biliob_spider/spiders/video_spider.py | 7 +++++-- biliob_spider/spiders/video_watcher.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index 30a4c55..dda0884 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -44,7 +44,7 @@ '手机游戏': '游戏', '搞笑': '生活', '教程演示': '鬼畜', - '数码': '科技', + '数码': '数码', '日常': '生活', '明星': '娱乐', '星海': '科技', @@ -98,7 +98,10 @@ '生活': '生活', '音乐': '音乐', '纪录片': '纪录片', - '游戏': '游戏' + '手机平板': '数码', + '电脑装机': '数码', + '影音智能': '数码', + '摄影摄像': '数码', } diff --git a/biliob_spider/spiders/video_watcher.py b/biliob_spider/spiders/video_watcher.py index 511597b..51e2939 100644 --- a/biliob_spider/spiders/video_watcher.py +++ b/biliob_spider/spiders/video_watcher.py @@ -19,7 +19,7 @@ class VideoWatch(scrapy.spiders.Spider): 'biliob_spider.pipelines.VideoAddPipeline': 300, 'biliob_spider.pipelines.AuthorChannelPipeline': 301 }, - # 'DOWNLOAD_DELAY': 0.5 + 'DOWNLOAD_DELAY': 0.5 } def __init__(self): From 231a59a685fe6fe1f0d99c971f89f320e4118a15 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 5 Jan 2019 22:46:03 +0800 Subject: [PATCH 107/469] crawl all videos --- biliob_spider/spiders/video_spider.py | 8 + biliob_spider/spiders/video_spider_all.py | 230 ++++++++++++++++++++++ 2 files changed, 238 insertions(+) create mode 100644 biliob_spider/spiders/video_spider_all.py diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index dda0884..131cded 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -102,6 +102,14 @@ '电脑装机': '数码', '影音智能': '数码', '摄影摄像': '数码', + '摄影摄像': '数码', + '风尚标': '时尚', + '电音': '音乐', + '音乐综合': '音乐', + 'MV': '音乐', + '音乐现场': '音乐', + '游戏': '游戏', + 'T台': '时尚', } diff --git a/biliob_spider/spiders/video_spider_all.py b/biliob_spider/spiders/video_spider_all.py new file mode 100644 index 0000000..98f58f3 --- /dev/null +++ b/biliob_spider/spiders/video_spider_all.py @@ -0,0 +1,230 @@ +# coding=utf-8 +import scrapy +from scrapy.http import Request +from biliob_spider.items import VideoItem +from datetime import datetime +import time +import json +import logging +from pymongo import MongoClient +from db import settings + +sub_channel_2_channel = { + 'ASMR': '生活', + 'GMV': '游戏', + 'Korea相关': '娱乐', + 'MAD·AMV': '动画', + 'MMD·3D': '动画', + 'Mugen': '游戏', + 'OP/ED/OST': '音乐', + 'VOCALOID·UTAU': '音乐', + '三次元舞蹈': '舞蹈', + '三次元音乐': '音乐', + '人力VOCALOID': '鬼畜', + '人文·历史': '纪录片', + '健身': '时尚', + '其他': '生活', + '其他国家': '电影', + '军事': '纪录片', + '动物圈': '生活', + '华语电影': '电影', + '单机游戏': '游戏', + '原创音乐': '音乐', + '国产剧': '电视剧', + '国产动画': '国创', + '国产原创相关': '国创', + '宅舞': '舞蹈', + '完结动画': '番剧', + '官方延伸': '番剧', + '布袋戏': '国创', + '广告': '广告', + '影视剪辑': '影视', + '影视杂谈': '影视', + '手工': '生活', + '手机游戏': '游戏', + '搞笑': '生活', + '教程演示': '鬼畜', + '数码': '数码', + '日常': '生活', + '明星': '娱乐', + '星海': '科技', + '服饰': '时尚', + '机械': '科技', + '桌游棋牌': '游戏', + '欧美电影': '电影', + '汽车': '科技', + '游戏': '游戏', + '海外剧': '电视剧', + '演奏': '音乐', + '演讲·公开课': '科技', + '特摄': '影视', + '电子竞技': '游戏', + '短片': '影视', + '短片·手书·配音': '动画', + '社会·美食·旅行': '纪录片', + '科学·探索·自然': '纪录片', + '绘画': '生活', + '综合': '动画', + '综艺': '娱乐', + '网络游戏': '游戏', + '美妆': '时尚', + '美食圈': '生活', + '翻唱': '音乐', + '舞蹈教程': '舞蹈', + '资讯': '国创', + '趣味科普人文': '科技', + '运动': '生活', + '连载动画': '番剧', + '野生技术协会': '科技', + '音MAD': '鬼畜', + '音乐选集': '音乐', + '音游': '游戏', + '预告 资讯': '影视', + '预告·资讯': '影视', + '单机联机': '游戏', + '鬼畜调教': '鬼畜', + '演讲• 公开课': '科技', + '国产电影': '电影', + '日本电影': '电影', + '番剧': '番剧', + '国创': '国创', + '鬼畜': '鬼畜', + '电视剧': '电视剧', + '动画': '动画', + '时尚': '时尚', + '娱乐': '娱乐', + '电影': '电影', + '舞蹈': '舞蹈', + '科技': '科技', + '生活': '生活', + '音乐': '音乐', + '纪录片': '纪录片', + '手机平板': '数码', + '电脑装机': '数码', + '影音智能': '数码', + '摄影摄像': '数码', + '摄影摄像': '数码', + '风尚标': '时尚', + '电音': '音乐', + '音乐综合': '音乐', + 'MV': '音乐', + '音乐现场': '音乐', + 'T台': '时尚', +} + + +class VideoSpider(scrapy.spiders.Spider): + name = "videoSpiderAll" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.VideoPipeline': 300, + } + } + + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def start_requests(self): + # 只需要aid + c = self.coll.find({}, {'aid': 1}) + + x = 0 + + aid_list = [] + for each_doc in c: + x = x + 1 + aid_list.append(each_doc['aid']) + i = 0 + while aid_list != []: + if i == 0: + aid_str = '' + aid_str += str(aid_list.pop()) + ',' + i = i + 1 + if i == 100 or aid_list == []: + i = 0 + yield Request( + "https://api.bilibili.com/x/article/archives?ids=" + + aid_str.rstrip(',')) + + def parse(self, response): + try: + r = json.loads(response.body) + d = r["data"] + keys = list(d.keys()) + for each_key in keys: + + aid = d[each_key]['stat']['aid'] + author = d[each_key]['owner']['name'] + mid = d[each_key]['owner']['mid'] + view = d[each_key]['stat']['view'] + favorite = d[each_key]['stat']['favorite'] + danmaku = d[each_key]['stat']['danmaku'] + coin = d[each_key]['stat']['coin'] + share = d[each_key]['stat']['share'] + like = d[each_key]['stat']['like'] + dislike = d[each_key]['stat']['dislike'] + current_date = datetime.now() + data = { + 'view': view, + 'favorite': favorite, + 'danmaku': danmaku, + 'coin': coin, + 'share': share, + 'like': like, + 'dislike': dislike, + 'datetime': current_date + } + + + subChannel = d[each_key]['tname'] + title = d[each_key]['title'] + date = d[each_key]['pubdate'] + tid = d[each_key]['tid'] + pic = d[each_key]['pic'] + item = VideoItem() + item['current_view'] = view + item['current_favorite'] = favorite + item['current_danmaku'] = danmaku + item['current_coin'] = coin + item['current_share'] = share + item['current_like'] = like + item['current_dislike'] = dislike + item['current_datetime'] = current_date + item['aid'] = aid + item['mid'] = mid + item['pic'] = pic + item['author'] = author + item['data'] = data + item['title'] = title + item['subChannel'] = subChannel + item['datetime'] = date + + if subChannel != '': + item['channel'] = sub_channel_2_channel[subChannel] + elif subChannel == '资讯': + if tid == 51: + item['channel'] == '番剧' + if tid == 170: + item['channel'] == '国创' + if tid == 159: + item['channel'] == '娱乐' + else: + item['channel'] = None + yield item + + except Exception as error: + # 出现错误时打印错误日志 + if r['code'] == -404: + return + logging.error("视频爬虫在解析时发生错误") + logging.error(item) + logging.error(response.url) + logging.error(error) From 9ee3c1c8210a498821b0fbe435d12d79a99a26ef Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 5 Jan 2019 22:46:03 +0800 Subject: [PATCH 108/469] crawl all videos --- biliob_spider/spiders/video_spider.py | 8 + biliob_spider/spiders/video_spider_all.py | 230 ++++++++++++++++++++++ 2 files changed, 238 insertions(+) create mode 100644 biliob_spider/spiders/video_spider_all.py diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index dda0884..131cded 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -102,6 +102,14 @@ '电脑装机': '数码', '影音智能': '数码', '摄影摄像': '数码', + '摄影摄像': '数码', + '风尚标': '时尚', + '电音': '音乐', + '音乐综合': '音乐', + 'MV': '音乐', + '音乐现场': '音乐', + '游戏': '游戏', + 'T台': '时尚', } diff --git a/biliob_spider/spiders/video_spider_all.py b/biliob_spider/spiders/video_spider_all.py new file mode 100644 index 0000000..98f58f3 --- /dev/null +++ b/biliob_spider/spiders/video_spider_all.py @@ -0,0 +1,230 @@ +# coding=utf-8 +import scrapy +from scrapy.http import Request +from biliob_spider.items import VideoItem +from datetime import datetime +import time +import json +import logging +from pymongo import MongoClient +from db import settings + +sub_channel_2_channel = { + 'ASMR': '生活', + 'GMV': '游戏', + 'Korea相关': '娱乐', + 'MAD·AMV': '动画', + 'MMD·3D': '动画', + 'Mugen': '游戏', + 'OP/ED/OST': '音乐', + 'VOCALOID·UTAU': '音乐', + '三次元舞蹈': '舞蹈', + '三次元音乐': '音乐', + '人力VOCALOID': '鬼畜', + '人文·历史': '纪录片', + '健身': '时尚', + '其他': '生活', + '其他国家': '电影', + '军事': '纪录片', + '动物圈': '生活', + '华语电影': '电影', + '单机游戏': '游戏', + '原创音乐': '音乐', + '国产剧': '电视剧', + '国产动画': '国创', + '国产原创相关': '国创', + '宅舞': '舞蹈', + '完结动画': '番剧', + '官方延伸': '番剧', + '布袋戏': '国创', + '广告': '广告', + '影视剪辑': '影视', + '影视杂谈': '影视', + '手工': '生活', + '手机游戏': '游戏', + '搞笑': '生活', + '教程演示': '鬼畜', + '数码': '数码', + '日常': '生活', + '明星': '娱乐', + '星海': '科技', + '服饰': '时尚', + '机械': '科技', + '桌游棋牌': '游戏', + '欧美电影': '电影', + '汽车': '科技', + '游戏': '游戏', + '海外剧': '电视剧', + '演奏': '音乐', + '演讲·公开课': '科技', + '特摄': '影视', + '电子竞技': '游戏', + '短片': '影视', + '短片·手书·配音': '动画', + '社会·美食·旅行': '纪录片', + '科学·探索·自然': '纪录片', + '绘画': '生活', + '综合': '动画', + '综艺': '娱乐', + '网络游戏': '游戏', + '美妆': '时尚', + '美食圈': '生活', + '翻唱': '音乐', + '舞蹈教程': '舞蹈', + '资讯': '国创', + '趣味科普人文': '科技', + '运动': '生活', + '连载动画': '番剧', + '野生技术协会': '科技', + '音MAD': '鬼畜', + '音乐选集': '音乐', + '音游': '游戏', + '预告 资讯': '影视', + '预告·资讯': '影视', + '单机联机': '游戏', + '鬼畜调教': '鬼畜', + '演讲• 公开课': '科技', + '国产电影': '电影', + '日本电影': '电影', + '番剧': '番剧', + '国创': '国创', + '鬼畜': '鬼畜', + '电视剧': '电视剧', + '动画': '动画', + '时尚': '时尚', + '娱乐': '娱乐', + '电影': '电影', + '舞蹈': '舞蹈', + '科技': '科技', + '生活': '生活', + '音乐': '音乐', + '纪录片': '纪录片', + '手机平板': '数码', + '电脑装机': '数码', + '影音智能': '数码', + '摄影摄像': '数码', + '摄影摄像': '数码', + '风尚标': '时尚', + '电音': '音乐', + '音乐综合': '音乐', + 'MV': '音乐', + '音乐现场': '音乐', + 'T台': '时尚', +} + + +class VideoSpider(scrapy.spiders.Spider): + name = "videoSpiderAll" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.VideoPipeline': 300, + } + } + + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def start_requests(self): + # 只需要aid + c = self.coll.find({}, {'aid': 1}) + + x = 0 + + aid_list = [] + for each_doc in c: + x = x + 1 + aid_list.append(each_doc['aid']) + i = 0 + while aid_list != []: + if i == 0: + aid_str = '' + aid_str += str(aid_list.pop()) + ',' + i = i + 1 + if i == 100 or aid_list == []: + i = 0 + yield Request( + "https://api.bilibili.com/x/article/archives?ids=" + + aid_str.rstrip(',')) + + def parse(self, response): + try: + r = json.loads(response.body) + d = r["data"] + keys = list(d.keys()) + for each_key in keys: + + aid = d[each_key]['stat']['aid'] + author = d[each_key]['owner']['name'] + mid = d[each_key]['owner']['mid'] + view = d[each_key]['stat']['view'] + favorite = d[each_key]['stat']['favorite'] + danmaku = d[each_key]['stat']['danmaku'] + coin = d[each_key]['stat']['coin'] + share = d[each_key]['stat']['share'] + like = d[each_key]['stat']['like'] + dislike = d[each_key]['stat']['dislike'] + current_date = datetime.now() + data = { + 'view': view, + 'favorite': favorite, + 'danmaku': danmaku, + 'coin': coin, + 'share': share, + 'like': like, + 'dislike': dislike, + 'datetime': current_date + } + + + subChannel = d[each_key]['tname'] + title = d[each_key]['title'] + date = d[each_key]['pubdate'] + tid = d[each_key]['tid'] + pic = d[each_key]['pic'] + item = VideoItem() + item['current_view'] = view + item['current_favorite'] = favorite + item['current_danmaku'] = danmaku + item['current_coin'] = coin + item['current_share'] = share + item['current_like'] = like + item['current_dislike'] = dislike + item['current_datetime'] = current_date + item['aid'] = aid + item['mid'] = mid + item['pic'] = pic + item['author'] = author + item['data'] = data + item['title'] = title + item['subChannel'] = subChannel + item['datetime'] = date + + if subChannel != '': + item['channel'] = sub_channel_2_channel[subChannel] + elif subChannel == '资讯': + if tid == 51: + item['channel'] == '番剧' + if tid == 170: + item['channel'] == '国创' + if tid == 159: + item['channel'] == '娱乐' + else: + item['channel'] = None + yield item + + except Exception as error: + # 出现错误时打印错误日志 + if r['code'] == -404: + return + logging.error("视频爬虫在解析时发生错误") + logging.error(item) + logging.error(response.url) + logging.error(error) From ae62f36f4aba3cf6fe34996d74bc17c5651cfd9f Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 5 Jan 2019 23:00:46 +0800 Subject: [PATCH 109/469] send email --- biliob_spider/items.py | 1 + biliob_spider/spiders/author_auto_add.py | 3 +++ biliob_spider/spiders/author_update.py | 3 +++ biliob_spider/spiders/bangumi.py | 1 + biliob_spider/spiders/bili_monthly_rank.py | 3 +++ biliob_spider/spiders/donghua.py | 1 + biliob_spider/spiders/dynamic.py | 3 +++ biliob_spider/spiders/online.py | 3 +++ biliob_spider/spiders/site_info.py | 3 +++ biliob_spider/spiders/tag.py | 3 +++ biliob_spider/spiders/video_spider.py | 4 +++- biliob_spider/spiders/video_spider_all.py | 3 +++ biliob_spider/spiders/video_watcher.py | 3 +++ 13 files changed, 33 insertions(+), 1 deletion(-) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index e85c431..88d9571 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -6,6 +6,7 @@ # https://doc.scrapy.org/en/latest/topics/items.html import scrapy +from scrapy.mail import MailSender class SiteItem(scrapy.Item): region_count = scrapy.Field() diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index af6a451..03fe0a7 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -1,5 +1,6 @@ #coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import AuthorItem import time @@ -49,6 +50,8 @@ def parse(self, response): callback=self.detailParse) except Exception as error: # 出现错误时打印错误日志 + mailer = MailSender() + mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}".format(response.url,error)) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index 12f5280..a181359 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -1,5 +1,6 @@ #coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import AuthorItem import time @@ -84,6 +85,8 @@ def parse(self, response): callback=self.parse_view) except Exception as error: # 出现错误时打印错误日志 + mailer = MailSender() + mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/bangumi.py b/biliob_spider/spiders/bangumi.py index d9d155b..b2f6d7e 100644 --- a/biliob_spider/spiders/bangumi.py +++ b/biliob_spider/spiders/bangumi.py @@ -1,5 +1,6 @@ #coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import BangumiItem import time diff --git a/biliob_spider/spiders/bili_monthly_rank.py b/biliob_spider/spiders/bili_monthly_rank.py index eb7fcf9..c0d8c15 100644 --- a/biliob_spider/spiders/bili_monthly_rank.py +++ b/biliob_spider/spiders/bili_monthly_rank.py @@ -1,5 +1,6 @@ #coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import RankItem import time @@ -56,6 +57,8 @@ def parse(self, response): yield item except Exception as error: # 出现错误时打印错误日志 + mailer = MailSender() + mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/donghua.py b/biliob_spider/spiders/donghua.py index 63bb5af..bcd3449 100644 --- a/biliob_spider/spiders/donghua.py +++ b/biliob_spider/spiders/donghua.py @@ -1,5 +1,6 @@ #coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import BangumiItem import time diff --git a/biliob_spider/spiders/dynamic.py b/biliob_spider/spiders/dynamic.py index 058481e..62a99e7 100644 --- a/biliob_spider/spiders/dynamic.py +++ b/biliob_spider/spiders/dynamic.py @@ -1,5 +1,6 @@ #coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import SiteItem import time @@ -40,6 +41,8 @@ def parse(self, response): except Exception as error: # 出现错误时打印错误日志 + mailer = MailSender() + mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}".format(response.url,error)) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/online.py b/biliob_spider/spiders/online.py index ed6e35d..4378a2a 100644 --- a/biliob_spider/spiders/online.py +++ b/biliob_spider/spiders/online.py @@ -1,5 +1,6 @@ #coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import VideoOnline import time @@ -44,6 +45,8 @@ def parse(self, response): callback=self.detailParse) except Exception as error: # 出现错误时打印错误日志 + mailer = MailSender() + mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/site_info.py b/biliob_spider/spiders/site_info.py index 6faafc6..b8fffb5 100644 --- a/biliob_spider/spiders/site_info.py +++ b/biliob_spider/spiders/site_info.py @@ -1,5 +1,6 @@ #coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import SiteItem import time @@ -32,6 +33,8 @@ def parse(self, response): except Exception as error: # 出现错误时打印错误日志 + mailer = MailSender() + mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/tag.py b/biliob_spider/spiders/tag.py index e4f756d..a2236ea 100644 --- a/biliob_spider/spiders/tag.py +++ b/biliob_spider/spiders/tag.py @@ -1,5 +1,6 @@ #coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import TagItem import time @@ -37,6 +38,8 @@ def parse(self, response): except Exception as error: # 出现错误时打印错误日志 + mailer = MailSender() + mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index 131cded..fc25a83 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -1,5 +1,6 @@ # coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import VideoItem from datetime import datetime @@ -8,7 +9,6 @@ import logging from pymongo import MongoClient from db import settings - sub_channel_2_channel = { 'ASMR': '生活', 'GMV': '游戏', @@ -224,6 +224,8 @@ def parse(self, response): # 出现错误时打印错误日志 if r['code'] == -404: return + mailer = MailSender() + mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) logging.error("视频爬虫在解析时发生错误") logging.error(item) logging.error(response.url) diff --git a/biliob_spider/spiders/video_spider_all.py b/biliob_spider/spiders/video_spider_all.py index 98f58f3..05f0980 100644 --- a/biliob_spider/spiders/video_spider_all.py +++ b/biliob_spider/spiders/video_spider_all.py @@ -1,5 +1,6 @@ # coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import VideoItem from datetime import datetime @@ -224,6 +225,8 @@ def parse(self, response): # 出现错误时打印错误日志 if r['code'] == -404: return + mailer = MailSender() + mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) logging.error("视频爬虫在解析时发生错误") logging.error(item) logging.error(response.url) diff --git a/biliob_spider/spiders/video_watcher.py b/biliob_spider/spiders/video_watcher.py index 51e2939..3da6202 100644 --- a/biliob_spider/spiders/video_watcher.py +++ b/biliob_spider/spiders/video_watcher.py @@ -1,5 +1,6 @@ #coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import VideoWatcherItem import time @@ -59,6 +60,8 @@ def parse(self, response): yield item except Exception as error: # 出现错误时打印错误日志 + mailer = MailSender() + mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) From 838b9bad8be734ea65bd129dbcac5981c12a3238 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 5 Jan 2019 23:00:46 +0800 Subject: [PATCH 110/469] send email --- biliob_spider/items.py | 1 + biliob_spider/spiders/author_auto_add.py | 3 +++ biliob_spider/spiders/author_update.py | 3 +++ biliob_spider/spiders/bangumi.py | 1 + biliob_spider/spiders/bili_monthly_rank.py | 3 +++ biliob_spider/spiders/donghua.py | 1 + biliob_spider/spiders/dynamic.py | 3 +++ biliob_spider/spiders/online.py | 3 +++ biliob_spider/spiders/site_info.py | 3 +++ biliob_spider/spiders/tag.py | 3 +++ biliob_spider/spiders/video_spider.py | 4 +++- biliob_spider/spiders/video_spider_all.py | 3 +++ biliob_spider/spiders/video_watcher.py | 3 +++ 13 files changed, 33 insertions(+), 1 deletion(-) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index e85c431..88d9571 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -6,6 +6,7 @@ # https://doc.scrapy.org/en/latest/topics/items.html import scrapy +from scrapy.mail import MailSender class SiteItem(scrapy.Item): region_count = scrapy.Field() diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index af6a451..03fe0a7 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -1,5 +1,6 @@ #coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import AuthorItem import time @@ -49,6 +50,8 @@ def parse(self, response): callback=self.detailParse) except Exception as error: # 出现错误时打印错误日志 + mailer = MailSender() + mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}".format(response.url,error)) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index 12f5280..a181359 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -1,5 +1,6 @@ #coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import AuthorItem import time @@ -84,6 +85,8 @@ def parse(self, response): callback=self.parse_view) except Exception as error: # 出现错误时打印错误日志 + mailer = MailSender() + mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/bangumi.py b/biliob_spider/spiders/bangumi.py index d9d155b..b2f6d7e 100644 --- a/biliob_spider/spiders/bangumi.py +++ b/biliob_spider/spiders/bangumi.py @@ -1,5 +1,6 @@ #coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import BangumiItem import time diff --git a/biliob_spider/spiders/bili_monthly_rank.py b/biliob_spider/spiders/bili_monthly_rank.py index eb7fcf9..c0d8c15 100644 --- a/biliob_spider/spiders/bili_monthly_rank.py +++ b/biliob_spider/spiders/bili_monthly_rank.py @@ -1,5 +1,6 @@ #coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import RankItem import time @@ -56,6 +57,8 @@ def parse(self, response): yield item except Exception as error: # 出现错误时打印错误日志 + mailer = MailSender() + mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/donghua.py b/biliob_spider/spiders/donghua.py index 63bb5af..bcd3449 100644 --- a/biliob_spider/spiders/donghua.py +++ b/biliob_spider/spiders/donghua.py @@ -1,5 +1,6 @@ #coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import BangumiItem import time diff --git a/biliob_spider/spiders/dynamic.py b/biliob_spider/spiders/dynamic.py index 058481e..62a99e7 100644 --- a/biliob_spider/spiders/dynamic.py +++ b/biliob_spider/spiders/dynamic.py @@ -1,5 +1,6 @@ #coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import SiteItem import time @@ -40,6 +41,8 @@ def parse(self, response): except Exception as error: # 出现错误时打印错误日志 + mailer = MailSender() + mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}".format(response.url,error)) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/online.py b/biliob_spider/spiders/online.py index ed6e35d..4378a2a 100644 --- a/biliob_spider/spiders/online.py +++ b/biliob_spider/spiders/online.py @@ -1,5 +1,6 @@ #coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import VideoOnline import time @@ -44,6 +45,8 @@ def parse(self, response): callback=self.detailParse) except Exception as error: # 出现错误时打印错误日志 + mailer = MailSender() + mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/site_info.py b/biliob_spider/spiders/site_info.py index 6faafc6..b8fffb5 100644 --- a/biliob_spider/spiders/site_info.py +++ b/biliob_spider/spiders/site_info.py @@ -1,5 +1,6 @@ #coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import SiteItem import time @@ -32,6 +33,8 @@ def parse(self, response): except Exception as error: # 出现错误时打印错误日志 + mailer = MailSender() + mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/tag.py b/biliob_spider/spiders/tag.py index e4f756d..a2236ea 100644 --- a/biliob_spider/spiders/tag.py +++ b/biliob_spider/spiders/tag.py @@ -1,5 +1,6 @@ #coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import TagItem import time @@ -37,6 +38,8 @@ def parse(self, response): except Exception as error: # 出现错误时打印错误日志 + mailer = MailSender() + mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index 131cded..fc25a83 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -1,5 +1,6 @@ # coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import VideoItem from datetime import datetime @@ -8,7 +9,6 @@ import logging from pymongo import MongoClient from db import settings - sub_channel_2_channel = { 'ASMR': '生活', 'GMV': '游戏', @@ -224,6 +224,8 @@ def parse(self, response): # 出现错误时打印错误日志 if r['code'] == -404: return + mailer = MailSender() + mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) logging.error("视频爬虫在解析时发生错误") logging.error(item) logging.error(response.url) diff --git a/biliob_spider/spiders/video_spider_all.py b/biliob_spider/spiders/video_spider_all.py index 98f58f3..05f0980 100644 --- a/biliob_spider/spiders/video_spider_all.py +++ b/biliob_spider/spiders/video_spider_all.py @@ -1,5 +1,6 @@ # coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import VideoItem from datetime import datetime @@ -224,6 +225,8 @@ def parse(self, response): # 出现错误时打印错误日志 if r['code'] == -404: return + mailer = MailSender() + mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) logging.error("视频爬虫在解析时发生错误") logging.error(item) logging.error(response.url) diff --git a/biliob_spider/spiders/video_watcher.py b/biliob_spider/spiders/video_watcher.py index 51e2939..3da6202 100644 --- a/biliob_spider/spiders/video_watcher.py +++ b/biliob_spider/spiders/video_watcher.py @@ -1,5 +1,6 @@ #coding=utf-8 import scrapy +from scrapy.mail import MailSender from scrapy.http import Request from biliob_spider.items import VideoWatcherItem import time @@ -59,6 +60,8 @@ def parse(self, response): yield item except Exception as error: # 出现错误时打印错误日志 + mailer = MailSender() + mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) From 1c477431eea52eef3762695f66842c61dc7fae4e Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 5 Jan 2019 23:06:24 +0800 Subject: [PATCH 111/469] every week crawl all video --- run.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/run.py b/run.py index 4b149b3..ea00ba9 100644 --- a/run.py +++ b/run.py @@ -28,11 +28,13 @@ def video_watcher(): def video_spider(): Popen(["scrapy","crawl","videoSpider"]) +def video_spider_all(): + Popen(["scrapy","crawl","videoSpiderAll"]) + def online(): Popen(['scrapy','crawl','online']) def data_analyze(): - print('执行data_analyzer') Popen(['python','run_analyzer.py']) def bili_monthly_rank(): @@ -51,6 +53,7 @@ def run_threaded(job_func): schedule.every().day.at('16:30').do(run_threaded,donghua) schedule.every().day.at('22:00').do(run_threaded,video_watcher) schedule.every().day.at('21:00').do(run_threaded,bili_monthly_rank) +schedule.every().week.do(run_threaded,video_spider_all) schedule.every().hour.do(run_threaded,site) schedule.every().minute.do(run_threaded,online) From 37e4ea687473498a6cd70bd61fbdf5aa20041898 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 5 Jan 2019 23:06:24 +0800 Subject: [PATCH 112/469] every week crawl all video --- run.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/run.py b/run.py index 4b149b3..ea00ba9 100644 --- a/run.py +++ b/run.py @@ -28,11 +28,13 @@ def video_watcher(): def video_spider(): Popen(["scrapy","crawl","videoSpider"]) +def video_spider_all(): + Popen(["scrapy","crawl","videoSpiderAll"]) + def online(): Popen(['scrapy','crawl','online']) def data_analyze(): - print('执行data_analyzer') Popen(['python','run_analyzer.py']) def bili_monthly_rank(): @@ -51,6 +53,7 @@ def run_threaded(job_func): schedule.every().day.at('16:30').do(run_threaded,donghua) schedule.every().day.at('22:00').do(run_threaded,video_watcher) schedule.every().day.at('21:00').do(run_threaded,bili_monthly_rank) +schedule.every().week.do(run_threaded,video_spider_all) schedule.every().hour.do(run_threaded,site) schedule.every().minute.do(run_threaded,online) From 08dfd4f7706ecd56df84efa0193b4dc6202d230c Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 5 Jan 2019 23:08:00 +0800 Subject: [PATCH 113/469] remove dislike --- biliob_spider/items.py | 1 - biliob_spider/pipelines.py | 1 - biliob_spider/spiders/video_spider.py | 3 --- biliob_spider/spiders/video_spider_all.py | 3 --- 4 files changed, 8 deletions(-) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 88d9571..1dc2a12 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -46,7 +46,6 @@ class VideoItem(scrapy.Item): current_coin = scrapy.Field() current_share = scrapy.Field() current_like = scrapy.Field() - current_dislike = scrapy.Field() current_datetime = scrapy.Field() class AuthorItem(scrapy.Item): diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 1402f68..14f3bc9 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -31,7 +31,6 @@ def process_item(self, item, spider): 'cCoin':item['current_coin'], 'cShare':item['current_share'] , 'cLike':item['current_like'], - 'cDislike':item['current_dislike'], 'cDatetime':item['current_datetime'], 'author': item['author'], 'subChannel': item['subChannel'], diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index fc25a83..4235b9b 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -170,7 +170,6 @@ def parse(self, response): coin = d[each_key]['stat']['coin'] share = d[each_key]['stat']['share'] like = d[each_key]['stat']['like'] - dislike = d[each_key]['stat']['dislike'] current_date = datetime.now() data = { 'view': view, @@ -179,7 +178,6 @@ def parse(self, response): 'coin': coin, 'share': share, 'like': like, - 'dislike': dislike, 'datetime': current_date } @@ -196,7 +194,6 @@ def parse(self, response): item['current_coin'] = coin item['current_share'] = share item['current_like'] = like - item['current_dislike'] = dislike item['current_datetime'] = current_date item['aid'] = aid item['mid'] = mid diff --git a/biliob_spider/spiders/video_spider_all.py b/biliob_spider/spiders/video_spider_all.py index 05f0980..d23f5bb 100644 --- a/biliob_spider/spiders/video_spider_all.py +++ b/biliob_spider/spiders/video_spider_all.py @@ -171,7 +171,6 @@ def parse(self, response): coin = d[each_key]['stat']['coin'] share = d[each_key]['stat']['share'] like = d[each_key]['stat']['like'] - dislike = d[each_key]['stat']['dislike'] current_date = datetime.now() data = { 'view': view, @@ -180,7 +179,6 @@ def parse(self, response): 'coin': coin, 'share': share, 'like': like, - 'dislike': dislike, 'datetime': current_date } @@ -197,7 +195,6 @@ def parse(self, response): item['current_coin'] = coin item['current_share'] = share item['current_like'] = like - item['current_dislike'] = dislike item['current_datetime'] = current_date item['aid'] = aid item['mid'] = mid From a7d6d12793bb2078ed29f5db11e4cde10cbaf0a4 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 5 Jan 2019 23:08:00 +0800 Subject: [PATCH 114/469] remove dislike --- biliob_spider/items.py | 1 - biliob_spider/pipelines.py | 1 - biliob_spider/spiders/video_spider.py | 3 --- biliob_spider/spiders/video_spider_all.py | 3 --- 4 files changed, 8 deletions(-) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 88d9571..1dc2a12 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -46,7 +46,6 @@ class VideoItem(scrapy.Item): current_coin = scrapy.Field() current_share = scrapy.Field() current_like = scrapy.Field() - current_dislike = scrapy.Field() current_datetime = scrapy.Field() class AuthorItem(scrapy.Item): diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 1402f68..14f3bc9 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -31,7 +31,6 @@ def process_item(self, item, spider): 'cCoin':item['current_coin'], 'cShare':item['current_share'] , 'cLike':item['current_like'], - 'cDislike':item['current_dislike'], 'cDatetime':item['current_datetime'], 'author': item['author'], 'subChannel': item['subChannel'], diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index fc25a83..4235b9b 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -170,7 +170,6 @@ def parse(self, response): coin = d[each_key]['stat']['coin'] share = d[each_key]['stat']['share'] like = d[each_key]['stat']['like'] - dislike = d[each_key]['stat']['dislike'] current_date = datetime.now() data = { 'view': view, @@ -179,7 +178,6 @@ def parse(self, response): 'coin': coin, 'share': share, 'like': like, - 'dislike': dislike, 'datetime': current_date } @@ -196,7 +194,6 @@ def parse(self, response): item['current_coin'] = coin item['current_share'] = share item['current_like'] = like - item['current_dislike'] = dislike item['current_datetime'] = current_date item['aid'] = aid item['mid'] = mid diff --git a/biliob_spider/spiders/video_spider_all.py b/biliob_spider/spiders/video_spider_all.py index 05f0980..d23f5bb 100644 --- a/biliob_spider/spiders/video_spider_all.py +++ b/biliob_spider/spiders/video_spider_all.py @@ -171,7 +171,6 @@ def parse(self, response): coin = d[each_key]['stat']['coin'] share = d[each_key]['stat']['share'] like = d[each_key]['stat']['like'] - dislike = d[each_key]['stat']['dislike'] current_date = datetime.now() data = { 'view': view, @@ -180,7 +179,6 @@ def parse(self, response): 'coin': coin, 'share': share, 'like': like, - 'dislike': dislike, 'datetime': current_date } @@ -197,7 +195,6 @@ def parse(self, response): item['current_coin'] = coin item['current_share'] = share item['current_like'] = like - item['current_dislike'] = dislike item['current_datetime'] = current_date item['aid'] = aid item['mid'] = mid From 72d9c157fae24b164af668a804ef13dbc554fdcb Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 9 Jan 2019 16:47:37 +0800 Subject: [PATCH 115/469] feature: do not delete, change the constant --- biliob_analyzer/author_fans_watcher.py | 4 ++-- biliob_analyzer/video_analyzer.py | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index b243bce..bd0d4d5 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -23,9 +23,9 @@ # 对于下降趋势的UP主,较上日减少多少倍,才算是巨量掉粉 WTF_DECREASE = 8 - + # 粉丝增加多少,才算大量涨粉 -FANS_INCREASE_THRESHOLD = 10000 +FANS_INCREASE_THRESHOLD = 8000 # 粉丝减少多少,算作大量掉粉 FANS_DECREASE_THRESHOLD = -3000 # 多少粉丝以上才关注掉粉 diff --git a/biliob_analyzer/video_analyzer.py b/biliob_analyzer/video_analyzer.py index 022ed5a..92d58db 100644 --- a/biliob_analyzer/video_analyzer.py +++ b/biliob_analyzer/video_analyzer.py @@ -41,13 +41,13 @@ def video_filter(self): rate = (c_view-pre_view) pre_view = c_view pre_date = c_date - # 三天内播放增长小于3000则被认定为低质量 - if live_time == 3 and c_view < 3000: - delete = True - focus = False - break - # 大于三天后每日播放增长小于100则停止追踪 - elif live_time > 3 and rate < 100: + # # 三天内播放增长小于3000则被认定为低质量 + # if live_time == 3 and c_view < 3000: + # delete = True + # focus = False + # break + # 大于7天后每日播放增长小于100则停止追踪 + if live_time > 7 and rate < 100: focus = False delete = False break From 976ac8b776de2f003f21cf5e517968fb16d3253b Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 9 Jan 2019 16:47:37 +0800 Subject: [PATCH 116/469] feature: do not delete, change the constant --- biliob_analyzer/author_fans_watcher.py | 4 ++-- biliob_analyzer/video_analyzer.py | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index b243bce..bd0d4d5 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -23,9 +23,9 @@ # 对于下降趋势的UP主,较上日减少多少倍,才算是巨量掉粉 WTF_DECREASE = 8 - + # 粉丝增加多少,才算大量涨粉 -FANS_INCREASE_THRESHOLD = 10000 +FANS_INCREASE_THRESHOLD = 8000 # 粉丝减少多少,算作大量掉粉 FANS_DECREASE_THRESHOLD = -3000 # 多少粉丝以上才关注掉粉 diff --git a/biliob_analyzer/video_analyzer.py b/biliob_analyzer/video_analyzer.py index 022ed5a..92d58db 100644 --- a/biliob_analyzer/video_analyzer.py +++ b/biliob_analyzer/video_analyzer.py @@ -41,13 +41,13 @@ def video_filter(self): rate = (c_view-pre_view) pre_view = c_view pre_date = c_date - # 三天内播放增长小于3000则被认定为低质量 - if live_time == 3 and c_view < 3000: - delete = True - focus = False - break - # 大于三天后每日播放增长小于100则停止追踪 - elif live_time > 3 and rate < 100: + # # 三天内播放增长小于3000则被认定为低质量 + # if live_time == 3 and c_view < 3000: + # delete = True + # focus = False + # break + # 大于7天后每日播放增长小于100则停止追踪 + if live_time > 7 and rate < 100: focus = False delete = False break From 635a553016607fbbb8f755c49eeaf404cee21bcd Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 12 Jan 2019 14:24:03 +0800 Subject: [PATCH 117/469] hotfix: current fans change rate. --- biliob_analyzer/author_fans_watcher.py | 30 ++++++++------ biliob_analyzer/author_rate_caculate.py | 53 +++++++++++++++---------- 2 files changed, 50 insertions(+), 33 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index bd0d4d5..6adcdf4 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -23,7 +23,7 @@ # 对于下降趋势的UP主,较上日减少多少倍,才算是巨量掉粉 WTF_DECREASE = 8 - + # 粉丝增加多少,才算大量涨粉 FANS_INCREASE_THRESHOLD = 8000 # 粉丝减少多少,算作大量掉粉 @@ -31,6 +31,7 @@ # 多少粉丝以上才关注掉粉 WATCH_DECREASE = 1000 + class Event(Enum): increase_1 = 'I级增长' increase_2 = 'II级猛增' @@ -40,11 +41,13 @@ class Event(Enum): decrease_2 = 'II级锐减' decrease_3 = 'III级暴减' -last_datetime = datetime.datetime(2000,1,1) + +last_datetime = datetime.datetime(2000, 1, 1) print('开始捕捉事件') if event.count() != 0: - last_datetime = next(event.find().sort([('datetime',-1)]).limit(1))['datetime'] - + last_datetime = next(event.find().sort([('datetime', + -1)]).limit(1))['datetime'] + for each_author in coll.find().batch_size(8): if 'fansRate' in each_author and len(each_author['fansRate']) > 1: index = 1 @@ -56,16 +59,18 @@ def print_data(each_author): datetime=each_author['fansRate'][c_index]['datetime']) def insert_event(event_type): - videos = video.find({'mid':each_author['mid']}) + videos = video.find({'mid': each_author['mid']}) temp_video = {} - cause = {'type':'video'} + cause = {'type': 'video'} for each_v in videos: # 相差一日之内 - if (each_author['fansRate'][c_index]['datetime'] - each_v['datetime']).days <= 1: + if (each_author['fansRate'][c_index]['datetime'] - + each_v['datetime']).days <= 1: temp_video['aid'] = each_v['aid'] temp_video['title'] = each_v['title'] temp_video['cView'] = each_v['data'][0]['view'] - if 'cView' not in temp_video or 'aid' not in cause or temp_video['cView'] > cause['cView']: + if 'cView' not in temp_video or 'aid' not in cause or temp_video[ + 'cView'] > cause['cView']: cause['aid'] = temp_video['aid'] cause['title'] = temp_video['title'] cause['cView'] = temp_video['cView'] @@ -81,7 +86,8 @@ def insert_event(event_type): each_author['fansRate'][c_index]['rate'], 'datetime': each_author['fansRate'][c_index]['datetime'], - 'cause': cause + 'cause': + cause }) while index < len(each_author['fansRate']): @@ -120,9 +126,9 @@ def insert_event(event_type): # 一掉再掉 elif each_author['fansRate'][c_index][ 'rate'] < FANS_DECREASE_THRESHOLD and abs( - each_author['fansRate'][c_index]['rate']) > abs( - each_author['fansRate'][index] - ['rate']) * WTF_DECREASE: + each_author['fansRate'][c_index]['rate'] + ) > abs( + each_author['fansRate'][index]['rate']) * WTF_DECREASE: insert_event(Event.decrease_3) print(Event.decrease_3.value + print_data(each_author)) # 一掉再掉 diff --git a/biliob_analyzer/author_rate_caculate.py b/biliob_analyzer/author_rate_caculate.py index 6123538..2cb7acb 100644 --- a/biliob_analyzer/author_rate_caculate.py +++ b/biliob_analyzer/author_rate_caculate.py @@ -13,15 +13,21 @@ lastest_date = each_author['fansRate'][0]['datetime'] def getDate(date): - return date - datetime.timedelta(hours=date.hour, seconds=date.second,microseconds=date.microsecond,minutes=date.minute) + return date - datetime.timedelta( + hours=date.hour, + seconds=date.second, + microseconds=date.microsecond, + minutes=date.minute) def next_c(i): return each_author['data'][i]['fans'], each_author['data'][i][ 'datetime'], each_author['data'][i][ 'datetime'] - datetime.timedelta( - hours=each_author['data'][i][ - 'datetime'].hour, seconds=each_author['data'][i][ - 'datetime'].second,microseconds=each_author['data'][i]['datetime'].microsecond,minutes=each_author['data'][i]['datetime'].minute) + hours=each_author['data'][i]['datetime'].hour, + seconds=each_author['data'][i]['datetime'].second, + microseconds=each_author['data'][i]['datetime']. + microsecond, + minutes=each_author['data'][i]['datetime'].minute) c_fans, c_datetime, c_date = next_c(i) @@ -37,7 +43,6 @@ def next_p(i): p_fans, p_datetime, p_date = next_p(i) - # 相差粉丝数 delta_fans = c_fans - p_fans # 相差日期数 @@ -61,7 +66,12 @@ def next_p(i): if (c_date - p_date).days == 1: delta_fans = c_fans - p_fans seconds = days + (c_datetime.second - p_datetime.second) - rate.append({'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}) + rate.append({ + 'rate': + int(delta_fans / (1 + seconds / (60 * 60 * 24))), + 'datetime': + c_date + }) i += 1 c_fans, c_datetime, c_date = next_c(i) p_fans, p_datetime, p_date = next_p(i) @@ -72,30 +82,31 @@ def next_p(i): # 相差多天 days = (c_date - p_date).days while days > 1: - t_rate = delta_fans/(days + seconds/(60*60*24)) + t_rate = delta_fans / (days + seconds / (60 * 60 * 24)) t_date = c_date - datetime.timedelta(1) t_fans = c_fans - t_rate delta_fans = c_fans - t_fans - rate.append({'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}) + rate.append({ + 'rate': + int(delta_fans / (1 + seconds / (60 * 60 * 24))), + 'datetime': + c_date + }) c_fans = t_fans c_date = t_date days -= 1 coll.update_one({ 'mid': each_author['mid'] - }, { - '$push': { - 'fansRate': { - '$each': rate, - '$position': 0 - } + }, {'$push': { + 'fansRate': { + '$each': rate, + '$position': 0 } - }, True) - if 'fansRate' in each_author and len(each_author['fansRate']) != 0: + }}, True) + if len(rate) != 0: coll.update_one({ 'mid': each_author['mid'] - }, { - '$set': { - 'cRate': each_author['fansRate'][0]['rate'] - } - }, True) + }, {'$set': { + 'cRate': rate[0]['rate'] + }}, True) pass From 918686cf176189470f3af1f67e7fa585d3faccef Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 12 Jan 2019 14:24:03 +0800 Subject: [PATCH 118/469] hotfix: current fans change rate. --- biliob_analyzer/author_fans_watcher.py | 30 ++++++++------ biliob_analyzer/author_rate_caculate.py | 53 +++++++++++++++---------- 2 files changed, 50 insertions(+), 33 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index bd0d4d5..6adcdf4 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -23,7 +23,7 @@ # 对于下降趋势的UP主,较上日减少多少倍,才算是巨量掉粉 WTF_DECREASE = 8 - + # 粉丝增加多少,才算大量涨粉 FANS_INCREASE_THRESHOLD = 8000 # 粉丝减少多少,算作大量掉粉 @@ -31,6 +31,7 @@ # 多少粉丝以上才关注掉粉 WATCH_DECREASE = 1000 + class Event(Enum): increase_1 = 'I级增长' increase_2 = 'II级猛增' @@ -40,11 +41,13 @@ class Event(Enum): decrease_2 = 'II级锐减' decrease_3 = 'III级暴减' -last_datetime = datetime.datetime(2000,1,1) + +last_datetime = datetime.datetime(2000, 1, 1) print('开始捕捉事件') if event.count() != 0: - last_datetime = next(event.find().sort([('datetime',-1)]).limit(1))['datetime'] - + last_datetime = next(event.find().sort([('datetime', + -1)]).limit(1))['datetime'] + for each_author in coll.find().batch_size(8): if 'fansRate' in each_author and len(each_author['fansRate']) > 1: index = 1 @@ -56,16 +59,18 @@ def print_data(each_author): datetime=each_author['fansRate'][c_index]['datetime']) def insert_event(event_type): - videos = video.find({'mid':each_author['mid']}) + videos = video.find({'mid': each_author['mid']}) temp_video = {} - cause = {'type':'video'} + cause = {'type': 'video'} for each_v in videos: # 相差一日之内 - if (each_author['fansRate'][c_index]['datetime'] - each_v['datetime']).days <= 1: + if (each_author['fansRate'][c_index]['datetime'] - + each_v['datetime']).days <= 1: temp_video['aid'] = each_v['aid'] temp_video['title'] = each_v['title'] temp_video['cView'] = each_v['data'][0]['view'] - if 'cView' not in temp_video or 'aid' not in cause or temp_video['cView'] > cause['cView']: + if 'cView' not in temp_video or 'aid' not in cause or temp_video[ + 'cView'] > cause['cView']: cause['aid'] = temp_video['aid'] cause['title'] = temp_video['title'] cause['cView'] = temp_video['cView'] @@ -81,7 +86,8 @@ def insert_event(event_type): each_author['fansRate'][c_index]['rate'], 'datetime': each_author['fansRate'][c_index]['datetime'], - 'cause': cause + 'cause': + cause }) while index < len(each_author['fansRate']): @@ -120,9 +126,9 @@ def insert_event(event_type): # 一掉再掉 elif each_author['fansRate'][c_index][ 'rate'] < FANS_DECREASE_THRESHOLD and abs( - each_author['fansRate'][c_index]['rate']) > abs( - each_author['fansRate'][index] - ['rate']) * WTF_DECREASE: + each_author['fansRate'][c_index]['rate'] + ) > abs( + each_author['fansRate'][index]['rate']) * WTF_DECREASE: insert_event(Event.decrease_3) print(Event.decrease_3.value + print_data(each_author)) # 一掉再掉 diff --git a/biliob_analyzer/author_rate_caculate.py b/biliob_analyzer/author_rate_caculate.py index 6123538..2cb7acb 100644 --- a/biliob_analyzer/author_rate_caculate.py +++ b/biliob_analyzer/author_rate_caculate.py @@ -13,15 +13,21 @@ lastest_date = each_author['fansRate'][0]['datetime'] def getDate(date): - return date - datetime.timedelta(hours=date.hour, seconds=date.second,microseconds=date.microsecond,minutes=date.minute) + return date - datetime.timedelta( + hours=date.hour, + seconds=date.second, + microseconds=date.microsecond, + minutes=date.minute) def next_c(i): return each_author['data'][i]['fans'], each_author['data'][i][ 'datetime'], each_author['data'][i][ 'datetime'] - datetime.timedelta( - hours=each_author['data'][i][ - 'datetime'].hour, seconds=each_author['data'][i][ - 'datetime'].second,microseconds=each_author['data'][i]['datetime'].microsecond,minutes=each_author['data'][i]['datetime'].minute) + hours=each_author['data'][i]['datetime'].hour, + seconds=each_author['data'][i]['datetime'].second, + microseconds=each_author['data'][i]['datetime']. + microsecond, + minutes=each_author['data'][i]['datetime'].minute) c_fans, c_datetime, c_date = next_c(i) @@ -37,7 +43,6 @@ def next_p(i): p_fans, p_datetime, p_date = next_p(i) - # 相差粉丝数 delta_fans = c_fans - p_fans # 相差日期数 @@ -61,7 +66,12 @@ def next_p(i): if (c_date - p_date).days == 1: delta_fans = c_fans - p_fans seconds = days + (c_datetime.second - p_datetime.second) - rate.append({'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}) + rate.append({ + 'rate': + int(delta_fans / (1 + seconds / (60 * 60 * 24))), + 'datetime': + c_date + }) i += 1 c_fans, c_datetime, c_date = next_c(i) p_fans, p_datetime, p_date = next_p(i) @@ -72,30 +82,31 @@ def next_p(i): # 相差多天 days = (c_date - p_date).days while days > 1: - t_rate = delta_fans/(days + seconds/(60*60*24)) + t_rate = delta_fans / (days + seconds / (60 * 60 * 24)) t_date = c_date - datetime.timedelta(1) t_fans = c_fans - t_rate delta_fans = c_fans - t_fans - rate.append({'rate':int(delta_fans/(1 + seconds/(60*60*24))),'datetime':c_date}) + rate.append({ + 'rate': + int(delta_fans / (1 + seconds / (60 * 60 * 24))), + 'datetime': + c_date + }) c_fans = t_fans c_date = t_date days -= 1 coll.update_one({ 'mid': each_author['mid'] - }, { - '$push': { - 'fansRate': { - '$each': rate, - '$position': 0 - } + }, {'$push': { + 'fansRate': { + '$each': rate, + '$position': 0 } - }, True) - if 'fansRate' in each_author and len(each_author['fansRate']) != 0: + }}, True) + if len(rate) != 0: coll.update_one({ 'mid': each_author['mid'] - }, { - '$set': { - 'cRate': each_author['fansRate'][0]['rate'] - } - }, True) + }, {'$set': { + 'cRate': rate[0]['rate'] + }}, True) pass From a4c5ec8d58259329b8663740d6fa66337d8cb9fe Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sun, 20 Jan 2019 21:00:24 +0800 Subject: [PATCH 119/469] mail me --- .gitignore | 3 ++- biliob_analyzer/add_focus_video.py | 14 ++++++++++++++ biliob_spider/items.py | 2 +- biliob_spider/spiders/author_auto_add.py | 9 ++++++--- biliob_spider/spiders/author_update.py | 9 ++++++--- biliob_spider/spiders/bili_monthly_rank.py | 9 ++++++--- biliob_spider/spiders/dynamic.py | 9 ++++++--- biliob_spider/spiders/online.py | 9 ++++++--- biliob_spider/spiders/site_info.py | 9 ++++++--- biliob_spider/spiders/tag.py | 9 ++++++--- biliob_spider/spiders/video_spider_all.py | 10 ++++++---- biliob_spider/spiders/video_watcher.py | 9 ++++++--- 12 files changed, 71 insertions(+), 30 deletions(-) create mode 100644 biliob_analyzer/add_focus_video.py diff --git a/.gitignore b/.gitignore index f6806e5..1a40d66 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ db.py nohup.out biliob_spider.log -debug.py \ No newline at end of file +debug.py +mail.py diff --git a/biliob_analyzer/add_focus_video.py b/biliob_analyzer/add_focus_video.py new file mode 100644 index 0000000..f23de18 --- /dev/null +++ b/biliob_analyzer/add_focus_video.py @@ -0,0 +1,14 @@ +from db import settings +from pymongo import MongoClient +# 链接mongoDB +client = MongoClient(settings['MINGO_HOST'], 27017) +# 数据库登录需要帐号密码 +client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW']) +db = client['biliob'] # 获得数据库的句柄 +coll = db['video'] # 获得collection的句柄 +docs = coll.find({'focus': {'$exists': False}}).batch_size(60) +for each_doc in docs: + if 'aid' in each_doc: + each_doc['focus'] = True + coll.update_one({'aid': each_doc['aid']}, {'$set': each_doc}) + print('已修复aid' + str(each_doc['aid'])) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 1dc2a12..39ce9ec 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -6,7 +6,7 @@ # https://doc.scrapy.org/en/latest/topics/items.html import scrapy -from scrapy.mail import MailSender +from mail import mailer class SiteItem(scrapy.Item): region_count = scrapy.Field() diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index 03fe0a7..a06d075 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -1,6 +1,6 @@ #coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import AuthorItem import time @@ -50,8 +50,11 @@ def parse(self, response): callback=self.detailParse) except Exception as error: # 出现错误时打印错误日志 - mailer = MailSender() - mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}".format(response.url,error)) + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}".format(response.url, error), + ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index a181359..8b819fd 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -1,6 +1,6 @@ #coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import AuthorItem import time @@ -85,8 +85,11 @@ def parse(self, response): callback=self.parse_view) except Exception as error: # 出现错误时打印错误日志 - mailer = MailSender() - mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/bili_monthly_rank.py b/biliob_spider/spiders/bili_monthly_rank.py index c0d8c15..338ac31 100644 --- a/biliob_spider/spiders/bili_monthly_rank.py +++ b/biliob_spider/spiders/bili_monthly_rank.py @@ -1,6 +1,6 @@ #coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import RankItem import time @@ -57,8 +57,11 @@ def parse(self, response): yield item except Exception as error: # 出现错误时打印错误日志 - mailer = MailSender() - mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/dynamic.py b/biliob_spider/spiders/dynamic.py index 62a99e7..4fa4b65 100644 --- a/biliob_spider/spiders/dynamic.py +++ b/biliob_spider/spiders/dynamic.py @@ -1,6 +1,6 @@ #coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import SiteItem import time @@ -41,8 +41,11 @@ def parse(self, response): except Exception as error: # 出现错误时打印错误日志 - mailer = MailSender() - mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}".format(response.url,error)) + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}".format(response.url, error), + ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/online.py b/biliob_spider/spiders/online.py index 4378a2a..07fcdd2 100644 --- a/biliob_spider/spiders/online.py +++ b/biliob_spider/spiders/online.py @@ -1,6 +1,6 @@ #coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import VideoOnline import time @@ -45,8 +45,11 @@ def parse(self, response): callback=self.detailParse) except Exception as error: # 出现错误时打印错误日志 - mailer = MailSender() - mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/site_info.py b/biliob_spider/spiders/site_info.py index b8fffb5..305f888 100644 --- a/biliob_spider/spiders/site_info.py +++ b/biliob_spider/spiders/site_info.py @@ -1,6 +1,6 @@ #coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import SiteItem import time @@ -33,8 +33,11 @@ def parse(self, response): except Exception as error: # 出现错误时打印错误日志 - mailer = MailSender() - mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/tag.py b/biliob_spider/spiders/tag.py index a2236ea..b18c866 100644 --- a/biliob_spider/spiders/tag.py +++ b/biliob_spider/spiders/tag.py @@ -1,6 +1,6 @@ #coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import TagItem import time @@ -38,8 +38,11 @@ def parse(self, response): except Exception as error: # 出现错误时打印错误日志 - mailer = MailSender() - mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/video_spider_all.py b/biliob_spider/spiders/video_spider_all.py index d23f5bb..0900acb 100644 --- a/biliob_spider/spiders/video_spider_all.py +++ b/biliob_spider/spiders/video_spider_all.py @@ -1,6 +1,6 @@ # coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import VideoItem from datetime import datetime @@ -104,7 +104,6 @@ '电脑装机': '数码', '影音智能': '数码', '摄影摄像': '数码', - '摄影摄像': '数码', '风尚标': '时尚', '电音': '音乐', '音乐综合': '音乐', @@ -222,8 +221,11 @@ def parse(self, response): # 出现错误时打印错误日志 if r['code'] == -404: return - mailer = MailSender() - mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) logging.error("视频爬虫在解析时发生错误") logging.error(item) logging.error(response.url) diff --git a/biliob_spider/spiders/video_watcher.py b/biliob_spider/spiders/video_watcher.py index 3da6202..e24a865 100644 --- a/biliob_spider/spiders/video_watcher.py +++ b/biliob_spider/spiders/video_watcher.py @@ -1,6 +1,6 @@ #coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import VideoWatcherItem import time @@ -60,8 +60,11 @@ def parse(self, response): yield item except Exception as error: # 出现错误时打印错误日志 - mailer = MailSender() - mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) From b422bea622f0fbb63ead80d45194e752001c890f Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sun, 20 Jan 2019 21:00:24 +0800 Subject: [PATCH 120/469] mail me --- .gitignore | 3 ++- biliob_analyzer/add_focus_video.py | 14 ++++++++++++++ biliob_spider/items.py | 2 +- biliob_spider/spiders/author_auto_add.py | 9 ++++++--- biliob_spider/spiders/author_update.py | 9 ++++++--- biliob_spider/spiders/bili_monthly_rank.py | 9 ++++++--- biliob_spider/spiders/dynamic.py | 9 ++++++--- biliob_spider/spiders/online.py | 9 ++++++--- biliob_spider/spiders/site_info.py | 9 ++++++--- biliob_spider/spiders/tag.py | 9 ++++++--- biliob_spider/spiders/video_spider_all.py | 10 ++++++---- biliob_spider/spiders/video_watcher.py | 9 ++++++--- 12 files changed, 71 insertions(+), 30 deletions(-) create mode 100644 biliob_analyzer/add_focus_video.py diff --git a/.gitignore b/.gitignore index f6806e5..1a40d66 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ db.py nohup.out biliob_spider.log -debug.py \ No newline at end of file +debug.py +mail.py diff --git a/biliob_analyzer/add_focus_video.py b/biliob_analyzer/add_focus_video.py new file mode 100644 index 0000000..f23de18 --- /dev/null +++ b/biliob_analyzer/add_focus_video.py @@ -0,0 +1,14 @@ +from db import settings +from pymongo import MongoClient +# 链接mongoDB +client = MongoClient(settings['MINGO_HOST'], 27017) +# 数据库登录需要帐号密码 +client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW']) +db = client['biliob'] # 获得数据库的句柄 +coll = db['video'] # 获得collection的句柄 +docs = coll.find({'focus': {'$exists': False}}).batch_size(60) +for each_doc in docs: + if 'aid' in each_doc: + each_doc['focus'] = True + coll.update_one({'aid': each_doc['aid']}, {'$set': each_doc}) + print('已修复aid' + str(each_doc['aid'])) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 1dc2a12..39ce9ec 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -6,7 +6,7 @@ # https://doc.scrapy.org/en/latest/topics/items.html import scrapy -from scrapy.mail import MailSender +from mail import mailer class SiteItem(scrapy.Item): region_count = scrapy.Field() diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index 03fe0a7..a06d075 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -1,6 +1,6 @@ #coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import AuthorItem import time @@ -50,8 +50,11 @@ def parse(self, response): callback=self.detailParse) except Exception as error: # 出现错误时打印错误日志 - mailer = MailSender() - mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}".format(response.url,error)) + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}".format(response.url, error), + ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index a181359..8b819fd 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -1,6 +1,6 @@ #coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import AuthorItem import time @@ -85,8 +85,11 @@ def parse(self, response): callback=self.parse_view) except Exception as error: # 出现错误时打印错误日志 - mailer = MailSender() - mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/bili_monthly_rank.py b/biliob_spider/spiders/bili_monthly_rank.py index c0d8c15..338ac31 100644 --- a/biliob_spider/spiders/bili_monthly_rank.py +++ b/biliob_spider/spiders/bili_monthly_rank.py @@ -1,6 +1,6 @@ #coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import RankItem import time @@ -57,8 +57,11 @@ def parse(self, response): yield item except Exception as error: # 出现错误时打印错误日志 - mailer = MailSender() - mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/dynamic.py b/biliob_spider/spiders/dynamic.py index 62a99e7..4fa4b65 100644 --- a/biliob_spider/spiders/dynamic.py +++ b/biliob_spider/spiders/dynamic.py @@ -1,6 +1,6 @@ #coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import SiteItem import time @@ -41,8 +41,11 @@ def parse(self, response): except Exception as error: # 出现错误时打印错误日志 - mailer = MailSender() - mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}".format(response.url,error)) + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}".format(response.url, error), + ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/online.py b/biliob_spider/spiders/online.py index 4378a2a..07fcdd2 100644 --- a/biliob_spider/spiders/online.py +++ b/biliob_spider/spiders/online.py @@ -1,6 +1,6 @@ #coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import VideoOnline import time @@ -45,8 +45,11 @@ def parse(self, response): callback=self.detailParse) except Exception as error: # 出现错误时打印错误日志 - mailer = MailSender() - mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/site_info.py b/biliob_spider/spiders/site_info.py index b8fffb5..305f888 100644 --- a/biliob_spider/spiders/site_info.py +++ b/biliob_spider/spiders/site_info.py @@ -1,6 +1,6 @@ #coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import SiteItem import time @@ -33,8 +33,11 @@ def parse(self, response): except Exception as error: # 出现错误时打印错误日志 - mailer = MailSender() - mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/tag.py b/biliob_spider/spiders/tag.py index a2236ea..b18c866 100644 --- a/biliob_spider/spiders/tag.py +++ b/biliob_spider/spiders/tag.py @@ -1,6 +1,6 @@ #coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import TagItem import time @@ -38,8 +38,11 @@ def parse(self, response): except Exception as error: # 出现错误时打印错误日志 - mailer = MailSender() - mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) diff --git a/biliob_spider/spiders/video_spider_all.py b/biliob_spider/spiders/video_spider_all.py index d23f5bb..0900acb 100644 --- a/biliob_spider/spiders/video_spider_all.py +++ b/biliob_spider/spiders/video_spider_all.py @@ -1,6 +1,6 @@ # coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import VideoItem from datetime import datetime @@ -104,7 +104,6 @@ '电脑装机': '数码', '影音智能': '数码', '摄影摄像': '数码', - '摄影摄像': '数码', '风尚标': '时尚', '电音': '音乐', '音乐综合': '音乐', @@ -222,8 +221,11 @@ def parse(self, response): # 出现错误时打印错误日志 if r['code'] == -404: return - mailer = MailSender() - mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) logging.error("视频爬虫在解析时发生错误") logging.error(item) logging.error(response.url) diff --git a/biliob_spider/spiders/video_watcher.py b/biliob_spider/spiders/video_watcher.py index 3da6202..e24a865 100644 --- a/biliob_spider/spiders/video_watcher.py +++ b/biliob_spider/spiders/video_watcher.py @@ -1,6 +1,6 @@ #coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import VideoWatcherItem import time @@ -60,8 +60,11 @@ def parse(self, response): yield item except Exception as error: # 出现错误时打印错误日志 - mailer = MailSender() - mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error) From c04f5ccd36eb1d875389177e31a6c5749b25fbad Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sun, 20 Jan 2019 21:00:47 +0800 Subject: [PATCH 121/469] bangumi & donghua --- biliob_spider/pipelines.py | 113 ++++++++++++++++++-------- biliob_spider/spiders/bangumi.py | 48 ++++++----- biliob_spider/spiders/donghua.py | 48 ++++++----- biliob_spider/spiders/video_spider.py | 9 +- 4 files changed, 143 insertions(+), 75 deletions(-) diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 14f3bc9..1678ea2 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -6,9 +6,11 @@ # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html from pymongo import MongoClient from db import settings +from db import mysql_connect import datetime import logging + class VideoPipeline(object): def __init__(self): # 链接mongoDB @@ -25,13 +27,13 @@ def process_item(self, item, spider): 'aid': int(item['aid']) }, { '$set': { - 'cView':item['current_view'], - 'cFavorite':item['current_favorite'], - 'cDanmaku':item['current_danmaku'] , - 'cCoin':item['current_coin'], - 'cShare':item['current_share'] , - 'cLike':item['current_like'], - 'cDatetime':item['current_datetime'], + 'cView': item['current_view'], + 'cFavorite': item['current_favorite'], + 'cDanmaku': item['current_danmaku'], + 'cCoin': item['current_coin'], + 'cShare': item['current_share'], + 'cLike': item['current_like'], + 'cDatetime': item['current_datetime'], 'author': item['author'], 'subChannel': item['subChannel'], 'channel': item['channel'], @@ -43,8 +45,42 @@ def process_item(self, item, spider): }, '$push': { 'data': { - '$each':[item['data']], - '$position':0 + '$each': [item['data']], + '$position': 0 + } + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) + +class VideoPipelineFromKan(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'author': item['author'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': item['datetime'] + }, + '$push': { + 'data': { + '$each': [item['data']], + '$position': 0 } } }, True) @@ -71,12 +107,12 @@ def process_item(self, item, spider): '$set': { 'title': item['title'], 'cover': item['cover'], - 'isFinish': item['is_finish'], - 'isStarted': item['is_started'], + # 'isFinish': item['is_finish'], + # 'isStarted': item['is_started'], 'newest': item['newest_ep_index'], 'currentPts': item['data']['pts'], 'currentPlay': item['data']['play'], - 'squareCover': item['square_cover'], + # 'squareCover': item['square_cover'], 'currentWatch': item['data']['watch'], 'currentReview': item['data']['review'], 'currentDanmaku': item['data']['danmaku'] @@ -90,6 +126,7 @@ def process_item(self, item, spider): # 出现错误时打印错误日志 logging.error(error) + class DonghuaPipeLine(object): def __init__(self): # 链接mongoDB @@ -108,12 +145,12 @@ def process_item(self, item, spider): '$set': { 'title': item['title'], 'cover': item['cover'], - 'isFinish': item['is_finish'], - 'isStarted': item['is_started'], + # 'isFinish': item['is_finish'], + # 'isStarted': item['is_started'], 'newest': item['newest_ep_index'], 'currentPts': item['data']['pts'], 'currentPlay': item['data']['play'], - 'squareCover': item['square_cover'], + # 'squareCover': item['square_cover'], 'currentWatch': item['data']['watch'], 'currentReview': item['data']['review'], 'currentDanmaku': item['data']['danmaku'] @@ -126,6 +163,8 @@ def process_item(self, item, spider): except Exception as error: # 出现错误时打印错误日志 logging.error(error) + + class SiteInfoPipeline(object): def __init__(self): # 链接mongoDB @@ -139,17 +178,18 @@ def __init__(self): def process_item(self, item, spider): try: self.coll.insert_one({ - 'region_count': item['region_count'], - 'all_count': item['all_count'], - 'web_online': item['web_online'], - 'play_online': item['play_online'], - 'datetime':datetime.datetime.now() + 'region_count': item['region_count'], + 'all_count': item['all_count'], + 'web_online': item['web_online'], + 'play_online': item['play_online'], + 'datetime': datetime.datetime.now() }) return item except Exception as error: # 出现错误时打印错误日志 logging.error(error) + class AuthorPipeline(object): def __init__(self): # 链接mongoDB @@ -166,23 +206,23 @@ def process_item(self, item, spider): 'mid': item['mid'] }, { '$set': { - 'focus':True, + 'focus': True, 'sex': item['sex'], 'name': item['name'], 'face': item['face'], 'level': item['level'], - 'cFans':item['c_fans'], + 'cFans': item['c_fans'], 'official': item['official'], - 'cArchive':item['c_archive'] , - 'cArticle':item['c_article'] , - 'cAttention':item['c_attention'] , - 'cArchive_view':item['c_archive_view'], - 'cArticle_view':item['c_article_view'], + 'cArchive': item['c_archive'], + 'cArticle': item['c_article'], + 'cAttention': item['c_attention'], + 'cArchive_view': item['c_archive_view'], + 'cArticle_view': item['c_article_view'], }, '$push': { 'data': { - '$each':[item['data']], - '$position':0 + '$each': [item['data']], + '$position': 0 } } }, True) @@ -190,7 +230,8 @@ def process_item(self, item, spider): except Exception as error: # 出现错误时打印错误日志 logging.error(error) - + + class OnlinePipeline(object): def __init__(self): # 链接mongoDB @@ -203,7 +244,7 @@ def __init__(self): def process_item(self, item, spider): try: - + self.coll.update_one({ 'title': item['title'] }, { @@ -235,7 +276,7 @@ def __init__(self): def process_item(self, item, spider): try: - + self.coll.update_one({ 'tag_id': item['tag_id'] }, { @@ -253,6 +294,8 @@ def process_item(self, item, spider): except Exception as error: # 出现错误时打印错误日志 logging.error(error) + + class VideoAddPipeline(object): def __init__(self): # 链接mongoDB @@ -279,6 +322,7 @@ def process_item(self, item, spider): # 出现错误时打印错误日志 logging.error(error) + class AuthorChannelPipeline(object): def __init__(self): # 链接mongoDB @@ -303,6 +347,7 @@ def process_item(self, item, spider): # 出现错误时打印错误日志 logging.error(error) + class BiliMonthlyRankPipeline(object): def __init__(self): # 链接mongoDB @@ -322,13 +367,13 @@ def process_item(self, item, spider): 'pts': item['pts'], 'datetime': datetime.datetime.now() }, - '$set':{ + '$set': { 'title': item['title'], 'author': item['author'], 'aid': item['aid'], 'mid': item['mid'], 'channel': item['channel'], - 'currentPts':item['pts'] + 'currentPts': item['pts'] } }, True) return item diff --git a/biliob_spider/spiders/bangumi.py b/biliob_spider/spiders/bangumi.py index b2f6d7e..a5b3df0 100644 --- a/biliob_spider/spiders/bangumi.py +++ b/biliob_spider/spiders/bangumi.py @@ -1,6 +1,6 @@ #coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import BangumiItem import time @@ -19,21 +19,31 @@ class BangumiSpider(scrapy.spiders.Spider): } def parse(self, response): - j = json.loads(response.xpath("//script[3]/text()").extract()[0][len('window.__INITIAL_STATE__='):].split(';')[0]) - for each in j['rankList']: - item = BangumiItem() - item['title'] = each['title'] - item['cover'] = each['cover'] - item['square_cover'] = each['square_cover'] - item['is_finish'] = each['is_finish'] - item['is_started'] = each['is_started'] - item['newest_ep_index'] = each['newest_ep_index'] - item['data'] = { - 'danmaku': each['dm_count'], - 'watch': each['fav'], - 'play': each['play'], - 'pts': each['pts'], - 'review': each['video_review'], - 'datetime': datetime.datetime.now() - } - yield item \ No newline at end of file + try: + j = json.loads(response.xpath("//script[3]/text()").extract()[0][len('window.__INITIAL_STATE__='):].split(';')[0]) + for each in j['rankList']: + item = BangumiItem() + item['title'] = each['title'] + item['cover'] = each['cover'] + # item['square_cover'] = each['square_cover'] + # item['is_finish'] = each['is_finish'] + # item['is_started'] = each['is_started'] + item['newest_ep_index'] = each['new_ep']['index_show'] + item['data'] = { + 'danmaku': each['stat']['danmaku'], + 'watch': each['stat']['follow'], + 'play': each['stat']['view'], + 'pts': each['pts'], + 'review': each['video_review'], + 'datetime': datetime.datetime.now() + } + yield item + except Exception as error: + # 出现错误时打印错误日志 + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}".format(response.url, error), + ) + + diff --git a/biliob_spider/spiders/donghua.py b/biliob_spider/spiders/donghua.py index bcd3449..dc54f10 100644 --- a/biliob_spider/spiders/donghua.py +++ b/biliob_spider/spiders/donghua.py @@ -1,6 +1,6 @@ #coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import BangumiItem import time @@ -18,21 +18,31 @@ class DonghuaSpider(scrapy.spiders.Spider): } def parse(self, response): - j = json.loads(response.xpath("//script[3]/text()").extract()[0][len('window.__INITIAL_STATE__='):].split(';')[0]) - for each in j['rankList']: - item = BangumiItem() - item['title'] = each['title'] - item['cover'] = each['cover'] - item['square_cover'] = each['square_cover'] - item['is_finish'] = each['is_finish'] - item['is_started'] = each['is_started'] - item['newest_ep_index'] = each['newest_ep_index'] - item['data'] = { - 'danmaku': each['dm_count'], - 'watch': each['fav'], - 'play': each['play'], - 'pts': each['pts'], - 'review': each['video_review'], - 'datetime': datetime.datetime.now() - } - yield item + try: + j = json.loads(response.xpath("//script[3]/text()").extract()[0][len('window.__INITIAL_STATE__='):].split(';')[0]) + for each in j['rankList']: + item = BangumiItem() + item['title'] = each['title'] + item['cover'] = each['cover'] + # item['square_cover'] = each['square_cover'] + # item['is_finish'] = each['is_finish'] + # item['is_started'] = each['is_started'] + item['newest_ep_index'] = each['new_ep']['index_show'] + item['data'] = { + 'danmaku': each['stat']['danmaku'], + 'watch': each['stat']['follow'], + 'play': each['stat']['view'], + 'pts': each['pts'], + 'review': each['video_review'], + 'datetime': datetime.datetime.now() + } + yield item + except Exception as error: + # 出现错误时打印错误日志 + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}".format(response.url, error), + ) + + diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index 4235b9b..b70c3d3 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -1,6 +1,6 @@ # coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import VideoItem from datetime import datetime @@ -221,8 +221,11 @@ def parse(self, response): # 出现错误时打印错误日志 if r['code'] == -404: return - mailer = MailSender() - mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) logging.error("视频爬虫在解析时发生错误") logging.error(item) logging.error(response.url) From 62465ed69ba7ec7c53e9b1b555757e37db939c50 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sun, 20 Jan 2019 21:00:47 +0800 Subject: [PATCH 122/469] bangumi & donghua --- biliob_spider/pipelines.py | 113 ++++++++++++++++++-------- biliob_spider/spiders/bangumi.py | 48 ++++++----- biliob_spider/spiders/donghua.py | 48 ++++++----- biliob_spider/spiders/video_spider.py | 9 +- 4 files changed, 143 insertions(+), 75 deletions(-) diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 14f3bc9..1678ea2 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -6,9 +6,11 @@ # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html from pymongo import MongoClient from db import settings +from db import mysql_connect import datetime import logging + class VideoPipeline(object): def __init__(self): # 链接mongoDB @@ -25,13 +27,13 @@ def process_item(self, item, spider): 'aid': int(item['aid']) }, { '$set': { - 'cView':item['current_view'], - 'cFavorite':item['current_favorite'], - 'cDanmaku':item['current_danmaku'] , - 'cCoin':item['current_coin'], - 'cShare':item['current_share'] , - 'cLike':item['current_like'], - 'cDatetime':item['current_datetime'], + 'cView': item['current_view'], + 'cFavorite': item['current_favorite'], + 'cDanmaku': item['current_danmaku'], + 'cCoin': item['current_coin'], + 'cShare': item['current_share'], + 'cLike': item['current_like'], + 'cDatetime': item['current_datetime'], 'author': item['author'], 'subChannel': item['subChannel'], 'channel': item['channel'], @@ -43,8 +45,42 @@ def process_item(self, item, spider): }, '$push': { 'data': { - '$each':[item['data']], - '$position':0 + '$each': [item['data']], + '$position': 0 + } + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) + +class VideoPipelineFromKan(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'author': item['author'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': item['datetime'] + }, + '$push': { + 'data': { + '$each': [item['data']], + '$position': 0 } } }, True) @@ -71,12 +107,12 @@ def process_item(self, item, spider): '$set': { 'title': item['title'], 'cover': item['cover'], - 'isFinish': item['is_finish'], - 'isStarted': item['is_started'], + # 'isFinish': item['is_finish'], + # 'isStarted': item['is_started'], 'newest': item['newest_ep_index'], 'currentPts': item['data']['pts'], 'currentPlay': item['data']['play'], - 'squareCover': item['square_cover'], + # 'squareCover': item['square_cover'], 'currentWatch': item['data']['watch'], 'currentReview': item['data']['review'], 'currentDanmaku': item['data']['danmaku'] @@ -90,6 +126,7 @@ def process_item(self, item, spider): # 出现错误时打印错误日志 logging.error(error) + class DonghuaPipeLine(object): def __init__(self): # 链接mongoDB @@ -108,12 +145,12 @@ def process_item(self, item, spider): '$set': { 'title': item['title'], 'cover': item['cover'], - 'isFinish': item['is_finish'], - 'isStarted': item['is_started'], + # 'isFinish': item['is_finish'], + # 'isStarted': item['is_started'], 'newest': item['newest_ep_index'], 'currentPts': item['data']['pts'], 'currentPlay': item['data']['play'], - 'squareCover': item['square_cover'], + # 'squareCover': item['square_cover'], 'currentWatch': item['data']['watch'], 'currentReview': item['data']['review'], 'currentDanmaku': item['data']['danmaku'] @@ -126,6 +163,8 @@ def process_item(self, item, spider): except Exception as error: # 出现错误时打印错误日志 logging.error(error) + + class SiteInfoPipeline(object): def __init__(self): # 链接mongoDB @@ -139,17 +178,18 @@ def __init__(self): def process_item(self, item, spider): try: self.coll.insert_one({ - 'region_count': item['region_count'], - 'all_count': item['all_count'], - 'web_online': item['web_online'], - 'play_online': item['play_online'], - 'datetime':datetime.datetime.now() + 'region_count': item['region_count'], + 'all_count': item['all_count'], + 'web_online': item['web_online'], + 'play_online': item['play_online'], + 'datetime': datetime.datetime.now() }) return item except Exception as error: # 出现错误时打印错误日志 logging.error(error) + class AuthorPipeline(object): def __init__(self): # 链接mongoDB @@ -166,23 +206,23 @@ def process_item(self, item, spider): 'mid': item['mid'] }, { '$set': { - 'focus':True, + 'focus': True, 'sex': item['sex'], 'name': item['name'], 'face': item['face'], 'level': item['level'], - 'cFans':item['c_fans'], + 'cFans': item['c_fans'], 'official': item['official'], - 'cArchive':item['c_archive'] , - 'cArticle':item['c_article'] , - 'cAttention':item['c_attention'] , - 'cArchive_view':item['c_archive_view'], - 'cArticle_view':item['c_article_view'], + 'cArchive': item['c_archive'], + 'cArticle': item['c_article'], + 'cAttention': item['c_attention'], + 'cArchive_view': item['c_archive_view'], + 'cArticle_view': item['c_article_view'], }, '$push': { 'data': { - '$each':[item['data']], - '$position':0 + '$each': [item['data']], + '$position': 0 } } }, True) @@ -190,7 +230,8 @@ def process_item(self, item, spider): except Exception as error: # 出现错误时打印错误日志 logging.error(error) - + + class OnlinePipeline(object): def __init__(self): # 链接mongoDB @@ -203,7 +244,7 @@ def __init__(self): def process_item(self, item, spider): try: - + self.coll.update_one({ 'title': item['title'] }, { @@ -235,7 +276,7 @@ def __init__(self): def process_item(self, item, spider): try: - + self.coll.update_one({ 'tag_id': item['tag_id'] }, { @@ -253,6 +294,8 @@ def process_item(self, item, spider): except Exception as error: # 出现错误时打印错误日志 logging.error(error) + + class VideoAddPipeline(object): def __init__(self): # 链接mongoDB @@ -279,6 +322,7 @@ def process_item(self, item, spider): # 出现错误时打印错误日志 logging.error(error) + class AuthorChannelPipeline(object): def __init__(self): # 链接mongoDB @@ -303,6 +347,7 @@ def process_item(self, item, spider): # 出现错误时打印错误日志 logging.error(error) + class BiliMonthlyRankPipeline(object): def __init__(self): # 链接mongoDB @@ -322,13 +367,13 @@ def process_item(self, item, spider): 'pts': item['pts'], 'datetime': datetime.datetime.now() }, - '$set':{ + '$set': { 'title': item['title'], 'author': item['author'], 'aid': item['aid'], 'mid': item['mid'], 'channel': item['channel'], - 'currentPts':item['pts'] + 'currentPts': item['pts'] } }, True) return item diff --git a/biliob_spider/spiders/bangumi.py b/biliob_spider/spiders/bangumi.py index b2f6d7e..a5b3df0 100644 --- a/biliob_spider/spiders/bangumi.py +++ b/biliob_spider/spiders/bangumi.py @@ -1,6 +1,6 @@ #coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import BangumiItem import time @@ -19,21 +19,31 @@ class BangumiSpider(scrapy.spiders.Spider): } def parse(self, response): - j = json.loads(response.xpath("//script[3]/text()").extract()[0][len('window.__INITIAL_STATE__='):].split(';')[0]) - for each in j['rankList']: - item = BangumiItem() - item['title'] = each['title'] - item['cover'] = each['cover'] - item['square_cover'] = each['square_cover'] - item['is_finish'] = each['is_finish'] - item['is_started'] = each['is_started'] - item['newest_ep_index'] = each['newest_ep_index'] - item['data'] = { - 'danmaku': each['dm_count'], - 'watch': each['fav'], - 'play': each['play'], - 'pts': each['pts'], - 'review': each['video_review'], - 'datetime': datetime.datetime.now() - } - yield item \ No newline at end of file + try: + j = json.loads(response.xpath("//script[3]/text()").extract()[0][len('window.__INITIAL_STATE__='):].split(';')[0]) + for each in j['rankList']: + item = BangumiItem() + item['title'] = each['title'] + item['cover'] = each['cover'] + # item['square_cover'] = each['square_cover'] + # item['is_finish'] = each['is_finish'] + # item['is_started'] = each['is_started'] + item['newest_ep_index'] = each['new_ep']['index_show'] + item['data'] = { + 'danmaku': each['stat']['danmaku'], + 'watch': each['stat']['follow'], + 'play': each['stat']['view'], + 'pts': each['pts'], + 'review': each['video_review'], + 'datetime': datetime.datetime.now() + } + yield item + except Exception as error: + # 出现错误时打印错误日志 + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}".format(response.url, error), + ) + + diff --git a/biliob_spider/spiders/donghua.py b/biliob_spider/spiders/donghua.py index bcd3449..dc54f10 100644 --- a/biliob_spider/spiders/donghua.py +++ b/biliob_spider/spiders/donghua.py @@ -1,6 +1,6 @@ #coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import BangumiItem import time @@ -18,21 +18,31 @@ class DonghuaSpider(scrapy.spiders.Spider): } def parse(self, response): - j = json.loads(response.xpath("//script[3]/text()").extract()[0][len('window.__INITIAL_STATE__='):].split(';')[0]) - for each in j['rankList']: - item = BangumiItem() - item['title'] = each['title'] - item['cover'] = each['cover'] - item['square_cover'] = each['square_cover'] - item['is_finish'] = each['is_finish'] - item['is_started'] = each['is_started'] - item['newest_ep_index'] = each['newest_ep_index'] - item['data'] = { - 'danmaku': each['dm_count'], - 'watch': each['fav'], - 'play': each['play'], - 'pts': each['pts'], - 'review': each['video_review'], - 'datetime': datetime.datetime.now() - } - yield item + try: + j = json.loads(response.xpath("//script[3]/text()").extract()[0][len('window.__INITIAL_STATE__='):].split(';')[0]) + for each in j['rankList']: + item = BangumiItem() + item['title'] = each['title'] + item['cover'] = each['cover'] + # item['square_cover'] = each['square_cover'] + # item['is_finish'] = each['is_finish'] + # item['is_started'] = each['is_started'] + item['newest_ep_index'] = each['new_ep']['index_show'] + item['data'] = { + 'danmaku': each['stat']['danmaku'], + 'watch': each['stat']['follow'], + 'play': each['stat']['view'], + 'pts': each['pts'], + 'review': each['video_review'], + 'datetime': datetime.datetime.now() + } + yield item + except Exception as error: + # 出现错误时打印错误日志 + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}".format(response.url, error), + ) + + diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index 4235b9b..b70c3d3 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -1,6 +1,6 @@ # coding=utf-8 import scrapy -from scrapy.mail import MailSender +from mail import mailer from scrapy.http import Request from biliob_spider.items import VideoItem from datetime import datetime @@ -221,8 +221,11 @@ def parse(self, response): # 出现错误时打印错误日志 if r['code'] == -404: return - mailer = MailSender() - mailer.send(to=["604264970@qq.com"], subject="BiliobSpiderError", body="{}{}{}".format(item,response.url,error)) + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) logging.error("视频爬虫在解析时发生错误") logging.error(item) logging.error(response.url) From 7a8f9058854fb6ff7af987c1f35487d353a8a04c Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sun, 20 Jan 2019 21:01:01 +0800 Subject: [PATCH 123/469] add data --- .../spiders/video_from_kanbilibili.py | 233 ++++++++++++++++++ 1 file changed, 233 insertions(+) create mode 100644 biliob_spider/spiders/video_from_kanbilibili.py diff --git a/biliob_spider/spiders/video_from_kanbilibili.py b/biliob_spider/spiders/video_from_kanbilibili.py new file mode 100644 index 0000000..f7f6b25 --- /dev/null +++ b/biliob_spider/spiders/video_from_kanbilibili.py @@ -0,0 +1,233 @@ +# coding=utf-8 +import scrapy +from scrapy.http import Request +from biliob_spider.items import VideoItem +import datetime +import time +import json +import logging +from pymongo import MongoClient +from db import settings +from mail import mailer +sub_channel_2_channel = { + 'ASMR': '生活', + 'GMV': '游戏', + 'Korea相关': '娱乐', + 'MAD·AMV': '动画', + 'MMD·3D': '动画', + 'Mugen': '游戏', + 'OP/ED/OST': '音乐', + 'VOCALOID·UTAU': '音乐', + '三次元舞蹈': '舞蹈', + '三次元音乐': '音乐', + '人力VOCALOID': '鬼畜', + '人文·历史': '纪录片', + '健身': '时尚', + '其他': '生活', + '其他国家': '电影', + '军事': '纪录片', + '动物圈': '生活', + '华语电影': '电影', + '单机游戏': '游戏', + '原创音乐': '音乐', + '国产剧': '电视剧', + '国产动画': '国创', + '国产原创相关': '国创', + '宅舞': '舞蹈', + '完结动画': '番剧', + '官方延伸': '番剧', + '布袋戏': '国创', + '广告': '广告', + '影视剪辑': '影视', + '影视杂谈': '影视', + '手工': '生活', + '手机游戏': '游戏', + '搞笑': '生活', + '教程演示': '鬼畜', + '数码': '数码', + '日常': '生活', + '明星': '娱乐', + '星海': '科技', + '服饰': '时尚', + '机械': '科技', + '桌游棋牌': '游戏', + '欧美电影': '电影', + '汽车': '科技', + '海外剧': '电视剧', + '演奏': '音乐', + '演讲·公开课': '科技', + '特摄': '影视', + '电子竞技': '游戏', + '短片': '影视', + '短片·手书·配音': '动画', + '社会·美食·旅行': '纪录片', + '科学·探索·自然': '纪录片', + '绘画': '生活', + '综合': '动画', + '综艺': '娱乐', + '网络游戏': '游戏', + '美妆': '时尚', + '美食圈': '生活', + '翻唱': '音乐', + '舞蹈教程': '舞蹈', + '资讯': '国创', + '趣味科普人文': '科技', + '运动': '生活', + '连载动画': '番剧', + '野生技术协会': '科技', + '音MAD': '鬼畜', + '音乐选集': '音乐', + '音游': '游戏', + '预告 资讯': '影视', + '预告·资讯': '影视', + '单机联机': '游戏', + '鬼畜调教': '鬼畜', + '演讲• 公开课': '科技', + '国产电影': '电影', + '日本电影': '电影', + '番剧': '番剧', + '国创': '国创', + '鬼畜': '鬼畜', + '电视剧': '电视剧', + '动画': '动画', + '时尚': '时尚', + '娱乐': '娱乐', + '电影': '电影', + '舞蹈': '舞蹈', + '科技': '科技', + '生活': '生活', + '音乐': '音乐', + '纪录片': '纪录片', + '手机平板': '数码', + '电脑装机': '数码', + '影音智能': '数码', + '摄影摄像': '数码', + '摄影摄像': '数码', + '风尚标': '时尚', + '电音': '音乐', + '音乐综合': '音乐', + 'MV': '音乐', + '音乐现场': '音乐', + '游戏': '游戏', + 'T台': '时尚', +} + + +class FromKan(scrapy.spiders.Spider): + name = "fromkan" + allowed_domains = ["kanbilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.VideoPipelineFromKan': 300, + }, + 'DOWNLOAD_DELAY': 0.5 + } + + def dateRange(self, beginDate, endDate): + dates = [] + dt = datetime.datetime.strptime(beginDate, "%Y%m%d") + date = beginDate[:] + while date <= endDate: + dates.append(date) + dt = dt + datetime.timedelta(1) + date = dt.strftime("%Y%m%d") + return dates + + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def start_requests(self): + dates = self.dateRange('20181001', '20190120') + for each in dates: + yield Request( + 'https://www.kanbilibili.com/json/all/{}/0_play_0.json'.format( + each), + meta={'date': each}) + + def parse(self, response): + try: + if response.status == 404: + return + r = json.loads(response.body) + for each in r: + aid = each['aid'] + author = each['name'] + mid = each['mid'] + view = each['playTotal'] + favorite = each['favoritesTotal'] + danmaku = each['danmakuTotal'] + coin = None + share = None + like = None + date = response.meta['date'] + date_str = '{}-{}-{}'.format(date[:4], date[4:6], date[6:8]) + current_date = datetime.datetime.strptime(date_str, "%Y-%m-%d") + + data = { + 'view': view, + 'favorite': favorite, + 'danmaku': danmaku, + 'coin': coin, + 'share': share, + 'like': like, + 'datetime': current_date + } + + subChannel = None + tid = None + title = each['title'] + date = each['created'] + pic = 'http:' + each['pic'] + item = VideoItem() + item['current_view'] = view + item['current_favorite'] = favorite + item['current_danmaku'] = danmaku + item['current_coin'] = coin + item['current_share'] = share + item['current_like'] = like + item['current_datetime'] = current_date + item['aid'] = aid + item['mid'] = mid + item['pic'] = pic + item['author'] = author + item['data'] = data + item['title'] = title + item['subChannel'] = subChannel + item['datetime'] = date + if author == '腾讯动漫' or author == '哔哩哔哩番剧': + continue + self.coll.find_one({'aid': aid}) + d = self.coll.find_one({'aid': aid}) + flag = 0 + if d != None and 'data' in d: + if 'subChannel' in d: + item['subChannel'] = d['subChannel'] + if 'channel' in d: + item['channel'] = d['channel'] + for each_data in d['data']: + data_date = each_data['datetime'].strftime("%Y-%m-%d") + if data_date == date_str: + flag = 1 + break + if flag == 0: + yield item + + except Exception as error: + # 出现错误时打印错误日志 + + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(item) + logging.error(response.url) + logging.error(error) From c62bb9d77905f6026d1241bb07712008e8fd8a22 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sun, 20 Jan 2019 21:01:01 +0800 Subject: [PATCH 124/469] add data --- .../spiders/video_from_kanbilibili.py | 233 ++++++++++++++++++ 1 file changed, 233 insertions(+) create mode 100644 biliob_spider/spiders/video_from_kanbilibili.py diff --git a/biliob_spider/spiders/video_from_kanbilibili.py b/biliob_spider/spiders/video_from_kanbilibili.py new file mode 100644 index 0000000..f7f6b25 --- /dev/null +++ b/biliob_spider/spiders/video_from_kanbilibili.py @@ -0,0 +1,233 @@ +# coding=utf-8 +import scrapy +from scrapy.http import Request +from biliob_spider.items import VideoItem +import datetime +import time +import json +import logging +from pymongo import MongoClient +from db import settings +from mail import mailer +sub_channel_2_channel = { + 'ASMR': '生活', + 'GMV': '游戏', + 'Korea相关': '娱乐', + 'MAD·AMV': '动画', + 'MMD·3D': '动画', + 'Mugen': '游戏', + 'OP/ED/OST': '音乐', + 'VOCALOID·UTAU': '音乐', + '三次元舞蹈': '舞蹈', + '三次元音乐': '音乐', + '人力VOCALOID': '鬼畜', + '人文·历史': '纪录片', + '健身': '时尚', + '其他': '生活', + '其他国家': '电影', + '军事': '纪录片', + '动物圈': '生活', + '华语电影': '电影', + '单机游戏': '游戏', + '原创音乐': '音乐', + '国产剧': '电视剧', + '国产动画': '国创', + '国产原创相关': '国创', + '宅舞': '舞蹈', + '完结动画': '番剧', + '官方延伸': '番剧', + '布袋戏': '国创', + '广告': '广告', + '影视剪辑': '影视', + '影视杂谈': '影视', + '手工': '生活', + '手机游戏': '游戏', + '搞笑': '生活', + '教程演示': '鬼畜', + '数码': '数码', + '日常': '生活', + '明星': '娱乐', + '星海': '科技', + '服饰': '时尚', + '机械': '科技', + '桌游棋牌': '游戏', + '欧美电影': '电影', + '汽车': '科技', + '海外剧': '电视剧', + '演奏': '音乐', + '演讲·公开课': '科技', + '特摄': '影视', + '电子竞技': '游戏', + '短片': '影视', + '短片·手书·配音': '动画', + '社会·美食·旅行': '纪录片', + '科学·探索·自然': '纪录片', + '绘画': '生活', + '综合': '动画', + '综艺': '娱乐', + '网络游戏': '游戏', + '美妆': '时尚', + '美食圈': '生活', + '翻唱': '音乐', + '舞蹈教程': '舞蹈', + '资讯': '国创', + '趣味科普人文': '科技', + '运动': '生活', + '连载动画': '番剧', + '野生技术协会': '科技', + '音MAD': '鬼畜', + '音乐选集': '音乐', + '音游': '游戏', + '预告 资讯': '影视', + '预告·资讯': '影视', + '单机联机': '游戏', + '鬼畜调教': '鬼畜', + '演讲• 公开课': '科技', + '国产电影': '电影', + '日本电影': '电影', + '番剧': '番剧', + '国创': '国创', + '鬼畜': '鬼畜', + '电视剧': '电视剧', + '动画': '动画', + '时尚': '时尚', + '娱乐': '娱乐', + '电影': '电影', + '舞蹈': '舞蹈', + '科技': '科技', + '生活': '生活', + '音乐': '音乐', + '纪录片': '纪录片', + '手机平板': '数码', + '电脑装机': '数码', + '影音智能': '数码', + '摄影摄像': '数码', + '摄影摄像': '数码', + '风尚标': '时尚', + '电音': '音乐', + '音乐综合': '音乐', + 'MV': '音乐', + '音乐现场': '音乐', + '游戏': '游戏', + 'T台': '时尚', +} + + +class FromKan(scrapy.spiders.Spider): + name = "fromkan" + allowed_domains = ["kanbilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.VideoPipelineFromKan': 300, + }, + 'DOWNLOAD_DELAY': 0.5 + } + + def dateRange(self, beginDate, endDate): + dates = [] + dt = datetime.datetime.strptime(beginDate, "%Y%m%d") + date = beginDate[:] + while date <= endDate: + dates.append(date) + dt = dt + datetime.timedelta(1) + date = dt.strftime("%Y%m%d") + return dates + + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def start_requests(self): + dates = self.dateRange('20181001', '20190120') + for each in dates: + yield Request( + 'https://www.kanbilibili.com/json/all/{}/0_play_0.json'.format( + each), + meta={'date': each}) + + def parse(self, response): + try: + if response.status == 404: + return + r = json.loads(response.body) + for each in r: + aid = each['aid'] + author = each['name'] + mid = each['mid'] + view = each['playTotal'] + favorite = each['favoritesTotal'] + danmaku = each['danmakuTotal'] + coin = None + share = None + like = None + date = response.meta['date'] + date_str = '{}-{}-{}'.format(date[:4], date[4:6], date[6:8]) + current_date = datetime.datetime.strptime(date_str, "%Y-%m-%d") + + data = { + 'view': view, + 'favorite': favorite, + 'danmaku': danmaku, + 'coin': coin, + 'share': share, + 'like': like, + 'datetime': current_date + } + + subChannel = None + tid = None + title = each['title'] + date = each['created'] + pic = 'http:' + each['pic'] + item = VideoItem() + item['current_view'] = view + item['current_favorite'] = favorite + item['current_danmaku'] = danmaku + item['current_coin'] = coin + item['current_share'] = share + item['current_like'] = like + item['current_datetime'] = current_date + item['aid'] = aid + item['mid'] = mid + item['pic'] = pic + item['author'] = author + item['data'] = data + item['title'] = title + item['subChannel'] = subChannel + item['datetime'] = date + if author == '腾讯动漫' or author == '哔哩哔哩番剧': + continue + self.coll.find_one({'aid': aid}) + d = self.coll.find_one({'aid': aid}) + flag = 0 + if d != None and 'data' in d: + if 'subChannel' in d: + item['subChannel'] = d['subChannel'] + if 'channel' in d: + item['channel'] = d['channel'] + for each_data in d['data']: + data_date = each_data['datetime'].strftime("%Y-%m-%d") + if data_date == date_str: + flag = 1 + break + if flag == 0: + yield item + + except Exception as error: + # 出现错误时打印错误日志 + + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(item) + logging.error(response.url) + logging.error(error) From 330a182165a70c0551b9c85befe7cc13c01a193f Mon Sep 17 00:00:00 2001 From: Jannchie Date: Fri, 1 Feb 2019 17:20:57 +0800 Subject: [PATCH 125/469] move the table relating to user --- .gitignore | 2 + biliob_spider/spiders/online.py | 2 +- biliob_to_mysql/move_data.py | 77 +++++++++++++++++++++++++++++++++ run.py | 64 +++++++++++++++------------ run_move.py | 1 + 5 files changed, 118 insertions(+), 28 deletions(-) create mode 100644 biliob_to_mysql/move_data.py create mode 100644 run_move.py diff --git a/.gitignore b/.gitignore index 1a40d66..fabcde9 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ nohup.out biliob_spider.log debug.py mail.py +.vscode/settings.json +.vscode/launch.json diff --git a/biliob_spider/spiders/online.py b/biliob_spider/spiders/online.py index 07fcdd2..683d807 100644 --- a/biliob_spider/spiders/online.py +++ b/biliob_spider/spiders/online.py @@ -1,4 +1,4 @@ -#coding=utf-8 +# coding=utf-8 import scrapy from mail import mailer from scrapy.http import Request diff --git a/biliob_to_mysql/move_data.py b/biliob_to_mysql/move_data.py new file mode 100644 index 0000000..88447ef --- /dev/null +++ b/biliob_to_mysql/move_data.py @@ -0,0 +1,77 @@ +from db import cursor +from db import db as mongodb +from pymongo import ASCENDING + +mongo_user = mongodb['user'] +mongo_video = mongodb['video'] +mongo_author = mongodb['author'] + +# 用户相关 + +INSERT_USER_SQL = """ +INSERT INTO `user` (`name`, `password`, `credit`, `exp`, `gmt_create`, `role`) +VALUES (%(name)s, %(password)s, %(credit)s, %(exp)s, %(gen_time)s, %(role)s) +ON DUPLICATE KEY UPDATE `name` = VALUES(`name`), `exp` = VALUES(`exp`), `credit` = VALUES(`credit`), `password` = VALUES(`password`), `role` = VALUES(`role`); +""" + +GET_USER_ID_SQL = """ +SELECT `user_id` FROM `user` WHERE `name` = %s +""" + +DELETE_USER_FOCUS_VIDEO_SQL = """ +DELETE FROM biliob.user_focus_video +WHERE + `user_id` = %s; +""" + +DELETE_USER_FOCUS_AUTHOR_SQL = """ +DELETE FROM biliob.user_focus_author +WHERE + `user_id` = %s; +""" + +INSERT_USER_FOCUS_VIDEO_SQL = """ +INSERT INTO `user_focus_video` (`user_id`, `video_id`) +VALUES (%(user_id)s, %(video_id)s); +""" + +INSERT_USER_FOCUS_AUTHOR_SQL = """ +INSERT INTO `user_focus_author` (`user_id`, `author_id`) +VALUES (%(user_id)s, %(author_id)s) +""" + + +def move_user(): + for each_doc in mongo_user.find().sort('_id', direction=ASCENDING): + item = dict() + item['gen_time'] = each_doc.pop('_id').generation_time + item['name'] = each_doc['name'] + item['credit'] = each_doc['credit'] if 'credit' in each_doc else 0 + item['password'] = each_doc['password'] if 'password' in each_doc else 0 + item['exp'] = each_doc['exp'] if 'exp' in each_doc else 0 + item['role'] = each_doc['role'] if 'role' in each_doc else 0 + if len(item['name']) > 45: + print(item['name']) + continue + cursor.execute(INSERT_USER_SQL, item) + cursor.execute(GET_USER_ID_SQL, (each_doc['name'])) + + user_id = cursor.fetchone()['user_id'] + cursor.execute(DELETE_USER_FOCUS_VIDEO_SQL, (user_id)) + cursor.execute(DELETE_USER_FOCUS_AUTHOR_SQL, (user_id)) + if 'favoriteAid' in each_doc: + for each_aid in each_doc['favoriteAid']: + if each_aid == None or each_aid > 4294967295: + continue + item = {} + item['user_id'] = int(user_id) + item['video_id'] = int(each_aid) + cursor.execute(INSERT_USER_FOCUS_VIDEO_SQL, item) + if 'favoriteMid' in each_doc: + for each_mid in each_doc['favoriteMid']: + if each_mid == None or each_mid > 4294967295: + continue + item = {} + item['user_id'] = int(user_id) + item['author_id'] = int(each_mid) + cursor.execute(INSERT_USER_FOCUS_AUTHOR_SQL, item) diff --git a/run.py b/run.py index ea00ba9..db784be 100644 --- a/run.py +++ b/run.py @@ -7,59 +7,69 @@ import logging import threading + def site(): - Popen(["scrapy","crawl","site"]) + Popen(["scrapy", "crawl", "site"]) + def bangumi(): - Popen(["scrapy","crawl","bangumi"]) + Popen(["scrapy", "crawl", "bangumi"]) + def donghua(): - Popen(["scrapy","crawl","donghua"]) + Popen(["scrapy", "crawl", "donghua"]) + def update_author(): - Popen(["scrapy","crawl","authorUpdate"]) + Popen(["scrapy", "crawl", "authorUpdate"]) + def auto_add_author(): - Popen(["scrapy","crawl","authorAutoAdd"]) + Popen(["scrapy", "crawl", "authorAutoAdd"]) + def video_watcher(): - Popen(["scrapy","crawl","videoWatcher"]) + Popen(["scrapy", "crawl", "videoWatcher"]) + def video_spider(): - Popen(["scrapy","crawl","videoSpider"]) + Popen(["scrapy", "crawl", "videoSpider"]) + def video_spider_all(): - Popen(["scrapy","crawl","videoSpiderAll"]) + Popen(["scrapy", "crawl", "videoSpiderAll"]) + def online(): - Popen(['scrapy','crawl','online']) + Popen(['scrapy', 'crawl', 'online']) + def data_analyze(): - Popen(['python','run_analyzer.py']) + Popen(['python', 'run_analyzer.py']) + def bili_monthly_rank(): - Popen(['scrapy','crawl','biliMonthlyRank']) + Popen(['scrapy', 'crawl', 'biliMonthlyRank']) def run_threaded(job_func): - job_thread = threading.Thread(target=job_func) - job_thread.start() - -schedule.every().day.at('11:40').do(run_threaded,data_analyze) -schedule.every().day.at('01:00').do(run_threaded,update_author) -schedule.every().day.at('07:00').do(run_threaded,video_spider) -schedule.every().day.at('14:00').do(run_threaded,auto_add_author) -schedule.every().day.at('16:50').do(run_threaded,bangumi) -schedule.every().day.at('16:30').do(run_threaded,donghua) -schedule.every().day.at('22:00').do(run_threaded,video_watcher) -schedule.every().day.at('21:00').do(run_threaded,bili_monthly_rank) -schedule.every().week.do(run_threaded,video_spider_all) -schedule.every().hour.do(run_threaded,site) -schedule.every().minute.do(run_threaded,online) - + job_thread = threading.Thread(target=job_func) + job_thread.start() + + +schedule.every().day.at('11:40').do(run_threaded, data_analyze) +schedule.every().day.at('01:00').do(run_threaded, update_author) +schedule.every().day.at('07:00').do(run_threaded, video_spider) +schedule.every().day.at('14:00').do(run_threaded, auto_add_author) +schedule.every().day.at('16:50').do(run_threaded, bangumi) +schedule.every().day.at('16:30').do(run_threaded, donghua) +schedule.every().day.at('22:00').do(run_threaded, video_watcher) +schedule.every().day.at('21:00').do(run_threaded, bili_monthly_rank) +schedule.every().week.do(run_threaded, video_spider_all) +schedule.every().hour.do(run_threaded, site) +schedule.every().minute.do(run_threaded, online) print('开始运行计划任务..') while True: schedule.run_pending() time.sleep(60) - diff --git a/run_move.py b/run_move.py new file mode 100644 index 0000000..fbc6001 --- /dev/null +++ b/run_move.py @@ -0,0 +1 @@ +import biliob_to_mysql.move_data From 35b96410402af6bf05c30674cf09fa43f4a0f118 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Fri, 1 Feb 2019 17:20:57 +0800 Subject: [PATCH 126/469] move the table relating to user --- .gitignore | 2 + biliob_spider/spiders/online.py | 2 +- biliob_to_mysql/move_data.py | 77 +++++++++++++++++++++++++++++++++ run.py | 64 +++++++++++++++------------ run_move.py | 1 + 5 files changed, 118 insertions(+), 28 deletions(-) create mode 100644 biliob_to_mysql/move_data.py create mode 100644 run_move.py diff --git a/.gitignore b/.gitignore index 1a40d66..fabcde9 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ nohup.out biliob_spider.log debug.py mail.py +.vscode/settings.json +.vscode/launch.json diff --git a/biliob_spider/spiders/online.py b/biliob_spider/spiders/online.py index 07fcdd2..683d807 100644 --- a/biliob_spider/spiders/online.py +++ b/biliob_spider/spiders/online.py @@ -1,4 +1,4 @@ -#coding=utf-8 +# coding=utf-8 import scrapy from mail import mailer from scrapy.http import Request diff --git a/biliob_to_mysql/move_data.py b/biliob_to_mysql/move_data.py new file mode 100644 index 0000000..88447ef --- /dev/null +++ b/biliob_to_mysql/move_data.py @@ -0,0 +1,77 @@ +from db import cursor +from db import db as mongodb +from pymongo import ASCENDING + +mongo_user = mongodb['user'] +mongo_video = mongodb['video'] +mongo_author = mongodb['author'] + +# 用户相关 + +INSERT_USER_SQL = """ +INSERT INTO `user` (`name`, `password`, `credit`, `exp`, `gmt_create`, `role`) +VALUES (%(name)s, %(password)s, %(credit)s, %(exp)s, %(gen_time)s, %(role)s) +ON DUPLICATE KEY UPDATE `name` = VALUES(`name`), `exp` = VALUES(`exp`), `credit` = VALUES(`credit`), `password` = VALUES(`password`), `role` = VALUES(`role`); +""" + +GET_USER_ID_SQL = """ +SELECT `user_id` FROM `user` WHERE `name` = %s +""" + +DELETE_USER_FOCUS_VIDEO_SQL = """ +DELETE FROM biliob.user_focus_video +WHERE + `user_id` = %s; +""" + +DELETE_USER_FOCUS_AUTHOR_SQL = """ +DELETE FROM biliob.user_focus_author +WHERE + `user_id` = %s; +""" + +INSERT_USER_FOCUS_VIDEO_SQL = """ +INSERT INTO `user_focus_video` (`user_id`, `video_id`) +VALUES (%(user_id)s, %(video_id)s); +""" + +INSERT_USER_FOCUS_AUTHOR_SQL = """ +INSERT INTO `user_focus_author` (`user_id`, `author_id`) +VALUES (%(user_id)s, %(author_id)s) +""" + + +def move_user(): + for each_doc in mongo_user.find().sort('_id', direction=ASCENDING): + item = dict() + item['gen_time'] = each_doc.pop('_id').generation_time + item['name'] = each_doc['name'] + item['credit'] = each_doc['credit'] if 'credit' in each_doc else 0 + item['password'] = each_doc['password'] if 'password' in each_doc else 0 + item['exp'] = each_doc['exp'] if 'exp' in each_doc else 0 + item['role'] = each_doc['role'] if 'role' in each_doc else 0 + if len(item['name']) > 45: + print(item['name']) + continue + cursor.execute(INSERT_USER_SQL, item) + cursor.execute(GET_USER_ID_SQL, (each_doc['name'])) + + user_id = cursor.fetchone()['user_id'] + cursor.execute(DELETE_USER_FOCUS_VIDEO_SQL, (user_id)) + cursor.execute(DELETE_USER_FOCUS_AUTHOR_SQL, (user_id)) + if 'favoriteAid' in each_doc: + for each_aid in each_doc['favoriteAid']: + if each_aid == None or each_aid > 4294967295: + continue + item = {} + item['user_id'] = int(user_id) + item['video_id'] = int(each_aid) + cursor.execute(INSERT_USER_FOCUS_VIDEO_SQL, item) + if 'favoriteMid' in each_doc: + for each_mid in each_doc['favoriteMid']: + if each_mid == None or each_mid > 4294967295: + continue + item = {} + item['user_id'] = int(user_id) + item['author_id'] = int(each_mid) + cursor.execute(INSERT_USER_FOCUS_AUTHOR_SQL, item) diff --git a/run.py b/run.py index ea00ba9..db784be 100644 --- a/run.py +++ b/run.py @@ -7,59 +7,69 @@ import logging import threading + def site(): - Popen(["scrapy","crawl","site"]) + Popen(["scrapy", "crawl", "site"]) + def bangumi(): - Popen(["scrapy","crawl","bangumi"]) + Popen(["scrapy", "crawl", "bangumi"]) + def donghua(): - Popen(["scrapy","crawl","donghua"]) + Popen(["scrapy", "crawl", "donghua"]) + def update_author(): - Popen(["scrapy","crawl","authorUpdate"]) + Popen(["scrapy", "crawl", "authorUpdate"]) + def auto_add_author(): - Popen(["scrapy","crawl","authorAutoAdd"]) + Popen(["scrapy", "crawl", "authorAutoAdd"]) + def video_watcher(): - Popen(["scrapy","crawl","videoWatcher"]) + Popen(["scrapy", "crawl", "videoWatcher"]) + def video_spider(): - Popen(["scrapy","crawl","videoSpider"]) + Popen(["scrapy", "crawl", "videoSpider"]) + def video_spider_all(): - Popen(["scrapy","crawl","videoSpiderAll"]) + Popen(["scrapy", "crawl", "videoSpiderAll"]) + def online(): - Popen(['scrapy','crawl','online']) + Popen(['scrapy', 'crawl', 'online']) + def data_analyze(): - Popen(['python','run_analyzer.py']) + Popen(['python', 'run_analyzer.py']) + def bili_monthly_rank(): - Popen(['scrapy','crawl','biliMonthlyRank']) + Popen(['scrapy', 'crawl', 'biliMonthlyRank']) def run_threaded(job_func): - job_thread = threading.Thread(target=job_func) - job_thread.start() - -schedule.every().day.at('11:40').do(run_threaded,data_analyze) -schedule.every().day.at('01:00').do(run_threaded,update_author) -schedule.every().day.at('07:00').do(run_threaded,video_spider) -schedule.every().day.at('14:00').do(run_threaded,auto_add_author) -schedule.every().day.at('16:50').do(run_threaded,bangumi) -schedule.every().day.at('16:30').do(run_threaded,donghua) -schedule.every().day.at('22:00').do(run_threaded,video_watcher) -schedule.every().day.at('21:00').do(run_threaded,bili_monthly_rank) -schedule.every().week.do(run_threaded,video_spider_all) -schedule.every().hour.do(run_threaded,site) -schedule.every().minute.do(run_threaded,online) - + job_thread = threading.Thread(target=job_func) + job_thread.start() + + +schedule.every().day.at('11:40').do(run_threaded, data_analyze) +schedule.every().day.at('01:00').do(run_threaded, update_author) +schedule.every().day.at('07:00').do(run_threaded, video_spider) +schedule.every().day.at('14:00').do(run_threaded, auto_add_author) +schedule.every().day.at('16:50').do(run_threaded, bangumi) +schedule.every().day.at('16:30').do(run_threaded, donghua) +schedule.every().day.at('22:00').do(run_threaded, video_watcher) +schedule.every().day.at('21:00').do(run_threaded, bili_monthly_rank) +schedule.every().week.do(run_threaded, video_spider_all) +schedule.every().hour.do(run_threaded, site) +schedule.every().minute.do(run_threaded, online) print('开始运行计划任务..') while True: schedule.run_pending() time.sleep(60) - diff --git a/run_move.py b/run_move.py new file mode 100644 index 0000000..fbc6001 --- /dev/null +++ b/run_move.py @@ -0,0 +1 @@ +import biliob_to_mysql.move_data From b07cc138a50d6f8f46aa62644365f0fc83345d13 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Fri, 1 Feb 2019 21:05:32 +0800 Subject: [PATCH 127/469] move the table relating to video --- biliob_to_mysql/move_data.py | 65 +++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/biliob_to_mysql/move_data.py b/biliob_to_mysql/move_data.py index 88447ef..4569fb6 100644 --- a/biliob_to_mysql/move_data.py +++ b/biliob_to_mysql/move_data.py @@ -1,7 +1,8 @@ from db import cursor from db import db as mongodb from pymongo import ASCENDING - +import bson +import datetime mongo_user = mongodb['user'] mongo_video = mongodb['video'] mongo_author = mongodb['author'] @@ -41,6 +42,12 @@ """ +def translate_int64(item): + for each_key in item: + if type(item[each_key]) is bson.int64.Int64: + item[each_key] = int(item[each_key]) + + def move_user(): for each_doc in mongo_user.find().sort('_id', direction=ASCENDING): item = dict() @@ -75,3 +82,59 @@ def move_user(): item['user_id'] = int(user_id) item['author_id'] = int(each_mid) cursor.execute(INSERT_USER_FOCUS_AUTHOR_SQL, item) + + +# 视频相关 + +INSERT_VIDEO_SQL = """ +INSERT INTO `video` (`video_id`, `author_id`, `title`, `pic`, `is_observe`, `gmt_create`, `channel`, `subchannel`, `pub_datetime`) +VALUES (%(video_id)s, %(author_id)s, %(title)s, %(pic)s, %(is_observe)s, %(gen_time)s, %(channel)s, %(subchannel)s, %(pub_datetime)s) +ON DUPLICATE KEY UPDATE `title` = VALUES(`title`), `pic` = VALUES(`pic`), `is_observe` = VALUES(`is_observe`), `channel` = VALUES(`channel`), `subchannel` = VALUES(`subchannel`), `pub_datetime` = VALUES(`pub_datetime`); +""" + +INSERT_VIDEO_RECORD_SQL = """ +INSERT INTO `video_record` (`video_id`, `view`, `danmaku`, `favorite`, `coin`, `share`, `like`, `dislike`, `gmt_create`) +VALUES (%(video_id)s, %(view)s, %(danmaku)s, %(favorite)s, %(coin)s, %(share)s, %(like)s, %(dislike)s, %(gmt_create)s) +ON DUPLICATE KEY UPDATE +`video_id` = VALUES(`video_id`), +`view` = VALUES(`view`), +`danmaku` = VALUES(`danmaku`), +`favorite` = VALUES(`favorite`), +`coin` = VALUES(`coin`), +`share` = VALUES(`share`); +`like` = VALUES(`like`); +`dislike` = VALUES(`dislike`); +""" + + +def move_video(): + for each_doc in mongo_video.find().batch_size(8): + translate_int64(each_doc) + item = {} + item['video_id'] = each_doc['aid'] if 'aid' in each_doc else None + print(item['video_id']) + item['author_id'] = each_doc['mid'] if 'mid' in each_doc else None + item['title'] = each_doc['title'] if 'title' in each_doc else None + item['pic'] = each_doc['pic'] if 'pic' in each_doc else None + item['is_observe'] = each_doc['focus'] if 'focus' in each_doc else 1 + item['channel'] = each_doc['channel'] if 'channel' in each_doc else None + item['subchannel'] = each_doc['subChannel'] if 'subChannel' in each_doc else None + item['gen_time'] = each_doc.pop('_id').generation_time + item['pub_datetime'] = each_doc['datetime'] if 'datetime' in each_doc else None + cursor.execute(INSERT_VIDEO_SQL, item) + if 'data' in each_doc: + item_list = [] + for each_record in each_doc['data']: + translate_int64(each_record) + item = {} + item['video_id'] = each_doc['aid'] if 'aid' in each_doc else None + item['view'] = each_record['view'] if 'view' in each_record else None + item['danmaku'] = each_record['danmaku'] if 'danmaku' in each_record else None + item['favorite'] = each_record['favorite'] if 'favorite' in each_record else None + item['coin'] = each_record['coin'] if 'coin' in each_record else None + item['share'] = each_record['share'] if 'share' in each_record else None + item['like'] = each_record['like'] if 'like' in each_record else None + item['dislike'] = each_record['dislike'] if 'dislike' in each_record else None + item['gmt_create'] = each_record['datetime'] if 'datetime' in each_record else None + item_list.append(item) + cursor.executemany(INSERT_VIDEO_RECORD_SQL, item_list) From 21d5e2401eb1d76657c7b6409fb3645b842bfa71 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Fri, 1 Feb 2019 21:05:32 +0800 Subject: [PATCH 128/469] move the table relating to video --- biliob_to_mysql/move_data.py | 65 +++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/biliob_to_mysql/move_data.py b/biliob_to_mysql/move_data.py index 88447ef..4569fb6 100644 --- a/biliob_to_mysql/move_data.py +++ b/biliob_to_mysql/move_data.py @@ -1,7 +1,8 @@ from db import cursor from db import db as mongodb from pymongo import ASCENDING - +import bson +import datetime mongo_user = mongodb['user'] mongo_video = mongodb['video'] mongo_author = mongodb['author'] @@ -41,6 +42,12 @@ """ +def translate_int64(item): + for each_key in item: + if type(item[each_key]) is bson.int64.Int64: + item[each_key] = int(item[each_key]) + + def move_user(): for each_doc in mongo_user.find().sort('_id', direction=ASCENDING): item = dict() @@ -75,3 +82,59 @@ def move_user(): item['user_id'] = int(user_id) item['author_id'] = int(each_mid) cursor.execute(INSERT_USER_FOCUS_AUTHOR_SQL, item) + + +# 视频相关 + +INSERT_VIDEO_SQL = """ +INSERT INTO `video` (`video_id`, `author_id`, `title`, `pic`, `is_observe`, `gmt_create`, `channel`, `subchannel`, `pub_datetime`) +VALUES (%(video_id)s, %(author_id)s, %(title)s, %(pic)s, %(is_observe)s, %(gen_time)s, %(channel)s, %(subchannel)s, %(pub_datetime)s) +ON DUPLICATE KEY UPDATE `title` = VALUES(`title`), `pic` = VALUES(`pic`), `is_observe` = VALUES(`is_observe`), `channel` = VALUES(`channel`), `subchannel` = VALUES(`subchannel`), `pub_datetime` = VALUES(`pub_datetime`); +""" + +INSERT_VIDEO_RECORD_SQL = """ +INSERT INTO `video_record` (`video_id`, `view`, `danmaku`, `favorite`, `coin`, `share`, `like`, `dislike`, `gmt_create`) +VALUES (%(video_id)s, %(view)s, %(danmaku)s, %(favorite)s, %(coin)s, %(share)s, %(like)s, %(dislike)s, %(gmt_create)s) +ON DUPLICATE KEY UPDATE +`video_id` = VALUES(`video_id`), +`view` = VALUES(`view`), +`danmaku` = VALUES(`danmaku`), +`favorite` = VALUES(`favorite`), +`coin` = VALUES(`coin`), +`share` = VALUES(`share`); +`like` = VALUES(`like`); +`dislike` = VALUES(`dislike`); +""" + + +def move_video(): + for each_doc in mongo_video.find().batch_size(8): + translate_int64(each_doc) + item = {} + item['video_id'] = each_doc['aid'] if 'aid' in each_doc else None + print(item['video_id']) + item['author_id'] = each_doc['mid'] if 'mid' in each_doc else None + item['title'] = each_doc['title'] if 'title' in each_doc else None + item['pic'] = each_doc['pic'] if 'pic' in each_doc else None + item['is_observe'] = each_doc['focus'] if 'focus' in each_doc else 1 + item['channel'] = each_doc['channel'] if 'channel' in each_doc else None + item['subchannel'] = each_doc['subChannel'] if 'subChannel' in each_doc else None + item['gen_time'] = each_doc.pop('_id').generation_time + item['pub_datetime'] = each_doc['datetime'] if 'datetime' in each_doc else None + cursor.execute(INSERT_VIDEO_SQL, item) + if 'data' in each_doc: + item_list = [] + for each_record in each_doc['data']: + translate_int64(each_record) + item = {} + item['video_id'] = each_doc['aid'] if 'aid' in each_doc else None + item['view'] = each_record['view'] if 'view' in each_record else None + item['danmaku'] = each_record['danmaku'] if 'danmaku' in each_record else None + item['favorite'] = each_record['favorite'] if 'favorite' in each_record else None + item['coin'] = each_record['coin'] if 'coin' in each_record else None + item['share'] = each_record['share'] if 'share' in each_record else None + item['like'] = each_record['like'] if 'like' in each_record else None + item['dislike'] = each_record['dislike'] if 'dislike' in each_record else None + item['gmt_create'] = each_record['datetime'] if 'datetime' in each_record else None + item_list.append(item) + cursor.executemany(INSERT_VIDEO_RECORD_SQL, item_list) From 311716294818f69781dd5f5065e8ab6cf9a88959 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 2 Feb 2019 00:21:07 +0800 Subject: [PATCH 129/469] feature: strong observe spider --- biliob_spider/items.py | 49 ++++- biliob_spider/pipelines.py | 76 +++++++ biliob_spider/spiders/author_update.py | 2 +- biliob_spider/spiders/strong_focus.py | 285 +++++++++++++++++++++++++ biliob_spider/spiders/video_spider.py | 2 - run.py | 68 +++--- 6 files changed, 451 insertions(+), 31 deletions(-) create mode 100644 biliob_spider/spiders/strong_focus.py diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 39ce9ec..490dfe6 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -8,18 +8,22 @@ import scrapy from mail import mailer + class SiteItem(scrapy.Item): region_count = scrapy.Field() all_count = scrapy.Field() web_online = scrapy.Field() play_online = scrapy.Field() + class TagItem(scrapy.Item): tag_id = scrapy.Field() tag_name = scrapy.Field() use = scrapy.Field() atten = scrapy.Field() ctime = scrapy.Field() + + class BangumiItem(scrapy.Item): title = scrapy.Field() tag = scrapy.Field() @@ -30,6 +34,43 @@ class BangumiItem(scrapy.Item): newest_ep_index = scrapy.Field() data = scrapy.Field() + +class VideoAndAuthorItem(scrapy.Item): + mid = scrapy.Field() + name = scrapy.Field() + face = scrapy.Field() + official = scrapy.Field() + sex = scrapy.Field() + data_video = scrapy.Field() + data_author = scrapy.Field() + level = scrapy.Field() + focus = scrapy.Field() + pts = scrapy.Field() + c_fans = scrapy.Field() + c_attention = scrapy.Field() + c_archive = scrapy.Field() + c_article = scrapy.Field() + c_archive_view = scrapy.Field() + c_article_view = scrapy.Field() + c_datetime = scrapy.Field() + channel = scrapy.Field() + aid = scrapy.Field() + datetime = scrapy.Field() + author = scrapy.Field() + data = scrapy.Field() + subChannel = scrapy.Field() + title = scrapy.Field() + mid = scrapy.Field() + pic = scrapy.Field() + current_view = scrapy.Field() + current_favorite = scrapy.Field() + current_danmaku = scrapy.Field() + current_coin = scrapy.Field() + current_share = scrapy.Field() + current_like = scrapy.Field() + current_datetime = scrapy.Field() + + class VideoItem(scrapy.Item): channel = scrapy.Field() aid = scrapy.Field() @@ -48,6 +89,7 @@ class VideoItem(scrapy.Item): current_like = scrapy.Field() current_datetime = scrapy.Field() + class AuthorItem(scrapy.Item): mid = scrapy.Field() name = scrapy.Field() @@ -66,6 +108,7 @@ class AuthorItem(scrapy.Item): c_article_view = scrapy.Field() c_datetime = scrapy.Field() + class RankItem(scrapy.Item): title = scrapy.Field() author = scrapy.Field() @@ -74,6 +117,7 @@ class RankItem(scrapy.Item): mid = scrapy.Field() channel = scrapy.Field() + class VideoOnline(scrapy.Item): title = scrapy.Field() author = scrapy.Field() @@ -81,8 +125,9 @@ class VideoOnline(scrapy.Item): aid = scrapy.Field() subChannel = scrapy.Field() channel = scrapy.Field() - + + class VideoWatcherItem(scrapy.Item): mid = scrapy.Field() aid = scrapy.Field() - channels = scrapy.Field() \ No newline at end of file + channels = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 1678ea2..da2d371 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -11,6 +11,80 @@ import logging +class StrongPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + + def process_item(self, item, spider): + try: + self.coll = self.db['video'] + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'cView': item['current_view'], + 'cFavorite': item['current_favorite'], + 'cDanmaku': item['current_danmaku'], + 'cCoin': item['current_coin'], + 'cShare': item['current_share'], + 'cLike': item['current_like'], + 'cDatetime': item['current_datetime'], + 'author': item['author'], + 'subChannel': item['subChannel'], + 'channel': item['channel'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': datetime.datetime.fromtimestamp( + item['datetime']) + }, + '$push': { + 'data': { + '$each': [item['data_video']], + '$position': 0 + } + } + }, True) + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) + try: + self.coll = self.db['author'] # 获得collection的句柄 + self.coll.update_one({ + 'mid': item['mid'] + }, { + '$set': { + 'focus': True, + 'sex': item['sex'], + 'name': item['name'], + 'face': item['face'], + 'level': item['level'], + 'cFans': item['c_fans'], + 'official': item['official'], + 'cArchive': item['c_archive'], + 'cArticle': item['c_article'], + 'cAttention': item['c_attention'], + 'cArchive_view': item['c_archive_view'], + 'cArticle_view': item['c_article_view'], + }, + '$push': { + 'data': { + '$each': [item['data_author']], + '$position': 0 + } + } + }, True) + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) + return item + + class VideoPipeline(object): def __init__(self): # 链接mongoDB @@ -55,6 +129,7 @@ def process_item(self, item, spider): # 出现错误时打印错误日志 logging.error(error) + class VideoPipelineFromKan(object): def __init__(self): # 链接mongoDB @@ -89,6 +164,7 @@ def process_item(self, item, spider): # 出现错误时打印错误日志 logging.error(error) + class BangumiPipeLine(object): def __init__(self): # 链接mongoDB diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index 8b819fd..399d341 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -1,4 +1,4 @@ -#coding=utf-8 +# coding=utf-8 import scrapy from mail import mailer from scrapy.http import Request diff --git a/biliob_spider/spiders/strong_focus.py b/biliob_spider/spiders/strong_focus.py new file mode 100644 index 0000000..8d353eb --- /dev/null +++ b/biliob_spider/spiders/strong_focus.py @@ -0,0 +1,285 @@ +# coding=utf-8 +import scrapy +from mail import mailer +from scrapy.http import Request +from biliob_spider.items import VideoAndAuthorItem +import time +import json +import logging +from pymongo import MongoClient +import datetime + +sub_channel_2_channel = { + 'ASMR': '生活', + 'GMV': '游戏', + 'Korea相关': '娱乐', + 'MAD·AMV': '动画', + 'MMD·3D': '动画', + 'Mugen': '游戏', + 'OP/ED/OST': '音乐', + 'VOCALOID·UTAU': '音乐', + '三次元舞蹈': '舞蹈', + '三次元音乐': '音乐', + '人力VOCALOID': '鬼畜', + '人文·历史': '纪录片', + '健身': '时尚', + '其他': '生活', + '其他国家': '电影', + '军事': '纪录片', + '动物圈': '生活', + '华语电影': '电影', + '单机游戏': '游戏', + '原创音乐': '音乐', + '国产剧': '电视剧', + '国产动画': '国创', + '国产原创相关': '国创', + '宅舞': '舞蹈', + '完结动画': '番剧', + '官方延伸': '番剧', + '布袋戏': '国创', + '广告': '广告', + '影视剪辑': '影视', + '影视杂谈': '影视', + '手工': '生活', + '手机游戏': '游戏', + '搞笑': '生活', + '教程演示': '鬼畜', + '数码': '数码', + '日常': '生活', + '明星': '娱乐', + '星海': '科技', + '服饰': '时尚', + '机械': '科技', + '桌游棋牌': '游戏', + '欧美电影': '电影', + '汽车': '科技', + '游戏': '游戏', + '海外剧': '电视剧', + '演奏': '音乐', + '演讲·公开课': '科技', + '特摄': '影视', + '电子竞技': '游戏', + '短片': '影视', + '短片·手书·配音': '动画', + '社会·美食·旅行': '纪录片', + '科学·探索·自然': '纪录片', + '绘画': '生活', + '综合': '动画', + '综艺': '娱乐', + '网络游戏': '游戏', + '美妆': '时尚', + '美食圈': '生活', + '翻唱': '音乐', + '舞蹈教程': '舞蹈', + '资讯': '国创', + '趣味科普人文': '科技', + '运动': '生活', + '连载动画': '番剧', + '野生技术协会': '科技', + '音MAD': '鬼畜', + '音乐选集': '音乐', + '音游': '游戏', + '预告 资讯': '影视', + '预告·资讯': '影视', + '单机联机': '游戏', + '鬼畜调教': '鬼畜', + '演讲• 公开课': '科技', + '国产电影': '电影', + '日本电影': '电影', + '番剧': '番剧', + '国创': '国创', + '鬼畜': '鬼畜', + '电视剧': '电视剧', + '动画': '动画', + '时尚': '时尚', + '娱乐': '娱乐', + '电影': '电影', + '舞蹈': '舞蹈', + '科技': '科技', + '生活': '生活', + '音乐': '音乐', + '纪录片': '纪录片', + '手机平板': '数码', + '电脑装机': '数码', + '影音智能': '数码', + '摄影摄像': '数码', + '风尚标': '时尚', + '电音': '音乐', + '音乐综合': '音乐', + 'MV': '音乐', + '音乐现场': '音乐', + 'T台': '时尚', +} + + +class StrongSpider(scrapy.spiders.Spider): + name = "strong" + allowed_domains = ["bilibili.com"] + start_urls = ['https://www.bilibili.com/video/online.html'] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.StrongPipeline': 300 + }, + 'DOWNLOAD_DELAY': 2 + } + + def parse(self, response): + try: + video_list = response.xpath('//*[@id="app"]/div[2]/div[2]/div') + # 为了爬取分区、粉丝数等数据,需要进入每一个视频的详情页面进行抓取 + href_list = video_list.xpath('./a/@href').extract() + for i in range(len(href_list)): + # 为了爬取分区等数据,需要进入每一个视频的详情页面进行抓取 + yield Request( + "https://api.bilibili.com/x/article/archives?ids=" + + href_list[i][9:-1], + callback=self.detailParse) + except Exception as error: + # 出现错误时打印错误日志 + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}".format(response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) + + def detailParse(self, response): + try: + r = json.loads(response.body) + d = r["data"] + keys = list(d.keys()) + for each_key in keys: + + aid = d[each_key]['stat']['aid'] + author = d[each_key]['owner']['name'] + mid = d[each_key]['owner']['mid'] + view = d[each_key]['stat']['view'] + favorite = d[each_key]['stat']['favorite'] + danmaku = d[each_key]['stat']['danmaku'] + coin = d[each_key]['stat']['coin'] + share = d[each_key]['stat']['share'] + like = d[each_key]['stat']['like'] + current_date = datetime.datetime.now() + data = { + 'view': view, + 'favorite': favorite, + 'danmaku': danmaku, + 'coin': coin, + 'share': share, + 'like': like, + 'datetime': current_date + } + + subChannel = d[each_key]['tname'] + title = d[each_key]['title'] + date = d[each_key]['pubdate'] + tid = d[each_key]['tid'] + pic = d[each_key]['pic'] + item = VideoAndAuthorItem() + item['current_view'] = view + item['current_favorite'] = favorite + item['current_danmaku'] = danmaku + item['current_coin'] = coin + item['current_share'] = share + item['current_like'] = like + item['current_datetime'] = current_date + item['aid'] = aid + item['mid'] = mid + item['pic'] = pic + item['author'] = author + item['data_video'] = data + item['title'] = title + item['subChannel'] = subChannel + item['datetime'] = date + + if subChannel != '': + item['channel'] = sub_channel_2_channel[subChannel] + elif subChannel == '资讯': + if tid == 51: + item['channel'] == '番剧' + if tid == 170: + item['channel'] == '国创' + if tid == 159: + item['channel'] == '娱乐' + else: + item['channel'] = None + yield Request( + "https://api.bilibili.com/x/web-interface/card?mid=" + + str(mid), meta={'item': item}, + method='GET', callback=self.authorParse) + + except Exception as error: + # 出现错误时打印错误日志 + if r['code'] == -404: + return + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(item) + logging.error(response.url) + logging.error(error) + + def authorParse(self, response): + try: + item = response.meta['item'] + j = json.loads(response.body) + name = j['data']['card']['name'] + mid = j['data']['card']['mid'] + sex = j['data']['card']['sex'] + face = j['data']['card']['face'] + fans = j['data']['card']['fans'] + attention = j['data']['card']['attention'] + level = j['data']['card']['level_info']['current_level'] + official = j['data']['card']['Official']['title'] + archive = j['data']['archive_count'] + article = j['data']['article_count'] + face = j['data']['card']['face'] + item['mid'] = int(mid) + item['name'] = name + item['face'] = face + item['official'] = official + item['sex'] = sex + item['level'] = int(level) + item['data_author'] = { + 'fans': int(fans), + 'attention': int(attention), + 'archive': int(archive), + 'article': int(article), + 'datetime': datetime.datetime.now() + } + item['c_fans'] = int(fans) + item['c_attention'] = int(attention) + item['c_archive'] = int(archive) + item['c_article'] = int(article) + yield Request( + "https://api.bilibili.com/x/space/upstat?mid={mid}".format( + mid=str(mid)), + meta={'item': item}, + method='GET', + callback=self.parse_view) + except Exception as error: + # 出现错误时打印错误日志 + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) + + def parse_view(self, response): + j = json.loads(response.body) + archive_view = j['data']['archive']['view'] + article_view = j['data']['article']['view'] + item = response.meta['item'] + item['data_author']['archiveView'] = archive_view + item['data_author']['articleView'] = article_view + item['c_archive_view'] = int(archive_view) + item['c_article_view'] = int(article_view) + yield item diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index b70c3d3..aa54a53 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -135,9 +135,7 @@ def __init__(self): def start_requests(self): # 只需要aid c = self.coll.find({'$or':[{'focus':True},{'forceFocus':True}]}, {'aid': 1}) - x = 0 - aid_list = [] for each_doc in c: x = x + 1 diff --git a/run.py b/run.py index ea00ba9..1fa68df 100644 --- a/run.py +++ b/run.py @@ -7,59 +7,75 @@ import logging import threading + def site(): - Popen(["scrapy","crawl","site"]) + Popen(["scrapy", "crawl", "site"]) + def bangumi(): - Popen(["scrapy","crawl","bangumi"]) + Popen(["scrapy", "crawl", "bangumi"]) + def donghua(): - Popen(["scrapy","crawl","donghua"]) + Popen(["scrapy", "crawl", "donghua"]) + def update_author(): - Popen(["scrapy","crawl","authorUpdate"]) + Popen(["scrapy", "crawl", "authorUpdate"]) + def auto_add_author(): - Popen(["scrapy","crawl","authorAutoAdd"]) + Popen(["scrapy", "crawl", "authorAutoAdd"]) + def video_watcher(): - Popen(["scrapy","crawl","videoWatcher"]) + Popen(["scrapy", "crawl", "videoWatcher"]) + def video_spider(): - Popen(["scrapy","crawl","videoSpider"]) + Popen(["scrapy", "crawl", "videoSpider"]) + def video_spider_all(): - Popen(["scrapy","crawl","videoSpiderAll"]) + Popen(["scrapy", "crawl", "videoSpiderAll"]) + def online(): - Popen(['scrapy','crawl','online']) + Popen(['scrapy', 'crawl', 'online']) + + +def strong(): + Popen(['scrapy', 'crawl', 'strong']) + def data_analyze(): - Popen(['python','run_analyzer.py']) + Popen(['python', 'run_analyzer.py']) + def bili_monthly_rank(): - Popen(['scrapy','crawl','biliMonthlyRank']) + Popen(['scrapy', 'crawl', 'biliMonthlyRank']) def run_threaded(job_func): - job_thread = threading.Thread(target=job_func) - job_thread.start() - -schedule.every().day.at('11:40').do(run_threaded,data_analyze) -schedule.every().day.at('01:00').do(run_threaded,update_author) -schedule.every().day.at('07:00').do(run_threaded,video_spider) -schedule.every().day.at('14:00').do(run_threaded,auto_add_author) -schedule.every().day.at('16:50').do(run_threaded,bangumi) -schedule.every().day.at('16:30').do(run_threaded,donghua) -schedule.every().day.at('22:00').do(run_threaded,video_watcher) -schedule.every().day.at('21:00').do(run_threaded,bili_monthly_rank) -schedule.every().week.do(run_threaded,video_spider_all) -schedule.every().hour.do(run_threaded,site) -schedule.every().minute.do(run_threaded,online) + job_thread = threading.Thread(target=job_func) + job_thread.start() + + +schedule.every().day.at('11:40').do(run_threaded, data_analyze) +schedule.every().day.at('01:00').do(run_threaded, update_author) +schedule.every().day.at('07:00').do(run_threaded, video_spider) +schedule.every().day.at('14:00').do(run_threaded, auto_add_author) +schedule.every().day.at('16:50').do(run_threaded, bangumi) +schedule.every().day.at('16:30').do(run_threaded, donghua) +schedule.every().day.at('22:00').do(run_threaded, video_watcher) +schedule.every().day.at('21:00').do(run_threaded, bili_monthly_rank) +schedule.every().week.do(run_threaded, video_spider_all) +schedule.every().hour.do(run_threaded, site) +schedule.every(5).minutes.do(run_threaded, online) +schedule.every().minute.do(run_threaded, strong) print('开始运行计划任务..') while True: schedule.run_pending() time.sleep(60) - From 554b5e7cd820fed0c08f327db4e8f23a2809d70b Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 2 Feb 2019 00:21:07 +0800 Subject: [PATCH 130/469] feature: strong observe spider --- .vscode/launch.json | 67 ++++++ .vscode/settings.json | 0 biliob_spider/items.py | 49 ++++- biliob_spider/pipelines.py | 76 +++++++ biliob_spider/spiders/author_update.py | 2 +- biliob_spider/spiders/strong_focus.py | 285 +++++++++++++++++++++++++ biliob_spider/spiders/video_spider.py | 2 - run.py | 68 +++--- 8 files changed, 518 insertions(+), 31 deletions(-) create mode 100644 .vscode/launch.json create mode 100644 .vscode/settings.json create mode 100644 biliob_spider/spiders/strong_focus.py diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..dd7901a --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,67 @@ +{ + // 使用 IntelliSense 了解相关属性。 + // 悬停以查看现有属性的描述。 + // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [{ + "name": "Python: Current File (Integrated Terminal)", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal" + }, + { + "name": "Python: Remote Attach", + "type": "python", + "request": "attach", + "port": 5678, + "host": "localhost", + "pathMappings": [{ + "localRoot": "${workspaceFolder}", + "remoteRoot": "." + }] + }, + { + "name": "Python: Module", + "type": "python", + "request": "launch", + "module": "enter-your-module-name-here", + "console": "integratedTerminal" + }, + { + "name": "Python: Django", + "type": "python", + "request": "launch", + "program": "${workspaceFolder}/manage.py", + "console": "integratedTerminal", + "args": [ + "runserver", + "--noreload", + "--nothreading" + ], + "django": true + }, + { + "name": "Python: Flask", + "type": "python", + "request": "launch", + "module": "flask", + "env": { + "FLASK_APP": "app.py" + }, + "args": [ + "run", + "--no-debugger", + "--no-reload" + ], + "jinja": true + }, + { + "name": "Python: Current File (External Terminal)", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "externalTerminal" + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..e69de29 diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 39ce9ec..490dfe6 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -8,18 +8,22 @@ import scrapy from mail import mailer + class SiteItem(scrapy.Item): region_count = scrapy.Field() all_count = scrapy.Field() web_online = scrapy.Field() play_online = scrapy.Field() + class TagItem(scrapy.Item): tag_id = scrapy.Field() tag_name = scrapy.Field() use = scrapy.Field() atten = scrapy.Field() ctime = scrapy.Field() + + class BangumiItem(scrapy.Item): title = scrapy.Field() tag = scrapy.Field() @@ -30,6 +34,43 @@ class BangumiItem(scrapy.Item): newest_ep_index = scrapy.Field() data = scrapy.Field() + +class VideoAndAuthorItem(scrapy.Item): + mid = scrapy.Field() + name = scrapy.Field() + face = scrapy.Field() + official = scrapy.Field() + sex = scrapy.Field() + data_video = scrapy.Field() + data_author = scrapy.Field() + level = scrapy.Field() + focus = scrapy.Field() + pts = scrapy.Field() + c_fans = scrapy.Field() + c_attention = scrapy.Field() + c_archive = scrapy.Field() + c_article = scrapy.Field() + c_archive_view = scrapy.Field() + c_article_view = scrapy.Field() + c_datetime = scrapy.Field() + channel = scrapy.Field() + aid = scrapy.Field() + datetime = scrapy.Field() + author = scrapy.Field() + data = scrapy.Field() + subChannel = scrapy.Field() + title = scrapy.Field() + mid = scrapy.Field() + pic = scrapy.Field() + current_view = scrapy.Field() + current_favorite = scrapy.Field() + current_danmaku = scrapy.Field() + current_coin = scrapy.Field() + current_share = scrapy.Field() + current_like = scrapy.Field() + current_datetime = scrapy.Field() + + class VideoItem(scrapy.Item): channel = scrapy.Field() aid = scrapy.Field() @@ -48,6 +89,7 @@ class VideoItem(scrapy.Item): current_like = scrapy.Field() current_datetime = scrapy.Field() + class AuthorItem(scrapy.Item): mid = scrapy.Field() name = scrapy.Field() @@ -66,6 +108,7 @@ class AuthorItem(scrapy.Item): c_article_view = scrapy.Field() c_datetime = scrapy.Field() + class RankItem(scrapy.Item): title = scrapy.Field() author = scrapy.Field() @@ -74,6 +117,7 @@ class RankItem(scrapy.Item): mid = scrapy.Field() channel = scrapy.Field() + class VideoOnline(scrapy.Item): title = scrapy.Field() author = scrapy.Field() @@ -81,8 +125,9 @@ class VideoOnline(scrapy.Item): aid = scrapy.Field() subChannel = scrapy.Field() channel = scrapy.Field() - + + class VideoWatcherItem(scrapy.Item): mid = scrapy.Field() aid = scrapy.Field() - channels = scrapy.Field() \ No newline at end of file + channels = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 1678ea2..da2d371 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -11,6 +11,80 @@ import logging +class StrongPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + + def process_item(self, item, spider): + try: + self.coll = self.db['video'] + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'cView': item['current_view'], + 'cFavorite': item['current_favorite'], + 'cDanmaku': item['current_danmaku'], + 'cCoin': item['current_coin'], + 'cShare': item['current_share'], + 'cLike': item['current_like'], + 'cDatetime': item['current_datetime'], + 'author': item['author'], + 'subChannel': item['subChannel'], + 'channel': item['channel'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': datetime.datetime.fromtimestamp( + item['datetime']) + }, + '$push': { + 'data': { + '$each': [item['data_video']], + '$position': 0 + } + } + }, True) + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) + try: + self.coll = self.db['author'] # 获得collection的句柄 + self.coll.update_one({ + 'mid': item['mid'] + }, { + '$set': { + 'focus': True, + 'sex': item['sex'], + 'name': item['name'], + 'face': item['face'], + 'level': item['level'], + 'cFans': item['c_fans'], + 'official': item['official'], + 'cArchive': item['c_archive'], + 'cArticle': item['c_article'], + 'cAttention': item['c_attention'], + 'cArchive_view': item['c_archive_view'], + 'cArticle_view': item['c_article_view'], + }, + '$push': { + 'data': { + '$each': [item['data_author']], + '$position': 0 + } + } + }, True) + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) + return item + + class VideoPipeline(object): def __init__(self): # 链接mongoDB @@ -55,6 +129,7 @@ def process_item(self, item, spider): # 出现错误时打印错误日志 logging.error(error) + class VideoPipelineFromKan(object): def __init__(self): # 链接mongoDB @@ -89,6 +164,7 @@ def process_item(self, item, spider): # 出现错误时打印错误日志 logging.error(error) + class BangumiPipeLine(object): def __init__(self): # 链接mongoDB diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index 8b819fd..399d341 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -1,4 +1,4 @@ -#coding=utf-8 +# coding=utf-8 import scrapy from mail import mailer from scrapy.http import Request diff --git a/biliob_spider/spiders/strong_focus.py b/biliob_spider/spiders/strong_focus.py new file mode 100644 index 0000000..8d353eb --- /dev/null +++ b/biliob_spider/spiders/strong_focus.py @@ -0,0 +1,285 @@ +# coding=utf-8 +import scrapy +from mail import mailer +from scrapy.http import Request +from biliob_spider.items import VideoAndAuthorItem +import time +import json +import logging +from pymongo import MongoClient +import datetime + +sub_channel_2_channel = { + 'ASMR': '生活', + 'GMV': '游戏', + 'Korea相关': '娱乐', + 'MAD·AMV': '动画', + 'MMD·3D': '动画', + 'Mugen': '游戏', + 'OP/ED/OST': '音乐', + 'VOCALOID·UTAU': '音乐', + '三次元舞蹈': '舞蹈', + '三次元音乐': '音乐', + '人力VOCALOID': '鬼畜', + '人文·历史': '纪录片', + '健身': '时尚', + '其他': '生活', + '其他国家': '电影', + '军事': '纪录片', + '动物圈': '生活', + '华语电影': '电影', + '单机游戏': '游戏', + '原创音乐': '音乐', + '国产剧': '电视剧', + '国产动画': '国创', + '国产原创相关': '国创', + '宅舞': '舞蹈', + '完结动画': '番剧', + '官方延伸': '番剧', + '布袋戏': '国创', + '广告': '广告', + '影视剪辑': '影视', + '影视杂谈': '影视', + '手工': '生活', + '手机游戏': '游戏', + '搞笑': '生活', + '教程演示': '鬼畜', + '数码': '数码', + '日常': '生活', + '明星': '娱乐', + '星海': '科技', + '服饰': '时尚', + '机械': '科技', + '桌游棋牌': '游戏', + '欧美电影': '电影', + '汽车': '科技', + '游戏': '游戏', + '海外剧': '电视剧', + '演奏': '音乐', + '演讲·公开课': '科技', + '特摄': '影视', + '电子竞技': '游戏', + '短片': '影视', + '短片·手书·配音': '动画', + '社会·美食·旅行': '纪录片', + '科学·探索·自然': '纪录片', + '绘画': '生活', + '综合': '动画', + '综艺': '娱乐', + '网络游戏': '游戏', + '美妆': '时尚', + '美食圈': '生活', + '翻唱': '音乐', + '舞蹈教程': '舞蹈', + '资讯': '国创', + '趣味科普人文': '科技', + '运动': '生活', + '连载动画': '番剧', + '野生技术协会': '科技', + '音MAD': '鬼畜', + '音乐选集': '音乐', + '音游': '游戏', + '预告 资讯': '影视', + '预告·资讯': '影视', + '单机联机': '游戏', + '鬼畜调教': '鬼畜', + '演讲• 公开课': '科技', + '国产电影': '电影', + '日本电影': '电影', + '番剧': '番剧', + '国创': '国创', + '鬼畜': '鬼畜', + '电视剧': '电视剧', + '动画': '动画', + '时尚': '时尚', + '娱乐': '娱乐', + '电影': '电影', + '舞蹈': '舞蹈', + '科技': '科技', + '生活': '生活', + '音乐': '音乐', + '纪录片': '纪录片', + '手机平板': '数码', + '电脑装机': '数码', + '影音智能': '数码', + '摄影摄像': '数码', + '风尚标': '时尚', + '电音': '音乐', + '音乐综合': '音乐', + 'MV': '音乐', + '音乐现场': '音乐', + 'T台': '时尚', +} + + +class StrongSpider(scrapy.spiders.Spider): + name = "strong" + allowed_domains = ["bilibili.com"] + start_urls = ['https://www.bilibili.com/video/online.html'] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.StrongPipeline': 300 + }, + 'DOWNLOAD_DELAY': 2 + } + + def parse(self, response): + try: + video_list = response.xpath('//*[@id="app"]/div[2]/div[2]/div') + # 为了爬取分区、粉丝数等数据,需要进入每一个视频的详情页面进行抓取 + href_list = video_list.xpath('./a/@href').extract() + for i in range(len(href_list)): + # 为了爬取分区等数据,需要进入每一个视频的详情页面进行抓取 + yield Request( + "https://api.bilibili.com/x/article/archives?ids=" + + href_list[i][9:-1], + callback=self.detailParse) + except Exception as error: + # 出现错误时打印错误日志 + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}".format(response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) + + def detailParse(self, response): + try: + r = json.loads(response.body) + d = r["data"] + keys = list(d.keys()) + for each_key in keys: + + aid = d[each_key]['stat']['aid'] + author = d[each_key]['owner']['name'] + mid = d[each_key]['owner']['mid'] + view = d[each_key]['stat']['view'] + favorite = d[each_key]['stat']['favorite'] + danmaku = d[each_key]['stat']['danmaku'] + coin = d[each_key]['stat']['coin'] + share = d[each_key]['stat']['share'] + like = d[each_key]['stat']['like'] + current_date = datetime.datetime.now() + data = { + 'view': view, + 'favorite': favorite, + 'danmaku': danmaku, + 'coin': coin, + 'share': share, + 'like': like, + 'datetime': current_date + } + + subChannel = d[each_key]['tname'] + title = d[each_key]['title'] + date = d[each_key]['pubdate'] + tid = d[each_key]['tid'] + pic = d[each_key]['pic'] + item = VideoAndAuthorItem() + item['current_view'] = view + item['current_favorite'] = favorite + item['current_danmaku'] = danmaku + item['current_coin'] = coin + item['current_share'] = share + item['current_like'] = like + item['current_datetime'] = current_date + item['aid'] = aid + item['mid'] = mid + item['pic'] = pic + item['author'] = author + item['data_video'] = data + item['title'] = title + item['subChannel'] = subChannel + item['datetime'] = date + + if subChannel != '': + item['channel'] = sub_channel_2_channel[subChannel] + elif subChannel == '资讯': + if tid == 51: + item['channel'] == '番剧' + if tid == 170: + item['channel'] == '国创' + if tid == 159: + item['channel'] == '娱乐' + else: + item['channel'] = None + yield Request( + "https://api.bilibili.com/x/web-interface/card?mid=" + + str(mid), meta={'item': item}, + method='GET', callback=self.authorParse) + + except Exception as error: + # 出现错误时打印错误日志 + if r['code'] == -404: + return + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(item) + logging.error(response.url) + logging.error(error) + + def authorParse(self, response): + try: + item = response.meta['item'] + j = json.loads(response.body) + name = j['data']['card']['name'] + mid = j['data']['card']['mid'] + sex = j['data']['card']['sex'] + face = j['data']['card']['face'] + fans = j['data']['card']['fans'] + attention = j['data']['card']['attention'] + level = j['data']['card']['level_info']['current_level'] + official = j['data']['card']['Official']['title'] + archive = j['data']['archive_count'] + article = j['data']['article_count'] + face = j['data']['card']['face'] + item['mid'] = int(mid) + item['name'] = name + item['face'] = face + item['official'] = official + item['sex'] = sex + item['level'] = int(level) + item['data_author'] = { + 'fans': int(fans), + 'attention': int(attention), + 'archive': int(archive), + 'article': int(article), + 'datetime': datetime.datetime.now() + } + item['c_fans'] = int(fans) + item['c_attention'] = int(attention) + item['c_archive'] = int(archive) + item['c_article'] = int(article) + yield Request( + "https://api.bilibili.com/x/space/upstat?mid={mid}".format( + mid=str(mid)), + meta={'item': item}, + method='GET', + callback=self.parse_view) + except Exception as error: + # 出现错误时打印错误日志 + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) + + def parse_view(self, response): + j = json.loads(response.body) + archive_view = j['data']['archive']['view'] + article_view = j['data']['article']['view'] + item = response.meta['item'] + item['data_author']['archiveView'] = archive_view + item['data_author']['articleView'] = article_view + item['c_archive_view'] = int(archive_view) + item['c_article_view'] = int(article_view) + yield item diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index b70c3d3..aa54a53 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -135,9 +135,7 @@ def __init__(self): def start_requests(self): # 只需要aid c = self.coll.find({'$or':[{'focus':True},{'forceFocus':True}]}, {'aid': 1}) - x = 0 - aid_list = [] for each_doc in c: x = x + 1 diff --git a/run.py b/run.py index ea00ba9..1fa68df 100644 --- a/run.py +++ b/run.py @@ -7,59 +7,75 @@ import logging import threading + def site(): - Popen(["scrapy","crawl","site"]) + Popen(["scrapy", "crawl", "site"]) + def bangumi(): - Popen(["scrapy","crawl","bangumi"]) + Popen(["scrapy", "crawl", "bangumi"]) + def donghua(): - Popen(["scrapy","crawl","donghua"]) + Popen(["scrapy", "crawl", "donghua"]) + def update_author(): - Popen(["scrapy","crawl","authorUpdate"]) + Popen(["scrapy", "crawl", "authorUpdate"]) + def auto_add_author(): - Popen(["scrapy","crawl","authorAutoAdd"]) + Popen(["scrapy", "crawl", "authorAutoAdd"]) + def video_watcher(): - Popen(["scrapy","crawl","videoWatcher"]) + Popen(["scrapy", "crawl", "videoWatcher"]) + def video_spider(): - Popen(["scrapy","crawl","videoSpider"]) + Popen(["scrapy", "crawl", "videoSpider"]) + def video_spider_all(): - Popen(["scrapy","crawl","videoSpiderAll"]) + Popen(["scrapy", "crawl", "videoSpiderAll"]) + def online(): - Popen(['scrapy','crawl','online']) + Popen(['scrapy', 'crawl', 'online']) + + +def strong(): + Popen(['scrapy', 'crawl', 'strong']) + def data_analyze(): - Popen(['python','run_analyzer.py']) + Popen(['python', 'run_analyzer.py']) + def bili_monthly_rank(): - Popen(['scrapy','crawl','biliMonthlyRank']) + Popen(['scrapy', 'crawl', 'biliMonthlyRank']) def run_threaded(job_func): - job_thread = threading.Thread(target=job_func) - job_thread.start() - -schedule.every().day.at('11:40').do(run_threaded,data_analyze) -schedule.every().day.at('01:00').do(run_threaded,update_author) -schedule.every().day.at('07:00').do(run_threaded,video_spider) -schedule.every().day.at('14:00').do(run_threaded,auto_add_author) -schedule.every().day.at('16:50').do(run_threaded,bangumi) -schedule.every().day.at('16:30').do(run_threaded,donghua) -schedule.every().day.at('22:00').do(run_threaded,video_watcher) -schedule.every().day.at('21:00').do(run_threaded,bili_monthly_rank) -schedule.every().week.do(run_threaded,video_spider_all) -schedule.every().hour.do(run_threaded,site) -schedule.every().minute.do(run_threaded,online) + job_thread = threading.Thread(target=job_func) + job_thread.start() + + +schedule.every().day.at('11:40').do(run_threaded, data_analyze) +schedule.every().day.at('01:00').do(run_threaded, update_author) +schedule.every().day.at('07:00').do(run_threaded, video_spider) +schedule.every().day.at('14:00').do(run_threaded, auto_add_author) +schedule.every().day.at('16:50').do(run_threaded, bangumi) +schedule.every().day.at('16:30').do(run_threaded, donghua) +schedule.every().day.at('22:00').do(run_threaded, video_watcher) +schedule.every().day.at('21:00').do(run_threaded, bili_monthly_rank) +schedule.every().week.do(run_threaded, video_spider_all) +schedule.every().hour.do(run_threaded, site) +schedule.every(5).minutes.do(run_threaded, online) +schedule.every().minute.do(run_threaded, strong) print('开始运行计划任务..') while True: schedule.run_pending() time.sleep(60) - From c8baa2920241ee29cecee86e66c8ff3d202388eb Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 2 Feb 2019 00:21:07 +0800 Subject: [PATCH 131/469] feature: strong observe spider --- .vscode/launch.json | 67 ++++++ .vscode/settings.json | 0 biliob_spider/items.py | 49 ++++- biliob_spider/pipelines.py | 76 +++++++ biliob_spider/spiders/author_update.py | 2 +- biliob_spider/spiders/strong_focus.py | 285 +++++++++++++++++++++++++ biliob_spider/spiders/video_spider.py | 2 - run.py | 68 +++--- 8 files changed, 518 insertions(+), 31 deletions(-) create mode 100644 .vscode/launch.json create mode 100644 .vscode/settings.json create mode 100644 biliob_spider/spiders/strong_focus.py diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..dd7901a --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,67 @@ +{ + // 使用 IntelliSense 了解相关属性。 + // 悬停以查看现有属性的描述。 + // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [{ + "name": "Python: Current File (Integrated Terminal)", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal" + }, + { + "name": "Python: Remote Attach", + "type": "python", + "request": "attach", + "port": 5678, + "host": "localhost", + "pathMappings": [{ + "localRoot": "${workspaceFolder}", + "remoteRoot": "." + }] + }, + { + "name": "Python: Module", + "type": "python", + "request": "launch", + "module": "enter-your-module-name-here", + "console": "integratedTerminal" + }, + { + "name": "Python: Django", + "type": "python", + "request": "launch", + "program": "${workspaceFolder}/manage.py", + "console": "integratedTerminal", + "args": [ + "runserver", + "--noreload", + "--nothreading" + ], + "django": true + }, + { + "name": "Python: Flask", + "type": "python", + "request": "launch", + "module": "flask", + "env": { + "FLASK_APP": "app.py" + }, + "args": [ + "run", + "--no-debugger", + "--no-reload" + ], + "jinja": true + }, + { + "name": "Python: Current File (External Terminal)", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "externalTerminal" + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..e69de29 diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 39ce9ec..490dfe6 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -8,18 +8,22 @@ import scrapy from mail import mailer + class SiteItem(scrapy.Item): region_count = scrapy.Field() all_count = scrapy.Field() web_online = scrapy.Field() play_online = scrapy.Field() + class TagItem(scrapy.Item): tag_id = scrapy.Field() tag_name = scrapy.Field() use = scrapy.Field() atten = scrapy.Field() ctime = scrapy.Field() + + class BangumiItem(scrapy.Item): title = scrapy.Field() tag = scrapy.Field() @@ -30,6 +34,43 @@ class BangumiItem(scrapy.Item): newest_ep_index = scrapy.Field() data = scrapy.Field() + +class VideoAndAuthorItem(scrapy.Item): + mid = scrapy.Field() + name = scrapy.Field() + face = scrapy.Field() + official = scrapy.Field() + sex = scrapy.Field() + data_video = scrapy.Field() + data_author = scrapy.Field() + level = scrapy.Field() + focus = scrapy.Field() + pts = scrapy.Field() + c_fans = scrapy.Field() + c_attention = scrapy.Field() + c_archive = scrapy.Field() + c_article = scrapy.Field() + c_archive_view = scrapy.Field() + c_article_view = scrapy.Field() + c_datetime = scrapy.Field() + channel = scrapy.Field() + aid = scrapy.Field() + datetime = scrapy.Field() + author = scrapy.Field() + data = scrapy.Field() + subChannel = scrapy.Field() + title = scrapy.Field() + mid = scrapy.Field() + pic = scrapy.Field() + current_view = scrapy.Field() + current_favorite = scrapy.Field() + current_danmaku = scrapy.Field() + current_coin = scrapy.Field() + current_share = scrapy.Field() + current_like = scrapy.Field() + current_datetime = scrapy.Field() + + class VideoItem(scrapy.Item): channel = scrapy.Field() aid = scrapy.Field() @@ -48,6 +89,7 @@ class VideoItem(scrapy.Item): current_like = scrapy.Field() current_datetime = scrapy.Field() + class AuthorItem(scrapy.Item): mid = scrapy.Field() name = scrapy.Field() @@ -66,6 +108,7 @@ class AuthorItem(scrapy.Item): c_article_view = scrapy.Field() c_datetime = scrapy.Field() + class RankItem(scrapy.Item): title = scrapy.Field() author = scrapy.Field() @@ -74,6 +117,7 @@ class RankItem(scrapy.Item): mid = scrapy.Field() channel = scrapy.Field() + class VideoOnline(scrapy.Item): title = scrapy.Field() author = scrapy.Field() @@ -81,8 +125,9 @@ class VideoOnline(scrapy.Item): aid = scrapy.Field() subChannel = scrapy.Field() channel = scrapy.Field() - + + class VideoWatcherItem(scrapy.Item): mid = scrapy.Field() aid = scrapy.Field() - channels = scrapy.Field() \ No newline at end of file + channels = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 1678ea2..da2d371 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -11,6 +11,80 @@ import logging +class StrongPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + + def process_item(self, item, spider): + try: + self.coll = self.db['video'] + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'cView': item['current_view'], + 'cFavorite': item['current_favorite'], + 'cDanmaku': item['current_danmaku'], + 'cCoin': item['current_coin'], + 'cShare': item['current_share'], + 'cLike': item['current_like'], + 'cDatetime': item['current_datetime'], + 'author': item['author'], + 'subChannel': item['subChannel'], + 'channel': item['channel'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': datetime.datetime.fromtimestamp( + item['datetime']) + }, + '$push': { + 'data': { + '$each': [item['data_video']], + '$position': 0 + } + } + }, True) + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) + try: + self.coll = self.db['author'] # 获得collection的句柄 + self.coll.update_one({ + 'mid': item['mid'] + }, { + '$set': { + 'focus': True, + 'sex': item['sex'], + 'name': item['name'], + 'face': item['face'], + 'level': item['level'], + 'cFans': item['c_fans'], + 'official': item['official'], + 'cArchive': item['c_archive'], + 'cArticle': item['c_article'], + 'cAttention': item['c_attention'], + 'cArchive_view': item['c_archive_view'], + 'cArticle_view': item['c_article_view'], + }, + '$push': { + 'data': { + '$each': [item['data_author']], + '$position': 0 + } + } + }, True) + except Exception as error: + # 出现错误时打印错误日志 + logging.error(error) + return item + + class VideoPipeline(object): def __init__(self): # 链接mongoDB @@ -55,6 +129,7 @@ def process_item(self, item, spider): # 出现错误时打印错误日志 logging.error(error) + class VideoPipelineFromKan(object): def __init__(self): # 链接mongoDB @@ -89,6 +164,7 @@ def process_item(self, item, spider): # 出现错误时打印错误日志 logging.error(error) + class BangumiPipeLine(object): def __init__(self): # 链接mongoDB diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index 8b819fd..399d341 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -1,4 +1,4 @@ -#coding=utf-8 +# coding=utf-8 import scrapy from mail import mailer from scrapy.http import Request diff --git a/biliob_spider/spiders/strong_focus.py b/biliob_spider/spiders/strong_focus.py new file mode 100644 index 0000000..8d353eb --- /dev/null +++ b/biliob_spider/spiders/strong_focus.py @@ -0,0 +1,285 @@ +# coding=utf-8 +import scrapy +from mail import mailer +from scrapy.http import Request +from biliob_spider.items import VideoAndAuthorItem +import time +import json +import logging +from pymongo import MongoClient +import datetime + +sub_channel_2_channel = { + 'ASMR': '生活', + 'GMV': '游戏', + 'Korea相关': '娱乐', + 'MAD·AMV': '动画', + 'MMD·3D': '动画', + 'Mugen': '游戏', + 'OP/ED/OST': '音乐', + 'VOCALOID·UTAU': '音乐', + '三次元舞蹈': '舞蹈', + '三次元音乐': '音乐', + '人力VOCALOID': '鬼畜', + '人文·历史': '纪录片', + '健身': '时尚', + '其他': '生活', + '其他国家': '电影', + '军事': '纪录片', + '动物圈': '生活', + '华语电影': '电影', + '单机游戏': '游戏', + '原创音乐': '音乐', + '国产剧': '电视剧', + '国产动画': '国创', + '国产原创相关': '国创', + '宅舞': '舞蹈', + '完结动画': '番剧', + '官方延伸': '番剧', + '布袋戏': '国创', + '广告': '广告', + '影视剪辑': '影视', + '影视杂谈': '影视', + '手工': '生活', + '手机游戏': '游戏', + '搞笑': '生活', + '教程演示': '鬼畜', + '数码': '数码', + '日常': '生活', + '明星': '娱乐', + '星海': '科技', + '服饰': '时尚', + '机械': '科技', + '桌游棋牌': '游戏', + '欧美电影': '电影', + '汽车': '科技', + '游戏': '游戏', + '海外剧': '电视剧', + '演奏': '音乐', + '演讲·公开课': '科技', + '特摄': '影视', + '电子竞技': '游戏', + '短片': '影视', + '短片·手书·配音': '动画', + '社会·美食·旅行': '纪录片', + '科学·探索·自然': '纪录片', + '绘画': '生活', + '综合': '动画', + '综艺': '娱乐', + '网络游戏': '游戏', + '美妆': '时尚', + '美食圈': '生活', + '翻唱': '音乐', + '舞蹈教程': '舞蹈', + '资讯': '国创', + '趣味科普人文': '科技', + '运动': '生活', + '连载动画': '番剧', + '野生技术协会': '科技', + '音MAD': '鬼畜', + '音乐选集': '音乐', + '音游': '游戏', + '预告 资讯': '影视', + '预告·资讯': '影视', + '单机联机': '游戏', + '鬼畜调教': '鬼畜', + '演讲• 公开课': '科技', + '国产电影': '电影', + '日本电影': '电影', + '番剧': '番剧', + '国创': '国创', + '鬼畜': '鬼畜', + '电视剧': '电视剧', + '动画': '动画', + '时尚': '时尚', + '娱乐': '娱乐', + '电影': '电影', + '舞蹈': '舞蹈', + '科技': '科技', + '生活': '生活', + '音乐': '音乐', + '纪录片': '纪录片', + '手机平板': '数码', + '电脑装机': '数码', + '影音智能': '数码', + '摄影摄像': '数码', + '风尚标': '时尚', + '电音': '音乐', + '音乐综合': '音乐', + 'MV': '音乐', + '音乐现场': '音乐', + 'T台': '时尚', +} + + +class StrongSpider(scrapy.spiders.Spider): + name = "strong" + allowed_domains = ["bilibili.com"] + start_urls = ['https://www.bilibili.com/video/online.html'] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.StrongPipeline': 300 + }, + 'DOWNLOAD_DELAY': 2 + } + + def parse(self, response): + try: + video_list = response.xpath('//*[@id="app"]/div[2]/div[2]/div') + # 为了爬取分区、粉丝数等数据,需要进入每一个视频的详情页面进行抓取 + href_list = video_list.xpath('./a/@href').extract() + for i in range(len(href_list)): + # 为了爬取分区等数据,需要进入每一个视频的详情页面进行抓取 + yield Request( + "https://api.bilibili.com/x/article/archives?ids=" + + href_list[i][9:-1], + callback=self.detailParse) + except Exception as error: + # 出现错误时打印错误日志 + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}".format(response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) + + def detailParse(self, response): + try: + r = json.loads(response.body) + d = r["data"] + keys = list(d.keys()) + for each_key in keys: + + aid = d[each_key]['stat']['aid'] + author = d[each_key]['owner']['name'] + mid = d[each_key]['owner']['mid'] + view = d[each_key]['stat']['view'] + favorite = d[each_key]['stat']['favorite'] + danmaku = d[each_key]['stat']['danmaku'] + coin = d[each_key]['stat']['coin'] + share = d[each_key]['stat']['share'] + like = d[each_key]['stat']['like'] + current_date = datetime.datetime.now() + data = { + 'view': view, + 'favorite': favorite, + 'danmaku': danmaku, + 'coin': coin, + 'share': share, + 'like': like, + 'datetime': current_date + } + + subChannel = d[each_key]['tname'] + title = d[each_key]['title'] + date = d[each_key]['pubdate'] + tid = d[each_key]['tid'] + pic = d[each_key]['pic'] + item = VideoAndAuthorItem() + item['current_view'] = view + item['current_favorite'] = favorite + item['current_danmaku'] = danmaku + item['current_coin'] = coin + item['current_share'] = share + item['current_like'] = like + item['current_datetime'] = current_date + item['aid'] = aid + item['mid'] = mid + item['pic'] = pic + item['author'] = author + item['data_video'] = data + item['title'] = title + item['subChannel'] = subChannel + item['datetime'] = date + + if subChannel != '': + item['channel'] = sub_channel_2_channel[subChannel] + elif subChannel == '资讯': + if tid == 51: + item['channel'] == '番剧' + if tid == 170: + item['channel'] == '国创' + if tid == 159: + item['channel'] == '娱乐' + else: + item['channel'] = None + yield Request( + "https://api.bilibili.com/x/web-interface/card?mid=" + + str(mid), meta={'item': item}, + method='GET', callback=self.authorParse) + + except Exception as error: + # 出现错误时打印错误日志 + if r['code'] == -404: + return + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(item) + logging.error(response.url) + logging.error(error) + + def authorParse(self, response): + try: + item = response.meta['item'] + j = json.loads(response.body) + name = j['data']['card']['name'] + mid = j['data']['card']['mid'] + sex = j['data']['card']['sex'] + face = j['data']['card']['face'] + fans = j['data']['card']['fans'] + attention = j['data']['card']['attention'] + level = j['data']['card']['level_info']['current_level'] + official = j['data']['card']['Official']['title'] + archive = j['data']['archive_count'] + article = j['data']['article_count'] + face = j['data']['card']['face'] + item['mid'] = int(mid) + item['name'] = name + item['face'] = face + item['official'] = official + item['sex'] = sex + item['level'] = int(level) + item['data_author'] = { + 'fans': int(fans), + 'attention': int(attention), + 'archive': int(archive), + 'article': int(article), + 'datetime': datetime.datetime.now() + } + item['c_fans'] = int(fans) + item['c_attention'] = int(attention) + item['c_archive'] = int(archive) + item['c_article'] = int(article) + yield Request( + "https://api.bilibili.com/x/space/upstat?mid={mid}".format( + mid=str(mid)), + meta={'item': item}, + method='GET', + callback=self.parse_view) + except Exception as error: + # 出现错误时打印错误日志 + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) + + def parse_view(self, response): + j = json.loads(response.body) + archive_view = j['data']['archive']['view'] + article_view = j['data']['article']['view'] + item = response.meta['item'] + item['data_author']['archiveView'] = archive_view + item['data_author']['articleView'] = article_view + item['c_archive_view'] = int(archive_view) + item['c_article_view'] = int(article_view) + yield item diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index b70c3d3..aa54a53 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -135,9 +135,7 @@ def __init__(self): def start_requests(self): # 只需要aid c = self.coll.find({'$or':[{'focus':True},{'forceFocus':True}]}, {'aid': 1}) - x = 0 - aid_list = [] for each_doc in c: x = x + 1 diff --git a/run.py b/run.py index ea00ba9..1fa68df 100644 --- a/run.py +++ b/run.py @@ -7,59 +7,75 @@ import logging import threading + def site(): - Popen(["scrapy","crawl","site"]) + Popen(["scrapy", "crawl", "site"]) + def bangumi(): - Popen(["scrapy","crawl","bangumi"]) + Popen(["scrapy", "crawl", "bangumi"]) + def donghua(): - Popen(["scrapy","crawl","donghua"]) + Popen(["scrapy", "crawl", "donghua"]) + def update_author(): - Popen(["scrapy","crawl","authorUpdate"]) + Popen(["scrapy", "crawl", "authorUpdate"]) + def auto_add_author(): - Popen(["scrapy","crawl","authorAutoAdd"]) + Popen(["scrapy", "crawl", "authorAutoAdd"]) + def video_watcher(): - Popen(["scrapy","crawl","videoWatcher"]) + Popen(["scrapy", "crawl", "videoWatcher"]) + def video_spider(): - Popen(["scrapy","crawl","videoSpider"]) + Popen(["scrapy", "crawl", "videoSpider"]) + def video_spider_all(): - Popen(["scrapy","crawl","videoSpiderAll"]) + Popen(["scrapy", "crawl", "videoSpiderAll"]) + def online(): - Popen(['scrapy','crawl','online']) + Popen(['scrapy', 'crawl', 'online']) + + +def strong(): + Popen(['scrapy', 'crawl', 'strong']) + def data_analyze(): - Popen(['python','run_analyzer.py']) + Popen(['python', 'run_analyzer.py']) + def bili_monthly_rank(): - Popen(['scrapy','crawl','biliMonthlyRank']) + Popen(['scrapy', 'crawl', 'biliMonthlyRank']) def run_threaded(job_func): - job_thread = threading.Thread(target=job_func) - job_thread.start() - -schedule.every().day.at('11:40').do(run_threaded,data_analyze) -schedule.every().day.at('01:00').do(run_threaded,update_author) -schedule.every().day.at('07:00').do(run_threaded,video_spider) -schedule.every().day.at('14:00').do(run_threaded,auto_add_author) -schedule.every().day.at('16:50').do(run_threaded,bangumi) -schedule.every().day.at('16:30').do(run_threaded,donghua) -schedule.every().day.at('22:00').do(run_threaded,video_watcher) -schedule.every().day.at('21:00').do(run_threaded,bili_monthly_rank) -schedule.every().week.do(run_threaded,video_spider_all) -schedule.every().hour.do(run_threaded,site) -schedule.every().minute.do(run_threaded,online) + job_thread = threading.Thread(target=job_func) + job_thread.start() + + +schedule.every().day.at('11:40').do(run_threaded, data_analyze) +schedule.every().day.at('01:00').do(run_threaded, update_author) +schedule.every().day.at('07:00').do(run_threaded, video_spider) +schedule.every().day.at('14:00').do(run_threaded, auto_add_author) +schedule.every().day.at('16:50').do(run_threaded, bangumi) +schedule.every().day.at('16:30').do(run_threaded, donghua) +schedule.every().day.at('22:00').do(run_threaded, video_watcher) +schedule.every().day.at('21:00').do(run_threaded, bili_monthly_rank) +schedule.every().week.do(run_threaded, video_spider_all) +schedule.every().hour.do(run_threaded, site) +schedule.every(5).minutes.do(run_threaded, online) +schedule.every().minute.do(run_threaded, strong) print('开始运行计划任务..') while True: schedule.run_pending() time.sleep(60) - From 6f5847920321fada2f53eeec3ac4e565dc876973 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 2 Feb 2019 13:30:39 +0800 Subject: [PATCH 132/469] feature: modify the strong focus rate from 1/min to 1/10min --- biliob_spider/spiders/strong_focus.py | 2 +- run.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/biliob_spider/spiders/strong_focus.py b/biliob_spider/spiders/strong_focus.py index 8d353eb..9ee44a6 100644 --- a/biliob_spider/spiders/strong_focus.py +++ b/biliob_spider/spiders/strong_focus.py @@ -120,7 +120,7 @@ class StrongSpider(scrapy.spiders.Spider): 'ITEM_PIPELINES': { 'biliob_spider.pipelines.StrongPipeline': 300 }, - 'DOWNLOAD_DELAY': 2 + 'DOWNLOAD_DELAY': 20 } def parse(self, response): diff --git a/run.py b/run.py index 1fa68df..fb99b5d 100644 --- a/run.py +++ b/run.py @@ -71,8 +71,8 @@ def run_threaded(job_func): schedule.every().day.at('21:00').do(run_threaded, bili_monthly_rank) schedule.every().week.do(run_threaded, video_spider_all) schedule.every().hour.do(run_threaded, site) -schedule.every(5).minutes.do(run_threaded, online) -schedule.every().minute.do(run_threaded, strong) +schedule.every(15).minutes.do(run_threaded, online) +schedule.every(10).minute.do(run_threaded, strong) print('开始运行计划任务..') From 8c3d8cca90e5a0622ae2195607806cad19dbcbb4 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 2 Feb 2019 13:30:39 +0800 Subject: [PATCH 133/469] feature: modify the strong focus rate from 1/min to 1/10min --- biliob_spider/spiders/strong_focus.py | 2 +- run.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/biliob_spider/spiders/strong_focus.py b/biliob_spider/spiders/strong_focus.py index 8d353eb..9ee44a6 100644 --- a/biliob_spider/spiders/strong_focus.py +++ b/biliob_spider/spiders/strong_focus.py @@ -120,7 +120,7 @@ class StrongSpider(scrapy.spiders.Spider): 'ITEM_PIPELINES': { 'biliob_spider.pipelines.StrongPipeline': 300 }, - 'DOWNLOAD_DELAY': 2 + 'DOWNLOAD_DELAY': 20 } def parse(self, response): diff --git a/run.py b/run.py index 1fa68df..fb99b5d 100644 --- a/run.py +++ b/run.py @@ -71,8 +71,8 @@ def run_threaded(job_func): schedule.every().day.at('21:00').do(run_threaded, bili_monthly_rank) schedule.every().week.do(run_threaded, video_spider_all) schedule.every().hour.do(run_threaded, site) -schedule.every(5).minutes.do(run_threaded, online) -schedule.every().minute.do(run_threaded, strong) +schedule.every(15).minutes.do(run_threaded, online) +schedule.every(10).minute.do(run_threaded, strong) print('开始运行计划任务..') From 8320406123d677db75e204a5205b2cf968f2fe5a Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 2 Feb 2019 13:30:39 +0800 Subject: [PATCH 134/469] feature: modify the strong focus rate from 1/min to 1/10min --- biliob_spider/spiders/strong_focus.py | 2 +- run.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/biliob_spider/spiders/strong_focus.py b/biliob_spider/spiders/strong_focus.py index 8d353eb..9ee44a6 100644 --- a/biliob_spider/spiders/strong_focus.py +++ b/biliob_spider/spiders/strong_focus.py @@ -120,7 +120,7 @@ class StrongSpider(scrapy.spiders.Spider): 'ITEM_PIPELINES': { 'biliob_spider.pipelines.StrongPipeline': 300 }, - 'DOWNLOAD_DELAY': 2 + 'DOWNLOAD_DELAY': 20 } def parse(self, response): diff --git a/run.py b/run.py index 1fa68df..fb99b5d 100644 --- a/run.py +++ b/run.py @@ -71,8 +71,8 @@ def run_threaded(job_func): schedule.every().day.at('21:00').do(run_threaded, bili_monthly_rank) schedule.every().week.do(run_threaded, video_spider_all) schedule.every().hour.do(run_threaded, site) -schedule.every(5).minutes.do(run_threaded, online) -schedule.every().minute.do(run_threaded, strong) +schedule.every(15).minutes.do(run_threaded, online) +schedule.every(10).minute.do(run_threaded, strong) print('开始运行计划任务..') From eb011a8992dd4fbf6e072db2e66b28bbe56a7f0f Mon Sep 17 00:00:00 2001 From: jannchie Date: Sat, 2 Feb 2019 13:48:28 +0800 Subject: [PATCH 135/469] hotfix: syntax error --- run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run.py b/run.py index fb99b5d..13b2933 100644 --- a/run.py +++ b/run.py @@ -72,7 +72,7 @@ def run_threaded(job_func): schedule.every().week.do(run_threaded, video_spider_all) schedule.every().hour.do(run_threaded, site) schedule.every(15).minutes.do(run_threaded, online) -schedule.every(10).minute.do(run_threaded, strong) +schedule.every(10).minutes.do(run_threaded, strong) print('开始运行计划任务..') From 42b2ad7e1d98de04caf3ea96441c02afd5529fb9 Mon Sep 17 00:00:00 2001 From: jannchie Date: Sat, 2 Feb 2019 13:48:28 +0800 Subject: [PATCH 136/469] hotfix: syntax error --- run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run.py b/run.py index fb99b5d..13b2933 100644 --- a/run.py +++ b/run.py @@ -72,7 +72,7 @@ def run_threaded(job_func): schedule.every().week.do(run_threaded, video_spider_all) schedule.every().hour.do(run_threaded, site) schedule.every(15).minutes.do(run_threaded, online) -schedule.every(10).minute.do(run_threaded, strong) +schedule.every(10).minutes.do(run_threaded, strong) print('开始运行计划任务..') From 82af9996d0d773e582fd4b723f4e465e0992b456 Mon Sep 17 00:00:00 2001 From: jannchie Date: Sat, 2 Feb 2019 13:48:28 +0800 Subject: [PATCH 137/469] hotfix: syntax error --- run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run.py b/run.py index fb99b5d..13b2933 100644 --- a/run.py +++ b/run.py @@ -72,7 +72,7 @@ def run_threaded(job_func): schedule.every().week.do(run_threaded, video_spider_all) schedule.every().hour.do(run_threaded, site) schedule.every(15).minutes.do(run_threaded, online) -schedule.every(10).minute.do(run_threaded, strong) +schedule.every(10).minutes.do(run_threaded, strong) print('开始运行计划任务..') From 7ad761160f038baef33803df10ecc9e104291935 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 2 Feb 2019 14:29:55 +0800 Subject: [PATCH 138/469] adjust download delay for strong spider --- biliob_spider/spiders/strong_focus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biliob_spider/spiders/strong_focus.py b/biliob_spider/spiders/strong_focus.py index 9ee44a6..06a267c 100644 --- a/biliob_spider/spiders/strong_focus.py +++ b/biliob_spider/spiders/strong_focus.py @@ -120,7 +120,7 @@ class StrongSpider(scrapy.spiders.Spider): 'ITEM_PIPELINES': { 'biliob_spider.pipelines.StrongPipeline': 300 }, - 'DOWNLOAD_DELAY': 20 + 'DOWNLOAD_DELAY': 10 } def parse(self, response): From 7af348acbb0b8f18c6c0aafc5c0aad0f9e246874 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 2 Feb 2019 14:29:55 +0800 Subject: [PATCH 139/469] adjust download delay for strong spider --- biliob_spider/spiders/strong_focus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biliob_spider/spiders/strong_focus.py b/biliob_spider/spiders/strong_focus.py index 9ee44a6..06a267c 100644 --- a/biliob_spider/spiders/strong_focus.py +++ b/biliob_spider/spiders/strong_focus.py @@ -120,7 +120,7 @@ class StrongSpider(scrapy.spiders.Spider): 'ITEM_PIPELINES': { 'biliob_spider.pipelines.StrongPipeline': 300 }, - 'DOWNLOAD_DELAY': 20 + 'DOWNLOAD_DELAY': 10 } def parse(self, response): From adf890949d6416e9397673ff05396dfc8539f83b Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 2 Feb 2019 14:29:55 +0800 Subject: [PATCH 140/469] adjust download delay for strong spider --- biliob_spider/spiders/strong_focus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biliob_spider/spiders/strong_focus.py b/biliob_spider/spiders/strong_focus.py index 9ee44a6..06a267c 100644 --- a/biliob_spider/spiders/strong_focus.py +++ b/biliob_spider/spiders/strong_focus.py @@ -120,7 +120,7 @@ class StrongSpider(scrapy.spiders.Spider): 'ITEM_PIPELINES': { 'biliob_spider.pipelines.StrongPipeline': 300 }, - 'DOWNLOAD_DELAY': 20 + 'DOWNLOAD_DELAY': 10 } def parse(self, response): From 4d60b0c490279bc12bd87d0a93ae7fe052f515f4 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 2 Feb 2019 14:39:11 +0800 Subject: [PATCH 141/469] feature: output error spider name --- biliob_spider/pipelines.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index da2d371..225814a 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -52,7 +52,7 @@ def process_item(self, item, spider): }, True) except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) try: self.coll = self.db['author'] # 获得collection的句柄 self.coll.update_one({ @@ -81,7 +81,7 @@ def process_item(self, item, spider): }, True) except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) return item @@ -127,7 +127,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class VideoPipelineFromKan(object): @@ -162,7 +162,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class BangumiPipeLine(object): @@ -200,7 +200,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class DonghuaPipeLine(object): @@ -238,7 +238,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class SiteInfoPipeline(object): @@ -263,7 +263,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class AuthorPipeline(object): @@ -305,7 +305,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class OnlinePipeline(object): @@ -337,7 +337,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class TagPipeLine(object): @@ -369,7 +369,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class VideoAddPipeline(object): @@ -396,7 +396,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class AuthorChannelPipeline(object): @@ -421,7 +421,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class BiliMonthlyRankPipeline(object): @@ -455,4 +455,4 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) From 60ec65a54c7eea3eac5f5b3be994e89347ec3ddb Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 2 Feb 2019 14:39:11 +0800 Subject: [PATCH 142/469] feature: output error spider name --- biliob_spider/pipelines.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index da2d371..225814a 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -52,7 +52,7 @@ def process_item(self, item, spider): }, True) except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) try: self.coll = self.db['author'] # 获得collection的句柄 self.coll.update_one({ @@ -81,7 +81,7 @@ def process_item(self, item, spider): }, True) except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) return item @@ -127,7 +127,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class VideoPipelineFromKan(object): @@ -162,7 +162,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class BangumiPipeLine(object): @@ -200,7 +200,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class DonghuaPipeLine(object): @@ -238,7 +238,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class SiteInfoPipeline(object): @@ -263,7 +263,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class AuthorPipeline(object): @@ -305,7 +305,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class OnlinePipeline(object): @@ -337,7 +337,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class TagPipeLine(object): @@ -369,7 +369,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class VideoAddPipeline(object): @@ -396,7 +396,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class AuthorChannelPipeline(object): @@ -421,7 +421,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class BiliMonthlyRankPipeline(object): @@ -455,4 +455,4 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) From 0a7e04fd1f74a55855879dc68ece1c29aef2f7ac Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 2 Feb 2019 14:39:11 +0800 Subject: [PATCH 143/469] feature: output error spider name --- biliob_spider/pipelines.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index da2d371..225814a 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -52,7 +52,7 @@ def process_item(self, item, spider): }, True) except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) try: self.coll = self.db['author'] # 获得collection的句柄 self.coll.update_one({ @@ -81,7 +81,7 @@ def process_item(self, item, spider): }, True) except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) return item @@ -127,7 +127,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class VideoPipelineFromKan(object): @@ -162,7 +162,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class BangumiPipeLine(object): @@ -200,7 +200,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class DonghuaPipeLine(object): @@ -238,7 +238,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class SiteInfoPipeline(object): @@ -263,7 +263,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class AuthorPipeline(object): @@ -305,7 +305,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class OnlinePipeline(object): @@ -337,7 +337,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class TagPipeLine(object): @@ -369,7 +369,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class VideoAddPipeline(object): @@ -396,7 +396,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class AuthorChannelPipeline(object): @@ -421,7 +421,7 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) class BiliMonthlyRankPipeline(object): @@ -455,4 +455,4 @@ def process_item(self, item, spider): return item except Exception as error: # 出现错误时打印错误日志 - logging.error(error) + logging.error('{}: {}'.format(spider.name, error)) From f4256e93a7059c0a92d2132c7346b7173e6ba735 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Mon, 11 Feb 2019 16:47:35 +0800 Subject: [PATCH 144/469] fix --- biliob_spider/pipelines.py | 5 - ...nes.py.1f305fa5e47c4f7467edd6e4cb280dc2.py | 453 ++++++++++++++++++ biliob_spider/settings.py | 21 +- biliob_spider/spiders/author_auto_add.py | 78 +-- .../spiders/author_update_with_redis.py | 94 ++++ biliob_spider/spiders/strong_focus.py | 103 +--- .../spiders/video_from_kanbilibili.py | 103 +--- biliob_spider/spiders/video_spider.py | 107 +---- biliob_spider/spiders/video_spider_all.py | 183 +------ .../spiders/video_spider_with_redis.py | 109 +++++ biliob_spider/spiders/video_watcher.py | 5 +- util.py | 101 ++++ 12 files changed, 827 insertions(+), 535 deletions(-) create mode 100644 biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py create mode 100644 biliob_spider/spiders/author_update_with_redis.py create mode 100644 biliob_spider/spiders/video_spider_with_redis.py create mode 100644 util.py diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 225814a..38252ba 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -289,11 +289,6 @@ def process_item(self, item, spider): 'level': item['level'], 'cFans': item['c_fans'], 'official': item['official'], - 'cArchive': item['c_archive'], - 'cArticle': item['c_article'], - 'cAttention': item['c_attention'], - 'cArchive_view': item['c_archive_view'], - 'cArticle_view': item['c_article_view'], }, '$push': { 'data': { diff --git a/biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py b/biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py new file mode 100644 index 0000000..38252ba --- /dev/null +++ b/biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py @@ -0,0 +1,453 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html +from pymongo import MongoClient +from db import settings +from db import mysql_connect +import datetime +import logging + + +class StrongPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + + def process_item(self, item, spider): + try: + self.coll = self.db['video'] + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'cView': item['current_view'], + 'cFavorite': item['current_favorite'], + 'cDanmaku': item['current_danmaku'], + 'cCoin': item['current_coin'], + 'cShare': item['current_share'], + 'cLike': item['current_like'], + 'cDatetime': item['current_datetime'], + 'author': item['author'], + 'subChannel': item['subChannel'], + 'channel': item['channel'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': datetime.datetime.fromtimestamp( + item['datetime']) + }, + '$push': { + 'data': { + '$each': [item['data_video']], + '$position': 0 + } + } + }, True) + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + try: + self.coll = self.db['author'] # 获得collection的句柄 + self.coll.update_one({ + 'mid': item['mid'] + }, { + '$set': { + 'focus': True, + 'sex': item['sex'], + 'name': item['name'], + 'face': item['face'], + 'level': item['level'], + 'cFans': item['c_fans'], + 'official': item['official'], + 'cArchive': item['c_archive'], + 'cArticle': item['c_article'], + 'cAttention': item['c_attention'], + 'cArchive_view': item['c_archive_view'], + 'cArticle_view': item['c_article_view'], + }, + '$push': { + 'data': { + '$each': [item['data_author']], + '$position': 0 + } + } + }, True) + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + return item + + +class VideoPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'cView': item['current_view'], + 'cFavorite': item['current_favorite'], + 'cDanmaku': item['current_danmaku'], + 'cCoin': item['current_coin'], + 'cShare': item['current_share'], + 'cLike': item['current_like'], + 'cDatetime': item['current_datetime'], + 'author': item['author'], + 'subChannel': item['subChannel'], + 'channel': item['channel'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': datetime.datetime.fromtimestamp( + item['datetime']) + }, + '$push': { + 'data': { + '$each': [item['data']], + '$position': 0 + } + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class VideoPipelineFromKan(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'author': item['author'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': item['datetime'] + }, + '$push': { + 'data': { + '$each': [item['data']], + '$position': 0 + } + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class BangumiPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['bangumi'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'title': item['title'] + }, { + '$set': { + 'title': item['title'], + 'cover': item['cover'], + # 'isFinish': item['is_finish'], + # 'isStarted': item['is_started'], + 'newest': item['newest_ep_index'], + 'currentPts': item['data']['pts'], + 'currentPlay': item['data']['play'], + # 'squareCover': item['square_cover'], + 'currentWatch': item['data']['watch'], + 'currentReview': item['data']['review'], + 'currentDanmaku': item['data']['danmaku'] + }, + '$addToSet': { + 'data': item['data'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class DonghuaPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['donghua'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'title': item['title'] + }, { + '$set': { + 'title': item['title'], + 'cover': item['cover'], + # 'isFinish': item['is_finish'], + # 'isStarted': item['is_started'], + 'newest': item['newest_ep_index'], + 'currentPts': item['data']['pts'], + 'currentPlay': item['data']['play'], + # 'squareCover': item['square_cover'], + 'currentWatch': item['data']['watch'], + 'currentReview': item['data']['review'], + 'currentDanmaku': item['data']['danmaku'] + }, + '$addToSet': { + 'data': item['data'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class SiteInfoPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['site_info'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.insert_one({ + 'region_count': item['region_count'], + 'all_count': item['all_count'], + 'web_online': item['web_online'], + 'play_online': item['play_online'], + 'datetime': datetime.datetime.now() + }) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class AuthorPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'mid': item['mid'] + }, { + '$set': { + 'focus': True, + 'sex': item['sex'], + 'name': item['name'], + 'face': item['face'], + 'level': item['level'], + 'cFans': item['c_fans'], + 'official': item['official'], + }, + '$push': { + 'data': { + '$each': [item['data']], + '$position': 0 + } + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class OnlinePipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video_online'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + + self.coll.update_one({ + 'title': item['title'] + }, { + '$set': { + 'title': item['title'], + 'author': item['author'], + 'channel': item['channel'], + 'subChannel': item['subChannel'], + }, + '$addToSet': { + 'data': item['data'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class TagPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['tag'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + + self.coll.update_one({ + 'tag_id': item['tag_id'] + }, { + '$set': { + 'tag_name': item['tag_name'], + 'ctime': item['ctime'], + }, + '$addToSet': { + 'use': item['use'], + 'atten': item['atten'], + 'datetime': datetime.datetime.now() + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class VideoAddPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + for each_aid in item['aid']: + self.coll.update_one({ + 'aid': each_aid + }, { + '$set': { + 'aid': each_aid, + 'focus': True + }, + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class AuthorChannelPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'mid': item['mid'] + }, { + '$set': { + 'channels': item['channels'] + }, + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class BiliMonthlyRankPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['monthly_rank'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'aid': item['aid'] + }, { + '$addToSet': { + 'pts': item['pts'], + 'datetime': datetime.datetime.now() + }, + '$set': { + 'title': item['title'], + 'author': item['author'], + 'aid': item['aid'], + 'mid': item['mid'], + 'channel': item['channel'], + 'currentPts': item['pts'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index bd07c94..d4eb73f 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -10,6 +10,9 @@ # https://doc.scrapy.org/en/latest/topics/spider-middleware.html import random +from db import redis_connect_string + +REDIS_URL = redis_connect_string # LOG_FILE = "biliob_spider.log" LOG_LEVEL = "WARNING" @@ -28,7 +31,7 @@ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36' ] # 随机生成user agent -USER_AGENT = random.choice(USER_AGENT_LIST) +USER_AGENT = random.choice(USER_AGENT_LIST) # Obey robots.txt rules @@ -52,28 +55,28 @@ #TELNETCONSOLE_ENABLED = False # Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { +# DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', -#} +# } # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { +# SPIDER_MIDDLEWARES = { # 'BilibiliRankListSpider.middlewares.BilibiliranklistspiderSpiderMiddleware': 543, -#} +# } # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { +# DOWNLOADER_MIDDLEWARES = { # 'BilibiliRankListSpider.middlewares.BilibiliranklistspiderDownloaderMiddleware': 543, -#} +# } # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { +# EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, -#} +# } # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index a06d075..1475fe6 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -1,4 +1,4 @@ -#coding=utf-8 +# coding=utf-8 import scrapy from mail import mailer from scrapy.http import Request @@ -60,33 +60,49 @@ def parse(self, response): logging.error(error) def detailParse(self, response): - j = json.loads(response.body) - name = j['data']['card']['name'] - mid = j['data']['card']['mid'] - sex = j['data']['card']['sex'] - face = j['data']['card']['face'] - fans = j['data']['card']['fans'] - attention = j['data']['card']['attention'] - level = j['data']['card']['level_info']['current_level'] - official = j['data']['card']['Official']['title'] - archive = j['data']['archive_count'] - article = j['data']['article_count'] - face = j['data']['card']['face'] - item = AuthorItem() - # 粉丝数大于1000才加入 - if int(fans) > 1000: - item['mid'] = int(mid) - item['name'] = name - item['face'] = face - item['official'] = official - item['sex'] = sex - item['focus'] = True - item['level'] = int(level) - item['data'] = { - 'fans': int(fans), - 'attention': int(attention), - 'archive': int(archive), - 'article': int(article), - 'datetime': datetime.datetime.now() - } - yield item + try: + j = json.loads(response.body) + name = j['data']['card']['name'] + mid = j['data']['card']['mid'] + sex = j['data']['card']['sex'] + face = j['data']['card']['face'] + fans = j['data']['card']['fans'] + attention = j['data']['card']['attention'] + level = j['data']['card']['level_info']['current_level'] + official = j['data']['card']['Official']['title'] + archive = j['data']['archive_count'] + article = j['data']['article_count'] + face = j['data']['card']['face'] + item = AuthorItem() + + # 粉丝数大于1000才加入 + if int(fans) > 1000: + item['c_fans'] = int(fans) + item['c_attention'] = int(attention) + item['c_archive'] = int(archive) + item['c_article'] = int(article) + item['mid'] = int(mid) + item['name'] = name + item['face'] = face + item['official'] = official + item['sex'] = sex + item['focus'] = True + item['level'] = int(level) + item['data'] = { + 'fans': int(fans), + 'attention': int(attention), + 'archive': int(archive), + 'article': int(article), + 'datetime': datetime.datetime.now() + } + yield item + except Exception as error: + # 出现错误时打印错误日志 + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}".format(response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) diff --git a/biliob_spider/spiders/author_update_with_redis.py b/biliob_spider/spiders/author_update_with_redis.py new file mode 100644 index 0000000..2ca321c --- /dev/null +++ b/biliob_spider/spiders/author_update_with_redis.py @@ -0,0 +1,94 @@ +# coding=utf-8 +import scrapy +from mail import mailer +from scrapy.http import Request +from biliob_spider.items import AuthorItem +import time +import json +import logging +from pymongo import MongoClient +import datetime +from db import settings +from scrapy_redis.spiders import RedisSpider + + +class AuthorUpdateWithRedis(RedisSpider): + name = "AuthorUpdateWithRedis" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.AuthorPipeline': 300 + }, + 'DOWNLOAD_DELAY': 2 + } + + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + + def parse(self, response): + try: + j = json.loads(response.body) + name = j['data']['card']['name'] + mid = j['data']['card']['mid'] + sex = j['data']['card']['sex'] + face = j['data']['card']['face'] + fans = j['data']['card']['fans'] + attention = j['data']['card']['attention'] + level = j['data']['card']['level_info']['current_level'] + official = j['data']['card']['Official']['title'] + archive = j['data']['archive_count'] + article = j['data']['article_count'] + face = j['data']['card']['face'] + item = AuthorItem() + item['mid'] = int(mid) + item['name'] = name + item['face'] = face + item['official'] = official + item['sex'] = sex + item['level'] = int(level) + item['data'] = { + 'fans': int(fans), + 'attention': int(attention), + 'archive': int(archive), + 'article': int(article), + 'datetime': datetime.datetime.now() + } + item['c_fans'] = int(fans) + item['c_attention'] = int(attention) + item['c_archive'] = int(archive) + item['c_article'] = int(article) + yield Request( + "https://api.bilibili.com/x/space/upstat?mid={mid}".format( + mid=str(mid)), + meta={'item': item}, + method='GET', + callback=self.parse_view) + except Exception as error: + # 出现错误时打印错误日志 + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) + + def parse_view(self, response): + j = json.loads(response.body) + archive_view = j['data']['archive']['view'] + article_view = j['data']['article']['view'] + item = response.meta['item'] + item['data']['archiveView'] = archive_view + item['data']['articleView'] = article_view + item['c_archive_view'] = int(archive_view) + item['c_article_view'] = int(article_view) + + yield item diff --git a/biliob_spider/spiders/strong_focus.py b/biliob_spider/spiders/strong_focus.py index 06a267c..e4609ce 100644 --- a/biliob_spider/spiders/strong_focus.py +++ b/biliob_spider/spiders/strong_focus.py @@ -8,108 +8,7 @@ import logging from pymongo import MongoClient import datetime - -sub_channel_2_channel = { - 'ASMR': '生活', - 'GMV': '游戏', - 'Korea相关': '娱乐', - 'MAD·AMV': '动画', - 'MMD·3D': '动画', - 'Mugen': '游戏', - 'OP/ED/OST': '音乐', - 'VOCALOID·UTAU': '音乐', - '三次元舞蹈': '舞蹈', - '三次元音乐': '音乐', - '人力VOCALOID': '鬼畜', - '人文·历史': '纪录片', - '健身': '时尚', - '其他': '生活', - '其他国家': '电影', - '军事': '纪录片', - '动物圈': '生活', - '华语电影': '电影', - '单机游戏': '游戏', - '原创音乐': '音乐', - '国产剧': '电视剧', - '国产动画': '国创', - '国产原创相关': '国创', - '宅舞': '舞蹈', - '完结动画': '番剧', - '官方延伸': '番剧', - '布袋戏': '国创', - '广告': '广告', - '影视剪辑': '影视', - '影视杂谈': '影视', - '手工': '生活', - '手机游戏': '游戏', - '搞笑': '生活', - '教程演示': '鬼畜', - '数码': '数码', - '日常': '生活', - '明星': '娱乐', - '星海': '科技', - '服饰': '时尚', - '机械': '科技', - '桌游棋牌': '游戏', - '欧美电影': '电影', - '汽车': '科技', - '游戏': '游戏', - '海外剧': '电视剧', - '演奏': '音乐', - '演讲·公开课': '科技', - '特摄': '影视', - '电子竞技': '游戏', - '短片': '影视', - '短片·手书·配音': '动画', - '社会·美食·旅行': '纪录片', - '科学·探索·自然': '纪录片', - '绘画': '生活', - '综合': '动画', - '综艺': '娱乐', - '网络游戏': '游戏', - '美妆': '时尚', - '美食圈': '生活', - '翻唱': '音乐', - '舞蹈教程': '舞蹈', - '资讯': '国创', - '趣味科普人文': '科技', - '运动': '生活', - '连载动画': '番剧', - '野生技术协会': '科技', - '音MAD': '鬼畜', - '音乐选集': '音乐', - '音游': '游戏', - '预告 资讯': '影视', - '预告·资讯': '影视', - '单机联机': '游戏', - '鬼畜调教': '鬼畜', - '演讲• 公开课': '科技', - '国产电影': '电影', - '日本电影': '电影', - '番剧': '番剧', - '国创': '国创', - '鬼畜': '鬼畜', - '电视剧': '电视剧', - '动画': '动画', - '时尚': '时尚', - '娱乐': '娱乐', - '电影': '电影', - '舞蹈': '舞蹈', - '科技': '科技', - '生活': '生活', - '音乐': '音乐', - '纪录片': '纪录片', - '手机平板': '数码', - '电脑装机': '数码', - '影音智能': '数码', - '摄影摄像': '数码', - '风尚标': '时尚', - '电音': '音乐', - '音乐综合': '音乐', - 'MV': '音乐', - '音乐现场': '音乐', - 'T台': '时尚', -} +from util import sub_channel_2_channel class StrongSpider(scrapy.spiders.Spider): diff --git a/biliob_spider/spiders/video_from_kanbilibili.py b/biliob_spider/spiders/video_from_kanbilibili.py index f7f6b25..b59a958 100644 --- a/biliob_spider/spiders/video_from_kanbilibili.py +++ b/biliob_spider/spiders/video_from_kanbilibili.py @@ -9,108 +9,7 @@ from pymongo import MongoClient from db import settings from mail import mailer -sub_channel_2_channel = { - 'ASMR': '生活', - 'GMV': '游戏', - 'Korea相关': '娱乐', - 'MAD·AMV': '动画', - 'MMD·3D': '动画', - 'Mugen': '游戏', - 'OP/ED/OST': '音乐', - 'VOCALOID·UTAU': '音乐', - '三次元舞蹈': '舞蹈', - '三次元音乐': '音乐', - '人力VOCALOID': '鬼畜', - '人文·历史': '纪录片', - '健身': '时尚', - '其他': '生活', - '其他国家': '电影', - '军事': '纪录片', - '动物圈': '生活', - '华语电影': '电影', - '单机游戏': '游戏', - '原创音乐': '音乐', - '国产剧': '电视剧', - '国产动画': '国创', - '国产原创相关': '国创', - '宅舞': '舞蹈', - '完结动画': '番剧', - '官方延伸': '番剧', - '布袋戏': '国创', - '广告': '广告', - '影视剪辑': '影视', - '影视杂谈': '影视', - '手工': '生活', - '手机游戏': '游戏', - '搞笑': '生活', - '教程演示': '鬼畜', - '数码': '数码', - '日常': '生活', - '明星': '娱乐', - '星海': '科技', - '服饰': '时尚', - '机械': '科技', - '桌游棋牌': '游戏', - '欧美电影': '电影', - '汽车': '科技', - '海外剧': '电视剧', - '演奏': '音乐', - '演讲·公开课': '科技', - '特摄': '影视', - '电子竞技': '游戏', - '短片': '影视', - '短片·手书·配音': '动画', - '社会·美食·旅行': '纪录片', - '科学·探索·自然': '纪录片', - '绘画': '生活', - '综合': '动画', - '综艺': '娱乐', - '网络游戏': '游戏', - '美妆': '时尚', - '美食圈': '生活', - '翻唱': '音乐', - '舞蹈教程': '舞蹈', - '资讯': '国创', - '趣味科普人文': '科技', - '运动': '生活', - '连载动画': '番剧', - '野生技术协会': '科技', - '音MAD': '鬼畜', - '音乐选集': '音乐', - '音游': '游戏', - '预告 资讯': '影视', - '预告·资讯': '影视', - '单机联机': '游戏', - '鬼畜调教': '鬼畜', - '演讲• 公开课': '科技', - '国产电影': '电影', - '日本电影': '电影', - '番剧': '番剧', - '国创': '国创', - '鬼畜': '鬼畜', - '电视剧': '电视剧', - '动画': '动画', - '时尚': '时尚', - '娱乐': '娱乐', - '电影': '电影', - '舞蹈': '舞蹈', - '科技': '科技', - '生活': '生活', - '音乐': '音乐', - '纪录片': '纪录片', - '手机平板': '数码', - '电脑装机': '数码', - '影音智能': '数码', - '摄影摄像': '数码', - '摄影摄像': '数码', - '风尚标': '时尚', - '电音': '音乐', - '音乐综合': '音乐', - 'MV': '音乐', - '音乐现场': '音乐', - '游戏': '游戏', - 'T台': '时尚', -} +from util import sub_channel_2_channel class FromKan(scrapy.spiders.Spider): diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index aa54a53..b07b92b 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -9,108 +9,7 @@ import logging from pymongo import MongoClient from db import settings -sub_channel_2_channel = { - 'ASMR': '生活', - 'GMV': '游戏', - 'Korea相关': '娱乐', - 'MAD·AMV': '动画', - 'MMD·3D': '动画', - 'Mugen': '游戏', - 'OP/ED/OST': '音乐', - 'VOCALOID·UTAU': '音乐', - '三次元舞蹈': '舞蹈', - '三次元音乐': '音乐', - '人力VOCALOID': '鬼畜', - '人文·历史': '纪录片', - '健身': '时尚', - '其他': '生活', - '其他国家': '电影', - '军事': '纪录片', - '动物圈': '生活', - '华语电影': '电影', - '单机游戏': '游戏', - '原创音乐': '音乐', - '国产剧': '电视剧', - '国产动画': '国创', - '国产原创相关': '国创', - '宅舞': '舞蹈', - '完结动画': '番剧', - '官方延伸': '番剧', - '布袋戏': '国创', - '广告': '广告', - '影视剪辑': '影视', - '影视杂谈': '影视', - '手工': '生活', - '手机游戏': '游戏', - '搞笑': '生活', - '教程演示': '鬼畜', - '数码': '数码', - '日常': '生活', - '明星': '娱乐', - '星海': '科技', - '服饰': '时尚', - '机械': '科技', - '桌游棋牌': '游戏', - '欧美电影': '电影', - '汽车': '科技', - '海外剧': '电视剧', - '演奏': '音乐', - '演讲·公开课': '科技', - '特摄': '影视', - '电子竞技': '游戏', - '短片': '影视', - '短片·手书·配音': '动画', - '社会·美食·旅行': '纪录片', - '科学·探索·自然': '纪录片', - '绘画': '生活', - '综合': '动画', - '综艺': '娱乐', - '网络游戏': '游戏', - '美妆': '时尚', - '美食圈': '生活', - '翻唱': '音乐', - '舞蹈教程': '舞蹈', - '资讯': '国创', - '趣味科普人文': '科技', - '运动': '生活', - '连载动画': '番剧', - '野生技术协会': '科技', - '音MAD': '鬼畜', - '音乐选集': '音乐', - '音游': '游戏', - '预告 资讯': '影视', - '预告·资讯': '影视', - '单机联机': '游戏', - '鬼畜调教': '鬼畜', - '演讲• 公开课': '科技', - '国产电影': '电影', - '日本电影': '电影', - '番剧': '番剧', - '国创': '国创', - '鬼畜': '鬼畜', - '电视剧': '电视剧', - '动画': '动画', - '时尚': '时尚', - '娱乐': '娱乐', - '电影': '电影', - '舞蹈': '舞蹈', - '科技': '科技', - '生活': '生活', - '音乐': '音乐', - '纪录片': '纪录片', - '手机平板': '数码', - '电脑装机': '数码', - '影音智能': '数码', - '摄影摄像': '数码', - '摄影摄像': '数码', - '风尚标': '时尚', - '电音': '音乐', - '音乐综合': '音乐', - 'MV': '音乐', - '音乐现场': '音乐', - '游戏': '游戏', - 'T台': '时尚', -} +from util import sub_channel_2_channel class VideoSpider(scrapy.spiders.Spider): @@ -134,7 +33,8 @@ def __init__(self): def start_requests(self): # 只需要aid - c = self.coll.find({'$or':[{'focus':True},{'forceFocus':True}]}, {'aid': 1}) + c = self.coll.find( + {'$or': [{'focus': True}, {'forceFocus': True}]}, {'aid': 1}) x = 0 aid_list = [] for each_doc in c: @@ -179,7 +79,6 @@ def parse(self, response): 'datetime': current_date } - subChannel = d[each_key]['tname'] title = d[each_key]['title'] date = d[each_key]['pubdate'] diff --git a/biliob_spider/spiders/video_spider_all.py b/biliob_spider/spiders/video_spider_all.py index 0900acb..56118be 100644 --- a/biliob_spider/spiders/video_spider_all.py +++ b/biliob_spider/spiders/video_spider_all.py @@ -9,111 +9,11 @@ import logging from pymongo import MongoClient from db import settings +from util import sub_channel_2_channel +from biliob_spider.spiders.video_spider import VideoSpider -sub_channel_2_channel = { - 'ASMR': '生活', - 'GMV': '游戏', - 'Korea相关': '娱乐', - 'MAD·AMV': '动画', - 'MMD·3D': '动画', - 'Mugen': '游戏', - 'OP/ED/OST': '音乐', - 'VOCALOID·UTAU': '音乐', - '三次元舞蹈': '舞蹈', - '三次元音乐': '音乐', - '人力VOCALOID': '鬼畜', - '人文·历史': '纪录片', - '健身': '时尚', - '其他': '生活', - '其他国家': '电影', - '军事': '纪录片', - '动物圈': '生活', - '华语电影': '电影', - '单机游戏': '游戏', - '原创音乐': '音乐', - '国产剧': '电视剧', - '国产动画': '国创', - '国产原创相关': '国创', - '宅舞': '舞蹈', - '完结动画': '番剧', - '官方延伸': '番剧', - '布袋戏': '国创', - '广告': '广告', - '影视剪辑': '影视', - '影视杂谈': '影视', - '手工': '生活', - '手机游戏': '游戏', - '搞笑': '生活', - '教程演示': '鬼畜', - '数码': '数码', - '日常': '生活', - '明星': '娱乐', - '星海': '科技', - '服饰': '时尚', - '机械': '科技', - '桌游棋牌': '游戏', - '欧美电影': '电影', - '汽车': '科技', - '游戏': '游戏', - '海外剧': '电视剧', - '演奏': '音乐', - '演讲·公开课': '科技', - '特摄': '影视', - '电子竞技': '游戏', - '短片': '影视', - '短片·手书·配音': '动画', - '社会·美食·旅行': '纪录片', - '科学·探索·自然': '纪录片', - '绘画': '生活', - '综合': '动画', - '综艺': '娱乐', - '网络游戏': '游戏', - '美妆': '时尚', - '美食圈': '生活', - '翻唱': '音乐', - '舞蹈教程': '舞蹈', - '资讯': '国创', - '趣味科普人文': '科技', - '运动': '生活', - '连载动画': '番剧', - '野生技术协会': '科技', - '音MAD': '鬼畜', - '音乐选集': '音乐', - '音游': '游戏', - '预告 资讯': '影视', - '预告·资讯': '影视', - '单机联机': '游戏', - '鬼畜调教': '鬼畜', - '演讲• 公开课': '科技', - '国产电影': '电影', - '日本电影': '电影', - '番剧': '番剧', - '国创': '国创', - '鬼畜': '鬼畜', - '电视剧': '电视剧', - '动画': '动画', - '时尚': '时尚', - '娱乐': '娱乐', - '电影': '电影', - '舞蹈': '舞蹈', - '科技': '科技', - '生活': '生活', - '音乐': '音乐', - '纪录片': '纪录片', - '手机平板': '数码', - '电脑装机': '数码', - '影音智能': '数码', - '摄影摄像': '数码', - '风尚标': '时尚', - '电音': '音乐', - '音乐综合': '音乐', - 'MV': '音乐', - '音乐现场': '音乐', - 'T台': '时尚', -} - -class VideoSpider(scrapy.spiders.Spider): +class VideoSpiderAll(VideoSpider): name = "videoSpiderAll" allowed_domains = ["bilibili.com"] start_urls = [] @@ -153,80 +53,3 @@ def start_requests(self): yield Request( "https://api.bilibili.com/x/article/archives?ids=" + aid_str.rstrip(',')) - - def parse(self, response): - try: - r = json.loads(response.body) - d = r["data"] - keys = list(d.keys()) - for each_key in keys: - - aid = d[each_key]['stat']['aid'] - author = d[each_key]['owner']['name'] - mid = d[each_key]['owner']['mid'] - view = d[each_key]['stat']['view'] - favorite = d[each_key]['stat']['favorite'] - danmaku = d[each_key]['stat']['danmaku'] - coin = d[each_key]['stat']['coin'] - share = d[each_key]['stat']['share'] - like = d[each_key]['stat']['like'] - current_date = datetime.now() - data = { - 'view': view, - 'favorite': favorite, - 'danmaku': danmaku, - 'coin': coin, - 'share': share, - 'like': like, - 'datetime': current_date - } - - - subChannel = d[each_key]['tname'] - title = d[each_key]['title'] - date = d[each_key]['pubdate'] - tid = d[each_key]['tid'] - pic = d[each_key]['pic'] - item = VideoItem() - item['current_view'] = view - item['current_favorite'] = favorite - item['current_danmaku'] = danmaku - item['current_coin'] = coin - item['current_share'] = share - item['current_like'] = like - item['current_datetime'] = current_date - item['aid'] = aid - item['mid'] = mid - item['pic'] = pic - item['author'] = author - item['data'] = data - item['title'] = title - item['subChannel'] = subChannel - item['datetime'] = date - - if subChannel != '': - item['channel'] = sub_channel_2_channel[subChannel] - elif subChannel == '资讯': - if tid == 51: - item['channel'] == '番剧' - if tid == 170: - item['channel'] == '国创' - if tid == 159: - item['channel'] == '娱乐' - else: - item['channel'] = None - yield item - - except Exception as error: - # 出现错误时打印错误日志 - if r['code'] == -404: - return - mailer.send( - to=["604264970@qq.com"], - subject="BiliobSpiderError", - body="{}\n{}\n{}".format(item, response.url, error), - ) - logging.error("视频爬虫在解析时发生错误") - logging.error(item) - logging.error(response.url) - logging.error(error) diff --git a/biliob_spider/spiders/video_spider_with_redis.py b/biliob_spider/spiders/video_spider_with_redis.py new file mode 100644 index 0000000..b4cea54 --- /dev/null +++ b/biliob_spider/spiders/video_spider_with_redis.py @@ -0,0 +1,109 @@ +# coding=utf-8 +import scrapy +from mail import mailer +from scrapy.http import Request +from biliob_spider.items import VideoItem +from datetime import datetime +import time +import json +import logging +from pymongo import MongoClient +from db import settings +from util import sub_channel_2_channel +from scrapy_redis.spiders import RedisSpider + + +class VideoSpiderWithRedis(RedisSpider): + name = "VideoSpiderWithRedis" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.VideoPipeline': 300, + } + } + + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def parse(self, response): + try: + r = json.loads(response.body) + d = r["data"] + keys = list(d.keys()) + for each_key in keys: + + aid = d[each_key]['stat']['aid'] + author = d[each_key]['owner']['name'] + mid = d[each_key]['owner']['mid'] + view = d[each_key]['stat']['view'] + favorite = d[each_key]['stat']['favorite'] + danmaku = d[each_key]['stat']['danmaku'] + coin = d[each_key]['stat']['coin'] + share = d[each_key]['stat']['share'] + like = d[each_key]['stat']['like'] + current_date = datetime.now() + data = { + 'view': view, + 'favorite': favorite, + 'danmaku': danmaku, + 'coin': coin, + 'share': share, + 'like': like, + 'datetime': current_date + } + + subChannel = d[each_key]['tname'] + title = d[each_key]['title'] + date = d[each_key]['pubdate'] + tid = d[each_key]['tid'] + pic = d[each_key]['pic'] + item = VideoItem() + item['current_view'] = view + item['current_favorite'] = favorite + item['current_danmaku'] = danmaku + item['current_coin'] = coin + item['current_share'] = share + item['current_like'] = like + item['current_datetime'] = current_date + item['aid'] = aid + item['mid'] = mid + item['pic'] = pic + item['author'] = author + item['data'] = data + item['title'] = title + item['subChannel'] = subChannel + item['datetime'] = date + + if subChannel != '': + item['channel'] = sub_channel_2_channel[subChannel] + elif subChannel == '资讯': + if tid == 51: + item['channel'] == '番剧' + if tid == 170: + item['channel'] == '国创' + if tid == 159: + item['channel'] == '娱乐' + else: + item['channel'] = None + yield item + + except Exception as error: + # 出现错误时打印错误日志 + if r['code'] == -404: + return + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(item) + logging.error(response.url) + logging.error(error) diff --git a/biliob_spider/spiders/video_watcher.py b/biliob_spider/spiders/video_watcher.py index e24a865..2ef475b 100644 --- a/biliob_spider/spiders/video_watcher.py +++ b/biliob_spider/spiders/video_watcher.py @@ -1,4 +1,4 @@ -#coding=utf-8 +# coding=utf-8 import scrapy from mail import mailer from scrapy.http import Request @@ -33,7 +33,8 @@ def __init__(self): self.coll = self.db['author'] # 获得collection的句柄 def start_requests(self): - c = self.coll.find({'$or':[{'focus':True},{'forceFocus':True}]}, {'mid': 1}) + c = self.coll.find( + {'$or': [{'focus': True}, {'forceFocus': True}]}, {'mid': 1}) for each_doc in c: yield Request( 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid=' + diff --git a/util.py b/util.py new file mode 100644 index 0000000..4713cf3 --- /dev/null +++ b/util.py @@ -0,0 +1,101 @@ +sub_channel_2_channel = { + 'ASMR': '生活', + 'GMV': '游戏', + 'Korea相关': '娱乐', + 'MAD·AMV': '动画', + 'MMD·3D': '动画', + 'Mugen': '游戏', + 'OP/ED/OST': '音乐', + 'VOCALOID·UTAU': '音乐', + '三次元舞蹈': '舞蹈', + '三次元音乐': '音乐', + '人力VOCALOID': '鬼畜', + '人文·历史': '纪录片', + '健身': '时尚', + '其他': '生活', + '其他国家': '电影', + '军事': '纪录片', + '动物圈': '生活', + '华语电影': '电影', + '单机游戏': '游戏', + '原创音乐': '音乐', + '国产剧': '电视剧', + '国产动画': '国创', + '国产原创相关': '国创', + '宅舞': '舞蹈', + '完结动画': '番剧', + '官方延伸': '番剧', + '布袋戏': '国创', + '广告': '广告', + '影视剪辑': '影视', + '影视杂谈': '影视', + '手工': '生活', + '手机游戏': '游戏', + '搞笑': '生活', + '教程演示': '鬼畜', + '数码': '数码', + '日常': '生活', + '明星': '娱乐', + '星海': '科技', + '服饰': '时尚', + '机械': '科技', + '桌游棋牌': '游戏', + '欧美电影': '电影', + '汽车': '科技', + '海外剧': '电视剧', + '演奏': '音乐', + '演讲·公开课': '科技', + '特摄': '影视', + '电子竞技': '游戏', + '短片': '影视', + '短片·手书·配音': '动画', + '社会·美食·旅行': '纪录片', + '科学·探索·自然': '纪录片', + '绘画': '生活', + '综合': '动画', + '综艺': '娱乐', + '网络游戏': '游戏', + '美妆': '时尚', + '美食圈': '生活', + '翻唱': '音乐', + '舞蹈教程': '舞蹈', + '资讯': '国创', + '趣味科普人文': '科技', + '运动': '生活', + '连载动画': '番剧', + '野生技术协会': '科技', + '音MAD': '鬼畜', + '音乐选集': '音乐', + '音游': '游戏', + '预告 资讯': '影视', + '预告·资讯': '影视', + '单机联机': '游戏', + '鬼畜调教': '鬼畜', + '演讲• 公开课': '科技', + '国产电影': '电影', + '日本电影': '电影', + '番剧': '番剧', + '国创': '国创', + '鬼畜': '鬼畜', + '电视剧': '电视剧', + '动画': '动画', + '时尚': '时尚', + '娱乐': '娱乐', + '电影': '电影', + '舞蹈': '舞蹈', + '科技': '科技', + '生活': '生活', + '音乐': '音乐', + '纪录片': '纪录片', + '手机平板': '数码', + '电脑装机': '数码', + '影音智能': '数码', + '摄影摄像': '数码', + '风尚标': '时尚', + '电音': '音乐', + '音乐综合': '音乐', + 'MV': '音乐', + '音乐现场': '音乐', + '游戏': '游戏', + 'T台': '时尚', +} From e585c0051280dde711e799272f840ca9d6583c71 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Mon, 11 Feb 2019 16:47:35 +0800 Subject: [PATCH 145/469] fix --- biliob_spider/pipelines.py | 5 - ...nes.py.1f305fa5e47c4f7467edd6e4cb280dc2.py | 453 ++++++++++++++++++ biliob_spider/settings.py | 21 +- biliob_spider/spiders/author_auto_add.py | 78 +-- .../spiders/author_update_with_redis.py | 94 ++++ biliob_spider/spiders/strong_focus.py | 103 +--- .../spiders/video_from_kanbilibili.py | 103 +--- biliob_spider/spiders/video_spider.py | 107 +---- biliob_spider/spiders/video_spider_all.py | 183 +------ .../spiders/video_spider_with_redis.py | 109 +++++ biliob_spider/spiders/video_watcher.py | 5 +- util.py | 101 ++++ 12 files changed, 827 insertions(+), 535 deletions(-) create mode 100644 biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py create mode 100644 biliob_spider/spiders/author_update_with_redis.py create mode 100644 biliob_spider/spiders/video_spider_with_redis.py create mode 100644 util.py diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 225814a..38252ba 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -289,11 +289,6 @@ def process_item(self, item, spider): 'level': item['level'], 'cFans': item['c_fans'], 'official': item['official'], - 'cArchive': item['c_archive'], - 'cArticle': item['c_article'], - 'cAttention': item['c_attention'], - 'cArchive_view': item['c_archive_view'], - 'cArticle_view': item['c_article_view'], }, '$push': { 'data': { diff --git a/biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py b/biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py new file mode 100644 index 0000000..38252ba --- /dev/null +++ b/biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py @@ -0,0 +1,453 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html +from pymongo import MongoClient +from db import settings +from db import mysql_connect +import datetime +import logging + + +class StrongPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + + def process_item(self, item, spider): + try: + self.coll = self.db['video'] + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'cView': item['current_view'], + 'cFavorite': item['current_favorite'], + 'cDanmaku': item['current_danmaku'], + 'cCoin': item['current_coin'], + 'cShare': item['current_share'], + 'cLike': item['current_like'], + 'cDatetime': item['current_datetime'], + 'author': item['author'], + 'subChannel': item['subChannel'], + 'channel': item['channel'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': datetime.datetime.fromtimestamp( + item['datetime']) + }, + '$push': { + 'data': { + '$each': [item['data_video']], + '$position': 0 + } + } + }, True) + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + try: + self.coll = self.db['author'] # 获得collection的句柄 + self.coll.update_one({ + 'mid': item['mid'] + }, { + '$set': { + 'focus': True, + 'sex': item['sex'], + 'name': item['name'], + 'face': item['face'], + 'level': item['level'], + 'cFans': item['c_fans'], + 'official': item['official'], + 'cArchive': item['c_archive'], + 'cArticle': item['c_article'], + 'cAttention': item['c_attention'], + 'cArchive_view': item['c_archive_view'], + 'cArticle_view': item['c_article_view'], + }, + '$push': { + 'data': { + '$each': [item['data_author']], + '$position': 0 + } + } + }, True) + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + return item + + +class VideoPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'cView': item['current_view'], + 'cFavorite': item['current_favorite'], + 'cDanmaku': item['current_danmaku'], + 'cCoin': item['current_coin'], + 'cShare': item['current_share'], + 'cLike': item['current_like'], + 'cDatetime': item['current_datetime'], + 'author': item['author'], + 'subChannel': item['subChannel'], + 'channel': item['channel'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': datetime.datetime.fromtimestamp( + item['datetime']) + }, + '$push': { + 'data': { + '$each': [item['data']], + '$position': 0 + } + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class VideoPipelineFromKan(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'author': item['author'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': item['datetime'] + }, + '$push': { + 'data': { + '$each': [item['data']], + '$position': 0 + } + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class BangumiPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['bangumi'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'title': item['title'] + }, { + '$set': { + 'title': item['title'], + 'cover': item['cover'], + # 'isFinish': item['is_finish'], + # 'isStarted': item['is_started'], + 'newest': item['newest_ep_index'], + 'currentPts': item['data']['pts'], + 'currentPlay': item['data']['play'], + # 'squareCover': item['square_cover'], + 'currentWatch': item['data']['watch'], + 'currentReview': item['data']['review'], + 'currentDanmaku': item['data']['danmaku'] + }, + '$addToSet': { + 'data': item['data'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class DonghuaPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['donghua'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'title': item['title'] + }, { + '$set': { + 'title': item['title'], + 'cover': item['cover'], + # 'isFinish': item['is_finish'], + # 'isStarted': item['is_started'], + 'newest': item['newest_ep_index'], + 'currentPts': item['data']['pts'], + 'currentPlay': item['data']['play'], + # 'squareCover': item['square_cover'], + 'currentWatch': item['data']['watch'], + 'currentReview': item['data']['review'], + 'currentDanmaku': item['data']['danmaku'] + }, + '$addToSet': { + 'data': item['data'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class SiteInfoPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['site_info'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.insert_one({ + 'region_count': item['region_count'], + 'all_count': item['all_count'], + 'web_online': item['web_online'], + 'play_online': item['play_online'], + 'datetime': datetime.datetime.now() + }) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class AuthorPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'mid': item['mid'] + }, { + '$set': { + 'focus': True, + 'sex': item['sex'], + 'name': item['name'], + 'face': item['face'], + 'level': item['level'], + 'cFans': item['c_fans'], + 'official': item['official'], + }, + '$push': { + 'data': { + '$each': [item['data']], + '$position': 0 + } + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class OnlinePipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video_online'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + + self.coll.update_one({ + 'title': item['title'] + }, { + '$set': { + 'title': item['title'], + 'author': item['author'], + 'channel': item['channel'], + 'subChannel': item['subChannel'], + }, + '$addToSet': { + 'data': item['data'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class TagPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['tag'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + + self.coll.update_one({ + 'tag_id': item['tag_id'] + }, { + '$set': { + 'tag_name': item['tag_name'], + 'ctime': item['ctime'], + }, + '$addToSet': { + 'use': item['use'], + 'atten': item['atten'], + 'datetime': datetime.datetime.now() + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class VideoAddPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + for each_aid in item['aid']: + self.coll.update_one({ + 'aid': each_aid + }, { + '$set': { + 'aid': each_aid, + 'focus': True + }, + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class AuthorChannelPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'mid': item['mid'] + }, { + '$set': { + 'channels': item['channels'] + }, + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class BiliMonthlyRankPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['monthly_rank'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'aid': item['aid'] + }, { + '$addToSet': { + 'pts': item['pts'], + 'datetime': datetime.datetime.now() + }, + '$set': { + 'title': item['title'], + 'author': item['author'], + 'aid': item['aid'], + 'mid': item['mid'], + 'channel': item['channel'], + 'currentPts': item['pts'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index bd07c94..d4eb73f 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -10,6 +10,9 @@ # https://doc.scrapy.org/en/latest/topics/spider-middleware.html import random +from db import redis_connect_string + +REDIS_URL = redis_connect_string # LOG_FILE = "biliob_spider.log" LOG_LEVEL = "WARNING" @@ -28,7 +31,7 @@ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36' ] # 随机生成user agent -USER_AGENT = random.choice(USER_AGENT_LIST) +USER_AGENT = random.choice(USER_AGENT_LIST) # Obey robots.txt rules @@ -52,28 +55,28 @@ #TELNETCONSOLE_ENABLED = False # Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { +# DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', -#} +# } # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { +# SPIDER_MIDDLEWARES = { # 'BilibiliRankListSpider.middlewares.BilibiliranklistspiderSpiderMiddleware': 543, -#} +# } # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { +# DOWNLOADER_MIDDLEWARES = { # 'BilibiliRankListSpider.middlewares.BilibiliranklistspiderDownloaderMiddleware': 543, -#} +# } # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { +# EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, -#} +# } # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index a06d075..1475fe6 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -1,4 +1,4 @@ -#coding=utf-8 +# coding=utf-8 import scrapy from mail import mailer from scrapy.http import Request @@ -60,33 +60,49 @@ def parse(self, response): logging.error(error) def detailParse(self, response): - j = json.loads(response.body) - name = j['data']['card']['name'] - mid = j['data']['card']['mid'] - sex = j['data']['card']['sex'] - face = j['data']['card']['face'] - fans = j['data']['card']['fans'] - attention = j['data']['card']['attention'] - level = j['data']['card']['level_info']['current_level'] - official = j['data']['card']['Official']['title'] - archive = j['data']['archive_count'] - article = j['data']['article_count'] - face = j['data']['card']['face'] - item = AuthorItem() - # 粉丝数大于1000才加入 - if int(fans) > 1000: - item['mid'] = int(mid) - item['name'] = name - item['face'] = face - item['official'] = official - item['sex'] = sex - item['focus'] = True - item['level'] = int(level) - item['data'] = { - 'fans': int(fans), - 'attention': int(attention), - 'archive': int(archive), - 'article': int(article), - 'datetime': datetime.datetime.now() - } - yield item + try: + j = json.loads(response.body) + name = j['data']['card']['name'] + mid = j['data']['card']['mid'] + sex = j['data']['card']['sex'] + face = j['data']['card']['face'] + fans = j['data']['card']['fans'] + attention = j['data']['card']['attention'] + level = j['data']['card']['level_info']['current_level'] + official = j['data']['card']['Official']['title'] + archive = j['data']['archive_count'] + article = j['data']['article_count'] + face = j['data']['card']['face'] + item = AuthorItem() + + # 粉丝数大于1000才加入 + if int(fans) > 1000: + item['c_fans'] = int(fans) + item['c_attention'] = int(attention) + item['c_archive'] = int(archive) + item['c_article'] = int(article) + item['mid'] = int(mid) + item['name'] = name + item['face'] = face + item['official'] = official + item['sex'] = sex + item['focus'] = True + item['level'] = int(level) + item['data'] = { + 'fans': int(fans), + 'attention': int(attention), + 'archive': int(archive), + 'article': int(article), + 'datetime': datetime.datetime.now() + } + yield item + except Exception as error: + # 出现错误时打印错误日志 + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}".format(response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) diff --git a/biliob_spider/spiders/author_update_with_redis.py b/biliob_spider/spiders/author_update_with_redis.py new file mode 100644 index 0000000..2ca321c --- /dev/null +++ b/biliob_spider/spiders/author_update_with_redis.py @@ -0,0 +1,94 @@ +# coding=utf-8 +import scrapy +from mail import mailer +from scrapy.http import Request +from biliob_spider.items import AuthorItem +import time +import json +import logging +from pymongo import MongoClient +import datetime +from db import settings +from scrapy_redis.spiders import RedisSpider + + +class AuthorUpdateWithRedis(RedisSpider): + name = "AuthorUpdateWithRedis" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.AuthorPipeline': 300 + }, + 'DOWNLOAD_DELAY': 2 + } + + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + + def parse(self, response): + try: + j = json.loads(response.body) + name = j['data']['card']['name'] + mid = j['data']['card']['mid'] + sex = j['data']['card']['sex'] + face = j['data']['card']['face'] + fans = j['data']['card']['fans'] + attention = j['data']['card']['attention'] + level = j['data']['card']['level_info']['current_level'] + official = j['data']['card']['Official']['title'] + archive = j['data']['archive_count'] + article = j['data']['article_count'] + face = j['data']['card']['face'] + item = AuthorItem() + item['mid'] = int(mid) + item['name'] = name + item['face'] = face + item['official'] = official + item['sex'] = sex + item['level'] = int(level) + item['data'] = { + 'fans': int(fans), + 'attention': int(attention), + 'archive': int(archive), + 'article': int(article), + 'datetime': datetime.datetime.now() + } + item['c_fans'] = int(fans) + item['c_attention'] = int(attention) + item['c_archive'] = int(archive) + item['c_article'] = int(article) + yield Request( + "https://api.bilibili.com/x/space/upstat?mid={mid}".format( + mid=str(mid)), + meta={'item': item}, + method='GET', + callback=self.parse_view) + except Exception as error: + # 出现错误时打印错误日志 + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) + + def parse_view(self, response): + j = json.loads(response.body) + archive_view = j['data']['archive']['view'] + article_view = j['data']['article']['view'] + item = response.meta['item'] + item['data']['archiveView'] = archive_view + item['data']['articleView'] = article_view + item['c_archive_view'] = int(archive_view) + item['c_article_view'] = int(article_view) + + yield item diff --git a/biliob_spider/spiders/strong_focus.py b/biliob_spider/spiders/strong_focus.py index 06a267c..e4609ce 100644 --- a/biliob_spider/spiders/strong_focus.py +++ b/biliob_spider/spiders/strong_focus.py @@ -8,108 +8,7 @@ import logging from pymongo import MongoClient import datetime - -sub_channel_2_channel = { - 'ASMR': '生活', - 'GMV': '游戏', - 'Korea相关': '娱乐', - 'MAD·AMV': '动画', - 'MMD·3D': '动画', - 'Mugen': '游戏', - 'OP/ED/OST': '音乐', - 'VOCALOID·UTAU': '音乐', - '三次元舞蹈': '舞蹈', - '三次元音乐': '音乐', - '人力VOCALOID': '鬼畜', - '人文·历史': '纪录片', - '健身': '时尚', - '其他': '生活', - '其他国家': '电影', - '军事': '纪录片', - '动物圈': '生活', - '华语电影': '电影', - '单机游戏': '游戏', - '原创音乐': '音乐', - '国产剧': '电视剧', - '国产动画': '国创', - '国产原创相关': '国创', - '宅舞': '舞蹈', - '完结动画': '番剧', - '官方延伸': '番剧', - '布袋戏': '国创', - '广告': '广告', - '影视剪辑': '影视', - '影视杂谈': '影视', - '手工': '生活', - '手机游戏': '游戏', - '搞笑': '生活', - '教程演示': '鬼畜', - '数码': '数码', - '日常': '生活', - '明星': '娱乐', - '星海': '科技', - '服饰': '时尚', - '机械': '科技', - '桌游棋牌': '游戏', - '欧美电影': '电影', - '汽车': '科技', - '游戏': '游戏', - '海外剧': '电视剧', - '演奏': '音乐', - '演讲·公开课': '科技', - '特摄': '影视', - '电子竞技': '游戏', - '短片': '影视', - '短片·手书·配音': '动画', - '社会·美食·旅行': '纪录片', - '科学·探索·自然': '纪录片', - '绘画': '生活', - '综合': '动画', - '综艺': '娱乐', - '网络游戏': '游戏', - '美妆': '时尚', - '美食圈': '生活', - '翻唱': '音乐', - '舞蹈教程': '舞蹈', - '资讯': '国创', - '趣味科普人文': '科技', - '运动': '生活', - '连载动画': '番剧', - '野生技术协会': '科技', - '音MAD': '鬼畜', - '音乐选集': '音乐', - '音游': '游戏', - '预告 资讯': '影视', - '预告·资讯': '影视', - '单机联机': '游戏', - '鬼畜调教': '鬼畜', - '演讲• 公开课': '科技', - '国产电影': '电影', - '日本电影': '电影', - '番剧': '番剧', - '国创': '国创', - '鬼畜': '鬼畜', - '电视剧': '电视剧', - '动画': '动画', - '时尚': '时尚', - '娱乐': '娱乐', - '电影': '电影', - '舞蹈': '舞蹈', - '科技': '科技', - '生活': '生活', - '音乐': '音乐', - '纪录片': '纪录片', - '手机平板': '数码', - '电脑装机': '数码', - '影音智能': '数码', - '摄影摄像': '数码', - '风尚标': '时尚', - '电音': '音乐', - '音乐综合': '音乐', - 'MV': '音乐', - '音乐现场': '音乐', - 'T台': '时尚', -} +from util import sub_channel_2_channel class StrongSpider(scrapy.spiders.Spider): diff --git a/biliob_spider/spiders/video_from_kanbilibili.py b/biliob_spider/spiders/video_from_kanbilibili.py index f7f6b25..b59a958 100644 --- a/biliob_spider/spiders/video_from_kanbilibili.py +++ b/biliob_spider/spiders/video_from_kanbilibili.py @@ -9,108 +9,7 @@ from pymongo import MongoClient from db import settings from mail import mailer -sub_channel_2_channel = { - 'ASMR': '生活', - 'GMV': '游戏', - 'Korea相关': '娱乐', - 'MAD·AMV': '动画', - 'MMD·3D': '动画', - 'Mugen': '游戏', - 'OP/ED/OST': '音乐', - 'VOCALOID·UTAU': '音乐', - '三次元舞蹈': '舞蹈', - '三次元音乐': '音乐', - '人力VOCALOID': '鬼畜', - '人文·历史': '纪录片', - '健身': '时尚', - '其他': '生活', - '其他国家': '电影', - '军事': '纪录片', - '动物圈': '生活', - '华语电影': '电影', - '单机游戏': '游戏', - '原创音乐': '音乐', - '国产剧': '电视剧', - '国产动画': '国创', - '国产原创相关': '国创', - '宅舞': '舞蹈', - '完结动画': '番剧', - '官方延伸': '番剧', - '布袋戏': '国创', - '广告': '广告', - '影视剪辑': '影视', - '影视杂谈': '影视', - '手工': '生活', - '手机游戏': '游戏', - '搞笑': '生活', - '教程演示': '鬼畜', - '数码': '数码', - '日常': '生活', - '明星': '娱乐', - '星海': '科技', - '服饰': '时尚', - '机械': '科技', - '桌游棋牌': '游戏', - '欧美电影': '电影', - '汽车': '科技', - '海外剧': '电视剧', - '演奏': '音乐', - '演讲·公开课': '科技', - '特摄': '影视', - '电子竞技': '游戏', - '短片': '影视', - '短片·手书·配音': '动画', - '社会·美食·旅行': '纪录片', - '科学·探索·自然': '纪录片', - '绘画': '生活', - '综合': '动画', - '综艺': '娱乐', - '网络游戏': '游戏', - '美妆': '时尚', - '美食圈': '生活', - '翻唱': '音乐', - '舞蹈教程': '舞蹈', - '资讯': '国创', - '趣味科普人文': '科技', - '运动': '生活', - '连载动画': '番剧', - '野生技术协会': '科技', - '音MAD': '鬼畜', - '音乐选集': '音乐', - '音游': '游戏', - '预告 资讯': '影视', - '预告·资讯': '影视', - '单机联机': '游戏', - '鬼畜调教': '鬼畜', - '演讲• 公开课': '科技', - '国产电影': '电影', - '日本电影': '电影', - '番剧': '番剧', - '国创': '国创', - '鬼畜': '鬼畜', - '电视剧': '电视剧', - '动画': '动画', - '时尚': '时尚', - '娱乐': '娱乐', - '电影': '电影', - '舞蹈': '舞蹈', - '科技': '科技', - '生活': '生活', - '音乐': '音乐', - '纪录片': '纪录片', - '手机平板': '数码', - '电脑装机': '数码', - '影音智能': '数码', - '摄影摄像': '数码', - '摄影摄像': '数码', - '风尚标': '时尚', - '电音': '音乐', - '音乐综合': '音乐', - 'MV': '音乐', - '音乐现场': '音乐', - '游戏': '游戏', - 'T台': '时尚', -} +from util import sub_channel_2_channel class FromKan(scrapy.spiders.Spider): diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index aa54a53..b07b92b 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -9,108 +9,7 @@ import logging from pymongo import MongoClient from db import settings -sub_channel_2_channel = { - 'ASMR': '生活', - 'GMV': '游戏', - 'Korea相关': '娱乐', - 'MAD·AMV': '动画', - 'MMD·3D': '动画', - 'Mugen': '游戏', - 'OP/ED/OST': '音乐', - 'VOCALOID·UTAU': '音乐', - '三次元舞蹈': '舞蹈', - '三次元音乐': '音乐', - '人力VOCALOID': '鬼畜', - '人文·历史': '纪录片', - '健身': '时尚', - '其他': '生活', - '其他国家': '电影', - '军事': '纪录片', - '动物圈': '生活', - '华语电影': '电影', - '单机游戏': '游戏', - '原创音乐': '音乐', - '国产剧': '电视剧', - '国产动画': '国创', - '国产原创相关': '国创', - '宅舞': '舞蹈', - '完结动画': '番剧', - '官方延伸': '番剧', - '布袋戏': '国创', - '广告': '广告', - '影视剪辑': '影视', - '影视杂谈': '影视', - '手工': '生活', - '手机游戏': '游戏', - '搞笑': '生活', - '教程演示': '鬼畜', - '数码': '数码', - '日常': '生活', - '明星': '娱乐', - '星海': '科技', - '服饰': '时尚', - '机械': '科技', - '桌游棋牌': '游戏', - '欧美电影': '电影', - '汽车': '科技', - '海外剧': '电视剧', - '演奏': '音乐', - '演讲·公开课': '科技', - '特摄': '影视', - '电子竞技': '游戏', - '短片': '影视', - '短片·手书·配音': '动画', - '社会·美食·旅行': '纪录片', - '科学·探索·自然': '纪录片', - '绘画': '生活', - '综合': '动画', - '综艺': '娱乐', - '网络游戏': '游戏', - '美妆': '时尚', - '美食圈': '生活', - '翻唱': '音乐', - '舞蹈教程': '舞蹈', - '资讯': '国创', - '趣味科普人文': '科技', - '运动': '生活', - '连载动画': '番剧', - '野生技术协会': '科技', - '音MAD': '鬼畜', - '音乐选集': '音乐', - '音游': '游戏', - '预告 资讯': '影视', - '预告·资讯': '影视', - '单机联机': '游戏', - '鬼畜调教': '鬼畜', - '演讲• 公开课': '科技', - '国产电影': '电影', - '日本电影': '电影', - '番剧': '番剧', - '国创': '国创', - '鬼畜': '鬼畜', - '电视剧': '电视剧', - '动画': '动画', - '时尚': '时尚', - '娱乐': '娱乐', - '电影': '电影', - '舞蹈': '舞蹈', - '科技': '科技', - '生活': '生活', - '音乐': '音乐', - '纪录片': '纪录片', - '手机平板': '数码', - '电脑装机': '数码', - '影音智能': '数码', - '摄影摄像': '数码', - '摄影摄像': '数码', - '风尚标': '时尚', - '电音': '音乐', - '音乐综合': '音乐', - 'MV': '音乐', - '音乐现场': '音乐', - '游戏': '游戏', - 'T台': '时尚', -} +from util import sub_channel_2_channel class VideoSpider(scrapy.spiders.Spider): @@ -134,7 +33,8 @@ def __init__(self): def start_requests(self): # 只需要aid - c = self.coll.find({'$or':[{'focus':True},{'forceFocus':True}]}, {'aid': 1}) + c = self.coll.find( + {'$or': [{'focus': True}, {'forceFocus': True}]}, {'aid': 1}) x = 0 aid_list = [] for each_doc in c: @@ -179,7 +79,6 @@ def parse(self, response): 'datetime': current_date } - subChannel = d[each_key]['tname'] title = d[each_key]['title'] date = d[each_key]['pubdate'] diff --git a/biliob_spider/spiders/video_spider_all.py b/biliob_spider/spiders/video_spider_all.py index 0900acb..56118be 100644 --- a/biliob_spider/spiders/video_spider_all.py +++ b/biliob_spider/spiders/video_spider_all.py @@ -9,111 +9,11 @@ import logging from pymongo import MongoClient from db import settings +from util import sub_channel_2_channel +from biliob_spider.spiders.video_spider import VideoSpider -sub_channel_2_channel = { - 'ASMR': '生活', - 'GMV': '游戏', - 'Korea相关': '娱乐', - 'MAD·AMV': '动画', - 'MMD·3D': '动画', - 'Mugen': '游戏', - 'OP/ED/OST': '音乐', - 'VOCALOID·UTAU': '音乐', - '三次元舞蹈': '舞蹈', - '三次元音乐': '音乐', - '人力VOCALOID': '鬼畜', - '人文·历史': '纪录片', - '健身': '时尚', - '其他': '生活', - '其他国家': '电影', - '军事': '纪录片', - '动物圈': '生活', - '华语电影': '电影', - '单机游戏': '游戏', - '原创音乐': '音乐', - '国产剧': '电视剧', - '国产动画': '国创', - '国产原创相关': '国创', - '宅舞': '舞蹈', - '完结动画': '番剧', - '官方延伸': '番剧', - '布袋戏': '国创', - '广告': '广告', - '影视剪辑': '影视', - '影视杂谈': '影视', - '手工': '生活', - '手机游戏': '游戏', - '搞笑': '生活', - '教程演示': '鬼畜', - '数码': '数码', - '日常': '生活', - '明星': '娱乐', - '星海': '科技', - '服饰': '时尚', - '机械': '科技', - '桌游棋牌': '游戏', - '欧美电影': '电影', - '汽车': '科技', - '游戏': '游戏', - '海外剧': '电视剧', - '演奏': '音乐', - '演讲·公开课': '科技', - '特摄': '影视', - '电子竞技': '游戏', - '短片': '影视', - '短片·手书·配音': '动画', - '社会·美食·旅行': '纪录片', - '科学·探索·自然': '纪录片', - '绘画': '生活', - '综合': '动画', - '综艺': '娱乐', - '网络游戏': '游戏', - '美妆': '时尚', - '美食圈': '生活', - '翻唱': '音乐', - '舞蹈教程': '舞蹈', - '资讯': '国创', - '趣味科普人文': '科技', - '运动': '生活', - '连载动画': '番剧', - '野生技术协会': '科技', - '音MAD': '鬼畜', - '音乐选集': '音乐', - '音游': '游戏', - '预告 资讯': '影视', - '预告·资讯': '影视', - '单机联机': '游戏', - '鬼畜调教': '鬼畜', - '演讲• 公开课': '科技', - '国产电影': '电影', - '日本电影': '电影', - '番剧': '番剧', - '国创': '国创', - '鬼畜': '鬼畜', - '电视剧': '电视剧', - '动画': '动画', - '时尚': '时尚', - '娱乐': '娱乐', - '电影': '电影', - '舞蹈': '舞蹈', - '科技': '科技', - '生活': '生活', - '音乐': '音乐', - '纪录片': '纪录片', - '手机平板': '数码', - '电脑装机': '数码', - '影音智能': '数码', - '摄影摄像': '数码', - '风尚标': '时尚', - '电音': '音乐', - '音乐综合': '音乐', - 'MV': '音乐', - '音乐现场': '音乐', - 'T台': '时尚', -} - -class VideoSpider(scrapy.spiders.Spider): +class VideoSpiderAll(VideoSpider): name = "videoSpiderAll" allowed_domains = ["bilibili.com"] start_urls = [] @@ -153,80 +53,3 @@ def start_requests(self): yield Request( "https://api.bilibili.com/x/article/archives?ids=" + aid_str.rstrip(',')) - - def parse(self, response): - try: - r = json.loads(response.body) - d = r["data"] - keys = list(d.keys()) - for each_key in keys: - - aid = d[each_key]['stat']['aid'] - author = d[each_key]['owner']['name'] - mid = d[each_key]['owner']['mid'] - view = d[each_key]['stat']['view'] - favorite = d[each_key]['stat']['favorite'] - danmaku = d[each_key]['stat']['danmaku'] - coin = d[each_key]['stat']['coin'] - share = d[each_key]['stat']['share'] - like = d[each_key]['stat']['like'] - current_date = datetime.now() - data = { - 'view': view, - 'favorite': favorite, - 'danmaku': danmaku, - 'coin': coin, - 'share': share, - 'like': like, - 'datetime': current_date - } - - - subChannel = d[each_key]['tname'] - title = d[each_key]['title'] - date = d[each_key]['pubdate'] - tid = d[each_key]['tid'] - pic = d[each_key]['pic'] - item = VideoItem() - item['current_view'] = view - item['current_favorite'] = favorite - item['current_danmaku'] = danmaku - item['current_coin'] = coin - item['current_share'] = share - item['current_like'] = like - item['current_datetime'] = current_date - item['aid'] = aid - item['mid'] = mid - item['pic'] = pic - item['author'] = author - item['data'] = data - item['title'] = title - item['subChannel'] = subChannel - item['datetime'] = date - - if subChannel != '': - item['channel'] = sub_channel_2_channel[subChannel] - elif subChannel == '资讯': - if tid == 51: - item['channel'] == '番剧' - if tid == 170: - item['channel'] == '国创' - if tid == 159: - item['channel'] == '娱乐' - else: - item['channel'] = None - yield item - - except Exception as error: - # 出现错误时打印错误日志 - if r['code'] == -404: - return - mailer.send( - to=["604264970@qq.com"], - subject="BiliobSpiderError", - body="{}\n{}\n{}".format(item, response.url, error), - ) - logging.error("视频爬虫在解析时发生错误") - logging.error(item) - logging.error(response.url) - logging.error(error) diff --git a/biliob_spider/spiders/video_spider_with_redis.py b/biliob_spider/spiders/video_spider_with_redis.py new file mode 100644 index 0000000..b4cea54 --- /dev/null +++ b/biliob_spider/spiders/video_spider_with_redis.py @@ -0,0 +1,109 @@ +# coding=utf-8 +import scrapy +from mail import mailer +from scrapy.http import Request +from biliob_spider.items import VideoItem +from datetime import datetime +import time +import json +import logging +from pymongo import MongoClient +from db import settings +from util import sub_channel_2_channel +from scrapy_redis.spiders import RedisSpider + + +class VideoSpiderWithRedis(RedisSpider): + name = "VideoSpiderWithRedis" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.VideoPipeline': 300, + } + } + + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def parse(self, response): + try: + r = json.loads(response.body) + d = r["data"] + keys = list(d.keys()) + for each_key in keys: + + aid = d[each_key]['stat']['aid'] + author = d[each_key]['owner']['name'] + mid = d[each_key]['owner']['mid'] + view = d[each_key]['stat']['view'] + favorite = d[each_key]['stat']['favorite'] + danmaku = d[each_key]['stat']['danmaku'] + coin = d[each_key]['stat']['coin'] + share = d[each_key]['stat']['share'] + like = d[each_key]['stat']['like'] + current_date = datetime.now() + data = { + 'view': view, + 'favorite': favorite, + 'danmaku': danmaku, + 'coin': coin, + 'share': share, + 'like': like, + 'datetime': current_date + } + + subChannel = d[each_key]['tname'] + title = d[each_key]['title'] + date = d[each_key]['pubdate'] + tid = d[each_key]['tid'] + pic = d[each_key]['pic'] + item = VideoItem() + item['current_view'] = view + item['current_favorite'] = favorite + item['current_danmaku'] = danmaku + item['current_coin'] = coin + item['current_share'] = share + item['current_like'] = like + item['current_datetime'] = current_date + item['aid'] = aid + item['mid'] = mid + item['pic'] = pic + item['author'] = author + item['data'] = data + item['title'] = title + item['subChannel'] = subChannel + item['datetime'] = date + + if subChannel != '': + item['channel'] = sub_channel_2_channel[subChannel] + elif subChannel == '资讯': + if tid == 51: + item['channel'] == '番剧' + if tid == 170: + item['channel'] == '国创' + if tid == 159: + item['channel'] == '娱乐' + else: + item['channel'] = None + yield item + + except Exception as error: + # 出现错误时打印错误日志 + if r['code'] == -404: + return + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(item) + logging.error(response.url) + logging.error(error) diff --git a/biliob_spider/spiders/video_watcher.py b/biliob_spider/spiders/video_watcher.py index e24a865..2ef475b 100644 --- a/biliob_spider/spiders/video_watcher.py +++ b/biliob_spider/spiders/video_watcher.py @@ -1,4 +1,4 @@ -#coding=utf-8 +# coding=utf-8 import scrapy from mail import mailer from scrapy.http import Request @@ -33,7 +33,8 @@ def __init__(self): self.coll = self.db['author'] # 获得collection的句柄 def start_requests(self): - c = self.coll.find({'$or':[{'focus':True},{'forceFocus':True}]}, {'mid': 1}) + c = self.coll.find( + {'$or': [{'focus': True}, {'forceFocus': True}]}, {'mid': 1}) for each_doc in c: yield Request( 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid=' + diff --git a/util.py b/util.py new file mode 100644 index 0000000..4713cf3 --- /dev/null +++ b/util.py @@ -0,0 +1,101 @@ +sub_channel_2_channel = { + 'ASMR': '生活', + 'GMV': '游戏', + 'Korea相关': '娱乐', + 'MAD·AMV': '动画', + 'MMD·3D': '动画', + 'Mugen': '游戏', + 'OP/ED/OST': '音乐', + 'VOCALOID·UTAU': '音乐', + '三次元舞蹈': '舞蹈', + '三次元音乐': '音乐', + '人力VOCALOID': '鬼畜', + '人文·历史': '纪录片', + '健身': '时尚', + '其他': '生活', + '其他国家': '电影', + '军事': '纪录片', + '动物圈': '生活', + '华语电影': '电影', + '单机游戏': '游戏', + '原创音乐': '音乐', + '国产剧': '电视剧', + '国产动画': '国创', + '国产原创相关': '国创', + '宅舞': '舞蹈', + '完结动画': '番剧', + '官方延伸': '番剧', + '布袋戏': '国创', + '广告': '广告', + '影视剪辑': '影视', + '影视杂谈': '影视', + '手工': '生活', + '手机游戏': '游戏', + '搞笑': '生活', + '教程演示': '鬼畜', + '数码': '数码', + '日常': '生活', + '明星': '娱乐', + '星海': '科技', + '服饰': '时尚', + '机械': '科技', + '桌游棋牌': '游戏', + '欧美电影': '电影', + '汽车': '科技', + '海外剧': '电视剧', + '演奏': '音乐', + '演讲·公开课': '科技', + '特摄': '影视', + '电子竞技': '游戏', + '短片': '影视', + '短片·手书·配音': '动画', + '社会·美食·旅行': '纪录片', + '科学·探索·自然': '纪录片', + '绘画': '生活', + '综合': '动画', + '综艺': '娱乐', + '网络游戏': '游戏', + '美妆': '时尚', + '美食圈': '生活', + '翻唱': '音乐', + '舞蹈教程': '舞蹈', + '资讯': '国创', + '趣味科普人文': '科技', + '运动': '生活', + '连载动画': '番剧', + '野生技术协会': '科技', + '音MAD': '鬼畜', + '音乐选集': '音乐', + '音游': '游戏', + '预告 资讯': '影视', + '预告·资讯': '影视', + '单机联机': '游戏', + '鬼畜调教': '鬼畜', + '演讲• 公开课': '科技', + '国产电影': '电影', + '日本电影': '电影', + '番剧': '番剧', + '国创': '国创', + '鬼畜': '鬼畜', + '电视剧': '电视剧', + '动画': '动画', + '时尚': '时尚', + '娱乐': '娱乐', + '电影': '电影', + '舞蹈': '舞蹈', + '科技': '科技', + '生活': '生活', + '音乐': '音乐', + '纪录片': '纪录片', + '手机平板': '数码', + '电脑装机': '数码', + '影音智能': '数码', + '摄影摄像': '数码', + '风尚标': '时尚', + '电音': '音乐', + '音乐综合': '音乐', + 'MV': '音乐', + '音乐现场': '音乐', + '游戏': '游戏', + 'T台': '时尚', +} From 2c2ae523ac12912f11de0a026a23431ffcbb944f Mon Sep 17 00:00:00 2001 From: Jannchie Date: Mon, 11 Feb 2019 16:47:35 +0800 Subject: [PATCH 146/469] fix --- biliob_spider/pipelines.py | 5 - ...nes.py.1f305fa5e47c4f7467edd6e4cb280dc2.py | 453 ++++++++++++++++++ biliob_spider/settings.py | 21 +- biliob_spider/spiders/author_auto_add.py | 78 +-- .../spiders/author_update_with_redis.py | 94 ++++ biliob_spider/spiders/strong_focus.py | 103 +--- .../spiders/video_from_kanbilibili.py | 103 +--- biliob_spider/spiders/video_spider.py | 107 +---- biliob_spider/spiders/video_spider_all.py | 183 +------ .../spiders/video_spider_with_redis.py | 109 +++++ biliob_spider/spiders/video_watcher.py | 5 +- util.py | 101 ++++ 12 files changed, 827 insertions(+), 535 deletions(-) create mode 100644 biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py create mode 100644 biliob_spider/spiders/author_update_with_redis.py create mode 100644 biliob_spider/spiders/video_spider_with_redis.py create mode 100644 util.py diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 225814a..38252ba 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -289,11 +289,6 @@ def process_item(self, item, spider): 'level': item['level'], 'cFans': item['c_fans'], 'official': item['official'], - 'cArchive': item['c_archive'], - 'cArticle': item['c_article'], - 'cAttention': item['c_attention'], - 'cArchive_view': item['c_archive_view'], - 'cArticle_view': item['c_article_view'], }, '$push': { 'data': { diff --git a/biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py b/biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py new file mode 100644 index 0000000..38252ba --- /dev/null +++ b/biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py @@ -0,0 +1,453 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html +from pymongo import MongoClient +from db import settings +from db import mysql_connect +import datetime +import logging + + +class StrongPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + + def process_item(self, item, spider): + try: + self.coll = self.db['video'] + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'cView': item['current_view'], + 'cFavorite': item['current_favorite'], + 'cDanmaku': item['current_danmaku'], + 'cCoin': item['current_coin'], + 'cShare': item['current_share'], + 'cLike': item['current_like'], + 'cDatetime': item['current_datetime'], + 'author': item['author'], + 'subChannel': item['subChannel'], + 'channel': item['channel'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': datetime.datetime.fromtimestamp( + item['datetime']) + }, + '$push': { + 'data': { + '$each': [item['data_video']], + '$position': 0 + } + } + }, True) + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + try: + self.coll = self.db['author'] # 获得collection的句柄 + self.coll.update_one({ + 'mid': item['mid'] + }, { + '$set': { + 'focus': True, + 'sex': item['sex'], + 'name': item['name'], + 'face': item['face'], + 'level': item['level'], + 'cFans': item['c_fans'], + 'official': item['official'], + 'cArchive': item['c_archive'], + 'cArticle': item['c_article'], + 'cAttention': item['c_attention'], + 'cArchive_view': item['c_archive_view'], + 'cArticle_view': item['c_article_view'], + }, + '$push': { + 'data': { + '$each': [item['data_author']], + '$position': 0 + } + } + }, True) + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + return item + + +class VideoPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'cView': item['current_view'], + 'cFavorite': item['current_favorite'], + 'cDanmaku': item['current_danmaku'], + 'cCoin': item['current_coin'], + 'cShare': item['current_share'], + 'cLike': item['current_like'], + 'cDatetime': item['current_datetime'], + 'author': item['author'], + 'subChannel': item['subChannel'], + 'channel': item['channel'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': datetime.datetime.fromtimestamp( + item['datetime']) + }, + '$push': { + 'data': { + '$each': [item['data']], + '$position': 0 + } + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class VideoPipelineFromKan(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'author': item['author'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': item['datetime'] + }, + '$push': { + 'data': { + '$each': [item['data']], + '$position': 0 + } + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class BangumiPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['bangumi'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'title': item['title'] + }, { + '$set': { + 'title': item['title'], + 'cover': item['cover'], + # 'isFinish': item['is_finish'], + # 'isStarted': item['is_started'], + 'newest': item['newest_ep_index'], + 'currentPts': item['data']['pts'], + 'currentPlay': item['data']['play'], + # 'squareCover': item['square_cover'], + 'currentWatch': item['data']['watch'], + 'currentReview': item['data']['review'], + 'currentDanmaku': item['data']['danmaku'] + }, + '$addToSet': { + 'data': item['data'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class DonghuaPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['donghua'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'title': item['title'] + }, { + '$set': { + 'title': item['title'], + 'cover': item['cover'], + # 'isFinish': item['is_finish'], + # 'isStarted': item['is_started'], + 'newest': item['newest_ep_index'], + 'currentPts': item['data']['pts'], + 'currentPlay': item['data']['play'], + # 'squareCover': item['square_cover'], + 'currentWatch': item['data']['watch'], + 'currentReview': item['data']['review'], + 'currentDanmaku': item['data']['danmaku'] + }, + '$addToSet': { + 'data': item['data'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class SiteInfoPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['site_info'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.insert_one({ + 'region_count': item['region_count'], + 'all_count': item['all_count'], + 'web_online': item['web_online'], + 'play_online': item['play_online'], + 'datetime': datetime.datetime.now() + }) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class AuthorPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'mid': item['mid'] + }, { + '$set': { + 'focus': True, + 'sex': item['sex'], + 'name': item['name'], + 'face': item['face'], + 'level': item['level'], + 'cFans': item['c_fans'], + 'official': item['official'], + }, + '$push': { + 'data': { + '$each': [item['data']], + '$position': 0 + } + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class OnlinePipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video_online'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + + self.coll.update_one({ + 'title': item['title'] + }, { + '$set': { + 'title': item['title'], + 'author': item['author'], + 'channel': item['channel'], + 'subChannel': item['subChannel'], + }, + '$addToSet': { + 'data': item['data'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class TagPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['tag'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + + self.coll.update_one({ + 'tag_id': item['tag_id'] + }, { + '$set': { + 'tag_name': item['tag_name'], + 'ctime': item['ctime'], + }, + '$addToSet': { + 'use': item['use'], + 'atten': item['atten'], + 'datetime': datetime.datetime.now() + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class VideoAddPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + for each_aid in item['aid']: + self.coll.update_one({ + 'aid': each_aid + }, { + '$set': { + 'aid': each_aid, + 'focus': True + }, + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class AuthorChannelPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'mid': item['mid'] + }, { + '$set': { + 'channels': item['channels'] + }, + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class BiliMonthlyRankPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['monthly_rank'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'aid': item['aid'] + }, { + '$addToSet': { + 'pts': item['pts'], + 'datetime': datetime.datetime.now() + }, + '$set': { + 'title': item['title'], + 'author': item['author'], + 'aid': item['aid'], + 'mid': item['mid'], + 'channel': item['channel'], + 'currentPts': item['pts'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index bd07c94..d4eb73f 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -10,6 +10,9 @@ # https://doc.scrapy.org/en/latest/topics/spider-middleware.html import random +from db import redis_connect_string + +REDIS_URL = redis_connect_string # LOG_FILE = "biliob_spider.log" LOG_LEVEL = "WARNING" @@ -28,7 +31,7 @@ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36' ] # 随机生成user agent -USER_AGENT = random.choice(USER_AGENT_LIST) +USER_AGENT = random.choice(USER_AGENT_LIST) # Obey robots.txt rules @@ -52,28 +55,28 @@ #TELNETCONSOLE_ENABLED = False # Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { +# DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', -#} +# } # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { +# SPIDER_MIDDLEWARES = { # 'BilibiliRankListSpider.middlewares.BilibiliranklistspiderSpiderMiddleware': 543, -#} +# } # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { +# DOWNLOADER_MIDDLEWARES = { # 'BilibiliRankListSpider.middlewares.BilibiliranklistspiderDownloaderMiddleware': 543, -#} +# } # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { +# EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, -#} +# } # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/biliob_spider/spiders/author_auto_add.py b/biliob_spider/spiders/author_auto_add.py index a06d075..1475fe6 100644 --- a/biliob_spider/spiders/author_auto_add.py +++ b/biliob_spider/spiders/author_auto_add.py @@ -1,4 +1,4 @@ -#coding=utf-8 +# coding=utf-8 import scrapy from mail import mailer from scrapy.http import Request @@ -60,33 +60,49 @@ def parse(self, response): logging.error(error) def detailParse(self, response): - j = json.loads(response.body) - name = j['data']['card']['name'] - mid = j['data']['card']['mid'] - sex = j['data']['card']['sex'] - face = j['data']['card']['face'] - fans = j['data']['card']['fans'] - attention = j['data']['card']['attention'] - level = j['data']['card']['level_info']['current_level'] - official = j['data']['card']['Official']['title'] - archive = j['data']['archive_count'] - article = j['data']['article_count'] - face = j['data']['card']['face'] - item = AuthorItem() - # 粉丝数大于1000才加入 - if int(fans) > 1000: - item['mid'] = int(mid) - item['name'] = name - item['face'] = face - item['official'] = official - item['sex'] = sex - item['focus'] = True - item['level'] = int(level) - item['data'] = { - 'fans': int(fans), - 'attention': int(attention), - 'archive': int(archive), - 'article': int(article), - 'datetime': datetime.datetime.now() - } - yield item + try: + j = json.loads(response.body) + name = j['data']['card']['name'] + mid = j['data']['card']['mid'] + sex = j['data']['card']['sex'] + face = j['data']['card']['face'] + fans = j['data']['card']['fans'] + attention = j['data']['card']['attention'] + level = j['data']['card']['level_info']['current_level'] + official = j['data']['card']['Official']['title'] + archive = j['data']['archive_count'] + article = j['data']['article_count'] + face = j['data']['card']['face'] + item = AuthorItem() + + # 粉丝数大于1000才加入 + if int(fans) > 1000: + item['c_fans'] = int(fans) + item['c_attention'] = int(attention) + item['c_archive'] = int(archive) + item['c_article'] = int(article) + item['mid'] = int(mid) + item['name'] = name + item['face'] = face + item['official'] = official + item['sex'] = sex + item['focus'] = True + item['level'] = int(level) + item['data'] = { + 'fans': int(fans), + 'attention': int(attention), + 'archive': int(archive), + 'article': int(article), + 'datetime': datetime.datetime.now() + } + yield item + except Exception as error: + # 出现错误时打印错误日志 + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}".format(response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) diff --git a/biliob_spider/spiders/author_update_with_redis.py b/biliob_spider/spiders/author_update_with_redis.py new file mode 100644 index 0000000..2ca321c --- /dev/null +++ b/biliob_spider/spiders/author_update_with_redis.py @@ -0,0 +1,94 @@ +# coding=utf-8 +import scrapy +from mail import mailer +from scrapy.http import Request +from biliob_spider.items import AuthorItem +import time +import json +import logging +from pymongo import MongoClient +import datetime +from db import settings +from scrapy_redis.spiders import RedisSpider + + +class AuthorUpdateWithRedis(RedisSpider): + name = "AuthorUpdateWithRedis" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.AuthorPipeline': 300 + }, + 'DOWNLOAD_DELAY': 2 + } + + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + + def parse(self, response): + try: + j = json.loads(response.body) + name = j['data']['card']['name'] + mid = j['data']['card']['mid'] + sex = j['data']['card']['sex'] + face = j['data']['card']['face'] + fans = j['data']['card']['fans'] + attention = j['data']['card']['attention'] + level = j['data']['card']['level_info']['current_level'] + official = j['data']['card']['Official']['title'] + archive = j['data']['archive_count'] + article = j['data']['article_count'] + face = j['data']['card']['face'] + item = AuthorItem() + item['mid'] = int(mid) + item['name'] = name + item['face'] = face + item['official'] = official + item['sex'] = sex + item['level'] = int(level) + item['data'] = { + 'fans': int(fans), + 'attention': int(attention), + 'archive': int(archive), + 'article': int(article), + 'datetime': datetime.datetime.now() + } + item['c_fans'] = int(fans) + item['c_attention'] = int(attention) + item['c_archive'] = int(archive) + item['c_article'] = int(article) + yield Request( + "https://api.bilibili.com/x/space/upstat?mid={mid}".format( + mid=str(mid)), + meta={'item': item}, + method='GET', + callback=self.parse_view) + except Exception as error: + # 出现错误时打印错误日志 + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) + + def parse_view(self, response): + j = json.loads(response.body) + archive_view = j['data']['archive']['view'] + article_view = j['data']['article']['view'] + item = response.meta['item'] + item['data']['archiveView'] = archive_view + item['data']['articleView'] = article_view + item['c_archive_view'] = int(archive_view) + item['c_article_view'] = int(article_view) + + yield item diff --git a/biliob_spider/spiders/strong_focus.py b/biliob_spider/spiders/strong_focus.py index 06a267c..e4609ce 100644 --- a/biliob_spider/spiders/strong_focus.py +++ b/biliob_spider/spiders/strong_focus.py @@ -8,108 +8,7 @@ import logging from pymongo import MongoClient import datetime - -sub_channel_2_channel = { - 'ASMR': '生活', - 'GMV': '游戏', - 'Korea相关': '娱乐', - 'MAD·AMV': '动画', - 'MMD·3D': '动画', - 'Mugen': '游戏', - 'OP/ED/OST': '音乐', - 'VOCALOID·UTAU': '音乐', - '三次元舞蹈': '舞蹈', - '三次元音乐': '音乐', - '人力VOCALOID': '鬼畜', - '人文·历史': '纪录片', - '健身': '时尚', - '其他': '生活', - '其他国家': '电影', - '军事': '纪录片', - '动物圈': '生活', - '华语电影': '电影', - '单机游戏': '游戏', - '原创音乐': '音乐', - '国产剧': '电视剧', - '国产动画': '国创', - '国产原创相关': '国创', - '宅舞': '舞蹈', - '完结动画': '番剧', - '官方延伸': '番剧', - '布袋戏': '国创', - '广告': '广告', - '影视剪辑': '影视', - '影视杂谈': '影视', - '手工': '生活', - '手机游戏': '游戏', - '搞笑': '生活', - '教程演示': '鬼畜', - '数码': '数码', - '日常': '生活', - '明星': '娱乐', - '星海': '科技', - '服饰': '时尚', - '机械': '科技', - '桌游棋牌': '游戏', - '欧美电影': '电影', - '汽车': '科技', - '游戏': '游戏', - '海外剧': '电视剧', - '演奏': '音乐', - '演讲·公开课': '科技', - '特摄': '影视', - '电子竞技': '游戏', - '短片': '影视', - '短片·手书·配音': '动画', - '社会·美食·旅行': '纪录片', - '科学·探索·自然': '纪录片', - '绘画': '生活', - '综合': '动画', - '综艺': '娱乐', - '网络游戏': '游戏', - '美妆': '时尚', - '美食圈': '生活', - '翻唱': '音乐', - '舞蹈教程': '舞蹈', - '资讯': '国创', - '趣味科普人文': '科技', - '运动': '生活', - '连载动画': '番剧', - '野生技术协会': '科技', - '音MAD': '鬼畜', - '音乐选集': '音乐', - '音游': '游戏', - '预告 资讯': '影视', - '预告·资讯': '影视', - '单机联机': '游戏', - '鬼畜调教': '鬼畜', - '演讲• 公开课': '科技', - '国产电影': '电影', - '日本电影': '电影', - '番剧': '番剧', - '国创': '国创', - '鬼畜': '鬼畜', - '电视剧': '电视剧', - '动画': '动画', - '时尚': '时尚', - '娱乐': '娱乐', - '电影': '电影', - '舞蹈': '舞蹈', - '科技': '科技', - '生活': '生活', - '音乐': '音乐', - '纪录片': '纪录片', - '手机平板': '数码', - '电脑装机': '数码', - '影音智能': '数码', - '摄影摄像': '数码', - '风尚标': '时尚', - '电音': '音乐', - '音乐综合': '音乐', - 'MV': '音乐', - '音乐现场': '音乐', - 'T台': '时尚', -} +from util import sub_channel_2_channel class StrongSpider(scrapy.spiders.Spider): diff --git a/biliob_spider/spiders/video_from_kanbilibili.py b/biliob_spider/spiders/video_from_kanbilibili.py index f7f6b25..b59a958 100644 --- a/biliob_spider/spiders/video_from_kanbilibili.py +++ b/biliob_spider/spiders/video_from_kanbilibili.py @@ -9,108 +9,7 @@ from pymongo import MongoClient from db import settings from mail import mailer -sub_channel_2_channel = { - 'ASMR': '生活', - 'GMV': '游戏', - 'Korea相关': '娱乐', - 'MAD·AMV': '动画', - 'MMD·3D': '动画', - 'Mugen': '游戏', - 'OP/ED/OST': '音乐', - 'VOCALOID·UTAU': '音乐', - '三次元舞蹈': '舞蹈', - '三次元音乐': '音乐', - '人力VOCALOID': '鬼畜', - '人文·历史': '纪录片', - '健身': '时尚', - '其他': '生活', - '其他国家': '电影', - '军事': '纪录片', - '动物圈': '生活', - '华语电影': '电影', - '单机游戏': '游戏', - '原创音乐': '音乐', - '国产剧': '电视剧', - '国产动画': '国创', - '国产原创相关': '国创', - '宅舞': '舞蹈', - '完结动画': '番剧', - '官方延伸': '番剧', - '布袋戏': '国创', - '广告': '广告', - '影视剪辑': '影视', - '影视杂谈': '影视', - '手工': '生活', - '手机游戏': '游戏', - '搞笑': '生活', - '教程演示': '鬼畜', - '数码': '数码', - '日常': '生活', - '明星': '娱乐', - '星海': '科技', - '服饰': '时尚', - '机械': '科技', - '桌游棋牌': '游戏', - '欧美电影': '电影', - '汽车': '科技', - '海外剧': '电视剧', - '演奏': '音乐', - '演讲·公开课': '科技', - '特摄': '影视', - '电子竞技': '游戏', - '短片': '影视', - '短片·手书·配音': '动画', - '社会·美食·旅行': '纪录片', - '科学·探索·自然': '纪录片', - '绘画': '生活', - '综合': '动画', - '综艺': '娱乐', - '网络游戏': '游戏', - '美妆': '时尚', - '美食圈': '生活', - '翻唱': '音乐', - '舞蹈教程': '舞蹈', - '资讯': '国创', - '趣味科普人文': '科技', - '运动': '生活', - '连载动画': '番剧', - '野生技术协会': '科技', - '音MAD': '鬼畜', - '音乐选集': '音乐', - '音游': '游戏', - '预告 资讯': '影视', - '预告·资讯': '影视', - '单机联机': '游戏', - '鬼畜调教': '鬼畜', - '演讲• 公开课': '科技', - '国产电影': '电影', - '日本电影': '电影', - '番剧': '番剧', - '国创': '国创', - '鬼畜': '鬼畜', - '电视剧': '电视剧', - '动画': '动画', - '时尚': '时尚', - '娱乐': '娱乐', - '电影': '电影', - '舞蹈': '舞蹈', - '科技': '科技', - '生活': '生活', - '音乐': '音乐', - '纪录片': '纪录片', - '手机平板': '数码', - '电脑装机': '数码', - '影音智能': '数码', - '摄影摄像': '数码', - '摄影摄像': '数码', - '风尚标': '时尚', - '电音': '音乐', - '音乐综合': '音乐', - 'MV': '音乐', - '音乐现场': '音乐', - '游戏': '游戏', - 'T台': '时尚', -} +from util import sub_channel_2_channel class FromKan(scrapy.spiders.Spider): diff --git a/biliob_spider/spiders/video_spider.py b/biliob_spider/spiders/video_spider.py index aa54a53..b07b92b 100644 --- a/biliob_spider/spiders/video_spider.py +++ b/biliob_spider/spiders/video_spider.py @@ -9,108 +9,7 @@ import logging from pymongo import MongoClient from db import settings -sub_channel_2_channel = { - 'ASMR': '生活', - 'GMV': '游戏', - 'Korea相关': '娱乐', - 'MAD·AMV': '动画', - 'MMD·3D': '动画', - 'Mugen': '游戏', - 'OP/ED/OST': '音乐', - 'VOCALOID·UTAU': '音乐', - '三次元舞蹈': '舞蹈', - '三次元音乐': '音乐', - '人力VOCALOID': '鬼畜', - '人文·历史': '纪录片', - '健身': '时尚', - '其他': '生活', - '其他国家': '电影', - '军事': '纪录片', - '动物圈': '生活', - '华语电影': '电影', - '单机游戏': '游戏', - '原创音乐': '音乐', - '国产剧': '电视剧', - '国产动画': '国创', - '国产原创相关': '国创', - '宅舞': '舞蹈', - '完结动画': '番剧', - '官方延伸': '番剧', - '布袋戏': '国创', - '广告': '广告', - '影视剪辑': '影视', - '影视杂谈': '影视', - '手工': '生活', - '手机游戏': '游戏', - '搞笑': '生活', - '教程演示': '鬼畜', - '数码': '数码', - '日常': '生活', - '明星': '娱乐', - '星海': '科技', - '服饰': '时尚', - '机械': '科技', - '桌游棋牌': '游戏', - '欧美电影': '电影', - '汽车': '科技', - '海外剧': '电视剧', - '演奏': '音乐', - '演讲·公开课': '科技', - '特摄': '影视', - '电子竞技': '游戏', - '短片': '影视', - '短片·手书·配音': '动画', - '社会·美食·旅行': '纪录片', - '科学·探索·自然': '纪录片', - '绘画': '生活', - '综合': '动画', - '综艺': '娱乐', - '网络游戏': '游戏', - '美妆': '时尚', - '美食圈': '生活', - '翻唱': '音乐', - '舞蹈教程': '舞蹈', - '资讯': '国创', - '趣味科普人文': '科技', - '运动': '生活', - '连载动画': '番剧', - '野生技术协会': '科技', - '音MAD': '鬼畜', - '音乐选集': '音乐', - '音游': '游戏', - '预告 资讯': '影视', - '预告·资讯': '影视', - '单机联机': '游戏', - '鬼畜调教': '鬼畜', - '演讲• 公开课': '科技', - '国产电影': '电影', - '日本电影': '电影', - '番剧': '番剧', - '国创': '国创', - '鬼畜': '鬼畜', - '电视剧': '电视剧', - '动画': '动画', - '时尚': '时尚', - '娱乐': '娱乐', - '电影': '电影', - '舞蹈': '舞蹈', - '科技': '科技', - '生活': '生活', - '音乐': '音乐', - '纪录片': '纪录片', - '手机平板': '数码', - '电脑装机': '数码', - '影音智能': '数码', - '摄影摄像': '数码', - '摄影摄像': '数码', - '风尚标': '时尚', - '电音': '音乐', - '音乐综合': '音乐', - 'MV': '音乐', - '音乐现场': '音乐', - '游戏': '游戏', - 'T台': '时尚', -} +from util import sub_channel_2_channel class VideoSpider(scrapy.spiders.Spider): @@ -134,7 +33,8 @@ def __init__(self): def start_requests(self): # 只需要aid - c = self.coll.find({'$or':[{'focus':True},{'forceFocus':True}]}, {'aid': 1}) + c = self.coll.find( + {'$or': [{'focus': True}, {'forceFocus': True}]}, {'aid': 1}) x = 0 aid_list = [] for each_doc in c: @@ -179,7 +79,6 @@ def parse(self, response): 'datetime': current_date } - subChannel = d[each_key]['tname'] title = d[each_key]['title'] date = d[each_key]['pubdate'] diff --git a/biliob_spider/spiders/video_spider_all.py b/biliob_spider/spiders/video_spider_all.py index 0900acb..56118be 100644 --- a/biliob_spider/spiders/video_spider_all.py +++ b/biliob_spider/spiders/video_spider_all.py @@ -9,111 +9,11 @@ import logging from pymongo import MongoClient from db import settings +from util import sub_channel_2_channel +from biliob_spider.spiders.video_spider import VideoSpider -sub_channel_2_channel = { - 'ASMR': '生活', - 'GMV': '游戏', - 'Korea相关': '娱乐', - 'MAD·AMV': '动画', - 'MMD·3D': '动画', - 'Mugen': '游戏', - 'OP/ED/OST': '音乐', - 'VOCALOID·UTAU': '音乐', - '三次元舞蹈': '舞蹈', - '三次元音乐': '音乐', - '人力VOCALOID': '鬼畜', - '人文·历史': '纪录片', - '健身': '时尚', - '其他': '生活', - '其他国家': '电影', - '军事': '纪录片', - '动物圈': '生活', - '华语电影': '电影', - '单机游戏': '游戏', - '原创音乐': '音乐', - '国产剧': '电视剧', - '国产动画': '国创', - '国产原创相关': '国创', - '宅舞': '舞蹈', - '完结动画': '番剧', - '官方延伸': '番剧', - '布袋戏': '国创', - '广告': '广告', - '影视剪辑': '影视', - '影视杂谈': '影视', - '手工': '生活', - '手机游戏': '游戏', - '搞笑': '生活', - '教程演示': '鬼畜', - '数码': '数码', - '日常': '生活', - '明星': '娱乐', - '星海': '科技', - '服饰': '时尚', - '机械': '科技', - '桌游棋牌': '游戏', - '欧美电影': '电影', - '汽车': '科技', - '游戏': '游戏', - '海外剧': '电视剧', - '演奏': '音乐', - '演讲·公开课': '科技', - '特摄': '影视', - '电子竞技': '游戏', - '短片': '影视', - '短片·手书·配音': '动画', - '社会·美食·旅行': '纪录片', - '科学·探索·自然': '纪录片', - '绘画': '生活', - '综合': '动画', - '综艺': '娱乐', - '网络游戏': '游戏', - '美妆': '时尚', - '美食圈': '生活', - '翻唱': '音乐', - '舞蹈教程': '舞蹈', - '资讯': '国创', - '趣味科普人文': '科技', - '运动': '生活', - '连载动画': '番剧', - '野生技术协会': '科技', - '音MAD': '鬼畜', - '音乐选集': '音乐', - '音游': '游戏', - '预告 资讯': '影视', - '预告·资讯': '影视', - '单机联机': '游戏', - '鬼畜调教': '鬼畜', - '演讲• 公开课': '科技', - '国产电影': '电影', - '日本电影': '电影', - '番剧': '番剧', - '国创': '国创', - '鬼畜': '鬼畜', - '电视剧': '电视剧', - '动画': '动画', - '时尚': '时尚', - '娱乐': '娱乐', - '电影': '电影', - '舞蹈': '舞蹈', - '科技': '科技', - '生活': '生活', - '音乐': '音乐', - '纪录片': '纪录片', - '手机平板': '数码', - '电脑装机': '数码', - '影音智能': '数码', - '摄影摄像': '数码', - '风尚标': '时尚', - '电音': '音乐', - '音乐综合': '音乐', - 'MV': '音乐', - '音乐现场': '音乐', - 'T台': '时尚', -} - -class VideoSpider(scrapy.spiders.Spider): +class VideoSpiderAll(VideoSpider): name = "videoSpiderAll" allowed_domains = ["bilibili.com"] start_urls = [] @@ -153,80 +53,3 @@ def start_requests(self): yield Request( "https://api.bilibili.com/x/article/archives?ids=" + aid_str.rstrip(',')) - - def parse(self, response): - try: - r = json.loads(response.body) - d = r["data"] - keys = list(d.keys()) - for each_key in keys: - - aid = d[each_key]['stat']['aid'] - author = d[each_key]['owner']['name'] - mid = d[each_key]['owner']['mid'] - view = d[each_key]['stat']['view'] - favorite = d[each_key]['stat']['favorite'] - danmaku = d[each_key]['stat']['danmaku'] - coin = d[each_key]['stat']['coin'] - share = d[each_key]['stat']['share'] - like = d[each_key]['stat']['like'] - current_date = datetime.now() - data = { - 'view': view, - 'favorite': favorite, - 'danmaku': danmaku, - 'coin': coin, - 'share': share, - 'like': like, - 'datetime': current_date - } - - - subChannel = d[each_key]['tname'] - title = d[each_key]['title'] - date = d[each_key]['pubdate'] - tid = d[each_key]['tid'] - pic = d[each_key]['pic'] - item = VideoItem() - item['current_view'] = view - item['current_favorite'] = favorite - item['current_danmaku'] = danmaku - item['current_coin'] = coin - item['current_share'] = share - item['current_like'] = like - item['current_datetime'] = current_date - item['aid'] = aid - item['mid'] = mid - item['pic'] = pic - item['author'] = author - item['data'] = data - item['title'] = title - item['subChannel'] = subChannel - item['datetime'] = date - - if subChannel != '': - item['channel'] = sub_channel_2_channel[subChannel] - elif subChannel == '资讯': - if tid == 51: - item['channel'] == '番剧' - if tid == 170: - item['channel'] == '国创' - if tid == 159: - item['channel'] == '娱乐' - else: - item['channel'] = None - yield item - - except Exception as error: - # 出现错误时打印错误日志 - if r['code'] == -404: - return - mailer.send( - to=["604264970@qq.com"], - subject="BiliobSpiderError", - body="{}\n{}\n{}".format(item, response.url, error), - ) - logging.error("视频爬虫在解析时发生错误") - logging.error(item) - logging.error(response.url) - logging.error(error) diff --git a/biliob_spider/spiders/video_spider_with_redis.py b/biliob_spider/spiders/video_spider_with_redis.py new file mode 100644 index 0000000..b4cea54 --- /dev/null +++ b/biliob_spider/spiders/video_spider_with_redis.py @@ -0,0 +1,109 @@ +# coding=utf-8 +import scrapy +from mail import mailer +from scrapy.http import Request +from biliob_spider.items import VideoItem +from datetime import datetime +import time +import json +import logging +from pymongo import MongoClient +from db import settings +from util import sub_channel_2_channel +from scrapy_redis.spiders import RedisSpider + + +class VideoSpiderWithRedis(RedisSpider): + name = "VideoSpiderWithRedis" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.VideoPipeline': 300, + } + } + + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def parse(self, response): + try: + r = json.loads(response.body) + d = r["data"] + keys = list(d.keys()) + for each_key in keys: + + aid = d[each_key]['stat']['aid'] + author = d[each_key]['owner']['name'] + mid = d[each_key]['owner']['mid'] + view = d[each_key]['stat']['view'] + favorite = d[each_key]['stat']['favorite'] + danmaku = d[each_key]['stat']['danmaku'] + coin = d[each_key]['stat']['coin'] + share = d[each_key]['stat']['share'] + like = d[each_key]['stat']['like'] + current_date = datetime.now() + data = { + 'view': view, + 'favorite': favorite, + 'danmaku': danmaku, + 'coin': coin, + 'share': share, + 'like': like, + 'datetime': current_date + } + + subChannel = d[each_key]['tname'] + title = d[each_key]['title'] + date = d[each_key]['pubdate'] + tid = d[each_key]['tid'] + pic = d[each_key]['pic'] + item = VideoItem() + item['current_view'] = view + item['current_favorite'] = favorite + item['current_danmaku'] = danmaku + item['current_coin'] = coin + item['current_share'] = share + item['current_like'] = like + item['current_datetime'] = current_date + item['aid'] = aid + item['mid'] = mid + item['pic'] = pic + item['author'] = author + item['data'] = data + item['title'] = title + item['subChannel'] = subChannel + item['datetime'] = date + + if subChannel != '': + item['channel'] = sub_channel_2_channel[subChannel] + elif subChannel == '资讯': + if tid == 51: + item['channel'] == '番剧' + if tid == 170: + item['channel'] == '国创' + if tid == 159: + item['channel'] == '娱乐' + else: + item['channel'] = None + yield item + + except Exception as error: + # 出现错误时打印错误日志 + if r['code'] == -404: + return + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(item) + logging.error(response.url) + logging.error(error) diff --git a/biliob_spider/spiders/video_watcher.py b/biliob_spider/spiders/video_watcher.py index e24a865..2ef475b 100644 --- a/biliob_spider/spiders/video_watcher.py +++ b/biliob_spider/spiders/video_watcher.py @@ -1,4 +1,4 @@ -#coding=utf-8 +# coding=utf-8 import scrapy from mail import mailer from scrapy.http import Request @@ -33,7 +33,8 @@ def __init__(self): self.coll = self.db['author'] # 获得collection的句柄 def start_requests(self): - c = self.coll.find({'$or':[{'focus':True},{'forceFocus':True}]}, {'mid': 1}) + c = self.coll.find( + {'$or': [{'focus': True}, {'forceFocus': True}]}, {'mid': 1}) for each_doc in c: yield Request( 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid=' + diff --git a/util.py b/util.py new file mode 100644 index 0000000..4713cf3 --- /dev/null +++ b/util.py @@ -0,0 +1,101 @@ +sub_channel_2_channel = { + 'ASMR': '生活', + 'GMV': '游戏', + 'Korea相关': '娱乐', + 'MAD·AMV': '动画', + 'MMD·3D': '动画', + 'Mugen': '游戏', + 'OP/ED/OST': '音乐', + 'VOCALOID·UTAU': '音乐', + '三次元舞蹈': '舞蹈', + '三次元音乐': '音乐', + '人力VOCALOID': '鬼畜', + '人文·历史': '纪录片', + '健身': '时尚', + '其他': '生活', + '其他国家': '电影', + '军事': '纪录片', + '动物圈': '生活', + '华语电影': '电影', + '单机游戏': '游戏', + '原创音乐': '音乐', + '国产剧': '电视剧', + '国产动画': '国创', + '国产原创相关': '国创', + '宅舞': '舞蹈', + '完结动画': '番剧', + '官方延伸': '番剧', + '布袋戏': '国创', + '广告': '广告', + '影视剪辑': '影视', + '影视杂谈': '影视', + '手工': '生活', + '手机游戏': '游戏', + '搞笑': '生活', + '教程演示': '鬼畜', + '数码': '数码', + '日常': '生活', + '明星': '娱乐', + '星海': '科技', + '服饰': '时尚', + '机械': '科技', + '桌游棋牌': '游戏', + '欧美电影': '电影', + '汽车': '科技', + '海外剧': '电视剧', + '演奏': '音乐', + '演讲·公开课': '科技', + '特摄': '影视', + '电子竞技': '游戏', + '短片': '影视', + '短片·手书·配音': '动画', + '社会·美食·旅行': '纪录片', + '科学·探索·自然': '纪录片', + '绘画': '生活', + '综合': '动画', + '综艺': '娱乐', + '网络游戏': '游戏', + '美妆': '时尚', + '美食圈': '生活', + '翻唱': '音乐', + '舞蹈教程': '舞蹈', + '资讯': '国创', + '趣味科普人文': '科技', + '运动': '生活', + '连载动画': '番剧', + '野生技术协会': '科技', + '音MAD': '鬼畜', + '音乐选集': '音乐', + '音游': '游戏', + '预告 资讯': '影视', + '预告·资讯': '影视', + '单机联机': '游戏', + '鬼畜调教': '鬼畜', + '演讲• 公开课': '科技', + '国产电影': '电影', + '日本电影': '电影', + '番剧': '番剧', + '国创': '国创', + '鬼畜': '鬼畜', + '电视剧': '电视剧', + '动画': '动画', + '时尚': '时尚', + '娱乐': '娱乐', + '电影': '电影', + '舞蹈': '舞蹈', + '科技': '科技', + '生活': '生活', + '音乐': '音乐', + '纪录片': '纪录片', + '手机平板': '数码', + '电脑装机': '数码', + '影音智能': '数码', + '摄影摄像': '数码', + '风尚标': '时尚', + '电音': '音乐', + '音乐综合': '音乐', + 'MV': '音乐', + '音乐现场': '音乐', + '游戏': '游戏', + 'T台': '时尚', +} From 88de7b733d0ad47fc21a4084449cc2b19b75b038 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 12 Feb 2019 23:47:33 +0800 Subject: [PATCH 147/469] feature: caculate author's rank --- biliob_analyzer/author_rank.py | 77 ++++++++++++++++++++++++++++++++++ run_analyzer.py | 1 + 2 files changed, 78 insertions(+) create mode 100644 biliob_analyzer/author_rank.py diff --git a/biliob_analyzer/author_rank.py b/biliob_analyzer/author_rank.py new file mode 100644 index 0000000..9200a40 --- /dev/null +++ b/biliob_analyzer/author_rank.py @@ -0,0 +1,77 @@ +from db import settings +from db import db +import datetime +import logging +from pymongo import DESCENDING +coll = db['author'] # 获得collection的句柄 + +logging.basicConfig(level=logging.INFO, + format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') +logger = logging.getLogger(__name__) + +logger.info("开始计算作者粉丝排名") +i = 1 +authors = coll.find({}, {'mid': 1, 'rank': 1, 'cFans': 1}).batch_size( + 20).sort('cFans', DESCENDING) +for each_author in authors: + # 如果没有data 直接下一个 + if 'cFans' in each_author: + if 'rank' in each_author: + rank = each_author['rank'] + rank['fansRank'] = i + else: + rank = { + 'fansRank': i + } + i += 1 + coll.update_one({'mid': each_author['mid']}, { + '$set': { + 'rank': rank, + } + }) + pass + +logger.info("开始计算作者播放排名") +authors = coll.find({}, {'mid': 1, 'rank': 1, 'cArchive_view': 1}).batch_size( + 20).sort('cArchive_view', DESCENDING) +for each_author in authors: + # 如果没有data 直接下一个 + if 'cArchive_view' in each_author: + if 'rank' in each_author: + rank = each_author['rank'] + rank['archiveViewRank'] = i + else: + rank = { + 'archiveViewRank': i + } + i += 1 + coll.update_one({'mid': each_author['mid']}, { + '$set': { + 'rank': rank, + } + }) + pass + +logger.info("开始计算作者专栏排名") +i = 1 +authors = coll.find({}, {'mid': 1, 'rank': 1, 'cArticle_view': 1}).batch_size( + 20).sort('cArticle_view', DESCENDING) +for each_author in authors: + # 如果没有data 直接下一个 + if 'cArticle_view' in each_author: + if 'rank' in each_author: + rank = each_author['rank'] + rank['articleViewRank'] = i + else: + rank = { + 'articleViewRank': i + } + i += 1 + coll.update_one({'mid': each_author['mid']}, { + '$set': { + 'rank': rank, + } + }) + pass + +logger.info("计算作者排名结束") diff --git a/run_analyzer.py b/run_analyzer.py index 1d56976..a8b171d 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -2,6 +2,7 @@ from biliob_analyzer.video_analyzer import VideoAnalyzer import biliob_analyzer.author_rate_caculate import biliob_analyzer.author_fans_watcher +import biliob_analyzer.author_rank author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From 8679569e69fdf64e6b3cca8be806e60a753e6be4 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 12 Feb 2019 23:47:33 +0800 Subject: [PATCH 148/469] feature: caculate author's rank --- biliob_analyzer/author_rank.py | 77 ++++++++++++++++++++++++++++++++++ run_analyzer.py | 1 + 2 files changed, 78 insertions(+) create mode 100644 biliob_analyzer/author_rank.py diff --git a/biliob_analyzer/author_rank.py b/biliob_analyzer/author_rank.py new file mode 100644 index 0000000..9200a40 --- /dev/null +++ b/biliob_analyzer/author_rank.py @@ -0,0 +1,77 @@ +from db import settings +from db import db +import datetime +import logging +from pymongo import DESCENDING +coll = db['author'] # 获得collection的句柄 + +logging.basicConfig(level=logging.INFO, + format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') +logger = logging.getLogger(__name__) + +logger.info("开始计算作者粉丝排名") +i = 1 +authors = coll.find({}, {'mid': 1, 'rank': 1, 'cFans': 1}).batch_size( + 20).sort('cFans', DESCENDING) +for each_author in authors: + # 如果没有data 直接下一个 + if 'cFans' in each_author: + if 'rank' in each_author: + rank = each_author['rank'] + rank['fansRank'] = i + else: + rank = { + 'fansRank': i + } + i += 1 + coll.update_one({'mid': each_author['mid']}, { + '$set': { + 'rank': rank, + } + }) + pass + +logger.info("开始计算作者播放排名") +authors = coll.find({}, {'mid': 1, 'rank': 1, 'cArchive_view': 1}).batch_size( + 20).sort('cArchive_view', DESCENDING) +for each_author in authors: + # 如果没有data 直接下一个 + if 'cArchive_view' in each_author: + if 'rank' in each_author: + rank = each_author['rank'] + rank['archiveViewRank'] = i + else: + rank = { + 'archiveViewRank': i + } + i += 1 + coll.update_one({'mid': each_author['mid']}, { + '$set': { + 'rank': rank, + } + }) + pass + +logger.info("开始计算作者专栏排名") +i = 1 +authors = coll.find({}, {'mid': 1, 'rank': 1, 'cArticle_view': 1}).batch_size( + 20).sort('cArticle_view', DESCENDING) +for each_author in authors: + # 如果没有data 直接下一个 + if 'cArticle_view' in each_author: + if 'rank' in each_author: + rank = each_author['rank'] + rank['articleViewRank'] = i + else: + rank = { + 'articleViewRank': i + } + i += 1 + coll.update_one({'mid': each_author['mid']}, { + '$set': { + 'rank': rank, + } + }) + pass + +logger.info("计算作者排名结束") diff --git a/run_analyzer.py b/run_analyzer.py index 1d56976..a8b171d 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -2,6 +2,7 @@ from biliob_analyzer.video_analyzer import VideoAnalyzer import biliob_analyzer.author_rate_caculate import biliob_analyzer.author_fans_watcher +import biliob_analyzer.author_rank author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From 1dd444ec6319909ea38f71f77a05fb086d69c853 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 12 Feb 2019 23:47:33 +0800 Subject: [PATCH 149/469] feature: caculate author's rank --- biliob_analyzer/author_rank.py | 77 ++++++++++++++++++++++++++++++++++ run_analyzer.py | 1 + 2 files changed, 78 insertions(+) create mode 100644 biliob_analyzer/author_rank.py diff --git a/biliob_analyzer/author_rank.py b/biliob_analyzer/author_rank.py new file mode 100644 index 0000000..9200a40 --- /dev/null +++ b/biliob_analyzer/author_rank.py @@ -0,0 +1,77 @@ +from db import settings +from db import db +import datetime +import logging +from pymongo import DESCENDING +coll = db['author'] # 获得collection的句柄 + +logging.basicConfig(level=logging.INFO, + format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') +logger = logging.getLogger(__name__) + +logger.info("开始计算作者粉丝排名") +i = 1 +authors = coll.find({}, {'mid': 1, 'rank': 1, 'cFans': 1}).batch_size( + 20).sort('cFans', DESCENDING) +for each_author in authors: + # 如果没有data 直接下一个 + if 'cFans' in each_author: + if 'rank' in each_author: + rank = each_author['rank'] + rank['fansRank'] = i + else: + rank = { + 'fansRank': i + } + i += 1 + coll.update_one({'mid': each_author['mid']}, { + '$set': { + 'rank': rank, + } + }) + pass + +logger.info("开始计算作者播放排名") +authors = coll.find({}, {'mid': 1, 'rank': 1, 'cArchive_view': 1}).batch_size( + 20).sort('cArchive_view', DESCENDING) +for each_author in authors: + # 如果没有data 直接下一个 + if 'cArchive_view' in each_author: + if 'rank' in each_author: + rank = each_author['rank'] + rank['archiveViewRank'] = i + else: + rank = { + 'archiveViewRank': i + } + i += 1 + coll.update_one({'mid': each_author['mid']}, { + '$set': { + 'rank': rank, + } + }) + pass + +logger.info("开始计算作者专栏排名") +i = 1 +authors = coll.find({}, {'mid': 1, 'rank': 1, 'cArticle_view': 1}).batch_size( + 20).sort('cArticle_view', DESCENDING) +for each_author in authors: + # 如果没有data 直接下一个 + if 'cArticle_view' in each_author: + if 'rank' in each_author: + rank = each_author['rank'] + rank['articleViewRank'] = i + else: + rank = { + 'articleViewRank': i + } + i += 1 + coll.update_one({'mid': each_author['mid']}, { + '$set': { + 'rank': rank, + } + }) + pass + +logger.info("计算作者排名结束") diff --git a/run_analyzer.py b/run_analyzer.py index 1d56976..a8b171d 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -2,6 +2,7 @@ from biliob_analyzer.video_analyzer import VideoAnalyzer import biliob_analyzer.author_rate_caculate import biliob_analyzer.author_fans_watcher +import biliob_analyzer.author_rank author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From 4f60517fe226999a4c4d77be63117e15280af0c0 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 13 Feb 2019 14:38:43 +0800 Subject: [PATCH 150/469] feature: use logger --- biliob_analyzer/author_analyzer.py | 27 ++++++++-------- biliob_analyzer/author_rate_caculate.py | 8 ++++- biliob_analyzer/video_analyzer.py | 41 ++++++++++++++++--------- biliob_analyzer/video_rank.py | 39 +++++++++++++++++++++++ 4 files changed, 87 insertions(+), 28 deletions(-) create mode 100644 biliob_analyzer/video_rank.py diff --git a/biliob_analyzer/author_analyzer.py b/biliob_analyzer/author_analyzer.py index 5835229..0dec5d4 100644 --- a/biliob_analyzer/author_analyzer.py +++ b/biliob_analyzer/author_analyzer.py @@ -1,16 +1,24 @@ from db import settings -from pymongo import MongoClient +from pymongo import MongoClient from datetime import datetime from datetime import timedelta +import logging + +logging.basicConfig(level=logging.INFO, + format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') +logger = logging.getLogger(__name__) + + class AuthorAnalyzer(object): def __init__(self): # 链接mongoDB self.client = MongoClient(settings['MINGO_HOST'], 27017) # 数据库登录需要帐号密码 self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) + settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['author'] # 获得collection的句柄 + def author_filter(self): pre_fans = -1 c_fans = -1 @@ -19,7 +27,7 @@ def author_filter(self): c_date = datetime count_unfocus = 0 count_focus = 0 - for each_doc in self.coll.find({'focus':True}): + for each_doc in self.coll.find({'focus': True}): flag_cool = 0 if 'data' in each_doc: each_doc['data'].reverse() @@ -54,18 +62,13 @@ def author_filter(self): if focus: count_focus += 1 - print("√ 持续追踪:"+each_doc['name']) - self.coll.update_one({'mid':each_doc['mid']},{'$set':{'focus':True}}) else: count_unfocus += 1 - print("× 不再追踪:"+each_doc['name']) - self.coll.update_one({'mid':each_doc['mid']},{'$set':{'focus':False}}) pre_fans = -1 c_fans = -1 - print("· 本轮筛选结果:") - print("× 不再追踪总数:"+str(count_unfocus)) - print("√ 持续追踪总数:"+str(count_focus)) - + logger.info("· 本轮筛选结果:") + logger.info("× 不再追踪总数:"+str(count_unfocus)) + logger.info("√ 持续追踪总数:"+str(count_focus)) + def fans_variation(self): pass - diff --git a/biliob_analyzer/author_rate_caculate.py b/biliob_analyzer/author_rate_caculate.py index 2cb7acb..26cbbb1 100644 --- a/biliob_analyzer/author_rate_caculate.py +++ b/biliob_analyzer/author_rate_caculate.py @@ -1,8 +1,14 @@ from db import settings from db import db import datetime +import logging + +logging.basicConfig(level=logging.INFO, + format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') +logger = logging.getLogger(__name__) + coll = db['author'] # 获得collection的句柄 -print('开始计算粉丝增速') +logger.info('开始计算粉丝增速') for each_author in coll.find().batch_size(8): rate = [] i = 0 diff --git a/biliob_analyzer/video_analyzer.py b/biliob_analyzer/video_analyzer.py index 92d58db..25a41f7 100644 --- a/biliob_analyzer/video_analyzer.py +++ b/biliob_analyzer/video_analyzer.py @@ -1,16 +1,24 @@ from db import settings -from pymongo import MongoClient +from pymongo import MongoClient from datetime import datetime from datetime import timedelta +import logging + +logging.basicConfig(level=logging.INFO, + format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') +logger = logging.getLogger(__name__) + + class VideoAnalyzer(object): def __init__(self): # 链接mongoDB self.client = MongoClient(settings['MINGO_HOST'], 27017) # 数据库登录需要帐号密码 self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) + settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['video'] # 获得collection的句柄 + def video_filter(self): pre_view = -1 c_view = -1 @@ -20,14 +28,14 @@ def video_filter(self): count_delete = 0 count_unfocus = 0 count_focus = 0 - for each_doc in self.coll.find({'focus':True}): + for each_doc in self.coll.find({'focus': True}): live_time = 0 delete = False focus = True if 'data' in each_doc: each_doc['data'].reverse() for each_data in each_doc['data']: - + if pre_view == -1: pre_view = each_data['view'] pre_date = each_data['datetime'] @@ -37,7 +45,7 @@ def video_filter(self): if pre_date + delta > c_date: continue - live_time +=1 + live_time += 1 rate = (c_view-pre_view) pre_view = c_view pre_date = c_date @@ -57,19 +65,22 @@ def video_filter(self): delete = False if delete: count_delete += 1 - print("! 删除追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data'])-1]['view'])) - self.coll.delete_one({'aid':each_doc['aid']}) + logger.info("! 删除追踪:"+each_doc['title']+' 当前播放:'+str( + each_doc['data'][len(each_doc['data'])-1]['view'])) + self.coll.delete_one({'aid': each_doc['aid']}) elif focus: count_focus += 1 - print("√ 持续追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data'])-1]['view'])) + logger.info("√ 持续追踪:"+each_doc['title']+' 当前播放:'+str( + each_doc['data'][len(each_doc['data'])-1]['view'])) else: count_unfocus += 1 - print("× 不再追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data'])-1]['view'])) - self.coll.update_one({'aid':each_doc['aid']},{'$set':{'focus':False}}) + logger.info("× 不再追踪:"+each_doc['title']+' 当前播放:'+str( + each_doc['data'][len(each_doc['data'])-1]['view'])) + self.coll.update_one({'aid': each_doc['aid']}, { + '$set': {'focus': False}}) pre_view = -1 c_view = -1 - print("· 本轮筛选结果:") - print("! 删除辣鸡总数:"+str(count_delete)) - print("× 不再追踪总数:"+str(count_unfocus)) - print("√ 持续追踪总数:"+str(count_focus)) - + logger.info("· 本轮筛选结果:") + logger.info("! 删除辣鸡总数:"+str(count_delete)) + logger.info("× 不再追踪总数:"+str(count_unfocus)) + logger.info("√ 持续追踪总数:"+str(count_focus)) diff --git a/biliob_analyzer/video_rank.py b/biliob_analyzer/video_rank.py new file mode 100644 index 0000000..3b1d679 --- /dev/null +++ b/biliob_analyzer/video_rank.py @@ -0,0 +1,39 @@ +from db import settings +from db import db +import datetime +import logging +from pymongo import DESCENDING +coll = db['video'] # 获得collection的句柄 + +logging.basicConfig(level=logging.INFO, + format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') +logger = logging.getLogger(__name__) + +logger.info("开始计算视频数据排名") + +i = 1 + +keys = ['cView', 'cLike', 'cDanmaku', 'cFavorite', 'cCoin', 'cShare'] +for each_key in keys: + rank_key = each_key[1:] + 'Rank' + logger.info("开始计算视频{}排名".format(each_key)) + videos = coll.find({}, {'aid': 1, 'rank': 1, each_key: 1}).batch_size( + 20).sort(each_key, DESCENDING) + for each_video in videos: + # 如果没有data 直接下一个 + if each_key in each_video: + if 'rank' in each_video: + rank = each_video['rank'] + rank[rank_key] = i + else: + rank = { + rank_key: i + } + i += 1 + coll.update_one({'aid': each_video['aid']}, { + '$set': { + 'rank': rank, + } + }) + + logger.info("完成计算视频数据排名") From faa66ff3cd5ea5a69a58aedcb1bdc726131d86b0 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 13 Feb 2019 14:38:43 +0800 Subject: [PATCH 151/469] feature: use logger --- biliob_analyzer/author_analyzer.py | 27 ++++++++-------- biliob_analyzer/author_rate_caculate.py | 8 ++++- biliob_analyzer/video_analyzer.py | 41 ++++++++++++++++--------- biliob_analyzer/video_rank.py | 39 +++++++++++++++++++++++ 4 files changed, 87 insertions(+), 28 deletions(-) create mode 100644 biliob_analyzer/video_rank.py diff --git a/biliob_analyzer/author_analyzer.py b/biliob_analyzer/author_analyzer.py index 5835229..0dec5d4 100644 --- a/biliob_analyzer/author_analyzer.py +++ b/biliob_analyzer/author_analyzer.py @@ -1,16 +1,24 @@ from db import settings -from pymongo import MongoClient +from pymongo import MongoClient from datetime import datetime from datetime import timedelta +import logging + +logging.basicConfig(level=logging.INFO, + format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') +logger = logging.getLogger(__name__) + + class AuthorAnalyzer(object): def __init__(self): # 链接mongoDB self.client = MongoClient(settings['MINGO_HOST'], 27017) # 数据库登录需要帐号密码 self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) + settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['author'] # 获得collection的句柄 + def author_filter(self): pre_fans = -1 c_fans = -1 @@ -19,7 +27,7 @@ def author_filter(self): c_date = datetime count_unfocus = 0 count_focus = 0 - for each_doc in self.coll.find({'focus':True}): + for each_doc in self.coll.find({'focus': True}): flag_cool = 0 if 'data' in each_doc: each_doc['data'].reverse() @@ -54,18 +62,13 @@ def author_filter(self): if focus: count_focus += 1 - print("√ 持续追踪:"+each_doc['name']) - self.coll.update_one({'mid':each_doc['mid']},{'$set':{'focus':True}}) else: count_unfocus += 1 - print("× 不再追踪:"+each_doc['name']) - self.coll.update_one({'mid':each_doc['mid']},{'$set':{'focus':False}}) pre_fans = -1 c_fans = -1 - print("· 本轮筛选结果:") - print("× 不再追踪总数:"+str(count_unfocus)) - print("√ 持续追踪总数:"+str(count_focus)) - + logger.info("· 本轮筛选结果:") + logger.info("× 不再追踪总数:"+str(count_unfocus)) + logger.info("√ 持续追踪总数:"+str(count_focus)) + def fans_variation(self): pass - diff --git a/biliob_analyzer/author_rate_caculate.py b/biliob_analyzer/author_rate_caculate.py index 2cb7acb..26cbbb1 100644 --- a/biliob_analyzer/author_rate_caculate.py +++ b/biliob_analyzer/author_rate_caculate.py @@ -1,8 +1,14 @@ from db import settings from db import db import datetime +import logging + +logging.basicConfig(level=logging.INFO, + format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') +logger = logging.getLogger(__name__) + coll = db['author'] # 获得collection的句柄 -print('开始计算粉丝增速') +logger.info('开始计算粉丝增速') for each_author in coll.find().batch_size(8): rate = [] i = 0 diff --git a/biliob_analyzer/video_analyzer.py b/biliob_analyzer/video_analyzer.py index 92d58db..25a41f7 100644 --- a/biliob_analyzer/video_analyzer.py +++ b/biliob_analyzer/video_analyzer.py @@ -1,16 +1,24 @@ from db import settings -from pymongo import MongoClient +from pymongo import MongoClient from datetime import datetime from datetime import timedelta +import logging + +logging.basicConfig(level=logging.INFO, + format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') +logger = logging.getLogger(__name__) + + class VideoAnalyzer(object): def __init__(self): # 链接mongoDB self.client = MongoClient(settings['MINGO_HOST'], 27017) # 数据库登录需要帐号密码 self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) + settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['video'] # 获得collection的句柄 + def video_filter(self): pre_view = -1 c_view = -1 @@ -20,14 +28,14 @@ def video_filter(self): count_delete = 0 count_unfocus = 0 count_focus = 0 - for each_doc in self.coll.find({'focus':True}): + for each_doc in self.coll.find({'focus': True}): live_time = 0 delete = False focus = True if 'data' in each_doc: each_doc['data'].reverse() for each_data in each_doc['data']: - + if pre_view == -1: pre_view = each_data['view'] pre_date = each_data['datetime'] @@ -37,7 +45,7 @@ def video_filter(self): if pre_date + delta > c_date: continue - live_time +=1 + live_time += 1 rate = (c_view-pre_view) pre_view = c_view pre_date = c_date @@ -57,19 +65,22 @@ def video_filter(self): delete = False if delete: count_delete += 1 - print("! 删除追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data'])-1]['view'])) - self.coll.delete_one({'aid':each_doc['aid']}) + logger.info("! 删除追踪:"+each_doc['title']+' 当前播放:'+str( + each_doc['data'][len(each_doc['data'])-1]['view'])) + self.coll.delete_one({'aid': each_doc['aid']}) elif focus: count_focus += 1 - print("√ 持续追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data'])-1]['view'])) + logger.info("√ 持续追踪:"+each_doc['title']+' 当前播放:'+str( + each_doc['data'][len(each_doc['data'])-1]['view'])) else: count_unfocus += 1 - print("× 不再追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data'])-1]['view'])) - self.coll.update_one({'aid':each_doc['aid']},{'$set':{'focus':False}}) + logger.info("× 不再追踪:"+each_doc['title']+' 当前播放:'+str( + each_doc['data'][len(each_doc['data'])-1]['view'])) + self.coll.update_one({'aid': each_doc['aid']}, { + '$set': {'focus': False}}) pre_view = -1 c_view = -1 - print("· 本轮筛选结果:") - print("! 删除辣鸡总数:"+str(count_delete)) - print("× 不再追踪总数:"+str(count_unfocus)) - print("√ 持续追踪总数:"+str(count_focus)) - + logger.info("· 本轮筛选结果:") + logger.info("! 删除辣鸡总数:"+str(count_delete)) + logger.info("× 不再追踪总数:"+str(count_unfocus)) + logger.info("√ 持续追踪总数:"+str(count_focus)) diff --git a/biliob_analyzer/video_rank.py b/biliob_analyzer/video_rank.py new file mode 100644 index 0000000..3b1d679 --- /dev/null +++ b/biliob_analyzer/video_rank.py @@ -0,0 +1,39 @@ +from db import settings +from db import db +import datetime +import logging +from pymongo import DESCENDING +coll = db['video'] # 获得collection的句柄 + +logging.basicConfig(level=logging.INFO, + format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') +logger = logging.getLogger(__name__) + +logger.info("开始计算视频数据排名") + +i = 1 + +keys = ['cView', 'cLike', 'cDanmaku', 'cFavorite', 'cCoin', 'cShare'] +for each_key in keys: + rank_key = each_key[1:] + 'Rank' + logger.info("开始计算视频{}排名".format(each_key)) + videos = coll.find({}, {'aid': 1, 'rank': 1, each_key: 1}).batch_size( + 20).sort(each_key, DESCENDING) + for each_video in videos: + # 如果没有data 直接下一个 + if each_key in each_video: + if 'rank' in each_video: + rank = each_video['rank'] + rank[rank_key] = i + else: + rank = { + rank_key: i + } + i += 1 + coll.update_one({'aid': each_video['aid']}, { + '$set': { + 'rank': rank, + } + }) + + logger.info("完成计算视频数据排名") From cfff07a6d0c6abf69ca57d3ee1f6d2fc7c11779a Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 13 Feb 2019 14:38:43 +0800 Subject: [PATCH 152/469] feature: use logger --- biliob_analyzer/author_analyzer.py | 27 ++++++++-------- biliob_analyzer/author_rate_caculate.py | 8 ++++- biliob_analyzer/video_analyzer.py | 41 ++++++++++++++++--------- biliob_analyzer/video_rank.py | 39 +++++++++++++++++++++++ 4 files changed, 87 insertions(+), 28 deletions(-) create mode 100644 biliob_analyzer/video_rank.py diff --git a/biliob_analyzer/author_analyzer.py b/biliob_analyzer/author_analyzer.py index 5835229..0dec5d4 100644 --- a/biliob_analyzer/author_analyzer.py +++ b/biliob_analyzer/author_analyzer.py @@ -1,16 +1,24 @@ from db import settings -from pymongo import MongoClient +from pymongo import MongoClient from datetime import datetime from datetime import timedelta +import logging + +logging.basicConfig(level=logging.INFO, + format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') +logger = logging.getLogger(__name__) + + class AuthorAnalyzer(object): def __init__(self): # 链接mongoDB self.client = MongoClient(settings['MINGO_HOST'], 27017) # 数据库登录需要帐号密码 self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) + settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['author'] # 获得collection的句柄 + def author_filter(self): pre_fans = -1 c_fans = -1 @@ -19,7 +27,7 @@ def author_filter(self): c_date = datetime count_unfocus = 0 count_focus = 0 - for each_doc in self.coll.find({'focus':True}): + for each_doc in self.coll.find({'focus': True}): flag_cool = 0 if 'data' in each_doc: each_doc['data'].reverse() @@ -54,18 +62,13 @@ def author_filter(self): if focus: count_focus += 1 - print("√ 持续追踪:"+each_doc['name']) - self.coll.update_one({'mid':each_doc['mid']},{'$set':{'focus':True}}) else: count_unfocus += 1 - print("× 不再追踪:"+each_doc['name']) - self.coll.update_one({'mid':each_doc['mid']},{'$set':{'focus':False}}) pre_fans = -1 c_fans = -1 - print("· 本轮筛选结果:") - print("× 不再追踪总数:"+str(count_unfocus)) - print("√ 持续追踪总数:"+str(count_focus)) - + logger.info("· 本轮筛选结果:") + logger.info("× 不再追踪总数:"+str(count_unfocus)) + logger.info("√ 持续追踪总数:"+str(count_focus)) + def fans_variation(self): pass - diff --git a/biliob_analyzer/author_rate_caculate.py b/biliob_analyzer/author_rate_caculate.py index 2cb7acb..26cbbb1 100644 --- a/biliob_analyzer/author_rate_caculate.py +++ b/biliob_analyzer/author_rate_caculate.py @@ -1,8 +1,14 @@ from db import settings from db import db import datetime +import logging + +logging.basicConfig(level=logging.INFO, + format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') +logger = logging.getLogger(__name__) + coll = db['author'] # 获得collection的句柄 -print('开始计算粉丝增速') +logger.info('开始计算粉丝增速') for each_author in coll.find().batch_size(8): rate = [] i = 0 diff --git a/biliob_analyzer/video_analyzer.py b/biliob_analyzer/video_analyzer.py index 92d58db..25a41f7 100644 --- a/biliob_analyzer/video_analyzer.py +++ b/biliob_analyzer/video_analyzer.py @@ -1,16 +1,24 @@ from db import settings -from pymongo import MongoClient +from pymongo import MongoClient from datetime import datetime from datetime import timedelta +import logging + +logging.basicConfig(level=logging.INFO, + format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') +logger = logging.getLogger(__name__) + + class VideoAnalyzer(object): def __init__(self): # 链接mongoDB self.client = MongoClient(settings['MINGO_HOST'], 27017) # 数据库登录需要帐号密码 self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) + settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['video'] # 获得collection的句柄 + def video_filter(self): pre_view = -1 c_view = -1 @@ -20,14 +28,14 @@ def video_filter(self): count_delete = 0 count_unfocus = 0 count_focus = 0 - for each_doc in self.coll.find({'focus':True}): + for each_doc in self.coll.find({'focus': True}): live_time = 0 delete = False focus = True if 'data' in each_doc: each_doc['data'].reverse() for each_data in each_doc['data']: - + if pre_view == -1: pre_view = each_data['view'] pre_date = each_data['datetime'] @@ -37,7 +45,7 @@ def video_filter(self): if pre_date + delta > c_date: continue - live_time +=1 + live_time += 1 rate = (c_view-pre_view) pre_view = c_view pre_date = c_date @@ -57,19 +65,22 @@ def video_filter(self): delete = False if delete: count_delete += 1 - print("! 删除追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data'])-1]['view'])) - self.coll.delete_one({'aid':each_doc['aid']}) + logger.info("! 删除追踪:"+each_doc['title']+' 当前播放:'+str( + each_doc['data'][len(each_doc['data'])-1]['view'])) + self.coll.delete_one({'aid': each_doc['aid']}) elif focus: count_focus += 1 - print("√ 持续追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data'])-1]['view'])) + logger.info("√ 持续追踪:"+each_doc['title']+' 当前播放:'+str( + each_doc['data'][len(each_doc['data'])-1]['view'])) else: count_unfocus += 1 - print("× 不再追踪:"+each_doc['title']+' 当前播放:'+str(each_doc['data'][len(each_doc['data'])-1]['view'])) - self.coll.update_one({'aid':each_doc['aid']},{'$set':{'focus':False}}) + logger.info("× 不再追踪:"+each_doc['title']+' 当前播放:'+str( + each_doc['data'][len(each_doc['data'])-1]['view'])) + self.coll.update_one({'aid': each_doc['aid']}, { + '$set': {'focus': False}}) pre_view = -1 c_view = -1 - print("· 本轮筛选结果:") - print("! 删除辣鸡总数:"+str(count_delete)) - print("× 不再追踪总数:"+str(count_unfocus)) - print("√ 持续追踪总数:"+str(count_focus)) - + logger.info("· 本轮筛选结果:") + logger.info("! 删除辣鸡总数:"+str(count_delete)) + logger.info("× 不再追踪总数:"+str(count_unfocus)) + logger.info("√ 持续追踪总数:"+str(count_focus)) diff --git a/biliob_analyzer/video_rank.py b/biliob_analyzer/video_rank.py new file mode 100644 index 0000000..3b1d679 --- /dev/null +++ b/biliob_analyzer/video_rank.py @@ -0,0 +1,39 @@ +from db import settings +from db import db +import datetime +import logging +from pymongo import DESCENDING +coll = db['video'] # 获得collection的句柄 + +logging.basicConfig(level=logging.INFO, + format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') +logger = logging.getLogger(__name__) + +logger.info("开始计算视频数据排名") + +i = 1 + +keys = ['cView', 'cLike', 'cDanmaku', 'cFavorite', 'cCoin', 'cShare'] +for each_key in keys: + rank_key = each_key[1:] + 'Rank' + logger.info("开始计算视频{}排名".format(each_key)) + videos = coll.find({}, {'aid': 1, 'rank': 1, each_key: 1}).batch_size( + 20).sort(each_key, DESCENDING) + for each_video in videos: + # 如果没有data 直接下一个 + if each_key in each_video: + if 'rank' in each_video: + rank = each_video['rank'] + rank[rank_key] = i + else: + rank = { + rank_key: i + } + i += 1 + coll.update_one({'aid': each_video['aid']}, { + '$set': { + 'rank': rank, + } + }) + + logger.info("完成计算视频数据排名") From 8a38e67d9dafba78839edba41230aa9172816a39 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Thu, 14 Feb 2019 14:03:10 +0800 Subject: [PATCH 153/469] feature: refactor the author rank algorithm --- biliob_analyzer/author_rank.py | 100 ++++++++++++++------------------- 1 file changed, 41 insertions(+), 59 deletions(-) diff --git a/biliob_analyzer/author_rank.py b/biliob_analyzer/author_rank.py index 9200a40..43f463d 100644 --- a/biliob_analyzer/author_rank.py +++ b/biliob_analyzer/author_rank.py @@ -8,70 +8,52 @@ logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') logger = logging.getLogger(__name__) - -logger.info("开始计算作者粉丝排名") -i = 1 -authors = coll.find({}, {'mid': 1, 'rank': 1, 'cFans': 1}).batch_size( - 20).sort('cFans', DESCENDING) -for each_author in authors: - # 如果没有data 直接下一个 - if 'cFans' in each_author: - if 'rank' in each_author: - rank = each_author['rank'] - rank['fansRank'] = i - else: - rank = { - 'fansRank': i - } - i += 1 - coll.update_one({'mid': each_author['mid']}, { - '$set': { - 'rank': rank, - } - }) - pass - -logger.info("开始计算作者播放排名") -authors = coll.find({}, {'mid': 1, 'rank': 1, 'cArchive_view': 1}).batch_size( - 20).sort('cArchive_view', DESCENDING) -for each_author in authors: - # 如果没有data 直接下一个 - if 'cArchive_view' in each_author: - if 'rank' in each_author: - rank = each_author['rank'] - rank['archiveViewRank'] = i - else: - rank = { - 'archiveViewRank': i - } - i += 1 +for each_key in ['cFans', 'cArchive_view', 'cArticle_view']: + logger.info("开始计算作者{}排名".format(each_key)) + i = 1 + authors = coll.find({each_key: {'$exists': 1}}, {'mid': 1, 'rank': 1, each_key: 1}).batch_size( + 300).sort(each_key, DESCENDING) + if each_key == 'cFans': + each_rank = 'fansRank' + each_d_rank = 'dFansRank' + elif each_key == 'cArchive_view': + each_rank = 'archiveViewRank' + each_d_rank = 'dArchiveViewRank' + elif each_key == 'cArticle_view': + each_rank = 'articleViewRank' + each_d_rank = 'dArticleViewRank' + for each_author in authors: + # 如果没有data 直接下一个 + if each_key in each_author: + if 'rank' in each_author: + rank = each_author['rank'] + if each_rank in each_author['rank']: + rank[each_d_rank] = each_author['rank'][each_rank] - i + else: + rank[each_d_rank] = -1 + rank[each_rank] = i + else: + rank = { + each_rank: i, + each_d_rank: -1 + } + if each_author[each_key] == 0: + if 'rank' in each_author: + rank = each_author['rank'] + rank[each_d_rank] = 0 + rank[each_rank] = -1 + else: + rank = { + each_rank: -1, + each_d_rank: 0 + } + if each_key == 'cArticle_view': + rank['updateTime'] = datetime.datetime.now() coll.update_one({'mid': each_author['mid']}, { '$set': { 'rank': rank, } }) - pass - -logger.info("开始计算作者专栏排名") -i = 1 -authors = coll.find({}, {'mid': 1, 'rank': 1, 'cArticle_view': 1}).batch_size( - 20).sort('cArticle_view', DESCENDING) -for each_author in authors: - # 如果没有data 直接下一个 - if 'cArticle_view' in each_author: - if 'rank' in each_author: - rank = each_author['rank'] - rank['articleViewRank'] = i - else: - rank = { - 'articleViewRank': i - } i += 1 - coll.update_one({'mid': each_author['mid']}, { - '$set': { - 'rank': rank, - } - }) - pass logger.info("计算作者排名结束") From d2d3f0b421740777792036fcb5c99989da3b8fee Mon Sep 17 00:00:00 2001 From: Jannchie Date: Thu, 14 Feb 2019 14:03:10 +0800 Subject: [PATCH 154/469] feature: refactor the author rank algorithm --- biliob_analyzer/author_rank.py | 100 ++++++++++++++------------------- 1 file changed, 41 insertions(+), 59 deletions(-) diff --git a/biliob_analyzer/author_rank.py b/biliob_analyzer/author_rank.py index 9200a40..43f463d 100644 --- a/biliob_analyzer/author_rank.py +++ b/biliob_analyzer/author_rank.py @@ -8,70 +8,52 @@ logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') logger = logging.getLogger(__name__) - -logger.info("开始计算作者粉丝排名") -i = 1 -authors = coll.find({}, {'mid': 1, 'rank': 1, 'cFans': 1}).batch_size( - 20).sort('cFans', DESCENDING) -for each_author in authors: - # 如果没有data 直接下一个 - if 'cFans' in each_author: - if 'rank' in each_author: - rank = each_author['rank'] - rank['fansRank'] = i - else: - rank = { - 'fansRank': i - } - i += 1 - coll.update_one({'mid': each_author['mid']}, { - '$set': { - 'rank': rank, - } - }) - pass - -logger.info("开始计算作者播放排名") -authors = coll.find({}, {'mid': 1, 'rank': 1, 'cArchive_view': 1}).batch_size( - 20).sort('cArchive_view', DESCENDING) -for each_author in authors: - # 如果没有data 直接下一个 - if 'cArchive_view' in each_author: - if 'rank' in each_author: - rank = each_author['rank'] - rank['archiveViewRank'] = i - else: - rank = { - 'archiveViewRank': i - } - i += 1 +for each_key in ['cFans', 'cArchive_view', 'cArticle_view']: + logger.info("开始计算作者{}排名".format(each_key)) + i = 1 + authors = coll.find({each_key: {'$exists': 1}}, {'mid': 1, 'rank': 1, each_key: 1}).batch_size( + 300).sort(each_key, DESCENDING) + if each_key == 'cFans': + each_rank = 'fansRank' + each_d_rank = 'dFansRank' + elif each_key == 'cArchive_view': + each_rank = 'archiveViewRank' + each_d_rank = 'dArchiveViewRank' + elif each_key == 'cArticle_view': + each_rank = 'articleViewRank' + each_d_rank = 'dArticleViewRank' + for each_author in authors: + # 如果没有data 直接下一个 + if each_key in each_author: + if 'rank' in each_author: + rank = each_author['rank'] + if each_rank in each_author['rank']: + rank[each_d_rank] = each_author['rank'][each_rank] - i + else: + rank[each_d_rank] = -1 + rank[each_rank] = i + else: + rank = { + each_rank: i, + each_d_rank: -1 + } + if each_author[each_key] == 0: + if 'rank' in each_author: + rank = each_author['rank'] + rank[each_d_rank] = 0 + rank[each_rank] = -1 + else: + rank = { + each_rank: -1, + each_d_rank: 0 + } + if each_key == 'cArticle_view': + rank['updateTime'] = datetime.datetime.now() coll.update_one({'mid': each_author['mid']}, { '$set': { 'rank': rank, } }) - pass - -logger.info("开始计算作者专栏排名") -i = 1 -authors = coll.find({}, {'mid': 1, 'rank': 1, 'cArticle_view': 1}).batch_size( - 20).sort('cArticle_view', DESCENDING) -for each_author in authors: - # 如果没有data 直接下一个 - if 'cArticle_view' in each_author: - if 'rank' in each_author: - rank = each_author['rank'] - rank['articleViewRank'] = i - else: - rank = { - 'articleViewRank': i - } i += 1 - coll.update_one({'mid': each_author['mid']}, { - '$set': { - 'rank': rank, - } - }) - pass logger.info("计算作者排名结束") From 71741884c356c635326afe7bb745a96f6798c31b Mon Sep 17 00:00:00 2001 From: Jannchie Date: Thu, 14 Feb 2019 14:03:10 +0800 Subject: [PATCH 155/469] feature: refactor the author rank algorithm --- biliob_analyzer/author_rank.py | 100 ++++++++++++++------------------- 1 file changed, 41 insertions(+), 59 deletions(-) diff --git a/biliob_analyzer/author_rank.py b/biliob_analyzer/author_rank.py index 9200a40..43f463d 100644 --- a/biliob_analyzer/author_rank.py +++ b/biliob_analyzer/author_rank.py @@ -8,70 +8,52 @@ logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') logger = logging.getLogger(__name__) - -logger.info("开始计算作者粉丝排名") -i = 1 -authors = coll.find({}, {'mid': 1, 'rank': 1, 'cFans': 1}).batch_size( - 20).sort('cFans', DESCENDING) -for each_author in authors: - # 如果没有data 直接下一个 - if 'cFans' in each_author: - if 'rank' in each_author: - rank = each_author['rank'] - rank['fansRank'] = i - else: - rank = { - 'fansRank': i - } - i += 1 - coll.update_one({'mid': each_author['mid']}, { - '$set': { - 'rank': rank, - } - }) - pass - -logger.info("开始计算作者播放排名") -authors = coll.find({}, {'mid': 1, 'rank': 1, 'cArchive_view': 1}).batch_size( - 20).sort('cArchive_view', DESCENDING) -for each_author in authors: - # 如果没有data 直接下一个 - if 'cArchive_view' in each_author: - if 'rank' in each_author: - rank = each_author['rank'] - rank['archiveViewRank'] = i - else: - rank = { - 'archiveViewRank': i - } - i += 1 +for each_key in ['cFans', 'cArchive_view', 'cArticle_view']: + logger.info("开始计算作者{}排名".format(each_key)) + i = 1 + authors = coll.find({each_key: {'$exists': 1}}, {'mid': 1, 'rank': 1, each_key: 1}).batch_size( + 300).sort(each_key, DESCENDING) + if each_key == 'cFans': + each_rank = 'fansRank' + each_d_rank = 'dFansRank' + elif each_key == 'cArchive_view': + each_rank = 'archiveViewRank' + each_d_rank = 'dArchiveViewRank' + elif each_key == 'cArticle_view': + each_rank = 'articleViewRank' + each_d_rank = 'dArticleViewRank' + for each_author in authors: + # 如果没有data 直接下一个 + if each_key in each_author: + if 'rank' in each_author: + rank = each_author['rank'] + if each_rank in each_author['rank']: + rank[each_d_rank] = each_author['rank'][each_rank] - i + else: + rank[each_d_rank] = -1 + rank[each_rank] = i + else: + rank = { + each_rank: i, + each_d_rank: -1 + } + if each_author[each_key] == 0: + if 'rank' in each_author: + rank = each_author['rank'] + rank[each_d_rank] = 0 + rank[each_rank] = -1 + else: + rank = { + each_rank: -1, + each_d_rank: 0 + } + if each_key == 'cArticle_view': + rank['updateTime'] = datetime.datetime.now() coll.update_one({'mid': each_author['mid']}, { '$set': { 'rank': rank, } }) - pass - -logger.info("开始计算作者专栏排名") -i = 1 -authors = coll.find({}, {'mid': 1, 'rank': 1, 'cArticle_view': 1}).batch_size( - 20).sort('cArticle_view', DESCENDING) -for each_author in authors: - # 如果没有data 直接下一个 - if 'cArticle_view' in each_author: - if 'rank' in each_author: - rank = each_author['rank'] - rank['articleViewRank'] = i - else: - rank = { - 'articleViewRank': i - } i += 1 - coll.update_one({'mid': each_author['mid']}, { - '$set': { - 'rank': rank, - } - }) - pass logger.info("计算作者排名结束") From ad082afb9497a5ce1b5ecd05820394ca73024c65 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Thu, 14 Feb 2019 14:35:24 +0800 Subject: [PATCH 156/469] feature: video rank algorithm --- biliob_analyzer/video_rank.py | 44 +++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/biliob_analyzer/video_rank.py b/biliob_analyzer/video_rank.py index 3b1d679..7466c3e 100644 --- a/biliob_analyzer/video_rank.py +++ b/biliob_analyzer/video_rank.py @@ -15,25 +15,49 @@ keys = ['cView', 'cLike', 'cDanmaku', 'cFavorite', 'cCoin', 'cShare'] for each_key in keys: - rank_key = each_key[1:] + 'Rank' logger.info("开始计算视频{}排名".format(each_key)) - videos = coll.find({}, {'aid': 1, 'rank': 1, each_key: 1}).batch_size( - 20).sort(each_key, DESCENDING) + i = 1 + videos = coll.find({each_key: {'$exists': 1}}, {'aid': 1, 'rank': 1, each_key: 1}).batch_size( + 300).sort(each_key, DESCENDING) + if each_key == 'cView': + each_rank = 'cViewRank' + each_d_rank = 'dViewRank' + + each_rank = each_key + 'Rank' + each_d_rank = 'd' + each_key[1:] + 'Rank' + for each_video in videos: # 如果没有data 直接下一个 if each_key in each_video: if 'rank' in each_video: rank = each_video['rank'] - rank[rank_key] = i + if each_rank in each_video['rank']: + rank[each_d_rank] = each_video['rank'][each_rank] - i + else: + rank[each_d_rank] = -1 + rank[each_rank] = i else: rank = { - rank_key: i + each_rank: i, + each_d_rank: -1 } - i += 1 - coll.update_one({'aid': each_video['aid']}, { - '$set': { - 'rank': rank, + if each_video[each_key] == 0: + if 'rank' in each_video: + rank = each_video['rank'] + rank[each_d_rank] = 0 + rank[each_rank] = -1 + else: + rank = { + each_rank: -1, + each_d_rank: 0 } - }) + if each_key == keys[-1]: + rank['updateTime'] = datetime.datetime.now() + coll.update_one({'aid': each_video['aid']}, { + '$set': { + 'rank': rank, + } + }) + i += 1 logger.info("完成计算视频数据排名") From 134bab3fece77716cab3275782e7d596c20abee5 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Thu, 14 Feb 2019 14:35:24 +0800 Subject: [PATCH 157/469] feature: video rank algorithm --- biliob_analyzer/video_rank.py | 44 +++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/biliob_analyzer/video_rank.py b/biliob_analyzer/video_rank.py index 3b1d679..7466c3e 100644 --- a/biliob_analyzer/video_rank.py +++ b/biliob_analyzer/video_rank.py @@ -15,25 +15,49 @@ keys = ['cView', 'cLike', 'cDanmaku', 'cFavorite', 'cCoin', 'cShare'] for each_key in keys: - rank_key = each_key[1:] + 'Rank' logger.info("开始计算视频{}排名".format(each_key)) - videos = coll.find({}, {'aid': 1, 'rank': 1, each_key: 1}).batch_size( - 20).sort(each_key, DESCENDING) + i = 1 + videos = coll.find({each_key: {'$exists': 1}}, {'aid': 1, 'rank': 1, each_key: 1}).batch_size( + 300).sort(each_key, DESCENDING) + if each_key == 'cView': + each_rank = 'cViewRank' + each_d_rank = 'dViewRank' + + each_rank = each_key + 'Rank' + each_d_rank = 'd' + each_key[1:] + 'Rank' + for each_video in videos: # 如果没有data 直接下一个 if each_key in each_video: if 'rank' in each_video: rank = each_video['rank'] - rank[rank_key] = i + if each_rank in each_video['rank']: + rank[each_d_rank] = each_video['rank'][each_rank] - i + else: + rank[each_d_rank] = -1 + rank[each_rank] = i else: rank = { - rank_key: i + each_rank: i, + each_d_rank: -1 } - i += 1 - coll.update_one({'aid': each_video['aid']}, { - '$set': { - 'rank': rank, + if each_video[each_key] == 0: + if 'rank' in each_video: + rank = each_video['rank'] + rank[each_d_rank] = 0 + rank[each_rank] = -1 + else: + rank = { + each_rank: -1, + each_d_rank: 0 } - }) + if each_key == keys[-1]: + rank['updateTime'] = datetime.datetime.now() + coll.update_one({'aid': each_video['aid']}, { + '$set': { + 'rank': rank, + } + }) + i += 1 logger.info("完成计算视频数据排名") From a8177b7ca6fe859200b59d09dbb5125b9183df2c Mon Sep 17 00:00:00 2001 From: Jannchie Date: Thu, 14 Feb 2019 14:35:24 +0800 Subject: [PATCH 158/469] feature: video rank algorithm --- biliob_analyzer/video_rank.py | 44 +++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/biliob_analyzer/video_rank.py b/biliob_analyzer/video_rank.py index 3b1d679..7466c3e 100644 --- a/biliob_analyzer/video_rank.py +++ b/biliob_analyzer/video_rank.py @@ -15,25 +15,49 @@ keys = ['cView', 'cLike', 'cDanmaku', 'cFavorite', 'cCoin', 'cShare'] for each_key in keys: - rank_key = each_key[1:] + 'Rank' logger.info("开始计算视频{}排名".format(each_key)) - videos = coll.find({}, {'aid': 1, 'rank': 1, each_key: 1}).batch_size( - 20).sort(each_key, DESCENDING) + i = 1 + videos = coll.find({each_key: {'$exists': 1}}, {'aid': 1, 'rank': 1, each_key: 1}).batch_size( + 300).sort(each_key, DESCENDING) + if each_key == 'cView': + each_rank = 'cViewRank' + each_d_rank = 'dViewRank' + + each_rank = each_key + 'Rank' + each_d_rank = 'd' + each_key[1:] + 'Rank' + for each_video in videos: # 如果没有data 直接下一个 if each_key in each_video: if 'rank' in each_video: rank = each_video['rank'] - rank[rank_key] = i + if each_rank in each_video['rank']: + rank[each_d_rank] = each_video['rank'][each_rank] - i + else: + rank[each_d_rank] = -1 + rank[each_rank] = i else: rank = { - rank_key: i + each_rank: i, + each_d_rank: -1 } - i += 1 - coll.update_one({'aid': each_video['aid']}, { - '$set': { - 'rank': rank, + if each_video[each_key] == 0: + if 'rank' in each_video: + rank = each_video['rank'] + rank[each_d_rank] = 0 + rank[each_rank] = -1 + else: + rank = { + each_rank: -1, + each_d_rank: 0 } - }) + if each_key == keys[-1]: + rank['updateTime'] = datetime.datetime.now() + coll.update_one({'aid': each_video['aid']}, { + '$set': { + 'rank': rank, + } + }) + i += 1 logger.info("完成计算视频数据排名") From 5059391d2222c296f1a55335fb4bc58ed66bf507 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Fri, 15 Feb 2019 16:48:26 +0800 Subject: [PATCH 159/469] feature: video rank --- run_analyzer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/run_analyzer.py b/run_analyzer.py index a8b171d..787dd46 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -3,6 +3,7 @@ import biliob_analyzer.author_rate_caculate import biliob_analyzer.author_fans_watcher import biliob_analyzer.author_rank +import biliob_analyzer.video_rank author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From 869ef723647c0a7faf2c77214349ee1e7b9d3903 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Fri, 15 Feb 2019 16:48:26 +0800 Subject: [PATCH 160/469] feature: video rank --- run_analyzer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/run_analyzer.py b/run_analyzer.py index a8b171d..787dd46 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -3,6 +3,7 @@ import biliob_analyzer.author_rate_caculate import biliob_analyzer.author_fans_watcher import biliob_analyzer.author_rank +import biliob_analyzer.video_rank author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From bf06ca9f5e982461cc760c7b95c32e171d9370df Mon Sep 17 00:00:00 2001 From: Jannchie Date: Fri, 15 Feb 2019 16:48:26 +0800 Subject: [PATCH 161/469] feature: video rank --- run_analyzer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/run_analyzer.py b/run_analyzer.py index a8b171d..787dd46 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -3,6 +3,7 @@ import biliob_analyzer.author_rate_caculate import biliob_analyzer.author_fans_watcher import biliob_analyzer.author_rank +import biliob_analyzer.video_rank author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From 579903bf3f0b6392d64fd35d35477c17cfe9fa3a Mon Sep 17 00:00:00 2001 From: Jannchie Date: Fri, 15 Feb 2019 17:20:06 +0800 Subject: [PATCH 162/469] feature: video rank --- run_analyzer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/run_analyzer.py b/run_analyzer.py index a8b171d..787dd46 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -3,6 +3,7 @@ import biliob_analyzer.author_rate_caculate import biliob_analyzer.author_fans_watcher import biliob_analyzer.author_rank +import biliob_analyzer.video_rank author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From 4c0635703789b6f8f45156df5ba05c654d0ab93a Mon Sep 17 00:00:00 2001 From: Jannchie Date: Fri, 15 Feb 2019 17:20:06 +0800 Subject: [PATCH 163/469] feature: video rank --- run_analyzer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/run_analyzer.py b/run_analyzer.py index a8b171d..787dd46 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -3,6 +3,7 @@ import biliob_analyzer.author_rate_caculate import biliob_analyzer.author_fans_watcher import biliob_analyzer.author_rank +import biliob_analyzer.video_rank author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From 5350f7a993f5cdb65a3d254a118eb45fb8c27700 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Fri, 15 Feb 2019 17:20:06 +0800 Subject: [PATCH 164/469] feature: video rank --- run_analyzer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/run_analyzer.py b/run_analyzer.py index a8b171d..787dd46 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -3,6 +3,7 @@ import biliob_analyzer.author_rate_caculate import biliob_analyzer.author_fans_watcher import biliob_analyzer.author_rank +import biliob_analyzer.video_rank author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From c7d187df08da1d3d8f904fb188193fada203566e Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 16 Feb 2019 21:08:26 +0800 Subject: [PATCH 165/469] feature: calculate video rank weekly --- biliob_analyzer/video_analyzer.py | 6 -- biliob_analyzer/video_rank.py | 105 +++++++++++++++--------------- run.py | 10 +++ run_analyzer.py | 2 +- 4 files changed, 65 insertions(+), 58 deletions(-) diff --git a/biliob_analyzer/video_analyzer.py b/biliob_analyzer/video_analyzer.py index 25a41f7..f0b04b6 100644 --- a/biliob_analyzer/video_analyzer.py +++ b/biliob_analyzer/video_analyzer.py @@ -65,17 +65,11 @@ def video_filter(self): delete = False if delete: count_delete += 1 - logger.info("! 删除追踪:"+each_doc['title']+' 当前播放:'+str( - each_doc['data'][len(each_doc['data'])-1]['view'])) self.coll.delete_one({'aid': each_doc['aid']}) elif focus: count_focus += 1 - logger.info("√ 持续追踪:"+each_doc['title']+' 当前播放:'+str( - each_doc['data'][len(each_doc['data'])-1]['view'])) else: count_unfocus += 1 - logger.info("× 不再追踪:"+each_doc['title']+' 当前播放:'+str( - each_doc['data'][len(each_doc['data'])-1]['view'])) self.coll.update_one({'aid': each_doc['aid']}, { '$set': {'focus': False}}) pre_view = -1 diff --git a/biliob_analyzer/video_rank.py b/biliob_analyzer/video_rank.py index 7466c3e..c7c3622 100644 --- a/biliob_analyzer/video_rank.py +++ b/biliob_analyzer/video_rank.py @@ -3,61 +3,64 @@ import datetime import logging from pymongo import DESCENDING -coll = db['video'] # 获得collection的句柄 -logging.basicConfig(level=logging.INFO, - format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') -logger = logging.getLogger(__name__) -logger.info("开始计算视频数据排名") +def computeVideoRank(): + coll = db['video'] # 获得collection的句柄 -i = 1 + logging.basicConfig(level=logging.INFO, + format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') + logger = logging.getLogger(__name__) + + logger.info("开始计算视频数据排名") -keys = ['cView', 'cLike', 'cDanmaku', 'cFavorite', 'cCoin', 'cShare'] -for each_key in keys: - logger.info("开始计算视频{}排名".format(each_key)) i = 1 - videos = coll.find({each_key: {'$exists': 1}}, {'aid': 1, 'rank': 1, each_key: 1}).batch_size( - 300).sort(each_key, DESCENDING) - if each_key == 'cView': - each_rank = 'cViewRank' - each_d_rank = 'dViewRank' - - each_rank = each_key + 'Rank' - each_d_rank = 'd' + each_key[1:] + 'Rank' - - for each_video in videos: - # 如果没有data 直接下一个 - if each_key in each_video: - if 'rank' in each_video: - rank = each_video['rank'] - if each_rank in each_video['rank']: - rank[each_d_rank] = each_video['rank'][each_rank] - i + + keys = ['cView', 'cLike', 'cDanmaku', 'cFavorite', 'cCoin', 'cShare'] + for each_key in keys: + logger.info("开始计算视频{}排名".format(each_key)) + i = 1 + videos = coll.find({each_key: {'$exists': 1}}, {'aid': 1, 'rank': 1, each_key: 1}).batch_size( + 300).sort(each_key, DESCENDING) + if each_key == 'cView': + each_rank = 'cViewRank' + each_d_rank = 'dViewRank' + + each_rank = each_key + 'Rank' + each_d_rank = 'd' + each_key[1:] + 'Rank' + + for each_video in videos: + # 如果没有data 直接下一个 + if each_key in each_video: + if 'rank' in each_video: + rank = each_video['rank'] + if each_rank in each_video['rank']: + rank[each_d_rank] = each_video['rank'][each_rank] - i + else: + rank[each_d_rank] = -1 + rank[each_rank] = i else: - rank[each_d_rank] = -1 - rank[each_rank] = i - else: - rank = { - each_rank: i, - each_d_rank: -1 - } - if each_video[each_key] == 0: - if 'rank' in each_video: - rank = each_video['rank'] - rank[each_d_rank] = 0 - rank[each_rank] = -1 - else: - rank = { - each_rank: -1, - each_d_rank: 0 + rank = { + each_rank: i, + each_d_rank: -1 + } + if each_video[each_key] == 0: + if 'rank' in each_video: + rank = each_video['rank'] + rank[each_d_rank] = 0 + rank[each_rank] = -1 + else: + rank = { + each_rank: -1, + each_d_rank: 0 + } + if each_key == keys[-1]: + rank['updateTime'] = datetime.datetime.now() + coll.update_one({'aid': each_video['aid']}, { + '$set': { + 'rank': rank, } - if each_key == keys[-1]: - rank['updateTime'] = datetime.datetime.now() - coll.update_one({'aid': each_video['aid']}, { - '$set': { - 'rank': rank, - } - }) - i += 1 - - logger.info("完成计算视频数据排名") + }) + i += 1 + + logger.info("完成计算视频数据排名") diff --git a/run.py b/run.py index 13b2933..8377714 100644 --- a/run.py +++ b/run.py @@ -6,6 +6,11 @@ from subprocess import Popen import logging import threading +from biliob_analyzer.video_rank import computeVideoRank + + +def videoRank(): + computeVideoRank() def site(): @@ -52,6 +57,10 @@ def data_analyze(): Popen(['python', 'run_analyzer.py']) +def weekly_analyze(): + Popen(['python', 'run_weekly_analyzer.py']) + + def bili_monthly_rank(): Popen(['scrapy', 'crawl', 'biliMonthlyRank']) @@ -70,6 +79,7 @@ def run_threaded(job_func): schedule.every().day.at('22:00').do(run_threaded, video_watcher) schedule.every().day.at('21:00').do(run_threaded, bili_monthly_rank) schedule.every().week.do(run_threaded, video_spider_all) +schedule.every().week.do(run_threaded, videoRank) schedule.every().hour.do(run_threaded, site) schedule.every(15).minutes.do(run_threaded, online) schedule.every(10).minutes.do(run_threaded, strong) diff --git a/run_analyzer.py b/run_analyzer.py index 787dd46..f5fc193 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -3,7 +3,7 @@ import biliob_analyzer.author_rate_caculate import biliob_analyzer.author_fans_watcher import biliob_analyzer.author_rank -import biliob_analyzer.video_rank +# import biliob_analyzer.video_rank author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From 7ff68171ac8e05b546e7f206b64d6162af5e3152 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 16 Feb 2019 21:08:26 +0800 Subject: [PATCH 166/469] feature: calculate video rank weekly --- biliob_analyzer/video_analyzer.py | 6 -- biliob_analyzer/video_rank.py | 105 +++++++++++++++--------------- run.py | 10 +++ run_analyzer.py | 2 +- 4 files changed, 65 insertions(+), 58 deletions(-) diff --git a/biliob_analyzer/video_analyzer.py b/biliob_analyzer/video_analyzer.py index 25a41f7..f0b04b6 100644 --- a/biliob_analyzer/video_analyzer.py +++ b/biliob_analyzer/video_analyzer.py @@ -65,17 +65,11 @@ def video_filter(self): delete = False if delete: count_delete += 1 - logger.info("! 删除追踪:"+each_doc['title']+' 当前播放:'+str( - each_doc['data'][len(each_doc['data'])-1]['view'])) self.coll.delete_one({'aid': each_doc['aid']}) elif focus: count_focus += 1 - logger.info("√ 持续追踪:"+each_doc['title']+' 当前播放:'+str( - each_doc['data'][len(each_doc['data'])-1]['view'])) else: count_unfocus += 1 - logger.info("× 不再追踪:"+each_doc['title']+' 当前播放:'+str( - each_doc['data'][len(each_doc['data'])-1]['view'])) self.coll.update_one({'aid': each_doc['aid']}, { '$set': {'focus': False}}) pre_view = -1 diff --git a/biliob_analyzer/video_rank.py b/biliob_analyzer/video_rank.py index 7466c3e..c7c3622 100644 --- a/biliob_analyzer/video_rank.py +++ b/biliob_analyzer/video_rank.py @@ -3,61 +3,64 @@ import datetime import logging from pymongo import DESCENDING -coll = db['video'] # 获得collection的句柄 -logging.basicConfig(level=logging.INFO, - format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') -logger = logging.getLogger(__name__) -logger.info("开始计算视频数据排名") +def computeVideoRank(): + coll = db['video'] # 获得collection的句柄 -i = 1 + logging.basicConfig(level=logging.INFO, + format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') + logger = logging.getLogger(__name__) + + logger.info("开始计算视频数据排名") -keys = ['cView', 'cLike', 'cDanmaku', 'cFavorite', 'cCoin', 'cShare'] -for each_key in keys: - logger.info("开始计算视频{}排名".format(each_key)) i = 1 - videos = coll.find({each_key: {'$exists': 1}}, {'aid': 1, 'rank': 1, each_key: 1}).batch_size( - 300).sort(each_key, DESCENDING) - if each_key == 'cView': - each_rank = 'cViewRank' - each_d_rank = 'dViewRank' - - each_rank = each_key + 'Rank' - each_d_rank = 'd' + each_key[1:] + 'Rank' - - for each_video in videos: - # 如果没有data 直接下一个 - if each_key in each_video: - if 'rank' in each_video: - rank = each_video['rank'] - if each_rank in each_video['rank']: - rank[each_d_rank] = each_video['rank'][each_rank] - i + + keys = ['cView', 'cLike', 'cDanmaku', 'cFavorite', 'cCoin', 'cShare'] + for each_key in keys: + logger.info("开始计算视频{}排名".format(each_key)) + i = 1 + videos = coll.find({each_key: {'$exists': 1}}, {'aid': 1, 'rank': 1, each_key: 1}).batch_size( + 300).sort(each_key, DESCENDING) + if each_key == 'cView': + each_rank = 'cViewRank' + each_d_rank = 'dViewRank' + + each_rank = each_key + 'Rank' + each_d_rank = 'd' + each_key[1:] + 'Rank' + + for each_video in videos: + # 如果没有data 直接下一个 + if each_key in each_video: + if 'rank' in each_video: + rank = each_video['rank'] + if each_rank in each_video['rank']: + rank[each_d_rank] = each_video['rank'][each_rank] - i + else: + rank[each_d_rank] = -1 + rank[each_rank] = i else: - rank[each_d_rank] = -1 - rank[each_rank] = i - else: - rank = { - each_rank: i, - each_d_rank: -1 - } - if each_video[each_key] == 0: - if 'rank' in each_video: - rank = each_video['rank'] - rank[each_d_rank] = 0 - rank[each_rank] = -1 - else: - rank = { - each_rank: -1, - each_d_rank: 0 + rank = { + each_rank: i, + each_d_rank: -1 + } + if each_video[each_key] == 0: + if 'rank' in each_video: + rank = each_video['rank'] + rank[each_d_rank] = 0 + rank[each_rank] = -1 + else: + rank = { + each_rank: -1, + each_d_rank: 0 + } + if each_key == keys[-1]: + rank['updateTime'] = datetime.datetime.now() + coll.update_one({'aid': each_video['aid']}, { + '$set': { + 'rank': rank, } - if each_key == keys[-1]: - rank['updateTime'] = datetime.datetime.now() - coll.update_one({'aid': each_video['aid']}, { - '$set': { - 'rank': rank, - } - }) - i += 1 - - logger.info("完成计算视频数据排名") + }) + i += 1 + + logger.info("完成计算视频数据排名") diff --git a/run.py b/run.py index 13b2933..8377714 100644 --- a/run.py +++ b/run.py @@ -6,6 +6,11 @@ from subprocess import Popen import logging import threading +from biliob_analyzer.video_rank import computeVideoRank + + +def videoRank(): + computeVideoRank() def site(): @@ -52,6 +57,10 @@ def data_analyze(): Popen(['python', 'run_analyzer.py']) +def weekly_analyze(): + Popen(['python', 'run_weekly_analyzer.py']) + + def bili_monthly_rank(): Popen(['scrapy', 'crawl', 'biliMonthlyRank']) @@ -70,6 +79,7 @@ def run_threaded(job_func): schedule.every().day.at('22:00').do(run_threaded, video_watcher) schedule.every().day.at('21:00').do(run_threaded, bili_monthly_rank) schedule.every().week.do(run_threaded, video_spider_all) +schedule.every().week.do(run_threaded, videoRank) schedule.every().hour.do(run_threaded, site) schedule.every(15).minutes.do(run_threaded, online) schedule.every(10).minutes.do(run_threaded, strong) diff --git a/run_analyzer.py b/run_analyzer.py index 787dd46..f5fc193 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -3,7 +3,7 @@ import biliob_analyzer.author_rate_caculate import biliob_analyzer.author_fans_watcher import biliob_analyzer.author_rank -import biliob_analyzer.video_rank +# import biliob_analyzer.video_rank author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From a4a4b350c76cf44e43aa8368e66fcffbef24ff6d Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 16 Feb 2019 21:08:26 +0800 Subject: [PATCH 167/469] feature: calculate video rank weekly --- biliob_analyzer/video_analyzer.py | 6 -- biliob_analyzer/video_rank.py | 105 +++++++++++++++--------------- run.py | 10 +++ run_analyzer.py | 2 +- 4 files changed, 65 insertions(+), 58 deletions(-) diff --git a/biliob_analyzer/video_analyzer.py b/biliob_analyzer/video_analyzer.py index 25a41f7..f0b04b6 100644 --- a/biliob_analyzer/video_analyzer.py +++ b/biliob_analyzer/video_analyzer.py @@ -65,17 +65,11 @@ def video_filter(self): delete = False if delete: count_delete += 1 - logger.info("! 删除追踪:"+each_doc['title']+' 当前播放:'+str( - each_doc['data'][len(each_doc['data'])-1]['view'])) self.coll.delete_one({'aid': each_doc['aid']}) elif focus: count_focus += 1 - logger.info("√ 持续追踪:"+each_doc['title']+' 当前播放:'+str( - each_doc['data'][len(each_doc['data'])-1]['view'])) else: count_unfocus += 1 - logger.info("× 不再追踪:"+each_doc['title']+' 当前播放:'+str( - each_doc['data'][len(each_doc['data'])-1]['view'])) self.coll.update_one({'aid': each_doc['aid']}, { '$set': {'focus': False}}) pre_view = -1 diff --git a/biliob_analyzer/video_rank.py b/biliob_analyzer/video_rank.py index 7466c3e..c7c3622 100644 --- a/biliob_analyzer/video_rank.py +++ b/biliob_analyzer/video_rank.py @@ -3,61 +3,64 @@ import datetime import logging from pymongo import DESCENDING -coll = db['video'] # 获得collection的句柄 -logging.basicConfig(level=logging.INFO, - format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') -logger = logging.getLogger(__name__) -logger.info("开始计算视频数据排名") +def computeVideoRank(): + coll = db['video'] # 获得collection的句柄 -i = 1 + logging.basicConfig(level=logging.INFO, + format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') + logger = logging.getLogger(__name__) + + logger.info("开始计算视频数据排名") -keys = ['cView', 'cLike', 'cDanmaku', 'cFavorite', 'cCoin', 'cShare'] -for each_key in keys: - logger.info("开始计算视频{}排名".format(each_key)) i = 1 - videos = coll.find({each_key: {'$exists': 1}}, {'aid': 1, 'rank': 1, each_key: 1}).batch_size( - 300).sort(each_key, DESCENDING) - if each_key == 'cView': - each_rank = 'cViewRank' - each_d_rank = 'dViewRank' - - each_rank = each_key + 'Rank' - each_d_rank = 'd' + each_key[1:] + 'Rank' - - for each_video in videos: - # 如果没有data 直接下一个 - if each_key in each_video: - if 'rank' in each_video: - rank = each_video['rank'] - if each_rank in each_video['rank']: - rank[each_d_rank] = each_video['rank'][each_rank] - i + + keys = ['cView', 'cLike', 'cDanmaku', 'cFavorite', 'cCoin', 'cShare'] + for each_key in keys: + logger.info("开始计算视频{}排名".format(each_key)) + i = 1 + videos = coll.find({each_key: {'$exists': 1}}, {'aid': 1, 'rank': 1, each_key: 1}).batch_size( + 300).sort(each_key, DESCENDING) + if each_key == 'cView': + each_rank = 'cViewRank' + each_d_rank = 'dViewRank' + + each_rank = each_key + 'Rank' + each_d_rank = 'd' + each_key[1:] + 'Rank' + + for each_video in videos: + # 如果没有data 直接下一个 + if each_key in each_video: + if 'rank' in each_video: + rank = each_video['rank'] + if each_rank in each_video['rank']: + rank[each_d_rank] = each_video['rank'][each_rank] - i + else: + rank[each_d_rank] = -1 + rank[each_rank] = i else: - rank[each_d_rank] = -1 - rank[each_rank] = i - else: - rank = { - each_rank: i, - each_d_rank: -1 - } - if each_video[each_key] == 0: - if 'rank' in each_video: - rank = each_video['rank'] - rank[each_d_rank] = 0 - rank[each_rank] = -1 - else: - rank = { - each_rank: -1, - each_d_rank: 0 + rank = { + each_rank: i, + each_d_rank: -1 + } + if each_video[each_key] == 0: + if 'rank' in each_video: + rank = each_video['rank'] + rank[each_d_rank] = 0 + rank[each_rank] = -1 + else: + rank = { + each_rank: -1, + each_d_rank: 0 + } + if each_key == keys[-1]: + rank['updateTime'] = datetime.datetime.now() + coll.update_one({'aid': each_video['aid']}, { + '$set': { + 'rank': rank, } - if each_key == keys[-1]: - rank['updateTime'] = datetime.datetime.now() - coll.update_one({'aid': each_video['aid']}, { - '$set': { - 'rank': rank, - } - }) - i += 1 - - logger.info("完成计算视频数据排名") + }) + i += 1 + + logger.info("完成计算视频数据排名") diff --git a/run.py b/run.py index 13b2933..8377714 100644 --- a/run.py +++ b/run.py @@ -6,6 +6,11 @@ from subprocess import Popen import logging import threading +from biliob_analyzer.video_rank import computeVideoRank + + +def videoRank(): + computeVideoRank() def site(): @@ -52,6 +57,10 @@ def data_analyze(): Popen(['python', 'run_analyzer.py']) +def weekly_analyze(): + Popen(['python', 'run_weekly_analyzer.py']) + + def bili_monthly_rank(): Popen(['scrapy', 'crawl', 'biliMonthlyRank']) @@ -70,6 +79,7 @@ def run_threaded(job_func): schedule.every().day.at('22:00').do(run_threaded, video_watcher) schedule.every().day.at('21:00').do(run_threaded, bili_monthly_rank) schedule.every().week.do(run_threaded, video_spider_all) +schedule.every().week.do(run_threaded, videoRank) schedule.every().hour.do(run_threaded, site) schedule.every(15).minutes.do(run_threaded, online) schedule.every(10).minutes.do(run_threaded, strong) diff --git a/run_analyzer.py b/run_analyzer.py index 787dd46..f5fc193 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -3,7 +3,7 @@ import biliob_analyzer.author_rate_caculate import biliob_analyzer.author_fans_watcher import biliob_analyzer.author_rank -import biliob_analyzer.video_rank +# import biliob_analyzer.video_rank author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From a25da401cb43835257f1a0961b12acd1a161a632 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Mon, 18 Feb 2019 18:57:12 +0800 Subject: [PATCH 168/469] feature: redis distributed spider --- biliob_spider/filter.py | 6 + ...nes.py.1f305fa5e47c4f7467edd6e4cb280dc2.py | 453 ------------------ biliob_spider/settings.py | 7 +- .../spiders/author_update_with_redis.py | 2 +- biliob_spider/spiders/bili_monthly_rank.py | 31 +- .../spiders/video_spider_with_redis.py | 2 +- 6 files changed, 33 insertions(+), 468 deletions(-) create mode 100644 biliob_spider/filter.py delete mode 100644 biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py diff --git a/biliob_spider/filter.py b/biliob_spider/filter.py new file mode 100644 index 0000000..24537f9 --- /dev/null +++ b/biliob_spider/filter.py @@ -0,0 +1,6 @@ +from scrapy.dupefilter import RFPDupeFilter + + +class CloseDupefilter(RFPDupeFilter): + def request_seen(self, request): + return False diff --git a/biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py b/biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py deleted file mode 100644 index 38252ba..0000000 --- a/biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py +++ /dev/null @@ -1,453 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html -from pymongo import MongoClient -from db import settings -from db import mysql_connect -import datetime -import logging - - -class StrongPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - - def process_item(self, item, spider): - try: - self.coll = self.db['video'] - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'cView': item['current_view'], - 'cFavorite': item['current_favorite'], - 'cDanmaku': item['current_danmaku'], - 'cCoin': item['current_coin'], - 'cShare': item['current_share'], - 'cLike': item['current_like'], - 'cDatetime': item['current_datetime'], - 'author': item['author'], - 'subChannel': item['subChannel'], - 'channel': item['channel'], - 'mid': item['mid'], - 'pic': item['pic'], - 'title': item['title'], - 'datetime': datetime.datetime.fromtimestamp( - item['datetime']) - }, - '$push': { - 'data': { - '$each': [item['data_video']], - '$position': 0 - } - } - }, True) - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - try: - self.coll = self.db['author'] # 获得collection的句柄 - self.coll.update_one({ - 'mid': item['mid'] - }, { - '$set': { - 'focus': True, - 'sex': item['sex'], - 'name': item['name'], - 'face': item['face'], - 'level': item['level'], - 'cFans': item['c_fans'], - 'official': item['official'], - 'cArchive': item['c_archive'], - 'cArticle': item['c_article'], - 'cAttention': item['c_attention'], - 'cArchive_view': item['c_archive_view'], - 'cArticle_view': item['c_article_view'], - }, - '$push': { - 'data': { - '$each': [item['data_author']], - '$position': 0 - } - } - }, True) - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - return item - - -class VideoPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'cView': item['current_view'], - 'cFavorite': item['current_favorite'], - 'cDanmaku': item['current_danmaku'], - 'cCoin': item['current_coin'], - 'cShare': item['current_share'], - 'cLike': item['current_like'], - 'cDatetime': item['current_datetime'], - 'author': item['author'], - 'subChannel': item['subChannel'], - 'channel': item['channel'], - 'mid': item['mid'], - 'pic': item['pic'], - 'title': item['title'], - 'datetime': datetime.datetime.fromtimestamp( - item['datetime']) - }, - '$push': { - 'data': { - '$each': [item['data']], - '$position': 0 - } - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class VideoPipelineFromKan(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'author': item['author'], - 'mid': item['mid'], - 'pic': item['pic'], - 'title': item['title'], - 'datetime': item['datetime'] - }, - '$push': { - 'data': { - '$each': [item['data']], - '$position': 0 - } - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class BangumiPipeLine(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['bangumi'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'title': item['title'] - }, { - '$set': { - 'title': item['title'], - 'cover': item['cover'], - # 'isFinish': item['is_finish'], - # 'isStarted': item['is_started'], - 'newest': item['newest_ep_index'], - 'currentPts': item['data']['pts'], - 'currentPlay': item['data']['play'], - # 'squareCover': item['square_cover'], - 'currentWatch': item['data']['watch'], - 'currentReview': item['data']['review'], - 'currentDanmaku': item['data']['danmaku'] - }, - '$addToSet': { - 'data': item['data'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class DonghuaPipeLine(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['donghua'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'title': item['title'] - }, { - '$set': { - 'title': item['title'], - 'cover': item['cover'], - # 'isFinish': item['is_finish'], - # 'isStarted': item['is_started'], - 'newest': item['newest_ep_index'], - 'currentPts': item['data']['pts'], - 'currentPlay': item['data']['play'], - # 'squareCover': item['square_cover'], - 'currentWatch': item['data']['watch'], - 'currentReview': item['data']['review'], - 'currentDanmaku': item['data']['danmaku'] - }, - '$addToSet': { - 'data': item['data'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class SiteInfoPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['site_info'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.insert_one({ - 'region_count': item['region_count'], - 'all_count': item['all_count'], - 'web_online': item['web_online'], - 'play_online': item['play_online'], - 'datetime': datetime.datetime.now() - }) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class AuthorPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['author'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'mid': item['mid'] - }, { - '$set': { - 'focus': True, - 'sex': item['sex'], - 'name': item['name'], - 'face': item['face'], - 'level': item['level'], - 'cFans': item['c_fans'], - 'official': item['official'], - }, - '$push': { - 'data': { - '$each': [item['data']], - '$position': 0 - } - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class OnlinePipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video_online'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - - self.coll.update_one({ - 'title': item['title'] - }, { - '$set': { - 'title': item['title'], - 'author': item['author'], - 'channel': item['channel'], - 'subChannel': item['subChannel'], - }, - '$addToSet': { - 'data': item['data'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class TagPipeLine(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['tag'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - - self.coll.update_one({ - 'tag_id': item['tag_id'] - }, { - '$set': { - 'tag_name': item['tag_name'], - 'ctime': item['ctime'], - }, - '$addToSet': { - 'use': item['use'], - 'atten': item['atten'], - 'datetime': datetime.datetime.now() - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class VideoAddPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - for each_aid in item['aid']: - self.coll.update_one({ - 'aid': each_aid - }, { - '$set': { - 'aid': each_aid, - 'focus': True - }, - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class AuthorChannelPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['author'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'mid': item['mid'] - }, { - '$set': { - 'channels': item['channels'] - }, - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class BiliMonthlyRankPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['monthly_rank'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'aid': item['aid'] - }, { - '$addToSet': { - 'pts': item['pts'], - 'datetime': datetime.datetime.now() - }, - '$set': { - 'title': item['title'], - 'author': item['author'], - 'aid': item['aid'], - 'mid': item['mid'], - 'channel': item['channel'], - 'currentPts': item['pts'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index d4eb73f..b394f85 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -12,9 +12,14 @@ import random from db import redis_connect_string +DUPEFILTER_CLASS = 'biliob_spider.filter.CloseDupefilter' + +SCHEDULER_PERSIST = True +SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue' + REDIS_URL = redis_connect_string -# LOG_FILE = "biliob_spider.log" +LOG_FILE = "biliob_spider.log" LOG_LEVEL = "WARNING" BOT_NAME = 'biliob_spider' diff --git a/biliob_spider/spiders/author_update_with_redis.py b/biliob_spider/spiders/author_update_with_redis.py index 2ca321c..b7941b5 100644 --- a/biliob_spider/spiders/author_update_with_redis.py +++ b/biliob_spider/spiders/author_update_with_redis.py @@ -13,7 +13,7 @@ class AuthorUpdateWithRedis(RedisSpider): - name = "AuthorUpdateWithRedis" + name = "authorRedis" allowed_domains = ["bilibili.com"] start_urls = [] custom_settings = { diff --git a/biliob_spider/spiders/bili_monthly_rank.py b/biliob_spider/spiders/bili_monthly_rank.py index 338ac31..c51a1f1 100644 --- a/biliob_spider/spiders/bili_monthly_rank.py +++ b/biliob_spider/spiders/bili_monthly_rank.py @@ -1,4 +1,4 @@ -#coding=utf-8 +# coding=utf-8 import scrapy from mail import mailer from scrapy.http import Request @@ -35,18 +35,25 @@ class BiliMonthlyRankSpider(scrapy.spiders.Spider): def parse(self, response): try: - url_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/a/@href').extract() - pts_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[2]/div/text()').extract() - mid_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/a/@href').extract() - - title_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/a/text()').extract() - author_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/a/span/text()').extract() - aid_list = list(map(lambda x: int(x[27:-1]),url_list)) - pts_list = list(map(lambda x : int(x),pts_list)) - mid_list = list(map(lambda x : int(x.lstrip('//space.bilibili.com/').rstrip('/')),mid_list)) - channel = response.xpath("//li[@class='active']/text()").extract()[0] + url_list = response.xpath( + '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/a/@href').extract() + pts_list = response.xpath( + '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[2]/div/text()').extract() + mid_list = response.xpath( + '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/a/@href').extract() + + title_list = response.xpath( + '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/a/text()').extract() + author_list = response.xpath( + '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/a/span/text()').extract() + aid_list = list(map(lambda x: int(x[27:-1]), url_list)) + pts_list = list(map(lambda x: int(x), pts_list)) + mid_list = list( + map(lambda x: int(x.lstrip('//space.bilibili.com/').rstrip('/')), mid_list)) + channel = response.xpath( + "//li[@class='active']/text()").extract()[0] # 为了爬取分区、粉丝数等数据,需要进入每一个视频的详情页面进行抓取 - for each in zip(title_list,author_list,aid_list,pts_list,mid_list): + for each in zip(title_list, author_list, aid_list, pts_list, mid_list): item = RankItem() item['title'] = each[0] item['author'] = each[1] diff --git a/biliob_spider/spiders/video_spider_with_redis.py b/biliob_spider/spiders/video_spider_with_redis.py index b4cea54..4b1d230 100644 --- a/biliob_spider/spiders/video_spider_with_redis.py +++ b/biliob_spider/spiders/video_spider_with_redis.py @@ -14,7 +14,7 @@ class VideoSpiderWithRedis(RedisSpider): - name = "VideoSpiderWithRedis" + name = "videoRedis" allowed_domains = ["bilibili.com"] start_urls = [] custom_settings = { From db6a3b93c46dd6a0efb3dc9bec470fe23d5ac248 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Mon, 18 Feb 2019 18:57:12 +0800 Subject: [PATCH 169/469] feature: redis distributed spider --- biliob_spider/filter.py | 6 + ...nes.py.1f305fa5e47c4f7467edd6e4cb280dc2.py | 453 ------------------ biliob_spider/settings.py | 7 +- .../spiders/author_update_with_redis.py | 2 +- biliob_spider/spiders/bili_monthly_rank.py | 31 +- .../spiders/video_spider_with_redis.py | 2 +- 6 files changed, 33 insertions(+), 468 deletions(-) create mode 100644 biliob_spider/filter.py delete mode 100644 biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py diff --git a/biliob_spider/filter.py b/biliob_spider/filter.py new file mode 100644 index 0000000..24537f9 --- /dev/null +++ b/biliob_spider/filter.py @@ -0,0 +1,6 @@ +from scrapy.dupefilter import RFPDupeFilter + + +class CloseDupefilter(RFPDupeFilter): + def request_seen(self, request): + return False diff --git a/biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py b/biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py deleted file mode 100644 index 38252ba..0000000 --- a/biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py +++ /dev/null @@ -1,453 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html -from pymongo import MongoClient -from db import settings -from db import mysql_connect -import datetime -import logging - - -class StrongPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - - def process_item(self, item, spider): - try: - self.coll = self.db['video'] - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'cView': item['current_view'], - 'cFavorite': item['current_favorite'], - 'cDanmaku': item['current_danmaku'], - 'cCoin': item['current_coin'], - 'cShare': item['current_share'], - 'cLike': item['current_like'], - 'cDatetime': item['current_datetime'], - 'author': item['author'], - 'subChannel': item['subChannel'], - 'channel': item['channel'], - 'mid': item['mid'], - 'pic': item['pic'], - 'title': item['title'], - 'datetime': datetime.datetime.fromtimestamp( - item['datetime']) - }, - '$push': { - 'data': { - '$each': [item['data_video']], - '$position': 0 - } - } - }, True) - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - try: - self.coll = self.db['author'] # 获得collection的句柄 - self.coll.update_one({ - 'mid': item['mid'] - }, { - '$set': { - 'focus': True, - 'sex': item['sex'], - 'name': item['name'], - 'face': item['face'], - 'level': item['level'], - 'cFans': item['c_fans'], - 'official': item['official'], - 'cArchive': item['c_archive'], - 'cArticle': item['c_article'], - 'cAttention': item['c_attention'], - 'cArchive_view': item['c_archive_view'], - 'cArticle_view': item['c_article_view'], - }, - '$push': { - 'data': { - '$each': [item['data_author']], - '$position': 0 - } - } - }, True) - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - return item - - -class VideoPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'cView': item['current_view'], - 'cFavorite': item['current_favorite'], - 'cDanmaku': item['current_danmaku'], - 'cCoin': item['current_coin'], - 'cShare': item['current_share'], - 'cLike': item['current_like'], - 'cDatetime': item['current_datetime'], - 'author': item['author'], - 'subChannel': item['subChannel'], - 'channel': item['channel'], - 'mid': item['mid'], - 'pic': item['pic'], - 'title': item['title'], - 'datetime': datetime.datetime.fromtimestamp( - item['datetime']) - }, - '$push': { - 'data': { - '$each': [item['data']], - '$position': 0 - } - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class VideoPipelineFromKan(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'author': item['author'], - 'mid': item['mid'], - 'pic': item['pic'], - 'title': item['title'], - 'datetime': item['datetime'] - }, - '$push': { - 'data': { - '$each': [item['data']], - '$position': 0 - } - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class BangumiPipeLine(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['bangumi'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'title': item['title'] - }, { - '$set': { - 'title': item['title'], - 'cover': item['cover'], - # 'isFinish': item['is_finish'], - # 'isStarted': item['is_started'], - 'newest': item['newest_ep_index'], - 'currentPts': item['data']['pts'], - 'currentPlay': item['data']['play'], - # 'squareCover': item['square_cover'], - 'currentWatch': item['data']['watch'], - 'currentReview': item['data']['review'], - 'currentDanmaku': item['data']['danmaku'] - }, - '$addToSet': { - 'data': item['data'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class DonghuaPipeLine(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['donghua'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'title': item['title'] - }, { - '$set': { - 'title': item['title'], - 'cover': item['cover'], - # 'isFinish': item['is_finish'], - # 'isStarted': item['is_started'], - 'newest': item['newest_ep_index'], - 'currentPts': item['data']['pts'], - 'currentPlay': item['data']['play'], - # 'squareCover': item['square_cover'], - 'currentWatch': item['data']['watch'], - 'currentReview': item['data']['review'], - 'currentDanmaku': item['data']['danmaku'] - }, - '$addToSet': { - 'data': item['data'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class SiteInfoPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['site_info'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.insert_one({ - 'region_count': item['region_count'], - 'all_count': item['all_count'], - 'web_online': item['web_online'], - 'play_online': item['play_online'], - 'datetime': datetime.datetime.now() - }) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class AuthorPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['author'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'mid': item['mid'] - }, { - '$set': { - 'focus': True, - 'sex': item['sex'], - 'name': item['name'], - 'face': item['face'], - 'level': item['level'], - 'cFans': item['c_fans'], - 'official': item['official'], - }, - '$push': { - 'data': { - '$each': [item['data']], - '$position': 0 - } - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class OnlinePipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video_online'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - - self.coll.update_one({ - 'title': item['title'] - }, { - '$set': { - 'title': item['title'], - 'author': item['author'], - 'channel': item['channel'], - 'subChannel': item['subChannel'], - }, - '$addToSet': { - 'data': item['data'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class TagPipeLine(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['tag'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - - self.coll.update_one({ - 'tag_id': item['tag_id'] - }, { - '$set': { - 'tag_name': item['tag_name'], - 'ctime': item['ctime'], - }, - '$addToSet': { - 'use': item['use'], - 'atten': item['atten'], - 'datetime': datetime.datetime.now() - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class VideoAddPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - for each_aid in item['aid']: - self.coll.update_one({ - 'aid': each_aid - }, { - '$set': { - 'aid': each_aid, - 'focus': True - }, - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class AuthorChannelPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['author'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'mid': item['mid'] - }, { - '$set': { - 'channels': item['channels'] - }, - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class BiliMonthlyRankPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['monthly_rank'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'aid': item['aid'] - }, { - '$addToSet': { - 'pts': item['pts'], - 'datetime': datetime.datetime.now() - }, - '$set': { - 'title': item['title'], - 'author': item['author'], - 'aid': item['aid'], - 'mid': item['mid'], - 'channel': item['channel'], - 'currentPts': item['pts'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index d4eb73f..b394f85 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -12,9 +12,14 @@ import random from db import redis_connect_string +DUPEFILTER_CLASS = 'biliob_spider.filter.CloseDupefilter' + +SCHEDULER_PERSIST = True +SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue' + REDIS_URL = redis_connect_string -# LOG_FILE = "biliob_spider.log" +LOG_FILE = "biliob_spider.log" LOG_LEVEL = "WARNING" BOT_NAME = 'biliob_spider' diff --git a/biliob_spider/spiders/author_update_with_redis.py b/biliob_spider/spiders/author_update_with_redis.py index 2ca321c..b7941b5 100644 --- a/biliob_spider/spiders/author_update_with_redis.py +++ b/biliob_spider/spiders/author_update_with_redis.py @@ -13,7 +13,7 @@ class AuthorUpdateWithRedis(RedisSpider): - name = "AuthorUpdateWithRedis" + name = "authorRedis" allowed_domains = ["bilibili.com"] start_urls = [] custom_settings = { diff --git a/biliob_spider/spiders/bili_monthly_rank.py b/biliob_spider/spiders/bili_monthly_rank.py index 338ac31..c51a1f1 100644 --- a/biliob_spider/spiders/bili_monthly_rank.py +++ b/biliob_spider/spiders/bili_monthly_rank.py @@ -1,4 +1,4 @@ -#coding=utf-8 +# coding=utf-8 import scrapy from mail import mailer from scrapy.http import Request @@ -35,18 +35,25 @@ class BiliMonthlyRankSpider(scrapy.spiders.Spider): def parse(self, response): try: - url_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/a/@href').extract() - pts_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[2]/div/text()').extract() - mid_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/a/@href').extract() - - title_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/a/text()').extract() - author_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/a/span/text()').extract() - aid_list = list(map(lambda x: int(x[27:-1]),url_list)) - pts_list = list(map(lambda x : int(x),pts_list)) - mid_list = list(map(lambda x : int(x.lstrip('//space.bilibili.com/').rstrip('/')),mid_list)) - channel = response.xpath("//li[@class='active']/text()").extract()[0] + url_list = response.xpath( + '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/a/@href').extract() + pts_list = response.xpath( + '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[2]/div/text()').extract() + mid_list = response.xpath( + '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/a/@href').extract() + + title_list = response.xpath( + '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/a/text()').extract() + author_list = response.xpath( + '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/a/span/text()').extract() + aid_list = list(map(lambda x: int(x[27:-1]), url_list)) + pts_list = list(map(lambda x: int(x), pts_list)) + mid_list = list( + map(lambda x: int(x.lstrip('//space.bilibili.com/').rstrip('/')), mid_list)) + channel = response.xpath( + "//li[@class='active']/text()").extract()[0] # 为了爬取分区、粉丝数等数据,需要进入每一个视频的详情页面进行抓取 - for each in zip(title_list,author_list,aid_list,pts_list,mid_list): + for each in zip(title_list, author_list, aid_list, pts_list, mid_list): item = RankItem() item['title'] = each[0] item['author'] = each[1] diff --git a/biliob_spider/spiders/video_spider_with_redis.py b/biliob_spider/spiders/video_spider_with_redis.py index b4cea54..4b1d230 100644 --- a/biliob_spider/spiders/video_spider_with_redis.py +++ b/biliob_spider/spiders/video_spider_with_redis.py @@ -14,7 +14,7 @@ class VideoSpiderWithRedis(RedisSpider): - name = "VideoSpiderWithRedis" + name = "videoRedis" allowed_domains = ["bilibili.com"] start_urls = [] custom_settings = { From ca4dadfe58e3752b5860a73af58ddf95394a9f7d Mon Sep 17 00:00:00 2001 From: Jannchie Date: Mon, 18 Feb 2019 18:57:12 +0800 Subject: [PATCH 170/469] feature: redis distributed spider --- biliob_spider/filter.py | 6 + ...nes.py.1f305fa5e47c4f7467edd6e4cb280dc2.py | 453 ------------------ biliob_spider/settings.py | 7 +- .../spiders/author_update_with_redis.py | 2 +- biliob_spider/spiders/bili_monthly_rank.py | 31 +- .../spiders/video_spider_with_redis.py | 2 +- 6 files changed, 33 insertions(+), 468 deletions(-) create mode 100644 biliob_spider/filter.py delete mode 100644 biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py diff --git a/biliob_spider/filter.py b/biliob_spider/filter.py new file mode 100644 index 0000000..24537f9 --- /dev/null +++ b/biliob_spider/filter.py @@ -0,0 +1,6 @@ +from scrapy.dupefilter import RFPDupeFilter + + +class CloseDupefilter(RFPDupeFilter): + def request_seen(self, request): + return False diff --git a/biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py b/biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py deleted file mode 100644 index 38252ba..0000000 --- a/biliob_spider/pipelines.py.1f305fa5e47c4f7467edd6e4cb280dc2.py +++ /dev/null @@ -1,453 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html -from pymongo import MongoClient -from db import settings -from db import mysql_connect -import datetime -import logging - - -class StrongPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - - def process_item(self, item, spider): - try: - self.coll = self.db['video'] - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'cView': item['current_view'], - 'cFavorite': item['current_favorite'], - 'cDanmaku': item['current_danmaku'], - 'cCoin': item['current_coin'], - 'cShare': item['current_share'], - 'cLike': item['current_like'], - 'cDatetime': item['current_datetime'], - 'author': item['author'], - 'subChannel': item['subChannel'], - 'channel': item['channel'], - 'mid': item['mid'], - 'pic': item['pic'], - 'title': item['title'], - 'datetime': datetime.datetime.fromtimestamp( - item['datetime']) - }, - '$push': { - 'data': { - '$each': [item['data_video']], - '$position': 0 - } - } - }, True) - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - try: - self.coll = self.db['author'] # 获得collection的句柄 - self.coll.update_one({ - 'mid': item['mid'] - }, { - '$set': { - 'focus': True, - 'sex': item['sex'], - 'name': item['name'], - 'face': item['face'], - 'level': item['level'], - 'cFans': item['c_fans'], - 'official': item['official'], - 'cArchive': item['c_archive'], - 'cArticle': item['c_article'], - 'cAttention': item['c_attention'], - 'cArchive_view': item['c_archive_view'], - 'cArticle_view': item['c_article_view'], - }, - '$push': { - 'data': { - '$each': [item['data_author']], - '$position': 0 - } - } - }, True) - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - return item - - -class VideoPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'cView': item['current_view'], - 'cFavorite': item['current_favorite'], - 'cDanmaku': item['current_danmaku'], - 'cCoin': item['current_coin'], - 'cShare': item['current_share'], - 'cLike': item['current_like'], - 'cDatetime': item['current_datetime'], - 'author': item['author'], - 'subChannel': item['subChannel'], - 'channel': item['channel'], - 'mid': item['mid'], - 'pic': item['pic'], - 'title': item['title'], - 'datetime': datetime.datetime.fromtimestamp( - item['datetime']) - }, - '$push': { - 'data': { - '$each': [item['data']], - '$position': 0 - } - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class VideoPipelineFromKan(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'author': item['author'], - 'mid': item['mid'], - 'pic': item['pic'], - 'title': item['title'], - 'datetime': item['datetime'] - }, - '$push': { - 'data': { - '$each': [item['data']], - '$position': 0 - } - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class BangumiPipeLine(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['bangumi'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'title': item['title'] - }, { - '$set': { - 'title': item['title'], - 'cover': item['cover'], - # 'isFinish': item['is_finish'], - # 'isStarted': item['is_started'], - 'newest': item['newest_ep_index'], - 'currentPts': item['data']['pts'], - 'currentPlay': item['data']['play'], - # 'squareCover': item['square_cover'], - 'currentWatch': item['data']['watch'], - 'currentReview': item['data']['review'], - 'currentDanmaku': item['data']['danmaku'] - }, - '$addToSet': { - 'data': item['data'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class DonghuaPipeLine(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['donghua'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'title': item['title'] - }, { - '$set': { - 'title': item['title'], - 'cover': item['cover'], - # 'isFinish': item['is_finish'], - # 'isStarted': item['is_started'], - 'newest': item['newest_ep_index'], - 'currentPts': item['data']['pts'], - 'currentPlay': item['data']['play'], - # 'squareCover': item['square_cover'], - 'currentWatch': item['data']['watch'], - 'currentReview': item['data']['review'], - 'currentDanmaku': item['data']['danmaku'] - }, - '$addToSet': { - 'data': item['data'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class SiteInfoPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['site_info'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.insert_one({ - 'region_count': item['region_count'], - 'all_count': item['all_count'], - 'web_online': item['web_online'], - 'play_online': item['play_online'], - 'datetime': datetime.datetime.now() - }) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class AuthorPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['author'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'mid': item['mid'] - }, { - '$set': { - 'focus': True, - 'sex': item['sex'], - 'name': item['name'], - 'face': item['face'], - 'level': item['level'], - 'cFans': item['c_fans'], - 'official': item['official'], - }, - '$push': { - 'data': { - '$each': [item['data']], - '$position': 0 - } - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class OnlinePipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video_online'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - - self.coll.update_one({ - 'title': item['title'] - }, { - '$set': { - 'title': item['title'], - 'author': item['author'], - 'channel': item['channel'], - 'subChannel': item['subChannel'], - }, - '$addToSet': { - 'data': item['data'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class TagPipeLine(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['tag'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - - self.coll.update_one({ - 'tag_id': item['tag_id'] - }, { - '$set': { - 'tag_name': item['tag_name'], - 'ctime': item['ctime'], - }, - '$addToSet': { - 'use': item['use'], - 'atten': item['atten'], - 'datetime': datetime.datetime.now() - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class VideoAddPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - for each_aid in item['aid']: - self.coll.update_one({ - 'aid': each_aid - }, { - '$set': { - 'aid': each_aid, - 'focus': True - }, - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class AuthorChannelPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['author'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'mid': item['mid'] - }, { - '$set': { - 'channels': item['channels'] - }, - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class BiliMonthlyRankPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['monthly_rank'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'aid': item['aid'] - }, { - '$addToSet': { - 'pts': item['pts'], - 'datetime': datetime.datetime.now() - }, - '$set': { - 'title': item['title'], - 'author': item['author'], - 'aid': item['aid'], - 'mid': item['mid'], - 'channel': item['channel'], - 'currentPts': item['pts'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index d4eb73f..b394f85 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -12,9 +12,14 @@ import random from db import redis_connect_string +DUPEFILTER_CLASS = 'biliob_spider.filter.CloseDupefilter' + +SCHEDULER_PERSIST = True +SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue' + REDIS_URL = redis_connect_string -# LOG_FILE = "biliob_spider.log" +LOG_FILE = "biliob_spider.log" LOG_LEVEL = "WARNING" BOT_NAME = 'biliob_spider' diff --git a/biliob_spider/spiders/author_update_with_redis.py b/biliob_spider/spiders/author_update_with_redis.py index 2ca321c..b7941b5 100644 --- a/biliob_spider/spiders/author_update_with_redis.py +++ b/biliob_spider/spiders/author_update_with_redis.py @@ -13,7 +13,7 @@ class AuthorUpdateWithRedis(RedisSpider): - name = "AuthorUpdateWithRedis" + name = "authorRedis" allowed_domains = ["bilibili.com"] start_urls = [] custom_settings = { diff --git a/biliob_spider/spiders/bili_monthly_rank.py b/biliob_spider/spiders/bili_monthly_rank.py index 338ac31..c51a1f1 100644 --- a/biliob_spider/spiders/bili_monthly_rank.py +++ b/biliob_spider/spiders/bili_monthly_rank.py @@ -1,4 +1,4 @@ -#coding=utf-8 +# coding=utf-8 import scrapy from mail import mailer from scrapy.http import Request @@ -35,18 +35,25 @@ class BiliMonthlyRankSpider(scrapy.spiders.Spider): def parse(self, response): try: - url_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/a/@href').extract() - pts_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[2]/div/text()').extract() - mid_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/a/@href').extract() - - title_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/a/text()').extract() - author_list = response.xpath('//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/a/span/text()').extract() - aid_list = list(map(lambda x: int(x[27:-1]),url_list)) - pts_list = list(map(lambda x : int(x),pts_list)) - mid_list = list(map(lambda x : int(x.lstrip('//space.bilibili.com/').rstrip('/')),mid_list)) - channel = response.xpath("//li[@class='active']/text()").extract()[0] + url_list = response.xpath( + '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/a/@href').extract() + pts_list = response.xpath( + '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[2]/div/text()').extract() + mid_list = response.xpath( + '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/a/@href').extract() + + title_list = response.xpath( + '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/a/text()').extract() + author_list = response.xpath( + '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/a/span/text()').extract() + aid_list = list(map(lambda x: int(x[27:-1]), url_list)) + pts_list = list(map(lambda x: int(x), pts_list)) + mid_list = list( + map(lambda x: int(x.lstrip('//space.bilibili.com/').rstrip('/')), mid_list)) + channel = response.xpath( + "//li[@class='active']/text()").extract()[0] # 为了爬取分区、粉丝数等数据,需要进入每一个视频的详情页面进行抓取 - for each in zip(title_list,author_list,aid_list,pts_list,mid_list): + for each in zip(title_list, author_list, aid_list, pts_list, mid_list): item = RankItem() item['title'] = each[0] item['author'] = each[1] diff --git a/biliob_spider/spiders/video_spider_with_redis.py b/biliob_spider/spiders/video_spider_with_redis.py index b4cea54..4b1d230 100644 --- a/biliob_spider/spiders/video_spider_with_redis.py +++ b/biliob_spider/spiders/video_spider_with_redis.py @@ -14,7 +14,7 @@ class VideoSpiderWithRedis(RedisSpider): - name = "VideoSpiderWithRedis" + name = "videoRedis" allowed_domains = ["bilibili.com"] start_urls = [] custom_settings = { From da2530f1464272b6bc3b9bcf35dcb86cae4f422c Mon Sep 17 00:00:00 2001 From: Jannchie Date: Mon, 18 Feb 2019 21:46:41 +0800 Subject: [PATCH 171/469] feature: refresh cache --- biliob_spider/pipelines.py | 18 ++++++++++++++++++ .../spiders/author_update_with_redis.py | 9 ++++++++- .../spiders/video_spider_with_redis.py | 1 + 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 38252ba..515a02f 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -9,6 +9,8 @@ from db import mysql_connect import datetime import logging +import redis +from db import redis_connect_string class StrongPipeline(object): @@ -19,9 +21,11 @@ def __init__(self): self.client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 + self.redis_connection = redis.from_url(redis_connect_string) def process_item(self, item, spider): try: + self.coll = self.db['video'] self.coll.update_one({ 'aid': int(item['aid']) @@ -50,6 +54,9 @@ def process_item(self, item, spider): } } }, True) + # 刷新redis数据缓存 + self.redis_connection.delete( + "video_detail::{}".format(item['aid'])) except Exception as error: # 出现错误时打印错误日志 logging.error('{}: {}'.format(spider.name, error)) @@ -79,6 +86,8 @@ def process_item(self, item, spider): } } }, True) + self.redis_connection.delete( + "author_detail::{}".format(item['mid'])) except Exception as error: # 出现错误时打印错误日志 logging.error('{}: {}'.format(spider.name, error)) @@ -94,6 +103,7 @@ def __init__(self): settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['video'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) def process_item(self, item, spider): try: @@ -124,6 +134,8 @@ def process_item(self, item, spider): } } }, True) + self.redis_connection.delete( + "video_detail::{}".format(item['aid'])) return item except Exception as error: # 出现错误时打印错误日志 @@ -275,6 +287,7 @@ def __init__(self): settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['author'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) def process_item(self, item, spider): try: @@ -297,6 +310,8 @@ def process_item(self, item, spider): } } }, True) + self.redis_connection.delete( + "author_detail::{}".format(item['mid'])) return item except Exception as error: # 出现错误时打印错误日志 @@ -403,6 +418,7 @@ def __init__(self): settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['author'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) def process_item(self, item, spider): try: @@ -413,6 +429,8 @@ def process_item(self, item, spider): 'channels': item['channels'] }, }, True) + self.redis_connection.delete( + "author_detail::{}".format(item['mid'])) return item except Exception as error: # 出现错误时打印错误日志 diff --git a/biliob_spider/spiders/author_update_with_redis.py b/biliob_spider/spiders/author_update_with_redis.py index b7941b5..d947134 100644 --- a/biliob_spider/spiders/author_update_with_redis.py +++ b/biliob_spider/spiders/author_update_with_redis.py @@ -9,7 +9,9 @@ from pymongo import MongoClient import datetime from db import settings +from db import redis_connect_string from scrapy_redis.spiders import RedisSpider +import redis class AuthorUpdateWithRedis(RedisSpider): @@ -20,7 +22,7 @@ class AuthorUpdateWithRedis(RedisSpider): 'ITEM_PIPELINES': { 'biliob_spider.pipelines.AuthorPipeline': 300 }, - 'DOWNLOAD_DELAY': 2 + 'DOWNLOAD_DELAY': 10 } def __init__(self): @@ -31,12 +33,17 @@ def __init__(self): settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['author'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) def parse(self, response): try: j = json.loads(response.body) name = j['data']['card']['name'] mid = j['data']['card']['mid'] + + # 刷新redis数据缓存 + self.redis_connection.delete("author_detail::{}".format(mid)) + sex = j['data']['card']['sex'] face = j['data']['card']['face'] fans = j['data']['card']['fans'] diff --git a/biliob_spider/spiders/video_spider_with_redis.py b/biliob_spider/spiders/video_spider_with_redis.py index 4b1d230..4fe87fe 100644 --- a/biliob_spider/spiders/video_spider_with_redis.py +++ b/biliob_spider/spiders/video_spider_with_redis.py @@ -11,6 +11,7 @@ from db import settings from util import sub_channel_2_channel from scrapy_redis.spiders import RedisSpider +from db import redis_connect_string class VideoSpiderWithRedis(RedisSpider): From 7065a756f900daada2aa5019bee539ba981d4976 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Mon, 18 Feb 2019 21:46:41 +0800 Subject: [PATCH 172/469] feature: refresh cache --- biliob_spider/pipelines.py | 18 ++++++++++++++++++ .../spiders/author_update_with_redis.py | 9 ++++++++- .../spiders/video_spider_with_redis.py | 1 + 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 38252ba..515a02f 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -9,6 +9,8 @@ from db import mysql_connect import datetime import logging +import redis +from db import redis_connect_string class StrongPipeline(object): @@ -19,9 +21,11 @@ def __init__(self): self.client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 + self.redis_connection = redis.from_url(redis_connect_string) def process_item(self, item, spider): try: + self.coll = self.db['video'] self.coll.update_one({ 'aid': int(item['aid']) @@ -50,6 +54,9 @@ def process_item(self, item, spider): } } }, True) + # 刷新redis数据缓存 + self.redis_connection.delete( + "video_detail::{}".format(item['aid'])) except Exception as error: # 出现错误时打印错误日志 logging.error('{}: {}'.format(spider.name, error)) @@ -79,6 +86,8 @@ def process_item(self, item, spider): } } }, True) + self.redis_connection.delete( + "author_detail::{}".format(item['mid'])) except Exception as error: # 出现错误时打印错误日志 logging.error('{}: {}'.format(spider.name, error)) @@ -94,6 +103,7 @@ def __init__(self): settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['video'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) def process_item(self, item, spider): try: @@ -124,6 +134,8 @@ def process_item(self, item, spider): } } }, True) + self.redis_connection.delete( + "video_detail::{}".format(item['aid'])) return item except Exception as error: # 出现错误时打印错误日志 @@ -275,6 +287,7 @@ def __init__(self): settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['author'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) def process_item(self, item, spider): try: @@ -297,6 +310,8 @@ def process_item(self, item, spider): } } }, True) + self.redis_connection.delete( + "author_detail::{}".format(item['mid'])) return item except Exception as error: # 出现错误时打印错误日志 @@ -403,6 +418,7 @@ def __init__(self): settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['author'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) def process_item(self, item, spider): try: @@ -413,6 +429,8 @@ def process_item(self, item, spider): 'channels': item['channels'] }, }, True) + self.redis_connection.delete( + "author_detail::{}".format(item['mid'])) return item except Exception as error: # 出现错误时打印错误日志 diff --git a/biliob_spider/spiders/author_update_with_redis.py b/biliob_spider/spiders/author_update_with_redis.py index b7941b5..d947134 100644 --- a/biliob_spider/spiders/author_update_with_redis.py +++ b/biliob_spider/spiders/author_update_with_redis.py @@ -9,7 +9,9 @@ from pymongo import MongoClient import datetime from db import settings +from db import redis_connect_string from scrapy_redis.spiders import RedisSpider +import redis class AuthorUpdateWithRedis(RedisSpider): @@ -20,7 +22,7 @@ class AuthorUpdateWithRedis(RedisSpider): 'ITEM_PIPELINES': { 'biliob_spider.pipelines.AuthorPipeline': 300 }, - 'DOWNLOAD_DELAY': 2 + 'DOWNLOAD_DELAY': 10 } def __init__(self): @@ -31,12 +33,17 @@ def __init__(self): settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['author'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) def parse(self, response): try: j = json.loads(response.body) name = j['data']['card']['name'] mid = j['data']['card']['mid'] + + # 刷新redis数据缓存 + self.redis_connection.delete("author_detail::{}".format(mid)) + sex = j['data']['card']['sex'] face = j['data']['card']['face'] fans = j['data']['card']['fans'] diff --git a/biliob_spider/spiders/video_spider_with_redis.py b/biliob_spider/spiders/video_spider_with_redis.py index 4b1d230..4fe87fe 100644 --- a/biliob_spider/spiders/video_spider_with_redis.py +++ b/biliob_spider/spiders/video_spider_with_redis.py @@ -11,6 +11,7 @@ from db import settings from util import sub_channel_2_channel from scrapy_redis.spiders import RedisSpider +from db import redis_connect_string class VideoSpiderWithRedis(RedisSpider): From eccdc003c3704c55b38b3980306a483f65816873 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Mon, 18 Feb 2019 21:46:41 +0800 Subject: [PATCH 173/469] feature: refresh cache --- biliob_spider/pipelines.py | 18 ++++++++++++++++++ .../spiders/author_update_with_redis.py | 9 ++++++++- .../spiders/video_spider_with_redis.py | 1 + 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 38252ba..515a02f 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -9,6 +9,8 @@ from db import mysql_connect import datetime import logging +import redis +from db import redis_connect_string class StrongPipeline(object): @@ -19,9 +21,11 @@ def __init__(self): self.client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 + self.redis_connection = redis.from_url(redis_connect_string) def process_item(self, item, spider): try: + self.coll = self.db['video'] self.coll.update_one({ 'aid': int(item['aid']) @@ -50,6 +54,9 @@ def process_item(self, item, spider): } } }, True) + # 刷新redis数据缓存 + self.redis_connection.delete( + "video_detail::{}".format(item['aid'])) except Exception as error: # 出现错误时打印错误日志 logging.error('{}: {}'.format(spider.name, error)) @@ -79,6 +86,8 @@ def process_item(self, item, spider): } } }, True) + self.redis_connection.delete( + "author_detail::{}".format(item['mid'])) except Exception as error: # 出现错误时打印错误日志 logging.error('{}: {}'.format(spider.name, error)) @@ -94,6 +103,7 @@ def __init__(self): settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['video'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) def process_item(self, item, spider): try: @@ -124,6 +134,8 @@ def process_item(self, item, spider): } } }, True) + self.redis_connection.delete( + "video_detail::{}".format(item['aid'])) return item except Exception as error: # 出现错误时打印错误日志 @@ -275,6 +287,7 @@ def __init__(self): settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['author'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) def process_item(self, item, spider): try: @@ -297,6 +310,8 @@ def process_item(self, item, spider): } } }, True) + self.redis_connection.delete( + "author_detail::{}".format(item['mid'])) return item except Exception as error: # 出现错误时打印错误日志 @@ -403,6 +418,7 @@ def __init__(self): settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['author'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) def process_item(self, item, spider): try: @@ -413,6 +429,8 @@ def process_item(self, item, spider): 'channels': item['channels'] }, }, True) + self.redis_connection.delete( + "author_detail::{}".format(item['mid'])) return item except Exception as error: # 出现错误时打印错误日志 diff --git a/biliob_spider/spiders/author_update_with_redis.py b/biliob_spider/spiders/author_update_with_redis.py index b7941b5..d947134 100644 --- a/biliob_spider/spiders/author_update_with_redis.py +++ b/biliob_spider/spiders/author_update_with_redis.py @@ -9,7 +9,9 @@ from pymongo import MongoClient import datetime from db import settings +from db import redis_connect_string from scrapy_redis.spiders import RedisSpider +import redis class AuthorUpdateWithRedis(RedisSpider): @@ -20,7 +22,7 @@ class AuthorUpdateWithRedis(RedisSpider): 'ITEM_PIPELINES': { 'biliob_spider.pipelines.AuthorPipeline': 300 }, - 'DOWNLOAD_DELAY': 2 + 'DOWNLOAD_DELAY': 10 } def __init__(self): @@ -31,12 +33,17 @@ def __init__(self): settings['MONGO_PSW']) self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['author'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) def parse(self, response): try: j = json.loads(response.body) name = j['data']['card']['name'] mid = j['data']['card']['mid'] + + # 刷新redis数据缓存 + self.redis_connection.delete("author_detail::{}".format(mid)) + sex = j['data']['card']['sex'] face = j['data']['card']['face'] fans = j['data']['card']['fans'] diff --git a/biliob_spider/spiders/video_spider_with_redis.py b/biliob_spider/spiders/video_spider_with_redis.py index 4b1d230..4fe87fe 100644 --- a/biliob_spider/spiders/video_spider_with_redis.py +++ b/biliob_spider/spiders/video_spider_with_redis.py @@ -11,6 +11,7 @@ from db import settings from util import sub_channel_2_channel from scrapy_redis.spiders import RedisSpider +from db import redis_connect_string class VideoSpiderWithRedis(RedisSpider): From b5034f3f41eefe59a5210c63f38c27e7cbcfde20 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sun, 24 Feb 2019 14:54:04 +0800 Subject: [PATCH 174/469] fix: scrapy.dupfilter -> scrapy.dupfilters --- biliob_spider/filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biliob_spider/filter.py b/biliob_spider/filter.py index 24537f9..a2b9f15 100644 --- a/biliob_spider/filter.py +++ b/biliob_spider/filter.py @@ -1,4 +1,4 @@ -from scrapy.dupefilter import RFPDupeFilter +from scrapy.dupefilters import RFPDupeFilter class CloseDupefilter(RFPDupeFilter): From e0beabf59d00c6903eee4937805d34e6d0f763b6 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sun, 24 Feb 2019 14:54:04 +0800 Subject: [PATCH 175/469] fix: scrapy.dupfilter -> scrapy.dupfilters --- biliob_spider/filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biliob_spider/filter.py b/biliob_spider/filter.py index 24537f9..a2b9f15 100644 --- a/biliob_spider/filter.py +++ b/biliob_spider/filter.py @@ -1,4 +1,4 @@ -from scrapy.dupefilter import RFPDupeFilter +from scrapy.dupefilters import RFPDupeFilter class CloseDupefilter(RFPDupeFilter): From 7aef8421746db751a14a626bd0fbe43f7f17cdc4 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sun, 24 Feb 2019 14:54:04 +0800 Subject: [PATCH 176/469] fix: scrapy.dupfilter -> scrapy.dupfilters --- biliob_spider/filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biliob_spider/filter.py b/biliob_spider/filter.py index 24537f9..a2b9f15 100644 --- a/biliob_spider/filter.py +++ b/biliob_spider/filter.py @@ -1,4 +1,4 @@ -from scrapy.dupefilter import RFPDupeFilter +from scrapy.dupefilters import RFPDupeFilter class CloseDupefilter(RFPDupeFilter): From 7dfe17c7830be5282e290faa3f54286336bf7c27 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sun, 24 Feb 2019 23:13:38 +0800 Subject: [PATCH 177/469] feature: add keywords --- biliob_analyzer/add_keyword.py | 92 ++++++++++++++++++++++++++++++++++ biliob_analyzer/dict.txt | 2 + biliob_analyzer/online.py | 45 +++++++++++++++++ run_add_kw.py | 3 ++ run_analyzer.py | 3 ++ 5 files changed, 145 insertions(+) create mode 100644 biliob_analyzer/add_keyword.py create mode 100644 biliob_analyzer/dict.txt create mode 100644 biliob_analyzer/online.py create mode 100644 run_add_kw.py diff --git a/biliob_analyzer/add_keyword.py b/biliob_analyzer/add_keyword.py new file mode 100644 index 0000000..d18f359 --- /dev/null +++ b/biliob_analyzer/add_keyword.py @@ -0,0 +1,92 @@ +from pymongo import ReturnDocument +import jieba +from db import db + +# 载入字典 +jieba.load_userdict('./biliob_analyzer/dict.txt') + + +class AddKeyword(): + + def __init__(self): + self.mongo_author = db['author'] + self.mongo_video = db['video'] + + def get_video_kw_list(self, aid): + # 关键字从name和official中提取 + video = self.mongo_video.find_one( + {'aid': aid}, {'_id': 0, 'title': 1, 'channel': 1, 'subChannel': 1, 'author': 1, 'tag': 1}) + kw = [] + for each_key in video: + if each_key != 'keyword' or each_key != 'tag': + kw.append(str(video[each_key]).lower()) + elif each_key == 'tag': + kw += video['tag'] + else: + kw += video['keyword'] + seg_list = jieba.lcut_for_search( + ' '.join(kw), True) # 搜索引擎模式 + + # 全名算作关键字 + if 'author' in video and video['author'].lower() not in seg_list: + seg_list.append(video['author'].lower()) + + while ' ' in seg_list: + seg_list.remove(' ') + while '、' in seg_list: + seg_list.remove('、') + return list(set(seg_list)) + + def add_to_video(self, aid, seg_list): + self.mongo_video.update_one({'aid': aid}, {'$set': { + 'keyword': seg_list + }}) + + def add_video_kw(self, aid): + self.add_to_video(aid, self.get_video_kw_list(aid)) + return True + + def get_author_kw_list(self, mid): + # 关键字从name和official中提取 + author = self.mongo_author.find_one( + {'mid': mid}, {'_id': 0, 'name': 1, 'official': 1, 'keyword': 1}) + kw = [] + for each_key in author: + if each_key != 'keyword': + kw.append(str(author[each_key]).lower()) + else: + kw += author['keyword'] + seg_list = jieba.lcut_for_search( + ' '.join(kw), True) # 搜索引擎模式 + + # 全名算作关键字 + if 'name' in author and author['name'].lower() not in seg_list: + seg_list.append(author['name'].lower()) + + while ' ' in seg_list: + seg_list.remove(' ') + while '、' in seg_list: + seg_list.remove('、') + return list(set(seg_list)) + + def add_author_kw(self, mid): + self.add_to_author(mid, self.get_author_kw_list(mid)) + return True + + def add_to_author(self, mid, seg_list): + self.mongo_author.update_one( + {'mid': mid}, {'$set': {'keyword': seg_list}}) + + def add_all_author(self): + authors = self.mongo_author.find( + {'keyword': {'$exists': False}}, {'_id': 0, 'mid': 1}) + for each_author in authors: + mid = each_author['mid'] + self.add_author_kw(mid) + + def add_all_video(self): + videos = self.mongo_video.find( + {'keyword': {'$exists': False}}, {'_id': 0, 'aid': 1}) + for each_video in videos: + aid = each_video['aid'] + self.add_video_kw(aid) diff --git a/biliob_analyzer/dict.txt b/biliob_analyzer/dict.txt new file mode 100644 index 0000000..496402b --- /dev/null +++ b/biliob_analyzer/dict.txt @@ -0,0 +1,2 @@ +高能联盟 +lex \ No newline at end of file diff --git a/biliob_analyzer/online.py b/biliob_analyzer/online.py new file mode 100644 index 0000000..72de82a --- /dev/null +++ b/biliob_analyzer/online.py @@ -0,0 +1,45 @@ +from db import settings +from pymongo import MongoClient +import datetime +client = MongoClient(settings['MINGO_HOST'], 27017) +# 数据库登录需要帐号密码 +client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) +db = client['biliob'] # 获得数据库的句柄 +video_online = db['video_online'] +pass +d = video_online.aggregate([ + { + '$match': + { + 'data.datetime': + { + '$gt': datetime.datetime(2019, 2, 21) + } + } + }, + { + '$limit': 1 + }, + { + '$project': + { + "title": 1, + "author": 1, + "data": + { + '$filter': + { + 'input': "$data", + 'as': "item", + 'cond': + { + '$gt': ["$$item.datetime", datetime.datetime(2019, 2, 22)] + } + } + } + } + } +]) +print(len(next(d)['data'])) +pass diff --git a/run_add_kw.py b/run_add_kw.py new file mode 100644 index 0000000..a1a131a --- /dev/null +++ b/run_add_kw.py @@ -0,0 +1,3 @@ +from biliob_analyzer.add_keyword import AddKeyword +AddKeyword().add_all_author() +AddKeyword().add_all_video() diff --git a/run_analyzer.py b/run_analyzer.py index f5fc193..4664268 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -4,6 +4,9 @@ import biliob_analyzer.author_fans_watcher import biliob_analyzer.author_rank # import biliob_analyzer.video_rank +from biliob_analyzer.add_keyword import AddKeyword +AddKeyword().add_all_author() +AddKeyword().add_all_video() author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From 9a58d44fade073d2787f1b0eb6de0086742e2bcf Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sun, 24 Feb 2019 23:13:38 +0800 Subject: [PATCH 178/469] feature: add keywords --- biliob_analyzer/add_keyword.py | 92 ++++++++++++++++++++++++++++++++++ biliob_analyzer/dict.txt | 2 + biliob_analyzer/online.py | 45 +++++++++++++++++ run_add_kw.py | 3 ++ run_analyzer.py | 3 ++ 5 files changed, 145 insertions(+) create mode 100644 biliob_analyzer/add_keyword.py create mode 100644 biliob_analyzer/dict.txt create mode 100644 biliob_analyzer/online.py create mode 100644 run_add_kw.py diff --git a/biliob_analyzer/add_keyword.py b/biliob_analyzer/add_keyword.py new file mode 100644 index 0000000..d18f359 --- /dev/null +++ b/biliob_analyzer/add_keyword.py @@ -0,0 +1,92 @@ +from pymongo import ReturnDocument +import jieba +from db import db + +# 载入字典 +jieba.load_userdict('./biliob_analyzer/dict.txt') + + +class AddKeyword(): + + def __init__(self): + self.mongo_author = db['author'] + self.mongo_video = db['video'] + + def get_video_kw_list(self, aid): + # 关键字从name和official中提取 + video = self.mongo_video.find_one( + {'aid': aid}, {'_id': 0, 'title': 1, 'channel': 1, 'subChannel': 1, 'author': 1, 'tag': 1}) + kw = [] + for each_key in video: + if each_key != 'keyword' or each_key != 'tag': + kw.append(str(video[each_key]).lower()) + elif each_key == 'tag': + kw += video['tag'] + else: + kw += video['keyword'] + seg_list = jieba.lcut_for_search( + ' '.join(kw), True) # 搜索引擎模式 + + # 全名算作关键字 + if 'author' in video and video['author'].lower() not in seg_list: + seg_list.append(video['author'].lower()) + + while ' ' in seg_list: + seg_list.remove(' ') + while '、' in seg_list: + seg_list.remove('、') + return list(set(seg_list)) + + def add_to_video(self, aid, seg_list): + self.mongo_video.update_one({'aid': aid}, {'$set': { + 'keyword': seg_list + }}) + + def add_video_kw(self, aid): + self.add_to_video(aid, self.get_video_kw_list(aid)) + return True + + def get_author_kw_list(self, mid): + # 关键字从name和official中提取 + author = self.mongo_author.find_one( + {'mid': mid}, {'_id': 0, 'name': 1, 'official': 1, 'keyword': 1}) + kw = [] + for each_key in author: + if each_key != 'keyword': + kw.append(str(author[each_key]).lower()) + else: + kw += author['keyword'] + seg_list = jieba.lcut_for_search( + ' '.join(kw), True) # 搜索引擎模式 + + # 全名算作关键字 + if 'name' in author and author['name'].lower() not in seg_list: + seg_list.append(author['name'].lower()) + + while ' ' in seg_list: + seg_list.remove(' ') + while '、' in seg_list: + seg_list.remove('、') + return list(set(seg_list)) + + def add_author_kw(self, mid): + self.add_to_author(mid, self.get_author_kw_list(mid)) + return True + + def add_to_author(self, mid, seg_list): + self.mongo_author.update_one( + {'mid': mid}, {'$set': {'keyword': seg_list}}) + + def add_all_author(self): + authors = self.mongo_author.find( + {'keyword': {'$exists': False}}, {'_id': 0, 'mid': 1}) + for each_author in authors: + mid = each_author['mid'] + self.add_author_kw(mid) + + def add_all_video(self): + videos = self.mongo_video.find( + {'keyword': {'$exists': False}}, {'_id': 0, 'aid': 1}) + for each_video in videos: + aid = each_video['aid'] + self.add_video_kw(aid) diff --git a/biliob_analyzer/dict.txt b/biliob_analyzer/dict.txt new file mode 100644 index 0000000..496402b --- /dev/null +++ b/biliob_analyzer/dict.txt @@ -0,0 +1,2 @@ +高能联盟 +lex \ No newline at end of file diff --git a/biliob_analyzer/online.py b/biliob_analyzer/online.py new file mode 100644 index 0000000..72de82a --- /dev/null +++ b/biliob_analyzer/online.py @@ -0,0 +1,45 @@ +from db import settings +from pymongo import MongoClient +import datetime +client = MongoClient(settings['MINGO_HOST'], 27017) +# 数据库登录需要帐号密码 +client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) +db = client['biliob'] # 获得数据库的句柄 +video_online = db['video_online'] +pass +d = video_online.aggregate([ + { + '$match': + { + 'data.datetime': + { + '$gt': datetime.datetime(2019, 2, 21) + } + } + }, + { + '$limit': 1 + }, + { + '$project': + { + "title": 1, + "author": 1, + "data": + { + '$filter': + { + 'input': "$data", + 'as': "item", + 'cond': + { + '$gt': ["$$item.datetime", datetime.datetime(2019, 2, 22)] + } + } + } + } + } +]) +print(len(next(d)['data'])) +pass diff --git a/run_add_kw.py b/run_add_kw.py new file mode 100644 index 0000000..a1a131a --- /dev/null +++ b/run_add_kw.py @@ -0,0 +1,3 @@ +from biliob_analyzer.add_keyword import AddKeyword +AddKeyword().add_all_author() +AddKeyword().add_all_video() diff --git a/run_analyzer.py b/run_analyzer.py index f5fc193..4664268 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -4,6 +4,9 @@ import biliob_analyzer.author_fans_watcher import biliob_analyzer.author_rank # import biliob_analyzer.video_rank +from biliob_analyzer.add_keyword import AddKeyword +AddKeyword().add_all_author() +AddKeyword().add_all_video() author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From 893a41b278d7cd2f0031bdfb293d14851c2e233b Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sun, 24 Feb 2019 23:13:38 +0800 Subject: [PATCH 179/469] feature: add keywords --- biliob_analyzer/add_keyword.py | 92 ++++++++++++++++++++++++++++++++++ biliob_analyzer/dict.txt | 2 + biliob_analyzer/online.py | 45 +++++++++++++++++ run_add_kw.py | 3 ++ run_analyzer.py | 3 ++ 5 files changed, 145 insertions(+) create mode 100644 biliob_analyzer/add_keyword.py create mode 100644 biliob_analyzer/dict.txt create mode 100644 biliob_analyzer/online.py create mode 100644 run_add_kw.py diff --git a/biliob_analyzer/add_keyword.py b/biliob_analyzer/add_keyword.py new file mode 100644 index 0000000..d18f359 --- /dev/null +++ b/biliob_analyzer/add_keyword.py @@ -0,0 +1,92 @@ +from pymongo import ReturnDocument +import jieba +from db import db + +# 载入字典 +jieba.load_userdict('./biliob_analyzer/dict.txt') + + +class AddKeyword(): + + def __init__(self): + self.mongo_author = db['author'] + self.mongo_video = db['video'] + + def get_video_kw_list(self, aid): + # 关键字从name和official中提取 + video = self.mongo_video.find_one( + {'aid': aid}, {'_id': 0, 'title': 1, 'channel': 1, 'subChannel': 1, 'author': 1, 'tag': 1}) + kw = [] + for each_key in video: + if each_key != 'keyword' or each_key != 'tag': + kw.append(str(video[each_key]).lower()) + elif each_key == 'tag': + kw += video['tag'] + else: + kw += video['keyword'] + seg_list = jieba.lcut_for_search( + ' '.join(kw), True) # 搜索引擎模式 + + # 全名算作关键字 + if 'author' in video and video['author'].lower() not in seg_list: + seg_list.append(video['author'].lower()) + + while ' ' in seg_list: + seg_list.remove(' ') + while '、' in seg_list: + seg_list.remove('、') + return list(set(seg_list)) + + def add_to_video(self, aid, seg_list): + self.mongo_video.update_one({'aid': aid}, {'$set': { + 'keyword': seg_list + }}) + + def add_video_kw(self, aid): + self.add_to_video(aid, self.get_video_kw_list(aid)) + return True + + def get_author_kw_list(self, mid): + # 关键字从name和official中提取 + author = self.mongo_author.find_one( + {'mid': mid}, {'_id': 0, 'name': 1, 'official': 1, 'keyword': 1}) + kw = [] + for each_key in author: + if each_key != 'keyword': + kw.append(str(author[each_key]).lower()) + else: + kw += author['keyword'] + seg_list = jieba.lcut_for_search( + ' '.join(kw), True) # 搜索引擎模式 + + # 全名算作关键字 + if 'name' in author and author['name'].lower() not in seg_list: + seg_list.append(author['name'].lower()) + + while ' ' in seg_list: + seg_list.remove(' ') + while '、' in seg_list: + seg_list.remove('、') + return list(set(seg_list)) + + def add_author_kw(self, mid): + self.add_to_author(mid, self.get_author_kw_list(mid)) + return True + + def add_to_author(self, mid, seg_list): + self.mongo_author.update_one( + {'mid': mid}, {'$set': {'keyword': seg_list}}) + + def add_all_author(self): + authors = self.mongo_author.find( + {'keyword': {'$exists': False}}, {'_id': 0, 'mid': 1}) + for each_author in authors: + mid = each_author['mid'] + self.add_author_kw(mid) + + def add_all_video(self): + videos = self.mongo_video.find( + {'keyword': {'$exists': False}}, {'_id': 0, 'aid': 1}) + for each_video in videos: + aid = each_video['aid'] + self.add_video_kw(aid) diff --git a/biliob_analyzer/dict.txt b/biliob_analyzer/dict.txt new file mode 100644 index 0000000..496402b --- /dev/null +++ b/biliob_analyzer/dict.txt @@ -0,0 +1,2 @@ +高能联盟 +lex \ No newline at end of file diff --git a/biliob_analyzer/online.py b/biliob_analyzer/online.py new file mode 100644 index 0000000..72de82a --- /dev/null +++ b/biliob_analyzer/online.py @@ -0,0 +1,45 @@ +from db import settings +from pymongo import MongoClient +import datetime +client = MongoClient(settings['MINGO_HOST'], 27017) +# 数据库登录需要帐号密码 +client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) +db = client['biliob'] # 获得数据库的句柄 +video_online = db['video_online'] +pass +d = video_online.aggregate([ + { + '$match': + { + 'data.datetime': + { + '$gt': datetime.datetime(2019, 2, 21) + } + } + }, + { + '$limit': 1 + }, + { + '$project': + { + "title": 1, + "author": 1, + "data": + { + '$filter': + { + 'input': "$data", + 'as': "item", + 'cond': + { + '$gt': ["$$item.datetime", datetime.datetime(2019, 2, 22)] + } + } + } + } + } +]) +print(len(next(d)['data'])) +pass diff --git a/run_add_kw.py b/run_add_kw.py new file mode 100644 index 0000000..a1a131a --- /dev/null +++ b/run_add_kw.py @@ -0,0 +1,3 @@ +from biliob_analyzer.add_keyword import AddKeyword +AddKeyword().add_all_author() +AddKeyword().add_all_video() diff --git a/run_analyzer.py b/run_analyzer.py index f5fc193..4664268 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -4,6 +4,9 @@ import biliob_analyzer.author_fans_watcher import biliob_analyzer.author_rank # import biliob_analyzer.video_rank +from biliob_analyzer.add_keyword import AddKeyword +AddKeyword().add_all_author() +AddKeyword().add_all_video() author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From f8490dddc05d7aecd21b5b0af974b660c5ae7f75 Mon Sep 17 00:00:00 2001 From: jannchie Date: Mon, 25 Feb 2019 14:31:08 +0800 Subject: [PATCH 180/469] update --- biliob_spider/pipelines.py | 924 +++++++++++++++++++------------------ 1 file changed, 471 insertions(+), 453 deletions(-) diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 38252ba..b7ef183 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -1,453 +1,471 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html -from pymongo import MongoClient -from db import settings -from db import mysql_connect -import datetime -import logging - - -class StrongPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - - def process_item(self, item, spider): - try: - self.coll = self.db['video'] - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'cView': item['current_view'], - 'cFavorite': item['current_favorite'], - 'cDanmaku': item['current_danmaku'], - 'cCoin': item['current_coin'], - 'cShare': item['current_share'], - 'cLike': item['current_like'], - 'cDatetime': item['current_datetime'], - 'author': item['author'], - 'subChannel': item['subChannel'], - 'channel': item['channel'], - 'mid': item['mid'], - 'pic': item['pic'], - 'title': item['title'], - 'datetime': datetime.datetime.fromtimestamp( - item['datetime']) - }, - '$push': { - 'data': { - '$each': [item['data_video']], - '$position': 0 - } - } - }, True) - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - try: - self.coll = self.db['author'] # 获得collection的句柄 - self.coll.update_one({ - 'mid': item['mid'] - }, { - '$set': { - 'focus': True, - 'sex': item['sex'], - 'name': item['name'], - 'face': item['face'], - 'level': item['level'], - 'cFans': item['c_fans'], - 'official': item['official'], - 'cArchive': item['c_archive'], - 'cArticle': item['c_article'], - 'cAttention': item['c_attention'], - 'cArchive_view': item['c_archive_view'], - 'cArticle_view': item['c_article_view'], - }, - '$push': { - 'data': { - '$each': [item['data_author']], - '$position': 0 - } - } - }, True) - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - return item - - -class VideoPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'cView': item['current_view'], - 'cFavorite': item['current_favorite'], - 'cDanmaku': item['current_danmaku'], - 'cCoin': item['current_coin'], - 'cShare': item['current_share'], - 'cLike': item['current_like'], - 'cDatetime': item['current_datetime'], - 'author': item['author'], - 'subChannel': item['subChannel'], - 'channel': item['channel'], - 'mid': item['mid'], - 'pic': item['pic'], - 'title': item['title'], - 'datetime': datetime.datetime.fromtimestamp( - item['datetime']) - }, - '$push': { - 'data': { - '$each': [item['data']], - '$position': 0 - } - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class VideoPipelineFromKan(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'author': item['author'], - 'mid': item['mid'], - 'pic': item['pic'], - 'title': item['title'], - 'datetime': item['datetime'] - }, - '$push': { - 'data': { - '$each': [item['data']], - '$position': 0 - } - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class BangumiPipeLine(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['bangumi'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'title': item['title'] - }, { - '$set': { - 'title': item['title'], - 'cover': item['cover'], - # 'isFinish': item['is_finish'], - # 'isStarted': item['is_started'], - 'newest': item['newest_ep_index'], - 'currentPts': item['data']['pts'], - 'currentPlay': item['data']['play'], - # 'squareCover': item['square_cover'], - 'currentWatch': item['data']['watch'], - 'currentReview': item['data']['review'], - 'currentDanmaku': item['data']['danmaku'] - }, - '$addToSet': { - 'data': item['data'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class DonghuaPipeLine(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['donghua'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'title': item['title'] - }, { - '$set': { - 'title': item['title'], - 'cover': item['cover'], - # 'isFinish': item['is_finish'], - # 'isStarted': item['is_started'], - 'newest': item['newest_ep_index'], - 'currentPts': item['data']['pts'], - 'currentPlay': item['data']['play'], - # 'squareCover': item['square_cover'], - 'currentWatch': item['data']['watch'], - 'currentReview': item['data']['review'], - 'currentDanmaku': item['data']['danmaku'] - }, - '$addToSet': { - 'data': item['data'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class SiteInfoPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['site_info'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.insert_one({ - 'region_count': item['region_count'], - 'all_count': item['all_count'], - 'web_online': item['web_online'], - 'play_online': item['play_online'], - 'datetime': datetime.datetime.now() - }) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class AuthorPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['author'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'mid': item['mid'] - }, { - '$set': { - 'focus': True, - 'sex': item['sex'], - 'name': item['name'], - 'face': item['face'], - 'level': item['level'], - 'cFans': item['c_fans'], - 'official': item['official'], - }, - '$push': { - 'data': { - '$each': [item['data']], - '$position': 0 - } - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class OnlinePipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video_online'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - - self.coll.update_one({ - 'title': item['title'] - }, { - '$set': { - 'title': item['title'], - 'author': item['author'], - 'channel': item['channel'], - 'subChannel': item['subChannel'], - }, - '$addToSet': { - 'data': item['data'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class TagPipeLine(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['tag'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - - self.coll.update_one({ - 'tag_id': item['tag_id'] - }, { - '$set': { - 'tag_name': item['tag_name'], - 'ctime': item['ctime'], - }, - '$addToSet': { - 'use': item['use'], - 'atten': item['atten'], - 'datetime': datetime.datetime.now() - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class VideoAddPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - for each_aid in item['aid']: - self.coll.update_one({ - 'aid': each_aid - }, { - '$set': { - 'aid': each_aid, - 'focus': True - }, - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class AuthorChannelPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['author'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'mid': item['mid'] - }, { - '$set': { - 'channels': item['channels'] - }, - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class BiliMonthlyRankPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['monthly_rank'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'aid': item['aid'] - }, { - '$addToSet': { - 'pts': item['pts'], - 'datetime': datetime.datetime.now() - }, - '$set': { - 'title': item['title'], - 'author': item['author'], - 'aid': item['aid'], - 'mid': item['mid'], - 'channel': item['channel'], - 'currentPts': item['pts'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html +from pymongo import MongoClient +from db import settings +from db import mysql_connect +import datetime +import logging +import redis +from db import redis_connect_string + + +class StrongPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.redis_connection = redis.from_url(redis_connect_string) + + def process_item(self, item, spider): + try: + + self.coll = self.db['video'] + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'cView': item['current_view'], + 'cFavorite': item['current_favorite'], + 'cDanmaku': item['current_danmaku'], + 'cCoin': item['current_coin'], + 'cShare': item['current_share'], + 'cLike': item['current_like'], + 'cDatetime': item['current_datetime'], + 'author': item['author'], + 'subChannel': item['subChannel'], + 'channel': item['channel'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': datetime.datetime.fromtimestamp( + item['datetime']) + }, + '$push': { + 'data': { + '$each': [item['data_video']], + '$position': 0 + } + } + }, True) + # 刷新redis数据缓存 + self.redis_connection.delete( + "video_detail::{}".format(item['aid'])) + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + try: + self.coll = self.db['author'] # 获得collection的句柄 + self.coll.update_one({ + 'mid': item['mid'] + }, { + '$set': { + 'focus': True, + 'sex': item['sex'], + 'name': item['name'], + 'face': item['face'], + 'level': item['level'], + 'cFans': item['c_fans'], + 'official': item['official'], + 'cArchive': item['c_archive'], + 'cArticle': item['c_article'], + 'cAttention': item['c_attention'], + 'cArchive_view': item['c_archive_view'], + 'cArticle_view': item['c_article_view'], + }, + '$push': { + 'data': { + '$each': [item['data_author']], + '$position': 0 + } + } + }, True) + self.redis_connection.delete( + "author_detail::{}".format(item['mid'])) + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + return item + + +class VideoPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'cView': item['current_view'], + 'cFavorite': item['current_favorite'], + 'cDanmaku': item['current_danmaku'], + 'cCoin': item['current_coin'], + 'cShare': item['current_share'], + 'cLike': item['current_like'], + 'cDatetime': item['current_datetime'], + 'author': item['author'], + 'subChannel': item['subChannel'], + 'channel': item['channel'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': datetime.datetime.fromtimestamp( + item['datetime']) + }, + '$push': { + 'data': { + '$each': [item['data']], + '$position': 0 + } + } + }, True) + self.redis_connection.delete( + "video_detail::{}".format(item['aid'])) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class VideoPipelineFromKan(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'author': item['author'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': item['datetime'] + }, + '$push': { + 'data': { + '$each': [item['data']], + '$position': 0 + } + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class BangumiPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['bangumi'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'title': item['title'] + }, { + '$set': { + 'title': item['title'], + 'cover': item['cover'], + # 'isFinish': item['is_finish'], + # 'isStarted': item['is_started'], + 'newest': item['newest_ep_index'], + 'currentPts': item['data']['pts'], + 'currentPlay': item['data']['play'], + # 'squareCover': item['square_cover'], + 'currentWatch': item['data']['watch'], + 'currentReview': item['data']['review'], + 'currentDanmaku': item['data']['danmaku'] + }, + '$addToSet': { + 'data': item['data'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class DonghuaPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['donghua'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'title': item['title'] + }, { + '$set': { + 'title': item['title'], + 'cover': item['cover'], + # 'isFinish': item['is_finish'], + # 'isStarted': item['is_started'], + 'newest': item['newest_ep_index'], + 'currentPts': item['data']['pts'], + 'currentPlay': item['data']['play'], + # 'squareCover': item['square_cover'], + 'currentWatch': item['data']['watch'], + 'currentReview': item['data']['review'], + 'currentDanmaku': item['data']['danmaku'] + }, + '$addToSet': { + 'data': item['data'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class SiteInfoPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['site_info'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.insert_one({ + 'region_count': item['region_count'], + 'all_count': item['all_count'], + 'web_online': item['web_online'], + 'play_online': item['play_online'], + 'datetime': datetime.datetime.now() + }) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class AuthorPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'mid': item['mid'] + }, { + '$set': { + 'focus': True, + 'sex': item['sex'], + 'name': item['name'], + 'face': item['face'], + 'level': item['level'], + 'cFans': item['c_fans'], + 'official': item['official'], + }, + '$push': { + 'data': { + '$each': [item['data']], + '$position': 0 + } + } + }, True) + self.redis_connection.delete( + "author_detail::{}".format(item['mid'])) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class OnlinePipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video_online'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + + self.coll.update_one({ + 'title': item['title'] + }, { + '$set': { + 'title': item['title'], + 'author': item['author'], + 'channel': item['channel'], + 'subChannel': item['subChannel'], + }, + '$addToSet': { + 'data': item['data'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class TagPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['tag'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + + self.coll.update_one({ + 'tag_id': item['tag_id'] + }, { + '$set': { + 'tag_name': item['tag_name'], + 'ctime': item['ctime'], + }, + '$addToSet': { + 'use': item['use'], + 'atten': item['atten'], + 'datetime': datetime.datetime.now() + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class VideoAddPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + for each_aid in item['aid']: + self.coll.update_one({ + 'aid': each_aid + }, { + '$set': { + 'aid': each_aid, + 'focus': True + }, + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class AuthorChannelPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'mid': item['mid'] + }, { + '$set': { + 'channels': item['channels'] + }, + }, True) + self.redis_connection.delete( + "author_detail::{}".format(item['mid'])) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class BiliMonthlyRankPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['monthly_rank'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'aid': item['aid'] + }, { + '$addToSet': { + 'pts': item['pts'], + 'datetime': datetime.datetime.now() + }, + '$set': { + 'title': item['title'], + 'author': item['author'], + 'aid': item['aid'], + 'mid': item['mid'], + 'channel': item['channel'], + 'currentPts': item['pts'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) From 7a362b1da9296091cba4112ae1da34768fcc86fa Mon Sep 17 00:00:00 2001 From: jannchie Date: Mon, 25 Feb 2019 14:31:08 +0800 Subject: [PATCH 181/469] update --- biliob_spider/pipelines.py | 924 +++++++++++++++++++------------------ 1 file changed, 471 insertions(+), 453 deletions(-) diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 38252ba..b7ef183 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -1,453 +1,471 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html -from pymongo import MongoClient -from db import settings -from db import mysql_connect -import datetime -import logging - - -class StrongPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - - def process_item(self, item, spider): - try: - self.coll = self.db['video'] - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'cView': item['current_view'], - 'cFavorite': item['current_favorite'], - 'cDanmaku': item['current_danmaku'], - 'cCoin': item['current_coin'], - 'cShare': item['current_share'], - 'cLike': item['current_like'], - 'cDatetime': item['current_datetime'], - 'author': item['author'], - 'subChannel': item['subChannel'], - 'channel': item['channel'], - 'mid': item['mid'], - 'pic': item['pic'], - 'title': item['title'], - 'datetime': datetime.datetime.fromtimestamp( - item['datetime']) - }, - '$push': { - 'data': { - '$each': [item['data_video']], - '$position': 0 - } - } - }, True) - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - try: - self.coll = self.db['author'] # 获得collection的句柄 - self.coll.update_one({ - 'mid': item['mid'] - }, { - '$set': { - 'focus': True, - 'sex': item['sex'], - 'name': item['name'], - 'face': item['face'], - 'level': item['level'], - 'cFans': item['c_fans'], - 'official': item['official'], - 'cArchive': item['c_archive'], - 'cArticle': item['c_article'], - 'cAttention': item['c_attention'], - 'cArchive_view': item['c_archive_view'], - 'cArticle_view': item['c_article_view'], - }, - '$push': { - 'data': { - '$each': [item['data_author']], - '$position': 0 - } - } - }, True) - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - return item - - -class VideoPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'cView': item['current_view'], - 'cFavorite': item['current_favorite'], - 'cDanmaku': item['current_danmaku'], - 'cCoin': item['current_coin'], - 'cShare': item['current_share'], - 'cLike': item['current_like'], - 'cDatetime': item['current_datetime'], - 'author': item['author'], - 'subChannel': item['subChannel'], - 'channel': item['channel'], - 'mid': item['mid'], - 'pic': item['pic'], - 'title': item['title'], - 'datetime': datetime.datetime.fromtimestamp( - item['datetime']) - }, - '$push': { - 'data': { - '$each': [item['data']], - '$position': 0 - } - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class VideoPipelineFromKan(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'author': item['author'], - 'mid': item['mid'], - 'pic': item['pic'], - 'title': item['title'], - 'datetime': item['datetime'] - }, - '$push': { - 'data': { - '$each': [item['data']], - '$position': 0 - } - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class BangumiPipeLine(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['bangumi'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'title': item['title'] - }, { - '$set': { - 'title': item['title'], - 'cover': item['cover'], - # 'isFinish': item['is_finish'], - # 'isStarted': item['is_started'], - 'newest': item['newest_ep_index'], - 'currentPts': item['data']['pts'], - 'currentPlay': item['data']['play'], - # 'squareCover': item['square_cover'], - 'currentWatch': item['data']['watch'], - 'currentReview': item['data']['review'], - 'currentDanmaku': item['data']['danmaku'] - }, - '$addToSet': { - 'data': item['data'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class DonghuaPipeLine(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['donghua'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'title': item['title'] - }, { - '$set': { - 'title': item['title'], - 'cover': item['cover'], - # 'isFinish': item['is_finish'], - # 'isStarted': item['is_started'], - 'newest': item['newest_ep_index'], - 'currentPts': item['data']['pts'], - 'currentPlay': item['data']['play'], - # 'squareCover': item['square_cover'], - 'currentWatch': item['data']['watch'], - 'currentReview': item['data']['review'], - 'currentDanmaku': item['data']['danmaku'] - }, - '$addToSet': { - 'data': item['data'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class SiteInfoPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['site_info'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.insert_one({ - 'region_count': item['region_count'], - 'all_count': item['all_count'], - 'web_online': item['web_online'], - 'play_online': item['play_online'], - 'datetime': datetime.datetime.now() - }) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class AuthorPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['author'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'mid': item['mid'] - }, { - '$set': { - 'focus': True, - 'sex': item['sex'], - 'name': item['name'], - 'face': item['face'], - 'level': item['level'], - 'cFans': item['c_fans'], - 'official': item['official'], - }, - '$push': { - 'data': { - '$each': [item['data']], - '$position': 0 - } - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class OnlinePipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video_online'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - - self.coll.update_one({ - 'title': item['title'] - }, { - '$set': { - 'title': item['title'], - 'author': item['author'], - 'channel': item['channel'], - 'subChannel': item['subChannel'], - }, - '$addToSet': { - 'data': item['data'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class TagPipeLine(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['tag'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - - self.coll.update_one({ - 'tag_id': item['tag_id'] - }, { - '$set': { - 'tag_name': item['tag_name'], - 'ctime': item['ctime'], - }, - '$addToSet': { - 'use': item['use'], - 'atten': item['atten'], - 'datetime': datetime.datetime.now() - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class VideoAddPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - for each_aid in item['aid']: - self.coll.update_one({ - 'aid': each_aid - }, { - '$set': { - 'aid': each_aid, - 'focus': True - }, - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class AuthorChannelPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['author'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'mid': item['mid'] - }, { - '$set': { - 'channels': item['channels'] - }, - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class BiliMonthlyRankPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['monthly_rank'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'aid': item['aid'] - }, { - '$addToSet': { - 'pts': item['pts'], - 'datetime': datetime.datetime.now() - }, - '$set': { - 'title': item['title'], - 'author': item['author'], - 'aid': item['aid'], - 'mid': item['mid'], - 'channel': item['channel'], - 'currentPts': item['pts'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html +from pymongo import MongoClient +from db import settings +from db import mysql_connect +import datetime +import logging +import redis +from db import redis_connect_string + + +class StrongPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.redis_connection = redis.from_url(redis_connect_string) + + def process_item(self, item, spider): + try: + + self.coll = self.db['video'] + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'cView': item['current_view'], + 'cFavorite': item['current_favorite'], + 'cDanmaku': item['current_danmaku'], + 'cCoin': item['current_coin'], + 'cShare': item['current_share'], + 'cLike': item['current_like'], + 'cDatetime': item['current_datetime'], + 'author': item['author'], + 'subChannel': item['subChannel'], + 'channel': item['channel'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': datetime.datetime.fromtimestamp( + item['datetime']) + }, + '$push': { + 'data': { + '$each': [item['data_video']], + '$position': 0 + } + } + }, True) + # 刷新redis数据缓存 + self.redis_connection.delete( + "video_detail::{}".format(item['aid'])) + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + try: + self.coll = self.db['author'] # 获得collection的句柄 + self.coll.update_one({ + 'mid': item['mid'] + }, { + '$set': { + 'focus': True, + 'sex': item['sex'], + 'name': item['name'], + 'face': item['face'], + 'level': item['level'], + 'cFans': item['c_fans'], + 'official': item['official'], + 'cArchive': item['c_archive'], + 'cArticle': item['c_article'], + 'cAttention': item['c_attention'], + 'cArchive_view': item['c_archive_view'], + 'cArticle_view': item['c_article_view'], + }, + '$push': { + 'data': { + '$each': [item['data_author']], + '$position': 0 + } + } + }, True) + self.redis_connection.delete( + "author_detail::{}".format(item['mid'])) + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + return item + + +class VideoPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'cView': item['current_view'], + 'cFavorite': item['current_favorite'], + 'cDanmaku': item['current_danmaku'], + 'cCoin': item['current_coin'], + 'cShare': item['current_share'], + 'cLike': item['current_like'], + 'cDatetime': item['current_datetime'], + 'author': item['author'], + 'subChannel': item['subChannel'], + 'channel': item['channel'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': datetime.datetime.fromtimestamp( + item['datetime']) + }, + '$push': { + 'data': { + '$each': [item['data']], + '$position': 0 + } + } + }, True) + self.redis_connection.delete( + "video_detail::{}".format(item['aid'])) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class VideoPipelineFromKan(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'author': item['author'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': item['datetime'] + }, + '$push': { + 'data': { + '$each': [item['data']], + '$position': 0 + } + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class BangumiPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['bangumi'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'title': item['title'] + }, { + '$set': { + 'title': item['title'], + 'cover': item['cover'], + # 'isFinish': item['is_finish'], + # 'isStarted': item['is_started'], + 'newest': item['newest_ep_index'], + 'currentPts': item['data']['pts'], + 'currentPlay': item['data']['play'], + # 'squareCover': item['square_cover'], + 'currentWatch': item['data']['watch'], + 'currentReview': item['data']['review'], + 'currentDanmaku': item['data']['danmaku'] + }, + '$addToSet': { + 'data': item['data'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class DonghuaPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['donghua'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'title': item['title'] + }, { + '$set': { + 'title': item['title'], + 'cover': item['cover'], + # 'isFinish': item['is_finish'], + # 'isStarted': item['is_started'], + 'newest': item['newest_ep_index'], + 'currentPts': item['data']['pts'], + 'currentPlay': item['data']['play'], + # 'squareCover': item['square_cover'], + 'currentWatch': item['data']['watch'], + 'currentReview': item['data']['review'], + 'currentDanmaku': item['data']['danmaku'] + }, + '$addToSet': { + 'data': item['data'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class SiteInfoPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['site_info'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.insert_one({ + 'region_count': item['region_count'], + 'all_count': item['all_count'], + 'web_online': item['web_online'], + 'play_online': item['play_online'], + 'datetime': datetime.datetime.now() + }) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class AuthorPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'mid': item['mid'] + }, { + '$set': { + 'focus': True, + 'sex': item['sex'], + 'name': item['name'], + 'face': item['face'], + 'level': item['level'], + 'cFans': item['c_fans'], + 'official': item['official'], + }, + '$push': { + 'data': { + '$each': [item['data']], + '$position': 0 + } + } + }, True) + self.redis_connection.delete( + "author_detail::{}".format(item['mid'])) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class OnlinePipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video_online'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + + self.coll.update_one({ + 'title': item['title'] + }, { + '$set': { + 'title': item['title'], + 'author': item['author'], + 'channel': item['channel'], + 'subChannel': item['subChannel'], + }, + '$addToSet': { + 'data': item['data'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class TagPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['tag'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + + self.coll.update_one({ + 'tag_id': item['tag_id'] + }, { + '$set': { + 'tag_name': item['tag_name'], + 'ctime': item['ctime'], + }, + '$addToSet': { + 'use': item['use'], + 'atten': item['atten'], + 'datetime': datetime.datetime.now() + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class VideoAddPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + for each_aid in item['aid']: + self.coll.update_one({ + 'aid': each_aid + }, { + '$set': { + 'aid': each_aid, + 'focus': True + }, + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class AuthorChannelPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'mid': item['mid'] + }, { + '$set': { + 'channels': item['channels'] + }, + }, True) + self.redis_connection.delete( + "author_detail::{}".format(item['mid'])) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class BiliMonthlyRankPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['monthly_rank'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'aid': item['aid'] + }, { + '$addToSet': { + 'pts': item['pts'], + 'datetime': datetime.datetime.now() + }, + '$set': { + 'title': item['title'], + 'author': item['author'], + 'aid': item['aid'], + 'mid': item['mid'], + 'channel': item['channel'], + 'currentPts': item['pts'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) From 7c48ce8d3b510c87f37364cffe711456db6dcb81 Mon Sep 17 00:00:00 2001 From: jannchie Date: Mon, 25 Feb 2019 14:31:08 +0800 Subject: [PATCH 182/469] update --- biliob_spider/pipelines.py | 924 +++++++++++++++++++------------------ 1 file changed, 471 insertions(+), 453 deletions(-) diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 38252ba..b7ef183 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -1,453 +1,471 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html -from pymongo import MongoClient -from db import settings -from db import mysql_connect -import datetime -import logging - - -class StrongPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - - def process_item(self, item, spider): - try: - self.coll = self.db['video'] - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'cView': item['current_view'], - 'cFavorite': item['current_favorite'], - 'cDanmaku': item['current_danmaku'], - 'cCoin': item['current_coin'], - 'cShare': item['current_share'], - 'cLike': item['current_like'], - 'cDatetime': item['current_datetime'], - 'author': item['author'], - 'subChannel': item['subChannel'], - 'channel': item['channel'], - 'mid': item['mid'], - 'pic': item['pic'], - 'title': item['title'], - 'datetime': datetime.datetime.fromtimestamp( - item['datetime']) - }, - '$push': { - 'data': { - '$each': [item['data_video']], - '$position': 0 - } - } - }, True) - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - try: - self.coll = self.db['author'] # 获得collection的句柄 - self.coll.update_one({ - 'mid': item['mid'] - }, { - '$set': { - 'focus': True, - 'sex': item['sex'], - 'name': item['name'], - 'face': item['face'], - 'level': item['level'], - 'cFans': item['c_fans'], - 'official': item['official'], - 'cArchive': item['c_archive'], - 'cArticle': item['c_article'], - 'cAttention': item['c_attention'], - 'cArchive_view': item['c_archive_view'], - 'cArticle_view': item['c_article_view'], - }, - '$push': { - 'data': { - '$each': [item['data_author']], - '$position': 0 - } - } - }, True) - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - return item - - -class VideoPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'cView': item['current_view'], - 'cFavorite': item['current_favorite'], - 'cDanmaku': item['current_danmaku'], - 'cCoin': item['current_coin'], - 'cShare': item['current_share'], - 'cLike': item['current_like'], - 'cDatetime': item['current_datetime'], - 'author': item['author'], - 'subChannel': item['subChannel'], - 'channel': item['channel'], - 'mid': item['mid'], - 'pic': item['pic'], - 'title': item['title'], - 'datetime': datetime.datetime.fromtimestamp( - item['datetime']) - }, - '$push': { - 'data': { - '$each': [item['data']], - '$position': 0 - } - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class VideoPipelineFromKan(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'author': item['author'], - 'mid': item['mid'], - 'pic': item['pic'], - 'title': item['title'], - 'datetime': item['datetime'] - }, - '$push': { - 'data': { - '$each': [item['data']], - '$position': 0 - } - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class BangumiPipeLine(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['bangumi'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'title': item['title'] - }, { - '$set': { - 'title': item['title'], - 'cover': item['cover'], - # 'isFinish': item['is_finish'], - # 'isStarted': item['is_started'], - 'newest': item['newest_ep_index'], - 'currentPts': item['data']['pts'], - 'currentPlay': item['data']['play'], - # 'squareCover': item['square_cover'], - 'currentWatch': item['data']['watch'], - 'currentReview': item['data']['review'], - 'currentDanmaku': item['data']['danmaku'] - }, - '$addToSet': { - 'data': item['data'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class DonghuaPipeLine(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['donghua'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'title': item['title'] - }, { - '$set': { - 'title': item['title'], - 'cover': item['cover'], - # 'isFinish': item['is_finish'], - # 'isStarted': item['is_started'], - 'newest': item['newest_ep_index'], - 'currentPts': item['data']['pts'], - 'currentPlay': item['data']['play'], - # 'squareCover': item['square_cover'], - 'currentWatch': item['data']['watch'], - 'currentReview': item['data']['review'], - 'currentDanmaku': item['data']['danmaku'] - }, - '$addToSet': { - 'data': item['data'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class SiteInfoPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['site_info'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.insert_one({ - 'region_count': item['region_count'], - 'all_count': item['all_count'], - 'web_online': item['web_online'], - 'play_online': item['play_online'], - 'datetime': datetime.datetime.now() - }) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class AuthorPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['author'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'mid': item['mid'] - }, { - '$set': { - 'focus': True, - 'sex': item['sex'], - 'name': item['name'], - 'face': item['face'], - 'level': item['level'], - 'cFans': item['c_fans'], - 'official': item['official'], - }, - '$push': { - 'data': { - '$each': [item['data']], - '$position': 0 - } - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class OnlinePipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video_online'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - - self.coll.update_one({ - 'title': item['title'] - }, { - '$set': { - 'title': item['title'], - 'author': item['author'], - 'channel': item['channel'], - 'subChannel': item['subChannel'], - }, - '$addToSet': { - 'data': item['data'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class TagPipeLine(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['tag'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - - self.coll.update_one({ - 'tag_id': item['tag_id'] - }, { - '$set': { - 'tag_name': item['tag_name'], - 'ctime': item['ctime'], - }, - '$addToSet': { - 'use': item['use'], - 'atten': item['atten'], - 'datetime': datetime.datetime.now() - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class VideoAddPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - for each_aid in item['aid']: - self.coll.update_one({ - 'aid': each_aid - }, { - '$set': { - 'aid': each_aid, - 'focus': True - }, - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class AuthorChannelPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['author'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'mid': item['mid'] - }, { - '$set': { - 'channels': item['channels'] - }, - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) - - -class BiliMonthlyRankPipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['monthly_rank'] # 获得collection的句柄 - - def process_item(self, item, spider): - try: - self.coll.update_one({ - 'aid': item['aid'] - }, { - '$addToSet': { - 'pts': item['pts'], - 'datetime': datetime.datetime.now() - }, - '$set': { - 'title': item['title'], - 'author': item['author'], - 'aid': item['aid'], - 'mid': item['mid'], - 'channel': item['channel'], - 'currentPts': item['pts'] - } - }, True) - return item - except Exception as error: - # 出现错误时打印错误日志 - logging.error('{}: {}'.format(spider.name, error)) +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html +from pymongo import MongoClient +from db import settings +from db import mysql_connect +import datetime +import logging +import redis +from db import redis_connect_string + + +class StrongPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.redis_connection = redis.from_url(redis_connect_string) + + def process_item(self, item, spider): + try: + + self.coll = self.db['video'] + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'cView': item['current_view'], + 'cFavorite': item['current_favorite'], + 'cDanmaku': item['current_danmaku'], + 'cCoin': item['current_coin'], + 'cShare': item['current_share'], + 'cLike': item['current_like'], + 'cDatetime': item['current_datetime'], + 'author': item['author'], + 'subChannel': item['subChannel'], + 'channel': item['channel'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': datetime.datetime.fromtimestamp( + item['datetime']) + }, + '$push': { + 'data': { + '$each': [item['data_video']], + '$position': 0 + } + } + }, True) + # 刷新redis数据缓存 + self.redis_connection.delete( + "video_detail::{}".format(item['aid'])) + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + try: + self.coll = self.db['author'] # 获得collection的句柄 + self.coll.update_one({ + 'mid': item['mid'] + }, { + '$set': { + 'focus': True, + 'sex': item['sex'], + 'name': item['name'], + 'face': item['face'], + 'level': item['level'], + 'cFans': item['c_fans'], + 'official': item['official'], + 'cArchive': item['c_archive'], + 'cArticle': item['c_article'], + 'cAttention': item['c_attention'], + 'cArchive_view': item['c_archive_view'], + 'cArticle_view': item['c_article_view'], + }, + '$push': { + 'data': { + '$each': [item['data_author']], + '$position': 0 + } + } + }, True) + self.redis_connection.delete( + "author_detail::{}".format(item['mid'])) + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + return item + + +class VideoPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'cView': item['current_view'], + 'cFavorite': item['current_favorite'], + 'cDanmaku': item['current_danmaku'], + 'cCoin': item['current_coin'], + 'cShare': item['current_share'], + 'cLike': item['current_like'], + 'cDatetime': item['current_datetime'], + 'author': item['author'], + 'subChannel': item['subChannel'], + 'channel': item['channel'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': datetime.datetime.fromtimestamp( + item['datetime']) + }, + '$push': { + 'data': { + '$each': [item['data']], + '$position': 0 + } + } + }, True) + self.redis_connection.delete( + "video_detail::{}".format(item['aid'])) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class VideoPipelineFromKan(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'author': item['author'], + 'mid': item['mid'], + 'pic': item['pic'], + 'title': item['title'], + 'datetime': item['datetime'] + }, + '$push': { + 'data': { + '$each': [item['data']], + '$position': 0 + } + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class BangumiPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['bangumi'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'title': item['title'] + }, { + '$set': { + 'title': item['title'], + 'cover': item['cover'], + # 'isFinish': item['is_finish'], + # 'isStarted': item['is_started'], + 'newest': item['newest_ep_index'], + 'currentPts': item['data']['pts'], + 'currentPlay': item['data']['play'], + # 'squareCover': item['square_cover'], + 'currentWatch': item['data']['watch'], + 'currentReview': item['data']['review'], + 'currentDanmaku': item['data']['danmaku'] + }, + '$addToSet': { + 'data': item['data'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class DonghuaPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['donghua'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'title': item['title'] + }, { + '$set': { + 'title': item['title'], + 'cover': item['cover'], + # 'isFinish': item['is_finish'], + # 'isStarted': item['is_started'], + 'newest': item['newest_ep_index'], + 'currentPts': item['data']['pts'], + 'currentPlay': item['data']['play'], + # 'squareCover': item['square_cover'], + 'currentWatch': item['data']['watch'], + 'currentReview': item['data']['review'], + 'currentDanmaku': item['data']['danmaku'] + }, + '$addToSet': { + 'data': item['data'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class SiteInfoPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['site_info'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.insert_one({ + 'region_count': item['region_count'], + 'all_count': item['all_count'], + 'web_online': item['web_online'], + 'play_online': item['play_online'], + 'datetime': datetime.datetime.now() + }) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class AuthorPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'mid': item['mid'] + }, { + '$set': { + 'focus': True, + 'sex': item['sex'], + 'name': item['name'], + 'face': item['face'], + 'level': item['level'], + 'cFans': item['c_fans'], + 'official': item['official'], + }, + '$push': { + 'data': { + '$each': [item['data']], + '$position': 0 + } + } + }, True) + self.redis_connection.delete( + "author_detail::{}".format(item['mid'])) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class OnlinePipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video_online'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + + self.coll.update_one({ + 'title': item['title'] + }, { + '$set': { + 'title': item['title'], + 'author': item['author'], + 'channel': item['channel'], + 'subChannel': item['subChannel'], + }, + '$addToSet': { + 'data': item['data'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class TagPipeLine(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['tag'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + + self.coll.update_one({ + 'tag_id': item['tag_id'] + }, { + '$set': { + 'tag_name': item['tag_name'], + 'ctime': item['ctime'], + }, + '$addToSet': { + 'use': item['use'], + 'atten': item['atten'], + 'datetime': datetime.datetime.now() + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class VideoAddPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + for each_aid in item['aid']: + self.coll.update_one({ + 'aid': each_aid + }, { + '$set': { + 'aid': each_aid, + 'focus': True + }, + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class AuthorChannelPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'mid': item['mid'] + }, { + '$set': { + 'channels': item['channels'] + }, + }, True) + self.redis_connection.delete( + "author_detail::{}".format(item['mid'])) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) + + +class BiliMonthlyRankPipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['monthly_rank'] # 获得collection的句柄 + + def process_item(self, item, spider): + try: + self.coll.update_one({ + 'aid': item['aid'] + }, { + '$addToSet': { + 'pts': item['pts'], + 'datetime': datetime.datetime.now() + }, + '$set': { + 'title': item['title'], + 'author': item['author'], + 'aid': item['aid'], + 'mid': item['mid'], + 'channel': item['channel'], + 'currentPts': item['pts'] + } + }, True) + return item + except Exception as error: + # 出现错误时打印错误日志 + logging.error('{}: {}'.format(spider.name, error)) From c9e82bba44b62ae3c17150122f816597071db779 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 26 Feb 2019 19:29:04 +0800 Subject: [PATCH 183/469] feature: danmaku aggregation --- biliob_spider/items.py | 9 ++ biliob_spider/pipelines.py | 30 ++++- .../spiders/danmaku_aggregate_spider.py | 125 ++++++++++++++++++ 3 files changed, 163 insertions(+), 1 deletion(-) create mode 100644 biliob_spider/spiders/danmaku_aggregate_spider.py diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 490dfe6..c6b221d 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -131,3 +131,12 @@ class VideoWatcherItem(scrapy.Item): mid = scrapy.Field() aid = scrapy.Field() channels = scrapy.Field() + + +class DanmakuAggregateItem(scrapy.Item): + aid = scrapy.Field() + p_name = scrapy.Field() + page_number = scrapy.Field() + word_frequency = scrapy.Field() + danmaku_density = scrapy.Field() + duration = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 515a02f..c03195d 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -13,6 +13,35 @@ from db import redis_connect_string +class DanmakuAggregatePipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.redis_connection = redis.from_url(redis_connect_string) + + def process_item(self, item, spider): + self.coll = self.db['video'] + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'danmaku_aggregate.{}'.format(item['page_number']): { + 'duration': item['duration'], + 'p_name': item['p_name'], + 'danmaku_density': item['danmaku_density'], + 'word_frequency': item['word_frequency'] + }, + } + }, True) + # 刷新redis数据缓存 + self.redis_connection.delete( + "video_detail::{}".format(item['aid'])) + + class StrongPipeline(object): def __init__(self): # 链接mongoDB @@ -25,7 +54,6 @@ def __init__(self): def process_item(self, item, spider): try: - self.coll = self.db['video'] self.coll.update_one({ 'aid': int(item['aid']) diff --git a/biliob_spider/spiders/danmaku_aggregate_spider.py b/biliob_spider/spiders/danmaku_aggregate_spider.py new file mode 100644 index 0000000..4c1fdde --- /dev/null +++ b/biliob_spider/spiders/danmaku_aggregate_spider.py @@ -0,0 +1,125 @@ +# coding=utf-8 +import scrapy +from mail import mailer +from scrapy.http import Request +from biliob_spider.items import DanmakuAggregateItem +from datetime import datetime +import time +import json +import logging +from pymongo import MongoClient +from db import settings +from util import sub_channel_2_channel +from scrapy_redis.spiders import RedisSpider +from db import redis_connect_string +import jieba +import jieba.analyse +import re + +jieba.load_userdict('./biliob_analyzer/dict.txt') + + +def q_to_b(q_str): + """全角转半角""" + b_str = "" + for uchar in q_str: + inside_code = ord(uchar) + if inside_code == 12288: # 全角空格直接转换 + inside_code = 32 + elif 65374 >= inside_code >= 65281: # 全角字符(除空格)根据关系转化 + inside_code -= 65248 + b_str += chr(inside_code) + return b_str + + +class DanmakuAggregateSpider(RedisSpider): + name = "DanmakuAggregate" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.DanmakuAggregatePipeline': 300, + }, + 'DOWNLOAD_DELAY': 1 + } + CID_API = "https://api.bilibili.com/x/web-interface/view?aid={aid}" + DANMAKU_API = "https://api.bilibili.com/x/v1/dm/list.so?oid={oid}" + PATTERN = r"[0-9a-zA-Z\u4e00-\u9fa5\u30a1-\u30f6\u3041-\u3093\uFF00-\uFFFF\u4e00-\u9fa5]+" + + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def parse(self, response): + try: + j = json.loads(response.body) + if j['code'] == -403: + aid = response.url[50:] + print('https://api.bilibili.com/x/article/archives?ids={}'.format(aid)) + yield Request('https://api.bilibili.com/x/article/archives?ids={}'.format(aid), + callback=self.getCidPlanB, meta={'aid': aid}) + else: + aid = j['data']['aid'] + pages = j['data']['pages'] + for each_page in pages: + duration = each_page['duration'] + p_name = each_page['part'] + page_number = each_page['page'] + cid = each_page['cid'] + yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, + meta={'duration': duration, + 'p_name': p_name, + 'page_number': page_number, + 'aid': aid}) + except Exception as error: + # 出现错误时打印错误日志 + if response['code'] == -404: + return + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}".format(response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) + + def getCidPlanB(self, response): + aid = response.meta['aid'] + cid = json.loads(response.body)['data'][aid]['cid'] + duration = json.loads(response.body)['data'][aid]['duration'] + yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, meta={'duration': duration, 'p_name': '', 'page_number': 1, 'aid': int(aid)}) + + def parseDanmaku(self, response): + duration = response.meta['duration'] + danmaku_text = q_to_b( + " ".join(response.xpath("d/text()").extract()).upper()) + # 自实现太low,使用自带关键字 + word_frequency = dict(jieba.analyse.extract_tags(danmaku_text, topK=50, withWeight=True, allowPOS=( + 'ns', 'n', 'vn', 'v', 'nr', 'un', 'x', 'j', 'i', 'l', 'nz', 'eng', 'o'))) + # 计算弹幕密度 + danmaku_attr = list(map(lambda x: x.split( + ","), response.xpath("d/@p").extract())) + tick = duration / 50 + danmaku_density = {} + danmaku_density = [0 for i in range(50)] + for each_attr in danmaku_attr: + t = float(each_attr[0]) + if t > duration: + continue + index = int(t // tick) + danmaku_density[index] += 1 + item = DanmakuAggregateItem() + + item['aid'] = response.meta['aid'] + item['duration'] = duration + item['word_frequency'] = word_frequency + item['p_name'] = response.meta['p_name'] + item['danmaku_density'] = danmaku_density + item['page_number'] = response.meta['page_number'] + yield item From afaeea32af4c8e17e4879005df3290dd3a84ee33 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 26 Feb 2019 19:29:04 +0800 Subject: [PATCH 184/469] feature: danmaku aggregation --- biliob_spider/items.py | 9 ++ biliob_spider/pipelines.py | 30 ++++- .../spiders/danmaku_aggregate_spider.py | 125 ++++++++++++++++++ 3 files changed, 163 insertions(+), 1 deletion(-) create mode 100644 biliob_spider/spiders/danmaku_aggregate_spider.py diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 490dfe6..c6b221d 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -131,3 +131,12 @@ class VideoWatcherItem(scrapy.Item): mid = scrapy.Field() aid = scrapy.Field() channels = scrapy.Field() + + +class DanmakuAggregateItem(scrapy.Item): + aid = scrapy.Field() + p_name = scrapy.Field() + page_number = scrapy.Field() + word_frequency = scrapy.Field() + danmaku_density = scrapy.Field() + duration = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 515a02f..c03195d 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -13,6 +13,35 @@ from db import redis_connect_string +class DanmakuAggregatePipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.redis_connection = redis.from_url(redis_connect_string) + + def process_item(self, item, spider): + self.coll = self.db['video'] + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'danmaku_aggregate.{}'.format(item['page_number']): { + 'duration': item['duration'], + 'p_name': item['p_name'], + 'danmaku_density': item['danmaku_density'], + 'word_frequency': item['word_frequency'] + }, + } + }, True) + # 刷新redis数据缓存 + self.redis_connection.delete( + "video_detail::{}".format(item['aid'])) + + class StrongPipeline(object): def __init__(self): # 链接mongoDB @@ -25,7 +54,6 @@ def __init__(self): def process_item(self, item, spider): try: - self.coll = self.db['video'] self.coll.update_one({ 'aid': int(item['aid']) diff --git a/biliob_spider/spiders/danmaku_aggregate_spider.py b/biliob_spider/spiders/danmaku_aggregate_spider.py new file mode 100644 index 0000000..4c1fdde --- /dev/null +++ b/biliob_spider/spiders/danmaku_aggregate_spider.py @@ -0,0 +1,125 @@ +# coding=utf-8 +import scrapy +from mail import mailer +from scrapy.http import Request +from biliob_spider.items import DanmakuAggregateItem +from datetime import datetime +import time +import json +import logging +from pymongo import MongoClient +from db import settings +from util import sub_channel_2_channel +from scrapy_redis.spiders import RedisSpider +from db import redis_connect_string +import jieba +import jieba.analyse +import re + +jieba.load_userdict('./biliob_analyzer/dict.txt') + + +def q_to_b(q_str): + """全角转半角""" + b_str = "" + for uchar in q_str: + inside_code = ord(uchar) + if inside_code == 12288: # 全角空格直接转换 + inside_code = 32 + elif 65374 >= inside_code >= 65281: # 全角字符(除空格)根据关系转化 + inside_code -= 65248 + b_str += chr(inside_code) + return b_str + + +class DanmakuAggregateSpider(RedisSpider): + name = "DanmakuAggregate" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.DanmakuAggregatePipeline': 300, + }, + 'DOWNLOAD_DELAY': 1 + } + CID_API = "https://api.bilibili.com/x/web-interface/view?aid={aid}" + DANMAKU_API = "https://api.bilibili.com/x/v1/dm/list.so?oid={oid}" + PATTERN = r"[0-9a-zA-Z\u4e00-\u9fa5\u30a1-\u30f6\u3041-\u3093\uFF00-\uFFFF\u4e00-\u9fa5]+" + + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def parse(self, response): + try: + j = json.loads(response.body) + if j['code'] == -403: + aid = response.url[50:] + print('https://api.bilibili.com/x/article/archives?ids={}'.format(aid)) + yield Request('https://api.bilibili.com/x/article/archives?ids={}'.format(aid), + callback=self.getCidPlanB, meta={'aid': aid}) + else: + aid = j['data']['aid'] + pages = j['data']['pages'] + for each_page in pages: + duration = each_page['duration'] + p_name = each_page['part'] + page_number = each_page['page'] + cid = each_page['cid'] + yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, + meta={'duration': duration, + 'p_name': p_name, + 'page_number': page_number, + 'aid': aid}) + except Exception as error: + # 出现错误时打印错误日志 + if response['code'] == -404: + return + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}".format(response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) + + def getCidPlanB(self, response): + aid = response.meta['aid'] + cid = json.loads(response.body)['data'][aid]['cid'] + duration = json.loads(response.body)['data'][aid]['duration'] + yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, meta={'duration': duration, 'p_name': '', 'page_number': 1, 'aid': int(aid)}) + + def parseDanmaku(self, response): + duration = response.meta['duration'] + danmaku_text = q_to_b( + " ".join(response.xpath("d/text()").extract()).upper()) + # 自实现太low,使用自带关键字 + word_frequency = dict(jieba.analyse.extract_tags(danmaku_text, topK=50, withWeight=True, allowPOS=( + 'ns', 'n', 'vn', 'v', 'nr', 'un', 'x', 'j', 'i', 'l', 'nz', 'eng', 'o'))) + # 计算弹幕密度 + danmaku_attr = list(map(lambda x: x.split( + ","), response.xpath("d/@p").extract())) + tick = duration / 50 + danmaku_density = {} + danmaku_density = [0 for i in range(50)] + for each_attr in danmaku_attr: + t = float(each_attr[0]) + if t > duration: + continue + index = int(t // tick) + danmaku_density[index] += 1 + item = DanmakuAggregateItem() + + item['aid'] = response.meta['aid'] + item['duration'] = duration + item['word_frequency'] = word_frequency + item['p_name'] = response.meta['p_name'] + item['danmaku_density'] = danmaku_density + item['page_number'] = response.meta['page_number'] + yield item From 860d5317f5d6aba95357974ac2091329145b6ab5 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 26 Feb 2019 19:29:04 +0800 Subject: [PATCH 185/469] feature: danmaku aggregation --- biliob_spider/items.py | 9 ++ biliob_spider/pipelines.py | 30 ++++- .../spiders/danmaku_aggregate_spider.py | 125 ++++++++++++++++++ 3 files changed, 163 insertions(+), 1 deletion(-) create mode 100644 biliob_spider/spiders/danmaku_aggregate_spider.py diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 490dfe6..c6b221d 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -131,3 +131,12 @@ class VideoWatcherItem(scrapy.Item): mid = scrapy.Field() aid = scrapy.Field() channels = scrapy.Field() + + +class DanmakuAggregateItem(scrapy.Item): + aid = scrapy.Field() + p_name = scrapy.Field() + page_number = scrapy.Field() + word_frequency = scrapy.Field() + danmaku_density = scrapy.Field() + duration = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index 515a02f..c03195d 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -13,6 +13,35 @@ from db import redis_connect_string +class DanmakuAggregatePipeline(object): + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.redis_connection = redis.from_url(redis_connect_string) + + def process_item(self, item, spider): + self.coll = self.db['video'] + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'danmaku_aggregate.{}'.format(item['page_number']): { + 'duration': item['duration'], + 'p_name': item['p_name'], + 'danmaku_density': item['danmaku_density'], + 'word_frequency': item['word_frequency'] + }, + } + }, True) + # 刷新redis数据缓存 + self.redis_connection.delete( + "video_detail::{}".format(item['aid'])) + + class StrongPipeline(object): def __init__(self): # 链接mongoDB @@ -25,7 +54,6 @@ def __init__(self): def process_item(self, item, spider): try: - self.coll = self.db['video'] self.coll.update_one({ 'aid': int(item['aid']) diff --git a/biliob_spider/spiders/danmaku_aggregate_spider.py b/biliob_spider/spiders/danmaku_aggregate_spider.py new file mode 100644 index 0000000..4c1fdde --- /dev/null +++ b/biliob_spider/spiders/danmaku_aggregate_spider.py @@ -0,0 +1,125 @@ +# coding=utf-8 +import scrapy +from mail import mailer +from scrapy.http import Request +from biliob_spider.items import DanmakuAggregateItem +from datetime import datetime +import time +import json +import logging +from pymongo import MongoClient +from db import settings +from util import sub_channel_2_channel +from scrapy_redis.spiders import RedisSpider +from db import redis_connect_string +import jieba +import jieba.analyse +import re + +jieba.load_userdict('./biliob_analyzer/dict.txt') + + +def q_to_b(q_str): + """全角转半角""" + b_str = "" + for uchar in q_str: + inside_code = ord(uchar) + if inside_code == 12288: # 全角空格直接转换 + inside_code = 32 + elif 65374 >= inside_code >= 65281: # 全角字符(除空格)根据关系转化 + inside_code -= 65248 + b_str += chr(inside_code) + return b_str + + +class DanmakuAggregateSpider(RedisSpider): + name = "DanmakuAggregate" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.DanmakuAggregatePipeline': 300, + }, + 'DOWNLOAD_DELAY': 1 + } + CID_API = "https://api.bilibili.com/x/web-interface/view?aid={aid}" + DANMAKU_API = "https://api.bilibili.com/x/v1/dm/list.so?oid={oid}" + PATTERN = r"[0-9a-zA-Z\u4e00-\u9fa5\u30a1-\u30f6\u3041-\u3093\uFF00-\uFFFF\u4e00-\u9fa5]+" + + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def parse(self, response): + try: + j = json.loads(response.body) + if j['code'] == -403: + aid = response.url[50:] + print('https://api.bilibili.com/x/article/archives?ids={}'.format(aid)) + yield Request('https://api.bilibili.com/x/article/archives?ids={}'.format(aid), + callback=self.getCidPlanB, meta={'aid': aid}) + else: + aid = j['data']['aid'] + pages = j['data']['pages'] + for each_page in pages: + duration = each_page['duration'] + p_name = each_page['part'] + page_number = each_page['page'] + cid = each_page['cid'] + yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, + meta={'duration': duration, + 'p_name': p_name, + 'page_number': page_number, + 'aid': aid}) + except Exception as error: + # 出现错误时打印错误日志 + if response['code'] == -404: + return + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}".format(response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) + + def getCidPlanB(self, response): + aid = response.meta['aid'] + cid = json.loads(response.body)['data'][aid]['cid'] + duration = json.loads(response.body)['data'][aid]['duration'] + yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, meta={'duration': duration, 'p_name': '', 'page_number': 1, 'aid': int(aid)}) + + def parseDanmaku(self, response): + duration = response.meta['duration'] + danmaku_text = q_to_b( + " ".join(response.xpath("d/text()").extract()).upper()) + # 自实现太low,使用自带关键字 + word_frequency = dict(jieba.analyse.extract_tags(danmaku_text, topK=50, withWeight=True, allowPOS=( + 'ns', 'n', 'vn', 'v', 'nr', 'un', 'x', 'j', 'i', 'l', 'nz', 'eng', 'o'))) + # 计算弹幕密度 + danmaku_attr = list(map(lambda x: x.split( + ","), response.xpath("d/@p").extract())) + tick = duration / 50 + danmaku_density = {} + danmaku_density = [0 for i in range(50)] + for each_attr in danmaku_attr: + t = float(each_attr[0]) + if t > duration: + continue + index = int(t // tick) + danmaku_density[index] += 1 + item = DanmakuAggregateItem() + + item['aid'] = response.meta['aid'] + item['duration'] = duration + item['word_frequency'] = word_frequency + item['p_name'] = response.meta['p_name'] + item['danmaku_density'] = danmaku_density + item['page_number'] = response.meta['page_number'] + yield item From 84fa9c30361648b8349f26b044d3e5e75644c412 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 26 Feb 2019 20:04:25 +0800 Subject: [PATCH 186/469] feature: put update time of danmaku aggregate --- biliob_spider/pipelines.py | 1 + 1 file changed, 1 insertion(+) diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index c03195d..d2bd012 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -35,6 +35,7 @@ def process_item(self, item, spider): 'danmaku_density': item['danmaku_density'], 'word_frequency': item['word_frequency'] }, + 'danmaku_aggregate.updatetime':datetime.datetime.now() } }, True) # 刷新redis数据缓存 From de1bcdb3409ea9ab3b85393afda21a1e659234eb Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 26 Feb 2019 20:04:25 +0800 Subject: [PATCH 187/469] feature: put update time of danmaku aggregate --- biliob_spider/pipelines.py | 1 + 1 file changed, 1 insertion(+) diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index c03195d..d2bd012 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -35,6 +35,7 @@ def process_item(self, item, spider): 'danmaku_density': item['danmaku_density'], 'word_frequency': item['word_frequency'] }, + 'danmaku_aggregate.updatetime':datetime.datetime.now() } }, True) # 刷新redis数据缓存 From 9cda87c2edc2065f8fb53985dc224798c2ad9687 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 26 Feb 2019 20:04:25 +0800 Subject: [PATCH 188/469] feature: put update time of danmaku aggregate --- biliob_spider/pipelines.py | 1 + 1 file changed, 1 insertion(+) diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index c03195d..d2bd012 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -35,6 +35,7 @@ def process_item(self, item, spider): 'danmaku_density': item['danmaku_density'], 'word_frequency': item['word_frequency'] }, + 'danmaku_aggregate.updatetime':datetime.datetime.now() } }, True) # 刷新redis数据缓存 From 5d2f4f321f2fb76d44612a23bbbc6338140d8186 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 26 Feb 2019 21:57:24 +0800 Subject: [PATCH 189/469] update analyzer --- run_analyzer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/run_analyzer.py b/run_analyzer.py index 4664268..9c920e0 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -4,9 +4,9 @@ import biliob_analyzer.author_fans_watcher import biliob_analyzer.author_rank # import biliob_analyzer.video_rank -from biliob_analyzer.add_keyword import AddKeyword -AddKeyword().add_all_author() -AddKeyword().add_all_video() +# from biliob_analyzer.add_keyword import AddKeyword +# AddKeyword().add_all_author() +# AddKeyword().add_all_video() author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From a4f7b057d222a707c20f21789efee2ad47d5a6fd Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 26 Feb 2019 21:57:24 +0800 Subject: [PATCH 190/469] update analyzer --- run_analyzer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/run_analyzer.py b/run_analyzer.py index 4664268..9c920e0 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -4,9 +4,9 @@ import biliob_analyzer.author_fans_watcher import biliob_analyzer.author_rank # import biliob_analyzer.video_rank -from biliob_analyzer.add_keyword import AddKeyword -AddKeyword().add_all_author() -AddKeyword().add_all_video() +# from biliob_analyzer.add_keyword import AddKeyword +# AddKeyword().add_all_author() +# AddKeyword().add_all_video() author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From 167f19e16a745a5f850d1e1303fc4cf7320fdb12 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 26 Feb 2019 21:57:24 +0800 Subject: [PATCH 191/469] update analyzer --- run_analyzer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/run_analyzer.py b/run_analyzer.py index 4664268..9c920e0 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -4,9 +4,9 @@ import biliob_analyzer.author_fans_watcher import biliob_analyzer.author_rank # import biliob_analyzer.video_rank -from biliob_analyzer.add_keyword import AddKeyword -AddKeyword().add_all_author() -AddKeyword().add_all_video() +# from biliob_analyzer.add_keyword import AddKeyword +# AddKeyword().add_all_author() +# AddKeyword().add_all_video() author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From f487453788dfcde444c836932a2e4be4c4d822f8 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 26 Feb 2019 22:38:54 +0800 Subject: [PATCH 192/469] fix --- .../spiders/danmaku_aggregate_spider.py | 29 ++++++++++--------- run.py | 4 +-- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/biliob_spider/spiders/danmaku_aggregate_spider.py b/biliob_spider/spiders/danmaku_aggregate_spider.py index 4c1fdde..2e04fce 100644 --- a/biliob_spider/spiders/danmaku_aggregate_spider.py +++ b/biliob_spider/spiders/danmaku_aggregate_spider.py @@ -16,23 +16,13 @@ import jieba.analyse import re -jieba.load_userdict('./biliob_analyzer/dict.txt') -def q_to_b(q_str): - """全角转半角""" - b_str = "" - for uchar in q_str: - inside_code = ord(uchar) - if inside_code == 12288: # 全角空格直接转换 - inside_code = 32 - elif 65374 >= inside_code >= 65281: # 全角字符(除空格)根据关系转化 - inside_code -= 65248 - b_str += chr(inside_code) - return b_str + class DanmakuAggregateSpider(RedisSpider): + name = "DanmakuAggregate" allowed_domains = ["bilibili.com"] start_urls = [] @@ -48,6 +38,7 @@ class DanmakuAggregateSpider(RedisSpider): def __init__(self): # 链接mongoDB + jieba.load_userdict('./biliob_analyzer/dict.txt') self.client = MongoClient(settings['MINGO_HOST'], 27017) # 数据库登录需要帐号密码 self.client.admin.authenticate(settings['MINGO_USER'], @@ -55,6 +46,18 @@ def __init__(self): self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['video'] # 获得collection的句柄 + def q_to_b(self,q_str): + """全角转半角""" + b_str = "" + for uchar in q_str: + inside_code = ord(uchar) + if inside_code == 12288: # 全角空格直接转换 + inside_code = 32 + elif 65374 >= inside_code >= 65281: # 全角字符(除空格)根据关系转化 + inside_code -= 65248 + b_str += chr(inside_code) + return b_str + def parse(self, response): try: j = json.loads(response.body) @@ -97,7 +100,7 @@ def getCidPlanB(self, response): def parseDanmaku(self, response): duration = response.meta['duration'] - danmaku_text = q_to_b( + danmaku_text = self.q_to_b( " ".join(response.xpath("d/text()").extract()).upper()) # 自实现太low,使用自带关键字 word_frequency = dict(jieba.analyse.extract_tags(danmaku_text, topK=50, withWeight=True, allowPOS=( diff --git a/run.py b/run.py index 8377714..0897e3b 100644 --- a/run.py +++ b/run.py @@ -57,8 +57,8 @@ def data_analyze(): Popen(['python', 'run_analyzer.py']) -def weekly_analyze(): - Popen(['python', 'run_weekly_analyzer.py']) +# def weekly_analyze(): +# Popen(['python', 'run_weekly_analyzer.py']) def bili_monthly_rank(): From b26e2439435c6eca296060c1423cec414ccf1378 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 26 Feb 2019 22:38:54 +0800 Subject: [PATCH 193/469] fix --- .../spiders/danmaku_aggregate_spider.py | 29 ++++++++++--------- run.py | 4 +-- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/biliob_spider/spiders/danmaku_aggregate_spider.py b/biliob_spider/spiders/danmaku_aggregate_spider.py index 4c1fdde..2e04fce 100644 --- a/biliob_spider/spiders/danmaku_aggregate_spider.py +++ b/biliob_spider/spiders/danmaku_aggregate_spider.py @@ -16,23 +16,13 @@ import jieba.analyse import re -jieba.load_userdict('./biliob_analyzer/dict.txt') -def q_to_b(q_str): - """全角转半角""" - b_str = "" - for uchar in q_str: - inside_code = ord(uchar) - if inside_code == 12288: # 全角空格直接转换 - inside_code = 32 - elif 65374 >= inside_code >= 65281: # 全角字符(除空格)根据关系转化 - inside_code -= 65248 - b_str += chr(inside_code) - return b_str + class DanmakuAggregateSpider(RedisSpider): + name = "DanmakuAggregate" allowed_domains = ["bilibili.com"] start_urls = [] @@ -48,6 +38,7 @@ class DanmakuAggregateSpider(RedisSpider): def __init__(self): # 链接mongoDB + jieba.load_userdict('./biliob_analyzer/dict.txt') self.client = MongoClient(settings['MINGO_HOST'], 27017) # 数据库登录需要帐号密码 self.client.admin.authenticate(settings['MINGO_USER'], @@ -55,6 +46,18 @@ def __init__(self): self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['video'] # 获得collection的句柄 + def q_to_b(self,q_str): + """全角转半角""" + b_str = "" + for uchar in q_str: + inside_code = ord(uchar) + if inside_code == 12288: # 全角空格直接转换 + inside_code = 32 + elif 65374 >= inside_code >= 65281: # 全角字符(除空格)根据关系转化 + inside_code -= 65248 + b_str += chr(inside_code) + return b_str + def parse(self, response): try: j = json.loads(response.body) @@ -97,7 +100,7 @@ def getCidPlanB(self, response): def parseDanmaku(self, response): duration = response.meta['duration'] - danmaku_text = q_to_b( + danmaku_text = self.q_to_b( " ".join(response.xpath("d/text()").extract()).upper()) # 自实现太low,使用自带关键字 word_frequency = dict(jieba.analyse.extract_tags(danmaku_text, topK=50, withWeight=True, allowPOS=( diff --git a/run.py b/run.py index 8377714..0897e3b 100644 --- a/run.py +++ b/run.py @@ -57,8 +57,8 @@ def data_analyze(): Popen(['python', 'run_analyzer.py']) -def weekly_analyze(): - Popen(['python', 'run_weekly_analyzer.py']) +# def weekly_analyze(): +# Popen(['python', 'run_weekly_analyzer.py']) def bili_monthly_rank(): From 91d8d689329eef985998749bef26dba67e901dec Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 26 Feb 2019 22:38:54 +0800 Subject: [PATCH 194/469] fix --- .../spiders/danmaku_aggregate_spider.py | 29 ++++++++++--------- run.py | 4 +-- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/biliob_spider/spiders/danmaku_aggregate_spider.py b/biliob_spider/spiders/danmaku_aggregate_spider.py index 4c1fdde..2e04fce 100644 --- a/biliob_spider/spiders/danmaku_aggregate_spider.py +++ b/biliob_spider/spiders/danmaku_aggregate_spider.py @@ -16,23 +16,13 @@ import jieba.analyse import re -jieba.load_userdict('./biliob_analyzer/dict.txt') -def q_to_b(q_str): - """全角转半角""" - b_str = "" - for uchar in q_str: - inside_code = ord(uchar) - if inside_code == 12288: # 全角空格直接转换 - inside_code = 32 - elif 65374 >= inside_code >= 65281: # 全角字符(除空格)根据关系转化 - inside_code -= 65248 - b_str += chr(inside_code) - return b_str + class DanmakuAggregateSpider(RedisSpider): + name = "DanmakuAggregate" allowed_domains = ["bilibili.com"] start_urls = [] @@ -48,6 +38,7 @@ class DanmakuAggregateSpider(RedisSpider): def __init__(self): # 链接mongoDB + jieba.load_userdict('./biliob_analyzer/dict.txt') self.client = MongoClient(settings['MINGO_HOST'], 27017) # 数据库登录需要帐号密码 self.client.admin.authenticate(settings['MINGO_USER'], @@ -55,6 +46,18 @@ def __init__(self): self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['video'] # 获得collection的句柄 + def q_to_b(self,q_str): + """全角转半角""" + b_str = "" + for uchar in q_str: + inside_code = ord(uchar) + if inside_code == 12288: # 全角空格直接转换 + inside_code = 32 + elif 65374 >= inside_code >= 65281: # 全角字符(除空格)根据关系转化 + inside_code -= 65248 + b_str += chr(inside_code) + return b_str + def parse(self, response): try: j = json.loads(response.body) @@ -97,7 +100,7 @@ def getCidPlanB(self, response): def parseDanmaku(self, response): duration = response.meta['duration'] - danmaku_text = q_to_b( + danmaku_text = self.q_to_b( " ".join(response.xpath("d/text()").extract()).upper()) # 自实现太low,使用自带关键字 word_frequency = dict(jieba.analyse.extract_tags(danmaku_text, topK=50, withWeight=True, allowPOS=( diff --git a/run.py b/run.py index 8377714..0897e3b 100644 --- a/run.py +++ b/run.py @@ -57,8 +57,8 @@ def data_analyze(): Popen(['python', 'run_analyzer.py']) -def weekly_analyze(): - Popen(['python', 'run_weekly_analyzer.py']) +# def weekly_analyze(): +# Popen(['python', 'run_weekly_analyzer.py']) def bili_monthly_rank(): From 11d9252de7841324c881bdf88437447bc1d82e82 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 27 Feb 2019 13:08:40 +0800 Subject: [PATCH 195/469] update settings --- biliob_spider/settings.py | 4 +- .../spiders/author_update_with_redis.1.py | 101 ++++++++++++++++++ 2 files changed, 103 insertions(+), 2 deletions(-) create mode 100644 biliob_spider/spiders/author_update_with_redis.1.py diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index b394f85..4934e6b 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -43,14 +43,14 @@ ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) -# CONCURRENT_REQUESTS = 32 +CONCURRENT_REQUESTS = 8 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: -# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +CONCURRENT_REQUESTS_PER_DOMAIN = 8 # CONCURRENT_REQUESTS_PER_IP = 64 # Disable cookies (enabled by default) diff --git a/biliob_spider/spiders/author_update_with_redis.1.py b/biliob_spider/spiders/author_update_with_redis.1.py new file mode 100644 index 0000000..5b8de55 --- /dev/null +++ b/biliob_spider/spiders/author_update_with_redis.1.py @@ -0,0 +1,101 @@ +# coding=utf-8 +import scrapy +from mail import mailer +from scrapy.http import Request +from biliob_spider.items import AuthorItem +import time +import json +import logging +from pymongo import MongoClient +import datetime +from db import settings +from db import redis_connect_string +from scrapy_redis.spiders import RedisSpider +import redis + + +class BiliobSpider(RedisSpider): + name = "BiliobSpider" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.BiliobPipeline': 300 + }, + 'DOWNLOAD_DELAY': 10 + } + + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) + + def parse(self, response): + try: + j = json.loads(response.body) + name = j['data']['card']['name'] + mid = j['data']['card']['mid'] + + # 刷新redis数据缓存 + self.redis_connection.delete("author_detail::{}".format(mid)) + + sex = j['data']['card']['sex'] + face = j['data']['card']['face'] + fans = j['data']['card']['fans'] + attention = j['data']['card']['attention'] + level = j['data']['card']['level_info']['current_level'] + official = j['data']['card']['Official']['title'] + archive = j['data']['archive_count'] + article = j['data']['article_count'] + face = j['data']['card']['face'] + item = AuthorItem() + item['mid'] = int(mid) + item['name'] = name + item['face'] = face + item['official'] = official + item['sex'] = sex + item['level'] = int(level) + item['data'] = { + 'fans': int(fans), + 'attention': int(attention), + 'archive': int(archive), + 'article': int(article), + 'datetime': datetime.datetime.now() + } + item['c_fans'] = int(fans) + item['c_attention'] = int(attention) + item['c_archive'] = int(archive) + item['c_article'] = int(article) + yield Request( + "https://api.bilibili.com/x/space/upstat?mid={mid}".format( + mid=str(mid)), + meta={'item': item}, + method='GET', + callback=self.parse_view) + except Exception as error: + # 出现错误时打印错误日志 + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) + + def parse_view(self, response): + j = json.loads(response.body) + archive_view = j['data']['archive']['view'] + article_view = j['data']['article']['view'] + item = response.meta['item'] + item['data']['archiveView'] = archive_view + item['data']['articleView'] = article_view + item['c_archive_view'] = int(archive_view) + item['c_article_view'] = int(article_view) + + yield item From 670c1f6563b48969ac9ed42a7cd63857a574698f Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 27 Feb 2019 13:08:40 +0800 Subject: [PATCH 196/469] update settings --- biliob_spider/settings.py | 4 +- .../spiders/author_update_with_redis.1.py | 101 ++++++++++++++++++ 2 files changed, 103 insertions(+), 2 deletions(-) create mode 100644 biliob_spider/spiders/author_update_with_redis.1.py diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index b394f85..4934e6b 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -43,14 +43,14 @@ ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) -# CONCURRENT_REQUESTS = 32 +CONCURRENT_REQUESTS = 8 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: -# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +CONCURRENT_REQUESTS_PER_DOMAIN = 8 # CONCURRENT_REQUESTS_PER_IP = 64 # Disable cookies (enabled by default) diff --git a/biliob_spider/spiders/author_update_with_redis.1.py b/biliob_spider/spiders/author_update_with_redis.1.py new file mode 100644 index 0000000..5b8de55 --- /dev/null +++ b/biliob_spider/spiders/author_update_with_redis.1.py @@ -0,0 +1,101 @@ +# coding=utf-8 +import scrapy +from mail import mailer +from scrapy.http import Request +from biliob_spider.items import AuthorItem +import time +import json +import logging +from pymongo import MongoClient +import datetime +from db import settings +from db import redis_connect_string +from scrapy_redis.spiders import RedisSpider +import redis + + +class BiliobSpider(RedisSpider): + name = "BiliobSpider" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.BiliobPipeline': 300 + }, + 'DOWNLOAD_DELAY': 10 + } + + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) + + def parse(self, response): + try: + j = json.loads(response.body) + name = j['data']['card']['name'] + mid = j['data']['card']['mid'] + + # 刷新redis数据缓存 + self.redis_connection.delete("author_detail::{}".format(mid)) + + sex = j['data']['card']['sex'] + face = j['data']['card']['face'] + fans = j['data']['card']['fans'] + attention = j['data']['card']['attention'] + level = j['data']['card']['level_info']['current_level'] + official = j['data']['card']['Official']['title'] + archive = j['data']['archive_count'] + article = j['data']['article_count'] + face = j['data']['card']['face'] + item = AuthorItem() + item['mid'] = int(mid) + item['name'] = name + item['face'] = face + item['official'] = official + item['sex'] = sex + item['level'] = int(level) + item['data'] = { + 'fans': int(fans), + 'attention': int(attention), + 'archive': int(archive), + 'article': int(article), + 'datetime': datetime.datetime.now() + } + item['c_fans'] = int(fans) + item['c_attention'] = int(attention) + item['c_archive'] = int(archive) + item['c_article'] = int(article) + yield Request( + "https://api.bilibili.com/x/space/upstat?mid={mid}".format( + mid=str(mid)), + meta={'item': item}, + method='GET', + callback=self.parse_view) + except Exception as error: + # 出现错误时打印错误日志 + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) + + def parse_view(self, response): + j = json.loads(response.body) + archive_view = j['data']['archive']['view'] + article_view = j['data']['article']['view'] + item = response.meta['item'] + item['data']['archiveView'] = archive_view + item['data']['articleView'] = article_view + item['c_archive_view'] = int(archive_view) + item['c_article_view'] = int(article_view) + + yield item From a7f2e7e3712c075975ad6b04f2a855ec5fdce35c Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 27 Feb 2019 13:08:40 +0800 Subject: [PATCH 197/469] update settings --- biliob_spider/settings.py | 4 +- .../spiders/author_update_with_redis.1.py | 101 ++++++++++++++++++ 2 files changed, 103 insertions(+), 2 deletions(-) create mode 100644 biliob_spider/spiders/author_update_with_redis.1.py diff --git a/biliob_spider/settings.py b/biliob_spider/settings.py index b394f85..4934e6b 100644 --- a/biliob_spider/settings.py +++ b/biliob_spider/settings.py @@ -43,14 +43,14 @@ ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) -# CONCURRENT_REQUESTS = 32 +CONCURRENT_REQUESTS = 8 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: -# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +CONCURRENT_REQUESTS_PER_DOMAIN = 8 # CONCURRENT_REQUESTS_PER_IP = 64 # Disable cookies (enabled by default) diff --git a/biliob_spider/spiders/author_update_with_redis.1.py b/biliob_spider/spiders/author_update_with_redis.1.py new file mode 100644 index 0000000..5b8de55 --- /dev/null +++ b/biliob_spider/spiders/author_update_with_redis.1.py @@ -0,0 +1,101 @@ +# coding=utf-8 +import scrapy +from mail import mailer +from scrapy.http import Request +from biliob_spider.items import AuthorItem +import time +import json +import logging +from pymongo import MongoClient +import datetime +from db import settings +from db import redis_connect_string +from scrapy_redis.spiders import RedisSpider +import redis + + +class BiliobSpider(RedisSpider): + name = "BiliobSpider" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'ITEM_PIPELINES': { + 'biliob_spider.pipelines.BiliobPipeline': 300 + }, + 'DOWNLOAD_DELAY': 10 + } + + def __init__(self): + # 链接mongoDB + self.client = MongoClient(settings['MINGO_HOST'], 27017) + # 数据库登录需要帐号密码 + self.client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['author'] # 获得collection的句柄 + self.redis_connection = redis.from_url(redis_connect_string) + + def parse(self, response): + try: + j = json.loads(response.body) + name = j['data']['card']['name'] + mid = j['data']['card']['mid'] + + # 刷新redis数据缓存 + self.redis_connection.delete("author_detail::{}".format(mid)) + + sex = j['data']['card']['sex'] + face = j['data']['card']['face'] + fans = j['data']['card']['fans'] + attention = j['data']['card']['attention'] + level = j['data']['card']['level_info']['current_level'] + official = j['data']['card']['Official']['title'] + archive = j['data']['archive_count'] + article = j['data']['article_count'] + face = j['data']['card']['face'] + item = AuthorItem() + item['mid'] = int(mid) + item['name'] = name + item['face'] = face + item['official'] = official + item['sex'] = sex + item['level'] = int(level) + item['data'] = { + 'fans': int(fans), + 'attention': int(attention), + 'archive': int(archive), + 'article': int(article), + 'datetime': datetime.datetime.now() + } + item['c_fans'] = int(fans) + item['c_attention'] = int(attention) + item['c_archive'] = int(archive) + item['c_article'] = int(article) + yield Request( + "https://api.bilibili.com/x/space/upstat?mid={mid}".format( + mid=str(mid)), + meta={'item': item}, + method='GET', + callback=self.parse_view) + except Exception as error: + # 出现错误时打印错误日志 + mailer.send( + to=["604264970@qq.com"], + subject="BiliobSpiderError", + body="{}\n{}\n{}".format(item, response.url, error), + ) + logging.error("视频爬虫在解析时发生错误") + logging.error(response.url) + logging.error(error) + + def parse_view(self, response): + j = json.loads(response.body) + archive_view = j['data']['archive']['view'] + article_view = j['data']['article']['view'] + item = response.meta['item'] + item['data']['archiveView'] = archive_view + item['data']['articleView'] = article_view + item['c_archive_view'] = int(archive_view) + item['c_article_view'] = int(article_view) + + yield item From f1e802a1c60b62a8ed68a7244d2edcbc503ab497 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 27 Feb 2019 13:09:03 +0800 Subject: [PATCH 198/469] create biliob spider --- .../spiders/{author_update_with_redis.1.py => BiliobSpider.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename biliob_spider/spiders/{author_update_with_redis.1.py => BiliobSpider.py} (100%) diff --git a/biliob_spider/spiders/author_update_with_redis.1.py b/biliob_spider/spiders/BiliobSpider.py similarity index 100% rename from biliob_spider/spiders/author_update_with_redis.1.py rename to biliob_spider/spiders/BiliobSpider.py From 263963284eb7e177abb67f8b7048e8719a147746 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 27 Feb 2019 13:09:03 +0800 Subject: [PATCH 199/469] create biliob spider --- .../spiders/{author_update_with_redis.1.py => BiliobSpider.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename biliob_spider/spiders/{author_update_with_redis.1.py => BiliobSpider.py} (100%) diff --git a/biliob_spider/spiders/author_update_with_redis.1.py b/biliob_spider/spiders/BiliobSpider.py similarity index 100% rename from biliob_spider/spiders/author_update_with_redis.1.py rename to biliob_spider/spiders/BiliobSpider.py From 3663a70ddfdd9b5bb18f716f58aaa9243b4f28b0 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 27 Feb 2019 13:09:03 +0800 Subject: [PATCH 200/469] create biliob spider --- .../spiders/{author_update_with_redis.1.py => BiliobSpider.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename biliob_spider/spiders/{author_update_with_redis.1.py => BiliobSpider.py} (100%) diff --git a/biliob_spider/spiders/author_update_with_redis.1.py b/biliob_spider/spiders/BiliobSpider.py similarity index 100% rename from biliob_spider/spiders/author_update_with_redis.1.py rename to biliob_spider/spiders/BiliobSpider.py From 7604dcb0ddf297932de0d0708b248d9a3fa4812e Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 27 Feb 2019 15:43:32 +0800 Subject: [PATCH 201/469] feature: divide danmaku spider from others --- biliob_spider/items.py | 9 -- biliob_spider/pipelines.py | 30 ----- biliob_spider/spiders/BiliobSpider.py | 28 ++-- .../spiders/danmaku_aggregate_spider.py | 125 ------------------ danmaku_spider/danmaku_spider/__init__.py | 0 danmaku_spider/danmaku_spider/filter.py | 6 + danmaku_spider/danmaku_spider/items.py | 17 +++ danmaku_spider/danmaku_spider/middlewares.py | 103 +++++++++++++++ danmaku_spider/danmaku_spider/pipelines.py | 44 ++++++ danmaku_spider/danmaku_spider/settings.py | 106 +++++++++++++++ .../danmaku_spider/spiders/__init__.py | 4 + .../spiders/danmaku_aggregate_spider.py | 123 +++++++++++++++++ danmaku_spider/scrapy.cfg | 11 ++ 13 files changed, 430 insertions(+), 176 deletions(-) delete mode 100644 biliob_spider/spiders/danmaku_aggregate_spider.py create mode 100644 danmaku_spider/danmaku_spider/__init__.py create mode 100644 danmaku_spider/danmaku_spider/filter.py create mode 100644 danmaku_spider/danmaku_spider/items.py create mode 100644 danmaku_spider/danmaku_spider/middlewares.py create mode 100644 danmaku_spider/danmaku_spider/pipelines.py create mode 100644 danmaku_spider/danmaku_spider/settings.py create mode 100644 danmaku_spider/danmaku_spider/spiders/__init__.py create mode 100644 danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py create mode 100644 danmaku_spider/scrapy.cfg diff --git a/biliob_spider/items.py b/biliob_spider/items.py index c6b221d..490dfe6 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -131,12 +131,3 @@ class VideoWatcherItem(scrapy.Item): mid = scrapy.Field() aid = scrapy.Field() channels = scrapy.Field() - - -class DanmakuAggregateItem(scrapy.Item): - aid = scrapy.Field() - p_name = scrapy.Field() - page_number = scrapy.Field() - word_frequency = scrapy.Field() - danmaku_density = scrapy.Field() - duration = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index d2bd012..fbb99fd 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -13,36 +13,6 @@ from db import redis_connect_string -class DanmakuAggregatePipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.redis_connection = redis.from_url(redis_connect_string) - - def process_item(self, item, spider): - self.coll = self.db['video'] - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'danmaku_aggregate.{}'.format(item['page_number']): { - 'duration': item['duration'], - 'p_name': item['p_name'], - 'danmaku_density': item['danmaku_density'], - 'word_frequency': item['word_frequency'] - }, - 'danmaku_aggregate.updatetime':datetime.datetime.now() - } - }, True) - # 刷新redis数据缓存 - self.redis_connection.delete( - "video_detail::{}".format(item['aid'])) - - class StrongPipeline(object): def __init__(self): # 链接mongoDB diff --git a/biliob_spider/spiders/BiliobSpider.py b/biliob_spider/spiders/BiliobSpider.py index 5b8de55..234f7f8 100644 --- a/biliob_spider/spiders/BiliobSpider.py +++ b/biliob_spider/spiders/BiliobSpider.py @@ -1,31 +1,34 @@ # coding=utf-8 -import scrapy -from mail import mailer -from scrapy.http import Request -from biliob_spider.items import AuthorItem -import time +import datetime import json import logging +import time + +import redis +import scrapy +from memory_profiler import profile from pymongo import MongoClient -import datetime -from db import settings -from db import redis_connect_string +from scrapy.http import Request from scrapy_redis.spiders import RedisSpider -import redis + +from biliob_spider.items import AuthorItem +from db import redis_connect_string, settings +from mail import mailer -class BiliobSpider(RedisSpider): +class BiliobSpider(scrapy.spiders.Spider): name = "BiliobSpider" allowed_domains = ["bilibili.com"] - start_urls = [] + start_urls = ['www.bilibili.com'] custom_settings = { 'ITEM_PIPELINES': { 'biliob_spider.pipelines.BiliobPipeline': 300 }, 'DOWNLOAD_DELAY': 10 } - + @profile def __init__(self): + # 链接mongoDB self.client = MongoClient(settings['MINGO_HOST'], 27017) # 数据库登录需要帐号密码 @@ -35,6 +38,7 @@ def __init__(self): self.coll = self.db['author'] # 获得collection的句柄 self.redis_connection = redis.from_url(redis_connect_string) + @profile def parse(self, response): try: j = json.loads(response.body) diff --git a/biliob_spider/spiders/danmaku_aggregate_spider.py b/biliob_spider/spiders/danmaku_aggregate_spider.py deleted file mode 100644 index 4c1fdde..0000000 --- a/biliob_spider/spiders/danmaku_aggregate_spider.py +++ /dev/null @@ -1,125 +0,0 @@ -# coding=utf-8 -import scrapy -from mail import mailer -from scrapy.http import Request -from biliob_spider.items import DanmakuAggregateItem -from datetime import datetime -import time -import json -import logging -from pymongo import MongoClient -from db import settings -from util import sub_channel_2_channel -from scrapy_redis.spiders import RedisSpider -from db import redis_connect_string -import jieba -import jieba.analyse -import re - -jieba.load_userdict('./biliob_analyzer/dict.txt') - - -def q_to_b(q_str): - """全角转半角""" - b_str = "" - for uchar in q_str: - inside_code = ord(uchar) - if inside_code == 12288: # 全角空格直接转换 - inside_code = 32 - elif 65374 >= inside_code >= 65281: # 全角字符(除空格)根据关系转化 - inside_code -= 65248 - b_str += chr(inside_code) - return b_str - - -class DanmakuAggregateSpider(RedisSpider): - name = "DanmakuAggregate" - allowed_domains = ["bilibili.com"] - start_urls = [] - custom_settings = { - 'ITEM_PIPELINES': { - 'biliob_spider.pipelines.DanmakuAggregatePipeline': 300, - }, - 'DOWNLOAD_DELAY': 1 - } - CID_API = "https://api.bilibili.com/x/web-interface/view?aid={aid}" - DANMAKU_API = "https://api.bilibili.com/x/v1/dm/list.so?oid={oid}" - PATTERN = r"[0-9a-zA-Z\u4e00-\u9fa5\u30a1-\u30f6\u3041-\u3093\uFF00-\uFFFF\u4e00-\u9fa5]+" - - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video'] # 获得collection的句柄 - - def parse(self, response): - try: - j = json.loads(response.body) - if j['code'] == -403: - aid = response.url[50:] - print('https://api.bilibili.com/x/article/archives?ids={}'.format(aid)) - yield Request('https://api.bilibili.com/x/article/archives?ids={}'.format(aid), - callback=self.getCidPlanB, meta={'aid': aid}) - else: - aid = j['data']['aid'] - pages = j['data']['pages'] - for each_page in pages: - duration = each_page['duration'] - p_name = each_page['part'] - page_number = each_page['page'] - cid = each_page['cid'] - yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, - meta={'duration': duration, - 'p_name': p_name, - 'page_number': page_number, - 'aid': aid}) - except Exception as error: - # 出现错误时打印错误日志 - if response['code'] == -404: - return - mailer.send( - to=["604264970@qq.com"], - subject="BiliobSpiderError", - body="{}\n{}".format(response.url, error), - ) - logging.error("视频爬虫在解析时发生错误") - logging.error(response.url) - logging.error(error) - - def getCidPlanB(self, response): - aid = response.meta['aid'] - cid = json.loads(response.body)['data'][aid]['cid'] - duration = json.loads(response.body)['data'][aid]['duration'] - yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, meta={'duration': duration, 'p_name': '', 'page_number': 1, 'aid': int(aid)}) - - def parseDanmaku(self, response): - duration = response.meta['duration'] - danmaku_text = q_to_b( - " ".join(response.xpath("d/text()").extract()).upper()) - # 自实现太low,使用自带关键字 - word_frequency = dict(jieba.analyse.extract_tags(danmaku_text, topK=50, withWeight=True, allowPOS=( - 'ns', 'n', 'vn', 'v', 'nr', 'un', 'x', 'j', 'i', 'l', 'nz', 'eng', 'o'))) - # 计算弹幕密度 - danmaku_attr = list(map(lambda x: x.split( - ","), response.xpath("d/@p").extract())) - tick = duration / 50 - danmaku_density = {} - danmaku_density = [0 for i in range(50)] - for each_attr in danmaku_attr: - t = float(each_attr[0]) - if t > duration: - continue - index = int(t // tick) - danmaku_density[index] += 1 - item = DanmakuAggregateItem() - - item['aid'] = response.meta['aid'] - item['duration'] = duration - item['word_frequency'] = word_frequency - item['p_name'] = response.meta['p_name'] - item['danmaku_density'] = danmaku_density - item['page_number'] = response.meta['page_number'] - yield item diff --git a/danmaku_spider/danmaku_spider/__init__.py b/danmaku_spider/danmaku_spider/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/danmaku_spider/danmaku_spider/filter.py b/danmaku_spider/danmaku_spider/filter.py new file mode 100644 index 0000000..a2b9f15 --- /dev/null +++ b/danmaku_spider/danmaku_spider/filter.py @@ -0,0 +1,6 @@ +from scrapy.dupefilters import RFPDupeFilter + + +class CloseDupefilter(RFPDupeFilter): + def request_seen(self, request): + return False diff --git a/danmaku_spider/danmaku_spider/items.py b/danmaku_spider/danmaku_spider/items.py new file mode 100644 index 0000000..ffafac0 --- /dev/null +++ b/danmaku_spider/danmaku_spider/items.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class DanmakuAggregateItem(scrapy.Item): + aid = scrapy.Field() + p_name = scrapy.Field() + page_number = scrapy.Field() + word_frequency = scrapy.Field() + danmaku_density = scrapy.Field() + duration = scrapy.Field() diff --git a/danmaku_spider/danmaku_spider/middlewares.py b/danmaku_spider/danmaku_spider/middlewares.py new file mode 100644 index 0000000..5a9acae --- /dev/null +++ b/danmaku_spider/danmaku_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class DanmakuSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class DanmakuSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/danmaku_spider/danmaku_spider/pipelines.py b/danmaku_spider/danmaku_spider/pipelines.py new file mode 100644 index 0000000..8b07b46 --- /dev/null +++ b/danmaku_spider/danmaku_spider/pipelines.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html + +import datetime +import os +import sys + +import redis +from pymongo import MongoClient + +env_dist = os.environ + + +class DanmakuSpiderPipeline(object): + def __init__(self): + self.client = MongoClient(env_dist['BILIOB_MONGO_SERVER'], 27017) + self.client.admin.authenticate(env_dist['BILIOB_MONGO_USER'], + env_dist['BILIOB_MONGO_PASSWD']) + self.db = self.client['biliob'] + self.redis_connection = redis.from_url( + env_dist['BILIOB_REDIS_CONNECTION_STRING']) + + def process_item(self, item, spider): + self.coll = self.db['video'] + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'danmaku_aggregate.{}'.format(item['page_number']): { + 'duration': item['duration'], + 'p_name': item['p_name'], + 'danmaku_density': item['danmaku_density'], + 'word_frequency': item['word_frequency'] + }, + 'danmaku_aggregate.updatetime': datetime.datetime.now() + } + }, True) + # 刷新redis数据缓存 + self.redis_connection.delete( + "video_detail::{}".format(item['aid'])) diff --git a/danmaku_spider/danmaku_spider/settings.py b/danmaku_spider/danmaku_spider/settings.py new file mode 100644 index 0000000..ad81ead --- /dev/null +++ b/danmaku_spider/danmaku_spider/settings.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for danmaku_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +import os +env_dist = os.environ + +DUPEFILTER_CLASS = 'danmaku_spider.filter.CloseDupefilter' + +SCHEDULER_PERSIST = True +SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue' + +REDIS_URL = env_dist['BILIOB_REDIS_CONNECTION_STRING'] + +LOG_FILE = "danmaku_spider.log" +LOG_LEVEL = "DEBUG" + +BOT_NAME = 'danmaku_spider' + +SPIDER_MODULES = ['danmaku_spider.spiders'] +NEWSPIDER_MODULE = 'danmaku_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36' + + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +CONCURRENT_REQUESTS = 8 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +# DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +CONCURRENT_REQUESTS_PER_DOMAIN = 8 +# CONCURRENT_REQUESTS_PER_IP = 64 + +# Disable cookies (enabled by default) +COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +# } + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# 'BilibiliRankListSpider.middlewares.BilibiliranklistspiderSpiderMiddleware': 543, +# } + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# DOWNLOADER_MIDDLEWARES = { +# 'BilibiliRankListSpider.middlewares.BilibiliranklistspiderDownloaderMiddleware': 543, +# } + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +# } + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +# ITEM_PIPELINES = { + +# } + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' +DOWNLOAD_FAIL_ON_DATALOSS = True +RETRY_ENABLED = True diff --git a/danmaku_spider/danmaku_spider/spiders/__init__.py b/danmaku_spider/danmaku_spider/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/danmaku_spider/danmaku_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py b/danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py new file mode 100644 index 0000000..eeaf85d --- /dev/null +++ b/danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py @@ -0,0 +1,123 @@ +# coding=utf-8 +import json +import logging +import os +import re +from datetime import datetime + +import jieba +import jieba.analyse +import scrapy +from pymongo import MongoClient +from scrapy.http import Request +from scrapy_redis.spiders import RedisSpider + +from danmaku_spider.items import DanmakuAggregateItem + +env_dist = os.environ + + +class DanmakuAggregateSpider(RedisSpider): + + name = "DanmakuAggregate" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'DOWNLOAD_DELAY': 1 + } + CID_API = "https://api.bilibili.com/x/web-interface/view?aid={aid}" + DANMAKU_API = "https://api.bilibili.com/x/v1/dm/list.so?oid={oid}" + + def __init__(self): + jieba.load_userdict('../biliob_analyzer/dict.txt') + self.client = MongoClient(env_dist['BILIOB_MONGO_SERVER'], 27017) + self.client.admin.authenticate(env_dist['BILIOB_MONGO_USER'], + env_dist['BILIOB_MONGO_PASSWD']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def q_to_b(self, q_str): + """全角转半角""" + b_str = "" + for uchar in q_str: + inside_code = ord(uchar) + if inside_code == 12288: # 全角空格直接转换 + inside_code = 32 + elif 65374 >= inside_code >= 65281: # 全角字符(除空格)根据关系转化 + inside_code -= 65248 + b_str += chr(inside_code) + return b_str + + def parse(self, response): + try: + j = json.loads(response.body) + if j['code'] == -403: + aid = response.url[50:] + print('https://api.bilibili.com/x/article/archives?ids={}'.format(aid)) + yield Request('https://api.bilibili.com/x/article/archives?ids={}'.format(aid), + callback=self.getCidPlanB, meta={'aid': aid}) + else: + aid = j['data']['aid'] + pages = j['data']['pages'] + for each_page in pages: + duration = each_page['duration'] + p_name = each_page['part'] + page_number = each_page['page'] + cid = each_page['cid'] + yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, + meta={'duration': duration, + 'p_name': p_name, + 'page_number': page_number, + 'aid': aid}) + except Exception as error: + # 出现错误时存入出错集合 + self.db['error'].insert_one( + {'aid': int(aid), 'url': response.url, 'error': error}) + + def getCidPlanB(self, response): + try: + aid = response.meta['aid'] + cid = json.loads(response.body)['data'][aid]['cid'] + duration = json.loads(response.body)['data'][aid]['duration'] + yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, meta={'duration': duration, 'p_name': '', 'page_number': 1, 'aid': int(aid)}) + except Exception as error: + # 出现错误时存入出错集合 + self.db['error'].insert_one( + {'aid': int(aid), 'url': response.url, 'error': error}) + + def parseDanmaku(self, response): + try: + duration = response.meta['duration'] + + # 全角转半角,转大写 + danmaku_text = self.q_to_b( + " ".join(response.xpath("d/text()").extract()).upper()) + + # 自实现太low,使用自带关键字 + word_frequency = dict(jieba.analyse.extract_tags(danmaku_text, topK=50, withWeight=True, allowPOS=( + 'ns', 'n', 'vn', 'v', 'nr', 'un', 'x', 'j', 'i', 'l', 'nz', 'eng', 'o'))) + # 计算弹幕密度 + danmaku_attr = list(map(lambda x: x.split( + ","), response.xpath("d/@p").extract())) + tick = duration / 50 + danmaku_density = {} + danmaku_density = [0 for i in range(50)] + for each_attr in danmaku_attr: + t = float(each_attr[0]) + if t > duration: + continue + index = int(t // tick) + danmaku_density[index] += 1 + item = DanmakuAggregateItem() + + item['aid'] = response.meta['aid'] + item['duration'] = duration + item['word_frequency'] = word_frequency + item['p_name'] = response.meta['p_name'] + item['danmaku_density'] = danmaku_density + item['page_number'] = response.meta['page_number'] + yield item + except Exception as error: + # 出现错误时存入出错集合 + self.db['error'].insert_one( + {'aid': int(response.meta['aid']), 'url': response.url, 'error': error}) diff --git a/danmaku_spider/scrapy.cfg b/danmaku_spider/scrapy.cfg new file mode 100644 index 0000000..218c209 --- /dev/null +++ b/danmaku_spider/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = danmaku_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = danmaku_spider From fa6a9dc3efb0da276feb39056cb3086b02a146a1 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 27 Feb 2019 15:43:32 +0800 Subject: [PATCH 202/469] feature: divide danmaku spider from others --- biliob_spider/items.py | 9 -- biliob_spider/pipelines.py | 30 ----- biliob_spider/spiders/BiliobSpider.py | 28 ++-- .../spiders/danmaku_aggregate_spider.py | 125 ------------------ danmaku_spider/danmaku_spider/__init__.py | 0 danmaku_spider/danmaku_spider/filter.py | 6 + danmaku_spider/danmaku_spider/items.py | 17 +++ danmaku_spider/danmaku_spider/middlewares.py | 103 +++++++++++++++ danmaku_spider/danmaku_spider/pipelines.py | 44 ++++++ danmaku_spider/danmaku_spider/settings.py | 106 +++++++++++++++ .../danmaku_spider/spiders/__init__.py | 4 + .../spiders/danmaku_aggregate_spider.py | 123 +++++++++++++++++ danmaku_spider/scrapy.cfg | 11 ++ 13 files changed, 430 insertions(+), 176 deletions(-) delete mode 100644 biliob_spider/spiders/danmaku_aggregate_spider.py create mode 100644 danmaku_spider/danmaku_spider/__init__.py create mode 100644 danmaku_spider/danmaku_spider/filter.py create mode 100644 danmaku_spider/danmaku_spider/items.py create mode 100644 danmaku_spider/danmaku_spider/middlewares.py create mode 100644 danmaku_spider/danmaku_spider/pipelines.py create mode 100644 danmaku_spider/danmaku_spider/settings.py create mode 100644 danmaku_spider/danmaku_spider/spiders/__init__.py create mode 100644 danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py create mode 100644 danmaku_spider/scrapy.cfg diff --git a/biliob_spider/items.py b/biliob_spider/items.py index c6b221d..490dfe6 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -131,12 +131,3 @@ class VideoWatcherItem(scrapy.Item): mid = scrapy.Field() aid = scrapy.Field() channels = scrapy.Field() - - -class DanmakuAggregateItem(scrapy.Item): - aid = scrapy.Field() - p_name = scrapy.Field() - page_number = scrapy.Field() - word_frequency = scrapy.Field() - danmaku_density = scrapy.Field() - duration = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index d2bd012..fbb99fd 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -13,36 +13,6 @@ from db import redis_connect_string -class DanmakuAggregatePipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.redis_connection = redis.from_url(redis_connect_string) - - def process_item(self, item, spider): - self.coll = self.db['video'] - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'danmaku_aggregate.{}'.format(item['page_number']): { - 'duration': item['duration'], - 'p_name': item['p_name'], - 'danmaku_density': item['danmaku_density'], - 'word_frequency': item['word_frequency'] - }, - 'danmaku_aggregate.updatetime':datetime.datetime.now() - } - }, True) - # 刷新redis数据缓存 - self.redis_connection.delete( - "video_detail::{}".format(item['aid'])) - - class StrongPipeline(object): def __init__(self): # 链接mongoDB diff --git a/biliob_spider/spiders/BiliobSpider.py b/biliob_spider/spiders/BiliobSpider.py index 5b8de55..234f7f8 100644 --- a/biliob_spider/spiders/BiliobSpider.py +++ b/biliob_spider/spiders/BiliobSpider.py @@ -1,31 +1,34 @@ # coding=utf-8 -import scrapy -from mail import mailer -from scrapy.http import Request -from biliob_spider.items import AuthorItem -import time +import datetime import json import logging +import time + +import redis +import scrapy +from memory_profiler import profile from pymongo import MongoClient -import datetime -from db import settings -from db import redis_connect_string +from scrapy.http import Request from scrapy_redis.spiders import RedisSpider -import redis + +from biliob_spider.items import AuthorItem +from db import redis_connect_string, settings +from mail import mailer -class BiliobSpider(RedisSpider): +class BiliobSpider(scrapy.spiders.Spider): name = "BiliobSpider" allowed_domains = ["bilibili.com"] - start_urls = [] + start_urls = ['www.bilibili.com'] custom_settings = { 'ITEM_PIPELINES': { 'biliob_spider.pipelines.BiliobPipeline': 300 }, 'DOWNLOAD_DELAY': 10 } - + @profile def __init__(self): + # 链接mongoDB self.client = MongoClient(settings['MINGO_HOST'], 27017) # 数据库登录需要帐号密码 @@ -35,6 +38,7 @@ def __init__(self): self.coll = self.db['author'] # 获得collection的句柄 self.redis_connection = redis.from_url(redis_connect_string) + @profile def parse(self, response): try: j = json.loads(response.body) diff --git a/biliob_spider/spiders/danmaku_aggregate_spider.py b/biliob_spider/spiders/danmaku_aggregate_spider.py deleted file mode 100644 index 4c1fdde..0000000 --- a/biliob_spider/spiders/danmaku_aggregate_spider.py +++ /dev/null @@ -1,125 +0,0 @@ -# coding=utf-8 -import scrapy -from mail import mailer -from scrapy.http import Request -from biliob_spider.items import DanmakuAggregateItem -from datetime import datetime -import time -import json -import logging -from pymongo import MongoClient -from db import settings -from util import sub_channel_2_channel -from scrapy_redis.spiders import RedisSpider -from db import redis_connect_string -import jieba -import jieba.analyse -import re - -jieba.load_userdict('./biliob_analyzer/dict.txt') - - -def q_to_b(q_str): - """全角转半角""" - b_str = "" - for uchar in q_str: - inside_code = ord(uchar) - if inside_code == 12288: # 全角空格直接转换 - inside_code = 32 - elif 65374 >= inside_code >= 65281: # 全角字符(除空格)根据关系转化 - inside_code -= 65248 - b_str += chr(inside_code) - return b_str - - -class DanmakuAggregateSpider(RedisSpider): - name = "DanmakuAggregate" - allowed_domains = ["bilibili.com"] - start_urls = [] - custom_settings = { - 'ITEM_PIPELINES': { - 'biliob_spider.pipelines.DanmakuAggregatePipeline': 300, - }, - 'DOWNLOAD_DELAY': 1 - } - CID_API = "https://api.bilibili.com/x/web-interface/view?aid={aid}" - DANMAKU_API = "https://api.bilibili.com/x/v1/dm/list.so?oid={oid}" - PATTERN = r"[0-9a-zA-Z\u4e00-\u9fa5\u30a1-\u30f6\u3041-\u3093\uFF00-\uFFFF\u4e00-\u9fa5]+" - - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video'] # 获得collection的句柄 - - def parse(self, response): - try: - j = json.loads(response.body) - if j['code'] == -403: - aid = response.url[50:] - print('https://api.bilibili.com/x/article/archives?ids={}'.format(aid)) - yield Request('https://api.bilibili.com/x/article/archives?ids={}'.format(aid), - callback=self.getCidPlanB, meta={'aid': aid}) - else: - aid = j['data']['aid'] - pages = j['data']['pages'] - for each_page in pages: - duration = each_page['duration'] - p_name = each_page['part'] - page_number = each_page['page'] - cid = each_page['cid'] - yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, - meta={'duration': duration, - 'p_name': p_name, - 'page_number': page_number, - 'aid': aid}) - except Exception as error: - # 出现错误时打印错误日志 - if response['code'] == -404: - return - mailer.send( - to=["604264970@qq.com"], - subject="BiliobSpiderError", - body="{}\n{}".format(response.url, error), - ) - logging.error("视频爬虫在解析时发生错误") - logging.error(response.url) - logging.error(error) - - def getCidPlanB(self, response): - aid = response.meta['aid'] - cid = json.loads(response.body)['data'][aid]['cid'] - duration = json.loads(response.body)['data'][aid]['duration'] - yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, meta={'duration': duration, 'p_name': '', 'page_number': 1, 'aid': int(aid)}) - - def parseDanmaku(self, response): - duration = response.meta['duration'] - danmaku_text = q_to_b( - " ".join(response.xpath("d/text()").extract()).upper()) - # 自实现太low,使用自带关键字 - word_frequency = dict(jieba.analyse.extract_tags(danmaku_text, topK=50, withWeight=True, allowPOS=( - 'ns', 'n', 'vn', 'v', 'nr', 'un', 'x', 'j', 'i', 'l', 'nz', 'eng', 'o'))) - # 计算弹幕密度 - danmaku_attr = list(map(lambda x: x.split( - ","), response.xpath("d/@p").extract())) - tick = duration / 50 - danmaku_density = {} - danmaku_density = [0 for i in range(50)] - for each_attr in danmaku_attr: - t = float(each_attr[0]) - if t > duration: - continue - index = int(t // tick) - danmaku_density[index] += 1 - item = DanmakuAggregateItem() - - item['aid'] = response.meta['aid'] - item['duration'] = duration - item['word_frequency'] = word_frequency - item['p_name'] = response.meta['p_name'] - item['danmaku_density'] = danmaku_density - item['page_number'] = response.meta['page_number'] - yield item diff --git a/danmaku_spider/danmaku_spider/__init__.py b/danmaku_spider/danmaku_spider/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/danmaku_spider/danmaku_spider/filter.py b/danmaku_spider/danmaku_spider/filter.py new file mode 100644 index 0000000..a2b9f15 --- /dev/null +++ b/danmaku_spider/danmaku_spider/filter.py @@ -0,0 +1,6 @@ +from scrapy.dupefilters import RFPDupeFilter + + +class CloseDupefilter(RFPDupeFilter): + def request_seen(self, request): + return False diff --git a/danmaku_spider/danmaku_spider/items.py b/danmaku_spider/danmaku_spider/items.py new file mode 100644 index 0000000..ffafac0 --- /dev/null +++ b/danmaku_spider/danmaku_spider/items.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class DanmakuAggregateItem(scrapy.Item): + aid = scrapy.Field() + p_name = scrapy.Field() + page_number = scrapy.Field() + word_frequency = scrapy.Field() + danmaku_density = scrapy.Field() + duration = scrapy.Field() diff --git a/danmaku_spider/danmaku_spider/middlewares.py b/danmaku_spider/danmaku_spider/middlewares.py new file mode 100644 index 0000000..5a9acae --- /dev/null +++ b/danmaku_spider/danmaku_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class DanmakuSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class DanmakuSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/danmaku_spider/danmaku_spider/pipelines.py b/danmaku_spider/danmaku_spider/pipelines.py new file mode 100644 index 0000000..8b07b46 --- /dev/null +++ b/danmaku_spider/danmaku_spider/pipelines.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html + +import datetime +import os +import sys + +import redis +from pymongo import MongoClient + +env_dist = os.environ + + +class DanmakuSpiderPipeline(object): + def __init__(self): + self.client = MongoClient(env_dist['BILIOB_MONGO_SERVER'], 27017) + self.client.admin.authenticate(env_dist['BILIOB_MONGO_USER'], + env_dist['BILIOB_MONGO_PASSWD']) + self.db = self.client['biliob'] + self.redis_connection = redis.from_url( + env_dist['BILIOB_REDIS_CONNECTION_STRING']) + + def process_item(self, item, spider): + self.coll = self.db['video'] + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'danmaku_aggregate.{}'.format(item['page_number']): { + 'duration': item['duration'], + 'p_name': item['p_name'], + 'danmaku_density': item['danmaku_density'], + 'word_frequency': item['word_frequency'] + }, + 'danmaku_aggregate.updatetime': datetime.datetime.now() + } + }, True) + # 刷新redis数据缓存 + self.redis_connection.delete( + "video_detail::{}".format(item['aid'])) diff --git a/danmaku_spider/danmaku_spider/settings.py b/danmaku_spider/danmaku_spider/settings.py new file mode 100644 index 0000000..ad81ead --- /dev/null +++ b/danmaku_spider/danmaku_spider/settings.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for danmaku_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +import os +env_dist = os.environ + +DUPEFILTER_CLASS = 'danmaku_spider.filter.CloseDupefilter' + +SCHEDULER_PERSIST = True +SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue' + +REDIS_URL = env_dist['BILIOB_REDIS_CONNECTION_STRING'] + +LOG_FILE = "danmaku_spider.log" +LOG_LEVEL = "DEBUG" + +BOT_NAME = 'danmaku_spider' + +SPIDER_MODULES = ['danmaku_spider.spiders'] +NEWSPIDER_MODULE = 'danmaku_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36' + + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +CONCURRENT_REQUESTS = 8 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +# DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +CONCURRENT_REQUESTS_PER_DOMAIN = 8 +# CONCURRENT_REQUESTS_PER_IP = 64 + +# Disable cookies (enabled by default) +COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +# } + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# 'BilibiliRankListSpider.middlewares.BilibiliranklistspiderSpiderMiddleware': 543, +# } + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# DOWNLOADER_MIDDLEWARES = { +# 'BilibiliRankListSpider.middlewares.BilibiliranklistspiderDownloaderMiddleware': 543, +# } + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +# } + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +# ITEM_PIPELINES = { + +# } + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' +DOWNLOAD_FAIL_ON_DATALOSS = True +RETRY_ENABLED = True diff --git a/danmaku_spider/danmaku_spider/spiders/__init__.py b/danmaku_spider/danmaku_spider/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/danmaku_spider/danmaku_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py b/danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py new file mode 100644 index 0000000..eeaf85d --- /dev/null +++ b/danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py @@ -0,0 +1,123 @@ +# coding=utf-8 +import json +import logging +import os +import re +from datetime import datetime + +import jieba +import jieba.analyse +import scrapy +from pymongo import MongoClient +from scrapy.http import Request +from scrapy_redis.spiders import RedisSpider + +from danmaku_spider.items import DanmakuAggregateItem + +env_dist = os.environ + + +class DanmakuAggregateSpider(RedisSpider): + + name = "DanmakuAggregate" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'DOWNLOAD_DELAY': 1 + } + CID_API = "https://api.bilibili.com/x/web-interface/view?aid={aid}" + DANMAKU_API = "https://api.bilibili.com/x/v1/dm/list.so?oid={oid}" + + def __init__(self): + jieba.load_userdict('../biliob_analyzer/dict.txt') + self.client = MongoClient(env_dist['BILIOB_MONGO_SERVER'], 27017) + self.client.admin.authenticate(env_dist['BILIOB_MONGO_USER'], + env_dist['BILIOB_MONGO_PASSWD']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def q_to_b(self, q_str): + """全角转半角""" + b_str = "" + for uchar in q_str: + inside_code = ord(uchar) + if inside_code == 12288: # 全角空格直接转换 + inside_code = 32 + elif 65374 >= inside_code >= 65281: # 全角字符(除空格)根据关系转化 + inside_code -= 65248 + b_str += chr(inside_code) + return b_str + + def parse(self, response): + try: + j = json.loads(response.body) + if j['code'] == -403: + aid = response.url[50:] + print('https://api.bilibili.com/x/article/archives?ids={}'.format(aid)) + yield Request('https://api.bilibili.com/x/article/archives?ids={}'.format(aid), + callback=self.getCidPlanB, meta={'aid': aid}) + else: + aid = j['data']['aid'] + pages = j['data']['pages'] + for each_page in pages: + duration = each_page['duration'] + p_name = each_page['part'] + page_number = each_page['page'] + cid = each_page['cid'] + yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, + meta={'duration': duration, + 'p_name': p_name, + 'page_number': page_number, + 'aid': aid}) + except Exception as error: + # 出现错误时存入出错集合 + self.db['error'].insert_one( + {'aid': int(aid), 'url': response.url, 'error': error}) + + def getCidPlanB(self, response): + try: + aid = response.meta['aid'] + cid = json.loads(response.body)['data'][aid]['cid'] + duration = json.loads(response.body)['data'][aid]['duration'] + yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, meta={'duration': duration, 'p_name': '', 'page_number': 1, 'aid': int(aid)}) + except Exception as error: + # 出现错误时存入出错集合 + self.db['error'].insert_one( + {'aid': int(aid), 'url': response.url, 'error': error}) + + def parseDanmaku(self, response): + try: + duration = response.meta['duration'] + + # 全角转半角,转大写 + danmaku_text = self.q_to_b( + " ".join(response.xpath("d/text()").extract()).upper()) + + # 自实现太low,使用自带关键字 + word_frequency = dict(jieba.analyse.extract_tags(danmaku_text, topK=50, withWeight=True, allowPOS=( + 'ns', 'n', 'vn', 'v', 'nr', 'un', 'x', 'j', 'i', 'l', 'nz', 'eng', 'o'))) + # 计算弹幕密度 + danmaku_attr = list(map(lambda x: x.split( + ","), response.xpath("d/@p").extract())) + tick = duration / 50 + danmaku_density = {} + danmaku_density = [0 for i in range(50)] + for each_attr in danmaku_attr: + t = float(each_attr[0]) + if t > duration: + continue + index = int(t // tick) + danmaku_density[index] += 1 + item = DanmakuAggregateItem() + + item['aid'] = response.meta['aid'] + item['duration'] = duration + item['word_frequency'] = word_frequency + item['p_name'] = response.meta['p_name'] + item['danmaku_density'] = danmaku_density + item['page_number'] = response.meta['page_number'] + yield item + except Exception as error: + # 出现错误时存入出错集合 + self.db['error'].insert_one( + {'aid': int(response.meta['aid']), 'url': response.url, 'error': error}) diff --git a/danmaku_spider/scrapy.cfg b/danmaku_spider/scrapy.cfg new file mode 100644 index 0000000..218c209 --- /dev/null +++ b/danmaku_spider/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = danmaku_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = danmaku_spider From c442f250a9b913ec8da8233e2b87bda205607ac5 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 27 Feb 2019 15:43:32 +0800 Subject: [PATCH 203/469] feature: divide danmaku spider from others --- biliob_spider/items.py | 9 -- biliob_spider/pipelines.py | 30 ----- biliob_spider/spiders/BiliobSpider.py | 28 ++-- .../spiders/danmaku_aggregate_spider.py | 125 ------------------ danmaku_spider/danmaku_spider/__init__.py | 0 danmaku_spider/danmaku_spider/filter.py | 6 + danmaku_spider/danmaku_spider/items.py | 17 +++ danmaku_spider/danmaku_spider/middlewares.py | 103 +++++++++++++++ danmaku_spider/danmaku_spider/pipelines.py | 44 ++++++ danmaku_spider/danmaku_spider/settings.py | 106 +++++++++++++++ .../danmaku_spider/spiders/__init__.py | 4 + .../spiders/danmaku_aggregate_spider.py | 123 +++++++++++++++++ danmaku_spider/scrapy.cfg | 11 ++ 13 files changed, 430 insertions(+), 176 deletions(-) delete mode 100644 biliob_spider/spiders/danmaku_aggregate_spider.py create mode 100644 danmaku_spider/danmaku_spider/__init__.py create mode 100644 danmaku_spider/danmaku_spider/filter.py create mode 100644 danmaku_spider/danmaku_spider/items.py create mode 100644 danmaku_spider/danmaku_spider/middlewares.py create mode 100644 danmaku_spider/danmaku_spider/pipelines.py create mode 100644 danmaku_spider/danmaku_spider/settings.py create mode 100644 danmaku_spider/danmaku_spider/spiders/__init__.py create mode 100644 danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py create mode 100644 danmaku_spider/scrapy.cfg diff --git a/biliob_spider/items.py b/biliob_spider/items.py index c6b221d..490dfe6 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -131,12 +131,3 @@ class VideoWatcherItem(scrapy.Item): mid = scrapy.Field() aid = scrapy.Field() channels = scrapy.Field() - - -class DanmakuAggregateItem(scrapy.Item): - aid = scrapy.Field() - p_name = scrapy.Field() - page_number = scrapy.Field() - word_frequency = scrapy.Field() - danmaku_density = scrapy.Field() - duration = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index d2bd012..fbb99fd 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -13,36 +13,6 @@ from db import redis_connect_string -class DanmakuAggregatePipeline(object): - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.redis_connection = redis.from_url(redis_connect_string) - - def process_item(self, item, spider): - self.coll = self.db['video'] - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'danmaku_aggregate.{}'.format(item['page_number']): { - 'duration': item['duration'], - 'p_name': item['p_name'], - 'danmaku_density': item['danmaku_density'], - 'word_frequency': item['word_frequency'] - }, - 'danmaku_aggregate.updatetime':datetime.datetime.now() - } - }, True) - # 刷新redis数据缓存 - self.redis_connection.delete( - "video_detail::{}".format(item['aid'])) - - class StrongPipeline(object): def __init__(self): # 链接mongoDB diff --git a/biliob_spider/spiders/BiliobSpider.py b/biliob_spider/spiders/BiliobSpider.py index 5b8de55..234f7f8 100644 --- a/biliob_spider/spiders/BiliobSpider.py +++ b/biliob_spider/spiders/BiliobSpider.py @@ -1,31 +1,34 @@ # coding=utf-8 -import scrapy -from mail import mailer -from scrapy.http import Request -from biliob_spider.items import AuthorItem -import time +import datetime import json import logging +import time + +import redis +import scrapy +from memory_profiler import profile from pymongo import MongoClient -import datetime -from db import settings -from db import redis_connect_string +from scrapy.http import Request from scrapy_redis.spiders import RedisSpider -import redis + +from biliob_spider.items import AuthorItem +from db import redis_connect_string, settings +from mail import mailer -class BiliobSpider(RedisSpider): +class BiliobSpider(scrapy.spiders.Spider): name = "BiliobSpider" allowed_domains = ["bilibili.com"] - start_urls = [] + start_urls = ['www.bilibili.com'] custom_settings = { 'ITEM_PIPELINES': { 'biliob_spider.pipelines.BiliobPipeline': 300 }, 'DOWNLOAD_DELAY': 10 } - + @profile def __init__(self): + # 链接mongoDB self.client = MongoClient(settings['MINGO_HOST'], 27017) # 数据库登录需要帐号密码 @@ -35,6 +38,7 @@ def __init__(self): self.coll = self.db['author'] # 获得collection的句柄 self.redis_connection = redis.from_url(redis_connect_string) + @profile def parse(self, response): try: j = json.loads(response.body) diff --git a/biliob_spider/spiders/danmaku_aggregate_spider.py b/biliob_spider/spiders/danmaku_aggregate_spider.py deleted file mode 100644 index 4c1fdde..0000000 --- a/biliob_spider/spiders/danmaku_aggregate_spider.py +++ /dev/null @@ -1,125 +0,0 @@ -# coding=utf-8 -import scrapy -from mail import mailer -from scrapy.http import Request -from biliob_spider.items import DanmakuAggregateItem -from datetime import datetime -import time -import json -import logging -from pymongo import MongoClient -from db import settings -from util import sub_channel_2_channel -from scrapy_redis.spiders import RedisSpider -from db import redis_connect_string -import jieba -import jieba.analyse -import re - -jieba.load_userdict('./biliob_analyzer/dict.txt') - - -def q_to_b(q_str): - """全角转半角""" - b_str = "" - for uchar in q_str: - inside_code = ord(uchar) - if inside_code == 12288: # 全角空格直接转换 - inside_code = 32 - elif 65374 >= inside_code >= 65281: # 全角字符(除空格)根据关系转化 - inside_code -= 65248 - b_str += chr(inside_code) - return b_str - - -class DanmakuAggregateSpider(RedisSpider): - name = "DanmakuAggregate" - allowed_domains = ["bilibili.com"] - start_urls = [] - custom_settings = { - 'ITEM_PIPELINES': { - 'biliob_spider.pipelines.DanmakuAggregatePipeline': 300, - }, - 'DOWNLOAD_DELAY': 1 - } - CID_API = "https://api.bilibili.com/x/web-interface/view?aid={aid}" - DANMAKU_API = "https://api.bilibili.com/x/v1/dm/list.so?oid={oid}" - PATTERN = r"[0-9a-zA-Z\u4e00-\u9fa5\u30a1-\u30f6\u3041-\u3093\uFF00-\uFFFF\u4e00-\u9fa5]+" - - def __init__(self): - # 链接mongoDB - self.client = MongoClient(settings['MINGO_HOST'], 27017) - # 数据库登录需要帐号密码 - self.client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - self.db = self.client['biliob'] # 获得数据库的句柄 - self.coll = self.db['video'] # 获得collection的句柄 - - def parse(self, response): - try: - j = json.loads(response.body) - if j['code'] == -403: - aid = response.url[50:] - print('https://api.bilibili.com/x/article/archives?ids={}'.format(aid)) - yield Request('https://api.bilibili.com/x/article/archives?ids={}'.format(aid), - callback=self.getCidPlanB, meta={'aid': aid}) - else: - aid = j['data']['aid'] - pages = j['data']['pages'] - for each_page in pages: - duration = each_page['duration'] - p_name = each_page['part'] - page_number = each_page['page'] - cid = each_page['cid'] - yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, - meta={'duration': duration, - 'p_name': p_name, - 'page_number': page_number, - 'aid': aid}) - except Exception as error: - # 出现错误时打印错误日志 - if response['code'] == -404: - return - mailer.send( - to=["604264970@qq.com"], - subject="BiliobSpiderError", - body="{}\n{}".format(response.url, error), - ) - logging.error("视频爬虫在解析时发生错误") - logging.error(response.url) - logging.error(error) - - def getCidPlanB(self, response): - aid = response.meta['aid'] - cid = json.loads(response.body)['data'][aid]['cid'] - duration = json.loads(response.body)['data'][aid]['duration'] - yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, meta={'duration': duration, 'p_name': '', 'page_number': 1, 'aid': int(aid)}) - - def parseDanmaku(self, response): - duration = response.meta['duration'] - danmaku_text = q_to_b( - " ".join(response.xpath("d/text()").extract()).upper()) - # 自实现太low,使用自带关键字 - word_frequency = dict(jieba.analyse.extract_tags(danmaku_text, topK=50, withWeight=True, allowPOS=( - 'ns', 'n', 'vn', 'v', 'nr', 'un', 'x', 'j', 'i', 'l', 'nz', 'eng', 'o'))) - # 计算弹幕密度 - danmaku_attr = list(map(lambda x: x.split( - ","), response.xpath("d/@p").extract())) - tick = duration / 50 - danmaku_density = {} - danmaku_density = [0 for i in range(50)] - for each_attr in danmaku_attr: - t = float(each_attr[0]) - if t > duration: - continue - index = int(t // tick) - danmaku_density[index] += 1 - item = DanmakuAggregateItem() - - item['aid'] = response.meta['aid'] - item['duration'] = duration - item['word_frequency'] = word_frequency - item['p_name'] = response.meta['p_name'] - item['danmaku_density'] = danmaku_density - item['page_number'] = response.meta['page_number'] - yield item diff --git a/danmaku_spider/danmaku_spider/__init__.py b/danmaku_spider/danmaku_spider/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/danmaku_spider/danmaku_spider/filter.py b/danmaku_spider/danmaku_spider/filter.py new file mode 100644 index 0000000..a2b9f15 --- /dev/null +++ b/danmaku_spider/danmaku_spider/filter.py @@ -0,0 +1,6 @@ +from scrapy.dupefilters import RFPDupeFilter + + +class CloseDupefilter(RFPDupeFilter): + def request_seen(self, request): + return False diff --git a/danmaku_spider/danmaku_spider/items.py b/danmaku_spider/danmaku_spider/items.py new file mode 100644 index 0000000..ffafac0 --- /dev/null +++ b/danmaku_spider/danmaku_spider/items.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class DanmakuAggregateItem(scrapy.Item): + aid = scrapy.Field() + p_name = scrapy.Field() + page_number = scrapy.Field() + word_frequency = scrapy.Field() + danmaku_density = scrapy.Field() + duration = scrapy.Field() diff --git a/danmaku_spider/danmaku_spider/middlewares.py b/danmaku_spider/danmaku_spider/middlewares.py new file mode 100644 index 0000000..5a9acae --- /dev/null +++ b/danmaku_spider/danmaku_spider/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class DanmakuSpiderSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class DanmakuSpiderDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/danmaku_spider/danmaku_spider/pipelines.py b/danmaku_spider/danmaku_spider/pipelines.py new file mode 100644 index 0000000..8b07b46 --- /dev/null +++ b/danmaku_spider/danmaku_spider/pipelines.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html + +import datetime +import os +import sys + +import redis +from pymongo import MongoClient + +env_dist = os.environ + + +class DanmakuSpiderPipeline(object): + def __init__(self): + self.client = MongoClient(env_dist['BILIOB_MONGO_SERVER'], 27017) + self.client.admin.authenticate(env_dist['BILIOB_MONGO_USER'], + env_dist['BILIOB_MONGO_PASSWD']) + self.db = self.client['biliob'] + self.redis_connection = redis.from_url( + env_dist['BILIOB_REDIS_CONNECTION_STRING']) + + def process_item(self, item, spider): + self.coll = self.db['video'] + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'danmaku_aggregate.{}'.format(item['page_number']): { + 'duration': item['duration'], + 'p_name': item['p_name'], + 'danmaku_density': item['danmaku_density'], + 'word_frequency': item['word_frequency'] + }, + 'danmaku_aggregate.updatetime': datetime.datetime.now() + } + }, True) + # 刷新redis数据缓存 + self.redis_connection.delete( + "video_detail::{}".format(item['aid'])) diff --git a/danmaku_spider/danmaku_spider/settings.py b/danmaku_spider/danmaku_spider/settings.py new file mode 100644 index 0000000..ad81ead --- /dev/null +++ b/danmaku_spider/danmaku_spider/settings.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for danmaku_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +import os +env_dist = os.environ + +DUPEFILTER_CLASS = 'danmaku_spider.filter.CloseDupefilter' + +SCHEDULER_PERSIST = True +SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue' + +REDIS_URL = env_dist['BILIOB_REDIS_CONNECTION_STRING'] + +LOG_FILE = "danmaku_spider.log" +LOG_LEVEL = "DEBUG" + +BOT_NAME = 'danmaku_spider' + +SPIDER_MODULES = ['danmaku_spider.spiders'] +NEWSPIDER_MODULE = 'danmaku_spider.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36' + + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +CONCURRENT_REQUESTS = 8 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +# DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +CONCURRENT_REQUESTS_PER_DOMAIN = 8 +# CONCURRENT_REQUESTS_PER_IP = 64 + +# Disable cookies (enabled by default) +COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +# } + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# 'BilibiliRankListSpider.middlewares.BilibiliranklistspiderSpiderMiddleware': 543, +# } + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# DOWNLOADER_MIDDLEWARES = { +# 'BilibiliRankListSpider.middlewares.BilibiliranklistspiderDownloaderMiddleware': 543, +# } + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +# } + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +# ITEM_PIPELINES = { + +# } + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' +DOWNLOAD_FAIL_ON_DATALOSS = True +RETRY_ENABLED = True diff --git a/danmaku_spider/danmaku_spider/spiders/__init__.py b/danmaku_spider/danmaku_spider/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/danmaku_spider/danmaku_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py b/danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py new file mode 100644 index 0000000..eeaf85d --- /dev/null +++ b/danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py @@ -0,0 +1,123 @@ +# coding=utf-8 +import json +import logging +import os +import re +from datetime import datetime + +import jieba +import jieba.analyse +import scrapy +from pymongo import MongoClient +from scrapy.http import Request +from scrapy_redis.spiders import RedisSpider + +from danmaku_spider.items import DanmakuAggregateItem + +env_dist = os.environ + + +class DanmakuAggregateSpider(RedisSpider): + + name = "DanmakuAggregate" + allowed_domains = ["bilibili.com"] + start_urls = [] + custom_settings = { + 'DOWNLOAD_DELAY': 1 + } + CID_API = "https://api.bilibili.com/x/web-interface/view?aid={aid}" + DANMAKU_API = "https://api.bilibili.com/x/v1/dm/list.so?oid={oid}" + + def __init__(self): + jieba.load_userdict('../biliob_analyzer/dict.txt') + self.client = MongoClient(env_dist['BILIOB_MONGO_SERVER'], 27017) + self.client.admin.authenticate(env_dist['BILIOB_MONGO_USER'], + env_dist['BILIOB_MONGO_PASSWD']) + self.db = self.client['biliob'] # 获得数据库的句柄 + self.coll = self.db['video'] # 获得collection的句柄 + + def q_to_b(self, q_str): + """全角转半角""" + b_str = "" + for uchar in q_str: + inside_code = ord(uchar) + if inside_code == 12288: # 全角空格直接转换 + inside_code = 32 + elif 65374 >= inside_code >= 65281: # 全角字符(除空格)根据关系转化 + inside_code -= 65248 + b_str += chr(inside_code) + return b_str + + def parse(self, response): + try: + j = json.loads(response.body) + if j['code'] == -403: + aid = response.url[50:] + print('https://api.bilibili.com/x/article/archives?ids={}'.format(aid)) + yield Request('https://api.bilibili.com/x/article/archives?ids={}'.format(aid), + callback=self.getCidPlanB, meta={'aid': aid}) + else: + aid = j['data']['aid'] + pages = j['data']['pages'] + for each_page in pages: + duration = each_page['duration'] + p_name = each_page['part'] + page_number = each_page['page'] + cid = each_page['cid'] + yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, + meta={'duration': duration, + 'p_name': p_name, + 'page_number': page_number, + 'aid': aid}) + except Exception as error: + # 出现错误时存入出错集合 + self.db['error'].insert_one( + {'aid': int(aid), 'url': response.url, 'error': error}) + + def getCidPlanB(self, response): + try: + aid = response.meta['aid'] + cid = json.loads(response.body)['data'][aid]['cid'] + duration = json.loads(response.body)['data'][aid]['duration'] + yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, meta={'duration': duration, 'p_name': '', 'page_number': 1, 'aid': int(aid)}) + except Exception as error: + # 出现错误时存入出错集合 + self.db['error'].insert_one( + {'aid': int(aid), 'url': response.url, 'error': error}) + + def parseDanmaku(self, response): + try: + duration = response.meta['duration'] + + # 全角转半角,转大写 + danmaku_text = self.q_to_b( + " ".join(response.xpath("d/text()").extract()).upper()) + + # 自实现太low,使用自带关键字 + word_frequency = dict(jieba.analyse.extract_tags(danmaku_text, topK=50, withWeight=True, allowPOS=( + 'ns', 'n', 'vn', 'v', 'nr', 'un', 'x', 'j', 'i', 'l', 'nz', 'eng', 'o'))) + # 计算弹幕密度 + danmaku_attr = list(map(lambda x: x.split( + ","), response.xpath("d/@p").extract())) + tick = duration / 50 + danmaku_density = {} + danmaku_density = [0 for i in range(50)] + for each_attr in danmaku_attr: + t = float(each_attr[0]) + if t > duration: + continue + index = int(t // tick) + danmaku_density[index] += 1 + item = DanmakuAggregateItem() + + item['aid'] = response.meta['aid'] + item['duration'] = duration + item['word_frequency'] = word_frequency + item['p_name'] = response.meta['p_name'] + item['danmaku_density'] = danmaku_density + item['page_number'] = response.meta['page_number'] + yield item + except Exception as error: + # 出现错误时存入出错集合 + self.db['error'].insert_one( + {'aid': int(response.meta['aid']), 'url': response.url, 'error': error}) diff --git a/danmaku_spider/scrapy.cfg b/danmaku_spider/scrapy.cfg new file mode 100644 index 0000000..218c209 --- /dev/null +++ b/danmaku_spider/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = danmaku_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = danmaku_spider From f15714ab344a0a57799678d52f6c4025c5931f36 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 27 Feb 2019 15:47:22 +0800 Subject: [PATCH 204/469] remove memory profiler --- biliob_spider/spiders/BiliobSpider.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/biliob_spider/spiders/BiliobSpider.py b/biliob_spider/spiders/BiliobSpider.py index 234f7f8..9ad61ad 100644 --- a/biliob_spider/spiders/BiliobSpider.py +++ b/biliob_spider/spiders/BiliobSpider.py @@ -6,7 +6,6 @@ import redis import scrapy -from memory_profiler import profile from pymongo import MongoClient from scrapy.http import Request from scrapy_redis.spiders import RedisSpider @@ -26,7 +25,6 @@ class BiliobSpider(scrapy.spiders.Spider): }, 'DOWNLOAD_DELAY': 10 } - @profile def __init__(self): # 链接mongoDB @@ -38,7 +36,6 @@ def __init__(self): self.coll = self.db['author'] # 获得collection的句柄 self.redis_connection = redis.from_url(redis_connect_string) - @profile def parse(self, response): try: j = json.loads(response.body) From ddfd8c96dd0027c773e54a0d7163cd02490df44c Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 27 Feb 2019 15:47:22 +0800 Subject: [PATCH 205/469] remove memory profiler --- biliob_spider/spiders/BiliobSpider.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/biliob_spider/spiders/BiliobSpider.py b/biliob_spider/spiders/BiliobSpider.py index 234f7f8..9ad61ad 100644 --- a/biliob_spider/spiders/BiliobSpider.py +++ b/biliob_spider/spiders/BiliobSpider.py @@ -6,7 +6,6 @@ import redis import scrapy -from memory_profiler import profile from pymongo import MongoClient from scrapy.http import Request from scrapy_redis.spiders import RedisSpider @@ -26,7 +25,6 @@ class BiliobSpider(scrapy.spiders.Spider): }, 'DOWNLOAD_DELAY': 10 } - @profile def __init__(self): # 链接mongoDB @@ -38,7 +36,6 @@ def __init__(self): self.coll = self.db['author'] # 获得collection的句柄 self.redis_connection = redis.from_url(redis_connect_string) - @profile def parse(self, response): try: j = json.loads(response.body) From 9c5ec37ad83012ff90d10e20148252d276fa2425 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 27 Feb 2019 15:47:22 +0800 Subject: [PATCH 206/469] remove memory profiler --- biliob_spider/spiders/BiliobSpider.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/biliob_spider/spiders/BiliobSpider.py b/biliob_spider/spiders/BiliobSpider.py index 234f7f8..9ad61ad 100644 --- a/biliob_spider/spiders/BiliobSpider.py +++ b/biliob_spider/spiders/BiliobSpider.py @@ -6,7 +6,6 @@ import redis import scrapy -from memory_profiler import profile from pymongo import MongoClient from scrapy.http import Request from scrapy_redis.spiders import RedisSpider @@ -26,7 +25,6 @@ class BiliobSpider(scrapy.spiders.Spider): }, 'DOWNLOAD_DELAY': 10 } - @profile def __init__(self): # 链接mongoDB @@ -38,7 +36,6 @@ def __init__(self): self.coll = self.db['author'] # 获得collection的句柄 self.redis_connection = redis.from_url(redis_connect_string) - @profile def parse(self, response): try: j = json.loads(response.body) From 7b59066409a1079c4b37b34c649b04a25cbb6942 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 27 Feb 2019 22:06:46 +0800 Subject: [PATCH 207/469] fix: add update time for danmaku spider --- danmaku_spider/danmaku_spider/pipelines.py | 2 +- ...nes.py.63c85e88f826a6055228b84175aa73a6.py | 44 +++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py diff --git a/danmaku_spider/danmaku_spider/pipelines.py b/danmaku_spider/danmaku_spider/pipelines.py index 8b07b46..ef8d4d2 100644 --- a/danmaku_spider/danmaku_spider/pipelines.py +++ b/danmaku_spider/danmaku_spider/pipelines.py @@ -36,7 +36,7 @@ def process_item(self, item, spider): 'danmaku_density': item['danmaku_density'], 'word_frequency': item['word_frequency'] }, - 'danmaku_aggregate.updatetime': datetime.datetime.now() + 'danmaku_aggregate': {'updatetime': datetime.datetime.now()} } }, True) # 刷新redis数据缓存 diff --git a/danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py b/danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py new file mode 100644 index 0000000..43d1f88 --- /dev/null +++ b/danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html + +import datetime +import os +import sys + +import redis +from pymongo import MongoClient + +env_dist = os.environ + + +class DanmakuSpiderPipeline(object): + def __init__(self): + self.client = MongoClient(env_dist['BILIOB_MONGO_SERVER'], 27017) + self.client.admin.authenticate(env_dist['BILIOB_MONGO_USER'], + env_dist['BILIOB_MONGO_PASSWD']) + self.db = self.client['biliob'] + self.redis_connection = redis.from_url( + env_dist['BILIOB_REDIS_CONNECTION_STRING']) + + def process_item(self, item, spider): + self.coll = self.db['video'] + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'danmaku_aggregate.{}'.format(item['page_number']): { + 'duration': item['duration'], + 'p_name': item['p_name'], + 'danmaku_density': item['danmaku_density'], + 'word_frequency': item['word_frequency'] + }, + 'danmaku_aggregate': {'updatetime':datetime.datetime.now()} + } + }, True) + # 刷新redis数据缓存 + self.redis_connection.delete( + "video_detail::{}".format(item['aid'])) From f6a9d491334cea3a20b86989d9d0a4778d528130 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 27 Feb 2019 22:06:46 +0800 Subject: [PATCH 208/469] fix: add update time for danmaku spider --- danmaku_spider/danmaku_spider/pipelines.py | 2 +- ...nes.py.63c85e88f826a6055228b84175aa73a6.py | 44 +++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py diff --git a/danmaku_spider/danmaku_spider/pipelines.py b/danmaku_spider/danmaku_spider/pipelines.py index 8b07b46..ef8d4d2 100644 --- a/danmaku_spider/danmaku_spider/pipelines.py +++ b/danmaku_spider/danmaku_spider/pipelines.py @@ -36,7 +36,7 @@ def process_item(self, item, spider): 'danmaku_density': item['danmaku_density'], 'word_frequency': item['word_frequency'] }, - 'danmaku_aggregate.updatetime': datetime.datetime.now() + 'danmaku_aggregate': {'updatetime': datetime.datetime.now()} } }, True) # 刷新redis数据缓存 diff --git a/danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py b/danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py new file mode 100644 index 0000000..43d1f88 --- /dev/null +++ b/danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html + +import datetime +import os +import sys + +import redis +from pymongo import MongoClient + +env_dist = os.environ + + +class DanmakuSpiderPipeline(object): + def __init__(self): + self.client = MongoClient(env_dist['BILIOB_MONGO_SERVER'], 27017) + self.client.admin.authenticate(env_dist['BILIOB_MONGO_USER'], + env_dist['BILIOB_MONGO_PASSWD']) + self.db = self.client['biliob'] + self.redis_connection = redis.from_url( + env_dist['BILIOB_REDIS_CONNECTION_STRING']) + + def process_item(self, item, spider): + self.coll = self.db['video'] + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'danmaku_aggregate.{}'.format(item['page_number']): { + 'duration': item['duration'], + 'p_name': item['p_name'], + 'danmaku_density': item['danmaku_density'], + 'word_frequency': item['word_frequency'] + }, + 'danmaku_aggregate': {'updatetime':datetime.datetime.now()} + } + }, True) + # 刷新redis数据缓存 + self.redis_connection.delete( + "video_detail::{}".format(item['aid'])) From fb969efe7c59b75e46b888d7125af75d2b8b324b Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 27 Feb 2019 22:06:46 +0800 Subject: [PATCH 209/469] fix: add update time for danmaku spider --- danmaku_spider/danmaku_spider/pipelines.py | 2 +- ...nes.py.63c85e88f826a6055228b84175aa73a6.py | 44 +++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py diff --git a/danmaku_spider/danmaku_spider/pipelines.py b/danmaku_spider/danmaku_spider/pipelines.py index 8b07b46..ef8d4d2 100644 --- a/danmaku_spider/danmaku_spider/pipelines.py +++ b/danmaku_spider/danmaku_spider/pipelines.py @@ -36,7 +36,7 @@ def process_item(self, item, spider): 'danmaku_density': item['danmaku_density'], 'word_frequency': item['word_frequency'] }, - 'danmaku_aggregate.updatetime': datetime.datetime.now() + 'danmaku_aggregate': {'updatetime': datetime.datetime.now()} } }, True) # 刷新redis数据缓存 diff --git a/danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py b/danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py new file mode 100644 index 0000000..43d1f88 --- /dev/null +++ b/danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html + +import datetime +import os +import sys + +import redis +from pymongo import MongoClient + +env_dist = os.environ + + +class DanmakuSpiderPipeline(object): + def __init__(self): + self.client = MongoClient(env_dist['BILIOB_MONGO_SERVER'], 27017) + self.client.admin.authenticate(env_dist['BILIOB_MONGO_USER'], + env_dist['BILIOB_MONGO_PASSWD']) + self.db = self.client['biliob'] + self.redis_connection = redis.from_url( + env_dist['BILIOB_REDIS_CONNECTION_STRING']) + + def process_item(self, item, spider): + self.coll = self.db['video'] + self.coll.update_one({ + 'aid': int(item['aid']) + }, { + '$set': { + 'danmaku_aggregate.{}'.format(item['page_number']): { + 'duration': item['duration'], + 'p_name': item['p_name'], + 'danmaku_density': item['danmaku_density'], + 'word_frequency': item['word_frequency'] + }, + 'danmaku_aggregate': {'updatetime':datetime.datetime.now()} + } + }, True) + # 刷新redis数据缓存 + self.redis_connection.delete( + "video_detail::{}".format(item['aid'])) From 2b77b9e403a5c46b4ab1ee3c40479b241c0c4604 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 27 Feb 2019 23:30:28 +0800 Subject: [PATCH 210/469] fix: danmaku update time --- danmaku_spider/danmaku_spider/pipelines.py | 2 +- ...nes.py.63c85e88f826a6055228b84175aa73a6.py | 44 ------------------- 2 files changed, 1 insertion(+), 45 deletions(-) delete mode 100644 danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py diff --git a/danmaku_spider/danmaku_spider/pipelines.py b/danmaku_spider/danmaku_spider/pipelines.py index ef8d4d2..140e66c 100644 --- a/danmaku_spider/danmaku_spider/pipelines.py +++ b/danmaku_spider/danmaku_spider/pipelines.py @@ -36,7 +36,7 @@ def process_item(self, item, spider): 'danmaku_density': item['danmaku_density'], 'word_frequency': item['word_frequency'] }, - 'danmaku_aggregate': {'updatetime': datetime.datetime.now()} + 'danmaku_aggregate.updatetime': datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') } }, True) # 刷新redis数据缓存 diff --git a/danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py b/danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py deleted file mode 100644 index 43d1f88..0000000 --- a/danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py +++ /dev/null @@ -1,44 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html - -import datetime -import os -import sys - -import redis -from pymongo import MongoClient - -env_dist = os.environ - - -class DanmakuSpiderPipeline(object): - def __init__(self): - self.client = MongoClient(env_dist['BILIOB_MONGO_SERVER'], 27017) - self.client.admin.authenticate(env_dist['BILIOB_MONGO_USER'], - env_dist['BILIOB_MONGO_PASSWD']) - self.db = self.client['biliob'] - self.redis_connection = redis.from_url( - env_dist['BILIOB_REDIS_CONNECTION_STRING']) - - def process_item(self, item, spider): - self.coll = self.db['video'] - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'danmaku_aggregate.{}'.format(item['page_number']): { - 'duration': item['duration'], - 'p_name': item['p_name'], - 'danmaku_density': item['danmaku_density'], - 'word_frequency': item['word_frequency'] - }, - 'danmaku_aggregate': {'updatetime':datetime.datetime.now()} - } - }, True) - # 刷新redis数据缓存 - self.redis_connection.delete( - "video_detail::{}".format(item['aid'])) From 5994f9a48571b91eef12a5b83cd894bbdba8cf04 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 27 Feb 2019 23:30:28 +0800 Subject: [PATCH 211/469] fix: danmaku update time --- danmaku_spider/danmaku_spider/pipelines.py | 2 +- ...nes.py.63c85e88f826a6055228b84175aa73a6.py | 44 ------------------- 2 files changed, 1 insertion(+), 45 deletions(-) delete mode 100644 danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py diff --git a/danmaku_spider/danmaku_spider/pipelines.py b/danmaku_spider/danmaku_spider/pipelines.py index ef8d4d2..140e66c 100644 --- a/danmaku_spider/danmaku_spider/pipelines.py +++ b/danmaku_spider/danmaku_spider/pipelines.py @@ -36,7 +36,7 @@ def process_item(self, item, spider): 'danmaku_density': item['danmaku_density'], 'word_frequency': item['word_frequency'] }, - 'danmaku_aggregate': {'updatetime': datetime.datetime.now()} + 'danmaku_aggregate.updatetime': datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') } }, True) # 刷新redis数据缓存 diff --git a/danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py b/danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py deleted file mode 100644 index 43d1f88..0000000 --- a/danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py +++ /dev/null @@ -1,44 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html - -import datetime -import os -import sys - -import redis -from pymongo import MongoClient - -env_dist = os.environ - - -class DanmakuSpiderPipeline(object): - def __init__(self): - self.client = MongoClient(env_dist['BILIOB_MONGO_SERVER'], 27017) - self.client.admin.authenticate(env_dist['BILIOB_MONGO_USER'], - env_dist['BILIOB_MONGO_PASSWD']) - self.db = self.client['biliob'] - self.redis_connection = redis.from_url( - env_dist['BILIOB_REDIS_CONNECTION_STRING']) - - def process_item(self, item, spider): - self.coll = self.db['video'] - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'danmaku_aggregate.{}'.format(item['page_number']): { - 'duration': item['duration'], - 'p_name': item['p_name'], - 'danmaku_density': item['danmaku_density'], - 'word_frequency': item['word_frequency'] - }, - 'danmaku_aggregate': {'updatetime':datetime.datetime.now()} - } - }, True) - # 刷新redis数据缓存 - self.redis_connection.delete( - "video_detail::{}".format(item['aid'])) From a7d2d5b9e4c16fd7e6e47f138f89523288bd1151 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 27 Feb 2019 23:30:28 +0800 Subject: [PATCH 212/469] fix: danmaku update time --- danmaku_spider/danmaku_spider/pipelines.py | 2 +- ...nes.py.63c85e88f826a6055228b84175aa73a6.py | 44 ------------------- 2 files changed, 1 insertion(+), 45 deletions(-) delete mode 100644 danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py diff --git a/danmaku_spider/danmaku_spider/pipelines.py b/danmaku_spider/danmaku_spider/pipelines.py index ef8d4d2..140e66c 100644 --- a/danmaku_spider/danmaku_spider/pipelines.py +++ b/danmaku_spider/danmaku_spider/pipelines.py @@ -36,7 +36,7 @@ def process_item(self, item, spider): 'danmaku_density': item['danmaku_density'], 'word_frequency': item['word_frequency'] }, - 'danmaku_aggregate': {'updatetime': datetime.datetime.now()} + 'danmaku_aggregate.updatetime': datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') } }, True) # 刷新redis数据缓存 diff --git a/danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py b/danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py deleted file mode 100644 index 43d1f88..0000000 --- a/danmaku_spider/danmaku_spider/pipelines.py.63c85e88f826a6055228b84175aa73a6.py +++ /dev/null @@ -1,44 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html - -import datetime -import os -import sys - -import redis -from pymongo import MongoClient - -env_dist = os.environ - - -class DanmakuSpiderPipeline(object): - def __init__(self): - self.client = MongoClient(env_dist['BILIOB_MONGO_SERVER'], 27017) - self.client.admin.authenticate(env_dist['BILIOB_MONGO_USER'], - env_dist['BILIOB_MONGO_PASSWD']) - self.db = self.client['biliob'] - self.redis_connection = redis.from_url( - env_dist['BILIOB_REDIS_CONNECTION_STRING']) - - def process_item(self, item, spider): - self.coll = self.db['video'] - self.coll.update_one({ - 'aid': int(item['aid']) - }, { - '$set': { - 'danmaku_aggregate.{}'.format(item['page_number']): { - 'duration': item['duration'], - 'p_name': item['p_name'], - 'danmaku_density': item['danmaku_density'], - 'word_frequency': item['word_frequency'] - }, - 'danmaku_aggregate': {'updatetime':datetime.datetime.now()} - } - }, True) - # 刷新redis数据缓存 - self.redis_connection.delete( - "video_detail::{}".format(item['aid'])) From 31faaf6923a29a75da1f9944cae4cdfd5be21460 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sun, 3 Mar 2019 00:18:00 +0800 Subject: [PATCH 213/469] feature: add_keywords and rank update --- biliob_analyzer/add_keyword.py | 33 +++++ biliob_analyzer/author_rank.py | 22 +++- biliob_analyzer/dict.txt | 229 ++++++++++++++++++++++++++++++++- biliob_analyzer/video_rank.py | 18 ++- 4 files changed, 292 insertions(+), 10 deletions(-) diff --git a/biliob_analyzer/add_keyword.py b/biliob_analyzer/add_keyword.py index d18f359..b7d28ff 100644 --- a/biliob_analyzer/add_keyword.py +++ b/biliob_analyzer/add_keyword.py @@ -11,6 +11,7 @@ class AddKeyword(): def __init__(self): self.mongo_author = db['author'] self.mongo_video = db['video'] + self.mongo_word = db['search_word'] def get_video_kw_list(self, aid): # 关键字从name和official中提取 @@ -90,3 +91,35 @@ def add_all_video(self): for each_video in videos: aid = each_video['aid'] self.add_video_kw(aid) + + def refresh_all_author(self): + authors = self.mongo_author.find( + {}, {'_id': 0, 'mid': 1}) + for each_author in authors: + mid = each_author['mid'] + self.add_author_kw(mid) + + def refresh_all_video(self): + videos = self.mongo_video.find( + {}, {'_id': 0, 'aid': 1}) + for each_video in videos: + aid = each_video['aid'] + self.add_video_kw(aid) + + def add_omitted(self): + d = open('./biliob_analyzer/dict.txt', 'r', + encoding='utf8').read().split('\n') + for each in self.mongo_word.find(): + if 'aid' in each and each['aid'] not in d: + d.append(each['aid']) + elif 'mid' in each and each['mid'] not in d: + d.append(each['mid']) + pass + pass + o = open('./biliob_analyzer/dict.txt', + 'w', encoding='utf8', newline='') + for each in d: + o.write(each+'\n') + o.close() + self.refresh_all_video() + self.refresh_all_author() diff --git a/biliob_analyzer/author_rank.py b/biliob_analyzer/author_rank.py index 43f463d..323a5cd 100644 --- a/biliob_analyzer/author_rank.py +++ b/biliob_analyzer/author_rank.py @@ -8,44 +8,60 @@ logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') logger = logging.getLogger(__name__) + + +def format_p_rank(i, count): + return round(i/count * 100, 2) + + for each_key in ['cFans', 'cArchive_view', 'cArticle_view']: logger.info("开始计算作者{}排名".format(each_key)) i = 1 + count = coll.find().count() authors = coll.find({each_key: {'$exists': 1}}, {'mid': 1, 'rank': 1, each_key: 1}).batch_size( 300).sort(each_key, DESCENDING) if each_key == 'cFans': each_rank = 'fansRank' each_d_rank = 'dFansRank' + each_p_rank = 'pFansRank' elif each_key == 'cArchive_view': each_rank = 'archiveViewRank' each_d_rank = 'dArchiveViewRank' + each_p_rank = 'pArchiveViewRank' elif each_key == 'cArticle_view': each_rank = 'articleViewRank' each_d_rank = 'dArticleViewRank' + each_p_rank = 'pArticleViewRank' for each_author in authors: # 如果没有data 直接下一个 if each_key in each_author: + # 如果已经计算过rank if 'rank' in each_author: rank = each_author['rank'] if each_rank in each_author['rank']: rank[each_d_rank] = each_author['rank'][each_rank] - i else: - rank[each_d_rank] = -1 + rank[each_d_rank] = 0 rank[each_rank] = i + rank[each_p_rank] = format_p_rank(i, count) else: + # 初始化 rank = { each_rank: i, - each_d_rank: -1 + each_d_rank: 0, + each_p_rank: format_p_rank(i, count) } if each_author[each_key] == 0: if 'rank' in each_author: rank = each_author['rank'] rank[each_d_rank] = 0 rank[each_rank] = -1 + rank[each_p_rank] = -1 else: rank = { each_rank: -1, - each_d_rank: 0 + each_d_rank: 0, + each_p_rank: -1 } if each_key == 'cArticle_view': rank['updateTime'] = datetime.datetime.now() diff --git a/biliob_analyzer/dict.txt b/biliob_analyzer/dict.txt index 496402b..2eef132 100644 --- a/biliob_analyzer/dict.txt +++ b/biliob_analyzer/dict.txt @@ -1,2 +1,229 @@ 高能联盟 -lex \ No newline at end of file +lex +辉夜 +Jannchie +大小姐 +huiyeda +辉夜大小姐 +豆砸 +力元 +boardki +非著名 +哇哇哇 +杆菌无敌 +猪桥 +六道 +帕里 +lao +关口知宏 +懒人 +舞秋风 +林北是 +林北是国民 +null +斑鸠心平气和 +斑鸠心平气和every +day +千一的弟弟 +千 +【李云龙 +【李云龙】chus +【李云龙】出山 +czw +采紫葳的 +泛科文计划 +kirakira时尚酱 +哔斯卡金像奖 +8K8K +zhe +这是di +lang +6道 +凯 +凯wen +av42321016 +记录生活 +记录 +猪侨 +竖琴小 +鼠标买手火云 +uid +16877795 +av44999149 +diytale +2018bilibili新年mv《再来一杯》 +爆博士 +镇魂曲的夜空 +tlan +tlang +爱做饭的芋头SAMA +爱做饭的芋头 +av706 +【東方】bad +apple!! +pv【影絵】 +vivo +rongyaoshou +404NTFOUND +404NTFounD +锡兰cey +not +问 +来 +xun +Damn +面筋哥 +huitailangbango +gaig +拜年祭 +2019拜年祭 +sh +兽耳 +y'r'x'zhe +yi're +一人xing +一人行者 +早 +大胖和 +hongg +蒂姆嘟嘟 +baoh +baojia +阿兔 +孟xiao +孟晓 +meng'xiao'jie +指法芬芳 +10后 +av108 +av10872 +av810872 +baozouba +暴走吧 +水一 +蛋黄 +zhanghaoyizhuxio +账号已注销 +zuiqiangdanao +最强大脑 +quan'yu +zi +兹 +紫jia +哔哩哔哩活动娘 +徐大下咯 +徐大咯 +中国人口 +神奇的 +bao +爆bo's +d爆博士 +tebiet +特别特别s +特别特别帅的dings +特别特别帅的定时 +半脸的男人 +2017拜年祭 +日常00 +哔哩哔哩 +labi +散避awa +念诗之王 +玄 +豆砸ow +阿良良 +阿良良木 +a +tar +动态活动 +荒唐的 +荒唐的loser +恐恐 +z君 +feng huang yu +凤凰yuan +凤凰院 +欣小萌 +阿齐一 +阿神 +勾指起誓 +白上吹 +白上吹雪 +冠世一战 +辉夜daxiaojie +辉夜大小姐想让 +tan +tankaoniurou +tankaoniuroutan +炭烤牛肉碳 +lanpi +蓝皮jun +蓝皮君 +蓝皮 +奋 +奋婧 +奋婧仔 +环海航行 +幻海航行 +文曰xiao'qiang +龙哥 +华农兄弟 +jojo21 +小熊fli +曲之 +曲之海洋 +xi +xin'xiao'meng +tian_yi +欣小萌- +多拉的 +多拉的十九 +多拉的十九n +爱吃饭的 +爱吃饭的芋头 +爱做饭的 +爱做饭的芋头SA +爱做饭的芋头SAMa +fff团 +菜到 +zhi +知君 +liyu +liyuanjun +力元军 +力元j +lie +蓝毛 +小蓝毛鸭 +小藍毛 +我是c +我是 +我是菜头 +lf +菜来到 +菜来道 +万物30秒 +地下11楼 +地下11楼的 +地下11楼的森 +李大神 +10后找人 +lds +lds李大神 +源 +bingbing +饼饼饼呢 +so +sone钊哥 +芋 +改革春风 +liy +sha +山之 +小狼xiol +ya'yi +雅音g +雅音gong'yu +雅音宫羽 +黑板报 +av21714693 diff --git a/biliob_analyzer/video_rank.py b/biliob_analyzer/video_rank.py index c7c3622..c96f423 100644 --- a/biliob_analyzer/video_rank.py +++ b/biliob_analyzer/video_rank.py @@ -5,6 +5,10 @@ from pymongo import DESCENDING +def format_p_rank(i, count): + return round(i / count * 100, 2) + + def computeVideoRank(): coll = db['video'] # 获得collection的句柄 @@ -22,12 +26,10 @@ def computeVideoRank(): i = 1 videos = coll.find({each_key: {'$exists': 1}}, {'aid': 1, 'rank': 1, each_key: 1}).batch_size( 300).sort(each_key, DESCENDING) - if each_key == 'cView': - each_rank = 'cViewRank' - each_d_rank = 'dViewRank' - each_rank = each_key + 'Rank' each_d_rank = 'd' + each_key[1:] + 'Rank' + each_p_rank = 'p' + each_key[1:] + 'Rank' + count = coll.find().count() for each_video in videos: # 如果没有data 直接下一个 @@ -39,20 +41,24 @@ def computeVideoRank(): else: rank[each_d_rank] = -1 rank[each_rank] = i + rank[each_p_rank] = format_p_rank(i, count) else: rank = { each_rank: i, - each_d_rank: -1 + each_d_rank: -1, + each_p_rank: format_p_rank(i, count) } if each_video[each_key] == 0: if 'rank' in each_video: rank = each_video['rank'] rank[each_d_rank] = 0 rank[each_rank] = -1 + rank[each_p_rank] = -1 else: rank = { each_rank: -1, - each_d_rank: 0 + each_d_rank: 0, + each_p_rank: -1 } if each_key == keys[-1]: rank['updateTime'] = datetime.datetime.now() From 92a9d3a1a79fd71abcc3e2973636ec360c246e93 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sun, 3 Mar 2019 00:18:00 +0800 Subject: [PATCH 214/469] feature: add_keywords and rank update --- biliob_analyzer/add_keyword.py | 33 +++++ biliob_analyzer/author_rank.py | 22 +++- biliob_analyzer/dict.txt | 229 ++++++++++++++++++++++++++++++++- biliob_analyzer/video_rank.py | 18 ++- 4 files changed, 292 insertions(+), 10 deletions(-) diff --git a/biliob_analyzer/add_keyword.py b/biliob_analyzer/add_keyword.py index d18f359..b7d28ff 100644 --- a/biliob_analyzer/add_keyword.py +++ b/biliob_analyzer/add_keyword.py @@ -11,6 +11,7 @@ class AddKeyword(): def __init__(self): self.mongo_author = db['author'] self.mongo_video = db['video'] + self.mongo_word = db['search_word'] def get_video_kw_list(self, aid): # 关键字从name和official中提取 @@ -90,3 +91,35 @@ def add_all_video(self): for each_video in videos: aid = each_video['aid'] self.add_video_kw(aid) + + def refresh_all_author(self): + authors = self.mongo_author.find( + {}, {'_id': 0, 'mid': 1}) + for each_author in authors: + mid = each_author['mid'] + self.add_author_kw(mid) + + def refresh_all_video(self): + videos = self.mongo_video.find( + {}, {'_id': 0, 'aid': 1}) + for each_video in videos: + aid = each_video['aid'] + self.add_video_kw(aid) + + def add_omitted(self): + d = open('./biliob_analyzer/dict.txt', 'r', + encoding='utf8').read().split('\n') + for each in self.mongo_word.find(): + if 'aid' in each and each['aid'] not in d: + d.append(each['aid']) + elif 'mid' in each and each['mid'] not in d: + d.append(each['mid']) + pass + pass + o = open('./biliob_analyzer/dict.txt', + 'w', encoding='utf8', newline='') + for each in d: + o.write(each+'\n') + o.close() + self.refresh_all_video() + self.refresh_all_author() diff --git a/biliob_analyzer/author_rank.py b/biliob_analyzer/author_rank.py index 43f463d..323a5cd 100644 --- a/biliob_analyzer/author_rank.py +++ b/biliob_analyzer/author_rank.py @@ -8,44 +8,60 @@ logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') logger = logging.getLogger(__name__) + + +def format_p_rank(i, count): + return round(i/count * 100, 2) + + for each_key in ['cFans', 'cArchive_view', 'cArticle_view']: logger.info("开始计算作者{}排名".format(each_key)) i = 1 + count = coll.find().count() authors = coll.find({each_key: {'$exists': 1}}, {'mid': 1, 'rank': 1, each_key: 1}).batch_size( 300).sort(each_key, DESCENDING) if each_key == 'cFans': each_rank = 'fansRank' each_d_rank = 'dFansRank' + each_p_rank = 'pFansRank' elif each_key == 'cArchive_view': each_rank = 'archiveViewRank' each_d_rank = 'dArchiveViewRank' + each_p_rank = 'pArchiveViewRank' elif each_key == 'cArticle_view': each_rank = 'articleViewRank' each_d_rank = 'dArticleViewRank' + each_p_rank = 'pArticleViewRank' for each_author in authors: # 如果没有data 直接下一个 if each_key in each_author: + # 如果已经计算过rank if 'rank' in each_author: rank = each_author['rank'] if each_rank in each_author['rank']: rank[each_d_rank] = each_author['rank'][each_rank] - i else: - rank[each_d_rank] = -1 + rank[each_d_rank] = 0 rank[each_rank] = i + rank[each_p_rank] = format_p_rank(i, count) else: + # 初始化 rank = { each_rank: i, - each_d_rank: -1 + each_d_rank: 0, + each_p_rank: format_p_rank(i, count) } if each_author[each_key] == 0: if 'rank' in each_author: rank = each_author['rank'] rank[each_d_rank] = 0 rank[each_rank] = -1 + rank[each_p_rank] = -1 else: rank = { each_rank: -1, - each_d_rank: 0 + each_d_rank: 0, + each_p_rank: -1 } if each_key == 'cArticle_view': rank['updateTime'] = datetime.datetime.now() diff --git a/biliob_analyzer/dict.txt b/biliob_analyzer/dict.txt index 496402b..2eef132 100644 --- a/biliob_analyzer/dict.txt +++ b/biliob_analyzer/dict.txt @@ -1,2 +1,229 @@ 高能联盟 -lex \ No newline at end of file +lex +辉夜 +Jannchie +大小姐 +huiyeda +辉夜大小姐 +豆砸 +力元 +boardki +非著名 +哇哇哇 +杆菌无敌 +猪桥 +六道 +帕里 +lao +关口知宏 +懒人 +舞秋风 +林北是 +林北是国民 +null +斑鸠心平气和 +斑鸠心平气和every +day +千一的弟弟 +千 +【李云龙 +【李云龙】chus +【李云龙】出山 +czw +采紫葳的 +泛科文计划 +kirakira时尚酱 +哔斯卡金像奖 +8K8K +zhe +这是di +lang +6道 +凯 +凯wen +av42321016 +记录生活 +记录 +猪侨 +竖琴小 +鼠标买手火云 +uid +16877795 +av44999149 +diytale +2018bilibili新年mv《再来一杯》 +爆博士 +镇魂曲的夜空 +tlan +tlang +爱做饭的芋头SAMA +爱做饭的芋头 +av706 +【東方】bad +apple!! +pv【影絵】 +vivo +rongyaoshou +404NTFOUND +404NTFounD +锡兰cey +not +问 +来 +xun +Damn +面筋哥 +huitailangbango +gaig +拜年祭 +2019拜年祭 +sh +兽耳 +y'r'x'zhe +yi're +一人xing +一人行者 +早 +大胖和 +hongg +蒂姆嘟嘟 +baoh +baojia +阿兔 +孟xiao +孟晓 +meng'xiao'jie +指法芬芳 +10后 +av108 +av10872 +av810872 +baozouba +暴走吧 +水一 +蛋黄 +zhanghaoyizhuxio +账号已注销 +zuiqiangdanao +最强大脑 +quan'yu +zi +兹 +紫jia +哔哩哔哩活动娘 +徐大下咯 +徐大咯 +中国人口 +神奇的 +bao +爆bo's +d爆博士 +tebiet +特别特别s +特别特别帅的dings +特别特别帅的定时 +半脸的男人 +2017拜年祭 +日常00 +哔哩哔哩 +labi +散避awa +念诗之王 +玄 +豆砸ow +阿良良 +阿良良木 +a +tar +动态活动 +荒唐的 +荒唐的loser +恐恐 +z君 +feng huang yu +凤凰yuan +凤凰院 +欣小萌 +阿齐一 +阿神 +勾指起誓 +白上吹 +白上吹雪 +冠世一战 +辉夜daxiaojie +辉夜大小姐想让 +tan +tankaoniurou +tankaoniuroutan +炭烤牛肉碳 +lanpi +蓝皮jun +蓝皮君 +蓝皮 +奋 +奋婧 +奋婧仔 +环海航行 +幻海航行 +文曰xiao'qiang +龙哥 +华农兄弟 +jojo21 +小熊fli +曲之 +曲之海洋 +xi +xin'xiao'meng +tian_yi +欣小萌- +多拉的 +多拉的十九 +多拉的十九n +爱吃饭的 +爱吃饭的芋头 +爱做饭的 +爱做饭的芋头SA +爱做饭的芋头SAMa +fff团 +菜到 +zhi +知君 +liyu +liyuanjun +力元军 +力元j +lie +蓝毛 +小蓝毛鸭 +小藍毛 +我是c +我是 +我是菜头 +lf +菜来到 +菜来道 +万物30秒 +地下11楼 +地下11楼的 +地下11楼的森 +李大神 +10后找人 +lds +lds李大神 +源 +bingbing +饼饼饼呢 +so +sone钊哥 +芋 +改革春风 +liy +sha +山之 +小狼xiol +ya'yi +雅音g +雅音gong'yu +雅音宫羽 +黑板报 +av21714693 diff --git a/biliob_analyzer/video_rank.py b/biliob_analyzer/video_rank.py index c7c3622..c96f423 100644 --- a/biliob_analyzer/video_rank.py +++ b/biliob_analyzer/video_rank.py @@ -5,6 +5,10 @@ from pymongo import DESCENDING +def format_p_rank(i, count): + return round(i / count * 100, 2) + + def computeVideoRank(): coll = db['video'] # 获得collection的句柄 @@ -22,12 +26,10 @@ def computeVideoRank(): i = 1 videos = coll.find({each_key: {'$exists': 1}}, {'aid': 1, 'rank': 1, each_key: 1}).batch_size( 300).sort(each_key, DESCENDING) - if each_key == 'cView': - each_rank = 'cViewRank' - each_d_rank = 'dViewRank' - each_rank = each_key + 'Rank' each_d_rank = 'd' + each_key[1:] + 'Rank' + each_p_rank = 'p' + each_key[1:] + 'Rank' + count = coll.find().count() for each_video in videos: # 如果没有data 直接下一个 @@ -39,20 +41,24 @@ def computeVideoRank(): else: rank[each_d_rank] = -1 rank[each_rank] = i + rank[each_p_rank] = format_p_rank(i, count) else: rank = { each_rank: i, - each_d_rank: -1 + each_d_rank: -1, + each_p_rank: format_p_rank(i, count) } if each_video[each_key] == 0: if 'rank' in each_video: rank = each_video['rank'] rank[each_d_rank] = 0 rank[each_rank] = -1 + rank[each_p_rank] = -1 else: rank = { each_rank: -1, - each_d_rank: 0 + each_d_rank: 0, + each_p_rank: -1 } if each_key == keys[-1]: rank['updateTime'] = datetime.datetime.now() From 7ab30863cfe3529d35ccd8f039eb89f09056dce0 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sun, 3 Mar 2019 00:18:00 +0800 Subject: [PATCH 215/469] feature: add_keywords and rank update --- biliob_analyzer/add_keyword.py | 33 +++++ biliob_analyzer/author_rank.py | 22 +++- biliob_analyzer/dict.txt | 229 ++++++++++++++++++++++++++++++++- biliob_analyzer/video_rank.py | 18 ++- 4 files changed, 292 insertions(+), 10 deletions(-) diff --git a/biliob_analyzer/add_keyword.py b/biliob_analyzer/add_keyword.py index d18f359..b7d28ff 100644 --- a/biliob_analyzer/add_keyword.py +++ b/biliob_analyzer/add_keyword.py @@ -11,6 +11,7 @@ class AddKeyword(): def __init__(self): self.mongo_author = db['author'] self.mongo_video = db['video'] + self.mongo_word = db['search_word'] def get_video_kw_list(self, aid): # 关键字从name和official中提取 @@ -90,3 +91,35 @@ def add_all_video(self): for each_video in videos: aid = each_video['aid'] self.add_video_kw(aid) + + def refresh_all_author(self): + authors = self.mongo_author.find( + {}, {'_id': 0, 'mid': 1}) + for each_author in authors: + mid = each_author['mid'] + self.add_author_kw(mid) + + def refresh_all_video(self): + videos = self.mongo_video.find( + {}, {'_id': 0, 'aid': 1}) + for each_video in videos: + aid = each_video['aid'] + self.add_video_kw(aid) + + def add_omitted(self): + d = open('./biliob_analyzer/dict.txt', 'r', + encoding='utf8').read().split('\n') + for each in self.mongo_word.find(): + if 'aid' in each and each['aid'] not in d: + d.append(each['aid']) + elif 'mid' in each and each['mid'] not in d: + d.append(each['mid']) + pass + pass + o = open('./biliob_analyzer/dict.txt', + 'w', encoding='utf8', newline='') + for each in d: + o.write(each+'\n') + o.close() + self.refresh_all_video() + self.refresh_all_author() diff --git a/biliob_analyzer/author_rank.py b/biliob_analyzer/author_rank.py index 43f463d..323a5cd 100644 --- a/biliob_analyzer/author_rank.py +++ b/biliob_analyzer/author_rank.py @@ -8,44 +8,60 @@ logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s @ %(name)s: %(message)s') logger = logging.getLogger(__name__) + + +def format_p_rank(i, count): + return round(i/count * 100, 2) + + for each_key in ['cFans', 'cArchive_view', 'cArticle_view']: logger.info("开始计算作者{}排名".format(each_key)) i = 1 + count = coll.find().count() authors = coll.find({each_key: {'$exists': 1}}, {'mid': 1, 'rank': 1, each_key: 1}).batch_size( 300).sort(each_key, DESCENDING) if each_key == 'cFans': each_rank = 'fansRank' each_d_rank = 'dFansRank' + each_p_rank = 'pFansRank' elif each_key == 'cArchive_view': each_rank = 'archiveViewRank' each_d_rank = 'dArchiveViewRank' + each_p_rank = 'pArchiveViewRank' elif each_key == 'cArticle_view': each_rank = 'articleViewRank' each_d_rank = 'dArticleViewRank' + each_p_rank = 'pArticleViewRank' for each_author in authors: # 如果没有data 直接下一个 if each_key in each_author: + # 如果已经计算过rank if 'rank' in each_author: rank = each_author['rank'] if each_rank in each_author['rank']: rank[each_d_rank] = each_author['rank'][each_rank] - i else: - rank[each_d_rank] = -1 + rank[each_d_rank] = 0 rank[each_rank] = i + rank[each_p_rank] = format_p_rank(i, count) else: + # 初始化 rank = { each_rank: i, - each_d_rank: -1 + each_d_rank: 0, + each_p_rank: format_p_rank(i, count) } if each_author[each_key] == 0: if 'rank' in each_author: rank = each_author['rank'] rank[each_d_rank] = 0 rank[each_rank] = -1 + rank[each_p_rank] = -1 else: rank = { each_rank: -1, - each_d_rank: 0 + each_d_rank: 0, + each_p_rank: -1 } if each_key == 'cArticle_view': rank['updateTime'] = datetime.datetime.now() diff --git a/biliob_analyzer/dict.txt b/biliob_analyzer/dict.txt index 496402b..2eef132 100644 --- a/biliob_analyzer/dict.txt +++ b/biliob_analyzer/dict.txt @@ -1,2 +1,229 @@ 高能联盟 -lex \ No newline at end of file +lex +辉夜 +Jannchie +大小姐 +huiyeda +辉夜大小姐 +豆砸 +力元 +boardki +非著名 +哇哇哇 +杆菌无敌 +猪桥 +六道 +帕里 +lao +关口知宏 +懒人 +舞秋风 +林北是 +林北是国民 +null +斑鸠心平气和 +斑鸠心平气和every +day +千一的弟弟 +千 +【李云龙 +【李云龙】chus +【李云龙】出山 +czw +采紫葳的 +泛科文计划 +kirakira时尚酱 +哔斯卡金像奖 +8K8K +zhe +这是di +lang +6道 +凯 +凯wen +av42321016 +记录生活 +记录 +猪侨 +竖琴小 +鼠标买手火云 +uid +16877795 +av44999149 +diytale +2018bilibili新年mv《再来一杯》 +爆博士 +镇魂曲的夜空 +tlan +tlang +爱做饭的芋头SAMA +爱做饭的芋头 +av706 +【東方】bad +apple!! +pv【影絵】 +vivo +rongyaoshou +404NTFOUND +404NTFounD +锡兰cey +not +问 +来 +xun +Damn +面筋哥 +huitailangbango +gaig +拜年祭 +2019拜年祭 +sh +兽耳 +y'r'x'zhe +yi're +一人xing +一人行者 +早 +大胖和 +hongg +蒂姆嘟嘟 +baoh +baojia +阿兔 +孟xiao +孟晓 +meng'xiao'jie +指法芬芳 +10后 +av108 +av10872 +av810872 +baozouba +暴走吧 +水一 +蛋黄 +zhanghaoyizhuxio +账号已注销 +zuiqiangdanao +最强大脑 +quan'yu +zi +兹 +紫jia +哔哩哔哩活动娘 +徐大下咯 +徐大咯 +中国人口 +神奇的 +bao +爆bo's +d爆博士 +tebiet +特别特别s +特别特别帅的dings +特别特别帅的定时 +半脸的男人 +2017拜年祭 +日常00 +哔哩哔哩 +labi +散避awa +念诗之王 +玄 +豆砸ow +阿良良 +阿良良木 +a +tar +动态活动 +荒唐的 +荒唐的loser +恐恐 +z君 +feng huang yu +凤凰yuan +凤凰院 +欣小萌 +阿齐一 +阿神 +勾指起誓 +白上吹 +白上吹雪 +冠世一战 +辉夜daxiaojie +辉夜大小姐想让 +tan +tankaoniurou +tankaoniuroutan +炭烤牛肉碳 +lanpi +蓝皮jun +蓝皮君 +蓝皮 +奋 +奋婧 +奋婧仔 +环海航行 +幻海航行 +文曰xiao'qiang +龙哥 +华农兄弟 +jojo21 +小熊fli +曲之 +曲之海洋 +xi +xin'xiao'meng +tian_yi +欣小萌- +多拉的 +多拉的十九 +多拉的十九n +爱吃饭的 +爱吃饭的芋头 +爱做饭的 +爱做饭的芋头SA +爱做饭的芋头SAMa +fff团 +菜到 +zhi +知君 +liyu +liyuanjun +力元军 +力元j +lie +蓝毛 +小蓝毛鸭 +小藍毛 +我是c +我是 +我是菜头 +lf +菜来到 +菜来道 +万物30秒 +地下11楼 +地下11楼的 +地下11楼的森 +李大神 +10后找人 +lds +lds李大神 +源 +bingbing +饼饼饼呢 +so +sone钊哥 +芋 +改革春风 +liy +sha +山之 +小狼xiol +ya'yi +雅音g +雅音gong'yu +雅音宫羽 +黑板报 +av21714693 diff --git a/biliob_analyzer/video_rank.py b/biliob_analyzer/video_rank.py index c7c3622..c96f423 100644 --- a/biliob_analyzer/video_rank.py +++ b/biliob_analyzer/video_rank.py @@ -5,6 +5,10 @@ from pymongo import DESCENDING +def format_p_rank(i, count): + return round(i / count * 100, 2) + + def computeVideoRank(): coll = db['video'] # 获得collection的句柄 @@ -22,12 +26,10 @@ def computeVideoRank(): i = 1 videos = coll.find({each_key: {'$exists': 1}}, {'aid': 1, 'rank': 1, each_key: 1}).batch_size( 300).sort(each_key, DESCENDING) - if each_key == 'cView': - each_rank = 'cViewRank' - each_d_rank = 'dViewRank' - each_rank = each_key + 'Rank' each_d_rank = 'd' + each_key[1:] + 'Rank' + each_p_rank = 'p' + each_key[1:] + 'Rank' + count = coll.find().count() for each_video in videos: # 如果没有data 直接下一个 @@ -39,20 +41,24 @@ def computeVideoRank(): else: rank[each_d_rank] = -1 rank[each_rank] = i + rank[each_p_rank] = format_p_rank(i, count) else: rank = { each_rank: i, - each_d_rank: -1 + each_d_rank: -1, + each_p_rank: format_p_rank(i, count) } if each_video[each_key] == 0: if 'rank' in each_video: rank = each_video['rank'] rank[each_d_rank] = 0 rank[each_rank] = -1 + rank[each_p_rank] = -1 else: rank = { each_rank: -1, - each_d_rank: 0 + each_d_rank: 0, + each_p_rank: -1 } if each_key == keys[-1]: rank['updateTime'] = datetime.datetime.now() From 831fe833b32b522ccce6539e95bde6e3b60e68cf Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 5 Mar 2019 17:37:14 +0800 Subject: [PATCH 216/469] feature: add sleep --- biliob_analyzer/add_credit.py | 3 ++ biliob_analyzer/add_keyword.py | 22 +++++---- biliob_analyzer/del_data.py | 27 ++++++++++ biliob_analyzer/dict.txt | 90 ++++++++++++++++++++++++++++++++++ biliob_analyzer/video_rank.py | 12 +++-- run_add_kw.py | 5 +- run_analyzer.py | 10 ++-- 7 files changed, 150 insertions(+), 19 deletions(-) create mode 100644 biliob_analyzer/add_credit.py create mode 100644 biliob_analyzer/del_data.py diff --git a/biliob_analyzer/add_credit.py b/biliob_analyzer/add_credit.py new file mode 100644 index 0000000..9fb8d4a --- /dev/null +++ b/biliob_analyzer/add_credit.py @@ -0,0 +1,3 @@ +from db import db +u = db['user'] +u.update_many({}, {'$inc': {'credit': 50}}) diff --git a/biliob_analyzer/add_keyword.py b/biliob_analyzer/add_keyword.py index b7d28ff..66565b8 100644 --- a/biliob_analyzer/add_keyword.py +++ b/biliob_analyzer/add_keyword.py @@ -1,17 +1,16 @@ from pymongo import ReturnDocument import jieba from db import db +from time import sleep # 载入字典 -jieba.load_userdict('./biliob_analyzer/dict.txt') - - -class AddKeyword(): +class KeywordAdder(): def __init__(self): self.mongo_author = db['author'] self.mongo_video = db['video'] self.mongo_word = db['search_word'] + jieba.load_userdict('./biliob_analyzer/dict.txt') def get_video_kw_list(self, aid): # 关键字从name和official中提取 @@ -39,6 +38,7 @@ def get_video_kw_list(self, aid): return list(set(seg_list)) def add_to_video(self, aid, seg_list): + sleep(0.01) self.mongo_video.update_one({'aid': aid}, {'$set': { 'keyword': seg_list }}) @@ -75,35 +75,38 @@ def add_author_kw(self, mid): return True def add_to_author(self, mid, seg_list): + sleep(0.01) self.mongo_author.update_one( {'mid': mid}, {'$set': {'keyword': seg_list}}) def add_all_author(self): authors = self.mongo_author.find( - {'keyword': {'$exists': False}}, {'_id': 0, 'mid': 1}) + {'keyword': {'$exists': False}}, {'_id': 0, 'mid': 1}).batch_size(200) for each_author in authors: mid = each_author['mid'] self.add_author_kw(mid) def add_all_video(self): videos = self.mongo_video.find( - {'keyword': {'$exists': False}}, {'_id': 0, 'aid': 1}) + {'keyword': {'$exists': False}}, {'_id': 0, 'aid': 1}).batch_size(200) for each_video in videos: aid = each_video['aid'] self.add_video_kw(aid) def refresh_all_author(self): authors = self.mongo_author.find( - {}, {'_id': 0, 'mid': 1}) + {}, {'_id': 0, 'mid': 1}).batch_size(500) for each_author in authors: mid = each_author['mid'] + print("[mid]"+str(mid)) self.add_author_kw(mid) def refresh_all_video(self): videos = self.mongo_video.find( - {}, {'_id': 0, 'aid': 1}) + {}, {'_id': 0, 'aid': 1}).batch_size(500) for each_video in videos: aid = each_video['aid'] + print("[aid]"+str(aid)) self.add_video_kw(aid) def add_omitted(self): @@ -121,5 +124,6 @@ def add_omitted(self): for each in d: o.write(each+'\n') o.close() - self.refresh_all_video() + jieba.load_userdict('./biliob_analyzer/dict.txt') self.refresh_all_author() + self.refresh_all_video() diff --git a/biliob_analyzer/del_data.py b/biliob_analyzer/del_data.py new file mode 100644 index 0000000..50ebb87 --- /dev/null +++ b/biliob_analyzer/del_data.py @@ -0,0 +1,27 @@ +from db import db +import datetime +author_coll = db['author'] +authors = author_coll.find({'data.3000': {'$exists': True}}) +for each_author in authors: + mid = each_author['mid'] + data = sorted(each_author['data'], + key=lambda x: x['datetime'], reverse=True) + c_data = data[0] + c_date = data[0]['datetime'].strftime('%Y-%m-%d') + f_data = [c_data] + for each_data in data: + delta_day = (datetime.datetime.now() - + each_data['datetime']).days + if delta_day > 7: + n_date = each_data['datetime'].strftime('%Y-%m-%d') + # 如果不是同一天 + if n_date != c_date: + f_data.append(each_data) + c_date = n_date + pass + pass + else: + f_data.append(each_data) + author_coll.update_one({'mid': mid}, {'$set': {'data': f_data}}) + pass +pass diff --git a/biliob_analyzer/dict.txt b/biliob_analyzer/dict.txt index 2eef132..3cc3c91 100644 --- a/biliob_analyzer/dict.txt +++ b/biliob_analyzer/dict.txt @@ -227,3 +227,93 @@ ya'yi 雅音宫羽 黑板报 av21714693 + +想让我 +青色 +入世 +zhudax +菜莱 +菜莱道 +当然是橙橙 +当然是橙橙啦 +林尔 +zha +jsf +jsuf +jus +小阿 +大划 +老邪 +记录生活的收黄 +记录生活的蛋黄派出 +刘老师说 +刘哔 +紫银风 +大氿 +大氿歌 +10后zhao +10后找人dai +xuuxlaoplao +徐霞baobao +徐霞宝宝 +旭旭bao +旭旭姥姥 +地心me +地心meishan +地心meis +地心美少女 +绯绯 +冇丨 +冇冂 +北京第三区 +回到20 +我是谁 +fab +赚了 +网易云音乐 +网易云 +mangguo +芒果冰 +bilibilidanmuwa +小帅 +小帅喵 +街森 +可桑德 +桑德 +bilibili番剧 +菜来一个 +菜来d +yanyi +颜yi +颜艺aijiang +ailedyo +yygy +yyg +楼下万能 +开心嘴 +百合吧 +摇 +摇曳 +花花与 +花花与三猫 +摩尔庄园 +暴 +渗透之 +徐璐的 +洛米尔博科夫 +max +青色小柚子 +抖音超火《出山》改编版《入世》戏腔高能 +小渔打野 +电影最 + +neko喵了个咪33 +neko喵了个咪 +信 +滕乐 +数学超人 +数学 +猪猪老师 + + + diff --git a/biliob_analyzer/video_rank.py b/biliob_analyzer/video_rank.py index c96f423..948e5ff 100644 --- a/biliob_analyzer/video_rank.py +++ b/biliob_analyzer/video_rank.py @@ -3,6 +3,7 @@ import datetime import logging from pymongo import DESCENDING +from time import sleep def format_p_rank(i, count): @@ -25,13 +26,15 @@ def computeVideoRank(): logger.info("开始计算视频{}排名".format(each_key)) i = 1 videos = coll.find({each_key: {'$exists': 1}}, {'aid': 1, 'rank': 1, each_key: 1}).batch_size( - 300).sort(each_key, DESCENDING) + 200).sort(each_key, DESCENDING) each_rank = each_key + 'Rank' each_d_rank = 'd' + each_key[1:] + 'Rank' each_p_rank = 'p' + each_key[1:] + 'Rank' - count = coll.find().count() + count = coll.count_documents({each_key: {'$exists': 1}}) for each_video in videos: + logger.info("[aid]{}".format(each_video['aid'])) + sleep(0.01) # 如果没有data 直接下一个 if each_key in each_video: if 'rank' in each_video: @@ -45,7 +48,7 @@ def computeVideoRank(): else: rank = { each_rank: i, - each_d_rank: -1, + each_d_rank: 0, each_p_rank: format_p_rank(i, count) } if each_video[each_key] == 0: @@ -70,3 +73,6 @@ def computeVideoRank(): i += 1 logger.info("完成计算视频数据排名") + + +computeVideoRank() diff --git a/run_add_kw.py b/run_add_kw.py index a1a131a..3ea62ec 100644 --- a/run_add_kw.py +++ b/run_add_kw.py @@ -1,3 +1,2 @@ -from biliob_analyzer.add_keyword import AddKeyword -AddKeyword().add_all_author() -AddKeyword().add_all_video() +from biliob_analyzer.add_keyword import KeywordAdder +KeywordAdder().add_omitted() diff --git a/run_analyzer.py b/run_analyzer.py index 9c920e0..04fe2f7 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -3,10 +3,12 @@ import biliob_analyzer.author_rate_caculate import biliob_analyzer.author_fans_watcher import biliob_analyzer.author_rank -# import biliob_analyzer.video_rank -# from biliob_analyzer.add_keyword import AddKeyword -# AddKeyword().add_all_author() -# AddKeyword().add_all_video() +import biliob_analyzer.video_rank +from biliob_analyzer.add_keyword import KeywordAdder +kwAdder = KeywordAdder() +kwAdder.add_all_author() +kwAdder.add_all_video() +kwAdder.add_omitted() author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From 3cd1a8474c286302cc015e8d86aefbc263a9730a Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 5 Mar 2019 17:37:14 +0800 Subject: [PATCH 217/469] feature: add sleep --- biliob_analyzer/add_credit.py | 3 ++ biliob_analyzer/add_keyword.py | 22 +++++---- biliob_analyzer/del_data.py | 27 ++++++++++ biliob_analyzer/dict.txt | 90 ++++++++++++++++++++++++++++++++++ biliob_analyzer/video_rank.py | 12 +++-- run_add_kw.py | 5 +- run_analyzer.py | 10 ++-- 7 files changed, 150 insertions(+), 19 deletions(-) create mode 100644 biliob_analyzer/add_credit.py create mode 100644 biliob_analyzer/del_data.py diff --git a/biliob_analyzer/add_credit.py b/biliob_analyzer/add_credit.py new file mode 100644 index 0000000..9fb8d4a --- /dev/null +++ b/biliob_analyzer/add_credit.py @@ -0,0 +1,3 @@ +from db import db +u = db['user'] +u.update_many({}, {'$inc': {'credit': 50}}) diff --git a/biliob_analyzer/add_keyword.py b/biliob_analyzer/add_keyword.py index b7d28ff..66565b8 100644 --- a/biliob_analyzer/add_keyword.py +++ b/biliob_analyzer/add_keyword.py @@ -1,17 +1,16 @@ from pymongo import ReturnDocument import jieba from db import db +from time import sleep # 载入字典 -jieba.load_userdict('./biliob_analyzer/dict.txt') - - -class AddKeyword(): +class KeywordAdder(): def __init__(self): self.mongo_author = db['author'] self.mongo_video = db['video'] self.mongo_word = db['search_word'] + jieba.load_userdict('./biliob_analyzer/dict.txt') def get_video_kw_list(self, aid): # 关键字从name和official中提取 @@ -39,6 +38,7 @@ def get_video_kw_list(self, aid): return list(set(seg_list)) def add_to_video(self, aid, seg_list): + sleep(0.01) self.mongo_video.update_one({'aid': aid}, {'$set': { 'keyword': seg_list }}) @@ -75,35 +75,38 @@ def add_author_kw(self, mid): return True def add_to_author(self, mid, seg_list): + sleep(0.01) self.mongo_author.update_one( {'mid': mid}, {'$set': {'keyword': seg_list}}) def add_all_author(self): authors = self.mongo_author.find( - {'keyword': {'$exists': False}}, {'_id': 0, 'mid': 1}) + {'keyword': {'$exists': False}}, {'_id': 0, 'mid': 1}).batch_size(200) for each_author in authors: mid = each_author['mid'] self.add_author_kw(mid) def add_all_video(self): videos = self.mongo_video.find( - {'keyword': {'$exists': False}}, {'_id': 0, 'aid': 1}) + {'keyword': {'$exists': False}}, {'_id': 0, 'aid': 1}).batch_size(200) for each_video in videos: aid = each_video['aid'] self.add_video_kw(aid) def refresh_all_author(self): authors = self.mongo_author.find( - {}, {'_id': 0, 'mid': 1}) + {}, {'_id': 0, 'mid': 1}).batch_size(500) for each_author in authors: mid = each_author['mid'] + print("[mid]"+str(mid)) self.add_author_kw(mid) def refresh_all_video(self): videos = self.mongo_video.find( - {}, {'_id': 0, 'aid': 1}) + {}, {'_id': 0, 'aid': 1}).batch_size(500) for each_video in videos: aid = each_video['aid'] + print("[aid]"+str(aid)) self.add_video_kw(aid) def add_omitted(self): @@ -121,5 +124,6 @@ def add_omitted(self): for each in d: o.write(each+'\n') o.close() - self.refresh_all_video() + jieba.load_userdict('./biliob_analyzer/dict.txt') self.refresh_all_author() + self.refresh_all_video() diff --git a/biliob_analyzer/del_data.py b/biliob_analyzer/del_data.py new file mode 100644 index 0000000..50ebb87 --- /dev/null +++ b/biliob_analyzer/del_data.py @@ -0,0 +1,27 @@ +from db import db +import datetime +author_coll = db['author'] +authors = author_coll.find({'data.3000': {'$exists': True}}) +for each_author in authors: + mid = each_author['mid'] + data = sorted(each_author['data'], + key=lambda x: x['datetime'], reverse=True) + c_data = data[0] + c_date = data[0]['datetime'].strftime('%Y-%m-%d') + f_data = [c_data] + for each_data in data: + delta_day = (datetime.datetime.now() - + each_data['datetime']).days + if delta_day > 7: + n_date = each_data['datetime'].strftime('%Y-%m-%d') + # 如果不是同一天 + if n_date != c_date: + f_data.append(each_data) + c_date = n_date + pass + pass + else: + f_data.append(each_data) + author_coll.update_one({'mid': mid}, {'$set': {'data': f_data}}) + pass +pass diff --git a/biliob_analyzer/dict.txt b/biliob_analyzer/dict.txt index 2eef132..3cc3c91 100644 --- a/biliob_analyzer/dict.txt +++ b/biliob_analyzer/dict.txt @@ -227,3 +227,93 @@ ya'yi 雅音宫羽 黑板报 av21714693 + +想让我 +青色 +入世 +zhudax +菜莱 +菜莱道 +当然是橙橙 +当然是橙橙啦 +林尔 +zha +jsf +jsuf +jus +小阿 +大划 +老邪 +记录生活的收黄 +记录生活的蛋黄派出 +刘老师说 +刘哔 +紫银风 +大氿 +大氿歌 +10后zhao +10后找人dai +xuuxlaoplao +徐霞baobao +徐霞宝宝 +旭旭bao +旭旭姥姥 +地心me +地心meishan +地心meis +地心美少女 +绯绯 +冇丨 +冇冂 +北京第三区 +回到20 +我是谁 +fab +赚了 +网易云音乐 +网易云 +mangguo +芒果冰 +bilibilidanmuwa +小帅 +小帅喵 +街森 +可桑德 +桑德 +bilibili番剧 +菜来一个 +菜来d +yanyi +颜yi +颜艺aijiang +ailedyo +yygy +yyg +楼下万能 +开心嘴 +百合吧 +摇 +摇曳 +花花与 +花花与三猫 +摩尔庄园 +暴 +渗透之 +徐璐的 +洛米尔博科夫 +max +青色小柚子 +抖音超火《出山》改编版《入世》戏腔高能 +小渔打野 +电影最 + +neko喵了个咪33 +neko喵了个咪 +信 +滕乐 +数学超人 +数学 +猪猪老师 + + + diff --git a/biliob_analyzer/video_rank.py b/biliob_analyzer/video_rank.py index c96f423..948e5ff 100644 --- a/biliob_analyzer/video_rank.py +++ b/biliob_analyzer/video_rank.py @@ -3,6 +3,7 @@ import datetime import logging from pymongo import DESCENDING +from time import sleep def format_p_rank(i, count): @@ -25,13 +26,15 @@ def computeVideoRank(): logger.info("开始计算视频{}排名".format(each_key)) i = 1 videos = coll.find({each_key: {'$exists': 1}}, {'aid': 1, 'rank': 1, each_key: 1}).batch_size( - 300).sort(each_key, DESCENDING) + 200).sort(each_key, DESCENDING) each_rank = each_key + 'Rank' each_d_rank = 'd' + each_key[1:] + 'Rank' each_p_rank = 'p' + each_key[1:] + 'Rank' - count = coll.find().count() + count = coll.count_documents({each_key: {'$exists': 1}}) for each_video in videos: + logger.info("[aid]{}".format(each_video['aid'])) + sleep(0.01) # 如果没有data 直接下一个 if each_key in each_video: if 'rank' in each_video: @@ -45,7 +48,7 @@ def computeVideoRank(): else: rank = { each_rank: i, - each_d_rank: -1, + each_d_rank: 0, each_p_rank: format_p_rank(i, count) } if each_video[each_key] == 0: @@ -70,3 +73,6 @@ def computeVideoRank(): i += 1 logger.info("完成计算视频数据排名") + + +computeVideoRank() diff --git a/run_add_kw.py b/run_add_kw.py index a1a131a..3ea62ec 100644 --- a/run_add_kw.py +++ b/run_add_kw.py @@ -1,3 +1,2 @@ -from biliob_analyzer.add_keyword import AddKeyword -AddKeyword().add_all_author() -AddKeyword().add_all_video() +from biliob_analyzer.add_keyword import KeywordAdder +KeywordAdder().add_omitted() diff --git a/run_analyzer.py b/run_analyzer.py index 9c920e0..04fe2f7 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -3,10 +3,12 @@ import biliob_analyzer.author_rate_caculate import biliob_analyzer.author_fans_watcher import biliob_analyzer.author_rank -# import biliob_analyzer.video_rank -# from biliob_analyzer.add_keyword import AddKeyword -# AddKeyword().add_all_author() -# AddKeyword().add_all_video() +import biliob_analyzer.video_rank +from biliob_analyzer.add_keyword import KeywordAdder +kwAdder = KeywordAdder() +kwAdder.add_all_author() +kwAdder.add_all_video() +kwAdder.add_omitted() author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From ac0c32e94d85c53fffa5b33ace3e8b8f270fae2d Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 5 Mar 2019 17:37:14 +0800 Subject: [PATCH 218/469] feature: add sleep --- biliob_analyzer/add_credit.py | 3 ++ biliob_analyzer/add_keyword.py | 22 +++++---- biliob_analyzer/del_data.py | 27 ++++++++++ biliob_analyzer/dict.txt | 90 ++++++++++++++++++++++++++++++++++ biliob_analyzer/video_rank.py | 12 +++-- run_add_kw.py | 5 +- run_analyzer.py | 10 ++-- 7 files changed, 150 insertions(+), 19 deletions(-) create mode 100644 biliob_analyzer/add_credit.py create mode 100644 biliob_analyzer/del_data.py diff --git a/biliob_analyzer/add_credit.py b/biliob_analyzer/add_credit.py new file mode 100644 index 0000000..9fb8d4a --- /dev/null +++ b/biliob_analyzer/add_credit.py @@ -0,0 +1,3 @@ +from db import db +u = db['user'] +u.update_many({}, {'$inc': {'credit': 50}}) diff --git a/biliob_analyzer/add_keyword.py b/biliob_analyzer/add_keyword.py index b7d28ff..66565b8 100644 --- a/biliob_analyzer/add_keyword.py +++ b/biliob_analyzer/add_keyword.py @@ -1,17 +1,16 @@ from pymongo import ReturnDocument import jieba from db import db +from time import sleep # 载入字典 -jieba.load_userdict('./biliob_analyzer/dict.txt') - - -class AddKeyword(): +class KeywordAdder(): def __init__(self): self.mongo_author = db['author'] self.mongo_video = db['video'] self.mongo_word = db['search_word'] + jieba.load_userdict('./biliob_analyzer/dict.txt') def get_video_kw_list(self, aid): # 关键字从name和official中提取 @@ -39,6 +38,7 @@ def get_video_kw_list(self, aid): return list(set(seg_list)) def add_to_video(self, aid, seg_list): + sleep(0.01) self.mongo_video.update_one({'aid': aid}, {'$set': { 'keyword': seg_list }}) @@ -75,35 +75,38 @@ def add_author_kw(self, mid): return True def add_to_author(self, mid, seg_list): + sleep(0.01) self.mongo_author.update_one( {'mid': mid}, {'$set': {'keyword': seg_list}}) def add_all_author(self): authors = self.mongo_author.find( - {'keyword': {'$exists': False}}, {'_id': 0, 'mid': 1}) + {'keyword': {'$exists': False}}, {'_id': 0, 'mid': 1}).batch_size(200) for each_author in authors: mid = each_author['mid'] self.add_author_kw(mid) def add_all_video(self): videos = self.mongo_video.find( - {'keyword': {'$exists': False}}, {'_id': 0, 'aid': 1}) + {'keyword': {'$exists': False}}, {'_id': 0, 'aid': 1}).batch_size(200) for each_video in videos: aid = each_video['aid'] self.add_video_kw(aid) def refresh_all_author(self): authors = self.mongo_author.find( - {}, {'_id': 0, 'mid': 1}) + {}, {'_id': 0, 'mid': 1}).batch_size(500) for each_author in authors: mid = each_author['mid'] + print("[mid]"+str(mid)) self.add_author_kw(mid) def refresh_all_video(self): videos = self.mongo_video.find( - {}, {'_id': 0, 'aid': 1}) + {}, {'_id': 0, 'aid': 1}).batch_size(500) for each_video in videos: aid = each_video['aid'] + print("[aid]"+str(aid)) self.add_video_kw(aid) def add_omitted(self): @@ -121,5 +124,6 @@ def add_omitted(self): for each in d: o.write(each+'\n') o.close() - self.refresh_all_video() + jieba.load_userdict('./biliob_analyzer/dict.txt') self.refresh_all_author() + self.refresh_all_video() diff --git a/biliob_analyzer/del_data.py b/biliob_analyzer/del_data.py new file mode 100644 index 0000000..50ebb87 --- /dev/null +++ b/biliob_analyzer/del_data.py @@ -0,0 +1,27 @@ +from db import db +import datetime +author_coll = db['author'] +authors = author_coll.find({'data.3000': {'$exists': True}}) +for each_author in authors: + mid = each_author['mid'] + data = sorted(each_author['data'], + key=lambda x: x['datetime'], reverse=True) + c_data = data[0] + c_date = data[0]['datetime'].strftime('%Y-%m-%d') + f_data = [c_data] + for each_data in data: + delta_day = (datetime.datetime.now() - + each_data['datetime']).days + if delta_day > 7: + n_date = each_data['datetime'].strftime('%Y-%m-%d') + # 如果不是同一天 + if n_date != c_date: + f_data.append(each_data) + c_date = n_date + pass + pass + else: + f_data.append(each_data) + author_coll.update_one({'mid': mid}, {'$set': {'data': f_data}}) + pass +pass diff --git a/biliob_analyzer/dict.txt b/biliob_analyzer/dict.txt index 2eef132..3cc3c91 100644 --- a/biliob_analyzer/dict.txt +++ b/biliob_analyzer/dict.txt @@ -227,3 +227,93 @@ ya'yi 雅音宫羽 黑板报 av21714693 + +想让我 +青色 +入世 +zhudax +菜莱 +菜莱道 +当然是橙橙 +当然是橙橙啦 +林尔 +zha +jsf +jsuf +jus +小阿 +大划 +老邪 +记录生活的收黄 +记录生活的蛋黄派出 +刘老师说 +刘哔 +紫银风 +大氿 +大氿歌 +10后zhao +10后找人dai +xuuxlaoplao +徐霞baobao +徐霞宝宝 +旭旭bao +旭旭姥姥 +地心me +地心meishan +地心meis +地心美少女 +绯绯 +冇丨 +冇冂 +北京第三区 +回到20 +我是谁 +fab +赚了 +网易云音乐 +网易云 +mangguo +芒果冰 +bilibilidanmuwa +小帅 +小帅喵 +街森 +可桑德 +桑德 +bilibili番剧 +菜来一个 +菜来d +yanyi +颜yi +颜艺aijiang +ailedyo +yygy +yyg +楼下万能 +开心嘴 +百合吧 +摇 +摇曳 +花花与 +花花与三猫 +摩尔庄园 +暴 +渗透之 +徐璐的 +洛米尔博科夫 +max +青色小柚子 +抖音超火《出山》改编版《入世》戏腔高能 +小渔打野 +电影最 + +neko喵了个咪33 +neko喵了个咪 +信 +滕乐 +数学超人 +数学 +猪猪老师 + + + diff --git a/biliob_analyzer/video_rank.py b/biliob_analyzer/video_rank.py index c96f423..948e5ff 100644 --- a/biliob_analyzer/video_rank.py +++ b/biliob_analyzer/video_rank.py @@ -3,6 +3,7 @@ import datetime import logging from pymongo import DESCENDING +from time import sleep def format_p_rank(i, count): @@ -25,13 +26,15 @@ def computeVideoRank(): logger.info("开始计算视频{}排名".format(each_key)) i = 1 videos = coll.find({each_key: {'$exists': 1}}, {'aid': 1, 'rank': 1, each_key: 1}).batch_size( - 300).sort(each_key, DESCENDING) + 200).sort(each_key, DESCENDING) each_rank = each_key + 'Rank' each_d_rank = 'd' + each_key[1:] + 'Rank' each_p_rank = 'p' + each_key[1:] + 'Rank' - count = coll.find().count() + count = coll.count_documents({each_key: {'$exists': 1}}) for each_video in videos: + logger.info("[aid]{}".format(each_video['aid'])) + sleep(0.01) # 如果没有data 直接下一个 if each_key in each_video: if 'rank' in each_video: @@ -45,7 +48,7 @@ def computeVideoRank(): else: rank = { each_rank: i, - each_d_rank: -1, + each_d_rank: 0, each_p_rank: format_p_rank(i, count) } if each_video[each_key] == 0: @@ -70,3 +73,6 @@ def computeVideoRank(): i += 1 logger.info("完成计算视频数据排名") + + +computeVideoRank() diff --git a/run_add_kw.py b/run_add_kw.py index a1a131a..3ea62ec 100644 --- a/run_add_kw.py +++ b/run_add_kw.py @@ -1,3 +1,2 @@ -from biliob_analyzer.add_keyword import AddKeyword -AddKeyword().add_all_author() -AddKeyword().add_all_video() +from biliob_analyzer.add_keyword import KeywordAdder +KeywordAdder().add_omitted() diff --git a/run_analyzer.py b/run_analyzer.py index 9c920e0..04fe2f7 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -3,10 +3,12 @@ import biliob_analyzer.author_rate_caculate import biliob_analyzer.author_fans_watcher import biliob_analyzer.author_rank -# import biliob_analyzer.video_rank -# from biliob_analyzer.add_keyword import AddKeyword -# AddKeyword().add_all_author() -# AddKeyword().add_all_video() +import biliob_analyzer.video_rank +from biliob_analyzer.add_keyword import KeywordAdder +kwAdder = KeywordAdder() +kwAdder.add_all_author() +kwAdder.add_all_video() +kwAdder.add_omitted() author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From 095a7fd27b6df7b234dabd35f61d826c38625ac0 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 6 Mar 2019 17:13:56 +0800 Subject: [PATCH 219/469] update --- biliob_analyzer/add_keyword.py | 30 +++- biliob_analyzer/dict.txt | 278 +++++++++++++++++++++++++++++++++ run_analyzer.py | 2 +- 3 files changed, 306 insertions(+), 4 deletions(-) diff --git a/biliob_analyzer/add_keyword.py b/biliob_analyzer/add_keyword.py index 66565b8..481a62c 100644 --- a/biliob_analyzer/add_keyword.py +++ b/biliob_analyzer/add_keyword.py @@ -4,6 +4,8 @@ from time import sleep # 载入字典 + + class KeywordAdder(): def __init__(self): @@ -81,14 +83,33 @@ def add_to_author(self, mid, seg_list): def add_all_author(self): authors = self.mongo_author.find( - {'keyword': {'$exists': False}}, {'_id': 0, 'mid': 1}).batch_size(200) + { + '$or': [ + { + 'keyword': [] + }, { + 'keyword': { + '$exists': False + } + } + ] + }, {'_id': 0, 'mid': 1}).batch_size(200) for each_author in authors: mid = each_author['mid'] self.add_author_kw(mid) def add_all_video(self): - videos = self.mongo_video.find( - {'keyword': {'$exists': False}}, {'_id': 0, 'aid': 1}).batch_size(200) + videos = self.mongo_video.find({ + '$or': [ + { + 'keyword': [] + }, { + 'keyword': { + '$exists': False + } + } + ] + }, {'_id': 0, 'aid': 1}).batch_size(200) for each_video in videos: aid = each_video['aid'] self.add_video_kw(aid) @@ -110,6 +131,8 @@ def refresh_all_video(self): self.add_video_kw(aid) def add_omitted(self): + if self.mongo_word.count_documents({}) < 100: + return d = open('./biliob_analyzer/dict.txt', 'r', encoding='utf8').read().split('\n') for each in self.mongo_word.find(): @@ -124,6 +147,7 @@ def add_omitted(self): for each in d: o.write(each+'\n') o.close() + self.mongo_word.delete_many({}) jieba.load_userdict('./biliob_analyzer/dict.txt') self.refresh_all_author() self.refresh_all_video() diff --git a/biliob_analyzer/dict.txt b/biliob_analyzer/dict.txt index 3cc3c91..b0257d1 100644 --- a/biliob_analyzer/dict.txt +++ b/biliob_analyzer/dict.txt @@ -317,3 +317,281 @@ neko喵了个咪 + +孟晓洁 +孟晓jie +孟晓洁je +原谅哥w +fu'li +jusf +lit +litt +litter +key +key猩 +key猩诚 +k猩诚 +猩诚 +泛式 +政宗君de情敌 +ascince +av43012394 +森宇酱 +森宇酱参上 +酱参上 +屿酱参上 +允星 +交 +荒野大 +还有一天 +hai'y +还有yi +小清水 +uid19653 +诗 +lexbur +诗二 +小清水ya +小清水亜mei +小清水亜美 +xiao'qing +chus +深海色待遇 +深海s +深海色带鱼 +采紫葳的凌霄子 +还有一天就fangji +还有yiti +深海色daiyu +怠惰的一x +秋名山上的 +龙大人bu +wangs +王思妇 +深海色daiyuu +深海色dai +浅笑anran +maodian +一咸juju +冰攻厂 +冰攻厂be +圣枪hee +圣枪hehe +k希 +k希桑 +徐大虾 +铁锅炖 +白 +井溪辰 +秦少 +潇湘醉月 +美食分析师 +美食分析师曹博文 +舞秋风直播 +u257851644 +小米周 +泠鸢 +hua non +华农x +a画 +大蓝 +he't +和田g +和田guang'si +和田光si +和田光司 +陈亚楠cyn +Nathan +NathanRich +NathanRich火锅d +NathanRich火锅大王 +六wei +六尾猫yu +六尾猫yui +av4228 +av42283575 +共青团zh +玩啥游戏 +力 +蕉酱qu +李四 +李四ganhao +李四赶海 +、四赶海 +四赶海 +摇曳百合 +yaoy +摇曳baihe +大果粒 +MordonF +Mordon +Mordo +Mord +Mor +oe +oel +b99 +b000 +b00063 +布莱德 +布莱德神盾 +狸 +川乡 +川乡小妹儿 +王尼 +王尼玛 +yi_xiang +yi_xiang忆 +yi_xiang忆巷 +rngvsg2 +这是 +这是di lang +这是地lang +极客湾 +http://bilicount.probe.earth/?883968http://bilicount.probe.earth/?883968 +ilicount.probe.earth/?883968http://bilicount.probe.earth/?883968 +猫神 +猫神辣椒酱 +星海璃幻 +政宗 +星海璃 +隐紫大人 +老大老三 +圣心fckle +政宗君de +手冲f +圣心f +力yu +力元jun +士织狂三 +小神爱 +uid:283281141 +' +ascin +可樱 +可桜 +你的keying +你的可樱 +煎蛋 +煎蛋的 +煎蛋的厨房 +十y +第一日 +懒蛇 +懒蛇- +懒蛇-巴 +uid:30483859 +前方高燃 +中国boy +av38262830 +uid:178664678 +呆萌chongw +追剧xiaohuan +追剧xiaohuanx +bbleae +yobachi +漫推 +宇哥你裤子 +宇哥你裤子掉了 +chen +种壶 +种壶野 +种壶野朗 +bai'sh +白上chui'x +白上吹雪offi +白上吹雪off +liyua +力yuan +力元、 +你的ke +你的可樱yijing +你的可樱已经 +你的可樱已经、 +你的可樱已 +你的可樱已经上线 +yude +马zua +马zhuang +马壮shi +马壮实 +马壮实h +不zheng'j +不正经mei'shu +不正经美食u +不正经美食upzhu +不正经美食up主 +全靠y +全靠一双sh +anrly +anrly暗语 +中国通史 +中二の +中二のome +中二 +anggie +不ba'man +不霸蛮喽 +不霸蛮lou +无gx +无毒放心ve +幽ww +撸过g +撸过一下 +请叫我 +守望at春语 +春语 +幽灵子 +犬山 +rng +uid:273279238 +不是尴尬 +不尴尬 +不失尴尬 +av44955 +av44955917 +av4498 +av44982599 +av345678 +水蛭 +zhe shi di lang +这是抵抗 +这里是 +夏日夜望 +福利老粗 +边白 +边白贝 +uid:36965114 +犬山y +犬山yuji +犬山yu +犬山玉ji +paojiao +跑焦 +跑焦熊 +相机镜头批发价90元一斤 +很多还能用 +垃圾王开 +av45343677 +小清 +fan' ju +伟修 +玩啥游戏官方 +采紫葳 +好吃的shenghuoj +好吃的生活jun +yi mi b +一米八 +多功能的气候 +多功能的qi h +多功能的qi l +多功能的麒 +多功能的麒h +多功能的麒桦 +f k +放开那个xi hong +放开那个西红柿 +胡子gai +胡子盖 +太猪 +太 diff --git a/run_analyzer.py b/run_analyzer.py index 04fe2f7..ecbef74 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -8,7 +8,7 @@ kwAdder = KeywordAdder() kwAdder.add_all_author() kwAdder.add_all_video() -kwAdder.add_omitted() +# kwAdder.add_omitted() author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From 5ea7cf2531a46f4af371cf22c493ffe5f67a15e5 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 6 Mar 2019 17:13:56 +0800 Subject: [PATCH 220/469] update --- biliob_analyzer/add_keyword.py | 30 +++- biliob_analyzer/dict.txt | 278 +++++++++++++++++++++++++++++++++ run_analyzer.py | 2 +- 3 files changed, 306 insertions(+), 4 deletions(-) diff --git a/biliob_analyzer/add_keyword.py b/biliob_analyzer/add_keyword.py index 66565b8..481a62c 100644 --- a/biliob_analyzer/add_keyword.py +++ b/biliob_analyzer/add_keyword.py @@ -4,6 +4,8 @@ from time import sleep # 载入字典 + + class KeywordAdder(): def __init__(self): @@ -81,14 +83,33 @@ def add_to_author(self, mid, seg_list): def add_all_author(self): authors = self.mongo_author.find( - {'keyword': {'$exists': False}}, {'_id': 0, 'mid': 1}).batch_size(200) + { + '$or': [ + { + 'keyword': [] + }, { + 'keyword': { + '$exists': False + } + } + ] + }, {'_id': 0, 'mid': 1}).batch_size(200) for each_author in authors: mid = each_author['mid'] self.add_author_kw(mid) def add_all_video(self): - videos = self.mongo_video.find( - {'keyword': {'$exists': False}}, {'_id': 0, 'aid': 1}).batch_size(200) + videos = self.mongo_video.find({ + '$or': [ + { + 'keyword': [] + }, { + 'keyword': { + '$exists': False + } + } + ] + }, {'_id': 0, 'aid': 1}).batch_size(200) for each_video in videos: aid = each_video['aid'] self.add_video_kw(aid) @@ -110,6 +131,8 @@ def refresh_all_video(self): self.add_video_kw(aid) def add_omitted(self): + if self.mongo_word.count_documents({}) < 100: + return d = open('./biliob_analyzer/dict.txt', 'r', encoding='utf8').read().split('\n') for each in self.mongo_word.find(): @@ -124,6 +147,7 @@ def add_omitted(self): for each in d: o.write(each+'\n') o.close() + self.mongo_word.delete_many({}) jieba.load_userdict('./biliob_analyzer/dict.txt') self.refresh_all_author() self.refresh_all_video() diff --git a/biliob_analyzer/dict.txt b/biliob_analyzer/dict.txt index 3cc3c91..b0257d1 100644 --- a/biliob_analyzer/dict.txt +++ b/biliob_analyzer/dict.txt @@ -317,3 +317,281 @@ neko喵了个咪 + +孟晓洁 +孟晓jie +孟晓洁je +原谅哥w +fu'li +jusf +lit +litt +litter +key +key猩 +key猩诚 +k猩诚 +猩诚 +泛式 +政宗君de情敌 +ascince +av43012394 +森宇酱 +森宇酱参上 +酱参上 +屿酱参上 +允星 +交 +荒野大 +还有一天 +hai'y +还有yi +小清水 +uid19653 +诗 +lexbur +诗二 +小清水ya +小清水亜mei +小清水亜美 +xiao'qing +chus +深海色待遇 +深海s +深海色带鱼 +采紫葳的凌霄子 +还有一天就fangji +还有yiti +深海色daiyu +怠惰的一x +秋名山上的 +龙大人bu +wangs +王思妇 +深海色daiyuu +深海色dai +浅笑anran +maodian +一咸juju +冰攻厂 +冰攻厂be +圣枪hee +圣枪hehe +k希 +k希桑 +徐大虾 +铁锅炖 +白 +井溪辰 +秦少 +潇湘醉月 +美食分析师 +美食分析师曹博文 +舞秋风直播 +u257851644 +小米周 +泠鸢 +hua non +华农x +a画 +大蓝 +he't +和田g +和田guang'si +和田光si +和田光司 +陈亚楠cyn +Nathan +NathanRich +NathanRich火锅d +NathanRich火锅大王 +六wei +六尾猫yu +六尾猫yui +av4228 +av42283575 +共青团zh +玩啥游戏 +力 +蕉酱qu +李四 +李四ganhao +李四赶海 +、四赶海 +四赶海 +摇曳百合 +yaoy +摇曳baihe +大果粒 +MordonF +Mordon +Mordo +Mord +Mor +oe +oel +b99 +b000 +b00063 +布莱德 +布莱德神盾 +狸 +川乡 +川乡小妹儿 +王尼 +王尼玛 +yi_xiang +yi_xiang忆 +yi_xiang忆巷 +rngvsg2 +这是 +这是di lang +这是地lang +极客湾 +http://bilicount.probe.earth/?883968http://bilicount.probe.earth/?883968 +ilicount.probe.earth/?883968http://bilicount.probe.earth/?883968 +猫神 +猫神辣椒酱 +星海璃幻 +政宗 +星海璃 +隐紫大人 +老大老三 +圣心fckle +政宗君de +手冲f +圣心f +力yu +力元jun +士织狂三 +小神爱 +uid:283281141 +' +ascin +可樱 +可桜 +你的keying +你的可樱 +煎蛋 +煎蛋的 +煎蛋的厨房 +十y +第一日 +懒蛇 +懒蛇- +懒蛇-巴 +uid:30483859 +前方高燃 +中国boy +av38262830 +uid:178664678 +呆萌chongw +追剧xiaohuan +追剧xiaohuanx +bbleae +yobachi +漫推 +宇哥你裤子 +宇哥你裤子掉了 +chen +种壶 +种壶野 +种壶野朗 +bai'sh +白上chui'x +白上吹雪offi +白上吹雪off +liyua +力yuan +力元、 +你的ke +你的可樱yijing +你的可樱已经 +你的可樱已经、 +你的可樱已 +你的可樱已经上线 +yude +马zua +马zhuang +马壮shi +马壮实 +马壮实h +不zheng'j +不正经mei'shu +不正经美食u +不正经美食upzhu +不正经美食up主 +全靠y +全靠一双sh +anrly +anrly暗语 +中国通史 +中二の +中二のome +中二 +anggie +不ba'man +不霸蛮喽 +不霸蛮lou +无gx +无毒放心ve +幽ww +撸过g +撸过一下 +请叫我 +守望at春语 +春语 +幽灵子 +犬山 +rng +uid:273279238 +不是尴尬 +不尴尬 +不失尴尬 +av44955 +av44955917 +av4498 +av44982599 +av345678 +水蛭 +zhe shi di lang +这是抵抗 +这里是 +夏日夜望 +福利老粗 +边白 +边白贝 +uid:36965114 +犬山y +犬山yuji +犬山yu +犬山玉ji +paojiao +跑焦 +跑焦熊 +相机镜头批发价90元一斤 +很多还能用 +垃圾王开 +av45343677 +小清 +fan' ju +伟修 +玩啥游戏官方 +采紫葳 +好吃的shenghuoj +好吃的生活jun +yi mi b +一米八 +多功能的气候 +多功能的qi h +多功能的qi l +多功能的麒 +多功能的麒h +多功能的麒桦 +f k +放开那个xi hong +放开那个西红柿 +胡子gai +胡子盖 +太猪 +太 diff --git a/run_analyzer.py b/run_analyzer.py index 04fe2f7..ecbef74 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -8,7 +8,7 @@ kwAdder = KeywordAdder() kwAdder.add_all_author() kwAdder.add_all_video() -kwAdder.add_omitted() +# kwAdder.add_omitted() author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From 8e785ac1864e55fb17729dadc715542429395073 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 6 Mar 2019 17:13:56 +0800 Subject: [PATCH 221/469] update --- biliob_analyzer/add_keyword.py | 30 +++- biliob_analyzer/dict.txt | 278 +++++++++++++++++++++++++++++++++ run_analyzer.py | 2 +- 3 files changed, 306 insertions(+), 4 deletions(-) diff --git a/biliob_analyzer/add_keyword.py b/biliob_analyzer/add_keyword.py index 66565b8..481a62c 100644 --- a/biliob_analyzer/add_keyword.py +++ b/biliob_analyzer/add_keyword.py @@ -4,6 +4,8 @@ from time import sleep # 载入字典 + + class KeywordAdder(): def __init__(self): @@ -81,14 +83,33 @@ def add_to_author(self, mid, seg_list): def add_all_author(self): authors = self.mongo_author.find( - {'keyword': {'$exists': False}}, {'_id': 0, 'mid': 1}).batch_size(200) + { + '$or': [ + { + 'keyword': [] + }, { + 'keyword': { + '$exists': False + } + } + ] + }, {'_id': 0, 'mid': 1}).batch_size(200) for each_author in authors: mid = each_author['mid'] self.add_author_kw(mid) def add_all_video(self): - videos = self.mongo_video.find( - {'keyword': {'$exists': False}}, {'_id': 0, 'aid': 1}).batch_size(200) + videos = self.mongo_video.find({ + '$or': [ + { + 'keyword': [] + }, { + 'keyword': { + '$exists': False + } + } + ] + }, {'_id': 0, 'aid': 1}).batch_size(200) for each_video in videos: aid = each_video['aid'] self.add_video_kw(aid) @@ -110,6 +131,8 @@ def refresh_all_video(self): self.add_video_kw(aid) def add_omitted(self): + if self.mongo_word.count_documents({}) < 100: + return d = open('./biliob_analyzer/dict.txt', 'r', encoding='utf8').read().split('\n') for each in self.mongo_word.find(): @@ -124,6 +147,7 @@ def add_omitted(self): for each in d: o.write(each+'\n') o.close() + self.mongo_word.delete_many({}) jieba.load_userdict('./biliob_analyzer/dict.txt') self.refresh_all_author() self.refresh_all_video() diff --git a/biliob_analyzer/dict.txt b/biliob_analyzer/dict.txt index 3cc3c91..b0257d1 100644 --- a/biliob_analyzer/dict.txt +++ b/biliob_analyzer/dict.txt @@ -317,3 +317,281 @@ neko喵了个咪 + +孟晓洁 +孟晓jie +孟晓洁je +原谅哥w +fu'li +jusf +lit +litt +litter +key +key猩 +key猩诚 +k猩诚 +猩诚 +泛式 +政宗君de情敌 +ascince +av43012394 +森宇酱 +森宇酱参上 +酱参上 +屿酱参上 +允星 +交 +荒野大 +还有一天 +hai'y +还有yi +小清水 +uid19653 +诗 +lexbur +诗二 +小清水ya +小清水亜mei +小清水亜美 +xiao'qing +chus +深海色待遇 +深海s +深海色带鱼 +采紫葳的凌霄子 +还有一天就fangji +还有yiti +深海色daiyu +怠惰的一x +秋名山上的 +龙大人bu +wangs +王思妇 +深海色daiyuu +深海色dai +浅笑anran +maodian +一咸juju +冰攻厂 +冰攻厂be +圣枪hee +圣枪hehe +k希 +k希桑 +徐大虾 +铁锅炖 +白 +井溪辰 +秦少 +潇湘醉月 +美食分析师 +美食分析师曹博文 +舞秋风直播 +u257851644 +小米周 +泠鸢 +hua non +华农x +a画 +大蓝 +he't +和田g +和田guang'si +和田光si +和田光司 +陈亚楠cyn +Nathan +NathanRich +NathanRich火锅d +NathanRich火锅大王 +六wei +六尾猫yu +六尾猫yui +av4228 +av42283575 +共青团zh +玩啥游戏 +力 +蕉酱qu +李四 +李四ganhao +李四赶海 +、四赶海 +四赶海 +摇曳百合 +yaoy +摇曳baihe +大果粒 +MordonF +Mordon +Mordo +Mord +Mor +oe +oel +b99 +b000 +b00063 +布莱德 +布莱德神盾 +狸 +川乡 +川乡小妹儿 +王尼 +王尼玛 +yi_xiang +yi_xiang忆 +yi_xiang忆巷 +rngvsg2 +这是 +这是di lang +这是地lang +极客湾 +http://bilicount.probe.earth/?883968http://bilicount.probe.earth/?883968 +ilicount.probe.earth/?883968http://bilicount.probe.earth/?883968 +猫神 +猫神辣椒酱 +星海璃幻 +政宗 +星海璃 +隐紫大人 +老大老三 +圣心fckle +政宗君de +手冲f +圣心f +力yu +力元jun +士织狂三 +小神爱 +uid:283281141 +' +ascin +可樱 +可桜 +你的keying +你的可樱 +煎蛋 +煎蛋的 +煎蛋的厨房 +十y +第一日 +懒蛇 +懒蛇- +懒蛇-巴 +uid:30483859 +前方高燃 +中国boy +av38262830 +uid:178664678 +呆萌chongw +追剧xiaohuan +追剧xiaohuanx +bbleae +yobachi +漫推 +宇哥你裤子 +宇哥你裤子掉了 +chen +种壶 +种壶野 +种壶野朗 +bai'sh +白上chui'x +白上吹雪offi +白上吹雪off +liyua +力yuan +力元、 +你的ke +你的可樱yijing +你的可樱已经 +你的可樱已经、 +你的可樱已 +你的可樱已经上线 +yude +马zua +马zhuang +马壮shi +马壮实 +马壮实h +不zheng'j +不正经mei'shu +不正经美食u +不正经美食upzhu +不正经美食up主 +全靠y +全靠一双sh +anrly +anrly暗语 +中国通史 +中二の +中二のome +中二 +anggie +不ba'man +不霸蛮喽 +不霸蛮lou +无gx +无毒放心ve +幽ww +撸过g +撸过一下 +请叫我 +守望at春语 +春语 +幽灵子 +犬山 +rng +uid:273279238 +不是尴尬 +不尴尬 +不失尴尬 +av44955 +av44955917 +av4498 +av44982599 +av345678 +水蛭 +zhe shi di lang +这是抵抗 +这里是 +夏日夜望 +福利老粗 +边白 +边白贝 +uid:36965114 +犬山y +犬山yuji +犬山yu +犬山玉ji +paojiao +跑焦 +跑焦熊 +相机镜头批发价90元一斤 +很多还能用 +垃圾王开 +av45343677 +小清 +fan' ju +伟修 +玩啥游戏官方 +采紫葳 +好吃的shenghuoj +好吃的生活jun +yi mi b +一米八 +多功能的气候 +多功能的qi h +多功能的qi l +多功能的麒 +多功能的麒h +多功能的麒桦 +f k +放开那个xi hong +放开那个西红柿 +胡子gai +胡子盖 +太猪 +太 diff --git a/run_analyzer.py b/run_analyzer.py index 04fe2f7..ecbef74 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -8,7 +8,7 @@ kwAdder = KeywordAdder() kwAdder.add_all_author() kwAdder.add_all_video() -kwAdder.add_omitted() +# kwAdder.add_omitted() author_analyzer = AuthorAnalyzer() video_analyzer = VideoAnalyzer() From 453a48c240b763b25924d82963a33b171fc8cd57 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Fri, 8 Mar 2019 23:17:20 +0800 Subject: [PATCH 222/469] refactor: fans variation --- biliob_analyzer/author_fans_watcher.py | 302 +++++++++++++------------ 1 file changed, 155 insertions(+), 147 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index 6adcdf4..32e1439 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -2,150 +2,158 @@ from db import db import datetime from enum import Enum -coll = db['author'] # 获得collection的句柄 -event = db['event'] # 获得collection的句柄 -video = db['video'] # 获得collection的句柄 - -# 对于上升趋势的UP主,较上日增加多少倍,才算是巨量涨粉 -WTF_INCREASE = 10 - -# 对于上升趋势的UP主,较上日增加多少倍,才算是超量涨粉 -AMAZING_INCREASE = 5 - -# 对于上升趋势的UP主,较上日增加多少倍,才算是大量涨粉 -MAGNIFICATION_INCREASE = 3 - -# 对于下降趋势的UP主,较上日减少多少倍,才算是大量掉粉 -MAGNIFICATION_DECREASE = 2 - -# 对于上升趋势的UP主,较上日增加多少倍,才算是超量涨粉 -AMAZING_DECREASE = 5 - -# 对于下降趋势的UP主,较上日减少多少倍,才算是巨量掉粉 -WTF_DECREASE = 8 - -# 粉丝增加多少,才算大量涨粉 -FANS_INCREASE_THRESHOLD = 8000 -# 粉丝减少多少,算作大量掉粉 -FANS_DECREASE_THRESHOLD = -3000 -# 多少粉丝以上才关注掉粉 -WATCH_DECREASE = 1000 - - -class Event(Enum): - increase_1 = 'I级增长' - increase_2 = 'II级猛增' - increase_3 = 'III级激增' - sudden_fall = 'SF级骤减' - decrease_1 = 'I级减少' - decrease_2 = 'II级锐减' - decrease_3 = 'III级暴减' - - -last_datetime = datetime.datetime(2000, 1, 1) -print('开始捕捉事件') -if event.count() != 0: - last_datetime = next(event.find().sort([('datetime', - -1)]).limit(1))['datetime'] - -for each_author in coll.find().batch_size(8): - if 'fansRate' in each_author and len(each_author['fansRate']) > 1: - index = 1 - - def print_data(each_author): - return '{name},速率:{rate},时间:{datetime}'.format( - name=each_author['name'], - rate=each_author['fansRate'][c_index]['rate'], - datetime=each_author['fansRate'][c_index]['datetime']) - - def insert_event(event_type): - videos = video.find({'mid': each_author['mid']}) - temp_video = {} - cause = {'type': 'video'} - for each_v in videos: - # 相差一日之内 - if (each_author['fansRate'][c_index]['datetime'] - - each_v['datetime']).days <= 1: - temp_video['aid'] = each_v['aid'] - temp_video['title'] = each_v['title'] - temp_video['cView'] = each_v['data'][0]['view'] - if 'cView' not in temp_video or 'aid' not in cause or temp_video[ - 'cView'] > cause['cView']: - cause['aid'] = temp_video['aid'] - cause['title'] = temp_video['title'] - cause['cView'] = temp_video['cView'] - - event.insert_one({ - 'type': - event_type.value, - 'mid': - each_author['mid'], - 'author': - each_author['name'], - 'rate': - each_author['fansRate'][c_index]['rate'], - 'datetime': - each_author['fansRate'][c_index]['datetime'], - 'cause': - cause - }) - - while index < len(each_author['fansRate']): - c_datetime = each_author['fansRate'][index]['datetime'] - if c_datetime <= last_datetime: - break - # 涨粉超高 - c_index = index - 1 - if each_author['fansRate'][c_index][ - 'rate'] > FANS_INCREASE_THRESHOLD and each_author[ - 'fansRate'][c_index]['rate'] > each_author['fansRate'][ - index]['rate'] * WTF_INCREASE: - insert_event(Event.increase_3) - print(Event.increase_3.value + print_data(each_author)) - - elif each_author['fansRate'][c_index][ - 'rate'] > FANS_INCREASE_THRESHOLD and each_author[ - 'fansRate'][c_index]['rate'] > each_author['fansRate'][ - index]['rate'] * AMAZING_INCREASE: - insert_event(Event.increase_2) - print(Event.increase_2.value + print_data(each_author)) - - elif each_author['fansRate'][c_index][ - 'rate'] > FANS_INCREASE_THRESHOLD and each_author[ - 'fansRate'][c_index]['rate'] > each_author['fansRate'][ - index]['rate'] * MAGNIFICATION_INCREASE: - insert_event(Event.increase_1) - print(Event.increase_1.value + print_data(each_author)) - - # 突然出现大量的掉粉 - if each_author['fansRate'][c_index][ - 'rate'] < FANS_DECREASE_THRESHOLD and each_author[ - 'fansRate'][index]['rate'] > WATCH_DECREASE: - insert_event(Event.sudden_fall) - print(Event.sudden_fall.value + print_data(each_author)) - # 一掉再掉 - elif each_author['fansRate'][c_index][ - 'rate'] < FANS_DECREASE_THRESHOLD and abs( - each_author['fansRate'][c_index]['rate'] - ) > abs( - each_author['fansRate'][index]['rate']) * WTF_DECREASE: - insert_event(Event.decrease_3) - print(Event.decrease_3.value + print_data(each_author)) - # 一掉再掉 - elif each_author['fansRate'][c_index][ - 'rate'] < FANS_DECREASE_THRESHOLD and abs( - each_author['fansRate'][c_index]['rate']) > abs( - each_author['fansRate'][index] - ['rate']) * AMAZING_DECREASE: - insert_event(Event.decrease_2) - print(Event.decrease_2.value + print_data(each_author)) - # 一掉再掉 - elif each_author['fansRate'][c_index][ - 'rate'] < FANS_DECREASE_THRESHOLD and abs( - each_author['fansRate'][c_index]['rate']) > abs( - each_author['fansRate'][index] - ['rate']) * MAGNIFICATION_DECREASE: - insert_event(Event.decrease_1) - print(Event.decrease_1.value + print_data(each_author)) - - index += 1 +from scipy.interpolate import interp1d + +author_coll = db['author'] # 获得collection的句柄 +video_coll = db['video'] # 获得collection的句柄 +fans_variation_coll = db['fans_variation'] # 获得collection的句柄 + + +def dateRange(beginDate, endDate): + dates = [] + date = beginDate + while date <= endDate: + dates.append(date.date()) + date += datetime.timedelta(1) + return dates + + +class FansWatcher(object): + def insert_event(self, delta_rate, d_daily, author, info, date): + print('变化率:{}% \n单日涨幅:{} \nUP主:{} \n信息:{}\n日期:{}\n\n'.format( + delta_rate, d_daily, author['name'], info, date)) + + out_data = { + 'variation': int(d_daily), + 'mid': author['mid'], + 'author': author['name'], + 'face': author['face'], + 'deltaRate': delta_rate, + 'datetime': date.strftime("%Y-%m-%d"), + 'info': info, + } + + videos = video_coll.find({'mid': author['mid']}) + temp_video = {} + cause = {'type': 'video'} + for each_v in videos: + # 相差一日之内 + if (date - each_v['datetime']).days <= 2: + temp_video['aid'] = each_v['aid'] + temp_video['title'] = each_v['title'] + temp_video['pic'] = each_v['pic'] + temp_video['cView'] = each_v['data'][0]['view'] + temp_video['channel'] = each_v['channel'] + temp_video['subChannel'] = each_v['subChannel'] + if 'cView' not in temp_video or 'aid' not in cause or temp_video[ + 'cView'] > cause['cView']: + cause['aid'] = temp_video['aid'] + cause['title'] = temp_video['title'] + cause['pic'] = temp_video['pic'] + cause['cView'] = temp_video['cView'] + cause['channel'] = temp_video['channel'] + cause['subChannel'] = temp_video['subChannel'] + + if cause != {'type': 'video'}: + out_data['cause'] = cause + fans_variation_coll.insert(out_data) + + def judge(self, author): + ''' + 一共有这样几种可能: + 1、 大量涨粉 日涨粉数超过上周平均的10倍 + 2、 史诗级涨粉 日涨粉数超过上周平均的50倍 + 3、 传说级涨粉 日涨粉数超过上周平均的100倍 + 4、 急转直下 上升轨道中的UP主突然掉粉 + 5、 大量掉粉 每日掉粉数突破5K + 6、 雪崩级掉粉 每日掉粉数突破2W + 7、 末日级掉粉 每日掉粉数突破5W + 8、 新星爆发 日涨粉超过粉丝总数的20% + ''' + + data = sorted(author['data'], key=lambda x: x['datetime']) + start_date = data[0]['datetime'].timestamp() + end_date = data[-1]['datetime'].timestamp() + x = [] + y = [] + for each in data: + x.append(each['datetime'].timestamp()) + y.append(each['fans']) + # 线性插值 + interrupted_fans = interp1d(x, y, kind='linear') + temp_date = datetime.datetime.fromtimestamp(start_date) + c_date = datetime.datetime( + temp_date.year, temp_date.month, temp_date.day).timestamp() + 86400 * 3 + while (c_date <= end_date): + date = datetime.datetime.fromtimestamp(c_date) + daily_array = interrupted_fans([c_date - 86400, c_date]) + + p_daily_array = interrupted_fans( + [c_date - 86400 * 2, c_date - 86400]) + # 24小时前涨粉数 + pd_daily = p_daily_array[1] - p_daily_array[0] + + # 每日涨粉数 + d_daily = daily_array[1] - daily_array[0] + if (d_daily >= 3000 or d_daily <= -2000): + + delta_rate = round(d_daily / pd_daily * 100, 2) + if (d_daily >= daily_array[1] * 0.20): + self.insert_event(round(d_daily/daily_array[1]*100, 2), d_daily, + author, '新星爆发', date) + + if (d_daily <= 0 and pd_daily >= 0): + self.insert_event('-', d_daily, + author, '急转直下', date) + c_date += 86400 + continue + + if (d_daily <= -50000): + # 每日掉粉数突破5K + self.insert_event(delta_rate, d_daily, + author, '末日级掉粉', date) + pass + elif (d_daily <= -20000): + # 每日掉粉数突破2W + self.insert_event(delta_rate, d_daily, + author, '雪崩级掉粉', date) + pass + elif (d_daily <= -5000): + # 每日掉粉数突破5W + self.insert_event(delta_rate, d_daily, + author, '大量掉粉', date) + pass + + if (c_date >= start_date + 86400 * 8): + weekly_array = interrupted_fans([ + c_date - 86400 * 8, c_date - 86400]) + # 上月平均涨粉数 + weekly_mean = (weekly_array[1] - weekly_array[0]) / 7 + # 上周平均涨粉数 + delta_rate = round(d_daily / weekly_mean * 100, 2) + if delta_rate >= 10000: + # 日涨粉数超过上日的100倍 + self.insert_event(delta_rate, d_daily, + author, '传说级涨粉', date) + pass + elif delta_rate >= 5000: + # 日涨粉数超过上日的50倍 + self.insert_event(delta_rate, d_daily, + author, '史诗级涨粉', date) + pass + elif delta_rate >= 1000: + # 日涨粉数超过上日的10倍 + self.insert_event(delta_rate, d_daily, + author, '大量涨粉', date) + pass + + c_date += 86400 + pass + + def watchAllAuthor(self): + for each_author in author_coll.find({'data':{'$exists':True}}).batch_size(40): + self.judge(each_author) + pass + + +FansWatcher().watchAllAuthor() From b4e86f651b43e00bdbba5a56e754e61ab495ebc5 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Fri, 8 Mar 2019 23:17:20 +0800 Subject: [PATCH 223/469] refactor: fans variation --- biliob_analyzer/author_fans_watcher.py | 302 +++++++++++++------------ 1 file changed, 155 insertions(+), 147 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index 6adcdf4..32e1439 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -2,150 +2,158 @@ from db import db import datetime from enum import Enum -coll = db['author'] # 获得collection的句柄 -event = db['event'] # 获得collection的句柄 -video = db['video'] # 获得collection的句柄 - -# 对于上升趋势的UP主,较上日增加多少倍,才算是巨量涨粉 -WTF_INCREASE = 10 - -# 对于上升趋势的UP主,较上日增加多少倍,才算是超量涨粉 -AMAZING_INCREASE = 5 - -# 对于上升趋势的UP主,较上日增加多少倍,才算是大量涨粉 -MAGNIFICATION_INCREASE = 3 - -# 对于下降趋势的UP主,较上日减少多少倍,才算是大量掉粉 -MAGNIFICATION_DECREASE = 2 - -# 对于上升趋势的UP主,较上日增加多少倍,才算是超量涨粉 -AMAZING_DECREASE = 5 - -# 对于下降趋势的UP主,较上日减少多少倍,才算是巨量掉粉 -WTF_DECREASE = 8 - -# 粉丝增加多少,才算大量涨粉 -FANS_INCREASE_THRESHOLD = 8000 -# 粉丝减少多少,算作大量掉粉 -FANS_DECREASE_THRESHOLD = -3000 -# 多少粉丝以上才关注掉粉 -WATCH_DECREASE = 1000 - - -class Event(Enum): - increase_1 = 'I级增长' - increase_2 = 'II级猛增' - increase_3 = 'III级激增' - sudden_fall = 'SF级骤减' - decrease_1 = 'I级减少' - decrease_2 = 'II级锐减' - decrease_3 = 'III级暴减' - - -last_datetime = datetime.datetime(2000, 1, 1) -print('开始捕捉事件') -if event.count() != 0: - last_datetime = next(event.find().sort([('datetime', - -1)]).limit(1))['datetime'] - -for each_author in coll.find().batch_size(8): - if 'fansRate' in each_author and len(each_author['fansRate']) > 1: - index = 1 - - def print_data(each_author): - return '{name},速率:{rate},时间:{datetime}'.format( - name=each_author['name'], - rate=each_author['fansRate'][c_index]['rate'], - datetime=each_author['fansRate'][c_index]['datetime']) - - def insert_event(event_type): - videos = video.find({'mid': each_author['mid']}) - temp_video = {} - cause = {'type': 'video'} - for each_v in videos: - # 相差一日之内 - if (each_author['fansRate'][c_index]['datetime'] - - each_v['datetime']).days <= 1: - temp_video['aid'] = each_v['aid'] - temp_video['title'] = each_v['title'] - temp_video['cView'] = each_v['data'][0]['view'] - if 'cView' not in temp_video or 'aid' not in cause or temp_video[ - 'cView'] > cause['cView']: - cause['aid'] = temp_video['aid'] - cause['title'] = temp_video['title'] - cause['cView'] = temp_video['cView'] - - event.insert_one({ - 'type': - event_type.value, - 'mid': - each_author['mid'], - 'author': - each_author['name'], - 'rate': - each_author['fansRate'][c_index]['rate'], - 'datetime': - each_author['fansRate'][c_index]['datetime'], - 'cause': - cause - }) - - while index < len(each_author['fansRate']): - c_datetime = each_author['fansRate'][index]['datetime'] - if c_datetime <= last_datetime: - break - # 涨粉超高 - c_index = index - 1 - if each_author['fansRate'][c_index][ - 'rate'] > FANS_INCREASE_THRESHOLD and each_author[ - 'fansRate'][c_index]['rate'] > each_author['fansRate'][ - index]['rate'] * WTF_INCREASE: - insert_event(Event.increase_3) - print(Event.increase_3.value + print_data(each_author)) - - elif each_author['fansRate'][c_index][ - 'rate'] > FANS_INCREASE_THRESHOLD and each_author[ - 'fansRate'][c_index]['rate'] > each_author['fansRate'][ - index]['rate'] * AMAZING_INCREASE: - insert_event(Event.increase_2) - print(Event.increase_2.value + print_data(each_author)) - - elif each_author['fansRate'][c_index][ - 'rate'] > FANS_INCREASE_THRESHOLD and each_author[ - 'fansRate'][c_index]['rate'] > each_author['fansRate'][ - index]['rate'] * MAGNIFICATION_INCREASE: - insert_event(Event.increase_1) - print(Event.increase_1.value + print_data(each_author)) - - # 突然出现大量的掉粉 - if each_author['fansRate'][c_index][ - 'rate'] < FANS_DECREASE_THRESHOLD and each_author[ - 'fansRate'][index]['rate'] > WATCH_DECREASE: - insert_event(Event.sudden_fall) - print(Event.sudden_fall.value + print_data(each_author)) - # 一掉再掉 - elif each_author['fansRate'][c_index][ - 'rate'] < FANS_DECREASE_THRESHOLD and abs( - each_author['fansRate'][c_index]['rate'] - ) > abs( - each_author['fansRate'][index]['rate']) * WTF_DECREASE: - insert_event(Event.decrease_3) - print(Event.decrease_3.value + print_data(each_author)) - # 一掉再掉 - elif each_author['fansRate'][c_index][ - 'rate'] < FANS_DECREASE_THRESHOLD and abs( - each_author['fansRate'][c_index]['rate']) > abs( - each_author['fansRate'][index] - ['rate']) * AMAZING_DECREASE: - insert_event(Event.decrease_2) - print(Event.decrease_2.value + print_data(each_author)) - # 一掉再掉 - elif each_author['fansRate'][c_index][ - 'rate'] < FANS_DECREASE_THRESHOLD and abs( - each_author['fansRate'][c_index]['rate']) > abs( - each_author['fansRate'][index] - ['rate']) * MAGNIFICATION_DECREASE: - insert_event(Event.decrease_1) - print(Event.decrease_1.value + print_data(each_author)) - - index += 1 +from scipy.interpolate import interp1d + +author_coll = db['author'] # 获得collection的句柄 +video_coll = db['video'] # 获得collection的句柄 +fans_variation_coll = db['fans_variation'] # 获得collection的句柄 + + +def dateRange(beginDate, endDate): + dates = [] + date = beginDate + while date <= endDate: + dates.append(date.date()) + date += datetime.timedelta(1) + return dates + + +class FansWatcher(object): + def insert_event(self, delta_rate, d_daily, author, info, date): + print('变化率:{}% \n单日涨幅:{} \nUP主:{} \n信息:{}\n日期:{}\n\n'.format( + delta_rate, d_daily, author['name'], info, date)) + + out_data = { + 'variation': int(d_daily), + 'mid': author['mid'], + 'author': author['name'], + 'face': author['face'], + 'deltaRate': delta_rate, + 'datetime': date.strftime("%Y-%m-%d"), + 'info': info, + } + + videos = video_coll.find({'mid': author['mid']}) + temp_video = {} + cause = {'type': 'video'} + for each_v in videos: + # 相差一日之内 + if (date - each_v['datetime']).days <= 2: + temp_video['aid'] = each_v['aid'] + temp_video['title'] = each_v['title'] + temp_video['pic'] = each_v['pic'] + temp_video['cView'] = each_v['data'][0]['view'] + temp_video['channel'] = each_v['channel'] + temp_video['subChannel'] = each_v['subChannel'] + if 'cView' not in temp_video or 'aid' not in cause or temp_video[ + 'cView'] > cause['cView']: + cause['aid'] = temp_video['aid'] + cause['title'] = temp_video['title'] + cause['pic'] = temp_video['pic'] + cause['cView'] = temp_video['cView'] + cause['channel'] = temp_video['channel'] + cause['subChannel'] = temp_video['subChannel'] + + if cause != {'type': 'video'}: + out_data['cause'] = cause + fans_variation_coll.insert(out_data) + + def judge(self, author): + ''' + 一共有这样几种可能: + 1、 大量涨粉 日涨粉数超过上周平均的10倍 + 2、 史诗级涨粉 日涨粉数超过上周平均的50倍 + 3、 传说级涨粉 日涨粉数超过上周平均的100倍 + 4、 急转直下 上升轨道中的UP主突然掉粉 + 5、 大量掉粉 每日掉粉数突破5K + 6、 雪崩级掉粉 每日掉粉数突破2W + 7、 末日级掉粉 每日掉粉数突破5W + 8、 新星爆发 日涨粉超过粉丝总数的20% + ''' + + data = sorted(author['data'], key=lambda x: x['datetime']) + start_date = data[0]['datetime'].timestamp() + end_date = data[-1]['datetime'].timestamp() + x = [] + y = [] + for each in data: + x.append(each['datetime'].timestamp()) + y.append(each['fans']) + # 线性插值 + interrupted_fans = interp1d(x, y, kind='linear') + temp_date = datetime.datetime.fromtimestamp(start_date) + c_date = datetime.datetime( + temp_date.year, temp_date.month, temp_date.day).timestamp() + 86400 * 3 + while (c_date <= end_date): + date = datetime.datetime.fromtimestamp(c_date) + daily_array = interrupted_fans([c_date - 86400, c_date]) + + p_daily_array = interrupted_fans( + [c_date - 86400 * 2, c_date - 86400]) + # 24小时前涨粉数 + pd_daily = p_daily_array[1] - p_daily_array[0] + + # 每日涨粉数 + d_daily = daily_array[1] - daily_array[0] + if (d_daily >= 3000 or d_daily <= -2000): + + delta_rate = round(d_daily / pd_daily * 100, 2) + if (d_daily >= daily_array[1] * 0.20): + self.insert_event(round(d_daily/daily_array[1]*100, 2), d_daily, + author, '新星爆发', date) + + if (d_daily <= 0 and pd_daily >= 0): + self.insert_event('-', d_daily, + author, '急转直下', date) + c_date += 86400 + continue + + if (d_daily <= -50000): + # 每日掉粉数突破5K + self.insert_event(delta_rate, d_daily, + author, '末日级掉粉', date) + pass + elif (d_daily <= -20000): + # 每日掉粉数突破2W + self.insert_event(delta_rate, d_daily, + author, '雪崩级掉粉', date) + pass + elif (d_daily <= -5000): + # 每日掉粉数突破5W + self.insert_event(delta_rate, d_daily, + author, '大量掉粉', date) + pass + + if (c_date >= start_date + 86400 * 8): + weekly_array = interrupted_fans([ + c_date - 86400 * 8, c_date - 86400]) + # 上月平均涨粉数 + weekly_mean = (weekly_array[1] - weekly_array[0]) / 7 + # 上周平均涨粉数 + delta_rate = round(d_daily / weekly_mean * 100, 2) + if delta_rate >= 10000: + # 日涨粉数超过上日的100倍 + self.insert_event(delta_rate, d_daily, + author, '传说级涨粉', date) + pass + elif delta_rate >= 5000: + # 日涨粉数超过上日的50倍 + self.insert_event(delta_rate, d_daily, + author, '史诗级涨粉', date) + pass + elif delta_rate >= 1000: + # 日涨粉数超过上日的10倍 + self.insert_event(delta_rate, d_daily, + author, '大量涨粉', date) + pass + + c_date += 86400 + pass + + def watchAllAuthor(self): + for each_author in author_coll.find({'data':{'$exists':True}}).batch_size(40): + self.judge(each_author) + pass + + +FansWatcher().watchAllAuthor() From 6d1a893f202a565998a8109ea3f49fda496970ea Mon Sep 17 00:00:00 2001 From: Jannchie Date: Fri, 8 Mar 2019 23:17:20 +0800 Subject: [PATCH 224/469] refactor: fans variation --- biliob_analyzer/author_fans_watcher.py | 302 +++++++++++++------------ 1 file changed, 155 insertions(+), 147 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index 6adcdf4..32e1439 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -2,150 +2,158 @@ from db import db import datetime from enum import Enum -coll = db['author'] # 获得collection的句柄 -event = db['event'] # 获得collection的句柄 -video = db['video'] # 获得collection的句柄 - -# 对于上升趋势的UP主,较上日增加多少倍,才算是巨量涨粉 -WTF_INCREASE = 10 - -# 对于上升趋势的UP主,较上日增加多少倍,才算是超量涨粉 -AMAZING_INCREASE = 5 - -# 对于上升趋势的UP主,较上日增加多少倍,才算是大量涨粉 -MAGNIFICATION_INCREASE = 3 - -# 对于下降趋势的UP主,较上日减少多少倍,才算是大量掉粉 -MAGNIFICATION_DECREASE = 2 - -# 对于上升趋势的UP主,较上日增加多少倍,才算是超量涨粉 -AMAZING_DECREASE = 5 - -# 对于下降趋势的UP主,较上日减少多少倍,才算是巨量掉粉 -WTF_DECREASE = 8 - -# 粉丝增加多少,才算大量涨粉 -FANS_INCREASE_THRESHOLD = 8000 -# 粉丝减少多少,算作大量掉粉 -FANS_DECREASE_THRESHOLD = -3000 -# 多少粉丝以上才关注掉粉 -WATCH_DECREASE = 1000 - - -class Event(Enum): - increase_1 = 'I级增长' - increase_2 = 'II级猛增' - increase_3 = 'III级激增' - sudden_fall = 'SF级骤减' - decrease_1 = 'I级减少' - decrease_2 = 'II级锐减' - decrease_3 = 'III级暴减' - - -last_datetime = datetime.datetime(2000, 1, 1) -print('开始捕捉事件') -if event.count() != 0: - last_datetime = next(event.find().sort([('datetime', - -1)]).limit(1))['datetime'] - -for each_author in coll.find().batch_size(8): - if 'fansRate' in each_author and len(each_author['fansRate']) > 1: - index = 1 - - def print_data(each_author): - return '{name},速率:{rate},时间:{datetime}'.format( - name=each_author['name'], - rate=each_author['fansRate'][c_index]['rate'], - datetime=each_author['fansRate'][c_index]['datetime']) - - def insert_event(event_type): - videos = video.find({'mid': each_author['mid']}) - temp_video = {} - cause = {'type': 'video'} - for each_v in videos: - # 相差一日之内 - if (each_author['fansRate'][c_index]['datetime'] - - each_v['datetime']).days <= 1: - temp_video['aid'] = each_v['aid'] - temp_video['title'] = each_v['title'] - temp_video['cView'] = each_v['data'][0]['view'] - if 'cView' not in temp_video or 'aid' not in cause or temp_video[ - 'cView'] > cause['cView']: - cause['aid'] = temp_video['aid'] - cause['title'] = temp_video['title'] - cause['cView'] = temp_video['cView'] - - event.insert_one({ - 'type': - event_type.value, - 'mid': - each_author['mid'], - 'author': - each_author['name'], - 'rate': - each_author['fansRate'][c_index]['rate'], - 'datetime': - each_author['fansRate'][c_index]['datetime'], - 'cause': - cause - }) - - while index < len(each_author['fansRate']): - c_datetime = each_author['fansRate'][index]['datetime'] - if c_datetime <= last_datetime: - break - # 涨粉超高 - c_index = index - 1 - if each_author['fansRate'][c_index][ - 'rate'] > FANS_INCREASE_THRESHOLD and each_author[ - 'fansRate'][c_index]['rate'] > each_author['fansRate'][ - index]['rate'] * WTF_INCREASE: - insert_event(Event.increase_3) - print(Event.increase_3.value + print_data(each_author)) - - elif each_author['fansRate'][c_index][ - 'rate'] > FANS_INCREASE_THRESHOLD and each_author[ - 'fansRate'][c_index]['rate'] > each_author['fansRate'][ - index]['rate'] * AMAZING_INCREASE: - insert_event(Event.increase_2) - print(Event.increase_2.value + print_data(each_author)) - - elif each_author['fansRate'][c_index][ - 'rate'] > FANS_INCREASE_THRESHOLD and each_author[ - 'fansRate'][c_index]['rate'] > each_author['fansRate'][ - index]['rate'] * MAGNIFICATION_INCREASE: - insert_event(Event.increase_1) - print(Event.increase_1.value + print_data(each_author)) - - # 突然出现大量的掉粉 - if each_author['fansRate'][c_index][ - 'rate'] < FANS_DECREASE_THRESHOLD and each_author[ - 'fansRate'][index]['rate'] > WATCH_DECREASE: - insert_event(Event.sudden_fall) - print(Event.sudden_fall.value + print_data(each_author)) - # 一掉再掉 - elif each_author['fansRate'][c_index][ - 'rate'] < FANS_DECREASE_THRESHOLD and abs( - each_author['fansRate'][c_index]['rate'] - ) > abs( - each_author['fansRate'][index]['rate']) * WTF_DECREASE: - insert_event(Event.decrease_3) - print(Event.decrease_3.value + print_data(each_author)) - # 一掉再掉 - elif each_author['fansRate'][c_index][ - 'rate'] < FANS_DECREASE_THRESHOLD and abs( - each_author['fansRate'][c_index]['rate']) > abs( - each_author['fansRate'][index] - ['rate']) * AMAZING_DECREASE: - insert_event(Event.decrease_2) - print(Event.decrease_2.value + print_data(each_author)) - # 一掉再掉 - elif each_author['fansRate'][c_index][ - 'rate'] < FANS_DECREASE_THRESHOLD and abs( - each_author['fansRate'][c_index]['rate']) > abs( - each_author['fansRate'][index] - ['rate']) * MAGNIFICATION_DECREASE: - insert_event(Event.decrease_1) - print(Event.decrease_1.value + print_data(each_author)) - - index += 1 +from scipy.interpolate import interp1d + +author_coll = db['author'] # 获得collection的句柄 +video_coll = db['video'] # 获得collection的句柄 +fans_variation_coll = db['fans_variation'] # 获得collection的句柄 + + +def dateRange(beginDate, endDate): + dates = [] + date = beginDate + while date <= endDate: + dates.append(date.date()) + date += datetime.timedelta(1) + return dates + + +class FansWatcher(object): + def insert_event(self, delta_rate, d_daily, author, info, date): + print('变化率:{}% \n单日涨幅:{} \nUP主:{} \n信息:{}\n日期:{}\n\n'.format( + delta_rate, d_daily, author['name'], info, date)) + + out_data = { + 'variation': int(d_daily), + 'mid': author['mid'], + 'author': author['name'], + 'face': author['face'], + 'deltaRate': delta_rate, + 'datetime': date.strftime("%Y-%m-%d"), + 'info': info, + } + + videos = video_coll.find({'mid': author['mid']}) + temp_video = {} + cause = {'type': 'video'} + for each_v in videos: + # 相差一日之内 + if (date - each_v['datetime']).days <= 2: + temp_video['aid'] = each_v['aid'] + temp_video['title'] = each_v['title'] + temp_video['pic'] = each_v['pic'] + temp_video['cView'] = each_v['data'][0]['view'] + temp_video['channel'] = each_v['channel'] + temp_video['subChannel'] = each_v['subChannel'] + if 'cView' not in temp_video or 'aid' not in cause or temp_video[ + 'cView'] > cause['cView']: + cause['aid'] = temp_video['aid'] + cause['title'] = temp_video['title'] + cause['pic'] = temp_video['pic'] + cause['cView'] = temp_video['cView'] + cause['channel'] = temp_video['channel'] + cause['subChannel'] = temp_video['subChannel'] + + if cause != {'type': 'video'}: + out_data['cause'] = cause + fans_variation_coll.insert(out_data) + + def judge(self, author): + ''' + 一共有这样几种可能: + 1、 大量涨粉 日涨粉数超过上周平均的10倍 + 2、 史诗级涨粉 日涨粉数超过上周平均的50倍 + 3、 传说级涨粉 日涨粉数超过上周平均的100倍 + 4、 急转直下 上升轨道中的UP主突然掉粉 + 5、 大量掉粉 每日掉粉数突破5K + 6、 雪崩级掉粉 每日掉粉数突破2W + 7、 末日级掉粉 每日掉粉数突破5W + 8、 新星爆发 日涨粉超过粉丝总数的20% + ''' + + data = sorted(author['data'], key=lambda x: x['datetime']) + start_date = data[0]['datetime'].timestamp() + end_date = data[-1]['datetime'].timestamp() + x = [] + y = [] + for each in data: + x.append(each['datetime'].timestamp()) + y.append(each['fans']) + # 线性插值 + interrupted_fans = interp1d(x, y, kind='linear') + temp_date = datetime.datetime.fromtimestamp(start_date) + c_date = datetime.datetime( + temp_date.year, temp_date.month, temp_date.day).timestamp() + 86400 * 3 + while (c_date <= end_date): + date = datetime.datetime.fromtimestamp(c_date) + daily_array = interrupted_fans([c_date - 86400, c_date]) + + p_daily_array = interrupted_fans( + [c_date - 86400 * 2, c_date - 86400]) + # 24小时前涨粉数 + pd_daily = p_daily_array[1] - p_daily_array[0] + + # 每日涨粉数 + d_daily = daily_array[1] - daily_array[0] + if (d_daily >= 3000 or d_daily <= -2000): + + delta_rate = round(d_daily / pd_daily * 100, 2) + if (d_daily >= daily_array[1] * 0.20): + self.insert_event(round(d_daily/daily_array[1]*100, 2), d_daily, + author, '新星爆发', date) + + if (d_daily <= 0 and pd_daily >= 0): + self.insert_event('-', d_daily, + author, '急转直下', date) + c_date += 86400 + continue + + if (d_daily <= -50000): + # 每日掉粉数突破5K + self.insert_event(delta_rate, d_daily, + author, '末日级掉粉', date) + pass + elif (d_daily <= -20000): + # 每日掉粉数突破2W + self.insert_event(delta_rate, d_daily, + author, '雪崩级掉粉', date) + pass + elif (d_daily <= -5000): + # 每日掉粉数突破5W + self.insert_event(delta_rate, d_daily, + author, '大量掉粉', date) + pass + + if (c_date >= start_date + 86400 * 8): + weekly_array = interrupted_fans([ + c_date - 86400 * 8, c_date - 86400]) + # 上月平均涨粉数 + weekly_mean = (weekly_array[1] - weekly_array[0]) / 7 + # 上周平均涨粉数 + delta_rate = round(d_daily / weekly_mean * 100, 2) + if delta_rate >= 10000: + # 日涨粉数超过上日的100倍 + self.insert_event(delta_rate, d_daily, + author, '传说级涨粉', date) + pass + elif delta_rate >= 5000: + # 日涨粉数超过上日的50倍 + self.insert_event(delta_rate, d_daily, + author, '史诗级涨粉', date) + pass + elif delta_rate >= 1000: + # 日涨粉数超过上日的10倍 + self.insert_event(delta_rate, d_daily, + author, '大量涨粉', date) + pass + + c_date += 86400 + pass + + def watchAllAuthor(self): + for each_author in author_coll.find({'data':{'$exists':True}}).batch_size(40): + self.judge(each_author) + pass + + +FansWatcher().watchAllAuthor() From 7a36b2f55c78deaee57e7c1c00a857265f65de6f Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sun, 10 Mar 2019 23:46:59 +0800 Subject: [PATCH 225/469] update fans watcher --- biliob_analyzer/author_fans_watcher.py | 36 ++++++++++++++++---------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index 32e1439..8c4073b 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -38,7 +38,7 @@ def insert_event(self, delta_rate, d_daily, author, info, date): cause = {'type': 'video'} for each_v in videos: # 相差一日之内 - if (date - each_v['datetime']).days <= 2: + if (date - each_v['datetime']).days >= -1 and (date - each_v['datetime']).days <= 7: temp_video['aid'] = each_v['aid'] temp_video['title'] = each_v['title'] temp_video['pic'] = each_v['pic'] @@ -56,14 +56,15 @@ def insert_event(self, delta_rate, d_daily, author, info, date): if cause != {'type': 'video'}: out_data['cause'] = cause - fans_variation_coll.insert(out_data) + fans_variation_coll.replace_one( + {'mid': out_data['mid'], 'datetime': out_data['datetime']}, out_data, upsert=True) - def judge(self, author): + def judge(self, author, c_date=None): ''' 一共有这样几种可能: - 1、 大量涨粉 日涨粉数超过上周平均的10倍 - 2、 史诗级涨粉 日涨粉数超过上周平均的50倍 - 3、 传说级涨粉 日涨粉数超过上周平均的100倍 + 1、 大量涨粉 日涨粉数超过上周平均的25倍 + 2、 史诗级涨粉 日涨粉数超过上周平均的50倍或单日涨粉超过10W + 3、 传说级涨粉 日涨粉数超过上周平均的100倍或单日涨粉超过20W 4、 急转直下 上升轨道中的UP主突然掉粉 5、 大量掉粉 每日掉粉数突破5K 6、 雪崩级掉粉 每日掉粉数突破2W @@ -79,15 +80,19 @@ def judge(self, author): for each in data: x.append(each['datetime'].timestamp()) y.append(each['fans']) + if len(x) <= 1: + return # 线性插值 interrupted_fans = interp1d(x, y, kind='linear') temp_date = datetime.datetime.fromtimestamp(start_date) - c_date = datetime.datetime( - temp_date.year, temp_date.month, temp_date.day).timestamp() + 86400 * 3 + if c_date == None: + c_date = datetime.datetime( + temp_date.year, temp_date.month, temp_date.day).timestamp() + 86400 * 3 + if c_date - 86400 * 2 <= start_date: + return while (c_date <= end_date): date = datetime.datetime.fromtimestamp(c_date) daily_array = interrupted_fans([c_date - 86400, c_date]) - p_daily_array = interrupted_fans( [c_date - 86400 * 2, c_date - 86400]) # 24小时前涨粉数 @@ -95,6 +100,7 @@ def judge(self, author): # 每日涨粉数 d_daily = daily_array[1] - daily_array[0] + if (d_daily >= 3000 or d_daily <= -2000): delta_rate = round(d_daily / pd_daily * 100, 2) @@ -131,18 +137,18 @@ def judge(self, author): weekly_mean = (weekly_array[1] - weekly_array[0]) / 7 # 上周平均涨粉数 delta_rate = round(d_daily / weekly_mean * 100, 2) - if delta_rate >= 10000: + if delta_rate >= 10000 or (d_daily >= 200000 and delta_rate > 0): # 日涨粉数超过上日的100倍 self.insert_event(delta_rate, d_daily, author, '传说级涨粉', date) pass - elif delta_rate >= 5000: + elif delta_rate >= 5000 or (d_daily >= 100000 and delta_rate > 0): # 日涨粉数超过上日的50倍 self.insert_event(delta_rate, d_daily, author, '史诗级涨粉', date) pass - elif delta_rate >= 1000: - # 日涨粉数超过上日的10倍 + elif delta_rate >= 2500: + # 日涨粉数超过上日的25倍 self.insert_event(delta_rate, d_daily, author, '大量涨粉', date) pass @@ -151,7 +157,9 @@ def judge(self, author): pass def watchAllAuthor(self): - for each_author in author_coll.find({'data':{'$exists':True}}).batch_size(40): + start_date = (datetime.datetime.now() - + datetime.timedelta(1)).timestamp() + for each_author in author_coll.find({'data': {'$exists': True}}).batch_size(40): self.judge(each_author) pass From 94c0893f3b01048ac928ef9ffc6d17ffd93080ff Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sun, 10 Mar 2019 23:46:59 +0800 Subject: [PATCH 226/469] update fans watcher --- biliob_analyzer/author_fans_watcher.py | 36 ++++++++++++++++---------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index 32e1439..8c4073b 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -38,7 +38,7 @@ def insert_event(self, delta_rate, d_daily, author, info, date): cause = {'type': 'video'} for each_v in videos: # 相差一日之内 - if (date - each_v['datetime']).days <= 2: + if (date - each_v['datetime']).days >= -1 and (date - each_v['datetime']).days <= 7: temp_video['aid'] = each_v['aid'] temp_video['title'] = each_v['title'] temp_video['pic'] = each_v['pic'] @@ -56,14 +56,15 @@ def insert_event(self, delta_rate, d_daily, author, info, date): if cause != {'type': 'video'}: out_data['cause'] = cause - fans_variation_coll.insert(out_data) + fans_variation_coll.replace_one( + {'mid': out_data['mid'], 'datetime': out_data['datetime']}, out_data, upsert=True) - def judge(self, author): + def judge(self, author, c_date=None): ''' 一共有这样几种可能: - 1、 大量涨粉 日涨粉数超过上周平均的10倍 - 2、 史诗级涨粉 日涨粉数超过上周平均的50倍 - 3、 传说级涨粉 日涨粉数超过上周平均的100倍 + 1、 大量涨粉 日涨粉数超过上周平均的25倍 + 2、 史诗级涨粉 日涨粉数超过上周平均的50倍或单日涨粉超过10W + 3、 传说级涨粉 日涨粉数超过上周平均的100倍或单日涨粉超过20W 4、 急转直下 上升轨道中的UP主突然掉粉 5、 大量掉粉 每日掉粉数突破5K 6、 雪崩级掉粉 每日掉粉数突破2W @@ -79,15 +80,19 @@ def judge(self, author): for each in data: x.append(each['datetime'].timestamp()) y.append(each['fans']) + if len(x) <= 1: + return # 线性插值 interrupted_fans = interp1d(x, y, kind='linear') temp_date = datetime.datetime.fromtimestamp(start_date) - c_date = datetime.datetime( - temp_date.year, temp_date.month, temp_date.day).timestamp() + 86400 * 3 + if c_date == None: + c_date = datetime.datetime( + temp_date.year, temp_date.month, temp_date.day).timestamp() + 86400 * 3 + if c_date - 86400 * 2 <= start_date: + return while (c_date <= end_date): date = datetime.datetime.fromtimestamp(c_date) daily_array = interrupted_fans([c_date - 86400, c_date]) - p_daily_array = interrupted_fans( [c_date - 86400 * 2, c_date - 86400]) # 24小时前涨粉数 @@ -95,6 +100,7 @@ def judge(self, author): # 每日涨粉数 d_daily = daily_array[1] - daily_array[0] + if (d_daily >= 3000 or d_daily <= -2000): delta_rate = round(d_daily / pd_daily * 100, 2) @@ -131,18 +137,18 @@ def judge(self, author): weekly_mean = (weekly_array[1] - weekly_array[0]) / 7 # 上周平均涨粉数 delta_rate = round(d_daily / weekly_mean * 100, 2) - if delta_rate >= 10000: + if delta_rate >= 10000 or (d_daily >= 200000 and delta_rate > 0): # 日涨粉数超过上日的100倍 self.insert_event(delta_rate, d_daily, author, '传说级涨粉', date) pass - elif delta_rate >= 5000: + elif delta_rate >= 5000 or (d_daily >= 100000 and delta_rate > 0): # 日涨粉数超过上日的50倍 self.insert_event(delta_rate, d_daily, author, '史诗级涨粉', date) pass - elif delta_rate >= 1000: - # 日涨粉数超过上日的10倍 + elif delta_rate >= 2500: + # 日涨粉数超过上日的25倍 self.insert_event(delta_rate, d_daily, author, '大量涨粉', date) pass @@ -151,7 +157,9 @@ def judge(self, author): pass def watchAllAuthor(self): - for each_author in author_coll.find({'data':{'$exists':True}}).batch_size(40): + start_date = (datetime.datetime.now() - + datetime.timedelta(1)).timestamp() + for each_author in author_coll.find({'data': {'$exists': True}}).batch_size(40): self.judge(each_author) pass From 70c803210e4af75cd0f65839f8ba732cdd39392c Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sun, 10 Mar 2019 23:46:59 +0800 Subject: [PATCH 227/469] update fans watcher --- biliob_analyzer/author_fans_watcher.py | 36 ++++++++++++++++---------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index 32e1439..8c4073b 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -38,7 +38,7 @@ def insert_event(self, delta_rate, d_daily, author, info, date): cause = {'type': 'video'} for each_v in videos: # 相差一日之内 - if (date - each_v['datetime']).days <= 2: + if (date - each_v['datetime']).days >= -1 and (date - each_v['datetime']).days <= 7: temp_video['aid'] = each_v['aid'] temp_video['title'] = each_v['title'] temp_video['pic'] = each_v['pic'] @@ -56,14 +56,15 @@ def insert_event(self, delta_rate, d_daily, author, info, date): if cause != {'type': 'video'}: out_data['cause'] = cause - fans_variation_coll.insert(out_data) + fans_variation_coll.replace_one( + {'mid': out_data['mid'], 'datetime': out_data['datetime']}, out_data, upsert=True) - def judge(self, author): + def judge(self, author, c_date=None): ''' 一共有这样几种可能: - 1、 大量涨粉 日涨粉数超过上周平均的10倍 - 2、 史诗级涨粉 日涨粉数超过上周平均的50倍 - 3、 传说级涨粉 日涨粉数超过上周平均的100倍 + 1、 大量涨粉 日涨粉数超过上周平均的25倍 + 2、 史诗级涨粉 日涨粉数超过上周平均的50倍或单日涨粉超过10W + 3、 传说级涨粉 日涨粉数超过上周平均的100倍或单日涨粉超过20W 4、 急转直下 上升轨道中的UP主突然掉粉 5、 大量掉粉 每日掉粉数突破5K 6、 雪崩级掉粉 每日掉粉数突破2W @@ -79,15 +80,19 @@ def judge(self, author): for each in data: x.append(each['datetime'].timestamp()) y.append(each['fans']) + if len(x) <= 1: + return # 线性插值 interrupted_fans = interp1d(x, y, kind='linear') temp_date = datetime.datetime.fromtimestamp(start_date) - c_date = datetime.datetime( - temp_date.year, temp_date.month, temp_date.day).timestamp() + 86400 * 3 + if c_date == None: + c_date = datetime.datetime( + temp_date.year, temp_date.month, temp_date.day).timestamp() + 86400 * 3 + if c_date - 86400 * 2 <= start_date: + return while (c_date <= end_date): date = datetime.datetime.fromtimestamp(c_date) daily_array = interrupted_fans([c_date - 86400, c_date]) - p_daily_array = interrupted_fans( [c_date - 86400 * 2, c_date - 86400]) # 24小时前涨粉数 @@ -95,6 +100,7 @@ def judge(self, author): # 每日涨粉数 d_daily = daily_array[1] - daily_array[0] + if (d_daily >= 3000 or d_daily <= -2000): delta_rate = round(d_daily / pd_daily * 100, 2) @@ -131,18 +137,18 @@ def judge(self, author): weekly_mean = (weekly_array[1] - weekly_array[0]) / 7 # 上周平均涨粉数 delta_rate = round(d_daily / weekly_mean * 100, 2) - if delta_rate >= 10000: + if delta_rate >= 10000 or (d_daily >= 200000 and delta_rate > 0): # 日涨粉数超过上日的100倍 self.insert_event(delta_rate, d_daily, author, '传说级涨粉', date) pass - elif delta_rate >= 5000: + elif delta_rate >= 5000 or (d_daily >= 100000 and delta_rate > 0): # 日涨粉数超过上日的50倍 self.insert_event(delta_rate, d_daily, author, '史诗级涨粉', date) pass - elif delta_rate >= 1000: - # 日涨粉数超过上日的10倍 + elif delta_rate >= 2500: + # 日涨粉数超过上日的25倍 self.insert_event(delta_rate, d_daily, author, '大量涨粉', date) pass @@ -151,7 +157,9 @@ def judge(self, author): pass def watchAllAuthor(self): - for each_author in author_coll.find({'data':{'$exists':True}}).batch_size(40): + start_date = (datetime.datetime.now() - + datetime.timedelta(1)).timestamp() + for each_author in author_coll.find({'data': {'$exists': True}}).batch_size(40): self.judge(each_author) pass From 41edc3710bf3af32c1b4f3f5a2f92d5b467b2f65 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Mon, 11 Mar 2019 22:59:33 +0800 Subject: [PATCH 228/469] update --- biliob_analyzer/author_fans_watcher.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index 8c4073b..f4e5ce3 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -59,7 +59,7 @@ def insert_event(self, delta_rate, d_daily, author, info, date): fans_variation_coll.replace_one( {'mid': out_data['mid'], 'datetime': out_data['datetime']}, out_data, upsert=True) - def judge(self, author, c_date=None): + def judge(self, author): ''' 一共有这样几种可能: 1、 大量涨粉 日涨粉数超过上周平均的25倍 @@ -85,9 +85,8 @@ def judge(self, author, c_date=None): # 线性插值 interrupted_fans = interp1d(x, y, kind='linear') temp_date = datetime.datetime.fromtimestamp(start_date) - if c_date == None: - c_date = datetime.datetime( - temp_date.year, temp_date.month, temp_date.day).timestamp() + 86400 * 3 + c_date = datetime.datetime( + temp_date.year, temp_date.month, temp_date.day).timestamp() + 86400 * 3 if c_date - 86400 * 2 <= start_date: return while (c_date <= end_date): @@ -157,8 +156,6 @@ def judge(self, author, c_date=None): pass def watchAllAuthor(self): - start_date = (datetime.datetime.now() - - datetime.timedelta(1)).timestamp() for each_author in author_coll.find({'data': {'$exists': True}}).batch_size(40): self.judge(each_author) pass From 975fc1e59c280bef7979c98d77fbd134cdc903a8 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Mon, 11 Mar 2019 22:59:33 +0800 Subject: [PATCH 229/469] update --- biliob_analyzer/author_fans_watcher.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index 8c4073b..f4e5ce3 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -59,7 +59,7 @@ def insert_event(self, delta_rate, d_daily, author, info, date): fans_variation_coll.replace_one( {'mid': out_data['mid'], 'datetime': out_data['datetime']}, out_data, upsert=True) - def judge(self, author, c_date=None): + def judge(self, author): ''' 一共有这样几种可能: 1、 大量涨粉 日涨粉数超过上周平均的25倍 @@ -85,9 +85,8 @@ def judge(self, author, c_date=None): # 线性插值 interrupted_fans = interp1d(x, y, kind='linear') temp_date = datetime.datetime.fromtimestamp(start_date) - if c_date == None: - c_date = datetime.datetime( - temp_date.year, temp_date.month, temp_date.day).timestamp() + 86400 * 3 + c_date = datetime.datetime( + temp_date.year, temp_date.month, temp_date.day).timestamp() + 86400 * 3 if c_date - 86400 * 2 <= start_date: return while (c_date <= end_date): @@ -157,8 +156,6 @@ def judge(self, author, c_date=None): pass def watchAllAuthor(self): - start_date = (datetime.datetime.now() - - datetime.timedelta(1)).timestamp() for each_author in author_coll.find({'data': {'$exists': True}}).batch_size(40): self.judge(each_author) pass From 1bdf2e3a025f06ebc9a5b6a0012e9727c8a0f6fc Mon Sep 17 00:00:00 2001 From: Jannchie Date: Mon, 11 Mar 2019 22:59:33 +0800 Subject: [PATCH 230/469] update --- biliob_analyzer/author_fans_watcher.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index 8c4073b..f4e5ce3 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -59,7 +59,7 @@ def insert_event(self, delta_rate, d_daily, author, info, date): fans_variation_coll.replace_one( {'mid': out_data['mid'], 'datetime': out_data['datetime']}, out_data, upsert=True) - def judge(self, author, c_date=None): + def judge(self, author): ''' 一共有这样几种可能: 1、 大量涨粉 日涨粉数超过上周平均的25倍 @@ -85,9 +85,8 @@ def judge(self, author, c_date=None): # 线性插值 interrupted_fans = interp1d(x, y, kind='linear') temp_date = datetime.datetime.fromtimestamp(start_date) - if c_date == None: - c_date = datetime.datetime( - temp_date.year, temp_date.month, temp_date.day).timestamp() + 86400 * 3 + c_date = datetime.datetime( + temp_date.year, temp_date.month, temp_date.day).timestamp() + 86400 * 3 if c_date - 86400 * 2 <= start_date: return while (c_date <= end_date): @@ -157,8 +156,6 @@ def judge(self, author, c_date=None): pass def watchAllAuthor(self): - start_date = (datetime.datetime.now() - - datetime.timedelta(1)).timestamp() for each_author in author_coll.find({'data': {'$exists': True}}).batch_size(40): self.judge(each_author) pass From 73c7a964946923203cb5bf4f8b4b439ecb0f1c12 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 12 Mar 2019 20:39:17 +0800 Subject: [PATCH 231/469] fix bug --- biliob_analyzer/author_fans_watcher.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index f4e5ce3..b0184aa 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -94,6 +94,7 @@ def judge(self, author): daily_array = interrupted_fans([c_date - 86400, c_date]) p_daily_array = interrupted_fans( [c_date - 86400 * 2, c_date - 86400]) + # 24小时前涨粉数 pd_daily = p_daily_array[1] - p_daily_array[0] @@ -129,19 +130,19 @@ def judge(self, author): author, '大量掉粉', date) pass - if (c_date >= start_date + 86400 * 8): + if (c_date >= start_date + 86400 * 8 and delta_rate > 0): weekly_array = interrupted_fans([ c_date - 86400 * 8, c_date - 86400]) # 上月平均涨粉数 weekly_mean = (weekly_array[1] - weekly_array[0]) / 7 # 上周平均涨粉数 delta_rate = round(d_daily / weekly_mean * 100, 2) - if delta_rate >= 10000 or (d_daily >= 200000 and delta_rate > 0): + if delta_rate >= 10000 or d_daily >= 200000: # 日涨粉数超过上日的100倍 self.insert_event(delta_rate, d_daily, author, '传说级涨粉', date) pass - elif delta_rate >= 5000 or (d_daily >= 100000 and delta_rate > 0): + elif delta_rate >= 5000 or d_daily >= 100000: # 日涨粉数超过上日的50倍 self.insert_event(delta_rate, d_daily, author, '史诗级涨粉', date) From 5176d4b0a83d39b32f357a5d16c18eef532548ac Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 12 Mar 2019 20:39:17 +0800 Subject: [PATCH 232/469] fix bug --- biliob_analyzer/author_fans_watcher.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index f4e5ce3..b0184aa 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -94,6 +94,7 @@ def judge(self, author): daily_array = interrupted_fans([c_date - 86400, c_date]) p_daily_array = interrupted_fans( [c_date - 86400 * 2, c_date - 86400]) + # 24小时前涨粉数 pd_daily = p_daily_array[1] - p_daily_array[0] @@ -129,19 +130,19 @@ def judge(self, author): author, '大量掉粉', date) pass - if (c_date >= start_date + 86400 * 8): + if (c_date >= start_date + 86400 * 8 and delta_rate > 0): weekly_array = interrupted_fans([ c_date - 86400 * 8, c_date - 86400]) # 上月平均涨粉数 weekly_mean = (weekly_array[1] - weekly_array[0]) / 7 # 上周平均涨粉数 delta_rate = round(d_daily / weekly_mean * 100, 2) - if delta_rate >= 10000 or (d_daily >= 200000 and delta_rate > 0): + if delta_rate >= 10000 or d_daily >= 200000: # 日涨粉数超过上日的100倍 self.insert_event(delta_rate, d_daily, author, '传说级涨粉', date) pass - elif delta_rate >= 5000 or (d_daily >= 100000 and delta_rate > 0): + elif delta_rate >= 5000 or d_daily >= 100000: # 日涨粉数超过上日的50倍 self.insert_event(delta_rate, d_daily, author, '史诗级涨粉', date) From f39cb8c5503b970c72366b266c0e4582cce60f4c Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 12 Mar 2019 20:39:17 +0800 Subject: [PATCH 233/469] fix bug --- biliob_analyzer/author_fans_watcher.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index f4e5ce3..b0184aa 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -94,6 +94,7 @@ def judge(self, author): daily_array = interrupted_fans([c_date - 86400, c_date]) p_daily_array = interrupted_fans( [c_date - 86400 * 2, c_date - 86400]) + # 24小时前涨粉数 pd_daily = p_daily_array[1] - p_daily_array[0] @@ -129,19 +130,19 @@ def judge(self, author): author, '大量掉粉', date) pass - if (c_date >= start_date + 86400 * 8): + if (c_date >= start_date + 86400 * 8 and delta_rate > 0): weekly_array = interrupted_fans([ c_date - 86400 * 8, c_date - 86400]) # 上月平均涨粉数 weekly_mean = (weekly_array[1] - weekly_array[0]) / 7 # 上周平均涨粉数 delta_rate = round(d_daily / weekly_mean * 100, 2) - if delta_rate >= 10000 or (d_daily >= 200000 and delta_rate > 0): + if delta_rate >= 10000 or d_daily >= 200000: # 日涨粉数超过上日的100倍 self.insert_event(delta_rate, d_daily, author, '传说级涨粉', date) pass - elif delta_rate >= 5000 or (d_daily >= 100000 and delta_rate > 0): + elif delta_rate >= 5000 or d_daily >= 100000: # 日涨粉数超过上日的50倍 self.insert_event(delta_rate, d_daily, author, '史诗级涨粉', date) From e2d45eb7ba4664ec8c405f93c97910f960541378 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 13 Mar 2019 20:14:58 +0800 Subject: [PATCH 234/469] fix bug --- biliob_analyzer/author_fans_watcher.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index b0184aa..437eba1 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -101,7 +101,7 @@ def judge(self, author): # 每日涨粉数 d_daily = daily_array[1] - daily_array[0] - if (d_daily >= 3000 or d_daily <= -2000): + if (d_daily >= 5000 or d_daily <= -2000): delta_rate = round(d_daily / pd_daily * 100, 2) if (d_daily >= daily_array[1] * 0.20): @@ -130,7 +130,7 @@ def judge(self, author): author, '大量掉粉', date) pass - if (c_date >= start_date + 86400 * 8 and delta_rate > 0): + if (c_date >= start_date + 86400 * 8 and d_daily > 0): weekly_array = interrupted_fans([ c_date - 86400 * 8, c_date - 86400]) # 上月平均涨粉数 From a6d19a182ab12ee9a32552d62c535ebbb7fb9b17 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 13 Mar 2019 20:14:58 +0800 Subject: [PATCH 235/469] fix bug --- biliob_analyzer/author_fans_watcher.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index b0184aa..437eba1 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -101,7 +101,7 @@ def judge(self, author): # 每日涨粉数 d_daily = daily_array[1] - daily_array[0] - if (d_daily >= 3000 or d_daily <= -2000): + if (d_daily >= 5000 or d_daily <= -2000): delta_rate = round(d_daily / pd_daily * 100, 2) if (d_daily >= daily_array[1] * 0.20): @@ -130,7 +130,7 @@ def judge(self, author): author, '大量掉粉', date) pass - if (c_date >= start_date + 86400 * 8 and delta_rate > 0): + if (c_date >= start_date + 86400 * 8 and d_daily > 0): weekly_array = interrupted_fans([ c_date - 86400 * 8, c_date - 86400]) # 上月平均涨粉数 From 32e1c442a175592a1a5caef4601da554f5339476 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 13 Mar 2019 20:14:58 +0800 Subject: [PATCH 236/469] fix bug --- biliob_analyzer/author_fans_watcher.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/biliob_analyzer/author_fans_watcher.py b/biliob_analyzer/author_fans_watcher.py index b0184aa..437eba1 100644 --- a/biliob_analyzer/author_fans_watcher.py +++ b/biliob_analyzer/author_fans_watcher.py @@ -101,7 +101,7 @@ def judge(self, author): # 每日涨粉数 d_daily = daily_array[1] - daily_array[0] - if (d_daily >= 3000 or d_daily <= -2000): + if (d_daily >= 5000 or d_daily <= -2000): delta_rate = round(d_daily / pd_daily * 100, 2) if (d_daily >= daily_array[1] * 0.20): @@ -130,7 +130,7 @@ def judge(self, author): author, '大量掉粉', date) pass - if (c_date >= start_date + 86400 * 8 and delta_rate > 0): + if (c_date >= start_date + 86400 * 8 and d_daily > 0): weekly_array = interrupted_fans([ c_date - 86400 * 8, c_date - 86400]) # 上月平均涨粉数 From a7e29caf960334952c8e622e62cd330e4f9f4db0 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Mon, 18 Mar 2019 16:16:35 +0800 Subject: [PATCH 237/469] feature: update top two current fans --- biliob_requests/author_update_currentFans.py | 23 ++++++++++++++++++++ biliob_spider/spiders/author_update.py | 1 - run_requests.py | 1 + 3 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 biliob_requests/author_update_currentFans.py create mode 100644 run_requests.py diff --git a/biliob_requests/author_update_currentFans.py b/biliob_requests/author_update_currentFans.py new file mode 100644 index 0000000..5c8bd46 --- /dev/null +++ b/biliob_requests/author_update_currentFans.py @@ -0,0 +1,23 @@ +import time +import datetime +from db import settings +from pymongo import MongoClient, DESCENDING +import requests + +client = MongoClient(settings['MINGO_HOST'], 27017) +client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + +db = client['biliob'] # 获得数据库的句柄 +coll = db['author'] # 获得collection的句柄 + +URL = 'https://api.bilibili.com/x/web-interface/card?mid={}' +while True: + docs = coll.find({}, {'mid': 1}).sort( + 'cFans', direction=DESCENDING).limit(2) + mids = map(lambda x: x['mid'], docs) + for mid in mids: + j = requests.get(URL.format(mid)).json() + fans = j['data']['card']['fans'] + coll.update_one({'mid': mid}, {'$set': {'cFans': fans}}) + time.sleep(5) diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index 399d341..6fe27c2 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -58,7 +58,6 @@ def parse(self, response): official = j['data']['card']['Official']['title'] archive = j['data']['archive_count'] article = j['data']['article_count'] - face = j['data']['card']['face'] item = AuthorItem() item['mid'] = int(mid) item['name'] = name diff --git a/run_requests.py b/run_requests.py new file mode 100644 index 0000000..621844b --- /dev/null +++ b/run_requests.py @@ -0,0 +1 @@ +import biliob_requests.author_update_currentFans From 0da7f98dbf6110dc1432d70fa1a66fee123df3a9 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Mon, 18 Mar 2019 16:16:35 +0800 Subject: [PATCH 238/469] feature: update top two current fans --- biliob_requests/author_update_currentFans.py | 23 ++++++++++++++++++++ biliob_spider/spiders/author_update.py | 1 - run_requests.py | 1 + 3 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 biliob_requests/author_update_currentFans.py create mode 100644 run_requests.py diff --git a/biliob_requests/author_update_currentFans.py b/biliob_requests/author_update_currentFans.py new file mode 100644 index 0000000..5c8bd46 --- /dev/null +++ b/biliob_requests/author_update_currentFans.py @@ -0,0 +1,23 @@ +import time +import datetime +from db import settings +from pymongo import MongoClient, DESCENDING +import requests + +client = MongoClient(settings['MINGO_HOST'], 27017) +client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + +db = client['biliob'] # 获得数据库的句柄 +coll = db['author'] # 获得collection的句柄 + +URL = 'https://api.bilibili.com/x/web-interface/card?mid={}' +while True: + docs = coll.find({}, {'mid': 1}).sort( + 'cFans', direction=DESCENDING).limit(2) + mids = map(lambda x: x['mid'], docs) + for mid in mids: + j = requests.get(URL.format(mid)).json() + fans = j['data']['card']['fans'] + coll.update_one({'mid': mid}, {'$set': {'cFans': fans}}) + time.sleep(5) diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index 399d341..6fe27c2 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -58,7 +58,6 @@ def parse(self, response): official = j['data']['card']['Official']['title'] archive = j['data']['archive_count'] article = j['data']['article_count'] - face = j['data']['card']['face'] item = AuthorItem() item['mid'] = int(mid) item['name'] = name diff --git a/run_requests.py b/run_requests.py new file mode 100644 index 0000000..621844b --- /dev/null +++ b/run_requests.py @@ -0,0 +1 @@ +import biliob_requests.author_update_currentFans From 51ce95d9e29d8c001d7366bf9b812068dd7cd4b6 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Mon, 18 Mar 2019 16:16:35 +0800 Subject: [PATCH 239/469] feature: update top two current fans --- biliob_requests/author_update_currentFans.py | 23 ++++++++++++++++++++ biliob_spider/spiders/author_update.py | 1 - run_requests.py | 1 + 3 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 biliob_requests/author_update_currentFans.py create mode 100644 run_requests.py diff --git a/biliob_requests/author_update_currentFans.py b/biliob_requests/author_update_currentFans.py new file mode 100644 index 0000000..5c8bd46 --- /dev/null +++ b/biliob_requests/author_update_currentFans.py @@ -0,0 +1,23 @@ +import time +import datetime +from db import settings +from pymongo import MongoClient, DESCENDING +import requests + +client = MongoClient(settings['MINGO_HOST'], 27017) +client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + +db = client['biliob'] # 获得数据库的句柄 +coll = db['author'] # 获得collection的句柄 + +URL = 'https://api.bilibili.com/x/web-interface/card?mid={}' +while True: + docs = coll.find({}, {'mid': 1}).sort( + 'cFans', direction=DESCENDING).limit(2) + mids = map(lambda x: x['mid'], docs) + for mid in mids: + j = requests.get(URL.format(mid)).json() + fans = j['data']['card']['fans'] + coll.update_one({'mid': mid}, {'$set': {'cFans': fans}}) + time.sleep(5) diff --git a/biliob_spider/spiders/author_update.py b/biliob_spider/spiders/author_update.py index 399d341..6fe27c2 100644 --- a/biliob_spider/spiders/author_update.py +++ b/biliob_spider/spiders/author_update.py @@ -58,7 +58,6 @@ def parse(self, response): official = j['data']['card']['Official']['title'] archive = j['data']['archive_count'] article = j['data']['article_count'] - face = j['data']['card']['face'] item = AuthorItem() item['mid'] = int(mid) item['name'] = name diff --git a/run_requests.py b/run_requests.py new file mode 100644 index 0000000..621844b --- /dev/null +++ b/run_requests.py @@ -0,0 +1 @@ +import biliob_requests.author_update_currentFans From 205a5669262e6914538777c1c17d89e860c0e823 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 20 Mar 2019 16:36:18 +0800 Subject: [PATCH 240/469] update requirements --- requirements.txt | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e7d7d97 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +scrapy_redis==0.6.8 +pymongo==3.7.1 +Scrapy==1.5.1 +requests==2.19.1 +scipy==1.1.0 +jieba==0.39 +schedule==0.5.0 +redis==3.1.0 +PyMySQL==0.8.0 From f80f212f5e816f5de31da6b8dc40de3fb4e7dd82 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 20 Mar 2019 16:36:18 +0800 Subject: [PATCH 241/469] update requirements --- requirements.txt | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e7d7d97 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +scrapy_redis==0.6.8 +pymongo==3.7.1 +Scrapy==1.5.1 +requests==2.19.1 +scipy==1.1.0 +jieba==0.39 +schedule==0.5.0 +redis==3.1.0 +PyMySQL==0.8.0 From cbda57652cddbed6979beb74aa53b99e10a4933d Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 20 Mar 2019 16:36:18 +0800 Subject: [PATCH 242/469] update requirements --- requirements.txt | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e7d7d97 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +scrapy_redis==0.6.8 +pymongo==3.7.1 +Scrapy==1.5.1 +requests==2.19.1 +scipy==1.1.0 +jieba==0.39 +schedule==0.5.0 +redis==3.1.0 +PyMySQL==0.8.0 From 526028ee92801c6a6d71aea98a72a6979d8f8b63 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 20 Mar 2019 16:36:53 +0800 Subject: [PATCH 243/469] add boot script --- run_spider_with_mq.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 run_spider_with_mq.sh diff --git a/run_spider_with_mq.sh b/run_spider_with_mq.sh new file mode 100644 index 0000000..b08431b --- /dev/null +++ b/run_spider_with_mq.sh @@ -0,0 +1,10 @@ +source ~/biliob-spider-env/bin/activate +ps -ef | grep authorRedis | grep -v grep | cut -c 9-15 | xargs kill -9 +ps -ef | grep videoRedis | grep -v grep | cut -c 9-15 | xargs kill -9 +ps -ef | grep DanmakuAggregate | grep -v grep | cut -c 9-15 | xargs kill -9 + +nohup scrapy crawl authorRedis 1>log.log 2>&1 & +nohup scrapy crawl videoRedis 1>log.log 2>&1 & + +cd danmaku_spider/ && nohup scrapy crawl DanmakuAggregate 1>log.log 2>&1 & +ps -ef |grep py \ No newline at end of file From 2cb094d3a5c4599c66acd1ce832016b83ee0458f Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 20 Mar 2019 16:36:53 +0800 Subject: [PATCH 244/469] add boot script --- run_spider_with_mq.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 run_spider_with_mq.sh diff --git a/run_spider_with_mq.sh b/run_spider_with_mq.sh new file mode 100644 index 0000000..b08431b --- /dev/null +++ b/run_spider_with_mq.sh @@ -0,0 +1,10 @@ +source ~/biliob-spider-env/bin/activate +ps -ef | grep authorRedis | grep -v grep | cut -c 9-15 | xargs kill -9 +ps -ef | grep videoRedis | grep -v grep | cut -c 9-15 | xargs kill -9 +ps -ef | grep DanmakuAggregate | grep -v grep | cut -c 9-15 | xargs kill -9 + +nohup scrapy crawl authorRedis 1>log.log 2>&1 & +nohup scrapy crawl videoRedis 1>log.log 2>&1 & + +cd danmaku_spider/ && nohup scrapy crawl DanmakuAggregate 1>log.log 2>&1 & +ps -ef |grep py \ No newline at end of file From 6d5be26c6bc0f379ca532c9eae43f6e9fd516aaa Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 20 Mar 2019 16:36:53 +0800 Subject: [PATCH 245/469] add boot script --- run_spider_with_mq.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 run_spider_with_mq.sh diff --git a/run_spider_with_mq.sh b/run_spider_with_mq.sh new file mode 100644 index 0000000..b08431b --- /dev/null +++ b/run_spider_with_mq.sh @@ -0,0 +1,10 @@ +source ~/biliob-spider-env/bin/activate +ps -ef | grep authorRedis | grep -v grep | cut -c 9-15 | xargs kill -9 +ps -ef | grep videoRedis | grep -v grep | cut -c 9-15 | xargs kill -9 +ps -ef | grep DanmakuAggregate | grep -v grep | cut -c 9-15 | xargs kill -9 + +nohup scrapy crawl authorRedis 1>log.log 2>&1 & +nohup scrapy crawl videoRedis 1>log.log 2>&1 & + +cd danmaku_spider/ && nohup scrapy crawl DanmakuAggregate 1>log.log 2>&1 & +ps -ef |grep py \ No newline at end of file From 730e16d09849b7ef0cc361e18905c0c70bfbdf90 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 20 Mar 2019 16:37:54 +0800 Subject: [PATCH 246/469] feature: real time fans --- biliob_requests/author_update_currentFans.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/biliob_requests/author_update_currentFans.py b/biliob_requests/author_update_currentFans.py index 5c8bd46..b433e2d 100644 --- a/biliob_requests/author_update_currentFans.py +++ b/biliob_requests/author_update_currentFans.py @@ -17,7 +17,13 @@ 'cFans', direction=DESCENDING).limit(2) mids = map(lambda x: x['mid'], docs) for mid in mids: - j = requests.get(URL.format(mid)).json() - fans = j['data']['card']['fans'] - coll.update_one({'mid': mid}, {'$set': {'cFans': fans}}) + try: + j = requests.get(URL.format(mid)).json() + pass + fans = j['data']['card']['fans'] + if fans == 0: + continue + coll.update_one({'mid': mid}, {'$set': {'cFans': fans}}) + except Exception as e: + pass time.sleep(5) From 734dc07725ea6de1cb6c7192e49e82802ae40ab1 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 20 Mar 2019 16:37:54 +0800 Subject: [PATCH 247/469] feature: real time fans --- biliob_requests/author_update_currentFans.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/biliob_requests/author_update_currentFans.py b/biliob_requests/author_update_currentFans.py index 5c8bd46..b433e2d 100644 --- a/biliob_requests/author_update_currentFans.py +++ b/biliob_requests/author_update_currentFans.py @@ -17,7 +17,13 @@ 'cFans', direction=DESCENDING).limit(2) mids = map(lambda x: x['mid'], docs) for mid in mids: - j = requests.get(URL.format(mid)).json() - fans = j['data']['card']['fans'] - coll.update_one({'mid': mid}, {'$set': {'cFans': fans}}) + try: + j = requests.get(URL.format(mid)).json() + pass + fans = j['data']['card']['fans'] + if fans == 0: + continue + coll.update_one({'mid': mid}, {'$set': {'cFans': fans}}) + except Exception as e: + pass time.sleep(5) From 0ecfc0c9d89cb828a06f951e31bb4947614f0623 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 20 Mar 2019 16:37:54 +0800 Subject: [PATCH 248/469] feature: real time fans --- biliob_requests/author_update_currentFans.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/biliob_requests/author_update_currentFans.py b/biliob_requests/author_update_currentFans.py index 5c8bd46..b433e2d 100644 --- a/biliob_requests/author_update_currentFans.py +++ b/biliob_requests/author_update_currentFans.py @@ -17,7 +17,13 @@ 'cFans', direction=DESCENDING).limit(2) mids = map(lambda x: x['mid'], docs) for mid in mids: - j = requests.get(URL.format(mid)).json() - fans = j['data']['card']['fans'] - coll.update_one({'mid': mid}, {'$set': {'cFans': fans}}) + try: + j = requests.get(URL.format(mid)).json() + pass + fans = j['data']['card']['fans'] + if fans == 0: + continue + coll.update_one({'mid': mid}, {'$set': {'cFans': fans}}) + except Exception as e: + pass time.sleep(5) From da4a3d6e848a4c9230bbfc6f54092b95eb1df1c7 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 20 Mar 2019 16:38:21 +0800 Subject: [PATCH 249/469] script for make video --- get_data/aggregate_fans_rate.py | 62 +++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 get_data/aggregate_fans_rate.py diff --git a/get_data/aggregate_fans_rate.py b/get_data/aggregate_fans_rate.py new file mode 100644 index 0000000..15b34dd --- /dev/null +++ b/get_data/aggregate_fans_rate.py @@ -0,0 +1,62 @@ +from db import db +import datetime +from scipy.interpolate import interp1d + +start_date = datetime.datetime(2018, 11, 1) +end_date = datetime.datetime.now() +date_range = 7 * 24 * 60 * 60 +delta_date = 0.25 * 24 * 60 * 60 +date_format = '%Y-%m-%d %H:%M' +d = {} + +current_date = start_date.timestamp() +while (current_date < end_date.timestamp()): + c_date = datetime.datetime.fromtimestamp( + current_date).strftime(date_format) + d[c_date] = [] + current_date += delta_date + + +for each_author in db['author'].find({'cFans': {'$gt': 200000}}).batch_size(1): + + current_date = start_date.timestamp() + + data = sorted(each_author['data'], key=lambda x: x['datetime']) + x = list(map( + lambda each_data: each_data['datetime'].timestamp(), data)) + y = list(map(lambda each_data: each_data['fans'], data)) + + if len(x) <= 2: + continue + interrupted_fans = interp1d(x, y, kind='linear') + current_date = start_date.timestamp() + + while (current_date < min(end_date.timestamp(), x[-1])): + # 出界 + if (current_date - date_range) > x[0] and current_date < x[-1]: + fans_func = interrupted_fans( + [current_date - date_range, current_date]) + delta_fans = int(fans_func[1] - fans_func[0]) + pass + c_date = datetime.datetime.fromtimestamp( + current_date).strftime(date_format) + print('"{}","{}","{}"'.format( + each_author['name'], delta_fans, c_date)) + # d[c_date].append((delta_fans, each_author['name'])) + + d[c_date].append((each_author['name'], delta_fans)) + + if len(d[c_date]) >= 200: + d[c_date] = sorted( + d[c_date], key=lambda x: x[1], reverse=True)[:20] + current_date += delta_date + +d[c_date] = sorted( + d[c_date], key=lambda x: x[1], reverse=True)[:20] + +with open('D:/数据/B站/fans/190319.csv', 'w', encoding="utf-8-sig") as f: + f.writelines('date,name,value\n') + for each_date in d: + for each_data in d[each_date]: + f.writelines('"{}","{}","{}"\n'.format( + each_date, each_data[0], each_data[1])) From fcc76c6089b29f0a471a4a953c9f5f589b284600 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 20 Mar 2019 16:38:21 +0800 Subject: [PATCH 250/469] script for make video --- get_data/aggregate_fans_rate.py | 62 +++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 get_data/aggregate_fans_rate.py diff --git a/get_data/aggregate_fans_rate.py b/get_data/aggregate_fans_rate.py new file mode 100644 index 0000000..15b34dd --- /dev/null +++ b/get_data/aggregate_fans_rate.py @@ -0,0 +1,62 @@ +from db import db +import datetime +from scipy.interpolate import interp1d + +start_date = datetime.datetime(2018, 11, 1) +end_date = datetime.datetime.now() +date_range = 7 * 24 * 60 * 60 +delta_date = 0.25 * 24 * 60 * 60 +date_format = '%Y-%m-%d %H:%M' +d = {} + +current_date = start_date.timestamp() +while (current_date < end_date.timestamp()): + c_date = datetime.datetime.fromtimestamp( + current_date).strftime(date_format) + d[c_date] = [] + current_date += delta_date + + +for each_author in db['author'].find({'cFans': {'$gt': 200000}}).batch_size(1): + + current_date = start_date.timestamp() + + data = sorted(each_author['data'], key=lambda x: x['datetime']) + x = list(map( + lambda each_data: each_data['datetime'].timestamp(), data)) + y = list(map(lambda each_data: each_data['fans'], data)) + + if len(x) <= 2: + continue + interrupted_fans = interp1d(x, y, kind='linear') + current_date = start_date.timestamp() + + while (current_date < min(end_date.timestamp(), x[-1])): + # 出界 + if (current_date - date_range) > x[0] and current_date < x[-1]: + fans_func = interrupted_fans( + [current_date - date_range, current_date]) + delta_fans = int(fans_func[1] - fans_func[0]) + pass + c_date = datetime.datetime.fromtimestamp( + current_date).strftime(date_format) + print('"{}","{}","{}"'.format( + each_author['name'], delta_fans, c_date)) + # d[c_date].append((delta_fans, each_author['name'])) + + d[c_date].append((each_author['name'], delta_fans)) + + if len(d[c_date]) >= 200: + d[c_date] = sorted( + d[c_date], key=lambda x: x[1], reverse=True)[:20] + current_date += delta_date + +d[c_date] = sorted( + d[c_date], key=lambda x: x[1], reverse=True)[:20] + +with open('D:/数据/B站/fans/190319.csv', 'w', encoding="utf-8-sig") as f: + f.writelines('date,name,value\n') + for each_date in d: + for each_data in d[each_date]: + f.writelines('"{}","{}","{}"\n'.format( + each_date, each_data[0], each_data[1])) From d7333334c5ab3b352931945d520bbaf548dbd14c Mon Sep 17 00:00:00 2001 From: Jannchie Date: Wed, 20 Mar 2019 16:38:21 +0800 Subject: [PATCH 251/469] script for make video --- get_data/aggregate_fans_rate.py | 62 +++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 get_data/aggregate_fans_rate.py diff --git a/get_data/aggregate_fans_rate.py b/get_data/aggregate_fans_rate.py new file mode 100644 index 0000000..15b34dd --- /dev/null +++ b/get_data/aggregate_fans_rate.py @@ -0,0 +1,62 @@ +from db import db +import datetime +from scipy.interpolate import interp1d + +start_date = datetime.datetime(2018, 11, 1) +end_date = datetime.datetime.now() +date_range = 7 * 24 * 60 * 60 +delta_date = 0.25 * 24 * 60 * 60 +date_format = '%Y-%m-%d %H:%M' +d = {} + +current_date = start_date.timestamp() +while (current_date < end_date.timestamp()): + c_date = datetime.datetime.fromtimestamp( + current_date).strftime(date_format) + d[c_date] = [] + current_date += delta_date + + +for each_author in db['author'].find({'cFans': {'$gt': 200000}}).batch_size(1): + + current_date = start_date.timestamp() + + data = sorted(each_author['data'], key=lambda x: x['datetime']) + x = list(map( + lambda each_data: each_data['datetime'].timestamp(), data)) + y = list(map(lambda each_data: each_data['fans'], data)) + + if len(x) <= 2: + continue + interrupted_fans = interp1d(x, y, kind='linear') + current_date = start_date.timestamp() + + while (current_date < min(end_date.timestamp(), x[-1])): + # 出界 + if (current_date - date_range) > x[0] and current_date < x[-1]: + fans_func = interrupted_fans( + [current_date - date_range, current_date]) + delta_fans = int(fans_func[1] - fans_func[0]) + pass + c_date = datetime.datetime.fromtimestamp( + current_date).strftime(date_format) + print('"{}","{}","{}"'.format( + each_author['name'], delta_fans, c_date)) + # d[c_date].append((delta_fans, each_author['name'])) + + d[c_date].append((each_author['name'], delta_fans)) + + if len(d[c_date]) >= 200: + d[c_date] = sorted( + d[c_date], key=lambda x: x[1], reverse=True)[:20] + current_date += delta_date + +d[c_date] = sorted( + d[c_date], key=lambda x: x[1], reverse=True)[:20] + +with open('D:/数据/B站/fans/190319.csv', 'w', encoding="utf-8-sig") as f: + f.writelines('date,name,value\n') + for each_date in d: + for each_data in d[each_date]: + f.writelines('"{}","{}","{}"\n'.format( + each_date, each_data[0], each_data[1])) From 56b48509ba20adb761c14b12a10f062521b0b50e Mon Sep 17 00:00:00 2001 From: Jannchie Date: Thu, 21 Mar 2019 16:13:25 +0800 Subject: [PATCH 252/469] feature: add callback --- biliob_spider/items.py | 2 ++ biliob_spider/pipelines.py | 18 ++++++++++++++---- .../spiders/author_update_with_redis.py | 6 ++++++ .../spiders/video_spider_with_redis.py | 7 +++++++ danmaku_spider/danmaku_spider/items.py | 1 + danmaku_spider/danmaku_spider/pipelines.py | 15 +++++++++++---- .../spiders/danmaku_aggregate_spider.py | 14 ++++++++++---- 7 files changed, 51 insertions(+), 12 deletions(-) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 490dfe6..8c1e698 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -72,6 +72,7 @@ class VideoAndAuthorItem(scrapy.Item): class VideoItem(scrapy.Item): + object_id = scrapy.Field() channel = scrapy.Field() aid = scrapy.Field() datetime = scrapy.Field() @@ -91,6 +92,7 @@ class VideoItem(scrapy.Item): class AuthorItem(scrapy.Item): + object_id = scrapy.Field() mid = scrapy.Field() name = scrapy.Field() face = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index fbb99fd..c3bcf7a 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -11,6 +11,13 @@ import logging import redis from db import redis_connect_string +from bson import ObjectId + + +def sentCallBack(object_id, coll): + if object_id != None: + coll.update_one({'_id': ObjectId(object_id)}, { + '$set': {'isExecuted': True}}) class StrongPipeline(object): @@ -103,6 +110,7 @@ def __init__(self): self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['video'] # 获得collection的句柄 self.redis_connection = redis.from_url(redis_connect_string) + def process_item(self, item, spider): try: @@ -133,8 +141,9 @@ def process_item(self, item, spider): } } }, True) - self.redis_connection.delete( - "video_detail::{}".format(item['aid'])) + sentCallBack(item['object_id'], self.db['user_record']) + # self.redis_connection.delete( + # "video_detail::{}".format(item['aid'])) return item except Exception as error: # 出现错误时打印错误日志 @@ -309,8 +318,9 @@ def process_item(self, item, spider): } } }, True) - self.redis_connection.delete( - "author_detail::{}".format(item['mid'])) + sentCallBack(item['object_id'], self.db['user_record']) + # self.redis_connection.delete( + # "author_detail::{}".format(item['mid'])) return item except Exception as error: # 出现错误时打印错误日志 diff --git a/biliob_spider/spiders/author_update_with_redis.py b/biliob_spider/spiders/author_update_with_redis.py index d947134..f925f31 100644 --- a/biliob_spider/spiders/author_update_with_redis.py +++ b/biliob_spider/spiders/author_update_with_redis.py @@ -71,6 +71,12 @@ def parse(self, response): item['c_attention'] = int(attention) item['c_archive'] = int(archive) item['c_article'] = int(article) + + url_list = response.url.split('&') + if len(url_list) == 2: + item['object_id'] = url_list[1] + else: + item['object_id'] = None yield Request( "https://api.bilibili.com/x/space/upstat?mid={mid}".format( mid=str(mid)), diff --git a/biliob_spider/spiders/video_spider_with_redis.py b/biliob_spider/spiders/video_spider_with_redis.py index 4fe87fe..1cba172 100644 --- a/biliob_spider/spiders/video_spider_with_redis.py +++ b/biliob_spider/spiders/video_spider_with_redis.py @@ -35,6 +35,7 @@ def __init__(self): def parse(self, response): try: + r = json.loads(response.body) d = r["data"] keys = list(d.keys()) @@ -93,6 +94,12 @@ def parse(self, response): item['channel'] == '娱乐' else: item['channel'] = None + + url_list = response.url.split('&') + if len(url_list) == 2: + item['object_id'] = url_list[1] + else: + item['object_id'] = None yield item except Exception as error: diff --git a/danmaku_spider/danmaku_spider/items.py b/danmaku_spider/danmaku_spider/items.py index ffafac0..1a3ecea 100644 --- a/danmaku_spider/danmaku_spider/items.py +++ b/danmaku_spider/danmaku_spider/items.py @@ -15,3 +15,4 @@ class DanmakuAggregateItem(scrapy.Item): word_frequency = scrapy.Field() danmaku_density = scrapy.Field() duration = scrapy.Field() + object_id = scrapy.Field() diff --git a/danmaku_spider/danmaku_spider/pipelines.py b/danmaku_spider/danmaku_spider/pipelines.py index 140e66c..d5d0353 100644 --- a/danmaku_spider/danmaku_spider/pipelines.py +++ b/danmaku_spider/danmaku_spider/pipelines.py @@ -11,10 +11,16 @@ import redis from pymongo import MongoClient - +from bson import ObjectId env_dist = os.environ +def sentCallBack(object_id, coll): + if object_id != None: + coll.update_one({'_id': ObjectId(object_id)}, { + '$set': {'isExecuted': True}}) + + class DanmakuSpiderPipeline(object): def __init__(self): self.client = MongoClient(env_dist['BILIOB_MONGO_SERVER'], 27017) @@ -39,6 +45,7 @@ def process_item(self, item, spider): 'danmaku_aggregate.updatetime': datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') } }, True) - # 刷新redis数据缓存 - self.redis_connection.delete( - "video_detail::{}".format(item['aid'])) + sentCallBack(item['object_id'], self.db['user_record']) + # # 刷新redis数据缓存 + # self.redis_connection.delete( + # "video_detail::{}".format(item['aid'])) diff --git a/danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py b/danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py index eeaf85d..abcdb17 100644 --- a/danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py +++ b/danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py @@ -51,11 +51,16 @@ def q_to_b(self, q_str): def parse(self, response): try: j = json.loads(response.body) + url_list = response.url.split('&') + if len(url_list) == 2: + object_id = url_list[1] + else: + object_id == None if j['code'] == -403: aid = response.url[50:] print('https://api.bilibili.com/x/article/archives?ids={}'.format(aid)) yield Request('https://api.bilibili.com/x/article/archives?ids={}'.format(aid), - callback=self.getCidPlanB, meta={'aid': aid}) + callback=self.getCidPlanB, meta={'aid': aid, 'object_id': object_id}) else: aid = j['data']['aid'] pages = j['data']['pages'] @@ -68,7 +73,7 @@ def parse(self, response): meta={'duration': duration, 'p_name': p_name, 'page_number': page_number, - 'aid': aid}) + 'aid': aid, 'object_id': object_id}) except Exception as error: # 出现错误时存入出错集合 self.db['error'].insert_one( @@ -77,9 +82,10 @@ def parse(self, response): def getCidPlanB(self, response): try: aid = response.meta['aid'] + object_id = response.meta['object_id'] cid = json.loads(response.body)['data'][aid]['cid'] duration = json.loads(response.body)['data'][aid]['duration'] - yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, meta={'duration': duration, 'p_name': '', 'page_number': 1, 'aid': int(aid)}) + yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, meta={'object_id': object_id, 'duration': duration, 'p_name': '', 'page_number': 1, 'aid': int(aid)}) except Exception as error: # 出现错误时存入出错集合 self.db['error'].insert_one( @@ -109,7 +115,7 @@ def parseDanmaku(self, response): index = int(t // tick) danmaku_density[index] += 1 item = DanmakuAggregateItem() - + item['object_id'] = response.meta['object_id'] item['aid'] = response.meta['aid'] item['duration'] = duration item['word_frequency'] = word_frequency From 0166edc43b8b4bc1e5bb607fc5e9b1861daa0744 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Thu, 21 Mar 2019 16:13:25 +0800 Subject: [PATCH 253/469] feature: add callback --- biliob_spider/items.py | 2 ++ biliob_spider/pipelines.py | 18 ++++++++++++++---- .../spiders/author_update_with_redis.py | 6 ++++++ .../spiders/video_spider_with_redis.py | 7 +++++++ danmaku_spider/danmaku_spider/items.py | 1 + danmaku_spider/danmaku_spider/pipelines.py | 15 +++++++++++---- .../spiders/danmaku_aggregate_spider.py | 14 ++++++++++---- 7 files changed, 51 insertions(+), 12 deletions(-) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 490dfe6..8c1e698 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -72,6 +72,7 @@ class VideoAndAuthorItem(scrapy.Item): class VideoItem(scrapy.Item): + object_id = scrapy.Field() channel = scrapy.Field() aid = scrapy.Field() datetime = scrapy.Field() @@ -91,6 +92,7 @@ class VideoItem(scrapy.Item): class AuthorItem(scrapy.Item): + object_id = scrapy.Field() mid = scrapy.Field() name = scrapy.Field() face = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index fbb99fd..c3bcf7a 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -11,6 +11,13 @@ import logging import redis from db import redis_connect_string +from bson import ObjectId + + +def sentCallBack(object_id, coll): + if object_id != None: + coll.update_one({'_id': ObjectId(object_id)}, { + '$set': {'isExecuted': True}}) class StrongPipeline(object): @@ -103,6 +110,7 @@ def __init__(self): self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['video'] # 获得collection的句柄 self.redis_connection = redis.from_url(redis_connect_string) + def process_item(self, item, spider): try: @@ -133,8 +141,9 @@ def process_item(self, item, spider): } } }, True) - self.redis_connection.delete( - "video_detail::{}".format(item['aid'])) + sentCallBack(item['object_id'], self.db['user_record']) + # self.redis_connection.delete( + # "video_detail::{}".format(item['aid'])) return item except Exception as error: # 出现错误时打印错误日志 @@ -309,8 +318,9 @@ def process_item(self, item, spider): } } }, True) - self.redis_connection.delete( - "author_detail::{}".format(item['mid'])) + sentCallBack(item['object_id'], self.db['user_record']) + # self.redis_connection.delete( + # "author_detail::{}".format(item['mid'])) return item except Exception as error: # 出现错误时打印错误日志 diff --git a/biliob_spider/spiders/author_update_with_redis.py b/biliob_spider/spiders/author_update_with_redis.py index d947134..f925f31 100644 --- a/biliob_spider/spiders/author_update_with_redis.py +++ b/biliob_spider/spiders/author_update_with_redis.py @@ -71,6 +71,12 @@ def parse(self, response): item['c_attention'] = int(attention) item['c_archive'] = int(archive) item['c_article'] = int(article) + + url_list = response.url.split('&') + if len(url_list) == 2: + item['object_id'] = url_list[1] + else: + item['object_id'] = None yield Request( "https://api.bilibili.com/x/space/upstat?mid={mid}".format( mid=str(mid)), diff --git a/biliob_spider/spiders/video_spider_with_redis.py b/biliob_spider/spiders/video_spider_with_redis.py index 4fe87fe..1cba172 100644 --- a/biliob_spider/spiders/video_spider_with_redis.py +++ b/biliob_spider/spiders/video_spider_with_redis.py @@ -35,6 +35,7 @@ def __init__(self): def parse(self, response): try: + r = json.loads(response.body) d = r["data"] keys = list(d.keys()) @@ -93,6 +94,12 @@ def parse(self, response): item['channel'] == '娱乐' else: item['channel'] = None + + url_list = response.url.split('&') + if len(url_list) == 2: + item['object_id'] = url_list[1] + else: + item['object_id'] = None yield item except Exception as error: diff --git a/danmaku_spider/danmaku_spider/items.py b/danmaku_spider/danmaku_spider/items.py index ffafac0..1a3ecea 100644 --- a/danmaku_spider/danmaku_spider/items.py +++ b/danmaku_spider/danmaku_spider/items.py @@ -15,3 +15,4 @@ class DanmakuAggregateItem(scrapy.Item): word_frequency = scrapy.Field() danmaku_density = scrapy.Field() duration = scrapy.Field() + object_id = scrapy.Field() diff --git a/danmaku_spider/danmaku_spider/pipelines.py b/danmaku_spider/danmaku_spider/pipelines.py index 140e66c..d5d0353 100644 --- a/danmaku_spider/danmaku_spider/pipelines.py +++ b/danmaku_spider/danmaku_spider/pipelines.py @@ -11,10 +11,16 @@ import redis from pymongo import MongoClient - +from bson import ObjectId env_dist = os.environ +def sentCallBack(object_id, coll): + if object_id != None: + coll.update_one({'_id': ObjectId(object_id)}, { + '$set': {'isExecuted': True}}) + + class DanmakuSpiderPipeline(object): def __init__(self): self.client = MongoClient(env_dist['BILIOB_MONGO_SERVER'], 27017) @@ -39,6 +45,7 @@ def process_item(self, item, spider): 'danmaku_aggregate.updatetime': datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') } }, True) - # 刷新redis数据缓存 - self.redis_connection.delete( - "video_detail::{}".format(item['aid'])) + sentCallBack(item['object_id'], self.db['user_record']) + # # 刷新redis数据缓存 + # self.redis_connection.delete( + # "video_detail::{}".format(item['aid'])) diff --git a/danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py b/danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py index eeaf85d..abcdb17 100644 --- a/danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py +++ b/danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py @@ -51,11 +51,16 @@ def q_to_b(self, q_str): def parse(self, response): try: j = json.loads(response.body) + url_list = response.url.split('&') + if len(url_list) == 2: + object_id = url_list[1] + else: + object_id == None if j['code'] == -403: aid = response.url[50:] print('https://api.bilibili.com/x/article/archives?ids={}'.format(aid)) yield Request('https://api.bilibili.com/x/article/archives?ids={}'.format(aid), - callback=self.getCidPlanB, meta={'aid': aid}) + callback=self.getCidPlanB, meta={'aid': aid, 'object_id': object_id}) else: aid = j['data']['aid'] pages = j['data']['pages'] @@ -68,7 +73,7 @@ def parse(self, response): meta={'duration': duration, 'p_name': p_name, 'page_number': page_number, - 'aid': aid}) + 'aid': aid, 'object_id': object_id}) except Exception as error: # 出现错误时存入出错集合 self.db['error'].insert_one( @@ -77,9 +82,10 @@ def parse(self, response): def getCidPlanB(self, response): try: aid = response.meta['aid'] + object_id = response.meta['object_id'] cid = json.loads(response.body)['data'][aid]['cid'] duration = json.loads(response.body)['data'][aid]['duration'] - yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, meta={'duration': duration, 'p_name': '', 'page_number': 1, 'aid': int(aid)}) + yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, meta={'object_id': object_id, 'duration': duration, 'p_name': '', 'page_number': 1, 'aid': int(aid)}) except Exception as error: # 出现错误时存入出错集合 self.db['error'].insert_one( @@ -109,7 +115,7 @@ def parseDanmaku(self, response): index = int(t // tick) danmaku_density[index] += 1 item = DanmakuAggregateItem() - + item['object_id'] = response.meta['object_id'] item['aid'] = response.meta['aid'] item['duration'] = duration item['word_frequency'] = word_frequency From 5cd0f2fa8ee5ef4b3cdd054a2eed3ea0042fee24 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Thu, 21 Mar 2019 16:13:25 +0800 Subject: [PATCH 254/469] feature: add callback --- biliob_spider/items.py | 2 ++ biliob_spider/pipelines.py | 18 ++++++++++++++---- .../spiders/author_update_with_redis.py | 6 ++++++ .../spiders/video_spider_with_redis.py | 7 +++++++ danmaku_spider/danmaku_spider/items.py | 1 + danmaku_spider/danmaku_spider/pipelines.py | 15 +++++++++++---- .../spiders/danmaku_aggregate_spider.py | 14 ++++++++++---- 7 files changed, 51 insertions(+), 12 deletions(-) diff --git a/biliob_spider/items.py b/biliob_spider/items.py index 490dfe6..8c1e698 100644 --- a/biliob_spider/items.py +++ b/biliob_spider/items.py @@ -72,6 +72,7 @@ class VideoAndAuthorItem(scrapy.Item): class VideoItem(scrapy.Item): + object_id = scrapy.Field() channel = scrapy.Field() aid = scrapy.Field() datetime = scrapy.Field() @@ -91,6 +92,7 @@ class VideoItem(scrapy.Item): class AuthorItem(scrapy.Item): + object_id = scrapy.Field() mid = scrapy.Field() name = scrapy.Field() face = scrapy.Field() diff --git a/biliob_spider/pipelines.py b/biliob_spider/pipelines.py index fbb99fd..c3bcf7a 100644 --- a/biliob_spider/pipelines.py +++ b/biliob_spider/pipelines.py @@ -11,6 +11,13 @@ import logging import redis from db import redis_connect_string +from bson import ObjectId + + +def sentCallBack(object_id, coll): + if object_id != None: + coll.update_one({'_id': ObjectId(object_id)}, { + '$set': {'isExecuted': True}}) class StrongPipeline(object): @@ -103,6 +110,7 @@ def __init__(self): self.db = self.client['biliob'] # 获得数据库的句柄 self.coll = self.db['video'] # 获得collection的句柄 self.redis_connection = redis.from_url(redis_connect_string) + def process_item(self, item, spider): try: @@ -133,8 +141,9 @@ def process_item(self, item, spider): } } }, True) - self.redis_connection.delete( - "video_detail::{}".format(item['aid'])) + sentCallBack(item['object_id'], self.db['user_record']) + # self.redis_connection.delete( + # "video_detail::{}".format(item['aid'])) return item except Exception as error: # 出现错误时打印错误日志 @@ -309,8 +318,9 @@ def process_item(self, item, spider): } } }, True) - self.redis_connection.delete( - "author_detail::{}".format(item['mid'])) + sentCallBack(item['object_id'], self.db['user_record']) + # self.redis_connection.delete( + # "author_detail::{}".format(item['mid'])) return item except Exception as error: # 出现错误时打印错误日志 diff --git a/biliob_spider/spiders/author_update_with_redis.py b/biliob_spider/spiders/author_update_with_redis.py index d947134..f925f31 100644 --- a/biliob_spider/spiders/author_update_with_redis.py +++ b/biliob_spider/spiders/author_update_with_redis.py @@ -71,6 +71,12 @@ def parse(self, response): item['c_attention'] = int(attention) item['c_archive'] = int(archive) item['c_article'] = int(article) + + url_list = response.url.split('&') + if len(url_list) == 2: + item['object_id'] = url_list[1] + else: + item['object_id'] = None yield Request( "https://api.bilibili.com/x/space/upstat?mid={mid}".format( mid=str(mid)), diff --git a/biliob_spider/spiders/video_spider_with_redis.py b/biliob_spider/spiders/video_spider_with_redis.py index 4fe87fe..1cba172 100644 --- a/biliob_spider/spiders/video_spider_with_redis.py +++ b/biliob_spider/spiders/video_spider_with_redis.py @@ -35,6 +35,7 @@ def __init__(self): def parse(self, response): try: + r = json.loads(response.body) d = r["data"] keys = list(d.keys()) @@ -93,6 +94,12 @@ def parse(self, response): item['channel'] == '娱乐' else: item['channel'] = None + + url_list = response.url.split('&') + if len(url_list) == 2: + item['object_id'] = url_list[1] + else: + item['object_id'] = None yield item except Exception as error: diff --git a/danmaku_spider/danmaku_spider/items.py b/danmaku_spider/danmaku_spider/items.py index ffafac0..1a3ecea 100644 --- a/danmaku_spider/danmaku_spider/items.py +++ b/danmaku_spider/danmaku_spider/items.py @@ -15,3 +15,4 @@ class DanmakuAggregateItem(scrapy.Item): word_frequency = scrapy.Field() danmaku_density = scrapy.Field() duration = scrapy.Field() + object_id = scrapy.Field() diff --git a/danmaku_spider/danmaku_spider/pipelines.py b/danmaku_spider/danmaku_spider/pipelines.py index 140e66c..d5d0353 100644 --- a/danmaku_spider/danmaku_spider/pipelines.py +++ b/danmaku_spider/danmaku_spider/pipelines.py @@ -11,10 +11,16 @@ import redis from pymongo import MongoClient - +from bson import ObjectId env_dist = os.environ +def sentCallBack(object_id, coll): + if object_id != None: + coll.update_one({'_id': ObjectId(object_id)}, { + '$set': {'isExecuted': True}}) + + class DanmakuSpiderPipeline(object): def __init__(self): self.client = MongoClient(env_dist['BILIOB_MONGO_SERVER'], 27017) @@ -39,6 +45,7 @@ def process_item(self, item, spider): 'danmaku_aggregate.updatetime': datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') } }, True) - # 刷新redis数据缓存 - self.redis_connection.delete( - "video_detail::{}".format(item['aid'])) + sentCallBack(item['object_id'], self.db['user_record']) + # # 刷新redis数据缓存 + # self.redis_connection.delete( + # "video_detail::{}".format(item['aid'])) diff --git a/danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py b/danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py index eeaf85d..abcdb17 100644 --- a/danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py +++ b/danmaku_spider/danmaku_spider/spiders/danmaku_aggregate_spider.py @@ -51,11 +51,16 @@ def q_to_b(self, q_str): def parse(self, response): try: j = json.loads(response.body) + url_list = response.url.split('&') + if len(url_list) == 2: + object_id = url_list[1] + else: + object_id == None if j['code'] == -403: aid = response.url[50:] print('https://api.bilibili.com/x/article/archives?ids={}'.format(aid)) yield Request('https://api.bilibili.com/x/article/archives?ids={}'.format(aid), - callback=self.getCidPlanB, meta={'aid': aid}) + callback=self.getCidPlanB, meta={'aid': aid, 'object_id': object_id}) else: aid = j['data']['aid'] pages = j['data']['pages'] @@ -68,7 +73,7 @@ def parse(self, response): meta={'duration': duration, 'p_name': p_name, 'page_number': page_number, - 'aid': aid}) + 'aid': aid, 'object_id': object_id}) except Exception as error: # 出现错误时存入出错集合 self.db['error'].insert_one( @@ -77,9 +82,10 @@ def parse(self, response): def getCidPlanB(self, response): try: aid = response.meta['aid'] + object_id = response.meta['object_id'] cid = json.loads(response.body)['data'][aid]['cid'] duration = json.loads(response.body)['data'][aid]['duration'] - yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, meta={'duration': duration, 'p_name': '', 'page_number': 1, 'aid': int(aid)}) + yield Request(self.DANMAKU_API.format(oid=cid), callback=self.parseDanmaku, meta={'object_id': object_id, 'duration': duration, 'p_name': '', 'page_number': 1, 'aid': int(aid)}) except Exception as error: # 出现错误时存入出错集合 self.db['error'].insert_one( @@ -109,7 +115,7 @@ def parseDanmaku(self, response): index = int(t // tick) danmaku_density[index] += 1 item = DanmakuAggregateItem() - + item['object_id'] = response.meta['object_id'] item['aid'] = response.meta['aid'] item['duration'] = duration item['word_frequency'] = word_frequency From 42b050cf5c6c183b53c03d03b4ca2f25b43e3ed5 Mon Sep 17 00:00:00 2001 From: jannchie Date: Tue, 9 Apr 2019 19:13:47 +0800 Subject: [PATCH 255/469] feature: author upadte current fans --- biliob_requests/author_update_currentFans.py | 52 +++++++++++--------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/biliob_requests/author_update_currentFans.py b/biliob_requests/author_update_currentFans.py index 5c8bd46..644c7e3 100644 --- a/biliob_requests/author_update_currentFans.py +++ b/biliob_requests/author_update_currentFans.py @@ -1,23 +1,29 @@ -import time -import datetime -from db import settings -from pymongo import MongoClient, DESCENDING -import requests - -client = MongoClient(settings['MINGO_HOST'], 27017) -client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - -db = client['biliob'] # 获得数据库的句柄 -coll = db['author'] # 获得collection的句柄 - -URL = 'https://api.bilibili.com/x/web-interface/card?mid={}' -while True: - docs = coll.find({}, {'mid': 1}).sort( - 'cFans', direction=DESCENDING).limit(2) - mids = map(lambda x: x['mid'], docs) - for mid in mids: - j = requests.get(URL.format(mid)).json() - fans = j['data']['card']['fans'] - coll.update_one({'mid': mid}, {'$set': {'cFans': fans}}) - time.sleep(5) +import time +import datetime +from db import settings +from pymongo import MongoClient, DESCENDING +import requests + +client = MongoClient(settings['MINGO_HOST'], 27017) +client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + +db = client['biliob'] # 获得数据库的句柄 +coll = db['author'] # 获得collection的句柄 + +URL = 'https://api.bilibili.com/x/web-interface/card?mid={}' +while True: + docs = coll.find({}, {'mid': 1}).sort( + 'cFans', direction=DESCENDING).limit(3) + mids = map(lambda x: x['mid'], docs) + for mid in mids: + try: + j = requests.get(URL.format(mid)).json() + pass + fans = j['data']['card']['fans'] + if fans == 0: + continue + coll.update_one({'mid': mid}, {'$set': {'cFans': fans}}) + except Exception as e: + pass + time.sleep(3) From d16eefb9964b749e5f116971ec1d4d0d9704199e Mon Sep 17 00:00:00 2001 From: jannchie Date: Tue, 9 Apr 2019 19:13:47 +0800 Subject: [PATCH 256/469] feature: author upadte current fans --- biliob_requests/author_update_currentFans.py | 52 +++++++++++--------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/biliob_requests/author_update_currentFans.py b/biliob_requests/author_update_currentFans.py index 5c8bd46..644c7e3 100644 --- a/biliob_requests/author_update_currentFans.py +++ b/biliob_requests/author_update_currentFans.py @@ -1,23 +1,29 @@ -import time -import datetime -from db import settings -from pymongo import MongoClient, DESCENDING -import requests - -client = MongoClient(settings['MINGO_HOST'], 27017) -client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - -db = client['biliob'] # 获得数据库的句柄 -coll = db['author'] # 获得collection的句柄 - -URL = 'https://api.bilibili.com/x/web-interface/card?mid={}' -while True: - docs = coll.find({}, {'mid': 1}).sort( - 'cFans', direction=DESCENDING).limit(2) - mids = map(lambda x: x['mid'], docs) - for mid in mids: - j = requests.get(URL.format(mid)).json() - fans = j['data']['card']['fans'] - coll.update_one({'mid': mid}, {'$set': {'cFans': fans}}) - time.sleep(5) +import time +import datetime +from db import settings +from pymongo import MongoClient, DESCENDING +import requests + +client = MongoClient(settings['MINGO_HOST'], 27017) +client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + +db = client['biliob'] # 获得数据库的句柄 +coll = db['author'] # 获得collection的句柄 + +URL = 'https://api.bilibili.com/x/web-interface/card?mid={}' +while True: + docs = coll.find({}, {'mid': 1}).sort( + 'cFans', direction=DESCENDING).limit(3) + mids = map(lambda x: x['mid'], docs) + for mid in mids: + try: + j = requests.get(URL.format(mid)).json() + pass + fans = j['data']['card']['fans'] + if fans == 0: + continue + coll.update_one({'mid': mid}, {'$set': {'cFans': fans}}) + except Exception as e: + pass + time.sleep(3) From a9dae64decfed8f5e139cb0c660bd86fb3ffd5d4 Mon Sep 17 00:00:00 2001 From: jannchie Date: Tue, 9 Apr 2019 19:13:47 +0800 Subject: [PATCH 257/469] feature: author upadte current fans --- biliob_requests/author_update_currentFans.py | 52 +++++++++++--------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/biliob_requests/author_update_currentFans.py b/biliob_requests/author_update_currentFans.py index 5c8bd46..644c7e3 100644 --- a/biliob_requests/author_update_currentFans.py +++ b/biliob_requests/author_update_currentFans.py @@ -1,23 +1,29 @@ -import time -import datetime -from db import settings -from pymongo import MongoClient, DESCENDING -import requests - -client = MongoClient(settings['MINGO_HOST'], 27017) -client.admin.authenticate(settings['MINGO_USER'], - settings['MONGO_PSW']) - -db = client['biliob'] # 获得数据库的句柄 -coll = db['author'] # 获得collection的句柄 - -URL = 'https://api.bilibili.com/x/web-interface/card?mid={}' -while True: - docs = coll.find({}, {'mid': 1}).sort( - 'cFans', direction=DESCENDING).limit(2) - mids = map(lambda x: x['mid'], docs) - for mid in mids: - j = requests.get(URL.format(mid)).json() - fans = j['data']['card']['fans'] - coll.update_one({'mid': mid}, {'$set': {'cFans': fans}}) - time.sleep(5) +import time +import datetime +from db import settings +from pymongo import MongoClient, DESCENDING +import requests + +client = MongoClient(settings['MINGO_HOST'], 27017) +client.admin.authenticate(settings['MINGO_USER'], + settings['MONGO_PSW']) + +db = client['biliob'] # 获得数据库的句柄 +coll = db['author'] # 获得collection的句柄 + +URL = 'https://api.bilibili.com/x/web-interface/card?mid={}' +while True: + docs = coll.find({}, {'mid': 1}).sort( + 'cFans', direction=DESCENDING).limit(3) + mids = map(lambda x: x['mid'], docs) + for mid in mids: + try: + j = requests.get(URL.format(mid)).json() + pass + fans = j['data']['card']['fans'] + if fans == 0: + continue + coll.update_one({'mid': mid}, {'$set': {'cFans': fans}}) + except Exception as e: + pass + time.sleep(3) From 761446d9b4e8a14c5ebdc8f9ffe1b00e9a0dde60 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 9 Apr 2019 19:35:35 +0800 Subject: [PATCH 258/469] feature: fans increasing rate for make video --- biliob_requests/get_user_coin.py | 18 ++++++++ get_data/aggregate_fans_rate.py | 78 +++++++++++++++++++++++--------- get_data/color.py | 1 + get_data/face.py | 1 + run.py | 3 +- 5 files changed, 77 insertions(+), 24 deletions(-) create mode 100644 biliob_requests/get_user_coin.py create mode 100644 get_data/color.py create mode 100644 get_data/face.py diff --git a/biliob_requests/get_user_coin.py b/biliob_requests/get_user_coin.py new file mode 100644 index 0000000..a570631 --- /dev/null +++ b/biliob_requests/get_user_coin.py @@ -0,0 +1,18 @@ +import requests +from db import db +import time +author_coll = db['author'] +URL = 'https://api.bilibili.com/x/space/acc/info?mid={mid}' +with open('D:/数据/B站/UP主硬币数.csv', 'w',encoding="utf-8-sig") as f: + for each_author in author_coll.find({}, {'mid': 1, 'name': 1}): + mid = each_author['mid'] + response = requests.get(URL.format(mid=mid)) + j = response.json() + + if 'code' in j and j['code'] != -404 and 'data' in j and 'coins' in j[ + 'data']: + print('"{}","{}"\n'.format(each_author['name'], + j['data']['coins'])) + f.write('"{}","{}"\n'.format(each_author['name'], + j['data']['coins'])) + time.sleep(0.5) \ No newline at end of file diff --git a/get_data/aggregate_fans_rate.py b/get_data/aggregate_fans_rate.py index 15b34dd..4447bdb 100644 --- a/get_data/aggregate_fans_rate.py +++ b/get_data/aggregate_fans_rate.py @@ -1,62 +1,96 @@ from db import db import datetime from scipy.interpolate import interp1d +from haishoku.haishoku import Haishoku + +from face import face +from color import color start_date = datetime.datetime(2018, 11, 1) end_date = datetime.datetime.now() -date_range = 7 * 24 * 60 * 60 +date_range = 30 * 24 * 60 * 60 delta_date = 0.25 * 24 * 60 * 60 date_format = '%Y-%m-%d %H:%M' d = {} current_date = start_date.timestamp() while (current_date < end_date.timestamp()): - c_date = datetime.datetime.fromtimestamp( - current_date).strftime(date_format) + c_date = datetime.datetime.fromtimestamp(current_date).strftime( + date_format) d[c_date] = [] current_date += delta_date - -for each_author in db['author'].find({'cFans': {'$gt': 200000}}).batch_size(1): +for each_author in db['author'].find({'cFans': {'$gt': 200000}}).batch_size(5): current_date = start_date.timestamp() data = sorted(each_author['data'], key=lambda x: x['datetime']) - x = list(map( - lambda each_data: each_data['datetime'].timestamp(), data)) + x = list(map(lambda each_data: each_data['datetime'].timestamp(), data)) y = list(map(lambda each_data: each_data['fans'], data)) if len(x) <= 2: continue interrupted_fans = interp1d(x, y, kind='linear') current_date = start_date.timestamp() - while (current_date < min(end_date.timestamp(), x[-1])): + begin_date = current_date - date_range + if begin_date <= x[0]: + begin_date = x[0] # 出界 - if (current_date - date_range) > x[0] and current_date < x[-1]: - fans_func = interrupted_fans( - [current_date - date_range, current_date]) + if begin_date >= x[0] and current_date < x[-1] and current_date > x[0]: + fans_func = interrupted_fans([begin_date, current_date]) delta_fans = int(fans_func[1] - fans_func[0]) pass - c_date = datetime.datetime.fromtimestamp( - current_date).strftime(date_format) - print('"{}","{}","{}"'.format( - each_author['name'], delta_fans, c_date)) + c_date = datetime.datetime.fromtimestamp(current_date).strftime( + date_format) + print('"{}","{}","{}"'.format(each_author['name'], delta_fans, + c_date)) # d[c_date].append((delta_fans, each_author['name'])) - d[c_date].append((each_author['name'], delta_fans)) + d[c_date].append((each_author['name'], delta_fans, + each_author['face'])) if len(d[c_date]) >= 200: d[c_date] = sorted( d[c_date], key=lambda x: x[1], reverse=True)[:20] current_date += delta_date +for c_date in d: + d[c_date] = sorted(d[c_date], key=lambda x: x[1], reverse=True)[:20] -d[c_date] = sorted( - d[c_date], key=lambda x: x[1], reverse=True)[:20] - -with open('D:/数据/B站/fans/190319.csv', 'w', encoding="utf-8-sig") as f: +with open('D:/数据/B站/fans/月结粉丝.csv', 'w', encoding="utf-8-sig") as f: f.writelines('date,name,value\n') for each_date in d: for each_data in d[each_date]: - f.writelines('"{}","{}","{}"\n'.format( - each_date, each_data[0], each_data[1])) + f.writelines('"{}","{}","{}"\n'.format(each_date, each_data[0], + each_data[1])) +authors = [] +for each_date in d: + for each_author in d[each_date]: + authors.append(each_author[0]) + if each_author[0] not in face: + face[each_author[0]] = each_author[2] +with open('./get_data/face.py', 'w', encoding="utf-8-sig") as f: + f.writelines('face = ' + str(face)) + +for each_author in face: + if each_author in color: + continue + if face[each_author][-3:] == 'gif': + color[each_author] = '#000000' + else: + color_list = Haishoku.getPalette(face[each_author]) + color_list = sorted( + color_list, key=lambda x: x[1][0] + x[1][1] + x[1][2]) + color[each_author] = 'rgb' + \ + str(color_list[int(len(color_list)/2)][1]) + +with open('./get_data/color.py', 'w', encoding="utf-8-sig") as f: + f.writelines('color = ' + str(color)) + +min_fans = 99999999 +for each_author in authors: + c_fans = db['author'].find_one({'name': each_author}, + {'cFans': True})['cFans'] + if c_fans <= min_fans: + min_fans = c_fans +print(min_fans) \ No newline at end of file diff --git a/get_data/color.py b/get_data/color.py new file mode 100644 index 0000000..2a8321c --- /dev/null +++ b/get_data/color.py @@ -0,0 +1 @@ +color = {'华农兄弟': 'rgb(241, 124, 23)', '信誓蛋蛋': 'rgb(132, 112, 117)', '痴鸡小队官方': 'rgb(224, 147, 35)', '中国BOY超级大猩猩': 'rgb(232, 105, 10)', '唯一音乐小魔王': 'rgb(122, 146, 147)', '-欣小萌-': 'rgb(184, 157, 139)', '英雄联盟': 'rgb(52, 116, 193)', '爱做饭的芋头SAMA': 'rgb(123, 120, 124)', '美食作家王刚R': 'rgb(137, 138, 126)', '盗月社食遇记': 'rgb(200, 140, 125)', '怕上火暴王老菊': 'rgb(212, 82, 88)', '机智的党妹': 'rgb(158, 120, 105)', '靠脸吃饭的徐大王': 'rgb(135, 144, 179)', '长歌与小见见': 'rgb(205, 144, 127)', 'papi酱': 'rgb(140, 140, 140)', '我是郭杰瑞': 'rgb(72, 34, 183)', '倒悬的橘子': 'rgb(154, 126, 120)', 'LexBurner': 'rgb(149, 137, 135)', '狂人实验室': 'rgb(129, 130, 136)', '木鱼水心': 'rgb(128, 71, 179)', '允星河Yoseya': 'rgb(231, 147, 136)', '翔翔大作战': 'rgb(206, 150, 119)', '共青团中央': 'rgb(250, 29, 27)', '猎奇笔记本': 'rgb(237, 126, 40)', '吴织亚切大忽悠': 'rgb(207, 91, 81)', '哔哩哔哩英雄联盟赛事': 'rgb(110, 127, 203)', '赫萝老师': 'rgb(197, 130, 81)', '老番茄': 'rgb(250, 29, 27)', 'MordonFreeman': 'rgb(84, 100, 153)', '花花与三猫CatLive': 'rgb(144, 130, 109)', 'EdmundDZhang': 'rgb(171, 221, 72)', '浅澄月': 'rgb(161, 119, 108)', '徐大虾咯': 'rgb(211, 141, 57)', '王大境泽': 'rgb(127, 113, 112)', '哔哩哔哩线下活动': 'rgb(124, 64, 115)', '神秘店长A': 'rgb(114, 111, 114)', '楼上的老张': 'rgb(151, 151, 151)', '土味角虫': 'rgb(122, 125, 119)', '低调的帅爷': 'rgb(193, 112, 73)', '靖菌命': 'rgb(202, 135, 136)', '日本沙雕日常': 'rgb(138, 121, 110)', 'MrYang杨家成': 'rgb(145, 127, 122)', '黑镖客梦回': 'rgb(80, 79, 80)', '无聊的Do君': 'rgb(209, 119, 82)', '泛式': 'rgb(151, 116, 111)', '芒果冰OL': 'rgb(186, 156, 151)', '面筋哥-程书林': 'rgb(179, 111, 79)', '刘老师说电影': 'rgb(133, 128, 114)', '李子柒': 'rgb(128, 125, 118)', '10后找人带': 'rgb(250, 29, 27)', '丝血反杀闰土的猹': 'rgb(188, 121, 67)', '孤独的美食基': 'rgb(185, 122, 86)', '非我执笔': 'rgb(180, 114, 108)', '神奇的老皮': '#222222', '敖厂长': 'rgb(231, 187, 88)', '扎双马尾的丧尸': '#333333', '采紫葳的凌霄子': 'rgb(199, 155, 150)', '痒局长': 'rgb(207, 69, 119)', '夜刀神吉吉': 'rgb(209, 151, 115)', '路人A-': 'rgb(44, 160, 178)', '开心嘴炮': 'rgb(130, 111, 129)', '小高姐的魔法调料': 'rgb(145, 121, 110)', '科技美学': 'rgb(130, 118, 132)', '何必Hebee': 'rgb(211, 141, 68)', '优酱胖头鱼': 'rgb(181, 154, 125)', '手工耿': 'rgb(127, 128, 137)', '马壮实Hera': 'rgb(129, 139, 109)', '码哥与马也的日常': 'rgb(194, 154, 144)', '哔哩哔哩纪录片': 'rgb(113, 202, 226)', 'EmmaThePolaris': 'rgb(128, 122, 123)', '努力的Lorre': 'rgb(140, 116, 98)', '进击的冰糖': 'rgb(210, 121, 124)', '故事王StoryMan': 'rgb(132, 112, 26)', '饭帅fun': 'rgb(117, 103, 149)', '三木刃': 'rgb(105, 104, 131)', '点滴菌': 'rgb(17, 128, 243)', '多多poi丶': 'rgb(218, 108, 93)', '紧张的猫饼': 'rgb(92, 128, 156)', '哦呼w': 'rgb(92, 128, 156)', '杰里德Jared': 'rgb(196, 153, 134)', '哎哟阿尤': 'rgb(79, 24, 193)', '冷面Lim': 'rgb(149, 193, 182)', '留学的真相': 'rgb(198, 94, 157)', '辣目洋子': 'rgb(142, 122, 123)', '毒角SHOW': 'rgb(131, 129, 130)', '敬汉卿': 'rgb(142, 131, 124)', '蜡笔小勋是一对儿': 'rgb(190, 154, 154)', '王咩阿': 'rgb(208, 142, 127)', '你的可樱已上线': 'rgb(191, 110, 121)', 'bilibili电影': 'rgb(118, 201, 228)', 'Vinheteiro': 'rgb(151, 123, 105)', 'zy戏精学院院长': 'rgb(190, 148, 126)', 'ADC芒果': 'rgb(230, 176, 77)', '=咬人猫=': 'rgb(199, 155, 120)', '神秘学调查员': 'rgb(147, 142, 124)', '无聊的开箱': 'rgb(144, 114, 16)', '小可儿': 'rgb(114, 132, 190)', '茶理理理子': 'rgb(41, 114, 201)', '郝给力': 'rgb(41, 114, 201)', '燃茶哥哥在此': 'rgb(198, 144, 104)', 'Mafumafu_Channel': 'rgb(249, 101, 106)', '卡卡会发光guang': 'rgb(178, 133, 79)', '贤宝宝Baby': 'rgb(119, 110, 97)', '崩坏3第一偶像爱酱': 'rgb(206, 135, 74)', '渔人阿烽': 'rgb(120, 161, 192)', '十音Shiyin': 'rgb(97, 98, 110)', '还有一天就放假了': 'rgb(212, 141, 60)', '槐安遗梦': 'rgb(158, 131, 101)', '喵喵折App': 'rgb(254, 150, 150)', '四季萌芽': 'rgb(202, 130, 120)', 'A路人': 'rgb(181, 115, 79)', '吃鸡陪玩酱': 'rgb(212, 148, 122)', '哇哇哇妹': 'rgb(249, 108, 127)', '极客湾Geekerwan': 'rgb(117, 176, 195)', '哔哩哔哩会员购': 'rgb(232, 179, 193)', '视角姬': 'rgb(77, 180, 63)', '萧忆情Alex': 'rgb(216, 147, 141)', '狂风桑': 'rgb(193, 155, 126)', '老邪说电影': 'rgb(165, 179, 8)', '某幻君': 'rgb(182, 139, 139)', 'low君热剧': 'rgb(127, 133, 128)', '大连老湿王博文': 'rgb(226, 138, 127)', '野食小哥': 'rgb(136, 127, 121)', '洛天依': 'rgb(152, 141, 180)', 'bilibili星访问': 'rgb(200, 200, 37)', '桃核叫我桃道长': 'rgb(133, 133, 133)', '水一大魔王': 'rgb(233, 69, 150)', '不正经老丝': 'rgb(184, 121, 79)', '白上吹雪Official': 'rgb(188, 168, 179)', '喝水少年孙十一': 'rgb(198, 145, 127)', '指法芬芳张大仙': 'rgb(182, 152, 150)', 'OELoop': 'rgb(148, 158, 192)', '山药视频': 'rgb(2, 129, 202)', '抽风Crazy': 'rgb(120, 114, 112)', '哔哩哔哩弹幕网': 'rgb(117, 200, 228)', '凉风Kaze': 'rgb(204, 157, 118)', '1818黄金眼': 'rgb(129, 130, 129)', '吃素的狮子': 'rgb(141, 135, 111)', '远古时代装机猿': 'rgb(127, 127, 127)', '曼食慢语': 'rgb(131, 131, 131)', '影视飓风': 'rgb(124, 124, 124)', '渗透之C君': 'rgb(251, 81, 90)', '花少北丶': 'rgb(111, 104, 112)', '声鱼片儿': 'rgb(230, 97, 30)', '暴走漫画': 'rgb(229, 152, 55)', '阿神的Bili官方頻道': 'rgb(178, 161, 142)', '夹性芝士': 'rgb(127, 127, 127)', 'AS极客': 'rgb(138, 126, 105)', '哔哩哔哩活动': 'rgb(189, 82, 110)', '丰兄来了': 'rgb(231, 153, 152)', 'scyrax': 'rgb(210, 133, 126)', '拉宏桑': 'rgb(55, 135, 212)', '郁郁_Yu': 'rgb(130, 140, 206)', '是你的霹雳': 'rgb(104, 119, 215)', '哔哩哔哩直播': 'rgb(20, 177, 234)', 'NathanRich火锅大王': 'rgb(142, 127, 125)', '潇湘公子寻': 'rgb(190, 168, 114)', '纳豆奶奶': 'rgb(174, 125, 79)', '逍遥散人': 'rgb(113, 135, 144)', '郭乐乐很努力': 'rgb(126, 120, 128)', '宝剑嫂': 'rgb(119, 111, 119)', 'colinzang': 'rgb(144, 119, 114)', '曹译文iris': 'rgb(127, 130, 127)', '歪果仁研究协会': 'rgb(201, 166, 91)', '音乐君': 'rgb(203, 107, 78)', '李永乐老师官方': 'rgb(71, 101, 183)', '库特菌': 'rgb(198, 149, 131)', '东尼ookii': 'rgb(182, 149, 125)', '力元君': 'rgb(219, 30, 81)', '哔哩哔哩大会员': 'rgb(251, 166, 190)', '周六野Zoey': 'rgb(159, 163, 178)', '老四赶海': 'rgb(227, 142, 113)', '纯黑-V-布里塔尼亚': 'rgb(187, 164, 159)', '阿漫啊阿漫': 'rgb(20, 102, 144)', '秋呗叔岳': 'rgb(181, 158, 143)', '靠谱电竞': 'rgb(195, 164, 38)', '百万剪辑狮': 'rgb(126, 126, 126)', '电影最TOP': 'rgb(193, 159, 60)', '刘哔电影': 'rgb(243, 92, 70)', '-LKs-': 'rgb(146, 138, 198)', '嘻咦啊看': 'rgb(228, 94, 73)', '凉下采桑': 'rgb(252, 252, 252)', '哔哩哔哩番剧': 'rgb(118, 201, 228)', '一只南音呀': 'rgb(124, 126, 123)', '蜡笔和小勋': 'rgb(190, 154, 154)', 'skyhahalife': 'rgb(150, 150, 150)', '芳斯塔芙': 'rgb(204, 181, 172)', '哔哩哔哩游戏中心': 'rgb(92, 145, 220)', '山海zoom': 'rgb(178, 121, 74)', '是当归哦': 'rgb(151, 136, 128)', '黑桐谷歌': 'rgb(220, 123, 111)', 'PDD在B站': 'rgb(138, 117, 109)', '伊丽莎白鼠': 'rgb(190, 148, 134)', '潮汕好男人': 'rgb(117, 142, 140)', '起小点是大腿': 'rgb(243, 158, 124)', '观察者网': 'rgb(230, 168, 172)', '纯黑-克劳狄乌斯': 'rgb(187, 164, 159)', '徐大sao': 'rgb(187, 186, 196)', 'STN工作室': 'rgb(187, 123, 9)', '皇族电子竞技俱乐部官方账号': 'rgb(144, 126, 104)', '牛叔万岁万岁万万岁': 'rgb(75, 100, 95)', 'TESTV官方频道': 'rgb(135, 131, 121)', '聚印象视频': 'rgb(127, 127, 127)', '暗猫の祝福': 'rgb(158, 200, 207)', '凤凰天使TSKS韩剧社官方账号': 'rgb(180, 147, 139)', '努巴尼守望先锋': 'rgb(142, 144, 147)', '少年Pi': 'rgb(152, 133, 119)', '老实憨厚的笑笑': 'rgb(193, 147, 108)'} \ No newline at end of file diff --git a/get_data/face.py b/get_data/face.py new file mode 100644 index 0000000..b312adb --- /dev/null +++ b/get_data/face.py @@ -0,0 +1 @@ +face = {'华农兄弟': 'http://i1.hdslb.com/bfs/face/bac504655c69ab937b0be4557e27535f794b0c66.jpg', '信誓蛋蛋': 'http://i2.hdslb.com/bfs/face/df0be0f1946581030cbaf34e3f66a996f0a1af4a.jpg', '痴鸡小队官方': 'http://i2.hdslb.com/bfs/face/25b9ca4626a41dbb249a2cf144b14200cb4c34e1.jpg', '中国BOY超级大猩猩': 'http://i1.hdslb.com/bfs/face/068939602dae190c86f6b36ca301281d7d8aa6d9.jpg', '唯一音乐小魔王': 'http://i1.hdslb.com/bfs/face/d17ddb244a783fa8179c362209080c48716beebf.jpg', '-欣小萌-': 'http://i1.hdslb.com/bfs/face/b5aee8b711fd655c70d705678b4e350ae255a1d0.jpg', '英雄联盟': 'http://i1.hdslb.com/bfs/face/04d579b1644ee8864e7aea01219dae4a94b469ce.jpg', '爱做饭的芋头SAMA': 'http://i1.hdslb.com/bfs/face/69c2df87253eabe27e8257dc827d186cceabc3f1.jpg', '美食作家王刚R': 'http://i0.hdslb.com/bfs/face/1463fa4ea6bffd867dc257dca87248bb1d671cde.jpg', '盗月社食遇记': 'http://i1.hdslb.com/bfs/face/ab901fc0571698bb9f389798029c3fc4c0188311.jpg', '怕上火暴王老菊': 'http://i0.hdslb.com/bfs/face/2edf4a4f534869a63158d13a4b6b9676d75f1e0a.jpg', '机智的党妹': 'http://i0.hdslb.com/bfs/face/d83e93dc9101cc0c416ca09ad33a63bdf3d26c6c.jpg', '靠脸吃饭的徐大王': 'http://i1.hdslb.com/bfs/face/0e6e0f313a195e293d4ee6ae8ab86a3074abb315.jpg', '长歌与小见见': 'http://i2.hdslb.com/bfs/face/04bf6928fcdb0452dee9e1aed6a6ff1becd51bcd.jpg', 'papi酱': 'http://i0.hdslb.com/bfs/face/e45a7b248f496fad8b32d3b9cfa0335339331798.jpg', '我是郭杰瑞': 'http://i1.hdslb.com/bfs/face/6182455e4d61159121c223ddc7a3a381f2d4d056.jpg', '倒悬的橘子': 'http://i0.hdslb.com/bfs/face/440968b6694576e931ed4ec61d699029d82bcbaa.jpg', 'LexBurner': 'http://i1.hdslb.com/bfs/face/2996e22a24eed2d7767e452627a9130207defe6a.jpg', '狂人实验室': 'http://i1.hdslb.com/bfs/face/4e5e30311607340a607f52a8bfbc74b12d00558d.jpg', '木鱼水心': 'http://i1.hdslb.com/bfs/face/696df59d35c78430f1a0bdb6184558e7b7eb4a6e.jpg', '允星河Yoseya': 'http://i0.hdslb.com/bfs/face/6f0920f35ae664e937d455ba4e1ef1ac10fb80b5.jpg', '翔翔大作战': 'http://i0.hdslb.com/bfs/face/1ad13832c10e8018dc8e0e7671a6b8594ddab0c0.jpg', '共青团中央': 'http://i2.hdslb.com/bfs/face/52e16dad8aa29b6214bbfadf702e83eeac34ad9f.jpg', '猎奇笔记本': 'http://i2.hdslb.com/bfs/face/e3cb24355694fa887741871a9a1e22ae590e3769.jpg', '吴织亚切大忽悠': 'http://i1.hdslb.com/bfs/face/3b89afc4e25e534a8e4165952116aa9265ea201f.jpg', '哔哩哔哩英雄联盟赛事': 'http://i2.hdslb.com/bfs/face/f07c74fe2a020b33ab1035fea6d3338b6a6e6749.jpg', '赫萝老师': 'http://i0.hdslb.com/bfs/face/f3776594fc0ff076bdfcc8fc4921327239a7150e.jpg', '老番茄': 'http://i2.hdslb.com/bfs/face/bc5ca101313d4db223c395d64779e76eb3482d60.jpg', 'MordonFreeman': 'http://i2.hdslb.com/bfs/face/c076aac067d1d32c4bdd7b6aa5dc1930185bf91a.jpg', '花花与三猫CatLive': 'http://i1.hdslb.com/bfs/face/1871c834255ffea531f699164e70f0daebc7558b.jpg', 'EdmundDZhang': 'http://i2.hdslb.com/bfs/face/5d94b9727a49815716cd66fc7ba3840382025c56.jpg', '浅澄月': 'http://i2.hdslb.com/bfs/face/512fd36e0fb24746f04aa1073ea89c1a2f91f7e1.jpg', '徐大虾咯': 'http://i0.hdslb.com/bfs/face/f24ea395c75b7a73db57da2a2920f6e84c902082.jpg', '王大境泽': 'http://i0.hdslb.com/bfs/face/fd66b20e63fe0a3e2331892feb7ecc5ff871dcb5.jpg', '哔哩哔哩线下活动': 'http://i1.hdslb.com/bfs/face/10c354765e8a1b3fa47fba1594edc866145bd79b.jpg', '神秘店长A': 'http://i1.hdslb.com/bfs/face/cdb729f3b5cb6dc9a0cb7cc9b8c63aebf1bc5b6b.jpg', '楼上的老张': 'http://i0.hdslb.com/bfs/face/b159d3f9a0ba088dbfc845a849e84bb9f110e6f2.jpg', '土味角虫': 'http://i1.hdslb.com/bfs/face/21d2a655e44aa4844c7353d138b33581f7aaa94f.jpg', '低调的帅爷': 'http://i0.hdslb.com/bfs/face/e42e975697d7237d96992210cf801ae5e87af354.jpg', '靖菌命': 'http://i2.hdslb.com/bfs/face/6f6411b3d701ad213df75b8f9ad8910fc1ebe408.jpg', '日本沙雕日常': 'http://i2.hdslb.com/bfs/face/68aa0664390afa981cf78d8bda4042ec55d26170.jpg', 'MrYang杨家成': 'http://i0.hdslb.com/bfs/face/623fea846d8ba3b11e36d6dbc44baca08238a3d3.jpg', '黑镖客梦回': 'http://i1.hdslb.com/bfs/face/83c8bf808e662d02291a41ab5992541e2707a5d2.jpg', '无聊的Do君': 'http://i0.hdslb.com/bfs/face/d00ef6d8ceea8edee3cd61e5e87bff036189a5bb.jpg', '泛式': 'http://i2.hdslb.com/bfs/face/5f60d345059b82f0878984d9f9133f45b33b82be.jpg', '芒果冰OL': 'http://i0.hdslb.com/bfs/face/f40b734ef61f95f8adb3beca5b7b693db399c50e.jpg', '面筋哥-程书林': 'http://i1.hdslb.com/bfs/face/44fd943316e177fcf91ebde537bcff65f7a84515.jpg', '刘老师说电影': 'http://i1.hdslb.com/bfs/face/145226ff2d32d7c99b8ea3591ffec2f38fc9d3d3.jpg', '李子柒': 'http://i1.hdslb.com/bfs/face/82d27965dae3b2fe9e52780c6309c7b37ad4cbf2.jpg', '10后找人带': 'http://i0.hdslb.com/bfs/face/b8b3badb8344b72f1f4746eac2817f8423aeec2b.jpg', '丝血反杀闰土的猹': 'http://i2.hdslb.com/bfs/face/9672e303acae98a22efb54d0319e60294db887c3.jpg', '孤独的美食基': 'http://i2.hdslb.com/bfs/face/df188387360d6ad90f9e36ac4aaea68ed9f3c9be.jpg', '非我执笔': 'http://i2.hdslb.com/bfs/face/4814cdf4293ba26839f895bd478efaf21bf299db.jpg', '神奇的老皮': 'http://i2.hdslb.com/bfs/face/e7c191e9be6764107415069b36f7d9564f149c86.gif', '敖厂长': 'http://i0.hdslb.com/bfs/face/156d5d3b3f4b66d940365b3b0e3a809e1fcc0d97.jpg', '扎双马尾的丧尸': 'http://i1.hdslb.com/bfs/face/5be61949369dd844cc459eab808da151d8c363d2.gif', '采紫葳的凌霄子': 'http://i0.hdslb.com/bfs/face/b3d6d9871475f15db85340e1ed12c93f2a8b81a9.jpg', '痒局长': 'http://i2.hdslb.com/bfs/face/bcdf640faa16ebaacea1d4c930baabaec9087a80.jpg', '夜刀神吉吉': 'http://i0.hdslb.com/bfs/face/04c6f83245fdfc39440c19c5e70f73a5351d0ada.jpg', '路人A-': 'http://i0.hdslb.com/bfs/face/0383674a5cf40d2163af1f5b80f8cd90a2d62e2c.jpg', '开心嘴炮': 'http://i1.hdslb.com/bfs/face/5cd3053f352bc597bac3d11b824e548108423cb2.jpg', '小高姐的魔法调料': 'http://i2.hdslb.com/bfs/face/aae000e04ce872b58e6bbfc1632cc5bc2203084c.jpg', '科技美学': 'http://i0.hdslb.com/bfs/face/f6f8dc53ddf3ba07c1f0dd3ad87fe92034198b81.jpg', '何必Hebee': 'http://i0.hdslb.com/bfs/face/08032c6289096a9e6869a5cd5c05280060b10532.jpg', '优酱胖头鱼': 'http://i0.hdslb.com/bfs/face/7a1f15e4fc51e16c32752e10e40e7fa3ffa81eb6.jpg', '手工耿': 'http://i2.hdslb.com/bfs/face/b8a75ae7d2a0e2af1d36ca9f1084d850eebb28e3.jpg', '马壮实Hera': 'http://i2.hdslb.com/bfs/face/92d75c347df9a50cdd691b6c62dafff93138be8e.jpg', '码哥与马也的日常': 'http://i0.hdslb.com/bfs/face/692b776a39b208aacfb6ef6a5ccfb6cfe2861bb6.jpg', '哔哩哔哩纪录片': 'http://i1.hdslb.com/bfs/face/33687c6c4707352cd25fac995cd416009830c917.jpg', 'EmmaThePolaris': 'http://i2.hdslb.com/bfs/face/5433666ef01f1e51e3f21bd3d509ed7bb68eff87.jpg', '努力的Lorre': 'http://i1.hdslb.com/bfs/face/c63ebeed7d49967e2348ef953b539f8de90c5140.jpg', '进击的冰糖': 'http://i0.hdslb.com/bfs/face/8294462b9c92d587c5982a5ec5008d808325056e.jpg', '故事王StoryMan': 'http://i2.hdslb.com/bfs/face/b55679bd383423cb02f0992e44f19a68c6f5fd1b.jpg', '饭帅fun': 'http://i1.hdslb.com/bfs/face/1f8c1c2665d8c1cbb14c4dbe5e09d06f0e78e314.jpg', '三木刃': 'http://i0.hdslb.com/bfs/face/785f79f302e6166079ad2fef933dcbfd435cca4b.jpg', '点滴菌': 'http://i2.hdslb.com/bfs/face/fd8c1c3a5a454c75eafe12464989e9d794179d29.jpg', '多多poi丶': 'http://i2.hdslb.com/bfs/face/3f55db249421c556084719cd9581036d67e93ed4.jpg', '紧张的猫饼': 'http://i2.hdslb.com/bfs/face/fc1b1cd176a854c4fd718694b9216469fa148f4b.jpg', '哦呼w': 'http://i0.hdslb.com/bfs/face/57ead5621801ec8a637bc47754e00e9ae6e62888.gif', '杰里德Jared': 'http://i2.hdslb.com/bfs/face/5138b1aea6da8a7e9962562208ed848a417e207b.jpg', '哎哟阿尤': 'http://i0.hdslb.com/bfs/face/39f892f717cd18ad5de3d61c1a99c3e50f6ab390.jpg', '冷面Lim': 'http://i0.hdslb.com/bfs/face/a52f915170f1f3bfbe8e1f59a5b512936669cade.jpg', '留学的真相': 'http://i1.hdslb.com/bfs/face/7af19e342cb560870733804a94e46a45a1d1e771.jpg', '辣目洋子': 'http://i0.hdslb.com/bfs/face/a2c5ca6dd5be6b89e94294e06f9e365a29b7943b.jpg', '毒角SHOW': 'http://i0.hdslb.com/bfs/face/24486911dc40a0faa23b025b4493f15b086c65cc.jpg', '敬汉卿': 'http://i1.hdslb.com/bfs/face/a5c6005a27da6afd52021dc07423f7b4a78a466c.jpg', '蜡笔小勋是一对儿': 'http://i0.hdslb.com/bfs/face/8a8812e0a9bb3adda90044ef48830584e1efe7a2.jpg', '王咩阿': 'http://i1.hdslb.com/bfs/face/dce221c5c508ca338eb0428a751e27b41ec7bfeb.jpg', '你的可樱已上线': 'http://i1.hdslb.com/bfs/face/d72b7fe43b556fb5872ab9bdad771b1476928796.jpg', 'bilibili电影': 'http://i0.hdslb.com/bfs/face/60a9153609998b04301dc5b8ed44c41b537a2268.jpg', 'Vinheteiro': 'http://i2.hdslb.com/bfs/face/dff03339974b20517bbe26ce49ea9d2a39831023.jpg', 'zy戏精学院院长': 'http://i2.hdslb.com/bfs/face/9dc26bc30f200dd131b9479fc30c443264c39278.jpg', 'ADC芒果': 'http://i2.hdslb.com/bfs/face/cce7e724724581e1323d5bdd7e00796ee8aec8b8.jpg', '=咬人猫=': 'http://i1.hdslb.com/bfs/face/8fad84a4470f3d894d8f0dc95555ab8f2cb10a83.jpg', '神秘学调查员': 'http://i2.hdslb.com/bfs/face/b803792a721b61da30e46e7e93d7d818f4f0bc48.jpg', '无聊的开箱': 'http://i2.hdslb.com/bfs/face/9ecc839c3b28f9752bfae2834333ddc32930787f.jpg', '小可儿': 'http://i2.hdslb.com/bfs/face/4d7fbb6c47b097e297b958c8dc74287cd880fc4a.jpg', '茶理理理子': 'http://i0.hdslb.com/bfs/face/557d3f4b8cee7d413714f48f9ec671c4c44e1e6c.gif', '郝给力': 'http://i0.hdslb.com/bfs/face/94e057ae6738be1810e1747df306ac12e7c3aece.jpg', '燃茶哥哥在此': 'http://i1.hdslb.com/bfs/face/938eca5533223a6d30f0582eb3e5350049f9c1cf.jpg', 'Mafumafu_Channel': 'http://i0.hdslb.com/bfs/face/0cee96b40676ede1c7b956e636b240a4119da9f9.jpg', '卡卡会发光guang': 'http://i1.hdslb.com/bfs/face/c9dbb60c7d27a7bd9750d5c2de982c3d23d7dc82.jpg', '贤宝宝Baby': 'http://i1.hdslb.com/bfs/face/c91f1ff05b7da257278bb88d4959733ac12ab3b3.jpg', '崩坏3第一偶像爱酱': 'http://i2.hdslb.com/bfs/face/cb5facbc29275e6bdc4cfa9c20e47ecdb0fe3392.jpg', '渔人阿烽': 'http://i2.hdslb.com/bfs/face/efcf3f7f3acbd93247b04b69d705f67301bfd2f7.jpg', '十音Shiyin': 'http://i1.hdslb.com/bfs/face/71d8b38d96ec67a06ff87359a29418dd944c60b1.jpg', '还有一天就放假了': 'http://i1.hdslb.com/bfs/face/30e515fc2ff5435a5cfb31c289e0bcef860bad1c.jpg', '槐安遗梦': 'http://i1.hdslb.com/bfs/face/a7b82ad44b0194430bc9d8a6ebc94d418ec1b085.jpg', '喵喵折App': 'http://i1.hdslb.com/bfs/face/6582bf05f0a16716a68ddfce84342e239f7bcb68.jpg', '四季萌芽': 'http://i0.hdslb.com/bfs/face/2db05455674f3eeffdf9195f5ed51e7fa66bc763.jpg', 'A路人': 'http://i2.hdslb.com/bfs/face/c4022010115d00da6667a5cf799d5400067d7f66.jpg', '吃鸡陪玩酱': 'http://i0.hdslb.com/bfs/face/0d42041fabc7046f171f8de27d8972c76443a6af.jpg', '哇哇哇妹': 'http://i0.hdslb.com/bfs/face/c134be58f0a8a5bde4b96437c7e4e04968449a19.jpg', '极客湾Geekerwan': 'http://i0.hdslb.com/bfs/face/d0f7a7ee34a4a45c8390eb3a07e4d7f2d70bae91.jpg', '哔哩哔哩会员购': 'http://i2.hdslb.com/bfs/face/ba56c5cac0809c7f389f78b1e4dce4971ba07d52.jpg', '视角姬': 'http://i0.hdslb.com/bfs/face/851025ed4bc57ad6ae82a6314e72e249fd21d604.jpg', '萧忆情Alex': 'http://i1.hdslb.com/bfs/face/98ce5676de8391d3e3164c563866f32ceba9e18b.jpg', '狂风桑': 'http://i1.hdslb.com/bfs/face/5076846bb07c2d6ec442856f69214cae215301f3.jpg', '老邪说电影': 'http://i1.hdslb.com/bfs/face/0fbf6aad74852d4cd68f72d50de62d9f11f7e886.jpg', '某幻君': 'http://i1.hdslb.com/bfs/face/9ed5ebf1e3694d9cd2b4fcd1d353759ee83b3dfe.jpg', 'low君热剧': 'http://i1.hdslb.com/bfs/face/fabe17ef4b344336b872e3ded46a4a4b11140e7f.jpg', '大连老湿王博文': 'http://i0.hdslb.com/bfs/face/21fa2a84a8733cd91a940fad6af81147538ae968.jpg', '野食小哥': 'http://i1.hdslb.com/bfs/face/e0610a26bc510770d997385fb81b9c47157a53c4.jpg', '洛天依': 'http://i1.hdslb.com/bfs/face/67bd11fcd3be8fac5cef1a743a16f0a8cdf39463.jpg', 'bilibili星访问': 'http://i0.hdslb.com/bfs/face/f68925a967357060898da90fb3be14ebd5289879.jpg', '桃核叫我桃道长': 'http://i2.hdslb.com/bfs/face/16279e85ec9dc197f6d306abf84a7f8cd75a0b5e.jpg', '水一大魔王': 'http://i2.hdslb.com/bfs/face/ffdedf591d5ed5a5f584c61fec1add096a61d75a.jpg', '不正经老丝': 'http://i0.hdslb.com/bfs/face/126c00cc089a3c0f540da4525e6ba41452ad83fc.jpg', '白上吹雪Official': 'http://i0.hdslb.com/bfs/face/3b3f2cde975bf334e4f5948709e42b6569c0755d.jpg', '喝水少年孙十一': 'http://i0.hdslb.com/bfs/face/a258cfccf39862242b61328e11dbb926a3b6dbe1.jpg', '指法芬芳张大仙': 'http://i2.hdslb.com/bfs/face/6426336744dd49a744eca32855808073988bd2ce.jpg', 'OELoop': 'http://i1.hdslb.com/bfs/face/6ed036381a0d0a86714816ac393dd9bf1b0bef21.jpg', '山药视频': 'http://i0.hdslb.com/bfs/face/357b015de3b9f4c04527d4fefb844460397ac8b0.jpg', '抽风Crazy': 'http://i2.hdslb.com/bfs/face/04d92c0e30315e56e1365dc9ac2d2cf32e2fe039.jpg', '哔哩哔哩弹幕网': 'http://i0.hdslb.com/bfs/face/8aa6bd8cf269021ffa7a03f7a903ca899c11b7fb.jpg', '凉风Kaze': 'http://i1.hdslb.com/bfs/face/e0cc906bb531195e9ee9f3b575effdd2b056eaea.jpg', '1818黄金眼': 'http://i1.hdslb.com/bfs/face/4cc8d48c9fd68d3e511851b876e04bb953cb095e.jpg', '吃素的狮子': 'http://i0.hdslb.com/bfs/face/249bfa1b3d3e0932f533bc5364964b132fe9c6c2.jpg', '远古时代装机猿': 'http://i2.hdslb.com/bfs/face/ec008e32064705c576f3ffd73d20288e441d945f.jpg', '曼食慢语': 'http://i2.hdslb.com/bfs/face/49dab862b8acaf9a80a921df04c9532f1c826ebc.jpg', '影视飓风': 'http://i0.hdslb.com/bfs/face/c1733474892caa45952b2c09a89323157df7129a.jpg', '渗透之C君': 'http://i2.hdslb.com/bfs/face/623ccce0ab28b721edb61dd64749d91de18fb384.jpg', '花少北丶': 'http://i1.hdslb.com/bfs/face/86ef6895a8f88c80f2885e7eb9ba7989db437b93.jpg', '声鱼片儿': 'http://i1.hdslb.com/bfs/face/e25a06ab0e6e83d8e839dfccec43cc14089c308c.jpg', '暴走漫画': 'http://i2.hdslb.com/bfs/face/c93ff5df2cd0c0c9384b139c86d1a56a9540c9a2.jpg', '阿神的Bili官方頻道': 'http://i1.hdslb.com/bfs/face/a0648687f1b1b7b385b9abdffedb1ba4de78bab0.jpg', '夹性芝士': 'http://i1.hdslb.com/bfs/face/c66dd2ef0573ac91a5a85f79b0c2188b02c98633.jpg', 'AS极客': 'http://i2.hdslb.com/bfs/face/34379e726c3ee063571aea987ff3a27c5b1a536e.jpg', '哔哩哔哩活动': 'http://i0.hdslb.com/bfs/face/45728a09aa3cd19f13247fce58aad38ac46a0f1d.jpg', '丰兄来了': 'http://i2.hdslb.com/bfs/face/369f64c5f24324cf85eaedfc6104325bd6e950cf.jpg', 'scyrax': 'http://i0.hdslb.com/bfs/face/efc37d5941f67087c8b84e99760ae47721f8c443.jpg', '拉宏桑': 'http://i1.hdslb.com/bfs/face/e74603c80efa529c76dce7ede07f986da0af8b0d.jpg', '郁郁_Yu': 'http://i0.hdslb.com/bfs/face/7816841e2e6fefffbb785bb77f1654cc91224338.jpg', '是你的霹雳': 'http://i1.hdslb.com/bfs/face/691bcd9a0f2a77087af2a0962179deab1b87756d.jpg', '哔哩哔哩直播': 'http://i0.hdslb.com/bfs/face/58ca83240e58c3e6ce1ebb9828ce84f6f1b72863.jpg', 'NathanRich火锅大王': 'http://i0.hdslb.com/bfs/face/f2d0fa88345e2f511ae518cc655da9ded11471a8.jpg', '潇湘公子寻': 'http://i2.hdslb.com/bfs/face/a004cfeec6feb65fa421c9a4a49738999ab032de.jpg', '纳豆奶奶': 'http://i0.hdslb.com/bfs/face/0a8638b34173708fcf979bd1166fbb7fdb1110a4.jpg', '逍遥散人': 'http://i2.hdslb.com/bfs/face/d0dad8800774a4903547b1326d1fd927df47b4e9.jpg', '郭乐乐很努力': 'http://i1.hdslb.com/bfs/face/0b609106d937936fb24f92ec42e6ee08e1306a4e.jpg', '宝剑嫂': 'http://i1.hdslb.com/bfs/face/b4be6527726b04049a066d5a073afd7c1a733ee6.jpg', 'colinzang': 'http://i2.hdslb.com/bfs/face/a6be4da0320912851bd5f31479178ec2e5af45c3.jpg', '曹译文iris': 'http://i0.hdslb.com/bfs/face/a21a7c95042c5187575d6768ea5ec9ee331fee3a.jpg', '歪果仁研究协会': 'http://i2.hdslb.com/bfs/face/0ea49d2ad88658689eb31746c09f716c09581e4f.jpg', '音乐君': 'http://i2.hdslb.com/bfs/face/20ec194172acfdbca47539af7779ae4890c753ec.jpg', '李永乐老师官方': 'http://i0.hdslb.com/bfs/face/b299b0e20dc4fbbd45e13ba86f330d890c0a0118.jpg', '库特菌': 'http://i0.hdslb.com/bfs/face/82a5d10cd93a90ec1b255b279578fe3801908831.jpg', '东尼ookii': 'http://i2.hdslb.com/bfs/face/c2c3ce926ec8d9edc361ca706c9dce9415c15114.jpg', '力元君': 'http://i1.hdslb.com/bfs/face/b831ab8e458d7763d397c41f7b1298fe20a0d096.jpg', '哔哩哔哩大会员': 'http://i1.hdslb.com/bfs/face/7266e09e212b2e5109465d877dfa9a58e4ba33d4.jpg', '周六野Zoey': 'http://i1.hdslb.com/bfs/face/c95abd1564994c36dd481c21acf14a83aeb61ab8.jpg', '老四赶海': 'http://i0.hdslb.com/bfs/face/d6b4a211a323f1c8fa35f42b7caca6bcdb9e68c9.jpg', '纯黑-V-布里塔尼亚': 'http://i2.hdslb.com/bfs/face/e8ab7b02d6576f4141ea857734b68b9dd35a5730.jpg', '阿漫啊阿漫': 'http://i2.hdslb.com/bfs/face/ec38b52e36d4577fe865e43bdeef4375221f1250.jpg', '秋呗叔岳': 'http://i1.hdslb.com/bfs/face/71805f5a1971f2637059da49e46f5eafbe4223cb.jpg', '靠谱电竞': 'http://i2.hdslb.com/bfs/face/b37e8c05448cb24b15a4f29581c66fdff1062147.jpg', '百万剪辑狮': 'http://i1.hdslb.com/bfs/face/7974bacea323324b7175055e7a942a676f994132.jpg', '电影最TOP': 'http://i0.hdslb.com/bfs/face/6b2ade215ea603b495648875c925172a863d16d4.jpg', '刘哔电影': 'http://i0.hdslb.com/bfs/face/654ed89c763ca1f003cc2d0145e1ec444cc7f0e8.jpg', '-LKs-': 'http://i2.hdslb.com/bfs/face/6bc7a2778be455273c82e45bfca55cb0f70c820b.jpg', '嘻咦啊看': 'http://i2.hdslb.com/bfs/face/170e1009846dd3ad8cdcd4050527b9cf2ecf67ab.jpg', '凉下采桑': 'http://i0.hdslb.com/bfs/face/c21e4fb4d44e033d7d15283057aa2021e5d0bfb7.jpg', '哔哩哔哩番剧': 'http://i1.hdslb.com/bfs/face/60a9153609998b04301dc5b8ed44c41b537a2268.jpg', '一只南音呀': 'http://i2.hdslb.com/bfs/face/166e2039c60e0b6d8d24bf76bb3628f08804a431.jpg', '蜡笔和小勋': 'http://i2.hdslb.com/bfs/face/8a8812e0a9bb3adda90044ef48830584e1efe7a2.jpg', 'skyhahalife': 'http://i0.hdslb.com/bfs/face/2e5cada7e126a5c43c2ba5576c4dd1725b7075a0.jpg', '芳斯塔芙': 'http://i1.hdslb.com/bfs/face/b3e851174abfbe9622cf3d336f302531d9cdfc97.jpg', '哔哩哔哩游戏中心': 'http://i2.hdslb.com/bfs/face/509a6ca97e1599e55739b048c0770ce4a63531a5.jpg', '山海zoom': 'http://i0.hdslb.com/bfs/face/7289f4e7032cef7c6be32b07f8c559a46627d293.jpg', '是当归哦': 'http://i0.hdslb.com/bfs/face/201b100685362d362434bbbed11834a2a6753401.jpg', '黑桐谷歌': 'http://i0.hdslb.com/bfs/face/31706c82949b3ba4756a411825c3f16aeb14ad44.jpg', 'PDD在B站': 'http://i0.hdslb.com/bfs/face/10a4d2e4b20cf2c08c17604f7a192eb9d7d293f2.jpg', '伊丽莎白鼠': 'http://i2.hdslb.com/bfs/face/6c36ec15f6d7ddd9bdb558511521bd0256779e1c.jpg', '潮汕好男人': 'http://i0.hdslb.com/bfs/face/2a4c1b3c2d8f48a58c5f818287c90a6ea78d7907.jpg', '起小点是大腿': 'http://i0.hdslb.com/bfs/face/94b65197150887e1bc1671be6598d7fd2ea3132b.jpg', '观察者网': 'http://i2.hdslb.com/bfs/face/790c525e30faa01b855e2d4f1126d7e8a7632af0.jpg', '纯黑-克劳狄乌斯': 'http://i0.hdslb.com/bfs/face/e8ab7b02d6576f4141ea857734b68b9dd35a5730.jpg', '徐大sao': 'http://i2.hdslb.com/bfs/face/17323892be2e17b50acadb89b2e385b493f5fb4d.jpg', 'STN工作室': 'http://i1.hdslb.com/bfs/face/c43e6cab13c9a0303cf8476cfd405cff61195726.jpg', '皇族电子竞技俱乐部官方账号': 'http://i2.hdslb.com/bfs/face/ab85323352b2af52e7a5477cd963d5da98fdab4e.jpg', '牛叔万岁万岁万万岁': 'http://i2.hdslb.com/bfs/face/3c9d436659037cc17fe0d32975ef7f17710ba820.jpg', 'TESTV官方频道': 'http://i1.hdslb.com/bfs/face/34ccaf9461c67482e3164675c0036e94df18b7a7.jpg', '聚印象视频': 'http://i0.hdslb.com/bfs/face/98e2202d8e2725fe3a4b7c3adafeb91764584bbb.jpg', '暗猫の祝福': 'http://i0.hdslb.com/bfs/face/c286d67fbc3abe9f47c115fdae797e9b140156b9.jpg', '凤凰天使TSKS韩剧社官方账号': 'http://i2.hdslb.com/bfs/face/97cdf2282bb2c9d4e8ae5184356097844d1ba4d7.jpg', '努巴尼守望先锋': 'http://i2.hdslb.com/bfs/face/358e0b207a9c6012778e4ad1b21356c91536e790.jpg', '少年Pi': 'http://i0.hdslb.com/bfs/face/d851f48a579778b06249bf3debaa62d353694e91.jpg', '老实憨厚的笑笑': 'http://i1.hdslb.com/bfs/face/048853815037dd95d4193dd6a3d561db428a927f.jpg'} \ No newline at end of file diff --git a/run.py b/run.py index 0897e3b..765afc1 100644 --- a/run.py +++ b/run.py @@ -81,10 +81,9 @@ def run_threaded(job_func): schedule.every().week.do(run_threaded, video_spider_all) schedule.every().week.do(run_threaded, videoRank) schedule.every().hour.do(run_threaded, site) -schedule.every(15).minutes.do(run_threaded, online) +# schedule.every(15).minutes.do(run_threaded, online) schedule.every(10).minutes.do(run_threaded, strong) - print('开始运行计划任务..') while True: schedule.run_pending() From 6d34ef3d5913153d3e6ded079ab40350900e5c47 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 9 Apr 2019 19:35:35 +0800 Subject: [PATCH 259/469] feature: fans increasing rate for make video --- biliob_requests/get_user_coin.py | 18 ++++++++ get_data/aggregate_fans_rate.py | 78 +++++++++++++++++++++++--------- get_data/color.py | 1 + get_data/face.py | 1 + run.py | 3 +- 5 files changed, 77 insertions(+), 24 deletions(-) create mode 100644 biliob_requests/get_user_coin.py create mode 100644 get_data/color.py create mode 100644 get_data/face.py diff --git a/biliob_requests/get_user_coin.py b/biliob_requests/get_user_coin.py new file mode 100644 index 0000000..a570631 --- /dev/null +++ b/biliob_requests/get_user_coin.py @@ -0,0 +1,18 @@ +import requests +from db import db +import time +author_coll = db['author'] +URL = 'https://api.bilibili.com/x/space/acc/info?mid={mid}' +with open('D:/数据/B站/UP主硬币数.csv', 'w',encoding="utf-8-sig") as f: + for each_author in author_coll.find({}, {'mid': 1, 'name': 1}): + mid = each_author['mid'] + response = requests.get(URL.format(mid=mid)) + j = response.json() + + if 'code' in j and j['code'] != -404 and 'data' in j and 'coins' in j[ + 'data']: + print('"{}","{}"\n'.format(each_author['name'], + j['data']['coins'])) + f.write('"{}","{}"\n'.format(each_author['name'], + j['data']['coins'])) + time.sleep(0.5) \ No newline at end of file diff --git a/get_data/aggregate_fans_rate.py b/get_data/aggregate_fans_rate.py index 15b34dd..4447bdb 100644 --- a/get_data/aggregate_fans_rate.py +++ b/get_data/aggregate_fans_rate.py @@ -1,62 +1,96 @@ from db import db import datetime from scipy.interpolate import interp1d +from haishoku.haishoku import Haishoku + +from face import face +from color import color start_date = datetime.datetime(2018, 11, 1) end_date = datetime.datetime.now() -date_range = 7 * 24 * 60 * 60 +date_range = 30 * 24 * 60 * 60 delta_date = 0.25 * 24 * 60 * 60 date_format = '%Y-%m-%d %H:%M' d = {} current_date = start_date.timestamp() while (current_date < end_date.timestamp()): - c_date = datetime.datetime.fromtimestamp( - current_date).strftime(date_format) + c_date = datetime.datetime.fromtimestamp(current_date).strftime( + date_format) d[c_date] = [] current_date += delta_date - -for each_author in db['author'].find({'cFans': {'$gt': 200000}}).batch_size(1): +for each_author in db['author'].find({'cFans': {'$gt': 200000}}).batch_size(5): current_date = start_date.timestamp() data = sorted(each_author['data'], key=lambda x: x['datetime']) - x = list(map( - lambda each_data: each_data['datetime'].timestamp(), data)) + x = list(map(lambda each_data: each_data['datetime'].timestamp(), data)) y = list(map(lambda each_data: each_data['fans'], data)) if len(x) <= 2: continue interrupted_fans = interp1d(x, y, kind='linear') current_date = start_date.timestamp() - while (current_date < min(end_date.timestamp(), x[-1])): + begin_date = current_date - date_range + if begin_date <= x[0]: + begin_date = x[0] # 出界 - if (current_date - date_range) > x[0] and current_date < x[-1]: - fans_func = interrupted_fans( - [current_date - date_range, current_date]) + if begin_date >= x[0] and current_date < x[-1] and current_date > x[0]: + fans_func = interrupted_fans([begin_date, current_date]) delta_fans = int(fans_func[1] - fans_func[0]) pass - c_date = datetime.datetime.fromtimestamp( - current_date).strftime(date_format) - print('"{}","{}","{}"'.format( - each_author['name'], delta_fans, c_date)) + c_date = datetime.datetime.fromtimestamp(current_date).strftime( + date_format) + print('"{}","{}","{}"'.format(each_author['name'], delta_fans, + c_date)) # d[c_date].append((delta_fans, each_author['name'])) - d[c_date].append((each_author['name'], delta_fans)) + d[c_date].append((each_author['name'], delta_fans, + each_author['face'])) if len(d[c_date]) >= 200: d[c_date] = sorted( d[c_date], key=lambda x: x[1], reverse=True)[:20] current_date += delta_date +for c_date in d: + d[c_date] = sorted(d[c_date], key=lambda x: x[1], reverse=True)[:20] -d[c_date] = sorted( - d[c_date], key=lambda x: x[1], reverse=True)[:20] - -with open('D:/数据/B站/fans/190319.csv', 'w', encoding="utf-8-sig") as f: +with open('D:/数据/B站/fans/月结粉丝.csv', 'w', encoding="utf-8-sig") as f: f.writelines('date,name,value\n') for each_date in d: for each_data in d[each_date]: - f.writelines('"{}","{}","{}"\n'.format( - each_date, each_data[0], each_data[1])) + f.writelines('"{}","{}","{}"\n'.format(each_date, each_data[0], + each_data[1])) +authors = [] +for each_date in d: + for each_author in d[each_date]: + authors.append(each_author[0]) + if each_author[0] not in face: + face[each_author[0]] = each_author[2] +with open('./get_data/face.py', 'w', encoding="utf-8-sig") as f: + f.writelines('face = ' + str(face)) + +for each_author in face: + if each_author in color: + continue + if face[each_author][-3:] == 'gif': + color[each_author] = '#000000' + else: + color_list = Haishoku.getPalette(face[each_author]) + color_list = sorted( + color_list, key=lambda x: x[1][0] + x[1][1] + x[1][2]) + color[each_author] = 'rgb' + \ + str(color_list[int(len(color_list)/2)][1]) + +with open('./get_data/color.py', 'w', encoding="utf-8-sig") as f: + f.writelines('color = ' + str(color)) + +min_fans = 99999999 +for each_author in authors: + c_fans = db['author'].find_one({'name': each_author}, + {'cFans': True})['cFans'] + if c_fans <= min_fans: + min_fans = c_fans +print(min_fans) \ No newline at end of file diff --git a/get_data/color.py b/get_data/color.py new file mode 100644 index 0000000..2a8321c --- /dev/null +++ b/get_data/color.py @@ -0,0 +1 @@ +color = {'华农兄弟': 'rgb(241, 124, 23)', '信誓蛋蛋': 'rgb(132, 112, 117)', '痴鸡小队官方': 'rgb(224, 147, 35)', '中国BOY超级大猩猩': 'rgb(232, 105, 10)', '唯一音乐小魔王': 'rgb(122, 146, 147)', '-欣小萌-': 'rgb(184, 157, 139)', '英雄联盟': 'rgb(52, 116, 193)', '爱做饭的芋头SAMA': 'rgb(123, 120, 124)', '美食作家王刚R': 'rgb(137, 138, 126)', '盗月社食遇记': 'rgb(200, 140, 125)', '怕上火暴王老菊': 'rgb(212, 82, 88)', '机智的党妹': 'rgb(158, 120, 105)', '靠脸吃饭的徐大王': 'rgb(135, 144, 179)', '长歌与小见见': 'rgb(205, 144, 127)', 'papi酱': 'rgb(140, 140, 140)', '我是郭杰瑞': 'rgb(72, 34, 183)', '倒悬的橘子': 'rgb(154, 126, 120)', 'LexBurner': 'rgb(149, 137, 135)', '狂人实验室': 'rgb(129, 130, 136)', '木鱼水心': 'rgb(128, 71, 179)', '允星河Yoseya': 'rgb(231, 147, 136)', '翔翔大作战': 'rgb(206, 150, 119)', '共青团中央': 'rgb(250, 29, 27)', '猎奇笔记本': 'rgb(237, 126, 40)', '吴织亚切大忽悠': 'rgb(207, 91, 81)', '哔哩哔哩英雄联盟赛事': 'rgb(110, 127, 203)', '赫萝老师': 'rgb(197, 130, 81)', '老番茄': 'rgb(250, 29, 27)', 'MordonFreeman': 'rgb(84, 100, 153)', '花花与三猫CatLive': 'rgb(144, 130, 109)', 'EdmundDZhang': 'rgb(171, 221, 72)', '浅澄月': 'rgb(161, 119, 108)', '徐大虾咯': 'rgb(211, 141, 57)', '王大境泽': 'rgb(127, 113, 112)', '哔哩哔哩线下活动': 'rgb(124, 64, 115)', '神秘店长A': 'rgb(114, 111, 114)', '楼上的老张': 'rgb(151, 151, 151)', '土味角虫': 'rgb(122, 125, 119)', '低调的帅爷': 'rgb(193, 112, 73)', '靖菌命': 'rgb(202, 135, 136)', '日本沙雕日常': 'rgb(138, 121, 110)', 'MrYang杨家成': 'rgb(145, 127, 122)', '黑镖客梦回': 'rgb(80, 79, 80)', '无聊的Do君': 'rgb(209, 119, 82)', '泛式': 'rgb(151, 116, 111)', '芒果冰OL': 'rgb(186, 156, 151)', '面筋哥-程书林': 'rgb(179, 111, 79)', '刘老师说电影': 'rgb(133, 128, 114)', '李子柒': 'rgb(128, 125, 118)', '10后找人带': 'rgb(250, 29, 27)', '丝血反杀闰土的猹': 'rgb(188, 121, 67)', '孤独的美食基': 'rgb(185, 122, 86)', '非我执笔': 'rgb(180, 114, 108)', '神奇的老皮': '#222222', '敖厂长': 'rgb(231, 187, 88)', '扎双马尾的丧尸': '#333333', '采紫葳的凌霄子': 'rgb(199, 155, 150)', '痒局长': 'rgb(207, 69, 119)', '夜刀神吉吉': 'rgb(209, 151, 115)', '路人A-': 'rgb(44, 160, 178)', '开心嘴炮': 'rgb(130, 111, 129)', '小高姐的魔法调料': 'rgb(145, 121, 110)', '科技美学': 'rgb(130, 118, 132)', '何必Hebee': 'rgb(211, 141, 68)', '优酱胖头鱼': 'rgb(181, 154, 125)', '手工耿': 'rgb(127, 128, 137)', '马壮实Hera': 'rgb(129, 139, 109)', '码哥与马也的日常': 'rgb(194, 154, 144)', '哔哩哔哩纪录片': 'rgb(113, 202, 226)', 'EmmaThePolaris': 'rgb(128, 122, 123)', '努力的Lorre': 'rgb(140, 116, 98)', '进击的冰糖': 'rgb(210, 121, 124)', '故事王StoryMan': 'rgb(132, 112, 26)', '饭帅fun': 'rgb(117, 103, 149)', '三木刃': 'rgb(105, 104, 131)', '点滴菌': 'rgb(17, 128, 243)', '多多poi丶': 'rgb(218, 108, 93)', '紧张的猫饼': 'rgb(92, 128, 156)', '哦呼w': 'rgb(92, 128, 156)', '杰里德Jared': 'rgb(196, 153, 134)', '哎哟阿尤': 'rgb(79, 24, 193)', '冷面Lim': 'rgb(149, 193, 182)', '留学的真相': 'rgb(198, 94, 157)', '辣目洋子': 'rgb(142, 122, 123)', '毒角SHOW': 'rgb(131, 129, 130)', '敬汉卿': 'rgb(142, 131, 124)', '蜡笔小勋是一对儿': 'rgb(190, 154, 154)', '王咩阿': 'rgb(208, 142, 127)', '你的可樱已上线': 'rgb(191, 110, 121)', 'bilibili电影': 'rgb(118, 201, 228)', 'Vinheteiro': 'rgb(151, 123, 105)', 'zy戏精学院院长': 'rgb(190, 148, 126)', 'ADC芒果': 'rgb(230, 176, 77)', '=咬人猫=': 'rgb(199, 155, 120)', '神秘学调查员': 'rgb(147, 142, 124)', '无聊的开箱': 'rgb(144, 114, 16)', '小可儿': 'rgb(114, 132, 190)', '茶理理理子': 'rgb(41, 114, 201)', '郝给力': 'rgb(41, 114, 201)', '燃茶哥哥在此': 'rgb(198, 144, 104)', 'Mafumafu_Channel': 'rgb(249, 101, 106)', '卡卡会发光guang': 'rgb(178, 133, 79)', '贤宝宝Baby': 'rgb(119, 110, 97)', '崩坏3第一偶像爱酱': 'rgb(206, 135, 74)', '渔人阿烽': 'rgb(120, 161, 192)', '十音Shiyin': 'rgb(97, 98, 110)', '还有一天就放假了': 'rgb(212, 141, 60)', '槐安遗梦': 'rgb(158, 131, 101)', '喵喵折App': 'rgb(254, 150, 150)', '四季萌芽': 'rgb(202, 130, 120)', 'A路人': 'rgb(181, 115, 79)', '吃鸡陪玩酱': 'rgb(212, 148, 122)', '哇哇哇妹': 'rgb(249, 108, 127)', '极客湾Geekerwan': 'rgb(117, 176, 195)', '哔哩哔哩会员购': 'rgb(232, 179, 193)', '视角姬': 'rgb(77, 180, 63)', '萧忆情Alex': 'rgb(216, 147, 141)', '狂风桑': 'rgb(193, 155, 126)', '老邪说电影': 'rgb(165, 179, 8)', '某幻君': 'rgb(182, 139, 139)', 'low君热剧': 'rgb(127, 133, 128)', '大连老湿王博文': 'rgb(226, 138, 127)', '野食小哥': 'rgb(136, 127, 121)', '洛天依': 'rgb(152, 141, 180)', 'bilibili星访问': 'rgb(200, 200, 37)', '桃核叫我桃道长': 'rgb(133, 133, 133)', '水一大魔王': 'rgb(233, 69, 150)', '不正经老丝': 'rgb(184, 121, 79)', '白上吹雪Official': 'rgb(188, 168, 179)', '喝水少年孙十一': 'rgb(198, 145, 127)', '指法芬芳张大仙': 'rgb(182, 152, 150)', 'OELoop': 'rgb(148, 158, 192)', '山药视频': 'rgb(2, 129, 202)', '抽风Crazy': 'rgb(120, 114, 112)', '哔哩哔哩弹幕网': 'rgb(117, 200, 228)', '凉风Kaze': 'rgb(204, 157, 118)', '1818黄金眼': 'rgb(129, 130, 129)', '吃素的狮子': 'rgb(141, 135, 111)', '远古时代装机猿': 'rgb(127, 127, 127)', '曼食慢语': 'rgb(131, 131, 131)', '影视飓风': 'rgb(124, 124, 124)', '渗透之C君': 'rgb(251, 81, 90)', '花少北丶': 'rgb(111, 104, 112)', '声鱼片儿': 'rgb(230, 97, 30)', '暴走漫画': 'rgb(229, 152, 55)', '阿神的Bili官方頻道': 'rgb(178, 161, 142)', '夹性芝士': 'rgb(127, 127, 127)', 'AS极客': 'rgb(138, 126, 105)', '哔哩哔哩活动': 'rgb(189, 82, 110)', '丰兄来了': 'rgb(231, 153, 152)', 'scyrax': 'rgb(210, 133, 126)', '拉宏桑': 'rgb(55, 135, 212)', '郁郁_Yu': 'rgb(130, 140, 206)', '是你的霹雳': 'rgb(104, 119, 215)', '哔哩哔哩直播': 'rgb(20, 177, 234)', 'NathanRich火锅大王': 'rgb(142, 127, 125)', '潇湘公子寻': 'rgb(190, 168, 114)', '纳豆奶奶': 'rgb(174, 125, 79)', '逍遥散人': 'rgb(113, 135, 144)', '郭乐乐很努力': 'rgb(126, 120, 128)', '宝剑嫂': 'rgb(119, 111, 119)', 'colinzang': 'rgb(144, 119, 114)', '曹译文iris': 'rgb(127, 130, 127)', '歪果仁研究协会': 'rgb(201, 166, 91)', '音乐君': 'rgb(203, 107, 78)', '李永乐老师官方': 'rgb(71, 101, 183)', '库特菌': 'rgb(198, 149, 131)', '东尼ookii': 'rgb(182, 149, 125)', '力元君': 'rgb(219, 30, 81)', '哔哩哔哩大会员': 'rgb(251, 166, 190)', '周六野Zoey': 'rgb(159, 163, 178)', '老四赶海': 'rgb(227, 142, 113)', '纯黑-V-布里塔尼亚': 'rgb(187, 164, 159)', '阿漫啊阿漫': 'rgb(20, 102, 144)', '秋呗叔岳': 'rgb(181, 158, 143)', '靠谱电竞': 'rgb(195, 164, 38)', '百万剪辑狮': 'rgb(126, 126, 126)', '电影最TOP': 'rgb(193, 159, 60)', '刘哔电影': 'rgb(243, 92, 70)', '-LKs-': 'rgb(146, 138, 198)', '嘻咦啊看': 'rgb(228, 94, 73)', '凉下采桑': 'rgb(252, 252, 252)', '哔哩哔哩番剧': 'rgb(118, 201, 228)', '一只南音呀': 'rgb(124, 126, 123)', '蜡笔和小勋': 'rgb(190, 154, 154)', 'skyhahalife': 'rgb(150, 150, 150)', '芳斯塔芙': 'rgb(204, 181, 172)', '哔哩哔哩游戏中心': 'rgb(92, 145, 220)', '山海zoom': 'rgb(178, 121, 74)', '是当归哦': 'rgb(151, 136, 128)', '黑桐谷歌': 'rgb(220, 123, 111)', 'PDD在B站': 'rgb(138, 117, 109)', '伊丽莎白鼠': 'rgb(190, 148, 134)', '潮汕好男人': 'rgb(117, 142, 140)', '起小点是大腿': 'rgb(243, 158, 124)', '观察者网': 'rgb(230, 168, 172)', '纯黑-克劳狄乌斯': 'rgb(187, 164, 159)', '徐大sao': 'rgb(187, 186, 196)', 'STN工作室': 'rgb(187, 123, 9)', '皇族电子竞技俱乐部官方账号': 'rgb(144, 126, 104)', '牛叔万岁万岁万万岁': 'rgb(75, 100, 95)', 'TESTV官方频道': 'rgb(135, 131, 121)', '聚印象视频': 'rgb(127, 127, 127)', '暗猫の祝福': 'rgb(158, 200, 207)', '凤凰天使TSKS韩剧社官方账号': 'rgb(180, 147, 139)', '努巴尼守望先锋': 'rgb(142, 144, 147)', '少年Pi': 'rgb(152, 133, 119)', '老实憨厚的笑笑': 'rgb(193, 147, 108)'} \ No newline at end of file diff --git a/get_data/face.py b/get_data/face.py new file mode 100644 index 0000000..b312adb --- /dev/null +++ b/get_data/face.py @@ -0,0 +1 @@ +face = {'华农兄弟': 'http://i1.hdslb.com/bfs/face/bac504655c69ab937b0be4557e27535f794b0c66.jpg', '信誓蛋蛋': 'http://i2.hdslb.com/bfs/face/df0be0f1946581030cbaf34e3f66a996f0a1af4a.jpg', '痴鸡小队官方': 'http://i2.hdslb.com/bfs/face/25b9ca4626a41dbb249a2cf144b14200cb4c34e1.jpg', '中国BOY超级大猩猩': 'http://i1.hdslb.com/bfs/face/068939602dae190c86f6b36ca301281d7d8aa6d9.jpg', '唯一音乐小魔王': 'http://i1.hdslb.com/bfs/face/d17ddb244a783fa8179c362209080c48716beebf.jpg', '-欣小萌-': 'http://i1.hdslb.com/bfs/face/b5aee8b711fd655c70d705678b4e350ae255a1d0.jpg', '英雄联盟': 'http://i1.hdslb.com/bfs/face/04d579b1644ee8864e7aea01219dae4a94b469ce.jpg', '爱做饭的芋头SAMA': 'http://i1.hdslb.com/bfs/face/69c2df87253eabe27e8257dc827d186cceabc3f1.jpg', '美食作家王刚R': 'http://i0.hdslb.com/bfs/face/1463fa4ea6bffd867dc257dca87248bb1d671cde.jpg', '盗月社食遇记': 'http://i1.hdslb.com/bfs/face/ab901fc0571698bb9f389798029c3fc4c0188311.jpg', '怕上火暴王老菊': 'http://i0.hdslb.com/bfs/face/2edf4a4f534869a63158d13a4b6b9676d75f1e0a.jpg', '机智的党妹': 'http://i0.hdslb.com/bfs/face/d83e93dc9101cc0c416ca09ad33a63bdf3d26c6c.jpg', '靠脸吃饭的徐大王': 'http://i1.hdslb.com/bfs/face/0e6e0f313a195e293d4ee6ae8ab86a3074abb315.jpg', '长歌与小见见': 'http://i2.hdslb.com/bfs/face/04bf6928fcdb0452dee9e1aed6a6ff1becd51bcd.jpg', 'papi酱': 'http://i0.hdslb.com/bfs/face/e45a7b248f496fad8b32d3b9cfa0335339331798.jpg', '我是郭杰瑞': 'http://i1.hdslb.com/bfs/face/6182455e4d61159121c223ddc7a3a381f2d4d056.jpg', '倒悬的橘子': 'http://i0.hdslb.com/bfs/face/440968b6694576e931ed4ec61d699029d82bcbaa.jpg', 'LexBurner': 'http://i1.hdslb.com/bfs/face/2996e22a24eed2d7767e452627a9130207defe6a.jpg', '狂人实验室': 'http://i1.hdslb.com/bfs/face/4e5e30311607340a607f52a8bfbc74b12d00558d.jpg', '木鱼水心': 'http://i1.hdslb.com/bfs/face/696df59d35c78430f1a0bdb6184558e7b7eb4a6e.jpg', '允星河Yoseya': 'http://i0.hdslb.com/bfs/face/6f0920f35ae664e937d455ba4e1ef1ac10fb80b5.jpg', '翔翔大作战': 'http://i0.hdslb.com/bfs/face/1ad13832c10e8018dc8e0e7671a6b8594ddab0c0.jpg', '共青团中央': 'http://i2.hdslb.com/bfs/face/52e16dad8aa29b6214bbfadf702e83eeac34ad9f.jpg', '猎奇笔记本': 'http://i2.hdslb.com/bfs/face/e3cb24355694fa887741871a9a1e22ae590e3769.jpg', '吴织亚切大忽悠': 'http://i1.hdslb.com/bfs/face/3b89afc4e25e534a8e4165952116aa9265ea201f.jpg', '哔哩哔哩英雄联盟赛事': 'http://i2.hdslb.com/bfs/face/f07c74fe2a020b33ab1035fea6d3338b6a6e6749.jpg', '赫萝老师': 'http://i0.hdslb.com/bfs/face/f3776594fc0ff076bdfcc8fc4921327239a7150e.jpg', '老番茄': 'http://i2.hdslb.com/bfs/face/bc5ca101313d4db223c395d64779e76eb3482d60.jpg', 'MordonFreeman': 'http://i2.hdslb.com/bfs/face/c076aac067d1d32c4bdd7b6aa5dc1930185bf91a.jpg', '花花与三猫CatLive': 'http://i1.hdslb.com/bfs/face/1871c834255ffea531f699164e70f0daebc7558b.jpg', 'EdmundDZhang': 'http://i2.hdslb.com/bfs/face/5d94b9727a49815716cd66fc7ba3840382025c56.jpg', '浅澄月': 'http://i2.hdslb.com/bfs/face/512fd36e0fb24746f04aa1073ea89c1a2f91f7e1.jpg', '徐大虾咯': 'http://i0.hdslb.com/bfs/face/f24ea395c75b7a73db57da2a2920f6e84c902082.jpg', '王大境泽': 'http://i0.hdslb.com/bfs/face/fd66b20e63fe0a3e2331892feb7ecc5ff871dcb5.jpg', '哔哩哔哩线下活动': 'http://i1.hdslb.com/bfs/face/10c354765e8a1b3fa47fba1594edc866145bd79b.jpg', '神秘店长A': 'http://i1.hdslb.com/bfs/face/cdb729f3b5cb6dc9a0cb7cc9b8c63aebf1bc5b6b.jpg', '楼上的老张': 'http://i0.hdslb.com/bfs/face/b159d3f9a0ba088dbfc845a849e84bb9f110e6f2.jpg', '土味角虫': 'http://i1.hdslb.com/bfs/face/21d2a655e44aa4844c7353d138b33581f7aaa94f.jpg', '低调的帅爷': 'http://i0.hdslb.com/bfs/face/e42e975697d7237d96992210cf801ae5e87af354.jpg', '靖菌命': 'http://i2.hdslb.com/bfs/face/6f6411b3d701ad213df75b8f9ad8910fc1ebe408.jpg', '日本沙雕日常': 'http://i2.hdslb.com/bfs/face/68aa0664390afa981cf78d8bda4042ec55d26170.jpg', 'MrYang杨家成': 'http://i0.hdslb.com/bfs/face/623fea846d8ba3b11e36d6dbc44baca08238a3d3.jpg', '黑镖客梦回': 'http://i1.hdslb.com/bfs/face/83c8bf808e662d02291a41ab5992541e2707a5d2.jpg', '无聊的Do君': 'http://i0.hdslb.com/bfs/face/d00ef6d8ceea8edee3cd61e5e87bff036189a5bb.jpg', '泛式': 'http://i2.hdslb.com/bfs/face/5f60d345059b82f0878984d9f9133f45b33b82be.jpg', '芒果冰OL': 'http://i0.hdslb.com/bfs/face/f40b734ef61f95f8adb3beca5b7b693db399c50e.jpg', '面筋哥-程书林': 'http://i1.hdslb.com/bfs/face/44fd943316e177fcf91ebde537bcff65f7a84515.jpg', '刘老师说电影': 'http://i1.hdslb.com/bfs/face/145226ff2d32d7c99b8ea3591ffec2f38fc9d3d3.jpg', '李子柒': 'http://i1.hdslb.com/bfs/face/82d27965dae3b2fe9e52780c6309c7b37ad4cbf2.jpg', '10后找人带': 'http://i0.hdslb.com/bfs/face/b8b3badb8344b72f1f4746eac2817f8423aeec2b.jpg', '丝血反杀闰土的猹': 'http://i2.hdslb.com/bfs/face/9672e303acae98a22efb54d0319e60294db887c3.jpg', '孤独的美食基': 'http://i2.hdslb.com/bfs/face/df188387360d6ad90f9e36ac4aaea68ed9f3c9be.jpg', '非我执笔': 'http://i2.hdslb.com/bfs/face/4814cdf4293ba26839f895bd478efaf21bf299db.jpg', '神奇的老皮': 'http://i2.hdslb.com/bfs/face/e7c191e9be6764107415069b36f7d9564f149c86.gif', '敖厂长': 'http://i0.hdslb.com/bfs/face/156d5d3b3f4b66d940365b3b0e3a809e1fcc0d97.jpg', '扎双马尾的丧尸': 'http://i1.hdslb.com/bfs/face/5be61949369dd844cc459eab808da151d8c363d2.gif', '采紫葳的凌霄子': 'http://i0.hdslb.com/bfs/face/b3d6d9871475f15db85340e1ed12c93f2a8b81a9.jpg', '痒局长': 'http://i2.hdslb.com/bfs/face/bcdf640faa16ebaacea1d4c930baabaec9087a80.jpg', '夜刀神吉吉': 'http://i0.hdslb.com/bfs/face/04c6f83245fdfc39440c19c5e70f73a5351d0ada.jpg', '路人A-': 'http://i0.hdslb.com/bfs/face/0383674a5cf40d2163af1f5b80f8cd90a2d62e2c.jpg', '开心嘴炮': 'http://i1.hdslb.com/bfs/face/5cd3053f352bc597bac3d11b824e548108423cb2.jpg', '小高姐的魔法调料': 'http://i2.hdslb.com/bfs/face/aae000e04ce872b58e6bbfc1632cc5bc2203084c.jpg', '科技美学': 'http://i0.hdslb.com/bfs/face/f6f8dc53ddf3ba07c1f0dd3ad87fe92034198b81.jpg', '何必Hebee': 'http://i0.hdslb.com/bfs/face/08032c6289096a9e6869a5cd5c05280060b10532.jpg', '优酱胖头鱼': 'http://i0.hdslb.com/bfs/face/7a1f15e4fc51e16c32752e10e40e7fa3ffa81eb6.jpg', '手工耿': 'http://i2.hdslb.com/bfs/face/b8a75ae7d2a0e2af1d36ca9f1084d850eebb28e3.jpg', '马壮实Hera': 'http://i2.hdslb.com/bfs/face/92d75c347df9a50cdd691b6c62dafff93138be8e.jpg', '码哥与马也的日常': 'http://i0.hdslb.com/bfs/face/692b776a39b208aacfb6ef6a5ccfb6cfe2861bb6.jpg', '哔哩哔哩纪录片': 'http://i1.hdslb.com/bfs/face/33687c6c4707352cd25fac995cd416009830c917.jpg', 'EmmaThePolaris': 'http://i2.hdslb.com/bfs/face/5433666ef01f1e51e3f21bd3d509ed7bb68eff87.jpg', '努力的Lorre': 'http://i1.hdslb.com/bfs/face/c63ebeed7d49967e2348ef953b539f8de90c5140.jpg', '进击的冰糖': 'http://i0.hdslb.com/bfs/face/8294462b9c92d587c5982a5ec5008d808325056e.jpg', '故事王StoryMan': 'http://i2.hdslb.com/bfs/face/b55679bd383423cb02f0992e44f19a68c6f5fd1b.jpg', '饭帅fun': 'http://i1.hdslb.com/bfs/face/1f8c1c2665d8c1cbb14c4dbe5e09d06f0e78e314.jpg', '三木刃': 'http://i0.hdslb.com/bfs/face/785f79f302e6166079ad2fef933dcbfd435cca4b.jpg', '点滴菌': 'http://i2.hdslb.com/bfs/face/fd8c1c3a5a454c75eafe12464989e9d794179d29.jpg', '多多poi丶': 'http://i2.hdslb.com/bfs/face/3f55db249421c556084719cd9581036d67e93ed4.jpg', '紧张的猫饼': 'http://i2.hdslb.com/bfs/face/fc1b1cd176a854c4fd718694b9216469fa148f4b.jpg', '哦呼w': 'http://i0.hdslb.com/bfs/face/57ead5621801ec8a637bc47754e00e9ae6e62888.gif', '杰里德Jared': 'http://i2.hdslb.com/bfs/face/5138b1aea6da8a7e9962562208ed848a417e207b.jpg', '哎哟阿尤': 'http://i0.hdslb.com/bfs/face/39f892f717cd18ad5de3d61c1a99c3e50f6ab390.jpg', '冷面Lim': 'http://i0.hdslb.com/bfs/face/a52f915170f1f3bfbe8e1f59a5b512936669cade.jpg', '留学的真相': 'http://i1.hdslb.com/bfs/face/7af19e342cb560870733804a94e46a45a1d1e771.jpg', '辣目洋子': 'http://i0.hdslb.com/bfs/face/a2c5ca6dd5be6b89e94294e06f9e365a29b7943b.jpg', '毒角SHOW': 'http://i0.hdslb.com/bfs/face/24486911dc40a0faa23b025b4493f15b086c65cc.jpg', '敬汉卿': 'http://i1.hdslb.com/bfs/face/a5c6005a27da6afd52021dc07423f7b4a78a466c.jpg', '蜡笔小勋是一对儿': 'http://i0.hdslb.com/bfs/face/8a8812e0a9bb3adda90044ef48830584e1efe7a2.jpg', '王咩阿': 'http://i1.hdslb.com/bfs/face/dce221c5c508ca338eb0428a751e27b41ec7bfeb.jpg', '你的可樱已上线': 'http://i1.hdslb.com/bfs/face/d72b7fe43b556fb5872ab9bdad771b1476928796.jpg', 'bilibili电影': 'http://i0.hdslb.com/bfs/face/60a9153609998b04301dc5b8ed44c41b537a2268.jpg', 'Vinheteiro': 'http://i2.hdslb.com/bfs/face/dff03339974b20517bbe26ce49ea9d2a39831023.jpg', 'zy戏精学院院长': 'http://i2.hdslb.com/bfs/face/9dc26bc30f200dd131b9479fc30c443264c39278.jpg', 'ADC芒果': 'http://i2.hdslb.com/bfs/face/cce7e724724581e1323d5bdd7e00796ee8aec8b8.jpg', '=咬人猫=': 'http://i1.hdslb.com/bfs/face/8fad84a4470f3d894d8f0dc95555ab8f2cb10a83.jpg', '神秘学调查员': 'http://i2.hdslb.com/bfs/face/b803792a721b61da30e46e7e93d7d818f4f0bc48.jpg', '无聊的开箱': 'http://i2.hdslb.com/bfs/face/9ecc839c3b28f9752bfae2834333ddc32930787f.jpg', '小可儿': 'http://i2.hdslb.com/bfs/face/4d7fbb6c47b097e297b958c8dc74287cd880fc4a.jpg', '茶理理理子': 'http://i0.hdslb.com/bfs/face/557d3f4b8cee7d413714f48f9ec671c4c44e1e6c.gif', '郝给力': 'http://i0.hdslb.com/bfs/face/94e057ae6738be1810e1747df306ac12e7c3aece.jpg', '燃茶哥哥在此': 'http://i1.hdslb.com/bfs/face/938eca5533223a6d30f0582eb3e5350049f9c1cf.jpg', 'Mafumafu_Channel': 'http://i0.hdslb.com/bfs/face/0cee96b40676ede1c7b956e636b240a4119da9f9.jpg', '卡卡会发光guang': 'http://i1.hdslb.com/bfs/face/c9dbb60c7d27a7bd9750d5c2de982c3d23d7dc82.jpg', '贤宝宝Baby': 'http://i1.hdslb.com/bfs/face/c91f1ff05b7da257278bb88d4959733ac12ab3b3.jpg', '崩坏3第一偶像爱酱': 'http://i2.hdslb.com/bfs/face/cb5facbc29275e6bdc4cfa9c20e47ecdb0fe3392.jpg', '渔人阿烽': 'http://i2.hdslb.com/bfs/face/efcf3f7f3acbd93247b04b69d705f67301bfd2f7.jpg', '十音Shiyin': 'http://i1.hdslb.com/bfs/face/71d8b38d96ec67a06ff87359a29418dd944c60b1.jpg', '还有一天就放假了': 'http://i1.hdslb.com/bfs/face/30e515fc2ff5435a5cfb31c289e0bcef860bad1c.jpg', '槐安遗梦': 'http://i1.hdslb.com/bfs/face/a7b82ad44b0194430bc9d8a6ebc94d418ec1b085.jpg', '喵喵折App': 'http://i1.hdslb.com/bfs/face/6582bf05f0a16716a68ddfce84342e239f7bcb68.jpg', '四季萌芽': 'http://i0.hdslb.com/bfs/face/2db05455674f3eeffdf9195f5ed51e7fa66bc763.jpg', 'A路人': 'http://i2.hdslb.com/bfs/face/c4022010115d00da6667a5cf799d5400067d7f66.jpg', '吃鸡陪玩酱': 'http://i0.hdslb.com/bfs/face/0d42041fabc7046f171f8de27d8972c76443a6af.jpg', '哇哇哇妹': 'http://i0.hdslb.com/bfs/face/c134be58f0a8a5bde4b96437c7e4e04968449a19.jpg', '极客湾Geekerwan': 'http://i0.hdslb.com/bfs/face/d0f7a7ee34a4a45c8390eb3a07e4d7f2d70bae91.jpg', '哔哩哔哩会员购': 'http://i2.hdslb.com/bfs/face/ba56c5cac0809c7f389f78b1e4dce4971ba07d52.jpg', '视角姬': 'http://i0.hdslb.com/bfs/face/851025ed4bc57ad6ae82a6314e72e249fd21d604.jpg', '萧忆情Alex': 'http://i1.hdslb.com/bfs/face/98ce5676de8391d3e3164c563866f32ceba9e18b.jpg', '狂风桑': 'http://i1.hdslb.com/bfs/face/5076846bb07c2d6ec442856f69214cae215301f3.jpg', '老邪说电影': 'http://i1.hdslb.com/bfs/face/0fbf6aad74852d4cd68f72d50de62d9f11f7e886.jpg', '某幻君': 'http://i1.hdslb.com/bfs/face/9ed5ebf1e3694d9cd2b4fcd1d353759ee83b3dfe.jpg', 'low君热剧': 'http://i1.hdslb.com/bfs/face/fabe17ef4b344336b872e3ded46a4a4b11140e7f.jpg', '大连老湿王博文': 'http://i0.hdslb.com/bfs/face/21fa2a84a8733cd91a940fad6af81147538ae968.jpg', '野食小哥': 'http://i1.hdslb.com/bfs/face/e0610a26bc510770d997385fb81b9c47157a53c4.jpg', '洛天依': 'http://i1.hdslb.com/bfs/face/67bd11fcd3be8fac5cef1a743a16f0a8cdf39463.jpg', 'bilibili星访问': 'http://i0.hdslb.com/bfs/face/f68925a967357060898da90fb3be14ebd5289879.jpg', '桃核叫我桃道长': 'http://i2.hdslb.com/bfs/face/16279e85ec9dc197f6d306abf84a7f8cd75a0b5e.jpg', '水一大魔王': 'http://i2.hdslb.com/bfs/face/ffdedf591d5ed5a5f584c61fec1add096a61d75a.jpg', '不正经老丝': 'http://i0.hdslb.com/bfs/face/126c00cc089a3c0f540da4525e6ba41452ad83fc.jpg', '白上吹雪Official': 'http://i0.hdslb.com/bfs/face/3b3f2cde975bf334e4f5948709e42b6569c0755d.jpg', '喝水少年孙十一': 'http://i0.hdslb.com/bfs/face/a258cfccf39862242b61328e11dbb926a3b6dbe1.jpg', '指法芬芳张大仙': 'http://i2.hdslb.com/bfs/face/6426336744dd49a744eca32855808073988bd2ce.jpg', 'OELoop': 'http://i1.hdslb.com/bfs/face/6ed036381a0d0a86714816ac393dd9bf1b0bef21.jpg', '山药视频': 'http://i0.hdslb.com/bfs/face/357b015de3b9f4c04527d4fefb844460397ac8b0.jpg', '抽风Crazy': 'http://i2.hdslb.com/bfs/face/04d92c0e30315e56e1365dc9ac2d2cf32e2fe039.jpg', '哔哩哔哩弹幕网': 'http://i0.hdslb.com/bfs/face/8aa6bd8cf269021ffa7a03f7a903ca899c11b7fb.jpg', '凉风Kaze': 'http://i1.hdslb.com/bfs/face/e0cc906bb531195e9ee9f3b575effdd2b056eaea.jpg', '1818黄金眼': 'http://i1.hdslb.com/bfs/face/4cc8d48c9fd68d3e511851b876e04bb953cb095e.jpg', '吃素的狮子': 'http://i0.hdslb.com/bfs/face/249bfa1b3d3e0932f533bc5364964b132fe9c6c2.jpg', '远古时代装机猿': 'http://i2.hdslb.com/bfs/face/ec008e32064705c576f3ffd73d20288e441d945f.jpg', '曼食慢语': 'http://i2.hdslb.com/bfs/face/49dab862b8acaf9a80a921df04c9532f1c826ebc.jpg', '影视飓风': 'http://i0.hdslb.com/bfs/face/c1733474892caa45952b2c09a89323157df7129a.jpg', '渗透之C君': 'http://i2.hdslb.com/bfs/face/623ccce0ab28b721edb61dd64749d91de18fb384.jpg', '花少北丶': 'http://i1.hdslb.com/bfs/face/86ef6895a8f88c80f2885e7eb9ba7989db437b93.jpg', '声鱼片儿': 'http://i1.hdslb.com/bfs/face/e25a06ab0e6e83d8e839dfccec43cc14089c308c.jpg', '暴走漫画': 'http://i2.hdslb.com/bfs/face/c93ff5df2cd0c0c9384b139c86d1a56a9540c9a2.jpg', '阿神的Bili官方頻道': 'http://i1.hdslb.com/bfs/face/a0648687f1b1b7b385b9abdffedb1ba4de78bab0.jpg', '夹性芝士': 'http://i1.hdslb.com/bfs/face/c66dd2ef0573ac91a5a85f79b0c2188b02c98633.jpg', 'AS极客': 'http://i2.hdslb.com/bfs/face/34379e726c3ee063571aea987ff3a27c5b1a536e.jpg', '哔哩哔哩活动': 'http://i0.hdslb.com/bfs/face/45728a09aa3cd19f13247fce58aad38ac46a0f1d.jpg', '丰兄来了': 'http://i2.hdslb.com/bfs/face/369f64c5f24324cf85eaedfc6104325bd6e950cf.jpg', 'scyrax': 'http://i0.hdslb.com/bfs/face/efc37d5941f67087c8b84e99760ae47721f8c443.jpg', '拉宏桑': 'http://i1.hdslb.com/bfs/face/e74603c80efa529c76dce7ede07f986da0af8b0d.jpg', '郁郁_Yu': 'http://i0.hdslb.com/bfs/face/7816841e2e6fefffbb785bb77f1654cc91224338.jpg', '是你的霹雳': 'http://i1.hdslb.com/bfs/face/691bcd9a0f2a77087af2a0962179deab1b87756d.jpg', '哔哩哔哩直播': 'http://i0.hdslb.com/bfs/face/58ca83240e58c3e6ce1ebb9828ce84f6f1b72863.jpg', 'NathanRich火锅大王': 'http://i0.hdslb.com/bfs/face/f2d0fa88345e2f511ae518cc655da9ded11471a8.jpg', '潇湘公子寻': 'http://i2.hdslb.com/bfs/face/a004cfeec6feb65fa421c9a4a49738999ab032de.jpg', '纳豆奶奶': 'http://i0.hdslb.com/bfs/face/0a8638b34173708fcf979bd1166fbb7fdb1110a4.jpg', '逍遥散人': 'http://i2.hdslb.com/bfs/face/d0dad8800774a4903547b1326d1fd927df47b4e9.jpg', '郭乐乐很努力': 'http://i1.hdslb.com/bfs/face/0b609106d937936fb24f92ec42e6ee08e1306a4e.jpg', '宝剑嫂': 'http://i1.hdslb.com/bfs/face/b4be6527726b04049a066d5a073afd7c1a733ee6.jpg', 'colinzang': 'http://i2.hdslb.com/bfs/face/a6be4da0320912851bd5f31479178ec2e5af45c3.jpg', '曹译文iris': 'http://i0.hdslb.com/bfs/face/a21a7c95042c5187575d6768ea5ec9ee331fee3a.jpg', '歪果仁研究协会': 'http://i2.hdslb.com/bfs/face/0ea49d2ad88658689eb31746c09f716c09581e4f.jpg', '音乐君': 'http://i2.hdslb.com/bfs/face/20ec194172acfdbca47539af7779ae4890c753ec.jpg', '李永乐老师官方': 'http://i0.hdslb.com/bfs/face/b299b0e20dc4fbbd45e13ba86f330d890c0a0118.jpg', '库特菌': 'http://i0.hdslb.com/bfs/face/82a5d10cd93a90ec1b255b279578fe3801908831.jpg', '东尼ookii': 'http://i2.hdslb.com/bfs/face/c2c3ce926ec8d9edc361ca706c9dce9415c15114.jpg', '力元君': 'http://i1.hdslb.com/bfs/face/b831ab8e458d7763d397c41f7b1298fe20a0d096.jpg', '哔哩哔哩大会员': 'http://i1.hdslb.com/bfs/face/7266e09e212b2e5109465d877dfa9a58e4ba33d4.jpg', '周六野Zoey': 'http://i1.hdslb.com/bfs/face/c95abd1564994c36dd481c21acf14a83aeb61ab8.jpg', '老四赶海': 'http://i0.hdslb.com/bfs/face/d6b4a211a323f1c8fa35f42b7caca6bcdb9e68c9.jpg', '纯黑-V-布里塔尼亚': 'http://i2.hdslb.com/bfs/face/e8ab7b02d6576f4141ea857734b68b9dd35a5730.jpg', '阿漫啊阿漫': 'http://i2.hdslb.com/bfs/face/ec38b52e36d4577fe865e43bdeef4375221f1250.jpg', '秋呗叔岳': 'http://i1.hdslb.com/bfs/face/71805f5a1971f2637059da49e46f5eafbe4223cb.jpg', '靠谱电竞': 'http://i2.hdslb.com/bfs/face/b37e8c05448cb24b15a4f29581c66fdff1062147.jpg', '百万剪辑狮': 'http://i1.hdslb.com/bfs/face/7974bacea323324b7175055e7a942a676f994132.jpg', '电影最TOP': 'http://i0.hdslb.com/bfs/face/6b2ade215ea603b495648875c925172a863d16d4.jpg', '刘哔电影': 'http://i0.hdslb.com/bfs/face/654ed89c763ca1f003cc2d0145e1ec444cc7f0e8.jpg', '-LKs-': 'http://i2.hdslb.com/bfs/face/6bc7a2778be455273c82e45bfca55cb0f70c820b.jpg', '嘻咦啊看': 'http://i2.hdslb.com/bfs/face/170e1009846dd3ad8cdcd4050527b9cf2ecf67ab.jpg', '凉下采桑': 'http://i0.hdslb.com/bfs/face/c21e4fb4d44e033d7d15283057aa2021e5d0bfb7.jpg', '哔哩哔哩番剧': 'http://i1.hdslb.com/bfs/face/60a9153609998b04301dc5b8ed44c41b537a2268.jpg', '一只南音呀': 'http://i2.hdslb.com/bfs/face/166e2039c60e0b6d8d24bf76bb3628f08804a431.jpg', '蜡笔和小勋': 'http://i2.hdslb.com/bfs/face/8a8812e0a9bb3adda90044ef48830584e1efe7a2.jpg', 'skyhahalife': 'http://i0.hdslb.com/bfs/face/2e5cada7e126a5c43c2ba5576c4dd1725b7075a0.jpg', '芳斯塔芙': 'http://i1.hdslb.com/bfs/face/b3e851174abfbe9622cf3d336f302531d9cdfc97.jpg', '哔哩哔哩游戏中心': 'http://i2.hdslb.com/bfs/face/509a6ca97e1599e55739b048c0770ce4a63531a5.jpg', '山海zoom': 'http://i0.hdslb.com/bfs/face/7289f4e7032cef7c6be32b07f8c559a46627d293.jpg', '是当归哦': 'http://i0.hdslb.com/bfs/face/201b100685362d362434bbbed11834a2a6753401.jpg', '黑桐谷歌': 'http://i0.hdslb.com/bfs/face/31706c82949b3ba4756a411825c3f16aeb14ad44.jpg', 'PDD在B站': 'http://i0.hdslb.com/bfs/face/10a4d2e4b20cf2c08c17604f7a192eb9d7d293f2.jpg', '伊丽莎白鼠': 'http://i2.hdslb.com/bfs/face/6c36ec15f6d7ddd9bdb558511521bd0256779e1c.jpg', '潮汕好男人': 'http://i0.hdslb.com/bfs/face/2a4c1b3c2d8f48a58c5f818287c90a6ea78d7907.jpg', '起小点是大腿': 'http://i0.hdslb.com/bfs/face/94b65197150887e1bc1671be6598d7fd2ea3132b.jpg', '观察者网': 'http://i2.hdslb.com/bfs/face/790c525e30faa01b855e2d4f1126d7e8a7632af0.jpg', '纯黑-克劳狄乌斯': 'http://i0.hdslb.com/bfs/face/e8ab7b02d6576f4141ea857734b68b9dd35a5730.jpg', '徐大sao': 'http://i2.hdslb.com/bfs/face/17323892be2e17b50acadb89b2e385b493f5fb4d.jpg', 'STN工作室': 'http://i1.hdslb.com/bfs/face/c43e6cab13c9a0303cf8476cfd405cff61195726.jpg', '皇族电子竞技俱乐部官方账号': 'http://i2.hdslb.com/bfs/face/ab85323352b2af52e7a5477cd963d5da98fdab4e.jpg', '牛叔万岁万岁万万岁': 'http://i2.hdslb.com/bfs/face/3c9d436659037cc17fe0d32975ef7f17710ba820.jpg', 'TESTV官方频道': 'http://i1.hdslb.com/bfs/face/34ccaf9461c67482e3164675c0036e94df18b7a7.jpg', '聚印象视频': 'http://i0.hdslb.com/bfs/face/98e2202d8e2725fe3a4b7c3adafeb91764584bbb.jpg', '暗猫の祝福': 'http://i0.hdslb.com/bfs/face/c286d67fbc3abe9f47c115fdae797e9b140156b9.jpg', '凤凰天使TSKS韩剧社官方账号': 'http://i2.hdslb.com/bfs/face/97cdf2282bb2c9d4e8ae5184356097844d1ba4d7.jpg', '努巴尼守望先锋': 'http://i2.hdslb.com/bfs/face/358e0b207a9c6012778e4ad1b21356c91536e790.jpg', '少年Pi': 'http://i0.hdslb.com/bfs/face/d851f48a579778b06249bf3debaa62d353694e91.jpg', '老实憨厚的笑笑': 'http://i1.hdslb.com/bfs/face/048853815037dd95d4193dd6a3d561db428a927f.jpg'} \ No newline at end of file diff --git a/run.py b/run.py index 0897e3b..765afc1 100644 --- a/run.py +++ b/run.py @@ -81,10 +81,9 @@ def run_threaded(job_func): schedule.every().week.do(run_threaded, video_spider_all) schedule.every().week.do(run_threaded, videoRank) schedule.every().hour.do(run_threaded, site) -schedule.every(15).minutes.do(run_threaded, online) +# schedule.every(15).minutes.do(run_threaded, online) schedule.every(10).minutes.do(run_threaded, strong) - print('开始运行计划任务..') while True: schedule.run_pending() From 73096204ecc9b73522a2de677b8e618283b88e78 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 9 Apr 2019 19:35:35 +0800 Subject: [PATCH 260/469] feature: fans increasing rate for make video --- biliob_requests/get_user_coin.py | 18 ++++++++ get_data/aggregate_fans_rate.py | 78 +++++++++++++++++++++++--------- get_data/color.py | 1 + get_data/face.py | 1 + run.py | 3 +- 5 files changed, 77 insertions(+), 24 deletions(-) create mode 100644 biliob_requests/get_user_coin.py create mode 100644 get_data/color.py create mode 100644 get_data/face.py diff --git a/biliob_requests/get_user_coin.py b/biliob_requests/get_user_coin.py new file mode 100644 index 0000000..a570631 --- /dev/null +++ b/biliob_requests/get_user_coin.py @@ -0,0 +1,18 @@ +import requests +from db import db +import time +author_coll = db['author'] +URL = 'https://api.bilibili.com/x/space/acc/info?mid={mid}' +with open('D:/数据/B站/UP主硬币数.csv', 'w',encoding="utf-8-sig") as f: + for each_author in author_coll.find({}, {'mid': 1, 'name': 1}): + mid = each_author['mid'] + response = requests.get(URL.format(mid=mid)) + j = response.json() + + if 'code' in j and j['code'] != -404 and 'data' in j and 'coins' in j[ + 'data']: + print('"{}","{}"\n'.format(each_author['name'], + j['data']['coins'])) + f.write('"{}","{}"\n'.format(each_author['name'], + j['data']['coins'])) + time.sleep(0.5) \ No newline at end of file diff --git a/get_data/aggregate_fans_rate.py b/get_data/aggregate_fans_rate.py index 15b34dd..4447bdb 100644 --- a/get_data/aggregate_fans_rate.py +++ b/get_data/aggregate_fans_rate.py @@ -1,62 +1,96 @@ from db import db import datetime from scipy.interpolate import interp1d +from haishoku.haishoku import Haishoku + +from face import face +from color import color start_date = datetime.datetime(2018, 11, 1) end_date = datetime.datetime.now() -date_range = 7 * 24 * 60 * 60 +date_range = 30 * 24 * 60 * 60 delta_date = 0.25 * 24 * 60 * 60 date_format = '%Y-%m-%d %H:%M' d = {} current_date = start_date.timestamp() while (current_date < end_date.timestamp()): - c_date = datetime.datetime.fromtimestamp( - current_date).strftime(date_format) + c_date = datetime.datetime.fromtimestamp(current_date).strftime( + date_format) d[c_date] = [] current_date += delta_date - -for each_author in db['author'].find({'cFans': {'$gt': 200000}}).batch_size(1): +for each_author in db['author'].find({'cFans': {'$gt': 200000}}).batch_size(5): current_date = start_date.timestamp() data = sorted(each_author['data'], key=lambda x: x['datetime']) - x = list(map( - lambda each_data: each_data['datetime'].timestamp(), data)) + x = list(map(lambda each_data: each_data['datetime'].timestamp(), data)) y = list(map(lambda each_data: each_data['fans'], data)) if len(x) <= 2: continue interrupted_fans = interp1d(x, y, kind='linear') current_date = start_date.timestamp() - while (current_date < min(end_date.timestamp(), x[-1])): + begin_date = current_date - date_range + if begin_date <= x[0]: + begin_date = x[0] # 出界 - if (current_date - date_range) > x[0] and current_date < x[-1]: - fans_func = interrupted_fans( - [current_date - date_range, current_date]) + if begin_date >= x[0] and current_date < x[-1] and current_date > x[0]: + fans_func = interrupted_fans([begin_date, current_date]) delta_fans = int(fans_func[1] - fans_func[0]) pass - c_date = datetime.datetime.fromtimestamp( - current_date).strftime(date_format) - print('"{}","{}","{}"'.format( - each_author['name'], delta_fans, c_date)) + c_date = datetime.datetime.fromtimestamp(current_date).strftime( + date_format) + print('"{}","{}","{}"'.format(each_author['name'], delta_fans, + c_date)) # d[c_date].append((delta_fans, each_author['name'])) - d[c_date].append((each_author['name'], delta_fans)) + d[c_date].append((each_author['name'], delta_fans, + each_author['face'])) if len(d[c_date]) >= 200: d[c_date] = sorted( d[c_date], key=lambda x: x[1], reverse=True)[:20] current_date += delta_date +for c_date in d: + d[c_date] = sorted(d[c_date], key=lambda x: x[1], reverse=True)[:20] -d[c_date] = sorted( - d[c_date], key=lambda x: x[1], reverse=True)[:20] - -with open('D:/数据/B站/fans/190319.csv', 'w', encoding="utf-8-sig") as f: +with open('D:/数据/B站/fans/月结粉丝.csv', 'w', encoding="utf-8-sig") as f: f.writelines('date,name,value\n') for each_date in d: for each_data in d[each_date]: - f.writelines('"{}","{}","{}"\n'.format( - each_date, each_data[0], each_data[1])) + f.writelines('"{}","{}","{}"\n'.format(each_date, each_data[0], + each_data[1])) +authors = [] +for each_date in d: + for each_author in d[each_date]: + authors.append(each_author[0]) + if each_author[0] not in face: + face[each_author[0]] = each_author[2] +with open('./get_data/face.py', 'w', encoding="utf-8-sig") as f: + f.writelines('face = ' + str(face)) + +for each_author in face: + if each_author in color: + continue + if face[each_author][-3:] == 'gif': + color[each_author] = '#000000' + else: + color_list = Haishoku.getPalette(face[each_author]) + color_list = sorted( + color_list, key=lambda x: x[1][0] + x[1][1] + x[1][2]) + color[each_author] = 'rgb' + \ + str(color_list[int(len(color_list)/2)][1]) + +with open('./get_data/color.py', 'w', encoding="utf-8-sig") as f: + f.writelines('color = ' + str(color)) + +min_fans = 99999999 +for each_author in authors: + c_fans = db['author'].find_one({'name': each_author}, + {'cFans': True})['cFans'] + if c_fans <= min_fans: + min_fans = c_fans +print(min_fans) \ No newline at end of file diff --git a/get_data/color.py b/get_data/color.py new file mode 100644 index 0000000..2a8321c --- /dev/null +++ b/get_data/color.py @@ -0,0 +1 @@ +color = {'华农兄弟': 'rgb(241, 124, 23)', '信誓蛋蛋': 'rgb(132, 112, 117)', '痴鸡小队官方': 'rgb(224, 147, 35)', '中国BOY超级大猩猩': 'rgb(232, 105, 10)', '唯一音乐小魔王': 'rgb(122, 146, 147)', '-欣小萌-': 'rgb(184, 157, 139)', '英雄联盟': 'rgb(52, 116, 193)', '爱做饭的芋头SAMA': 'rgb(123, 120, 124)', '美食作家王刚R': 'rgb(137, 138, 126)', '盗月社食遇记': 'rgb(200, 140, 125)', '怕上火暴王老菊': 'rgb(212, 82, 88)', '机智的党妹': 'rgb(158, 120, 105)', '靠脸吃饭的徐大王': 'rgb(135, 144, 179)', '长歌与小见见': 'rgb(205, 144, 127)', 'papi酱': 'rgb(140, 140, 140)', '我是郭杰瑞': 'rgb(72, 34, 183)', '倒悬的橘子': 'rgb(154, 126, 120)', 'LexBurner': 'rgb(149, 137, 135)', '狂人实验室': 'rgb(129, 130, 136)', '木鱼水心': 'rgb(128, 71, 179)', '允星河Yoseya': 'rgb(231, 147, 136)', '翔翔大作战': 'rgb(206, 150, 119)', '共青团中央': 'rgb(250, 29, 27)', '猎奇笔记本': 'rgb(237, 126, 40)', '吴织亚切大忽悠': 'rgb(207, 91, 81)', '哔哩哔哩英雄联盟赛事': 'rgb(110, 127, 203)', '赫萝老师': 'rgb(197, 130, 81)', '老番茄': 'rgb(250, 29, 27)', 'MordonFreeman': 'rgb(84, 100, 153)', '花花与三猫CatLive': 'rgb(144, 130, 109)', 'EdmundDZhang': 'rgb(171, 221, 72)', '浅澄月': 'rgb(161, 119, 108)', '徐大虾咯': 'rgb(211, 141, 57)', '王大境泽': 'rgb(127, 113, 112)', '哔哩哔哩线下活动': 'rgb(124, 64, 115)', '神秘店长A': 'rgb(114, 111, 114)', '楼上的老张': 'rgb(151, 151, 151)', '土味角虫': 'rgb(122, 125, 119)', '低调的帅爷': 'rgb(193, 112, 73)', '靖菌命': 'rgb(202, 135, 136)', '日本沙雕日常': 'rgb(138, 121, 110)', 'MrYang杨家成': 'rgb(145, 127, 122)', '黑镖客梦回': 'rgb(80, 79, 80)', '无聊的Do君': 'rgb(209, 119, 82)', '泛式': 'rgb(151, 116, 111)', '芒果冰OL': 'rgb(186, 156, 151)', '面筋哥-程书林': 'rgb(179, 111, 79)', '刘老师说电影': 'rgb(133, 128, 114)', '李子柒': 'rgb(128, 125, 118)', '10后找人带': 'rgb(250, 29, 27)', '丝血反杀闰土的猹': 'rgb(188, 121, 67)', '孤独的美食基': 'rgb(185, 122, 86)', '非我执笔': 'rgb(180, 114, 108)', '神奇的老皮': '#222222', '敖厂长': 'rgb(231, 187, 88)', '扎双马尾的丧尸': '#333333', '采紫葳的凌霄子': 'rgb(199, 155, 150)', '痒局长': 'rgb(207, 69, 119)', '夜刀神吉吉': 'rgb(209, 151, 115)', '路人A-': 'rgb(44, 160, 178)', '开心嘴炮': 'rgb(130, 111, 129)', '小高姐的魔法调料': 'rgb(145, 121, 110)', '科技美学': 'rgb(130, 118, 132)', '何必Hebee': 'rgb(211, 141, 68)', '优酱胖头鱼': 'rgb(181, 154, 125)', '手工耿': 'rgb(127, 128, 137)', '马壮实Hera': 'rgb(129, 139, 109)', '码哥与马也的日常': 'rgb(194, 154, 144)', '哔哩哔哩纪录片': 'rgb(113, 202, 226)', 'EmmaThePolaris': 'rgb(128, 122, 123)', '努力的Lorre': 'rgb(140, 116, 98)', '进击的冰糖': 'rgb(210, 121, 124)', '故事王StoryMan': 'rgb(132, 112, 26)', '饭帅fun': 'rgb(117, 103, 149)', '三木刃': 'rgb(105, 104, 131)', '点滴菌': 'rgb(17, 128, 243)', '多多poi丶': 'rgb(218, 108, 93)', '紧张的猫饼': 'rgb(92, 128, 156)', '哦呼w': 'rgb(92, 128, 156)', '杰里德Jared': 'rgb(196, 153, 134)', '哎哟阿尤': 'rgb(79, 24, 193)', '冷面Lim': 'rgb(149, 193, 182)', '留学的真相': 'rgb(198, 94, 157)', '辣目洋子': 'rgb(142, 122, 123)', '毒角SHOW': 'rgb(131, 129, 130)', '敬汉卿': 'rgb(142, 131, 124)', '蜡笔小勋是一对儿': 'rgb(190, 154, 154)', '王咩阿': 'rgb(208, 142, 127)', '你的可樱已上线': 'rgb(191, 110, 121)', 'bilibili电影': 'rgb(118, 201, 228)', 'Vinheteiro': 'rgb(151, 123, 105)', 'zy戏精学院院长': 'rgb(190, 148, 126)', 'ADC芒果': 'rgb(230, 176, 77)', '=咬人猫=': 'rgb(199, 155, 120)', '神秘学调查员': 'rgb(147, 142, 124)', '无聊的开箱': 'rgb(144, 114, 16)', '小可儿': 'rgb(114, 132, 190)', '茶理理理子': 'rgb(41, 114, 201)', '郝给力': 'rgb(41, 114, 201)', '燃茶哥哥在此': 'rgb(198, 144, 104)', 'Mafumafu_Channel': 'rgb(249, 101, 106)', '卡卡会发光guang': 'rgb(178, 133, 79)', '贤宝宝Baby': 'rgb(119, 110, 97)', '崩坏3第一偶像爱酱': 'rgb(206, 135, 74)', '渔人阿烽': 'rgb(120, 161, 192)', '十音Shiyin': 'rgb(97, 98, 110)', '还有一天就放假了': 'rgb(212, 141, 60)', '槐安遗梦': 'rgb(158, 131, 101)', '喵喵折App': 'rgb(254, 150, 150)', '四季萌芽': 'rgb(202, 130, 120)', 'A路人': 'rgb(181, 115, 79)', '吃鸡陪玩酱': 'rgb(212, 148, 122)', '哇哇哇妹': 'rgb(249, 108, 127)', '极客湾Geekerwan': 'rgb(117, 176, 195)', '哔哩哔哩会员购': 'rgb(232, 179, 193)', '视角姬': 'rgb(77, 180, 63)', '萧忆情Alex': 'rgb(216, 147, 141)', '狂风桑': 'rgb(193, 155, 126)', '老邪说电影': 'rgb(165, 179, 8)', '某幻君': 'rgb(182, 139, 139)', 'low君热剧': 'rgb(127, 133, 128)', '大连老湿王博文': 'rgb(226, 138, 127)', '野食小哥': 'rgb(136, 127, 121)', '洛天依': 'rgb(152, 141, 180)', 'bilibili星访问': 'rgb(200, 200, 37)', '桃核叫我桃道长': 'rgb(133, 133, 133)', '水一大魔王': 'rgb(233, 69, 150)', '不正经老丝': 'rgb(184, 121, 79)', '白上吹雪Official': 'rgb(188, 168, 179)', '喝水少年孙十一': 'rgb(198, 145, 127)', '指法芬芳张大仙': 'rgb(182, 152, 150)', 'OELoop': 'rgb(148, 158, 192)', '山药视频': 'rgb(2, 129, 202)', '抽风Crazy': 'rgb(120, 114, 112)', '哔哩哔哩弹幕网': 'rgb(117, 200, 228)', '凉风Kaze': 'rgb(204, 157, 118)', '1818黄金眼': 'rgb(129, 130, 129)', '吃素的狮子': 'rgb(141, 135, 111)', '远古时代装机猿': 'rgb(127, 127, 127)', '曼食慢语': 'rgb(131, 131, 131)', '影视飓风': 'rgb(124, 124, 124)', '渗透之C君': 'rgb(251, 81, 90)', '花少北丶': 'rgb(111, 104, 112)', '声鱼片儿': 'rgb(230, 97, 30)', '暴走漫画': 'rgb(229, 152, 55)', '阿神的Bili官方頻道': 'rgb(178, 161, 142)', '夹性芝士': 'rgb(127, 127, 127)', 'AS极客': 'rgb(138, 126, 105)', '哔哩哔哩活动': 'rgb(189, 82, 110)', '丰兄来了': 'rgb(231, 153, 152)', 'scyrax': 'rgb(210, 133, 126)', '拉宏桑': 'rgb(55, 135, 212)', '郁郁_Yu': 'rgb(130, 140, 206)', '是你的霹雳': 'rgb(104, 119, 215)', '哔哩哔哩直播': 'rgb(20, 177, 234)', 'NathanRich火锅大王': 'rgb(142, 127, 125)', '潇湘公子寻': 'rgb(190, 168, 114)', '纳豆奶奶': 'rgb(174, 125, 79)', '逍遥散人': 'rgb(113, 135, 144)', '郭乐乐很努力': 'rgb(126, 120, 128)', '宝剑嫂': 'rgb(119, 111, 119)', 'colinzang': 'rgb(144, 119, 114)', '曹译文iris': 'rgb(127, 130, 127)', '歪果仁研究协会': 'rgb(201, 166, 91)', '音乐君': 'rgb(203, 107, 78)', '李永乐老师官方': 'rgb(71, 101, 183)', '库特菌': 'rgb(198, 149, 131)', '东尼ookii': 'rgb(182, 149, 125)', '力元君': 'rgb(219, 30, 81)', '哔哩哔哩大会员': 'rgb(251, 166, 190)', '周六野Zoey': 'rgb(159, 163, 178)', '老四赶海': 'rgb(227, 142, 113)', '纯黑-V-布里塔尼亚': 'rgb(187, 164, 159)', '阿漫啊阿漫': 'rgb(20, 102, 144)', '秋呗叔岳': 'rgb(181, 158, 143)', '靠谱电竞': 'rgb(195, 164, 38)', '百万剪辑狮': 'rgb(126, 126, 126)', '电影最TOP': 'rgb(193, 159, 60)', '刘哔电影': 'rgb(243, 92, 70)', '-LKs-': 'rgb(146, 138, 198)', '嘻咦啊看': 'rgb(228, 94, 73)', '凉下采桑': 'rgb(252, 252, 252)', '哔哩哔哩番剧': 'rgb(118, 201, 228)', '一只南音呀': 'rgb(124, 126, 123)', '蜡笔和小勋': 'rgb(190, 154, 154)', 'skyhahalife': 'rgb(150, 150, 150)', '芳斯塔芙': 'rgb(204, 181, 172)', '哔哩哔哩游戏中心': 'rgb(92, 145, 220)', '山海zoom': 'rgb(178, 121, 74)', '是当归哦': 'rgb(151, 136, 128)', '黑桐谷歌': 'rgb(220, 123, 111)', 'PDD在B站': 'rgb(138, 117, 109)', '伊丽莎白鼠': 'rgb(190, 148, 134)', '潮汕好男人': 'rgb(117, 142, 140)', '起小点是大腿': 'rgb(243, 158, 124)', '观察者网': 'rgb(230, 168, 172)', '纯黑-克劳狄乌斯': 'rgb(187, 164, 159)', '徐大sao': 'rgb(187, 186, 196)', 'STN工作室': 'rgb(187, 123, 9)', '皇族电子竞技俱乐部官方账号': 'rgb(144, 126, 104)', '牛叔万岁万岁万万岁': 'rgb(75, 100, 95)', 'TESTV官方频道': 'rgb(135, 131, 121)', '聚印象视频': 'rgb(127, 127, 127)', '暗猫の祝福': 'rgb(158, 200, 207)', '凤凰天使TSKS韩剧社官方账号': 'rgb(180, 147, 139)', '努巴尼守望先锋': 'rgb(142, 144, 147)', '少年Pi': 'rgb(152, 133, 119)', '老实憨厚的笑笑': 'rgb(193, 147, 108)'} \ No newline at end of file diff --git a/get_data/face.py b/get_data/face.py new file mode 100644 index 0000000..b312adb --- /dev/null +++ b/get_data/face.py @@ -0,0 +1 @@ +face = {'华农兄弟': 'http://i1.hdslb.com/bfs/face/bac504655c69ab937b0be4557e27535f794b0c66.jpg', '信誓蛋蛋': 'http://i2.hdslb.com/bfs/face/df0be0f1946581030cbaf34e3f66a996f0a1af4a.jpg', '痴鸡小队官方': 'http://i2.hdslb.com/bfs/face/25b9ca4626a41dbb249a2cf144b14200cb4c34e1.jpg', '中国BOY超级大猩猩': 'http://i1.hdslb.com/bfs/face/068939602dae190c86f6b36ca301281d7d8aa6d9.jpg', '唯一音乐小魔王': 'http://i1.hdslb.com/bfs/face/d17ddb244a783fa8179c362209080c48716beebf.jpg', '-欣小萌-': 'http://i1.hdslb.com/bfs/face/b5aee8b711fd655c70d705678b4e350ae255a1d0.jpg', '英雄联盟': 'http://i1.hdslb.com/bfs/face/04d579b1644ee8864e7aea01219dae4a94b469ce.jpg', '爱做饭的芋头SAMA': 'http://i1.hdslb.com/bfs/face/69c2df87253eabe27e8257dc827d186cceabc3f1.jpg', '美食作家王刚R': 'http://i0.hdslb.com/bfs/face/1463fa4ea6bffd867dc257dca87248bb1d671cde.jpg', '盗月社食遇记': 'http://i1.hdslb.com/bfs/face/ab901fc0571698bb9f389798029c3fc4c0188311.jpg', '怕上火暴王老菊': 'http://i0.hdslb.com/bfs/face/2edf4a4f534869a63158d13a4b6b9676d75f1e0a.jpg', '机智的党妹': 'http://i0.hdslb.com/bfs/face/d83e93dc9101cc0c416ca09ad33a63bdf3d26c6c.jpg', '靠脸吃饭的徐大王': 'http://i1.hdslb.com/bfs/face/0e6e0f313a195e293d4ee6ae8ab86a3074abb315.jpg', '长歌与小见见': 'http://i2.hdslb.com/bfs/face/04bf6928fcdb0452dee9e1aed6a6ff1becd51bcd.jpg', 'papi酱': 'http://i0.hdslb.com/bfs/face/e45a7b248f496fad8b32d3b9cfa0335339331798.jpg', '我是郭杰瑞': 'http://i1.hdslb.com/bfs/face/6182455e4d61159121c223ddc7a3a381f2d4d056.jpg', '倒悬的橘子': 'http://i0.hdslb.com/bfs/face/440968b6694576e931ed4ec61d699029d82bcbaa.jpg', 'LexBurner': 'http://i1.hdslb.com/bfs/face/2996e22a24eed2d7767e452627a9130207defe6a.jpg', '狂人实验室': 'http://i1.hdslb.com/bfs/face/4e5e30311607340a607f52a8bfbc74b12d00558d.jpg', '木鱼水心': 'http://i1.hdslb.com/bfs/face/696df59d35c78430f1a0bdb6184558e7b7eb4a6e.jpg', '允星河Yoseya': 'http://i0.hdslb.com/bfs/face/6f0920f35ae664e937d455ba4e1ef1ac10fb80b5.jpg', '翔翔大作战': 'http://i0.hdslb.com/bfs/face/1ad13832c10e8018dc8e0e7671a6b8594ddab0c0.jpg', '共青团中央': 'http://i2.hdslb.com/bfs/face/52e16dad8aa29b6214bbfadf702e83eeac34ad9f.jpg', '猎奇笔记本': 'http://i2.hdslb.com/bfs/face/e3cb24355694fa887741871a9a1e22ae590e3769.jpg', '吴织亚切大忽悠': 'http://i1.hdslb.com/bfs/face/3b89afc4e25e534a8e4165952116aa9265ea201f.jpg', '哔哩哔哩英雄联盟赛事': 'http://i2.hdslb.com/bfs/face/f07c74fe2a020b33ab1035fea6d3338b6a6e6749.jpg', '赫萝老师': 'http://i0.hdslb.com/bfs/face/f3776594fc0ff076bdfcc8fc4921327239a7150e.jpg', '老番茄': 'http://i2.hdslb.com/bfs/face/bc5ca101313d4db223c395d64779e76eb3482d60.jpg', 'MordonFreeman': 'http://i2.hdslb.com/bfs/face/c076aac067d1d32c4bdd7b6aa5dc1930185bf91a.jpg', '花花与三猫CatLive': 'http://i1.hdslb.com/bfs/face/1871c834255ffea531f699164e70f0daebc7558b.jpg', 'EdmundDZhang': 'http://i2.hdslb.com/bfs/face/5d94b9727a49815716cd66fc7ba3840382025c56.jpg', '浅澄月': 'http://i2.hdslb.com/bfs/face/512fd36e0fb24746f04aa1073ea89c1a2f91f7e1.jpg', '徐大虾咯': 'http://i0.hdslb.com/bfs/face/f24ea395c75b7a73db57da2a2920f6e84c902082.jpg', '王大境泽': 'http://i0.hdslb.com/bfs/face/fd66b20e63fe0a3e2331892feb7ecc5ff871dcb5.jpg', '哔哩哔哩线下活动': 'http://i1.hdslb.com/bfs/face/10c354765e8a1b3fa47fba1594edc866145bd79b.jpg', '神秘店长A': 'http://i1.hdslb.com/bfs/face/cdb729f3b5cb6dc9a0cb7cc9b8c63aebf1bc5b6b.jpg', '楼上的老张': 'http://i0.hdslb.com/bfs/face/b159d3f9a0ba088dbfc845a849e84bb9f110e6f2.jpg', '土味角虫': 'http://i1.hdslb.com/bfs/face/21d2a655e44aa4844c7353d138b33581f7aaa94f.jpg', '低调的帅爷': 'http://i0.hdslb.com/bfs/face/e42e975697d7237d96992210cf801ae5e87af354.jpg', '靖菌命': 'http://i2.hdslb.com/bfs/face/6f6411b3d701ad213df75b8f9ad8910fc1ebe408.jpg', '日本沙雕日常': 'http://i2.hdslb.com/bfs/face/68aa0664390afa981cf78d8bda4042ec55d26170.jpg', 'MrYang杨家成': 'http://i0.hdslb.com/bfs/face/623fea846d8ba3b11e36d6dbc44baca08238a3d3.jpg', '黑镖客梦回': 'http://i1.hdslb.com/bfs/face/83c8bf808e662d02291a41ab5992541e2707a5d2.jpg', '无聊的Do君': 'http://i0.hdslb.com/bfs/face/d00ef6d8ceea8edee3cd61e5e87bff036189a5bb.jpg', '泛式': 'http://i2.hdslb.com/bfs/face/5f60d345059b82f0878984d9f9133f45b33b82be.jpg', '芒果冰OL': 'http://i0.hdslb.com/bfs/face/f40b734ef61f95f8adb3beca5b7b693db399c50e.jpg', '面筋哥-程书林': 'http://i1.hdslb.com/bfs/face/44fd943316e177fcf91ebde537bcff65f7a84515.jpg', '刘老师说电影': 'http://i1.hdslb.com/bfs/face/145226ff2d32d7c99b8ea3591ffec2f38fc9d3d3.jpg', '李子柒': 'http://i1.hdslb.com/bfs/face/82d27965dae3b2fe9e52780c6309c7b37ad4cbf2.jpg', '10后找人带': 'http://i0.hdslb.com/bfs/face/b8b3badb8344b72f1f4746eac2817f8423aeec2b.jpg', '丝血反杀闰土的猹': 'http://i2.hdslb.com/bfs/face/9672e303acae98a22efb54d0319e60294db887c3.jpg', '孤独的美食基': 'http://i2.hdslb.com/bfs/face/df188387360d6ad90f9e36ac4aaea68ed9f3c9be.jpg', '非我执笔': 'http://i2.hdslb.com/bfs/face/4814cdf4293ba26839f895bd478efaf21bf299db.jpg', '神奇的老皮': 'http://i2.hdslb.com/bfs/face/e7c191e9be6764107415069b36f7d9564f149c86.gif', '敖厂长': 'http://i0.hdslb.com/bfs/face/156d5d3b3f4b66d940365b3b0e3a809e1fcc0d97.jpg', '扎双马尾的丧尸': 'http://i1.hdslb.com/bfs/face/5be61949369dd844cc459eab808da151d8c363d2.gif', '采紫葳的凌霄子': 'http://i0.hdslb.com/bfs/face/b3d6d9871475f15db85340e1ed12c93f2a8b81a9.jpg', '痒局长': 'http://i2.hdslb.com/bfs/face/bcdf640faa16ebaacea1d4c930baabaec9087a80.jpg', '夜刀神吉吉': 'http://i0.hdslb.com/bfs/face/04c6f83245fdfc39440c19c5e70f73a5351d0ada.jpg', '路人A-': 'http://i0.hdslb.com/bfs/face/0383674a5cf40d2163af1f5b80f8cd90a2d62e2c.jpg', '开心嘴炮': 'http://i1.hdslb.com/bfs/face/5cd3053f352bc597bac3d11b824e548108423cb2.jpg', '小高姐的魔法调料': 'http://i2.hdslb.com/bfs/face/aae000e04ce872b58e6bbfc1632cc5bc2203084c.jpg', '科技美学': 'http://i0.hdslb.com/bfs/face/f6f8dc53ddf3ba07c1f0dd3ad87fe92034198b81.jpg', '何必Hebee': 'http://i0.hdslb.com/bfs/face/08032c6289096a9e6869a5cd5c05280060b10532.jpg', '优酱胖头鱼': 'http://i0.hdslb.com/bfs/face/7a1f15e4fc51e16c32752e10e40e7fa3ffa81eb6.jpg', '手工耿': 'http://i2.hdslb.com/bfs/face/b8a75ae7d2a0e2af1d36ca9f1084d850eebb28e3.jpg', '马壮实Hera': 'http://i2.hdslb.com/bfs/face/92d75c347df9a50cdd691b6c62dafff93138be8e.jpg', '码哥与马也的日常': 'http://i0.hdslb.com/bfs/face/692b776a39b208aacfb6ef6a5ccfb6cfe2861bb6.jpg', '哔哩哔哩纪录片': 'http://i1.hdslb.com/bfs/face/33687c6c4707352cd25fac995cd416009830c917.jpg', 'EmmaThePolaris': 'http://i2.hdslb.com/bfs/face/5433666ef01f1e51e3f21bd3d509ed7bb68eff87.jpg', '努力的Lorre': 'http://i1.hdslb.com/bfs/face/c63ebeed7d49967e2348ef953b539f8de90c5140.jpg', '进击的冰糖': 'http://i0.hdslb.com/bfs/face/8294462b9c92d587c5982a5ec5008d808325056e.jpg', '故事王StoryMan': 'http://i2.hdslb.com/bfs/face/b55679bd383423cb02f0992e44f19a68c6f5fd1b.jpg', '饭帅fun': 'http://i1.hdslb.com/bfs/face/1f8c1c2665d8c1cbb14c4dbe5e09d06f0e78e314.jpg', '三木刃': 'http://i0.hdslb.com/bfs/face/785f79f302e6166079ad2fef933dcbfd435cca4b.jpg', '点滴菌': 'http://i2.hdslb.com/bfs/face/fd8c1c3a5a454c75eafe12464989e9d794179d29.jpg', '多多poi丶': 'http://i2.hdslb.com/bfs/face/3f55db249421c556084719cd9581036d67e93ed4.jpg', '紧张的猫饼': 'http://i2.hdslb.com/bfs/face/fc1b1cd176a854c4fd718694b9216469fa148f4b.jpg', '哦呼w': 'http://i0.hdslb.com/bfs/face/57ead5621801ec8a637bc47754e00e9ae6e62888.gif', '杰里德Jared': 'http://i2.hdslb.com/bfs/face/5138b1aea6da8a7e9962562208ed848a417e207b.jpg', '哎哟阿尤': 'http://i0.hdslb.com/bfs/face/39f892f717cd18ad5de3d61c1a99c3e50f6ab390.jpg', '冷面Lim': 'http://i0.hdslb.com/bfs/face/a52f915170f1f3bfbe8e1f59a5b512936669cade.jpg', '留学的真相': 'http://i1.hdslb.com/bfs/face/7af19e342cb560870733804a94e46a45a1d1e771.jpg', '辣目洋子': 'http://i0.hdslb.com/bfs/face/a2c5ca6dd5be6b89e94294e06f9e365a29b7943b.jpg', '毒角SHOW': 'http://i0.hdslb.com/bfs/face/24486911dc40a0faa23b025b4493f15b086c65cc.jpg', '敬汉卿': 'http://i1.hdslb.com/bfs/face/a5c6005a27da6afd52021dc07423f7b4a78a466c.jpg', '蜡笔小勋是一对儿': 'http://i0.hdslb.com/bfs/face/8a8812e0a9bb3adda90044ef48830584e1efe7a2.jpg', '王咩阿': 'http://i1.hdslb.com/bfs/face/dce221c5c508ca338eb0428a751e27b41ec7bfeb.jpg', '你的可樱已上线': 'http://i1.hdslb.com/bfs/face/d72b7fe43b556fb5872ab9bdad771b1476928796.jpg', 'bilibili电影': 'http://i0.hdslb.com/bfs/face/60a9153609998b04301dc5b8ed44c41b537a2268.jpg', 'Vinheteiro': 'http://i2.hdslb.com/bfs/face/dff03339974b20517bbe26ce49ea9d2a39831023.jpg', 'zy戏精学院院长': 'http://i2.hdslb.com/bfs/face/9dc26bc30f200dd131b9479fc30c443264c39278.jpg', 'ADC芒果': 'http://i2.hdslb.com/bfs/face/cce7e724724581e1323d5bdd7e00796ee8aec8b8.jpg', '=咬人猫=': 'http://i1.hdslb.com/bfs/face/8fad84a4470f3d894d8f0dc95555ab8f2cb10a83.jpg', '神秘学调查员': 'http://i2.hdslb.com/bfs/face/b803792a721b61da30e46e7e93d7d818f4f0bc48.jpg', '无聊的开箱': 'http://i2.hdslb.com/bfs/face/9ecc839c3b28f9752bfae2834333ddc32930787f.jpg', '小可儿': 'http://i2.hdslb.com/bfs/face/4d7fbb6c47b097e297b958c8dc74287cd880fc4a.jpg', '茶理理理子': 'http://i0.hdslb.com/bfs/face/557d3f4b8cee7d413714f48f9ec671c4c44e1e6c.gif', '郝给力': 'http://i0.hdslb.com/bfs/face/94e057ae6738be1810e1747df306ac12e7c3aece.jpg', '燃茶哥哥在此': 'http://i1.hdslb.com/bfs/face/938eca5533223a6d30f0582eb3e5350049f9c1cf.jpg', 'Mafumafu_Channel': 'http://i0.hdslb.com/bfs/face/0cee96b40676ede1c7b956e636b240a4119da9f9.jpg', '卡卡会发光guang': 'http://i1.hdslb.com/bfs/face/c9dbb60c7d27a7bd9750d5c2de982c3d23d7dc82.jpg', '贤宝宝Baby': 'http://i1.hdslb.com/bfs/face/c91f1ff05b7da257278bb88d4959733ac12ab3b3.jpg', '崩坏3第一偶像爱酱': 'http://i2.hdslb.com/bfs/face/cb5facbc29275e6bdc4cfa9c20e47ecdb0fe3392.jpg', '渔人阿烽': 'http://i2.hdslb.com/bfs/face/efcf3f7f3acbd93247b04b69d705f67301bfd2f7.jpg', '十音Shiyin': 'http://i1.hdslb.com/bfs/face/71d8b38d96ec67a06ff87359a29418dd944c60b1.jpg', '还有一天就放假了': 'http://i1.hdslb.com/bfs/face/30e515fc2ff5435a5cfb31c289e0bcef860bad1c.jpg', '槐安遗梦': 'http://i1.hdslb.com/bfs/face/a7b82ad44b0194430bc9d8a6ebc94d418ec1b085.jpg', '喵喵折App': 'http://i1.hdslb.com/bfs/face/6582bf05f0a16716a68ddfce84342e239f7bcb68.jpg', '四季萌芽': 'http://i0.hdslb.com/bfs/face/2db05455674f3eeffdf9195f5ed51e7fa66bc763.jpg', 'A路人': 'http://i2.hdslb.com/bfs/face/c4022010115d00da6667a5cf799d5400067d7f66.jpg', '吃鸡陪玩酱': 'http://i0.hdslb.com/bfs/face/0d42041fabc7046f171f8de27d8972c76443a6af.jpg', '哇哇哇妹': 'http://i0.hdslb.com/bfs/face/c134be58f0a8a5bde4b96437c7e4e04968449a19.jpg', '极客湾Geekerwan': 'http://i0.hdslb.com/bfs/face/d0f7a7ee34a4a45c8390eb3a07e4d7f2d70bae91.jpg', '哔哩哔哩会员购': 'http://i2.hdslb.com/bfs/face/ba56c5cac0809c7f389f78b1e4dce4971ba07d52.jpg', '视角姬': 'http://i0.hdslb.com/bfs/face/851025ed4bc57ad6ae82a6314e72e249fd21d604.jpg', '萧忆情Alex': 'http://i1.hdslb.com/bfs/face/98ce5676de8391d3e3164c563866f32ceba9e18b.jpg', '狂风桑': 'http://i1.hdslb.com/bfs/face/5076846bb07c2d6ec442856f69214cae215301f3.jpg', '老邪说电影': 'http://i1.hdslb.com/bfs/face/0fbf6aad74852d4cd68f72d50de62d9f11f7e886.jpg', '某幻君': 'http://i1.hdslb.com/bfs/face/9ed5ebf1e3694d9cd2b4fcd1d353759ee83b3dfe.jpg', 'low君热剧': 'http://i1.hdslb.com/bfs/face/fabe17ef4b344336b872e3ded46a4a4b11140e7f.jpg', '大连老湿王博文': 'http://i0.hdslb.com/bfs/face/21fa2a84a8733cd91a940fad6af81147538ae968.jpg', '野食小哥': 'http://i1.hdslb.com/bfs/face/e0610a26bc510770d997385fb81b9c47157a53c4.jpg', '洛天依': 'http://i1.hdslb.com/bfs/face/67bd11fcd3be8fac5cef1a743a16f0a8cdf39463.jpg', 'bilibili星访问': 'http://i0.hdslb.com/bfs/face/f68925a967357060898da90fb3be14ebd5289879.jpg', '桃核叫我桃道长': 'http://i2.hdslb.com/bfs/face/16279e85ec9dc197f6d306abf84a7f8cd75a0b5e.jpg', '水一大魔王': 'http://i2.hdslb.com/bfs/face/ffdedf591d5ed5a5f584c61fec1add096a61d75a.jpg', '不正经老丝': 'http://i0.hdslb.com/bfs/face/126c00cc089a3c0f540da4525e6ba41452ad83fc.jpg', '白上吹雪Official': 'http://i0.hdslb.com/bfs/face/3b3f2cde975bf334e4f5948709e42b6569c0755d.jpg', '喝水少年孙十一': 'http://i0.hdslb.com/bfs/face/a258cfccf39862242b61328e11dbb926a3b6dbe1.jpg', '指法芬芳张大仙': 'http://i2.hdslb.com/bfs/face/6426336744dd49a744eca32855808073988bd2ce.jpg', 'OELoop': 'http://i1.hdslb.com/bfs/face/6ed036381a0d0a86714816ac393dd9bf1b0bef21.jpg', '山药视频': 'http://i0.hdslb.com/bfs/face/357b015de3b9f4c04527d4fefb844460397ac8b0.jpg', '抽风Crazy': 'http://i2.hdslb.com/bfs/face/04d92c0e30315e56e1365dc9ac2d2cf32e2fe039.jpg', '哔哩哔哩弹幕网': 'http://i0.hdslb.com/bfs/face/8aa6bd8cf269021ffa7a03f7a903ca899c11b7fb.jpg', '凉风Kaze': 'http://i1.hdslb.com/bfs/face/e0cc906bb531195e9ee9f3b575effdd2b056eaea.jpg', '1818黄金眼': 'http://i1.hdslb.com/bfs/face/4cc8d48c9fd68d3e511851b876e04bb953cb095e.jpg', '吃素的狮子': 'http://i0.hdslb.com/bfs/face/249bfa1b3d3e0932f533bc5364964b132fe9c6c2.jpg', '远古时代装机猿': 'http://i2.hdslb.com/bfs/face/ec008e32064705c576f3ffd73d20288e441d945f.jpg', '曼食慢语': 'http://i2.hdslb.com/bfs/face/49dab862b8acaf9a80a921df04c9532f1c826ebc.jpg', '影视飓风': 'http://i0.hdslb.com/bfs/face/c1733474892caa45952b2c09a89323157df7129a.jpg', '渗透之C君': 'http://i2.hdslb.com/bfs/face/623ccce0ab28b721edb61dd64749d91de18fb384.jpg', '花少北丶': 'http://i1.hdslb.com/bfs/face/86ef6895a8f88c80f2885e7eb9ba7989db437b93.jpg', '声鱼片儿': 'http://i1.hdslb.com/bfs/face/e25a06ab0e6e83d8e839dfccec43cc14089c308c.jpg', '暴走漫画': 'http://i2.hdslb.com/bfs/face/c93ff5df2cd0c0c9384b139c86d1a56a9540c9a2.jpg', '阿神的Bili官方頻道': 'http://i1.hdslb.com/bfs/face/a0648687f1b1b7b385b9abdffedb1ba4de78bab0.jpg', '夹性芝士': 'http://i1.hdslb.com/bfs/face/c66dd2ef0573ac91a5a85f79b0c2188b02c98633.jpg', 'AS极客': 'http://i2.hdslb.com/bfs/face/34379e726c3ee063571aea987ff3a27c5b1a536e.jpg', '哔哩哔哩活动': 'http://i0.hdslb.com/bfs/face/45728a09aa3cd19f13247fce58aad38ac46a0f1d.jpg', '丰兄来了': 'http://i2.hdslb.com/bfs/face/369f64c5f24324cf85eaedfc6104325bd6e950cf.jpg', 'scyrax': 'http://i0.hdslb.com/bfs/face/efc37d5941f67087c8b84e99760ae47721f8c443.jpg', '拉宏桑': 'http://i1.hdslb.com/bfs/face/e74603c80efa529c76dce7ede07f986da0af8b0d.jpg', '郁郁_Yu': 'http://i0.hdslb.com/bfs/face/7816841e2e6fefffbb785bb77f1654cc91224338.jpg', '是你的霹雳': 'http://i1.hdslb.com/bfs/face/691bcd9a0f2a77087af2a0962179deab1b87756d.jpg', '哔哩哔哩直播': 'http://i0.hdslb.com/bfs/face/58ca83240e58c3e6ce1ebb9828ce84f6f1b72863.jpg', 'NathanRich火锅大王': 'http://i0.hdslb.com/bfs/face/f2d0fa88345e2f511ae518cc655da9ded11471a8.jpg', '潇湘公子寻': 'http://i2.hdslb.com/bfs/face/a004cfeec6feb65fa421c9a4a49738999ab032de.jpg', '纳豆奶奶': 'http://i0.hdslb.com/bfs/face/0a8638b34173708fcf979bd1166fbb7fdb1110a4.jpg', '逍遥散人': 'http://i2.hdslb.com/bfs/face/d0dad8800774a4903547b1326d1fd927df47b4e9.jpg', '郭乐乐很努力': 'http://i1.hdslb.com/bfs/face/0b609106d937936fb24f92ec42e6ee08e1306a4e.jpg', '宝剑嫂': 'http://i1.hdslb.com/bfs/face/b4be6527726b04049a066d5a073afd7c1a733ee6.jpg', 'colinzang': 'http://i2.hdslb.com/bfs/face/a6be4da0320912851bd5f31479178ec2e5af45c3.jpg', '曹译文iris': 'http://i0.hdslb.com/bfs/face/a21a7c95042c5187575d6768ea5ec9ee331fee3a.jpg', '歪果仁研究协会': 'http://i2.hdslb.com/bfs/face/0ea49d2ad88658689eb31746c09f716c09581e4f.jpg', '音乐君': 'http://i2.hdslb.com/bfs/face/20ec194172acfdbca47539af7779ae4890c753ec.jpg', '李永乐老师官方': 'http://i0.hdslb.com/bfs/face/b299b0e20dc4fbbd45e13ba86f330d890c0a0118.jpg', '库特菌': 'http://i0.hdslb.com/bfs/face/82a5d10cd93a90ec1b255b279578fe3801908831.jpg', '东尼ookii': 'http://i2.hdslb.com/bfs/face/c2c3ce926ec8d9edc361ca706c9dce9415c15114.jpg', '力元君': 'http://i1.hdslb.com/bfs/face/b831ab8e458d7763d397c41f7b1298fe20a0d096.jpg', '哔哩哔哩大会员': 'http://i1.hdslb.com/bfs/face/7266e09e212b2e5109465d877dfa9a58e4ba33d4.jpg', '周六野Zoey': 'http://i1.hdslb.com/bfs/face/c95abd1564994c36dd481c21acf14a83aeb61ab8.jpg', '老四赶海': 'http://i0.hdslb.com/bfs/face/d6b4a211a323f1c8fa35f42b7caca6bcdb9e68c9.jpg', '纯黑-V-布里塔尼亚': 'http://i2.hdslb.com/bfs/face/e8ab7b02d6576f4141ea857734b68b9dd35a5730.jpg', '阿漫啊阿漫': 'http://i2.hdslb.com/bfs/face/ec38b52e36d4577fe865e43bdeef4375221f1250.jpg', '秋呗叔岳': 'http://i1.hdslb.com/bfs/face/71805f5a1971f2637059da49e46f5eafbe4223cb.jpg', '靠谱电竞': 'http://i2.hdslb.com/bfs/face/b37e8c05448cb24b15a4f29581c66fdff1062147.jpg', '百万剪辑狮': 'http://i1.hdslb.com/bfs/face/7974bacea323324b7175055e7a942a676f994132.jpg', '电影最TOP': 'http://i0.hdslb.com/bfs/face/6b2ade215ea603b495648875c925172a863d16d4.jpg', '刘哔电影': 'http://i0.hdslb.com/bfs/face/654ed89c763ca1f003cc2d0145e1ec444cc7f0e8.jpg', '-LKs-': 'http://i2.hdslb.com/bfs/face/6bc7a2778be455273c82e45bfca55cb0f70c820b.jpg', '嘻咦啊看': 'http://i2.hdslb.com/bfs/face/170e1009846dd3ad8cdcd4050527b9cf2ecf67ab.jpg', '凉下采桑': 'http://i0.hdslb.com/bfs/face/c21e4fb4d44e033d7d15283057aa2021e5d0bfb7.jpg', '哔哩哔哩番剧': 'http://i1.hdslb.com/bfs/face/60a9153609998b04301dc5b8ed44c41b537a2268.jpg', '一只南音呀': 'http://i2.hdslb.com/bfs/face/166e2039c60e0b6d8d24bf76bb3628f08804a431.jpg', '蜡笔和小勋': 'http://i2.hdslb.com/bfs/face/8a8812e0a9bb3adda90044ef48830584e1efe7a2.jpg', 'skyhahalife': 'http://i0.hdslb.com/bfs/face/2e5cada7e126a5c43c2ba5576c4dd1725b7075a0.jpg', '芳斯塔芙': 'http://i1.hdslb.com/bfs/face/b3e851174abfbe9622cf3d336f302531d9cdfc97.jpg', '哔哩哔哩游戏中心': 'http://i2.hdslb.com/bfs/face/509a6ca97e1599e55739b048c0770ce4a63531a5.jpg', '山海zoom': 'http://i0.hdslb.com/bfs/face/7289f4e7032cef7c6be32b07f8c559a46627d293.jpg', '是当归哦': 'http://i0.hdslb.com/bfs/face/201b100685362d362434bbbed11834a2a6753401.jpg', '黑桐谷歌': 'http://i0.hdslb.com/bfs/face/31706c82949b3ba4756a411825c3f16aeb14ad44.jpg', 'PDD在B站': 'http://i0.hdslb.com/bfs/face/10a4d2e4b20cf2c08c17604f7a192eb9d7d293f2.jpg', '伊丽莎白鼠': 'http://i2.hdslb.com/bfs/face/6c36ec15f6d7ddd9bdb558511521bd0256779e1c.jpg', '潮汕好男人': 'http://i0.hdslb.com/bfs/face/2a4c1b3c2d8f48a58c5f818287c90a6ea78d7907.jpg', '起小点是大腿': 'http://i0.hdslb.com/bfs/face/94b65197150887e1bc1671be6598d7fd2ea3132b.jpg', '观察者网': 'http://i2.hdslb.com/bfs/face/790c525e30faa01b855e2d4f1126d7e8a7632af0.jpg', '纯黑-克劳狄乌斯': 'http://i0.hdslb.com/bfs/face/e8ab7b02d6576f4141ea857734b68b9dd35a5730.jpg', '徐大sao': 'http://i2.hdslb.com/bfs/face/17323892be2e17b50acadb89b2e385b493f5fb4d.jpg', 'STN工作室': 'http://i1.hdslb.com/bfs/face/c43e6cab13c9a0303cf8476cfd405cff61195726.jpg', '皇族电子竞技俱乐部官方账号': 'http://i2.hdslb.com/bfs/face/ab85323352b2af52e7a5477cd963d5da98fdab4e.jpg', '牛叔万岁万岁万万岁': 'http://i2.hdslb.com/bfs/face/3c9d436659037cc17fe0d32975ef7f17710ba820.jpg', 'TESTV官方频道': 'http://i1.hdslb.com/bfs/face/34ccaf9461c67482e3164675c0036e94df18b7a7.jpg', '聚印象视频': 'http://i0.hdslb.com/bfs/face/98e2202d8e2725fe3a4b7c3adafeb91764584bbb.jpg', '暗猫の祝福': 'http://i0.hdslb.com/bfs/face/c286d67fbc3abe9f47c115fdae797e9b140156b9.jpg', '凤凰天使TSKS韩剧社官方账号': 'http://i2.hdslb.com/bfs/face/97cdf2282bb2c9d4e8ae5184356097844d1ba4d7.jpg', '努巴尼守望先锋': 'http://i2.hdslb.com/bfs/face/358e0b207a9c6012778e4ad1b21356c91536e790.jpg', '少年Pi': 'http://i0.hdslb.com/bfs/face/d851f48a579778b06249bf3debaa62d353694e91.jpg', '老实憨厚的笑笑': 'http://i1.hdslb.com/bfs/face/048853815037dd95d4193dd6a3d561db428a927f.jpg'} \ No newline at end of file diff --git a/run.py b/run.py index 0897e3b..765afc1 100644 --- a/run.py +++ b/run.py @@ -81,10 +81,9 @@ def run_threaded(job_func): schedule.every().week.do(run_threaded, video_spider_all) schedule.every().week.do(run_threaded, videoRank) schedule.every().hour.do(run_threaded, site) -schedule.every(15).minutes.do(run_threaded, online) +# schedule.every(15).minutes.do(run_threaded, online) schedule.every(10).minutes.do(run_threaded, strong) - print('开始运行计划任务..') while True: schedule.run_pending() From f54f05b1eb74e12dd98e5041d92094c9e99439f4 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 9 Apr 2019 21:10:26 +0800 Subject: [PATCH 261/469] refactor: strong spider --- run.py | 50 +++++++++++++++++++++++++++++++++++++++---------- run_analyzer.py | 4 ++-- 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/run.py b/run.py index 765afc1..e5530b6 100644 --- a/run.py +++ b/run.py @@ -6,12 +6,43 @@ from subprocess import Popen import logging import threading -from biliob_analyzer.video_rank import computeVideoRank - - -def videoRank(): - computeVideoRank() - +from db import redis_connection + +import requests +import redis +from lxml import etree +import json + +VIDEO_URL = "https://api.bilibili.com/x/article/archives?ids={aid}" +VIDEO_KEY = "videoRedis:start_urls" +AUTHOR_URL = "https://api.bilibili.com/x/web-interface/card?mid={mid}" +AUTHOR_KEY = "authorRedis:start_urls" +DANMAKU_FROM_AID_URL = "https://api.bilibili.com/x/web-interface/view?aid={aid}" +DANMAKU_KEY = "DanmakuAggregate:start_urls" + +def sendAuthorCrawlRequest(mid): + redis_connection.rpush(AUTHOR_KEY, AUTHOR_URL.format(mid=mid)) + + +def sendVideoCrawlRequest(aid): + redis_connection.rpush(VIDEO_KEY, VIDEO_URL.format(aid=aid)) + + +def crawlOnlineTopListData(): + ONLINE_URL = 'https://www.bilibili.com/video/online.html' + response = requests.get(ONLINE_URL) + data_text = etree.HTML(response.content.decode( + 'utf8')).xpath('//script/text()')[-2] + j = json.loads(data_text.lstrip('window.__INITIAL_STATE__=')[:-122]) + for each_video in j['onlineList']: + aid = each_video['aid'] + mid = each_video['owner']['mid'] + if mid not in [7584632, 928123]: + sendAuthorCrawlRequest(mid) + sendVideoCrawlRequest(aid) + print(aid) + print(mid) + pass def site(): Popen(["scrapy", "crawl", "site"]) @@ -50,7 +81,7 @@ def online(): def strong(): - Popen(['scrapy', 'crawl', 'strong']) + crawlOnlineTopListData() def data_analyze(): @@ -79,10 +110,9 @@ def run_threaded(job_func): schedule.every().day.at('22:00').do(run_threaded, video_watcher) schedule.every().day.at('21:00').do(run_threaded, bili_monthly_rank) schedule.every().week.do(run_threaded, video_spider_all) -schedule.every().week.do(run_threaded, videoRank) schedule.every().hour.do(run_threaded, site) -# schedule.every(15).minutes.do(run_threaded, online) -schedule.every(10).minutes.do(run_threaded, strong) +schedule.every(15).minutes.do(run_threaded, online) +schedule.every(1).minutes.do(run_threaded, strong) print('开始运行计划任务..') while True: diff --git a/run_analyzer.py b/run_analyzer.py index ecbef74..04a3255 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -2,8 +2,8 @@ from biliob_analyzer.video_analyzer import VideoAnalyzer import biliob_analyzer.author_rate_caculate import biliob_analyzer.author_fans_watcher -import biliob_analyzer.author_rank -import biliob_analyzer.video_rank +# import biliob_analyzer.author_rank +# import biliob_analyzer.video_rank from biliob_analyzer.add_keyword import KeywordAdder kwAdder = KeywordAdder() kwAdder.add_all_author() From b92bcfc0cc3e5a97081140d0bf9b0e3827300578 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 9 Apr 2019 21:10:26 +0800 Subject: [PATCH 262/469] refactor: strong spider --- .vscode/settings.json | 3 +++ run.py | 50 ++++++++++++++++++++++++++++++++++--------- run_analyzer.py | 4 ++-- 3 files changed, 45 insertions(+), 12 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index e69de29..de00549 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.pythonPath": "C:\\Users\\mvgos\\AppData\\Local\\Programs\\Python\\Python37\\python.exe" +} \ No newline at end of file diff --git a/run.py b/run.py index 765afc1..e5530b6 100644 --- a/run.py +++ b/run.py @@ -6,12 +6,43 @@ from subprocess import Popen import logging import threading -from biliob_analyzer.video_rank import computeVideoRank - - -def videoRank(): - computeVideoRank() - +from db import redis_connection + +import requests +import redis +from lxml import etree +import json + +VIDEO_URL = "https://api.bilibili.com/x/article/archives?ids={aid}" +VIDEO_KEY = "videoRedis:start_urls" +AUTHOR_URL = "https://api.bilibili.com/x/web-interface/card?mid={mid}" +AUTHOR_KEY = "authorRedis:start_urls" +DANMAKU_FROM_AID_URL = "https://api.bilibili.com/x/web-interface/view?aid={aid}" +DANMAKU_KEY = "DanmakuAggregate:start_urls" + +def sendAuthorCrawlRequest(mid): + redis_connection.rpush(AUTHOR_KEY, AUTHOR_URL.format(mid=mid)) + + +def sendVideoCrawlRequest(aid): + redis_connection.rpush(VIDEO_KEY, VIDEO_URL.format(aid=aid)) + + +def crawlOnlineTopListData(): + ONLINE_URL = 'https://www.bilibili.com/video/online.html' + response = requests.get(ONLINE_URL) + data_text = etree.HTML(response.content.decode( + 'utf8')).xpath('//script/text()')[-2] + j = json.loads(data_text.lstrip('window.__INITIAL_STATE__=')[:-122]) + for each_video in j['onlineList']: + aid = each_video['aid'] + mid = each_video['owner']['mid'] + if mid not in [7584632, 928123]: + sendAuthorCrawlRequest(mid) + sendVideoCrawlRequest(aid) + print(aid) + print(mid) + pass def site(): Popen(["scrapy", "crawl", "site"]) @@ -50,7 +81,7 @@ def online(): def strong(): - Popen(['scrapy', 'crawl', 'strong']) + crawlOnlineTopListData() def data_analyze(): @@ -79,10 +110,9 @@ def run_threaded(job_func): schedule.every().day.at('22:00').do(run_threaded, video_watcher) schedule.every().day.at('21:00').do(run_threaded, bili_monthly_rank) schedule.every().week.do(run_threaded, video_spider_all) -schedule.every().week.do(run_threaded, videoRank) schedule.every().hour.do(run_threaded, site) -# schedule.every(15).minutes.do(run_threaded, online) -schedule.every(10).minutes.do(run_threaded, strong) +schedule.every(15).minutes.do(run_threaded, online) +schedule.every(1).minutes.do(run_threaded, strong) print('开始运行计划任务..') while True: diff --git a/run_analyzer.py b/run_analyzer.py index ecbef74..04a3255 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -2,8 +2,8 @@ from biliob_analyzer.video_analyzer import VideoAnalyzer import biliob_analyzer.author_rate_caculate import biliob_analyzer.author_fans_watcher -import biliob_analyzer.author_rank -import biliob_analyzer.video_rank +# import biliob_analyzer.author_rank +# import biliob_analyzer.video_rank from biliob_analyzer.add_keyword import KeywordAdder kwAdder = KeywordAdder() kwAdder.add_all_author() From a3c4f8a43f2dfc7165e8e52101f05af3d247c9b6 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Tue, 9 Apr 2019 21:10:26 +0800 Subject: [PATCH 263/469] refactor: strong spider --- .vscode/settings.json | 3 +++ run.py | 50 ++++++++++++++++++++++++++++++++++--------- run_analyzer.py | 4 ++-- 3 files changed, 45 insertions(+), 12 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index e69de29..de00549 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.pythonPath": "C:\\Users\\mvgos\\AppData\\Local\\Programs\\Python\\Python37\\python.exe" +} \ No newline at end of file diff --git a/run.py b/run.py index 765afc1..e5530b6 100644 --- a/run.py +++ b/run.py @@ -6,12 +6,43 @@ from subprocess import Popen import logging import threading -from biliob_analyzer.video_rank import computeVideoRank - - -def videoRank(): - computeVideoRank() - +from db import redis_connection + +import requests +import redis +from lxml import etree +import json + +VIDEO_URL = "https://api.bilibili.com/x/article/archives?ids={aid}" +VIDEO_KEY = "videoRedis:start_urls" +AUTHOR_URL = "https://api.bilibili.com/x/web-interface/card?mid={mid}" +AUTHOR_KEY = "authorRedis:start_urls" +DANMAKU_FROM_AID_URL = "https://api.bilibili.com/x/web-interface/view?aid={aid}" +DANMAKU_KEY = "DanmakuAggregate:start_urls" + +def sendAuthorCrawlRequest(mid): + redis_connection.rpush(AUTHOR_KEY, AUTHOR_URL.format(mid=mid)) + + +def sendVideoCrawlRequest(aid): + redis_connection.rpush(VIDEO_KEY, VIDEO_URL.format(aid=aid)) + + +def crawlOnlineTopListData(): + ONLINE_URL = 'https://www.bilibili.com/video/online.html' + response = requests.get(ONLINE_URL) + data_text = etree.HTML(response.content.decode( + 'utf8')).xpath('//script/text()')[-2] + j = json.loads(data_text.lstrip('window.__INITIAL_STATE__=')[:-122]) + for each_video in j['onlineList']: + aid = each_video['aid'] + mid = each_video['owner']['mid'] + if mid not in [7584632, 928123]: + sendAuthorCrawlRequest(mid) + sendVideoCrawlRequest(aid) + print(aid) + print(mid) + pass def site(): Popen(["scrapy", "crawl", "site"]) @@ -50,7 +81,7 @@ def online(): def strong(): - Popen(['scrapy', 'crawl', 'strong']) + crawlOnlineTopListData() def data_analyze(): @@ -79,10 +110,9 @@ def run_threaded(job_func): schedule.every().day.at('22:00').do(run_threaded, video_watcher) schedule.every().day.at('21:00').do(run_threaded, bili_monthly_rank) schedule.every().week.do(run_threaded, video_spider_all) -schedule.every().week.do(run_threaded, videoRank) schedule.every().hour.do(run_threaded, site) -# schedule.every(15).minutes.do(run_threaded, online) -schedule.every(10).minutes.do(run_threaded, strong) +schedule.every(15).minutes.do(run_threaded, online) +schedule.every(1).minutes.do(run_threaded, strong) print('开始运行计划任务..') while True: diff --git a/run_analyzer.py b/run_analyzer.py index ecbef74..04a3255 100644 --- a/run_analyzer.py +++ b/run_analyzer.py @@ -2,8 +2,8 @@ from biliob_analyzer.video_analyzer import VideoAnalyzer import biliob_analyzer.author_rate_caculate import biliob_analyzer.author_fans_watcher -import biliob_analyzer.author_rank -import biliob_analyzer.video_rank +# import biliob_analyzer.author_rank +# import biliob_analyzer.video_rank from biliob_analyzer.add_keyword import KeywordAdder kwAdder = KeywordAdder() kwAdder.add_all_author() From c976e260ebb3ec9947c802899df81110b5429644 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 13 Apr 2019 00:29:30 +0800 Subject: [PATCH 264/469] speed up --- biliob_spider/spiders/author_update_with_redis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biliob_spider/spiders/author_update_with_redis.py b/biliob_spider/spiders/author_update_with_redis.py index f925f31..6d6816b 100644 --- a/biliob_spider/spiders/author_update_with_redis.py +++ b/biliob_spider/spiders/author_update_with_redis.py @@ -22,7 +22,7 @@ class AuthorUpdateWithRedis(RedisSpider): 'ITEM_PIPELINES': { 'biliob_spider.pipelines.AuthorPipeline': 300 }, - 'DOWNLOAD_DELAY': 10 + 'DOWNLOAD_DELAY': 0.5 } def __init__(self): From 10ec478599a27385b53ef8af3980dc48a1874062 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 13 Apr 2019 00:29:30 +0800 Subject: [PATCH 265/469] speed up --- biliob_spider/spiders/author_update_with_redis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biliob_spider/spiders/author_update_with_redis.py b/biliob_spider/spiders/author_update_with_redis.py index f925f31..6d6816b 100644 --- a/biliob_spider/spiders/author_update_with_redis.py +++ b/biliob_spider/spiders/author_update_with_redis.py @@ -22,7 +22,7 @@ class AuthorUpdateWithRedis(RedisSpider): 'ITEM_PIPELINES': { 'biliob_spider.pipelines.AuthorPipeline': 300 }, - 'DOWNLOAD_DELAY': 10 + 'DOWNLOAD_DELAY': 0.5 } def __init__(self): From f797c4e1e82c079235503d1bad9f4b8fae6aaacb Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sat, 13 Apr 2019 00:29:30 +0800 Subject: [PATCH 266/469] speed up --- biliob_spider/spiders/author_update_with_redis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biliob_spider/spiders/author_update_with_redis.py b/biliob_spider/spiders/author_update_with_redis.py index f925f31..6d6816b 100644 --- a/biliob_spider/spiders/author_update_with_redis.py +++ b/biliob_spider/spiders/author_update_with_redis.py @@ -22,7 +22,7 @@ class AuthorUpdateWithRedis(RedisSpider): 'ITEM_PIPELINES': { 'biliob_spider.pipelines.AuthorPipeline': 300 }, - 'DOWNLOAD_DELAY': 10 + 'DOWNLOAD_DELAY': 0.5 } def __init__(self): From 5ebbcc254bfcf49137cb173f28e1471507e43b16 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sun, 14 Apr 2019 00:05:21 +0800 Subject: [PATCH 267/469] update real time spider --- biliob_requests/author_update_currentFans.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/biliob_requests/author_update_currentFans.py b/biliob_requests/author_update_currentFans.py index b433e2d..806e4a7 100644 --- a/biliob_requests/author_update_currentFans.py +++ b/biliob_requests/author_update_currentFans.py @@ -9,13 +9,15 @@ settings['MONGO_PSW']) db = client['biliob'] # 获得数据库的句柄 -coll = db['author'] # 获得collection的句柄 +author = db['author'] # 获得collection的句柄 +realtime_fans = db['realtime_fans'] URL = 'https://api.bilibili.com/x/web-interface/card?mid={}' while True: - docs = coll.find({}, {'mid': 1}).sort( + docs = author.find({}, {'mid': 1}).sort( 'cFans', direction=DESCENDING).limit(2) mids = map(lambda x: x['mid'], docs) + date = datetime.datetime.now() for mid in mids: try: j = requests.get(URL.format(mid)).json() @@ -23,7 +25,10 @@ fans = j['data']['card']['fans'] if fans == 0: continue - coll.update_one({'mid': mid}, {'$set': {'cFans': fans}}) + author.update_one({'mid': mid}, {'$set': {'cFans': fans}}) + realtime_fans.insert_one( + {'mid': mid, 'fans': fans, 'datetime': date}) except Exception as e: + print(e) pass time.sleep(5) From 8e784c4e3c47eaa7dd9522b82ab86c8752542594 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sun, 14 Apr 2019 00:05:21 +0800 Subject: [PATCH 268/469] update real time spider --- biliob_requests/author_update_currentFans.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/biliob_requests/author_update_currentFans.py b/biliob_requests/author_update_currentFans.py index b433e2d..806e4a7 100644 --- a/biliob_requests/author_update_currentFans.py +++ b/biliob_requests/author_update_currentFans.py @@ -9,13 +9,15 @@ settings['MONGO_PSW']) db = client['biliob'] # 获得数据库的句柄 -coll = db['author'] # 获得collection的句柄 +author = db['author'] # 获得collection的句柄 +realtime_fans = db['realtime_fans'] URL = 'https://api.bilibili.com/x/web-interface/card?mid={}' while True: - docs = coll.find({}, {'mid': 1}).sort( + docs = author.find({}, {'mid': 1}).sort( 'cFans', direction=DESCENDING).limit(2) mids = map(lambda x: x['mid'], docs) + date = datetime.datetime.now() for mid in mids: try: j = requests.get(URL.format(mid)).json() @@ -23,7 +25,10 @@ fans = j['data']['card']['fans'] if fans == 0: continue - coll.update_one({'mid': mid}, {'$set': {'cFans': fans}}) + author.update_one({'mid': mid}, {'$set': {'cFans': fans}}) + realtime_fans.insert_one( + {'mid': mid, 'fans': fans, 'datetime': date}) except Exception as e: + print(e) pass time.sleep(5) From edb7c09956ebc4d1f7efb296dd3511539c76359c Mon Sep 17 00:00:00 2001 From: Jannchie Date: Sun, 14 Apr 2019 00:05:21 +0800 Subject: [PATCH 269/469] update real time spider --- biliob_requests/author_update_currentFans.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/biliob_requests/author_update_currentFans.py b/biliob_requests/author_update_currentFans.py index b433e2d..806e4a7 100644 --- a/biliob_requests/author_update_currentFans.py +++ b/biliob_requests/author_update_currentFans.py @@ -9,13 +9,15 @@ settings['MONGO_PSW']) db = client['biliob'] # 获得数据库的句柄 -coll = db['author'] # 获得collection的句柄 +author = db['author'] # 获得collection的句柄 +realtime_fans = db['realtime_fans'] URL = 'https://api.bilibili.com/x/web-interface/card?mid={}' while True: - docs = coll.find({}, {'mid': 1}).sort( + docs = author.find({}, {'mid': 1}).sort( 'cFans', direction=DESCENDING).limit(2) mids = map(lambda x: x['mid'], docs) + date = datetime.datetime.now() for mid in mids: try: j = requests.get(URL.format(mid)).json() @@ -23,7 +25,10 @@ fans = j['data']['card']['fans'] if fans == 0: continue - coll.update_one({'mid': mid}, {'$set': {'cFans': fans}}) + author.update_one({'mid': mid}, {'$set': {'cFans': fans}}) + realtime_fans.insert_one( + {'mid': mid, 'fans': fans, 'datetime': date}) except Exception as e: + print(e) pass time.sleep(5) From 515ea88d05fc2c36e8b876c624c51636438405c4 Mon Sep 17 00:00:00 2001 From: Jannchie Date: Fri, 19 Apr 2019 15:33:47 +0800 Subject: [PATCH 270/469] update: search text dict --- biliob_analyzer/dict.txt | 9183 +++++++++++++++++++++++++++++++++++++- 1 file changed, 9179 insertions(+), 4 deletions(-) diff --git a/biliob_analyzer/dict.txt b/biliob_analyzer/dict.txt index b0257d1..7c6399a 100644 --- a/biliob_analyzer/dict.txt +++ b/biliob_analyzer/dict.txt @@ -505,10 +505,6 @@ liyua 力yuan 力元、 你的ke -你的可樱yijing -你的可樱已经 -你的可樱已经、 -你的可樱已 你的可樱已经上线 yude 马zua @@ -595,3 +591,9182 @@ f k 胡子盖 太猪 太 + +一只不会 +一只不会空气斩的皮卡 +uid:75 +ke'lai'mi +克莱米 +一只不会zhan +ke +克莱米a +克莱米阿 +yqa +av44735946 +av11530039 +阿儒还小 +播放量 +某科学 +zhongguo +石原里美 +霹雳无敌 +我就叫五 +我就叫五尺啊 +vinh +yjanjo +yjan +嗨氏 +张大仙 +双马尾 +chiho +吃货xiaowei +吃货小伟 +吃货小伟0 +吃货 +fu +hua +av45549428 +zhan +占用各位 +https://space.bilibili.com/395844576?spm_id_from=333.788.b_765f7570696e666f.2 +majaing +majiang +majiangjiu +马ian +马讲究 +我的烤面筋, +我的烤面筋,融化你的心 +小科尔 +吴织亚切 +郭杰瑞 +喵会长 +温柔junz +阿峰 +西棠1 +西棠18号jie +西棠18号街 +允星河 +吾空ba'jie +唔叽里 +南丰 +强行 +强行观测 +f'z +jian +hu +bonjourhu +will +chaosove +chaosoverwatch +av41809384 +oulis +av45406456 +ge +嗝bi +嗝壁老 +嗝壁老wang +嗝壁老汪 +lao'sao +老骚dou +嗝壁lao'wa +j'f'j'e +花花y +琉璃子 +ruriko +xiao +pang +小胖ja +ful +素描彩铅书记 +从猩猩到人类 +xu'xu'l +yu +toucc +嗝 +嗝壁 +嗝壁老wan +与山 +a44979891 +av44979891 +你的可樱已上线 +相妄 +746786? +xiang +相 +-eassy +-e +喜羊羊是一部怎样的动画?巅峰时期的喜羊羊与灰太狼有多强 +av42466533 +janc +/av19390801 +av19390801 +憋气 +由you +由由xiao +由由小子 +ile +声优都是 +声优都是大佬2 +声优都是怪物 +君寻玩 +敬汉 +av5109135 +av42295013 +贪草兔 +贪草 +星之 +辉夜大小姐想让我告白~天才们的恋爱头脑战~ +我是guoji +鸵 +小相公咨询 +xiaoxianggongzi +潇湘公子xun +陈卓 +快手 +新闻 +科幻he +lusheng +lushenn +鹿笙 +uid:318822410 +子轩 +cnm +w45 +le e yu +leeyu +l e e h +l e e h yun +leehyuny y +leehyun鱼 +秋叶援 +animenzzzz +fj +uhn +girl +nice +狙击手麦克 +初音未来 +潜艇 +潜艇伟 +贝宁 +潜艇伟伟 +ls +laocu +ilem +刀剑神域3 +普通的 +战斗吧 +战斗吧ge +he'lu +赫萝la +老奶奶 +奥德修斯号 +liyuan +liyuanju +landjung +九条九奈 +uid:316344611 +laos +申申很 +天牛奶 +阿远 +波音植树部 +波音 +oi +scp +oid +scp-07 +scp-071-x +scp-071-性魔 +old +稚嫩的魔法师 +uid:247294492 +ld +软软冰 +xn= +xn== +edmund +xn==x +xn==x'n +xn==x'n'c'y +xn==x'n'c +xn==虚拟次元 +xia +foxlin +软软 +xiao'x +xiao'xi +老e +h娘 +uid:5004701 +g'q +g'q't +hel +雪茄 +赫lou +赫luo +狐麟 +uid:218690305 +ui5004701 +uid218690305 +大逗 +大逗d +dpff +莉 +天聪君 +天连 +天连水碧 +一只yany +天连水碧水连天 +一只咸鱼 +一只咸鱼莫 +mendako +menda +mendako醬 +av45258931 +mendako醬official +bilibilil +mendako醬of +郭勒了很努力 +mendako醬offi +mendako醬offici +郭乐很努力 +bili +白熊one +予以 +白熊 +小p +小p就是我 +mea +飞麦菌 +安东尼奥 +无主之地3 +安東尼奧 +独立菌二 +kanna +大家好 +赤井 +大家好我叫何同学 +恒星 +yoyo +ye'dao +恒星、 +恒星、star +夜道 +笙誓 +yoyo鼠 +uid:29594981 +yingchu +影川流 +冰灵 +中国航天失败 +欣 +欣小 +兲嘂玊 +段老c +段老c爱玩游戏 +朝阳逆光 +av19791318 +扎 +扎双马尾的 +天使降临到 +hentia +天使降临到我身边 +h +entai +女神是 +三颗眼睛的 +三颗眼睛的闷訫 +三颗眼睛的闷訫丶 +幽芒灬风翼 +uid:221390876 +幽芒风翼 +一休大叔 +一休大叔真好玩 +雨上欧尼 +露依 +露依思 +露依思鹿 +女孩为何 +女孩为何穿 +1av0492 +av10492 +av10 +av2 +av7 +喵呜 +喵呜视 +kylin先生 +喵呜视频 +10周年 +宁子星人 +uid:32802153 +bilibil +b网站 +网站 +哔哩哔哩网站 +周小 +周小瑜 +渣画廊 +a'shen +b'r +bright +南望山小柯基 +改革春风吹满地 +一米八的 +uid:327816584 +信誉 +洛御桀 +东方栀子 +信誉蛋蛋 +作业 +serbiaball +uid:243418864 +散人 +我刘醒教你 +我刘醒教你吔屎 +小了 +美丽科技 +有之风 +黑哲君 +vzoo +黑哲 +神楽めあ +神楽めあoffic +神楽めあoffici +神楽めあofficio +もりながみ +两亿滚 +森永みいう +两滚 +森永みいうofficial +森永みうofficial +森永 +娇妹与 +娇妹与骚猪 +阿里巴巴 +officer +office +家里蹲 +啥玩意 +啥玩意都发 +啥玩意都发的 +啥玩意都发的dontwin +平平无奇 +我的平平无奇 +罗小新 +小缸与 +小缸与阿灿 +小缸与阿、 +小缸与阿 +变变变 +小鱼儿2468 +oldba +uid:36311613 +拥有封禁 +拥有封禁体质的 +月影y +拥有封禁体质的单色调 +月影yancie +让人欲罢不能 +让人欲罢不能的 +让人欲罢不能的胖次 +稚嫩的 +楼下的 +楼下的老张 +自喵星 +自喵星的鱼 +来自喵星的鱼 +ui34210345 +芒 +东方华 +东方华灯 +花少北 +nov +nova +novatang +d.b +d.ba +dogball +幽默童鞋 +小助手 +水木清扬 +uid:345819190 +uid:349991143 +拼死 +拼死攒够 +老七 +拼死攒够1000 +拼死攒够1000关注 +拼死攒够1000关 +精灵宝可梦 +迟到即正义 +老师好 +老师好我叫 +塔拉 +塔拉卖 +塔拉麦 +uid:305803856 +uid:586455 +方块学园 +月影 +月影yanc +东京飘落樱花泪 +酷炫的明明 +av40286290 +av4028 +建桥小学 +建桥 +建桥小学五年级 +小申ap +cmchsc +cm +音习社 +bilibi +南一qaq +是橙星佑 +是橙星佑哒 +橙星佑丶 +橙星佑丶og +橙星佑丶og喵 +uid:96846483 +钢之炼金术师 +钢之炼金术师fa +av30000000 +赫萝老师 +超果果m +波兰球 +liem +传送门 +seanxu +se +某某阳 +樱木世衣 +木世衣 +古都仙 +睡不醒的 +渡鸦12345 +谜之御坂x +吃呱唧 +吃呱唧的 +吃呱唧的猫呱唧 +逍遙散人 +玩你鸟 +逍 +0玩你鸟0 +万里马的 +万里马的伯乐 +玖依 +玖依7 +红色即是 +红色即是正义 +lin'du +vdger唯界 +折紙熊貓 +七七七 +七七七枸杞 +七七七杞 +尤里政委 +胡萝卜的 +胡萝卜的须 +凌刀夜心 +真王无敌 +西瓜jun +二六六 +颜盏 +dz +pac_ma +pac_mac +pac_mac_x +小黑猫 +jan +ba1 +kch +uid:86339189 +绩点不达 +绩点不达4不改名 +pi_ +pi_greco +みま +神楽 +九世 +九世公 +风间幽香 +宁采臣在唱歌 +宁采臣在 +辛夷神说 +瓶子君15 +个岸 +腾讯一个岸 +hr1 +hr11 +hr11apk +岛艾玛 +菌子笑笑 +徐老师 +徐老师来巡山 +超心塞的 +超心塞的十六 +路酱 +星辰 +老局长7890 +千灯引 +mkli +mkliang +cloudyirq +cloudyir +鲤鱼 +锡兰c e +鲤鱼ace +yun +doge +doge_man +浮空 +浮空amazing +喜欢近战的 +喜欢近战的魔法天 +喜欢近战的魔法天才 +鲤鱼a +鲤鱼are +籽1 +两仪滚的 +两仪滚的按摩 +星航传媒 +两仪滚的按摩技师 +两仪滚的按 +卻 +鲤 +辣骨 +o101 +红糖 +o心 +红糖糖 +红糖: +红糖糛 +静听夜落 +夕墨 +哎 +派san +好奇瞎 +好奇瞎翻译 +刘女 +jian'qi +刘- +刘老" +木子-天曰 +北條麻妃 +橱二 +橱二chu +橱二chu2 +利利的小细腿 +小心翼翼 +baishangchuixue +神乐 +yaoyao +雒杭桑 +神乐めあ +妖妖幺yue +妖妖幺月 +uid4017177 +犬山玉姬 +湊-阿库娅 +芳斯 +芳斯塔 +神楽officel +星海天夜 +滚动的天空 +跳舞的线 +小邵 +小邵の大大 +见齐 +野生动物 +氶一 +氶一大魔王 +拭旧忆 +天海 +天海星夜 +fu li fa +fu li f +fu shi +枪声音乐 +粽子_z +粽子_ +粽子_zombie +夏目友人帐 +朋友交易最光荣 +粽子_zo +粽子_zomvie +jialiujia +睡过 +虎牙汉界 +虎牙 +神游八荒 +桔子鲑鱼 +痛苦 +痛苦pain +上海一夜 +http://www.bilibili.com/video/av20204014?share_medium=android +http://www.bilibili.com/video/av20204014?share_medium=andro +http://www.bilibili.com/video/av20204014? +http://www.bilibili.coav20204014 +av20204014 +吃糖鸭 +a残梦 +a残梦tt +残梦 +萌萌哒 +萌萌哒丶九 +萌萌哒丶九灬书 +柔柔冰冰冰柔 +硝子的 +硝子的助听器 +yuzi +yuziminecraft +g's'y'z +guan'shi +guan'shi'yi'zhan +guan'shiyizhan +冠shi'yi'zhan +tianm丶恶魔 +上官月逸 +用六 +用六硬币换来 +用六硬币换来的名字 +韩小言 +xevoer +ild +籽岷 +olaba +olaba1 +鹿野郎 +尼古拉斯坚果 +灯一啦 +山山 +世 +世界_ +世界_检 +cr +nalan +少年城 +十二月的毛毛 +xianbaob +贤宝宝 +灯灯 +灯灯登 +灯灯登蹬 +柴笔 +灯灯登蹬等 +神秘点 +神秘店长 +fls +fp +bilib +bilibik +sox神教 +sox神教大教主 +shinianmoluli +十年陌路灵归 +不吉波普 +高仿姬 +百舸莫争流 +2019中美合拍 +密子君 +大忽悠 +七悠qaq +某幻 +皇甫汐 +皇甫汐丶 +ale +我d +我d世界 +克利兰 +我d世界letter +我d世界letterrain +异世界魔王 +异界少女召唤术 +终点成 +终点成殇 +uid12216219 +淡青 +淡青色域 +初禾r +初禾ryio +pacmac +黑镖客 +牛逼牛逼牛逼 +允星河yos +社会我 +桥肯 +吾乃叶姐 +喜欢做梦的落叶 +喜欢做梦的落叶丶 +uid:49936103 +星 +yun'xing'h +hu'you +隐匿的山矿 +丰来了 +李魏儿 +十年zen +十年怎么走 +穷开挂 +rdc +rdcgame +rdcg +rdcgaming +羊辰c +羊辰 +孤丶 +孤立 +孤立sama +清浊伊 +重工 +重工特色人 +愉悦的hehe11 +老叶大 +老叶大shi +老叶大师 +老叶 +隔壁老 +隔壁老汪 +有两双 +有两双眼睛 +有两双眼睛在 +有两双眼睛在看你 +药 +solo +吴承恩与西游记 +橙子好累 +遍访名川 +中國 +中國boy +迪 +泡芙 +http://www.bilibili.com/video/av40006753?share_medium=android +av40006753 +武局 +武局启东 +大j +天綠 +天綠君 +陈文 +摄影师陈文 +摄影师陈文坚 +摄影师陈 +xiao c s +xiao cao +小草梅 +小草莓sh +小草莓是不是xuao +小草莓是不是小草桑 +小草桑 +摸着良心的 +摸着良心的说 +大英帝国球 +uid:34288358 +未志 +未志铭 +耳击bxx +耳击b +繁吹雨 +bili_54624048346 +樟脑丸 +脑丸 +章 +寧三九 +羊小星 +蓝棠儿 +uid:31596887 +冰羽 +冰羽虹 +500ml容量瓶 +linj +邻家的x +邻家的xixure +邻家的吸血鬼 +邻家的吸血鬼小妹 +邻家suo +邻家索菲 +av33166665 +平溪 +平溪先生 +摄影师做 +摄影师钟xiao +摄影师钟晓胖 +摄影师钟 +憨么子 +江ren +cz331 +cz3354 +tannin +tanninetyfour +tanninetyf +信誓旦旦 +噗啦 +噗啦woofy +辉夜大小姐想让我 +辉夜大小姐想让我告白 +熬 +番茄 +jabbc +jannch +老番茄 +imzooy +imzooye +飘 +萝莉控 +bao'feng +暴风yu'zhi'zhong +暴风雨之中 +暴风雨之中de'ning +暴风雨之中的宁静 +川先森 +雪听 +雪听幻 +三无m +见齐天 +幸运的 +豆子半仙 +隐形守护 +隐形守护者 +月东 +千叶 +次元饺子 +san'wu +月比 +月比天高 +月比天高1 +无情之殇 +凌兰雨璃 +傻瓜 +傻瓜卷发子 +世界bb +我是papi +黑喂狗 +黑喂 +吃货明 +不可名状之烷 +硬的要死聚丁烯 +回不去的 +回不去的被砍 +夜游子 +夜游子ne +吴织亚切大 +quan +犬pian +天降精灵 +uid:34798239 +吃里 +吃里趴外 +you'zai +悠哉日常 +悠哉日常的wang'qiu +悠哉日常的旺shu +悠哉日常的旺叔 +晏暮 +鱼籽 +晏暮酱 +鱼籽_ +鱼籽_sherroe +uid:57427709 +豆豆子 +小仙若 +酷睿 +五歌 +kann +康娜 +uid345819190 +杰里得 +杰里 +杰里德 +仙剑奇侠传 +jard +shi'wei +食wei +食味阿远 +怪异君 +郭杰 +齐 +jann +jannie +uid:1850091 +橘红聚合 +带北极星 +带北极星小姐姐去 +带北极星小姐姐去上海 +宸宸宸 +b's +zearatul +地球颜值担当 +双琴侠 +地球颜值担当总代表 +kan +k +陆婉 +陌上凌燮 +lemon_null +kanna_vtuber官方频道 +kanna_vtuber +威尔逊不是 +fesb +fesbettani +fes +lmintlcx +lcx +故剑情深 +uid:353912636 +嘛咔叭咔吼 +雲墨蘭 +uid2 +uid171675923 +r童趣 +百度r +百度red +跳舞的 +祖沫尘 +心语安 +心语安― +心语安―安 +帝国元首 +超级xiao +超级小 +祭焱or祭三火 +hk茶杯 +宁采臣 +所以陈文坚 +bili_7996465223 +vlog- +vlog-177 +我是天天天天 +我是天 +我是天天天天天天幻啦 +我是天天天天天幻啦 +hello +mute +mute——liu +mute— +mute—— +xin'yuan'jie'yi +新垣结衣 +逃避sui +逃避虽 +乌鸦子 +乌鸦子ct +uid:195664727 +弗林fang +弗林坊 +熙夜 +熙夜今天 +熙夜今天丧了没 +超格瓦拉 +uid:108540180 +奶昔 +朱日和 +合格的搬运工 +uranusian +潘大帅 +潘大帅在减肥 +怕上火 +怕上火爆wang'lao'ju +怕上火爆王老菊 +chou d +玖绫aya +电子像素 +zu'zhi'was +zu'zhi'ya'she +吴zhi +中国知网 +http://www.bilibili.com/video/av45651829?share_medium=android +http://www.bilibili.com/video/av45651829?share_ +http://www.bilibili.com/video/av45651829 +喵茴香 +极客geek +跃迁引擎启动 +fesbe +hellobeastie_ +百慕三石 +米丶 +米兰主 +米兰 +ena +有栖息地 +凑 +兰若 +tarsko +腾易游戏官方 +侵害 +侵害香蕉君 +欧利 +天然小 +天然小槑 +天然小槑菌 +阿神的 +璇咩 +花空烟水流丶 +uid:35729600 +你脑袋进咸鱼了吗 +k's'k'l'h +k's'k'l'j'd +快上课了觉得 +kaana +凑, +ka +灵月瑶 +圣翔 +路人我 +路人我姓 +路人我姓丙 +园田海泷 +园 +久保 +久保小鹿 +久保小鹿字幕组 +楠田帝国广宣部 +楠田 +一f +一翻 +love +llove +lolve +西木野 +西木野千恵 +shishi +失禁风 +小黑 +霍永 +井溪 +merjoex +粽子g y +粽子工艺 +黑川 +黑川爱chito +黑川爱吃肉 +柔柔冰 +第十人 +你脑子进咸鱼了吗 +祁七 +蓝棠 +科洛弗电子 +肥鱼 +uid:95205039 +突破天际的 +突破天际的金闪闪 +突破天际 +诸葛琴魔 +bai'chi'ma +bi哩 +bi哩bi +bi哩bili +bi哩bi哩wang +bi哩bi哩汪 +穷li +吃jie +加藤惠 +加藤 +吃節操 +磕了假 +磕了假糖的白糖 +教主小 +教主小血 +快乐的 +快乐的大大 +快乐的大大翔 +老墨 +老墨咖啡馆 +uid:306862878 +勾指 +q君 +ma'li +maliao +二阶堂造极 +二阶堂 +造极 +winks +wink +陆 +芒果斌 +芒果bing +狐妖 +三叔 +养过 +yang +头部 +头部尚书 +av45714738 +【炮姐/amv】我永远都会守护在你的身边! +xiangxiang +冰绎‘’ +菊菊 +翔太 +快乐的大 +快乐的大翔 +u306862878 +麦乐送 +xiaokeer +xian +大祥哥来了 +下给你 +xiang'xiang +大祥哥 +无聊的 +邻座boy +月色真美 +比宇宙更遥远的 +比宇宙更遥远的地方 +比宇宙更远的地方 +某科学的 +某科学的超电磁炮 +某科学的超电磁炮、 +福乐xiao +戴荃_悟空 +戴荃 +无聊的霍比特人 +见其 +萝卜吃 +m正 +burning直播 +手残联盟 +ruodian +弱电ty +弱电统一nl +弱电统一nli +弱电统一nen +弱电统一能量 +xudax +xudaxi +xudaia +xudaxia +请容我想一想昵称 +请容我想一想 +钟饮 +钟饮- +李羽 +李珩羽 +uid:330720805 +苏小包 +闲不住的苏小包 +-eassys +eassys +我的方法 +我的方法cos +wodef +wodefan +沫云君 +av4118165 +xiami +xiamidelei +虾米的泪 +kevinz__ +kevinz +python +影视剧风· +影视剧风 +摄影师陈文键c +鱼人 +鱼人阿烽 +牛逼牛 +我条绝 +我条绝超可爱 +古模点 +辉夜月 +奥雷 +潇湘o +试吃估价十万元一只的鲜活白玉海参 +派三叔ipython3 +shi +林晓 +林晓蜜 +粉丝第一 +播放量最多 +简单的快乐 +简单的快乐love +ntr +ntr_b +ntr_ni +ntr_nightrav +ntr_nightraven +小徐的 +小徐的部落 +松鼠打不 +uid:315811683 +领 +老司机 +柳青瑶 +apricity +朱碧 +朱碧の利 +朱碧の利库 +朱碧一 +uid:4447943 +老苏头 +异界型 +https://www.biliob.com/author +he'luo'lao'sh +荒唐 +loser +uid:7066012 +av39300549 +bai'xiao'ye +白小夜Dream +bai'xiao'y +bakabyj +baka +baka板鸭酱 +英雄lian +英雄lianm +ob一下 +二次元l +二次元老肥宅 +史上最长 +史上最长报站 +奈克斯 +sdvx +一言 +一方通 +一方通行 +虚拟 +泡芙miao +泡芙🐱 +泡芙喵 +牛奶水 +牛奶水母 +哔哩哔哩番剧 +派三 +tmreix +mmmmmoko +liyiliy +liyiliyi6 +行星发动机 +帕累托最优 +帕累托最优ruc +av438515 +av43851531 +出山章鱼哥 +魔法 +抽疯鸡 +守望先锋 +守望先锋cg +av48738 +av4873874 +食肉佛陀 +c酱 +zhao +Anime +AnimeTama +阿葬 +米米 +萌动 +郭乐乐 +郭乐乐henluli +fu linfa +fu lin fang +拂lin fang +拂菻fang +小弓口 +阿掌 +浩 +小阿zhang +chao +chao'guo'g +chao'guo'gu +chao'guoguo +超guo'gu +吃货q +chai'qu +李ling +李令羽 +元气少女麻麻 +无才sh +泡芙jiang +pfj +泡芙jia +泡芙酱d +https://www.bilibili.com/video/av45954571/ +av45954571 +一大碗xuegao +一大碗雪糕 +肥宅雪糕 +y'l's'b's +hubaotime +huhaotime +huhao +女流66 +阿神的bili +阿神的bili官方频道 +50571- +av3061232 +av456930 +sof +sofr +sofrinio +av308040 +av534147 +syrax +sayrax +syr +syra +tao'y'l'g +桃源恋歌 +yi'nü'jie +乙女解剖 +白日鱼鱼 +kanon阿狸 +神乐mea +t'y +八重樱 +av42250009 +崩坏三 +崩坏三di'yi'ou'xiang'ai +最后一课 +雨o +雨l +雨li'ma'p +大忽 +wn +腐女 +腐女菌 +大不 +大不6 +一只桥 +喵星人抢不到 +无形切 +糖人家 +进化电子 +猫片 +真凤舞九天 +凤舞九天 +碧蓝 +我的 +雅达利寻剑 +fulinf +ai'y'y +我mai +我买下了 +av81 +暗猫的 +有栖m +有栖マナ +黎大熊d +黎大熊丶 +斑出山 +火影/宇智波斑】出山 +fu'lin'fang +fulinfang +理娱打挺 +理 +理娱打 +lc电竞 +老四gan'hai +老fan'q +占用 +http://www.bilibili.com/video/av45114446?share_medium=android +http://www.bilibili.com/video/av45114446?share_medium +http://www.bilibili.com/video/av45114446?share_mediu +http://www.bilibili.com/video/av45114446?s +http://www.bilibili.com/video/av45114446 +yzr +有人某人 +爱做饭de +av45024297 +葡萄与酱汁 +游戏bug +游戏bug集锦 +bak +忽悠 +no'r'ri's'j +norris +norrisj +hua'hua'yu'san'mao +花花与san'mao +花花与三猫catli +wuzhiyaqie +adcmangguo +ad +timecompass +shis +xiaoz +xiaozh +小猪g +小猪guap +小猪挂奇 +黑 +拂lin +拂菻 +拂菻fanh +我的三体 +世mo'ri +世末歌者 +赫萝ls +赫萝 +stn工作室 +可意ke +可意keyl +lks- +-lks- +德丽莎丶 +德丽莎丶观猩 +lu +鹿qi +鹿q +鹿岐 +鹿岐了echee +鹿岐l +鹿岐leechee +zhi'n +幽灵子辰 +虚拟次元 +仙女 +虚拟次元计划 +迪哥 +迪哥闯世界 +uid:27996286 +vv11 +死侍 +小球小 +pf +泡芙ji +泡芙酱 +泡芙酱daze +美食作家王刚 +美食作家王刚r +信小条 +kanna_ +vtuber官方频道 +花花 +今 +今小 +今小晴 +敖厂 +起小点 +起小点是大腿 +陆婉莹 +fulingfang +十yin +白给赖 +:345819190 +ui:345819190 +女流6 +女流 +女流录 +tiainwenj +天问酱 +大忽悠怎么了 +关于我转生 +关于我转生成 +关于我转生成史莱姆 +yingxiongli +doat +doat2 +占用ge'we +q'x'di +摄影师陈文坚持 +王yi +王怡苏 +视频大拍 +视频大拍档 +dgc +小希 +ai哟 +ai哟喂 +a哟喂 +meng'ai +萌爱m +dgt +点个头就好 +政 +泛 +可乐神 +陈tya +陈雅 +gaige +时之 +pap +路过-u +路过-ing +tuo'qi +椭qi +xiao'chu +赤九 +xi'si +浩克 +巫托邦 +loushangdelaozhang +吴 +link +link6n +link6nk +小猪姐姐 +小猪姐 +小猪姐姐zzrel +一只伍叶 +罗某人 +新番《》 +新番《家有》 +新番《家 +新番《家有 +新番《家有喰种 +av4 +av452 +av45233629 +齐木楠雄の +齐木楠雄 +赵德汉: +赵德汉:我一个 +【声优】 +【声优揭秘】 +av3 +av39589 +av39589618 +av39589610 +av39589253 +罗某人kakkoii +卡敏与 +人故时代的 +时代的 +远古时代的 +m子 +virtual +舵北鱼 +feizh +高纯度菊粉 +一大碗雪橇 +树根longmen +av456960 +中华铄金娘 +戒猫协会 +叫我小佐伊 +泠音工作室 +泠鸢yousa +2019.3.9虚拟主播测试直播实录 +游戏实况(红娘大师) +泠鸢yo +忍不住 +忍不住指点一二 +我是你的 +我是你的大哥大 +liu'lao'shi +英雄联盟 +路过- +零九 +杆菌 +超鸥 +超鸥列兵 +超欧 +超欧列 +超鸥列 +小黑的 +短的 +d调 +机智的猫君 +叫我 +du'du +嘟du'du +甜辣味の猫娘 +甜辣味、 +甜辣味 +li'yuan'jun +力yuan'jun +塑料 +猫娘 +t'la +mafu +ti'a +tian +甜辣 +蔓 +cha +av34239224 +manguo +manguob +资深小 +ti'na'la +你的小美 +mord +mordon +mordonfree +mor +mordonfr +jinghan +jinghanqing +时空小 +游戏中心 +bilibili游戏中心 +bilibili游戏中心官方 +独立菌 +哔哩哔哩yo +哔哩哔哩游戏 +第五人格 +网易第五人格 +zhouzhoug +周周尬聊 +bilibiliyo +哔哩哔哩y +aishi +艾shi +艾什ahe +艾什ashen +couaku +湊阿库娅 +ユウヒ悠阳 +糖水莲心 +泡芙喵- +欣yi c +黑biao'ke +meng'hui +めあ +九筒ninetone +九筒 +张en'en +张恩恩 +blg +bilbili +会员购 +想做谐星 +想做谐星的大雄 +想当谐星的大雄 +某幻君 +夏実萌恵 +ena에나 +兰若_re +doffy大魔 +doff +极客船geekship +小丸子君 +littlez +littlezy +littl +little +纯白 +纯白p +白p +純白p +伊水 +伊水_ +伊水_u +伊水_uryan +灯塔 +灯塔生物 +hua'den'yan +波流 +资 +三叶草 +逻辑 +逻 +深 +神之· +神之 +神之逻辑 +天恶 +天恶之 +小牧 +j an +j an n chi +从零 +456930 +佳茗 +跑飞 +泡芙酱da +杨草原 +杨超元 +杨元 +杨草 +行销体 +新萧条 +信萧条 +度人 +王ni +听wo +听wo姐 +听wo姐说 +花栗鼠 +花栗鼠__ +魔宵さきゅ +刘木子maggie +陈皮诺pino +wansha +玩啥you +大胃王 +云韶国风 +某路过的 +某路过的红马尾 +uid.211793117 +uid11793117 +uid:211793117 +的十年 +av46015569 +痴鸡小队 +links +linksphotograph +av46067452 +美食基 +黑皮 +黑皮脑规律 +黑皮脑回 +黑皮脑回剧 +黑色脑回剧 +积木有点苏 +uid:27399037 +咬人猫 +doffy  +doffy大  +重庆  +31137138、 +guolele +guolelehnl +紫颜-x +紫颜- +紫颜-小仙子 +紫颜-小仙 +大莉誉 +大蓝誉 +力yua +uid:375504219 +虚拟宫宫official +南道 +周二自习室 +zhouerz +bili_5306578953 +https://www.bilibili.com/video/av36458832?from=search +chunhe +chunhei +yunzi +云zi +云紫 +云紫S +云紫SW +云紫Sa +云紫Sam +云紫Sama +ch +chu +chun +chunj +chunje +chunjei +春节 +春 +chunh +yunz +huo +huol +huolo +huolon +huolong +huolongx +huolongxi +huolongxia +huolongxiao +huolongxiaoc +huolongxiaoca +huolongxiaocao +fe +fen +fengs +fengsh +fengsha +fengshag +fengshage +风沙哥 +风沙g +风沙ge +29752368av +av29752368 +你可曾见过这些虐心游戏?iii +mouhuanjun +刘航de +hong'shu +呆呆 +爱睡觉 +ai +hong's +艺术!ji +艺术就是 +艺术!就是 +av36458832 +mouhuan +宇哥到处跑 +雨哥到处跑 +逆浪千秋 +xid +我是hanhanhanhan +我是涵涵涵涵啊 +我是涵涵涵啊 +我是涵涵涵 +兔叭咯 +哔哩哔哩youxizhon +av46169628 +嘟 +嘟都督 +忘卿 +uid:911757 +忘 +ui +uid: +uid:911757 +tie'g +火锅大王 +onyk +av46281123 +死亡笔记 +av1336126 +yu't +战du +毒老师的bb +毒老师的bbb +毒老师bbb +毒老师 +毒老 +lei'xian +隔壁老王 +佐伊 +u21662135 +uid21662135 +佐佐佐佐佐佐佐佐伊 +老男孩 +麦爹欲 +麦爹欲上 +麦爹欲上半藏 +麦爹 +uid:268104 +wnm +猪qiao +猪qi ai +av46092553 +连载动画 +饭局研究 +fanjuyanjiusuo +番剧研究所 +会做饭的 +会做饭的芋头 +落雪 +磊哥 +磊哥游戏 +猪骨莲藕汤 +av35520476 +芋头sama +瓶子君 +na +naru +naruex +nar +uid:317720034 +花少 +花少bei +柳青瑶ben'zun +av11348047 +青瑶 +"琵琶行" +yanj +岩浆苦力怕 +德井 +蜡 +快乐的台长 +快乐的台 +da'la'b +达拉崩 +达拉崩吧 +z新豪 +-xin +-欣 +-欣小 +chenfe +chenfeng +辰风tz +鹿岐lee +南北巡礼 +南北巡礼日记 +圆芳,你 +圆芳,你怎么看 +圆芳 +手速超快的初总 +hhmks +震惊!你二爷竟穿越一代! +鹰目大人yahoo +抽风cr +抽风cra +yingmu +鹰目 +av37249363 +haiwan +fatetianzhibeio +fate天之杯 +命运之夜——天之杯:恶兆之花 +命运之夜——天之杯:恶兆 +ss26703 +命运之夜 +某路过 +< + + + + + +喵头汪 +花粥 +精分实验 +兔er +兔er猫 +xianbao +贤宝宝b +贤宝宝bab +任天堂纽约 +领事之声 +key丶 +key丶浅陌 +av1049 +弥见 +弥见其新 +mr.li小哥 +黑凤梨 +黑凤 +周二自习 +圣剑少女 +真的李小新 +老邪说 +老邪看 +老邪看电影 +华农民 +mel +melos +melo是个hao'ha +melo是个好孩子 +you'guan'sahng +油管上 +av34794811 +av42807128 +ket +kukejun +kutejun +夏色祭 +芳si +芳丝塔芙 +fa m g +fang +芳斯ta f +白帽酱 +uid:32890488 +镇南w +千凝雪ptzpanzer +fei'zao'jun +肥皂菌 +paddy我可以 +bilibili薇尔 +庄不 +庄不纯 +vz +hai +海绵homer +w'y'f +qian'wo'de'yong'sh +欠我的用什么huan +欠我的用什么还 +hei'feng'l +av24929108 +xin'ha +山下智博 +逃跑吧 +顾 +djc +夜骑 +洗好厉害 +佛菻坊 +hl +l'x'z +l'x +l'xian +女王 +女王泡面 +舞秋风台 +土味  +土味兽 +四角切圆 +pimiany +皮棉爷 +zeye +小小小 +小小小猫 +小小小猫仔 +三水君 +mr三水君 +辰子晨c +he +月影Ya +月影Yancie +av42494734 +方块学 +洗红菠萝 +西红菠萝 +鸟爷 +冷鸟丶饲养员 +chaiqu +柴犬lao'wan'xi'z +柴犬老丸洗澡、 +一只nan +一只南 +一只南yin +一只南音啊 +一只南音 +https://www.bilibili.com/video/av46463543 +av46463543 +厂长扬言 +优酱是咸鱼 +小马在 +人民日报 +{{search_text}} +uid:404273567 +netomint +nekomint +nekomint猫 +nekomint猫蒲何 +nekomint猫薄荷 +能猫fei'mao +能猫非猫 +zim +超静定 +超静定拱 +一个视频让你彻底了解动画是怎么做出来的——anitama解新番特别篇第一期 +一个视频让你彻底了解动画是怎么做出来的 +bi'shi +波喵 +fulinfan +拂菻坊 +邦邦 +邦邦灰 +邦邦灰小 +邦邦灰小兔 +x调查 +shaung +shuang +明月短松 +明月短松扣舷 +明月短松扣舷独笑 +明月短松扣舷独 +渔农小 +清半 +清半夏 +清半夏lynn +周患者 +晴老板 +jackforever +井号 +井号5467 +bigfun +在b站为快手正名的我是如何成为up主的 +shuangsheng +kuro战 +kuro战双帕弥什 +2y +2yi +yi'yi +以逸待劳 +逸2待劳 +逸2yi待劳 +逸2以3待3劳 +逸2以3待3劳3 +gon +gong'qimg +gong'qing +2yiyidai +2y'y +2以逸待劳 +2待劳 +2yi待劳 +2逸3yi待劳 +2逸3以待劳 +2逸3以3待劳 +baish +bilb +bilbiili +bilibili弹幕网 +bilibili弹幕 +艾云的huangzua +艾云的黄钻 +lizi +火娃山岚 +火娃 +火岚 +剪刀手栗栗子 +ltyl +圆一 +圆一reaction +运动女孩研究zhon +运动女孩研究 +运动女孩研究中心 +徐大 +泠音 +泠音f +colin +脏 +zang +av45681549 +惊奇队长 +官博娘的 +官博娘的后花园 +pa'shagn'h +怕上火bao'wang'lao'j +pa'shagn +爬上火爆王老菊 +爬上火暴王老菊 +还有一天就 +瞬间爆炸 +瞬间爆炸型 +倒悬的 +mr。 +wzy +小m +空耳组 +呜喵王 +小m五米瓯网 +https://www.bilibili.com/video/av10814352?from=search +10814352?from=search +灵yi +灵依 +https://www.bilibili.com/video/av46339814 +av46339814 +凉风ka +林深bu +林深不知归处ye'w +林深不知归处也 +林深不知归处也wu'ni +林深不知归处也无你 +化学老shi +化学老施 +himehina +田中姬 +田中姬铃木雏 +大xia +大侠哥 +田中姬铃木雏offici +av44175308 +h萌 +text +dalao +dalao面条桑 +不支持啊 +かぐ +小豆蔻 +oelppo +mr,lemon +mr,lemo +mremo +mr.lemo +黎da +黎大熊 +飞机君 +yun'xing'he +lov +魔魔丸 +Vinheteiro +liang +liangfeng +liang'f +托儿索 +托儿索求关注 +uid:32782194 +uid: +uid:uid:32782194 +huiz +huizhagn +xing萌 +xin萌 +昕小萌 +windbe +windbell +windbellsa +windbellsakura +允星河Yoseya +王mie +王咩 +王咩a +克蕾雅 +陆婉莹godriku +acicfg社区 +acicfg +uid:358422 +uid:238031529 +eric +国电电 +国电电力 +国电武术馆 +ericph +ericphilips +三天两觉 +三天两觉是也 +慕蓉 +渔人阿 +蓝殇恒海 +b1ue +b1ueagle +老番茄中国 +未末川 +happyteo +和罗 +bai'shi'zhuang'jia'hce +bai'shizhuangjiahce +败式装甲车 +败式装甲 +fu'li'f +ping +pingzi +pingz +败 +小式式 +一之 +秋豪·1 +瓶子ju +ping'z +lei'si +蕾丝 +性ba'sa +性芭莎 +性芭莎拉 +uid:345496332 +老兵和大黄 +naixiao +suelen +suelence +3bro +3brown +3brown1blue +3blue +咸鱼嫌 +uid1792530 +cho +咸鱼嫌鱼 +咸鱼嫌鱼娴 +玩偶君 +瓶子君1 +玩啥 +小g +纳兰l'ch +纳兰li +纳兰流畅 +纳兰流觞 +国电w +wu mi +小呜喵王 +小mw +小mwu +小m呜喵 +小m呜喵w +小mwu miap w +claw +claws +川同学 +约谈 +h萌视频 +shuainiyilian +kuangrenshiyansh +av44431501 +绝境之刃 +av45456326 +乙女jie'p +de'co +cey +leec +wan'ou'jun +xy散人 +xing'ba's +xing'ba'sha'la +性ba'sha'la +性ba莎拉 +paofujian +paofujiang +王思 +南宫若馨c +住在狼穴的羊 +vtuber +韵律源点 +韵律 +韵律源点a r ca e a +韵律源点arcaea +柔柔冰bing'b +信誓蛋蛋 +outlandish +outlandish异样 +uid:872507 +wu du fan +【描改手书】【meaqua】卵とじ(附字幕) +si +coli +lu guo yi x +齐天大肾yu'xiao'sa +zi'yin'feng +不正经lao'si +无毒fang'xin'nai +哔哩哔哩国创 +晨上chu'yu +lab +la +碧落jiu'chong'jian +blog +10hou +信誓da +越 +展 +展展 +展展展 +展展展不开 +uid:689904 +葡萄与 +神楽mea +神楽めぁ +神楽めぁofficial +凑阿 +凑阿库ya +凑阿库娅 +凑阿库娅offical +凑阿库娅offici +凑阿库娅official +查理s +查理star +查理sta +查理stat +发条 +田中姬铃木雏of +田中姬铃木雏offi +吃遍天下 +大掌柜 +overid +10h +10ho +10后zh +特效小哥 +g s y +s b z l +av3064189 +sofro +智能姬 +哔哩哔哩智能姬 +xin +信sh +信shi +xing shi +xing shi dan dan +xingshidandan +xinshidandan +xin shi dan d an +月光x +月光下de +月光下dewe +白星 +白星绿茶 +白星绿茶仔 +划 +刘哗 +刘哗电影 +喵喵折 +jachine +jachinesiqi +jachine思齐 +jannchie思齐 +路人豆腐铺 +灵能百分百 +lia +兽娘动物园 +倒吊人 +赛博空间的倒吊人 +uid13318842 +liang'xia +jiao'lü +蕉绿 +蕉绿不jiao'l +蕉绿不焦虑 +地鼠 +jiao'lü'bu'jia +蕉绿bu'jiao'lü +凉 +郭、 +xian'bao'bao +下采桑 +癒月巧可official +解说dj +解说dj哥 +泰蕾莎 +凉下采桑 +雪胖胖 +雪胖胖家的xiao +雪胖胖家的xiao'yin'yin +欣xi +欣小me +ling'feng +聆风 +旦旦 +圣tu'sh +杨可爱 +杨可 +uku +流绪 +女装大佬 +黄老师 +才不是 +天 +桃 +聪 +若 +lao si +lao si gan +与酱汁 +大事件、 +大事件 +是大腿 +是大腿秀 +shi'da +wan'sh +av26408754 +比比目鱼 +pceva +pceva评测室 +紫叶相思 +jannchile +dj哥 +ccfst +av46863635 +a46863635 +这是他的uid:306414867 +uid:306414867 +ui306414867 +uid306414867 +我的yin +我的英雄学院 +范式 +范shi +fan式 +yose +允星河yo +xing'h +gan +ganhai +赶海的laosi +赶海的老四 +jannc +jannchi +老四ga +老四gan +guok +guo +果壳里的mo'mo +果壳里的墨墨 +凉风ka'z +著小生 +祺祺 +keyki520 +keyki52 +六花 +xi'ba'li +yi'ba'jin'zhan +yi'ba'jin'zhan'dou'bu'gei'liu'hua +一把近战都不给六花 +悟空八戒 +wu'kong +寒冰 +寒冰屮 +黑塔利亚 +av45080707 +逗逼少年 +逗逼v少年 +洛天依,言和 +原创《得过且过的勇者》 +洛天依,言和原创《鸽子》 +wi +欣xiao +欣小meng +牛小咖 +lo_ +lo_c +睡不醒的ka +睡不醒的咖啡思 +十六字零 +十六字令 +十六字令hu +黑衣无名 +无限病房-锲米尔 +https://www.bilibili.com/video/av46861707 +av46861707 +av45581245 +一个凡老师 +uid:55388028 +av46043936 +du'shi'xiao +独食小哥 +召 +呆大您 +假美食po +召yuan +呆大 +召远hao +呆大年 +呆d +虎烂大王 +本杰里mouyu +本杰里mouyun +av46923868 +av43628275 +heiyebaitian +uid:36030372 +dgv +du'shi +dgv麟 +不等同学 +av46607486 +a5224836 +av5224836 +icha +icha夏芒 +夏芒 +nicel +nicenlk +五五开六小龄童 +五五开六小龄童伍六七 +落寞的牧之 +海格hefg +海格heg +海格hege +海格 +伍迪嘿嘿嘿 +niceの +niceのn +伍迪-嘿嘿嘿 +兔群主 +凑a +niceのnlk +uid99971625 +凑a'kua'ya +凑阿库 +av46383500 +uid:19940729 +叶枫 +叶枫j +叶枫jun +湊-阿库娅officia +叶枫君c +kanana +tu +蔡徐坤 +willtv +av29426569 +av38622052 +wil +willt +爱网着 +最少六个字 +爱网 +爱网者 +av19270958 +a'v +av31050 +绯红之锟 +av31056646 +v19270958 +wa +watyfges +10bu le +10bu la +10布莱恩 +flashyami +zai'xia'xi +极客库 +av26175316 +寒影 +j君 +j君的 +j君的迷你屋 +锦堂sheng'hu'k +锦堂sheng'huo'ki'b'g +毕导 +yong'hen'ai'hen +毕导dao +毕导导 +永和 +永hen +清华小d +永恒ai'h'ne +清华小d哥 +永恒爱恨 +cabs +暖心凯 +老班长 +暖心凯同学 +老班长a +av2996595 +av29965954 +林林零七 +av466 +av4660403 +av46604036 +av44843496 +那是清华kai +那是清华开学的第一天 +every55 +zai'xia'xiao'su +every5 +ca +每天都在 +av46956979 +bu'm +每天都在努力的雅九晏 +bu'miao +不可描述 +av46136085 +av22239708 +鱼奋逗 +鱼奋逗moto +n4nke +gamesp +gamesprout +zai'xia'xia +烟雨平生 +av46253268 +刀塔自走棋 +兴趣使然的 +兴趣使然的王m +tangc +兴趣使然的王某人 +糖醋骚pai +烤糊的糖 +av46682 +av46682408 +av46788886 +av46945927 +av45456858 +ci qi on +词穷nan h +词穷男孩 +w i +米拉s +盐取 +盐取shiot +盐取shiotori +av46948378 +zx柚子 +苏打热可可 +小缘 +qi +千语 +kyuubeeowo +yu千 +za +在下小若叶 +阿刘 +masterliu +阿刘_ma +阿刘_masterliu +av45768693 +uid:5062996 +https://www.bilibili.com/video/av45768693 +【8848】rap:我不仅有8848我还要rap一曲 +https://www.bilibili.com/video/av45639685/ +av45639685 +45639685av +韦伯小王妃 +韦伯王菲 +韦伯王妃子 +韦伯王妃 +av44370173 +斯帕克s p +斯帕克spaak +140阿xi +140阿昕 +av45157932 +lfcj +lfc利物浦 +利物浦 +kover +kover様 +吾行风 +yanglaobo +杨老伯 +杨老伯_xxm +杨老伯_xxmie +wl +未来akira +av46153703 +不gugu +不咕咕的weila +不咕咕的未来 +不咕咕的未来akiraxian +不咕咕的未来akira先森 +马里奥开始啦 +yang'lao'bo +知味人生meishi +知味人生美食研习社 +知味人生 +10yen +ma'li'ao'kai'sghi +10ye +dabao +dabaojian +大保舰 +av40809905 +猫lai +猫来 +allfan +av46960826 +allf +bili_7007166550 +10yen'= +av44558664 +v44558664 +https://www.bilibili.com/video/av6162326 +av6162326 +av45396595 +https://www.bilibili.com/video/av47021993 +av47021993 +av46759749 +rndxie +rndxi +rnd鞋cha +rnd鞋chao'liu +rnd鞋潮 +s'k'y'wang +skywang666】 +10hop +10后zhaorendaio +小巨人zhu +kuokuodan +阔阔d +阔阔蛋 +chuanto +川同学buch +川同学不穿tongxiu +av46898319 +切丝j +风爽stns +kuang +狂a +狂阿 +狂 +切丝菌 +爱解说的 +爱解说的k +爱解说 +somnolibrar +somnolibraria +tusea +av45516191 +r5-263 +r5-2600 +r5-2600和rx +xinxiaomeng +r5-2600和rx590 +中国bo +av8940052 +ri +日食记 +fuliyexiaol +傅里叶小笼包 +萌萌de +萌萌的萌dare +萌萌的萌大人 +圆肥bai +圆肥白 +小胜 +小胜解说 +小胜解说吖 +xiaobaic +小白侧破 +小白侧 +cxi +大米pi +机智的dangm +90后 +wonderimagine +电影zui +ji ke wan +xiaoying +小yin +王盖伊 +小隐soyyo +小隐 +test +omg +rnggrenjs +rng个人解说 +考研】 +考研 +ai归来 +/av46236510 +av46236510 +『看完脑醒』资本寒冬下大学毕业生出路骤减! +mi'mi +mi'mi'y +眯眯眼的f +眯眯眼的 +眯眯眼的frozen +眯眯眼的frozen弗洛城 +『看完脑醒』资本寒冬下大学毕业生出路骤减!‘ +虎爸mao'ma +虎爸猫妈 +虎爸猫妈tf +mickywo +mickywokstv +不是富婆 +不是富婆树 +dove +dov +fu'po +富婆 +东北da +东北大鹌鹑 +东北大鹌鹑工作室 +大鹌鹑工作室 +吴you +insdo +根大shuo +根大说ji +根大说机 +三个盾勇 +三个盾勇成名 +三个盾勇成名录 +三个盾勇成 +三个盾 +轨迹君丶 +wowgear +av43050582 +五毒 +五毒巫毒 +石榴 +渡边kezin +渡边kezinc +渡边 +渡边kez +uid:105519472 +:105519472 +wan zi jiu s +丸子就是 +丸子就是ke ai +丸子就是可爱 +av1049743 +xiao l +小霖 +小霖q +小霖qi +普通disc +普通disco +xi li +xi l +溪流 +溪流li +溪流liao liao +溪流寥寥 +xiliuliaoliao +普通辩论学 +xi liu liao liao +辩论 +辩论搞事情 +辩 +卢卡斯 +卢卡斯的 +树小罐 +http://www.bilibili.com/video/av44747447?share_medium=android +小木匠 +小木匠_jw +英雄辩的力与美 +鱼王澎澎 +av38597033 +av44987215 +欧阳狗蛋 +ou'y +vr小杰 +小杰 +某末影萨斯 +j喘 +jiao喘 +koool +av46431843 +kooolamarch +一闪 +av44422359 +blin一闪 +bling丶一闪 +敖厂长 +Date +Date_ +D_A +DATE_A +DATE_A_LIVE +uid:28759879 +shasi +scry +ch明明 +火性老王 +企鹅 +这是一只企鹅 +这是一只企鹅_ +一棵一珂 +一棵一珂_ +一珂 +めいど +めいどうp +メイドうp +av3280731 +av46960542 +av42684316 +reirei-エル +徵羽君 +av1259176 +av39877697 +the +the阿磊 +透明人間unvisible +爱吃肉的 +爱吃肉的三了 +爱吃肉的三了三 +爱吃肉的三了个三 +北员 +北员233 +琳工头 +http://www.bilibili.com/video/av46763730?share_medium=android +av46757336 +即影 +即影instamour +南门er +南门贰 +xiao'mu'j +小木匠—— +jojo +jojo_jw +shu'xiao +天海佑希 +肥皂解说 +肥皂 +spy林允儿 +张网红 +张网红_ +tm- +稽泽君 +稽 +大小眼 +大小眼秀篇 +大小眼独秀篇 +大会员 +【考研】考研英语词汇5500词视频讲解 +lex磊磊 +傻哥y +lex磊磊磊 +傻哥有话说 +万词 +怪客little +怪客littlechen +怪客little陳 +av45744068 +刃下狼血 +怪客 +视频大 +一花依世界 +12270792602671572405 +lan'kun +南卡卡 +南卡卡ka +南卡卡k +兮寂pi'p +youyinjun +mog +蘑菇shi +蘑菇视频 +mogu +蘑菇 +南me +南门yu +南门玉 +av42731269 +周六ye +周六野zo'e'y +zhoul +唐独秀来辣 +唐独秀 +唐鹿鸣 +bili_87043711684 +弥天少 +弥 +比别人 +园艺s +园艺莳家 +园艺 +z'y +真轶星 +默哀为谁 +火神 +火神烈云 +apex风男 +武装炼金 +wuzhuanglianjin +武裝煉金 +av466652 +av46665262 +珂朵莉 +珂朵莉加油 +a46665262 +木之咖啡 +木咖啡 +av46501981 +机智的胖 +机智的胖宇 +king西瓜瓜 +yoosh +大和藻虾021 +大和藻虾0212 +https://www.bilibili.com/video/av46703535 +无铲车间主任 +ed +uid:34351155 +辑 +enitor辑 +enitor-辑 +enti +entior +苹果突然上架新款ipad +mini和ai +entior- +entior-辑 +entior辑 +数码qu'yu +数码区 +讯息万变 +enitor +mini和air +yoo +av4280 +av42802299 +大蛇丸 +jing'han'q +craz +叫我黄叔好了 +守夜 +adg +ad钙 +一瓶ad钙 +dian一瓶ad钙 +丶一瓶ad钙 +zhifeijifeihen +纸飞机飞很远 +cao'xie +草鞋ji +草鞋街g +草鞋街gang'ba'z +草鞋街杠把子 +草鞋街扛把子 +起风了 +av4399 +av4399854 +av43998548 +云梦咩 +云梦咩_ +av17000 +av170001 +av45050497 +凉宫春日的 +凉宫春日的消失 +凉宫春日 +百万剪辑 +武士桑 +影凝 +小栽君 +不会打球的 +不会打球的迈克尔 +仙柚 +av1344 +仙柚儿er +av13441 +bili_27910020674 +离ya +厘丫qi +假美食 +厘丫泣 +假美食pozhu +厘 +jk鱼 +jk鱼太 +jk低鱼太 +jk低手鱼太 +av45 +av4583768 +av45837684 +av45837584 +啊吗zong +84君已经 +84君已经是条 +84君已经是条咸鱼了 +lisashe +lis +吃货xiao +sifan +思凡 +思凡mei +思凡没有 +思凡没有尾巴 +uid:2520135 +吃货ming +风暴84 +christina +christinaaaya +christinaaa呀 +ch'ri's'tina'a +christinaaa +问曰小强 +-欣小萌 +慕依 +ohemm +ou'n +欧内ji +欧内酱 +龙王 +龙王xshen +龙王x神力 +程序yua +程序媛 +我为什么 +我为什么学cs +christinaaay +败犬 +84君 +dang'er +x'h +喜欢大机机 +喰 +喰can +喰喰 +寒yin +寒影aki +寒影akina +av46955554 +av46126131 +江右之人 +小鹿乱撞 +av41959884 +av15413521 +100级 +100级大英雄的shang +100级大英雄的伤害 +白井黑子 +av470 +av4705 +av47052837 +mu'yu +木鱼sh +木鱼s +100级d +liangfz +两分钟制作 +av30104349 +av46248434 +伍索迪奥 +寂哥嘴很笨 +uid:235701587 +uid235701587 +av13252230 +双面ou'xia +双面偶像 +囚枫 +snh4 +苏联 +苏联红军 +gnz4 +马利克 +马利克m +马利克mailk +milu +milu_uu +momoko8443 +anthonymiles +甘乐 +甘乐up +av18202765 +av45676304 +av46326045 +环华下 +只是个有meng'xiang +mi lu +zhou'xai +周小鱼na +messiahv +av39033914 +寒影ak +寒影a +在下yi'qi +在下一切 +在下一切不是神 +在下一切不是神、 +snh +gnz +7774837 +紫菜 +紫菜xixi +妈咪说 +妈咪说maki +快刀jiang +快刀酱儿 +狸猸 +凯文最游记 +小砍解说 +小砍丫 +蠢羊 +蠢羊sillysheep +youyouba +右右粑粑kevin +徐manqian +徐墁 +徐墁qian +徐墁倩 +子时当归 +玩命游 +玩命游戏 +https://www.bilibili.com/video/av43743103 +li'yuan'j +suta +小怪兽 +mr_小光头 +av47039248 +云吸毛茸茸 +av45546251 +a45546251 +!! +uid:https://zh.m.wikipedia.org/wiki/习近平5062996 +想暴富想e +想暴富想 +想暴富 +想暴富的 +想暴富的edward +av46572023 +av46885448 +风酱大魔王 +风酱大魔王a +风酱大魔 +universe +universe_文 +j- +j_ +j_un +j_un623 +bili_7232420179 +peach快装板 +peach快装 +带nao'fu +带脑 +带脑fu +带脑腐 +摸鱼放在 +设计小教室 +av23081941 +doyod +doyo +doyo大胖 +vickyspoo +vickysp +黑chi'b +黑翅 +水菌c +av45261215 +马lan +马老 +马老jun +马老均 +https://www.bilibili.com/video/av17119215 +av17119215 +生死狙击 +葫芦小小刚 +万粉up主为了忽悠粉丝,竟然连洛伦兹力都用上了!!!自制雅各布天梯 +生死狙击黑痴 +zhiminggeshit +生死狙击好处 +致命哥食堂 +生死狙击好 +致命哥 +睡觉觉 +iamyimii +av26368593 +奥地利的xia +第一次参加中国婚礼就吃到了我最喜欢的中国菜! +第一次参加中国婚礼就吃到了我最喜欢的中国菜 +野望征程 +铮铮日上 +铮铮日上kid +蓝晴婧 +av47025841 +sky灬素颜 +e'dong +鄂东l +鄂东老na +宅哥studio +宅哥 +胡看看 +胡看看瞎乐呵 +月下 +齿轮字 +齿轮字幕组 +齿轮字幕组_gearfansub +feng bu ang'' +红色口 +de di kuang +deng +疯狂e +疯狂的ji hua +疯狂的计划 +红色蜀国 +疯狂的计划tv +tests +testsg +tests官方s +tests官方频道 +约会大作战 +约会大 +麻花蓉 +xin shi jue +yin shi lu deng +yin qi +yin shi +xing qi +影视lu deng +麻花容荣蓉 +lai'me +蓝色 +丧命 +196jiu +wo'c +我才bu +我才bu'she'hui +我才不社会 +buling +bulingbulingzhan +bulingbuling张liang'liang +bulingbuling张两两 +我ca +我才 +我才不 +我才不she'hui +av45071133 +av22652193 +绿苗 +绿苗_lvmiao +虚拟睡眠 +爱夏 +baiscx +uid:175341092 +dandingdebang +淡定的ba +小熊master +av4235612 +av46636163 +zidea知点 +dandin +小白开箱 +av47024542 +av46679054 +小众美食 +jd246 +极客chun +https://www.bilibili.com/video/av46891186?spm_id_from=333.171.b_686f6d655f636f6d6d656e745f6c697374.2 +av46891186 +av46791884 +av46458471 +极客船 +ds +石楠h +石楠huaer +石楠 +kuma0109 +av44612998 +anm +anmemu +anmenu +没有更新 +没有更新没有罐 +我是熊哥 +av36384868 +哲别君 +卷毛weapon +卷毛 +铮铮 +铮铮ri +铮铮日上k'i'd +光头的gua +光头的瓜 +光头的瓜pi +光头的瓜皮 +av46677968 +av46669400 +av46909877 +罗莉安 +消失的罗莉安 +nf +伊恩 +伊恩ian +伊恩ian哥哥 +杀戮天使 +酷白 +《过春天》一段特立独行的青春往事,那些正在发生的时代记忆 +《过春天》 +酷lei'ge +酷了个 +酷lei +酷勒个白 +酷勒个白0 +ku +ku'lei'ge'v'a'u +ku'lei'ge'baui +yin's'j +小砍kankanakn +小砍砍砍 +小砍砍砍砍 +烦恼 +q烦a恼q +lfc +yu'ren +渔人a +渔人阿峰 +渔人阿fe +渔人阿feng +lfcli'wu +LIAN +用我的世界 +用我的世界格式打开 +用我的世界格式打开猫和老鼠 +av170 +柳叶哥哥 +http://www.bilibili.com/video/av30208085?share_medium=android +http://www.bilibili.com/video/av30208085?s +http://www.bilibili.com/video/av30208085 +av30208085 +av46835445 +【自制展示+抽奖】3d打印太阳钟(日晷gui?) +峰哥与糖果 +撩他 +聚印象 +峰哥与谈过 +撩他すき +han'xiao'mu +老王不是 +老王不是老黄 +yi'ma'ping'chuan'd +av46283677 +夹克mnnm +韩小m +uid:79574540 +六木视 +袄 +六木视觉 +av46648405 +无限光芒sx +无限光芒s +无限光芒 +wxgmssx +无限光芒ss +翎昱 +翎昱k +檀纸姬 +萝大 +萝大冒险家 +nian +鸟jie +茶御字 +nightroot +枪手视觉 +星云的guan'guan +星云的关关guan +星云的关关关老师 +米兰老司机 +米兰老六 +李哥的 +李哥的愤怒 +檀纸ji +conor +xilan +涩瑟se影si +hydc +辉夜大小姐声优 +、 +willy +willyv +燃烧冰 +燃烧冰滑板 +huo'g +自走棋 +北京燃烧冰 +北京燃烧冰滑板 +aoc +shui +av46891795 +拼多多 +【血亏 +异灵术 +异灵术老师 +铛e +铛儿 +铛儿mei +纯洁上校 +纯洁 +徐大xia +上校 +uid:26583216 +cva +cva_ +cva_s +cva_sy'lv'ia +cva_sylvia +cva_sylvia—— +cva_sylvia_ +cva_sylvia_cva +sylvia_cva +愿起xin +愿起心头 +幻m +幻mie +av46765777 +小帅哥 +小帅哥儿food +小帅哥儿fffood +av35593406 +女王降临 +崩坏3 +崩坏3第一偶像 +某科学d +某科学的ch +ep84340 +西本りみ +myo +myo西里尔 +肥羊村的shi +肥羊村的视频解说 +肥羊村 +温泉组 +av37811346 +温泉组qiandaimu +温泉组千代木 +西虹市首富要出第二部了?王多鱼妻离财散 +五分钟倒放《富首市虹西》 +【倒放】周星驰出演真·《新喜剧之王》 +化身软饭王让张柏芝养自己 +zhang'nao'wan +逆天的 +jianqi +见qi +开心ta +开心糖水pu +开心糖水铺 +紫菜xixixi +许杨玉琢最聪明 +chao'w'n +chao'w +chao'wan +潮玩e +潮玩ke +潮玩客 +ali +ali'en +alien +alienware +av32165798 +shui'yue +twot +wu'di +av45888881 +无敌翁da +无敌翁大头 +vikor +老邓 +老邓182 +君之榴芒 +fei'fei +人丑声甜土拨鼠 +在下骚王 +在下骚王~ +av45488118 +ar +ar-s +ar-sr-na +ar-sr +获取信息失败 +获取信息 +获取 +布拉布拉j +v21600814 +https://www.bilibili.com/video/av21600814 +15ye +arsrna +arsr +ars +lv。 +lv.ti +wu'dong +lv.天 +乌冬 +lv天 +ysh +影视区 +lvtian +lv天-001 +小鸡ca +小鸡chuan'z +小鸡cu'ang'z +小鸡创zuo +小鸡创作 +qq鬼额外 +qq归位 +av42918224 +beego +flutter +ji's +技术pan'zi +技术胖 +技术pan +一点也不 +一点也不中二的95 +一点也不中二的95214 +一点也不中二的9524 +熊猫妹子 +ar- +熊猫妹子在东京哟 +寂哥 +lousha +楼上的 +雁鸿 +雁鸿aimee +bili_76918356520 +v俊晖jan +俊晖jan +俊晖j +俊晖 +小xi +小xiang +小翔 +av44610475 +_杨大大黑 +coobeesb +小此 +lalisa +拉里萨 +sul +sues +企鹅大萌子 +av46946586 +张朝阳 +张朝阳在修仙 +壹戈说戏 +壹戈 +鲁卡利 +鲁卡利欧 +luka +【测试重要勿动】 +鲁卡利欧luka +av4709603 +av46490344 +g'b'b'l +高板八零 +高板八零桑 +av46807406 +詹姆士的厨房 +赤风 +赤风nk +av41349644 +【mv还原向】当天使们真正地跳起了新宝岛会发生什么 +【天使降临到我身边】全员的樱花风暴*亮度*反射 +av46639171 +uid:169369147 +amd2800 +amd2800元 +amd2800元装机 +av43895169 +yi'nü'n +muyushui +恋爱x +恋爱夏先生 +ADC +Aa +ACD +95388515 +听ba +听bai +听白xian +听白先生说 +askvincent +许捷许仙僧 +hei'ye'b +xingc +xing +星残 +星残yueying +星残月影520 +sshi +是李不是我 +是李不是 +红月ever +bili_76953247789 +bili_7695324778 +bili_76953247 +bili_76953 +神调 +av45337008 +aochangzha +aochangzhang +重庆没有 +重庆没有麻辣烫 +【敖厂长10】烂尾的游戏冒险(雅达利寻剑) +重庆没 +运营魔术师 +林林ling +林林零 +wll +tv +ba +zha lin +渣林 +零七 +喵来啦 +花花与三mao +gmh +gmh十三 +命运石之门0 +18分钟看完命运石之门 +老虾米 +裙底开光 +熊可夫的哥哥 +熊可夫的哥哥熊可 +熊可夫的哥哥熊可未 +刘老师shuo +bili_1655363729 +泠鸢yousa‘’ +泠鸢yous +tuoo +乘oo +乘火车sfi +乘火车飘 +你好啊出发吧 +xiang wang g mo +av47036447 +liz +lizhiwei +lizhiweierde +荔枝味儿的ss +荔枝味儿的shengs +荔枝味儿的shengshe +荔枝味儿的生生 +shengs +生生肥宅 +tz +av46294032 +apexyjj +uid:5223660 +积极的 +积极的慢工 +av43302806 +晗 +av40382728 +全新独立品牌 +红米redmi +note7 +发布会直播全程回顾 +999元起死磕性价比 +看喂出 +看喂出品 +小z +小z极致 +baoji +fclhdm +fclhdmz +乐歌给 +有mei +有咩 +小胡xi +小胡仙儿 +小胡 +lian +丧男 +丧男pala +丧男paladin +av46315866 +jackie +jackie不吃鱼 +https://www.biliob.com/ +骑车boy +av39077623 +jacike +一风是也 +一风 +bili_55427651473 +爱上情感j'y +爱上情感教育 +爱上情感 +http://www.bilibili.com/video/av42014883 +洛zhou +洛舟zhi +79句号 +方头人 +mangguobing +jingyit +jingyitiq +jingyi +jing +锦yi +美食台 +风火 +起司其实是 +起司其实是cheese +https://www.bilibili.com/video/av45705830 +av45705830 +洛朗 +洛朗君 +luan +eheh11 +luo'ang'j +云c +云ch +云巢 +云巢zhi +hui +云巢智慧 +卢克奶妈 +云巢z +楼上的老 +告别悲痛 +云巢智慧科技 +十爷 +厚大法考 +云巢智慧guang +云巢智慧官方 +euganie +https://www.bilibili.com/video/av46808684/ +av46808684 +情圣d +情圣都是da'bei'tou +情圣都是大背头 +偷看d +偷看粉丝hanxiao +偷看粉丝憨笑的pimo +偷看粉丝憨笑的皮某 +波e'e +波尔ka +波尔卡多 +av22821125 +【效率饥荒】当活了1000天一次会来多少只狗呢? +av25603342 +【效率饥荒】五百年桑田沧海 +秋名山shang'd +秋名山上的4866 +【l'y'k】 +【李云龙】 +rich_tang +我头上真的you +mao'xia +猫侠 +猫侠catman +捞 +老cu +2018nian +哈巴ai sui j +哈巴爱碎觉 +当你在 +当你在c s go zh +当你在c +46767459 +tk +偷看 +偷看粉丝憨笑的pimou +lear +塞尔达编年史 +塞尔达传说 +任天堂daluan +任天堂大乱斗 +rttwer +rtt +半支烟sa +半支烟sama +半支烟 +半支烟录 +凌霄 +niugulu +小豆子 +小豆子zyq +uid:302469604 +做一个vloger到底有多简单! +av47074051 +av46496217 +av44 +av443701 +av20705268 +av46919 +av46919850 +alallaa +alallaalalal +丘尔库夫斯基 +uid:382200983 +maoxi +猫系的兔子l +猫系的兔子lili +av47014550 +av45869049 +av45857005 +zhibailinxi +之白林夕的玻璃柜 +qin +夷陵w +情侣就真的相互了解吗? +白小寒丶 +av46823021 +小见见 +长歌与 +heluo +偷梗王克林特 +白色 +白色长夜 +偷梗王 +秋田役 +祈锁影 +adriann +adr +da'qi +大切的椰子 +二喵的饭 +宝剑sao +shala +shalabob +shaladbob +丶辰風 +av46301274 +dao +玩命解 +猪乔 +kawayii +kawayii_li +kawayii_limeow +朝花夕誓 +av47053898 +sledazgui +丘勒个丘 +sleadazhangg +dazhan +uid:54230527 +av45077909 +pexle +pexle_gmae +pexle_gma +pexle_game +pexlegame +pexlegamep +pexlegamepi +pexlegamepil +pexlegame皮可兽毛毛 +pexle_game皮可兽毛毛 +daokouer +d道口尔 +风星碎 +uid:233524745 +cg漫步印象 +360m +低头崛起 +低头崛起2014 +收藏夹没有 +收藏夹没有福利 +收藏夹没有福利的喵 +章余飞 +章余飞œ +章余 +伊世嗷 +琦德隆东镪 +av46959221 +蓅煋の寂寞 +av34336602 +会飞的芽子 +tara +renasus +ares s +aresssse +aressssera +a re s s s +a re s s s s er a +aresssera +aressser +brs +brs北 +萌萌哒小白 +萌萌哒小白白 +萌萌哒小白白qwq +uid:322384798 +那你 +xiaomaz +大白小宇挑战空气感旅拍,用简单道具拍情侣合影 +大白小宇 +绿帽君很n +绿帽君很努力呢 +san'ke +三颗 +30112 +547 +叫我a +叫我aden就好了 +av47057170 +bili_5514100406 +prof +binary +prof-binary +av46902544 +shuai'zha +帅炸的 +帅炸的小哥 +帅炸 +盘点委员 +av46566148 +alpha +alphaau +班里的h +alphaaurora- +班里的珠子 +班里的z +班里的 +alph +班里的zhu +av47017835 +av44708935 +aluren +a路 +字幕np +字幕npc +360之滑稽 +黑镖客梦回、 +飞机上 +飞机上的xiaodians +飞机上的小电视 +av47088947 +av40007 +av000 +av0002 +铮铮r +2019n +2019nian +2019年jijux +2019年jijuxingjia +2019年极具性价比的sh +2019极具性价比 +av43441238 +虚拟ciy +muta +mutachun +mutangc +木糖chun +木糖纯 +范帅 +泛帅 +bo'luo'sai'dong +菠萝赛东 +菠萝赛动 +hengn +ne'en'n'g +何能言 +何能言shi'g +何能言是个da'qi'shi +何能言是个da'zi'qi +何能言是个大zi'q +何能言是个大棋子 +mioa +av45834694 +miao_ +miao_喵 +【】 +【新游今日谈】 +慕雪之殇 +汐霖seli +汐霖 +汐霖selin +9527dhx +凡浩d +凡浩fan hao +凡浩fanhao +lin'meng +ling'me +柠檬len +柠檬lenmoe +kayexiaoqi +kaye小七ia +zao'qian +yu'za +玉藻前 +藻前 +uid;37671022 +uid:37671022 +uid:13 +uid:133547 +uid:13354765 +https://www.bilibili.com/video/av43420601 +海贼王唐吉诃德多弗朗明哥家族传,从成立到覆灭,唐吉诃德家族的那些年的经历,少主和家族成员的羁绊! +av43420601 +av47042544 +小七 +av46605317 +thinktot +thinktothings +百度语音技术永久免费的语音转字幕介绍 +thinktothing +环w +av46927229 +大数据技术wv +大数据技术分享 +土间被 +土间被活埋 +摇滚m +摇滚mzt +摇滚莫扎特 +av44389692 +小鱼d +小鱼丶jiam +小鱼丶加米 +路lan +路蓝 +eilene艾琳搬运 +entei08 +atc +atc录音 +年少呆萌的 +lhteam +592560.】. +李先森 +李先森t +小屁hai +仙气满满 +仙气满满的 +山海zo +eebu +ee不 +mo'yumo +摸鱼天猪m +狗君似 +狗君似条g +狗君似条狗 +av47138844 +gou'jun +狗君似tiao +nor +er'ma +二m +派大猩 +派大猩_ +此物天下j +av45788724 +不搞 +gong'kou +av42 +av4222 +av42222 +av42222679 +yuan'qi +元气-x +元气-xiao'yu +元气-小煜 +qing'xi'q +qing'qing +futurebas +futurebasketball +uturebasketball +耐高 +fut +future +futureba +futurebasketba +馋鱼的哈士奇 +柟白 +柟白blueblank +xi'gong +汐gon +汐宫 +汐宫lian +汐宫liang'zi +汐宫凉子 +av46499566 +小she +你比我 +你比我贵 +zhi'bu'zhi'de +值不值得买 +樱桃之士 +av46759891 +左ke +左坷新 +左坷 +左珂欣 +【灵能百分百】【励志】希望大家都能找到属于自己的超能力 +25_3gwy +av16 +av107 +蔡京珂 +郑州大学 +av45253184 +sh4z +kanan +kana +huz1 +huz1m +张恒1023 +ravens +ravensgo +ra +ravensgo—a +ravensgo—aiesoft +ravensgo-airsoft +白帝 +白帝帝帝帝 +av46248994 +dota +dota2_aaa +dotash +dota实验室 +https://www.bilibili.com/video/av46566033 +av46566033 +lfcliwupu +公共画室 +liuyuechao +liuyuechao2008 +go +公共huashi +电竞迟 +电竞迟小超 +十夜 +sherman +呵呵2233 +呵呵2233君 +av47045269 +zzong +z总来袭 +lkz +s'l +s'l'd't's'g +水里的碳酸钙 +花  +真f +我们自己 +真feng'wu'jiu +就“就 +av38321257 +花了 +碳酸钙 +开心又又 +开心又又666 +猪队友 +小y +1ng +1ng_meng xin +1ng_萌心 +mwng xin +萌心 +wi l l +meng x +mengguo +av47197808 +海豚or +海豚orz +海嗨嗨豚orz +haihaihai豚orz +海海海豚orz +海海海豚 +海豚 +海豚】 +海豚开箱 +xiayi +进击的香蕉alive +amaz +amazong +啊mazong +啊吗棕 +av30061394 +小号抽风 +小号cf +元老赛-利物浦传奇3-2米兰传奇 +杰拉德绝杀皮尔洛圆月弯刀 +wuzhiyaq +av449 +av44912084 +word +wordwordgu +wordwordgua +wordword瓜 +wordword +ztj0208 +jingqi +ailiaili_ +av45754176 +uid180659383 +uid:180659383 +av47045476 +wordwordg +萨沙 +萨 +vijaytsui +av46103040 +亿年 +亿年不作 +av46806299 +da'fa +大发music +av470048 +av47004866 +2009-2018年 +b榜年度男艺人 +十年来哪些男歌手拿到榜首,你猜到了吗? +圆润的苏先生 +听白先生s +一咸菊菊. +beso +哦呼 +qianxiao +老王不是l +老王 +投影 +投影消逝中 +土成 +axiyaxi +axi亚希 +axi +麦丫maiya +麦丫 +iamj +iamjem +ji'dian'qu'pai'pian +几点去拍片 +几点 +几点qu'pai'p +【几点去拍片 +【几点去拍片】 +hua'shao'be +花少be +品木去旅行 +品木 +哈栗卤鸭 +哈鸭 +卤鸭 +ganymedenil +ye'x +叶小兽不是受 +叶小兽 +喜欢拉面 +wii +av46910880 +av46614640 +p ri s +prison +prison-brea +prison-break +你好竹子v +riskyart +av47063140 +蜡烛jun +瑞泽 +av47027458 +小荣的美好 +小荣的美好生活 +极致哥 +四月是你的 +四月是 +魔王zhong'er'hao'shao +魔王中二好少年 +g'g +g'g'g +priso +prison-br +江南夜 +laofabqi +pri +空心菜zhe +空心菜折叠 +辉尔曼herm +辉尔曼 +夏色祭o f fi c +夏色祭officia +7dian'b +7点半delu'ge +7点半de鹭戈 +绎寒9479 +隔壁班的 +7点半de +uid5162357 +av45176087 +【pr】五分钟带你做一个好看的字幕效果 +《元气骑士》游侠本命武器真的强 +jing'han +【做up教程】up主一般用什么录屏软件 +牙膏yago +av46204854 +9102年 +minecr +minecraft +jx +卷心菜 +卷心菜_minecraft +av47225895 +大橙子 +大橙 +水冷tv +av46807691 +dajiuh +dajiuge +大jiu +大氿ge +开箱的 +开箱的小咸鱼 +小鱼干an +小鱼干anchovy +小鱼干ancho +av42475306 +山下第一帅 +余不帅 +飘霖colin +小土耳其 +伍言 +伍小言 +严肃吃货 +严肃吃货今天好吃吗 +pro'fe +professorm +只是有个 +只是有个m'n +只是有个梦想的qin +只是有个梦想的青瓜 +陈先生 +陈先生d +bili_8551593163 +陈先生desire +严肃吃货今天ha +pro +pro'fe's'so'r +professorme'wo +professormeo +av468 +av46854791 +av40587351 +虹桥e +虹桥二哥 +虹桥二哥哥 +还有一天就放假le +胡子盖shu'sh +胡子盖叔叔 +奇趣研究 +学长和他的 +学长和他的妹妹 +rua酱 +学长和 +rua酱i +wufu +五fu +五斧 +五斧jinkong +五斧jingkong +五斧尽kong +五斧尽空 +dayim +大yimao +大yi +大衣 +大衣猫 +大衣猫钢爪 +大衣猫の钢爪 +解说拒绝 +av47226913 +av22883255 +xiayutia +xiayutian +夏宇天 +av33015280 +绎 +绎寒 +av32619489 +laoxiami +linksp +linksphotogra +35708937‘ +FlameHazeのシャナ +兔比 +unique +un +ique +unique兔比 +普通的西瓜皮 +马虎 +马虎魔方 +x柚子 +uid:23705450 +热爱身后的 +热爱身后的洪米粒 +墓幽官喜 +勺子ku +勺子快chu +勺子快出肉 +恰恰哥 +恰恰哥qiatrick +av46182348 +勺子kuai'ch +锡兰ce +东尼o +东尼okk +gao'wan'liang +恋爱夏xian +sheng +刘q +tvtokyo +zhi guai jun +志怪君 +__三三 +三三 +庄周道长 +overi +胖锤 +胖锤减肥日记 +403898135 +01-01 +av42279875 +不会吐槽 +会吐槽 +隐会吐槽 +隐紫大人会吐槽 +骚东西 +骚东西又在 +av46646549 +排骨jia +chogn +崇明蛋ge +崇明蛋哥 +av44530387 +不是柯桑de +哲别jun +哲 +小胖hu niu +小胖果果 +小胖果果1357 +xiao pang hu +小胖果果1 +老法师 +老法师marcus +球鞋小新 +球鞋xiao zin +answer +answer824 +谈水tv +淡水tv +mo yang gomg de +磨洋工的eb +磨洋工的 +大欧 +ma li ao kai shi lr +大欧j +skywang +磨洋工de +鞋yin +鞋瘾 +y yan +杨lao bo +yan +comfish +heto +he'tong'xue +何同学 +崔k +崔 +崔寻 +崔手 +uid:32037172 +tatto +tatto121 +tattoo121 +za c +zackycao dan +teng yuan xian shen +藤原先生。 +藤原先生z +ni d w du +uiex +yiexn +onefun +zettar +wei jia +伟jia +无才s +daoxua +b0006 +boka +bolan +波澜ge +波澜哥 +铁刘海毓米君 +av46439294 +痕迹之星 +芦苇草的梦想 +av41550407 +x掉哈 +天台sh +天台sha +天台双雄 +天台双雄life +王zhiw +王知无 +锦灰视读 +uid:4236937 +政宗君deq'di +沙丁鱼 +沙-丁-鱼 +goodtake电视 +av47231840 +骚的 +一碗编辑部 +神奇a +du +神奇啊多 +盗月社 +av46917748 +huaxian +huaxiaqin +huaxiaq +华夏qin +华夏清yu +华夏清yuequ +华夏清乐 +av46611309 +boz +波子b +波子boo +波子booz +s渡渡淼 +s渡渡淼m +渡渡淼 +明日方舟 +ais +a愛睡觉 +a愛睡觉de呆呆獸 +愛睡觉de呆呆獸 +av47112161 +荟小hui +我的英文min +我的英文名字ji +我的英文名字叫d +我的英文名字叫dn +我的英文名字叫daniel +qi'tian'da'sh +不正经la +喔嚯嚯 +一次性kuai +一次性筷子 +我的yi +我的英文m +我的英文名字叫 +我的英文名字叫da +av45133275 +竹子 +days +days的 +days的wingy +av47001762 +冷鸟 +冷鸟dian +冷鸟· +lengleng +shuailen +帅冷冷 +帅冷冷quq +papijiangh +ren'tu +run't +xin'shi +花吃了那 +花吃了那巴德 +花吃了那巴德qnq +qian'qian +欠qian +欠欠 +清jiu欠欠 +isay +醉雪 +醉雪share +醉雪shareyoung +齐天大肾 +bi'li +碧落jiu +mucxu'k +不正经lao +齐天da'shen +齐天大肾yu'xia +齐天大肾yu'xiao's +yin'yue'ren'w +音乐人网 +无毒fang'xin +从零开始学习音乐制作(编曲) +奇思c +jieba +结 +中国风吉他 +一晚上的时间,如何给洛天依写一首4536的古风歌曲 +一晚上的时间,如何给洛天依写一首 +逃离北上广,一个乡间音乐人的故事 +格鲁特音乐 +av46357608 +神仙合唱 +格鲁特 +sdads +shens +不想练腿的 +不想练腿的古龙 +不想练腿的古龙同学 +zhig +玩家星球 +zhiguaijun +黄金瞳 +av46383337 +被子打屁 +大番茄 +zealer中国 +uid3000 +uid6000 +uid600000000 +uid60000 +uid10556 +uid105568 +uid10 +uid19655 +uid1 +uid685695125 +西里尔 +碧海潮生大闸蟹 +集才社 +uid:319469486 +赤壁产生大闸蟹 +bill +billlok666 +绥远玉麒麟 +绥远玉麒麟の日常 +不困小姐 +不困小姐在、 +不困小姐z +肤黑貌美暴躁馬 +暴躁馬 +暴躁馬、、 +明柏 +流绪微梦 +流绪微 +我叫 +圆 +吃萝卜 +吃萝卜的 +沙 +沙拉 +若谷 +xiao'a +xiao'a'z +xiao'a'zh +小a +小a'z +小阿z +gaki +一树树 +av47156227 +2分57秒,谁上谁都行,史上最弱智蝴蝶夫人,这可能是个假的boss +十夜sherman +dlear +pingqialear +平qiaolear +平翘bulear +平翘部分的lear +平翘不分的lear +油管gengfan +油管更番人蘑菇 +cymweilaihyfyg +我的宇宙 +_我的宇宙_ +xihuguixheng +西湖龟丞相 +西湖 +nicxie +nicxie_ +nicxie_vlog +nicxie_v +bili_88389792514 +lwcdmx +来了哦 +来了 +芦苇草 +比例比 +prof) +prof- +uid:169324440 +u169324440 +我在故宫修文物 +https://www.bilibili.com/bangumi/play/ep120576 +ok +举个栗子 +举个栗子o_o +我是棉被 +av7224263 +举个 +jiao'n +胶囊lv'g +胶囊旅馆 +https://www.bilibili.com/video/av45103688 +av45103688 +heyl +heyleohe +仙剑外传 +enough1234 +命运冠位指定 +av46742186 +1451983914519839 +游戏区 +uid38 +uid38911282 +igvs +igvsrng +https://www.bilibili.com/video/av47126553 +av47126553 +张鲍勃 +av38783065 +jie'mi +婕咪陪秋猫 +婕咪 +av44254465 +婕咪pei'qiu'm +【蔡康永】我鼓励大家成为冷淡的人 +