Skip to content

Commit

Permalink
refactor: 规范日志打印
Browse files Browse the repository at this point in the history
feat: B站指定视频ID爬取(bvid)
  • Loading branch information
NanmiCoder committed Dec 22, 2023
1 parent 273c9a3 commit aba9f14
Show file tree
Hide file tree
Showing 18 changed files with 147 additions and 133 deletions.
9 changes: 5 additions & 4 deletions config/base_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

# 评论关键词筛选(只会留下包含关键词的评论,为空不限制)
COMMENT_KEYWORDS = [
"我"
# "真棒"
# ........................
]

Expand All @@ -56,9 +56,10 @@
# 指定快手平台需要爬取的ID列表
KS_SPECIFIED_ID_LIST = []

# 指定B站平台需要爬取的视频ID列表
# 指定B站平台需要爬取的视频bvid列表
BILI_SPECIFIED_ID_LIST = [
"416252543",
"976148468"
"BV1d54y1g7db",
"BV1Sz4y1U77N",
"BV14Q4y1n7jz",
# ........................
]
2 changes: 1 addition & 1 deletion db.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ async def init_db(create_db: bool = False) -> None:
async def init():
await init_db(create_db=True)
await Tortoise.generate_schemas()
utils.logger.info("Init DB Success!")
utils.logger.info("[db.init] Init DB Success!")


if __name__ == '__main__':
Expand Down
26 changes: 16 additions & 10 deletions media_platform/bilibili/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# @Desc : bilibili 请求客户端
import asyncio
import json
from typing import Any, Callable, Dict, List, Optional, Tuple
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from urllib.parse import urlencode

import httpx
Expand Down Expand Up @@ -94,16 +94,16 @@ async def post(self, uri: str, data: dict) -> Dict:

async def pong(self) -> bool:
"""get a note to check if login state is ok"""
utils.logger.info("Begin pong bilibili...")
utils.logger.info("[BilibiliClient.pong] Begin pong bilibili...")
ping_flag = False
try:
check_login_uri = "/x/web-interface/nav"
response = await self.get(check_login_uri)
if response.get("isLogin"):
utils.logger.info("use cache login state get web interface successfull!")
utils.logger.info("[BilibiliClient.pong] Use cache login state get web interface successfull!")
ping_flag = True
except Exception as e:
utils.logger.error(f"Pong bilibili failed: {e}, and try to login again...")
utils.logger.error(f"[BilibiliClient.pong] Pong bilibili failed: {e}, and try to login again...")
ping_flag = False
return ping_flag

Expand Down Expand Up @@ -132,16 +132,22 @@ async def search_video_by_keyword(self, keyword: str, page: int = 1, page_size:
}
return await self.get(uri, post_data)

async def get_video_info(self, video_id: str) -> Dict:
async def get_video_info(self, aid: Union[int, None] = None, bvid: Union[str, None] = None) -> Dict:
"""
Bilibli web video detail api
:param video_id:
Bilibli web video detail api, aid 和 bvid任选一个参数
:param aid: 稿件avid
:param bvid: 稿件bvid
:return:
"""
if not aid and not bvid:
raise ValueError("请提供 aid 或 bvid 中的至少一个参数")

uri = "/x/web-interface/view/detail"
params = {
"aid": video_id
}
params = dict()
if aid:
params.update({"aid": aid})
else:
params.update({"bvid": bvid})
return await self.get(uri, params, enable_params_sign=False)

async def get_video_comments(self,
Expand Down
43 changes: 25 additions & 18 deletions media_platform/bilibili/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import random
import time
from asyncio import Task
from typing import Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Tuple, Union

from playwright.async_api import (BrowserContext, BrowserType, Page,
async_playwright)
Expand Down Expand Up @@ -69,7 +69,7 @@ async def start(self):
if not await self.bili_client.pong():
login_obj = BilibiliLogin(
login_type=self.login_type,
login_phone="", # your phone number
login_phone="", # your phone number
browser_context=self.browser_context,
context_page=self.context_page,
cookie_str=config.COOKIES
Expand All @@ -94,10 +94,10 @@ async def search(self):
search bilibili video with keywords
:return:
"""
utils.logger.info("Begin search bilibli keywords")
utils.logger.info("[BilibiliCrawler.search] Begin search bilibli keywords")
bili_limit_count = 20 # bilibili limit page fixed value
for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"Current search keyword: {keyword}")
utils.logger.info(f"[BilibiliCrawler.search] Current search keyword: {keyword}")
page = 1
while page * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
video_id_list: List[str] = []
Expand All @@ -111,7 +111,7 @@ async def search(self):

semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [
self.get_video_info_task(video_item.get("aid"), semaphore)
self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore)
for video_item in video_list
]
video_items = await asyncio.gather(*task_list)
Expand All @@ -129,7 +129,7 @@ async def batch_get_video_comments(self, video_id_list: List[str]):
:param video_id_list:
:return:
"""
utils.logger.info(f"[batch_get_video_comments] video ids:{video_id_list}")
utils.logger.info(f"[BilibiliCrawler.batch_get_video_comments] video ids:{video_id_list}")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = []
for video_id in video_id_list:
Expand All @@ -146,7 +146,7 @@ async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore):
"""
async with semaphore:
try:
utils.logger.info(f"[get_comments] begin get video_id: {video_id} comments ...")
utils.logger.info(f"[BilibiliCrawler.get_comments] begin get video_id: {video_id} comments ...")
# Read keyword and quantity from config
keywords = config.COMMENT_KEYWORDS
max_comments = config.MAX_COMMENTS_PER_POST
Expand Down Expand Up @@ -174,9 +174,9 @@ async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore):
await bilibili.batch_update_bilibili_video_comments(video_id, filtered_comments)

except DataFetchError as ex:
utils.logger.error(f"[get_comments] get video_id: {video_id} comment error: {ex}")
utils.logger.error(f"[BilibiliCrawler.get_comments] get video_id: {video_id} comment error: {ex}")
except Exception as e:
utils.logger.error(f"[get_comments] may be been blocked, err:{e}")
utils.logger.error(f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}")

async def get_specified_videos(self):
"""
Expand All @@ -185,35 +185,42 @@ async def get_specified_videos(self):
"""
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [
self.get_video_info_task(video_id=video_id, semaphore=semaphore) for video_id in config.BILI_SPECIFIED_ID_LIST
self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in
config.BILI_SPECIFIED_ID_LIST
]
video_details = await asyncio.gather(*task_list)
video_aids_list = []
for video_detail in video_details:
if video_detail is not None:
video_item_view: Dict = video_detail.get("View")
video_aid: str = video_item_view.get("aid")
if video_aid:
video_aids_list.append(video_aid)
await bilibili.update_bilibili_video(video_detail)
await self.batch_get_video_comments(config.BILI_SPECIFIED_ID_LIST)
await self.batch_get_video_comments(video_aids_list)

async def get_video_info_task(self, video_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
async def get_video_info_task(self, aid: int, bvid: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
"""
Get video detail task
:param video_id:
:param aid:
:param bvid:
:param semaphore:
:return:
"""
async with semaphore:
try:
result = await self.bili_client.get_video_info(video_id)
result = await self.bili_client.get_video_info(aid=aid, bvid=bvid)
return result
except DataFetchError as ex:
utils.logger.error(f"Get video detail error: {ex}")
utils.logger.error(f"[BilibiliCrawler.get_video_info_task] Get video detail error: {ex}")
return None
except KeyError as ex:
utils.logger.error(f"have not fund note detail video_id:{video_id}, err: {ex}")
utils.logger.error(f"[BilibiliCrawler.get_video_info_task] have not fund note detail video_id:{bvid}, err: {ex}")
return None

async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient:
"""Create xhs client"""
utils.logger.info("Begin create xiaohongshu API client ...")
utils.logger.info("[BilibiliCrawler.create_bilibili_client] Begin create xiaohongshu API client ...")
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
bilibili_client_obj = BilibiliClient(
proxies=httpx_proxy,
Expand Down Expand Up @@ -250,7 +257,7 @@ async def launch_browser(
headless: bool = True
) -> BrowserContext:
"""Launch browser and create browser context"""
utils.logger.info("Begin create browser context ...")
utils.logger.info("[BilibiliCrawler.launch_browser] Begin create browser context ...")
if config.SAVE_LOGIN_STATE:
# feat issue #14
# we will save login state to avoid login every time
Expand Down
16 changes: 8 additions & 8 deletions media_platform/bilibili/login.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,15 @@ def __init__(self,

async def begin(self):
"""Start login xiaohongshu"""
utils.logger.info("Begin login Bilibili ...")
utils.logger.info("[BilibiliLogin.begin] Begin login Bilibili ...")
if self.login_type == "qrcode":
await self.login_by_qrcode()
elif self.login_type == "phone":
await self.login_by_mobile()
elif self.login_type == "cookie":
await self.login_by_cookies()
else:
raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookie ...")
raise ValueError("[BilibiliLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")

@retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
async def check_login_state(self) -> bool:
Expand All @@ -59,7 +59,7 @@ async def check_login_state(self) -> bool:

async def login_by_qrcode(self):
"""login bilibili website and keep webdriver login state"""
utils.logger.info("Begin login bilibili by qrcode ...")
utils.logger.info("[BilibiliLogin.login_by_qrcode] Begin login bilibili by qrcode ...")

# click login button
login_button_ele = self.context_page.locator(
Expand All @@ -74,29 +74,29 @@ async def login_by_qrcode(self):
selector=qrcode_img_selector
)
if not base64_qrcode_img:
utils.logger.info("login failed , have not found qrcode please check ....")
utils.logger.info("[BilibiliLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
sys.exit()

# show login qrcode
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)

utils.logger.info(f"Waiting for scan code login, remaining time is 20s")
utils.logger.info(f"[BilibiliLogin.login_by_qrcode] Waiting for scan code login, remaining time is 20s")
try:
await self.check_login_state()
except RetryError:
utils.logger.info("Login bilibili failed by qrcode login method ...")
utils.logger.info("[BilibiliLogin.login_by_qrcode] Login bilibili failed by qrcode login method ...")
sys.exit()

wait_redirect_seconds = 5
utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
utils.logger.info(f"[BilibiliLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds)

async def login_by_mobile(self):
pass

async def login_by_cookies(self):
utils.logger.info("Begin login bilibili by cookie ...")
utils.logger.info("[BilibiliLogin.login_by_qrcode] Begin login bilibili by cookie ...")
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
await self.browser_context.add_cookies([{
'name': key,
Expand Down
20 changes: 10 additions & 10 deletions media_platform/douyin/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,12 @@ async def start(self) -> None:
# Get the information and comments of the specified post
await self.get_specified_awemes()

utils.logger.info("Douyin Crawler finished ...")
utils.logger.info("[DouYinCrawler.start] Douyin Crawler finished ...")

async def search(self) -> None:
utils.logger.info("Begin search douyin keywords")
utils.logger.info("[DouYinCrawler.search] Begin search douyin keywords")
for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"Current keyword: {keyword}")
utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
aweme_list: List[str] = []
dy_limit_count = 10
page = 0
Expand All @@ -89,7 +89,7 @@ async def search(self) -> None:
posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword,
offset=page * dy_limit_count)
except DataFetchError:
utils.logger.error(f"search douyin keyword: {keyword} failed")
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed")
break
page += 1
for post_item in posts_res.get("data"):
Expand All @@ -100,7 +100,7 @@ async def search(self) -> None:
continue
aweme_list.append(aweme_info.get("aweme_id", ""))
await douyin.update_douyin_aweme(aweme_item=aweme_info)
utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
await self.batch_get_note_comments(aweme_list)

async def get_specified_awemes(self):
Expand All @@ -121,10 +121,10 @@ async def get_aweme_detail(self, aweme_id: str, semaphore: asyncio.Semaphore) ->
try:
return await self.dy_client.get_video_by_id(aweme_id)
except DataFetchError as ex:
utils.logger.error(f"Get aweme detail error: {ex}")
utils.logger.error(f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}")
return None
except KeyError as ex:
utils.logger.error(f"have not fund note detail aweme_id:{aweme_id}, err: {ex}")
utils.logger.error(f"[DouYinCrawler.get_aweme_detail] have not fund note detail aweme_id:{aweme_id}, err: {ex}")
return None

async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
Expand All @@ -147,9 +147,9 @@ async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore, max_co
)
# 现在返回的 comments 已经是经过关键词筛选的
await douyin.batch_update_dy_aweme_comments(aweme_id, comments)
utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained and filtered ...")
utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
except DataFetchError as e:
utils.logger.error(f"aweme_id: {aweme_id} get comments failed, error: {e}")
utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")

@staticmethod
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:
Expand Down Expand Up @@ -213,4 +213,4 @@ async def launch_browser(
async def close(self) -> None:
"""Close browser context"""
await self.browser_context.close()
utils.logger.info("Browser context closed ...")
utils.logger.info("[DouYinCrawler.close] Browser context closed ...")
Loading

0 comments on commit aba9f14

Please sign in to comment.