From aba9f14f50f9de02d7fec950ef96384ed6b3b433 Mon Sep 17 00:00:00 2001 From: Relakkes Date: Sat, 23 Dec 2023 01:04:08 +0800 Subject: [PATCH] =?UTF-8?q?refactor:=20=E8=A7=84=E8=8C=83=E6=97=A5?= =?UTF-8?q?=E5=BF=97=E6=89=93=E5=8D=B0=20feat:=20B=E7=AB=99=E6=8C=87?= =?UTF-8?q?=E5=AE=9A=E8=A7=86=E9=A2=91ID=E7=88=AC=E5=8F=96=EF=BC=88bvid?= =?UTF-8?q?=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config/base_config.py | 9 ++++--- db.py | 2 +- media_platform/bilibili/client.py | 26 ++++++++++++------- media_platform/bilibili/core.py | 43 ++++++++++++++++++------------- media_platform/bilibili/login.py | 16 ++++++------ media_platform/douyin/core.py | 20 +++++++------- media_platform/douyin/login.py | 32 +++++++++++------------ media_platform/kuaishou/client.py | 4 +-- media_platform/kuaishou/core.py | 30 ++++++++++----------- media_platform/kuaishou/login.py | 16 ++++++------ media_platform/xhs/client.py | 8 +++--- media_platform/xhs/core.py | 24 ++++++++--------- media_platform/xhs/login.py | 28 ++++++++++---------- models/bilibili.py | 4 +-- models/douyin.py | 6 ++--- models/kuaishou.py | 6 ++--- models/xiaohongshu.py | 4 +-- proxy/proxy_ip_provider.py | 2 +- 18 files changed, 147 insertions(+), 133 deletions(-) diff --git a/config/base_config.py b/config/base_config.py index f7673eb..b99a979 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -34,7 +34,7 @@ # 评论关键词筛选(只会留下包含关键词的评论,为空不限制) COMMENT_KEYWORDS = [ - "我" + # "真棒" # ........................ ] @@ -56,9 +56,10 @@ # 指定快手平台需要爬取的ID列表 KS_SPECIFIED_ID_LIST = [] -# 指定B站平台需要爬取的视频ID列表 +# 指定B站平台需要爬取的视频bvid列表 BILI_SPECIFIED_ID_LIST = [ - "416252543", - "976148468" + "BV1d54y1g7db", + "BV1Sz4y1U77N", + "BV14Q4y1n7jz", # ........................ ] diff --git a/db.py b/db.py index 2d0863d..d25e436 100644 --- a/db.py +++ b/db.py @@ -16,7 +16,7 @@ async def init_db(create_db: bool = False) -> None: async def init(): await init_db(create_db=True) await Tortoise.generate_schemas() - utils.logger.info("Init DB Success!") + utils.logger.info("[db.init] Init DB Success!") if __name__ == '__main__': diff --git a/media_platform/bilibili/client.py b/media_platform/bilibili/client.py index 48cb058..372d26d 100644 --- a/media_platform/bilibili/client.py +++ b/media_platform/bilibili/client.py @@ -4,7 +4,7 @@ # @Desc : bilibili 请求客户端 import asyncio import json -from typing import Any, Callable, Dict, List, Optional, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple, Union from urllib.parse import urlencode import httpx @@ -94,16 +94,16 @@ async def post(self, uri: str, data: dict) -> Dict: async def pong(self) -> bool: """get a note to check if login state is ok""" - utils.logger.info("Begin pong bilibili...") + utils.logger.info("[BilibiliClient.pong] Begin pong bilibili...") ping_flag = False try: check_login_uri = "/x/web-interface/nav" response = await self.get(check_login_uri) if response.get("isLogin"): - utils.logger.info("use cache login state get web interface successfull!") + utils.logger.info("[BilibiliClient.pong] Use cache login state get web interface successfull!") ping_flag = True except Exception as e: - utils.logger.error(f"Pong bilibili failed: {e}, and try to login again...") + utils.logger.error(f"[BilibiliClient.pong] Pong bilibili failed: {e}, and try to login again...") ping_flag = False return ping_flag @@ -132,16 +132,22 @@ async def search_video_by_keyword(self, keyword: str, page: int = 1, page_size: } return await self.get(uri, post_data) - async def get_video_info(self, video_id: str) -> Dict: + async def get_video_info(self, aid: Union[int, None] = None, bvid: Union[str, None] = None) -> Dict: """ - Bilibli web video detail api - :param video_id: + Bilibli web video detail api, aid 和 bvid任选一个参数 + :param aid: 稿件avid + :param bvid: 稿件bvid :return: """ + if not aid and not bvid: + raise ValueError("请提供 aid 或 bvid 中的至少一个参数") + uri = "/x/web-interface/view/detail" - params = { - "aid": video_id - } + params = dict() + if aid: + params.update({"aid": aid}) + else: + params.update({"bvid": bvid}) return await self.get(uri, params, enable_params_sign=False) async def get_video_comments(self, diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py index fbd0651..ccdc22b 100644 --- a/media_platform/bilibili/core.py +++ b/media_platform/bilibili/core.py @@ -8,7 +8,7 @@ import random import time from asyncio import Task -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from playwright.async_api import (BrowserContext, BrowserType, Page, async_playwright) @@ -69,7 +69,7 @@ async def start(self): if not await self.bili_client.pong(): login_obj = BilibiliLogin( login_type=self.login_type, - login_phone="", # your phone number + login_phone="", # your phone number browser_context=self.browser_context, context_page=self.context_page, cookie_str=config.COOKIES @@ -94,10 +94,10 @@ async def search(self): search bilibili video with keywords :return: """ - utils.logger.info("Begin search bilibli keywords") + utils.logger.info("[BilibiliCrawler.search] Begin search bilibli keywords") bili_limit_count = 20 # bilibili limit page fixed value for keyword in config.KEYWORDS.split(","): - utils.logger.info(f"Current search keyword: {keyword}") + utils.logger.info(f"[BilibiliCrawler.search] Current search keyword: {keyword}") page = 1 while page * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: video_id_list: List[str] = [] @@ -111,7 +111,7 @@ async def search(self): semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list = [ - self.get_video_info_task(video_item.get("aid"), semaphore) + self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list ] video_items = await asyncio.gather(*task_list) @@ -129,7 +129,7 @@ async def batch_get_video_comments(self, video_id_list: List[str]): :param video_id_list: :return: """ - utils.logger.info(f"[batch_get_video_comments] video ids:{video_id_list}") + utils.logger.info(f"[BilibiliCrawler.batch_get_video_comments] video ids:{video_id_list}") semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list: List[Task] = [] for video_id in video_id_list: @@ -146,7 +146,7 @@ async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore): """ async with semaphore: try: - utils.logger.info(f"[get_comments] begin get video_id: {video_id} comments ...") + utils.logger.info(f"[BilibiliCrawler.get_comments] begin get video_id: {video_id} comments ...") # Read keyword and quantity from config keywords = config.COMMENT_KEYWORDS max_comments = config.MAX_COMMENTS_PER_POST @@ -174,9 +174,9 @@ async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore): await bilibili.batch_update_bilibili_video_comments(video_id, filtered_comments) except DataFetchError as ex: - utils.logger.error(f"[get_comments] get video_id: {video_id} comment error: {ex}") + utils.logger.error(f"[BilibiliCrawler.get_comments] get video_id: {video_id} comment error: {ex}") except Exception as e: - utils.logger.error(f"[get_comments] may be been blocked, err:{e}") + utils.logger.error(f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}") async def get_specified_videos(self): """ @@ -185,35 +185,42 @@ async def get_specified_videos(self): """ semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list = [ - self.get_video_info_task(video_id=video_id, semaphore=semaphore) for video_id in config.BILI_SPECIFIED_ID_LIST + self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in + config.BILI_SPECIFIED_ID_LIST ] video_details = await asyncio.gather(*task_list) + video_aids_list = [] for video_detail in video_details: if video_detail is not None: + video_item_view: Dict = video_detail.get("View") + video_aid: str = video_item_view.get("aid") + if video_aid: + video_aids_list.append(video_aid) await bilibili.update_bilibili_video(video_detail) - await self.batch_get_video_comments(config.BILI_SPECIFIED_ID_LIST) + await self.batch_get_video_comments(video_aids_list) - async def get_video_info_task(self, video_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]: + async def get_video_info_task(self, aid: int, bvid: str, semaphore: asyncio.Semaphore) -> Optional[Dict]: """ Get video detail task - :param video_id: + :param aid: + :param bvid: :param semaphore: :return: """ async with semaphore: try: - result = await self.bili_client.get_video_info(video_id) + result = await self.bili_client.get_video_info(aid=aid, bvid=bvid) return result except DataFetchError as ex: - utils.logger.error(f"Get video detail error: {ex}") + utils.logger.error(f"[BilibiliCrawler.get_video_info_task] Get video detail error: {ex}") return None except KeyError as ex: - utils.logger.error(f"have not fund note detail video_id:{video_id}, err: {ex}") + utils.logger.error(f"[BilibiliCrawler.get_video_info_task] have not fund note detail video_id:{bvid}, err: {ex}") return None async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient: """Create xhs client""" - utils.logger.info("Begin create xiaohongshu API client ...") + utils.logger.info("[BilibiliCrawler.create_bilibili_client] Begin create xiaohongshu API client ...") cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) bilibili_client_obj = BilibiliClient( proxies=httpx_proxy, @@ -250,7 +257,7 @@ async def launch_browser( headless: bool = True ) -> BrowserContext: """Launch browser and create browser context""" - utils.logger.info("Begin create browser context ...") + utils.logger.info("[BilibiliCrawler.launch_browser] Begin create browser context ...") if config.SAVE_LOGIN_STATE: # feat issue #14 # we will save login state to avoid login every time diff --git a/media_platform/bilibili/login.py b/media_platform/bilibili/login.py index 9426c46..b58f218 100644 --- a/media_platform/bilibili/login.py +++ b/media_platform/bilibili/login.py @@ -34,7 +34,7 @@ def __init__(self, async def begin(self): """Start login xiaohongshu""" - utils.logger.info("Begin login Bilibili ...") + utils.logger.info("[BilibiliLogin.begin] Begin login Bilibili ...") if self.login_type == "qrcode": await self.login_by_qrcode() elif self.login_type == "phone": @@ -42,7 +42,7 @@ async def begin(self): elif self.login_type == "cookie": await self.login_by_cookies() else: - raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookie ...") + raise ValueError("[BilibiliLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...") @retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False)) async def check_login_state(self) -> bool: @@ -59,7 +59,7 @@ async def check_login_state(self) -> bool: async def login_by_qrcode(self): """login bilibili website and keep webdriver login state""" - utils.logger.info("Begin login bilibili by qrcode ...") + utils.logger.info("[BilibiliLogin.login_by_qrcode] Begin login bilibili by qrcode ...") # click login button login_button_ele = self.context_page.locator( @@ -74,29 +74,29 @@ async def login_by_qrcode(self): selector=qrcode_img_selector ) if not base64_qrcode_img: - utils.logger.info("login failed , have not found qrcode please check ....") + utils.logger.info("[BilibiliLogin.login_by_qrcode] login failed , have not found qrcode please check ....") sys.exit() # show login qrcode partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img) asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode) - utils.logger.info(f"Waiting for scan code login, remaining time is 20s") + utils.logger.info(f"[BilibiliLogin.login_by_qrcode] Waiting for scan code login, remaining time is 20s") try: await self.check_login_state() except RetryError: - utils.logger.info("Login bilibili failed by qrcode login method ...") + utils.logger.info("[BilibiliLogin.login_by_qrcode] Login bilibili failed by qrcode login method ...") sys.exit() wait_redirect_seconds = 5 - utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") + utils.logger.info(f"[BilibiliLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...") await asyncio.sleep(wait_redirect_seconds) async def login_by_mobile(self): pass async def login_by_cookies(self): - utils.logger.info("Begin login bilibili by cookie ...") + utils.logger.info("[BilibiliLogin.login_by_qrcode] Begin login bilibili by cookie ...") for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): await self.browser_context.add_cookies([{ 'name': key, diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index 4411780..2770bb9 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -75,12 +75,12 @@ async def start(self) -> None: # Get the information and comments of the specified post await self.get_specified_awemes() - utils.logger.info("Douyin Crawler finished ...") + utils.logger.info("[DouYinCrawler.start] Douyin Crawler finished ...") async def search(self) -> None: - utils.logger.info("Begin search douyin keywords") + utils.logger.info("[DouYinCrawler.search] Begin search douyin keywords") for keyword in config.KEYWORDS.split(","): - utils.logger.info(f"Current keyword: {keyword}") + utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}") aweme_list: List[str] = [] dy_limit_count = 10 page = 0 @@ -89,7 +89,7 @@ async def search(self) -> None: posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword, offset=page * dy_limit_count) except DataFetchError: - utils.logger.error(f"search douyin keyword: {keyword} failed") + utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed") break page += 1 for post_item in posts_res.get("data"): @@ -100,7 +100,7 @@ async def search(self) -> None: continue aweme_list.append(aweme_info.get("aweme_id", "")) await douyin.update_douyin_aweme(aweme_item=aweme_info) - utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}") + utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}") await self.batch_get_note_comments(aweme_list) async def get_specified_awemes(self): @@ -121,10 +121,10 @@ async def get_aweme_detail(self, aweme_id: str, semaphore: asyncio.Semaphore) -> try: return await self.dy_client.get_video_by_id(aweme_id) except DataFetchError as ex: - utils.logger.error(f"Get aweme detail error: {ex}") + utils.logger.error(f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}") return None except KeyError as ex: - utils.logger.error(f"have not fund note detail aweme_id:{aweme_id}, err: {ex}") + utils.logger.error(f"[DouYinCrawler.get_aweme_detail] have not fund note detail aweme_id:{aweme_id}, err: {ex}") return None async def batch_get_note_comments(self, aweme_list: List[str]) -> None: @@ -147,9 +147,9 @@ async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore, max_co ) # 现在返回的 comments 已经是经过关键词筛选的 await douyin.batch_update_dy_aweme_comments(aweme_id, comments) - utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained and filtered ...") + utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...") except DataFetchError as e: - utils.logger.error(f"aweme_id: {aweme_id} get comments failed, error: {e}") + utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}") @staticmethod def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]: @@ -213,4 +213,4 @@ async def launch_browser( async def close(self) -> None: """Close browser context""" await self.browser_context.close() - utils.logger.info("Browser context closed ...") + utils.logger.info("[DouYinCrawler.close] Browser context closed ...") diff --git a/media_platform/douyin/login.py b/media_platform/douyin/login.py index e789131..77f9b6e 100644 --- a/media_platform/douyin/login.py +++ b/media_platform/douyin/login.py @@ -47,7 +47,7 @@ async def begin(self): elif self.login_type == "cookie": await self.login_by_cookies() else: - raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookie ...") + raise ValueError("[DouYinLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...") # 如果页面重定向到滑动验证码页面,需要再次滑动滑块 await asyncio.sleep(6) @@ -56,16 +56,16 @@ async def begin(self): await self.check_page_display_slider(move_step=3, slider_level="hard") # check login state - utils.logger.info(f"login finished then check login state ...") + utils.logger.info(f"[DouYinLogin.begin] login finished then check login state ...") try: await self.check_login_state() except RetryError: - utils.logger.info("login failed please confirm ...") + utils.logger.info("[DouYinLogin.begin] login failed please confirm ...") sys.exit() # wait for redirect wait_redirect_seconds = 5 - utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") + utils.logger.info(f"[DouYinLogin.begin] Login successful then wait for {wait_redirect_seconds} seconds redirect ...") await asyncio.sleep(wait_redirect_seconds) @retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False)) @@ -84,21 +84,21 @@ async def popup_login_dialog(self): # check dialog box is auto popup and wait for 10 seconds await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 10) except Exception as e: - utils.logger.error(f"login dialog box does not pop up automatically, error: {e}") - utils.logger.info("login dialog box does not pop up automatically, we will manually click the login button") + utils.logger.error(f"[DouYinLogin.popup_login_dialog] login dialog box does not pop up automatically, error: {e}") + utils.logger.info("[DouYinLogin.popup_login_dialog] login dialog box does not pop up automatically, we will manually click the login button") login_button_ele = self.context_page.locator("xpath=//p[text() = '登录']") await login_button_ele.click() await asyncio.sleep(0.5) async def login_by_qrcode(self): - utils.logger.info("Begin login douyin by qrcode...") + utils.logger.info("[DouYinLogin.login_by_qrcode] Begin login douyin by qrcode...") qrcode_img_selector = "xpath=//article[@class='web-login']//img" base64_qrcode_img = await utils.find_login_qrcode( self.context_page, selector=qrcode_img_selector ) if not base64_qrcode_img: - utils.logger.info("login qrcode not found please confirm ...") + utils.logger.info("[DouYinLogin.login_by_qrcode] login qrcode not found please confirm ...") sys.exit() # show login qrcode @@ -109,7 +109,7 @@ async def login_by_qrcode(self): await asyncio.sleep(2) async def login_by_mobile(self): - utils.logger.info("Begin login douyin by mobile ...") + utils.logger.info("[DouYinLogin.login_by_mobile] Begin login douyin by mobile ...") mobile_tap_ele = self.context_page.locator("xpath=//li[text() = '验证码登录']") await mobile_tap_ele.click() await self.context_page.wait_for_selector("xpath=//article[@class='web-login-mobile-code']") @@ -124,7 +124,7 @@ async def login_by_mobile(self): redis_obj = redis.Redis(host=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD) max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟 while max_get_sms_code_time > 0: - utils.logger.info(f"get douyin sms code from redis remaining time {max_get_sms_code_time}s ...") + utils.logger.info(f"[DouYinLogin.login_by_mobile] get douyin sms code from redis remaining time {max_get_sms_code_time}s ...") await asyncio.sleep(1) sms_code_key = f"dy_{self.login_phone}" sms_code_value = redis_obj.get(sms_code_key) @@ -157,7 +157,7 @@ async def check_page_display_slider(self, move_step: int = 10, slider_level: str slider_verify_success = False while not slider_verify_success: if max_slider_try_times <= 0: - utils.logger.error("slider verify failed ...") + utils.logger.error("[DouYinLogin.check_page_display_slider] slider verify failed ...") sys.exit() try: await self.move_slider(back_selector, gap_selector, move_step, slider_level) @@ -166,20 +166,20 @@ async def check_page_display_slider(self, move_step: int = 10, slider_level: str # 如果滑块滑动慢了,或者验证失败了,会提示操作过慢,这里点一下刷新按钮 page_content = await self.context_page.content() if "操作过慢" in page_content or "提示重新操作" in page_content: - utils.logger.info("slider verify failed, retry ...") + utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify failed, retry ...") await self.context_page.click(selector="//a[contains(@class, 'secsdk_captcha_refresh')]") continue # 滑动成功后,等待滑块消失 await self.context_page.wait_for_selector(selector=back_selector, state="hidden", timeout=1000) # 如果滑块消失了,说明验证成功了,跳出循环,如果没有消失,说明验证失败了,上面这一行代码会抛出异常被捕获后继续循环滑动验证码 - utils.logger.info("slider verify success ...") + utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify success ...") slider_verify_success = True except Exception as e: - utils.logger.error(f"slider verify failed, error: {e}") + utils.logger.error(f"[DouYinLogin.check_page_display_slider] slider verify failed, error: {e}") await asyncio.sleep(1) max_slider_try_times -= 1 - utils.logger.info(f"remaining slider try times: {max_slider_try_times}") + utils.logger.info(f"[DouYinLogin.check_page_display_slider] remaining slider try times: {max_slider_try_times}") continue async def move_slider(self, back_selector: str, gap_selector: str, move_step: int = 10, slider_level="easy"): @@ -236,7 +236,7 @@ async def move_slider(self, back_selector: str, gap_selector: str, move_step: in await self.context_page.mouse.up() async def login_by_cookies(self): - utils.logger.info("Begin login douyin by cookie ...") + utils.logger.info("[DouYinLogin.login_by_cookies] Begin login douyin by cookie ...") for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): await self.browser_context.add_cookies([{ 'name': key, diff --git a/media_platform/kuaishou/client.py b/media_platform/kuaishou/client.py index 76456b1..20b30fb 100644 --- a/media_platform/kuaishou/client.py +++ b/media_platform/kuaishou/client.py @@ -59,12 +59,12 @@ async def post(self, uri: str, data: dict) -> Dict: @staticmethod async def pong() -> bool: """get a note to check if login state is ok""" - utils.logger.info("Begin pong kuaishou...") + utils.logger.info("[KuaiShouClient.pong] Begin pong kuaishou...") ping_flag = False try: pass except Exception as e: - utils.logger.error(f"Pong kuaishou failed: {e}, and try to login again...") + utils.logger.error(f"[KuaiShouClient.pong] Pong kuaishou failed: {e}, and try to login again...") ping_flag = False return ping_flag diff --git a/media_platform/kuaishou/core.py b/media_platform/kuaishou/core.py index b1ed310..d5cc928 100644 --- a/media_platform/kuaishou/core.py +++ b/media_platform/kuaishou/core.py @@ -81,13 +81,13 @@ async def start(self): else: pass - utils.logger.info("Kuaishou Crawler finished ...") + utils.logger.info("[KuaishouCrawler.start] Kuaishou Crawler finished ...") async def search(self): - utils.logger.info("Begin search kuaishou keywords") + utils.logger.info("[KuaishouCrawler.search] Begin search kuaishou keywords") ks_limit_count = 20 # kuaishou limit page fixed value for keyword in config.KEYWORDS.split(","): - utils.logger.info(f"Current search keyword: {keyword}") + utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}") page = 1 while page * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: video_id_list: List[str] = [] @@ -96,12 +96,12 @@ async def search(self): pcursor=str(page), ) if not videos_res: - utils.logger.error(f"search info by keyword:{keyword} not found data") + utils.logger.error(f"[KuaishouCrawler.search] search info by keyword:{keyword} not found data") continue vision_search_photo: Dict = videos_res.get("visionSearchPhoto") if vision_search_photo.get("result") != 1: - utils.logger.error(f"search info by keyword:{keyword} not found data ") + utils.logger.error(f"[KuaishouCrawler.search] search info by keyword:{keyword} not found data ") continue for video_detail in vision_search_photo.get("feeds"): @@ -129,13 +129,13 @@ async def get_video_info_task(self, video_id: str, semaphore: asyncio.Semaphore) async with semaphore: try: result = await self.ks_client.get_video_info(video_id) - utils.logger.info(f"Get video_id:{video_id} info result: {result} ...") + utils.logger.info(f"[KuaishouCrawler.get_video_info_task] Get video_id:{video_id} info result: {result} ...") return result.get("visionVideoDetail") except DataFetchError as ex: - utils.logger.error(f"Get video detail error: {ex}") + utils.logger.error(f"[KuaishouCrawler.get_video_info_task] Get video detail error: {ex}") return None except KeyError as ex: - utils.logger.error(f"have not fund note detail video_id:{video_id}, err: {ex}") + utils.logger.error(f"[KuaishouCrawler.get_video_info_task] have not fund note detail video_id:{video_id}, err: {ex}") return None async def batch_get_video_comments(self, video_id_list: List[str]): @@ -144,7 +144,7 @@ async def batch_get_video_comments(self, video_id_list: List[str]): :param video_id_list: :return: """ - utils.logger.info(f"[batch_get_video_comments] video ids:{video_id_list}") + utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] video ids:{video_id_list}") semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list: List[Task] = [] for video_id in video_id_list: @@ -163,16 +163,16 @@ async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore): """ async with semaphore: try: - utils.logger.info(f"[get_comments] bengin get video_id: {video_id} comments ...") + utils.logger.info(f"[KuaishouCrawler.get_comments] begin get video_id: {video_id} comments ...") await self.ks_client.get_video_all_comments( photo_id=video_id, crawl_interval=random.random(), callback=kuaishou.batch_update_ks_video_comments ) except DataFetchError as ex: - utils.logger.error(f"[get_comments] get video_id: {video_id} comment error: {ex}") + utils.logger.error(f"[KuaishouCrawler.get_comments] get video_id: {video_id} comment error: {ex}") except Exception as e: - utils.logger.error(f"[get_comments] may be been blocked, err:{e}") + utils.logger.error(f"[KuaishouCrawler.get_comments] may be been blocked, err:{e}") # use time.sleeep block main coroutine instead of asyncio.sleep and cacel running comment task # maybe kuaishou block our request, we will take a nap and update the cookie again current_running_tasks = comment_tasks_var.get() @@ -197,7 +197,7 @@ def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optio async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient: """Create xhs client""" - utils.logger.info("Begin create kuaishou API client ...") + utils.logger.info("[KuaishouCrawler.create_ks_client] Begin create kuaishou API client ...") cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) xhs_client_obj = KuaiShouClient( proxies=httpx_proxy, @@ -221,7 +221,7 @@ async def launch_browser( headless: bool = True ) -> BrowserContext: """Launch browser and create browser context""" - utils.logger.info("Begin create browser context ...") + utils.logger.info("[KuaishouCrawler.launch_browser] Begin create browser context ...") if config.SAVE_LOGIN_STATE: user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % self.platform) # type: ignore @@ -245,4 +245,4 @@ async def launch_browser( async def close(self): """Close browser context""" await self.browser_context.close() - utils.logger.info("Browser context closed ...") + utils.logger.info("[KuaishouCrawler.close] Browser context closed ...") diff --git a/media_platform/kuaishou/login.py b/media_platform/kuaishou/login.py index a7508ca..001d7b0 100644 --- a/media_platform/kuaishou/login.py +++ b/media_platform/kuaishou/login.py @@ -29,7 +29,7 @@ def __init__(self, async def begin(self): """Start login xiaohongshu""" - utils.logger.info("Begin login kuaishou ...") + utils.logger.info("[KuaishouLogin.begin] Begin login kuaishou ...") if self.login_type == "qrcode": await self.login_by_qrcode() elif self.login_type == "phone": @@ -37,7 +37,7 @@ async def begin(self): elif self.login_type == "cookie": await self.login_by_cookies() else: - raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookie ...") + raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...") @retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False)) async def check_login_state(self) -> bool: @@ -55,7 +55,7 @@ async def check_login_state(self) -> bool: async def login_by_qrcode(self): """login kuaishou website and keep webdriver login state""" - utils.logger.info("Begin login kuaishou by qrcode ...") + utils.logger.info("[KuaishouLogin.login_by_qrcode] Begin login kuaishou by qrcode ...") # click login button login_button_ele = self.context_page.locator( @@ -70,7 +70,7 @@ async def login_by_qrcode(self): selector=qrcode_img_selector ) if not base64_qrcode_img: - utils.logger.info("login failed , have not found qrcode please check ....") + utils.logger.info("[KuaishouLogin.login_by_qrcode] login failed , have not found qrcode please check ....") sys.exit() @@ -78,22 +78,22 @@ async def login_by_qrcode(self): partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img) asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode) - utils.logger.info(f"waiting for scan code login, remaining time is 20s") + utils.logger.info(f"[KuaishouLogin.login_by_qrcode] waiting for scan code login, remaining time is 20s") try: await self.check_login_state() except RetryError: - utils.logger.info("Login kuaishou failed by qrcode login method ...") + utils.logger.info("[KuaishouLogin.login_by_qrcode] Login kuaishou failed by qrcode login method ...") sys.exit() wait_redirect_seconds = 5 - utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") + utils.logger.info(f"[KuaishouLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...") await asyncio.sleep(wait_redirect_seconds) async def login_by_mobile(self): pass async def login_by_cookies(self): - utils.logger.info("Begin login kuaishou by cookie ...") + utils.logger.info("[KuaishouLogin.login_by_cookies] Begin login kuaishou by cookie ...") for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): await self.browser_context.add_cookies([{ 'name': key, diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index 1d96986..67d991b 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -83,14 +83,14 @@ async def post(self, uri: str, data: dict) -> Dict: async def pong(self) -> bool: """get a note to check if login state is ok""" - utils.logger.info("Begin to pong xhs...") + utils.logger.info("[XHSClient.pong] Begin to pong xhs...") ping_flag = False try: note_card: Dict = await self.get_note_by_keyword(keyword="小红书") if note_card.get("items"): ping_flag = True except Exception as e: - utils.logger.error(f"Ping xhs failed: {e}, and try to login again...") + utils.logger.error(f"[XHSClient.pong] Ping xhs failed: {e}, and try to login again...") ping_flag = False return ping_flag @@ -136,7 +136,7 @@ async def get_note_by_id(self, note_id: str) -> Dict: if res and res.get("items"): res_dict: Dict = res["items"][0]["note_card"] return res_dict - utils.logger.error(f"[xhs.client.get_note_by_id] get note empty and res:{res}") + utils.logger.error(f"[XHSClient.get_note_by_id] get note empty and res:{res}") return dict() async def get_note_comments(self, note_id: str, cursor: str = "") -> Dict: @@ -195,7 +195,7 @@ async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0, # Handle the absence of 'comments' key appropriately # For example, log an error message, break from the loop, etc. # This is just an example: - print(f"No 'comments' key found in response: {comments_res}") + utils.logger.info(f"[XHSClient.get_note_all_comments] No 'comments' key found in response: {comments_res}") break comments = comments_res["comments"] if not is_fetch_sub_comments: diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index a33c929..a734a62 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -87,14 +87,14 @@ async def start(self) -> None: else: pass - utils.logger.info("Xhs Crawler finished ...") + utils.logger.info("[XiaoHongShuCrawler.start] Xhs Crawler finished ...") async def search(self) -> None: """Search for notes and retrieve their comment information.""" - utils.logger.info("Begin search xiaohongshu keywords") + utils.logger.info("[XiaoHongShuCrawler.search] Begin search xiaohongshu keywords") xhs_limit_count = 20 # xhs limit page fixed value for keyword in config.KEYWORDS.split(","): - utils.logger.info(f"Current search keyword: {keyword}") + utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}") page = 1 while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: note_id_list: List[str] = [] @@ -102,7 +102,7 @@ async def search(self) -> None: keyword=keyword, page=page, ) - utils.logger.info(f"Search notes res:{notes_res}") + utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}") semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list = [ self.get_note_detail(post_item.get("id"), semaphore) @@ -115,7 +115,7 @@ async def search(self) -> None: await xhs_model.update_xhs_note(note_detail) note_id_list.append(note_detail.get("note_id")) page += 1 - utils.logger.info(f"Note details: {note_details}") + utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}") await self.batch_get_note_comments(note_id_list) async def get_specified_notes(self): @@ -136,15 +136,15 @@ async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> O try: return await self.xhs_client.get_note_by_id(note_id) except DataFetchError as ex: - utils.logger.error(f"Get note detail error: {ex}") + utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail] Get note detail error: {ex}") return None except KeyError as ex: - utils.logger.error(f"have not fund note detail note_id:{note_id}, err: {ex}") + utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail] have not fund note detail note_id:{note_id}, err: {ex}") return None async def batch_get_note_comments(self, note_list: List[str]): """Batch get note comments""" - utils.logger.info(f"Begin batch get note comments, note list: {note_list}") + utils.logger.info(f"[XiaoHongShuCrawler.batch_get_note_comments] Begin batch get note comments, note list: {note_list}") semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list: List[Task] = [] for note_id in note_list: @@ -155,7 +155,7 @@ async def batch_get_note_comments(self, note_list: List[str]): async def get_comments(self, note_id: str, semaphore: asyncio.Semaphore): """Get note comments with keyword filtering and quantity limitation""" async with semaphore: - utils.logger.info(f"Begin get note id comments {note_id}") + utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}") all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random()) # 从配置文件中读取关键词和数量限制 @@ -191,7 +191,7 @@ def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optio async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XHSClient: """Create xhs client""" - utils.logger.info("Begin create xiaohongshu API client ...") + utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...") cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) xhs_client_obj = XHSClient( proxies=httpx_proxy, @@ -215,7 +215,7 @@ async def launch_browser( headless: bool = True ) -> BrowserContext: """Launch browser and create browser context""" - utils.logger.info("Begin create browser context ...") + utils.logger.info("[XiaoHongShuCrawler.launch_browser] Begin create browser context ...") if config.SAVE_LOGIN_STATE: # feat issue #14 # we will save login state to avoid login every time @@ -241,4 +241,4 @@ async def launch_browser( async def close(self): """Close browser context""" await self.browser_context.close() - utils.logger.info("Browser context closed ...") + utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...") diff --git a/media_platform/xhs/login.py b/media_platform/xhs/login.py index 6f656ad..cba1219 100644 --- a/media_platform/xhs/login.py +++ b/media_platform/xhs/login.py @@ -37,7 +37,7 @@ async def check_login_state(self, no_logged_in_session: str) -> bool: """ if "请通过验证" in await self.context_page.content(): - utils.logger.info("登录过程中出现验证码,请手动验证") + utils.logger.info("[XHSLogin.check_login_state] 登录过程中出现验证码,请手动验证") current_cookie = await self.browser_context.cookies() _, cookie_dict = utils.convert_cookies(current_cookie) @@ -48,7 +48,7 @@ async def check_login_state(self, no_logged_in_session: str) -> bool: async def begin(self): """Start login xiaohongshu""" - utils.logger.info("Begin login xiaohongshu ...") + utils.logger.info("[XHSLogin.begin] Begin login xiaohongshu ...") if self.login_type == "qrcode": await self.login_by_qrcode() elif self.login_type == "phone": @@ -56,11 +56,11 @@ async def begin(self): elif self.login_type == "cookie": await self.login_by_cookies() else: - raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookies ...") + raise ValueError("[XHSLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...") async def login_by_mobile(self): """Login xiaohongshu by mobile""" - utils.logger.info("Begin login xiaohongshu by mobile ...") + utils.logger.info("[XHSLogin.login_by_mobile] Begin login xiaohongshu by mobile ...") await asyncio.sleep(1) try: # 小红书进入首页后,有可能不会自动弹出登录框,需要手动点击登录按钮 @@ -77,7 +77,7 @@ async def login_by_mobile(self): ) await element.click() except Exception as e: - utils.logger.info("have not found mobile button icon and keep going ...") + utils.logger.info("[XHSLogin.login_by_mobile] have not found mobile button icon and keep going ...") await asyncio.sleep(1) login_container_ele = await self.context_page.wait_for_selector("div.login-container") @@ -93,7 +93,7 @@ async def login_by_mobile(self): max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟 no_logged_in_session = "" while max_get_sms_code_time > 0: - utils.logger.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...") + utils.logger.info(f"[XHSLogin.login_by_mobile] get sms code from redis remaining time {max_get_sms_code_time}s ...") await asyncio.sleep(1) sms_code_key = f"xhs_{self.login_phone}" sms_code_value = redis_obj.get(sms_code_key) @@ -119,16 +119,16 @@ async def login_by_mobile(self): try: await self.check_login_state(no_logged_in_session) except RetryError: - utils.logger.info("Login xiaohongshu failed by mobile login method ...") + utils.logger.info("[XHSLogin.login_by_mobile] Login xiaohongshu failed by mobile login method ...") sys.exit() wait_redirect_seconds = 5 - utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") + utils.logger.info(f"[XHSLogin.login_by_mobile] Login successful then wait for {wait_redirect_seconds} seconds redirect ...") await asyncio.sleep(wait_redirect_seconds) async def login_by_qrcode(self): """login xiaohongshu website and keep webdriver login state""" - utils.logger.info("Begin login xiaohongshu by qrcode ...") + utils.logger.info("[XHSLogin.login_by_qrcode] Begin login xiaohongshu by qrcode ...") # login_selector = "div.login-container > div.left > div.qrcode > img" qrcode_img_selector = "xpath=//img[@class='qrcode-img']" # find login qrcode @@ -137,7 +137,7 @@ async def login_by_qrcode(self): selector=qrcode_img_selector ) if not base64_qrcode_img: - utils.logger.info("login failed , have not found qrcode please check ....") + utils.logger.info("[XHSLogin.login_by_qrcode] login failed , have not found qrcode please check ....") # if this website does not automatically popup login dialog box, we will manual click login button await asyncio.sleep(0.5) login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button") @@ -161,20 +161,20 @@ async def login_by_qrcode(self): partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img) asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode) - utils.logger.info(f"waiting for scan code login, remaining time is 120s") + utils.logger.info(f"[XHSLogin.login_by_qrcode] waiting for scan code login, remaining time is 120s") try: await self.check_login_state(no_logged_in_session) except RetryError: - utils.logger.info("Login xiaohongshu failed by qrcode login method ...") + utils.logger.info("[XHSLogin.login_by_qrcode] Login xiaohongshu failed by qrcode login method ...") sys.exit() wait_redirect_seconds = 5 - utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") + utils.logger.info(f"[XHSLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...") await asyncio.sleep(wait_redirect_seconds) async def login_by_cookies(self): """login xiaohongshu website by cookies""" - utils.logger.info("Begin login xiaohongshu by cookie ...") + utils.logger.info("[XHSLogin.login_by_cookies] Begin login xiaohongshu by cookie ...") for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): if key != "web_session": # only set web_session cookie attr continue diff --git a/models/bilibili.py b/models/bilibili.py index 15cfafb..6ca3e71 100644 --- a/models/bilibili.py +++ b/models/bilibili.py @@ -85,7 +85,7 @@ async def update_bilibili_video(video_item: Dict): "video_url": f"https://www.bilibili.com/video/av{video_id}", "video_cover_url": video_item_view.get("pic", ""), } - utils.logger.info(f"bilibili video id:{video_id}, title:{local_db_item.get('title')}") + utils.logger.info(f"[models.bilibili.update_bilibili_video] bilibili video id:{video_id}, title:{local_db_item.get('title')}") if config.IS_SAVED_DATABASED: if not await BilibiliVideo.filter(video_id=video_id).exists(): local_db_item["add_ts"] = utils.get_current_timestamp() @@ -131,7 +131,7 @@ async def update_bilibili_video_comment(video_id: str, comment_item: Dict): "sub_comment_count": str(comment_item.get("rcount", 0)), "last_modify_ts": utils.get_current_timestamp(), } - utils.logger.info(f"Bilibili video comment: {comment_id}, content: {local_db_item.get('content')}") + utils.logger.info(f"[models.bilibili.update_bilibili_video_comment] Bilibili video comment: {comment_id}, content: {local_db_item.get('content')}") if config.IS_SAVED_DATABASED: if not await BilibiliComment.filter(comment_id=comment_id).exists(): local_db_item["add_ts"] = utils.get_current_timestamp() diff --git a/models/douyin.py b/models/douyin.py index fe94bd4..a65d7a1 100644 --- a/models/douyin.py +++ b/models/douyin.py @@ -88,7 +88,7 @@ async def update_douyin_aweme(aweme_item: Dict): "last_modify_ts": utils.get_current_timestamp(), "aweme_url": f"https://www.douyin.com/video/{aweme_id}" } - print(f"douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}") + utils.logger.info(f"[models.douyin.update_douyin_aweme] douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}") if config.IS_SAVED_DATABASED: if not await DouyinAweme.filter(aweme_id=aweme_id).exists(): local_db_item["add_ts"] = utils.get_current_timestamp() @@ -123,7 +123,7 @@ async def batch_update_dy_aweme_comments(aweme_id: str, comments: List[Dict]): async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict): comment_aweme_id = comment_item.get("aweme_id") if aweme_id != comment_aweme_id: - print(f"comment_aweme_id: {comment_aweme_id} != aweme_id: {aweme_id}") + utils.logger.error(f"[models.douyin.update_dy_aweme_comment] comment_aweme_id: {comment_aweme_id} != aweme_id: {aweme_id}") return user_info = comment_item.get("user", {}) comment_id = comment_item.get("cid") @@ -145,7 +145,7 @@ async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict): "sub_comment_count": str(comment_item.get("reply_comment_total", 0)), "last_modify_ts": utils.get_current_timestamp(), } - print(f"douyin aweme comment: {comment_id}, content: {local_db_item.get('content')}") + utils.logger.info(f"[models.douyin.update_dy_aweme_comment] douyin aweme comment: {comment_id}, content: {local_db_item.get('content')}") if config.IS_SAVED_DATABASED: if not await DouyinAwemeComment.filter(comment_id=comment_id).exists(): local_db_item["add_ts"] = utils.get_current_timestamp() diff --git a/models/kuaishou.py b/models/kuaishou.py index e2edefe..823d670 100644 --- a/models/kuaishou.py +++ b/models/kuaishou.py @@ -80,7 +80,7 @@ async def update_kuaishou_video(video_item: Dict): "video_cover_url": photo_info.get("coverUrl", ""), "video_play_url": photo_info.get("photoUrl", ""), } - print(f"Kuaishou video id:{video_id}, title:{local_db_item.get('title')}") + utils.logger.info(f"[models.kuaishou.update_kuaishou_video] Kuaishou video id:{video_id}, title:{local_db_item.get('title')}") if config.IS_SAVED_DATABASED: if not await KuaishouVideo.filter(video_id=video_id).exists(): local_db_item["add_ts"] = utils.get_current_timestamp() @@ -106,7 +106,7 @@ async def update_kuaishou_video(video_item: Dict): async def batch_update_ks_video_comments(video_id: str, comments: List[Dict]): - utils.logger.info(f"[batch_update_ks_video_comments] video_id:{video_id}, comments:{comments}") + utils.logger.info(f"[KuaishouVideoComment.batch_update_ks_video_comments] video_id:{video_id}, comments:{comments}") if not comments: return for comment_item in comments: @@ -126,7 +126,7 @@ async def update_ks_video_comment(video_id: str, comment_item: Dict): "sub_comment_count": str(comment_item.get("subCommentCount", 0)), "last_modify_ts": utils.get_current_timestamp(), } - print(f"Kuaishou video comment: {comment_id}, content: {local_db_item.get('content')}") + utils.logger.info(f"[models.kuaishou.update_ks_video_comment] Kuaishou video comment: {comment_id}, content: {local_db_item.get('content')}") if config.IS_SAVED_DATABASED: if not await KuaishouVideoComment.filter(comment_id=comment_id).exists(): local_db_item["add_ts"] = utils.get_current_timestamp() diff --git a/models/xiaohongshu.py b/models/xiaohongshu.py index 3c7c5a3..4098cad 100644 --- a/models/xiaohongshu.py +++ b/models/xiaohongshu.py @@ -86,7 +86,7 @@ async def update_xhs_note(note_item: Dict): "last_modify_ts": utils.get_current_timestamp(), "note_url": f"https://www.xiaohongshu.com/explore/{note_id}" } - print("xhs note:", local_db_item) + utils.logger.info(f"[models.xiaohongshu.update_xhs_note] xhs note: {local_db_item}") if config.IS_SAVED_DATABASED: if not await XHSNote.filter(note_id=note_id).first(): local_db_item["add_ts"] = utils.get_current_timestamp() @@ -125,7 +125,7 @@ async def update_xhs_note_comment(note_id: str, comment_item: Dict): "sub_comment_count": comment_item.get("sub_comment_count"), "last_modify_ts": utils.get_current_timestamp(), } - print("xhs note comment:", local_db_item) + utils.logger.info(f"[models.xiaohongshu.update_xhs_note_comment] xhs note comment:{local_db_item}") if config.IS_SAVED_DATABASED: if not await XHSNoteComment.filter(comment_id=comment_id).first(): local_db_item["add_ts"] = utils.get_current_timestamp() diff --git a/proxy/proxy_ip_provider.py b/proxy/proxy_ip_provider.py index 77318b0..822afb9 100644 --- a/proxy/proxy_ip_provider.py +++ b/proxy/proxy_ip_provider.py @@ -115,7 +115,7 @@ async def get_proxies(self, num: int) -> List[IpInfoModel]: ip_infos = [] async with httpx.AsyncClient() as client: url = self.api_path + "/fetchips" + '?' + urlencode(self.params) - utils.logger.info(f"[JiSuHttpProxy] get ip proxy url:{url}") + utils.logger.info(f"[JiSuHttpProxy.get_proxies] get ip proxy url:{url}") response = await client.get(url, headers={ "User-Agent": "MediaCrawler https://github.com/NanmiCoder/MediaCrawler"}) res_dict: Dict = response.json()