From a1f554592ae98b0be77533dc4b088f22fe32cd47 Mon Sep 17 00:00:00 2001 From: puppylpg Date: Sun, 10 Jan 2021 02:22:20 +0800 Subject: [PATCH] Use aiohttp-socks to request steam by ways of socks. (#66) --- CHANGELOG.md | 3 +++ README.md | 16 +++++++++++++++- config/config.ini | 16 ++++++++-------- requirements.txt | 3 +++ src/crawl/history_price_crawler.py | 12 +++++++++++- src/crawl/item_crawler.py | 9 ++++----- src/util/requester.py | 14 +++++++------- src/util/timer.py | 4 ++-- 8 files changed, 53 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e7da537..2d1be66 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -133,3 +133,6 @@ * bugfix - 应该用总量除以80,而不是总页数除以80…… +## 4.0.0(2021-01-10) +* 功能 + - 使用aio从steam获取请求 diff --git a/README.md b/README.md index f3ccb16..acb3b46 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,14 @@ If there is no data available, crawl from the website, then analyse data from lo ## 视频教程 最直白的方式,就是再跟着视频一步步来:[oddish纯小白使用教程](https://www.bilibili.com/video/BV1ET4y1w7e1/) +## v4.0.0异步爬取 +从v4.0.0版本开始,从steam获取数据时,换成了python aio(async io),速度极大提升,基本上一分钟以内就能搞定了。 +相应的,要装的aio相关依赖也多了一些。 + +**如果实在搞不定,建议下载v4.0.0之前的版本。用起来稍微简单点儿,缺点就是非常慢。和v4.0.0相比,二者大概就是一分钟和一小时的差距吧。** + +但个人还是建议先尝试一下v4.0.0+版本,毕竟速度快了不是一星半点儿。 + ## wiki 这里还有两篇文章介绍了oddish初期构建和优化思路,如果你想看的话: - https://puppylpg.github.io/2019/12/02/python-crawler-buff/ @@ -94,11 +102,17 @@ steam_cookie = timezoneOffset=28800,0; steamMachineAuth76561198093333055=649A9B5 如果不关心过程,只查看分析结果即可。 ## 依赖 -如果只会人肉安装,就安装以下依赖: +如果只会使用Anaconda人肉安装,就安装以下依赖: - python: 3.8.5 - pandas: 1.1.0 - numpy: 1.19.1 - requests: 2.24.0 +- aiohttp: 3.7.3 +- aiofiles: 0.6.0 +- aiohttp-socks: 0.5.5(使用pip) + +由于aiohttp-socks没有移植到Anaconda,所以只能使用pip安装。点击Anaconda界面首页的'CMD.exe Promt'下面的Launch按钮, +会打开一个命令行,输入`pip install aiosocks`即可安装。 如果懂pip,直接用以下命令安装: > pip install -r requirements.txt diff --git a/config/config.ini b/config/config.ini index ad2a96d..f5bd4a7 100644 --- a/config/config.ini +++ b/config/config.ini @@ -13,11 +13,11 @@ proxy = socks5://127.0.0.1:10808 # 控制程序行为 [BEHAVIOR] # 警告:鉴于buff现在爬得太多会账号冷却一段时间,建议搞大一点!!! -# 爬取间隔调大之后,爬的速度会很慢,建议使用下面的category_white_list/category_black_list缩小目标饰品范围 -# 爬取间隔下限:4s -frequency_interval_low=4 -# 爬取间隔上限:8s。即:每4-8s爬取一次 -frequency_interval_high=8 +# 现在从buff上获取的信息很少,爬取间隔调大之后,基本不影响整体的爬取速度。 +# 爬取间隔下限:10s +frequency_interval_low=10 +# 爬取间隔上限:15s。即:每10-15s爬取一次 +frequency_interval_high=15 # 重新爬取已缓存文件的小时间隔 url_cache_hour = 6 # 无视缓存爬取数据 @@ -35,9 +35,9 @@ steam_sell_tax = 0.15 # 过滤行为 [FILTER] # 爬取物品的最低价格,价格过低则不考虑 -crawl_min_price_item = 150 +crawl_min_price_item = 100 # 爬取物品的最高价格 -crawl_max_price_item = 160 +crawl_max_price_item = 300 # 7天交易历史,少于该阈值则认为是冷门物品,不考虑 min_sold_threshold = 70 # 爬取类别白名单,如只想爬取AK和M4(A1 & A4),则设置为:["weapon_ak47", "weapon_m4a1", "weapon_m4a1_silencer"] @@ -51,5 +51,5 @@ category_black_list = ["*sticker*", "*knife*", "csgo_type_tool", "csgo_type_spra # 结果设置 [RESULT] # 每一项建议只显示这么多条数据 -top_n = 50 +top_n = 20 diff --git a/requirements.txt b/requirements.txt index 4711ccb..af2c3df 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,6 @@ +aiofiles==0.6.0 +aiohttp==3.7.3 +aiohttp-socks==0.5.5 certifi==2020.6.20 cffi==1.14.3 chardet==3.0.4 diff --git a/src/crawl/history_price_crawler.py b/src/crawl/history_price_crawler.py index 43fb455..6d81032 100644 --- a/src/crawl/history_price_crawler.py +++ b/src/crawl/history_price_crawler.py @@ -4,6 +4,10 @@ from datetime import datetime import aiohttp +# import aiosocks +# from aiosocks.connector import ProxyConnector +from aiohttp_socks import ProxyConnector +from src.config.definitions import PROXY from src.config.urls import steam_price_history_url from src.util.logger import log @@ -45,7 +49,13 @@ async def async_crawl_history_price(csgo_items): log.info('Total {} items to get history price.'.format(total_price_number)) tasks = [] - async with aiohttp.ClientSession(cookies=steam_cookies, headers=get_headers(), connector = aiohttp.TCPConnector(limit=5)) as session: + + if PROXY: + # use socks + connector = ProxyConnector.from_url(PROXY, limit=5) + else: + connector = aiohttp.TCPConnector(limit=5) + async with aiohttp.ClientSession(cookies=steam_cookies, headers=get_headers(), connector=connector) as session: for index, item in enumerate(csgo_items, start=1): try: tasks.append( diff --git a/src/crawl/item_crawler.py b/src/crawl/item_crawler.py index d34e7cf..44294ab 100644 --- a/src/crawl/item_crawler.py +++ b/src/crawl/item_crawler.py @@ -56,12 +56,11 @@ def csgo_all_categories(): return categories -def enrich_item_with_price_history(csgo_items, crawl_steam_async = True): +def enrich_item_with_price_history(csgo_items, crawl_steam_async=True): # crawl price for all items - if crawl_steam_async == True: - asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) - asyncio.run( - history_price_crawler.async_crawl_history_price(csgo_items)) + if crawl_steam_async: + # asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) + asyncio.run(history_price_crawler.async_crawl_history_price(csgo_items)) else: history_price_crawler.crawl_history_price(csgo_items) return csgo_items diff --git a/src/util/requester.py b/src/util/requester.py index 0d28c26..da67357 100644 --- a/src/util/requester.py +++ b/src/util/requester.py @@ -98,14 +98,9 @@ async def async_get_json_dict_raw(url, cookies, session: ClientSession, proxy = try: - if proxy and proxies != {}: - async with session.get(url, proxy=proxies["http"]) as resp: - return await resp.text() - # return requests.get(url, headers=get_headers(), cookies=cookies, timeout=5, proxies=proxies).text async with session.get(url) as resp: return await resp.text() # return requests.get(url, headers=get_headers(), cookies=cookies, timeout=5).text - except Timeout: log.warn("Timeout for {}. Try again.".format(url)) except Exception as e: @@ -115,14 +110,19 @@ async def async_get_json_dict_raw(url, cookies, session: ClientSession, proxy = # 首次出错时异步休眠,第二次出错时全体任务休眠。 await timer.async_sleep_awhile() if times == 2: + log.error('aio http error happens 2 times. use sync wait') timer.sleep_awhile() data = await async_get_json_dict_raw(url, cookies, session, proxy, times + 1) return data async def async_get_json_dict(url, cookies, session, proxy = False, times = 1): - if await asyncexist(url): - return json.loads(await asyncfetch(url)) + # TODO: 从cache获取部分数据后,aiohttp_socks请求会挂掉: RuntimeError: Session is closed,从一开始就请求steam就没问题 + # TODO: 以上问题还会导致最终的table都是NaN + # TODO 复现方式: 先清空cache跑一遍,再随机删掉chche里的某条目,再跑一遍,就能复现 + # TODO 鉴于以上两个问题,先把从本地cache获取数据的功能关了。反正aio很快,也不是很需要cache + # if await asyncexist(url): + # return json.loads(await asyncfetch(url)) json_data = await async_get_json_dict_raw(url, cookies, session, proxy, times) if json_data is None: diff --git a/src/util/timer.py b/src/util/timer.py index 5d24942..e6a69f1 100644 --- a/src/util/timer.py +++ b/src/util/timer.py @@ -8,8 +8,8 @@ # 爬buff使用长间隔,steam使用短间隔 def sleep_awhile(is_steam_request = 0): - low = max(FREQUENCY_INTERVAL_LOW, 2) - high = max(2, FREQUENCY_INTERVAL_HIGH) + low = max(FREQUENCY_INTERVAL_LOW, 10) + high = max(10, FREQUENCY_INTERVAL_HIGH) if is_steam_request == 1: interval = 1/(random.randint(5, 10)) else: