Use aiohttp-socks to request steam by ways of socks. (#66)

puppylpg · Jan 9, 2021 · a1f5545 · a1f5545
1 parent db3880a
commit a1f5545
Show file tree

Hide file tree

Showing 8 changed files with 53 additions and 24 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -133,3 +133,6 @@
 * bugfix
     - 应该用总量除以80，而不是总页数除以80……
 
+## 4.0.0(2021-01-10)
+* 功能
+    - 使用aio从steam获取请求
diff --git a/README.md b/README.md
@@ -42,6 +42,14 @@ If there is no data available, crawl from the website, then analyse data from lo
 ## 视频教程
 最直白的方式，就是再跟着视频一步步来：[oddish纯小白使用教程](https://www.bilibili.com/video/BV1ET4y1w7e1/)
 
+## v4.0.0异步爬取
+从v4.0.0版本开始，从steam获取数据时，换成了python aio（async io），速度极大提升，基本上一分钟以内就能搞定了。
+相应的，要装的aio相关依赖也多了一些。
+
+**如果实在搞不定，建议下载v4.0.0之前的版本。用起来稍微简单点儿，缺点就是非常慢。和v4.0.0相比，二者大概就是一分钟和一小时的差距吧。**
+
+但个人还是建议先尝试一下v4.0.0+版本，毕竟速度快了不是一星半点儿。
+
 ## wiki
 这里还有两篇文章介绍了oddish初期构建和优化思路，如果你想看的话：
 - https://puppylpg.github.io/2019/12/02/python-crawler-buff/
@@ -94,11 +102,17 @@ steam_cookie = timezoneOffset=28800,0; steamMachineAuth76561198093333055=649A9B5
 如果不关心过程，只查看分析结果即可。
 
 ## 依赖
-如果只会人肉安装，就安装以下依赖：
+如果只会使用Anaconda人肉安装，就安装以下依赖：
 - python: 3.8.5
 - pandas: 1.1.0
 - numpy: 1.19.1
 - requests: 2.24.0
+- aiohttp: 3.7.3
+- aiofiles: 0.6.0
+- aiohttp-socks: 0.5.5（使用pip）
+
+由于aiohttp-socks没有移植到Anaconda，所以只能使用pip安装。点击Anaconda界面首页的'CMD.exe Promt'下面的Launch按钮，
+会打开一个命令行，输入`pip install aiosocks`即可安装。
 
 如果懂pip，直接用以下命令安装：
 > pip install -r requirements.txt

diff --git a/config/config.ini b/config/config.ini
@@ -13,11 +13,11 @@ proxy = socks5://127.0.0.1:10808
 # 控制程序行为
 [BEHAVIOR]
 # 警告：鉴于buff现在爬得太多会账号冷却一段时间，建议搞大一点！！！
-# 爬取间隔调大之后，爬的速度会很慢，建议使用下面的category_white_list/category_black_list缩小目标饰品范围
-# 爬取间隔下限：4s
-frequency_interval_low=4
-# 爬取间隔上限：8s。即：每4-8s爬取一次
-frequency_interval_high=8
+# 现在从buff上获取的信息很少，爬取间隔调大之后，基本不影响整体的爬取速度。
+# 爬取间隔下限：10s
+frequency_interval_low=10
+# 爬取间隔上限：15s。即：每10-15s爬取一次
+frequency_interval_high=15
 # 重新爬取已缓存文件的小时间隔
 url_cache_hour = 6
 # 无视缓存爬取数据
@@ -35,9 +35,9 @@ steam_sell_tax = 0.15
 # 过滤行为
 [FILTER]
 # 爬取物品的最低价格，价格过低则不考虑
-crawl_min_price_item = 150
+crawl_min_price_item = 100
 # 爬取物品的最高价格
-crawl_max_price_item = 160
+crawl_max_price_item = 300
 # 7天交易历史，少于该阈值则认为是冷门物品，不考虑
 min_sold_threshold = 70
 # 爬取类别白名单，如只想爬取AK和M4（A1 & A4），则设置为：["weapon_ak47", "weapon_m4a1", "weapon_m4a1_silencer"]
@@ -51,5 +51,5 @@ category_black_list = ["*sticker*", "*knife*", "csgo_type_tool", "csgo_type_spra
 # 结果设置
 [RESULT]
 # 每一项建议只显示这么多条数据
-top_n = 50
+top_n = 20
 
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,6 @@
+aiofiles==0.6.0
+aiohttp==3.7.3
+aiohttp-socks==0.5.5
 certifi==2020.6.20
 cffi==1.14.3
 chardet==3.0.4

diff --git a/src/crawl/history_price_crawler.py b/src/crawl/history_price_crawler.py
@@ -4,6 +4,10 @@
 from datetime import datetime
 
 import aiohttp
+# import aiosocks
+# from aiosocks.connector import ProxyConnector
+from aiohttp_socks import ProxyConnector
+from src.config.definitions import PROXY
 
 from src.config.urls import steam_price_history_url
 from src.util.logger import log
@@ -45,7 +49,13 @@ async def async_crawl_history_price(csgo_items):
     log.info('Total {} items to get history price.'.format(total_price_number))
 
     tasks = []
-    async with aiohttp.ClientSession(cookies=steam_cookies, headers=get_headers(), connector = aiohttp.TCPConnector(limit=5)) as session:
+
+    if PROXY:
+        # use socks
+        connector = ProxyConnector.from_url(PROXY, limit=5)
+    else:
+        connector = aiohttp.TCPConnector(limit=5)
+    async with aiohttp.ClientSession(cookies=steam_cookies, headers=get_headers(), connector=connector) as session:
         for index, item in enumerate(csgo_items, start=1):
             try:
                 tasks.append(

diff --git a/src/crawl/item_crawler.py b/src/crawl/item_crawler.py
@@ -56,12 +56,11 @@ def csgo_all_categories():
     return categories
 
 
-def enrich_item_with_price_history(csgo_items, crawl_steam_async = True):
+def enrich_item_with_price_history(csgo_items, crawl_steam_async=True):
     # crawl price for all items
-    if crawl_steam_async == True:
-        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
-        asyncio.run(
-            history_price_crawler.async_crawl_history_price(csgo_items))
+    if crawl_steam_async:
+        # asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
+        asyncio.run(history_price_crawler.async_crawl_history_price(csgo_items))
     else:
         history_price_crawler.crawl_history_price(csgo_items)
     return csgo_items

diff --git a/src/util/requester.py b/src/util/requester.py
@@ -98,14 +98,9 @@ async def async_get_json_dict_raw(url, cookies, session: ClientSession, proxy =
 
 
     try:
-        if proxy and proxies != {}:
-            async with session.get(url, proxy=proxies["http"]) as resp:
-                return await resp.text()
-            # return requests.get(url, headers=get_headers(), cookies=cookies, timeout=5, proxies=proxies).text
         async with session.get(url) as resp:
             return await resp.text()
         # return requests.get(url, headers=get_headers(), cookies=cookies, timeout=5).text
-
     except Timeout:
         log.warn("Timeout for {}. Try again.".format(url))
     except Exception as e:
@@ -115,14 +110,19 @@ async def async_get_json_dict_raw(url, cookies, session: ClientSession, proxy =
     # 首次出错时异步休眠，第二次出错时全体任务休眠。
     await timer.async_sleep_awhile()
     if times == 2:
+        log.error('aio http error happens 2 times. use sync wait')
         timer.sleep_awhile()
 
     data = await async_get_json_dict_raw(url, cookies, session, proxy, times + 1)
     return data
 
 async def async_get_json_dict(url, cookies, session, proxy = False, times = 1):
-    if await asyncexist(url):
-        return json.loads(await asyncfetch(url))
+    # TODO: 从cache获取部分数据后，aiohttp_socks请求会挂掉： RuntimeError: Session is closed，从一开始就请求steam就没问题
+    # TODO: 以上问题还会导致最终的table都是NaN
+    # TODO 复现方式： 先清空cache跑一遍，再随机删掉chche里的某条目，再跑一遍，就能复现
+    # TODO 鉴于以上两个问题，先把从本地cache获取数据的功能关了。反正aio很快，也不是很需要cache
+    # if await asyncexist(url):
+    #     return json.loads(await asyncfetch(url))
     json_data = await async_get_json_dict_raw(url, cookies, session, proxy, times)
 
     if json_data is None:

diff --git a/src/util/timer.py b/src/util/timer.py
@@ -8,8 +8,8 @@
 
 # 爬buff使用长间隔，steam使用短间隔
 def sleep_awhile(is_steam_request = 0):
-    low = max(FREQUENCY_INTERVAL_LOW, 2)
-    high = max(2, FREQUENCY_INTERVAL_HIGH)
+    low = max(FREQUENCY_INTERVAL_LOW, 10)
+    high = max(10, FREQUENCY_INTERVAL_HIGH)
     if is_steam_request == 1:
         interval = 1/(random.randint(5, 10))
     else: