From 9adf1920db0f47c2b2294bc4fd9dfa64b7da6070 Mon Sep 17 00:00:00 2001 From: puppylpg Date: Thu, 12 Nov 2020 23:46:46 +0800 Subject: [PATCH] Add some checks. (#39) --- CHANGELOG.md | 8 ++++++++ config/config.ini | 16 ++++++++-------- src/__main__.py | 3 ++- src/crawl/history_price_crawler.py | 5 +++-- src/crawl/item_crawler.py | 5 ++++- src/util/persist_util.py | 4 ++-- src/util/requester.py | 9 +++++++-- 7 files changed, 34 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 26acc89..cf3dc89 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -101,3 +101,11 @@ * bugfix - 修复 Win 下缓存文件编码的问题 - 在缓存意外不合法时现在会重新抓取 + +# v3.8.3(2020-11-12) +* bugfix + - sticker类别变多了,默认黑白名单屏蔽sticker的时候使用通配符; + - 增加了一些校验,防止特殊情况数据不存在时报错: + + 写cache前校验爬取内容是否超时为None; + + 数据表为空校验。没爬到数据就不给出建议了; + + 获取steam历史价格返回时,增加'prices' key存在性校验; diff --git a/config/config.ini b/config/config.ini index ff70d1b..f2cc607 100644 --- a/config/config.ini +++ b/config/config.ini @@ -1,7 +1,7 @@ [BASIC] # 网页登录buff和steam后,把浏览器的cookie贴到这里,才能运行。推荐从Chrome中复制,下面是两条示例cookie -buff_cookie = _ga=GA1.2.162602080.1551374933; _ntes_nuid=8ce0cf6bdce55512e73f49cb8a49960e; mail_psc_fingerprint=d80ec72871726e9b192181fd1a3633d6; OUTFOX_SEARCH_USER_ID_NCOO=29659292.15961449; Device-Id=33u998YqmNWbhH5GbWUo; vjuids=369cb7d82.170e16a9519.0.3eb2c52902997; vjlast=1584329824.1584329824.30; _ntes_nnid=8ce0cf6bdce55512e73f49cb8a49960e,1584329823520; vinfo_n_f_l_n3=d81bf3a25989eb31.1.4.1561837557589.1576393349946.1585037711031; NTES_CMT_USER_INFO=305053074%7C%E6%9C%89%E6%80%81%E5%BA%A6%E7%BD%91%E5%8F%8B0ibHSi%7Chttp%3A%2F%2Fcms-bucket.nosdn.127.net%2F2018%2F08%2F13%2F078ea9f65d954410b62a52ac773875a1.jpeg%7Cfalse%7CeWQuNzU3YTdkZjAwZWNiNDJlOGJAMTYzLmNvbQ%3D%3D; nts_mail_user=shining63.com:-1:1; Locale-Supported=zh-Hans; game=csgo; _gid=GA1.2.408819780.1602387923; NTES_YD_SESS=2ad4oMUZrpNSwralkVQFGR_rtV_TZgYPUG_YgOJvmm9n.6Js.4TFkURWXExDgqyQtJ6h2Xb5ffQVQtqcw3wjTyevM11skcblbUdx4.PVDePscTo5TxGZkWFOFlDL1QAZtPU9d_0I0UegOlOjKEeBL8Eh8RAUnitaCpIfkdkkdS1pjSmSo4ivvdcaUmWcEdxlgDOgqYTMRsxdkXcseYgy0YQ4b8UJhNoWKeWEG_FUnB; S_INFO=1602387932|0|3&80##|; P_INFO=|1602387932|1|netease_buff|00&99|bej&1599745561&netease_buff#bej&null#10#0#0|&0|null|; remember_me=U1094050600|UrFjxmWDgUX4vfX9pphq8i2cTtkEz0d2; session=1-eV6CEf3tgQ9qCDS-6e2xQlEjJXSVuGqp3JNAfLwBVSGe2046524528; csrf_token=ImI2NTcyNThkYmU2Mzg2MjIxMzBhMTE3NzgyYWJiMGM5YzEwYWY5NmEi.EmQUig.oRWIRxnDlzWlMYX4ps9paRW2eWo -steam_cookie = steamMachineAuth76561198251761676=B89D7B0897180E54C9F2E93F8AAFA4583CAADE7D; timezoneOffset=28800,0; _ga=GA1.2.1902489943.1551205764; steamMachineAuth76561198874249759=E46DCE6095514E3D489CAF1E7CBC3F9F8CD3ACC6; browserid=1066728544083117486; recentlyVisitedAppHubs=271590%2C80%2C730; sessionid=afd31a4ca7e900c9125edf89; steamCountry=US%7C630911856130eec5e4910b6ffb8dbdde; _gid=GA1.2.2064465479.1602388630; steamLoginSecure=76561198251761676%7C%7CF29B04B445D4E8B30D2AD04AA974588C1FAB7EDE; steamRememberLogin=76561198251761676%7C%7Ca5e43585d1cd13db87c3d856d7676178; webTradeEligibility=%7B%22allowed%22%3A1%2C%22allowed_at_time%22%3A0%2C%22steamguard_required_days%22%3A15%2C%22new_device_cooldown_days%22%3A7%2C%22time_checked%22%3A16388657%7D; strInventoryLastContext=730_2; tsTradeOffersLastRead=1662594 +buff_cookie = _ga=GA1.2.162602080.1551374933; _ntes_nuid=8ce0cf6bdce55512e73f49cb8a49960e; mail_psc_fingerprint=d80ec72871726e9b192181fd1a3633d6; OUTFOX_SEARCH_USER_ID_NCOO=29659292.15961449; Device-Id=33u998YqmNWbhH5GbWUo; vjuids=369cb7d82.170e16a9519.0.3eb2c52902997; vjlast=1584329824.1584329824.30; _ntes_nnid=8ce0cf6bdce55512e73f49cb8a49960e,1584329823520; vinfo_n_f_l_n3=d81bf3a25989eb31.1.4.1561837557589.1576393349946.1585037711031; NTES_CMT_USER_INFO=305053074%7C%E6%9C%89%E6%80%81%E5%BA%A6%E7%BD%91%E5%8F%8B0ibHSi%7Chttp%3A%2F%2Fcms-bucket.nosdn.127.net%2F2018%2F08%2F13%2F078ea9f65d954410b62a52ac773875a1.jpeg%7Cfalse%7CeWQuNzU3YTdkZjAwZWNiNDJlOGJAMTYzLmNvbQ%3D%3D; nts_mail_user=ter@163.com:-1:1; __oc_uuid=ed078220-12cd-11eb-8ff5-199a4d2b4ac4; Locale-Supported=zh-Hans; game=csgo; _gid=GA1.2.648285736.1605190175; _gat_gtag_UA_109989484_1=1; NTES_YD_SESS=SN.CH9UV_zHPlqCiCLvgBOoLTrvc2fBGRyieqBhbqAP1HglxHydKU4DQmq5B7At06lgkSmM_II0j06AJnuMWYnpdtYe8PPxUJMsM4X5yH3jBY3xJdC_d59nM8A1bksgKL51SSXhh3Rbd4SeDy6ZIwse2MUjzElPeLdPKBaMoZafPdtNUF9E67TduT0krt3r6_s46hz3dnGE.y20NruVavQP3kETqQCAqK6iZ3b0Nc6tJw; S_INFO=1605190207|0|3&80##|2051; P_INFO=2051|1605190207|1|netease_buff|00&99|bej&1602387932&netease_buff#bej&null#10#0#0|&0|null|2051; remember_me=U1094050600|T3zeeLJIc6y9kVtTTAGV0mdqvIXDpeX0; session=1-WfP1TH9yGjtZniGRmbfFSezTOMS-ZeYguhJFzDIT5Fem2046524528; csrf_token=ImFjMWE4YTc4MDFkMTAyZjYyYWZhZWVhYzllZGFlNTJiZjc1NWE1MDEi.Eo7TwA.f39WinRhrzJgSTG4as2EjhD6za0 +steam_cookie = ActListPageSize=100; steamMachineAuth76561198251761676=B89D7B0897180E54C9F2E93F8AAFA4583CAADE7D; timezoneOffset=28800,0; _ga=GA1.2.1902489943.1551205764; steamMachineAuth76561198874249759=E46DCE6095514E3D489CAF1E7CBC3F9F8CD3ACC6; browserid=1066728544083117486; recentlyVisitedAppHubs=271590%2C80%2C730; Steam_Language=english; steamCountry=US%7C4705a9aaf22f908f9e4452081abd865a; sessionid=56b51232f9f3936a0ebbf88d; _gid=GA1.2.1847664544.1605190173; steamLoginSecure=76561198251761676%7C%7CE4B6E3BBDD5AF069692D8C8A56755ECBB34ECC68; steamRememberLogin=76561198251761676%7C%7Ca5e43585d1cd13db87c3d856d7676178; webTradeEligibility=%7B%22allowed%22%3A1%2C%22allowed_at_time%22%3A0%2C%22steamguard_required_days%22%3A15%2C%22new_device_cooldown_days%22%3A7%2C%22time_checked%22%3A1605190194%7D # 提供一个代理来访问Steam社区市场 proxy = socks5://127.0.0.1:10808 @@ -18,7 +18,7 @@ url_cache_hour = 6 # 无视缓存爬取数据 force_crawl = False # 一次请求的超时重试次数 -retry_times = 4 +retry_times = 3 # 基本参数设置 [COMMON] @@ -28,18 +28,18 @@ steam_sell_tax = 0.15 # 过滤行为 [FILTER] # 爬取物品的最低价格,价格过低则不考虑 -crawl_min_price_item = 100 +crawl_min_price_item = 150 # 爬取物品的最高价格 -crawl_max_price_item = 200 +crawl_max_price_item = 160 # 7天交易历史,少于该阈值则认为是冷门物品,不考虑 min_sold_threshold = 70 # 爬取类别白名单,如只想爬取AK和M4(A1 & A4),则设置为:["weapon_ak47", "weapon_m4a1", "weapon_m4a1_silencer"] # 具体类别参考`config/reference/category.md`,详见README -# 黑白名单均支持通配符匹配,如 weapon_knife* 等,更多用法请搜索 "Shell 通配符" +# 黑白名单均支持通配符匹配,如'weapon_knife*'等,更多用法请搜索 "Shell 通配符" category_white_list = [] # 爬取类别黑名单。如果黑名单白名单同时存在,白名单优先级更高 -# 默认的黑名单加了以下内容,排除掉乱七八糟的武器箱音乐盒印花探员之类的 -category_black_list = ["sticker", "csgo_type_tool", "csgo_type_spray", "csgo_type_collectible", "csgo_type_ticket", "csgo_tool_gifttag", "csgo_type_musickit", "csgo_type_weaponcase", "csgo_tool_weaponcase_keytag", "type_customplayer", "csgo_tool_patch"] +# 默认的黑名单加了以下内容,排除掉乱七八糟的武器箱音乐盒印花探员之类的,刀也排除掉了(不会真有人steam里卖刀吧:D) +category_black_list = ["*sticker*", "*knife*", "csgo_type_tool", "csgo_type_spray", "csgo_type_collectible", "csgo_type_ticket", "csgo_tool_gifttag", "csgo_type_musickit", "csgo_type_weaponcase", "csgo_tool_weaponcase_keytag", "type_customplayer", "csgo_tool_patch"] # 结果设置 [RESULT] diff --git a/src/__main__.py b/src/__main__.py index 05745ad..b2068ce 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -12,7 +12,8 @@ table = item_crawler.crawl() - if table is not None: + # table may be empty if no data is received due to timeout + if (table is not None) and (not table.empty): # suggestion suggestion.suggest(table) else: diff --git a/src/crawl/history_price_crawler.py b/src/crawl/history_price_crawler.py index 6a1624e..e9271a7 100644 --- a/src/crawl/history_price_crawler.py +++ b/src/crawl/history_price_crawler.py @@ -2,8 +2,8 @@ from datetime import datetime from src.config.urls import steam_price_history_url -from src.util.requester import get_json_dict, steam_cookies from src.util.logger import log +from src.util.requester import get_json_dict, steam_cookies def crawl_item_history_price(index, item, total_price_number): @@ -13,7 +13,8 @@ def crawl_item_history_price(index, item, total_price_number): log.info('GET steam history price {}/{} for ({}): {}'.format(index, total_price_number, item.name, steam_price_url)) steam_history_prices = get_json_dict(steam_price_url, steam_cookies, True) - if steam_history_prices is not None: + # key existence check + if (steam_history_prices is not None) and ('prices' in steam_history_prices): raw_price_history = steam_history_prices['prices'] if len(raw_price_history) > 0: days = min((datetime.today().date() - datetime.strptime(raw_price_history[0][0], '%b %d %Y %H: +0').date()).days, 7) diff --git a/src/crawl/item_crawler.py b/src/crawl/item_crawler.py index 64470dd..22fa97b 100644 --- a/src/crawl/item_crawler.py +++ b/src/crawl/item_crawler.py @@ -1,4 +1,5 @@ import re +# from tqdm import tqdm from src.config.definitions import CRAWL_MIN_PRICE_ITEM, CRAWL_MAX_PRICE_ITEM, BUFF_COOKIE, FORCE_CRAWL from src.config.urls import goods_section_root_url, goods_root_url, goods_section_page_url @@ -67,8 +68,10 @@ def crawl_website(): # crawl by categories and price section if len(raw_categories) != len(categories): - for category in categories: + total_category = len(categories) + for index, category in enumerate(categories, start=1): csgo_items.extend(crawl_goods_by_price_section(category)) + log.info('GET category {}/{} for ({}).'.format(index, total_category, category)) else: # crawl by price section without category csgo_items.extend(crawl_goods_by_price_section(None)) diff --git a/src/util/persist_util.py b/src/util/persist_util.py index 89f15b7..bd6ec94 100644 --- a/src/util/persist_util.py +++ b/src/util/persist_util.py @@ -13,6 +13,6 @@ def tabulate(csgo_items): def table_info(table): - log.info(table) - log.info(table.describe()) + # log.info(table) + log.info('Total Items Summary:\n{}'.format(table.describe())) log.info('\n') diff --git a/src/util/requester.py b/src/util/requester.py index 943dc18..114529f 100644 --- a/src/util/requester.py +++ b/src/util/requester.py @@ -60,5 +60,10 @@ def get_json_dict(url, cookies, proxy = False, times = 1): if exist(url): return json.loads(fetch(url)) json_data = get_json_dict_raw(url, cookies, proxy, times) - store(url,json_data) - return json.loads(json_data) + + if json_data is None: + return None + else: + # can not store None + store(url, json_data) + return json.loads(json_data)