Skip to content
This repository has been archived by the owner on Mar 12, 2024. It is now read-only.

Commit

Permalink
Use max page_size when requesting (#59)
Browse files Browse the repository at this point in the history
  • Loading branch information
puppylpg authored Dec 19, 2020
1 parent 1213cb4 commit cbd27fe
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 5 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,7 @@
* 功能
- 配置里支持使用`buff_user_agent`粘贴自己的ua。如果留空,使用随机的ua,且每次爬取都使用该ua,而不是每次请求都随机一个ua。

## 3.9.2(2020-12-19)
* 功能
- buff有个`page_size`参数,实验发现每页最大可请求80个item。默认为20,所以能减少1/4的http请求;

11 changes: 7 additions & 4 deletions src/config/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,10 @@ def buff_price_history_url(item_id):

def goods_section_root_url(category):
"""
buff is strange: only request with page number beyond actual upper bound,
buff HAS BUG: only request with page number beyond actual upper bound,
can you get the true page number with this price section.
So sys.maxsize here is used as page number in order to get all page number and item count.
"""

base = BUFF_GOODS + 'game=csgo&page_num={}&sort_by=price.asc&min_price={}&max_price={}' \
Expand All @@ -45,9 +47,10 @@ def goods_section_root_url(category):
return base


def goods_section_page_url(category, page_num):
base = BUFF_GOODS + 'game=csgo&page_num={}&sort_by=price.desc&min_price={}&max_price={}' \
.format(page_num, CRAWL_MIN_PRICE_ITEM, CRAWL_MAX_PRICE_ITEM)
def goods_section_page_url(category, page_num, page_size=20):
# buff support page_size parameter, but the max value can only be 80
base = BUFF_GOODS + 'game=csgo&page_num={}&sort_by=price.desc&min_price={}&max_price={}&page_size={}' \
.format(page_num, CRAWL_MIN_PRICE_ITEM, CRAWL_MAX_PRICE_ITEM, page_size)
if category is not None:
base += '&category={}'.format(category)

Expand Down
17 changes: 16 additions & 1 deletion src/crawl/item_crawler.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
import math
# from tqdm import tqdm

from src.config.definitions import CRAWL_MIN_PRICE_ITEM, CRAWL_MAX_PRICE_ITEM, BUFF_COOKIE, FORCE_CRAWL
Expand Down Expand Up @@ -102,11 +103,25 @@ def crawl_goods_by_price_section(category=None):

total_page = root_json['data']['total_page']
total_count = root_json['data']['total_count']

# buff有个page_size参数,默认一页请求20个item,最多80
# 尝试使用80,能将对buff的访问量减少为原来的1/4。暂时不作为可配置项,硬编码在代码里
use_max_page_size = True
max_page_size = 80
default_page_size = 20

# 使用80一页后,新的页码
if use_max_page_size:
total_page = math.ceil(total_page / max_page_size)

log.info('Totally {} items of {} pages to crawl.'.format(total_count, total_page))
# get each page
for page_num in range(1, total_page + 1):
log.info('Page {} / {}'.format(page_num, total_page))
page_url = goods_section_page_url(category, page_num)
page_url = goods_section_page_url(
category, page_num,
page_size=max_page_size if use_max_page_size else default_page_size
)
page_json = get_json_dict(page_url, buff_cookies)
if (page_json is not None) and ('data' in page_json) and ('items' in page_json['data']):
# items on this page
Expand Down

0 comments on commit cbd27fe

Please sign in to comment.