Skip to content

Commit

Permalink
解决bug: redis有序集合变化后导致顺序重排,以至于重复test (#78)
Browse files Browse the repository at this point in the history
* pip使用镜像

* 获取代理时设置timeout

因为在获取某些网站公布的代理ip时,由于该网站被墙或者其他原因,导致需要几分钟才能反馈。

* 第5页之后的代理ip都是两三天之后的信息,质量很差

* 解决bug: redis有序集合变化后导致顺序重排,以至于重复test

解决bug: #73
因为redis的有序集合是按照分数进行变化的。
当修改了分数之后,再继续遍历时,会导致一部分重复遍历,一部分没有取到。
因此改为通过游标进行遍历

* 保证redis中不存在分数为0的数据

因为之前的逻辑是要么减分,要么删除。
也就是说,如果减完分之后,分数是0,那么还存在于redis中。
现在对这个逻辑进行了优化。减完分之后再和PROXY_SCORE_MIN的值进行判断。

* 解决bug:设置LOG_DIR后没有效果 #62

* 删除没用的import

* 修改pip源

* 修改BUG:获取分数和判断分数的逻辑

* 删除不能使用的代理iphai和xicidaili, 解决bug:zhandaye

* 解决bug:只获取了一次zhandaye的目录

* 解决bug:因为zhandaye的crawl没有返回值导致报错

* 恢复iphai和xicidaili。同时对这两个类增加了ignore属性

* 增加MAX_PAGE变量

* 设置pip源

* 恢复Dockerfile
  • Loading branch information
winturn authored Jul 13, 2020
1 parent def06b1 commit 737998f
Show file tree
Hide file tree
Showing 10 changed files with 59 additions and 49 deletions.
3 changes: 2 additions & 1 deletion proxypool/crawlers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import pkgutil
from .base import BaseCrawler
from .public.zhandaye import ZhandayeDetailCrawler
import inspect


# load classes subclass of BaseCrawler
classes = []
for loader, name, is_pkg in pkgutil.walk_packages(__path__):
Expand All @@ -13,3 +13,4 @@
and not getattr(value, 'ignore', False):
classes.append(value)
__all__ = __ALL__ = classes

5 changes: 4 additions & 1 deletion proxypool/crawlers/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from retrying import retry
import requests
from loguru import logger
from proxypool.setting import GET_TIMEOUT


class BaseCrawler(object):
Expand All @@ -9,8 +10,11 @@ class BaseCrawler(object):
@retry(stop_max_attempt_number=3, retry_on_result=lambda x: x is None, wait_fixed=2000)
def fetch(self, url, **kwargs):
try:
kwargs.setdefault('timeout', GET_TIMEOUT)
kwargs.setdefault('verify', False)
response = requests.get(url, **kwargs)
if response.status_code == 200:
response.encoding = 'utf-8'
return response.text
except requests.ConnectionError:
return
Expand All @@ -23,7 +27,6 @@ def crawl(self):
for url in self.urls:
logger.info(f'fetching {url}')
html = self.fetch(url)
print('html', html)
for proxy in self.parse(html):
logger.info(f'fetched proxy {proxy.string()} from {url}')
yield proxy
3 changes: 2 additions & 1 deletion proxypool/crawlers/public/iphai.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class IPHaiCrawler(BaseCrawler):
iphai crawler, http://www.iphai.com/
"""
urls = [BASE_URL]

ignore = True

def parse(self, html):
"""
Expand All @@ -32,3 +32,4 @@ def parse(self, html):
crawler = IPHaiCrawler()
for proxy in crawler.crawl():
print(proxy)

3 changes: 2 additions & 1 deletion proxypool/crawlers/public/kuaidaili.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@


BASE_URL = 'https://www.kuaidaili.com/free/inha/{page}/'
MAX_PAGE = 5


class KuaidailiCrawler(BaseCrawler):
"""
kuaidaili crawler, https://www.kuaidaili.com/
"""
urls = [BASE_URL.format(page=page) for page in range(1, 200)]
urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]

def parse(self, html):
"""
Expand Down
2 changes: 2 additions & 0 deletions proxypool/crawlers/public/xicidaili.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class XicidailiCrawler(BaseCrawler):
xididaili crawler, https://www.xicidaili.com/
"""
urls = [BASE_URL]
ignore = True

headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
Expand Down Expand Up @@ -48,3 +49,4 @@ def parse(self, html):
crawler = XicidailiCrawler()
for proxy in crawler.crawl():
print(proxy)

40 changes: 19 additions & 21 deletions proxypool/crawlers/public/zhandaye.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
from loguru import logger
import re


BASE_URL = 'https://www.zdaye.com/dayProxy/{page}.html'
Expand All @@ -11,50 +12,47 @@ class ZhandayeCrawler(BaseCrawler):
"""
zhandaye crawler, https://www.zdaye.com/dayProxy/
"""
urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE)]

urls_catalog = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE)]
headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}
urls = []

def crawl(self):
for url in self.urls:
self.crawl_catalog()
yield from super().crawl()

def crawl_catalog(self):
for url in self.urls_catalog:
logger.info(f'fetching {url}')
html = self.fetch(url, headers=self.headers)
self.parse(html)
self.parse_catalog(html)

def parse(self, html):
def parse_catalog(self, html):
"""
parse html file to get proxies
:return:
"""
doc = pq(html)
for item in doc('#J_posts_list .thread_item div div p a').items():
post = 'https://www.zdaye.com' + item.attr('href')
logger.info(f'get detail url: {post}')
ZhandayeDetailCrawler(post).crawl()


class ZhandayeDetailCrawler(BaseCrawler):
urls = []
ignore = True

def __init__(self, url):
self.urls.append(url)
super().__init__()
url = 'https://www.zdaye.com' + item.attr('href')
logger.info(f'get detail url: {url}')
self.urls.append(url)

def parse(self, html):
doc = pq(html)
trs = doc('.cont br').items()
for tr in trs:
line = tr[0].tail
host = line.split(':')[0]
port = line.split(':')[1][:4]
yield Proxy(host=host, port=port)

match = re.search(r'(\d+\.\d+\.\d+\.\d+):(\d+)', line)
if match:
host = match.group(1)
port = match.group(2)
yield Proxy(host=host, port=port)


if __name__ == '__main__':
crawler = ZhandayeCrawler()
for proxy in crawler.crawl():
print(proxy)

3 changes: 2 additions & 1 deletion proxypool/processors/getter.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,11 @@ def run(self):
if self.is_full():
return
for crawler in self.crawlers:
logger.info(f'crawler {crawler} to get proxy')
for proxy in crawler.crawl():
self.redis.add(proxy)


if __name__ == '__main__':
getter = Getter()
getter.run()
getter.run()
17 changes: 9 additions & 8 deletions proxypool/processors/tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,15 @@ def run(self):
logger.info('stating tester...')
count = self.redis.count()
logger.debug(f'{count} proxies to test')
for i in range(0, count, TEST_BATCH):
# start end end offset
start, end = i, min(i + TEST_BATCH, count)
logger.debug(f'testing proxies from {start} to {end} indices')
proxies = self.redis.batch(start, end)
tasks = [self.test(proxy) for proxy in proxies]
# run tasks using event loop
self.loop.run_until_complete(asyncio.wait(tasks))
cursor = 0
while True:
logger.debug(f'testing proxies use cursor {cursor}, count {TEST_BATCH}')
cursor, proxies = self.redis.batch(cursor, count=TEST_BATCH)
if proxies:
tasks = [self.test(proxy) for proxy in proxies]
self.loop.run_until_complete(asyncio.wait(tasks))
if not cursor:
break


if __name__ == '__main__':
Expand Down
6 changes: 4 additions & 2 deletions proxypool/setting.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
CYCLE_TESTER = env.int('CYCLE_TESTER', 20)
# definition of getter cycle, it will get proxy every CYCLE_GETTER second
CYCLE_GETTER = env.int('CYCLE_GETTER', 100)
GET_TIMEOUT = env.int('GET_TIMEOUT', 10)

# definition of tester
TEST_URL = env.str('TEST_URL', 'http://www.baidu.com')
Expand All @@ -75,5 +76,6 @@
ENABLE_GETTER = env.bool('ENABLE_GETTER', True)
ENABLE_SERVER = env.bool('ENABLE_SERVER', True)

logger.add(env.str('LOG_RUNTIME_FILE', 'runtime.log'), level='DEBUG', rotation='1 week', retention='20 days')
logger.add(env.str('LOG_ERROR_FILE', 'error.log'), level='ERROR', rotation='1 week')
logger.add(env.str('LOG_RUNTIME_FILE', join(LOG_DIR, 'runtime.log')), level='DEBUG', rotation='1 week', retention='20 days')
logger.add(env.str('LOG_ERROR_FILE', join(LOG_DIR, 'error.log')), level='ERROR', rotation='1 week')

26 changes: 13 additions & 13 deletions proxypool/storages/redis.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,17 +67,15 @@ def decrease(self, proxy: Proxy) -> int:
:param proxy: proxy
:return: new score
"""
score = self.db.zscore(REDIS_KEY, proxy.string())
# current score is larger than PROXY_SCORE_MIN
if score and score > PROXY_SCORE_MIN:
logger.info(f'{proxy.string()} current score {score}, decrease 1')
if IS_REDIS_VERSION_2:
return self.db.zincrby(REDIS_KEY, proxy.string(), -1)
return self.db.zincrby(REDIS_KEY, -1, proxy.string())
# otherwise delete proxy
if IS_REDIS_VERSION_2:
self.db.zincrby(REDIS_KEY, proxy.string(), -1)
else:
self.db.zincrby(REDIS_KEY, -1, proxy.string())
score = self.db.zscore(REDIS_KEY, proxy.string())
logger.info(f'{proxy.string()} score decrease 1, current {score}')
if score <= PROXY_SCORE_MIN:
logger.info(f'{proxy.string()} current score {score}, remove')
return self.db.zrem(REDIS_KEY, proxy.string())
self.db.zrem(REDIS_KEY, proxy.string())

def exists(self, proxy: Proxy) -> bool:
"""
Expand Down Expand Up @@ -112,17 +110,19 @@ def all(self) -> List[Proxy]:
"""
return convert_proxy_or_proxies(self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX))

def batch(self, start, end) -> List[Proxy]:
def batch(self, cursor, count) -> List[Proxy]:
"""
get batch of proxies
:param start: start index
:param end: end index
:param cursor: scan cursor
:param count: scan count
:return: list of proxies
"""
return convert_proxy_or_proxies(self.db.zrevrange(REDIS_KEY, start, end - 1))
cursor, proxies = self.db.zscan(REDIS_KEY, cursor, count=count)
return cursor, convert_proxy_or_proxies([i[0] for i in proxies])


if __name__ == '__main__':
conn = RedisClient()
result = conn.random()
print(result)

0 comments on commit 737998f

Please sign in to comment.