解决bug: redis有序集合变化后导致顺序重排，以至于重复test (#78)

* pip使用镜像 * 获取代理时设置timeout 因为在获取某些网站公布的代理ip时，由于该网站被墙或者其他原因，导致需要几分钟才能反馈。 * 第5页之后的代理ip都是两三天之后的信息，质量很差 * 解决bug: redis有序集合变化后导致顺序重排，以至于重复test 解决bug: #73 因为redis的有序集合是按照分数进行变化的。当修改了分数之后，再继续遍历时，会导致一部分重复遍历，一部分没有取到。因此改为通过游标进行遍历 * 保证redis中不存在分数为0的数据因为之前的逻辑是要么减分，要么删除。也就是说，如果减完分之后，分数是0，那么还存在于redis中。现在对这个逻辑进行了优化。减完分之后再和PROXY_SCORE_MIN的值进行判断。 * 解决bug：设置LOG_DIR后没有效果 #62 * 删除没用的import * 修改pip源 * 修改BUG：获取分数和判断分数的逻辑 * 删除不能使用的代理iphai和xicidaili, 解决bug：zhandaye * 解决bug：只获取了一次zhandaye的目录 * 解决bug：因为zhandaye的crawl没有返回值导致报错 * 恢复iphai和xicidaili。同时对这两个类增加了ignore属性 * 增加MAX_PAGE变量 * 设置pip源 * 恢复Dockerfile
Python3WebSpider · Jul 13, 2020 · 737998f · 737998f
1 parent def06b1
commit 737998f
Show file tree

Hide file tree

Showing 10 changed files with 59 additions and 49 deletions.
diff --git a/proxypool/crawlers/__init__.py b/proxypool/crawlers/__init__.py
@@ -1,8 +1,8 @@
 import pkgutil
 from .base import BaseCrawler
-from .public.zhandaye import ZhandayeDetailCrawler
 import inspect
 
+
 # load classes subclass of BaseCrawler
 classes = []
 for loader, name, is_pkg in pkgutil.walk_packages(__path__):
@@ -13,3 +13,4 @@
                 and not getattr(value, 'ignore', False):
             classes.append(value)
 __all__ = __ALL__ = classes
+
diff --git a/proxypool/crawlers/base.py b/proxypool/crawlers/base.py
@@ -1,6 +1,7 @@
 from retrying import retry
 import requests
 from loguru import logger
+from proxypool.setting import GET_TIMEOUT
 
 
 class BaseCrawler(object):
@@ -9,8 +10,11 @@ class BaseCrawler(object):
     @retry(stop_max_attempt_number=3, retry_on_result=lambda x: x is None, wait_fixed=2000)
     def fetch(self, url, **kwargs):
         try:
+            kwargs.setdefault('timeout', GET_TIMEOUT)
+            kwargs.setdefault('verify', False)
             response = requests.get(url, **kwargs)
             if response.status_code == 200:
+                response.encoding = 'utf-8'
                 return response.text
         except requests.ConnectionError:
             return
@@ -23,7 +27,6 @@ def crawl(self):
         for url in self.urls:
             logger.info(f'fetching {url}')
             html = self.fetch(url)
-            print('html', html)
             for proxy in self.parse(html):
                 logger.info(f'fetched proxy {proxy.string()} from {url}')
                 yield proxy
diff --git a/proxypool/crawlers/public/iphai.py b/proxypool/crawlers/public/iphai.py
@@ -10,7 +10,7 @@ class IPHaiCrawler(BaseCrawler):
     iphai crawler, http://www.iphai.com/
     """
     urls = [BASE_URL]
-
+    ignore = True
 
     def parse(self, html):
         """
@@ -32,3 +32,4 @@ def parse(self, html):
     crawler = IPHaiCrawler()
     for proxy in crawler.crawl():
         print(proxy)
+
diff --git a/proxypool/crawlers/public/kuaidaili.py b/proxypool/crawlers/public/kuaidaili.py
@@ -5,13 +5,14 @@
 
 
 BASE_URL = 'https://www.kuaidaili.com/free/inha/{page}/'
+MAX_PAGE = 5
 
 
 class KuaidailiCrawler(BaseCrawler):
     """
     kuaidaili crawler, https://www.kuaidaili.com/
     """
-    urls = [BASE_URL.format(page=page) for page in range(1, 200)]
+    urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
 
     def parse(self, html):
         """

diff --git a/proxypool/crawlers/public/xicidaili.py b/proxypool/crawlers/public/xicidaili.py
@@ -11,6 +11,7 @@ class XicidailiCrawler(BaseCrawler):
     xididaili crawler, https://www.xicidaili.com/
     """
     urls = [BASE_URL]
+    ignore = True
 
     headers = {
         'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
@@ -48,3 +49,4 @@ def parse(self, html):
     crawler = XicidailiCrawler()
     for proxy in crawler.crawl():
         print(proxy)
+
diff --git a/proxypool/crawlers/public/zhandaye.py b/proxypool/crawlers/public/zhandaye.py
@@ -2,6 +2,7 @@
 from proxypool.schemas.proxy import Proxy
 from proxypool.crawlers.base import BaseCrawler
 from loguru import logger
+import re
 
 
 BASE_URL = 'https://www.zdaye.com/dayProxy/{page}.html'
@@ -11,50 +12,47 @@ class ZhandayeCrawler(BaseCrawler):
     """
     zhandaye crawler, https://www.zdaye.com/dayProxy/
     """
-    urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE)]
-
+    urls_catalog = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE)]
     headers = {
         'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
     }
+    urls = []
 
     def crawl(self):
-        for url in self.urls:
+        self.crawl_catalog()
+        yield from super().crawl()
+
+    def crawl_catalog(self):
+        for url in self.urls_catalog:
             logger.info(f'fetching {url}')
             html = self.fetch(url, headers=self.headers)
-            self.parse(html)
+            self.parse_catalog(html)
 
-    def parse(self, html):
+    def parse_catalog(self, html):
         """
         parse html file to get proxies
         :return:
         """
         doc = pq(html)
         for item in doc('#J_posts_list .thread_item div div p a').items():
-            post = 'https://www.zdaye.com' + item.attr('href')
-            logger.info(f'get detail url: {post}')
-            ZhandayeDetailCrawler(post).crawl()
-
-
-class ZhandayeDetailCrawler(BaseCrawler):
-    urls = []
-    ignore = True
-
-    def __init__(self, url):
-        self.urls.append(url)
-        super().__init__()
+            url = 'https://www.zdaye.com' + item.attr('href')
+            logger.info(f'get detail url: {url}')
+            self.urls.append(url)
 
     def parse(self, html):
         doc = pq(html)
         trs = doc('.cont br').items()
         for tr in trs:
             line = tr[0].tail
-            host = line.split(':')[0]
-            port = line.split(':')[1][:4]
-            yield Proxy(host=host, port=port)
-
+            match = re.search(r'(\d+\.\d+\.\d+\.\d+):(\d+)', line)
+            if match:
+                host = match.group(1)
+                port = match.group(2)
+                yield Proxy(host=host, port=port)
 
 
 if __name__ == '__main__':
     crawler = ZhandayeCrawler()
     for proxy in crawler.crawl():
         print(proxy)
+
diff --git a/proxypool/processors/getter.py b/proxypool/processors/getter.py
@@ -33,10 +33,11 @@ def run(self):
         if self.is_full():
             return
         for crawler in self.crawlers:
+            logger.info(f'crawler {crawler} to get proxy')
             for proxy in crawler.crawl():
                 self.redis.add(proxy)
 
 
 if __name__ == '__main__':
     getter = Getter()
-    getter.run()
+    getter.run()
diff --git a/proxypool/processors/tester.py b/proxypool/processors/tester.py
@@ -74,14 +74,15 @@ def run(self):
         logger.info('stating tester...')
         count = self.redis.count()
         logger.debug(f'{count} proxies to test')
-        for i in range(0, count, TEST_BATCH):
-            # start end end offset
-            start, end = i, min(i + TEST_BATCH, count)
-            logger.debug(f'testing proxies from {start} to {end} indices')
-            proxies = self.redis.batch(start, end)
-            tasks = [self.test(proxy) for proxy in proxies]
-            # run tasks using event loop
-            self.loop.run_until_complete(asyncio.wait(tasks))
+        cursor = 0
+        while True:
+            logger.debug(f'testing proxies use cursor {cursor}, count {TEST_BATCH}')
+            cursor, proxies = self.redis.batch(cursor, count=TEST_BATCH)
+            if proxies:
+                tasks = [self.test(proxy) for proxy in proxies]
+                self.loop.run_until_complete(asyncio.wait(tasks))
+            if not cursor:
+                break
 
 
 if __name__ == '__main__':

diff --git a/proxypool/setting.py b/proxypool/setting.py
@@ -53,6 +53,7 @@
 CYCLE_TESTER = env.int('CYCLE_TESTER', 20)
 # definition of getter cycle, it will get proxy every CYCLE_GETTER second
 CYCLE_GETTER = env.int('CYCLE_GETTER', 100)
+GET_TIMEOUT = env.int('GET_TIMEOUT', 10)
 
 # definition of tester
 TEST_URL = env.str('TEST_URL', 'http://www.baidu.com')
@@ -75,5 +76,6 @@
 ENABLE_GETTER = env.bool('ENABLE_GETTER', True)
 ENABLE_SERVER = env.bool('ENABLE_SERVER', True)
 
-logger.add(env.str('LOG_RUNTIME_FILE', 'runtime.log'), level='DEBUG', rotation='1 week', retention='20 days')
-logger.add(env.str('LOG_ERROR_FILE', 'error.log'), level='ERROR', rotation='1 week')
+logger.add(env.str('LOG_RUNTIME_FILE', join(LOG_DIR, 'runtime.log')), level='DEBUG', rotation='1 week', retention='20 days')
+logger.add(env.str('LOG_ERROR_FILE', join(LOG_DIR, 'error.log')), level='ERROR', rotation='1 week')
+
diff --git a/proxypool/storages/redis.py b/proxypool/storages/redis.py
@@ -67,17 +67,15 @@ def decrease(self, proxy: Proxy) -> int:
         :param proxy: proxy
         :return: new score
         """
-        score = self.db.zscore(REDIS_KEY, proxy.string())
-        # current score is larger than PROXY_SCORE_MIN
-        if score and score > PROXY_SCORE_MIN:
-            logger.info(f'{proxy.string()} current score {score}, decrease 1')
-            if IS_REDIS_VERSION_2:
-                return self.db.zincrby(REDIS_KEY, proxy.string(), -1)
-            return self.db.zincrby(REDIS_KEY, -1, proxy.string())
-        # otherwise delete proxy
+        if IS_REDIS_VERSION_2:
+            self.db.zincrby(REDIS_KEY, proxy.string(), -1)
         else:
+            self.db.zincrby(REDIS_KEY, -1, proxy.string())
+        score = self.db.zscore(REDIS_KEY, proxy.string())
+        logger.info(f'{proxy.string()} score decrease 1, current {score}')
+        if score <= PROXY_SCORE_MIN:
             logger.info(f'{proxy.string()} current score {score}, remove')
-            return self.db.zrem(REDIS_KEY, proxy.string())
+            self.db.zrem(REDIS_KEY, proxy.string())
 
     def exists(self, proxy: Proxy) -> bool:
         """
@@ -112,17 +110,19 @@ def all(self) -> List[Proxy]:
         """
         return convert_proxy_or_proxies(self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX))
 
-    def batch(self, start, end) -> List[Proxy]:
+    def batch(self, cursor, count) -> List[Proxy]:
         """
         get batch of proxies
-        :param start: start index
-        :param end: end index
+        :param cursor: scan cursor
+        :param count: scan count
         :return: list of proxies
         """
-        return convert_proxy_or_proxies(self.db.zrevrange(REDIS_KEY, start, end - 1))
+        cursor, proxies = self.db.zscan(REDIS_KEY, cursor, count=count)
+        return cursor, convert_proxy_or_proxies([i[0] for i in proxies])
 
 
 if __name__ == '__main__':
     conn = RedisClient()
     result = conn.random()
     print(result)
+