From cdeced60017975ebb43457a04ec27c3bd3fbdf45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B7=AF=E8=BF=87?= <243647162@qq.com> Date: Thu, 16 Feb 2017 20:59:08 +0800 Subject: [PATCH] =?UTF-8?q?=E6=95=B4=E7=90=86=E4=BB=A3=E7=A0=81=E9=80=BB?= =?UTF-8?q?=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- spider/HtmlDownloader.py | 71 +++++++++++++--------------------------- util/exception.py | 2 +- 2 files changed, 24 insertions(+), 49 deletions(-) diff --git a/spider/HtmlDownloader.py b/spider/HtmlDownloader.py index f6a37a6..74ccb93 100644 --- a/spider/HtmlDownloader.py +++ b/spider/HtmlDownloader.py @@ -12,61 +12,36 @@ class Html_Downloader(object): - @classmethod - def download(self, url): - count = 0 # 重试次数 - r = '' + @staticmethod + def download(url): try: r = requests.get(url=url, headers=config.HEADER, timeout=config.TIMEOUT) r.encoding = chardet.detect(r.content)['encoding'] + if (not r.ok) or len(r.content) < 500: + raise ConnectionError + else: + return r.text + + except Exception: + count = 0 # 重试次数 + proxylist = sqlhelper.select(10) + if not proxylist: + return None + while count < config.RETRY_TIME: - if (not r.ok) or len(r.content) < 500: - proxylist = sqlhelper.select(10) + try: proxy = random.choice(proxylist) ip = proxy[0] port = proxy[1] proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)} - try: - r = requests.get(url=url, headers=config.HEADER, timeout=config.TIMEOUT, proxies=proxies) - r.encoding = chardet.detect(r.content)['encoding'] - count += 1 - except Exception as e: - count += 1 - - else: - return r.text - - return None - - except Exception as e: - while count < config.RETRY_TIME: - if r == '' or (not r.ok) or len(r.content) < 500: - try: - proxylist = sqlhelper.select(10) - proxy = random.choice(proxylist) - ip = proxy[0] - port = proxy[1] - proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)} - try: - r = requests.get(url=url, headers=config.HEADER, timeout=config.TIMEOUT, proxies=proxies) - r.encoding = chardet.detect(r.content)['encoding'] - count += 1 - except Exception as e: - count += 1 - - except Exception as e: - return None - - else: - return r.text - - return None - - - - - - - + r = requests.get(url=url, headers=config.HEADER, timeout=config.TIMEOUT, proxies=proxies) + r.encoding = chardet.detect(r.content)['encoding'] + if (not r.ok) or len(r.content) < 500: + raise ConnectionError + else: + return r.text + except Exception: + count += 1 + return None diff --git a/util/exception.py b/util/exception.py index 80d10ff..c992ac7 100644 --- a/util/exception.py +++ b/util/exception.py @@ -4,7 +4,7 @@ class Test_URL_Fail(Exception): def __str__(self): - str = "访问%s失败,请检查网络连接" % config.TEST_URL + str = "访问%s失败,请检查网络连接" % config.TEST_IP return str