diff --git a/.gitignore b/.gitignore index 524ed5d..a60d954 100644 --- a/.gitignore +++ b/.gitignore @@ -28,8 +28,7 @@ var/ # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest -*.spec - +*.spec # Installer logs pip-log.txt pip-delete-this-directory.txt @@ -88,4 +87,7 @@ ENV/ # Rope project settings .ropeproject .idea/ -*.db \ No newline at end of file +*.db +*.swp +test.py +_tmp diff --git a/README.md b/README.md index 1e61936..2f524bb 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,7 @@ -# IPProxyPool +# 修改 +添加了socks4/5支持,添加了多个源 + +# IPProxyPool IPProxyPool代理池项目,提供代理ip。支持py2和py3两个版本。 ### 我的新书[《Python爬虫开发与项目实战》](https://item.jd.com/12206762.html)出版了,喜欢的话可以看一下[样章](http://pan.baidu.com/s/1hrWEOYg)
@@ -142,7 +145,7 @@ GET / | Name | Type | Description | | ----| ---- | ---- | | types | int | 0: 高匿,1:匿名,2 透明 | -| protocol | int | 0: http, 1 https, 2 http/https | +| protocol | int | 0: http, 1: https, 2: http/https, 3: socks4, 4: socks5 | | count | int | 数量 | | country | str | 取值为 国内, 国外 | | area | str | 地区 | @@ -163,7 +166,7 @@ GET / [["122.226.189.55", 138, 10], ["183.61.236.54", 3128, 10], ["61.132.241.109", 808, 10], ["183.61.236.53", 3128, 10], ["122.227.246.102", 808, 10]] ```
-以["122.226.189.55", 138, 10]为例,第一个元素是ip,第二个元素是port,第三个元素是分值score。 +以["122.226.189.55", 138, 10]为例,第一个元素是ip,第二个元素是port,第三个元素是分值protocol,第四个元素是分值score。 ``` import requests @@ -194,7 +197,7 @@ GET /delete | ip | str | 类似192.168.1.1 | | port | int | 类似 80 | | types | int | 0: 高匿,1:匿名,2 透明 | -| protocol | int | 0: http, 1 https, 2 http/https | +| protocol | int | 0: http, 1: https, 2: http/https, 3: socks4, 4: socks5 | | count | int | 数量 | | country | str | 取值为 国内, 国外 | | area | str | 地区 | @@ -264,6 +267,9 @@ UPDATE_TIME = 30 * 60 # 当有效的ip值小于MINNUM时 需要启动爬虫进行爬取 MINNUM = 50 +# 强制重新爬取时间,默认两天,0:不强制重新爬取 +FORCE_CRAWL_TIME = 60*60*24*2 + # socket超时 TIMEOUT = 5 diff --git a/config.py b/config.py index 389dc74..e823fa1 100644 --- a/config.py +++ b/config.py @@ -18,6 +18,33 @@ 'pattern': ".//*[@id='main']/div/div[1]/table/tr[position()>1]", 'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[4]', 'protocol': ''} }, + { + 'urls': ['https://www.socks-proxy.net'], + 'type': 'xpath', + 'pattern': ".//*[@id='proxylisttable']/tr[position()>1]", + 'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''} + }, + { + 'urls': ['http://www.proxylists.net/proxylists.xml'], + 'type': 'xpath', + 'pattern': ".//proxy[position()>1]", + 'position': {'ip': './ip', 'port': './port', 'type': '', 'protocol': ''} + }, + { + 'urls': ['http://31f.cn'+n for n in ['','/http-proxy/', + '/https-proxy/','/socks-proxy/']+ + ['/region/'+a+'/' for a in ['安徽','广东','江苏','北京','浙江','山东','上海','湖南', + '河南','辽宁','四川','湖北','福建','河北','吉林','江西', + '山西','重庆','陕西','内蒙古','天津','云南','西藏','广西']]+ + ['/city/'+a+'/' for a in ['淮南','北京','深圳','杭州','上海','广州','苏州', + '常德','南京','青岛','成都','武汉','南通','东莞', + '合肥','重庆','连云港','长春','天津','长沙','焦作', + '佛山','常州','济南','大连','西安','郑州','无锡','石家庄', + '镇江','嘉兴','徐州','芜湖','金华','朝阳','福州']]], + 'type': 'xpath', + 'pattern': ".//table[1]/tr[position()>1]", + 'position': {'ip': './td[2]', 'port': './td[3]', 'type': '', 'protocol': ''} + }, { 'urls': ['http://www.66ip.cn/areaindex_%s/%s.html' % (m, n) for m in range(1, 35) for n in range(1, 10)], 'type': 'xpath', @@ -94,7 +121,13 @@ 'moduleName': 'CnproxyPraser', 'pattern': r'(\d+\.\d+\.\d+\.\d+)(HTTP|SOCKS4)\s*', 'position': {'ip': 0, 'port': 1, 'type': -1, 'protocol': 2} - } + }, + { + 'urls': ['https://www.xroxy.com/proxyrss.xml'], + 'type': 'xpath', + 'pattern': ".//proxy[position()>1]", + 'position': {'ip': './ip', 'port': './port', 'type': '', 'protocol': ''} + }, ] ''' 数据库的配置 @@ -117,13 +150,15 @@ '贵州', '安徽', '重庆', '北京', '上海', '天津', '广西', '内蒙', '西藏', '新疆', '宁夏', '香港', '澳门'] QQWRY_PATH = os.path.dirname(__file__) + "/data/qqwry.dat" THREADNUM = 5 -API_PORT = 8000 +#API_PORT = 8000 +API_PORT = 8765 ''' 爬虫爬取和检测ip的设置条件 不需要检测ip是否已经存在,因为会定时清理 ''' UPDATE_TIME = 30 * 60 # 每半个小时检测一次是否有代理ip失效 -MINNUM = 50 # 当有效的ip值小于一个时 需要启动爬虫进行爬取 +MINNUM = 800 # 当有效的ip值小于一个时 需要启动爬虫进行爬取 +FORCE_CRAWL_TIME = 60*60*24*2 # 强制重新爬取时间,默认两天,0:不强制重新爬取 TIMEOUT = 5 # socket延时 ''' @@ -204,5 +239,5 @@ def get_header(): MAX_CHECK_PROCESS = 2 # CHECK_PROXY最大进程数 MAX_CHECK_CONCURRENT_PER_PROCESS = 30 # CHECK_PROXY时每个进程的最大并发 TASK_QUEUE_SIZE = 50 # 任务队列SIZE -MAX_DOWNLOAD_CONCURRENT = 3 # 从免费代理网站下载时的最大并发 -CHECK_WATI_TIME = 1#进程数达到上限时的等待时间 \ No newline at end of file +MAX_DOWNLOAD_CONCURRENT = 3 # 从免费代理网站下载时的最大并发 +CHECK_WATI_TIME = 1#进程数达到上限时的等待时间 diff --git a/db/SqlHelper.py b/db/SqlHelper.py index 8df7b1a..e7204a5 100644 --- a/db/SqlHelper.py +++ b/db/SqlHelper.py @@ -118,7 +118,7 @@ def select(self, count=None, conditions=None): else: conditions = [] - query = self.session.query(Proxy.ip, Proxy.port, Proxy.score) + query = self.session.query(Proxy.ip, Proxy.port, Proxy.protocol, Proxy.score) if len(conditions) > 0 and count: for condition in conditions: query = query.filter(condition) diff --git a/spider/HtmlDownloader.py b/spider/HtmlDownloader.py index 3e1de09..8220f25 100644 --- a/spider/HtmlDownloader.py +++ b/spider/HtmlDownloader.py @@ -24,7 +24,7 @@ def download(url): except Exception: count = 0 # 重试次数 - proxylist = sqlhelper.select(10) + proxylist = sqlhelper.select(10,{'protocol':2}) if not proxylist: return None diff --git a/spider/HtmlPraser.py b/spider/HtmlPraser.py index 5f1e68c..cd1d3fd 100644 --- a/spider/HtmlPraser.py +++ b/spider/HtmlPraser.py @@ -49,7 +49,10 @@ def XpathPraser(self, response, parser): :return: ''' proxylist = [] - root = etree.HTML(response) + try: + root = etree.HTML(response) + except: + root = etree.HTML(bytes(bytearray(response, encoding='utf-8'))) proxys = root.xpath(parser['pattern']) for proxy in proxys: try: diff --git a/spider/ProxyCrawl.py b/spider/ProxyCrawl.py index 221d01f..1c046bd 100644 --- a/spider/ProxyCrawl.py +++ b/spider/ProxyCrawl.py @@ -10,7 +10,7 @@ from multiprocessing import Queue, Process, Value from api.apiServer import start_api_server -from config import THREADNUM, parserList, UPDATE_TIME, MINNUM, MAX_CHECK_CONCURRENT_PER_PROCESS, MAX_DOWNLOAD_CONCURRENT +from config import THREADNUM, parserList, UPDATE_TIME, MINNUM, MAX_CHECK_CONCURRENT_PER_PROCESS, MAX_DOWNLOAD_CONCURRENT, FORCE_CRAWL_TIME from db.DataStore import store_data, sqlhelper from spider.HtmlDownloader import Html_Downloader from spider.HtmlPraser import Html_Parser @@ -34,6 +34,7 @@ def __init__(self, queue, db_proxy_num,myip): self.queue = queue self.db_proxy_num = db_proxy_num self.myip = myip + self.crawl_time=time.time() def run(self): @@ -54,7 +55,8 @@ def run(self): self.db_proxy_num.value = len(self.proxies) str = 'IPProxyPool----->>>>>>>>db exists ip:%d' % len(self.proxies) - if len(self.proxies) < MINNUM: + if len(self.proxies) < MINNUM or (FORCE_CRAWL_TIME and time.time()-self.crawl_time>FORCE_CRAWL_TIME): + self.crawl_time=time.time() str += '\r\nIPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...' sys.stdout.write(str + "\r\n") sys.stdout.flush() diff --git a/validator/Validator.py b/validator/Validator.py index 46bc444..b9fd226 100644 --- a/validator/Validator.py +++ b/validator/Validator.py @@ -26,10 +26,10 @@ def detect_from_db(myip, proxy, proxies_set): proxies_set.add(proxy_str) else: - if proxy[2] < 1: + if proxy[3] < 1: sqlhelper.delete({'ip': proxy[0], 'port': proxy[1]}) else: - score = proxy[2]-1 + score = proxy[3]-1 sqlhelper.update({'ip': proxy[0], 'port': proxy[1]}, {'score': score}) proxy_str = '%s:%s' % (proxy[0], proxy[1]) proxies_set.add(proxy_str) @@ -88,8 +88,7 @@ def detect_proxy(selfip, proxy, queue2=None): ''' ip = proxy['ip'] port = proxy['port'] - proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)} - protocol, types, speed = getattr(sys.modules[__name__],config.CHECK_PROXY['function'])(selfip, proxies)#checkProxy(selfip, proxies) + protocol, types, speed = getattr(sys.modules[__name__],config.CHECK_PROXY['function'])(selfip, ip, port)#checkProxy(selfip, proxies) if protocol >= 0: proxy['protocol'] = protocol proxy['types'] = types @@ -101,7 +100,7 @@ def detect_proxy(selfip, proxy, queue2=None): return proxy -def checkProxy(selfip, proxies): +def checkProxy(selfip, ip, port): ''' 用来检测代理的类型,突然发现,免费网站写的信息不靠谱,还是要自己检测代理的类型 :param @@ -110,6 +109,7 @@ def checkProxy(selfip, proxies): protocol = -1 types = -1 speed = -1 + proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)} http, http_types, http_speed = _checkHttpProxy(selfip, proxies) https, https_types, https_speed = _checkHttpProxy(selfip, proxies, False) if http and https: @@ -125,9 +125,23 @@ def checkProxy(selfip, proxies): protocol = 1 speed = https_speed else: - types = -1 - protocol = -1 - speed = -1 + proxies = {"http": "socks5://%s:%s" % (ip, port), "https": "socks5://%s:%s" % (ip, port)} + socks5, socks5_types, socks5_speed = _checkHttpProxy(selfip, proxies) + if socks5: + types = socks5_types + protocol = 4 + speed = socks5_speed + else: + proxies = {"http": "socks4://%s:%s" % (ip, port), "https": "socks4://%s:%s" % (ip, port)} + socks4, socks4_types, socks4_speed = _checkHttpProxy(selfip, proxies) + if socks4: + types = socks4_types + protocol = 3 + speed = socks4_speed + else: + types = -1 + protocol = -1 + speed = -1 return protocol, types, speed