diff --git a/.gitignore b/.gitignore
index 524ed5d..a60d954 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,8 +28,7 @@ var/
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
-*.spec
-
+*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
@@ -88,4 +87,7 @@ ENV/
# Rope project settings
.ropeproject
.idea/
-*.db
\ No newline at end of file
+*.db
+*.swp
+test.py
+_tmp
diff --git a/README.md b/README.md
index 1e61936..2f524bb 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,7 @@
-# IPProxyPool
+# 修改
+添加了socks4/5支持,添加了多个源
+
+# IPProxyPool
IPProxyPool代理池项目,提供代理ip。支持py2和py3两个版本。
### 我的新书[《Python爬虫开发与项目实战》](https://item.jd.com/12206762.html)出版了,喜欢的话可以看一下[样章](http://pan.baidu.com/s/1hrWEOYg)
@@ -142,7 +145,7 @@ GET /
| Name | Type | Description |
| ----| ---- | ---- |
| types | int | 0: 高匿,1:匿名,2 透明 |
-| protocol | int | 0: http, 1 https, 2 http/https |
+| protocol | int | 0: http, 1: https, 2: http/https, 3: socks4, 4: socks5 |
| count | int | 数量 |
| country | str | 取值为 国内, 国外 |
| area | str | 地区 |
@@ -163,7 +166,7 @@ GET /
[["122.226.189.55", 138, 10], ["183.61.236.54", 3128, 10], ["61.132.241.109", 808, 10], ["183.61.236.53", 3128, 10], ["122.227.246.102", 808, 10]]
```
-以["122.226.189.55", 138, 10]为例,第一个元素是ip,第二个元素是port,第三个元素是分值score。
+以["122.226.189.55", 138, 10]为例,第一个元素是ip,第二个元素是port,第三个元素是分值protocol,第四个元素是分值score。
```
import requests
@@ -194,7 +197,7 @@ GET /delete
| ip | str | 类似192.168.1.1 |
| port | int | 类似 80 |
| types | int | 0: 高匿,1:匿名,2 透明 |
-| protocol | int | 0: http, 1 https, 2 http/https |
+| protocol | int | 0: http, 1: https, 2: http/https, 3: socks4, 4: socks5 |
| count | int | 数量 |
| country | str | 取值为 国内, 国外 |
| area | str | 地区 |
@@ -264,6 +267,9 @@ UPDATE_TIME = 30 * 60
# 当有效的ip值小于MINNUM时 需要启动爬虫进行爬取
MINNUM = 50
+# 强制重新爬取时间,默认两天,0:不强制重新爬取
+FORCE_CRAWL_TIME = 60*60*24*2
+
# socket超时
TIMEOUT = 5
diff --git a/config.py b/config.py
index 389dc74..e823fa1 100644
--- a/config.py
+++ b/config.py
@@ -18,6 +18,33 @@
'pattern': ".//*[@id='main']/div/div[1]/table/tr[position()>1]",
'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[4]', 'protocol': ''}
},
+ {
+ 'urls': ['https://www.socks-proxy.net'],
+ 'type': 'xpath',
+ 'pattern': ".//*[@id='proxylisttable']/tr[position()>1]",
+ 'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''}
+ },
+ {
+ 'urls': ['http://www.proxylists.net/proxylists.xml'],
+ 'type': 'xpath',
+ 'pattern': ".//proxy[position()>1]",
+ 'position': {'ip': './ip', 'port': './port', 'type': '', 'protocol': ''}
+ },
+ {
+ 'urls': ['http://31f.cn'+n for n in ['','/http-proxy/',
+ '/https-proxy/','/socks-proxy/']+
+ ['/region/'+a+'/' for a in ['安徽','广东','江苏','北京','浙江','山东','上海','湖南',
+ '河南','辽宁','四川','湖北','福建','河北','吉林','江西',
+ '山西','重庆','陕西','内蒙古','天津','云南','西藏','广西']]+
+ ['/city/'+a+'/' for a in ['淮南','北京','深圳','杭州','上海','广州','苏州',
+ '常德','南京','青岛','成都','武汉','南通','东莞',
+ '合肥','重庆','连云港','长春','天津','长沙','焦作',
+ '佛山','常州','济南','大连','西安','郑州','无锡','石家庄',
+ '镇江','嘉兴','徐州','芜湖','金华','朝阳','福州']]],
+ 'type': 'xpath',
+ 'pattern': ".//table[1]/tr[position()>1]",
+ 'position': {'ip': './td[2]', 'port': './td[3]', 'type': '', 'protocol': ''}
+ },
{
'urls': ['http://www.66ip.cn/areaindex_%s/%s.html' % (m, n) for m in range(1, 35) for n in range(1, 10)],
'type': 'xpath',
@@ -94,7 +121,13 @@
'moduleName': 'CnproxyPraser',
'pattern': r'
(\d+\.\d+\.\d+\.\d+) | (HTTP|SOCKS4)\s*',
'position': {'ip': 0, 'port': 1, 'type': -1, 'protocol': 2}
- }
+ },
+ {
+ 'urls': ['https://www.xroxy.com/proxyrss.xml'],
+ 'type': 'xpath',
+ 'pattern': ".//proxy[position()>1]",
+ 'position': {'ip': './ip', 'port': './port', 'type': '', 'protocol': ''}
+ },
]
'''
数据库的配置
@@ -117,13 +150,15 @@
'贵州', '安徽', '重庆', '北京', '上海', '天津', '广西', '内蒙', '西藏', '新疆', '宁夏', '香港', '澳门']
QQWRY_PATH = os.path.dirname(__file__) + "/data/qqwry.dat"
THREADNUM = 5
-API_PORT = 8000
+#API_PORT = 8000
+API_PORT = 8765
'''
爬虫爬取和检测ip的设置条件
不需要检测ip是否已经存在,因为会定时清理
'''
UPDATE_TIME = 30 * 60 # 每半个小时检测一次是否有代理ip失效
-MINNUM = 50 # 当有效的ip值小于一个时 需要启动爬虫进行爬取
+MINNUM = 800 # 当有效的ip值小于一个时 需要启动爬虫进行爬取
+FORCE_CRAWL_TIME = 60*60*24*2 # 强制重新爬取时间,默认两天,0:不强制重新爬取
TIMEOUT = 5 # socket延时
'''
@@ -204,5 +239,5 @@ def get_header():
MAX_CHECK_PROCESS = 2 # CHECK_PROXY最大进程数
MAX_CHECK_CONCURRENT_PER_PROCESS = 30 # CHECK_PROXY时每个进程的最大并发
TASK_QUEUE_SIZE = 50 # 任务队列SIZE
-MAX_DOWNLOAD_CONCURRENT = 3 # 从免费代理网站下载时的最大并发
-CHECK_WATI_TIME = 1#进程数达到上限时的等待时间
\ No newline at end of file
+MAX_DOWNLOAD_CONCURRENT = 3 # 从免费代理网站下载时的最大并发
+CHECK_WATI_TIME = 1#进程数达到上限时的等待时间
diff --git a/db/SqlHelper.py b/db/SqlHelper.py
index 8df7b1a..e7204a5 100644
--- a/db/SqlHelper.py
+++ b/db/SqlHelper.py
@@ -118,7 +118,7 @@ def select(self, count=None, conditions=None):
else:
conditions = []
- query = self.session.query(Proxy.ip, Proxy.port, Proxy.score)
+ query = self.session.query(Proxy.ip, Proxy.port, Proxy.protocol, Proxy.score)
if len(conditions) > 0 and count:
for condition in conditions:
query = query.filter(condition)
diff --git a/spider/HtmlDownloader.py b/spider/HtmlDownloader.py
index 3e1de09..8220f25 100644
--- a/spider/HtmlDownloader.py
+++ b/spider/HtmlDownloader.py
@@ -24,7 +24,7 @@ def download(url):
except Exception:
count = 0 # 重试次数
- proxylist = sqlhelper.select(10)
+ proxylist = sqlhelper.select(10,{'protocol':2})
if not proxylist:
return None
diff --git a/spider/HtmlPraser.py b/spider/HtmlPraser.py
index 5f1e68c..cd1d3fd 100644
--- a/spider/HtmlPraser.py
+++ b/spider/HtmlPraser.py
@@ -49,7 +49,10 @@ def XpathPraser(self, response, parser):
:return:
'''
proxylist = []
- root = etree.HTML(response)
+ try:
+ root = etree.HTML(response)
+ except:
+ root = etree.HTML(bytes(bytearray(response, encoding='utf-8')))
proxys = root.xpath(parser['pattern'])
for proxy in proxys:
try:
diff --git a/spider/ProxyCrawl.py b/spider/ProxyCrawl.py
index 221d01f..1c046bd 100644
--- a/spider/ProxyCrawl.py
+++ b/spider/ProxyCrawl.py
@@ -10,7 +10,7 @@
from multiprocessing import Queue, Process, Value
from api.apiServer import start_api_server
-from config import THREADNUM, parserList, UPDATE_TIME, MINNUM, MAX_CHECK_CONCURRENT_PER_PROCESS, MAX_DOWNLOAD_CONCURRENT
+from config import THREADNUM, parserList, UPDATE_TIME, MINNUM, MAX_CHECK_CONCURRENT_PER_PROCESS, MAX_DOWNLOAD_CONCURRENT, FORCE_CRAWL_TIME
from db.DataStore import store_data, sqlhelper
from spider.HtmlDownloader import Html_Downloader
from spider.HtmlPraser import Html_Parser
@@ -34,6 +34,7 @@ def __init__(self, queue, db_proxy_num,myip):
self.queue = queue
self.db_proxy_num = db_proxy_num
self.myip = myip
+ self.crawl_time=time.time()
def run(self):
@@ -54,7 +55,8 @@ def run(self):
self.db_proxy_num.value = len(self.proxies)
str = 'IPProxyPool----->>>>>>>>db exists ip:%d' % len(self.proxies)
- if len(self.proxies) < MINNUM:
+ if len(self.proxies) < MINNUM or (FORCE_CRAWL_TIME and time.time()-self.crawl_time>FORCE_CRAWL_TIME):
+ self.crawl_time=time.time()
str += '\r\nIPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...'
sys.stdout.write(str + "\r\n")
sys.stdout.flush()
diff --git a/validator/Validator.py b/validator/Validator.py
index 46bc444..b9fd226 100644
--- a/validator/Validator.py
+++ b/validator/Validator.py
@@ -26,10 +26,10 @@ def detect_from_db(myip, proxy, proxies_set):
proxies_set.add(proxy_str)
else:
- if proxy[2] < 1:
+ if proxy[3] < 1:
sqlhelper.delete({'ip': proxy[0], 'port': proxy[1]})
else:
- score = proxy[2]-1
+ score = proxy[3]-1
sqlhelper.update({'ip': proxy[0], 'port': proxy[1]}, {'score': score})
proxy_str = '%s:%s' % (proxy[0], proxy[1])
proxies_set.add(proxy_str)
@@ -88,8 +88,7 @@ def detect_proxy(selfip, proxy, queue2=None):
'''
ip = proxy['ip']
port = proxy['port']
- proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
- protocol, types, speed = getattr(sys.modules[__name__],config.CHECK_PROXY['function'])(selfip, proxies)#checkProxy(selfip, proxies)
+ protocol, types, speed = getattr(sys.modules[__name__],config.CHECK_PROXY['function'])(selfip, ip, port)#checkProxy(selfip, proxies)
if protocol >= 0:
proxy['protocol'] = protocol
proxy['types'] = types
@@ -101,7 +100,7 @@ def detect_proxy(selfip, proxy, queue2=None):
return proxy
-def checkProxy(selfip, proxies):
+def checkProxy(selfip, ip, port):
'''
用来检测代理的类型,突然发现,免费网站写的信息不靠谱,还是要自己检测代理的类型
:param
@@ -110,6 +109,7 @@ def checkProxy(selfip, proxies):
protocol = -1
types = -1
speed = -1
+ proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
http, http_types, http_speed = _checkHttpProxy(selfip, proxies)
https, https_types, https_speed = _checkHttpProxy(selfip, proxies, False)
if http and https:
@@ -125,9 +125,23 @@ def checkProxy(selfip, proxies):
protocol = 1
speed = https_speed
else:
- types = -1
- protocol = -1
- speed = -1
+ proxies = {"http": "socks5://%s:%s" % (ip, port), "https": "socks5://%s:%s" % (ip, port)}
+ socks5, socks5_types, socks5_speed = _checkHttpProxy(selfip, proxies)
+ if socks5:
+ types = socks5_types
+ protocol = 4
+ speed = socks5_speed
+ else:
+ proxies = {"http": "socks4://%s:%s" % (ip, port), "https": "socks4://%s:%s" % (ip, port)}
+ socks4, socks4_types, socks4_speed = _checkHttpProxy(selfip, proxies)
+ if socks4:
+ types = socks4_types
+ protocol = 3
+ speed = socks4_speed
+ else:
+ types = -1
+ protocol = -1
+ speed = -1
return protocol, types, speed
|