Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

添加socks4/socks5支持,增加了几个源(包括国内外的) #149

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ var/
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
Expand Down Expand Up @@ -88,4 +87,7 @@ ENV/
# Rope project settings
.ropeproject
.idea/
*.db
*.db
*.swp
test.py
_tmp
14 changes: 10 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
# IPProxyPool
# 修改
添加了socks4/5支持,添加了多个源

# IPProxyPool
IPProxyPool代理池项目,提供代理ip。支持py2和py3两个版本。
### 我的新书[《Python爬虫开发与项目实战》](https://item.jd.com/12206762.html)出版了,喜欢的话可以看一下[样章](http://pan.baidu.com/s/1hrWEOYg)
<br/>
Expand Down Expand Up @@ -142,7 +145,7 @@ GET /
| Name | Type | Description |
| ----| ---- | ---- |
| types | int | 0: 高匿,1:匿名,2 透明 |
| protocol | int | 0: http, 1 https, 2 http/https |
| protocol | int | 0: http, 1: https, 2: http/https, 3: socks4, 4: socks5 |
| count | int | 数量 |
| country | str | 取值为 国内, 国外 |
| area | str | 地区 |
Expand All @@ -163,7 +166,7 @@ GET /
[["122.226.189.55", 138, 10], ["183.61.236.54", 3128, 10], ["61.132.241.109", 808, 10], ["183.61.236.53", 3128, 10], ["122.227.246.102", 808, 10]]
```
<br/>
以["122.226.189.55", 138, 10]为例,第一个元素是ip,第二个元素是port,第三个元素是分值score
以["122.226.189.55", 138, 10]为例,第一个元素是ip,第二个元素是port,第三个元素是分值protocol,第四个元素是分值score

```
import requests
Expand Down Expand Up @@ -194,7 +197,7 @@ GET /delete
| ip | str | 类似192.168.1.1 |
| port | int | 类似 80 |
| types | int | 0: 高匿,1:匿名,2 透明 |
| protocol | int | 0: http, 1 https, 2 http/https |
| protocol | int | 0: http, 1: https, 2: http/https, 3: socks4, 4: socks5 |
| count | int | 数量 |
| country | str | 取值为 国内, 国外 |
| area | str | 地区 |
Expand Down Expand Up @@ -264,6 +267,9 @@ UPDATE_TIME = 30 * 60
# 当有效的ip值小于MINNUM时 需要启动爬虫进行爬取
MINNUM = 50

# 强制重新爬取时间,默认两天,0:不强制重新爬取
FORCE_CRAWL_TIME = 60*60*24*2

# socket超时
TIMEOUT = 5

Expand Down
45 changes: 40 additions & 5 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,33 @@
'pattern': ".//*[@id='main']/div/div[1]/table/tr[position()>1]",
'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[4]', 'protocol': ''}
},
{
'urls': ['https://www.socks-proxy.net'],
'type': 'xpath',
'pattern': ".//*[@id='proxylisttable']/tr[position()>1]",
'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''}
},
{
'urls': ['http://www.proxylists.net/proxylists.xml'],
'type': 'xpath',
'pattern': ".//proxy[position()>1]",
'position': {'ip': './ip', 'port': './port', 'type': '', 'protocol': ''}
},
{
'urls': ['http://31f.cn'+n for n in ['','/http-proxy/',
'/https-proxy/','/socks-proxy/']+
['/region/'+a+'/' for a in ['安徽','广东','江苏','北京','浙江','山东','上海','湖南',
'河南','辽宁','四川','湖北','福建','河北','吉林','江西',
'山西','重庆','陕西','内蒙古','天津','云南','西藏','广西']]+
['/city/'+a+'/' for a in ['淮南','北京','深圳','杭州','上海','广州','苏州',
'常德','南京','青岛','成都','武汉','南通','东莞',
'合肥','重庆','连云港','长春','天津','长沙','焦作',
'佛山','常州','济南','大连','西安','郑州','无锡','石家庄',
'镇江','嘉兴','徐州','芜湖','金华','朝阳','福州']]],
'type': 'xpath',
'pattern': ".//table[1]/tr[position()>1]",
'position': {'ip': './td[2]', 'port': './td[3]', 'type': '', 'protocol': ''}
},
{
'urls': ['http://www.66ip.cn/areaindex_%s/%s.html' % (m, n) for m in range(1, 35) for n in range(1, 10)],
'type': 'xpath',
Expand Down Expand Up @@ -94,7 +121,13 @@
'moduleName': 'CnproxyPraser',
'pattern': r'<tr><td>(\d+\.\d+\.\d+\.\d+)<SCRIPT type=text/javascript>document.write\(\"\:\"(.+)\)</SCRIPT></td><td>(HTTP|SOCKS4)\s*',
'position': {'ip': 0, 'port': 1, 'type': -1, 'protocol': 2}
}
},
{
'urls': ['https://www.xroxy.com/proxyrss.xml'],
'type': 'xpath',
'pattern': ".//proxy[position()>1]",
'position': {'ip': './ip', 'port': './port', 'type': '', 'protocol': ''}
},
]
'''
数据库的配置
Expand All @@ -117,13 +150,15 @@
'贵州', '安徽', '重庆', '北京', '上海', '天津', '广西', '内蒙', '西藏', '新疆', '宁夏', '香港', '澳门']
QQWRY_PATH = os.path.dirname(__file__) + "/data/qqwry.dat"
THREADNUM = 5
API_PORT = 8000
#API_PORT = 8000
API_PORT = 8765
'''
爬虫爬取和检测ip的设置条件
不需要检测ip是否已经存在,因为会定时清理
'''
UPDATE_TIME = 30 * 60 # 每半个小时检测一次是否有代理ip失效
MINNUM = 50 # 当有效的ip值小于一个时 需要启动爬虫进行爬取
MINNUM = 800 # 当有效的ip值小于一个时 需要启动爬虫进行爬取
FORCE_CRAWL_TIME = 60*60*24*2 # 强制重新爬取时间,默认两天,0:不强制重新爬取

TIMEOUT = 5 # socket延时
'''
Expand Down Expand Up @@ -204,5 +239,5 @@ def get_header():
MAX_CHECK_PROCESS = 2 # CHECK_PROXY最大进程数
MAX_CHECK_CONCURRENT_PER_PROCESS = 30 # CHECK_PROXY时每个进程的最大并发
TASK_QUEUE_SIZE = 50 # 任务队列SIZE
MAX_DOWNLOAD_CONCURRENT = 3 # 从免费代理网站下载时的最大并发
CHECK_WATI_TIME = 1#进程数达到上限时的等待时间
MAX_DOWNLOAD_CONCURRENT = 3 # 从免费代理网站下载时的最大并发
CHECK_WATI_TIME = 1#进程数达到上限时的等待时间
2 changes: 1 addition & 1 deletion db/SqlHelper.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def select(self, count=None, conditions=None):
else:
conditions = []

query = self.session.query(Proxy.ip, Proxy.port, Proxy.score)
query = self.session.query(Proxy.ip, Proxy.port, Proxy.protocol, Proxy.score)
if len(conditions) > 0 and count:
for condition in conditions:
query = query.filter(condition)
Expand Down
2 changes: 1 addition & 1 deletion spider/HtmlDownloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def download(url):

except Exception:
count = 0 # 重试次数
proxylist = sqlhelper.select(10)
proxylist = sqlhelper.select(10,{'protocol':2})
if not proxylist:
return None

Expand Down
5 changes: 4 additions & 1 deletion spider/HtmlPraser.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ def XpathPraser(self, response, parser):
:return:
'''
proxylist = []
root = etree.HTML(response)
try:
root = etree.HTML(response)
except:
root = etree.HTML(bytes(bytearray(response, encoding='utf-8')))
proxys = root.xpath(parser['pattern'])
for proxy in proxys:
try:
Expand Down
6 changes: 4 additions & 2 deletions spider/ProxyCrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from multiprocessing import Queue, Process, Value

from api.apiServer import start_api_server
from config import THREADNUM, parserList, UPDATE_TIME, MINNUM, MAX_CHECK_CONCURRENT_PER_PROCESS, MAX_DOWNLOAD_CONCURRENT
from config import THREADNUM, parserList, UPDATE_TIME, MINNUM, MAX_CHECK_CONCURRENT_PER_PROCESS, MAX_DOWNLOAD_CONCURRENT, FORCE_CRAWL_TIME
from db.DataStore import store_data, sqlhelper
from spider.HtmlDownloader import Html_Downloader
from spider.HtmlPraser import Html_Parser
Expand All @@ -34,6 +34,7 @@ def __init__(self, queue, db_proxy_num,myip):
self.queue = queue
self.db_proxy_num = db_proxy_num
self.myip = myip
self.crawl_time=time.time()


def run(self):
Expand All @@ -54,7 +55,8 @@ def run(self):
self.db_proxy_num.value = len(self.proxies)
str = 'IPProxyPool----->>>>>>>>db exists ip:%d' % len(self.proxies)

if len(self.proxies) < MINNUM:
if len(self.proxies) < MINNUM or (FORCE_CRAWL_TIME and time.time()-self.crawl_time>FORCE_CRAWL_TIME):
self.crawl_time=time.time()
str += '\r\nIPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...'
sys.stdout.write(str + "\r\n")
sys.stdout.flush()
Expand Down
30 changes: 22 additions & 8 deletions validator/Validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ def detect_from_db(myip, proxy, proxies_set):
proxies_set.add(proxy_str)

else:
if proxy[2] < 1:
if proxy[3] < 1:
sqlhelper.delete({'ip': proxy[0], 'port': proxy[1]})
else:
score = proxy[2]-1
score = proxy[3]-1
sqlhelper.update({'ip': proxy[0], 'port': proxy[1]}, {'score': score})
proxy_str = '%s:%s' % (proxy[0], proxy[1])
proxies_set.add(proxy_str)
Expand Down Expand Up @@ -88,8 +88,7 @@ def detect_proxy(selfip, proxy, queue2=None):
'''
ip = proxy['ip']
port = proxy['port']
proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
protocol, types, speed = getattr(sys.modules[__name__],config.CHECK_PROXY['function'])(selfip, proxies)#checkProxy(selfip, proxies)
protocol, types, speed = getattr(sys.modules[__name__],config.CHECK_PROXY['function'])(selfip, ip, port)#checkProxy(selfip, proxies)
if protocol >= 0:
proxy['protocol'] = protocol
proxy['types'] = types
Expand All @@ -101,7 +100,7 @@ def detect_proxy(selfip, proxy, queue2=None):
return proxy


def checkProxy(selfip, proxies):
def checkProxy(selfip, ip, port):
'''
用来检测代理的类型,突然发现,免费网站写的信息不靠谱,还是要自己检测代理的类型
:param
Expand All @@ -110,6 +109,7 @@ def checkProxy(selfip, proxies):
protocol = -1
types = -1
speed = -1
proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
http, http_types, http_speed = _checkHttpProxy(selfip, proxies)
https, https_types, https_speed = _checkHttpProxy(selfip, proxies, False)
if http and https:
Expand All @@ -125,9 +125,23 @@ def checkProxy(selfip, proxies):
protocol = 1
speed = https_speed
else:
types = -1
protocol = -1
speed = -1
proxies = {"http": "socks5://%s:%s" % (ip, port), "https": "socks5://%s:%s" % (ip, port)}
socks5, socks5_types, socks5_speed = _checkHttpProxy(selfip, proxies)
if socks5:
types = socks5_types
protocol = 4
speed = socks5_speed
else:
proxies = {"http": "socks4://%s:%s" % (ip, port), "https": "socks4://%s:%s" % (ip, port)}
socks4, socks4_types, socks4_speed = _checkHttpProxy(selfip, proxies)
if socks4:
types = socks4_types
protocol = 3
speed = socks4_speed
else:
types = -1
protocol = -1
speed = -1
return protocol, types, speed


Expand Down