Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

修复503错误 #32

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .DS_Store
Binary file not shown.
4 changes: 3 additions & 1 deletion pdf/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ $ sudo yum intsall wkhtmltopdf # centos

### 运行
```python
python get_proxy.py
python censor.py
python crawler.py
```

Expand All @@ -42,7 +44,7 @@ python crawler.py
### 更新记录

* 2017-2-21: 对代码进行了全面的重构,可扩展, 子类爬虫只需实现 `parse_menu`和`parse_body`方法就可以实现HTML转换PDF的逻辑

* 2017-11-14: 添加代理池,修复503错误

### Contact me

Expand Down
21 changes: 21 additions & 0 deletions pdf/censor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import random
from urllib import request

import requests

fpr=open('host.txt','r')
fpw = open('proxies.txt','w')

ips=fpr.readlines()
proxys=list()
for p in ips:
ip=p.strip('\n').split('\t')
pro=dict()
pro['https'] = ip[0] + ':' + ip[1]
print(pro)
try:
response = requests.get('https://www.baidu.com', proxies=pro,timeout=2)
print(response)
fpw.write(p)
except Exception as e:
print(e)
48 changes: 34 additions & 14 deletions pdf/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,12 @@

import logging
import os
import random
import re
import time
from urllib import request

try:
from urllib.parse import urlparse # py3
except:
from urlparse import urlparse # py2
from urllib.parse import urlparse

import pdfkit
import requests
Expand All @@ -25,7 +24,6 @@
{content}
</body>
</html>

"""


Expand All @@ -51,7 +49,26 @@ def request(url, **kwargs):
网络请求,返回response对象
:return:
"""
response = requests.get(url, **kwargs)
fp = open('proxies.txt', 'r')
ips = fp.readlines()
proxys = list()
for p in ips:
ip = p.strip('\n').split('\t')
pro = dict()
pro['https'] = ip[0] + ':' + ip[1]
proxys.append(pro)

headers = {
'user-agent': 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19'}
while (1):
try:
response = requests.get(url, headers=headers, proxies=random.choice(proxys), timeout=2)
if (response.status_code != 200): continue
break
except Exception as e:
print(e)

print("response:", response)
return response

def parse_menu(self, response):
Expand All @@ -70,6 +87,7 @@ def parse_body(self, response):

def run(self):
start = time.time()

options = {
'page-size': 'Letter',
'margin-top': '0.75in',
Expand All @@ -93,7 +111,6 @@ def run(self):
with open(f_name, 'wb') as f:
f.write(html)
htmls.append(f_name)

pdfkit.from_file(htmls, self.name + ".pdf", options=options)
for html in htmls:
os.remove(html)
Expand All @@ -112,10 +129,12 @@ def parse_menu(self, response):
:param response 爬虫返回的response对象
:return: url生成器
"""
soup = BeautifulSoup(response.content, "html.parser")
menu_tag = soup.find_all(class_="uk-nav uk-nav-side")[1]
for li in menu_tag.find_all("li"):
url = li.a.get("href")
soup = BeautifulSoup(response.text, "html.parser")
menu_tag = soup.find_all('ul', class_="uk-nav uk-nav-side")[1]
# print(menu_tag.find_all('a')[0].get("href"))
for li in menu_tag.find_all("a"):
url = li.get("href")
# print(li)
if not url.startswith("http"):
url = "".join([self.domain, url]) # 补全为全路径
yield url
Expand All @@ -127,9 +146,10 @@ def parse_body(self, response):
:return: 返回处理后的html文本
"""
try:
soup = BeautifulSoup(response.content, 'html.parser')
body = soup.find_all(class_="x-wiki-content")[0]
soup = BeautifulSoup(response.text, 'html.parser')
body = soup.find_all(class_="x-wiki-content x-main-content")[0]

print("body:", body)
# 加入标题, 居中显示
title = soup.find('h4').get_text()
center_tag = soup.new_tag("center")
Expand Down Expand Up @@ -158,6 +178,6 @@ def func(m):


if __name__ == '__main__':
start_url = "http://www.liaoxuefeng.com/wiki/0013739516305929606dd18361248578c67b8067c8c017b000"
start_url = "https://www.liaoxuefeng.com/wiki/0013739516305929606dd18361248578c67b8067c8c017b000"
crawler = LiaoxuefengPythonCrawler("廖雪峰Git", start_url)
crawler.run()
22 changes: 22 additions & 0 deletions pdf/get_proxy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from urllib import request
from bs4 import BeautifulSoup

fp = open('host.txt', 'w')
for i in range(1,3):
url='http://www.xicidaili.com/wn/' + str(i)
opener=request.build_opener()
opener.addheaders = [('User-Agent',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')]
request.install_opener(opener)
response = request.urlopen(url)
soup = BeautifulSoup(response.read(),'html.parser')
list = soup.find_all(class_='odd')
for elem in list:
data = elem.find_all('td')
ip=data[1].string
port=data[2].string
fp.write(ip)
fp.write('\t')
fp.write(port)
fp.write('\n')
fp.close()
100 changes: 100 additions & 0 deletions pdf/host.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
58.62.86.245 9999
61.152.230.26 8080
171.104.132.28 9999
112.74.94.142 3128
139.224.24.26 8888
122.72.18.34 80
111.85.15.166 8080
122.72.18.61 80
119.90.63.3 3128
27.44.170.236 9999
116.52.224.81 9999
119.114.229.137 80
180.76.134.106 3128
116.211.88.90 3128
116.236.151.166 8080
183.135.251.12 30291
125.112.194.189 37301
183.95.22.121 53281
182.246.209.199 80
222.132.145.126 80
202.201.3.121 3128
222.78.116.105 26956
49.88.168.150 31329
139.208.198.170 8118
113.87.88.103 9797
183.135.249.107 42411
59.63.74.254 4362
49.85.13.107 42332
171.212.140.116 8118
183.51.191.234 9797
113.76.96.91 9797
123.185.129.93 8080
223.241.79.48 8010
39.88.13.3 53281
27.46.39.194 9797
115.203.196.36 22221
175.171.108.227 53281
218.29.111.106 9999
113.65.160.146 9797
120.76.55.49 8088
183.38.61.213 9999
59.60.168.219 29377
223.241.117.50 8010
115.221.117.220 22628
117.90.111.6 45614
114.234.80.152 29786
218.73.139.165 25933
182.34.50.184 42887
183.135.253.23 46668
182.88.126.216 9797
113.89.13.153 9999
1.194.162.92 39615
60.179.40.138 27739
175.147.66.112 8080
14.211.119.11 9797
113.121.251.55 31353
14.211.123.143 9797
114.239.222.194 44127
115.203.194.104 38774
115.230.62.194 31772
223.241.116.93 8010
59.38.61.207 9797
110.72.34.144 8123
59.40.68.44 8010
223.241.119.91 8010
125.126.172.225 23107
59.40.50.197 8010
113.121.170.239 25683
223.241.116.115 8010
27.44.162.195 9999
183.189.114.218 80
106.113.242.113 9999
110.72.26.118 8123
118.254.153.227 3128
218.20.54.92 9999
120.42.124.17 48795
223.223.203.30 8080
111.76.64.44 4392
183.148.87.90 21140
221.198.105.220 8118
223.241.78.169 8010
113.89.15.143 9999
182.42.45.36 808
14.211.122.146 9797
202.105.111.162 9000
120.43.230.31 47224
115.202.80.0 28202
119.136.199.74 808
59.40.50.231 8010
223.241.117.25 8010
180.115.11.207 38367
223.241.117.196 8010
115.215.51.218 808
14.29.84.50 8080
111.76.175.214 4323
27.44.159.41 9797
115.46.68.40 8123
171.39.41.249 8123
121.31.71.193 8123
60.160.185.213 7654
14 changes: 14 additions & 0 deletions pdf/proxies.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
58.62.86.245 9999
122.72.18.34 80
111.85.15.166 8080
122.72.18.61 80
116.52.224.81 9999
116.236.151.166 8080
183.95.22.121 53281
202.201.3.121 3128
113.87.88.103 9797
27.46.39.194 9797
120.76.55.49 8088
182.88.126.216 9797
14.211.119.11 9797
14.211.123.143 9797