lzjun567 · GazEoD · Nov 14, 2017 · Nov 14, 2017
diff --git a/.DS_Store b/.DS_Store
diff --git a/pdf/README.md b/pdf/README.md
@@ -24,6 +24,8 @@ $ sudo yum intsall wkhtmltopdf      # centos
 
 ### 运行
 ```python
+python get_proxy.py
+python censor.py
 python crawler.py
 ```
 
@@ -42,7 +44,7 @@ python crawler.py
 ### 更新记录
 
 * 2017-2-21: 对代码进行了全面的重构,可扩展, 子类爬虫只需实现 `parse_menu`和`parse_body`方法就可以实现HTML转换PDF的逻辑
-
+* 2017-11-14: 添加代理池，修复503错误
 
 ### Contact me
 

diff --git a/pdf/censor.py b/pdf/censor.py
@@ -0,0 +1,21 @@
+import random
+from urllib import request
+
+import requests
+
+fpr=open('host.txt','r')
+fpw = open('proxies.txt','w')
+
+ips=fpr.readlines()
+proxys=list()
+for p in ips:
+    ip=p.strip('\n').split('\t')
+    pro=dict()
+    pro['https'] = ip[0] + ':' + ip[1]
+    print(pro)
+    try:
+        response = requests.get('https://www.baidu.com', proxies=pro,timeout=2)
+        print(response)
+        fpw.write(p)
+    except Exception as e:
+        print(e)
diff --git a/pdf/crawler.py b/pdf/crawler.py
@@ -3,13 +3,12 @@
 
 import logging
 import os
+import random
 import re
 import time
+from urllib import request
 
-try:
-    from urllib.parse import urlparse  # py3
-except:
-    from urlparse import urlparse  # py2
+from urllib.parse import urlparse
 
 import pdfkit
 import requests
@@ -25,7 +24,6 @@
 {content}
 </body>
 </html>
-
 """
 
 
@@ -51,7 +49,26 @@ def request(url, **kwargs):
         网络请求,返回response对象
         :return:
         """
-        response = requests.get(url, **kwargs)
+        fp = open('proxies.txt', 'r')
+        ips = fp.readlines()
+        proxys = list()
+        for p in ips:
+            ip = p.strip('\n').split('\t')
+            pro = dict()
+            pro['https'] = ip[0] + ':' + ip[1]
+            proxys.append(pro)
+
+        headers = {
+            'user-agent': 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19'}
+        while (1):
+            try:
+                response = requests.get(url, headers=headers, proxies=random.choice(proxys), timeout=2)
+                if (response.status_code != 200): continue
+                break
+            except Exception as e:
+                print(e)
+
+        print("response:", response)
         return response
 
     def parse_menu(self, response):
@@ -70,6 +87,7 @@ def parse_body(self, response):
 
     def run(self):
         start = time.time()
+
         options = {
             'page-size': 'Letter',
             'margin-top': '0.75in',
@@ -93,7 +111,6 @@ def run(self):
             with open(f_name, 'wb') as f:
                 f.write(html)
             htmls.append(f_name)
-
         pdfkit.from_file(htmls, self.name + ".pdf", options=options)
         for html in htmls:
             os.remove(html)
@@ -112,10 +129,12 @@ def parse_menu(self, response):
         :param response 爬虫返回的response对象
         :return: url生成器
         """
-        soup = BeautifulSoup(response.content, "html.parser")
-        menu_tag = soup.find_all(class_="uk-nav uk-nav-side")[1]
-        for li in menu_tag.find_all("li"):
-            url = li.a.get("href")
+        soup = BeautifulSoup(response.text, "html.parser")
+        menu_tag = soup.find_all('ul', class_="uk-nav uk-nav-side")[1]
+        # print(menu_tag.find_all('a')[0].get("href"))
+        for li in menu_tag.find_all("a"):
+            url = li.get("href")
+            # print(li)
             if not url.startswith("http"):
                 url = "".join([self.domain, url])  # 补全为全路径
             yield url
@@ -127,9 +146,10 @@ def parse_body(self, response):
         :return: 返回处理后的html文本
         """
         try:
-            soup = BeautifulSoup(response.content, 'html.parser')
-            body = soup.find_all(class_="x-wiki-content")[0]
+            soup = BeautifulSoup(response.text, 'html.parser')
+            body = soup.find_all(class_="x-wiki-content x-main-content")[0]
 
+            print("body:", body)
             # 加入标题, 居中显示
             title = soup.find('h4').get_text()
             center_tag = soup.new_tag("center")
@@ -158,6 +178,6 @@ def func(m):
 
 
 if __name__ == '__main__':
-    start_url = "http://www.liaoxuefeng.com/wiki/0013739516305929606dd18361248578c67b8067c8c017b000"
+    start_url = "https://www.liaoxuefeng.com/wiki/0013739516305929606dd18361248578c67b8067c8c017b000"
     crawler = LiaoxuefengPythonCrawler("廖雪峰Git", start_url)
     crawler.run()
diff --git a/pdf/get_proxy.py b/pdf/get_proxy.py
@@ -0,0 +1,22 @@
+from urllib import request
+from bs4 import BeautifulSoup
+
+fp = open('host.txt', 'w')
+for i in range(1,3):
+    url='http://www.xicidaili.com/wn/' + str(i)
+    opener=request.build_opener()
+    opener.addheaders = [('User-Agent',
+                          'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')]
+    request.install_opener(opener)
+    response = request.urlopen(url)
+    soup = BeautifulSoup(response.read(),'html.parser')
+    list = soup.find_all(class_='odd')
+    for elem in list:
+        data = elem.find_all('td')
+        ip=data[1].string
+        port=data[2].string
+        fp.write(ip)
+        fp.write('\t')
+        fp.write(port)
+        fp.write('\n')
+fp.close()
diff --git a/pdf/host.txt b/pdf/host.txt
@@ -0,0 +1,100 @@
+58.62.86.245	9999
+61.152.230.26	8080
+171.104.132.28	9999
+112.74.94.142	3128
+139.224.24.26	8888
+122.72.18.34	80
+111.85.15.166	8080
+122.72.18.61	80
+119.90.63.3	3128
+27.44.170.236	9999
+116.52.224.81	9999
+119.114.229.137	80
+180.76.134.106	3128
+116.211.88.90	3128
+116.236.151.166	8080
+183.135.251.12	30291
+125.112.194.189	37301
+183.95.22.121	53281
+182.246.209.199	80
+222.132.145.126	80
+202.201.3.121	3128
+222.78.116.105	26956
+49.88.168.150	31329
+139.208.198.170	8118
+113.87.88.103	9797
+183.135.249.107	42411
+59.63.74.254	4362
+49.85.13.107	42332
+171.212.140.116	8118
+183.51.191.234	9797
+113.76.96.91	9797
+123.185.129.93	8080
+223.241.79.48	8010
+39.88.13.3	53281
+27.46.39.194	9797
+115.203.196.36	22221
+175.171.108.227	53281
+218.29.111.106	9999
+113.65.160.146	9797
+120.76.55.49	8088
+183.38.61.213	9999
+59.60.168.219	29377
+223.241.117.50	8010
+115.221.117.220	22628
+117.90.111.6	45614
+114.234.80.152	29786
+218.73.139.165	25933
+182.34.50.184	42887
+183.135.253.23	46668
+182.88.126.216	9797
+113.89.13.153	9999
+1.194.162.92	39615
+60.179.40.138	27739
+175.147.66.112	8080
+14.211.119.11	9797
+113.121.251.55	31353
+14.211.123.143	9797
+114.239.222.194	44127
+115.203.194.104	38774
+115.230.62.194	31772
+223.241.116.93	8010
+59.38.61.207	9797
+110.72.34.144	8123
+59.40.68.44	8010
+223.241.119.91	8010
+125.126.172.225	23107
+59.40.50.197	8010
+113.121.170.239	25683
+223.241.116.115	8010
+27.44.162.195	9999
+183.189.114.218	80
+106.113.242.113	9999
+110.72.26.118	8123
+118.254.153.227	3128
+218.20.54.92	9999
+120.42.124.17	48795
+223.223.203.30	8080
+111.76.64.44	4392
+183.148.87.90	21140
+221.198.105.220	8118
+223.241.78.169	8010
+113.89.15.143	9999
+182.42.45.36	808
+14.211.122.146	9797
+202.105.111.162	9000
+120.43.230.31	47224
+115.202.80.0	28202
+119.136.199.74	808
+59.40.50.231	8010
+223.241.117.25	8010
+180.115.11.207	38367
+223.241.117.196	8010
+115.215.51.218	808
+14.29.84.50	8080
+111.76.175.214	4323
+27.44.159.41	9797
+115.46.68.40	8123
+171.39.41.249	8123
+121.31.71.193	8123
+60.160.185.213	7654
diff --git a/pdf/proxies.txt b/pdf/proxies.txt
@@ -0,0 +1,14 @@
+58.62.86.245	9999
+122.72.18.34	80
+111.85.15.166	8080
+122.72.18.61	80
+116.52.224.81	9999
+116.236.151.166	8080
+183.95.22.121	53281
+202.201.3.121	3128
+113.87.88.103	9797
+27.46.39.194	9797
+120.76.55.49	8088
+182.88.126.216	9797
+14.211.119.11	9797
+14.211.123.143	9797