From 4c50711dde75e33de813acccab1042d5f95487de Mon Sep 17 00:00:00 2001 From: Takayama <49364055+MGMCN@users.noreply.github.com> Date: Wed, 1 Mar 2023 10:55:52 +0900 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BA=86=20geonodedaili.py?= =?UTF-8?q?=20=E7=88=AC=E5=8F=96=E4=BB=A3=E7=90=86=20(#186)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add geonodedaili.py * add headers through crawl function --- proxypool/crawlers/public/geonodedaili.py | 71 +++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 proxypool/crawlers/public/geonodedaili.py diff --git a/proxypool/crawlers/public/geonodedaili.py b/proxypool/crawlers/public/geonodedaili.py new file mode 100644 index 00000000..f71f16ec --- /dev/null +++ b/proxypool/crawlers/public/geonodedaili.py @@ -0,0 +1,71 @@ +import time +from retrying import RetryError +from loguru import logger +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import json + +BASE_URL = 'https://proxylist.geonode.com/api/proxy-list?limit=500&page={page}&sort_by=lastChecked&sort_type=desc' +MAX_PAGE = 18 + + +class GeonodeCrawler(BaseCrawler): + """ + Geonode crawler, https://proxylist.geonode.com/ + """ + urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + try: + result = json.loads(html) + proxy_list = result['data'] + for proxy_item in proxy_list: + host = proxy_item['ip'] + port = proxy_item['port'] + yield Proxy(host=host, port=port) + except json.JSONDecodeError: + print("json.JSONDecodeError") + return + + def crawl(self): + """ + override crawl main method + add headers + """ + headers = { + 'authority': 'proxylist.geonode.com', + 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="99", "Google Chrome";v="99"', + 'accept': 'application/json, text/plain, */*', + 'sec-ch-ua-mobile': '?0', + 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.83 Safari/537.36', + 'sec-ch-ua-platform': '"macOS"', + 'origin': 'https://geonode.com', + 'sec-fetch-site': 'same-site', + 'sec-fetch-mode': 'cors', + 'sec-fetch-dest': 'empty', + 'referer': 'https://geonode.com/', + 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7', + 'if-none-match': 'W/"c25d-BXjLTmP+/yYXtIz4OEcmdOWSv88"', + } + try: + for url in self.urls: + logger.info(f'fetching {url}') + html = self.fetch(url, headers=headers) + if not html: + continue + time.sleep(.5) + yield from self.process(html, url) + except RetryError: + logger.error( + f'crawler {self} crawled proxy unsuccessfully, ' + 'please check if target url is valid or network issue') + + +if __name__ == '__main__': + crawler = GeonodeCrawler() + for proxy in crawler.crawl(): + print(proxy)