-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun_cli.py
77 lines (63 loc) · 2.84 KB
/
run_cli.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import os
os.environ['UA_PLATFORM'] = "android"
import logging
import requests
import urllib3
import urllib
from urllib.request import Request, urlopen
import requests_random_user_agent
from src.app_cli import run_spider
from src.dict_scraper.spiders import cambridge
from src.lib.json_to_apkg import generate_cloze
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s')
# File Handler
# file_handler = logging.FileHandler('run_cli.log')
# file_handler.setLevel(logging.ERROR)
# file_handler.setFormatter(formatter)
# logger.addHandler(file_handler)
# Stream Handler
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
# logging.basicConfig(
# level=logging.DEBUG,
# format="%(asctime)s:%(levelname)s:%(message)s"
# ) # filename='test.log'
if __name__ == '__main__':
word_url = "https://dictionary.cambridge.org/dictionary/english/sit"
word_url2 = "https://dictionary.cambridge.org/dictionary/english/run"
gcurl = "https://webcache.googleusercontent.com/search?q=cache:" + word_url
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'Referer': 'https://www.google.com'
}
# phrase = generate_cloze("an eye for an eye")
# # response = requests.get(gcurl, headers=headers)
# # print(response.content)
# CONTAINER['url'] = gcurl
http = urllib3.PoolManager(timeout=urllib3.Timeout(connect=1.0, read=2.0))
response = http.request('GET', word_url, headers=headers, retries=urllib3.Retry(5, redirect=2))
logger.info(f"Status: {response.status}")
# print(response.data)
# print(response["headers"])
# req = Request(word_url)
# req.add_header('User-Agent', 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)')
# content = urlopen(req).read()
# print(content)
# print(urllib.request.urlopen(word_url))
# s = requests.Session()
# s.headers.update({'Referer': 'https://www.google.com'})
# print(s.headers['User-Agent'], s.headers['Referer'])
#
# # Without a session
# resp = requests.get('https://httpbin.org/user-agent')
# print(resp.json()['user-agent'])
# 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) '
# 'Chrome/85.0.4183.140 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
# print(run_spider(cambridge.MeaningsSpider, word_url, headers))
# time.sleep(20)
# print(run_spider(cambridge.MeaningsSpider, word_url2, headers))
# run_spider(CambridgeSpider, gcurl, "com", "cbed-2-4", False) # dt.now().strftime("%Y%m%d%H%M%S")
# run_spider("https://dictionary.cambridge.org/dictionary/english/water", "com", dt.now().strftime("%Y%m%d%H%M%S"))