-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcc.py
50 lines (40 loc) · 1.27 KB
/
cc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import os
import time
import json
import requests
from downloader import Downloader
CC_URL = "https://index.commoncrawl.org"
LIST_URL = CC_URL + "/collinfo.json"
def get_collinfo():
path = "collinfo.json"
if os.path.exists(path):
with open(path, "r") as file:
return json.load(file)
return requests.get(LIST_URL).json()
def get_pdf_links(downloader: Downloader, cdx_api, url):
params = {
"url": url,
"output": "json",
"matchType": "domain",
"filter": "mime:pdf",
"fl": "url",
}
try:
# response = requests.get(cdx_api, params=params)
response = downloader.fetch(cdx_api, params)
except requests.exceptions.ConnectionError:
print("Sleeping before retry...")
time.sleep(2)
return get_pdf_links(downloader, cdx_api, url)
except requests.exceptions.ConnectionError:
print("Connection aborted")
downloader.shutdown_gateway()
exit()
if response.status_code == 404:
return []
elif response.status_code != 200:
print("Sleeping before retry...")
time.sleep(2)
return get_pdf_links(downloader, cdx_api, url)
json_lines = response.text.splitlines()
return [json.loads(line) for line in json_lines]