forked from iclab/centinel-server
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlist_grabber.py
106 lines (84 loc) · 3.59 KB
/
list_grabber.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python
#
# Abbas Razaghpanah ([email protected])
# January 2015, Stony Brook University
#
# list_grabber.py: a script to hit a given URL and download
# all of the csv files linked on the index page.
# This can be run periodically to update URL lists for different
# countries, censorship products, etc. using a URL repository.
# The script assumes HTTP basic authentication for the repository.
#
# For CSV files that are of form XX.csv, it is assumed that it
# belongs to a country XX and it will be copied to the related
# directory (which is created if it doesn't exist). Everything
# else will go to "global".
import argparse
import os
import re
import requests
from os import path
from requests.auth import HTTPDigestAuth
from urlparse import urljoin
def parse_args():
parser = argparse.ArgumentParser()
url_help = ('The URL for the index page. All of the file names from '
'hyperlinks will be joined with this URL.')
parser.add_argument('--url', '-U', help=url_help, required=True)
user_help = ('Username and password for HTTP authentication. '
'It should be provided as username:password')
parser.add_argument('--user', '-u', help=user_help, default=None)
output_help = ('The output directory where the files are supposed to be '
'saved to. Defaults to current directory.')
parser.add_argument('--output', '-o', help=output_help, default='.')
digest_help = ('Enable HTTP digest authentication. If username and password '
'are provided, basic authentication is used as default.')
parser.add_argument('--digest', '-d', help=digest_help, dest='digest', action='store_true')
parser.set_defaults(digest=False)
args = parser.parse_args()
if not os.path.exists(args.output):
parser.error("The output directory \"%s\" does not exist!" % args.output)
if args.user is None and args.digest is False:
parser.error('Digest authentication has been enabled but no username and password '
'given.')
return args
if __name__ == "__main__":
args = parse_args()
url = args.url
if args.user is not None:
user, password = args.user.split(':')
if args.digest:
auth = HTTPDigestAuth(user, password)
else:
auth = (user, password)
else:
auth = None
directory = os.path.join(args.output, "global")
req = requests.get(url, auth=auth)
print "Downloading list index."
req.raise_for_status()
csvs = re.findall('href=\"([^\'\.\"]+\.csv)\"', req.text)
if not os.path.exists(directory):
print "Creating \"global\" directory at %s." % directory
os.makedirs(directory)
for csvfile in csvs:
path = urljoin(url, csvfile)
print "Downloading list \"%s\"." % path
try:
req = requests.get(path, auth=auth)
req.raise_for_status()
except Exception as exp:
print "Error downloading file \"%s\": %s" % (path, exp)
continue
path = os.path.join(args.output, "global", csvfile)
# find out if it is a country-specific list
base = os.path.splitext(csvfile)[0].upper()
if len(base) == 2:
directory = os.path.join(args.output, base)
if not os.path.exists(directory):
print "Creating directory for country %s at %s." % (base, directory)
os.makedirs(directory)
path = os.path.join(directory, "country_list.csv")
output = open(path, 'w')
output.write(req.text.encode('utf-8'))
output.close()