-
Notifications
You must be signed in to change notification settings - Fork 12
/
builtwith.py
160 lines (119 loc) · 5.68 KB
/
builtwith.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import copy
import datetime
import re
import requests
ENDPOINTS_BY_API_VERSION = {1: 'http://api.builtwith.com/v1/api.json',
2: 'http://api.builtwith.com/v2/api.json',
7: 'http://api.builtwith.com/v7/api.json'}
VERSION_EXCEPTION_TEMPLATE = 'Version %s'
DATETIME_INFORMATION_NAMES = ['FirstDetected', 'LastDetected']
class UnsupportedApiVersion(NotImplementedError):
pass
def _convert_timestamp_to_utc_datetime(timestamp):
if not isinstance(timestamp, int):
timestamp = int(re.search(r'\d+', timestamp).group(0))
return datetime.datetime.utcfromtimestamp(timestamp / 1000)
class UrlTechnologiesSet(object):
def __init__(self, technologies_list, last_full_builtwith_scan_date=None):
"""
Initializes the object using the list of technology dictionaries that are copied and formatted. Takes an
optional parameter for the datetime.date object of the last full BuiltWith scan.
"""
self._technologies_by_name = {}
for technologies_dict in technologies_list:
copied_technologies_dict = copy.deepcopy(technologies_dict)
for name in DATETIME_INFORMATION_NAMES:
copied_technologies_dict[name] = _convert_timestamp_to_utc_datetime(technologies_dict[name])
# According to the team at BuiltWith, it's best to just use the last "FULL" scan
# time in the CurrentlyLive determination since BuiltWith doesn't publish their
# smaller "TOPSITE" list. Downside is that this client will say some technologies were
# successfully detected on "TOPSITE" sites on the the last BuiltWith scan when that's
# not in fact accurate.
if last_full_builtwith_scan_date:
copied_technologies_dict['CurrentlyLive'] = (
last_full_builtwith_scan_date <= copied_technologies_dict['LastDetected'].date())
self._technologies_by_name[technologies_dict['Name']] = copied_technologies_dict
def __iter__(self):
return iter(self._technologies_by_name.values())
def get_technology_info(self, technology_name):
return self._technologies_by_name.get(technology_name, None)
def list_technologies(self):
return self._technologies_by_name.keys()
class BuiltWithDomainInfo(object):
def __init__(self, api_response_json, last_full_builtwith_scan_date=None):
self.api_response_json = api_response_json
self._technologies_by_url = {}
for path_entry in api_response_json['Paths']:
url_key = self.__get_url_key(
path_entry['Domain'], path_entry.get('SubDomain', None), path_entry['Url'])
self._technologies_by_url[
url_key] = UrlTechnologiesSet(path_entry['Technologies'],
last_full_builtwith_scan_date=last_full_builtwith_scan_date)
def __iter__(self):
return iter(self._technologies_by_url.values())
@staticmethod
def __get_url_key(domain, subdomain, path):
return domain, subdomain, path
def available_urls(self):
return self._technologies_by_url.keys()
def get_technologies_by_url(self, domain, subdomain, path):
return self._technologies_by_url.get(self.__get_url_key(domain, subdomain, path), None)
# Example usage:
# V1:
#
# >>> from builtwith import BuiltWith
# >>> bw = BuiltWith(YOUR_API_KEY)
# >>> bw.lookup(URL)
#
# V2:
#
# >>> from builtwith import BuiltWith
# >>> bw = BuiltWith(YOUR_API_KEY, api_version=2)
# >>> bw.lookup(URL)
#
# V7:
#
# >>> from builtwith import BuiltWith
# >>> bw = BuiltWith(YOUR_API_KEY, api_version=7)
# >>> bw.lookup(URL) # look up a single domain
# >>> bw.lookup([URL1, URL2, ..., URL16]) # or look up up to 16 domains at once
class BuiltWith(object):
"""
BuiltWith API version client.
"""
def __init__(self, key, api_version=1):
"""
Initialize the client. Requires a BuiltWith API key. Optionally takes in the API version. If no API version is
specified, a default of `1` is used.
"""
if api_version not in ENDPOINTS_BY_API_VERSION.keys():
raise UnsupportedApiVersion(VERSION_EXCEPTION_TEMPLATE % api_version)
self.key = key
self.api_version = api_version
def lookup(self, domain, get_last_full_query=True):
"""
Lookup BuiltWith results for the given domain. If API version 2 is used and the get_last_full_query flag
enabled, it also queries for the date of the last full BuiltWith scan.
"""
last_full_builtwith_scan_date = None
if self.api_version == 7 and isinstance(domain, list):
domain = ','.join(domain)
if self.api_version in [2, 7]:
last_updates_resp = requests.get(ENDPOINTS_BY_API_VERSION[self.api_version], params={'UPDATE': 1})
last_updated_data = last_updates_resp.json()
if get_last_full_query and last_updated_data['FULL']:
last_full_builtwith_scan_date = datetime.datetime.strptime(last_updated_data['FULL'], '%Y-%m-%d').date()
params = {
'KEY': self.key,
'LOOKUP': domain,
}
response = requests.get(ENDPOINTS_BY_API_VERSION[self.api_version], params=params)
if self.api_version == 1:
return response.json()
elif self.api_version == 2:
return BuiltWithDomainInfo(response.json(), last_full_builtwith_scan_date)
elif self.api_version == 7:
domain_info = list()
for result in response.json()['Results']:
domain_info.append(BuiltWithDomainInfo(result['Result'], last_full_builtwith_scan_date))
return domain_info