-
Notifications
You must be signed in to change notification settings - Fork 46
/
Copy pathsubscraper.py
181 lines (153 loc) · 6.98 KB
/
subscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/usr/bin/env python3
import argparse
import re
import requests
import urllib3
from bs4 import BeautifulSoup
from Color_Console import ctext
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# Usage: argparse.py -u website.com -o output.txt
parser = argparse.ArgumentParser(description='Extract subdomains from javascript files.')
parser.add_argument('-u', help='URL of the website to scan.')
parser.add_argument('-f', help='File with a list of the URLs to scan.')
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('-o', help='Output file (for results).', nargs="?")
group.add_argument('-v', help='Enables verbosity', action="store_true")
args = parser.parse_args()
"""
We define subdomains enumerated and sites visited in order to compare both lists.
We can thus determine which subdomains have not yet been checked.
This 'domino effect' of subsequent requests yields much more subdomains than scanning only the front page.
Threading would be useful here for optimization purposes.
"""
SUBDOMAINS_ENUMERATED = []
SITES_VISITED = []
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}
"""
Find scripts function will initiate the sequence by identifying all script tags on a given page.
From there it enumerates a list, sorts it for duplicates and then passes the script content to find_subdomains function.
"""
def find_scripts(url):
# If we already checked the site, ignore it.
if url in SITES_VISITED:
return False
# Otherwise, add it to list of sites which we have checked
SITES_VISITED.append(url)
r = is_live(url)
if not r:
return False
soup = BeautifulSoup(r.text, 'lxml')
script_tags = soup.find_all('script')
for script_tag in script_tags:
"""
Here we need to account for relative URLs and many other types of CDNs.
We sh account that files hosted on other websites can usually be omitted.
As such we will omit these in order to prevent us falling into a rabbit hole of requests.
There is a margin of error here but it's probably negligible in the bigger picture.
"""
if is_src(script_tag.attrs):
script_src = script_tag.attrs['src']
if script_src[0] == "/" and script_src[1] != "/":
parsed_url = url + script_src
elif script_src[0] == "/" and script_src[1] == "/":
parsed_url = script_src[2:]
elif "http" not in script_src:
parsed_url = url + "/" + script_src
else:
parsed_url = re.search("[a-zA-Z0-9-_.]+\.[a-zA-Z]{2,}", script_src).group()
try:
find_subdomains(requests.get('http://' + parsed_url, verify=False, headers=HEADERS).text, url)
src_url = re.search("[a-zA-Z0-9-_.]+\.[a-zA-Z]{2,}", script_src).group()
if src_url not in SUBDOMAINS_ENUMERATED:
SUBDOMAINS_ENUMERATED.append(src_url)
except:
pass
else:
find_subdomains(script_tag, url)
def is_src(tag):
"""
Checks if dictionary has key and verifies that it is a dictionary.
"""
return isinstance(tag, dict) and 'src' in tag
def is_live(url):
"""
Here we will use another function to capture errors in our requests.
It's very common for request errors so we simply ignore it.
"""
try:
r = requests.get('http://' + str(url), verify=False, headers=HEADERS)
return r
except:
return False
def find_subdomains(script, url):
"""
Once we have our list of javascript code, we must find all subdomains in the code.
As such, we compare it to a regex and then sort for the various exceptions one might expect to find.
"""
subdomain_regex = re.findall(r"[%\\]?[a-zA-Z0-9][a-zA-Z0-9-_.]*\." + url, str(script))
for subdomain in subdomain_regex:
# If the subdomain is preceded by URL encoding, we removed it.
if "%" in subdomain:
# Sort for double URL encoding
while "%25" in subdomain:
subdomain = subdomain.replace("%25", "%")
parsed_subdomain = subdomain.split("%")[-1][2:]
# If the subdomain is preceded by \x escape sequence, remove it.
elif "\\x" in subdomain:
ctext("[+] " + subdomain, "red")
parsed_subdomain = subdomain.split("\\x")[-1][2:]
# If the subdomain is preceded by \u unicode sequence, remove it.
elif "\\u" in subdomain:
ctext("[+] " + subdomain, "red")
parsed_subdomain = subdomain.split("\\u")[-1][4:]
# Otherwise proceed as normal.
else:
parsed_subdomain = subdomain
if parsed_subdomain not in SUBDOMAINS_ENUMERATED:
if args.v:
ctext("[+] " + subdomain, "green")
SUBDOMAINS_ENUMERATED.append(subdomain)
# If our total subdomains discovered is not the same length as our sites visited, scan the rest of our subdomains.
if len(list(set(SUBDOMAINS_ENUMERATED))) != len(list(set(SITES_VISITED))):
for site in SUBDOMAINS_ENUMERATED:
find_scripts(site)
def ascii_banner():
ctext(" `. ___", "red")
ctext(" __,' __`. _..----....____", "red")
ctext(" __...--.'``;. ,. ;``--..__ .' ,-._ _.-'", "red")
ctext(" _..-''-------' `' `' `' O ``-''._ (,;') _,'", "red")
ctext(",'________________ \`-._`-','", "red")
ctext(" `._ ```````````------...___ '-.._'-:", "red")
ctext(" ```--.._ ,. ````--...__\-.", "red")
ctext(" `.--. `-` ____ | |`", "red")
ctext(" `. `. ,'`````. ; ;`", "red")
ctext(" `._`. __________ `. \'__/`", "red")
ctext(" `-:._____/______/___/____`. \\ `", "red")
ctext(" SUBSCRAPER | `._ `. \\", "red")
ctext(" SUBSCRAPER `._________`-. `. `.___", "red")
ctext(" SUBSCRAPER v1.0.0 `------'`", "red")
ctext("\nSubdomains Found:\n")
def main():
# Banner
ascii_banner()
# Suppress InsecureRequestWarning
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# Initiate user input
# Read URL list from provided path
if args.f:
url_list = open(args.f,"r")
for URL in url_list.readlines():
find_scripts(URL.strip())
# Read URL from argument
elif args.u:
find_scripts(args.u)
# If neither provided, throw error
else:
raise Exception("URL must be set with either the -u or -f flags")
if args.o:
with open(args.o, "w") as f:
f.write("".join(x + "\n" for x in SUBDOMAINS_ENUMERATED))
if __name__ == '__main__':
main()