-
Notifications
You must be signed in to change notification settings - Fork 0
/
Main.py
97 lines (75 loc) · 2.68 KB
/
Main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import re
def scrapeDomain(domain, maxDepth):
#navigates to all pages within maxDepth of domain, staying within the domain
#returns a list of all unique addresses found while navigating
toVisit = set()
toVisit.add(domain)
depths = {}
depths[domain] = 0
visited = set()
addresses = set()
while len(toVisit) > 0:
#visit the page
page = toVisit.pop()
print("visiting", page + ",", len(toVisit), "remaining")
depth = depths[page]
pageSource = getSoup(page)
#log it as visited
visited.add(page)
#add the addresses
for address in getAddresses(pageSource):
addresses.add(address)
#add the links to the queue to visit
if depth < maxDepth:
links = getLinks(pageSource)
for link in links:
try:
if link not in visited and link not in toVisit and inDomain(link, domain):
toVisit.add(link)
depths[link] = depth + 1
print("adding", link, "to queue at depth", depth + 1)
except:
print("adding link", link, "failed")
return list(addresses)
def getSoup(url):
#returns the BeautifulSoup for url
#try using requests
result = requests.get(url)
soup = BeautifulSoup(result.content, 'html.parser')
#test if it got a valid webpage by checking the number of links
if len(soup.find_all('a')) > 1:
return soup
print("requests failed for", url, "using selenium instead")
print(url.replace('/', '-'))
output = open(url.replace('/', '').replace(':', '').replace('?', '').replace('*', '') + '.txt', "w")
output.write(soup.prettify())
output.close()
#try using selenium
driver = webdriver.Chrome()
driver.get(url)
html = driver.page_source
driver.close()
return BeautifulSoup(html, 'html.parser')
def getLinks(soup):
#returns a list of links found in a soup that begin with domain
ret = []
for link in soup.find_all('a'):
ret.append(link.get('href'))
return ret
def inDomain(url, domain):
#print("inDomain:", url, domain)
return url[0:len(domain)] == domain
def getAddresses(soup):
#takes as input a soup that represents the source of a page
#returns a list of possible addresses
ptrn = '([0-9]+ [a-zA-Z0-9].{,100} [0-9]{5})[^0-9]'
return re.findall(ptrn, str(soup.prettify), re.DOTALL)
#url = 'https://www.crunchbase.com/'
#getSoup(url)
url = input ('starting domain?')
addresses = scrapeDomain(url, 3)
for address in addresses:
print(address)