-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwcrawler.py
64 lines (51 loc) · 1.98 KB
/
wcrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import collections
import requests
from bs4 import BeautifulSoup
def get_html_content(url):
try:
response = requests.get(url)
response.raise_for_status() # Raises an HTTPError for bad responses
return response.text
except requests.RequestException as e:
# Handle exceptions that may occur during the request
#print(f"Failed to retrieve content from {url}: {str(e)}")
return None
def get_links_on_page(base_url, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a') # Finds all <a> tags
urls = []
for link in links:
href = link.get('href')
if href:
# Check if the URL is relative and convert it to absolute
if href.startswith('/'):
href = requests.compat.urljoin(base_url, href)
urls.append(href)
return urls
class WCrawler:
def __init__(self, url): # constructor method
self.visited_urls = set() # create the visited_urls list for processed urls
self.url_queue = collections.deque() # create the queue to be processed
self.url_queue.appendleft(url) # add the provided urls to the deque
def get_url(self, url):
html = get_html_content(url)
if html is None:
#print(f"Skipping {url}, as no HTML content could be retrieved.")
return
links = get_links_on_page(url, html)
for link in links:
if link not in self.visited_urls:
self.visited_urls.add(link)
self.url_queue.appendleft(link)
def execute(self):
while self.url_queue:
current_url = self.url_queue.pop()
self.get_url(current_url)
return list(self.visited_urls)
if __name__ == '__main__':
initial_url = 'http://books.toscrape.com' # Set the URL you want to start crawling from
crawler = WCrawler(initial_url)
visited = crawler.execute()
print("Visited URLs:")
for url in visited:
print(url)