-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper_v2.py
67 lines (57 loc) · 2.45 KB
/
scraper_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import validators
def scrape(URL):
domain = urlparse(URL).netloc
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
# Finds all "<a 'href'" elements on the html page, and thereafter removes duplicates
url_results = soup.find_all('a', href=True)
url_results = list(dict.fromkeys(url_results))
url_list = []
text_list = []
# Finds all the "<p>" tags on the webpage and appends every line of text as... text in text_list.
text_results = soup.find_all('p')
for text in text_results:
text_list.append(text.text)
# Iterates over the urls in url_results and creates a list with only valid URL's
# For example, sites referring to their own pages using just "/pagename" references
# are concatenated to a correct URL. I used 'http' in the hope of catching older sites for archiving,
# or getting an https redirect.
for url in url_results:
url = url['href']
if url.startswith('/'):
url = 'http://' + domain + url
if validators.url(url):
url_list.append(url)
# In the end there is text_results with all text on one page and url_list with referenced URL's.
# [0] is the list of text, [1] the url list
return [text_list, url_list]
def spider(starting_URL):
spider_list = [starting_URL]
print(spider_list[0])
archieve = []
while len(spider_list) > 0:
scrape_results = scrape(spider_list[0])
scraped_text = = scrape_results[0]
# Kan hem misschien hier in de evaluator stoppen
# Als de site niet Nederlands is doet hij verder niets met de scraped URL's om evt. ruis te voorkomen
if IS_DUTCH?(scraped_text) == True:
# Adds the scraped URL's to the list
spider_list.extend(scrape_results[1])
# Removes the duplicates from the list
spider_list = list(dict.fromkeys(spider_list))
archieve.append(spider_list[0])
spider_list.pop(0)
else:
# Removes the first item on the list, so it will scrape the next in line in the next iteration
spider_list.pop(0)
URL = "https://nl.wikipedia.org/wiki/Nederland"
#URL = "https://nos.nl"
test_spider = spider(URL)
#results = scrape(URL)
#for lines in results[0]:
# print(lines)
#for urls in results[1]:
# print(urls)