-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprocess_link.py
114 lines (92 loc) · 5.37 KB
/
process_link.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import requests
import requests_toolbelt.adapters.appengine
from bs4 import BeautifulSoup
HOME_URL = "https://en.wikipedia.org"
MAX_LEVEL = 1 # max crawl depth
SEARCH_MODE = 1 # default search mode
#START_URL = "https://en.wikipedia.org/wiki/Unus_pro_omnibus,_omnes_pro_uno"
entryList = []
def scrape(self_url, currentLevel):
if currentLevel + 1 > MAX_LEVEL: # stops sending requests if spider is going to reach max crawl level
return
currentLevel += 1
#print("Currently scraping " + self_url)
page = requests.get(self_url)
# TODO throw exception here
soup = BeautifulSoup(page.content, 'lxml')
self_title = soup.title.contents[0] # stores title of page
listTags = [] # list to store all hyperlinks found
if SEARCH_MODE == 1: # OPTION 1: Use this to search only the FIRST paragraph of each page for hyperlinks
tag = soup.find('p') # search for first paragraph
table = tag.findParents('table')
while table or 'class' in tag.attrs: # table is not empty; tag is found in a table
tag = tag.findNext('p')
table = tag.findParents('table')
while (tag.find('a') != None and 'Coordinates' in tag.find('a').contents) or (tag.get('class') != None): # if first search result is not a pure <p> tag nor a coordinate link
tag = tag.findNext('p')
listTags.extend(tag.findAll('a'))
elif SEARCH_MODE == 2: # OPTION 2: Use this to search the introduction of each page only for hyperlinks
stop_at = soup.find('h2') # finds the first h2 element i.e. where the first subsection header is found
class_extr = stop_at.find_all_previous('p') # extracts all elements before this element
for paragraph in class_extr:
listTags.extend(paragraph.findAll('a'))
elif SEARCH_MODE == 3: # OPTION 3: Use this to search the entire page for hyperlinks
for paragraph in soup.findAll('p'): # for each paragraph found
listTags.extend(paragraph.findAll('a')) # stores all hyperlinks found
# cleans up list of hyperlinks; retains only relevant links
#listLinks = [] # stores the name and url of each hyperlink found
listOfFilterKeywords = ['cite_note', 'File', 'wikimedia', 'Help', ':Verifiability', 'Wikipedia:', 'wiktionary.org'] # stores list of keywords that indicates links to be filtered out
for tag in listTags:
if not any(keyword in str(tag) for keyword in listOfFilterKeywords): # checks if keyword is found; if so, skip this tag
if 'title' in tag.attrs and 'href' in tag.attrs: # checks if title and link elements exist in the tag
#listLinks.append((tag['title'], HOME_URL + tag['href'])) # appends a title-url pair to listLinks
entry = {"self_title": self_title, "self_url": self_url, "ext_title": tag['title'], "ext_url": HOME_URL + tag['href'], "current_level": currentLevel} # stores a dictionary of the information regarding each hyperlink i.e. which page it is found on
if entry not in entryList: # filters out entries already present
entryList.append(entry)
scrape(entry["ext_url"], currentLevel) # depth-search via recursion
return entryList
def proc_data(entryList, isMobileBrowser):
# creates a list to store unique urls found
urls = list(set([ data['self_url'] for data in entryList ])) # removes URL duplicates from self_urls
urls.extend(list(set([ data['ext_url'] for data in entryList ]))) # adds other URLs branches to list
#print(entryList)
nodeList = [] # to store nodes
for url in urls:
for data in entryList:
if url == data["self_url"]:
entry = {"id": url, "label": data["self_title"], "level": data["current_level"] - 1}
if entry not in nodeList:
nodeList.append(entry)
break
elif url == data["ext_url"]: # search again from self_urls?
entry = {"id": url, "label": data["ext_title"], "level": data["current_level"]}
if entry not in nodeList:
nodeList.append(entry)
break
linkList = [] # to store links
for data in entryList:
if isMobileBrowser:
strength = 0.6
else:
# strength formula
strength = 0.8 - 0.4*data["current_level"]/MAX_LEVEL
if data["ext_url"] != None:
linkList.append({"target": data["self_url"], "source": data["ext_url"], "strength": strength})
return nodeList, linkList
def generate_lists(self_url, max_level, isMobileBrowser, search_mode):
requests_toolbelt.adapters.appengine.monkeypatch() # patches requests as it has compatibility issues with Google App Engine/ comment this out to test on development server
# Changes HOME_URL
new_wikipedia_region = self_url.split("wikipedia.org", 1)[0]
new_home_url = new_wikipedia_region + "wikipedia.org"
global HOME_URL
HOME_URL = new_home_url
# Sets MAX_LEVEL
global MAX_LEVEL
MAX_LEVEL = int(max_level)
# Sets SEARCH_MODE
global SEARCH_MODE
SEARCH_MODE = int(search_mode)
# clears list for each new request made
del entryList[:]
nodeList, linkList = proc_data(scrape(self_url, currentLevel = 0), isMobileBrowser)
return nodeList, linkList