Skip to content

Commit

Permalink
optimize pagerank.py
Browse files Browse the repository at this point in the history
  • Loading branch information
cl117 committed Aug 30, 2024
1 parent 383359a commit e457a8e
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 113 deletions.
4 changes: 0 additions & 4 deletions flask/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ def add_roles(parts_response, term_list):
parts_response {List} -- List containing all parts from the SPARQL query
term_list {List} -- List of terms from the SO-Ontologies
"""
print("parts_response: ", len(parts_response))
print("term_list: ", len(term_list))
for part in parts_response:
# Split the CSV of roles from sparql
role = part.get('role')
Expand Down Expand Up @@ -109,8 +107,6 @@ def create_parts_index(index_name):
'number_of_shards': 1
}
}
logger_.log("index_name: ", index_name) # empty
logger_.log("body: ", body) # empty
es.indices.create(index=index_name, body=body)

logger_.log('Index created', True)
Expand Down
149 changes: 41 additions & 108 deletions flask/pagerank.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from xml.etree import ElementTree
import numpy as np
import query
from logger import Logger
Expand All @@ -25,84 +24,34 @@
}
'''


class graph:
# create uri to index mapping
def init_mapping(self, adjacency_list):
uris = set()
for parent in adjacency_list:
uris.add(parent)
for child in adjacency_list[parent]:
uris.add(child)

self.index2uri = list(uris)
self.uri2index = {}

for i in range(len(self.index2uri)):
uri = self.index2uri[i]
self.uri2index[uri] = i

# assert mappings are correct
for i in range(len(self.index2uri)):
uri = self.index2uri[i]
index = self.uri2index[uri]
assert(index == i)


def init_in_links(self, adjacency_list):
for j in range(self.size):
self.in_links[j] = []

for parent in adjacency_list:
for child in adjacency_list[parent]:
parent_idx = self.uri2index[parent]
child_idx = self.uri2index[child]
self.in_links[child_idx].append(parent_idx)


def init_number_out_links(self, adjacency_list):
for j in range(self.size):
self.number_out_links[j] = 0

for parent in adjacency_list:
parent_idx = self.uri2index[parent]
number_children = len(adjacency_list[parent])
self.number_out_links[parent_idx] = number_children


def init_dangling_pages(self, adjacency_list):
for parent in adjacency_list:
number_children = len(adjacency_list[parent])
if number_children == 0:
self.dangling_pages.add(self.uri2index[parent])


class Graph:
def __init__(self, adjacency_list):
self.index2uri = []
self.uri2index = {}
self.init_mapping(adjacency_list)

self.uri2index = {uri: idx for idx, uri in enumerate(adjacency_list)}
self.index2uri = list(adjacency_list.keys())
self.size = len(self.index2uri)

self.in_links = {}
self.init_in_links(adjacency_list)

self.number_out_links = {}
self.init_number_out_links(adjacency_list)


self.in_links = {_:[] for _ in range(self.size)}
self.number_out_links = {_:0 for _ in range(self.size)}
self.dangling_pages = set()
self.init_dangling_pages(adjacency_list)

for parent, children in adjacency_list.items():
parent_idx = self.uri2index[parent]
if children:
self.number_out_links[parent_idx] = len(children)
for child in children:
child_idx = self.uri2index[child]
self.in_links[child_idx].append(parent_idx)
else:
self.dangling_pages.add(parent_idx)

# add uris as keys to adjacency_list
def populate_uris(uri_response):
adjacency_list = {}
def get_dangling_contrib(self, p):
return sum([p[j] for j in self.dangling_pages]) / self.size

for uri in uri_response:
adjacency_list[uri['subject']] = set()

return adjacency_list
def get_teleportation_contrib(self):
return 1.0 / self.size

def populate_uris(uri_response):
return {uri['subject']: set() for uri in uri_response}

# add edges
def populate_links(link_response, adjacency_list):
Expand All @@ -111,74 +60,58 @@ def populate_links(link_response, adjacency_list):
adjacency_list[link['parent']].add(link['child'])
except:
raise


def pagerank(g, s=0.85, tolerance=0.001):
n = g.size
p = np.matrix(np.ones((n, 1))) / n

p = np.ones(n) / n # Initial probability distribution vector

if n == 0:
logger_.log('no iterations: empty graph', True)
return p

iteration = 1
delta = 2

while delta > tolerance:
v = np.matrix(np.zeros((n, 1)))

dangling_contrib = sum([p[j] for j in g.dangling_pages]) / n
teleportation_contrib = 1 / n

while delta > tolerance:
v = np.zeros(n)
dangling_contrib = g.get_dangling_contrib(p)
teleportation_contrib = g.get_teleportation_contrib()

for j in range(n):
link_contrib = sum([p[k] / g.number_out_links[k] for k in g.in_links[j]])
v[j] = s * link_contrib + s * dangling_contrib + (1 - s) * teleportation_contrib
new_p = v / np.sum(v)
delta = np.sum(np.abs(p - new_p))
logger_.log('Iteration ' + str(iteration) + ': L1 norm delta is ' + str(delta), True)
in_link_contrib = np.sum(p[k] / g.number_out_links[k] for k in g.in_links[j])
v[j] = s * (in_link_contrib + dangling_contrib) + (1 - s) * teleportation_contrib

v /= np.sum(v)
delta = np.sum(np.abs(p - v))
logger_.log(f'Iteration {iteration}: L1 norm delta is {delta}', True)

p = new_p
p = v
iteration += 1

return p

return p

def make_uri2rank(pr_vector, uri2index):
uri2rank = {}

try:
for uri in uri2index:
uri2rank[uri] = pr_vector[uri2index[uri]]
except:
raise

return uri2rank

return {uri: pr_vector[idx] for uri, idx in uri2index.items()}

def update_pagerank():
logger_.log('------------ Updating pagerank ------------', True)
logger_.log('******** Query for uris ********', True)
uri_response = query.query_sparql(uri_query)
logger_.log('******** Query for uris complete ********', True)

adjacency_list = populate_uris(uri_response)

logger_.log('******** Query for links ********', True)
link_response = query.query_sparql(link_query)
logger_.log('******** Query for links complete ********', True)

populate_links(link_response, adjacency_list)

g = graph(adjacency_list)
g = Graph(adjacency_list)

logger_.log('******** Running pagerank ********', True)
pr = pagerank(g, tolerance=float(config_manager.load_config()['pagerank_tolerance']))
pr_vector = pagerank(g, tolerance=float(config_manager.load_config()['pagerank_tolerance']))
logger_.log('******** Running pagerank complete ********', True)
logger_.log('------------ Successfully updated pagerank ------------\n', True)
pr_vector = np.squeeze(np.asarray(pr))

# after squeeze, make sure it at least has a dimension in the case that there is only one element
if pr_vector.shape == ():
pr_vector = np.array([pr_vector])

return make_uri2rank(pr_vector, g.uri2index)

2 changes: 1 addition & 1 deletion flask/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def parse_sparql_query(sparql_query, is_count_query):

# Construct es_query
es_query = ' '.join(keywords).strip()
print("Hello es_query: ", es_query)
#print("Hello es_query: ", es_query)

return es_query, _from, criteria, offset, limit, sequence, flags

Expand Down

0 comments on commit e457a8e

Please sign in to comment.