Skip to content

Commit

Permalink
Merge pull request #1727 from laws-africa/ranks
Browse files Browse the repository at this point in the history
use p99 as pagerank pivot
  • Loading branch information
longhotsummer authored Feb 13, 2024
2 parents 0ca5138 + 324f83e commit e61e236
Showing 1 changed file with 26 additions and 5 deletions.
31 changes: 26 additions & 5 deletions peachjam/graph/ranker.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from statistics import geometric_mean
import math

import igraph as ig
from elasticsearch import helpers
Expand Down Expand Up @@ -55,10 +55,10 @@ def publish_ranks(self):
# calculate the pivot as the geometric mean of the ranks
ranks = [x for x in self.ranks if x > 0.0]
if ranks:
pivot = geometric_mean(ranks)
log.info(
f"Updating pagerank pivot (geometric mean of non-zero ranks): {pivot}"
)
# analysis shows that the p99 is a good pivot
# see https://colab.research.google.com/drive/1KlYC7A9JeqS_uaLhL8yv_IrAmHHXNCdK
pivot = percentile(sorted(ranks), 0.99)
log.info(f"Updating pagerank pivot (p99 of non-zero ranks): {pivot}")
settings = pj_settings()
settings.pagerank_pivot_value = pivot
settings.save(update_fields=["pagerank_pivot_value"])
Expand Down Expand Up @@ -103,3 +103,24 @@ def bulk_update(self, works):
request_timeout=60 * 60 * 30,
)
log.info("Updated index")


def percentile(values, percent):
"""
Find the percentile of a list of values.
@parameter N - is a list of values. Note N MUST BE already sorted.
@parameter percent - a float value from 0.0 to 1.0.
@return - the percentile of the values
"""
if not values:
return None
k = (len(values) - 1) * percent
f = math.floor(k)
c = math.ceil(k)
if f == c:
return values[int(k)]
d0 = values[int(f)] * (c - k)
d1 = values[int(c)] * (k - f)
return d0 + d1

0 comments on commit e61e236

Please sign in to comment.