From 61b1b0cb6a1b37eef72d23792822ca332d0e5286 Mon Sep 17 00:00:00 2001
From: Mihai Blidaru <mihaiblidaru14@gmail.com>
Date: Wed, 23 Nov 2022 10:50:33 +0100
Subject: [PATCH] Sort rating tuples before computing similarities to avoid
 unnecessary computation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For each processed user or item, the number of operations is reduced from N² to N(N-1)/2, roughly half.

The sorting might reduce performance for small datasets but for larger datasets like movielens-1m the performance gains are very noticeable
---
 surprise/similarities.pyx | 40 +++++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/surprise/similarities.pyx b/surprise/similarities.pyx
index 35e0c14d..ade0c72e 100644
--- a/surprise/similarities.pyx
+++ b/surprise/similarities.pyx
@@ -61,13 +61,15 @@ def cosine(int n_x, yr, int min_support):
     # the similarity matrix
     cdef double [:, ::1] sim = np.zeros((n_x, n_x), np.double)
 
-    cdef int xi, xj, y
+    cdef int xi, xj, y, i
     cdef double ri, rj
     cdef int min_sprt = min_support
 
-    for y, y_ratings in yr.items():
-        for xi, ri in y_ratings:
-            for xj, rj in y_ratings:
+    sorted_yr = { y : sorted(y_ratings, key = lambda x: x[0]) for y, y_ratings in yr.items() }
+
+    for y, y_ratings in sorted_yr.items():
+        for i, (xi, ri) in enumerate(y_ratings):
+            for xj, rj in y_ratings[i + 1:]:
                 freq[xi, xj] += 1
                 prods[xi, xj] += ri * rj
                 sqi[xi, xj] += ri**2
@@ -128,13 +130,15 @@ def msd(int n_x, yr, int min_support):
     # the similarity matrix
     cdef double [:, ::1] sim = np.zeros((n_x, n_x), np.double)
 
-    cdef int xi, xj
+    cdef int xi, xj, i
     cdef double ri, rj
     cdef int min_sprt = min_support
 
-    for y, y_ratings in yr.items():
-        for xi, ri in y_ratings:
-            for xj, rj in y_ratings:
+    sorted_yr = { y : sorted(y_ratings, key = lambda x: x[0]) for y, y_ratings in yr.items() }
+
+    for y, y_ratings in sorted_yr.items():
+        for i, (xi, ri) in enumerate(y_ratings):
+            for xj, rj in y_ratings[i + 1:]:
                 sq_diff[xi, xj] += (ri - rj)**2
                 freq[xi, xj] += 1
 
@@ -200,13 +204,15 @@ def pearson(int n_x, yr, int min_support):
     # the similarity matrix
     cdef double [:, ::1] sim = np.zeros((n_x, n_x), np.double)
 
-    cdef int xi, xj, y, n
+    cdef int xi, xj, y, n, i
     cdef double ri, rj, num, denum
     cdef int min_sprt = min_support
 
-    for y, y_ratings in yr.items():
-        for xi, ri in y_ratings:
-            for xj, rj in y_ratings:
+    sorted_yr = { y : sorted(y_ratings, key = lambda x: x[0]) for y, y_ratings in yr.items() }
+
+    for y, y_ratings in sorted_yr.items():
+        for i, (xi, ri) in enumerate(y_ratings):
+            for xj, rj in y_ratings[i + 1:]:
                 prods[xi, xj] += ri * rj
                 freq[xi, xj] += 1
                 sqi[xi, xj] += ri**2
@@ -296,7 +302,7 @@ def pearson_baseline(
     # the similarity matrix
     cdef double [:, ::1] sim = np.zeros((n_x, n_x), np.double)
 
-    cdef int y, xi, xj
+    cdef int y, xi, xj, i
     cdef double ri, rj, diff_i, diff_j, partial_bias
     cdef int min_sprt = min_support
     cdef double global_mean_ = global_mean
@@ -305,10 +311,12 @@ def pearson_baseline(
     # is 1, so that's OK.
     min_sprt = max(2, min_sprt)
 
-    for y, y_ratings in yr.items():
+    sorted_yr = { y : sorted(y_ratings, key = lambda x: x[0]) for y, y_ratings in yr.items() }
+
+    for y, y_ratings in sorted_yr.items():
         partial_bias = global_mean_ + y_biases[y]
-        for xi, ri in y_ratings:
-            for xj, rj in y_ratings:
+        for i, (xi, ri) in enumerate(y_ratings):
+            for xj, rj in y_ratings[i + 1:]:
                 freq[xi, xj] += 1
                 diff_i = (ri - (partial_bias + x_biases[xi]))
                 diff_j = (rj - (partial_bias + x_biases[xj]))