diff --git a/doc/source/prediction_algorithms.rst b/doc/source/prediction_algorithms.rst index f358fb17..1235ade6 100644 --- a/doc/source/prediction_algorithms.rst +++ b/doc/source/prediction_algorithms.rst @@ -130,6 +130,9 @@ argument is a dictionary with the following (all optional) keys: ``'False'``) for the similarity not to be zero. Simply put, if :math:`|I_{uv}| < \text{min_support}` then :math:`\text{sim}(u, v) = 0`. The same goes for items. +- ``'common_ratings_only'``: Determines whether only common user/item ratings are + taken into account or all the full rating vectors are considered + (only relevant for cosine-based similraty). Default is True. - ``'shrinkage'``: Shrinkage parameter to apply (only relevant for :func:`pearson_baseline ` similarity). Default is 100. diff --git a/surprise/prediction_algorithms/algo_base.py b/surprise/prediction_algorithms/algo_base.py index 844cb44e..5cae77af 100644 --- a/surprise/prediction_algorithms/algo_base.py +++ b/surprise/prediction_algorithms/algo_base.py @@ -285,6 +285,10 @@ def compute_similarities(self): bx, by = bi, bu args += [self.trainset.global_mean, bx, by, shrinkage] + elif name == 'cosine': + common_ratings_only = self.sim_options.get('common_ratings_only', + True) + args += [common_ratings_only] try: print('Computing the {0} similarity matrix...'.format(name)) diff --git a/surprise/similarities.pyx b/surprise/similarities.pyx index 2a808017..70adebd0 100644 --- a/surprise/similarities.pyx +++ b/surprise/similarities.pyx @@ -25,11 +25,10 @@ from six.moves import range from six import iteritems -def cosine(n_x, yr, min_support): +def cosine(n_x, yr, min_support, common_ratings_only=True): """Compute the cosine similarity between all pairs of users (or items). - Only **common** users (or items) are taken into account. The cosine - similarity is defined as: + The cosine similarity is defined as: .. math:: \\text{cosine_sim}(u, v) = \\frac{ @@ -52,8 +51,20 @@ def cosine(n_x, yr, min_support): For details on cosine similarity, see on `Wikipedia `__. + + Depending on ``common_ratings_only`` field of ``sim_options`` + only common users (or items) are taken into account, or full rating + vectors (default: True). """ + if common_ratings_only: + return cosine_common_ratings_only(n_x, yr, min_support) + else: + return cosine_full_rating_vectors(n_x, yr, min_support) + + +def cosine_common_ratings_only(n_x, yr, min_support): + # sum (r_xy * r_x'y) for common ys cdef np.ndarray[np.double_t, ndim=2] prods # number of common ys @@ -80,8 +91,92 @@ def cosine(n_x, yr, min_support): for xj, rj in y_ratings: freq[xi, xj] += 1 prods[xi, xj] += ri * rj - sqi[xi, xj] += ri**2 - sqj[xi, xj] += rj**2 + sqi[xi, xj] += ri ** 2 + sqj[xi, xj] += rj ** 2 + + for xi in range(n_x): + sim[xi, xi] = 1 + for xj in range(xi + 1, n_x): + if freq[xi, xj] < min_sprt: + sim[xi, xj] = 0 + else: + denum = np.sqrt(sqi[xi, xj] * sqj[xi, xj]) + sim[xi, xj] = prods[xi, xj] / denum + + sim[xj, xi] = sim[xi, xj] + + return sim + + +def cosine_full_rating_vectors(n_x, yr, min_support): + + # sum (r_xy * r_x'y) for common ys + cdef np.ndarray[np.double_t, ndim=2] prods + # number of common ys + cdef np.ndarray[np.int_t, ndim=2] freq + # sum (r_xy ^ 2) for common ys + cdef np.ndarray[np.double_t, ndim=2] sqi + # sum (r_x'y ^ 2) for common ys + cdef np.ndarray[np.double_t, ndim=2] sqj + # the similarity matrix + cdef np.ndarray[np.double_t, ndim=2] sim + + cdef int xi, xj + cdef double ri, rj + cdef int min_sprt = min_support + + prods = np.zeros((n_x, n_x), np.double) + freq = np.zeros((n_x, n_x), np.int) + sqi = np.zeros((n_x, n_x), np.double) + sqj = np.zeros((n_x, n_x), np.double) + sim = np.zeros((n_x, n_x), np.double) + + for y, y_ratings in iteritems(yr): + + # yr_ratings data structure is sparse. But for cosine similarity it is + # necessary to obtain all pairs, substituting missing ratings for 0. + # Implementation: + # Iterate through the range of x-indexes, taking 0-rating for each + # index unless this index is actually present in the iter + sorted_y_ratings = sorted(y_ratings, key=lambda x: x[0]) + xi_iter = iter(sorted_y_ratings) + try: + xi_non_missing, ri_non_missing = next(xi_iter) + except StopIteration: + xi_non_missing = n_x + for xi_all in range(n_x): + if xi_all < xi_non_missing: + xi = xi_all + ri = 0 + else: + xi = xi_non_missing + ri = ri_non_missing + try: + xi_non_missing, ri_non_missing = next(xi_iter) + except StopIteration: + xi_non_missing = n_x + + xj_iter = iter(sorted_y_ratings) + try: + xj_non_missing, rj_non_missing = next(xj_iter) + except StopIteration: + xj_non_missing = n_x + for xj_all in range(n_x): + if xj_all < xj_non_missing: + xj = xj_all + rj = 0 + else: + xj = xj_non_missing + rj = rj_non_missing + try: + xj_non_missing, rj_non_missing = next(xj_iter) + except StopIteration: + xj_non_missing = n_x + + freq[xi, xj] += 1 + prods[xi, xj] += ri * rj + sqi[xi, xj] += ri ** 2 + sqj[xi, xj] += rj ** 2 for xi in range(n_x): sim[xi, xi] = 1 @@ -149,7 +244,7 @@ def msd(n_x, yr, min_support): for y, y_ratings in iteritems(yr): for xi, ri in y_ratings: for xj, rj in y_ratings: - sq_diff[xi, xj] += (ri - rj)**2 + sq_diff[xi, xj] += (ri - rj) ** 2 freq[xi, xj] += 1 for xi in range(n_x): @@ -232,8 +327,8 @@ def pearson(n_x, yr, min_support): for xj, rj in y_ratings: prods[xi, xj] += ri * rj freq[xi, xj] += 1 - sqi[xi, xj] += ri**2 - sqj[xi, xj] += rj**2 + sqi[xi, xj] += ri ** 2 + sqj[xi, xj] += rj ** 2 si[xi, xj] += ri sj[xi, xj] += rj @@ -341,8 +436,8 @@ def pearson_baseline(n_x, yr, min_support, global_mean, x_biases, y_biases, diff_i = (ri - (partial_bias + x_biases_[xi])) diff_j = (rj - (partial_bias + x_biases_[xj])) prods[xi, xj] += diff_i * diff_j - sq_diff_i[xi, xj] += diff_i**2 - sq_diff_j[xi, xj] += diff_j**2 + sq_diff_i[xi, xj] += diff_i ** 2 + sq_diff_j[xi, xj] += diff_j ** 2 for xi in range(n_x): sim[xi, xi] = 1 diff --git a/tests/test_similarities.py b/tests/test_similarities.py index da2fbae6..c1722e59 100644 --- a/tests/test_similarities.py +++ b/tests/test_similarities.py @@ -12,11 +12,11 @@ n_x = 8 yr_global = { - 0: [(0, 3), (1, 3), (2, 3), (5, 1), (6, 1.5), (7, 3)], # noqa + 0: [(0, 3), (1, 3), (2, 3), (5, 1), (6, 1.5), (7, 3)], # noqa 1: [(0, 4), (1, 4), (2, 4), ], # noqa 2: [ (2, 5), (3, 2), (4, 3) ], # noqa - 3: [(1, 1), (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)], # noqa - 4: [(1, 5), (2, 1), (5, 2), (6, 2.5), (7, 2.5)], # noqa + 3: [ (1, 1), (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)], # noqa + 4: [ (1, 5), (2, 1), (5, 2), (6, 2.5), (7, 2.5)], # noqa } @@ -48,7 +48,51 @@ def test_cosine_sim(): # cosine sim is necessarily 1 assert sim[3, 4] == 1 - # pairs of users (0, 3) have no common items + # pairs of users (0, 3) have no common items + assert sim[0, 3] == 0 + assert sim[0, 4] == 0 + + # check for float point support and computation correctness + dot_product56 = 1 * 1.5 + 3 * 3.5 + 2 * 2.5 + assert sim[5, 6] == (dot_product56 / + ((1 ** 2 + 3 ** 2 + 2 ** 2) * + (1.5 ** 2 + 3.5 ** 2 + 2.5 ** 2)) ** 0.5 + ) + + # ensure min_support is taken into account. Only users 1 and 2 have more + # than 4 common ratings. + sim = sims.cosine(n_x, yr, min_support=4) + for i in range(n_x): + for j in range(i + 1, n_x): + if i != 1 and j != 2: + assert sim[i, j] == 0 + + +def test_cosine_full_vectors_sim(): + """Tests for the cosine similarity.""" + + yr = yr_global.copy() + + # # shuffle every rating list, to ensure the order in which ratings are + # # processed does not matter (it's important because it used to be error + # # prone when we were using itertools.combinations) + # for _, ratings in yr.items(): + # random.shuffle(ratings) + + sim = sims.cosine(n_x, yr, min_support=1, common_ratings_only=False) + + # check symetry and bounds (as ratings are > 0, cosine sim must be >= 0) + for xi in range(n_x): + assert sim[xi, xi] == 1 + for xj in range(n_x): + assert sim[xi, xj] == sim[xj, xi] + assert 0 <= sim[xi, xj] <= 1 + + # users 0, 1 and 2 have different ratings when non-common items considered + assert sim[0, 1] < 1 + assert sim[0, 2] < 1 + + # pairs of users (0, 3) and (0,4) have no common items assert sim[0, 3] == 0 assert sim[0, 4] == 0