From 8a3532f57dc9804c00ec73abe93d7a843de0a84a Mon Sep 17 00:00:00 2001 From: martincousi Date: Mon, 26 Mar 2018 16:45:55 -0400 Subject: [PATCH 01/45] added asym_rmse and asym_mae --- surprise/accuracy.py | 75 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/surprise/accuracy.py b/surprise/accuracy.py index 1e9e4855..04ffca0f 100644 --- a/surprise/accuracy.py +++ b/surprise/accuracy.py @@ -88,6 +88,81 @@ def mae(predictions, verbose=True): return mae_ +def asym_rmse(predictions, weight=0.5, verbose=True): + """Compute Asymmetric RMSE (Root Mean Squared Error). + + .. math:: + \\text{Asymmetric RMSE} = \\sqrt{\\frac{1}{|\\hat{R}|} + \\sum_{\\hat{r}_{ui} \in \\hat{R}}(r_{ui} - \\hat{r}_{ui})^2 |\\omega + - 1_{r_{ui} - \\hat{r}_{ui} < 0}|}. + + Args: + predictions (:obj:`list` of :obj:`Prediction\ + `): + A list of predictions, as returned by the :meth:`test() + ` method. + weight (int): Weight used to characterize asymmetry. + verbose: If True, will print computed value. Default is ``True``. + + + Returns: + The Asymmetric Root Mean Squared Error of predictions. + + Raises: + ValueError: When ``predictions`` is empty. + """ + + if not predictions: + raise ValueError('Prediction list is empty.') + + res = np.array([float(true_r - est) + for (_, _, true_r, est, _) in predictions]) + asym_rmse_ = np.sqrt(np.mean(res**2 * np.abs(weight - + (res<0).astype(int)))) + + if verbose: + print('Asymmetric RMSE: {0:1.4f}'.format(asym_rmse_)) + + return asym_rmse_ + + +def asym_mae(predictions, weight=0.5, verbose=True): + """Compute Asymmetric MAE (Mean Absolute Error). + + .. math:: + \\text{Asymmetric MAE} = \\frac{1}{|\\hat{R}|} \\sum_{\\hat{r}_{ui} \in + \\hat{R}}|r_{ui} - \\hat{r}_{ui}| |\\omega - 1_{r_{ui} - \\hat{r}_{ui} + < 0}|. + + Args: + predictions (:obj:`list` of :obj:`Prediction\ + `): + A list of predictions, as returned by the :meth:`test() + ` method. + weight (int): Weight used to characterize asymmetry. + verbose: If True, will print computed value. Default is ``True``. + + + Returns: + The Asymmetric Mean Absolute Error of predictions. + + Raises: + ValueError: When ``predictions`` is empty. + """ + + if not predictions: + raise ValueError('Prediction list is empty.') + + res = np.array([float(true_r - est) + for (_, _, true_r, est, _) in predictions]) + asym_mae_ = np.mean(np.abs(res) * np.abs(weight - (res<0).astype(int))) + + if verbose: + print('Asymmetric MAE: {0:1.4f}'.format(asym_mae_)) + + return asym_mae_ + + def fcp(predictions, verbose=True): """Compute FCP (Fraction of Concordant Pairs). From 3f6b1d029d6fbada68bed45268675661da62f4dd Mon Sep 17 00:00:00 2001 From: martincousi Date: Tue, 27 Mar 2018 13:53:19 -0400 Subject: [PATCH 02/45] disable print in AlgoBase.compute_baselines() --- surprise/prediction_algorithms/algo_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/surprise/prediction_algorithms/algo_base.py b/surprise/prediction_algorithms/algo_base.py index 844cb44e..ce4a3e50 100644 --- a/surprise/prediction_algorithms/algo_base.py +++ b/surprise/prediction_algorithms/algo_base.py @@ -240,7 +240,7 @@ def compute_baselines(self): method_name = self.bsl_options.get('method', 'als') try: - print('Estimating biases using', method_name + '...') + # print('Estimating biases using', method_name + '...') self.bu, self.bi = method[method_name](self) return self.bu, self.bi except KeyError: From daab1bac3b19a02b84085ad280c89a7de3016dcf Mon Sep 17 00:00:00 2001 From: martincousi Date: Wed, 28 Mar 2018 14:34:35 -0400 Subject: [PATCH 03/45] Cancel printing of computation of similarities --- surprise/prediction_algorithms/algo_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/surprise/prediction_algorithms/algo_base.py b/surprise/prediction_algorithms/algo_base.py index ce4a3e50..3a80c4d4 100644 --- a/surprise/prediction_algorithms/algo_base.py +++ b/surprise/prediction_algorithms/algo_base.py @@ -287,9 +287,9 @@ def compute_similarities(self): args += [self.trainset.global_mean, bx, by, shrinkage] try: - print('Computing the {0} similarity matrix...'.format(name)) + # print('Computing the {0} similarity matrix...'.format(name)) sim = construction_func[name](*args) - print('Done computing similarity matrix.') + # print('Done computing similarity matrix.') return sim except KeyError: raise NameError('Wrong sim name ' + name + '. Allowed values ' + From 05ef072d85ba00dd821a57122c2c9a689b95f401 Mon Sep 17 00:00:00 2001 From: martincousi Date: Wed, 28 Mar 2018 16:30:31 -0400 Subject: [PATCH 04/45] Cancel printing of similiraty computation --- surprise/prediction_algorithms/algo_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/surprise/prediction_algorithms/algo_base.py b/surprise/prediction_algorithms/algo_base.py index ce4a3e50..3a80c4d4 100644 --- a/surprise/prediction_algorithms/algo_base.py +++ b/surprise/prediction_algorithms/algo_base.py @@ -287,9 +287,9 @@ def compute_similarities(self): args += [self.trainset.global_mean, bx, by, shrinkage] try: - print('Computing the {0} similarity matrix...'.format(name)) + # print('Computing the {0} similarity matrix...'.format(name)) sim = construction_func[name](*args) - print('Done computing similarity matrix.') + # print('Done computing similarity matrix.') return sim except KeyError: raise NameError('Wrong sim name ' + name + '. Allowed values ' + From 902246f32cccd077517f25a6c972f528dcfa023d Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 29 Mar 2018 14:37:26 -0400 Subject: [PATCH 05/45] add load_features_df() method --- .gitignore | 3 ++- surprise/dataset.py | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index bd32b905..24fd8009 100644 --- a/.gitignore +++ b/.gitignore @@ -23,4 +23,5 @@ _site .coverage tags -settings.json \ No newline at end of file +settings.json +surprise/.DS_Store diff --git a/surprise/dataset.py b/surprise/dataset.py index 17638b6c..f5df2f6a 100644 --- a/surprise/dataset.py +++ b/surprise/dataset.py @@ -53,6 +53,8 @@ class Dataset: def __init__(self, reader): self.reader = reader + self.raw_user_features = None + self.raw_item_features = None @classmethod def load_builtin(cls, name='ml-100k'): @@ -165,6 +167,28 @@ def load_from_df(cls, df, reader): return DatasetAutoFolds(reader=reader, df=df) + def load_features_df(self, features_df, user_features=True): + """Load features from a pandas dataframe into a dataset. + + Use this if you want to add user or item features to a dataset. Only + certain prediction algorithms in the :mod:`prediction_algorithms` + package support this additional data. + + Args: + features_df(`Dataframe`): The dataframe containing the features. It + must have two columns or more, corresponding to the user or + item (raw) ids, and the features, in this order. + user_features(:obj:`bool`): Whether the features are for the users + or the items. Default is ``True``. + """ + + if user_features: + self.user_features_df = features_df + self.raw_user_features = features_df.values.tolist() + else: + self.item_features_df = features_df + self.raw_item_features = features_df.values.tolist() + def read_ratings(self, file_name): """Return a list of ratings (user, item, rating, timestamp) read from file_name""" From fb64e9897ab40c1c072838305c4e11215cfd5aa8 Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 29 Mar 2018 16:07:15 -0400 Subject: [PATCH 06/45] modified construct_trainset() and load_features_df() --- surprise/dataset.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/surprise/dataset.py b/surprise/dataset.py index f5df2f6a..8953a105 100644 --- a/surprise/dataset.py +++ b/surprise/dataset.py @@ -53,8 +53,8 @@ class Dataset: def __init__(self, reader): self.reader = reader - self.raw_user_features = None - self.raw_item_features = None + self.user_features = None + self.item_features = None @classmethod def load_builtin(cls, name='ml-100k'): @@ -184,10 +184,16 @@ def load_features_df(self, features_df, user_features=True): if user_features: self.user_features_df = features_df - self.raw_user_features = features_df.values.tolist() + self.user_features = {urid: features for (urid, *features) in + features_df.itertuples(index=False)} + self.user_features_labels = features_df.columns.values.tolist()[1:] + self.user_features_nb = len(self.user_features_labels) else: self.item_features_df = features_df - self.raw_item_features = features_df.values.tolist() + self.item_features = {irid: features for (irid, *features) in + features_df.itertuples(index=False)} + self.item_features_labels = features_df.columns.values.tolist()[1:] + self.item_features_nb = len(self.item_features_labels) def read_ratings(self, file_name): """Return a list of ratings (user, item, rating, timestamp) read from @@ -231,6 +237,8 @@ def construct_trainset(self, raw_trainset): ur = defaultdict(list) ir = defaultdict(list) + u_features = defaultdict(list) + i_features = defaultdict(list) # user raw id, item raw id, translated rating, time stamp for urid, irid, r, timestamp in raw_trainset: @@ -240,12 +248,26 @@ def construct_trainset(self, raw_trainset): uid = current_u_index raw2inner_id_users[urid] = current_u_index current_u_index += 1 + if self.user_features is not None: + try: + u_features[uid] = self.user_features[urid] + except KeyError: + print('user ' + urid + ' does not exist in ' + + 'self.user_features') + raise try: iid = raw2inner_id_items[irid] except KeyError: iid = current_i_index raw2inner_id_items[irid] = current_i_index current_i_index += 1 + if self.item_features is not None: + try: + i_features[iid] = self.item_features[irid] + except KeyError: + print('item ' + irid + ' does not exist in ' + + 'self.item_features') + raise ur[uid].append((iid, r)) ir[iid].append((uid, r)) From 13f3a287ca7fc2287c68d1919d3aea2721d05a6e Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 29 Mar 2018 16:18:00 -0400 Subject: [PATCH 07/45] modified Trainset.__init__() --- surprise/dataset.py | 6 ++++++ surprise/trainset.py | 15 ++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/surprise/dataset.py b/surprise/dataset.py index 8953a105..0f386626 100644 --- a/surprise/dataset.py +++ b/surprise/dataset.py @@ -274,12 +274,18 @@ def construct_trainset(self, raw_trainset): n_users = len(ur) # number of users n_items = len(ir) # number of items + n_user_features = len(u_features) + n_item_features = len(i_features) n_ratings = len(raw_trainset) trainset = Trainset(ur, ir, + u_features, + i_features, n_users, n_items, + n_user_features, + n_item_features, n_ratings, self.reader.rating_scale, self.reader.offset, diff --git a/surprise/trainset.py b/surprise/trainset.py index ebb95204..885bc41c 100644 --- a/surprise/trainset.py +++ b/surprise/trainset.py @@ -33,21 +33,34 @@ class Trainset: ir(:obj:`defaultdict` of :obj:`list`): The items ratings. This is a dictionary containing lists of tuples of the form ``(user_inner_id, rating)``. The keys are item inner ids. + u_features(:obj:`defaultdict` of :obj:`list`): The user features. This + is a dictionary containing lists of features. The keys are user + inner ids. + i_features(:obj:`defaultdict` of :obj:`list`): The item features. This + is a dictionary containing lists of features. The keys are item + inner ids. n_users: Total number of users :math:`|U|`. n_items: Total number of items :math:`|I|`. + n_user_features: Total number of user features. + n_item_features: Total number of item features. n_ratings: Total number of ratings :math:`|R_{train}|`. rating_scale(tuple): The minimum and maximal rating of the rating scale. global_mean: The mean of all ratings :math:`\\mu`. """ - def __init__(self, ur, ir, n_users, n_items, n_ratings, rating_scale, + def __init__(self, ur, ir, u_features, i_features, n_users, n_items, + n_user_features, n_item_features, n_ratings, rating_scale, offset, raw2inner_id_users, raw2inner_id_items): self.ur = ur self.ir = ir + self.u_features = u_features + self.i_features = i_features self.n_users = n_users self.n_items = n_items + self.n_user_features = n_user_features + self.n_item_features = n_item_features self.n_ratings = n_ratings self.rating_scale = rating_scale self.offset = offset From 900c0c0cb02e15e29b107657f1b3b33b9f3416c9 Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 29 Mar 2018 17:23:02 -0400 Subject: [PATCH 08/45] corrected bugs in print statement --- surprise/dataset.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/surprise/dataset.py b/surprise/dataset.py index 0f386626..e93b6d2b 100644 --- a/surprise/dataset.py +++ b/surprise/dataset.py @@ -195,6 +195,8 @@ def load_features_df(self, features_df, user_features=True): self.item_features_labels = features_df.columns.values.tolist()[1:] self.item_features_nb = len(self.item_features_labels) + return self + def read_ratings(self, file_name): """Return a list of ratings (user, item, rating, timestamp) read from file_name""" @@ -252,7 +254,7 @@ def construct_trainset(self, raw_trainset): try: u_features[uid] = self.user_features[urid] except KeyError: - print('user ' + urid + ' does not exist in ' + + print('user ' + str(urid) + ' does not exist in ' + 'self.user_features') raise try: @@ -265,7 +267,7 @@ def construct_trainset(self, raw_trainset): try: i_features[iid] = self.item_features[irid] except KeyError: - print('item ' + irid + ' does not exist in ' + + print('item ' + str(irid) + ' does not exist in ' + 'self.item_features') raise From 68ccfca4f3645c94bbc6e43fc7ea2b587cee2f24 Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 29 Mar 2018 21:01:45 -0400 Subject: [PATCH 09/45] use user_features_nb to test if initialized --- surprise/dataset.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/surprise/dataset.py b/surprise/dataset.py index e93b6d2b..36b96e36 100644 --- a/surprise/dataset.py +++ b/surprise/dataset.py @@ -53,8 +53,8 @@ class Dataset: def __init__(self, reader): self.reader = reader - self.user_features = None - self.item_features = None + self.user_features_nb = 0 + self.item_features_nb = 0 @classmethod def load_builtin(cls, name='ml-100k'): @@ -250,7 +250,7 @@ def construct_trainset(self, raw_trainset): uid = current_u_index raw2inner_id_users[urid] = current_u_index current_u_index += 1 - if self.user_features is not None: + if self.user_features_nb > 0: try: u_features[uid] = self.user_features[urid] except KeyError: @@ -263,7 +263,7 @@ def construct_trainset(self, raw_trainset): iid = current_i_index raw2inner_id_items[irid] = current_i_index current_i_index += 1 - if self.item_features is not None: + if self.item_features_nb > 0: try: i_features[iid] = self.item_features[irid] except KeyError: @@ -276,8 +276,6 @@ def construct_trainset(self, raw_trainset): n_users = len(ur) # number of users n_items = len(ir) # number of items - n_user_features = len(u_features) - n_item_features = len(i_features) n_ratings = len(raw_trainset) trainset = Trainset(ur, @@ -286,8 +284,8 @@ def construct_trainset(self, raw_trainset): i_features, n_users, n_items, - n_user_features, - n_item_features, + self.user_features_nb, + self.item_features_nb, n_ratings, self.reader.rating_scale, self.reader.offset, From f7fa4d89eff119c05437089ab8102af41b63c06f Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 29 Mar 2018 21:51:13 -0400 Subject: [PATCH 10/45] revert back changes to accuracy.py --- surprise/accuracy.py | 75 -------------------------------------------- 1 file changed, 75 deletions(-) diff --git a/surprise/accuracy.py b/surprise/accuracy.py index 04ffca0f..1e9e4855 100644 --- a/surprise/accuracy.py +++ b/surprise/accuracy.py @@ -88,81 +88,6 @@ def mae(predictions, verbose=True): return mae_ -def asym_rmse(predictions, weight=0.5, verbose=True): - """Compute Asymmetric RMSE (Root Mean Squared Error). - - .. math:: - \\text{Asymmetric RMSE} = \\sqrt{\\frac{1}{|\\hat{R}|} - \\sum_{\\hat{r}_{ui} \in \\hat{R}}(r_{ui} - \\hat{r}_{ui})^2 |\\omega - - 1_{r_{ui} - \\hat{r}_{ui} < 0}|}. - - Args: - predictions (:obj:`list` of :obj:`Prediction\ - `): - A list of predictions, as returned by the :meth:`test() - ` method. - weight (int): Weight used to characterize asymmetry. - verbose: If True, will print computed value. Default is ``True``. - - - Returns: - The Asymmetric Root Mean Squared Error of predictions. - - Raises: - ValueError: When ``predictions`` is empty. - """ - - if not predictions: - raise ValueError('Prediction list is empty.') - - res = np.array([float(true_r - est) - for (_, _, true_r, est, _) in predictions]) - asym_rmse_ = np.sqrt(np.mean(res**2 * np.abs(weight - - (res<0).astype(int)))) - - if verbose: - print('Asymmetric RMSE: {0:1.4f}'.format(asym_rmse_)) - - return asym_rmse_ - - -def asym_mae(predictions, weight=0.5, verbose=True): - """Compute Asymmetric MAE (Mean Absolute Error). - - .. math:: - \\text{Asymmetric MAE} = \\frac{1}{|\\hat{R}|} \\sum_{\\hat{r}_{ui} \in - \\hat{R}}|r_{ui} - \\hat{r}_{ui}| |\\omega - 1_{r_{ui} - \\hat{r}_{ui} - < 0}|. - - Args: - predictions (:obj:`list` of :obj:`Prediction\ - `): - A list of predictions, as returned by the :meth:`test() - ` method. - weight (int): Weight used to characterize asymmetry. - verbose: If True, will print computed value. Default is ``True``. - - - Returns: - The Asymmetric Mean Absolute Error of predictions. - - Raises: - ValueError: When ``predictions`` is empty. - """ - - if not predictions: - raise ValueError('Prediction list is empty.') - - res = np.array([float(true_r - est) - for (_, _, true_r, est, _) in predictions]) - asym_mae_ = np.mean(np.abs(res) * np.abs(weight - (res<0).astype(int))) - - if verbose: - print('Asymmetric MAE: {0:1.4f}'.format(asym_mae_)) - - return asym_mae_ - - def fcp(predictions, verbose=True): """Compute FCP (Fraction of Concordant Pairs). From c6591ae125c7a9016c89099f36b36b619ebbefc5 Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 29 Mar 2018 21:52:26 -0400 Subject: [PATCH 11/45] revert back changes to AlgoBase --- surprise/prediction_algorithms/algo_base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/surprise/prediction_algorithms/algo_base.py b/surprise/prediction_algorithms/algo_base.py index 3a80c4d4..844cb44e 100644 --- a/surprise/prediction_algorithms/algo_base.py +++ b/surprise/prediction_algorithms/algo_base.py @@ -240,7 +240,7 @@ def compute_baselines(self): method_name = self.bsl_options.get('method', 'als') try: - # print('Estimating biases using', method_name + '...') + print('Estimating biases using', method_name + '...') self.bu, self.bi = method[method_name](self) return self.bu, self.bi except KeyError: @@ -287,9 +287,9 @@ def compute_similarities(self): args += [self.trainset.global_mean, bx, by, shrinkage] try: - # print('Computing the {0} similarity matrix...'.format(name)) + print('Computing the {0} similarity matrix...'.format(name)) sim = construction_func[name](*args) - # print('Done computing similarity matrix.') + print('Done computing similarity matrix.') return sim except KeyError: raise NameError('Wrong sim name ' + name + '. Allowed values ' + From e31e857a7aa094326d36e79e24729a881f7ba31e Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 29 Mar 2018 21:55:52 -0400 Subject: [PATCH 12/45] Update .gitignore --- .gitignore | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitignore b/.gitignore index 24fd8009..bd9d038c 100644 --- a/.gitignore +++ b/.gitignore @@ -23,5 +23,3 @@ _site .coverage tags -settings.json -surprise/.DS_Store From 7d679630ba2f2329eb4ea4e9c7f1fba2a9720fbf Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 29 Mar 2018 21:56:19 -0400 Subject: [PATCH 13/45] Update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index bd9d038c..45019cb0 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,4 @@ _site .coverage tags +settings.json From 73bea5018ee1137bba1f0792428d133e969a4c04 Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 29 Mar 2018 22:17:41 -0400 Subject: [PATCH 14/45] fixed python 2 compatibility --- surprise/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/surprise/dataset.py b/surprise/dataset.py index 36b96e36..61170bb6 100644 --- a/surprise/dataset.py +++ b/surprise/dataset.py @@ -184,13 +184,13 @@ def load_features_df(self, features_df, user_features=True): if user_features: self.user_features_df = features_df - self.user_features = {urid: features for (urid, *features) in + self.user_features = {tup[0]: tup[1:] for tup in features_df.itertuples(index=False)} self.user_features_labels = features_df.columns.values.tolist()[1:] self.user_features_nb = len(self.user_features_labels) else: self.item_features_df = features_df - self.item_features = {irid: features for (irid, *features) in + self.item_features = {tup[0]: tup[1:] for tup in features_df.itertuples(index=False)} self.item_features_labels = features_df.columns.values.tolist()[1:] self.item_features_nb = len(self.item_features_labels) From 4063da8286d165f7bd0b47b1bcc4e9006ecc4c31 Mon Sep 17 00:00:00 2001 From: martincousi Date: Wed, 4 Apr 2018 09:51:43 -0400 Subject: [PATCH 15/45] construction of Lasso.fit() --- surprise/__init__.py | 4 +- surprise/dataset.py | 3 +- surprise/prediction_algorithms/__init__.py | 3 +- surprise/prediction_algorithms/features.py | 97 +++++++++++++++++++ .../matrix_factorization.pyx | 2 +- surprise/trainset.py | 24 +++++ 6 files changed, 129 insertions(+), 4 deletions(-) create mode 100644 surprise/prediction_algorithms/features.py diff --git a/surprise/__init__.py b/surprise/__init__.py index e87ca980..82de2460 100644 --- a/surprise/__init__.py +++ b/surprise/__init__.py @@ -12,6 +12,7 @@ from .prediction_algorithms import NMF from .prediction_algorithms import SlopeOne from .prediction_algorithms import CoClustering +from .prediction_algorithms import Lasso from .prediction_algorithms import PredictionImpossible from .prediction_algorithms import Prediction @@ -30,6 +31,7 @@ 'KNNWithMeans', 'KNNBaseline', 'SVD', 'SVDpp', 'NMF', 'SlopeOne', 'CoClustering', 'PredictionImpossible', 'Prediction', 'Dataset', 'Reader', 'Trainset', 'evaluate', 'print_perf', 'GridSearch', - 'dump', 'KNNWithZScore', 'get_dataset_dir', 'model_selection'] + 'dump', 'KNNWithZScore', 'get_dataset_dir', 'model_selection', + 'Lasso'] __version__ = get_distribution('scikit-surprise').version diff --git a/surprise/dataset.py b/surprise/dataset.py index 61170bb6..980f797d 100644 --- a/surprise/dataset.py +++ b/surprise/dataset.py @@ -239,6 +239,7 @@ def construct_trainset(self, raw_trainset): ur = defaultdict(list) ir = defaultdict(list) + u_features = defaultdict(list) i_features = defaultdict(list) @@ -297,7 +298,7 @@ def construct_trainset(self, raw_trainset): def construct_testset(self, raw_testset): return [(ruid, riid, r_ui_trans) - for (ruid, riid, r_ui_trans, _) in raw_testset] + for (ruid, riid, r_ui_trans, __) in raw_testset] class DatasetUserFolds(Dataset): diff --git a/surprise/prediction_algorithms/__init__.py b/surprise/prediction_algorithms/__init__.py index d5ce8288..61091131 100644 --- a/surprise/prediction_algorithms/__init__.py +++ b/surprise/prediction_algorithms/__init__.py @@ -32,6 +32,7 @@ from .matrix_factorization import NMF from .slope_one import SlopeOne from .co_clustering import CoClustering +from .features import Lasso from .predictions import PredictionImpossible from .predictions import Prediction @@ -39,4 +40,4 @@ __all__ = ['AlgoBase', 'NormalPredictor', 'BaselineOnly', 'KNNBasic', 'KNNBaseline', 'KNNWithMeans', 'SVD', 'SVDpp', 'NMF', 'SlopeOne', 'CoClustering', 'PredictionImpossible', 'Prediction', - 'KNNWithZScore'] + 'KNNWithZScore', 'Lasso'] diff --git a/surprise/prediction_algorithms/features.py b/surprise/prediction_algorithms/features.py new file mode 100644 index 00000000..b7d435c7 --- /dev/null +++ b/surprise/prediction_algorithms/features.py @@ -0,0 +1,97 @@ +""" +the :mod:`features` module includes some features-based algorithms. +""" + +from __future__ import (absolute_import, division, print_function, + unicode_literals) +import numpy as np +from six import iteritems +import heapq +from sklearn import linear_model + +from .predictions import PredictionImpossible +from .algo_base import AlgoBase + + +class Lasso(AlgoBase): + """A basic linear regression algorithm. + + The prediction :math:`\\hat{r}_{ui}` is set as: + + .. math:: + \hat{r}_{ui} = \\frac{ + \\sum\\limits_{v \in N^k_i(u)} \\text{sim}(u, v) \cdot r_{vi}} + {\\sum\\limits_{v \in N^k_i(u)} \\text{sim}(u, v)} + + or + + .. math:: + \hat{r}_{ui} = \\frac{ + \\sum\\limits_{j \in N^k_u(i)} \\text{sim}(i, j) \cdot r_{uj}} + {\\sum\\limits_{j \in N^k_u(j)} \\text{sim}(i, j)} + + depending on the ``user_based`` field of the ``sim_options`` parameter. + + Args: + k(int): The (max) number of neighbors to take into account for + aggregation (see :ref:`this note `). Default is + ``40``. + min_k(int): The minimum number of neighbors to take into account for + aggregation. If there are not enough neighbors, the prediction is + set the the global mean of all ratings. Default is ``1``. + sim_options(dict): A dictionary of options for the similarity + measure. See :ref:`similarity_measures_configuration` for accepted + options. + """ + + def __init__(self, **kwargs): + + AlgoBase.__init__(self, **kwargs) + + def fit(self, trainset): + + AlgoBase.fit(self, trainset) + self.lasso(trainset) + + return self + + def lasso(self, trainset): + + if (self.trainset.n_user_features == 0 or + self.trainset.n_item_features == 0): + raise ValueError('trainset does not contain user and/or item features.') + + n_ratings = self.trainset.n_ratings + n_uf = self.trainset.n_user_features + n_if = self.trainset.n_item_features + u_features = self.trainset.u_features + i_features = self.trainset.i_features + + X = np.empty((n_ratings, n_uf + n_if)) + y = np.empty((n_ratings,)) + for k, (uid, iid, rating) in enumerate(self.trainset.all_ratings()): + y[k] = rating + X[k, :n_uf] = u_features[uid] + X[k, n_uf:] = i_features[iid] + + reg = linear_model.Lasso(alpha=0.1) + reg.fit(X, y) + + # self.X = X + # self.y = y + self.coef = reg.coef_ + self.intercept = reg.intercept_ + + def estimate(self, u, i): + + if not (self.trainset.has_user_features(u) and + self.trainset.has_item_features(i)): + raise PredictionImpossible('User and/or item features ' + 'are unknown.') + + x = np.concatenate((self.trainset.u_features[u], + self.trainset.i_features[i])) + + est = self.intercept + np.dot(x, self.coef) + + return est diff --git a/surprise/prediction_algorithms/matrix_factorization.pyx b/surprise/prediction_algorithms/matrix_factorization.pyx index 0e898632..fe2f1e56 100644 --- a/surprise/prediction_algorithms/matrix_factorization.pyx +++ b/surprise/prediction_algorithms/matrix_factorization.pyx @@ -275,7 +275,7 @@ class SVD(AlgoBase): if known_user and known_item: est = np.dot(self.qi[i], self.pu[u]) else: - raise PredictionImpossible('User and item are unkown.') + raise PredictionImpossible('User and item are unknown.') return est diff --git a/surprise/trainset.py b/surprise/trainset.py index 885bc41c..4da7ddf1 100644 --- a/surprise/trainset.py +++ b/surprise/trainset.py @@ -100,6 +100,30 @@ def knows_item(self, iid): return iid in self.ir + def has_user_features(self, uid): + """Indicate if the user features are part of the trainset. + + Args: + uid(int): The (inner) user id. See :ref:`this + note`. + Returns: + ``True`` if user features are part of the trainset, else ``False``. + """ + + return uid in self.u_features + + def has_item_features(self, iid): + """Indicate if the item features are part of the trainset. + + Args: + iid(int): The (inner) item id. See :ref:`this + note`. + Returns: + ``True`` if item features are part of the trainset, else ``False``. + """ + + return iid in self.i_features + def to_inner_uid(self, ruid): """Convert a **user** raw id to an inner id. From 34dd04bfbfafbb9578993625866f4fb9ae8e87ab Mon Sep 17 00:00:00 2001 From: martincousi Date: Wed, 4 Apr 2018 10:26:44 -0400 Subject: [PATCH 16/45] modified predict and estimate methods --- surprise/prediction_algorithms/algo_base.py | 18 ++++++++++++------ surprise/prediction_algorithms/features.py | 17 ++++++++--------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/surprise/prediction_algorithms/algo_base.py b/surprise/prediction_algorithms/algo_base.py index 844cb44e..2f8cf950 100644 --- a/surprise/prediction_algorithms/algo_base.py +++ b/surprise/prediction_algorithms/algo_base.py @@ -9,6 +9,7 @@ class :class:`AlgoBase` from which every single prediction algorithm has to import warnings from six import get_unbound_function as guf +import numpy as np from .. import similarities as sims from .predictions import PredictionImpossible @@ -37,7 +38,7 @@ def __init__(self, **kwargs): self.skip_train = False if (guf(self.__class__.fit) is guf(AlgoBase.fit) and - guf(self.__class__.train) is not guf(AlgoBase.train)): + guf(self.__class__.train) is not guf(AlgoBase.train)): warnings.warn('It looks like this algorithm (' + str(self.__class__) + ') implements train() ' @@ -96,7 +97,8 @@ def fit(self, trainset): return self - def predict(self, uid, iid, r_ui=None, clip=True, verbose=False): + def predict(self, uid, iid, u_features=[], i_features=[], r_ui=None, + clip=True, verbose=False): """Compute the rating prediction for given user and item. The ``predict`` method converts raw ids to inner ids and then calls the @@ -108,6 +110,10 @@ def predict(self, uid, iid, r_ui=None, clip=True, verbose=False): Args: uid: (Raw) id of the user. See :ref:`this note`. iid: (Raw) id of the item. See :ref:`this note`. + u_features: List of user features in the same order as used in + the ``fit`` method. Optional, default is empty list. + i_features: List of item features in the same order as used in + the ``fit`` method. Optional, default is empty list. r_ui(float): The true rating :math:`r_{ui}`. Optional, default is ``None``. clip(bool): Whether to clip the estimation into the rating scale. @@ -143,7 +149,7 @@ def predict(self, uid, iid, r_ui=None, clip=True, verbose=False): details = {} try: - est = self.estimate(iuid, iiid) + est = self.estimate(iuid, iiid, u_features, i_features) # If the details dict was also returned if isinstance(est, tuple): @@ -240,7 +246,7 @@ def compute_baselines(self): method_name = self.bsl_options.get('method', 'als') try: - print('Estimating biases using', method_name + '...') + # print('Estimating biases using', method_name + '...') self.bu, self.bi = method[method_name](self) return self.bu, self.bi except KeyError: @@ -287,9 +293,9 @@ def compute_similarities(self): args += [self.trainset.global_mean, bx, by, shrinkage] try: - print('Computing the {0} similarity matrix...'.format(name)) + # print('Computing the {0} similarity matrix...'.format(name)) sim = construction_func[name](*args) - print('Done computing similarity matrix.') + # print('Done computing similarity matrix.') return sim except KeyError: raise NameError('Wrong sim name ' + name + '. Allowed values ' + diff --git a/surprise/prediction_algorithms/features.py b/surprise/prediction_algorithms/features.py index b7d435c7..774678af 100644 --- a/surprise/prediction_algorithms/features.py +++ b/surprise/prediction_algorithms/features.py @@ -59,7 +59,8 @@ def lasso(self, trainset): if (self.trainset.n_user_features == 0 or self.trainset.n_item_features == 0): - raise ValueError('trainset does not contain user and/or item features.') + raise ValueError('trainset does not contain user and/or item ' + 'features.') n_ratings = self.trainset.n_ratings n_uf = self.trainset.n_user_features @@ -82,16 +83,14 @@ def lasso(self, trainset): self.coef = reg.coef_ self.intercept = reg.intercept_ - def estimate(self, u, i): + def estimate(self, u, i, u_features, i_features): - if not (self.trainset.has_user_features(u) and - self.trainset.has_item_features(i)): - raise PredictionImpossible('User and/or item features ' - 'are unknown.') + features = np.concatenate([u_features, i_features]) - x = np.concatenate((self.trainset.u_features[u], - self.trainset.i_features[i])) + if len(features) != len(self.coef): + raise PredictionImpossible('User and/or item features ' + 'are incomplete.') - est = self.intercept + np.dot(x, self.coef) + est = self.intercept + np.dot(features, self.coef) return est From d275f8463fdf686f2e33ef22c42a5b8980bdce88 Mon Sep 17 00:00:00 2001 From: martincousi Date: Wed, 4 Apr 2018 16:35:18 -0400 Subject: [PATCH 17/45] include features in testset and prediction objects --- surprise/accuracy.py | 6 ++-- surprise/dataset.py | 32 +++++++++---------- surprise/evaluate.py | 2 ++ surprise/model_selection/split.py | 4 +-- surprise/prediction_algorithms/algo_base.py | 14 ++++---- .../prediction_algorithms/baseline_only.py | 2 +- surprise/prediction_algorithms/features.py | 18 ++++++++--- surprise/prediction_algorithms/knns.py | 8 ++--- surprise/prediction_algorithms/predictions.py | 13 +++++++- surprise/trainset.py | 24 ++++++++++---- 10 files changed, 78 insertions(+), 45 deletions(-) diff --git a/surprise/accuracy.py b/surprise/accuracy.py index 1e9e4855..d6e8a88f 100644 --- a/surprise/accuracy.py +++ b/surprise/accuracy.py @@ -45,7 +45,7 @@ def rmse(predictions, verbose=True): raise ValueError('Prediction list is empty.') mse = np.mean([float((true_r - est)**2) - for (_, _, true_r, est, _) in predictions]) + for (__, __, __, __, true_r, est, __) in predictions]) rmse_ = np.sqrt(mse) if verbose: @@ -80,7 +80,7 @@ def mae(predictions, verbose=True): raise ValueError('Prediction list is empty.') mae_ = np.mean([float(abs(true_r - est)) - for (_, _, true_r, est, _) in predictions]) + for (__, __, __, __, true_r, est, __) in predictions]) if verbose: print('MAE: {0:1.4f}'.format(mae_)) @@ -117,7 +117,7 @@ def fcp(predictions, verbose=True): nc_u = defaultdict(int) nd_u = defaultdict(int) - for u0, _, r0, est, _ in predictions: + for u0, __, __, __, r0, est, __ in predictions: predictions_u[u0].append((r0, est)) for u0, preds in iteritems(predictions_u): diff --git a/surprise/dataset.py b/surprise/dataset.py index 980f797d..00dd623f 100644 --- a/surprise/dataset.py +++ b/surprise/dataset.py @@ -55,6 +55,8 @@ def __init__(self, reader): self.reader = reader self.user_features_nb = 0 self.item_features_nb = 0 + self.user_features = {} + self.item_features = {} @classmethod def load_builtin(cls, name='ml-100k'): @@ -240,11 +242,11 @@ def construct_trainset(self, raw_trainset): ur = defaultdict(list) ir = defaultdict(list) - u_features = defaultdict(list) - i_features = defaultdict(list) + u_features = {} + i_features = {} # user raw id, item raw id, translated rating, time stamp - for urid, irid, r, timestamp in raw_trainset: + for urid, irid, r, __ in raw_trainset: try: uid = raw2inner_id_users[urid] except KeyError: @@ -252,12 +254,8 @@ def construct_trainset(self, raw_trainset): raw2inner_id_users[urid] = current_u_index current_u_index += 1 if self.user_features_nb > 0: - try: - u_features[uid] = self.user_features[urid] - except KeyError: - print('user ' + str(urid) + ' does not exist in ' + - 'self.user_features') - raise + u_features[uid] = self.user_features.get(urid, None) + try: iid = raw2inner_id_items[irid] except KeyError: @@ -265,12 +263,7 @@ def construct_trainset(self, raw_trainset): raw2inner_id_items[irid] = current_i_index current_i_index += 1 if self.item_features_nb > 0: - try: - i_features[iid] = self.item_features[irid] - except KeyError: - print('item ' + str(irid) + ' does not exist in ' + - 'self.item_features') - raise + i_features[iid] = self.item_features.get(irid, None) ur[uid].append((iid, r)) ir[iid].append((uid, r)) @@ -297,8 +290,13 @@ def construct_trainset(self, raw_trainset): def construct_testset(self, raw_testset): - return [(ruid, riid, r_ui_trans) - for (ruid, riid, r_ui_trans, __) in raw_testset] + testset = [] + for (ruid, riid, r_ui_trans, __) in raw_testset: + u_features = self.user_features.get(ruid, None) + i_features = self.item_features.get(riid, None) + testset.append((ruid, riid, u_features, i_features, r_ui_trans)) + + return testset class DatasetUserFolds(Dataset): diff --git a/surprise/evaluate.py b/surprise/evaluate.py index 19e80fa5..bb283356 100644 --- a/surprise/evaluate.py +++ b/surprise/evaluate.py @@ -301,6 +301,7 @@ class CaseInsensitiveDefaultDict(defaultdict): Used for the returned dict, so that users can use perf['RMSE'] or perf['rmse'] indifferently. """ + def __setitem__(self, key, value): super(CaseInsensitiveDefaultDict, self).__setitem__(key.lower(), value) @@ -333,4 +334,5 @@ def seed_and_eval(seed, *args): different processes.""" random.seed(seed) + return evaluate(*args, verbose=0) diff --git a/surprise/model_selection/split.py b/surprise/model_selection/split.py index 14697911..ee9dfc64 100644 --- a/surprise/model_selection/split.py +++ b/surprise/model_selection/split.py @@ -276,7 +276,7 @@ def split(self, data): self.test_size, self.train_size, len(data.raw_ratings)) rng = get_rng(self.random_state) - for _ in range(self.n_splits): + for __ in range(self.n_splits): if self.shuffle: permutation = rng.permutation(len(data.raw_ratings)) @@ -372,7 +372,7 @@ def split(self, data): Args: data(:obj:`Dataset`): The data containing - ratings that will be devided into trainsets and testsets. + ratings that will be divided into trainsets and testsets. Yields: tuple of (trainset, testset) diff --git a/surprise/prediction_algorithms/algo_base.py b/surprise/prediction_algorithms/algo_base.py index 2f8cf950..41692cae 100644 --- a/surprise/prediction_algorithms/algo_base.py +++ b/surprise/prediction_algorithms/algo_base.py @@ -9,7 +9,6 @@ class :class:`AlgoBase` from which every single prediction algorithm has to import warnings from six import get_unbound_function as guf -import numpy as np from .. import similarities as sims from .predictions import PredictionImpossible @@ -97,7 +96,7 @@ def fit(self, trainset): return self - def predict(self, uid, iid, u_features=[], i_features=[], r_ui=None, + def predict(self, uid, iid, u_features=None, i_features=None, r_ui=None, clip=True, verbose=False): """Compute the rating prediction for given user and item. @@ -111,9 +110,9 @@ def predict(self, uid, iid, u_features=[], i_features=[], r_ui=None, uid: (Raw) id of the user. See :ref:`this note`. iid: (Raw) id of the item. See :ref:`this note`. u_features: List of user features in the same order as used in - the ``fit`` method. Optional, default is empty list. + the ``fit`` method. Optional, default is ``None``. i_features: List of item features in the same order as used in - the ``fit`` method. Optional, default is empty list. + the ``fit`` method. Optional, default is ``None``. r_ui(float): The true rating :math:`r_{ui}`. Optional, default is ``None``. clip(bool): Whether to clip the estimation into the rating scale. @@ -172,7 +171,7 @@ def predict(self, uid, iid, u_features=[], i_features=[], r_ui=None, est = min(higher_bound, est) est = max(lower_bound, est) - pred = Prediction(uid, iid, r_ui, est, details) + pred = Prediction(uid, iid, u_features, i_features, r_ui, est, details) if verbose: print(pred) @@ -213,9 +212,12 @@ def test(self, testset, verbose=False): # The ratings are translated back to their original scale. predictions = [self.predict(uid, iid, + u_features, + i_features, r_ui_trans - self.trainset.offset, verbose=verbose) - for (uid, iid, r_ui_trans) in testset] + for (uid, iid, u_features, i_features, r_ui_trans) + in testset] return predictions def compute_baselines(self): diff --git a/surprise/prediction_algorithms/baseline_only.py b/surprise/prediction_algorithms/baseline_only.py index 85221114..b3590d57 100644 --- a/surprise/prediction_algorithms/baseline_only.py +++ b/surprise/prediction_algorithms/baseline_only.py @@ -35,7 +35,7 @@ def fit(self, trainset): return self - def estimate(self, u, i): + def estimate(self, u, i, u_features, i_features): est = self.trainset.global_mean if self.trainset.knows_user(u): diff --git a/surprise/prediction_algorithms/features.py b/surprise/prediction_algorithms/features.py index 774678af..6133816b 100644 --- a/surprise/prediction_algorithms/features.py +++ b/surprise/prediction_algorithms/features.py @@ -72,8 +72,16 @@ def lasso(self, trainset): y = np.empty((n_ratings,)) for k, (uid, iid, rating) in enumerate(self.trainset.all_ratings()): y[k] = rating - X[k, :n_uf] = u_features[uid] - X[k, n_uf:] = i_features[iid] + try: + X[k, :n_uf] = u_features[uid] + except KeyError: + raise KeyError('No features for user ' + + str(self.trainset.to_raw_uid(uid))) + try: + X[k, n_uf:] = i_features[iid] + except KeyError: + raise KeyError('No features for item ' + + str(self.trainset.to_raw_iid(iid))) reg = linear_model.Lasso(alpha=0.1) reg.fit(X, y) @@ -87,9 +95,11 @@ def estimate(self, u, i, u_features, i_features): features = np.concatenate([u_features, i_features]) - if len(features) != len(self.coef): + if (u_features is None or + i_features is None or + len(features) != len(self.coef)): raise PredictionImpossible('User and/or item features ' - 'are incomplete.') + 'are missing.') est = self.intercept + np.dot(features, self.coef) diff --git a/surprise/prediction_algorithms/knns.py b/surprise/prediction_algorithms/knns.py index 069da4d3..410d5d17 100644 --- a/surprise/prediction_algorithms/knns.py +++ b/surprise/prediction_algorithms/knns.py @@ -96,7 +96,7 @@ def fit(self, trainset): return self - def estimate(self, u, i): + def estimate(self, u, i, u_features, i_features): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unkown.') @@ -176,7 +176,7 @@ def fit(self, trainset): return self - def estimate(self, u, i): + def estimate(self, u, i, u_features, i_features): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unkown.') @@ -271,7 +271,7 @@ def fit(self, trainset): return self - def estimate(self, u, i): + def estimate(self, u, i, u_features, i_features): est = self.trainset.global_mean if self.trainset.knows_user(u): @@ -370,7 +370,7 @@ def fit(self, trainset): return self - def estimate(self, u, i): + def estimate(self, u, i, u_features, i_features): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unkown.') diff --git a/surprise/prediction_algorithms/predictions.py b/surprise/prediction_algorithms/predictions.py index 76bc8ddc..9e971ee9 100644 --- a/surprise/prediction_algorithms/predictions.py +++ b/surprise/prediction_algorithms/predictions.py @@ -21,7 +21,8 @@ class PredictionImpossible(Exception): class Prediction(namedtuple('Prediction', - ['uid', 'iid', 'r_ui', 'est', 'details'])): + ['uid', 'iid', 'u_features', 'i_features', 'r_ui', + 'est', 'details'])): """A named tuple for storing the results of a prediction. It's wrapped in a class, but only for documentation and printing purposes. @@ -29,6 +30,8 @@ class Prediction(namedtuple('Prediction', Args: uid: The (raw) user id. See :ref:`this note`. iid: The (raw) item id. See :ref:`this note`. + u_features: The user features. + i_features: The item features. r_ui(float): The true rating :math:`r_{ui}`. est(float): The estimated rating :math:`\\hat{r}_{ui}`. details (dict): Stores additional details about the prediction that @@ -40,6 +43,14 @@ class Prediction(namedtuple('Prediction', def __str__(self): s = 'user: {uid:<10} '.format(uid=self.uid) s += 'item: {iid:<10} '.format(iid=self.iid) + if self.u_features is not None: + pass + else: + s += 'u_features = None ' + if self.i_features is not None: + pass + else: + s += 'i_features = None ' if self.r_ui is not None: s += 'r_ui = {r_ui:1.2f} '.format(r_ui=self.r_ui) else: diff --git a/surprise/trainset.py b/surprise/trainset.py index 4da7ddf1..fc1cebe7 100644 --- a/surprise/trainset.py +++ b/surprise/trainset.py @@ -237,8 +237,14 @@ def build_testset(self): cases where you want to to test your algorithm on the trainset. """ - return [(self.to_raw_uid(u), self.to_raw_iid(i), r) - for (u, i, r) in self.all_ratings()] + testset = [] + for (u, i, r) in self.all_ratings(): + u_features = self.u_features.get(u, None) + i_features = self.i_features.get(i, None) + testset.append((self.to_raw_uid(u), self.to_raw_iid(i), u_features, + i_features, r)) + + return testset def build_anti_testset(self, fill=None): """Return a list of ratings that can be used as a testset in the @@ -264,10 +270,14 @@ def build_anti_testset(self, fill=None): anti_testset = [] for u in self.all_users(): - user_items = set([j for (j, _) in self.ur[u]]) - anti_testset += [(self.to_raw_uid(u), self.to_raw_iid(i), fill) for - i in self.all_items() if - i not in user_items] + user_items = set([j for (j, __) in self.ur[u]]) + anti_testset += [(self.to_raw_uid(u), + self.to_raw_iid(i), + self.u_features.get(u, None), + self.i_features.get(i, None), + fill) + for i in self.all_items() + if i not in user_items] return anti_testset def all_users(self): @@ -292,7 +302,7 @@ def global_mean(self): It's only computed once.""" if self._global_mean is None: - self._global_mean = np.mean([r for (_, _, r) in + self._global_mean = np.mean([r for (__, __, r) in self.all_ratings()]) return self._global_mean From 14d1248229b74d4ca2e3181c6189dadb97cd62ba Mon Sep 17 00:00:00 2001 From: martincousi Date: Wed, 4 Apr 2018 17:53:21 -0400 Subject: [PATCH 18/45] update matrix factorization estimate method --- surprise/prediction_algorithms/matrix_factorization.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/surprise/prediction_algorithms/matrix_factorization.pyx b/surprise/prediction_algorithms/matrix_factorization.pyx index fe2f1e56..1a790e5f 100644 --- a/surprise/prediction_algorithms/matrix_factorization.pyx +++ b/surprise/prediction_algorithms/matrix_factorization.pyx @@ -253,7 +253,7 @@ class SVD(AlgoBase): self.pu = pu self.qi = qi - def estimate(self, u, i): + def estimate(self, u, i, u_features, i_features): # Should we cythonize this as well? known_user = self.trainset.knows_user(u) @@ -484,7 +484,7 @@ class SVDpp(AlgoBase): self.qi = qi self.yj = yj - def estimate(self, u, i): + def estimate(self, u, i, u_features, i_features): est = self.trainset.global_mean @@ -715,7 +715,7 @@ class NMF(AlgoBase): self.pu = pu self.qi = qi - def estimate(self, u, i): + def estimate(self, u, i, u_features, i_features): # Should we cythonize this as well? known_user = self.trainset.knows_user(u) @@ -737,6 +737,6 @@ class NMF(AlgoBase): if known_user and known_item: est = np.dot(self.qi[i], self.pu[u]) else: - raise PredictionImpossible('User and item are unkown.') + raise PredictionImpossible('User and item are unknown.') return est From a2b87c47b2613fde54532b8bc715df598e309820 Mon Sep 17 00:00:00 2001 From: martincousi Date: Wed, 4 Apr 2018 18:01:39 -0400 Subject: [PATCH 19/45] adapt estimate methods for all prediction algorithms --- surprise/prediction_algorithms/baseline_only.py | 2 +- surprise/prediction_algorithms/co_clustering.pyx | 2 +- surprise/prediction_algorithms/knns.py | 8 ++++---- surprise/prediction_algorithms/matrix_factorization.pyx | 6 +++--- surprise/prediction_algorithms/random_pred.py | 2 +- surprise/prediction_algorithms/slope_one.pyx | 2 +- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/surprise/prediction_algorithms/baseline_only.py b/surprise/prediction_algorithms/baseline_only.py index b3590d57..e4d9778c 100644 --- a/surprise/prediction_algorithms/baseline_only.py +++ b/surprise/prediction_algorithms/baseline_only.py @@ -35,7 +35,7 @@ def fit(self, trainset): return self - def estimate(self, u, i, u_features, i_features): + def estimate(self, u, i, *__): est = self.trainset.global_mean if self.trainset.knows_user(u): diff --git a/surprise/prediction_algorithms/co_clustering.pyx b/surprise/prediction_algorithms/co_clustering.pyx index 408780fc..56dea0d9 100644 --- a/surprise/prediction_algorithms/co_clustering.pyx +++ b/surprise/prediction_algorithms/co_clustering.pyx @@ -236,7 +236,7 @@ class CoClustering(AlgoBase): return avg_cltr_u, avg_cltr_i, avg_cocltr - def estimate(self, u, i): + def estimate(self, u, i, *__): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): return self.trainset.global_mean diff --git a/surprise/prediction_algorithms/knns.py b/surprise/prediction_algorithms/knns.py index 410d5d17..4b71ccd5 100644 --- a/surprise/prediction_algorithms/knns.py +++ b/surprise/prediction_algorithms/knns.py @@ -96,7 +96,7 @@ def fit(self, trainset): return self - def estimate(self, u, i, u_features, i_features): + def estimate(self, u, i, *__): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unkown.') @@ -176,7 +176,7 @@ def fit(self, trainset): return self - def estimate(self, u, i, u_features, i_features): + def estimate(self, u, i, *__): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unkown.') @@ -271,7 +271,7 @@ def fit(self, trainset): return self - def estimate(self, u, i, u_features, i_features): + def estimate(self, u, i, *__): est = self.trainset.global_mean if self.trainset.knows_user(u): @@ -370,7 +370,7 @@ def fit(self, trainset): return self - def estimate(self, u, i, u_features, i_features): + def estimate(self, u, i, *__): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unkown.') diff --git a/surprise/prediction_algorithms/matrix_factorization.pyx b/surprise/prediction_algorithms/matrix_factorization.pyx index 1a790e5f..98a87e5e 100644 --- a/surprise/prediction_algorithms/matrix_factorization.pyx +++ b/surprise/prediction_algorithms/matrix_factorization.pyx @@ -253,7 +253,7 @@ class SVD(AlgoBase): self.pu = pu self.qi = qi - def estimate(self, u, i, u_features, i_features): + def estimate(self, u, i, *__): # Should we cythonize this as well? known_user = self.trainset.knows_user(u) @@ -484,7 +484,7 @@ class SVDpp(AlgoBase): self.qi = qi self.yj = yj - def estimate(self, u, i, u_features, i_features): + def estimate(self, u, i, *__): est = self.trainset.global_mean @@ -715,7 +715,7 @@ class NMF(AlgoBase): self.pu = pu self.qi = qi - def estimate(self, u, i, u_features, i_features): + def estimate(self, u, i, *__): # Should we cythonize this as well? known_user = self.trainset.knows_user(u) diff --git a/surprise/prediction_algorithms/random_pred.py b/surprise/prediction_algorithms/random_pred.py index 86c4dcf1..4196fd49 100644 --- a/surprise/prediction_algorithms/random_pred.py +++ b/surprise/prediction_algorithms/random_pred.py @@ -40,6 +40,6 @@ def fit(self, trainset): return self - def estimate(self, *_): + def estimate(self, *__): return np.random.normal(self.trainset.global_mean, self.sigma) diff --git a/surprise/prediction_algorithms/slope_one.pyx b/surprise/prediction_algorithms/slope_one.pyx index 8049a6cf..52320196 100644 --- a/surprise/prediction_algorithms/slope_one.pyx +++ b/surprise/prediction_algorithms/slope_one.pyx @@ -79,7 +79,7 @@ class SlopeOne(AlgoBase): return self - def estimate(self, u, i): + def estimate(self, u, i, *__): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unkown.') From 3c5f7e6beb23ea335e6517e7cb8d5834af3eb39e Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 5 Apr 2018 09:41:19 -0400 Subject: [PATCH 20/45] add sklearn arguments to Lasso --- surprise/prediction_algorithms/features.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/surprise/prediction_algorithms/features.py b/surprise/prediction_algorithms/features.py index 6133816b..5c107902 100644 --- a/surprise/prediction_algorithms/features.py +++ b/surprise/prediction_algorithms/features.py @@ -44,9 +44,20 @@ class Lasso(AlgoBase): options. """ - def __init__(self, **kwargs): + def __init__(self, alpha=1.0, fit_intercept=True, normalize=False, + precompute=False, max_iter=1000, tol=0.0001, positive=False, + random_state=None, selection='cyclic', **kwargs): AlgoBase.__init__(self, **kwargs) + self.alpha = alpha + self.fit_intercept = fit_intercept + self.normalize = normalize + self.precompute = precompute + self.max_iter = max_iter + self.tol = tol + self.positive = positive + self.random_state = random_state + self.selection = selection def fit(self, trainset): @@ -83,7 +94,11 @@ def lasso(self, trainset): raise KeyError('No features for item ' + str(self.trainset.to_raw_iid(iid))) - reg = linear_model.Lasso(alpha=0.1) + reg = linear_model.Lasso( + alpha=self.alpha, fit_intercept=self.fit_intercept, + normalize=self.normalize, precompute=self.precompute, + max_iter=self.max_iter, tol=self.tol, positive=self.positive, + random_state=self.random_state, selection=self.selection) reg.fit(X, y) # self.X = X From 7b82e78a3cf2dfd094cd93a2717161571f8afcea Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 5 Apr 2018 10:08:12 -0400 Subject: [PATCH 21/45] single underscore for dummy variable --- surprise/accuracy.py | 6 +++--- surprise/dataset.py | 4 ++-- surprise/model_selection/search.py | 2 ++ surprise/model_selection/split.py | 2 +- surprise/prediction_algorithms/baseline_only.py | 2 +- surprise/prediction_algorithms/co_clustering.pyx | 2 +- surprise/prediction_algorithms/features.py | 2 -- surprise/prediction_algorithms/knns.py | 8 ++++---- surprise/prediction_algorithms/matrix_factorization.pyx | 6 +++--- surprise/prediction_algorithms/random_pred.py | 2 +- surprise/prediction_algorithms/slope_one.pyx | 2 +- surprise/trainset.py | 4 ++-- 12 files changed, 21 insertions(+), 21 deletions(-) diff --git a/surprise/accuracy.py b/surprise/accuracy.py index d6e8a88f..0bfcd1af 100644 --- a/surprise/accuracy.py +++ b/surprise/accuracy.py @@ -45,7 +45,7 @@ def rmse(predictions, verbose=True): raise ValueError('Prediction list is empty.') mse = np.mean([float((true_r - est)**2) - for (__, __, __, __, true_r, est, __) in predictions]) + for (_, _, _, _, true_r, est, _) in predictions]) rmse_ = np.sqrt(mse) if verbose: @@ -80,7 +80,7 @@ def mae(predictions, verbose=True): raise ValueError('Prediction list is empty.') mae_ = np.mean([float(abs(true_r - est)) - for (__, __, __, __, true_r, est, __) in predictions]) + for (_, _, _, _, true_r, est, _) in predictions]) if verbose: print('MAE: {0:1.4f}'.format(mae_)) @@ -117,7 +117,7 @@ def fcp(predictions, verbose=True): nc_u = defaultdict(int) nd_u = defaultdict(int) - for u0, __, __, __, r0, est, __ in predictions: + for u0, _, _, _, r0, est, _ in predictions: predictions_u[u0].append((r0, est)) for u0, preds in iteritems(predictions_u): diff --git a/surprise/dataset.py b/surprise/dataset.py index 00dd623f..349e9875 100644 --- a/surprise/dataset.py +++ b/surprise/dataset.py @@ -246,7 +246,7 @@ def construct_trainset(self, raw_trainset): i_features = {} # user raw id, item raw id, translated rating, time stamp - for urid, irid, r, __ in raw_trainset: + for urid, irid, r, _ in raw_trainset: try: uid = raw2inner_id_users[urid] except KeyError: @@ -291,7 +291,7 @@ def construct_trainset(self, raw_trainset): def construct_testset(self, raw_testset): testset = [] - for (ruid, riid, r_ui_trans, __) in raw_testset: + for (ruid, riid, r_ui_trans, _) in raw_testset: u_features = self.user_features.get(ruid, None) i_features = self.item_features.get(riid, None) testset.append((ruid, riid, u_features, i_features, r_ui_trans)) diff --git a/surprise/model_selection/search.py b/surprise/model_selection/search.py index 9489c88b..afa2e2a6 100644 --- a/surprise/model_selection/search.py +++ b/surprise/model_selection/search.py @@ -294,6 +294,7 @@ class GridSearchCV(BaseSearchCV): into a pandas `DataFrame` (see :ref:`example `). """ + def __init__(self, algo_class, param_grid, measures=['rmse', 'mae'], cv=None, refit=False, return_train_measures=False, n_jobs=-1, pre_dispatch='2*n_jobs', joblib_verbose=0): @@ -410,6 +411,7 @@ class RandomizedSearchCV(BaseSearchCV): into a pandas `DataFrame` (see :ref:`example `). """ + def __init__(self, algo_class, param_distributions, n_iter=10, measures=['rmse', 'mae'], cv=None, refit=False, return_train_measures=False, n_jobs=-1, diff --git a/surprise/model_selection/split.py b/surprise/model_selection/split.py index ee9dfc64..5c656565 100644 --- a/surprise/model_selection/split.py +++ b/surprise/model_selection/split.py @@ -276,7 +276,7 @@ def split(self, data): self.test_size, self.train_size, len(data.raw_ratings)) rng = get_rng(self.random_state) - for __ in range(self.n_splits): + for _ in range(self.n_splits): if self.shuffle: permutation = rng.permutation(len(data.raw_ratings)) diff --git a/surprise/prediction_algorithms/baseline_only.py b/surprise/prediction_algorithms/baseline_only.py index e4d9778c..7ae10a22 100644 --- a/surprise/prediction_algorithms/baseline_only.py +++ b/surprise/prediction_algorithms/baseline_only.py @@ -35,7 +35,7 @@ def fit(self, trainset): return self - def estimate(self, u, i, *__): + def estimate(self, u, i, *_): est = self.trainset.global_mean if self.trainset.knows_user(u): diff --git a/surprise/prediction_algorithms/co_clustering.pyx b/surprise/prediction_algorithms/co_clustering.pyx index 56dea0d9..8fded133 100644 --- a/surprise/prediction_algorithms/co_clustering.pyx +++ b/surprise/prediction_algorithms/co_clustering.pyx @@ -236,7 +236,7 @@ class CoClustering(AlgoBase): return avg_cltr_u, avg_cltr_i, avg_cocltr - def estimate(self, u, i, *__): + def estimate(self, u, i, *_): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): return self.trainset.global_mean diff --git a/surprise/prediction_algorithms/features.py b/surprise/prediction_algorithms/features.py index 5c107902..cc41e4df 100644 --- a/surprise/prediction_algorithms/features.py +++ b/surprise/prediction_algorithms/features.py @@ -5,8 +5,6 @@ from __future__ import (absolute_import, division, print_function, unicode_literals) import numpy as np -from six import iteritems -import heapq from sklearn import linear_model from .predictions import PredictionImpossible diff --git a/surprise/prediction_algorithms/knns.py b/surprise/prediction_algorithms/knns.py index 4b71ccd5..e5c5d8ad 100644 --- a/surprise/prediction_algorithms/knns.py +++ b/surprise/prediction_algorithms/knns.py @@ -96,7 +96,7 @@ def fit(self, trainset): return self - def estimate(self, u, i, *__): + def estimate(self, u, i, *_): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unkown.') @@ -176,7 +176,7 @@ def fit(self, trainset): return self - def estimate(self, u, i, *__): + def estimate(self, u, i, *_): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unkown.') @@ -271,7 +271,7 @@ def fit(self, trainset): return self - def estimate(self, u, i, *__): + def estimate(self, u, i, *_): est = self.trainset.global_mean if self.trainset.knows_user(u): @@ -370,7 +370,7 @@ def fit(self, trainset): return self - def estimate(self, u, i, *__): + def estimate(self, u, i, *_): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unkown.') diff --git a/surprise/prediction_algorithms/matrix_factorization.pyx b/surprise/prediction_algorithms/matrix_factorization.pyx index 98a87e5e..7a3cede5 100644 --- a/surprise/prediction_algorithms/matrix_factorization.pyx +++ b/surprise/prediction_algorithms/matrix_factorization.pyx @@ -253,7 +253,7 @@ class SVD(AlgoBase): self.pu = pu self.qi = qi - def estimate(self, u, i, *__): + def estimate(self, u, i, *_): # Should we cythonize this as well? known_user = self.trainset.knows_user(u) @@ -484,7 +484,7 @@ class SVDpp(AlgoBase): self.qi = qi self.yj = yj - def estimate(self, u, i, *__): + def estimate(self, u, i, *_): est = self.trainset.global_mean @@ -715,7 +715,7 @@ class NMF(AlgoBase): self.pu = pu self.qi = qi - def estimate(self, u, i, *__): + def estimate(self, u, i, *_): # Should we cythonize this as well? known_user = self.trainset.knows_user(u) diff --git a/surprise/prediction_algorithms/random_pred.py b/surprise/prediction_algorithms/random_pred.py index 4196fd49..86c4dcf1 100644 --- a/surprise/prediction_algorithms/random_pred.py +++ b/surprise/prediction_algorithms/random_pred.py @@ -40,6 +40,6 @@ def fit(self, trainset): return self - def estimate(self, *__): + def estimate(self, *_): return np.random.normal(self.trainset.global_mean, self.sigma) diff --git a/surprise/prediction_algorithms/slope_one.pyx b/surprise/prediction_algorithms/slope_one.pyx index 52320196..f986e496 100644 --- a/surprise/prediction_algorithms/slope_one.pyx +++ b/surprise/prediction_algorithms/slope_one.pyx @@ -79,7 +79,7 @@ class SlopeOne(AlgoBase): return self - def estimate(self, u, i, *__): + def estimate(self, u, i, *_): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unkown.') diff --git a/surprise/trainset.py b/surprise/trainset.py index fc1cebe7..6aec0744 100644 --- a/surprise/trainset.py +++ b/surprise/trainset.py @@ -270,7 +270,7 @@ def build_anti_testset(self, fill=None): anti_testset = [] for u in self.all_users(): - user_items = set([j for (j, __) in self.ur[u]]) + user_items = set([j for (j, _) in self.ur[u]]) anti_testset += [(self.to_raw_uid(u), self.to_raw_iid(i), self.u_features.get(u, None), @@ -302,7 +302,7 @@ def global_mean(self): It's only computed once.""" if self._global_mean is None: - self._global_mean = np.mean([r for (__, __, r) in + self._global_mean = np.mean([r for (_, _, r) in self.all_ratings()]) return self._global_mean From bf335c28e08c423380eb8e7ce8fca6fd36b1dcff Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 5 Apr 2018 10:27:53 -0400 Subject: [PATCH 22/45] update documentation for Lasso and change filename --- surprise/prediction_algorithms/__init__.py | 2 +- .../{features.py => linear.py} | 40 +++++++------------ 2 files changed, 16 insertions(+), 26 deletions(-) rename surprise/prediction_algorithms/{features.py => linear.py} (68%) diff --git a/surprise/prediction_algorithms/__init__.py b/surprise/prediction_algorithms/__init__.py index 61091131..1a719a97 100644 --- a/surprise/prediction_algorithms/__init__.py +++ b/surprise/prediction_algorithms/__init__.py @@ -32,7 +32,7 @@ from .matrix_factorization import NMF from .slope_one import SlopeOne from .co_clustering import CoClustering -from .features import Lasso +from .linear import Lasso from .predictions import PredictionImpossible from .predictions import Prediction diff --git a/surprise/prediction_algorithms/features.py b/surprise/prediction_algorithms/linear.py similarity index 68% rename from surprise/prediction_algorithms/features.py rename to surprise/prediction_algorithms/linear.py index cc41e4df..5a56f91d 100644 --- a/surprise/prediction_algorithms/features.py +++ b/surprise/prediction_algorithms/linear.py @@ -1,5 +1,5 @@ """ -the :mod:`features` module includes some features-based algorithms. +the :mod:`linear` module includes linear features-based algorithms. """ from __future__ import (absolute_import, division, print_function, @@ -12,39 +12,29 @@ class Lasso(AlgoBase): - """A basic linear regression algorithm. + """A basic lasso algorithm with user-item interaction terms. The prediction :math:`\\hat{r}_{ui}` is set as: .. math:: - \hat{r}_{ui} = \\frac{ - \\sum\\limits_{v \in N^k_i(u)} \\text{sim}(u, v) \cdot r_{vi}} - {\\sum\\limits_{v \in N^k_i(u)} \\text{sim}(u, v)} + \hat{r}_{ui} = \alpha_0 + \alpha_1^\top y_u + \alpha_2^\top z_i + + \alpha_3^\top \text{vec}(y_u \otimes z_i) - or - - .. math:: - \hat{r}_{ui} = \\frac{ - \\sum\\limits_{j \in N^k_u(i)} \\text{sim}(i, j) \cdot r_{uj}} - {\\sum\\limits_{j \in N^k_u(j)} \\text{sim}(i, j)} - - depending on the ``user_based`` field of the ``sim_options`` parameter. + where :math:`\alpha_0 \in \mathbb{R}, \alpha_1 \in \mathbb{R}^o, \alpha_2 + \in \mathbb{R}^p` and :math:`\alpha_3 \in \mathbb{R}^{op}` are coefficient + vectors, and :math:`\otimes` represent the Kronecker product of two vectors + (i.e., all possible cross-product combinations). Args: - k(int): The (max) number of neighbors to take into account for - aggregation (see :ref:`this note `). Default is - ``40``. - min_k(int): The minimum number of neighbors to take into account for - aggregation. If there are not enough neighbors, the prediction is - set the the global mean of all ratings. Default is ``1``. - sim_options(dict): A dictionary of options for the similarity - measure. See :ref:`similarity_measures_configuration` for accepted - options. + add_interactions(bool): Whether to add user-item interaction terms. + Optional, default is True. + other args: See ``sklearn`` documentation for ``linear_model.Lasso``. """ - def __init__(self, alpha=1.0, fit_intercept=True, normalize=False, - precompute=False, max_iter=1000, tol=0.0001, positive=False, - random_state=None, selection='cyclic', **kwargs): + def __init__(self, add_interactions=True, alpha=1.0, fit_intercept=True, + normalize=False, precompute=False, max_iter=1000, tol=0.0001, + positive=False, random_state=None, selection='cyclic', + **kwargs): AlgoBase.__init__(self, **kwargs) self.alpha = alpha From e34a5f9d3c30bafce876528bb1fce845e21a5507 Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 5 Apr 2018 10:41:56 -0400 Subject: [PATCH 23/45] correct conflict with master --- surprise/prediction_algorithms/algo_base.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/surprise/prediction_algorithms/algo_base.py b/surprise/prediction_algorithms/algo_base.py index 41692cae..4fdcd8f0 100644 --- a/surprise/prediction_algorithms/algo_base.py +++ b/surprise/prediction_algorithms/algo_base.py @@ -248,7 +248,8 @@ def compute_baselines(self): method_name = self.bsl_options.get('method', 'als') try: - # print('Estimating biases using', method_name + '...') + if self.verbose: + print('Estimating biases using', method_name + '...') self.bu, self.bi = method[method_name](self) return self.bu, self.bi except KeyError: @@ -295,9 +296,11 @@ def compute_similarities(self): args += [self.trainset.global_mean, bx, by, shrinkage] try: - # print('Computing the {0} similarity matrix...'.format(name)) + if self.verbose: + print('Computing the {0} similarity matrix...'.format(name)) sim = construction_func[name](*args) - # print('Done computing similarity matrix.') + if self.verbose: + print('Done computing similarity matrix.') return sim except KeyError: raise NameError('Wrong sim name ' + name + '. Allowed values ' + From 4081244a92648fbc51ff0e8549a4b6d34b778916 Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 5 Apr 2018 11:20:46 -0400 Subject: [PATCH 24/45] add interaction terms in Lasso --- surprise/prediction_algorithms/linear.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/surprise/prediction_algorithms/linear.py b/surprise/prediction_algorithms/linear.py index 5a56f91d..282d53c9 100644 --- a/surprise/prediction_algorithms/linear.py +++ b/surprise/prediction_algorithms/linear.py @@ -82,6 +82,11 @@ def lasso(self, trainset): raise KeyError('No features for item ' + str(self.trainset.to_raw_iid(iid))) + if self.add_interactions: + temp = np.array([X[:, u] * X[:, i] for u in range(n_uf) + for i in range(n_uf, n_uf + n_if)]).T + X = np.concatenate([X, temp], axis=1) + reg = linear_model.Lasso( alpha=self.alpha, fit_intercept=self.fit_intercept, normalize=self.normalize, precompute=self.precompute, @@ -89,8 +94,8 @@ def lasso(self, trainset): random_state=self.random_state, selection=self.selection) reg.fit(X, y) - # self.X = X - # self.y = y + self.X = X + self.y = y self.coef = reg.coef_ self.intercept = reg.intercept_ From d3dd0dd2bf55420009549041e8eb81144469fbbf Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 5 Apr 2018 11:43:11 -0400 Subject: [PATCH 25/45] add interaction terms to Lasso.estimate --- surprise/prediction_algorithms/linear.py | 42 ++++++++++++++++-------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/surprise/prediction_algorithms/linear.py b/surprise/prediction_algorithms/linear.py index 282d53c9..5d29b96f 100644 --- a/surprise/prediction_algorithms/linear.py +++ b/surprise/prediction_algorithms/linear.py @@ -74,18 +74,16 @@ def lasso(self, trainset): try: X[k, :n_uf] = u_features[uid] except KeyError: - raise KeyError('No features for user ' + - str(self.trainset.to_raw_uid(uid))) + raise ValueError('No features for user ' + + str(self.trainset.to_raw_uid(uid))) try: X[k, n_uf:] = i_features[iid] except KeyError: - raise KeyError('No features for item ' + - str(self.trainset.to_raw_iid(iid))) + raise ValueError('No features for item ' + + str(self.trainset.to_raw_iid(iid))) if self.add_interactions: - temp = np.array([X[:, u] * X[:, i] for u in range(n_uf) - for i in range(n_uf, n_uf + n_if)]).T - X = np.concatenate([X, temp], axis=1) + X = self.add_interactions(X) reg = linear_model.Lasso( alpha=self.alpha, fit_intercept=self.fit_intercept, @@ -99,16 +97,32 @@ def lasso(self, trainset): self.coef = reg.coef_ self.intercept = reg.intercept_ - def estimate(self, u, i, u_features, i_features): + def add_interactions(self, X): + + n_uf = self.trainset.n_user_features + n_if = self.trainset.n_item_features + + temp = np.array([X[:, u] * X[:, i] for u in range(n_uf) + for i in range(n_uf, n_uf + n_if)]).T + X = np.concatenate([X, temp], axis=1) - features = np.concatenate([u_features, i_features]) + return X + + def estimate(self, u, i, u_features, i_features): if (u_features is None or - i_features is None or - len(features) != len(self.coef)): - raise PredictionImpossible('User and/or item features ' - 'are missing.') + len(u_features) != self.trainset.n_user_features): + raise PredictionImpossible('User features are missing.') + + if (i_features is None or + len(i_features) != self.trainset.n_item_features): + raise PredictionImpossible('Item features are missing.') + + X = np.concatenate([u_features, i_features]) + + if self.add_interactions: + X = self.add_interactions(X) - est = self.intercept + np.dot(features, self.coef) + est = self.intercept + np.dot(X, self.coef) return est From 47ff4771b5b544be8bff9e88ee4ed560298e6e04 Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 5 Apr 2018 11:52:07 -0400 Subject: [PATCH 26/45] correct conflicts with master --- surprise/prediction_algorithms/baseline_only.py | 3 ++- surprise/prediction_algorithms/co_clustering.pyx | 2 +- surprise/prediction_algorithms/knns.py | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/surprise/prediction_algorithms/baseline_only.py b/surprise/prediction_algorithms/baseline_only.py index 7ae10a22..05886657 100644 --- a/surprise/prediction_algorithms/baseline_only.py +++ b/surprise/prediction_algorithms/baseline_only.py @@ -24,9 +24,10 @@ class BaselineOnly(AlgoBase): """ - def __init__(self, bsl_options={}): + def __init__(self, bsl_options={}, verbose=True): AlgoBase.__init__(self, bsl_options=bsl_options) + self.verbose = verbose def fit(self, trainset): diff --git a/surprise/prediction_algorithms/co_clustering.pyx b/surprise/prediction_algorithms/co_clustering.pyx index 8fded133..85837718 100644 --- a/surprise/prediction_algorithms/co_clustering.pyx +++ b/surprise/prediction_algorithms/co_clustering.pyx @@ -62,7 +62,7 @@ class CoClustering(AlgoBase): self.n_cltr_u = n_cltr_u self.n_cltr_i = n_cltr_i self.n_epochs = n_epochs - self.verbose=verbose + self.verbose = verbose self.random_state = random_state def fit(self, trainset): diff --git a/surprise/prediction_algorithms/knns.py b/surprise/prediction_algorithms/knns.py index e5c5d8ad..efd838c2 100644 --- a/surprise/prediction_algorithms/knns.py +++ b/surprise/prediction_algorithms/knns.py @@ -27,9 +27,10 @@ class SymmetricAlgo(AlgoBase): reversed. """ - def __init__(self, sim_options={}, **kwargs): + def __init__(self, sim_options={}, verbose=True, **kwargs): AlgoBase.__init__(self, sim_options=sim_options, **kwargs) + self.verbose = verbose def fit(self, trainset): From 127942414423c50ab5d18a5005b14952e731e6d2 Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 5 Apr 2018 12:04:46 -0400 Subject: [PATCH 27/45] correct verbose conflicts in knns --- surprise/prediction_algorithms/knns.py | 31 +++++++++++++++++--------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/surprise/prediction_algorithms/knns.py b/surprise/prediction_algorithms/knns.py index efd838c2..aa3fb7d8 100644 --- a/surprise/prediction_algorithms/knns.py +++ b/surprise/prediction_algorithms/knns.py @@ -82,11 +82,14 @@ class KNNBasic(SymmetricAlgo): sim_options(dict): A dictionary of options for the similarity measure. See :ref:`similarity_measures_configuration` for accepted options. + verbose(bool): Whether to print trace messages of bias estimation, + similarity, etc. Default is True. """ - def __init__(self, k=40, min_k=1, sim_options={}, **kwargs): + def __init__(self, k=40, min_k=1, sim_options={}, verbose=True, **kwargs): - SymmetricAlgo.__init__(self, sim_options=sim_options, **kwargs) + SymmetricAlgo.__init__(self, sim_options=sim_options, verbose=verbose, + **kwargs) self.k = k self.min_k = min_k @@ -157,11 +160,14 @@ class KNNWithMeans(SymmetricAlgo): sim_options(dict): A dictionary of options for the similarity measure. See :ref:`similarity_measures_configuration` for accepted options. + verbose(bool): Whether to print trace messages of bias estimation, + similarity, etc. Default is True. """ - def __init__(self, k=40, min_k=1, sim_options={}, **kwargs): + def __init__(self, k=40, min_k=1, sim_options={}, verbose=True, **kwargs): - SymmetricAlgo.__init__(self, sim_options=sim_options, **kwargs) + SymmetricAlgo.__init__(self, sim_options=sim_options, + verbose=verbose, **kwargs) self.k = k self.min_k = min_k @@ -248,17 +254,19 @@ class KNNBaseline(SymmetricAlgo): measure. See :ref:`similarity_measures_configuration` for accepted options. It is recommended to use the :func:`pearson_baseline ` similarity measure. - bsl_options(dict): A dictionary of options for the baseline estimates computation. See :ref:`baseline_estimates_configuration` for accepted options. - + verbose(bool): Whether to print trace messages of bias estimation, + similarity, etc. Default is True. """ - def __init__(self, k=40, min_k=1, sim_options={}, bsl_options={}): + def __init__(self, k=40, min_k=1, sim_options={}, bsl_options={}, + verbose=True, **kwargs): SymmetricAlgo.__init__(self, sim_options=sim_options, - bsl_options=bsl_options) + bsl_options=bsl_options, verbose=verbose, + **kwargs) self.k = k self.min_k = min_k @@ -343,11 +351,14 @@ class KNNWithZScore(SymmetricAlgo): sim_options(dict): A dictionary of options for the similarity measure. See :ref:`similarity_measures_configuration` for accepted options. + verbose(bool): Whether to print trace messages of bias estimation, + similarity, etc. Default is True. """ - def __init__(self, k=40, min_k=1, sim_options={}, **kwargs): + def __init__(self, k=40, min_k=1, sim_options={}, verbose=True, **kwargs): - SymmetricAlgo.__init__(self, sim_options=sim_options, **kwargs) + SymmetricAlgo.__init__(self, sim_options=sim_options, verbose=verbose, + **kwargs) self.k = k self.min_k = min_k From 62ccd84e75dbcbb8c3993692991134b2513f7a3f Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 5 Apr 2018 12:15:45 -0400 Subject: [PATCH 28/45] add add_interactions to self in Lasso --- surprise/prediction_algorithms/linear.py | 1 + 1 file changed, 1 insertion(+) diff --git a/surprise/prediction_algorithms/linear.py b/surprise/prediction_algorithms/linear.py index 5d29b96f..562b9975 100644 --- a/surprise/prediction_algorithms/linear.py +++ b/surprise/prediction_algorithms/linear.py @@ -37,6 +37,7 @@ def __init__(self, add_interactions=True, alpha=1.0, fit_intercept=True, **kwargs): AlgoBase.__init__(self, **kwargs) + self.add_interactions = add_interactions self.alpha = alpha self.fit_intercept = fit_intercept self.normalize = normalize From 53b869711ce7208e5a79cfd15b7b826c536b7493 Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 5 Apr 2018 12:40:43 -0400 Subject: [PATCH 29/45] change add_interactions fn name --- surprise/prediction_algorithms/linear.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/surprise/prediction_algorithms/linear.py b/surprise/prediction_algorithms/linear.py index 562b9975..ce22d7eb 100644 --- a/surprise/prediction_algorithms/linear.py +++ b/surprise/prediction_algorithms/linear.py @@ -84,7 +84,7 @@ def lasso(self, trainset): str(self.trainset.to_raw_iid(iid))) if self.add_interactions: - X = self.add_interactions(X) + X = self.add_interactions_fn(X) reg = linear_model.Lasso( alpha=self.alpha, fit_intercept=self.fit_intercept, @@ -98,7 +98,7 @@ def lasso(self, trainset): self.coef = reg.coef_ self.intercept = reg.intercept_ - def add_interactions(self, X): + def add_interactions_fn(self, X): n_uf = self.trainset.n_user_features n_if = self.trainset.n_item_features @@ -122,7 +122,7 @@ def estimate(self, u, i, u_features, i_features): X = np.concatenate([u_features, i_features]) if self.add_interactions: - X = self.add_interactions(X) + X = self.add_interactions_fn(X) est = self.intercept + np.dot(X, self.coef) From 7e342989dd9999997d9c37bc3b6d1443889f1895 Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 5 Apr 2018 13:15:04 -0400 Subject: [PATCH 30/45] remove add_interactions_fn --- surprise/prediction_algorithms/linear.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/surprise/prediction_algorithms/linear.py b/surprise/prediction_algorithms/linear.py index ce22d7eb..2a11edd8 100644 --- a/surprise/prediction_algorithms/linear.py +++ b/surprise/prediction_algorithms/linear.py @@ -84,7 +84,9 @@ def lasso(self, trainset): str(self.trainset.to_raw_iid(iid))) if self.add_interactions: - X = self.add_interactions_fn(X) + temp = np.array([X[:, v] * X[:, j] for v in range(n_uf) + for j in range(n_uf, n_uf + n_if)]).T + X = np.concatenate([X, temp], axis=1) reg = linear_model.Lasso( alpha=self.alpha, fit_intercept=self.fit_intercept, @@ -98,31 +100,23 @@ def lasso(self, trainset): self.coef = reg.coef_ self.intercept = reg.intercept_ - def add_interactions_fn(self, X): + def estimate(self, u, i, u_features, i_features): n_uf = self.trainset.n_user_features n_if = self.trainset.n_item_features - temp = np.array([X[:, u] * X[:, i] for u in range(n_uf) - for i in range(n_uf, n_uf + n_if)]).T - X = np.concatenate([X, temp], axis=1) - - return X - - def estimate(self, u, i, u_features, i_features): - - if (u_features is None or - len(u_features) != self.trainset.n_user_features): + if u_features is None or len(u_features) != n_uf: raise PredictionImpossible('User features are missing.') - if (i_features is None or - len(i_features) != self.trainset.n_item_features): + if i_features is None or len(i_features) != n_if: raise PredictionImpossible('Item features are missing.') X = np.concatenate([u_features, i_features]) if self.add_interactions: - X = self.add_interactions_fn(X) + temp = np.array([X[v] * X[i] for v in range(n_uf) + for j in range(n_uf, n_uf + n_if)]) + X = np.concatenate([X, temp]) est = self.intercept + np.dot(X, self.coef) From 39c2601ac71c42facfe4717de746922661174db8 Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 5 Apr 2018 13:40:12 -0400 Subject: [PATCH 31/45] correct bad index --- surprise/prediction_algorithms/linear.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/surprise/prediction_algorithms/linear.py b/surprise/prediction_algorithms/linear.py index 2a11edd8..c3649802 100644 --- a/surprise/prediction_algorithms/linear.py +++ b/surprise/prediction_algorithms/linear.py @@ -114,7 +114,7 @@ def estimate(self, u, i, u_features, i_features): X = np.concatenate([u_features, i_features]) if self.add_interactions: - temp = np.array([X[v] * X[i] for v in range(n_uf) + temp = np.array([X[v] * X[j] for v in range(n_uf) for j in range(n_uf, n_uf + n_if)]) X = np.concatenate([X, temp]) From 4f3c3a86e462f30aede6ffd029a94b0025ad53cc Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 5 Apr 2018 15:39:39 -0400 Subject: [PATCH 32/45] pep8 and description --- surprise/prediction_algorithms/linear.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/surprise/prediction_algorithms/linear.py b/surprise/prediction_algorithms/linear.py index c3649802..d72c4679 100644 --- a/surprise/prediction_algorithms/linear.py +++ b/surprise/prediction_algorithms/linear.py @@ -17,11 +17,11 @@ class Lasso(AlgoBase): The prediction :math:`\\hat{r}_{ui}` is set as: .. math:: - \hat{r}_{ui} = \alpha_0 + \alpha_1^\top y_u + \alpha_2^\top z_i + - \alpha_3^\top \text{vec}(y_u \otimes z_i) + \hat{r}_{ui} = \alpha_1 + \alpha_2^\top y_u + \alpha_3^\top z_i + + \alpha_4^\top \text{vec}(y_u \otimes z_i) - where :math:`\alpha_0 \in \mathbb{R}, \alpha_1 \in \mathbb{R}^o, \alpha_2 - \in \mathbb{R}^p` and :math:`\alpha_3 \in \mathbb{R}^{op}` are coefficient + where :math:`\alpha_1 \in \mathbb{R}, \alpha_2 \in \mathbb{R}^o, \alpha_3 + \in \mathbb{R}^p` and :math:`\alpha_4 \in \mathbb{R}^{op}` are coefficient vectors, and :math:`\otimes` represent the Kronecker product of two vectors (i.e., all possible cross-product combinations). @@ -85,7 +85,7 @@ def lasso(self, trainset): if self.add_interactions: temp = np.array([X[:, v] * X[:, j] for v in range(n_uf) - for j in range(n_uf, n_uf + n_if)]).T + for j in range(n_uf, n_uf + n_if)]).T X = np.concatenate([X, temp], axis=1) reg = linear_model.Lasso( @@ -115,7 +115,7 @@ def estimate(self, u, i, u_features, i_features): if self.add_interactions: temp = np.array([X[v] * X[j] for v in range(n_uf) - for j in range(n_uf, n_uf + n_if)]) + for j in range(n_uf, n_uf + n_if)]) X = np.concatenate([X, temp]) est = self.intercept + np.dot(X, self.coef) From aab90a51304a5b2fdcb9a0802dcb442e91e99694 Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 5 Apr 2018 16:29:45 -0400 Subject: [PATCH 33/45] add feature labels --- surprise/dataset.py | 2 ++ surprise/prediction_algorithms/linear.py | 7 +++++++ surprise/trainset.py | 7 +++++-- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/surprise/dataset.py b/surprise/dataset.py index 349e9875..d5d74cb6 100644 --- a/surprise/dataset.py +++ b/surprise/dataset.py @@ -280,6 +280,8 @@ def construct_trainset(self, raw_trainset): n_items, self.user_features_nb, self.item_features_nb, + self.user_features_labels, + self.item_features_labels, n_ratings, self.reader.rating_scale, self.reader.offset, diff --git a/surprise/prediction_algorithms/linear.py b/surprise/prediction_algorithms/linear.py index d72c4679..81697f26 100644 --- a/surprise/prediction_algorithms/linear.py +++ b/surprise/prediction_algorithms/linear.py @@ -67,6 +67,8 @@ def lasso(self, trainset): n_if = self.trainset.n_item_features u_features = self.trainset.u_features i_features = self.trainset.i_features + uf_labels = self.trainset.user_features_labels + if_labels = self.trainset.item_features_labels X = np.empty((n_ratings, n_uf + n_if)) y = np.empty((n_ratings,)) @@ -83,10 +85,14 @@ def lasso(self, trainset): raise ValueError('No features for item ' + str(self.trainset.to_raw_iid(iid))) + coef_labels = uf_labels + if_labels if self.add_interactions: temp = np.array([X[:, v] * X[:, j] for v in range(n_uf) for j in range(n_uf, n_uf + n_if)]).T X = np.concatenate([X, temp], axis=1) + temp = [coef_labels[v] + '*' + coef_labels[j] for v in range(n_uf) + for j in range(n_uf, n_uf + n_if)] + coef_labels += temp reg = linear_model.Lasso( alpha=self.alpha, fit_intercept=self.fit_intercept, @@ -98,6 +104,7 @@ def lasso(self, trainset): self.X = X self.y = y self.coef = reg.coef_ + self.coef_labels = coef_labels self.intercept = reg.intercept_ def estimate(self, u, i, u_features, i_features): diff --git a/surprise/trainset.py b/surprise/trainset.py index 6aec0744..c7d091f6 100644 --- a/surprise/trainset.py +++ b/surprise/trainset.py @@ -50,8 +50,9 @@ class Trainset: """ def __init__(self, ur, ir, u_features, i_features, n_users, n_items, - n_user_features, n_item_features, n_ratings, rating_scale, - offset, raw2inner_id_users, raw2inner_id_items): + n_user_features, n_item_features, user_features_labels, + item_features_labels, n_ratings, rating_scale, offset, + raw2inner_id_users, raw2inner_id_items): self.ur = ur self.ir = ir @@ -61,6 +62,8 @@ def __init__(self, ur, ir, u_features, i_features, n_users, n_items, self.n_items = n_items self.n_user_features = n_user_features self.n_item_features = n_item_features + self.user_features_labels = user_features_labels + self.item_features_labels = item_features_labels self.n_ratings = n_ratings self.rating_scale = rating_scale self.offset = offset From bfb2b8dd41afcd1db3982e2e42bfccbb41250bcf Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 5 Apr 2018 16:35:36 -0400 Subject: [PATCH 34/45] resolve conflicts --- surprise/prediction_algorithms/knns.py | 1 + 1 file changed, 1 insertion(+) diff --git a/surprise/prediction_algorithms/knns.py b/surprise/prediction_algorithms/knns.py index aa3fb7d8..301a8168 100644 --- a/surprise/prediction_algorithms/knns.py +++ b/surprise/prediction_algorithms/knns.py @@ -259,6 +259,7 @@ class KNNBaseline(SymmetricAlgo): accepted options. verbose(bool): Whether to print trace messages of bias estimation, similarity, etc. Default is True. + """ def __init__(self, k=40, min_k=1, sim_options={}, bsl_options={}, From f9255e1cb1c350049aabb21fd095619768d23fb8 Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 5 Apr 2018 17:10:54 -0400 Subject: [PATCH 35/45] remove Lasso --- surprise/__init__.py | 4 +- surprise/prediction_algorithms/__init__.py | 3 +- surprise/prediction_algorithms/linear.py | 130 --------------------- 3 files changed, 2 insertions(+), 135 deletions(-) delete mode 100644 surprise/prediction_algorithms/linear.py diff --git a/surprise/__init__.py b/surprise/__init__.py index 82de2460..e87ca980 100644 --- a/surprise/__init__.py +++ b/surprise/__init__.py @@ -12,7 +12,6 @@ from .prediction_algorithms import NMF from .prediction_algorithms import SlopeOne from .prediction_algorithms import CoClustering -from .prediction_algorithms import Lasso from .prediction_algorithms import PredictionImpossible from .prediction_algorithms import Prediction @@ -31,7 +30,6 @@ 'KNNWithMeans', 'KNNBaseline', 'SVD', 'SVDpp', 'NMF', 'SlopeOne', 'CoClustering', 'PredictionImpossible', 'Prediction', 'Dataset', 'Reader', 'Trainset', 'evaluate', 'print_perf', 'GridSearch', - 'dump', 'KNNWithZScore', 'get_dataset_dir', 'model_selection', - 'Lasso'] + 'dump', 'KNNWithZScore', 'get_dataset_dir', 'model_selection'] __version__ = get_distribution('scikit-surprise').version diff --git a/surprise/prediction_algorithms/__init__.py b/surprise/prediction_algorithms/__init__.py index 1a719a97..d5ce8288 100644 --- a/surprise/prediction_algorithms/__init__.py +++ b/surprise/prediction_algorithms/__init__.py @@ -32,7 +32,6 @@ from .matrix_factorization import NMF from .slope_one import SlopeOne from .co_clustering import CoClustering -from .linear import Lasso from .predictions import PredictionImpossible from .predictions import Prediction @@ -40,4 +39,4 @@ __all__ = ['AlgoBase', 'NormalPredictor', 'BaselineOnly', 'KNNBasic', 'KNNBaseline', 'KNNWithMeans', 'SVD', 'SVDpp', 'NMF', 'SlopeOne', 'CoClustering', 'PredictionImpossible', 'Prediction', - 'KNNWithZScore', 'Lasso'] + 'KNNWithZScore'] diff --git a/surprise/prediction_algorithms/linear.py b/surprise/prediction_algorithms/linear.py deleted file mode 100644 index 81697f26..00000000 --- a/surprise/prediction_algorithms/linear.py +++ /dev/null @@ -1,130 +0,0 @@ -""" -the :mod:`linear` module includes linear features-based algorithms. -""" - -from __future__ import (absolute_import, division, print_function, - unicode_literals) -import numpy as np -from sklearn import linear_model - -from .predictions import PredictionImpossible -from .algo_base import AlgoBase - - -class Lasso(AlgoBase): - """A basic lasso algorithm with user-item interaction terms. - - The prediction :math:`\\hat{r}_{ui}` is set as: - - .. math:: - \hat{r}_{ui} = \alpha_1 + \alpha_2^\top y_u + \alpha_3^\top z_i + - \alpha_4^\top \text{vec}(y_u \otimes z_i) - - where :math:`\alpha_1 \in \mathbb{R}, \alpha_2 \in \mathbb{R}^o, \alpha_3 - \in \mathbb{R}^p` and :math:`\alpha_4 \in \mathbb{R}^{op}` are coefficient - vectors, and :math:`\otimes` represent the Kronecker product of two vectors - (i.e., all possible cross-product combinations). - - Args: - add_interactions(bool): Whether to add user-item interaction terms. - Optional, default is True. - other args: See ``sklearn`` documentation for ``linear_model.Lasso``. - """ - - def __init__(self, add_interactions=True, alpha=1.0, fit_intercept=True, - normalize=False, precompute=False, max_iter=1000, tol=0.0001, - positive=False, random_state=None, selection='cyclic', - **kwargs): - - AlgoBase.__init__(self, **kwargs) - self.add_interactions = add_interactions - self.alpha = alpha - self.fit_intercept = fit_intercept - self.normalize = normalize - self.precompute = precompute - self.max_iter = max_iter - self.tol = tol - self.positive = positive - self.random_state = random_state - self.selection = selection - - def fit(self, trainset): - - AlgoBase.fit(self, trainset) - self.lasso(trainset) - - return self - - def lasso(self, trainset): - - if (self.trainset.n_user_features == 0 or - self.trainset.n_item_features == 0): - raise ValueError('trainset does not contain user and/or item ' - 'features.') - - n_ratings = self.trainset.n_ratings - n_uf = self.trainset.n_user_features - n_if = self.trainset.n_item_features - u_features = self.trainset.u_features - i_features = self.trainset.i_features - uf_labels = self.trainset.user_features_labels - if_labels = self.trainset.item_features_labels - - X = np.empty((n_ratings, n_uf + n_if)) - y = np.empty((n_ratings,)) - for k, (uid, iid, rating) in enumerate(self.trainset.all_ratings()): - y[k] = rating - try: - X[k, :n_uf] = u_features[uid] - except KeyError: - raise ValueError('No features for user ' + - str(self.trainset.to_raw_uid(uid))) - try: - X[k, n_uf:] = i_features[iid] - except KeyError: - raise ValueError('No features for item ' + - str(self.trainset.to_raw_iid(iid))) - - coef_labels = uf_labels + if_labels - if self.add_interactions: - temp = np.array([X[:, v] * X[:, j] for v in range(n_uf) - for j in range(n_uf, n_uf + n_if)]).T - X = np.concatenate([X, temp], axis=1) - temp = [coef_labels[v] + '*' + coef_labels[j] for v in range(n_uf) - for j in range(n_uf, n_uf + n_if)] - coef_labels += temp - - reg = linear_model.Lasso( - alpha=self.alpha, fit_intercept=self.fit_intercept, - normalize=self.normalize, precompute=self.precompute, - max_iter=self.max_iter, tol=self.tol, positive=self.positive, - random_state=self.random_state, selection=self.selection) - reg.fit(X, y) - - self.X = X - self.y = y - self.coef = reg.coef_ - self.coef_labels = coef_labels - self.intercept = reg.intercept_ - - def estimate(self, u, i, u_features, i_features): - - n_uf = self.trainset.n_user_features - n_if = self.trainset.n_item_features - - if u_features is None or len(u_features) != n_uf: - raise PredictionImpossible('User features are missing.') - - if i_features is None or len(i_features) != n_if: - raise PredictionImpossible('Item features are missing.') - - X = np.concatenate([u_features, i_features]) - - if self.add_interactions: - temp = np.array([X[v] * X[j] for v in range(n_uf) - for j in range(n_uf, n_uf + n_if)]) - X = np.concatenate([X, temp]) - - est = self.intercept + np.dot(X, self.coef) - - return est From ed7180dde1f53b2d376105a65d55f9d8bc6415f4 Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 5 Apr 2018 17:24:23 -0400 Subject: [PATCH 36/45] Revert "Features dataset" --- .gitignore | 2 +- surprise/accuracy.py | 81 ++++++++++++++++++- surprise/dataset.py | 59 +------------- surprise/evaluate.py | 2 - surprise/model_selection/search.py | 2 - surprise/model_selection/split.py | 2 +- surprise/prediction_algorithms/algo_base.py | 27 ++----- .../prediction_algorithms/baseline_only.py | 5 +- .../prediction_algorithms/co_clustering.pyx | 4 +- surprise/prediction_algorithms/knns.py | 41 ++++------ .../matrix_factorization.pyx | 10 +-- surprise/prediction_algorithms/predictions.py | 13 +-- surprise/prediction_algorithms/slope_one.pyx | 2 +- surprise/trainset.py | 64 ++------------- 14 files changed, 123 insertions(+), 191 deletions(-) diff --git a/.gitignore b/.gitignore index 45019cb0..bd32b905 100644 --- a/.gitignore +++ b/.gitignore @@ -23,4 +23,4 @@ _site .coverage tags -settings.json +settings.json \ No newline at end of file diff --git a/surprise/accuracy.py b/surprise/accuracy.py index 0bfcd1af..04ffca0f 100644 --- a/surprise/accuracy.py +++ b/surprise/accuracy.py @@ -45,7 +45,7 @@ def rmse(predictions, verbose=True): raise ValueError('Prediction list is empty.') mse = np.mean([float((true_r - est)**2) - for (_, _, _, _, true_r, est, _) in predictions]) + for (_, _, true_r, est, _) in predictions]) rmse_ = np.sqrt(mse) if verbose: @@ -80,7 +80,7 @@ def mae(predictions, verbose=True): raise ValueError('Prediction list is empty.') mae_ = np.mean([float(abs(true_r - est)) - for (_, _, _, _, true_r, est, _) in predictions]) + for (_, _, true_r, est, _) in predictions]) if verbose: print('MAE: {0:1.4f}'.format(mae_)) @@ -88,6 +88,81 @@ def mae(predictions, verbose=True): return mae_ +def asym_rmse(predictions, weight=0.5, verbose=True): + """Compute Asymmetric RMSE (Root Mean Squared Error). + + .. math:: + \\text{Asymmetric RMSE} = \\sqrt{\\frac{1}{|\\hat{R}|} + \\sum_{\\hat{r}_{ui} \in \\hat{R}}(r_{ui} - \\hat{r}_{ui})^2 |\\omega + - 1_{r_{ui} - \\hat{r}_{ui} < 0}|}. + + Args: + predictions (:obj:`list` of :obj:`Prediction\ + `): + A list of predictions, as returned by the :meth:`test() + ` method. + weight (int): Weight used to characterize asymmetry. + verbose: If True, will print computed value. Default is ``True``. + + + Returns: + The Asymmetric Root Mean Squared Error of predictions. + + Raises: + ValueError: When ``predictions`` is empty. + """ + + if not predictions: + raise ValueError('Prediction list is empty.') + + res = np.array([float(true_r - est) + for (_, _, true_r, est, _) in predictions]) + asym_rmse_ = np.sqrt(np.mean(res**2 * np.abs(weight - + (res<0).astype(int)))) + + if verbose: + print('Asymmetric RMSE: {0:1.4f}'.format(asym_rmse_)) + + return asym_rmse_ + + +def asym_mae(predictions, weight=0.5, verbose=True): + """Compute Asymmetric MAE (Mean Absolute Error). + + .. math:: + \\text{Asymmetric MAE} = \\frac{1}{|\\hat{R}|} \\sum_{\\hat{r}_{ui} \in + \\hat{R}}|r_{ui} - \\hat{r}_{ui}| |\\omega - 1_{r_{ui} - \\hat{r}_{ui} + < 0}|. + + Args: + predictions (:obj:`list` of :obj:`Prediction\ + `): + A list of predictions, as returned by the :meth:`test() + ` method. + weight (int): Weight used to characterize asymmetry. + verbose: If True, will print computed value. Default is ``True``. + + + Returns: + The Asymmetric Mean Absolute Error of predictions. + + Raises: + ValueError: When ``predictions`` is empty. + """ + + if not predictions: + raise ValueError('Prediction list is empty.') + + res = np.array([float(true_r - est) + for (_, _, true_r, est, _) in predictions]) + asym_mae_ = np.mean(np.abs(res) * np.abs(weight - (res<0).astype(int))) + + if verbose: + print('Asymmetric MAE: {0:1.4f}'.format(asym_mae_)) + + return asym_mae_ + + def fcp(predictions, verbose=True): """Compute FCP (Fraction of Concordant Pairs). @@ -117,7 +192,7 @@ def fcp(predictions, verbose=True): nc_u = defaultdict(int) nd_u = defaultdict(int) - for u0, _, _, _, r0, est, _ in predictions: + for u0, _, r0, est, _ in predictions: predictions_u[u0].append((r0, est)) for u0, preds in iteritems(predictions_u): diff --git a/surprise/dataset.py b/surprise/dataset.py index d5d74cb6..17638b6c 100644 --- a/surprise/dataset.py +++ b/surprise/dataset.py @@ -53,10 +53,6 @@ class Dataset: def __init__(self, reader): self.reader = reader - self.user_features_nb = 0 - self.item_features_nb = 0 - self.user_features = {} - self.item_features = {} @classmethod def load_builtin(cls, name='ml-100k'): @@ -169,36 +165,6 @@ def load_from_df(cls, df, reader): return DatasetAutoFolds(reader=reader, df=df) - def load_features_df(self, features_df, user_features=True): - """Load features from a pandas dataframe into a dataset. - - Use this if you want to add user or item features to a dataset. Only - certain prediction algorithms in the :mod:`prediction_algorithms` - package support this additional data. - - Args: - features_df(`Dataframe`): The dataframe containing the features. It - must have two columns or more, corresponding to the user or - item (raw) ids, and the features, in this order. - user_features(:obj:`bool`): Whether the features are for the users - or the items. Default is ``True``. - """ - - if user_features: - self.user_features_df = features_df - self.user_features = {tup[0]: tup[1:] for tup in - features_df.itertuples(index=False)} - self.user_features_labels = features_df.columns.values.tolist()[1:] - self.user_features_nb = len(self.user_features_labels) - else: - self.item_features_df = features_df - self.item_features = {tup[0]: tup[1:] for tup in - features_df.itertuples(index=False)} - self.item_features_labels = features_df.columns.values.tolist()[1:] - self.item_features_nb = len(self.item_features_labels) - - return self - def read_ratings(self, file_name): """Return a list of ratings (user, item, rating, timestamp) read from file_name""" @@ -242,28 +208,20 @@ def construct_trainset(self, raw_trainset): ur = defaultdict(list) ir = defaultdict(list) - u_features = {} - i_features = {} - # user raw id, item raw id, translated rating, time stamp - for urid, irid, r, _ in raw_trainset: + for urid, irid, r, timestamp in raw_trainset: try: uid = raw2inner_id_users[urid] except KeyError: uid = current_u_index raw2inner_id_users[urid] = current_u_index current_u_index += 1 - if self.user_features_nb > 0: - u_features[uid] = self.user_features.get(urid, None) - try: iid = raw2inner_id_items[irid] except KeyError: iid = current_i_index raw2inner_id_items[irid] = current_i_index current_i_index += 1 - if self.item_features_nb > 0: - i_features[iid] = self.item_features.get(irid, None) ur[uid].append((iid, r)) ir[iid].append((uid, r)) @@ -274,14 +232,8 @@ def construct_trainset(self, raw_trainset): trainset = Trainset(ur, ir, - u_features, - i_features, n_users, n_items, - self.user_features_nb, - self.item_features_nb, - self.user_features_labels, - self.item_features_labels, n_ratings, self.reader.rating_scale, self.reader.offset, @@ -292,13 +244,8 @@ def construct_trainset(self, raw_trainset): def construct_testset(self, raw_testset): - testset = [] - for (ruid, riid, r_ui_trans, _) in raw_testset: - u_features = self.user_features.get(ruid, None) - i_features = self.item_features.get(riid, None) - testset.append((ruid, riid, u_features, i_features, r_ui_trans)) - - return testset + return [(ruid, riid, r_ui_trans) + for (ruid, riid, r_ui_trans, _) in raw_testset] class DatasetUserFolds(Dataset): diff --git a/surprise/evaluate.py b/surprise/evaluate.py index bb283356..19e80fa5 100644 --- a/surprise/evaluate.py +++ b/surprise/evaluate.py @@ -301,7 +301,6 @@ class CaseInsensitiveDefaultDict(defaultdict): Used for the returned dict, so that users can use perf['RMSE'] or perf['rmse'] indifferently. """ - def __setitem__(self, key, value): super(CaseInsensitiveDefaultDict, self).__setitem__(key.lower(), value) @@ -334,5 +333,4 @@ def seed_and_eval(seed, *args): different processes.""" random.seed(seed) - return evaluate(*args, verbose=0) diff --git a/surprise/model_selection/search.py b/surprise/model_selection/search.py index afa2e2a6..9489c88b 100644 --- a/surprise/model_selection/search.py +++ b/surprise/model_selection/search.py @@ -294,7 +294,6 @@ class GridSearchCV(BaseSearchCV): into a pandas `DataFrame` (see :ref:`example `). """ - def __init__(self, algo_class, param_grid, measures=['rmse', 'mae'], cv=None, refit=False, return_train_measures=False, n_jobs=-1, pre_dispatch='2*n_jobs', joblib_verbose=0): @@ -411,7 +410,6 @@ class RandomizedSearchCV(BaseSearchCV): into a pandas `DataFrame` (see :ref:`example `). """ - def __init__(self, algo_class, param_distributions, n_iter=10, measures=['rmse', 'mae'], cv=None, refit=False, return_train_measures=False, n_jobs=-1, diff --git a/surprise/model_selection/split.py b/surprise/model_selection/split.py index 5c656565..14697911 100644 --- a/surprise/model_selection/split.py +++ b/surprise/model_selection/split.py @@ -372,7 +372,7 @@ def split(self, data): Args: data(:obj:`Dataset`): The data containing - ratings that will be divided into trainsets and testsets. + ratings that will be devided into trainsets and testsets. Yields: tuple of (trainset, testset) diff --git a/surprise/prediction_algorithms/algo_base.py b/surprise/prediction_algorithms/algo_base.py index 4fdcd8f0..3a80c4d4 100644 --- a/surprise/prediction_algorithms/algo_base.py +++ b/surprise/prediction_algorithms/algo_base.py @@ -37,7 +37,7 @@ def __init__(self, **kwargs): self.skip_train = False if (guf(self.__class__.fit) is guf(AlgoBase.fit) and - guf(self.__class__.train) is not guf(AlgoBase.train)): + guf(self.__class__.train) is not guf(AlgoBase.train)): warnings.warn('It looks like this algorithm (' + str(self.__class__) + ') implements train() ' @@ -96,8 +96,7 @@ def fit(self, trainset): return self - def predict(self, uid, iid, u_features=None, i_features=None, r_ui=None, - clip=True, verbose=False): + def predict(self, uid, iid, r_ui=None, clip=True, verbose=False): """Compute the rating prediction for given user and item. The ``predict`` method converts raw ids to inner ids and then calls the @@ -109,10 +108,6 @@ def predict(self, uid, iid, u_features=None, i_features=None, r_ui=None, Args: uid: (Raw) id of the user. See :ref:`this note`. iid: (Raw) id of the item. See :ref:`this note`. - u_features: List of user features in the same order as used in - the ``fit`` method. Optional, default is ``None``. - i_features: List of item features in the same order as used in - the ``fit`` method. Optional, default is ``None``. r_ui(float): The true rating :math:`r_{ui}`. Optional, default is ``None``. clip(bool): Whether to clip the estimation into the rating scale. @@ -148,7 +143,7 @@ def predict(self, uid, iid, u_features=None, i_features=None, r_ui=None, details = {} try: - est = self.estimate(iuid, iiid, u_features, i_features) + est = self.estimate(iuid, iiid) # If the details dict was also returned if isinstance(est, tuple): @@ -171,7 +166,7 @@ def predict(self, uid, iid, u_features=None, i_features=None, r_ui=None, est = min(higher_bound, est) est = max(lower_bound, est) - pred = Prediction(uid, iid, u_features, i_features, r_ui, est, details) + pred = Prediction(uid, iid, r_ui, est, details) if verbose: print(pred) @@ -212,12 +207,9 @@ def test(self, testset, verbose=False): # The ratings are translated back to their original scale. predictions = [self.predict(uid, iid, - u_features, - i_features, r_ui_trans - self.trainset.offset, verbose=verbose) - for (uid, iid, u_features, i_features, r_ui_trans) - in testset] + for (uid, iid, r_ui_trans) in testset] return predictions def compute_baselines(self): @@ -248,8 +240,7 @@ def compute_baselines(self): method_name = self.bsl_options.get('method', 'als') try: - if self.verbose: - print('Estimating biases using', method_name + '...') + # print('Estimating biases using', method_name + '...') self.bu, self.bi = method[method_name](self) return self.bu, self.bi except KeyError: @@ -296,11 +287,9 @@ def compute_similarities(self): args += [self.trainset.global_mean, bx, by, shrinkage] try: - if self.verbose: - print('Computing the {0} similarity matrix...'.format(name)) + # print('Computing the {0} similarity matrix...'.format(name)) sim = construction_func[name](*args) - if self.verbose: - print('Done computing similarity matrix.') + # print('Done computing similarity matrix.') return sim except KeyError: raise NameError('Wrong sim name ' + name + '. Allowed values ' + diff --git a/surprise/prediction_algorithms/baseline_only.py b/surprise/prediction_algorithms/baseline_only.py index 05886657..85221114 100644 --- a/surprise/prediction_algorithms/baseline_only.py +++ b/surprise/prediction_algorithms/baseline_only.py @@ -24,10 +24,9 @@ class BaselineOnly(AlgoBase): """ - def __init__(self, bsl_options={}, verbose=True): + def __init__(self, bsl_options={}): AlgoBase.__init__(self, bsl_options=bsl_options) - self.verbose = verbose def fit(self, trainset): @@ -36,7 +35,7 @@ def fit(self, trainset): return self - def estimate(self, u, i, *_): + def estimate(self, u, i): est = self.trainset.global_mean if self.trainset.knows_user(u): diff --git a/surprise/prediction_algorithms/co_clustering.pyx b/surprise/prediction_algorithms/co_clustering.pyx index 85837718..408780fc 100644 --- a/surprise/prediction_algorithms/co_clustering.pyx +++ b/surprise/prediction_algorithms/co_clustering.pyx @@ -62,7 +62,7 @@ class CoClustering(AlgoBase): self.n_cltr_u = n_cltr_u self.n_cltr_i = n_cltr_i self.n_epochs = n_epochs - self.verbose = verbose + self.verbose=verbose self.random_state = random_state def fit(self, trainset): @@ -236,7 +236,7 @@ class CoClustering(AlgoBase): return avg_cltr_u, avg_cltr_i, avg_cocltr - def estimate(self, u, i, *_): + def estimate(self, u, i): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): return self.trainset.global_mean diff --git a/surprise/prediction_algorithms/knns.py b/surprise/prediction_algorithms/knns.py index 301a8168..069da4d3 100644 --- a/surprise/prediction_algorithms/knns.py +++ b/surprise/prediction_algorithms/knns.py @@ -27,10 +27,9 @@ class SymmetricAlgo(AlgoBase): reversed. """ - def __init__(self, sim_options={}, verbose=True, **kwargs): + def __init__(self, sim_options={}, **kwargs): AlgoBase.__init__(self, sim_options=sim_options, **kwargs) - self.verbose = verbose def fit(self, trainset): @@ -82,14 +81,11 @@ class KNNBasic(SymmetricAlgo): sim_options(dict): A dictionary of options for the similarity measure. See :ref:`similarity_measures_configuration` for accepted options. - verbose(bool): Whether to print trace messages of bias estimation, - similarity, etc. Default is True. """ - def __init__(self, k=40, min_k=1, sim_options={}, verbose=True, **kwargs): + def __init__(self, k=40, min_k=1, sim_options={}, **kwargs): - SymmetricAlgo.__init__(self, sim_options=sim_options, verbose=verbose, - **kwargs) + SymmetricAlgo.__init__(self, sim_options=sim_options, **kwargs) self.k = k self.min_k = min_k @@ -100,7 +96,7 @@ def fit(self, trainset): return self - def estimate(self, u, i, *_): + def estimate(self, u, i): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unkown.') @@ -160,14 +156,11 @@ class KNNWithMeans(SymmetricAlgo): sim_options(dict): A dictionary of options for the similarity measure. See :ref:`similarity_measures_configuration` for accepted options. - verbose(bool): Whether to print trace messages of bias estimation, - similarity, etc. Default is True. """ - def __init__(self, k=40, min_k=1, sim_options={}, verbose=True, **kwargs): + def __init__(self, k=40, min_k=1, sim_options={}, **kwargs): - SymmetricAlgo.__init__(self, sim_options=sim_options, - verbose=verbose, **kwargs) + SymmetricAlgo.__init__(self, sim_options=sim_options, **kwargs) self.k = k self.min_k = min_k @@ -183,7 +176,7 @@ def fit(self, trainset): return self - def estimate(self, u, i, *_): + def estimate(self, u, i): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unkown.') @@ -254,20 +247,17 @@ class KNNBaseline(SymmetricAlgo): measure. See :ref:`similarity_measures_configuration` for accepted options. It is recommended to use the :func:`pearson_baseline ` similarity measure. + bsl_options(dict): A dictionary of options for the baseline estimates computation. See :ref:`baseline_estimates_configuration` for accepted options. - verbose(bool): Whether to print trace messages of bias estimation, - similarity, etc. Default is True. """ - def __init__(self, k=40, min_k=1, sim_options={}, bsl_options={}, - verbose=True, **kwargs): + def __init__(self, k=40, min_k=1, sim_options={}, bsl_options={}): SymmetricAlgo.__init__(self, sim_options=sim_options, - bsl_options=bsl_options, verbose=verbose, - **kwargs) + bsl_options=bsl_options) self.k = k self.min_k = min_k @@ -281,7 +271,7 @@ def fit(self, trainset): return self - def estimate(self, u, i, *_): + def estimate(self, u, i): est = self.trainset.global_mean if self.trainset.knows_user(u): @@ -352,14 +342,11 @@ class KNNWithZScore(SymmetricAlgo): sim_options(dict): A dictionary of options for the similarity measure. See :ref:`similarity_measures_configuration` for accepted options. - verbose(bool): Whether to print trace messages of bias estimation, - similarity, etc. Default is True. """ - def __init__(self, k=40, min_k=1, sim_options={}, verbose=True, **kwargs): + def __init__(self, k=40, min_k=1, sim_options={}, **kwargs): - SymmetricAlgo.__init__(self, sim_options=sim_options, verbose=verbose, - **kwargs) + SymmetricAlgo.__init__(self, sim_options=sim_options, **kwargs) self.k = k self.min_k = min_k @@ -383,7 +370,7 @@ def fit(self, trainset): return self - def estimate(self, u, i, *_): + def estimate(self, u, i): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unkown.') diff --git a/surprise/prediction_algorithms/matrix_factorization.pyx b/surprise/prediction_algorithms/matrix_factorization.pyx index 7a3cede5..0e898632 100644 --- a/surprise/prediction_algorithms/matrix_factorization.pyx +++ b/surprise/prediction_algorithms/matrix_factorization.pyx @@ -253,7 +253,7 @@ class SVD(AlgoBase): self.pu = pu self.qi = qi - def estimate(self, u, i, *_): + def estimate(self, u, i): # Should we cythonize this as well? known_user = self.trainset.knows_user(u) @@ -275,7 +275,7 @@ class SVD(AlgoBase): if known_user and known_item: est = np.dot(self.qi[i], self.pu[u]) else: - raise PredictionImpossible('User and item are unknown.') + raise PredictionImpossible('User and item are unkown.') return est @@ -484,7 +484,7 @@ class SVDpp(AlgoBase): self.qi = qi self.yj = yj - def estimate(self, u, i, *_): + def estimate(self, u, i): est = self.trainset.global_mean @@ -715,7 +715,7 @@ class NMF(AlgoBase): self.pu = pu self.qi = qi - def estimate(self, u, i, *_): + def estimate(self, u, i): # Should we cythonize this as well? known_user = self.trainset.knows_user(u) @@ -737,6 +737,6 @@ class NMF(AlgoBase): if known_user and known_item: est = np.dot(self.qi[i], self.pu[u]) else: - raise PredictionImpossible('User and item are unknown.') + raise PredictionImpossible('User and item are unkown.') return est diff --git a/surprise/prediction_algorithms/predictions.py b/surprise/prediction_algorithms/predictions.py index 9e971ee9..76bc8ddc 100644 --- a/surprise/prediction_algorithms/predictions.py +++ b/surprise/prediction_algorithms/predictions.py @@ -21,8 +21,7 @@ class PredictionImpossible(Exception): class Prediction(namedtuple('Prediction', - ['uid', 'iid', 'u_features', 'i_features', 'r_ui', - 'est', 'details'])): + ['uid', 'iid', 'r_ui', 'est', 'details'])): """A named tuple for storing the results of a prediction. It's wrapped in a class, but only for documentation and printing purposes. @@ -30,8 +29,6 @@ class Prediction(namedtuple('Prediction', Args: uid: The (raw) user id. See :ref:`this note`. iid: The (raw) item id. See :ref:`this note`. - u_features: The user features. - i_features: The item features. r_ui(float): The true rating :math:`r_{ui}`. est(float): The estimated rating :math:`\\hat{r}_{ui}`. details (dict): Stores additional details about the prediction that @@ -43,14 +40,6 @@ class Prediction(namedtuple('Prediction', def __str__(self): s = 'user: {uid:<10} '.format(uid=self.uid) s += 'item: {iid:<10} '.format(iid=self.iid) - if self.u_features is not None: - pass - else: - s += 'u_features = None ' - if self.i_features is not None: - pass - else: - s += 'i_features = None ' if self.r_ui is not None: s += 'r_ui = {r_ui:1.2f} '.format(r_ui=self.r_ui) else: diff --git a/surprise/prediction_algorithms/slope_one.pyx b/surprise/prediction_algorithms/slope_one.pyx index f986e496..8049a6cf 100644 --- a/surprise/prediction_algorithms/slope_one.pyx +++ b/surprise/prediction_algorithms/slope_one.pyx @@ -79,7 +79,7 @@ class SlopeOne(AlgoBase): return self - def estimate(self, u, i, *_): + def estimate(self, u, i): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unkown.') diff --git a/surprise/trainset.py b/surprise/trainset.py index c7d091f6..ebb95204 100644 --- a/surprise/trainset.py +++ b/surprise/trainset.py @@ -33,37 +33,21 @@ class Trainset: ir(:obj:`defaultdict` of :obj:`list`): The items ratings. This is a dictionary containing lists of tuples of the form ``(user_inner_id, rating)``. The keys are item inner ids. - u_features(:obj:`defaultdict` of :obj:`list`): The user features. This - is a dictionary containing lists of features. The keys are user - inner ids. - i_features(:obj:`defaultdict` of :obj:`list`): The item features. This - is a dictionary containing lists of features. The keys are item - inner ids. n_users: Total number of users :math:`|U|`. n_items: Total number of items :math:`|I|`. - n_user_features: Total number of user features. - n_item_features: Total number of item features. n_ratings: Total number of ratings :math:`|R_{train}|`. rating_scale(tuple): The minimum and maximal rating of the rating scale. global_mean: The mean of all ratings :math:`\\mu`. """ - def __init__(self, ur, ir, u_features, i_features, n_users, n_items, - n_user_features, n_item_features, user_features_labels, - item_features_labels, n_ratings, rating_scale, offset, - raw2inner_id_users, raw2inner_id_items): + def __init__(self, ur, ir, n_users, n_items, n_ratings, rating_scale, + offset, raw2inner_id_users, raw2inner_id_items): self.ur = ur self.ir = ir - self.u_features = u_features - self.i_features = i_features self.n_users = n_users self.n_items = n_items - self.n_user_features = n_user_features - self.n_item_features = n_item_features - self.user_features_labels = user_features_labels - self.item_features_labels = item_features_labels self.n_ratings = n_ratings self.rating_scale = rating_scale self.offset = offset @@ -103,30 +87,6 @@ def knows_item(self, iid): return iid in self.ir - def has_user_features(self, uid): - """Indicate if the user features are part of the trainset. - - Args: - uid(int): The (inner) user id. See :ref:`this - note`. - Returns: - ``True`` if user features are part of the trainset, else ``False``. - """ - - return uid in self.u_features - - def has_item_features(self, iid): - """Indicate if the item features are part of the trainset. - - Args: - iid(int): The (inner) item id. See :ref:`this - note`. - Returns: - ``True`` if item features are part of the trainset, else ``False``. - """ - - return iid in self.i_features - def to_inner_uid(self, ruid): """Convert a **user** raw id to an inner id. @@ -240,14 +200,8 @@ def build_testset(self): cases where you want to to test your algorithm on the trainset. """ - testset = [] - for (u, i, r) in self.all_ratings(): - u_features = self.u_features.get(u, None) - i_features = self.i_features.get(i, None) - testset.append((self.to_raw_uid(u), self.to_raw_iid(i), u_features, - i_features, r)) - - return testset + return [(self.to_raw_uid(u), self.to_raw_iid(i), r) + for (u, i, r) in self.all_ratings()] def build_anti_testset(self, fill=None): """Return a list of ratings that can be used as a testset in the @@ -274,13 +228,9 @@ def build_anti_testset(self, fill=None): anti_testset = [] for u in self.all_users(): user_items = set([j for (j, _) in self.ur[u]]) - anti_testset += [(self.to_raw_uid(u), - self.to_raw_iid(i), - self.u_features.get(u, None), - self.i_features.get(i, None), - fill) - for i in self.all_items() - if i not in user_items] + anti_testset += [(self.to_raw_uid(u), self.to_raw_iid(i), fill) for + i in self.all_items() if + i not in user_items] return anti_testset def all_users(self): From 32082ce95fb4e5975c7a2c1b3572bbb436783136 Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 5 Apr 2018 17:34:48 -0400 Subject: [PATCH 37/45] initialize features_labels --- surprise/dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/surprise/dataset.py b/surprise/dataset.py index d5d74cb6..da8a5ba4 100644 --- a/surprise/dataset.py +++ b/surprise/dataset.py @@ -57,6 +57,8 @@ def __init__(self, reader): self.item_features_nb = 0 self.user_features = {} self.item_features = {} + self.user_features_labels = [] + self.item_features_labels = [] @classmethod def load_builtin(cls, name='ml-100k'): From e3de2084b6598f0801222c581f0a5e70048ae3c4 Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 5 Apr 2018 17:37:27 -0400 Subject: [PATCH 38/45] Revert "Revert "Features dataset"" --- .gitignore | 2 +- surprise/accuracy.py | 81 +------------------ surprise/dataset.py | 59 +++++++++++++- surprise/evaluate.py | 2 + surprise/model_selection/search.py | 2 + surprise/model_selection/split.py | 2 +- surprise/prediction_algorithms/algo_base.py | 27 +++++-- .../prediction_algorithms/baseline_only.py | 5 +- .../prediction_algorithms/co_clustering.pyx | 4 +- surprise/prediction_algorithms/knns.py | 41 ++++++---- .../matrix_factorization.pyx | 10 +-- surprise/prediction_algorithms/predictions.py | 13 ++- surprise/prediction_algorithms/slope_one.pyx | 2 +- surprise/trainset.py | 64 +++++++++++++-- 14 files changed, 191 insertions(+), 123 deletions(-) diff --git a/.gitignore b/.gitignore index bd32b905..45019cb0 100644 --- a/.gitignore +++ b/.gitignore @@ -23,4 +23,4 @@ _site .coverage tags -settings.json \ No newline at end of file +settings.json diff --git a/surprise/accuracy.py b/surprise/accuracy.py index 04ffca0f..0bfcd1af 100644 --- a/surprise/accuracy.py +++ b/surprise/accuracy.py @@ -45,7 +45,7 @@ def rmse(predictions, verbose=True): raise ValueError('Prediction list is empty.') mse = np.mean([float((true_r - est)**2) - for (_, _, true_r, est, _) in predictions]) + for (_, _, _, _, true_r, est, _) in predictions]) rmse_ = np.sqrt(mse) if verbose: @@ -80,7 +80,7 @@ def mae(predictions, verbose=True): raise ValueError('Prediction list is empty.') mae_ = np.mean([float(abs(true_r - est)) - for (_, _, true_r, est, _) in predictions]) + for (_, _, _, _, true_r, est, _) in predictions]) if verbose: print('MAE: {0:1.4f}'.format(mae_)) @@ -88,81 +88,6 @@ def mae(predictions, verbose=True): return mae_ -def asym_rmse(predictions, weight=0.5, verbose=True): - """Compute Asymmetric RMSE (Root Mean Squared Error). - - .. math:: - \\text{Asymmetric RMSE} = \\sqrt{\\frac{1}{|\\hat{R}|} - \\sum_{\\hat{r}_{ui} \in \\hat{R}}(r_{ui} - \\hat{r}_{ui})^2 |\\omega - - 1_{r_{ui} - \\hat{r}_{ui} < 0}|}. - - Args: - predictions (:obj:`list` of :obj:`Prediction\ - `): - A list of predictions, as returned by the :meth:`test() - ` method. - weight (int): Weight used to characterize asymmetry. - verbose: If True, will print computed value. Default is ``True``. - - - Returns: - The Asymmetric Root Mean Squared Error of predictions. - - Raises: - ValueError: When ``predictions`` is empty. - """ - - if not predictions: - raise ValueError('Prediction list is empty.') - - res = np.array([float(true_r - est) - for (_, _, true_r, est, _) in predictions]) - asym_rmse_ = np.sqrt(np.mean(res**2 * np.abs(weight - - (res<0).astype(int)))) - - if verbose: - print('Asymmetric RMSE: {0:1.4f}'.format(asym_rmse_)) - - return asym_rmse_ - - -def asym_mae(predictions, weight=0.5, verbose=True): - """Compute Asymmetric MAE (Mean Absolute Error). - - .. math:: - \\text{Asymmetric MAE} = \\frac{1}{|\\hat{R}|} \\sum_{\\hat{r}_{ui} \in - \\hat{R}}|r_{ui} - \\hat{r}_{ui}| |\\omega - 1_{r_{ui} - \\hat{r}_{ui} - < 0}|. - - Args: - predictions (:obj:`list` of :obj:`Prediction\ - `): - A list of predictions, as returned by the :meth:`test() - ` method. - weight (int): Weight used to characterize asymmetry. - verbose: If True, will print computed value. Default is ``True``. - - - Returns: - The Asymmetric Mean Absolute Error of predictions. - - Raises: - ValueError: When ``predictions`` is empty. - """ - - if not predictions: - raise ValueError('Prediction list is empty.') - - res = np.array([float(true_r - est) - for (_, _, true_r, est, _) in predictions]) - asym_mae_ = np.mean(np.abs(res) * np.abs(weight - (res<0).astype(int))) - - if verbose: - print('Asymmetric MAE: {0:1.4f}'.format(asym_mae_)) - - return asym_mae_ - - def fcp(predictions, verbose=True): """Compute FCP (Fraction of Concordant Pairs). @@ -192,7 +117,7 @@ def fcp(predictions, verbose=True): nc_u = defaultdict(int) nd_u = defaultdict(int) - for u0, _, r0, est, _ in predictions: + for u0, _, _, _, r0, est, _ in predictions: predictions_u[u0].append((r0, est)) for u0, preds in iteritems(predictions_u): diff --git a/surprise/dataset.py b/surprise/dataset.py index 17638b6c..d5d74cb6 100644 --- a/surprise/dataset.py +++ b/surprise/dataset.py @@ -53,6 +53,10 @@ class Dataset: def __init__(self, reader): self.reader = reader + self.user_features_nb = 0 + self.item_features_nb = 0 + self.user_features = {} + self.item_features = {} @classmethod def load_builtin(cls, name='ml-100k'): @@ -165,6 +169,36 @@ def load_from_df(cls, df, reader): return DatasetAutoFolds(reader=reader, df=df) + def load_features_df(self, features_df, user_features=True): + """Load features from a pandas dataframe into a dataset. + + Use this if you want to add user or item features to a dataset. Only + certain prediction algorithms in the :mod:`prediction_algorithms` + package support this additional data. + + Args: + features_df(`Dataframe`): The dataframe containing the features. It + must have two columns or more, corresponding to the user or + item (raw) ids, and the features, in this order. + user_features(:obj:`bool`): Whether the features are for the users + or the items. Default is ``True``. + """ + + if user_features: + self.user_features_df = features_df + self.user_features = {tup[0]: tup[1:] for tup in + features_df.itertuples(index=False)} + self.user_features_labels = features_df.columns.values.tolist()[1:] + self.user_features_nb = len(self.user_features_labels) + else: + self.item_features_df = features_df + self.item_features = {tup[0]: tup[1:] for tup in + features_df.itertuples(index=False)} + self.item_features_labels = features_df.columns.values.tolist()[1:] + self.item_features_nb = len(self.item_features_labels) + + return self + def read_ratings(self, file_name): """Return a list of ratings (user, item, rating, timestamp) read from file_name""" @@ -208,20 +242,28 @@ def construct_trainset(self, raw_trainset): ur = defaultdict(list) ir = defaultdict(list) + u_features = {} + i_features = {} + # user raw id, item raw id, translated rating, time stamp - for urid, irid, r, timestamp in raw_trainset: + for urid, irid, r, _ in raw_trainset: try: uid = raw2inner_id_users[urid] except KeyError: uid = current_u_index raw2inner_id_users[urid] = current_u_index current_u_index += 1 + if self.user_features_nb > 0: + u_features[uid] = self.user_features.get(urid, None) + try: iid = raw2inner_id_items[irid] except KeyError: iid = current_i_index raw2inner_id_items[irid] = current_i_index current_i_index += 1 + if self.item_features_nb > 0: + i_features[iid] = self.item_features.get(irid, None) ur[uid].append((iid, r)) ir[iid].append((uid, r)) @@ -232,8 +274,14 @@ def construct_trainset(self, raw_trainset): trainset = Trainset(ur, ir, + u_features, + i_features, n_users, n_items, + self.user_features_nb, + self.item_features_nb, + self.user_features_labels, + self.item_features_labels, n_ratings, self.reader.rating_scale, self.reader.offset, @@ -244,8 +292,13 @@ def construct_trainset(self, raw_trainset): def construct_testset(self, raw_testset): - return [(ruid, riid, r_ui_trans) - for (ruid, riid, r_ui_trans, _) in raw_testset] + testset = [] + for (ruid, riid, r_ui_trans, _) in raw_testset: + u_features = self.user_features.get(ruid, None) + i_features = self.item_features.get(riid, None) + testset.append((ruid, riid, u_features, i_features, r_ui_trans)) + + return testset class DatasetUserFolds(Dataset): diff --git a/surprise/evaluate.py b/surprise/evaluate.py index 19e80fa5..bb283356 100644 --- a/surprise/evaluate.py +++ b/surprise/evaluate.py @@ -301,6 +301,7 @@ class CaseInsensitiveDefaultDict(defaultdict): Used for the returned dict, so that users can use perf['RMSE'] or perf['rmse'] indifferently. """ + def __setitem__(self, key, value): super(CaseInsensitiveDefaultDict, self).__setitem__(key.lower(), value) @@ -333,4 +334,5 @@ def seed_and_eval(seed, *args): different processes.""" random.seed(seed) + return evaluate(*args, verbose=0) diff --git a/surprise/model_selection/search.py b/surprise/model_selection/search.py index 9489c88b..afa2e2a6 100644 --- a/surprise/model_selection/search.py +++ b/surprise/model_selection/search.py @@ -294,6 +294,7 @@ class GridSearchCV(BaseSearchCV): into a pandas `DataFrame` (see :ref:`example `). """ + def __init__(self, algo_class, param_grid, measures=['rmse', 'mae'], cv=None, refit=False, return_train_measures=False, n_jobs=-1, pre_dispatch='2*n_jobs', joblib_verbose=0): @@ -410,6 +411,7 @@ class RandomizedSearchCV(BaseSearchCV): into a pandas `DataFrame` (see :ref:`example `). """ + def __init__(self, algo_class, param_distributions, n_iter=10, measures=['rmse', 'mae'], cv=None, refit=False, return_train_measures=False, n_jobs=-1, diff --git a/surprise/model_selection/split.py b/surprise/model_selection/split.py index 14697911..5c656565 100644 --- a/surprise/model_selection/split.py +++ b/surprise/model_selection/split.py @@ -372,7 +372,7 @@ def split(self, data): Args: data(:obj:`Dataset`): The data containing - ratings that will be devided into trainsets and testsets. + ratings that will be divided into trainsets and testsets. Yields: tuple of (trainset, testset) diff --git a/surprise/prediction_algorithms/algo_base.py b/surprise/prediction_algorithms/algo_base.py index 3a80c4d4..4fdcd8f0 100644 --- a/surprise/prediction_algorithms/algo_base.py +++ b/surprise/prediction_algorithms/algo_base.py @@ -37,7 +37,7 @@ def __init__(self, **kwargs): self.skip_train = False if (guf(self.__class__.fit) is guf(AlgoBase.fit) and - guf(self.__class__.train) is not guf(AlgoBase.train)): + guf(self.__class__.train) is not guf(AlgoBase.train)): warnings.warn('It looks like this algorithm (' + str(self.__class__) + ') implements train() ' @@ -96,7 +96,8 @@ def fit(self, trainset): return self - def predict(self, uid, iid, r_ui=None, clip=True, verbose=False): + def predict(self, uid, iid, u_features=None, i_features=None, r_ui=None, + clip=True, verbose=False): """Compute the rating prediction for given user and item. The ``predict`` method converts raw ids to inner ids and then calls the @@ -108,6 +109,10 @@ def predict(self, uid, iid, r_ui=None, clip=True, verbose=False): Args: uid: (Raw) id of the user. See :ref:`this note`. iid: (Raw) id of the item. See :ref:`this note`. + u_features: List of user features in the same order as used in + the ``fit`` method. Optional, default is ``None``. + i_features: List of item features in the same order as used in + the ``fit`` method. Optional, default is ``None``. r_ui(float): The true rating :math:`r_{ui}`. Optional, default is ``None``. clip(bool): Whether to clip the estimation into the rating scale. @@ -143,7 +148,7 @@ def predict(self, uid, iid, r_ui=None, clip=True, verbose=False): details = {} try: - est = self.estimate(iuid, iiid) + est = self.estimate(iuid, iiid, u_features, i_features) # If the details dict was also returned if isinstance(est, tuple): @@ -166,7 +171,7 @@ def predict(self, uid, iid, r_ui=None, clip=True, verbose=False): est = min(higher_bound, est) est = max(lower_bound, est) - pred = Prediction(uid, iid, r_ui, est, details) + pred = Prediction(uid, iid, u_features, i_features, r_ui, est, details) if verbose: print(pred) @@ -207,9 +212,12 @@ def test(self, testset, verbose=False): # The ratings are translated back to their original scale. predictions = [self.predict(uid, iid, + u_features, + i_features, r_ui_trans - self.trainset.offset, verbose=verbose) - for (uid, iid, r_ui_trans) in testset] + for (uid, iid, u_features, i_features, r_ui_trans) + in testset] return predictions def compute_baselines(self): @@ -240,7 +248,8 @@ def compute_baselines(self): method_name = self.bsl_options.get('method', 'als') try: - # print('Estimating biases using', method_name + '...') + if self.verbose: + print('Estimating biases using', method_name + '...') self.bu, self.bi = method[method_name](self) return self.bu, self.bi except KeyError: @@ -287,9 +296,11 @@ def compute_similarities(self): args += [self.trainset.global_mean, bx, by, shrinkage] try: - # print('Computing the {0} similarity matrix...'.format(name)) + if self.verbose: + print('Computing the {0} similarity matrix...'.format(name)) sim = construction_func[name](*args) - # print('Done computing similarity matrix.') + if self.verbose: + print('Done computing similarity matrix.') return sim except KeyError: raise NameError('Wrong sim name ' + name + '. Allowed values ' + diff --git a/surprise/prediction_algorithms/baseline_only.py b/surprise/prediction_algorithms/baseline_only.py index 85221114..05886657 100644 --- a/surprise/prediction_algorithms/baseline_only.py +++ b/surprise/prediction_algorithms/baseline_only.py @@ -24,9 +24,10 @@ class BaselineOnly(AlgoBase): """ - def __init__(self, bsl_options={}): + def __init__(self, bsl_options={}, verbose=True): AlgoBase.__init__(self, bsl_options=bsl_options) + self.verbose = verbose def fit(self, trainset): @@ -35,7 +36,7 @@ def fit(self, trainset): return self - def estimate(self, u, i): + def estimate(self, u, i, *_): est = self.trainset.global_mean if self.trainset.knows_user(u): diff --git a/surprise/prediction_algorithms/co_clustering.pyx b/surprise/prediction_algorithms/co_clustering.pyx index 408780fc..85837718 100644 --- a/surprise/prediction_algorithms/co_clustering.pyx +++ b/surprise/prediction_algorithms/co_clustering.pyx @@ -62,7 +62,7 @@ class CoClustering(AlgoBase): self.n_cltr_u = n_cltr_u self.n_cltr_i = n_cltr_i self.n_epochs = n_epochs - self.verbose=verbose + self.verbose = verbose self.random_state = random_state def fit(self, trainset): @@ -236,7 +236,7 @@ class CoClustering(AlgoBase): return avg_cltr_u, avg_cltr_i, avg_cocltr - def estimate(self, u, i): + def estimate(self, u, i, *_): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): return self.trainset.global_mean diff --git a/surprise/prediction_algorithms/knns.py b/surprise/prediction_algorithms/knns.py index 069da4d3..301a8168 100644 --- a/surprise/prediction_algorithms/knns.py +++ b/surprise/prediction_algorithms/knns.py @@ -27,9 +27,10 @@ class SymmetricAlgo(AlgoBase): reversed. """ - def __init__(self, sim_options={}, **kwargs): + def __init__(self, sim_options={}, verbose=True, **kwargs): AlgoBase.__init__(self, sim_options=sim_options, **kwargs) + self.verbose = verbose def fit(self, trainset): @@ -81,11 +82,14 @@ class KNNBasic(SymmetricAlgo): sim_options(dict): A dictionary of options for the similarity measure. See :ref:`similarity_measures_configuration` for accepted options. + verbose(bool): Whether to print trace messages of bias estimation, + similarity, etc. Default is True. """ - def __init__(self, k=40, min_k=1, sim_options={}, **kwargs): + def __init__(self, k=40, min_k=1, sim_options={}, verbose=True, **kwargs): - SymmetricAlgo.__init__(self, sim_options=sim_options, **kwargs) + SymmetricAlgo.__init__(self, sim_options=sim_options, verbose=verbose, + **kwargs) self.k = k self.min_k = min_k @@ -96,7 +100,7 @@ def fit(self, trainset): return self - def estimate(self, u, i): + def estimate(self, u, i, *_): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unkown.') @@ -156,11 +160,14 @@ class KNNWithMeans(SymmetricAlgo): sim_options(dict): A dictionary of options for the similarity measure. See :ref:`similarity_measures_configuration` for accepted options. + verbose(bool): Whether to print trace messages of bias estimation, + similarity, etc. Default is True. """ - def __init__(self, k=40, min_k=1, sim_options={}, **kwargs): + def __init__(self, k=40, min_k=1, sim_options={}, verbose=True, **kwargs): - SymmetricAlgo.__init__(self, sim_options=sim_options, **kwargs) + SymmetricAlgo.__init__(self, sim_options=sim_options, + verbose=verbose, **kwargs) self.k = k self.min_k = min_k @@ -176,7 +183,7 @@ def fit(self, trainset): return self - def estimate(self, u, i): + def estimate(self, u, i, *_): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unkown.') @@ -247,17 +254,20 @@ class KNNBaseline(SymmetricAlgo): measure. See :ref:`similarity_measures_configuration` for accepted options. It is recommended to use the :func:`pearson_baseline ` similarity measure. - bsl_options(dict): A dictionary of options for the baseline estimates computation. See :ref:`baseline_estimates_configuration` for accepted options. + verbose(bool): Whether to print trace messages of bias estimation, + similarity, etc. Default is True. """ - def __init__(self, k=40, min_k=1, sim_options={}, bsl_options={}): + def __init__(self, k=40, min_k=1, sim_options={}, bsl_options={}, + verbose=True, **kwargs): SymmetricAlgo.__init__(self, sim_options=sim_options, - bsl_options=bsl_options) + bsl_options=bsl_options, verbose=verbose, + **kwargs) self.k = k self.min_k = min_k @@ -271,7 +281,7 @@ def fit(self, trainset): return self - def estimate(self, u, i): + def estimate(self, u, i, *_): est = self.trainset.global_mean if self.trainset.knows_user(u): @@ -342,11 +352,14 @@ class KNNWithZScore(SymmetricAlgo): sim_options(dict): A dictionary of options for the similarity measure. See :ref:`similarity_measures_configuration` for accepted options. + verbose(bool): Whether to print trace messages of bias estimation, + similarity, etc. Default is True. """ - def __init__(self, k=40, min_k=1, sim_options={}, **kwargs): + def __init__(self, k=40, min_k=1, sim_options={}, verbose=True, **kwargs): - SymmetricAlgo.__init__(self, sim_options=sim_options, **kwargs) + SymmetricAlgo.__init__(self, sim_options=sim_options, verbose=verbose, + **kwargs) self.k = k self.min_k = min_k @@ -370,7 +383,7 @@ def fit(self, trainset): return self - def estimate(self, u, i): + def estimate(self, u, i, *_): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unkown.') diff --git a/surprise/prediction_algorithms/matrix_factorization.pyx b/surprise/prediction_algorithms/matrix_factorization.pyx index 0e898632..7a3cede5 100644 --- a/surprise/prediction_algorithms/matrix_factorization.pyx +++ b/surprise/prediction_algorithms/matrix_factorization.pyx @@ -253,7 +253,7 @@ class SVD(AlgoBase): self.pu = pu self.qi = qi - def estimate(self, u, i): + def estimate(self, u, i, *_): # Should we cythonize this as well? known_user = self.trainset.knows_user(u) @@ -275,7 +275,7 @@ class SVD(AlgoBase): if known_user and known_item: est = np.dot(self.qi[i], self.pu[u]) else: - raise PredictionImpossible('User and item are unkown.') + raise PredictionImpossible('User and item are unknown.') return est @@ -484,7 +484,7 @@ class SVDpp(AlgoBase): self.qi = qi self.yj = yj - def estimate(self, u, i): + def estimate(self, u, i, *_): est = self.trainset.global_mean @@ -715,7 +715,7 @@ class NMF(AlgoBase): self.pu = pu self.qi = qi - def estimate(self, u, i): + def estimate(self, u, i, *_): # Should we cythonize this as well? known_user = self.trainset.knows_user(u) @@ -737,6 +737,6 @@ class NMF(AlgoBase): if known_user and known_item: est = np.dot(self.qi[i], self.pu[u]) else: - raise PredictionImpossible('User and item are unkown.') + raise PredictionImpossible('User and item are unknown.') return est diff --git a/surprise/prediction_algorithms/predictions.py b/surprise/prediction_algorithms/predictions.py index 76bc8ddc..9e971ee9 100644 --- a/surprise/prediction_algorithms/predictions.py +++ b/surprise/prediction_algorithms/predictions.py @@ -21,7 +21,8 @@ class PredictionImpossible(Exception): class Prediction(namedtuple('Prediction', - ['uid', 'iid', 'r_ui', 'est', 'details'])): + ['uid', 'iid', 'u_features', 'i_features', 'r_ui', + 'est', 'details'])): """A named tuple for storing the results of a prediction. It's wrapped in a class, but only for documentation and printing purposes. @@ -29,6 +30,8 @@ class Prediction(namedtuple('Prediction', Args: uid: The (raw) user id. See :ref:`this note`. iid: The (raw) item id. See :ref:`this note`. + u_features: The user features. + i_features: The item features. r_ui(float): The true rating :math:`r_{ui}`. est(float): The estimated rating :math:`\\hat{r}_{ui}`. details (dict): Stores additional details about the prediction that @@ -40,6 +43,14 @@ class Prediction(namedtuple('Prediction', def __str__(self): s = 'user: {uid:<10} '.format(uid=self.uid) s += 'item: {iid:<10} '.format(iid=self.iid) + if self.u_features is not None: + pass + else: + s += 'u_features = None ' + if self.i_features is not None: + pass + else: + s += 'i_features = None ' if self.r_ui is not None: s += 'r_ui = {r_ui:1.2f} '.format(r_ui=self.r_ui) else: diff --git a/surprise/prediction_algorithms/slope_one.pyx b/surprise/prediction_algorithms/slope_one.pyx index 8049a6cf..f986e496 100644 --- a/surprise/prediction_algorithms/slope_one.pyx +++ b/surprise/prediction_algorithms/slope_one.pyx @@ -79,7 +79,7 @@ class SlopeOne(AlgoBase): return self - def estimate(self, u, i): + def estimate(self, u, i, *_): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unkown.') diff --git a/surprise/trainset.py b/surprise/trainset.py index ebb95204..c7d091f6 100644 --- a/surprise/trainset.py +++ b/surprise/trainset.py @@ -33,21 +33,37 @@ class Trainset: ir(:obj:`defaultdict` of :obj:`list`): The items ratings. This is a dictionary containing lists of tuples of the form ``(user_inner_id, rating)``. The keys are item inner ids. + u_features(:obj:`defaultdict` of :obj:`list`): The user features. This + is a dictionary containing lists of features. The keys are user + inner ids. + i_features(:obj:`defaultdict` of :obj:`list`): The item features. This + is a dictionary containing lists of features. The keys are item + inner ids. n_users: Total number of users :math:`|U|`. n_items: Total number of items :math:`|I|`. + n_user_features: Total number of user features. + n_item_features: Total number of item features. n_ratings: Total number of ratings :math:`|R_{train}|`. rating_scale(tuple): The minimum and maximal rating of the rating scale. global_mean: The mean of all ratings :math:`\\mu`. """ - def __init__(self, ur, ir, n_users, n_items, n_ratings, rating_scale, - offset, raw2inner_id_users, raw2inner_id_items): + def __init__(self, ur, ir, u_features, i_features, n_users, n_items, + n_user_features, n_item_features, user_features_labels, + item_features_labels, n_ratings, rating_scale, offset, + raw2inner_id_users, raw2inner_id_items): self.ur = ur self.ir = ir + self.u_features = u_features + self.i_features = i_features self.n_users = n_users self.n_items = n_items + self.n_user_features = n_user_features + self.n_item_features = n_item_features + self.user_features_labels = user_features_labels + self.item_features_labels = item_features_labels self.n_ratings = n_ratings self.rating_scale = rating_scale self.offset = offset @@ -87,6 +103,30 @@ def knows_item(self, iid): return iid in self.ir + def has_user_features(self, uid): + """Indicate if the user features are part of the trainset. + + Args: + uid(int): The (inner) user id. See :ref:`this + note`. + Returns: + ``True`` if user features are part of the trainset, else ``False``. + """ + + return uid in self.u_features + + def has_item_features(self, iid): + """Indicate if the item features are part of the trainset. + + Args: + iid(int): The (inner) item id. See :ref:`this + note`. + Returns: + ``True`` if item features are part of the trainset, else ``False``. + """ + + return iid in self.i_features + def to_inner_uid(self, ruid): """Convert a **user** raw id to an inner id. @@ -200,8 +240,14 @@ def build_testset(self): cases where you want to to test your algorithm on the trainset. """ - return [(self.to_raw_uid(u), self.to_raw_iid(i), r) - for (u, i, r) in self.all_ratings()] + testset = [] + for (u, i, r) in self.all_ratings(): + u_features = self.u_features.get(u, None) + i_features = self.i_features.get(i, None) + testset.append((self.to_raw_uid(u), self.to_raw_iid(i), u_features, + i_features, r)) + + return testset def build_anti_testset(self, fill=None): """Return a list of ratings that can be used as a testset in the @@ -228,9 +274,13 @@ def build_anti_testset(self, fill=None): anti_testset = [] for u in self.all_users(): user_items = set([j for (j, _) in self.ur[u]]) - anti_testset += [(self.to_raw_uid(u), self.to_raw_iid(i), fill) for - i in self.all_items() if - i not in user_items] + anti_testset += [(self.to_raw_uid(u), + self.to_raw_iid(i), + self.u_features.get(u, None), + self.i_features.get(i, None), + fill) + for i in self.all_items() + if i not in user_items] return anti_testset def all_users(self): From 4fabe2916f37725aca044a4ce6d60e8909e79ee9 Mon Sep 17 00:00:00 2001 From: martincousi Date: Thu, 5 Apr 2018 17:45:29 -0400 Subject: [PATCH 39/45] add lasso --- surprise/__init__.py | 4 +- surprise/prediction_algorithms/__init__.py | 3 +- surprise/prediction_algorithms/linear.py | 130 +++++++++++++++++++++ 3 files changed, 135 insertions(+), 2 deletions(-) create mode 100644 surprise/prediction_algorithms/linear.py diff --git a/surprise/__init__.py b/surprise/__init__.py index e87ca980..82de2460 100644 --- a/surprise/__init__.py +++ b/surprise/__init__.py @@ -12,6 +12,7 @@ from .prediction_algorithms import NMF from .prediction_algorithms import SlopeOne from .prediction_algorithms import CoClustering +from .prediction_algorithms import Lasso from .prediction_algorithms import PredictionImpossible from .prediction_algorithms import Prediction @@ -30,6 +31,7 @@ 'KNNWithMeans', 'KNNBaseline', 'SVD', 'SVDpp', 'NMF', 'SlopeOne', 'CoClustering', 'PredictionImpossible', 'Prediction', 'Dataset', 'Reader', 'Trainset', 'evaluate', 'print_perf', 'GridSearch', - 'dump', 'KNNWithZScore', 'get_dataset_dir', 'model_selection'] + 'dump', 'KNNWithZScore', 'get_dataset_dir', 'model_selection', + 'Lasso'] __version__ = get_distribution('scikit-surprise').version diff --git a/surprise/prediction_algorithms/__init__.py b/surprise/prediction_algorithms/__init__.py index d5ce8288..1a719a97 100644 --- a/surprise/prediction_algorithms/__init__.py +++ b/surprise/prediction_algorithms/__init__.py @@ -32,6 +32,7 @@ from .matrix_factorization import NMF from .slope_one import SlopeOne from .co_clustering import CoClustering +from .linear import Lasso from .predictions import PredictionImpossible from .predictions import Prediction @@ -39,4 +40,4 @@ __all__ = ['AlgoBase', 'NormalPredictor', 'BaselineOnly', 'KNNBasic', 'KNNBaseline', 'KNNWithMeans', 'SVD', 'SVDpp', 'NMF', 'SlopeOne', 'CoClustering', 'PredictionImpossible', 'Prediction', - 'KNNWithZScore'] + 'KNNWithZScore', 'Lasso'] diff --git a/surprise/prediction_algorithms/linear.py b/surprise/prediction_algorithms/linear.py new file mode 100644 index 00000000..81697f26 --- /dev/null +++ b/surprise/prediction_algorithms/linear.py @@ -0,0 +1,130 @@ +""" +the :mod:`linear` module includes linear features-based algorithms. +""" + +from __future__ import (absolute_import, division, print_function, + unicode_literals) +import numpy as np +from sklearn import linear_model + +from .predictions import PredictionImpossible +from .algo_base import AlgoBase + + +class Lasso(AlgoBase): + """A basic lasso algorithm with user-item interaction terms. + + The prediction :math:`\\hat{r}_{ui}` is set as: + + .. math:: + \hat{r}_{ui} = \alpha_1 + \alpha_2^\top y_u + \alpha_3^\top z_i + + \alpha_4^\top \text{vec}(y_u \otimes z_i) + + where :math:`\alpha_1 \in \mathbb{R}, \alpha_2 \in \mathbb{R}^o, \alpha_3 + \in \mathbb{R}^p` and :math:`\alpha_4 \in \mathbb{R}^{op}` are coefficient + vectors, and :math:`\otimes` represent the Kronecker product of two vectors + (i.e., all possible cross-product combinations). + + Args: + add_interactions(bool): Whether to add user-item interaction terms. + Optional, default is True. + other args: See ``sklearn`` documentation for ``linear_model.Lasso``. + """ + + def __init__(self, add_interactions=True, alpha=1.0, fit_intercept=True, + normalize=False, precompute=False, max_iter=1000, tol=0.0001, + positive=False, random_state=None, selection='cyclic', + **kwargs): + + AlgoBase.__init__(self, **kwargs) + self.add_interactions = add_interactions + self.alpha = alpha + self.fit_intercept = fit_intercept + self.normalize = normalize + self.precompute = precompute + self.max_iter = max_iter + self.tol = tol + self.positive = positive + self.random_state = random_state + self.selection = selection + + def fit(self, trainset): + + AlgoBase.fit(self, trainset) + self.lasso(trainset) + + return self + + def lasso(self, trainset): + + if (self.trainset.n_user_features == 0 or + self.trainset.n_item_features == 0): + raise ValueError('trainset does not contain user and/or item ' + 'features.') + + n_ratings = self.trainset.n_ratings + n_uf = self.trainset.n_user_features + n_if = self.trainset.n_item_features + u_features = self.trainset.u_features + i_features = self.trainset.i_features + uf_labels = self.trainset.user_features_labels + if_labels = self.trainset.item_features_labels + + X = np.empty((n_ratings, n_uf + n_if)) + y = np.empty((n_ratings,)) + for k, (uid, iid, rating) in enumerate(self.trainset.all_ratings()): + y[k] = rating + try: + X[k, :n_uf] = u_features[uid] + except KeyError: + raise ValueError('No features for user ' + + str(self.trainset.to_raw_uid(uid))) + try: + X[k, n_uf:] = i_features[iid] + except KeyError: + raise ValueError('No features for item ' + + str(self.trainset.to_raw_iid(iid))) + + coef_labels = uf_labels + if_labels + if self.add_interactions: + temp = np.array([X[:, v] * X[:, j] for v in range(n_uf) + for j in range(n_uf, n_uf + n_if)]).T + X = np.concatenate([X, temp], axis=1) + temp = [coef_labels[v] + '*' + coef_labels[j] for v in range(n_uf) + for j in range(n_uf, n_uf + n_if)] + coef_labels += temp + + reg = linear_model.Lasso( + alpha=self.alpha, fit_intercept=self.fit_intercept, + normalize=self.normalize, precompute=self.precompute, + max_iter=self.max_iter, tol=self.tol, positive=self.positive, + random_state=self.random_state, selection=self.selection) + reg.fit(X, y) + + self.X = X + self.y = y + self.coef = reg.coef_ + self.coef_labels = coef_labels + self.intercept = reg.intercept_ + + def estimate(self, u, i, u_features, i_features): + + n_uf = self.trainset.n_user_features + n_if = self.trainset.n_item_features + + if u_features is None or len(u_features) != n_uf: + raise PredictionImpossible('User features are missing.') + + if i_features is None or len(i_features) != n_if: + raise PredictionImpossible('Item features are missing.') + + X = np.concatenate([u_features, i_features]) + + if self.add_interactions: + temp = np.array([X[v] * X[j] for v in range(n_uf) + for j in range(n_uf, n_uf + n_if)]) + X = np.concatenate([X, temp]) + + est = self.intercept + np.dot(X, self.coef) + + return est From 7427b2287a80d20aa5c7162ca0b446a1b7f10700 Mon Sep 17 00:00:00 2001 From: martincousi Date: Mon, 9 Apr 2018 10:59:16 -0400 Subject: [PATCH 40/45] Delete linear.py --- surprise/prediction_algorithms/linear.py | 130 ----------------------- 1 file changed, 130 deletions(-) delete mode 100644 surprise/prediction_algorithms/linear.py diff --git a/surprise/prediction_algorithms/linear.py b/surprise/prediction_algorithms/linear.py deleted file mode 100644 index 81697f26..00000000 --- a/surprise/prediction_algorithms/linear.py +++ /dev/null @@ -1,130 +0,0 @@ -""" -the :mod:`linear` module includes linear features-based algorithms. -""" - -from __future__ import (absolute_import, division, print_function, - unicode_literals) -import numpy as np -from sklearn import linear_model - -from .predictions import PredictionImpossible -from .algo_base import AlgoBase - - -class Lasso(AlgoBase): - """A basic lasso algorithm with user-item interaction terms. - - The prediction :math:`\\hat{r}_{ui}` is set as: - - .. math:: - \hat{r}_{ui} = \alpha_1 + \alpha_2^\top y_u + \alpha_3^\top z_i + - \alpha_4^\top \text{vec}(y_u \otimes z_i) - - where :math:`\alpha_1 \in \mathbb{R}, \alpha_2 \in \mathbb{R}^o, \alpha_3 - \in \mathbb{R}^p` and :math:`\alpha_4 \in \mathbb{R}^{op}` are coefficient - vectors, and :math:`\otimes` represent the Kronecker product of two vectors - (i.e., all possible cross-product combinations). - - Args: - add_interactions(bool): Whether to add user-item interaction terms. - Optional, default is True. - other args: See ``sklearn`` documentation for ``linear_model.Lasso``. - """ - - def __init__(self, add_interactions=True, alpha=1.0, fit_intercept=True, - normalize=False, precompute=False, max_iter=1000, tol=0.0001, - positive=False, random_state=None, selection='cyclic', - **kwargs): - - AlgoBase.__init__(self, **kwargs) - self.add_interactions = add_interactions - self.alpha = alpha - self.fit_intercept = fit_intercept - self.normalize = normalize - self.precompute = precompute - self.max_iter = max_iter - self.tol = tol - self.positive = positive - self.random_state = random_state - self.selection = selection - - def fit(self, trainset): - - AlgoBase.fit(self, trainset) - self.lasso(trainset) - - return self - - def lasso(self, trainset): - - if (self.trainset.n_user_features == 0 or - self.trainset.n_item_features == 0): - raise ValueError('trainset does not contain user and/or item ' - 'features.') - - n_ratings = self.trainset.n_ratings - n_uf = self.trainset.n_user_features - n_if = self.trainset.n_item_features - u_features = self.trainset.u_features - i_features = self.trainset.i_features - uf_labels = self.trainset.user_features_labels - if_labels = self.trainset.item_features_labels - - X = np.empty((n_ratings, n_uf + n_if)) - y = np.empty((n_ratings,)) - for k, (uid, iid, rating) in enumerate(self.trainset.all_ratings()): - y[k] = rating - try: - X[k, :n_uf] = u_features[uid] - except KeyError: - raise ValueError('No features for user ' + - str(self.trainset.to_raw_uid(uid))) - try: - X[k, n_uf:] = i_features[iid] - except KeyError: - raise ValueError('No features for item ' + - str(self.trainset.to_raw_iid(iid))) - - coef_labels = uf_labels + if_labels - if self.add_interactions: - temp = np.array([X[:, v] * X[:, j] for v in range(n_uf) - for j in range(n_uf, n_uf + n_if)]).T - X = np.concatenate([X, temp], axis=1) - temp = [coef_labels[v] + '*' + coef_labels[j] for v in range(n_uf) - for j in range(n_uf, n_uf + n_if)] - coef_labels += temp - - reg = linear_model.Lasso( - alpha=self.alpha, fit_intercept=self.fit_intercept, - normalize=self.normalize, precompute=self.precompute, - max_iter=self.max_iter, tol=self.tol, positive=self.positive, - random_state=self.random_state, selection=self.selection) - reg.fit(X, y) - - self.X = X - self.y = y - self.coef = reg.coef_ - self.coef_labels = coef_labels - self.intercept = reg.intercept_ - - def estimate(self, u, i, u_features, i_features): - - n_uf = self.trainset.n_user_features - n_if = self.trainset.n_item_features - - if u_features is None or len(u_features) != n_uf: - raise PredictionImpossible('User features are missing.') - - if i_features is None or len(i_features) != n_if: - raise PredictionImpossible('Item features are missing.') - - X = np.concatenate([u_features, i_features]) - - if self.add_interactions: - temp = np.array([X[v] * X[j] for v in range(n_uf) - for j in range(n_uf, n_uf + n_if)]) - X = np.concatenate([X, temp]) - - est = self.intercept + np.dot(X, self.coef) - - return est From 773bd24c3cc6f255d9eeece19ca6da67f6801b29 Mon Sep 17 00:00:00 2001 From: martincousi Date: Mon, 9 Apr 2018 10:59:46 -0400 Subject: [PATCH 41/45] Update __init__.py --- surprise/prediction_algorithms/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/surprise/prediction_algorithms/__init__.py b/surprise/prediction_algorithms/__init__.py index 1a719a97..d5ce8288 100644 --- a/surprise/prediction_algorithms/__init__.py +++ b/surprise/prediction_algorithms/__init__.py @@ -32,7 +32,6 @@ from .matrix_factorization import NMF from .slope_one import SlopeOne from .co_clustering import CoClustering -from .linear import Lasso from .predictions import PredictionImpossible from .predictions import Prediction @@ -40,4 +39,4 @@ __all__ = ['AlgoBase', 'NormalPredictor', 'BaselineOnly', 'KNNBasic', 'KNNBaseline', 'KNNWithMeans', 'SVD', 'SVDpp', 'NMF', 'SlopeOne', 'CoClustering', 'PredictionImpossible', 'Prediction', - 'KNNWithZScore', 'Lasso'] + 'KNNWithZScore'] From bb0012c620336c6d9fcc957081d96275019e2053 Mon Sep 17 00:00:00 2001 From: martincousi Date: Mon, 9 Apr 2018 11:00:20 -0400 Subject: [PATCH 42/45] Update __init__.py --- surprise/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/surprise/__init__.py b/surprise/__init__.py index 82de2460..e87ca980 100644 --- a/surprise/__init__.py +++ b/surprise/__init__.py @@ -12,7 +12,6 @@ from .prediction_algorithms import NMF from .prediction_algorithms import SlopeOne from .prediction_algorithms import CoClustering -from .prediction_algorithms import Lasso from .prediction_algorithms import PredictionImpossible from .prediction_algorithms import Prediction @@ -31,7 +30,6 @@ 'KNNWithMeans', 'KNNBaseline', 'SVD', 'SVDpp', 'NMF', 'SlopeOne', 'CoClustering', 'PredictionImpossible', 'Prediction', 'Dataset', 'Reader', 'Trainset', 'evaluate', 'print_perf', 'GridSearch', - 'dump', 'KNNWithZScore', 'get_dataset_dir', 'model_selection', - 'Lasso'] + 'dump', 'KNNWithZScore', 'get_dataset_dir', 'model_selection'] __version__ = get_distribution('scikit-surprise').version From c76a51a4afa3f9d23479c20f03106817f2ad135c Mon Sep 17 00:00:00 2001 From: martincousi Date: Fri, 13 Apr 2018 09:13:11 -0400 Subject: [PATCH 43/45] remove features from Prediction object + typos --- surprise/prediction_algorithms/algo_base.py | 2 +- surprise/prediction_algorithms/knns.py | 7 +++---- surprise/prediction_algorithms/predictions.py | 13 +------------ 3 files changed, 5 insertions(+), 17 deletions(-) diff --git a/surprise/prediction_algorithms/algo_base.py b/surprise/prediction_algorithms/algo_base.py index 4fdcd8f0..5d4c5e02 100644 --- a/surprise/prediction_algorithms/algo_base.py +++ b/surprise/prediction_algorithms/algo_base.py @@ -171,7 +171,7 @@ def predict(self, uid, iid, u_features=None, i_features=None, r_ui=None, est = min(higher_bound, est) est = max(lower_bound, est) - pred = Prediction(uid, iid, u_features, i_features, r_ui, est, details) + pred = Prediction(uid, iid, r_ui, est, details) if verbose: print(pred) diff --git a/surprise/prediction_algorithms/knns.py b/surprise/prediction_algorithms/knns.py index 301a8168..245a83dc 100644 --- a/surprise/prediction_algorithms/knns.py +++ b/surprise/prediction_algorithms/knns.py @@ -103,7 +103,7 @@ def fit(self, trainset): def estimate(self, u, i, *_): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): - raise PredictionImpossible('User and/or item is unkown.') + raise PredictionImpossible('User and/or item is unknown.') x, y = self.switch(u, i) @@ -186,7 +186,7 @@ def fit(self, trainset): def estimate(self, u, i, *_): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): - raise PredictionImpossible('User and/or item is unkown.') + raise PredictionImpossible('User and/or item is unknown.') x, y = self.switch(u, i) @@ -259,7 +259,6 @@ class KNNBaseline(SymmetricAlgo): accepted options. verbose(bool): Whether to print trace messages of bias estimation, similarity, etc. Default is True. - """ def __init__(self, k=40, min_k=1, sim_options={}, bsl_options={}, @@ -386,7 +385,7 @@ def fit(self, trainset): def estimate(self, u, i, *_): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): - raise PredictionImpossible('User and/or item is unkown.') + raise PredictionImpossible('User and/or item is unknown.') x, y = self.switch(u, i) diff --git a/surprise/prediction_algorithms/predictions.py b/surprise/prediction_algorithms/predictions.py index 9e971ee9..76bc8ddc 100644 --- a/surprise/prediction_algorithms/predictions.py +++ b/surprise/prediction_algorithms/predictions.py @@ -21,8 +21,7 @@ class PredictionImpossible(Exception): class Prediction(namedtuple('Prediction', - ['uid', 'iid', 'u_features', 'i_features', 'r_ui', - 'est', 'details'])): + ['uid', 'iid', 'r_ui', 'est', 'details'])): """A named tuple for storing the results of a prediction. It's wrapped in a class, but only for documentation and printing purposes. @@ -30,8 +29,6 @@ class Prediction(namedtuple('Prediction', Args: uid: The (raw) user id. See :ref:`this note`. iid: The (raw) item id. See :ref:`this note`. - u_features: The user features. - i_features: The item features. r_ui(float): The true rating :math:`r_{ui}`. est(float): The estimated rating :math:`\\hat{r}_{ui}`. details (dict): Stores additional details about the prediction that @@ -43,14 +40,6 @@ class Prediction(namedtuple('Prediction', def __str__(self): s = 'user: {uid:<10} '.format(uid=self.uid) s += 'item: {iid:<10} '.format(iid=self.iid) - if self.u_features is not None: - pass - else: - s += 'u_features = None ' - if self.i_features is not None: - pass - else: - s += 'i_features = None ' if self.r_ui is not None: s += 'r_ui = {r_ui:1.2f} '.format(r_ui=self.r_ui) else: From c34a817d1ee71689d688cf7f787e06d062de5f78 Mon Sep 17 00:00:00 2001 From: martincousi Date: Fri, 13 Apr 2018 09:40:44 -0400 Subject: [PATCH 44/45] correct accuracy --- surprise/accuracy.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/surprise/accuracy.py b/surprise/accuracy.py index 0bfcd1af..1e9e4855 100644 --- a/surprise/accuracy.py +++ b/surprise/accuracy.py @@ -45,7 +45,7 @@ def rmse(predictions, verbose=True): raise ValueError('Prediction list is empty.') mse = np.mean([float((true_r - est)**2) - for (_, _, _, _, true_r, est, _) in predictions]) + for (_, _, true_r, est, _) in predictions]) rmse_ = np.sqrt(mse) if verbose: @@ -80,7 +80,7 @@ def mae(predictions, verbose=True): raise ValueError('Prediction list is empty.') mae_ = np.mean([float(abs(true_r - est)) - for (_, _, _, _, true_r, est, _) in predictions]) + for (_, _, true_r, est, _) in predictions]) if verbose: print('MAE: {0:1.4f}'.format(mae_)) @@ -117,7 +117,7 @@ def fcp(predictions, verbose=True): nc_u = defaultdict(int) nd_u = defaultdict(int) - for u0, _, _, _, r0, est, _ in predictions: + for u0, _, r0, est, _ in predictions: predictions_u[u0].append((r0, est)) for u0, preds in iteritems(predictions_u): From fec4d4f6500d1640cd09910d3f4ee28acd1372af Mon Sep 17 00:00:00 2001 From: martincousi Date: Fri, 13 Apr 2018 11:01:32 -0400 Subject: [PATCH 45/45] Correct tests --- tests/test_dataset.py | 22 +++++++++++----------- tests/test_split.py | 2 +- tests/test_train2fit.py | 4 ++-- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 69311404..2526d20c 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -145,12 +145,12 @@ def test_trainset_testset(): for i in range(4): assert trainset.to_inner_uid('user' + str(i)) == i with pytest.raises(ValueError): - trainset.to_inner_uid('unkown_user') + trainset.to_inner_uid('unknown_user') for i in range(2): assert trainset.to_inner_iid('item' + str(i)) == i with pytest.raises(ValueError): - trainset.to_inner_iid('unkown_item') + trainset.to_inner_iid('unknown_item') # test inner2raw assert trainset._inner2raw_id_users is None @@ -167,19 +167,19 @@ def test_trainset_testset(): algo.fit(trainset) testset = trainset.build_testset() algo.test(testset) # ensure an algorithm can manage the data - assert ('user0', 'item0', 4) in testset - assert ('user3', 'item1', 5) in testset - assert ('user3', 'item1', 0) not in testset + assert ('user0', 'item0', None, None, 4) in testset + assert ('user3', 'item1', None, None, 5) in testset + assert ('user3', 'item1', None, None, 0) not in testset # Test the build_anti_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_anti_testset() algo.test(testset) # ensure an algorithm can manage the data - assert ('user0', 'item0', trainset.global_mean) not in testset - assert ('user3', 'item1', trainset.global_mean) not in testset - assert ('user0', 'item1', trainset.global_mean) in testset - assert ('user3', 'item0', trainset.global_mean) in testset + assert ('user0', 'item0', None, None, trainset.global_mean) not in testset + assert ('user3', 'item1', None, None, trainset.global_mean) not in testset + assert ('user0', 'item1', None, None, trainset.global_mean) in testset + assert ('user3', 'item0', None, None, trainset.global_mean) in testset def test_load_form_df(): @@ -238,11 +238,11 @@ def test_build_anti_testset(): # fill with some specific value for fillvalue in (0, 42., -1): anti = trainset.build_anti_testset(fill=fillvalue) - for (u, i, r) in anti: + for (u, i, u_f, i_f, r) in anti: assert r == fillvalue # fill with global_mean anti = trainset.build_anti_testset(fill=None) - for (u, i, r) in anti: + for (u, i, u_f, i_f, r) in anti: assert r == trainset.global_mean expect = trainset.n_users * trainset.n_items assert trainset.n_ratings + len(anti) == expect diff --git a/tests/test_split.py b/tests/test_split.py index 0c12cb53..d55eb5ad 100644 --- a/tests/test_split.py +++ b/tests/test_split.py @@ -299,7 +299,7 @@ def test_LeaveOneOut(): # Make sure only one rating per user is present in the testset loo = LeaveOneOut() for _, testset in loo.split(data): - cnt = Counter([uid for (uid, _, _) in testset]) + cnt = Counter([uid for (uid, _, _, _, _) in testset]) assert all(val == 1 for val in itervalues(cnt)) # test the min_n_ratings parameter diff --git a/tests/test_train2fit.py b/tests/test_train2fit.py index ab0634e4..b2993031 100644 --- a/tests/test_train2fit.py +++ b/tests/test_train2fit.py @@ -35,7 +35,7 @@ def fit(self, trainset): self.bu, self.bi = 1, 1 self.cnt += 1 - def estimate(self, u, i): + def estimate(self, u, i, *_): return self.est algo = CustomAlgoFit() @@ -91,7 +91,7 @@ def train(self, trainset): self.bu, self.bi = 1, 1 self.cnt += 1 - def estimate(self, u, i): + def estimate(self, u, i, *_): return self.est with pytest.warns(UserWarning):