-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRankingTree.py
137 lines (112 loc) · 4.35 KB
/
RankingTree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""
Implementation of pairwise ranking using scikit-learn LinearSVC
Reference:
"Large Margin Rank Boundaries for Ordinal Regression", R. Herbrich,
T. Graepel, K. Obermayer 1999
"Learning to rank from medical imaging data." Pedregosa, Fabian, et al.,
Machine Learning in Medical Imaging 2012.
Authors: Fabian Pedregosa <[email protected]>
Alexandre Gramfort <[email protected]>
See also https://github.com/fabianp/pysofia for a more efficient implementation
of RankSVM using stochastic gradient descent methdos.
"""
import itertools
import numpy as np
import pdb
from sklearn import svm, linear_model, cross_validation
from sklearn.ensemble import RandomForestClassifier
def transform_pairwise(X, y):
"""Transforms data into pairs with balanced labels for ranking
Transforms a n-class ranking problem into a two-class classification
problem. Subclasses implementing particular strategies for choosing
pairs should override this method.
In this method, all pairs are choosen, except for those that have the
same target value. The output is an array of balanced classes, i.e.
there are the same number of -1 as +1
Parameters
----------
X : array, shape (n_samples, n_features)
The data
y : array, shape (n_samples,) or (n_samples, 2)
Target labels. If it's a 2D array, the second column represents
the grouping of samples, i.e., samples with different groups will
not be considered.
Returns
-------
X_trans : array, shape (k, n_feaures)
Data as pairs
y_trans : array, shape (k,)
Output class labels, where classes have values {-1, +1}
"""
X_new = []
y_new = []
y = np.asarray(y)
if y.ndim == 1:
y = np.c_[y, np.ones(y.shape[0])]
comb = itertools.combinations(range(X.shape[0]), 2)
for k, (i, j) in enumerate(comb):
if y[i, 1] != y[j, 1]:
# skip if same target or different group
continue
try:
X_new.append(X[i] - X[j])
except Exception:
pdb.set_trace()
y_new.append(np.sign(y[i, 0] - y[j, 0]))
# output balanced classes
if y_new[-1] != (-1) ** k:
y_new[-1] = - y_new[-1]
X_new[-1] = - X_new[-1]
return np.asarray(X_new), np.asarray(y_new).ravel()
class RankTree(RandomForestClassifier):
"""Performs pairwise ranking with an underlying LinearSVC model
Input should be a n-class ranking problem, this object will convert it
into a two-class classification problem, a setting known as
`pairwise ranking`.
See object :ref:`svm.LinearSVC` for a full description of parameters.
"""
def fit(self, X, y):
"""
Fit a pairwise ranking model.
Parameters
----------
X : array, shape (n_samples, n_features)
y : array, shape (n_samples,) or (n_samples, 2)
Returns
-------
self
"""
X_trans, y_trans = transform_pairwise(X, y)
super(RankTree, self).fit(X_trans, y_trans)
return self
#def decision_function(self, X):
# return np.dot(X, self.coef_.ravel())
#def predict(self, X):
# """
# Predict an ordering on X. For a list of n samples, this method
# returns a list from 0 to n-1 with the relative order of the rows of X.
# The item is given such that items ranked on top have are
# predicted a higher ordering (i.e. 0 means is the last item
# and n_samples would be the item ranked on top).
# Parameters
# ----------
# X : array, shape (n_samples, n_features)
# Returns
# -------
# ord : array, shape (n_samples,)
# Returns a list of integers representing the relative order of
# the rows in X.
# """
# if hasattr(self, 'coef_'):
# return np.argsort(np.dot(X, self.coef_.ravel()))
# else:
# raise ValueError("Must call fit() prior to predict()")
def predict(self, X, y):
X_trans, y_trans = transform_pairwise(X, y)
return super(RankTree, self).predict(X_trans)
def score(self, X, y):
"""
Because we transformed into a pairwise problem, chance level is at 0.5
"""
X_trans, y_trans = transform_pairwise(X, y)
return np.mean(super(RankTree, self).predict(X_trans) == y_trans)