-
Notifications
You must be signed in to change notification settings - Fork 229
[WIP] New API proposal #85
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
3e5fbc3
0cbf1ae
300dada
8615634
a478baa
3744bec
214d991
4f4ce8b
ac00b8b
33561ab
7f40c56
402f397
47a9372
41dc123
5f63f24
fb0d118
df8a340
e3e7e0c
5a9c2e5
cf94740
52f4516
079bb13
da7c8e7
8192d11
2d0f1ca
6c59a1a
b70163a
a12eb9a
b1f6c23
b0ec33b
64f5762
2cf78dd
11a8ff1
a768cbf
335d8f4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,8 +6,9 @@ | |
import warnings | ||
from six.moves import xrange | ||
from scipy.sparse import coo_matrix | ||
from sklearn.utils import check_array | ||
|
||
__all__ = ['Constraints'] | ||
__all__ = ['Constraints', 'ConstrainedDataset'] | ||
|
||
|
||
class Constraints(object): | ||
|
@@ -18,17 +19,6 @@ def __init__(self, partial_labels): | |
self.known_label_idx, = np.where(partial_labels >= 0) | ||
self.known_labels = partial_labels[self.known_label_idx] | ||
|
||
def adjacency_matrix(self, num_constraints, random_state=np.random): | ||
a, b, c, d = self.positive_negative_pairs(num_constraints, | ||
random_state=random_state) | ||
row = np.concatenate((a, c)) | ||
col = np.concatenate((b, d)) | ||
data = np.ones_like(row, dtype=int) | ||
data[len(a):] = -1 | ||
adj = coo_matrix((data, (row, col)), shape=(self.num_points,)*2) | ||
# symmetrize | ||
return adj + adj.T | ||
|
||
def positive_negative_pairs(self, num_constraints, same_length=False, | ||
random_state=np.random): | ||
a, b = self._pairs(num_constraints, same_label=True, | ||
|
@@ -100,3 +90,88 @@ def random_subset(all_labels, num_preserved=np.inf, random_state=np.random): | |
partial_labels = np.array(all_labels, copy=True) | ||
partial_labels[idx] = -1 | ||
return Constraints(partial_labels) | ||
|
||
|
||
class ConstrainedDataset(object): | ||
|
||
def __init__(self, X, c): | ||
# we convert the data to a suitable format | ||
self.X = check_array(X, accept_sparse=True, dtype=None, warn_on_dtype=True) | ||
self.c = check_array(c, dtype=['int'] + np.sctypes['int'] | ||
+ np.sctypes['uint'], | ||
# we add 'int' at the beginning to tell it is the | ||
# default format we want in case of conversion | ||
ensure_2d=False, ensure_min_samples=False, | ||
ensure_min_features=False, warn_on_dtype=True) | ||
self._check_index(self.X.shape[0], self.c) | ||
self.shape = (len(c) if hasattr(c, '__len__') else 0, self.X.shape[1]) | ||
|
||
def __getitem__(self, item): | ||
return ConstrainedDataset(self.X, self.c[item]) | ||
|
||
def __len__(self): | ||
return self.shape | ||
|
||
def __str__(self): | ||
return self.toarray().__str__() | ||
|
||
def __repr__(self): | ||
return self.toarray().__repr__() | ||
|
||
def toarray(self): | ||
return self.X[self.c] | ||
|
||
@staticmethod | ||
def _check_index(length, indices): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe this could also check for potential duplicates? could simply show a warning when this is the case. (one could also remove them but this might create problems later when constraint labels are used) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree, I will implement it in a next commit |
||
max_index = np.max(indices) | ||
min_index = np.min(indices) | ||
pb_index = None | ||
if max_index >= length: | ||
pb_index = max_index | ||
elif min_index > length + 1: | ||
pb_index = min_index | ||
if pb_index is not None: | ||
raise IndexError("ConstrainedDataset cannot be created: the length of " | ||
"the dataset is {}, so index {} is out of range." | ||
.format(length, pb_index)) | ||
|
||
@staticmethod | ||
def pairs_from_labels(y): | ||
# TODO: to be implemented | ||
raise NotImplementedError | ||
|
||
@staticmethod | ||
def triplets_from_labels(y): | ||
# TODO: to be implemented | ||
raise NotImplementedError | ||
|
||
|
||
def unwrap_pairs(constrained_dataset, y): | ||
a = constrained_dataset.c[(y == 0)[:, 0]][:, 0] | ||
b = constrained_dataset.c[(y == 0)[:, 0]][:, 1] | ||
c = constrained_dataset.c[(y == 1)[:, 0]][:, 0] | ||
d = constrained_dataset.c[(y == 1)[:, 0]][:, 1] | ||
X = constrained_dataset.X | ||
return X, [a, b, c, d] | ||
|
||
def wrap_pairs(X, constraints): | ||
a = np.array(constraints[0]) | ||
b = np.array(constraints[1]) | ||
c = np.array(constraints[2]) | ||
d = np.array(constraints[3]) | ||
constraints = np.vstack([np.hstack([a[:, None], b[:, None]]), | ||
np.hstack([c[:, None], d[:, None]])]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. a, b, c, d = constraints
constraints = np.vstack((np.column_stack((a, b)), np.column_stack((c, d))))
# or if we have numpy 1.13+
constraints = np.block([[a, b], [c, d]]) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks, will do |
||
y = np.vstack([np.zeros((len(a), 1)), np.ones((len(c), 1))]) | ||
constrained_dataset = ConstrainedDataset(X, constraints) | ||
return constrained_dataset, y | ||
|
||
def unwrap_to_graph(constrained_dataset, y): | ||
|
||
X, [a, b, c, d] = unwrap_pairs(constrained_dataset, y) | ||
row = np.concatenate((a, c)) | ||
col = np.concatenate((b, d)) | ||
data = np.ones_like(row, dtype=int) | ||
data[len(a):] = -1 | ||
adj = coo_matrix((data, (row, col)), shape=(constrained_dataset.X.shape[0],) | ||
* 2) | ||
return constrained_dataset.X, adj + adj.T |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,10 +12,10 @@ | |
import numpy as np | ||
from sklearn.utils.validation import check_array | ||
|
||
from .base_metric import BaseMetricLearner | ||
from .base_metric import SupervisedMetricLearner | ||
|
||
|
||
class Covariance(BaseMetricLearner): | ||
class Covariance(SupervisedMetricLearner): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I understand why this was chosen, but this particular base class made me stop and consider a moment. We may eventually want a base class for unsupervised methods as well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree, this simple covariance method should not be a SupervisedMetricLearner (as it is completely unsupervised). Whether we will really need an unsupervised class in the long run is unclear, but maybe the best for now is to create an UnsupervisedMetricLearner class which takes only X in fit. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the review ! I agree, I did not notice but indeed Covariance is unsupervised, so I will change this in a following PR |
||
def __init__(self): | ||
pass | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,12 +19,12 @@ | |
from sklearn.metrics import pairwise_distances | ||
from sklearn.utils.validation import check_array, check_X_y | ||
|
||
from .base_metric import BaseMetricLearner | ||
from .constraints import Constraints | ||
from .base_metric import PairsMetricLearner, SupervisedMetricLearner | ||
from .constraints import Constraints, unwrap_pairs, wrap_pairs | ||
from ._util import vector_norm | ||
|
||
|
||
class ITML(BaseMetricLearner): | ||
class ITML(PairsMetricLearner): | ||
"""Information Theoretic Metric Learning (ITML)""" | ||
def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3, | ||
A0=None, verbose=False): | ||
|
@@ -73,19 +73,19 @@ def _process_inputs(self, X, constraints, bounds): | |
self.A_ = check_array(self.A0) | ||
return a,b,c,d | ||
|
||
def fit(self, X, constraints, bounds=None): | ||
def fit(self, constrained_dataset, y, bounds=None): | ||
"""Learn the ITML model. | ||
|
||
Parameters | ||
---------- | ||
X : (n x d) data matrix | ||
each row corresponds to a single instance | ||
constraints : 4-tuple of arrays | ||
(a,b,c,d) indices into X, with (a,b) specifying positive and (c,d) | ||
negative pairs | ||
constrained_dataset : ConstrainedDataset | ||
with constraints being an array of shape [n_constraints, 2] | ||
y : array-like, shape (n x 1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, indeed, will change this |
||
labels of the constraints | ||
bounds : list (pos,neg) pairs, optional | ||
bounds on similarity, s.t. d(X[a],X[b]) < pos and d(X[c],X[d]) > neg | ||
""" | ||
X, constraints = unwrap_pairs(constrained_dataset, y) | ||
a,b,c,d = self._process_inputs(X, constraints, bounds) | ||
gamma = self.gamma | ||
num_pos = len(a) | ||
|
@@ -140,7 +140,7 @@ def metric(self): | |
return self.A_ | ||
|
||
|
||
class ITML_Supervised(ITML): | ||
class ITML_Supervised(ITML, SupervisedMetricLearner): | ||
"""Information Theoretic Metric Learning (ITML)""" | ||
def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3, | ||
num_labeled=np.inf, num_constraints=None, bounds=None, A0=None, | ||
|
@@ -195,4 +195,5 @@ def fit(self, X, y, random_state=np.random): | |
random_state=random_state) | ||
pos_neg = c.positive_negative_pairs(num_constraints, | ||
random_state=random_state) | ||
return ITML.fit(self, X, pos_neg, bounds=self.bounds) | ||
constrained_dataset, y = wrap_pairs(X, pos_neg) | ||
return ITML.fit(self, constrained_dataset, y, bounds=self.bounds) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,11 +13,11 @@ | |
from six.moves import xrange | ||
from sklearn.utils.validation import check_array, check_X_y | ||
|
||
from .base_metric import BaseMetricLearner | ||
from .constraints import Constraints | ||
from .base_metric import SupervisedMetricLearner, QuadrupletsMetricLearner | ||
from .constraints import Constraints, ConstrainedDataset | ||
|
||
|
||
class LSML(BaseMetricLearner): | ||
class LSML(QuadrupletsMetricLearner): | ||
def __init__(self, tol=1e-3, max_iter=1000, prior=None, verbose=False): | ||
"""Initialize LSML. | ||
|
||
|
@@ -57,18 +57,23 @@ def _prepare_inputs(self, X, constraints, weights): | |
def metric(self): | ||
return self.M_ | ||
|
||
def fit(self, X, constraints, weights=None): | ||
def fit(self, constrained_dataset, y=None, weights=None): | ||
"""Learn the LSML model. | ||
|
||
Parameters | ||
---------- | ||
X : (n x d) data matrix | ||
each row corresponds to a single instance | ||
constraints : 4-tuple of arrays | ||
(a,b,c,d) indices into X, such that d(X[a],X[b]) < d(X[c],X[d]) | ||
constrained_dataset : ConstrainedDataset | ||
with constraints being an array of shape [n_constraints, 4]. It | ||
should be the concatenation of 4 column vectors a, b, c and d, | ||
such that: ``d(X[a[i]],X[b[i]]) < d(X[c[i]],X[d[i]])`` for every | ||
constraint index ``i``. | ||
y : object | ||
Not used, for scikit-learn compatibility | ||
weights : (m,) array of floats, optional | ||
scale factor for each constraint | ||
""" | ||
X = constrained_dataset.X | ||
constraints = [constrained_dataset.c[:, i].ravel() for i in range(4)] | ||
self._prepare_inputs(X, constraints, weights) | ||
step_sizes = np.logspace(-10, 0, 10) | ||
# Keep track of the best step size and the loss at that step. | ||
|
@@ -131,7 +136,7 @@ def _gradient(self, metric): | |
return dMetric | ||
|
||
|
||
class LSML_Supervised(LSML): | ||
class LSML_Supervised(LSML, SupervisedMetricLearner): | ||
def __init__(self, tol=1e-3, max_iter=1000, prior=None, num_labeled=np.inf, | ||
num_constraints=None, weights=None, verbose=False): | ||
"""Initialize the learner. | ||
|
@@ -181,4 +186,7 @@ def fit(self, X, y, random_state=np.random): | |
random_state=random_state) | ||
pairs = c.positive_negative_pairs(num_constraints, same_length=True, | ||
random_state=random_state) | ||
return LSML.fit(self, X, pairs, weights=self.weights) | ||
constrained_dataset = ConstrainedDataset(X, np.hstack([pairs[i][:, None] | ||
for i in | ||
range(4)])) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great, thanks ! |
||
return LSML.fit(self, constrained_dataset, weights=self.weights) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,13 +22,14 @@ | |
from sklearn.metrics import pairwise_distances | ||
from sklearn.utils.validation import check_array, check_X_y | ||
|
||
from .base_metric import BaseMetricLearner | ||
from .constraints import Constraints | ||
from .base_metric import PairsMetricLearner, SupervisedMetricLearner | ||
from .constraints import Constraints, ConstrainedDataset, unwrap_pairs, \ | ||
wrap_pairs | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Style nitpick: Use parens to break up long import lists: from .constraints import (
Constraints, ConstrainedDataset, unwrap_pairs, wrap_pairs) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks ! |
||
from ._util import vector_norm | ||
|
||
|
||
|
||
class MMC(BaseMetricLearner): | ||
class MMC(PairsMetricLearner): | ||
"""Mahalanobis Metric for Clustering (MMC)""" | ||
def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3, | ||
A0=None, diagonal=False, diagonal_c=1.0, verbose=False): | ||
|
@@ -58,17 +59,17 @@ def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3, | |
self.diagonal_c = diagonal_c | ||
self.verbose = verbose | ||
|
||
def fit(self, X, constraints): | ||
def fit(self, constrained_dataset, y): | ||
"""Learn the MMC model. | ||
|
||
Parameters | ||
---------- | ||
X : (n x d) data matrix | ||
each row corresponds to a single instance | ||
constraints : 4-tuple of arrays | ||
(a,b,c,d) indices into X, with (a,b) specifying similar and (c,d) | ||
dissimilar pairs | ||
constrained_dataset : ConstrainedDataset | ||
with constraints being an array of shape [n_constraints, 2] | ||
y : array-like, shape (n x 1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same as above There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will do |
||
labels of the constraints | ||
""" | ||
X, constraints = unwrap_pairs(constrained_dataset, y) | ||
constraints = self._process_inputs(X, constraints) | ||
if self.diagonal: | ||
return self._fit_diag(X, constraints) | ||
|
@@ -380,7 +381,7 @@ def transformer(self): | |
return V.T * np.sqrt(np.maximum(0, w[:,None])) | ||
|
||
|
||
class MMC_Supervised(MMC): | ||
class MMC_Supervised(MMC, SupervisedMetricLearner): | ||
"""Mahalanobis Metric for Clustering (MMC)""" | ||
def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-6, | ||
num_labeled=np.inf, num_constraints=None, | ||
|
@@ -437,4 +438,5 @@ def fit(self, X, y, random_state=np.random): | |
random_state=random_state) | ||
pos_neg = c.positive_negative_pairs(num_constraints, | ||
random_state=random_state) | ||
return MMC.fit(self, X, pos_neg) | ||
constrained_dataset, y = wrap_pairs(X, pos_neg) | ||
return MMC.fit(self, constrained_dataset, y) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe think of a more concise naming convention instead of
constrained_dataset
as it is gonna be used all over the place.X_constrained
perhaps?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree,
X_constrained
seems good