-
Notifications
You must be signed in to change notification settings - Fork 229
[WIP] New API proposal #85
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 15 commits
3e5fbc3
0cbf1ae
300dada
8615634
a478baa
3744bec
214d991
4f4ce8b
ac00b8b
33561ab
7f40c56
402f397
47a9372
41dc123
5f63f24
fb0d118
df8a340
e3e7e0c
5a9c2e5
cf94740
52f4516
079bb13
da7c8e7
8192d11
2d0f1ca
6c59a1a
b70163a
a12eb9a
b1f6c23
b0ec33b
64f5762
2cf78dd
11a8ff1
a768cbf
335d8f4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,43 @@ | ||
from numpy.linalg import inv, cholesky | ||
from sklearn.base import BaseEstimator, TransformerMixin | ||
from numpy.linalg import cholesky | ||
from sklearn.base import BaseEstimator | ||
from sklearn.utils.validation import check_array | ||
|
||
|
||
class TransformerMixin(object): | ||
"""Mixin class for all transformers in metric-learn. Same as the one in | ||
scikit-learn, but the documentation is changed: this Transformer is | ||
allowed to take as y a non array-like input""" | ||
|
||
def fit_transform(self, X, y=None, **fit_params): | ||
"""Fit to data, then transform it. | ||
|
||
Fits transformer to X and y with optional parameters fit_params | ||
and returns a transformed version of X. | ||
|
||
Parameters | ||
---------- | ||
X : numpy array of shape [n_samples, n_features] | ||
Training set. | ||
|
||
y : numpy array of shape [n_samples] or 4-tuple of arrays | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. with the new API There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, indeed I forgot to update the 4-tuple part y : None or numpy array of shape [n_samples] or [n_constraints]
Target values, or constraints labels However now I am not sure it it a good idea: maybe we should make this class WeaklySupervisedMixin(object):
def fit_transform(self, X_constrained, y=None, **fit_params):
"""
[...]
y : None, or numpy array of shape [n_constraints]
Constraints labels.
"""
class SupervisedMixin(TransformerMixin): |
||
Target values, or constraints (a, b, c, d) indices into X, with | ||
(a, b) specifying similar and (c,d) dissimilar pairs). | ||
|
||
Returns | ||
------- | ||
X_new : numpy array of shape [n_samples, n_features_new] | ||
Transformed array. | ||
|
||
""" | ||
# non-optimized default implementation; override when a better | ||
# method is possible for a given clustering algorithm | ||
if y is None: | ||
# fit method of arity 1 (unsupervised transformation) | ||
return self.fit(X, **fit_params).transform(X) | ||
else: | ||
# fit method of arity 2 (supervised transformation) | ||
return self.fit(X, y, **fit_params).transform(X) | ||
|
||
class BaseMetricLearner(BaseEstimator, TransformerMixin): | ||
def __init__(self): | ||
raise NotImplementedError('BaseMetricLearner should not be instantiated') | ||
|
@@ -49,3 +84,45 @@ def transform(self, X=None): | |
X = check_array(X, accept_sparse=True) | ||
L = self.transformer() | ||
return X.dot(L.T) | ||
|
||
|
||
class SupervisedMixin(object): | ||
|
||
def fit(self, X, y): | ||
return NotImplementedError | ||
|
||
|
||
class UnsupervisedMixin(object): | ||
|
||
def fit(self, X, y=None): | ||
return NotImplementedError | ||
|
||
|
||
class WeaklySupervisedMixin(object): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
|
||
def fit(self, X, constraints, **kwargs): | ||
return self._fit(X, constraints, **kwargs) | ||
|
||
|
||
class PairsMixin(WeaklySupervisedMixin): | ||
|
||
def __init__(self): | ||
raise NotImplementedError('PairsMixin should not be instantiated') | ||
# TODO: introduce specific scoring functions etc | ||
|
||
|
||
class TripletsMixin(WeaklySupervisedMixin): | ||
|
||
def __init__(self): | ||
raise NotImplementedError('TripletsMixin should not be ' | ||
'instantiated') | ||
# TODO: introduce specific scoring functions etc | ||
|
||
|
||
class QuadrupletsMixin(UnsupervisedMixin): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Quadruplets imply weak supervision. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree |
||
|
||
def __init__(self): | ||
raise NotImplementedError('QuadrupletsMixin should not be ' | ||
'instantiated') | ||
# TODO: introduce specific scoring functions etc | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,8 +6,9 @@ | |
import warnings | ||
from six.moves import xrange | ||
from scipy.sparse import coo_matrix | ||
from sklearn.utils import check_array | ||
|
||
__all__ = ['Constraints'] | ||
__all__ = ['Constraints', 'ConstrainedDataset'] | ||
|
||
|
||
class Constraints(object): | ||
|
@@ -18,17 +19,6 @@ def __init__(self, partial_labels): | |
self.known_label_idx, = np.where(partial_labels >= 0) | ||
self.known_labels = partial_labels[self.known_label_idx] | ||
|
||
def adjacency_matrix(self, num_constraints, random_state=np.random): | ||
a, b, c, d = self.positive_negative_pairs(num_constraints, | ||
random_state=random_state) | ||
row = np.concatenate((a, c)) | ||
col = np.concatenate((b, d)) | ||
data = np.ones_like(row, dtype=int) | ||
data[len(a):] = -1 | ||
adj = coo_matrix((data, (row, col)), shape=(self.num_points,)*2) | ||
# symmetrize | ||
return adj + adj.T | ||
|
||
def positive_negative_pairs(self, num_constraints, same_length=False, | ||
random_state=np.random): | ||
a, b = self._pairs(num_constraints, same_label=True, | ||
|
@@ -100,3 +90,88 @@ def random_subset(all_labels, num_preserved=np.inf, random_state=np.random): | |
partial_labels = np.array(all_labels, copy=True) | ||
partial_labels[idx] = -1 | ||
return Constraints(partial_labels) | ||
|
||
|
||
class ConstrainedDataset(object): | ||
|
||
def __init__(self, X, c): | ||
# we convert the data to a suitable format | ||
self.X = check_array(X, accept_sparse=True, dtype=None, warn_on_dtype=True) | ||
self.c = check_array(c, dtype=['int'] + np.sctypes['int'] | ||
+ np.sctypes['uint'], | ||
# we add 'int' at the beginning to tell it is the | ||
# default format we want in case of conversion | ||
ensure_2d=False, ensure_min_samples=False, | ||
ensure_min_features=False, warn_on_dtype=True) | ||
self._check_index(self.X.shape[0], self.c) | ||
self.shape = (len(c) if hasattr(c, '__len__') else 0, self.X.shape[1]) | ||
|
||
def __getitem__(self, item): | ||
return ConstrainedDataset(self.X, self.c[item]) | ||
|
||
def __len__(self): | ||
return self.shape | ||
|
||
def __str__(self): | ||
return self.toarray().__str__() | ||
|
||
def __repr__(self): | ||
return self.toarray().__repr__() | ||
|
||
def toarray(self): | ||
return self.X[self.c] | ||
|
||
@staticmethod | ||
def _check_index(length, indices): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe this could also check for potential duplicates? could simply show a warning when this is the case. (one could also remove them but this might create problems later when constraint labels are used) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree, I will implement it in a next commit |
||
max_index = np.max(indices) | ||
min_index = np.min(indices) | ||
pb_index = None | ||
if max_index >= length: | ||
pb_index = max_index | ||
elif min_index > length + 1: | ||
pb_index = min_index | ||
if pb_index is not None: | ||
raise IndexError("ConstrainedDataset cannot be created: the length of " | ||
"the dataset is {}, so index {} is out of range." | ||
.format(length, pb_index)) | ||
|
||
@staticmethod | ||
def pairs_from_labels(y): | ||
# TODO: to be implemented | ||
raise NotImplementedError | ||
|
||
@staticmethod | ||
def triplets_from_labels(y): | ||
# TODO: to be implemented | ||
raise NotImplementedError | ||
|
||
|
||
def unwrap_pairs(X_constrained, y): | ||
a = X_constrained.c[(y == 0)[:, 0]][:, 0] | ||
b = X_constrained.c[(y == 0)[:, 0]][:, 1] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems redundant. How about: y_zero = (y == 0).ravel()
a, b = X_constrained[y_zero].T
c, d = X_constrained[~y_zero].T There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, will do |
||
c = X_constrained.c[(y == 1)[:, 0]][:, 0] | ||
d = X_constrained.c[(y == 1)[:, 0]][:, 1] | ||
X = X_constrained.X | ||
return X, [a, b, c, d] | ||
|
||
def wrap_pairs(X, constraints): | ||
a = np.array(constraints[0]) | ||
b = np.array(constraints[1]) | ||
c = np.array(constraints[2]) | ||
d = np.array(constraints[3]) | ||
constraints = np.vstack([np.hstack([a[:, None], b[:, None]]), | ||
np.hstack([c[:, None], d[:, None]])]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. a, b, c, d = constraints
constraints = np.vstack((np.column_stack((a, b)), np.column_stack((c, d))))
# or if we have numpy 1.13+
constraints = np.block([[a, b], [c, d]]) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks, will do |
||
y = np.vstack([np.zeros((len(a), 1)), np.ones((len(c), 1))]) | ||
X_constrained = ConstrainedDataset(X, constraints) | ||
return X_constrained, y | ||
|
||
def unwrap_to_graph(X_constrained, y): | ||
|
||
X, [a, b, c, d] = unwrap_pairs(X_constrained, y) | ||
row = np.concatenate((a, c)) | ||
col = np.concatenate((b, d)) | ||
data = np.ones_like(row, dtype=int) | ||
data[len(a):] = -1 | ||
adj = coo_matrix((data, (row, col)), shape=(X_constrained.X.shape[0],) | ||
* 2) | ||
return X_constrained.X, adj + adj.T |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,17 +12,17 @@ | |
import numpy as np | ||
from sklearn.utils.validation import check_array | ||
|
||
from .base_metric import BaseMetricLearner | ||
from .base_metric import BaseMetricLearner, UnsupervisedMixin | ||
|
||
|
||
class Covariance(BaseMetricLearner): | ||
class _Covariance(BaseMetricLearner): | ||
def __init__(self): | ||
pass | ||
|
||
def metric(self): | ||
return self.M_ | ||
|
||
def fit(self, X, y=None): | ||
def _fit(self, X, y=None): | ||
""" | ||
X : data matrix, (n x d) | ||
y : unused | ||
|
@@ -34,3 +34,8 @@ def fit(self, X, y=None): | |
else: | ||
self.M_ = np.linalg.inv(self.M_) | ||
return self | ||
|
||
class Covariance(_Covariance, UnsupervisedMixin): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If nothing else will use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In general, I don't think it's worth separating out the algorithm implementation unless/until we have >1 class that needs it. |
||
|
||
def fit(self, X, y=None): | ||
return self._fit(X, y) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does any class other than
BaseMetricLearner
use this mixin? If not, I'd just inline it intoBaseMetricLearner
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done