Skip to content

[WIP] New API proposal #85

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 35 commits into from
Closed
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
3e5fbc3
Add new class structure
Feb 26, 2018
0cbf1ae
Put back TransformerMixin in BaseEstimator to inherit Transformer beh…
Feb 26, 2018
300dada
add ConstrainedDataset object
Feb 27, 2018
8615634
simplify constraints to always keep a view on X
Feb 28, 2018
a478baa
add check for input formats
Mar 2, 2018
3744bec
add basic testing to ConstrainedDataset
Mar 2, 2018
214d991
correct asterisk bug
Mar 2, 2018
4f4ce8b
begin work to dissociate classes
Mar 5, 2018
ac00b8b
update MMC with constrained_dataset
Mar 5, 2018
33561ab
Fixes according to review https://github.com/metric-learn/metric-lear…
Mar 6, 2018
7f40c56
make mixins rather than classes hierarchy for inheriting special methods
Mar 6, 2018
402f397
Merge branch 'new_api' into feat/class_dissociation
Mar 6, 2018
47a9372
Make changes according to review https://github.com/metric-learn/metr…
Mar 13, 2018
41dc123
Finalize class dissociation into mixins
Mar 6, 2018
5f63f24
Merge branch 'feat/class_dissociation' into new_api
Mar 19, 2018
fb0d118
separate valid and invalid input testing
Mar 20, 2018
df8a340
correct too long line syntax
Mar 20, 2018
e3e7e0c
clarify definition of variables in tests
Mar 20, 2018
5a9c2e5
simplify unwrap pairs and make it more robust to y dimension
Mar 20, 2018
cf94740
fix bug due to bad refactoring of c_shape
Mar 20, 2018
52f4516
simplify wrap pairs
Mar 20, 2018
079bb13
make QuadrupletsMixin inherit from WeaklySupervisedMixin
Mar 21, 2018
da7c8e7
add NotImplementedError for abstract mixins
Mar 21, 2018
8192d11
put TransformerMixin inline
Mar 21, 2018
2d0f1ca
put random state at top of file
Mar 21, 2018
6c59a1a
add transform, predict, decision_function, and scoring for weakly sup…
Mar 6, 2018
b70163a
Add tests
Mar 19, 2018
a12eb9a
Add documentation
Mar 23, 2018
b1f6c23
fix typo or/of
Mar 30, 2018
b0ec33b
Add tests for sparse matrices, dataframes and lists
Apr 12, 2018
64f5762
Fix Transformer interface (cf. review https://github.com/metric-learn…
Apr 12, 2018
2cf78dd
Do not separate classes if not needed (cf. https://github.com/metric-…
Apr 12, 2018
11a8ff1
Fix ascii invisible character
Apr 12, 2018
a768cbf
Fix test attribute error and numerical problems with new dataset
Apr 12, 2018
335d8f4
Fix unittest hierarchy of classes
Apr 12, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions metric_learn/base_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,37 @@ def transform(self, X=None):
X = check_array(X, accept_sparse=True)
L = self.transformer()
return X.dot(L.T)


class SupervisedMetricLearner(BaseMetricLearner):

def fit(self, X, y):
return NotImplementedError


class WeaklySupervisedMetricLearner(BaseMetricLearner):

def fit(self, constrained_dataset, y):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe think of a more concise naming convention instead of constrained_dataset as it is gonna be used all over the place. X_constrained perhaps?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, X_constrained seems good

return NotImplementedError


class PairsMetricLearner(WeaklySupervisedMetricLearner):

def __init__(self):
raise NotImplementedError('PairsMetricLearner should not be instantiated')
# TODO: introduce specific scoring functions etc


class TripletsMetricLearner(WeaklySupervisedMetricLearner):

def __init__(self):
raise NotImplementedError('TripletsMetricLearner should not be '
'instantiated')
# TODO: introduce specific scoring functions etc

class QuadrupletsMetricLearner(WeaklySupervisedMetricLearner):

def __init__(self):
raise NotImplementedError('QuadrupletsMetricLearner should not be '
'instantiated')
# TODO: introduce specific scoring functions etc
99 changes: 87 additions & 12 deletions metric_learn/constraints.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
import warnings
from six.moves import xrange
from scipy.sparse import coo_matrix
from sklearn.utils import check_array

__all__ = ['Constraints']
__all__ = ['Constraints', 'ConstrainedDataset']


class Constraints(object):
Expand All @@ -18,17 +19,6 @@ def __init__(self, partial_labels):
self.known_label_idx, = np.where(partial_labels >= 0)
self.known_labels = partial_labels[self.known_label_idx]

def adjacency_matrix(self, num_constraints, random_state=np.random):
a, b, c, d = self.positive_negative_pairs(num_constraints,
random_state=random_state)
row = np.concatenate((a, c))
col = np.concatenate((b, d))
data = np.ones_like(row, dtype=int)
data[len(a):] = -1
adj = coo_matrix((data, (row, col)), shape=(self.num_points,)*2)
# symmetrize
return adj + adj.T

def positive_negative_pairs(self, num_constraints, same_length=False,
random_state=np.random):
a, b = self._pairs(num_constraints, same_label=True,
Expand Down Expand Up @@ -100,3 +90,88 @@ def random_subset(all_labels, num_preserved=np.inf, random_state=np.random):
partial_labels = np.array(all_labels, copy=True)
partial_labels[idx] = -1
return Constraints(partial_labels)


class ConstrainedDataset(object):

def __init__(self, X, c):
# we convert the data to a suitable format
self.X = check_array(X, accept_sparse=True, dtype=None, warn_on_dtype=True)
self.c = check_array(c, dtype=['int'] + np.sctypes['int']
+ np.sctypes['uint'],
# we add 'int' at the beginning to tell it is the
# default format we want in case of conversion
ensure_2d=False, ensure_min_samples=False,
ensure_min_features=False, warn_on_dtype=True)
self._check_index(self.X.shape[0], self.c)
self.shape = (len(c) if hasattr(c, '__len__') else 0, self.X.shape[1])

def __getitem__(self, item):
return ConstrainedDataset(self.X, self.c[item])

def __len__(self):
return self.shape

def __str__(self):
return self.toarray().__str__()

def __repr__(self):
return self.toarray().__repr__()

def toarray(self):
return self.X[self.c]

@staticmethod
def _check_index(length, indices):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe this could also check for potential duplicates? could simply show a warning when this is the case. (one could also remove them but this might create problems later when constraint labels are used)

Copy link
Member Author

@wdevazelhes wdevazelhes Mar 5, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, I will implement it in a next commit

max_index = np.max(indices)
min_index = np.min(indices)
pb_index = None
if max_index >= length:
pb_index = max_index
elif min_index > length + 1:
pb_index = min_index
if pb_index is not None:
raise IndexError("ConstrainedDataset cannot be created: the length of "
"the dataset is {}, so index {} is out of range."
.format(length, pb_index))

@staticmethod
def pairs_from_labels(y):
# TODO: to be implemented
raise NotImplementedError

@staticmethod
def triplets_from_labels(y):
# TODO: to be implemented
raise NotImplementedError


def unwrap_pairs(constrained_dataset, y):
a = constrained_dataset.c[(y == 0)[:, 0]][:, 0]
b = constrained_dataset.c[(y == 0)[:, 0]][:, 1]
c = constrained_dataset.c[(y == 1)[:, 0]][:, 0]
d = constrained_dataset.c[(y == 1)[:, 0]][:, 1]
X = constrained_dataset.X
return X, [a, b, c, d]

def wrap_pairs(X, constraints):
a = np.array(constraints[0])
b = np.array(constraints[1])
c = np.array(constraints[2])
d = np.array(constraints[3])
constraints = np.vstack([np.hstack([a[:, None], b[:, None]]),
np.hstack([c[:, None], d[:, None]])])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a, b, c, d = constraints
constraints = np.vstack((np.column_stack((a, b)), np.column_stack((c, d))))
# or if we have numpy 1.13+
constraints = np.block([[a, b], [c, d]])

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks, will do

y = np.vstack([np.zeros((len(a), 1)), np.ones((len(c), 1))])
constrained_dataset = ConstrainedDataset(X, constraints)
return constrained_dataset, y

def unwrap_to_graph(constrained_dataset, y):

X, [a, b, c, d] = unwrap_pairs(constrained_dataset, y)
row = np.concatenate((a, c))
col = np.concatenate((b, d))
data = np.ones_like(row, dtype=int)
data[len(a):] = -1
adj = coo_matrix((data, (row, col)), shape=(constrained_dataset.X.shape[0],)
* 2)
return constrained_dataset.X, adj + adj.T
4 changes: 2 additions & 2 deletions metric_learn/covariance.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
import numpy as np
from sklearn.utils.validation import check_array

from .base_metric import BaseMetricLearner
from .base_metric import SupervisedMetricLearner


class Covariance(BaseMetricLearner):
class Covariance(SupervisedMetricLearner):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand why this was chosen, but this particular base class made me stop and consider a moment. We may eventually want a base class for unsupervised methods as well.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, this simple covariance method should not be a SupervisedMetricLearner (as it is completely unsupervised). Whether we will really need an unsupervised class in the long run is unclear, but maybe the best for now is to create an UnsupervisedMetricLearner class which takes only X in fit.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the review ! I agree, I did not notice but indeed Covariance is unsupervised, so I will change this in a following PR

def __init__(self):
pass

Expand Down
23 changes: 12 additions & 11 deletions metric_learn/itml.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@
from sklearn.metrics import pairwise_distances
from sklearn.utils.validation import check_array, check_X_y

from .base_metric import BaseMetricLearner
from .constraints import Constraints
from .base_metric import PairsMetricLearner, SupervisedMetricLearner
from .constraints import Constraints, unwrap_pairs, wrap_pairs
from ._util import vector_norm


class ITML(BaseMetricLearner):
class ITML(PairsMetricLearner):
"""Information Theoretic Metric Learning (ITML)"""
def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3,
A0=None, verbose=False):
Expand Down Expand Up @@ -73,19 +73,19 @@ def _process_inputs(self, X, constraints, bounds):
self.A_ = check_array(self.A0)
return a,b,c,d

def fit(self, X, constraints, bounds=None):
def fit(self, constrained_dataset, y, bounds=None):
"""Learn the ITML model.

Parameters
----------
X : (n x d) data matrix
each row corresponds to a single instance
constraints : 4-tuple of arrays
(a,b,c,d) indices into X, with (a,b) specifying positive and (c,d)
negative pairs
constrained_dataset : ConstrainedDataset
with constraints being an array of shape [n_constraints, 2]
y : array-like, shape (n x 1)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be n_constraints instead of n

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, indeed, will change this

labels of the constraints
bounds : list (pos,neg) pairs, optional
bounds on similarity, s.t. d(X[a],X[b]) < pos and d(X[c],X[d]) > neg
"""
X, constraints = unwrap_pairs(constrained_dataset, y)
a,b,c,d = self._process_inputs(X, constraints, bounds)
gamma = self.gamma
num_pos = len(a)
Expand Down Expand Up @@ -140,7 +140,7 @@ def metric(self):
return self.A_


class ITML_Supervised(ITML):
class ITML_Supervised(ITML, SupervisedMetricLearner):
"""Information Theoretic Metric Learning (ITML)"""
def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3,
num_labeled=np.inf, num_constraints=None, bounds=None, A0=None,
Expand Down Expand Up @@ -195,4 +195,5 @@ def fit(self, X, y, random_state=np.random):
random_state=random_state)
pos_neg = c.positive_negative_pairs(num_constraints,
random_state=random_state)
return ITML.fit(self, X, pos_neg, bounds=self.bounds)
constrained_dataset, y = wrap_pairs(X, pos_neg)
return ITML.fit(self, constrained_dataset, y, bounds=self.bounds)
4 changes: 2 additions & 2 deletions metric_learn/lfda.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
from sklearn.metrics import pairwise_distances
from sklearn.utils.validation import check_X_y

from .base_metric import BaseMetricLearner
from .base_metric import SupervisedMetricLearner


class LFDA(BaseMetricLearner):
class LFDA(SupervisedMetricLearner):
'''
Local Fisher Discriminant Analysis for Supervised Dimensionality Reduction
Sugiyama, ICML 2006
Expand Down
4 changes: 2 additions & 2 deletions metric_learn/lmnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@
from sklearn.utils.validation import check_X_y, check_array
from sklearn.metrics import euclidean_distances

from .base_metric import BaseMetricLearner
from .base_metric import SupervisedMetricLearner


# commonality between LMNN implementations
class _base_LMNN(BaseMetricLearner):
class _base_LMNN(SupervisedMetricLearner):
def __init__(self, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7,
regularization=0.5, convergence_tol=0.001, use_pca=True,
verbose=False):
Expand Down
28 changes: 18 additions & 10 deletions metric_learn/lsml.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
from six.moves import xrange
from sklearn.utils.validation import check_array, check_X_y

from .base_metric import BaseMetricLearner
from .constraints import Constraints
from .base_metric import SupervisedMetricLearner, QuadrupletsMetricLearner
from .constraints import Constraints, ConstrainedDataset


class LSML(BaseMetricLearner):
class LSML(QuadrupletsMetricLearner):
def __init__(self, tol=1e-3, max_iter=1000, prior=None, verbose=False):
"""Initialize LSML.

Expand Down Expand Up @@ -57,18 +57,23 @@ def _prepare_inputs(self, X, constraints, weights):
def metric(self):
return self.M_

def fit(self, X, constraints, weights=None):
def fit(self, constrained_dataset, y=None, weights=None):
"""Learn the LSML model.

Parameters
----------
X : (n x d) data matrix
each row corresponds to a single instance
constraints : 4-tuple of arrays
(a,b,c,d) indices into X, such that d(X[a],X[b]) < d(X[c],X[d])
constrained_dataset : ConstrainedDataset
with constraints being an array of shape [n_constraints, 4]. It
should be the concatenation of 4 column vectors a, b, c and d,
such that: ``d(X[a[i]],X[b[i]]) < d(X[c[i]],X[d[i]])`` for every
constraint index ``i``.
y : object
Not used, for scikit-learn compatibility
weights : (m,) array of floats, optional
scale factor for each constraint
"""
X = constrained_dataset.X
constraints = [constrained_dataset.c[:, i].ravel() for i in range(4)]
self._prepare_inputs(X, constraints, weights)
step_sizes = np.logspace(-10, 0, 10)
# Keep track of the best step size and the loss at that step.
Expand Down Expand Up @@ -131,7 +136,7 @@ def _gradient(self, metric):
return dMetric


class LSML_Supervised(LSML):
class LSML_Supervised(LSML, SupervisedMetricLearner):
def __init__(self, tol=1e-3, max_iter=1000, prior=None, num_labeled=np.inf,
num_constraints=None, weights=None, verbose=False):
"""Initialize the learner.
Expand Down Expand Up @@ -181,4 +186,7 @@ def fit(self, X, y, random_state=np.random):
random_state=random_state)
pairs = c.positive_negative_pairs(num_constraints, same_length=True,
random_state=random_state)
return LSML.fit(self, X, pairs, weights=self.weights)
constrained_dataset = ConstrainedDataset(X, np.hstack([pairs[i][:, None]
for i in
range(4)]))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

np.column_stack(pairs) seems to be what you want here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great, thanks !

return LSML.fit(self, constrained_dataset, weights=self.weights)
4 changes: 2 additions & 2 deletions metric_learn/mlkr.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@
from sklearn.decomposition import PCA
from sklearn.utils.validation import check_X_y

from .base_metric import BaseMetricLearner
from .base_metric import SupervisedMetricLearner

EPS = np.finfo(float).eps


class MLKR(BaseMetricLearner):
class MLKR(SupervisedMetricLearner):
"""Metric Learning for Kernel Regression (MLKR)"""
def __init__(self, num_dims=None, A0=None, epsilon=0.01, alpha=0.0001,
max_iter=1000):
Expand Down
24 changes: 13 additions & 11 deletions metric_learn/mmc.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,14 @@
from sklearn.metrics import pairwise_distances
from sklearn.utils.validation import check_array, check_X_y

from .base_metric import BaseMetricLearner
from .constraints import Constraints
from .base_metric import PairsMetricLearner, SupervisedMetricLearner
from .constraints import Constraints, ConstrainedDataset, unwrap_pairs, \
wrap_pairs
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Style nitpick: Use parens to break up long import lists:

from .constraints import (
    Constraints, ConstrainedDataset, unwrap_pairs, wrap_pairs)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks !

from ._util import vector_norm



class MMC(BaseMetricLearner):
class MMC(PairsMetricLearner):
"""Mahalanobis Metric for Clustering (MMC)"""
def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3,
A0=None, diagonal=False, diagonal_c=1.0, verbose=False):
Expand Down Expand Up @@ -58,17 +59,17 @@ def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3,
self.diagonal_c = diagonal_c
self.verbose = verbose

def fit(self, X, constraints):
def fit(self, constrained_dataset, y):
"""Learn the MMC model.

Parameters
----------
X : (n x d) data matrix
each row corresponds to a single instance
constraints : 4-tuple of arrays
(a,b,c,d) indices into X, with (a,b) specifying similar and (c,d)
dissimilar pairs
constrained_dataset : ConstrainedDataset
with constraints being an array of shape [n_constraints, 2]
y : array-like, shape (n x 1)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will do

labels of the constraints
"""
X, constraints = unwrap_pairs(constrained_dataset, y)
constraints = self._process_inputs(X, constraints)
if self.diagonal:
return self._fit_diag(X, constraints)
Expand Down Expand Up @@ -380,7 +381,7 @@ def transformer(self):
return V.T * np.sqrt(np.maximum(0, w[:,None]))


class MMC_Supervised(MMC):
class MMC_Supervised(MMC, SupervisedMetricLearner):
"""Mahalanobis Metric for Clustering (MMC)"""
def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-6,
num_labeled=np.inf, num_constraints=None,
Expand Down Expand Up @@ -437,4 +438,5 @@ def fit(self, X, y, random_state=np.random):
random_state=random_state)
pos_neg = c.positive_negative_pairs(num_constraints,
random_state=random_state)
return MMC.fit(self, X, pos_neg)
constrained_dataset, y = wrap_pairs(X, pos_neg)
return MMC.fit(self, constrained_dataset, y)
Loading