Skip to content

[WIP] New API proposal #85

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 35 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
3e5fbc3
Add new class structure
Feb 26, 2018
0cbf1ae
Put back TransformerMixin in BaseEstimator to inherit Transformer beh…
Feb 26, 2018
300dada
add ConstrainedDataset object
Feb 27, 2018
8615634
simplify constraints to always keep a view on X
Feb 28, 2018
a478baa
add check for input formats
Mar 2, 2018
3744bec
add basic testing to ConstrainedDataset
Mar 2, 2018
214d991
correct asterisk bug
Mar 2, 2018
4f4ce8b
begin work to dissociate classes
Mar 5, 2018
ac00b8b
update MMC with constrained_dataset
Mar 5, 2018
33561ab
Fixes according to review https://github.com/metric-learn/metric-lear…
Mar 6, 2018
7f40c56
make mixins rather than classes hierarchy for inheriting special methods
Mar 6, 2018
402f397
Merge branch 'new_api' into feat/class_dissociation
Mar 6, 2018
47a9372
Make changes according to review https://github.com/metric-learn/metr…
Mar 13, 2018
41dc123
Finalize class dissociation into mixins
Mar 6, 2018
5f63f24
Merge branch 'feat/class_dissociation' into new_api
Mar 19, 2018
fb0d118
separate valid and invalid input testing
Mar 20, 2018
df8a340
correct too long line syntax
Mar 20, 2018
e3e7e0c
clarify definition of variables in tests
Mar 20, 2018
5a9c2e5
simplify unwrap pairs and make it more robust to y dimension
Mar 20, 2018
cf94740
fix bug due to bad refactoring of c_shape
Mar 20, 2018
52f4516
simplify wrap pairs
Mar 20, 2018
079bb13
make QuadrupletsMixin inherit from WeaklySupervisedMixin
Mar 21, 2018
da7c8e7
add NotImplementedError for abstract mixins
Mar 21, 2018
8192d11
put TransformerMixin inline
Mar 21, 2018
2d0f1ca
put random state at top of file
Mar 21, 2018
6c59a1a
add transform, predict, decision_function, and scoring for weakly sup…
Mar 6, 2018
b70163a
Add tests
Mar 19, 2018
a12eb9a
Add documentation
Mar 23, 2018
b1f6c23
fix typo or/of
Mar 30, 2018
b0ec33b
Add tests for sparse matrices, dataframes and lists
Apr 12, 2018
64f5762
Fix Transformer interface (cf. review https://github.com/metric-learn…
Apr 12, 2018
2cf78dd
Do not separate classes if not needed (cf. https://github.com/metric-…
Apr 12, 2018
11a8ff1
Fix ascii invisible character
Apr 12, 2018
a768cbf
Fix test attribute error and numerical problems with new dataset
Apr 12, 2018
335d8f4
Fix unittest hierarchy of classes
Apr 12, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion metric_learn/base_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def fit(self, X, y):

class WeaklySupervisedMetricLearner(BaseMetricLearner):

def fit(self, X, constraints):
def fit(self, constrained_dataset, y):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe think of a more concise naming convention instead of constrained_dataset as it is gonna be used all over the place. X_constrained perhaps?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, X_constrained seems good

return NotImplementedError


Expand All @@ -77,3 +77,9 @@ def __init__(self):
'instantiated')
# TODO: introduce specific scoring functions etc

class QuadrupletsMetricLearner(WeaklySupervisedMetricLearner):

def __init__(self):
raise NotImplementedError('QuadrupletsMetricLearner should not be '
'instantiated')
# TODO: introduce specific scoring functions etc
42 changes: 31 additions & 11 deletions metric_learn/constraints.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,6 @@ def __init__(self, partial_labels):
self.known_label_idx, = np.where(partial_labels >= 0)
self.known_labels = partial_labels[self.known_label_idx]

def adjacency_matrix(self, num_constraints, random_state=np.random):
a, b, c, d = self.positive_negative_pairs(num_constraints,
random_state=random_state)
row = np.concatenate((a, c))
col = np.concatenate((b, d))
data = np.ones_like(row, dtype=int)
data[len(a):] = -1
adj = coo_matrix((data, (row, col)), shape=(self.num_points,)*2)
# symmetrize
return adj + adj.T

def positive_negative_pairs(self, num_constraints, same_length=False,
random_state=np.random):
a, b = self._pairs(num_constraints, same_label=True,
Expand Down Expand Up @@ -155,3 +144,34 @@ def pairs_from_labels(y):
def triplets_from_labels(y):
# TODO: to be implemented
raise NotImplementedError


def unwrap_pairs(constrained_dataset, y):
a = constrained_dataset.c[(y == 0)[:, 0]][:, 0]
b = constrained_dataset.c[(y == 0)[:, 0]][:, 1]
c = constrained_dataset.c[(y == 1)[:, 0]][:, 0]
d = constrained_dataset.c[(y == 1)[:, 0]][:, 1]
X = constrained_dataset.X
return X, [a, b, c, d]

def wrap_pairs(X, constraints):
a = np.array(constraints[0])
b = np.array(constraints[1])
c = np.array(constraints[2])
d = np.array(constraints[3])
constraints = np.vstack([np.hstack([a[:, None], b[:, None]]),
np.hstack([c[:, None], d[:, None]])])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a, b, c, d = constraints
constraints = np.vstack((np.column_stack((a, b)), np.column_stack((c, d))))
# or if we have numpy 1.13+
constraints = np.block([[a, b], [c, d]])

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks, will do

y = np.vstack([np.zeros((len(a), 1)), np.ones((len(c), 1))])
constrained_dataset = ConstrainedDataset(X, constraints)
return constrained_dataset, y

def unwrap_to_graph(constrained_dataset, y):

X, [a, b, c, d] = unwrap_pairs(constrained_dataset, y)
row = np.concatenate((a, c))
col = np.concatenate((b, d))
data = np.ones_like(row, dtype=int)
data[len(a):] = -1
adj = coo_matrix((data, (row, col)), shape=(constrained_dataset.X.shape[0],)
* 2)
return constrained_dataset.X, adj + adj.T
17 changes: 9 additions & 8 deletions metric_learn/itml.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from sklearn.utils.validation import check_array, check_X_y

from .base_metric import PairsMetricLearner, SupervisedMetricLearner
from .constraints import Constraints
from .constraints import Constraints, unwrap_pairs, wrap_pairs
from ._util import vector_norm


Expand Down Expand Up @@ -73,19 +73,19 @@ def _process_inputs(self, X, constraints, bounds):
self.A_ = check_array(self.A0)
return a,b,c,d

def fit(self, X, constraints, bounds=None):
def fit(self, constrained_dataset, y, bounds=None):
"""Learn the ITML model.

Parameters
----------
X : (n x d) data matrix
each row corresponds to a single instance
constraints : 4-tuple of arrays
(a,b,c,d) indices into X, with (a,b) specifying positive and (c,d)
negative pairs
constrained_dataset : ConstrainedDataset
with constraints being an array of shape [n_constraints, 2]
y : array-like, shape (n x 1)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be n_constraints instead of n

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, indeed, will change this

labels of the constraints
bounds : list (pos,neg) pairs, optional
bounds on similarity, s.t. d(X[a],X[b]) < pos and d(X[c],X[d]) > neg
"""
X, constraints = unwrap_pairs(constrained_dataset, y)
a,b,c,d = self._process_inputs(X, constraints, bounds)
gamma = self.gamma
num_pos = len(a)
Expand Down Expand Up @@ -195,4 +195,5 @@ def fit(self, X, y, random_state=np.random):
random_state=random_state)
pos_neg = c.positive_negative_pairs(num_constraints,
random_state=random_state)
return ITML.fit(self, X, pos_neg, bounds=self.bounds)
constrained_dataset, y = wrap_pairs(X, pos_neg)
return ITML.fit(self, constrained_dataset, y, bounds=self.bounds)
26 changes: 17 additions & 9 deletions metric_learn/lsml.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
from six.moves import xrange
from sklearn.utils.validation import check_array, check_X_y

from .base_metric import PairsMetricLearner, SupervisedMetricLearner
from .constraints import Constraints
from .base_metric import SupervisedMetricLearner, QuadrupletsMetricLearner
from .constraints import Constraints, ConstrainedDataset


class LSML(PairsMetricLearner):
class LSML(QuadrupletsMetricLearner):
def __init__(self, tol=1e-3, max_iter=1000, prior=None, verbose=False):
"""Initialize LSML.

Expand Down Expand Up @@ -57,18 +57,23 @@ def _prepare_inputs(self, X, constraints, weights):
def metric(self):
return self.M_

def fit(self, X, constraints, weights=None):
def fit(self, constrained_dataset, y=None, weights=None):
"""Learn the LSML model.

Parameters
----------
X : (n x d) data matrix
each row corresponds to a single instance
constraints : 4-tuple of arrays
(a,b,c,d) indices into X, such that d(X[a],X[b]) < d(X[c],X[d])
constrained_dataset : ConstrainedDataset
with constraints being an array of shape [n_constraints, 4]. It
should be the concatenation of 4 column vectors a, b, c and d,
such that: ``d(X[a[i]],X[b[i]]) < d(X[c[i]],X[d[i]])`` for every
constraint index ``i``.
y : object
Not used, for scikit-learn compatibility
weights : (m,) array of floats, optional
scale factor for each constraint
"""
X = constrained_dataset.X
constraints = [constrained_dataset.c[:, i].ravel() for i in range(4)]
self._prepare_inputs(X, constraints, weights)
step_sizes = np.logspace(-10, 0, 10)
# Keep track of the best step size and the loss at that step.
Expand Down Expand Up @@ -181,4 +186,7 @@ def fit(self, X, y, random_state=np.random):
random_state=random_state)
pairs = c.positive_negative_pairs(num_constraints, same_length=True,
random_state=random_state)
return LSML.fit(self, X, pairs, weights=self.weights)
constrained_dataset = ConstrainedDataset(X, np.hstack([pairs[i][:, None]
for i in
range(4)]))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

np.column_stack(pairs) seems to be what you want here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great, thanks !

return LSML.fit(self, constrained_dataset, weights=self.weights)
18 changes: 10 additions & 8 deletions metric_learn/mmc.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
from sklearn.utils.validation import check_array, check_X_y

from .base_metric import PairsMetricLearner, SupervisedMetricLearner
from .constraints import Constraints
from .constraints import Constraints, ConstrainedDataset, unwrap_pairs, \
wrap_pairs
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Style nitpick: Use parens to break up long import lists:

from .constraints import (
    Constraints, ConstrainedDataset, unwrap_pairs, wrap_pairs)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks !

from ._util import vector_norm


Expand Down Expand Up @@ -58,17 +59,17 @@ def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3,
self.diagonal_c = diagonal_c
self.verbose = verbose

def fit(self, X, constraints):
def fit(self, constrained_dataset, y):
"""Learn the MMC model.

Parameters
----------
X : (n x d) data matrix
each row corresponds to a single instance
constraints : 4-tuple of arrays
(a,b,c,d) indices into X, with (a,b) specifying similar and (c,d)
dissimilar pairs
constrained_dataset : ConstrainedDataset
with constraints being an array of shape [n_constraints, 2]
y : array-like, shape (n x 1)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will do

labels of the constraints
"""
X, constraints = unwrap_pairs(constrained_dataset, y)
constraints = self._process_inputs(X, constraints)
if self.diagonal:
return self._fit_diag(X, constraints)
Expand Down Expand Up @@ -437,4 +438,5 @@ def fit(self, X, y, random_state=np.random):
random_state=random_state)
pos_neg = c.positive_negative_pairs(num_constraints,
random_state=random_state)
return MMC.fit(self, X, pos_neg)
constrained_dataset, y = wrap_pairs(X, pos_neg)
return MMC.fit(self, constrained_dataset, y)
19 changes: 11 additions & 8 deletions metric_learn/sdml.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from sklearn.utils.validation import check_array

from .base_metric import PairsMetricLearner, SupervisedMetricLearner
from .constraints import Constraints
from .constraints import Constraints, wrap_pairs, unwrap_to_graph


class SDML(PairsMetricLearner):
Expand Down Expand Up @@ -56,21 +56,22 @@ def _prepare_inputs(self, X, W):
def metric(self):
return self.M_

def fit(self, X, W):
def fit(self, constrained_dataset, y):
"""Learn the SDML model.

Parameters
----------
X : array-like, shape (n, d)
data matrix, where each row corresponds to a single instance
W : array-like, shape (n, n)
connectivity graph, with +1 for positive pairs and -1 for negative
constrained_dataset : ConstrainedDataset
with constraints being an array of shape [n_constraints, 2]
y : array-like, shape (n x 1)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will do

labels of the constraints

Returns
-------
self : object
Returns the instance.
"""
X, W = unwrap_to_graph(constrained_dataset, y)
loss_matrix = self._prepare_inputs(X, W)
P = self.M_ + self.balance_param * loss_matrix
emp_cov = pinvh(P)
Expand Down Expand Up @@ -131,5 +132,7 @@ def fit(self, X, y, random_state=np.random):

c = Constraints.random_subset(y, self.num_labeled,
random_state=random_state)
adj = c.adjacency_matrix(num_constraints, random_state=random_state)
return SDML.fit(self, X, adj)
pos_neg = c.positive_negative_pairs(num_constraints,
random_state=random_state)
constrained_dataset, y = wrap_pairs(X, pos_neg)
return SDML.fit(self, constrained_dataset, y)
5 changes: 3 additions & 2 deletions test/metric_learn_test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import unittest
import numpy as np
from metric_learn.constraints import wrap_pairs
from six.moves import xrange
from sklearn.metrics import pairwise_distances
from sklearn.datasets import load_iris
Expand Down Expand Up @@ -160,7 +161,7 @@ def test_iris(self):

# Full metric
mmc = MMC(convergence_threshold=0.01)
mmc.fit(self.iris_points, [a,b,c,d])
mmc.fit(*wrap_pairs(self.iris_points, [a,b,c,d]))
expected = [[+0.00046504, +0.00083371, -0.00111959, -0.00165265],
[+0.00083371, +0.00149466, -0.00200719, -0.00296284],
[-0.00111959, -0.00200719, +0.00269546, +0.00397881],
Expand All @@ -169,7 +170,7 @@ def test_iris(self):

# Diagonal metric
mmc = MMC(diagonal=True)
mmc.fit(self.iris_points, [a,b,c,d])
mmc.fit(*wrap_pairs(self.iris_points, [a,b,c,d]))
expected = [0, 0, 1.21045968, 1.22552608]
assert_array_almost_equal(np.diag(expected), mmc.metric(), decimal=6)

Expand Down