diff --git a/.gitignore b/.gitignore index 8321c7d2..02ee652d 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,7 @@ htmlcov/ .cache/ .pytest_cache/ doc/auto_examples/* -doc/generated/* \ No newline at end of file +doc/generated/* + +# Pycharm artifacts +.idea/ diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index 8bdc4bf0..95868f37 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -1,10 +1,17 @@ """ Large Margin Nearest Neighbor Metric learning (LMNN) """ +import sys +import time +import warnings import numpy as np -from collections import Counter +from scipy.optimize import minimize +from scipy.sparse import coo_matrix, csr_matrix, csc_matrix +from sklearn.neighbors import NearestNeighbors from sklearn.metrics import euclidean_distances +from sklearn.utils import gen_batches, check_random_state from sklearn.base import TransformerMixin +from sklearn.exceptions import ConvergenceWarning from ._util import _initialize_components, _check_n_components from .base_metric import MahalanobisMixin @@ -23,6 +30,18 @@ class LMNN(MahalanobisMixin, TransformerMixin): Parameters ---------- + n_neighbors : int, optional (default=3) + Number of neighbors to consider, not including self-edges. + + k : Not used + .. deprecated:: 0.6.3 + `k` was deprecated in version 0.6.3 and will + be removed in 0.6.4. It is replaced by `n_neighbors` to have more + consistent terminology with scikit-learn. + + n_components : int or None, optional (default=None) + Dimensionality of reduced space (if None, defaults to dimension of X). + init : string or numpy array, optional (default='auto') Initialization of the linear transformation. Possible options are 'auto', 'pca', 'identity', 'random', and a numpy array of shape @@ -63,35 +82,41 @@ class LMNN(MahalanobisMixin, TransformerMixin): :meth:`fit` and n_features_a must be less than or equal to that. If ``n_components`` is not None, n_features_a must match it. - k : int, optional (default=3) - Number of neighbors to consider, not including self-edges. + max_impostors : int, optional (default=500_000) + Maximum number of impostors to consider per iteration. In the worst + case this will allow ``max_impostors * n_neighbors`` constraints to be + active. + + neighbors_params : dict, optional (default=None) + Parameters to pass to a :class:`neighbors.NearestNeighbors` instance - + apart from ``n_neighbors`` - that will be used to select the target + neighbors. - min_iter : int, optional (default=50) - Minimum number of iterations of the optimization procedure. + push_loss_weight: float, optional (default=0.5) + Relative weight between pull loss and push loss terms, with 0.5 meaning + equal weight. Note that in practice the push loss weight will be + unweighted and the pull loss will be weighted accordingly by a factor + `(1 - push_loss_weight) / push_loss_weight`. + + regularization : Not used + .. deprecated:: 0.6.3 + `regularization` was deprecated in version 0.6.3 and will + be removed in 0.6.4. It is replaced by `push_loss_weight` that more + explicitly defines which loss term is meant to be weighted. max_iter : int, optional (default=1000) Maximum number of iterations of the optimization procedure. - learn_rate : float, optional (default=1e-7) - Learning rate of the optimization procedure - - tol : float, optional (default=0.001) + tol : float, optional (default=0.00001) Tolerance of the optimization procedure. If the objective value varies less than `tol`, we consider the algorithm has converged and stop it. - verbose : bool, optional (default=False) - Whether to print the progress of the optimization procedure. - - regularization: float, optional (default=0.5) - Relative weight between pull and push terms, with 0.5 meaning equal - weight. - preprocessor : array-like, shape=(n_samples, n_features) or callable The preprocessor to call to get tuples from indices. If array-like, tuples will be formed like this: X[indices]. - n_components : int or None, optional (default=None) - Dimensionality of reduced space (if None, defaults to dimension of X). + verbose : bool, optional (default=False) + Whether to print the progress of the optimization procedure. random_state : int or numpy.RandomState or None, optional (default=None) A pseudo random number generator object or a seed for it if int. If @@ -101,233 +126,591 @@ class LMNN(MahalanobisMixin, TransformerMixin): Attributes ---------- + components_ : `numpy.ndarray`, shape=(n_components, n_features) + The learned linear transformation ``L``. + + n_neighbors_ : int + The provided ``n_neighbors`` is decreased if it is greater than or + equal to min(number of elements in each class). + n_iter_ : `int` The number of iterations the solver has run. - components_ : `numpy.ndarray`, shape=(n_components, n_features) - The learned linear transformation ``L``. Examples -------- >>> import numpy as np >>> from metric_learn import LMNN + >>> from sklearn.neighbors import KNeighborsClassifier >>> from sklearn.datasets import load_iris >>> iris_data = load_iris() >>> X = iris_data['data'] >>> Y = iris_data['target'] - >>> lmnn = LMNN(k=5, learn_rate=1e-6) - >>> lmnn.fit(X, Y, verbose=False) + >>> lmnn = LMNN(n_neighbors=3) + >>> lmnn.fit(X, Y) + >>> knn = KNeighborsClassifier(n_neighbors=3) + >>> knn.fit(lmnn.transform(X), Y) References ---------- - .. [1] K. Q. Weinberger, J. Blitzer, L. K. Saul. `Distance Metric - Learning for Large Margin Nearest Neighbor Classification - `_. NIPS - 2005. + .. [1] Weinberger, Kilian Q., and Lawrence K. Saul. + "Distance Metric Learning for Large Margin Nearest Neighbor + Classification." + Journal of Machine Learning Research, Vol. 10, Feb. 2009, + pp. 207-244. + http://jmlr.csail.mit.edu/papers/volume10/weinberger09a/weinberger09a.pdf + + .. [2] Wikipedia entry on Large Margin Nearest Neighbor + https://en.wikipedia.org/wiki/Large_margin_nearest_neighbor + """ + def __init__(self, n_neighbors=3, k='deprecated', n_components=None, + init='auto', max_impostors=500_000, neighbors_params=None, + push_loss_weight=0.5, regularization='deprecated', max_iter=50, + tol=1e-5, preprocessor=None, verbose=False, random_state=None): - def __init__(self, init='auto', k=3, min_iter=50, max_iter=1000, - learn_rate=1e-7, regularization=0.5, convergence_tol=0.001, - verbose=False, preprocessor=None, - n_components=None, random_state=None): - self.init = init + self.n_neighbors = n_neighbors self.k = k - self.min_iter = min_iter - self.max_iter = max_iter - self.learn_rate = learn_rate + self.n_components = n_components + self.init = init + self.max_impostors = max_impostors + self.neighbors_params = neighbors_params + self.push_loss_weight = push_loss_weight self.regularization = regularization - self.convergence_tol = convergence_tol + self.max_iter = max_iter + self.tol = tol self.verbose = verbose - self.n_components = n_components self.random_state = random_state super(LMNN, self).__init__(preprocessor) def fit(self, X, y): - k = self.k - reg = self.regularization - learn_rate = self.learn_rate - - X, y = self._prepare_inputs(X, y, dtype=float, - ensure_min_samples=2) - num_pts, d = X.shape - output_dim = _check_n_components(d, self.n_components) - unique_labels, label_inds = np.unique(y, return_inverse=True) - if len(label_inds) != num_pts: - raise ValueError('Must have one label per point.') - self.labels_ = np.arange(len(unique_labels)) - - self.components_ = _initialize_components(output_dim, X, y, self.init, - self.verbose, - random_state=self.random_state) - required_k = np.bincount(label_inds).min() - if self.k > required_k: - raise ValueError('not enough class labels for specified k' - ' (smallest class has %d)' % required_k) - - target_neighbors = self._select_targets(X, label_inds) - - # sum outer products - dfG = _sum_outer_products(X, target_neighbors.flatten(), - np.repeat(np.arange(X.shape[0]), k)) - - # initialize L - L = self.components_ - - # first iteration: we compute variables (including objective and gradient) - # at initialization point - G, objective, total_active = self._loss_grad(X, L, dfG, k, - reg, target_neighbors, - label_inds) - - it = 1 # we already made one iteration + """Train Large Margin Nearest Neighbor model. + + :param X: array, shape (n_samples, n_features), the training samples. + :param y: array, shape (n_samples,), the training labels. + + :return: The trained LMNN model. + """ + + # Check input arrays + X, y = self._prepare_inputs(X, y, dtype=float, ensure_min_samples=2) + + n_samples, n_features = X.shape + n_components = _check_n_components(n_features, self.n_components) + + # Initialize the random generator + self._random_state = check_random_state(self.random_state) + + # Initialize transformation + self.components_ = _initialize_components(n_components, X, y, self.init, + verbose=self.verbose, + random_state=self._random_state) + + # remove singletons after initializing components + X, y, classes = self._validate_params(X, y) + + # Find the target neighbors of each sample + target_neighbors = self._select_target_neighbors(X, y, classes) + + # Compute the pull loss gradient w.r.t. M, which remains constant + tn_graph = _make_knn_graph(target_neighbors) + pull_loss_grad_m = _sum_weighted_outer_products(X, tn_graph) + + # Compute the pull loss weight such that the push loss weight becomes 1 + pull_loss_weight = (1 - self.push_loss_weight) / self.push_loss_weight + pull_loss_grad_m *= pull_loss_weight + + int_verbose = int(self.verbose) + disp = int_verbose - 2 if int_verbose > 1 else -1 + optimizer_params = { + 'method': 'L-BFGS-B', + 'fun': self._loss_grad_lbfgs, + 'jac': True, + 'args': (X, y, classes, target_neighbors, pull_loss_grad_m), + 'x0': self.components_, + 'tol': self.tol, + 'options': dict(maxiter=self.max_iter, disp=disp), + } + + # Call the optimizer + self.n_iter_ = 0 + opt_result = minimize(**optimizer_params) + + # Reshape the solution found by the optimizer + self.components_ = opt_result.x.reshape(-1, n_features) if self.verbose: - print("iter | objective | objective difference | active constraints", - "| learning rate") - - # main loop - for it in range(2, self.max_iter): - # then at each iteration, we try to find a value of L that has better - # objective than the previous L, following the gradient: - while True: - # the next point next_L to try out is found by a gradient step - L_next = L - learn_rate * G - # we compute the objective at next point - # we copy variables that can be modified by _loss_grad, because if we - # retry we don t want to modify them several times - (G_next, objective_next, total_active_next) = ( - self._loss_grad(X, L_next, dfG, k, reg, target_neighbors, - label_inds)) - assert not np.isnan(objective) - delta_obj = objective_next - objective - if delta_obj > 0: - # if we did not find a better objective, we retry with an L closer to - # the starting point, by decreasing the learning rate (making the - # gradient step smaller) - learn_rate /= 2 - else: - # otherwise, if we indeed found a better obj, we get out of the loop - break - # when the better L is found (and the related variables), we set the - # old variables to these new ones before next iteration and we - # slightly increase the learning rate - L = L_next - G, objective, total_active = G_next, objective_next, total_active_next - learn_rate *= 1.01 - - if self.verbose: - print(it, objective, delta_obj, total_active, learn_rate) - - # check for convergence - if it > self.min_iter and abs(delta_obj) < self.convergence_tol: - if self.verbose: - print("LMNN converged with objective", objective) - break - else: - if self.verbose: - print("LMNN didn't converge in %d steps." % self.max_iter) - - # store the last L - self.components_ = L - self.n_iter_ = it + cls_name = self.__class__.__name__ + + # Warn the user if the algorithm did not converge + if not opt_result.success: + warnings.warn('[{}] LMNN did not converge: {}'.format( + cls_name, opt_result.message), ConvergenceWarning) + return self - def _loss_grad(self, X, L, dfG, k, reg, target_neighbors, label_inds): - # Compute pairwise distances under current metric - Lx = L.dot(X.T).T - - # we need to find the furthest neighbor: - Ni = 1 + _inplace_paired_L2(Lx[target_neighbors], Lx[:, None, :]) - furthest_neighbors = np.take_along_axis(target_neighbors, - Ni.argmax(axis=1)[:, None], 1) - impostors = self._find_impostors(furthest_neighbors.ravel(), X, - label_inds, L) - - g0 = _inplace_paired_L2(*Lx[impostors]) - - # we reorder the target neighbors - g1, g2 = Ni[impostors] - # compute the gradient - total_active = 0 - df = np.zeros((X.shape[1], X.shape[1])) - for nn_idx in reversed(range(k)): # note: reverse not useful here - act1 = g0 < g1[:, nn_idx] - act2 = g0 < g2[:, nn_idx] - total_active += act1.sum() + act2.sum() - - targets = target_neighbors[:, nn_idx] - PLUS, pweight = _count_edges(act1, act2, impostors, targets) - df += _sum_outer_products(X, PLUS[:, 0], PLUS[:, 1], pweight) - - in_imp, out_imp = impostors - df -= _sum_outer_products(X, in_imp[act1], out_imp[act1]) - df -= _sum_outer_products(X, in_imp[act2], out_imp[act2]) - - # do the gradient update - assert not np.isnan(df).any() - G = dfG * reg + df * (1 - reg) - G = L.dot(G) - # compute the objective function - objective = total_active * (1 - reg) - objective += G.flatten().dot(L.flatten()) - return 2 * G, objective, total_active - - def _select_targets(self, X, label_inds): - target_neighbors = np.empty((X.shape[0], self.k), dtype=int) - for label in self.labels_: - inds, = np.nonzero(label_inds == label) - dd = euclidean_distances(X[inds], squared=True) - np.fill_diagonal(dd, np.inf) - nn = np.argsort(dd)[..., :self.k] - target_neighbors[inds] = inds[nn] + def _validate_params(self, X, y): + """Validate parameters as soon as :meth:`fit` is called. + + :param X: array, shape (n_samples, n_features), the training samples. + :param y: array, shape (n_samples,), the training labels. + + :return: + X: array, shape (n_samples, n_features), the validated training samples. + y: array, shape (n_samples,), the validated training labels. + classes: array, shape (n_classes,), the non-singleton classes encoded. + """ + + # TODO: remove these in v0.6.4 + if self.k != 'deprecated': + warnings.warn('"k" parameter from initialization is not used.' + ' It has been deprecated in version 0.6.3 and will be' + ' removed in 0.6.4. Use the "n_neighbors" parameter ' + 'instead.', DeprecationWarning) + + if self.regularization != 'deprecated': + warnings.warn('"regularization" parameter from initialization is not ' + 'used. It has been deprecated in version 0.6.3 and will ' + 'be removed in 0.6.4. Use the "push_loss_weight" parameter' + 'instead.', DeprecationWarning) + + # Find the appearing classes and the class index of each of the samples + classes, y = np.unique(y, return_inverse=True) + classes_inverse = np.arange(len(classes)) + + # Ignore classes that have less than two samples (singleton classes) + class_sizes = np.bincount(y) + mask_singleton_class = class_sizes == 1 + singleton_classes = np.where(mask_singleton_class)[0] + if len(singleton_classes): + warnings.warn('There are {} singleton classes that will be ignored ' + 'during training. A copy of the inputs `X` and `y` will ' + 'be made.'.format(len(singleton_classes))) + + mask_singleton_sample = np.asarray([yi in singleton_classes for + yi in y]) + X = X[~mask_singleton_sample] + y = y[~mask_singleton_sample] + + # Check that there are at least 2 non-singleton classes + n_classes_non_singleton = len(classes) - len(singleton_classes) + if n_classes_non_singleton < 2: + raise ValueError('LMNN needs at least 2 non-singleton classes, got {}.' + .format(n_classes_non_singleton)) + + classes_inverse_non_singleton = classes_inverse[~mask_singleton_class] + + # Check the preferred number of neighbors + min_non_singleton_size = class_sizes[~mask_singleton_class].min() + if self.n_neighbors >= min_non_singleton_size: + warnings.warn('`n_neighbors` (={}) is not less than the number of ' + 'samples in the smallest non-singleton class (={}). ' + '`n_neighbors_` will be set to {} for estimation.' + .format(self.n_neighbors, min_non_singleton_size, + min_non_singleton_size - 1)) + + self.n_neighbors_ = min(self.n_neighbors, min_non_singleton_size - 1) + + return X, y, classes_inverse_non_singleton + + def _loss_grad_lbfgs(self, L, X, y, classes, target_neighbors, + pull_loss_grad_m): + """Compute loss and gradient after one optimization iteration. + + :param L: array, shape (n_components * n_features,) + The current (flattened) linear transformation. + :param X: array, shape (n_samples, n_features), the training samples. + :param y: array, shape (n_samples,), the training labels. + :param classes: array, shape (n_classes,), classes encoded as intergers. + :param target_neighbors: array, shape (n_samples, n_neighbors), + the target neighbors of each training sample. + :param pull_loss_grad_m: array, shape (n_features, n_features), + the (weighted) gradient (w.r.t M=L.T@L) due to target + neighbors that stays fixed throughout the algorithm. + + :return: + loss: float, the loss given the current solution. + grad: array, shape (n_components, n_features), the (flattened) gradient. + """ + + # Print headers + if self.n_iter_ == 0 and self.verbose: + header_fields = ['Iteration', 'Objective Value', + '#Active Triplets', 'Time(s)'] + header_fmt = '{:>10} {:>20} {:>20} {:>10}' + header = header_fmt.format(*header_fields) + cls_name = self.__class__.__name__ + print('[{}]'.format(cls_name)) + print('[{}] {}'.format(cls_name, header)) + print('[{}] {}'.format(cls_name, '-' * len(header))) + + # Measure duration of optimization iteration + start_time = time.time() + + # Reshape current solution + n_samples, n_features = X.shape + L = L.reshape(-1, n_features) + + # Transform samples according to the current solution + X_embedded = X @ L.T + + # Compute squared distances to the target neighbors + n_neighbors = target_neighbors.shape[1] + dist_tn = np.zeros((n_samples, n_neighbors)) + for k in range(n_neighbors): + ind_tn = target_neighbors[:, k] + diffs = X_embedded - X_embedded[ind_tn] + dist_tn[:, k] = np.einsum('ij,ij->i', diffs, diffs) # row norms + + # Add margin to the distances + dist_tn += 1 + + # Find impostors + impostors_graph = self._find_impostors(X_embedded, y, classes, + dist_tn[:, -1]) + + # Compute the push loss and its gradient + push_loss, push_loss_grad_m, n_active = _push_loss_grad( + X, target_neighbors, dist_tn, impostors_graph) + + # Compute the total loss + M = L.T @ L + # It holds that trace(M.T @ C) = np.dot(M.ravel(), C.ravel()) + new_pull_loss = np.dot(pull_loss_grad_m.ravel(), M.ravel()) + loss = push_loss + new_pull_loss + + # Compute the total gradient (grad_L = 2L grad_M) + grad = np.dot(L, pull_loss_grad_m + push_loss_grad_m) + grad *= 2 + + # Report iteration metrics + self.n_iter_ += 1 + if self.verbose: + elapsed_time = time.time() - start_time + values_fmt = '[{}] {:>10} {:>20.6e} {:>20,} {:>10.2f}' + cls_name = self.__class__.__name__ + print(values_fmt.format(cls_name, self.n_iter_, loss, + n_active, elapsed_time)) + sys.stdout.flush() + + return loss, grad.ravel() + + def _select_target_neighbors(self, X, y, classes): + """Find the target neighbors of each training sample. + + :param X: array, shape (n_samples, n_features), the training samples. + :param y: array, shape (n_samples,), the training labels. + :param classes: array, shape (n_classes,), the classes encoded as integers. + + :return: array, shape (n_samples, n_neighbors), + the indices of the target neighbors of each training sample. + """ + + start_time = time.time() + cls_name = self.__class__.__name__ + if self.verbose: + print('[{}] Finding the target neighbors...'.format(cls_name)) + sys.stdout.flush() + + nn_kwargs = self.neighbors_params or {} + nn = NearestNeighbors(n_neighbors=self.n_neighbors_, **nn_kwargs) + target_neighbors = np.empty((X.shape[0], self.n_neighbors_), dtype=int) + + for label in classes: + ind_class = np.where(y == label)[0] + nn.fit(X[ind_class]) + ind_neighbors = nn.kneighbors(return_distance=False) + target_neighbors[ind_class] = ind_class[ind_neighbors] + + if self.verbose: + elapsed_time = time.time() - start_time + print('[{}] Found the target neighbors in {:5.2f}s.'.format( + cls_name, elapsed_time)) + return target_neighbors - def _find_impostors(self, furthest_neighbors, X, label_inds, L): - Lx = X.dot(L.T) - margin_radii = 1 + _inplace_paired_L2(Lx[furthest_neighbors], Lx) - impostors = [] - for label in self.labels_[:-1]: - in_inds, = np.nonzero(label_inds == label) - out_inds, = np.nonzero(label_inds > label) - dist = euclidean_distances(Lx[out_inds], Lx[in_inds], squared=True) - i1, j1 = np.nonzero(dist < margin_radii[out_inds][:, None]) - i2, j2 = np.nonzero(dist < margin_radii[in_inds]) - i = np.hstack((i1, i2)) - j = np.hstack((j1, j2)) - if i.size > 0: - # get unique (i,j) pairs using index trickery - shape = (i.max() + 1, j.max() + 1) - tmp = np.ravel_multi_index((i, j), shape) - i, j = np.unravel_index(np.unique(tmp), shape) - impostors.append(np.vstack((in_inds[j], out_inds[i]))) - if len(impostors) == 0: - # No impostors detected - return impostors - return np.hstack(impostors) - - -def _inplace_paired_L2(A, B): - '''Equivalent to ((A-B)**2).sum(axis=-1), but modifies A in place.''' - A -= B - return np.einsum('...ij,...ij->...i', A, A) - - -def _count_edges(act1, act2, impostors, targets): - imp = impostors[0, act1] - c = Counter(zip(imp, targets[imp])) - imp = impostors[1, act2] - c.update(zip(imp, targets[imp])) - if c: - active_pairs = np.array(list(c.keys())) + def _find_impostors(self, X_embedded, y, classes, margin_radii): + """Find the samples that violate the margin. + + :param X_embedded: array, shape (n_samples, n_components), + the training samples in the embedding space. + :param y: array, shape (n_samples,), training labels. + :param classes: array, shape (n_classes,), the classes encoded as integers. + :param margin_radii: array, shape (n_samples,), squared distances of + training samples to their farthest target neighbors + plus margin. + + :return: coo_matrix, shape (n_samples, n_neighbors), + If sample i is an impostor to sample j or vice versa, then the + value of A[i, j] is the squared distance between samples i and j. + Otherwise A[i, j] is zero. + + """ + + # Initialize lists for impostors storage + imp_row, imp_col = [], [] + + for label in classes[:-1]: + ind_a = np.where(y == label)[0] + ind_b = np.where(y > label)[0] + + ind_impostors = _find_impostors_blockwise(X_embedded, margin_radii, + ind_b, ind_a) + + n_impostors = len(ind_impostors) + if n_impostors: + if n_impostors > self.max_impostors: + ind_samples = self._random_state.choice( + n_impostors, self.max_impostors, replace=False) + ind_impostors = ind_impostors[ind_samples] + + dims = (len(ind_b), len(ind_a)) + ib, ia = np.unravel_index(ind_impostors, shape=dims) + + imp_row.extend(ind_b[ib]) + imp_col.extend(ind_a[ia]) + + # turn lists to numpy arrays + imp_row = np.asarray(imp_row, dtype=np.intp) + imp_col = np.asarray(imp_col, dtype=np.intp) + + # Make sure we do not exceed max_impostors + n_impostors = len(imp_row) + if n_impostors > self.max_impostors: + ind_samples = self._random_state.choice( + n_impostors, self.max_impostors, replace=False) + imp_row = imp_row[ind_samples] + imp_col = imp_col[ind_samples] + + # Compute distances + imp_dist = _paired_distances_blockwise(X_embedded, imp_row, imp_col) + + # store impostors as a sparse matrix + n = X_embedded.shape[0] + impostors_graph = coo_matrix((imp_dist, (imp_row, imp_col)), shape=(n, n)) + + return impostors_graph + + +######################## +# Some core functions # +####################### + +def _find_impostors_blockwise(X, radii, ind_a, ind_b, + return_distance=False, block_size=8): + """Find (sample, impostor) pairs in blocks to avoid large memory usage. + + Parameters + ---------- + X : array, shape (n_samples, n_components) + Transformed data samples. + + radii : array, shape (n_samples,) + Squared distances of the samples in ``X`` to their margins. + + ind_a : array, shape (n_samples_a,) + Indices of samples from class A. + + ind_b : array, shape (n_samples_b,) + Indices of samples from class B, where n_samples_b << n_samples_a. + + block_size : int, optional (default=8) + The maximum number of mebibytes (MiB) of memory to use at a time for + calculating paired squared distances. + return_distance : bool, optional (default=False) + Whether to return the squared distances to the impostors. + + Returns + ------- + imp_indices : array, shape (n_impostors,) + Unraveled indices of (sample, impostor) pairs referring to a matrix + of shape (n_samples_a, n_samples_b). + imp_distances : array, shape (n_impostors,), optional + imp_distances[i] is the squared distance between samples imp_row[i] and + imp_col[i], where + imp_row, imp_col = np.unravel_index(imp_indices, shape=(n_samples_a, + n_samples_b)) + """ + + n_samples_a = len(ind_a) + bytes_per_row = len(ind_b) * X.itemsize + block_n_rows = int(block_size * 1024 * 1024 // bytes_per_row) + + imp_indices, imp_distances = [], [] + + # data + X_b = X[ind_b] + radii_a = radii[ind_a] + radii_b = radii[ind_b] + + # X_b squared norm stays constant, so pre-compute it to get a speed-up + X_b_norm_squared = np.einsum('ij,ij->i', X_b, X_b).reshape(1, -1) + for chunk in gen_batches(n_samples_a, block_n_rows): + + dist = euclidean_distances(X[ind_a[chunk]], X_b, squared=True, + Y_norm_squared=X_b_norm_squared) + + ind_ba, = np.where((dist < radii_a[chunk, None]).ravel()) + ind_ab, = np.where((dist < radii_b[None, :]).ravel()) + ind = np.unique(np.concatenate((ind_ab, ind_ba))) + + if len(ind): + ind_plus_offset = ind + chunk.start * X_b.shape[0] + imp_indices.extend(ind_plus_offset) + + if return_distance: + # We only need to do clipping if we return the distances. + distances_chunk = dist.ravel()[ind] + # Clip only the indexed (unique) distances + np.maximum(distances_chunk, 0, out=distances_chunk) + imp_distances.extend(distances_chunk) + + imp_indices = np.asarray(imp_indices) + + if return_distance: + return imp_indices, np.asarray(imp_distances) else: - active_pairs = np.empty((0, 2), dtype=int) - return active_pairs, np.array(list(c.values())) + return imp_indices + + +def _push_loss_grad(X, target_neighbors, inflated_dist_tn, impostors_graph): + """Compute the push loss max(d(a, p) + 1 - d(a, n), 0) and its gradient. + + :param X: array, shape (n_samples, n_features), the training samples. + :param target_neighbors: array, shape (n_samples, n_neighbors), + the target neighbors of each training sample. + :param inflated_dist_tn: array, shape (n_samples, n_neighbors), + squared distances of each sample to their target + neighbors plus margin. + :param impostors_graph: coo_matrix, shape (n_samples, n_samples), + + :return: + loss: float, the push loss due to the given target neighbors and impostors. + grad: array, shape (n_features, n_features), the gradient of the push loss + with respect to M = L.T @ L. + n_active_triplets: int, the number of active triplet constraints. + + """ + + n_samples, n_neighbors = inflated_dist_tn.shape + imp_row = impostors_graph.row + imp_col = impostors_graph.col + dist_impostors = impostors_graph.data + + loss = 0 + shape = (n_samples, n_samples) + A0 = csr_matrix(shape) + sample_range = range(n_samples) + n_active_triplets = 0 + for k in reversed(range(n_neighbors)): + # Consider margin violations to the samples in imp_row + losses1 = np.maximum(inflated_dist_tn[imp_row, k] - dist_impostors, 0) + ac = np.where(losses1 > 0)[0] + n_active_triplets += len(ac) + A1 = csr_matrix((2 * losses1[ac], (imp_row[ac], imp_col[ac])), shape) + + # Consider margin violations to the samples in imp_col + losses2 = np.maximum(inflated_dist_tn[imp_col, k] - dist_impostors, 0) + ac = np.where(losses2 > 0)[0] + n_active_triplets += len(ac) + A2 = csc_matrix((2 * losses2[ac], (imp_row[ac], imp_col[ac])), shape) + + # Update the loss + loss += np.dot(losses1, losses1) + np.dot(losses2, losses2) + + # Update the weight matrix for gradient computation + val = (A1.sum(1).ravel() + A2.sum(0)).getA1() + A3 = csr_matrix((val, (sample_range, target_neighbors[:, k])), shape) + A0 = A0 - A1 - A2 + A3 + + grad = _sum_weighted_outer_products(X, A0) + + return loss, grad, n_active_triplets + + +########################## +# Some helper functions # +######################### + +def _paired_distances_blockwise(X, ind_a, ind_b, squared=True, block_size=8): + """Equivalent to row_norms(X[ind_a] - X[ind_b], squared=squared). + + Parameters + ---------- + X : array, shape (n_samples, n_features) + An array of data samples. + ind_a : array, shape (n_indices,) + An array of sample indices. + ind_b : array, shape (n_indices,) + Another array of sample indices. + squared : bool (default=True) + Whether to return the squared distances. + block_size : int, optional (default=8) + The maximum number of mebibytes (MiB) of memory to use at a time for + calculating paired (squared) distances. + + Returns + ------- + distances: array, shape (n_indices,) + An array of paired, optionally squared, distances. + """ + + bytes_per_row = X.shape[1] * X.itemsize + batch_size = int(block_size * 1024 * 1024 // bytes_per_row) + + n_pairs = len(ind_a) + distances = np.zeros(n_pairs) + for chunk in gen_batches(n_pairs, batch_size): + diffs = X[ind_a[chunk]] - X[ind_b[chunk]] + distances[chunk] = np.einsum('ij,ij->i', diffs, diffs) + + return distances if squared else np.sqrt(distances, out=distances) + + +def _sum_weighted_outer_products(X, weights): + """Compute the sum of weighted outer products. + + Consider the squared Mahalanobis distance between x_i and x_j: + + d_{ij} = (x_i - x_j)^T M (x_i - x_j) + = x_i^T M x_i - x_i^T M x_j - x_j^T M x_i + x_j^T M x_j + + The gradient of d_{ij} with respect to M is a sum of outer products: + + \nabla_M d_{ij} = x_i x_i^T - x_j x_i^T - x_i x_j^T + x_j x_j^T + + + :param X: array, shape (n_samples, n_features), data samples. + :param weights: csr_matrix, shape (n_samples, n_samples), sparse weights. + + :return: array, shape (n_features, n_features), + the sum of all weighted outer products. + """ + + W = weights + weights.T + D = W.sum(1).getA() + + # X.T * (D - W) * X + LX = D * X - W @ X # D is n x 1, W is n x n + ret = X.T @ LX + + return ret + + +def _make_knn_graph(indices): + """Convert a dense indices matrix to a sparse adjacency matrix. + + :param indices: array, shape (n_samples, n_neighbors), + indices of the k nearest neighbors of each sample. + :return: csr_matrix, shape (n_samples, n_samples), the adjacency matrix. + """ + n_samples, n_neighbors = indices.shape + row = np.repeat(range(n_samples), n_neighbors) + col = indices.ravel() + ind = np.ones(indices.size) + shape = (n_samples, n_samples) + knn_graph = csr_matrix((ind, (row, col)), shape=shape) -def _sum_outer_products(data, a_inds, b_inds, weights=None): - Xab = data[a_inds] - data[b_inds] - if weights is not None: - return np.dot(Xab.T, Xab * weights[:, None]) - return np.dot(Xab.T, Xab) + return knn_graph diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 5cae80f2..2fce2fa4 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -2,9 +2,8 @@ import re import pytest import numpy as np -import scipy from scipy.optimize import check_grad, approx_fprime -from sklearn.metrics import pairwise_distances, euclidean_distances +from sklearn.metrics import pairwise_distances from sklearn.datasets import (load_iris, make_classification, make_regression, make_spd_matrix) from numpy.testing import (assert_array_almost_equal, assert_array_equal, @@ -26,7 +25,11 @@ MMC_Supervised, SDML, RCA, ITML, SCML) # Import this specially for testing. from metric_learn.constraints import wrap_pairs, Constraints -from metric_learn.lmnn import _sum_outer_products +from metric_learn.lmnn import ( + _sum_weighted_outer_products, + _make_knn_graph, + _push_loss_grad +) def class_separation(X, labels): @@ -362,12 +365,11 @@ def test_bounds_parameters_invalid(bounds): class TestLMNN(MetricTestCase): def test_iris(self): - lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False) + lmnn = LMNN(n_neighbors=5, verbose=False) lmnn.fit(self.iris_points, self.iris_labels) - csep = class_separation(lmnn.transform(self.iris_points), - self.iris_labels) - self.assertLess(csep, 0.25) + csep = class_separation(lmnn.transform(self.iris_points), self.iris_labels) + self.assertLess(csep, 0.26) def test_loss_grad_lbfgs(self): """Test gradient of loss function @@ -378,38 +380,46 @@ def test_loss_grad_lbfgs(self): X, y = make_classification(random_state=rng) L = rng.randn(rng.randint(1, X.shape[1] + 1), X.shape[1]) lmnn = LMNN() + lmnn.n_iter_ = 0 + lmnn.n_neighbors_ = lmnn.n_neighbors - k = lmnn.k - reg = lmnn.regularization - - X, y = lmnn._prepare_inputs(X, y, dtype=float, - ensure_min_samples=2) + X, y = lmnn._prepare_inputs(X, y, dtype=float, ensure_min_samples=2) num_pts, n_components = X.shape - unique_labels, label_inds = np.unique(y, return_inverse=True) - lmnn.labels_ = np.arange(len(unique_labels)) + unique_labels, y_inverse = np.unique(y, return_inverse=True) + classes = np.arange(len(unique_labels)) lmnn.components_ = np.eye(n_components) - target_neighbors = lmnn._select_targets(X, label_inds) + target_neighbors = lmnn._select_target_neighbors(X, y_inverse, classes) # sum outer products - dfG = _sum_outer_products(X, target_neighbors.flatten(), - np.repeat(np.arange(X.shape[0]), k)) + tn_graph = _make_knn_graph(target_neighbors) + pull_loss_grad_m = _sum_weighted_outer_products(X, tn_graph) - # initialize L - def loss_grad(flat_L): - return lmnn._loss_grad(X, flat_L.reshape(-1, X.shape[1]), dfG, - k, reg, target_neighbors, label_inds) + kwargs = { + 'classes': classes, + 'target_neighbors': target_neighbors, + 'pull_loss_grad_m': pull_loss_grad_m, + } - def fun(x): - return loss_grad(x)[1] + def fun(L): + return lmnn._loss_grad_lbfgs(L, X, y, **kwargs)[0] - def grad(x): - return loss_grad(x)[0].ravel() + def grad(L): + return lmnn._loss_grad_lbfgs(L, X, y, **kwargs)[1] - # compute relative error + # compute gradient with and without finite differences epsilon = np.sqrt(np.finfo(float).eps) - rel_diff = (check_grad(fun, grad, L.ravel()) / - np.linalg.norm(approx_fprime(L.ravel(), fun, epsilon))) + grad_fin_diff = approx_fprime(L.ravel(), fun, epsilon) + grad_lmnn = grad(L) + + # compute absolute error + grad_error = np.sqrt((grad_lmnn - grad_fin_diff)**2) + + # compute relative error + # rel_diff1 = grad_error / np.linalg.norm(grad_fin_diff) + # rel_diff2 = grad_error / np.linalg.norm(grad_lmnn) + + rel_diff = grad_error / np.linalg.norm(grad_lmnn) np.testing.assert_almost_equal(rel_diff, 0., decimal=5) @@ -418,8 +428,11 @@ def test_loss_func(capsys): by comparing the results with the actual implementation of metric-learn, with a very simple (but nonperformant) implementation""" + import scipy + from sklearn.metrics import euclidean_distances + # toy dataset to use - X, y = make_classification(n_samples=10, n_classes=2, + X, y = make_classification(n_samples=40, n_classes=2, n_features=6, n_redundant=0, shuffle=True, scale=[1, 1, 20, 20, 20, 20], random_state=42) @@ -471,118 +484,75 @@ def _select_targets(X, y, k): x0 = np.random.randn(1, n_features) def loss(x0): - return loss_fn(x0.reshape(-1, X.shape[1]), X, y, target_neighbors, + return loss_fn(x0.reshape(-1, n_features), X, y, target_neighbors, regularization)[1] def grad(x0): - return loss_fn(x0.reshape(-1, X.shape[1]), X, y, target_neighbors, + return loss_fn(x0.reshape(-1, n_features), X, y, target_neighbors, regularization)[0].ravel() scipy.optimize.check_grad(loss, grad, x0.ravel()) - class LMNN_with_callback(LMNN): - """ We will use a callback to get the gradient (see later) + +def test_compute_push_loss(): + """Test if the push loss is computed correctly + + This test continues on the example from test_find_impostors. The push + loss is easy to compute, as we have only 4 violations and all of them + amount to 1 (squared distance to target neighbor + 1 - squared distance + to impostor = 4 + 1 - 4). """ - def __init__(self, callback, *args, **kwargs): - self.callback = callback - super(LMNN_with_callback, self).__init__(*args, **kwargs) - - def _loss_grad(self, *args, **kwargs): - grad, objective, total_active = ( - super(LMNN_with_callback, self)._loss_grad(*args, **kwargs)) - self.callback.append(grad) - return grad, objective, total_active - - class LMNN_nonperformant(LMNN_with_callback): - - def fit(self, X, y): - self.y = y - return super(LMNN_nonperformant, self).fit(X, y) - - def _loss_grad(self, X, L, dfG, k, reg, target_neighbors, label_inds): - grad, loss, total_active = loss_fn(L.ravel(), X, self.y, - target_neighbors, self.regularization) - self.callback.append(grad) - return grad, loss, total_active - - mem1, mem2 = [], [] - lmnn_perf = LMNN_with_callback(verbose=True, random_state=42, - init='identity', max_iter=30, callback=mem1) - lmnn_nonperf = LMNN_nonperformant(verbose=True, random_state=42, - init='identity', max_iter=30, - callback=mem2) - objectives, obj_diffs, learn_rate, total_active = (dict(), dict(), dict(), - dict()) - for algo, name in zip([lmnn_perf, lmnn_nonperf], ['perf', 'nonperf']): - algo.fit(X, y) - out, _ = capsys.readouterr() - lines = re.split("\n+", out) - # we get every variable that is printed from the algorithm in verbose - num = r'(-?\d+.?\d*(e[+|-]\d+)?)' - strings = [re.search(r"\d+ (?:{}) (?:{}) (?:(\d+)) (?:{})" - .format(num, num, num), s) for s in lines] - objectives[name] = [float(match.group(1)) for match in strings if match is - not None] - obj_diffs[name] = [float(match.group(3)) for match in strings if match is - not None] - total_active[name] = [float(match.group(5)) for match in strings if - match is not - None] - learn_rate[name] = [float(match.group(6)) for match in strings if match is - not None] - assert len(strings) >= 10 # we ensure that we actually did more than 10 - # iterations - assert total_active[name][0] >= 2 # we ensure that we have some active - # constraints (that's the case we want to test) - # we remove the last element because it can be equal to the penultimate - # if the last gradient update is null - for i in range(len(mem1)): - np.testing.assert_allclose(lmnn_perf.callback[i], - lmnn_nonperf.callback[i], - err_msg='Gradient different at position ' - '{}'.format(i)) - np.testing.assert_allclose(objectives['perf'], objectives['nonperf']) - np.testing.assert_allclose(obj_diffs['perf'], obj_diffs['nonperf']) - np.testing.assert_allclose(total_active['perf'], total_active['nonperf']) - np.testing.assert_allclose(learn_rate['perf'], learn_rate['nonperf']) + class_distance = 4. + X_a = np.array([[-1., 1], [-1., -1.], [1., 1.], [1., -1.]]) + X_b = X_a + np.array([class_distance, 0]) + X = np.concatenate((X_a, X_b)) + y = np.array([0, 0, 0, 0, 1, 1, 1, 1]) + lmnn = LMNN(n_neighbors=1) + lmnn.n_neighbors_ = 1 + classes = np.unique(y) + target_neighbors = lmnn._select_target_neighbors(X, y, classes) + diffs = X - X[target_neighbors[:, 0]] + dist_tn = np.einsum('ij,ij->i', diffs, diffs) + dist_tn = dist_tn[:, None] + dist_tn += 1 + margin_radii = dist_tn[:, -1] + impostors_graph = lmnn._find_impostors(X, y, classes, margin_radii) + loss, grad, _ = _push_loss_grad(X, target_neighbors, dist_tn, + impostors_graph) + + # The loss should be 4. (1. for each of the 4 violation) + assert loss == 4. @pytest.mark.parametrize('X, y, loss', [(np.array([[0], [1], [2], [3]]), - [1, 1, 0, 0], 3.0), + [1, 1, 0, 0], 6.0), (np.array([[0], [1], [2], [3]]), - [1, 0, 0, 1], 26.)]) + [1, 0, 0, 1], 256.)]) def test_toy_ex_lmnn(X, y, loss): """Test that the loss give the right result on a toy example""" L = np.array([[1]]) - lmnn = LMNN(k=1, regularization=0.5) + lmnn = LMNN(n_neighbors=1, push_loss_weight=0.5) - k = lmnn.k - reg = lmnn.regularization + lmnn.n_neighbors_ = lmnn.n_neighbors - X, y = lmnn._prepare_inputs(X, y, dtype=float, - ensure_min_samples=2) + X, y = lmnn._prepare_inputs(X, y, dtype=float, ensure_min_samples=2) num_pts, n_components = X.shape unique_labels, label_inds = np.unique(y, return_inverse=True) - lmnn.labels_ = np.arange(len(unique_labels)) + classes = np.arange(len(unique_labels)) lmnn.components_ = np.eye(n_components) - target_neighbors = lmnn._select_targets(X, label_inds) + target_neighbors = lmnn._select_target_neighbors(X, label_inds, classes) # sum outer products - dfG = _sum_outer_products(X, target_neighbors.flatten(), - np.repeat(np.arange(X.shape[0]), k)) - - # storage - a1 = [None] * k - a2 = [None] * k - for nn_idx in range(k): - a1[nn_idx] = np.array([]) - a2[nn_idx] = np.array([]) + tn_graph = _make_knn_graph(target_neighbors) + const_grad = _sum_weighted_outer_products(X, tn_graph) # assert that the loss equals the one computed by hand - assert lmnn._loss_grad(X, L.reshape(-1, X.shape[1]), dfG, k, - reg, target_neighbors, label_inds)[1] == loss + lmnn.n_iter_ = 0 + predicted_loss = lmnn._loss_grad_lbfgs(L, X, y, classes, target_neighbors, + const_grad)[0] + assert predicted_loss == loss def test_convergence_simple_example(capsys): @@ -592,28 +562,7 @@ def test_convergence_simple_example(capsys): lmnn = LMNN(verbose=True) lmnn.fit(X, y) out, _ = capsys.readouterr() - assert "LMNN converged with objective" in out - - -def test_no_twice_same_objective(capsys): - # test that the objective function never has twice the same value - # see https://github.com/scikit-learn-contrib/metric-learn/issues/88 - X, y = make_classification(random_state=0) - lmnn = LMNN(verbose=True) - lmnn.fit(X, y) - out, _ = capsys.readouterr() - lines = re.split("\n+", out) - # we get only objectives from each line: - # the regexp matches a float that follows an integer (the iteration - # number), and which is followed by a (signed) float (delta obj). It - # matches for instance: - # 3 **1113.7665747189938** -3.182774197440267 46431.0200999999999998e-06 - objectives = [re.search(r"\d* (?:(\d*.\d*))[ | -]\d*.\d*", s) - for s in lines] - objectives = [match.group(1) for match in objectives if match is not None] - # we remove the last element because it can be equal to the penultimate - # if the last gradient update is null - assert len(objectives[:-1]) == len(set(objectives[:-1])) + assert "LMNN did not converge" not in out class TestSDML(MetricTestCase): diff --git a/test/test_base_metric.py b/test/test_base_metric.py index 67f9b6a0..2d8fd69e 100644 --- a/test/test_base_metric.py +++ b/test/test_base_metric.py @@ -36,15 +36,17 @@ def test_covariance(self): remove_spaces(f"Covariance({merged_kwargs})")) def test_lmnn(self): - def_kwargs = {'convergence_tol': 0.001, 'init': 'auto', 'k': 3, - 'learn_rate': 1e-07, 'max_iter': 1000, 'min_iter': 50, - 'n_components': None, 'preprocessor': None, - 'random_state': None, 'regularization': 0.5, - 'verbose': False} - nndef_kwargs = {'convergence_tol': 0.01, 'k': 6} + def_kwargs = {'n_neighbors': 3, 'k': 'deprecated', 'n_components': None, + 'init': 'auto', 'max_impostors': 500_000, + 'neighbors_params': None, 'push_loss_weight': 0.5, + 'regularization': 'deprecated', 'max_iter': 50, 'tol': 1e-5, + 'preprocessor': None, 'verbose': False, + 'random_state': None} + def_kwargs = {k: def_kwargs[k] for k in sorted(def_kwargs.keys())} + nndef_kwargs = {'n_neighbors': 6, 'tol': 0.01} merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs) self.assertEqual( - remove_spaces(str(metric_learn.LMNN(convergence_tol=0.01, k=6))), + remove_spaces(str(metric_learn.LMNN(n_neighbors=6, tol=0.01))), remove_spaces(f"LMNN({merged_kwargs})")) def test_nca(self): diff --git a/test/test_components_metric_conversion.py b/test/test_components_metric_conversion.py index 5502ad90..f07e31a7 100644 --- a/test/test_components_metric_conversion.py +++ b/test/test_components_metric_conversion.py @@ -42,7 +42,7 @@ def test_itml_supervised(self): assert_array_almost_equal(L.T.dot(L), itml.get_mahalanobis_matrix()) def test_lmnn(self): - lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False) + lmnn = LMNN(n_neighbors=5, verbose=False) lmnn.fit(self.X, self.y) L = lmnn.components_ assert_array_almost_equal(L.T.dot(L), lmnn.get_mahalanobis_matrix()) diff --git a/test/test_fit_transform.py b/test/test_fit_transform.py index d4d4bfe0..609db3c9 100644 --- a/test/test_fit_transform.py +++ b/test/test_fit_transform.py @@ -52,11 +52,11 @@ def test_itml_supervised(self): assert_array_almost_equal(res_1, res_2) def test_lmnn(self): - lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False) + lmnn = LMNN(n_neighbors=5, verbose=False) lmnn.fit(self.X, self.y) res_1 = lmnn.transform(self.X) - lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False) + lmnn = LMNN(n_neighbors=5, verbose=False) res_2 = lmnn.fit_transform(self.X, self.y) assert_array_almost_equal(res_1, res_2) diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py index 84058b32..5a533eab 100644 --- a/test/test_mahalanobis_mixin.py +++ b/test/test_mahalanobis_mixin.py @@ -417,7 +417,7 @@ def test_auto_init_transformation(n_samples, n_features, n_classes, random_state=rng) # To make the test work for LMNN: if 'LMNN' in model_base.__class__.__name__: - model_base.set_params(k=1) + model_base.set_params(n_neighbors=1) # To make the test faster for estimators that have a max_iter: if hasattr(model_base, 'max_iter'): model_base.set_params(max_iter=1)