From 226c80ce0bf1414cc5c08a8627157dc5a93bf374 Mon Sep 17 00:00:00 2001 From: John Chiotellis Date: Sat, 20 Mar 2021 16:59:10 +0100 Subject: [PATCH 01/10] add new LMNN --- .gitignore | 5 +- metric_learn/lmnn.py | 626 ++++++++++++++-------- test/metric_learn_test.py | 268 +++------ test/test_base_metric.py | 4 +- test/test_components_metric_conversion.py | 2 +- test/test_fit_transform.py | 4 +- test/test_mahalanobis_mixin.py | 2 +- 7 files changed, 494 insertions(+), 417 deletions(-) diff --git a/.gitignore b/.gitignore index 8321c7d2..02ee652d 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,7 @@ htmlcov/ .cache/ .pytest_cache/ doc/auto_examples/* -doc/generated/* \ No newline at end of file +doc/generated/* + +# Pycharm artifacts +.idea/ diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index 8bdc4bf0..b2a3817f 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -1,10 +1,16 @@ """ Large Margin Nearest Neighbor Metric learning (LMNN) """ +import sys +import time +import warnings import numpy as np -from collections import Counter +from scipy.optimize import minimize +from scipy.sparse import coo_matrix, csr_matrix, csc_matrix +from sklearn.neighbors import NearestNeighbors from sklearn.metrics import euclidean_distances from sklearn.base import TransformerMixin +from sklearn.exceptions import ConvergenceWarning from ._util import _initialize_components, _check_n_components from .base_metric import MahalanobisMixin @@ -23,6 +29,12 @@ class LMNN(MahalanobisMixin, TransformerMixin): Parameters ---------- + n_neighbors : int, optional (default=3) + Number of neighbors to consider, not including self-edges. + + n_components : int or None, optional (default=None) + Dimensionality of reduced space (if None, defaults to dimension of X). + init : string or numpy array, optional (default='auto') Initialization of the linear transformation. Possible options are 'auto', 'pca', 'identity', 'random', and a numpy array of shape @@ -63,35 +75,28 @@ class LMNN(MahalanobisMixin, TransformerMixin): :meth:`fit` and n_features_a must be less than or equal to that. If ``n_components`` is not None, n_features_a must match it. - k : int, optional (default=3) - Number of neighbors to consider, not including self-edges. + neighbors_params : dict, optional (default=None) + Parameters to pass to a :class:`neighbors.NearestNeighbors` instance - + apart from ``n_neighbors`` - that will be used to select the target + neighbors. - min_iter : int, optional (default=50) - Minimum number of iterations of the optimization procedure. + push_loss_weight: float, optional (default=0.5) + Relative weight between pull and push terms, with 0.5 meaning equal + weight. max_iter : int, optional (default=1000) Maximum number of iterations of the optimization procedure. - learn_rate : float, optional (default=1e-7) - Learning rate of the optimization procedure - - tol : float, optional (default=0.001) + tol : float, optional (default=0.00001) Tolerance of the optimization procedure. If the objective value varies less than `tol`, we consider the algorithm has converged and stop it. - verbose : bool, optional (default=False) - Whether to print the progress of the optimization procedure. - - regularization: float, optional (default=0.5) - Relative weight between pull and push terms, with 0.5 meaning equal - weight. - preprocessor : array-like, shape=(n_samples, n_features) or callable The preprocessor to call to get tuples from indices. If array-like, tuples will be formed like this: X[indices]. - n_components : int or None, optional (default=None) - Dimensionality of reduced space (if None, defaults to dimension of X). + verbose : bool, optional (default=False) + Whether to print the progress of the optimization procedure. random_state : int or numpy.RandomState or None, optional (default=None) A pseudo random number generator object or a seed for it if int. If @@ -101,233 +106,432 @@ class LMNN(MahalanobisMixin, TransformerMixin): Attributes ---------- + components_ : `numpy.ndarray`, shape=(n_components, n_features) + The learned linear transformation ``L``. + + n_neighbors_ : int + The provided ``n_neighbors`` is decreased if it is greater than or + equal to min(number of elements in each class). + n_iter_ : `int` The number of iterations the solver has run. - components_ : `numpy.ndarray`, shape=(n_components, n_features) - The learned linear transformation ``L``. Examples -------- >>> import numpy as np >>> from metric_learn import LMNN + >>> from sklearn.neighbors import KNeighborsClassifier >>> from sklearn.datasets import load_iris >>> iris_data = load_iris() >>> X = iris_data['data'] >>> Y = iris_data['target'] - >>> lmnn = LMNN(k=5, learn_rate=1e-6) - >>> lmnn.fit(X, Y, verbose=False) + >>> lmnn = LMNN(n_neighbors=3) + >>> lmnn.fit(X, Y) + >>> knn = KNeighborsClassifier(n_neighbors=3) + >>> knn.fit(lmnn.transform(X), Y) References ---------- - .. [1] K. Q. Weinberger, J. Blitzer, L. K. Saul. `Distance Metric - Learning for Large Margin Nearest Neighbor Classification - `_. NIPS - 2005. + .. [1] Weinberger, Kilian Q., and Lawrence K. Saul. + "Distance Metric Learning for Large Margin Nearest Neighbor + Classification." + Journal of Machine Learning Research, Vol. 10, Feb. 2009, + pp. 207-244. + http://jmlr.csail.mit.edu/papers/volume10/weinberger09a/weinberger09a.pdf + + .. [2] Wikipedia entry on Large Margin Nearest Neighbor + https://en.wikipedia.org/wiki/Large_margin_nearest_neighbor + """ + def __init__(self, n_neighbors=3, n_components=None, init='auto', + neighbors_params=None, push_loss_weight=0.5, max_iter=50, + tol=1e-5, preprocessor=None, verbose=False, random_state=None): - def __init__(self, init='auto', k=3, min_iter=50, max_iter=1000, - learn_rate=1e-7, regularization=0.5, convergence_tol=0.001, - verbose=False, preprocessor=None, - n_components=None, random_state=None): + self.n_neighbors = n_neighbors + self.n_components = n_components self.init = init - self.k = k - self.min_iter = min_iter + self.neighbors_params = neighbors_params + self.push_loss_weight = push_loss_weight self.max_iter = max_iter - self.learn_rate = learn_rate - self.regularization = regularization - self.convergence_tol = convergence_tol + self.tol = tol self.verbose = verbose - self.n_components = n_components self.random_state = random_state super(LMNN, self).__init__(preprocessor) def fit(self, X, y): - k = self.k - reg = self.regularization - learn_rate = self.learn_rate - - X, y = self._prepare_inputs(X, y, dtype=float, - ensure_min_samples=2) - num_pts, d = X.shape - output_dim = _check_n_components(d, self.n_components) - unique_labels, label_inds = np.unique(y, return_inverse=True) - if len(label_inds) != num_pts: - raise ValueError('Must have one label per point.') - self.labels_ = np.arange(len(unique_labels)) - - self.components_ = _initialize_components(output_dim, X, y, self.init, - self.verbose, + """ Train Large Margin Nearest Neighbor model. + + :param X: array, shape (n_samples, n_features), the training samples. + :param y: array, shape (n_samples,), the training labels. + + :return: The trained LMNN model. + """ + + # Validate inputs + X, y = self._prepare_inputs(X, y, dtype=float, ensure_min_samples=2) + X, y, classes = self._validate_params(X, y) + + n_samples, n_features = X.shape + n_components = _check_n_components(n_features, self.n_components) + + # Initialize transformation + self.components_ = _initialize_components(n_components, X, y, self.init, + verbose=self.verbose, random_state=self.random_state) - required_k = np.bincount(label_inds).min() - if self.k > required_k: - raise ValueError('not enough class labels for specified k' - ' (smallest class has %d)' % required_k) - target_neighbors = self._select_targets(X, label_inds) + # Find the target neighbors of each sample + target_neighbors = self._select_target_neighbors(X, y, classes) + + # Compute the gradient w.r.t. the target neighbors which remains constant + tn_graph = _make_knn_graph(target_neighbors) + const_grad = _sum_weighted_outer_differences(X, tn_graph) - # sum outer products - dfG = _sum_outer_products(X, target_neighbors.flatten(), - np.repeat(np.arange(X.shape[0]), k)) + # Compute the pull loss weight such that the push loss weight becomes 1 + pull_loss_weight = (1 - self.push_loss_weight) / self.push_loss_weight + const_grad *= pull_loss_weight - # initialize L - L = self.components_ + int_verbose = int(self.verbose) + disp = int_verbose - 2 if int_verbose > 1 else -1 + optimizer_params = { + 'method': 'L-BFGS-B', + 'fun': self._loss_grad_lbfgs, + 'jac': True, + 'args': (X, y, classes, target_neighbors, const_grad), + 'x0': self.components_, + 'tol': self.tol, + 'options': dict(maxiter=self.max_iter, disp=disp), + } - # first iteration: we compute variables (including objective and gradient) - # at initialization point - G, objective, total_active = self._loss_grad(X, L, dfG, k, - reg, target_neighbors, - label_inds) + # Call the optimizer + self.n_iter_ = 0 + opt_result = minimize(**optimizer_params) - it = 1 # we already made one iteration + # Reshape the solution found by the optimizer + self.components_ = opt_result.x.reshape(-1, n_features) if self.verbose: - print("iter | objective | objective difference | active constraints", - "| learning rate") - - # main loop - for it in range(2, self.max_iter): - # then at each iteration, we try to find a value of L that has better - # objective than the previous L, following the gradient: - while True: - # the next point next_L to try out is found by a gradient step - L_next = L - learn_rate * G - # we compute the objective at next point - # we copy variables that can be modified by _loss_grad, because if we - # retry we don t want to modify them several times - (G_next, objective_next, total_active_next) = ( - self._loss_grad(X, L_next, dfG, k, reg, target_neighbors, - label_inds)) - assert not np.isnan(objective) - delta_obj = objective_next - objective - if delta_obj > 0: - # if we did not find a better objective, we retry with an L closer to - # the starting point, by decreasing the learning rate (making the - # gradient step smaller) - learn_rate /= 2 - else: - # otherwise, if we indeed found a better obj, we get out of the loop - break - # when the better L is found (and the related variables), we set the - # old variables to these new ones before next iteration and we - # slightly increase the learning rate - L = L_next - G, objective, total_active = G_next, objective_next, total_active_next - learn_rate *= 1.01 - - if self.verbose: - print(it, objective, delta_obj, total_active, learn_rate) - - # check for convergence - if it > self.min_iter and abs(delta_obj) < self.convergence_tol: - if self.verbose: - print("LMNN converged with objective", objective) - break - else: - if self.verbose: - print("LMNN didn't converge in %d steps." % self.max_iter) - - # store the last L - self.components_ = L - self.n_iter_ = it + cls_name = self.__class__.__name__ + + # Warn the user if the algorithm did not converge + if not opt_result.success: + warnings.warn('[{}] LMNN did not converge: {}'.format( + cls_name, opt_result.message), ConvergenceWarning) + return self - def _loss_grad(self, X, L, dfG, k, reg, target_neighbors, label_inds): - # Compute pairwise distances under current metric - Lx = L.dot(X.T).T - - # we need to find the furthest neighbor: - Ni = 1 + _inplace_paired_L2(Lx[target_neighbors], Lx[:, None, :]) - furthest_neighbors = np.take_along_axis(target_neighbors, - Ni.argmax(axis=1)[:, None], 1) - impostors = self._find_impostors(furthest_neighbors.ravel(), X, - label_inds, L) - - g0 = _inplace_paired_L2(*Lx[impostors]) - - # we reorder the target neighbors - g1, g2 = Ni[impostors] - # compute the gradient - total_active = 0 - df = np.zeros((X.shape[1], X.shape[1])) - for nn_idx in reversed(range(k)): # note: reverse not useful here - act1 = g0 < g1[:, nn_idx] - act2 = g0 < g2[:, nn_idx] - total_active += act1.sum() + act2.sum() - - targets = target_neighbors[:, nn_idx] - PLUS, pweight = _count_edges(act1, act2, impostors, targets) - df += _sum_outer_products(X, PLUS[:, 0], PLUS[:, 1], pweight) - - in_imp, out_imp = impostors - df -= _sum_outer_products(X, in_imp[act1], out_imp[act1]) - df -= _sum_outer_products(X, in_imp[act2], out_imp[act2]) - - # do the gradient update - assert not np.isnan(df).any() - G = dfG * reg + df * (1 - reg) - G = L.dot(G) - # compute the objective function - objective = total_active * (1 - reg) - objective += G.flatten().dot(L.flatten()) - return 2 * G, objective, total_active - - def _select_targets(self, X, label_inds): - target_neighbors = np.empty((X.shape[0], self.k), dtype=int) - for label in self.labels_: - inds, = np.nonzero(label_inds == label) - dd = euclidean_distances(X[inds], squared=True) - np.fill_diagonal(dd, np.inf) - nn = np.argsort(dd)[..., :self.k] - target_neighbors[inds] = inds[nn] + def _validate_params(self, X, y): + """ Validate parameters as soon as :meth:`fit` is called. + + :param X: array, shape (n_samples, n_features), the training samples. + :param y: array, shape (n_samples,), the training labels. + + :return: + X: array, shape (n_samples, n_features), the validated training samples. + y: array, shape (n_samples,), the validated training labels. + classes: array, shape (n_classes,), the non-singleton classes encoded. + """ + + # Find the appearing classes and the class index of each of the samples + classes, y = np.unique(y, return_inverse=True) + classes_inverse = np.arange(len(classes)) + + # Ignore classes that have less than two samples (singleton classes) + class_sizes = np.bincount(y) + mask_singleton_class = class_sizes == 1 + singleton_classes = np.where(mask_singleton_class)[0] + if len(singleton_classes): + warnings.warn('There are {} singleton classes that will be ignored ' + 'during training. A copy of the inputs `X` and `y` will ' + 'be made.'.format(len(singleton_classes))) + + mask_singleton_sample = np.asarray([yi in singleton_classes for + yi in y]) + X = X[~mask_singleton_sample] + y = y[~mask_singleton_sample] + + # Check that there are at least 2 non-singleton classes + n_classes_non_singleton = len(classes) - len(singleton_classes) + if n_classes_non_singleton < 2: + raise ValueError('LMNN needs at least 2 non-singleton classes, got {}.' + .format(n_classes_non_singleton)) + + classes_inverse_non_singleton = classes_inverse[~mask_singleton_class] + + # Check the preferred number of neighbors + min_non_singleton_size = class_sizes[~mask_singleton_class].min() + if self.n_neighbors >= min_non_singleton_size: + warnings.warn('`n_neighbors` (={}) is not less than the number of ' + 'samples in the smallest non-singleton class (={}). ' + '`n_neighbors_` will be set to {} for estimation.' + .format(self.n_neighbors, min_non_singleton_size, + min_non_singleton_size - 1)) + + self.n_neighbors_ = min(self.n_neighbors, min_non_singleton_size - 1) + + return X, y, classes_inverse_non_singleton + + def _loss_grad_lbfgs(self, L, X, y, classes, target_neighbors, const_grad): + """ Compute loss and gradient after one optimization iteration. + + :param L: array, shape (n_components * n_features,) + The current (flattened) linear transformation. + :param X: array, shape (n_samples, n_features), the training samples. + :param y: array, shape (n_samples,), the training labels. + :param classes: array, shape (n_classes,), classes encoded as intergers. + :param target_neighbors: array, shape (n_samples, n_neighbors), + the target neighbors of each training sample. + :param const_grad: array, shape (n_features, n_features), + the (weighted) gradient component due to target + neighbors that stays fixed throughout the algorithm. + + :return: + loss: float, the loss given the current solution. + grad: array, shape (n_components, n_features), the (flattened) gradient. + """ + + # Print headers + if self.n_iter_ == 0 and self.verbose: + header_fields = ['Iteration', 'Objective Value', + '#Active Triplets', 'Time(s)'] + header_fmt = '{:>10} {:>20} {:>20} {:>10}' + header = header_fmt.format(*header_fields) + cls_name = self.__class__.__name__ + print('[{}]'.format(cls_name)) + print('[{}] {}'.format(cls_name, header)) + print('[{}] {}'.format(cls_name, '-' * len(header))) + + # Measure duration of optimization iteration + start_time = time.time() + + # Reshape current solution + n_samples, n_features = X.shape + L = L.reshape(-1, n_features) + + # Transform samples according to the current solution + X_embedded = X @ L.T + + # Compute squared distances to the target neighbors + n_neighbors = target_neighbors.shape[1] + dist_tn = np.zeros((n_samples, n_neighbors)) + for k in range(n_neighbors): + ind_tn = target_neighbors[:, k] + diffs = X_embedded - X_embedded[ind_tn] + dist_tn[:, k] = np.einsum('ij,ij->i', diffs, diffs) # row norms + + # Add margin to the distances + dist_tn += 1 + + # Find impostors + impostors_graph = _find_impostors(X_embedded, y, classes, dist_tn[:, -1]) + + # Compute the push loss and its gradient + push_loss, push_loss_grad, n_active = _compute_push_loss( + X, target_neighbors, dist_tn, impostors_graph) + + # Compute the total loss + M = L.T @ L + new_pull_loss = np.dot(const_grad.ravel(), M.ravel()) + loss = push_loss + new_pull_loss + + # Compute the total gradient + grad = np.dot(L, const_grad + push_loss_grad) + grad *= 2 + + # Report iteration metrics + self.n_iter_ += 1 + if self.verbose: + elapsed_time = time.time() - start_time + values_fmt = '[{}] {:>10} {:>20.6e} {:>20,} {:>10.2f}' + cls_name = self.__class__.__name__ + print(values_fmt.format(cls_name, self.n_iter_, loss, n_active, + elapsed_time)) + sys.stdout.flush() + + return loss, grad.ravel() + + def _select_target_neighbors(self, X, y, classes): + """ Find the target neighbors of each training sample. + + :param X: array, shape (n_samples, n_features), the training samples. + :param y: array, shape (n_samples,), the training labels. + :param classes: array, shape (n_classes,), the classes encoded as integers. + + :return: array, shape (n_samples, n_neighbors), + the indices of the target neighbors of each training sample. + """ + + start_time = time.time() + cls_name = self.__class__.__name__ + if self.verbose: + print('[{}] Finding the target neighbors...'.format(cls_name)) + sys.stdout.flush() + + nn_kwargs = self.neighbors_params or {} + nn = NearestNeighbors(n_neighbors=self.n_neighbors_, **nn_kwargs) + target_neighbors = np.empty((X.shape[0], self.n_neighbors_), dtype=int) + + for label in classes: + ind_class = np.where(y == label)[0] + nn.fit(X[ind_class]) + ind_neighbors = nn.kneighbors(return_distance=False) + target_neighbors[ind_class] = ind_class[ind_neighbors] + + if self.verbose: + elapsed_time = time.time() - start_time + print('[{}] Found the target neighbors in {:5.2f}s.'.format( + cls_name, elapsed_time)) + return target_neighbors - def _find_impostors(self, furthest_neighbors, X, label_inds, L): - Lx = X.dot(L.T) - margin_radii = 1 + _inplace_paired_L2(Lx[furthest_neighbors], Lx) - impostors = [] - for label in self.labels_[:-1]: - in_inds, = np.nonzero(label_inds == label) - out_inds, = np.nonzero(label_inds > label) - dist = euclidean_distances(Lx[out_inds], Lx[in_inds], squared=True) - i1, j1 = np.nonzero(dist < margin_radii[out_inds][:, None]) - i2, j2 = np.nonzero(dist < margin_radii[in_inds]) - i = np.hstack((i1, i2)) - j = np.hstack((j1, j2)) - if i.size > 0: - # get unique (i,j) pairs using index trickery - shape = (i.max() + 1, j.max() + 1) - tmp = np.ravel_multi_index((i, j), shape) - i, j = np.unravel_index(np.unique(tmp), shape) - impostors.append(np.vstack((in_inds[j], out_inds[i]))) - if len(impostors) == 0: - # No impostors detected - return impostors - return np.hstack(impostors) - - -def _inplace_paired_L2(A, B): - '''Equivalent to ((A-B)**2).sum(axis=-1), but modifies A in place.''' - A -= B - return np.einsum('...ij,...ij->...i', A, A) - - -def _count_edges(act1, act2, impostors, targets): - imp = impostors[0, act1] - c = Counter(zip(imp, targets[imp])) - imp = impostors[1, act2] - c.update(zip(imp, targets[imp])) - if c: - active_pairs = np.array(list(c.keys())) - else: - active_pairs = np.empty((0, 2), dtype=int) - return active_pairs, np.array(list(c.values())) - - -def _sum_outer_products(data, a_inds, b_inds, weights=None): - Xab = data[a_inds] - data[b_inds] - if weights is not None: - return np.dot(Xab.T, Xab * weights[:, None]) - return np.dot(Xab.T, Xab) + +def _find_impostors(X_embedded, y, classes, margin_radii): + """ Find the samples that violate the margin. + + :param X_embedded: array, shape (n_samples, n_components), + the training samples in the embedding space. + :param y: array, shape (n_samples,), training labels. + :param classes: array, shape (n_classes,), the classes encoded as integers. + :param margin_radii: array, shape (n_samples,), squared distances of + training samples to their farthest target neighbors + plus margin. + + :return: coo_matrix, shape (n_samples, n_neighbors), + If sample i is an impostor to sample j or vice versa, then the + value of A[i, j] is the squared distance between samples i and j. + Otherwise A[i, j] is zero. + + """ + + # Initialize lists for impostors storage + imp_row, imp_col, imp_dist = [], [], [] + + for label in classes[:-1]: + ind_a = np.where(y == label)[0] + ind_b = np.where(y > label)[0] + + dist = euclidean_distances(X_embedded[ind_b], X_embedded[ind_a], + squared=True) # (b, a) + + radii_a = margin_radii[ind_a] # (a,) + radii_b = margin_radii[ind_b] # (b,) + + # Find samples in B that are impostors to A + imp_ba = np.where((dist < radii_a[None, :]).ravel())[0] + + # Find samples in A that are impostors to B + imp_ab = np.where((dist < radii_b[:, None]).ravel())[0] + + # merge and filter unique impostors + ind_impostors = np.unique(np.concatenate((imp_ba, imp_ab))) + + if len(ind_impostors): + # Map indices back to the original training data indices + ii, jj = np.unravel_index(ind_impostors, dist.shape) + imp_row.extend(ind_b[ii]) + imp_col.extend(ind_a[jj]) + imp_dist.extend(dist.ravel()[ind_impostors]) + + # turn lists to numpy arrays + imp_row = np.asarray(imp_row, dtype=np.intp) + imp_col = np.asarray(imp_col, dtype=np.intp) + imp_dist = np.asarray(imp_dist) + + # store impostors as a sparse matrix + n = X_embedded.shape[0] + impostors_graph = coo_matrix((imp_dist, (imp_row, imp_col)), shape=(n, n)) + + return impostors_graph + + +def _compute_push_loss(X, target_neighbors, inflated_dist_tn, impostors_graph): + """ Compute the push loss L_push = max(d(a, p) + 1 - d(a, n), 0) + + :param X: array, shape (n_samples, n_features), the training samples. + :param target_neighbors: array, shape (n_samples, n_neighbors), + the target neighbors of each training sample. + :param inflated_dist_tn: array, shape (n_samples, n_neighbors), + squared distances of each sample to their target + neighbors plus margin. + :param impostors_graph: coo_matrix, shape (n_samples, n_samples), + + :return: + loss: float, the push loss due to the given target neighbors and impostors. + grad: array, shape (n_features, n_features), the gradient of the push loss. + n_active_triplets: int, the number of active triplet constraints. + + """ + + n_samples, n_neighbors = inflated_dist_tn.shape + imp_row = impostors_graph.row + imp_col = impostors_graph.col + dist_impostors = impostors_graph.data + + loss = 0 + shape = (n_samples, n_samples) + A0 = csr_matrix(shape) + sample_range = range(n_samples) + n_active_triplets = 0 + for k in reversed(range(n_neighbors)): + # Consider margin violations to the samples in imp_row + losses1 = np.maximum(inflated_dist_tn[imp_row, k] - dist_impostors, 0) + ac = np.where(losses1 > 0)[0] + n_active_triplets += len(ac) + A1 = csr_matrix((2 * losses1[ac], (imp_row[ac], imp_col[ac])), shape) + + # Consider margin violations to the samples in imp_col + losses2 = np.maximum(inflated_dist_tn[imp_col, k] - dist_impostors, 0) + ac = np.where(losses2 > 0)[0] + n_active_triplets += len(ac) + A2 = csc_matrix((2 * losses2[ac], (imp_row[ac], imp_col[ac])), shape) + + # Update the loss + loss += np.dot(losses1, losses1) + np.dot(losses2, losses2) + + # Update the weight matrix for gradient computation + val = (A1.sum(1).ravel() + A2.sum(0)).getA1() + A3 = csr_matrix((val, (sample_range, target_neighbors[:, k])), shape) + A0 = A0 - A1 - A2 + A3 + + grad = _sum_weighted_outer_differences(X, A0) + + return loss, grad, n_active_triplets + + +def _sum_weighted_outer_differences(X, weights): + """ Compute the sum of weighted outer pairwise differences. + + :param X: array, shape (n_samples, n_features), data samples. + :param weights: csr_matrix, shape (n_samples, n_samples), sparse weights. + + :return: array, shape (n_features, n_features), + the sum of all outer weighted differences. + """ + + W = weights + weights.T + D = W.sum(1).getA() + + # X.T * (D - W) * X + LX = D * X - W @ X # D is n x 1, W is n x n + ret = X.T @ LX + + return ret + + +def _make_knn_graph(indices): + """ Convert a dense indices matrix to a sparse adjacency matrix. + + :param indices: array, shape (n_samples, n_neighbors), + indices of the k nearest neighbors of each sample. + :return: csr_matrix, shape (n_samples, n_samples), the adjacency matrix. + """ + + n_samples, n_neighbors = indices.shape + row = np.repeat(range(n_samples), n_neighbors) + col = indices.ravel() + ind = np.ones(indices.size) + shape = (n_samples, n_samples) + knn_graph = csr_matrix((ind, (row, col)), shape=shape) + + return knn_graph diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 4db0a1fc..42c6fe5d 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -26,7 +26,12 @@ MMC_Supervised, SDML, RCA, ITML, SCML) # Import this specially for testing. from metric_learn.constraints import wrap_pairs, Constraints -from metric_learn.lmnn import _sum_outer_products +from metric_learn.lmnn import ( + _sum_weighted_outer_differences, + _make_knn_graph, + _find_impostors, + _compute_push_loss +) def class_separation(X, labels): @@ -362,12 +367,11 @@ def test_bounds_parameters_invalid(bounds): class TestLMNN(MetricTestCase): def test_iris(self): - lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False) + lmnn = LMNN(n_neighbors=5, verbose=False) lmnn.fit(self.iris_points, self.iris_labels) - csep = class_separation(lmnn.transform(self.iris_points), - self.iris_labels) - self.assertLess(csep, 0.25) + csep = class_separation(lmnn.transform(self.iris_points), self.iris_labels) + self.assertLess(csep, 0.26) def test_loss_grad_lbfgs(self): """Test gradient of loss function @@ -378,211 +382,98 @@ def test_loss_grad_lbfgs(self): X, y = make_classification(random_state=rng) L = rng.randn(rng.randint(1, X.shape[1] + 1), X.shape[1]) lmnn = LMNN() + lmnn.n_iter_ = 0 + lmnn.n_neighbors_ = lmnn.n_neighbors - k = lmnn.k - reg = lmnn.regularization - - X, y = lmnn._prepare_inputs(X, y, dtype=float, - ensure_min_samples=2) + X, y = lmnn._prepare_inputs(X, y, dtype=float, ensure_min_samples=2) num_pts, n_components = X.shape - unique_labels, label_inds = np.unique(y, return_inverse=True) - lmnn.labels_ = np.arange(len(unique_labels)) + unique_labels, y_inverse = np.unique(y, return_inverse=True) + classes = np.arange(len(unique_labels)) lmnn.components_ = np.eye(n_components) - target_neighbors = lmnn._select_targets(X, label_inds) + target_neighbors = lmnn._select_target_neighbors(X, y_inverse, classes) # sum outer products - dfG = _sum_outer_products(X, target_neighbors.flatten(), - np.repeat(np.arange(X.shape[0]), k)) + tn_graph = _make_knn_graph(target_neighbors) + const_grad = _sum_weighted_outer_differences(X, tn_graph) + - # initialize L - def loss_grad(flat_L): - return lmnn._loss_grad(X, flat_L.reshape(-1, X.shape[1]), dfG, - k, reg, target_neighbors, label_inds) + kwargs = { + 'classes': classes, + 'target_neighbors': target_neighbors, + 'const_grad': const_grad, + } - def fun(x): - return loss_grad(x)[1] + def fun(L): + return lmnn._loss_grad_lbfgs(L, X, y, **kwargs)[0] - def grad(x): - return loss_grad(x)[0].ravel() + def grad(L): + return lmnn._loss_grad_lbfgs(L, X, y, **kwargs)[1] # compute relative error - epsilon = np.sqrt(np.finfo(float).eps) - rel_diff = (check_grad(fun, grad, L.ravel()) / - np.linalg.norm(approx_fprime(L.ravel(), fun, epsilon))) + rel_diff = check_grad(fun, grad, L.ravel()) / np.linalg.norm(grad(L)) np.testing.assert_almost_equal(rel_diff, 0., decimal=5) -def test_loss_func(capsys): - """Test the loss function (and its gradient) on a simple example, - by comparing the results with the actual implementation of metric-learn, - with a very simple (but nonperformant) implementation""" - - # toy dataset to use - X, y = make_classification(n_samples=10, n_classes=2, - n_features=6, - n_redundant=0, shuffle=True, - scale=[1, 1, 20, 20, 20, 20], random_state=42) - - def hinge(a): - if a > 0: - return a, 1 - else: - return 0, 0 - - def loss_fn(L, X, y, target_neighbors, reg): - L = L.reshape(-1, X.shape[1]) - Lx = np.dot(X, L.T) - loss = 0 - total_active = 0 - grad = np.zeros_like(L) - for i in range(X.shape[0]): - for j in target_neighbors[i]: - loss += (1 - reg) * np.sum((Lx[i] - Lx[j]) ** 2) - grad += (1 - reg) * np.outer(Lx[i] - Lx[j], X[i] - X[j]) - for k in range(X.shape[0]): - if y[i] != y[k]: - hin, active = hinge(1 + np.sum((Lx[i] - Lx[j])**2) - - np.sum((Lx[i] - Lx[k])**2)) - total_active += active - if active: - loss += reg * hin - grad += (reg * (np.outer(Lx[i] - Lx[j], X[i] - X[j]) - - np.outer(Lx[i] - Lx[k], X[i] - X[k]))) - grad = 2 * grad - return grad, loss, total_active - - # we check that the gradient we have computed in the non-performant implem - # is indeed the true gradient on a toy example: - - def _select_targets(X, y, k): - target_neighbors = np.empty((X.shape[0], k), dtype=int) - for label in np.unique(y): - inds, = np.nonzero(y == label) - dd = euclidean_distances(X[inds], squared=True) - np.fill_diagonal(dd, np.inf) - nn = np.argsort(dd)[..., :k] - target_neighbors[inds] = inds[nn] - return target_neighbors - - target_neighbors = _select_targets(X, y, 2) - regularization = 0.5 - n_features = X.shape[1] - x0 = np.random.randn(1, n_features) - - def loss(x0): - return loss_fn(x0.reshape(-1, X.shape[1]), X, y, target_neighbors, - regularization)[1] - - def grad(x0): - return loss_fn(x0.reshape(-1, X.shape[1]), X, y, target_neighbors, - regularization)[0].ravel() - - scipy.optimize.check_grad(loss, grad, x0.ravel()) - - class LMNN_with_callback(LMNN): - """ We will use a callback to get the gradient (see later) +def test_compute_push_loss(): + """Test if the push loss is computed correctly + + This test continues on the example from test_find_impostors. The push + loss is easy to compute, as we have only 4 violations and all of them + amount to 1 (squared distance to target neighbor + 1 - squared distance + to impostor = 4 + 1 - 4). """ - def __init__(self, callback, *args, **kwargs): - self.callback = callback - super(LMNN_with_callback, self).__init__(*args, **kwargs) - - def _loss_grad(self, *args, **kwargs): - grad, objective, total_active = ( - super(LMNN_with_callback, self)._loss_grad(*args, **kwargs)) - self.callback.append(grad) - return grad, objective, total_active - - class LMNN_nonperformant(LMNN_with_callback): - - def fit(self, X, y): - self.y = y - return super(LMNN_nonperformant, self).fit(X, y) - - def _loss_grad(self, X, L, dfG, k, reg, target_neighbors, label_inds): - grad, loss, total_active = loss_fn(L.ravel(), X, self.y, - target_neighbors, self.regularization) - self.callback.append(grad) - return grad, loss, total_active - - mem1, mem2 = [], [] - lmnn_perf = LMNN_with_callback(verbose=True, random_state=42, - init='identity', max_iter=30, callback=mem1) - lmnn_nonperf = LMNN_nonperformant(verbose=True, random_state=42, - init='identity', max_iter=30, - callback=mem2) - objectives, obj_diffs, learn_rate, total_active = (dict(), dict(), dict(), - dict()) - for algo, name in zip([lmnn_perf, lmnn_nonperf], ['perf', 'nonperf']): - algo.fit(X, y) - out, _ = capsys.readouterr() - lines = re.split("\n+", out) - # we get every variable that is printed from the algorithm in verbose - num = r'(-?\d+.?\d*(e[+|-]\d+)?)' - strings = [re.search(r"\d+ (?:{}) (?:{}) (?:(\d+)) (?:{})" - .format(num, num, num), s) for s in lines] - objectives[name] = [float(match.group(1)) for match in strings if match is - not None] - obj_diffs[name] = [float(match.group(3)) for match in strings if match is - not None] - total_active[name] = [float(match.group(5)) for match in strings if - match is not - None] - learn_rate[name] = [float(match.group(6)) for match in strings if match is - not None] - assert len(strings) >= 10 # we ensure that we actually did more than 10 - # iterations - assert total_active[name][0] >= 2 # we ensure that we have some active - # constraints (that's the case we want to test) - # we remove the last element because it can be equal to the penultimate - # if the last gradient update is null - for i in range(len(mem1)): - np.testing.assert_allclose(lmnn_perf.callback[i], - lmnn_nonperf.callback[i], - err_msg='Gradient different at position ' - '{}'.format(i)) - np.testing.assert_allclose(objectives['perf'], objectives['nonperf']) - np.testing.assert_allclose(obj_diffs['perf'], obj_diffs['nonperf']) - np.testing.assert_allclose(total_active['perf'], total_active['nonperf']) - np.testing.assert_allclose(learn_rate['perf'], learn_rate['nonperf']) + class_distance = 4. + X_a = np.array([[-1., 1], [-1., -1.], [1., 1.], [1., -1.]]) + X_b = X_a + np.array([class_distance, 0]) + X = np.concatenate((X_a, X_b)) + y = np.array([0, 0, 0, 0, 1, 1, 1, 1]) + lmnn = LMNN(n_neighbors=1) + lmnn.n_neighbors_ = 1 + classes = np.unique(y) + target_neighbors = lmnn._select_target_neighbors(X, y, classes) + diffs = X - X[target_neighbors[:, 0]] + dist_tn = np.einsum('ij,ij->i', diffs, diffs) + dist_tn = dist_tn[:, None] + dist_tn += 1 + margin_radii = dist_tn[:, -1] + impostors_graph = _find_impostors(X, y, classes, margin_radii) + loss, grad, _ = _compute_push_loss(X, target_neighbors, dist_tn, + impostors_graph) + + # The loss should be 4. (1. for each of the 4 violation) + assert loss == 4. @pytest.mark.parametrize('X, y, loss', [(np.array([[0], [1], [2], [3]]), - [1, 1, 0, 0], 3.0), + [1, 1, 0, 0], 6.0), (np.array([[0], [1], [2], [3]]), - [1, 0, 0, 1], 26.)]) + [1, 0, 0, 1], 256.)]) def test_toy_ex_lmnn(X, y, loss): """Test that the loss give the right result on a toy example""" L = np.array([[1]]) - lmnn = LMNN(k=1, regularization=0.5) + lmnn = LMNN(n_neighbors=1, push_loss_weight=0.5) - k = lmnn.k - reg = lmnn.regularization + lmnn.n_neighbors_ = lmnn.n_neighbors - X, y = lmnn._prepare_inputs(X, y, dtype=float, - ensure_min_samples=2) + X, y = lmnn._prepare_inputs(X, y, dtype=float, ensure_min_samples=2) num_pts, n_components = X.shape unique_labels, label_inds = np.unique(y, return_inverse=True) - lmnn.labels_ = np.arange(len(unique_labels)) + classes = np.arange(len(unique_labels)) lmnn.components_ = np.eye(n_components) - target_neighbors = lmnn._select_targets(X, label_inds) + target_neighbors = lmnn._select_target_neighbors(X, label_inds, classes) # sum outer products - dfG = _sum_outer_products(X, target_neighbors.flatten(), - np.repeat(np.arange(X.shape[0]), k)) - - # storage - a1 = [None] * k - a2 = [None] * k - for nn_idx in range(k): - a1[nn_idx] = np.array([]) - a2[nn_idx] = np.array([]) + tn_graph = _make_knn_graph(target_neighbors) + const_grad = _sum_weighted_outer_differences(X, tn_graph) # assert that the loss equals the one computed by hand - assert lmnn._loss_grad(X, L.reshape(-1, X.shape[1]), dfG, k, - reg, target_neighbors, label_inds)[1] == loss + lmnn.n_iter_ = 0 + predicted_loss = lmnn._loss_grad_lbfgs(L, X, y, classes, target_neighbors, + const_grad)[0] + assert predicted_loss == loss def test_convergence_simple_example(capsys): @@ -592,28 +483,7 @@ def test_convergence_simple_example(capsys): lmnn = LMNN(verbose=True) lmnn.fit(X, y) out, _ = capsys.readouterr() - assert "LMNN converged with objective" in out - - -def test_no_twice_same_objective(capsys): - # test that the objective function never has twice the same value - # see https://github.com/scikit-learn-contrib/metric-learn/issues/88 - X, y = make_classification(random_state=0) - lmnn = LMNN(verbose=True) - lmnn.fit(X, y) - out, _ = capsys.readouterr() - lines = re.split("\n+", out) - # we get only objectives from each line: - # the regexp matches a float that follows an integer (the iteration - # number), and which is followed by a (signed) float (delta obj). It - # matches for instance: - # 3 **1113.7665747189938** -3.182774197440267 46431.0200999999999998e-06 - objectives = [re.search(r"\d* (?:(\d*.\d*))[ | -]\d*.\d*", s) - for s in lines] - objectives = [match.group(1) for match in objectives if match is not None] - # we remove the last element because it can be equal to the penultimate - # if the last gradient update is null - assert len(objectives[:-1]) == len(set(objectives[:-1])) + assert "LMNN did not converge" not in out class TestSDML(MetricTestCase): diff --git a/test/test_base_metric.py b/test/test_base_metric.py index fed9018a..711e3b6d 100644 --- a/test/test_base_metric.py +++ b/test/test_base_metric.py @@ -20,8 +20,8 @@ def test_covariance(self): def test_lmnn(self): self.assertEqual( - remove_spaces(str(metric_learn.LMNN(convergence_tol=0.01, k=6))), - remove_spaces("LMNN(convergence_tol=0.01, k=6)")) + remove_spaces(str(metric_learn.LMNN(tol=0.01, n_neighbors=6))), + remove_spaces("LMNN(n_neighbors=6,tol=0.01)")) def test_nca(self): self.assertEqual(remove_spaces(str(metric_learn.NCA(max_iter=42))), diff --git a/test/test_components_metric_conversion.py b/test/test_components_metric_conversion.py index 04d0d007..40d2f942 100644 --- a/test/test_components_metric_conversion.py +++ b/test/test_components_metric_conversion.py @@ -42,7 +42,7 @@ def test_itml_supervised(self): assert_array_almost_equal(L.T.dot(L), itml.get_mahalanobis_matrix()) def test_lmnn(self): - lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False) + lmnn = LMNN(n_neighbors=5, verbose=False) lmnn.fit(self.X, self.y) L = lmnn.components_ assert_array_almost_equal(L.T.dot(L), lmnn.get_mahalanobis_matrix()) diff --git a/test/test_fit_transform.py b/test/test_fit_transform.py index d4d4bfe0..609db3c9 100644 --- a/test/test_fit_transform.py +++ b/test/test_fit_transform.py @@ -52,11 +52,11 @@ def test_itml_supervised(self): assert_array_almost_equal(res_1, res_2) def test_lmnn(self): - lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False) + lmnn = LMNN(n_neighbors=5, verbose=False) lmnn.fit(self.X, self.y) res_1 = lmnn.transform(self.X) - lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False) + lmnn = LMNN(n_neighbors=5, verbose=False) res_2 = lmnn.fit_transform(self.X, self.y) assert_array_almost_equal(res_1, res_2) diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py index ab7e972d..bbd5c718 100644 --- a/test/test_mahalanobis_mixin.py +++ b/test/test_mahalanobis_mixin.py @@ -417,7 +417,7 @@ def test_auto_init_transformation(n_samples, n_features, n_classes, random_state=rng) # To make the test work for LMNN: if 'LMNN' in model_base.__class__.__name__: - model_base.set_params(k=1) + model_base.set_params(n_neighbors=1) # To make the test faster for estimators that have a max_iter: if hasattr(model_base, 'max_iter'): model_base.set_params(max_iter=1) From e4348621f06530b400fd6f0e75bbdc0470415cb6 Mon Sep 17 00:00:00 2001 From: John Chiotellis Date: Fri, 26 Mar 2021 09:59:01 +0100 Subject: [PATCH 02/10] validate params after components initialization --- metric_learn/lmnn.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index b2a3817f..dc920020 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -169,9 +169,8 @@ def fit(self, X, y): :return: The trained LMNN model. """ - # Validate inputs + # Check input arrays X, y = self._prepare_inputs(X, y, dtype=float, ensure_min_samples=2) - X, y, classes = self._validate_params(X, y) n_samples, n_features = X.shape n_components = _check_n_components(n_features, self.n_components) @@ -181,6 +180,9 @@ def fit(self, X, y): verbose=self.verbose, random_state=self.random_state) + # remove singletons after initializing components + X, y, classes = self._validate_params(X, y) + # Find the target neighbors of each sample target_neighbors = self._select_target_neighbors(X, y, classes) From f0bbd8fedf5cb011561fb5ae268c39c059e253f1 Mon Sep 17 00:00:00 2001 From: John Chiotellis Date: Sat, 17 Apr 2021 00:00:26 +0200 Subject: [PATCH 03/10] add max_impostors param and improve documentation --- metric_learn/lmnn.py | 292 ++++++++++++++++++++++++++++---------- test/metric_learn_test.py | 17 ++- test/test_base_metric.py | 4 +- 3 files changed, 230 insertions(+), 83 deletions(-) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index dc920020..0466d39e 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -9,6 +9,7 @@ from scipy.sparse import coo_matrix, csr_matrix, csc_matrix from sklearn.neighbors import NearestNeighbors from sklearn.metrics import euclidean_distances +from sklearn.utils import gen_batches, check_random_state from sklearn.base import TransformerMixin from sklearn.exceptions import ConvergenceWarning @@ -75,6 +76,11 @@ class LMNN(MahalanobisMixin, TransformerMixin): :meth:`fit` and n_features_a must be less than or equal to that. If ``n_components`` is not None, n_features_a must match it. + max_impostors : int, optional (default=500_000) + Maximum number of impostors to consider per iteration. In the worst + case this will allow ``max_impostors * n_neighbors`` constraints to be + active. + neighbors_params : dict, optional (default=None) Parameters to pass to a :class:`neighbors.NearestNeighbors` instance - apart from ``n_neighbors`` - that will be used to select the target @@ -146,12 +152,14 @@ class LMNN(MahalanobisMixin, TransformerMixin): """ def __init__(self, n_neighbors=3, n_components=None, init='auto', - neighbors_params=None, push_loss_weight=0.5, max_iter=50, - tol=1e-5, preprocessor=None, verbose=False, random_state=None): + max_impostors=500_000, neighbors_params=None, + push_loss_weight=0.5, max_iter=50, tol=1e-5, + preprocessor=None, verbose=False, random_state=None): self.n_neighbors = n_neighbors self.n_components = n_components self.init = init + self.max_impostors = max_impostors self.neighbors_params = neighbors_params self.push_loss_weight = push_loss_weight self.max_iter = max_iter @@ -161,7 +169,7 @@ def __init__(self, n_neighbors=3, n_components=None, init='auto', super(LMNN, self).__init__(preprocessor) def fit(self, X, y): - """ Train Large Margin Nearest Neighbor model. + """Train Large Margin Nearest Neighbor model. :param X: array, shape (n_samples, n_features), the training samples. :param y: array, shape (n_samples,), the training labels. @@ -175,10 +183,13 @@ def fit(self, X, y): n_samples, n_features = X.shape n_components = _check_n_components(n_features, self.n_components) + # Initialize the random generator + self.random_state_ = check_random_state(self.random_state) + # Initialize transformation self.components_ = _initialize_components(n_components, X, y, self.init, verbose=self.verbose, - random_state=self.random_state) + random_state=self.random_state_) # remove singletons after initializing components X, y, classes = self._validate_params(X, y) @@ -186,13 +197,13 @@ def fit(self, X, y): # Find the target neighbors of each sample target_neighbors = self._select_target_neighbors(X, y, classes) - # Compute the gradient w.r.t. the target neighbors which remains constant + # Compute the pull loss gradient w.r.t. M, which remains constant tn_graph = _make_knn_graph(target_neighbors) - const_grad = _sum_weighted_outer_differences(X, tn_graph) + pull_loss_grad_m = _sum_weighted_outer_products(X, tn_graph) # Compute the pull loss weight such that the push loss weight becomes 1 pull_loss_weight = (1 - self.push_loss_weight) / self.push_loss_weight - const_grad *= pull_loss_weight + pull_loss_grad_m *= pull_loss_weight int_verbose = int(self.verbose) disp = int_verbose - 2 if int_verbose > 1 else -1 @@ -200,7 +211,7 @@ def fit(self, X, y): 'method': 'L-BFGS-B', 'fun': self._loss_grad_lbfgs, 'jac': True, - 'args': (X, y, classes, target_neighbors, const_grad), + 'args': (X, y, classes, target_neighbors, pull_loss_grad_m), 'x0': self.components_, 'tol': self.tol, 'options': dict(maxiter=self.max_iter, disp=disp), @@ -224,7 +235,7 @@ def fit(self, X, y): return self def _validate_params(self, X, y): - """ Validate parameters as soon as :meth:`fit` is called. + """Validate parameters as soon as :meth:`fit` is called. :param X: array, shape (n_samples, n_features), the training samples. :param y: array, shape (n_samples,), the training labels. @@ -274,8 +285,9 @@ def _validate_params(self, X, y): return X, y, classes_inverse_non_singleton - def _loss_grad_lbfgs(self, L, X, y, classes, target_neighbors, const_grad): - """ Compute loss and gradient after one optimization iteration. + def _loss_grad_lbfgs(self, L, X, y, classes, target_neighbors, + pull_loss_grad_m): + """Compute loss and gradient after one optimization iteration. :param L: array, shape (n_components * n_features,) The current (flattened) linear transformation. @@ -284,8 +296,8 @@ def _loss_grad_lbfgs(self, L, X, y, classes, target_neighbors, const_grad): :param classes: array, shape (n_classes,), classes encoded as intergers. :param target_neighbors: array, shape (n_samples, n_neighbors), the target neighbors of each training sample. - :param const_grad: array, shape (n_features, n_features), - the (weighted) gradient component due to target + :param pull_loss_grad_m: array, shape (n_features, n_features), + the (weighted) gradient (w.r.t M=L.T@L) due to target neighbors that stays fixed throughout the algorithm. :return: @@ -326,19 +338,21 @@ def _loss_grad_lbfgs(self, L, X, y, classes, target_neighbors, const_grad): dist_tn += 1 # Find impostors - impostors_graph = _find_impostors(X_embedded, y, classes, dist_tn[:, -1]) + impostors_graph = self._find_impostors(X_embedded, y, classes, + dist_tn[:, -1]) # Compute the push loss and its gradient - push_loss, push_loss_grad, n_active = _compute_push_loss( + push_loss, push_loss_grad_m, n_active = _push_loss_grad( X, target_neighbors, dist_tn, impostors_graph) # Compute the total loss M = L.T @ L - new_pull_loss = np.dot(const_grad.ravel(), M.ravel()) + # It holds that trace(M.T @ C) = np.dot(M.ravel(), C.ravel()) + new_pull_loss = np.dot(pull_loss_grad_m.ravel(), M.ravel()) loss = push_loss + new_pull_loss - # Compute the total gradient - grad = np.dot(L, const_grad + push_loss_grad) + # Compute the total gradient (grad_L = 2L grad_M) + grad = np.dot(L, pull_loss_grad_m + push_loss_grad_m) grad *= 2 # Report iteration metrics @@ -347,14 +361,14 @@ def _loss_grad_lbfgs(self, L, X, y, classes, target_neighbors, const_grad): elapsed_time = time.time() - start_time values_fmt = '[{}] {:>10} {:>20.6e} {:>20,} {:>10.2f}' cls_name = self.__class__.__name__ - print(values_fmt.format(cls_name, self.n_iter_, loss, n_active, - elapsed_time)) + print(values_fmt.format(cls_name, self.n_iter_, loss, + n_active, elapsed_time)) sys.stdout.flush() return loss, grad.ravel() def _select_target_neighbors(self, X, y, classes): - """ Find the target neighbors of each training sample. + """Find the target neighbors of each training sample. :param X: array, shape (n_samples, n_features), the training samples. :param y: array, shape (n_samples,), the training labels. @@ -387,68 +401,152 @@ def _select_target_neighbors(self, X, y, classes): return target_neighbors + def _find_impostors(self, X_embedded, y, classes, margin_radii): + """Find the samples that violate the margin. + + :param X_embedded: array, shape (n_samples, n_components), + the training samples in the embedding space. + :param y: array, shape (n_samples,), training labels. + :param classes: array, shape (n_classes,), the classes encoded as integers. + :param margin_radii: array, shape (n_samples,), squared distances of + training samples to their farthest target neighbors + plus margin. + + :return: coo_matrix, shape (n_samples, n_neighbors), + If sample i is an impostor to sample j or vice versa, then the + value of A[i, j] is the squared distance between samples i and j. + Otherwise A[i, j] is zero. + + """ -def _find_impostors(X_embedded, y, classes, margin_radii): - """ Find the samples that violate the margin. + # Initialize lists for impostors storage + imp_row, imp_col = [], [] - :param X_embedded: array, shape (n_samples, n_components), - the training samples in the embedding space. - :param y: array, shape (n_samples,), training labels. - :param classes: array, shape (n_classes,), the classes encoded as integers. - :param margin_radii: array, shape (n_samples,), squared distances of - training samples to their farthest target neighbors - plus margin. + for label in classes[:-1]: + ind_a = np.where(y == label)[0] + ind_b = np.where(y > label)[0] - :return: coo_matrix, shape (n_samples, n_neighbors), - If sample i is an impostor to sample j or vice versa, then the - value of A[i, j] is the squared distance between samples i and j. - Otherwise A[i, j] is zero. + ind_impostors = _find_impostors_blockwise(X_embedded, margin_radii, + ind_b, ind_a) + + n_impostors = len(ind_impostors) + if n_impostors: + if n_impostors > self.max_impostors: + ind_samples = self.random_state_.choice( + n_impostors, self.max_impostors, replace=False) + ind_impostors = ind_impostors[ind_samples] + + dims = (len(ind_b), len(ind_a)) + ib, ia = np.unravel_index(ind_impostors, shape=dims) + + imp_row.extend(ind_b[ib]) + imp_col.extend(ind_a[ia]) + + # turn lists to numpy arrays + imp_row = np.asarray(imp_row, dtype=np.intp) + imp_col = np.asarray(imp_col, dtype=np.intp) + + # Make sure we do not exceed max_impostors + n_impostors = len(imp_row) + if n_impostors > self.max_impostors: + ind_samples = self.random_state_.choice( + n_impostors, self.max_impostors, replace=False) + imp_row = imp_row[ind_samples] + imp_col = imp_col[ind_samples] + + # Compute distances + imp_dist = _paired_distances_blockwise(X_embedded, imp_row, imp_col) + + # store impostors as a sparse matrix + n = X_embedded.shape[0] + impostors_graph = coo_matrix((imp_dist, (imp_row, imp_col)), shape=(n, n)) + + return impostors_graph - """ - # Initialize lists for impostors storage - imp_row, imp_col, imp_dist = [], [], [] +######################## +# Some core functions # +####################### - for label in classes[:-1]: - ind_a = np.where(y == label)[0] - ind_b = np.where(y > label)[0] +def _find_impostors_blockwise(X, radii, ind_a, ind_b, + return_distance=False, block_size=8): + """Find (sample, impostor) pairs in blocks to avoid large memory usage. - dist = euclidean_distances(X_embedded[ind_b], X_embedded[ind_a], - squared=True) # (b, a) + Parameters + ---------- + X : array, shape (n_samples, n_components) + Transformed data samples. + + radii : array, shape (n_samples,) + Squared distances of the samples in ``X`` to their margins. + + ind_a : array, shape (n_samples_a,) + Indices of samples from class A. + + ind_b : array, shape (n_samples_b,) + Indices of samples from class B. + + block_size : int, optional (default=8) + The maximum number of mebibytes (MiB) of memory to use at a time for + calculating paired squared distances. + return_distance : bool, optional (default=False) + Whether to return the squared distances to the impostors. + + Returns + ------- + imp_indices : array, shape (n_impostors,) + Unraveled indices of (sample, impostor) pairs referring to a matrix + of shape (n_samples_a, n_samples_b). + imp_distances : array, shape (n_impostors,), optional + imp_distances[i] is the squared distance between samples imp_row[i] and + imp_col[i], where + imp_row, imp_col = np.unravel_index(imp_indices, shape=(n_samples_a, + n_samples_b)) + """ + + n_samples_a = len(ind_a) + bytes_per_row = len(ind_b) * X.itemsize + block_n_rows = int(block_size * 1024 * 1024 // bytes_per_row) - radii_a = margin_radii[ind_a] # (a,) - radii_b = margin_radii[ind_b] # (b,) + imp_indices, imp_distances = [], [] - # Find samples in B that are impostors to A - imp_ba = np.where((dist < radii_a[None, :]).ravel())[0] + # data + X_b = X[ind_b] + radii_a = radii[ind_a] + radii_b = radii[ind_b] - # Find samples in A that are impostors to B - imp_ab = np.where((dist < radii_b[:, None]).ravel())[0] + # X_b squared norm stays constant, so pre-compute it to get a speed-up + X_b_norm_squared = np.einsum('ij,ij->i', X_b, X_b).reshape(1, -1) + for chunk in gen_batches(n_samples_a, block_n_rows): - # merge and filter unique impostors - ind_impostors = np.unique(np.concatenate((imp_ba, imp_ab))) + dist = euclidean_distances(X[ind_a[chunk]], X_b, squared=True, + Y_norm_squared=X_b_norm_squared) - if len(ind_impostors): - # Map indices back to the original training data indices - ii, jj = np.unravel_index(ind_impostors, dist.shape) - imp_row.extend(ind_b[ii]) - imp_col.extend(ind_a[jj]) - imp_dist.extend(dist.ravel()[ind_impostors]) + ind_b, = np.where((dist < radii_a[chunk, None]).ravel()) + ind_a, = np.where((dist < radii_b[None, :]).ravel()) + ind = np.unique(np.concatenate((ind_a, ind_b))) - # turn lists to numpy arrays - imp_row = np.asarray(imp_row, dtype=np.intp) - imp_col = np.asarray(imp_col, dtype=np.intp) - imp_dist = np.asarray(imp_dist) + if len(ind): + ind_plus_offset = ind + chunk.start * X_b.shape[0] + imp_indices.extend(ind_plus_offset) - # store impostors as a sparse matrix - n = X_embedded.shape[0] - impostors_graph = coo_matrix((imp_dist, (imp_row, imp_col)), shape=(n, n)) + if return_distance: + # We only need to do clipping if we return the distances. + distances_chunk = dist.ravel()[ind] + # Clip only the indexed (unique) distances + np.maximum(distances_chunk, 0, out=distances_chunk) + imp_distances.extend(distances_chunk) - return impostors_graph + imp_indices = np.asarray(imp_indices) + if return_distance: + return imp_indices, np.asarray(imp_distances) + else: + return imp_indices -def _compute_push_loss(X, target_neighbors, inflated_dist_tn, impostors_graph): - """ Compute the push loss L_push = max(d(a, p) + 1 - d(a, n), 0) + +def _push_loss_grad(X, target_neighbors, inflated_dist_tn, impostors_graph): + """Compute the push loss max(d(a, p) + 1 - d(a, n), 0) and its gradient. :param X: array, shape (n_samples, n_features), the training samples. :param target_neighbors: array, shape (n_samples, n_neighbors), @@ -460,7 +558,8 @@ def _compute_push_loss(X, target_neighbors, inflated_dist_tn, impostors_graph): :return: loss: float, the push loss due to the given target neighbors and impostors. - grad: array, shape (n_features, n_features), the gradient of the push loss. + grad: array, shape (n_features, n_features), the gradient of the push loss + with respect to M = L.T @ L. n_active_triplets: int, the number of active triplet constraints. """ @@ -496,19 +595,68 @@ def _compute_push_loss(X, target_neighbors, inflated_dist_tn, impostors_graph): A3 = csr_matrix((val, (sample_range, target_neighbors[:, k])), shape) A0 = A0 - A1 - A2 + A3 - grad = _sum_weighted_outer_differences(X, A0) + grad = _sum_weighted_outer_products(X, A0) return loss, grad, n_active_triplets -def _sum_weighted_outer_differences(X, weights): - """ Compute the sum of weighted outer pairwise differences. +########################## +# Some helper functions # +######################### + +def _paired_distances_blockwise(X, ind_a, ind_b, squared=True, block_size=8): + """Equivalent to row_norms(X[ind_a] - X[ind_b], squared=squared). + + Parameters + ---------- + X : array, shape (n_samples, n_features) + An array of data samples. + ind_a : array, shape (n_indices,) + An array of sample indices. + ind_b : array, shape (n_indices,) + Another array of sample indices. + squared : bool (default=True) + Whether to return the squared distances. + block_size : int, optional (default=8) + The maximum number of mebibytes (MiB) of memory to use at a time for + calculating paired (squared) distances. + + Returns + ------- + distances: array, shape (n_indices,) + An array of paired, optionally squared, distances. + """ + + bytes_per_row = X.shape[1] * X.itemsize + batch_size = int(block_size*1024*1024 // bytes_per_row) + + n_pairs = len(ind_a) + distances = np.zeros(n_pairs) + for chunk in gen_batches(n_pairs, batch_size): + diffs = X[ind_a[chunk]] - X[ind_b[chunk]] + distances[chunk] = np.einsum('ij,ij->i', diffs, diffs) + + return distances if squared else np.sqrt(distances, out=distances) + + +def _sum_weighted_outer_products(X, weights): + """Compute the sum of weighted outer products. + + Consider the squared Mahalanobis distance between x_i and x_j: + + d_{ij} = (x_i - x_j)^T M (x_i - x_j) + = x_i^T M x_i - x_i^T M x_j - x_j^T M x_i + x_j^T M x_j + + The gradient of d_{ij} with respect to M is a sum of outer products: + + \nabla_M d_{ij} = x_i x_i^T - x_j x_i^T - x_i x_j^T + x_j x_j^T + :param X: array, shape (n_samples, n_features), data samples. :param weights: csr_matrix, shape (n_samples, n_samples), sparse weights. :return: array, shape (n_features, n_features), - the sum of all outer weighted differences. + the sum of all weighted outer products. """ W = weights + weights.T @@ -522,7 +670,7 @@ def _sum_weighted_outer_differences(X, weights): def _make_knn_graph(indices): - """ Convert a dense indices matrix to a sparse adjacency matrix. + """Convert a dense indices matrix to a sparse adjacency matrix. :param indices: array, shape (n_samples, n_neighbors), indices of the k nearest neighbors of each sample. diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 4af776cd..f712dc7e 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -27,10 +27,9 @@ # Import this specially for testing. from metric_learn.constraints import wrap_pairs, Constraints from metric_learn.lmnn import ( - _sum_weighted_outer_differences, + _sum_weighted_outer_products, _make_knn_graph, - _find_impostors, - _compute_push_loss + _push_loss_grad ) @@ -395,13 +394,13 @@ def test_loss_grad_lbfgs(self): # sum outer products tn_graph = _make_knn_graph(target_neighbors) - const_grad = _sum_weighted_outer_differences(X, tn_graph) + pull_loss_grad_m = _sum_weighted_outer_products(X, tn_graph) kwargs = { 'classes': classes, 'target_neighbors': target_neighbors, - 'const_grad': const_grad, + 'pull_loss_grad_m': pull_loss_grad_m, } def fun(L): @@ -438,9 +437,9 @@ def test_compute_push_loss(): dist_tn = dist_tn[:, None] dist_tn += 1 margin_radii = dist_tn[:, -1] - impostors_graph = _find_impostors(X, y, classes, margin_radii) - loss, grad, _ = _compute_push_loss(X, target_neighbors, dist_tn, - impostors_graph) + impostors_graph = lmnn._find_impostors(X, y, classes, margin_radii) + loss, grad, _ = _push_loss_grad(X, target_neighbors, dist_tn, + impostors_graph) # The loss should be 4. (1. for each of the 4 violation) assert loss == 4. @@ -467,7 +466,7 @@ def test_toy_ex_lmnn(X, y, loss): # sum outer products tn_graph = _make_knn_graph(target_neighbors) - const_grad = _sum_weighted_outer_differences(X, tn_graph) + const_grad = _sum_weighted_outer_products(X, tn_graph) # assert that the loss equals the one computed by hand lmnn.n_iter_ = 0 diff --git a/test/test_base_metric.py b/test/test_base_metric.py index ea42f603..d0db85fd 100644 --- a/test/test_base_metric.py +++ b/test/test_base_metric.py @@ -40,10 +40,10 @@ def test_lmnn(self): 'n_components': None, 'preprocessor': None, 'random_state': None, 'push_loss_weight': 0.5, 'verbose': False} - nndef_kwargs = {'tol': 0.01, 'n_neighbors': 6} + nndef_kwargs = {'n_neighbors': 6, 'tol': 0.01} merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs) self.assertEqual( - remove_spaces(str(metric_learn.LMNN(tol=0.01, n_neighbors=6))), + remove_spaces(str(metric_learn.LMNN(n_neighbors=6, tol=0.01))), remove_spaces(f"LMNN({merged_kwargs})")) def test_nca(self): From eb609d95626f93baaace2daf1686834b7476e68a Mon Sep 17 00:00:00 2001 From: John Chiotellis Date: Sat, 17 Apr 2021 11:27:58 +0200 Subject: [PATCH 04/10] fix LMNN StringRepr test --- test/metric_learn_test.py | 1 - test/test_base_metric.py | 10 ++++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index f712dc7e..ce855afc 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -396,7 +396,6 @@ def test_loss_grad_lbfgs(self): tn_graph = _make_knn_graph(target_neighbors) pull_loss_grad_m = _sum_weighted_outer_products(X, tn_graph) - kwargs = { 'classes': classes, 'target_neighbors': target_neighbors, diff --git a/test/test_base_metric.py b/test/test_base_metric.py index d0db85fd..9a62f8d9 100644 --- a/test/test_base_metric.py +++ b/test/test_base_metric.py @@ -36,10 +36,12 @@ def test_covariance(self): remove_spaces(f"Covariance({merged_kwargs})")) def test_lmnn(self): - def_kwargs = {'tol': 0.001, 'init': 'auto', 'n_neighbors': 3, - 'n_components': None, 'preprocessor': None, - 'random_state': None, 'push_loss_weight': 0.5, - 'verbose': False} + def_kwargs = {'n_neighbors': 3, 'n_components': None, 'init': 'auto', + 'max_impostors': 500_000, 'neighbors_params': None, + 'push_loss_weight': 0.5, 'max_iter': 50, 'tol': 1e-5, + 'preprocessor': None, 'verbose': False, + 'random_state': None} + def_kwargs = {k: def_kwargs[k] for k in sorted(def_kwargs.keys())} nndef_kwargs = {'n_neighbors': 6, 'tol': 0.01} merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs) self.assertEqual( From 53ec6e9dae1e5b65ce62cb9ea9c33222f88ae4ce Mon Sep 17 00:00:00 2001 From: John Chiotellis Date: Sat, 17 Apr 2021 11:52:46 +0200 Subject: [PATCH 05/10] fix flake8 errors --- metric_learn/lmnn.py | 8 ++++---- test/metric_learn_test.py | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index 0466d39e..c5d94ceb 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -276,10 +276,10 @@ def _validate_params(self, X, y): min_non_singleton_size = class_sizes[~mask_singleton_class].min() if self.n_neighbors >= min_non_singleton_size: warnings.warn('`n_neighbors` (={}) is not less than the number of ' - 'samples in the smallest non-singleton class (={}). ' - '`n_neighbors_` will be set to {} for estimation.' - .format(self.n_neighbors, min_non_singleton_size, - min_non_singleton_size - 1)) + 'samples in the smallest non-singleton class (={}). ' + '`n_neighbors_` will be set to {} for estimation.' + .format(self.n_neighbors, min_non_singleton_size, + min_non_singleton_size - 1)) self.n_neighbors_ = min(self.n_neighbors, min_non_singleton_size - 1) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index ce855afc..44ea0eb3 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -2,9 +2,8 @@ import re import pytest import numpy as np -import scipy from scipy.optimize import check_grad, approx_fprime -from sklearn.metrics import pairwise_distances, euclidean_distances +from sklearn.metrics import pairwise_distances from sklearn.datasets import (load_iris, make_classification, make_regression, make_spd_matrix) from numpy.testing import (assert_array_almost_equal, assert_array_equal, From d407def0b135130a77067fef09638f69ff841f18 Mon Sep 17 00:00:00 2001 From: John Chiotellis Date: Wed, 28 Apr 2021 10:45:55 +0200 Subject: [PATCH 06/10] fix double-naming bug in fib --- metric_learn/lmnn.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index c5d94ceb..c37b54b7 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -122,6 +122,8 @@ class LMNN(MahalanobisMixin, TransformerMixin): n_iter_ : `int` The number of iterations the solver has run. + random_state_ : numpy.RandomState + Pseudo random number generator object used during initialization. Examples -------- @@ -484,7 +486,7 @@ def _find_impostors_blockwise(X, radii, ind_a, ind_b, Indices of samples from class A. ind_b : array, shape (n_samples_b,) - Indices of samples from class B. + Indices of samples from class B, where n_samples_b << n_samples_a. block_size : int, optional (default=8) The maximum number of mebibytes (MiB) of memory to use at a time for @@ -522,9 +524,9 @@ def _find_impostors_blockwise(X, radii, ind_a, ind_b, dist = euclidean_distances(X[ind_a[chunk]], X_b, squared=True, Y_norm_squared=X_b_norm_squared) - ind_b, = np.where((dist < radii_a[chunk, None]).ravel()) - ind_a, = np.where((dist < radii_b[None, :]).ravel()) - ind = np.unique(np.concatenate((ind_a, ind_b))) + ind_ba, = np.where((dist < radii_a[chunk, None]).ravel()) + ind_ab, = np.where((dist < radii_b[None, :]).ravel()) + ind = np.unique(np.concatenate((ind_ab, ind_ba))) if len(ind): ind_plus_offset = ind + chunk.start * X_b.shape[0] From 131d0b2049a730853a812646aeaa16fdc0fe7a03 Mon Sep 17 00:00:00 2001 From: John Chiotellis Date: Sun, 23 May 2021 22:41:59 +0200 Subject: [PATCH 07/10] add deprecation warnings for k, regularization --- metric_learn/lmnn.py | 38 +++++++++++++++++++++++++++++++++----- test/metric_learn_test.py | 15 +++++++++++++-- 2 files changed, 46 insertions(+), 7 deletions(-) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index c37b54b7..dc8e219d 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -33,6 +33,12 @@ class LMNN(MahalanobisMixin, TransformerMixin): n_neighbors : int, optional (default=3) Number of neighbors to consider, not including self-edges. + k : Not used + .. deprecated:: 0.6.3 + `k` was deprecated in version 0.6.3 and will + be removed in 0.6.4. It is replaced by `n_neighbors` to have more + consistent terminology with scikit-learn. + n_components : int or None, optional (default=None) Dimensionality of reduced space (if None, defaults to dimension of X). @@ -90,6 +96,13 @@ class LMNN(MahalanobisMixin, TransformerMixin): Relative weight between pull and push terms, with 0.5 meaning equal weight. + regularization : Not used + .. deprecated:: 0.6.3 + `regularization` was deprecated in version 0.6.3 and will + be removed in 0.6.4. It is replaced by `push_loss_weight` that more + explicitly defines which loss term is meant to be weighted. + + max_iter : int, optional (default=1000) Maximum number of iterations of the optimization procedure. @@ -153,17 +166,19 @@ class LMNN(MahalanobisMixin, TransformerMixin): https://en.wikipedia.org/wiki/Large_margin_nearest_neighbor """ - def __init__(self, n_neighbors=3, n_components=None, init='auto', - max_impostors=500_000, neighbors_params=None, - push_loss_weight=0.5, max_iter=50, tol=1e-5, - preprocessor=None, verbose=False, random_state=None): + def __init__(self, n_neighbors=3, k='deprecated', n_components=None, + init='auto', max_impostors=500_000, neighbors_params=None, + push_loss_weight=0.5, regularization='deprecated', max_iter=50, + tol=1e-5, preprocessor=None, verbose=False, random_state=None): self.n_neighbors = n_neighbors + self.k = k self.n_components = n_components self.init = init self.max_impostors = max_impostors self.neighbors_params = neighbors_params self.push_loss_weight = push_loss_weight + self.regularization = regularization self.max_iter = max_iter self.tol = tol self.verbose = verbose @@ -248,6 +263,19 @@ def _validate_params(self, X, y): classes: array, shape (n_classes,), the non-singleton classes encoded. """ + # TODO: remove these in v0.6.4 + if self.k != 'deprecated': + warnings.warn('"k" parameter from initialization is not used.' + ' It has been deprecated in version 0.6.3 and will be' + ' removed in 0.6.4. Use the "n_neighbors" parameter ' + 'instead.', DeprecationWarning) + + if self.regularization != 'deprecated': + warnings.warn('"regularization" parameter from initialization is not ' + 'used. It has been deprecated in version 0.6.3 and will ' + 'be removed in 0.6.4. Use the "push_loss_weight" parameter' + 'instead.', DeprecationWarning) + # Find the appearing classes and the class index of each of the samples classes, y = np.unique(y, return_inverse=True) classes_inverse = np.arange(len(classes)) @@ -630,7 +658,7 @@ def _paired_distances_blockwise(X, ind_a, ind_b, squared=True, block_size=8): """ bytes_per_row = X.shape[1] * X.itemsize - batch_size = int(block_size*1024*1024 // bytes_per_row) + batch_size = int(block_size * 1024 * 1024 // bytes_per_row) n_pairs = len(ind_a) distances = np.zeros(n_pairs) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 44ea0eb3..2355dc40 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -396,7 +396,7 @@ def test_loss_grad_lbfgs(self): pull_loss_grad_m = _sum_weighted_outer_products(X, tn_graph) kwargs = { - 'classes': classes, + 'classes': classes, 'target_neighbors': target_neighbors, 'pull_loss_grad_m': pull_loss_grad_m, } @@ -407,8 +407,19 @@ def fun(L): def grad(L): return lmnn._loss_grad_lbfgs(L, X, y, **kwargs)[1] + # compute gradient with and without finite differences + epsilon = np.sqrt(np.finfo(float).eps) + grad_fin_diff = approx_fprime(L.ravel(), fun, epsilon) + grad_lmnn = grad(L) + + # compute absolute error + grad_error = np.sqrt((grad_lmnn - grad_fin_diff)**2) + # compute relative error - rel_diff = check_grad(fun, grad, L.ravel()) / np.linalg.norm(grad(L)) + # rel_diff1 = grad_error / np.linalg.norm(grad_fin_diff) + # rel_diff2 = grad_error / np.linalg.norm(grad_lmnn) + + rel_diff = grad_error / np.linalg.norm(grad_lmnn) np.testing.assert_almost_equal(rel_diff, 0., decimal=5) From 4b9a01ae1b5ffcd2543cfda84360a58e2def880a Mon Sep 17 00:00:00 2001 From: John Chiotellis Date: Sun, 23 May 2021 23:08:43 +0200 Subject: [PATCH 08/10] add deprecation vars in TestStringRepr --- test/test_base_metric.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/test_base_metric.py b/test/test_base_metric.py index 9a62f8d9..2d8fd69e 100644 --- a/test/test_base_metric.py +++ b/test/test_base_metric.py @@ -36,9 +36,10 @@ def test_covariance(self): remove_spaces(f"Covariance({merged_kwargs})")) def test_lmnn(self): - def_kwargs = {'n_neighbors': 3, 'n_components': None, 'init': 'auto', - 'max_impostors': 500_000, 'neighbors_params': None, - 'push_loss_weight': 0.5, 'max_iter': 50, 'tol': 1e-5, + def_kwargs = {'n_neighbors': 3, 'k': 'deprecated', 'n_components': None, + 'init': 'auto', 'max_impostors': 500_000, + 'neighbors_params': None, 'push_loss_weight': 0.5, + 'regularization': 'deprecated', 'max_iter': 50, 'tol': 1e-5, 'preprocessor': None, 'verbose': False, 'random_state': None} def_kwargs = {k: def_kwargs[k] for k in sorted(def_kwargs.keys())} From cc5c7131c1d30015bd2a477c45449b79117b50e1 Mon Sep 17 00:00:00 2001 From: John Chiotellis Date: Sun, 29 Aug 2021 15:31:49 +0200 Subject: [PATCH 09/10] add back test_loss_func() test, make random state private --- metric_learn/lmnn.py | 10 +++--- test/metric_learn_test.py | 71 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 6 deletions(-) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index dc8e219d..dfb3f2fe 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -135,8 +135,6 @@ class LMNN(MahalanobisMixin, TransformerMixin): n_iter_ : `int` The number of iterations the solver has run. - random_state_ : numpy.RandomState - Pseudo random number generator object used during initialization. Examples -------- @@ -201,12 +199,12 @@ def fit(self, X, y): n_components = _check_n_components(n_features, self.n_components) # Initialize the random generator - self.random_state_ = check_random_state(self.random_state) + self._random_state = check_random_state(self.random_state) # Initialize transformation self.components_ = _initialize_components(n_components, X, y, self.init, verbose=self.verbose, - random_state=self.random_state_) + random_state=self._random_state) # remove singletons after initializing components X, y, classes = self._validate_params(X, y) @@ -462,7 +460,7 @@ def _find_impostors(self, X_embedded, y, classes, margin_radii): n_impostors = len(ind_impostors) if n_impostors: if n_impostors > self.max_impostors: - ind_samples = self.random_state_.choice( + ind_samples = self._random_state.choice( n_impostors, self.max_impostors, replace=False) ind_impostors = ind_impostors[ind_samples] @@ -479,7 +477,7 @@ def _find_impostors(self, X_embedded, y, classes, margin_radii): # Make sure we do not exceed max_impostors n_impostors = len(imp_row) if n_impostors > self.max_impostors: - ind_samples = self.random_state_.choice( + ind_samples = self._random_state.choice( n_impostors, self.max_impostors, replace=False) imp_row = imp_row[ind_samples] imp_col = imp_col[ind_samples] diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 2355dc40..2fce2fa4 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -423,6 +423,77 @@ def grad(L): np.testing.assert_almost_equal(rel_diff, 0., decimal=5) +def test_loss_func(capsys): + """Test the loss function (and its gradient) on a simple example, + by comparing the results with the actual implementation of metric-learn, + with a very simple (but nonperformant) implementation""" + + import scipy + from sklearn.metrics import euclidean_distances + + # toy dataset to use + X, y = make_classification(n_samples=40, n_classes=2, + n_features=6, + n_redundant=0, shuffle=True, + scale=[1, 1, 20, 20, 20, 20], random_state=42) + + def hinge(a): + if a > 0: + return a, 1 + else: + return 0, 0 + + def loss_fn(L, X, y, target_neighbors, reg): + L = L.reshape(-1, X.shape[1]) + Lx = np.dot(X, L.T) + loss = 0 + total_active = 0 + grad = np.zeros_like(L) + for i in range(X.shape[0]): + for j in target_neighbors[i]: + loss += (1 - reg) * np.sum((Lx[i] - Lx[j]) ** 2) + grad += (1 - reg) * np.outer(Lx[i] - Lx[j], X[i] - X[j]) + for k in range(X.shape[0]): + if y[i] != y[k]: + hin, active = hinge(1 + np.sum((Lx[i] - Lx[j])**2) - + np.sum((Lx[i] - Lx[k])**2)) + total_active += active + if active: + loss += reg * hin + grad += (reg * (np.outer(Lx[i] - Lx[j], X[i] - X[j]) - + np.outer(Lx[i] - Lx[k], X[i] - X[k]))) + grad = 2 * grad + return grad, loss, total_active + + # we check that the gradient we have computed in the non-performant implem + # is indeed the true gradient on a toy example: + + def _select_targets(X, y, k): + target_neighbors = np.empty((X.shape[0], k), dtype=int) + for label in np.unique(y): + inds, = np.nonzero(y == label) + dd = euclidean_distances(X[inds], squared=True) + np.fill_diagonal(dd, np.inf) + nn = np.argsort(dd)[..., :k] + target_neighbors[inds] = inds[nn] + return target_neighbors + + target_neighbors = _select_targets(X, y, 2) + regularization = 0.5 + n_features = X.shape[1] + x0 = np.random.randn(1, n_features) + + def loss(x0): + return loss_fn(x0.reshape(-1, n_features), X, y, target_neighbors, + regularization)[1] + + def grad(x0): + return loss_fn(x0.reshape(-1, n_features), X, y, target_neighbors, + regularization)[0].ravel() + + scipy.optimize.check_grad(loss, grad, x0.ravel()) + + def test_compute_push_loss(): """Test if the push loss is computed correctly From e5cd82e04a1f0de38d0352acbc43c8fd3ea0c5c9 Mon Sep 17 00:00:00 2001 From: John Chiotellis Date: Sun, 29 Aug 2021 16:03:19 +0200 Subject: [PATCH 10/10] add better docstring for push_loss_weight --- metric_learn/lmnn.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index dfb3f2fe..95868f37 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -93,8 +93,10 @@ class LMNN(MahalanobisMixin, TransformerMixin): neighbors. push_loss_weight: float, optional (default=0.5) - Relative weight between pull and push terms, with 0.5 meaning equal - weight. + Relative weight between pull loss and push loss terms, with 0.5 meaning + equal weight. Note that in practice the push loss weight will be + unweighted and the pull loss will be weighted accordingly by a factor + `(1 - push_loss_weight) / push_loss_weight`. regularization : Not used .. deprecated:: 0.6.3 @@ -102,7 +104,6 @@ class LMNN(MahalanobisMixin, TransformerMixin): be removed in 0.6.4. It is replaced by `push_loss_weight` that more explicitly defines which loss term is meant to be weighted. - max_iter : int, optional (default=1000) Maximum number of iterations of the optimization procedure.