From 226c80ce0bf1414cc5c08a8627157dc5a93bf374 Mon Sep 17 00:00:00 2001
From: John Chiotellis <johnyc.code@gmail.com>
Date: Sat, 20 Mar 2021 16:59:10 +0100
Subject: [PATCH 01/10] add new LMNN

---
 .gitignore                                |   5 +-
 metric_learn/lmnn.py                      | 626 ++++++++++++++--------
 test/metric_learn_test.py                 | 268 +++------
 test/test_base_metric.py                  |   4 +-
 test/test_components_metric_conversion.py |   2 +-
 test/test_fit_transform.py                |   4 +-
 test/test_mahalanobis_mixin.py            |   2 +-
 7 files changed, 494 insertions(+), 417 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8321c7d2..02ee652d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,7 @@ htmlcov/
 .cache/
 .pytest_cache/
 doc/auto_examples/*
-doc/generated/*
\ No newline at end of file
+doc/generated/*
+
+# Pycharm artifacts
+.idea/
diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
index 8bdc4bf0..b2a3817f 100644
--- a/metric_learn/lmnn.py
+++ b/metric_learn/lmnn.py
@@ -1,10 +1,16 @@
 """
 Large Margin Nearest Neighbor Metric learning (LMNN)
 """
+import sys
+import time
+import warnings
 import numpy as np
-from collections import Counter
+from scipy.optimize import minimize
+from scipy.sparse import coo_matrix, csr_matrix, csc_matrix
+from sklearn.neighbors import NearestNeighbors
 from sklearn.metrics import euclidean_distances
 from sklearn.base import TransformerMixin
+from sklearn.exceptions import ConvergenceWarning
 
 from ._util import _initialize_components, _check_n_components
 from .base_metric import MahalanobisMixin
@@ -23,6 +29,12 @@ class LMNN(MahalanobisMixin, TransformerMixin):
 
   Parameters
   ----------
+  n_neighbors : int, optional (default=3)
+    Number of neighbors to consider, not including self-edges.
+
+  n_components : int or None, optional (default=None)
+    Dimensionality of reduced space (if None, defaults to dimension of X).
+
   init : string or numpy array, optional (default='auto')
     Initialization of the linear transformation. Possible options are
     'auto', 'pca', 'identity', 'random', and a numpy array of shape
@@ -63,35 +75,28 @@ class LMNN(MahalanobisMixin, TransformerMixin):
       :meth:`fit` and n_features_a must be less than or equal to that.
       If ``n_components`` is not None, n_features_a must match it.
 
-  k : int, optional (default=3)
-    Number of neighbors to consider, not including self-edges.
+  neighbors_params : dict, optional (default=None)
+    Parameters to pass to a :class:`neighbors.NearestNeighbors` instance -
+    apart from ``n_neighbors`` - that will be used to select the target
+    neighbors.
 
-  min_iter : int, optional (default=50)
-    Minimum number of iterations of the optimization procedure.
+  push_loss_weight: float, optional (default=0.5)
+    Relative weight between pull and push terms, with 0.5 meaning equal
+    weight.
 
   max_iter : int, optional (default=1000)
     Maximum number of iterations of the optimization procedure.
 
-  learn_rate : float, optional (default=1e-7)
-    Learning rate of the optimization procedure
-
-  tol : float, optional (default=0.001)
+  tol : float, optional (default=0.00001)
     Tolerance of the optimization procedure. If the objective value varies
     less than `tol`, we consider the algorithm has converged and stop it.
 
-  verbose : bool, optional (default=False)
-    Whether to print the progress of the optimization procedure.
-
-  regularization: float, optional (default=0.5)
-    Relative weight between pull and push terms, with 0.5 meaning equal
-    weight.
-
   preprocessor : array-like, shape=(n_samples, n_features) or callable
     The preprocessor to call to get tuples from indices. If array-like,
     tuples will be formed like this: X[indices].
 
-  n_components : int or None, optional (default=None)
-    Dimensionality of reduced space (if None, defaults to dimension of X).
+  verbose : bool, optional (default=False)
+    Whether to print the progress of the optimization procedure.
 
   random_state : int or numpy.RandomState or None, optional (default=None)
     A pseudo random number generator object or a seed for it if int. If
@@ -101,233 +106,432 @@ class LMNN(MahalanobisMixin, TransformerMixin):
 
   Attributes
   ----------
+  components_ : `numpy.ndarray`, shape=(n_components, n_features)
+    The learned linear transformation ``L``.
+
+  n_neighbors_ : int
+      The provided ``n_neighbors`` is decreased if it is greater than or
+      equal to  min(number of elements in each class).
+
   n_iter_ : `int`
     The number of iterations the solver has run.
 
-  components_ : `numpy.ndarray`, shape=(n_components, n_features)
-    The learned linear transformation ``L``.
 
   Examples
   --------
 
   >>> import numpy as np
   >>> from metric_learn import LMNN
+  >>> from sklearn.neighbors import KNeighborsClassifier
   >>> from sklearn.datasets import load_iris
   >>> iris_data = load_iris()
   >>> X = iris_data['data']
   >>> Y = iris_data['target']
-  >>> lmnn = LMNN(k=5, learn_rate=1e-6)
-  >>> lmnn.fit(X, Y, verbose=False)
+  >>> lmnn = LMNN(n_neighbors=3)
+  >>> lmnn.fit(X, Y)
+  >>> knn = KNeighborsClassifier(n_neighbors=3)
+  >>> knn.fit(lmnn.transform(X), Y)
 
   References
   ----------
-  .. [1] K. Q. Weinberger, J. Blitzer, L. K. Saul. `Distance Metric
-         Learning for Large Margin Nearest Neighbor Classification
-         <http://papers.nips.cc/paper/2795-distance-metric\
-         -learning-for-large-margin-nearest-neighbor-classification>`_. NIPS
-         2005.
+  .. [1] Weinberger, Kilian Q., and Lawrence K. Saul.
+         "Distance Metric Learning for Large Margin Nearest Neighbor
+         Classification."
+         Journal of Machine Learning Research, Vol. 10, Feb. 2009,
+         pp. 207-244.
+         http://jmlr.csail.mit.edu/papers/volume10/weinberger09a/weinberger09a.pdf
+
+  .. [2] Wikipedia entry on Large Margin Nearest Neighbor
+         https://en.wikipedia.org/wiki/Large_margin_nearest_neighbor
+
   """
+  def __init__(self, n_neighbors=3, n_components=None, init='auto',
+               neighbors_params=None, push_loss_weight=0.5, max_iter=50,
+               tol=1e-5, preprocessor=None, verbose=False, random_state=None):
 
-  def __init__(self, init='auto', k=3, min_iter=50, max_iter=1000,
-               learn_rate=1e-7, regularization=0.5, convergence_tol=0.001,
-               verbose=False, preprocessor=None,
-               n_components=None, random_state=None):
+    self.n_neighbors = n_neighbors
+    self.n_components = n_components
     self.init = init
-    self.k = k
-    self.min_iter = min_iter
+    self.neighbors_params = neighbors_params
+    self.push_loss_weight = push_loss_weight
     self.max_iter = max_iter
-    self.learn_rate = learn_rate
-    self.regularization = regularization
-    self.convergence_tol = convergence_tol
+    self.tol = tol
     self.verbose = verbose
-    self.n_components = n_components
     self.random_state = random_state
     super(LMNN, self).__init__(preprocessor)
 
   def fit(self, X, y):
-    k = self.k
-    reg = self.regularization
-    learn_rate = self.learn_rate
-
-    X, y = self._prepare_inputs(X, y, dtype=float,
-                                ensure_min_samples=2)
-    num_pts, d = X.shape
-    output_dim = _check_n_components(d, self.n_components)
-    unique_labels, label_inds = np.unique(y, return_inverse=True)
-    if len(label_inds) != num_pts:
-      raise ValueError('Must have one label per point.')
-    self.labels_ = np.arange(len(unique_labels))
-
-    self.components_ = _initialize_components(output_dim, X, y, self.init,
-                                              self.verbose,
+    """ Train Large Margin Nearest Neighbor model.
+
+    :param X: array, shape (n_samples, n_features), the training samples.
+    :param y: array, shape (n_samples,), the training labels.
+
+    :return: The trained LMNN model.
+    """
+
+    # Validate inputs
+    X, y = self._prepare_inputs(X, y, dtype=float, ensure_min_samples=2)
+    X, y, classes = self._validate_params(X, y)
+
+    n_samples, n_features = X.shape
+    n_components = _check_n_components(n_features, self.n_components)
+
+    # Initialize transformation
+    self.components_ = _initialize_components(n_components, X, y, self.init,
+                                              verbose=self.verbose,
                                               random_state=self.random_state)
-    required_k = np.bincount(label_inds).min()
-    if self.k > required_k:
-      raise ValueError('not enough class labels for specified k'
-                       ' (smallest class has %d)' % required_k)
 
-    target_neighbors = self._select_targets(X, label_inds)
+    # Find the target neighbors of each sample
+    target_neighbors = self._select_target_neighbors(X, y, classes)
+
+    # Compute the gradient w.r.t. the target neighbors which remains constant
+    tn_graph = _make_knn_graph(target_neighbors)
+    const_grad = _sum_weighted_outer_differences(X, tn_graph)
 
-    # sum outer products
-    dfG = _sum_outer_products(X, target_neighbors.flatten(),
-                              np.repeat(np.arange(X.shape[0]), k))
+    # Compute the pull loss weight such that the push loss weight becomes 1
+    pull_loss_weight = (1 - self.push_loss_weight) / self.push_loss_weight
+    const_grad *= pull_loss_weight
 
-    # initialize L
-    L = self.components_
+    int_verbose = int(self.verbose)
+    disp = int_verbose - 2 if int_verbose > 1 else -1
+    optimizer_params = {
+      'method': 'L-BFGS-B',
+      'fun': self._loss_grad_lbfgs,
+      'jac': True,
+      'args': (X, y, classes, target_neighbors, const_grad),
+      'x0': self.components_,
+      'tol': self.tol,
+      'options': dict(maxiter=self.max_iter, disp=disp),
+    }
 
-    # first iteration: we compute variables (including objective and gradient)
-    #  at initialization point
-    G, objective, total_active = self._loss_grad(X, L, dfG, k,
-                                                 reg, target_neighbors,
-                                                 label_inds)
+    # Call the optimizer
+    self.n_iter_ = 0
+    opt_result = minimize(**optimizer_params)
 
-    it = 1  # we already made one iteration
+    # Reshape the solution found by the optimizer
+    self.components_ = opt_result.x.reshape(-1, n_features)
 
     if self.verbose:
-      print("iter | objective | objective difference | active constraints",
-            "| learning rate")
-
-    # main loop
-    for it in range(2, self.max_iter):
-      # then at each iteration, we try to find a value of L that has better
-      # objective than the previous L, following the gradient:
-      while True:
-        # the next point next_L to try out is found by a gradient step
-        L_next = L - learn_rate * G
-        # we compute the objective at next point
-        # we copy variables that can be modified by _loss_grad, because if we
-        # retry we don t want to modify them several times
-        (G_next, objective_next, total_active_next) = (
-            self._loss_grad(X, L_next, dfG, k, reg, target_neighbors,
-                            label_inds))
-        assert not np.isnan(objective)
-        delta_obj = objective_next - objective
-        if delta_obj > 0:
-          # if we did not find a better objective, we retry with an L closer to
-          # the starting point, by decreasing the learning rate (making the
-          # gradient step smaller)
-          learn_rate /= 2
-        else:
-          # otherwise, if we indeed found a better obj, we get out of the loop
-          break
-      # when the better L is found (and the related variables), we set the
-      # old variables to these new ones before next iteration and we
-      # slightly increase the learning rate
-      L = L_next
-      G, objective, total_active = G_next, objective_next, total_active_next
-      learn_rate *= 1.01
-
-      if self.verbose:
-        print(it, objective, delta_obj, total_active, learn_rate)
-
-      # check for convergence
-      if it > self.min_iter and abs(delta_obj) < self.convergence_tol:
-        if self.verbose:
-          print("LMNN converged with objective", objective)
-        break
-    else:
-      if self.verbose:
-        print("LMNN didn't converge in %d steps." % self.max_iter)
-
-    # store the last L
-    self.components_ = L
-    self.n_iter_ = it
+      cls_name = self.__class__.__name__
+
+      # Warn the user if the algorithm did not converge
+      if not opt_result.success:
+        warnings.warn('[{}] LMNN did not converge: {}'.format(
+          cls_name, opt_result.message), ConvergenceWarning)
+
     return self
 
-  def _loss_grad(self, X, L, dfG, k, reg, target_neighbors, label_inds):
-    # Compute pairwise distances under current metric
-    Lx = L.dot(X.T).T
-
-    # we need to find the furthest neighbor:
-    Ni = 1 + _inplace_paired_L2(Lx[target_neighbors], Lx[:, None, :])
-    furthest_neighbors = np.take_along_axis(target_neighbors,
-                                            Ni.argmax(axis=1)[:, None], 1)
-    impostors = self._find_impostors(furthest_neighbors.ravel(), X,
-                                     label_inds, L)
-
-    g0 = _inplace_paired_L2(*Lx[impostors])
-
-    # we reorder the target neighbors
-    g1, g2 = Ni[impostors]
-    # compute the gradient
-    total_active = 0
-    df = np.zeros((X.shape[1], X.shape[1]))
-    for nn_idx in reversed(range(k)):  # note: reverse not useful here
-      act1 = g0 < g1[:, nn_idx]
-      act2 = g0 < g2[:, nn_idx]
-      total_active += act1.sum() + act2.sum()
-
-      targets = target_neighbors[:, nn_idx]
-      PLUS, pweight = _count_edges(act1, act2, impostors, targets)
-      df += _sum_outer_products(X, PLUS[:, 0], PLUS[:, 1], pweight)
-
-      in_imp, out_imp = impostors
-      df -= _sum_outer_products(X, in_imp[act1], out_imp[act1])
-      df -= _sum_outer_products(X, in_imp[act2], out_imp[act2])
-
-    # do the gradient update
-    assert not np.isnan(df).any()
-    G = dfG * reg + df * (1 - reg)
-    G = L.dot(G)
-    # compute the objective function
-    objective = total_active * (1 - reg)
-    objective += G.flatten().dot(L.flatten())
-    return 2 * G, objective, total_active
-
-  def _select_targets(self, X, label_inds):
-    target_neighbors = np.empty((X.shape[0], self.k), dtype=int)
-    for label in self.labels_:
-      inds, = np.nonzero(label_inds == label)
-      dd = euclidean_distances(X[inds], squared=True)
-      np.fill_diagonal(dd, np.inf)
-      nn = np.argsort(dd)[..., :self.k]
-      target_neighbors[inds] = inds[nn]
+  def _validate_params(self, X, y):
+    """ Validate parameters as soon as :meth:`fit` is called.
+
+    :param X: array, shape (n_samples, n_features), the training samples.
+    :param y: array, shape (n_samples,), the training labels.
+
+    :return:
+      X: array, shape (n_samples, n_features), the validated training samples.
+      y: array, shape (n_samples,), the validated training labels.
+      classes: array, shape (n_classes,), the non-singleton classes encoded.
+    """
+
+    # Find the appearing classes and the class index of each of the samples
+    classes, y = np.unique(y, return_inverse=True)
+    classes_inverse = np.arange(len(classes))
+
+    # Ignore classes that have less than two samples (singleton classes)
+    class_sizes = np.bincount(y)
+    mask_singleton_class = class_sizes == 1
+    singleton_classes = np.where(mask_singleton_class)[0]
+    if len(singleton_classes):
+      warnings.warn('There are {} singleton classes that will be ignored '
+                    'during training. A copy of the inputs `X` and `y` will '
+                    'be made.'.format(len(singleton_classes)))
+
+      mask_singleton_sample = np.asarray([yi in singleton_classes for
+                                          yi in y])
+      X = X[~mask_singleton_sample]
+      y = y[~mask_singleton_sample]
+
+    # Check that there are at least 2 non-singleton classes
+    n_classes_non_singleton = len(classes) - len(singleton_classes)
+    if n_classes_non_singleton < 2:
+      raise ValueError('LMNN needs at least 2 non-singleton classes, got {}.'
+                       .format(n_classes_non_singleton))
+
+    classes_inverse_non_singleton = classes_inverse[~mask_singleton_class]
+
+    # Check the preferred number of neighbors
+    min_non_singleton_size = class_sizes[~mask_singleton_class].min()
+    if self.n_neighbors >= min_non_singleton_size:
+      warnings.warn('`n_neighbors` (={}) is not less than the number of '
+           'samples in the smallest non-singleton class (={}). '
+           '`n_neighbors_` will be set to {} for estimation.'
+           .format(self.n_neighbors, min_non_singleton_size,
+                   min_non_singleton_size - 1))
+
+    self.n_neighbors_ = min(self.n_neighbors, min_non_singleton_size - 1)
+
+    return X, y, classes_inverse_non_singleton
+
+  def _loss_grad_lbfgs(self, L, X, y, classes, target_neighbors, const_grad):
+    """ Compute loss and gradient after one optimization iteration.
+
+    :param L: array, shape (n_components * n_features,)
+              The current (flattened) linear transformation.
+    :param X: array, shape (n_samples, n_features), the training samples.
+    :param y: array, shape (n_samples,), the training labels.
+    :param classes: array, shape (n_classes,), classes encoded as intergers.
+    :param target_neighbors: array, shape (n_samples, n_neighbors),
+                             the target neighbors of each training sample.
+    :param const_grad: array, shape (n_features, n_features),
+                       the (weighted) gradient component due to target
+                       neighbors that stays fixed throughout the algorithm.
+
+    :return:
+      loss: float, the loss given the current solution.
+      grad: array, shape (n_components, n_features), the (flattened) gradient.
+    """
+
+    # Print headers
+    if self.n_iter_ == 0 and self.verbose:
+      header_fields = ['Iteration', 'Objective Value',
+                       '#Active Triplets', 'Time(s)']
+      header_fmt = '{:>10} {:>20} {:>20} {:>10}'
+      header = header_fmt.format(*header_fields)
+      cls_name = self.__class__.__name__
+      print('[{}]'.format(cls_name))
+      print('[{}] {}'.format(cls_name, header))
+      print('[{}] {}'.format(cls_name, '-' * len(header)))
+
+    # Measure duration of optimization iteration
+    start_time = time.time()
+
+    # Reshape current solution
+    n_samples, n_features = X.shape
+    L = L.reshape(-1, n_features)
+
+    # Transform samples according to the current solution
+    X_embedded = X @ L.T
+
+    # Compute squared distances to the target neighbors
+    n_neighbors = target_neighbors.shape[1]
+    dist_tn = np.zeros((n_samples, n_neighbors))
+    for k in range(n_neighbors):
+      ind_tn = target_neighbors[:, k]
+      diffs = X_embedded - X_embedded[ind_tn]
+      dist_tn[:, k] = np.einsum('ij,ij->i', diffs, diffs)  # row norms
+
+    # Add margin to the distances
+    dist_tn += 1
+
+    # Find impostors
+    impostors_graph = _find_impostors(X_embedded, y, classes, dist_tn[:, -1])
+
+    # Compute the push loss and its gradient
+    push_loss, push_loss_grad, n_active = _compute_push_loss(
+      X, target_neighbors, dist_tn, impostors_graph)
+
+    # Compute the total loss
+    M = L.T @ L
+    new_pull_loss = np.dot(const_grad.ravel(), M.ravel())
+    loss = push_loss + new_pull_loss
+
+    # Compute the total gradient
+    grad = np.dot(L, const_grad + push_loss_grad)
+    grad *= 2
+
+    # Report iteration metrics
+    self.n_iter_ += 1
+    if self.verbose:
+      elapsed_time = time.time() - start_time
+      values_fmt = '[{}] {:>10} {:>20.6e} {:>20,} {:>10.2f}'
+      cls_name = self.__class__.__name__
+      print(values_fmt.format(cls_name, self.n_iter_, loss, n_active,
+                              elapsed_time))
+      sys.stdout.flush()
+
+    return loss, grad.ravel()
+
+  def _select_target_neighbors(self, X, y, classes):
+    """ Find the target neighbors of each training sample.
+
+    :param X: array, shape (n_samples, n_features), the training samples.
+    :param y: array, shape (n_samples,), the training labels.
+    :param classes: array, shape (n_classes,), the classes encoded as integers.
+
+    :return: array, shape (n_samples, n_neighbors),
+             the indices of the target neighbors of each training sample.
+    """
+
+    start_time = time.time()
+    cls_name = self.__class__.__name__
+    if self.verbose:
+      print('[{}] Finding the target neighbors...'.format(cls_name))
+      sys.stdout.flush()
+
+    nn_kwargs = self.neighbors_params or {}
+    nn = NearestNeighbors(n_neighbors=self.n_neighbors_, **nn_kwargs)
+    target_neighbors = np.empty((X.shape[0], self.n_neighbors_), dtype=int)
+
+    for label in classes:
+      ind_class = np.where(y == label)[0]
+      nn.fit(X[ind_class])
+      ind_neighbors = nn.kneighbors(return_distance=False)
+      target_neighbors[ind_class] = ind_class[ind_neighbors]
+
+    if self.verbose:
+      elapsed_time = time.time() - start_time
+      print('[{}] Found the target neighbors in {:5.2f}s.'.format(
+        cls_name, elapsed_time))
+
     return target_neighbors
 
-  def _find_impostors(self, furthest_neighbors, X, label_inds, L):
-    Lx = X.dot(L.T)
-    margin_radii = 1 + _inplace_paired_L2(Lx[furthest_neighbors], Lx)
-    impostors = []
-    for label in self.labels_[:-1]:
-      in_inds, = np.nonzero(label_inds == label)
-      out_inds, = np.nonzero(label_inds > label)
-      dist = euclidean_distances(Lx[out_inds], Lx[in_inds], squared=True)
-      i1, j1 = np.nonzero(dist < margin_radii[out_inds][:, None])
-      i2, j2 = np.nonzero(dist < margin_radii[in_inds])
-      i = np.hstack((i1, i2))
-      j = np.hstack((j1, j2))
-      if i.size > 0:
-        # get unique (i,j) pairs using index trickery
-        shape = (i.max() + 1, j.max() + 1)
-        tmp = np.ravel_multi_index((i, j), shape)
-        i, j = np.unravel_index(np.unique(tmp), shape)
-      impostors.append(np.vstack((in_inds[j], out_inds[i])))
-    if len(impostors) == 0:
-        # No impostors detected
-        return impostors
-    return np.hstack(impostors)
-
-
-def _inplace_paired_L2(A, B):
-  '''Equivalent to ((A-B)**2).sum(axis=-1), but modifies A in place.'''
-  A -= B
-  return np.einsum('...ij,...ij->...i', A, A)
-
-
-def _count_edges(act1, act2, impostors, targets):
-  imp = impostors[0, act1]
-  c = Counter(zip(imp, targets[imp]))
-  imp = impostors[1, act2]
-  c.update(zip(imp, targets[imp]))
-  if c:
-    active_pairs = np.array(list(c.keys()))
-  else:
-    active_pairs = np.empty((0, 2), dtype=int)
-  return active_pairs, np.array(list(c.values()))
-
-
-def _sum_outer_products(data, a_inds, b_inds, weights=None):
-  Xab = data[a_inds] - data[b_inds]
-  if weights is not None:
-    return np.dot(Xab.T, Xab * weights[:, None])
-  return np.dot(Xab.T, Xab)
+
+def _find_impostors(X_embedded, y, classes, margin_radii):
+  """ Find the samples that violate the margin.
+
+  :param X_embedded: array, shape (n_samples, n_components),
+                     the training samples in the embedding space.
+  :param y: array, shape (n_samples,), training labels.
+  :param classes: array, shape (n_classes,), the classes encoded as integers.
+  :param margin_radii: array, shape (n_samples,), squared distances of
+                       training samples to their farthest target neighbors
+                       plus margin.
+
+  :return: coo_matrix, shape (n_samples, n_neighbors),
+           If sample i is an impostor to sample j or vice versa, then the
+           value of A[i, j] is the squared distance between samples i and j.
+           Otherwise A[i, j] is zero.
+
+  """
+
+  # Initialize lists for impostors storage
+  imp_row, imp_col, imp_dist = [], [], []
+
+  for label in classes[:-1]:
+    ind_a = np.where(y == label)[0]
+    ind_b = np.where(y > label)[0]
+
+    dist = euclidean_distances(X_embedded[ind_b], X_embedded[ind_a],
+                               squared=True)  # (b, a)
+
+    radii_a = margin_radii[ind_a]  # (a,)
+    radii_b = margin_radii[ind_b]  # (b,)
+
+    # Find samples in B that are impostors to A
+    imp_ba = np.where((dist < radii_a[None, :]).ravel())[0]
+
+    # Find samples in A that are impostors to B
+    imp_ab = np.where((dist < radii_b[:, None]).ravel())[0]
+
+    # merge and filter unique impostors
+    ind_impostors = np.unique(np.concatenate((imp_ba, imp_ab)))
+
+    if len(ind_impostors):
+      # Map indices back to the original training data indices
+      ii, jj = np.unravel_index(ind_impostors, dist.shape)
+      imp_row.extend(ind_b[ii])
+      imp_col.extend(ind_a[jj])
+      imp_dist.extend(dist.ravel()[ind_impostors])
+
+  # turn lists to numpy arrays
+  imp_row = np.asarray(imp_row, dtype=np.intp)
+  imp_col = np.asarray(imp_col, dtype=np.intp)
+  imp_dist = np.asarray(imp_dist)
+
+  # store impostors as a sparse matrix
+  n = X_embedded.shape[0]
+  impostors_graph = coo_matrix((imp_dist, (imp_row, imp_col)), shape=(n, n))
+
+  return impostors_graph
+
+
+def _compute_push_loss(X, target_neighbors, inflated_dist_tn, impostors_graph):
+  """ Compute the push loss L_push = max(d(a, p) + 1 - d(a, n), 0)
+
+  :param X: array, shape (n_samples, n_features), the training samples.
+  :param target_neighbors: array, shape (n_samples, n_neighbors),
+                           the target neighbors of each training sample.
+  :param inflated_dist_tn: array, shape (n_samples, n_neighbors),
+                           squared distances of each sample to their target
+                           neighbors plus margin.
+  :param impostors_graph: coo_matrix, shape (n_samples, n_samples),
+
+  :return:
+    loss: float, the push loss due to the given target neighbors and impostors.
+    grad: array, shape (n_features, n_features), the gradient of the push loss.
+    n_active_triplets: int, the number of active triplet constraints.
+
+  """
+
+  n_samples, n_neighbors = inflated_dist_tn.shape
+  imp_row = impostors_graph.row
+  imp_col = impostors_graph.col
+  dist_impostors = impostors_graph.data
+
+  loss = 0
+  shape = (n_samples, n_samples)
+  A0 = csr_matrix(shape)
+  sample_range = range(n_samples)
+  n_active_triplets = 0
+  for k in reversed(range(n_neighbors)):
+    # Consider margin violations to the samples in imp_row
+    losses1 = np.maximum(inflated_dist_tn[imp_row, k] - dist_impostors, 0)
+    ac = np.where(losses1 > 0)[0]
+    n_active_triplets += len(ac)
+    A1 = csr_matrix((2 * losses1[ac], (imp_row[ac], imp_col[ac])), shape)
+
+    # Consider margin violations to the samples in imp_col
+    losses2 = np.maximum(inflated_dist_tn[imp_col, k] - dist_impostors, 0)
+    ac = np.where(losses2 > 0)[0]
+    n_active_triplets += len(ac)
+    A2 = csc_matrix((2 * losses2[ac], (imp_row[ac], imp_col[ac])), shape)
+
+    # Update the loss
+    loss += np.dot(losses1, losses1) + np.dot(losses2, losses2)
+
+    # Update the weight matrix for gradient computation
+    val = (A1.sum(1).ravel() + A2.sum(0)).getA1()
+    A3 = csr_matrix((val, (sample_range, target_neighbors[:, k])), shape)
+    A0 = A0 - A1 - A2 + A3
+
+  grad = _sum_weighted_outer_differences(X, A0)
+
+  return loss, grad, n_active_triplets
+
+
+def _sum_weighted_outer_differences(X, weights):
+  """ Compute the sum of weighted outer pairwise differences.
+
+  :param X: array, shape (n_samples, n_features), data samples.
+  :param weights: csr_matrix, shape (n_samples, n_samples), sparse weights.
+
+  :return: array, shape (n_features, n_features),
+           the sum of all outer weighted differences.
+  """
+
+  W = weights + weights.T
+  D = W.sum(1).getA()
+
+  # X.T * (D - W) * X
+  LX = D * X - W @ X  # D is n x 1, W is n x n
+  ret = X.T @ LX
+
+  return ret
+
+
+def _make_knn_graph(indices):
+  """ Convert a dense indices matrix to a sparse adjacency matrix.
+
+  :param indices: array, shape (n_samples, n_neighbors),
+                  indices of the k nearest neighbors of each sample.
+  :return: csr_matrix, shape (n_samples, n_samples), the adjacency matrix.
+  """
+
+  n_samples, n_neighbors = indices.shape
+  row = np.repeat(range(n_samples), n_neighbors)
+  col = indices.ravel()
+  ind = np.ones(indices.size)
+  shape = (n_samples, n_samples)
+  knn_graph = csr_matrix((ind, (row, col)), shape=shape)
+
+  return knn_graph
diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py
index 4db0a1fc..42c6fe5d 100644
--- a/test/metric_learn_test.py
+++ b/test/metric_learn_test.py
@@ -26,7 +26,12 @@
                           MMC_Supervised, SDML, RCA, ITML, SCML)
 # Import this specially for testing.
 from metric_learn.constraints import wrap_pairs, Constraints
-from metric_learn.lmnn import _sum_outer_products
+from metric_learn.lmnn import (
+    _sum_weighted_outer_differences,
+    _make_knn_graph,
+    _find_impostors,
+    _compute_push_loss
+)
 
 
 def class_separation(X, labels):
@@ -362,12 +367,11 @@ def test_bounds_parameters_invalid(bounds):
 
 class TestLMNN(MetricTestCase):
   def test_iris(self):
-    lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False)
+    lmnn = LMNN(n_neighbors=5, verbose=False)
     lmnn.fit(self.iris_points, self.iris_labels)
 
-    csep = class_separation(lmnn.transform(self.iris_points),
-                            self.iris_labels)
-    self.assertLess(csep, 0.25)
+    csep = class_separation(lmnn.transform(self.iris_points), self.iris_labels)
+    self.assertLess(csep, 0.26)
 
   def test_loss_grad_lbfgs(self):
     """Test gradient of loss function
@@ -378,211 +382,98 @@ def test_loss_grad_lbfgs(self):
     X, y = make_classification(random_state=rng)
     L = rng.randn(rng.randint(1, X.shape[1] + 1), X.shape[1])
     lmnn = LMNN()
+    lmnn.n_iter_ = 0
+    lmnn.n_neighbors_ = lmnn.n_neighbors
 
-    k = lmnn.k
-    reg = lmnn.regularization
-
-    X, y = lmnn._prepare_inputs(X, y, dtype=float,
-                                ensure_min_samples=2)
+    X, y = lmnn._prepare_inputs(X, y, dtype=float, ensure_min_samples=2)
     num_pts, n_components = X.shape
-    unique_labels, label_inds = np.unique(y, return_inverse=True)
-    lmnn.labels_ = np.arange(len(unique_labels))
+    unique_labels, y_inverse = np.unique(y, return_inverse=True)
+    classes = np.arange(len(unique_labels))
     lmnn.components_ = np.eye(n_components)
 
-    target_neighbors = lmnn._select_targets(X, label_inds)
+    target_neighbors = lmnn._select_target_neighbors(X, y_inverse, classes)
 
     # sum outer products
-    dfG = _sum_outer_products(X, target_neighbors.flatten(),
-                              np.repeat(np.arange(X.shape[0]), k))
+    tn_graph = _make_knn_graph(target_neighbors)
+    const_grad = _sum_weighted_outer_differences(X, tn_graph)
+
 
-    # initialize L
-    def loss_grad(flat_L):
-      return lmnn._loss_grad(X, flat_L.reshape(-1, X.shape[1]), dfG,
-                             k, reg, target_neighbors, label_inds)
+    kwargs = {
+        'classes':  classes,
+        'target_neighbors': target_neighbors,
+        'const_grad': const_grad,
+    }
 
-    def fun(x):
-      return loss_grad(x)[1]
+    def fun(L):
+        return lmnn._loss_grad_lbfgs(L, X, y, **kwargs)[0]
 
-    def grad(x):
-      return loss_grad(x)[0].ravel()
+    def grad(L):
+        return lmnn._loss_grad_lbfgs(L, X, y, **kwargs)[1]
 
     # compute relative error
-    epsilon = np.sqrt(np.finfo(float).eps)
-    rel_diff = (check_grad(fun, grad, L.ravel()) /
-                np.linalg.norm(approx_fprime(L.ravel(), fun, epsilon)))
+    rel_diff = check_grad(fun, grad, L.ravel()) / np.linalg.norm(grad(L))
     np.testing.assert_almost_equal(rel_diff, 0., decimal=5)
 
 
-def test_loss_func(capsys):
-  """Test the loss function (and its gradient) on a simple example,
-  by comparing the results with the actual implementation of metric-learn,
-  with a very simple (but nonperformant) implementation"""
-
-  # toy dataset to use
-  X, y = make_classification(n_samples=10, n_classes=2,
-                             n_features=6,
-                             n_redundant=0, shuffle=True,
-                             scale=[1, 1, 20, 20, 20, 20], random_state=42)
-
-  def hinge(a):
-    if a > 0:
-      return a, 1
-    else:
-      return 0, 0
-
-  def loss_fn(L, X, y, target_neighbors, reg):
-     L = L.reshape(-1, X.shape[1])
-     Lx = np.dot(X, L.T)
-     loss = 0
-     total_active = 0
-     grad = np.zeros_like(L)
-     for i in range(X.shape[0]):
-       for j in target_neighbors[i]:
-         loss += (1 - reg) * np.sum((Lx[i] - Lx[j]) ** 2)
-         grad += (1 - reg) * np.outer(Lx[i] - Lx[j], X[i] - X[j])
-         for k in range(X.shape[0]):
-           if y[i] != y[k]:
-             hin, active = hinge(1 + np.sum((Lx[i] - Lx[j])**2) -
-                                 np.sum((Lx[i] - Lx[k])**2))
-             total_active += active
-             if active:
-               loss += reg * hin
-               grad += (reg * (np.outer(Lx[i] - Lx[j], X[i] - X[j]) -
-                               np.outer(Lx[i] - Lx[k], X[i] - X[k])))
-     grad = 2 * grad
-     return grad, loss, total_active
-
-  # we check that the gradient we have computed in the non-performant implem
-  # is indeed the true gradient on a toy example:
-
-  def _select_targets(X, y, k):
-    target_neighbors = np.empty((X.shape[0], k), dtype=int)
-    for label in np.unique(y):
-      inds, = np.nonzero(y == label)
-      dd = euclidean_distances(X[inds], squared=True)
-      np.fill_diagonal(dd, np.inf)
-      nn = np.argsort(dd)[..., :k]
-      target_neighbors[inds] = inds[nn]
-    return target_neighbors
-
-  target_neighbors = _select_targets(X, y, 2)
-  regularization = 0.5
-  n_features = X.shape[1]
-  x0 = np.random.randn(1, n_features)
-
-  def loss(x0):
-    return loss_fn(x0.reshape(-1, X.shape[1]), X, y, target_neighbors,
-                   regularization)[1]
-
-  def grad(x0):
-    return loss_fn(x0.reshape(-1, X.shape[1]), X, y, target_neighbors,
-                   regularization)[0].ravel()
-
-  scipy.optimize.check_grad(loss, grad, x0.ravel())
-
-  class LMNN_with_callback(LMNN):
-    """ We will use a callback to get the gradient (see later)
+def test_compute_push_loss():
+    """Test if the push loss is computed correctly
+
+    This test continues on the example from test_find_impostors. The push
+    loss is easy to compute, as we have only 4 violations and all of them
+    amount to 1 (squared distance to target neighbor + 1 - squared distance
+    to impostor = 4 + 1 - 4).
     """
 
-    def __init__(self, callback, *args, **kwargs):
-      self.callback = callback
-      super(LMNN_with_callback, self).__init__(*args, **kwargs)
-
-    def _loss_grad(self, *args, **kwargs):
-      grad, objective, total_active = (
-          super(LMNN_with_callback, self)._loss_grad(*args, **kwargs))
-      self.callback.append(grad)
-      return grad, objective, total_active
-
-  class LMNN_nonperformant(LMNN_with_callback):
-
-    def fit(self, X, y):
-      self.y = y
-      return super(LMNN_nonperformant, self).fit(X, y)
-
-    def _loss_grad(self, X, L, dfG, k, reg, target_neighbors, label_inds):
-      grad, loss, total_active = loss_fn(L.ravel(), X, self.y,
-                                         target_neighbors, self.regularization)
-      self.callback.append(grad)
-      return grad, loss, total_active
-
-  mem1, mem2 = [], []
-  lmnn_perf = LMNN_with_callback(verbose=True, random_state=42,
-                                 init='identity', max_iter=30, callback=mem1)
-  lmnn_nonperf = LMNN_nonperformant(verbose=True, random_state=42,
-                                    init='identity', max_iter=30,
-                                    callback=mem2)
-  objectives, obj_diffs, learn_rate, total_active = (dict(), dict(), dict(),
-                                                     dict())
-  for algo, name in zip([lmnn_perf, lmnn_nonperf], ['perf', 'nonperf']):
-    algo.fit(X, y)
-    out, _ = capsys.readouterr()
-    lines = re.split("\n+", out)
-    # we get every variable that is printed from the algorithm in verbose
-    num = r'(-?\d+.?\d*(e[+|-]\d+)?)'
-    strings = [re.search(r"\d+ (?:{}) (?:{}) (?:(\d+)) (?:{})"
-                         .format(num, num, num), s) for s in lines]
-    objectives[name] = [float(match.group(1)) for match in strings if match is
-                        not None]
-    obj_diffs[name] = [float(match.group(3)) for match in strings if match is
-                       not None]
-    total_active[name] = [float(match.group(5)) for match in strings if
-                          match is not
-                          None]
-    learn_rate[name] = [float(match.group(6)) for match in strings if match is
-                        not None]
-    assert len(strings) >= 10  # we ensure that we actually did more than 10
-    # iterations
-    assert total_active[name][0] >= 2  # we ensure that we have some active
-    # constraints (that's the case we want to test)
-    # we remove the last element because it can be equal to the penultimate
-    # if the last gradient update is null
-  for i in range(len(mem1)):
-    np.testing.assert_allclose(lmnn_perf.callback[i],
-                               lmnn_nonperf.callback[i],
-                               err_msg='Gradient different at position '
-                                       '{}'.format(i))
-  np.testing.assert_allclose(objectives['perf'], objectives['nonperf'])
-  np.testing.assert_allclose(obj_diffs['perf'], obj_diffs['nonperf'])
-  np.testing.assert_allclose(total_active['perf'], total_active['nonperf'])
-  np.testing.assert_allclose(learn_rate['perf'], learn_rate['nonperf'])
+    class_distance = 4.
+    X_a = np.array([[-1., 1], [-1., -1.], [1., 1.], [1., -1.]])
+    X_b = X_a + np.array([class_distance, 0])
+    X = np.concatenate((X_a, X_b))
+    y = np.array([0, 0, 0, 0, 1, 1, 1, 1])
+    lmnn = LMNN(n_neighbors=1)
+    lmnn.n_neighbors_ = 1
+    classes = np.unique(y)
+    target_neighbors = lmnn._select_target_neighbors(X, y, classes)
+    diffs = X - X[target_neighbors[:, 0]]
+    dist_tn = np.einsum('ij,ij->i', diffs, diffs)
+    dist_tn = dist_tn[:, None]
+    dist_tn += 1
+    margin_radii = dist_tn[:, -1]
+    impostors_graph = _find_impostors(X, y, classes, margin_radii)
+    loss, grad, _ = _compute_push_loss(X, target_neighbors, dist_tn,
+                                       impostors_graph)
+
+    # The loss should be 4. (1. for each of the 4 violation)
+    assert loss == 4.
 
 
 @pytest.mark.parametrize('X, y, loss', [(np.array([[0], [1], [2], [3]]),
-                                         [1, 1, 0, 0], 3.0),
+                                         [1, 1, 0, 0], 6.0),
                                         (np.array([[0], [1], [2], [3]]),
-                                         [1, 0, 0, 1], 26.)])
+                                         [1, 0, 0, 1], 256.)])
 def test_toy_ex_lmnn(X, y, loss):
   """Test that the loss give the right result on a toy example"""
   L = np.array([[1]])
-  lmnn = LMNN(k=1, regularization=0.5)
+  lmnn = LMNN(n_neighbors=1, push_loss_weight=0.5)
 
-  k = lmnn.k
-  reg = lmnn.regularization
+  lmnn.n_neighbors_ = lmnn.n_neighbors
 
-  X, y = lmnn._prepare_inputs(X, y, dtype=float,
-                              ensure_min_samples=2)
+  X, y = lmnn._prepare_inputs(X, y, dtype=float, ensure_min_samples=2)
   num_pts, n_components = X.shape
   unique_labels, label_inds = np.unique(y, return_inverse=True)
-  lmnn.labels_ = np.arange(len(unique_labels))
+  classes = np.arange(len(unique_labels))
   lmnn.components_ = np.eye(n_components)
 
-  target_neighbors = lmnn._select_targets(X, label_inds)
+  target_neighbors = lmnn._select_target_neighbors(X, label_inds, classes)
 
   # sum outer products
-  dfG = _sum_outer_products(X, target_neighbors.flatten(),
-                            np.repeat(np.arange(X.shape[0]), k))
-
-  # storage
-  a1 = [None] * k
-  a2 = [None] * k
-  for nn_idx in range(k):
-    a1[nn_idx] = np.array([])
-    a2[nn_idx] = np.array([])
+  tn_graph = _make_knn_graph(target_neighbors)
+  const_grad = _sum_weighted_outer_differences(X, tn_graph)
 
   #  assert that the loss equals the one computed by hand
-  assert lmnn._loss_grad(X, L.reshape(-1, X.shape[1]), dfG, k,
-                         reg, target_neighbors, label_inds)[1] == loss
+  lmnn.n_iter_ = 0
+  predicted_loss = lmnn._loss_grad_lbfgs(L, X, y, classes, target_neighbors,
+                                         const_grad)[0]
+  assert predicted_loss == loss
 
 
 def test_convergence_simple_example(capsys):
@@ -592,28 +483,7 @@ def test_convergence_simple_example(capsys):
   lmnn = LMNN(verbose=True)
   lmnn.fit(X, y)
   out, _ = capsys.readouterr()
-  assert "LMNN converged with objective" in out
-
-
-def test_no_twice_same_objective(capsys):
-  # test that the objective function never has twice the same value
-  # see https://github.com/scikit-learn-contrib/metric-learn/issues/88
-  X, y = make_classification(random_state=0)
-  lmnn = LMNN(verbose=True)
-  lmnn.fit(X, y)
-  out, _ = capsys.readouterr()
-  lines = re.split("\n+", out)
-  # we get only objectives from each line:
-  # the regexp matches a float that follows an integer (the iteration
-  # number), and which is followed by a (signed) float (delta obj). It
-  # matches for instance:
-  # 3 **1113.7665747189938** -3.182774197440267 46431.0200999999999998e-06
-  objectives = [re.search(r"\d* (?:(\d*.\d*))[ | -]\d*.\d*", s)
-                for s in lines]
-  objectives = [match.group(1) for match in objectives if match is not None]
-  # we remove the last element because it can be equal to the penultimate
-  # if the last gradient update is null
-  assert len(objectives[:-1]) == len(set(objectives[:-1]))
+  assert "LMNN did not converge" not in out
 
 
 class TestSDML(MetricTestCase):
diff --git a/test/test_base_metric.py b/test/test_base_metric.py
index fed9018a..711e3b6d 100644
--- a/test/test_base_metric.py
+++ b/test/test_base_metric.py
@@ -20,8 +20,8 @@ def test_covariance(self):
 
   def test_lmnn(self):
     self.assertEqual(
-        remove_spaces(str(metric_learn.LMNN(convergence_tol=0.01, k=6))),
-        remove_spaces("LMNN(convergence_tol=0.01, k=6)"))
+        remove_spaces(str(metric_learn.LMNN(tol=0.01, n_neighbors=6))),
+        remove_spaces("LMNN(n_neighbors=6,tol=0.01)"))
 
   def test_nca(self):
     self.assertEqual(remove_spaces(str(metric_learn.NCA(max_iter=42))),
diff --git a/test/test_components_metric_conversion.py b/test/test_components_metric_conversion.py
index 04d0d007..40d2f942 100644
--- a/test/test_components_metric_conversion.py
+++ b/test/test_components_metric_conversion.py
@@ -42,7 +42,7 @@ def test_itml_supervised(self):
     assert_array_almost_equal(L.T.dot(L), itml.get_mahalanobis_matrix())
 
   def test_lmnn(self):
-    lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False)
+    lmnn = LMNN(n_neighbors=5, verbose=False)
     lmnn.fit(self.X, self.y)
     L = lmnn.components_
     assert_array_almost_equal(L.T.dot(L), lmnn.get_mahalanobis_matrix())
diff --git a/test/test_fit_transform.py b/test/test_fit_transform.py
index d4d4bfe0..609db3c9 100644
--- a/test/test_fit_transform.py
+++ b/test/test_fit_transform.py
@@ -52,11 +52,11 @@ def test_itml_supervised(self):
     assert_array_almost_equal(res_1, res_2)
 
   def test_lmnn(self):
-    lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False)
+    lmnn = LMNN(n_neighbors=5, verbose=False)
     lmnn.fit(self.X, self.y)
     res_1 = lmnn.transform(self.X)
 
-    lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False)
+    lmnn = LMNN(n_neighbors=5, verbose=False)
     res_2 = lmnn.fit_transform(self.X, self.y)
 
     assert_array_almost_equal(res_1, res_2)
diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py
index ab7e972d..bbd5c718 100644
--- a/test/test_mahalanobis_mixin.py
+++ b/test/test_mahalanobis_mixin.py
@@ -417,7 +417,7 @@ def test_auto_init_transformation(n_samples, n_features, n_classes,
                           random_state=rng)
     # To make the test work for LMNN:
     if 'LMNN' in model_base.__class__.__name__:
-      model_base.set_params(k=1)
+      model_base.set_params(n_neighbors=1)
     # To make the test faster for estimators that have a max_iter:
     if hasattr(model_base, 'max_iter'):
       model_base.set_params(max_iter=1)

From e4348621f06530b400fd6f0e75bbdc0470415cb6 Mon Sep 17 00:00:00 2001
From: John Chiotellis <johnyc.code@gmail.com>
Date: Fri, 26 Mar 2021 09:59:01 +0100
Subject: [PATCH 02/10] validate params after components initialization

---
 metric_learn/lmnn.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
index b2a3817f..dc920020 100644
--- a/metric_learn/lmnn.py
+++ b/metric_learn/lmnn.py
@@ -169,9 +169,8 @@ def fit(self, X, y):
     :return: The trained LMNN model.
     """
 
-    # Validate inputs
+    # Check input arrays
     X, y = self._prepare_inputs(X, y, dtype=float, ensure_min_samples=2)
-    X, y, classes = self._validate_params(X, y)
 
     n_samples, n_features = X.shape
     n_components = _check_n_components(n_features, self.n_components)
@@ -181,6 +180,9 @@ def fit(self, X, y):
                                               verbose=self.verbose,
                                               random_state=self.random_state)
 
+    # remove singletons after initializing components
+    X, y, classes = self._validate_params(X, y)
+
     # Find the target neighbors of each sample
     target_neighbors = self._select_target_neighbors(X, y, classes)
 

From f0bbd8fedf5cb011561fb5ae268c39c059e253f1 Mon Sep 17 00:00:00 2001
From: John Chiotellis <johnyc.code@gmail.com>
Date: Sat, 17 Apr 2021 00:00:26 +0200
Subject: [PATCH 03/10] add max_impostors param and improve documentation

---
 metric_learn/lmnn.py      | 292 ++++++++++++++++++++++++++++----------
 test/metric_learn_test.py |  17 ++-
 test/test_base_metric.py  |   4 +-
 3 files changed, 230 insertions(+), 83 deletions(-)

diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
index dc920020..0466d39e 100644
--- a/metric_learn/lmnn.py
+++ b/metric_learn/lmnn.py
@@ -9,6 +9,7 @@
 from scipy.sparse import coo_matrix, csr_matrix, csc_matrix
 from sklearn.neighbors import NearestNeighbors
 from sklearn.metrics import euclidean_distances
+from sklearn.utils import gen_batches, check_random_state
 from sklearn.base import TransformerMixin
 from sklearn.exceptions import ConvergenceWarning
 
@@ -75,6 +76,11 @@ class LMNN(MahalanobisMixin, TransformerMixin):
       :meth:`fit` and n_features_a must be less than or equal to that.
       If ``n_components`` is not None, n_features_a must match it.
 
+  max_impostors : int, optional (default=500_000)
+      Maximum number of impostors to consider per iteration. In the worst
+      case this will allow ``max_impostors * n_neighbors`` constraints to be
+      active.
+
   neighbors_params : dict, optional (default=None)
     Parameters to pass to a :class:`neighbors.NearestNeighbors` instance -
     apart from ``n_neighbors`` - that will be used to select the target
@@ -146,12 +152,14 @@ class LMNN(MahalanobisMixin, TransformerMixin):
 
   """
   def __init__(self, n_neighbors=3, n_components=None, init='auto',
-               neighbors_params=None, push_loss_weight=0.5, max_iter=50,
-               tol=1e-5, preprocessor=None, verbose=False, random_state=None):
+               max_impostors=500_000, neighbors_params=None,
+               push_loss_weight=0.5, max_iter=50, tol=1e-5,
+               preprocessor=None, verbose=False, random_state=None):
 
     self.n_neighbors = n_neighbors
     self.n_components = n_components
     self.init = init
+    self.max_impostors = max_impostors
     self.neighbors_params = neighbors_params
     self.push_loss_weight = push_loss_weight
     self.max_iter = max_iter
@@ -161,7 +169,7 @@ def __init__(self, n_neighbors=3, n_components=None, init='auto',
     super(LMNN, self).__init__(preprocessor)
 
   def fit(self, X, y):
-    """ Train Large Margin Nearest Neighbor model.
+    """Train Large Margin Nearest Neighbor model.
 
     :param X: array, shape (n_samples, n_features), the training samples.
     :param y: array, shape (n_samples,), the training labels.
@@ -175,10 +183,13 @@ def fit(self, X, y):
     n_samples, n_features = X.shape
     n_components = _check_n_components(n_features, self.n_components)
 
+    # Initialize the random generator
+    self.random_state_ = check_random_state(self.random_state)
+
     # Initialize transformation
     self.components_ = _initialize_components(n_components, X, y, self.init,
                                               verbose=self.verbose,
-                                              random_state=self.random_state)
+                                              random_state=self.random_state_)
 
     # remove singletons after initializing components
     X, y, classes = self._validate_params(X, y)
@@ -186,13 +197,13 @@ def fit(self, X, y):
     # Find the target neighbors of each sample
     target_neighbors = self._select_target_neighbors(X, y, classes)
 
-    # Compute the gradient w.r.t. the target neighbors which remains constant
+    # Compute the pull loss gradient w.r.t. M, which remains constant
     tn_graph = _make_knn_graph(target_neighbors)
-    const_grad = _sum_weighted_outer_differences(X, tn_graph)
+    pull_loss_grad_m = _sum_weighted_outer_products(X, tn_graph)
 
     # Compute the pull loss weight such that the push loss weight becomes 1
     pull_loss_weight = (1 - self.push_loss_weight) / self.push_loss_weight
-    const_grad *= pull_loss_weight
+    pull_loss_grad_m *= pull_loss_weight
 
     int_verbose = int(self.verbose)
     disp = int_verbose - 2 if int_verbose > 1 else -1
@@ -200,7 +211,7 @@ def fit(self, X, y):
       'method': 'L-BFGS-B',
       'fun': self._loss_grad_lbfgs,
       'jac': True,
-      'args': (X, y, classes, target_neighbors, const_grad),
+      'args': (X, y, classes, target_neighbors, pull_loss_grad_m),
       'x0': self.components_,
       'tol': self.tol,
       'options': dict(maxiter=self.max_iter, disp=disp),
@@ -224,7 +235,7 @@ def fit(self, X, y):
     return self
 
   def _validate_params(self, X, y):
-    """ Validate parameters as soon as :meth:`fit` is called.
+    """Validate parameters as soon as :meth:`fit` is called.
 
     :param X: array, shape (n_samples, n_features), the training samples.
     :param y: array, shape (n_samples,), the training labels.
@@ -274,8 +285,9 @@ def _validate_params(self, X, y):
 
     return X, y, classes_inverse_non_singleton
 
-  def _loss_grad_lbfgs(self, L, X, y, classes, target_neighbors, const_grad):
-    """ Compute loss and gradient after one optimization iteration.
+  def _loss_grad_lbfgs(self, L, X, y, classes, target_neighbors,
+                       pull_loss_grad_m):
+    """Compute loss and gradient after one optimization iteration.
 
     :param L: array, shape (n_components * n_features,)
               The current (flattened) linear transformation.
@@ -284,8 +296,8 @@ def _loss_grad_lbfgs(self, L, X, y, classes, target_neighbors, const_grad):
     :param classes: array, shape (n_classes,), classes encoded as intergers.
     :param target_neighbors: array, shape (n_samples, n_neighbors),
                              the target neighbors of each training sample.
-    :param const_grad: array, shape (n_features, n_features),
-                       the (weighted) gradient component due to target
+    :param pull_loss_grad_m: array, shape (n_features, n_features),
+                       the (weighted) gradient (w.r.t M=L.T@L) due to target
                        neighbors that stays fixed throughout the algorithm.
 
     :return:
@@ -326,19 +338,21 @@ def _loss_grad_lbfgs(self, L, X, y, classes, target_neighbors, const_grad):
     dist_tn += 1
 
     # Find impostors
-    impostors_graph = _find_impostors(X_embedded, y, classes, dist_tn[:, -1])
+    impostors_graph = self._find_impostors(X_embedded, y, classes,
+                                           dist_tn[:, -1])
 
     # Compute the push loss and its gradient
-    push_loss, push_loss_grad, n_active = _compute_push_loss(
+    push_loss, push_loss_grad_m, n_active = _push_loss_grad(
       X, target_neighbors, dist_tn, impostors_graph)
 
     # Compute the total loss
     M = L.T @ L
-    new_pull_loss = np.dot(const_grad.ravel(), M.ravel())
+    # It holds that trace(M.T @ C) = np.dot(M.ravel(), C.ravel())
+    new_pull_loss = np.dot(pull_loss_grad_m.ravel(), M.ravel())
     loss = push_loss + new_pull_loss
 
-    # Compute the total gradient
-    grad = np.dot(L, const_grad + push_loss_grad)
+    # Compute the total gradient (grad_L = 2L grad_M)
+    grad = np.dot(L, pull_loss_grad_m + push_loss_grad_m)
     grad *= 2
 
     # Report iteration metrics
@@ -347,14 +361,14 @@ def _loss_grad_lbfgs(self, L, X, y, classes, target_neighbors, const_grad):
       elapsed_time = time.time() - start_time
       values_fmt = '[{}] {:>10} {:>20.6e} {:>20,} {:>10.2f}'
       cls_name = self.__class__.__name__
-      print(values_fmt.format(cls_name, self.n_iter_, loss, n_active,
-                              elapsed_time))
+      print(values_fmt.format(cls_name, self.n_iter_, loss,
+                              n_active, elapsed_time))
       sys.stdout.flush()
 
     return loss, grad.ravel()
 
   def _select_target_neighbors(self, X, y, classes):
-    """ Find the target neighbors of each training sample.
+    """Find the target neighbors of each training sample.
 
     :param X: array, shape (n_samples, n_features), the training samples.
     :param y: array, shape (n_samples,), the training labels.
@@ -387,68 +401,152 @@ def _select_target_neighbors(self, X, y, classes):
 
     return target_neighbors
 
+  def _find_impostors(self, X_embedded, y, classes, margin_radii):
+    """Find the samples that violate the margin.
+
+    :param X_embedded: array, shape (n_samples, n_components),
+                       the training samples in the embedding space.
+    :param y: array, shape (n_samples,), training labels.
+    :param classes: array, shape (n_classes,), the classes encoded as integers.
+    :param margin_radii: array, shape (n_samples,), squared distances of
+                         training samples to their farthest target neighbors
+                         plus margin.
+
+    :return: coo_matrix, shape (n_samples, n_neighbors),
+             If sample i is an impostor to sample j or vice versa, then the
+             value of A[i, j] is the squared distance between samples i and j.
+             Otherwise A[i, j] is zero.
+
+    """
 
-def _find_impostors(X_embedded, y, classes, margin_radii):
-  """ Find the samples that violate the margin.
+    # Initialize lists for impostors storage
+    imp_row, imp_col = [], []
 
-  :param X_embedded: array, shape (n_samples, n_components),
-                     the training samples in the embedding space.
-  :param y: array, shape (n_samples,), training labels.
-  :param classes: array, shape (n_classes,), the classes encoded as integers.
-  :param margin_radii: array, shape (n_samples,), squared distances of
-                       training samples to their farthest target neighbors
-                       plus margin.
+    for label in classes[:-1]:
+      ind_a = np.where(y == label)[0]
+      ind_b = np.where(y > label)[0]
 
-  :return: coo_matrix, shape (n_samples, n_neighbors),
-           If sample i is an impostor to sample j or vice versa, then the
-           value of A[i, j] is the squared distance between samples i and j.
-           Otherwise A[i, j] is zero.
+      ind_impostors = _find_impostors_blockwise(X_embedded, margin_radii,
+                                                ind_b, ind_a)
+
+      n_impostors = len(ind_impostors)
+      if n_impostors:
+        if n_impostors > self.max_impostors:
+          ind_samples = self.random_state_.choice(
+            n_impostors, self.max_impostors, replace=False)
+          ind_impostors = ind_impostors[ind_samples]
+
+        dims = (len(ind_b), len(ind_a))
+        ib, ia = np.unravel_index(ind_impostors, shape=dims)
+
+        imp_row.extend(ind_b[ib])
+        imp_col.extend(ind_a[ia])
+
+    # turn lists to numpy arrays
+    imp_row = np.asarray(imp_row, dtype=np.intp)
+    imp_col = np.asarray(imp_col, dtype=np.intp)
+
+    # Make sure we do not exceed max_impostors
+    n_impostors = len(imp_row)
+    if n_impostors > self.max_impostors:
+      ind_samples = self.random_state_.choice(
+        n_impostors, self.max_impostors, replace=False)
+      imp_row = imp_row[ind_samples]
+      imp_col = imp_col[ind_samples]
+
+    # Compute distances
+    imp_dist = _paired_distances_blockwise(X_embedded, imp_row, imp_col)
+
+    # store impostors as a sparse matrix
+    n = X_embedded.shape[0]
+    impostors_graph = coo_matrix((imp_dist, (imp_row, imp_col)), shape=(n, n))
+
+    return impostors_graph
 
-  """
 
-  # Initialize lists for impostors storage
-  imp_row, imp_col, imp_dist = [], [], []
+########################
+# Some core functions #
+#######################
 
-  for label in classes[:-1]:
-    ind_a = np.where(y == label)[0]
-    ind_b = np.where(y > label)[0]
+def _find_impostors_blockwise(X, radii, ind_a, ind_b,
+                              return_distance=False, block_size=8):
+  """Find (sample, impostor) pairs in blocks to avoid large memory usage.
 
-    dist = euclidean_distances(X_embedded[ind_b], X_embedded[ind_a],
-                               squared=True)  # (b, a)
+  Parameters
+  ----------
+  X : array, shape (n_samples, n_components)
+      Transformed data samples.
+
+  radii : array, shape (n_samples,)
+      Squared distances of the samples in ``X`` to their margins.
+
+  ind_a : array, shape (n_samples_a,)
+      Indices of samples from class A.
+
+  ind_b : array, shape (n_samples_b,)
+      Indices of samples from class B.
+
+  block_size : int, optional (default=8)
+      The maximum number of mebibytes (MiB) of memory to use at a time for
+      calculating paired squared distances.
+  return_distance : bool, optional (default=False)
+      Whether to return the squared distances to the impostors.
+
+  Returns
+  -------
+  imp_indices : array, shape (n_impostors,)
+      Unraveled indices of (sample, impostor) pairs referring to a matrix
+      of shape (n_samples_a, n_samples_b).
+  imp_distances : array, shape (n_impostors,), optional
+      imp_distances[i] is the squared distance between samples imp_row[i] and
+      imp_col[i], where
+      imp_row, imp_col = np.unravel_index(imp_indices, shape=(n_samples_a,
+      n_samples_b))
+  """
+
+  n_samples_a = len(ind_a)
+  bytes_per_row = len(ind_b) * X.itemsize
+  block_n_rows = int(block_size * 1024 * 1024 // bytes_per_row)
 
-    radii_a = margin_radii[ind_a]  # (a,)
-    radii_b = margin_radii[ind_b]  # (b,)
+  imp_indices, imp_distances = [], []
 
-    # Find samples in B that are impostors to A
-    imp_ba = np.where((dist < radii_a[None, :]).ravel())[0]
+  # data
+  X_b = X[ind_b]
+  radii_a = radii[ind_a]
+  radii_b = radii[ind_b]
 
-    # Find samples in A that are impostors to B
-    imp_ab = np.where((dist < radii_b[:, None]).ravel())[0]
+  # X_b squared norm stays constant, so pre-compute it to get a speed-up
+  X_b_norm_squared = np.einsum('ij,ij->i', X_b, X_b).reshape(1, -1)
+  for chunk in gen_batches(n_samples_a, block_n_rows):
 
-    # merge and filter unique impostors
-    ind_impostors = np.unique(np.concatenate((imp_ba, imp_ab)))
+    dist = euclidean_distances(X[ind_a[chunk]], X_b, squared=True,
+                               Y_norm_squared=X_b_norm_squared)
 
-    if len(ind_impostors):
-      # Map indices back to the original training data indices
-      ii, jj = np.unravel_index(ind_impostors, dist.shape)
-      imp_row.extend(ind_b[ii])
-      imp_col.extend(ind_a[jj])
-      imp_dist.extend(dist.ravel()[ind_impostors])
+    ind_b, = np.where((dist < radii_a[chunk, None]).ravel())
+    ind_a, = np.where((dist < radii_b[None, :]).ravel())
+    ind = np.unique(np.concatenate((ind_a, ind_b)))
 
-  # turn lists to numpy arrays
-  imp_row = np.asarray(imp_row, dtype=np.intp)
-  imp_col = np.asarray(imp_col, dtype=np.intp)
-  imp_dist = np.asarray(imp_dist)
+    if len(ind):
+      ind_plus_offset = ind + chunk.start * X_b.shape[0]
+      imp_indices.extend(ind_plus_offset)
 
-  # store impostors as a sparse matrix
-  n = X_embedded.shape[0]
-  impostors_graph = coo_matrix((imp_dist, (imp_row, imp_col)), shape=(n, n))
+      if return_distance:
+        # We only need to do clipping if we return the distances.
+        distances_chunk = dist.ravel()[ind]
+        # Clip only the indexed (unique) distances
+        np.maximum(distances_chunk, 0, out=distances_chunk)
+        imp_distances.extend(distances_chunk)
 
-  return impostors_graph
+  imp_indices = np.asarray(imp_indices)
 
+  if return_distance:
+    return imp_indices, np.asarray(imp_distances)
+  else:
+    return imp_indices
 
-def _compute_push_loss(X, target_neighbors, inflated_dist_tn, impostors_graph):
-  """ Compute the push loss L_push = max(d(a, p) + 1 - d(a, n), 0)
+
+def _push_loss_grad(X, target_neighbors, inflated_dist_tn, impostors_graph):
+  """Compute the push loss max(d(a, p) + 1 - d(a, n), 0) and its gradient.
 
   :param X: array, shape (n_samples, n_features), the training samples.
   :param target_neighbors: array, shape (n_samples, n_neighbors),
@@ -460,7 +558,8 @@ def _compute_push_loss(X, target_neighbors, inflated_dist_tn, impostors_graph):
 
   :return:
     loss: float, the push loss due to the given target neighbors and impostors.
-    grad: array, shape (n_features, n_features), the gradient of the push loss.
+    grad: array, shape (n_features, n_features), the gradient of the push loss
+          with respect to M = L.T @ L.
     n_active_triplets: int, the number of active triplet constraints.
 
   """
@@ -496,19 +595,68 @@ def _compute_push_loss(X, target_neighbors, inflated_dist_tn, impostors_graph):
     A3 = csr_matrix((val, (sample_range, target_neighbors[:, k])), shape)
     A0 = A0 - A1 - A2 + A3
 
-  grad = _sum_weighted_outer_differences(X, A0)
+  grad = _sum_weighted_outer_products(X, A0)
 
   return loss, grad, n_active_triplets
 
 
-def _sum_weighted_outer_differences(X, weights):
-  """ Compute the sum of weighted outer pairwise differences.
+##########################
+# Some helper functions #
+#########################
+
+def _paired_distances_blockwise(X, ind_a, ind_b, squared=True, block_size=8):
+    """Equivalent to row_norms(X[ind_a] - X[ind_b], squared=squared).
+
+    Parameters
+    ----------
+    X : array, shape (n_samples, n_features)
+        An array of data samples.
+    ind_a : array, shape (n_indices,)
+        An array of sample indices.
+    ind_b : array, shape (n_indices,)
+        Another array of sample indices.
+    squared : bool (default=True)
+        Whether to return the squared distances.
+    block_size : int, optional (default=8)
+        The maximum number of mebibytes (MiB) of memory to use at a time for
+        calculating paired (squared) distances.
+
+    Returns
+    -------
+    distances: array, shape (n_indices,)
+        An array of paired, optionally squared, distances.
+    """
+
+    bytes_per_row = X.shape[1] * X.itemsize
+    batch_size = int(block_size*1024*1024 // bytes_per_row)
+
+    n_pairs = len(ind_a)
+    distances = np.zeros(n_pairs)
+    for chunk in gen_batches(n_pairs, batch_size):
+      diffs = X[ind_a[chunk]] - X[ind_b[chunk]]
+      distances[chunk] = np.einsum('ij,ij->i', diffs, diffs)
+
+    return distances if squared else np.sqrt(distances, out=distances)
+
+
+def _sum_weighted_outer_products(X, weights):
+  """Compute the sum of weighted outer products.
+
+      Consider the squared Mahalanobis distance between x_i and x_j:
+
+      d_{ij} = (x_i - x_j)^T M (x_i - x_j)
+             = x_i^T M x_i - x_i^T M x_j - x_j^T M x_i + x_j^T M x_j
+
+      The gradient of d_{ij} with respect to M is a sum of outer products:
+
+      \nabla_M d_{ij} = x_i x_i^T - x_j x_i^T - x_i x_j^T + x_j x_j^T
+
 
   :param X: array, shape (n_samples, n_features), data samples.
   :param weights: csr_matrix, shape (n_samples, n_samples), sparse weights.
 
   :return: array, shape (n_features, n_features),
-           the sum of all outer weighted differences.
+           the sum of all weighted outer products.
   """
 
   W = weights + weights.T
@@ -522,7 +670,7 @@ def _sum_weighted_outer_differences(X, weights):
 
 
 def _make_knn_graph(indices):
-  """ Convert a dense indices matrix to a sparse adjacency matrix.
+  """Convert a dense indices matrix to a sparse adjacency matrix.
 
   :param indices: array, shape (n_samples, n_neighbors),
                   indices of the k nearest neighbors of each sample.
diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py
index 4af776cd..f712dc7e 100644
--- a/test/metric_learn_test.py
+++ b/test/metric_learn_test.py
@@ -27,10 +27,9 @@
 # Import this specially for testing.
 from metric_learn.constraints import wrap_pairs, Constraints
 from metric_learn.lmnn import (
-    _sum_weighted_outer_differences,
+    _sum_weighted_outer_products,
     _make_knn_graph,
-    _find_impostors,
-    _compute_push_loss
+    _push_loss_grad
 )
 
 
@@ -395,13 +394,13 @@ def test_loss_grad_lbfgs(self):
 
     # sum outer products
     tn_graph = _make_knn_graph(target_neighbors)
-    const_grad = _sum_weighted_outer_differences(X, tn_graph)
+    pull_loss_grad_m = _sum_weighted_outer_products(X, tn_graph)
 
 
     kwargs = {
         'classes':  classes,
         'target_neighbors': target_neighbors,
-        'const_grad': const_grad,
+        'pull_loss_grad_m': pull_loss_grad_m,
     }
 
     def fun(L):
@@ -438,9 +437,9 @@ def test_compute_push_loss():
     dist_tn = dist_tn[:, None]
     dist_tn += 1
     margin_radii = dist_tn[:, -1]
-    impostors_graph = _find_impostors(X, y, classes, margin_radii)
-    loss, grad, _ = _compute_push_loss(X, target_neighbors, dist_tn,
-                                       impostors_graph)
+    impostors_graph = lmnn._find_impostors(X, y, classes, margin_radii)
+    loss, grad, _ = _push_loss_grad(X, target_neighbors, dist_tn,
+                                    impostors_graph)
 
     # The loss should be 4. (1. for each of the 4 violation)
     assert loss == 4.
@@ -467,7 +466,7 @@ def test_toy_ex_lmnn(X, y, loss):
 
   # sum outer products
   tn_graph = _make_knn_graph(target_neighbors)
-  const_grad = _sum_weighted_outer_differences(X, tn_graph)
+  const_grad = _sum_weighted_outer_products(X, tn_graph)
 
   #  assert that the loss equals the one computed by hand
   lmnn.n_iter_ = 0
diff --git a/test/test_base_metric.py b/test/test_base_metric.py
index ea42f603..d0db85fd 100644
--- a/test/test_base_metric.py
+++ b/test/test_base_metric.py
@@ -40,10 +40,10 @@ def test_lmnn(self):
                   'n_components': None, 'preprocessor': None,
                   'random_state': None, 'push_loss_weight': 0.5,
                   'verbose': False}
-    nndef_kwargs = {'tol': 0.01, 'n_neighbors': 6}
+    nndef_kwargs = {'n_neighbors': 6, 'tol': 0.01}
     merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
     self.assertEqual(
-        remove_spaces(str(metric_learn.LMNN(tol=0.01, n_neighbors=6))),
+        remove_spaces(str(metric_learn.LMNN(n_neighbors=6, tol=0.01))),
         remove_spaces(f"LMNN({merged_kwargs})"))
 
   def test_nca(self):

From eb609d95626f93baaace2daf1686834b7476e68a Mon Sep 17 00:00:00 2001
From: John Chiotellis <johnyc.code@gmail.com>
Date: Sat, 17 Apr 2021 11:27:58 +0200
Subject: [PATCH 04/10] fix LMNN StringRepr test

---
 test/metric_learn_test.py |  1 -
 test/test_base_metric.py  | 10 ++++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py
index f712dc7e..ce855afc 100644
--- a/test/metric_learn_test.py
+++ b/test/metric_learn_test.py
@@ -396,7 +396,6 @@ def test_loss_grad_lbfgs(self):
     tn_graph = _make_knn_graph(target_neighbors)
     pull_loss_grad_m = _sum_weighted_outer_products(X, tn_graph)
 
-
     kwargs = {
         'classes':  classes,
         'target_neighbors': target_neighbors,
diff --git a/test/test_base_metric.py b/test/test_base_metric.py
index d0db85fd..9a62f8d9 100644
--- a/test/test_base_metric.py
+++ b/test/test_base_metric.py
@@ -36,10 +36,12 @@ def test_covariance(self):
                      remove_spaces(f"Covariance({merged_kwargs})"))
 
   def test_lmnn(self):
-    def_kwargs = {'tol': 0.001, 'init': 'auto', 'n_neighbors': 3,
-                  'n_components': None, 'preprocessor': None,
-                  'random_state': None, 'push_loss_weight': 0.5,
-                  'verbose': False}
+    def_kwargs = {'n_neighbors': 3, 'n_components': None, 'init': 'auto',
+                  'max_impostors': 500_000, 'neighbors_params': None,
+                  'push_loss_weight': 0.5, 'max_iter': 50, 'tol': 1e-5,
+                  'preprocessor': None, 'verbose': False,
+                  'random_state': None}
+    def_kwargs = {k: def_kwargs[k] for k in sorted(def_kwargs.keys())}
     nndef_kwargs = {'n_neighbors': 6, 'tol': 0.01}
     merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
     self.assertEqual(

From 53ec6e9dae1e5b65ce62cb9ea9c33222f88ae4ce Mon Sep 17 00:00:00 2001
From: John Chiotellis <johnyc.code@gmail.com>
Date: Sat, 17 Apr 2021 11:52:46 +0200
Subject: [PATCH 05/10] fix flake8 errors

---
 metric_learn/lmnn.py      | 8 ++++----
 test/metric_learn_test.py | 3 +--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
index 0466d39e..c5d94ceb 100644
--- a/metric_learn/lmnn.py
+++ b/metric_learn/lmnn.py
@@ -276,10 +276,10 @@ def _validate_params(self, X, y):
     min_non_singleton_size = class_sizes[~mask_singleton_class].min()
     if self.n_neighbors >= min_non_singleton_size:
       warnings.warn('`n_neighbors` (={}) is not less than the number of '
-           'samples in the smallest non-singleton class (={}). '
-           '`n_neighbors_` will be set to {} for estimation.'
-           .format(self.n_neighbors, min_non_singleton_size,
-                   min_non_singleton_size - 1))
+                    'samples in the smallest non-singleton class (={}). '
+                    '`n_neighbors_` will be set to {} for estimation.'
+                    .format(self.n_neighbors, min_non_singleton_size,
+                            min_non_singleton_size - 1))
 
     self.n_neighbors_ = min(self.n_neighbors, min_non_singleton_size - 1)
 
diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py
index ce855afc..44ea0eb3 100644
--- a/test/metric_learn_test.py
+++ b/test/metric_learn_test.py
@@ -2,9 +2,8 @@
 import re
 import pytest
 import numpy as np
-import scipy
 from scipy.optimize import check_grad, approx_fprime
-from sklearn.metrics import pairwise_distances, euclidean_distances
+from sklearn.metrics import pairwise_distances
 from sklearn.datasets import (load_iris, make_classification, make_regression,
                               make_spd_matrix)
 from numpy.testing import (assert_array_almost_equal, assert_array_equal,

From d407def0b135130a77067fef09638f69ff841f18 Mon Sep 17 00:00:00 2001
From: John Chiotellis <johnyc.code@gmail.com>
Date: Wed, 28 Apr 2021 10:45:55 +0200
Subject: [PATCH 06/10] fix double-naming bug in fib

---
 metric_learn/lmnn.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
index c5d94ceb..c37b54b7 100644
--- a/metric_learn/lmnn.py
+++ b/metric_learn/lmnn.py
@@ -122,6 +122,8 @@ class LMNN(MahalanobisMixin, TransformerMixin):
   n_iter_ : `int`
     The number of iterations the solver has run.
 
+  random_state_ : numpy.RandomState
+    Pseudo random number generator object used during initialization.
 
   Examples
   --------
@@ -484,7 +486,7 @@ def _find_impostors_blockwise(X, radii, ind_a, ind_b,
       Indices of samples from class A.
 
   ind_b : array, shape (n_samples_b,)
-      Indices of samples from class B.
+      Indices of samples from class B, where n_samples_b << n_samples_a.
 
   block_size : int, optional (default=8)
       The maximum number of mebibytes (MiB) of memory to use at a time for
@@ -522,9 +524,9 @@ def _find_impostors_blockwise(X, radii, ind_a, ind_b,
     dist = euclidean_distances(X[ind_a[chunk]], X_b, squared=True,
                                Y_norm_squared=X_b_norm_squared)
 
-    ind_b, = np.where((dist < radii_a[chunk, None]).ravel())
-    ind_a, = np.where((dist < radii_b[None, :]).ravel())
-    ind = np.unique(np.concatenate((ind_a, ind_b)))
+    ind_ba, = np.where((dist < radii_a[chunk, None]).ravel())
+    ind_ab, = np.where((dist < radii_b[None, :]).ravel())
+    ind = np.unique(np.concatenate((ind_ab, ind_ba)))
 
     if len(ind):
       ind_plus_offset = ind + chunk.start * X_b.shape[0]

From 131d0b2049a730853a812646aeaa16fdc0fe7a03 Mon Sep 17 00:00:00 2001
From: John Chiotellis <johnyc.code@gmail.com>
Date: Sun, 23 May 2021 22:41:59 +0200
Subject: [PATCH 07/10] add deprecation warnings for k, regularization

---
 metric_learn/lmnn.py      | 38 +++++++++++++++++++++++++++++++++-----
 test/metric_learn_test.py | 15 +++++++++++++--
 2 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
index c37b54b7..dc8e219d 100644
--- a/metric_learn/lmnn.py
+++ b/metric_learn/lmnn.py
@@ -33,6 +33,12 @@ class LMNN(MahalanobisMixin, TransformerMixin):
   n_neighbors : int, optional (default=3)
     Number of neighbors to consider, not including self-edges.
 
+  k : Not used
+     .. deprecated:: 0.6.3
+    `k` was deprecated in version 0.6.3 and will
+    be removed in 0.6.4. It is replaced by `n_neighbors` to have more
+    consistent terminology with scikit-learn.
+
   n_components : int or None, optional (default=None)
     Dimensionality of reduced space (if None, defaults to dimension of X).
 
@@ -90,6 +96,13 @@ class LMNN(MahalanobisMixin, TransformerMixin):
     Relative weight between pull and push terms, with 0.5 meaning equal
     weight.
 
+  regularization : Not used
+     .. deprecated:: 0.6.3
+    `regularization` was deprecated in version 0.6.3 and will
+    be removed in 0.6.4. It is replaced by `push_loss_weight` that more
+    explicitly defines which loss term is meant to be weighted.
+
+
   max_iter : int, optional (default=1000)
     Maximum number of iterations of the optimization procedure.
 
@@ -153,17 +166,19 @@ class LMNN(MahalanobisMixin, TransformerMixin):
          https://en.wikipedia.org/wiki/Large_margin_nearest_neighbor
 
   """
-  def __init__(self, n_neighbors=3, n_components=None, init='auto',
-               max_impostors=500_000, neighbors_params=None,
-               push_loss_weight=0.5, max_iter=50, tol=1e-5,
-               preprocessor=None, verbose=False, random_state=None):
+  def __init__(self, n_neighbors=3, k='deprecated', n_components=None,
+               init='auto', max_impostors=500_000, neighbors_params=None,
+               push_loss_weight=0.5, regularization='deprecated', max_iter=50,
+               tol=1e-5, preprocessor=None, verbose=False, random_state=None):
 
     self.n_neighbors = n_neighbors
+    self.k = k
     self.n_components = n_components
     self.init = init
     self.max_impostors = max_impostors
     self.neighbors_params = neighbors_params
     self.push_loss_weight = push_loss_weight
+    self.regularization = regularization
     self.max_iter = max_iter
     self.tol = tol
     self.verbose = verbose
@@ -248,6 +263,19 @@ def _validate_params(self, X, y):
       classes: array, shape (n_classes,), the non-singleton classes encoded.
     """
 
+    # TODO: remove these in v0.6.4
+    if self.k != 'deprecated':
+      warnings.warn('"k" parameter from initialization is not used.'
+                    ' It has been deprecated in version 0.6.3 and will be'
+                    ' removed in 0.6.4. Use the "n_neighbors" parameter '
+                    'instead.', DeprecationWarning)
+
+    if self.regularization != 'deprecated':
+      warnings.warn('"regularization" parameter from initialization is not '
+                    'used. It has been deprecated in version 0.6.3 and will '
+                    'be removed in 0.6.4. Use the "push_loss_weight" parameter'
+                    'instead.', DeprecationWarning)
+
     # Find the appearing classes and the class index of each of the samples
     classes, y = np.unique(y, return_inverse=True)
     classes_inverse = np.arange(len(classes))
@@ -630,7 +658,7 @@ def _paired_distances_blockwise(X, ind_a, ind_b, squared=True, block_size=8):
     """
 
     bytes_per_row = X.shape[1] * X.itemsize
-    batch_size = int(block_size*1024*1024 // bytes_per_row)
+    batch_size = int(block_size * 1024 * 1024 // bytes_per_row)
 
     n_pairs = len(ind_a)
     distances = np.zeros(n_pairs)
diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py
index 44ea0eb3..2355dc40 100644
--- a/test/metric_learn_test.py
+++ b/test/metric_learn_test.py
@@ -396,7 +396,7 @@ def test_loss_grad_lbfgs(self):
     pull_loss_grad_m = _sum_weighted_outer_products(X, tn_graph)
 
     kwargs = {
-        'classes':  classes,
+        'classes': classes,
         'target_neighbors': target_neighbors,
         'pull_loss_grad_m': pull_loss_grad_m,
     }
@@ -407,8 +407,19 @@ def fun(L):
     def grad(L):
         return lmnn._loss_grad_lbfgs(L, X, y, **kwargs)[1]
 
+    # compute gradient with and without finite differences
+    epsilon = np.sqrt(np.finfo(float).eps)
+    grad_fin_diff = approx_fprime(L.ravel(), fun, epsilon)
+    grad_lmnn = grad(L)
+
+    # compute absolute error
+    grad_error = np.sqrt((grad_lmnn - grad_fin_diff)**2)
+
     # compute relative error
-    rel_diff = check_grad(fun, grad, L.ravel()) / np.linalg.norm(grad(L))
+    # rel_diff1 = grad_error / np.linalg.norm(grad_fin_diff)
+    # rel_diff2 = grad_error / np.linalg.norm(grad_lmnn)
+
+    rel_diff = grad_error / np.linalg.norm(grad_lmnn)
     np.testing.assert_almost_equal(rel_diff, 0., decimal=5)
 
 

From 4b9a01ae1b5ffcd2543cfda84360a58e2def880a Mon Sep 17 00:00:00 2001
From: John Chiotellis <johnyc.code@gmail.com>
Date: Sun, 23 May 2021 23:08:43 +0200
Subject: [PATCH 08/10] add deprecation vars in TestStringRepr

---
 test/test_base_metric.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/test/test_base_metric.py b/test/test_base_metric.py
index 9a62f8d9..2d8fd69e 100644
--- a/test/test_base_metric.py
+++ b/test/test_base_metric.py
@@ -36,9 +36,10 @@ def test_covariance(self):
                      remove_spaces(f"Covariance({merged_kwargs})"))
 
   def test_lmnn(self):
-    def_kwargs = {'n_neighbors': 3, 'n_components': None, 'init': 'auto',
-                  'max_impostors': 500_000, 'neighbors_params': None,
-                  'push_loss_weight': 0.5, 'max_iter': 50, 'tol': 1e-5,
+    def_kwargs = {'n_neighbors': 3, 'k': 'deprecated', 'n_components': None,
+                  'init': 'auto', 'max_impostors': 500_000,
+                  'neighbors_params': None, 'push_loss_weight': 0.5,
+                  'regularization': 'deprecated', 'max_iter': 50, 'tol': 1e-5,
                   'preprocessor': None, 'verbose': False,
                   'random_state': None}
     def_kwargs = {k: def_kwargs[k] for k in sorted(def_kwargs.keys())}

From cc5c7131c1d30015bd2a477c45449b79117b50e1 Mon Sep 17 00:00:00 2001
From: John Chiotellis <johnyc.code@gmail.com>
Date: Sun, 29 Aug 2021 15:31:49 +0200
Subject: [PATCH 09/10] add back test_loss_func() test, make random state
 private

---
 metric_learn/lmnn.py      | 10 +++---
 test/metric_learn_test.py | 71 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+), 6 deletions(-)

diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
index dc8e219d..dfb3f2fe 100644
--- a/metric_learn/lmnn.py
+++ b/metric_learn/lmnn.py
@@ -135,8 +135,6 @@ class LMNN(MahalanobisMixin, TransformerMixin):
   n_iter_ : `int`
     The number of iterations the solver has run.
 
-  random_state_ : numpy.RandomState
-    Pseudo random number generator object used during initialization.
 
   Examples
   --------
@@ -201,12 +199,12 @@ def fit(self, X, y):
     n_components = _check_n_components(n_features, self.n_components)
 
     # Initialize the random generator
-    self.random_state_ = check_random_state(self.random_state)
+    self._random_state = check_random_state(self.random_state)
 
     # Initialize transformation
     self.components_ = _initialize_components(n_components, X, y, self.init,
                                               verbose=self.verbose,
-                                              random_state=self.random_state_)
+                                              random_state=self._random_state)
 
     # remove singletons after initializing components
     X, y, classes = self._validate_params(X, y)
@@ -462,7 +460,7 @@ def _find_impostors(self, X_embedded, y, classes, margin_radii):
       n_impostors = len(ind_impostors)
       if n_impostors:
         if n_impostors > self.max_impostors:
-          ind_samples = self.random_state_.choice(
+          ind_samples = self._random_state.choice(
             n_impostors, self.max_impostors, replace=False)
           ind_impostors = ind_impostors[ind_samples]
 
@@ -479,7 +477,7 @@ def _find_impostors(self, X_embedded, y, classes, margin_radii):
     # Make sure we do not exceed max_impostors
     n_impostors = len(imp_row)
     if n_impostors > self.max_impostors:
-      ind_samples = self.random_state_.choice(
+      ind_samples = self._random_state.choice(
         n_impostors, self.max_impostors, replace=False)
       imp_row = imp_row[ind_samples]
       imp_col = imp_col[ind_samples]
diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py
index 2355dc40..2fce2fa4 100644
--- a/test/metric_learn_test.py
+++ b/test/metric_learn_test.py
@@ -423,6 +423,77 @@ def grad(L):
     np.testing.assert_almost_equal(rel_diff, 0., decimal=5)
 
 
+def test_loss_func(capsys):
+  """Test the loss function (and its gradient) on a simple example,
+  by comparing the results with the actual implementation of metric-learn,
+  with a very simple (but nonperformant) implementation"""
+
+  import scipy
+  from sklearn.metrics import euclidean_distances
+
+  # toy dataset to use
+  X, y = make_classification(n_samples=40, n_classes=2,
+                             n_features=6,
+                             n_redundant=0, shuffle=True,
+                             scale=[1, 1, 20, 20, 20, 20], random_state=42)
+
+  def hinge(a):
+    if a > 0:
+      return a, 1
+    else:
+      return 0, 0
+
+  def loss_fn(L, X, y, target_neighbors, reg):
+     L = L.reshape(-1, X.shape[1])
+     Lx = np.dot(X, L.T)
+     loss = 0
+     total_active = 0
+     grad = np.zeros_like(L)
+     for i in range(X.shape[0]):
+       for j in target_neighbors[i]:
+         loss += (1 - reg) * np.sum((Lx[i] - Lx[j]) ** 2)
+         grad += (1 - reg) * np.outer(Lx[i] - Lx[j], X[i] - X[j])
+         for k in range(X.shape[0]):
+           if y[i] != y[k]:
+             hin, active = hinge(1 + np.sum((Lx[i] - Lx[j])**2) -
+                                 np.sum((Lx[i] - Lx[k])**2))
+             total_active += active
+             if active:
+               loss += reg * hin
+               grad += (reg * (np.outer(Lx[i] - Lx[j], X[i] - X[j]) -
+                               np.outer(Lx[i] - Lx[k], X[i] - X[k])))
+     grad = 2 * grad
+     return grad, loss, total_active
+
+  # we check that the gradient we have computed in the non-performant implem
+  # is indeed the true gradient on a toy example:
+
+  def _select_targets(X, y, k):
+    target_neighbors = np.empty((X.shape[0], k), dtype=int)
+    for label in np.unique(y):
+      inds, = np.nonzero(y == label)
+      dd = euclidean_distances(X[inds], squared=True)
+      np.fill_diagonal(dd, np.inf)
+      nn = np.argsort(dd)[..., :k]
+      target_neighbors[inds] = inds[nn]
+    return target_neighbors
+
+  target_neighbors = _select_targets(X, y, 2)
+  regularization = 0.5
+  n_features = X.shape[1]
+  x0 = np.random.randn(1, n_features)
+
+  def loss(x0):
+    return loss_fn(x0.reshape(-1, n_features), X, y, target_neighbors,
+                   regularization)[1]
+
+  def grad(x0):
+    return loss_fn(x0.reshape(-1, n_features), X, y, target_neighbors,
+                   regularization)[0].ravel()
+
+  scipy.optimize.check_grad(loss, grad, x0.ravel())
+
+
 def test_compute_push_loss():
     """Test if the push loss is computed correctly
 

From e5cd82e04a1f0de38d0352acbc43c8fd3ea0c5c9 Mon Sep 17 00:00:00 2001
From: John Chiotellis <johnyc.code@gmail.com>
Date: Sun, 29 Aug 2021 16:03:19 +0200
Subject: [PATCH 10/10] add better docstring for push_loss_weight

---
 metric_learn/lmnn.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
index dfb3f2fe..95868f37 100644
--- a/metric_learn/lmnn.py
+++ b/metric_learn/lmnn.py
@@ -93,8 +93,10 @@ class LMNN(MahalanobisMixin, TransformerMixin):
     neighbors.
 
   push_loss_weight: float, optional (default=0.5)
-    Relative weight between pull and push terms, with 0.5 meaning equal
-    weight.
+    Relative weight between pull loss and push loss terms, with 0.5 meaning
+    equal weight. Note that in practice the push loss weight will be
+    unweighted and the pull loss will be weighted accordingly by a factor
+    `(1 - push_loss_weight) / push_loss_weight`.
 
   regularization : Not used
      .. deprecated:: 0.6.3
@@ -102,7 +104,6 @@ class LMNN(MahalanobisMixin, TransformerMixin):
     be removed in 0.6.4. It is replaced by `push_loss_weight` that more
     explicitly defines which loss term is meant to be weighted.
 
-
   max_iter : int, optional (default=1000)
     Maximum number of iterations of the optimization procedure.