diff --git a/.gitignore b/.gitignore
index 8321c7d2..02ee652d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,7 @@ htmlcov/
 .cache/
 .pytest_cache/
 doc/auto_examples/*
-doc/generated/*
\ No newline at end of file
+doc/generated/*
+
+# Pycharm artifacts
+.idea/
diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
index 8bdc4bf0..95868f37 100644
--- a/metric_learn/lmnn.py
+++ b/metric_learn/lmnn.py
@@ -1,10 +1,17 @@
 """
 Large Margin Nearest Neighbor Metric learning (LMNN)
 """
+import sys
+import time
+import warnings
 import numpy as np
-from collections import Counter
+from scipy.optimize import minimize
+from scipy.sparse import coo_matrix, csr_matrix, csc_matrix
+from sklearn.neighbors import NearestNeighbors
 from sklearn.metrics import euclidean_distances
+from sklearn.utils import gen_batches, check_random_state
 from sklearn.base import TransformerMixin
+from sklearn.exceptions import ConvergenceWarning
 
 from ._util import _initialize_components, _check_n_components
 from .base_metric import MahalanobisMixin
@@ -23,6 +30,18 @@ class LMNN(MahalanobisMixin, TransformerMixin):
 
   Parameters
   ----------
+  n_neighbors : int, optional (default=3)
+    Number of neighbors to consider, not including self-edges.
+
+  k : Not used
+     .. deprecated:: 0.6.3
+    `k` was deprecated in version 0.6.3 and will
+    be removed in 0.6.4. It is replaced by `n_neighbors` to have more
+    consistent terminology with scikit-learn.
+
+  n_components : int or None, optional (default=None)
+    Dimensionality of reduced space (if None, defaults to dimension of X).
+
   init : string or numpy array, optional (default='auto')
     Initialization of the linear transformation. Possible options are
     'auto', 'pca', 'identity', 'random', and a numpy array of shape
@@ -63,35 +82,41 @@ class LMNN(MahalanobisMixin, TransformerMixin):
       :meth:`fit` and n_features_a must be less than or equal to that.
       If ``n_components`` is not None, n_features_a must match it.
 
-  k : int, optional (default=3)
-    Number of neighbors to consider, not including self-edges.
+  max_impostors : int, optional (default=500_000)
+      Maximum number of impostors to consider per iteration. In the worst
+      case this will allow ``max_impostors * n_neighbors`` constraints to be
+      active.
+
+  neighbors_params : dict, optional (default=None)
+    Parameters to pass to a :class:`neighbors.NearestNeighbors` instance -
+    apart from ``n_neighbors`` - that will be used to select the target
+    neighbors.
 
-  min_iter : int, optional (default=50)
-    Minimum number of iterations of the optimization procedure.
+  push_loss_weight: float, optional (default=0.5)
+    Relative weight between pull loss and push loss terms, with 0.5 meaning
+    equal weight. Note that in practice the push loss weight will be
+    unweighted and the pull loss will be weighted accordingly by a factor
+    `(1 - push_loss_weight) / push_loss_weight`.
+
+  regularization : Not used
+     .. deprecated:: 0.6.3
+    `regularization` was deprecated in version 0.6.3 and will
+    be removed in 0.6.4. It is replaced by `push_loss_weight` that more
+    explicitly defines which loss term is meant to be weighted.
 
   max_iter : int, optional (default=1000)
     Maximum number of iterations of the optimization procedure.
 
-  learn_rate : float, optional (default=1e-7)
-    Learning rate of the optimization procedure
-
-  tol : float, optional (default=0.001)
+  tol : float, optional (default=0.00001)
     Tolerance of the optimization procedure. If the objective value varies
     less than `tol`, we consider the algorithm has converged and stop it.
 
-  verbose : bool, optional (default=False)
-    Whether to print the progress of the optimization procedure.
-
-  regularization: float, optional (default=0.5)
-    Relative weight between pull and push terms, with 0.5 meaning equal
-    weight.
-
   preprocessor : array-like, shape=(n_samples, n_features) or callable
     The preprocessor to call to get tuples from indices. If array-like,
     tuples will be formed like this: X[indices].
 
-  n_components : int or None, optional (default=None)
-    Dimensionality of reduced space (if None, defaults to dimension of X).
+  verbose : bool, optional (default=False)
+    Whether to print the progress of the optimization procedure.
 
   random_state : int or numpy.RandomState or None, optional (default=None)
     A pseudo random number generator object or a seed for it if int. If
@@ -101,233 +126,591 @@ class LMNN(MahalanobisMixin, TransformerMixin):
 
   Attributes
   ----------
+  components_ : `numpy.ndarray`, shape=(n_components, n_features)
+    The learned linear transformation ``L``.
+
+  n_neighbors_ : int
+      The provided ``n_neighbors`` is decreased if it is greater than or
+      equal to  min(number of elements in each class).
+
   n_iter_ : `int`
     The number of iterations the solver has run.
 
-  components_ : `numpy.ndarray`, shape=(n_components, n_features)
-    The learned linear transformation ``L``.
 
   Examples
   --------
 
   >>> import numpy as np
   >>> from metric_learn import LMNN
+  >>> from sklearn.neighbors import KNeighborsClassifier
   >>> from sklearn.datasets import load_iris
   >>> iris_data = load_iris()
   >>> X = iris_data['data']
   >>> Y = iris_data['target']
-  >>> lmnn = LMNN(k=5, learn_rate=1e-6)
-  >>> lmnn.fit(X, Y, verbose=False)
+  >>> lmnn = LMNN(n_neighbors=3)
+  >>> lmnn.fit(X, Y)
+  >>> knn = KNeighborsClassifier(n_neighbors=3)
+  >>> knn.fit(lmnn.transform(X), Y)
 
   References
   ----------
-  .. [1] K. Q. Weinberger, J. Blitzer, L. K. Saul. `Distance Metric
-         Learning for Large Margin Nearest Neighbor Classification
-         <http://papers.nips.cc/paper/2795-distance-metric\
-         -learning-for-large-margin-nearest-neighbor-classification>`_. NIPS
-         2005.
+  .. [1] Weinberger, Kilian Q., and Lawrence K. Saul.
+         "Distance Metric Learning for Large Margin Nearest Neighbor
+         Classification."
+         Journal of Machine Learning Research, Vol. 10, Feb. 2009,
+         pp. 207-244.
+         http://jmlr.csail.mit.edu/papers/volume10/weinberger09a/weinberger09a.pdf
+
+  .. [2] Wikipedia entry on Large Margin Nearest Neighbor
+         https://en.wikipedia.org/wiki/Large_margin_nearest_neighbor
+
   """
+  def __init__(self, n_neighbors=3, k='deprecated', n_components=None,
+               init='auto', max_impostors=500_000, neighbors_params=None,
+               push_loss_weight=0.5, regularization='deprecated', max_iter=50,
+               tol=1e-5, preprocessor=None, verbose=False, random_state=None):
 
-  def __init__(self, init='auto', k=3, min_iter=50, max_iter=1000,
-               learn_rate=1e-7, regularization=0.5, convergence_tol=0.001,
-               verbose=False, preprocessor=None,
-               n_components=None, random_state=None):
-    self.init = init
+    self.n_neighbors = n_neighbors
     self.k = k
-    self.min_iter = min_iter
-    self.max_iter = max_iter
-    self.learn_rate = learn_rate
+    self.n_components = n_components
+    self.init = init
+    self.max_impostors = max_impostors
+    self.neighbors_params = neighbors_params
+    self.push_loss_weight = push_loss_weight
     self.regularization = regularization
-    self.convergence_tol = convergence_tol
+    self.max_iter = max_iter
+    self.tol = tol
     self.verbose = verbose
-    self.n_components = n_components
     self.random_state = random_state
     super(LMNN, self).__init__(preprocessor)
 
   def fit(self, X, y):
-    k = self.k
-    reg = self.regularization
-    learn_rate = self.learn_rate
-
-    X, y = self._prepare_inputs(X, y, dtype=float,
-                                ensure_min_samples=2)
-    num_pts, d = X.shape
-    output_dim = _check_n_components(d, self.n_components)
-    unique_labels, label_inds = np.unique(y, return_inverse=True)
-    if len(label_inds) != num_pts:
-      raise ValueError('Must have one label per point.')
-    self.labels_ = np.arange(len(unique_labels))
-
-    self.components_ = _initialize_components(output_dim, X, y, self.init,
-                                              self.verbose,
-                                              random_state=self.random_state)
-    required_k = np.bincount(label_inds).min()
-    if self.k > required_k:
-      raise ValueError('not enough class labels for specified k'
-                       ' (smallest class has %d)' % required_k)
-
-    target_neighbors = self._select_targets(X, label_inds)
-
-    # sum outer products
-    dfG = _sum_outer_products(X, target_neighbors.flatten(),
-                              np.repeat(np.arange(X.shape[0]), k))
-
-    # initialize L
-    L = self.components_
-
-    # first iteration: we compute variables (including objective and gradient)
-    #  at initialization point
-    G, objective, total_active = self._loss_grad(X, L, dfG, k,
-                                                 reg, target_neighbors,
-                                                 label_inds)
-
-    it = 1  # we already made one iteration
+    """Train Large Margin Nearest Neighbor model.
+
+    :param X: array, shape (n_samples, n_features), the training samples.
+    :param y: array, shape (n_samples,), the training labels.
+
+    :return: The trained LMNN model.
+    """
+
+    # Check input arrays
+    X, y = self._prepare_inputs(X, y, dtype=float, ensure_min_samples=2)
+
+    n_samples, n_features = X.shape
+    n_components = _check_n_components(n_features, self.n_components)
+
+    # Initialize the random generator
+    self._random_state = check_random_state(self.random_state)
+
+    # Initialize transformation
+    self.components_ = _initialize_components(n_components, X, y, self.init,
+                                              verbose=self.verbose,
+                                              random_state=self._random_state)
+
+    # remove singletons after initializing components
+    X, y, classes = self._validate_params(X, y)
+
+    # Find the target neighbors of each sample
+    target_neighbors = self._select_target_neighbors(X, y, classes)
+
+    # Compute the pull loss gradient w.r.t. M, which remains constant
+    tn_graph = _make_knn_graph(target_neighbors)
+    pull_loss_grad_m = _sum_weighted_outer_products(X, tn_graph)
+
+    # Compute the pull loss weight such that the push loss weight becomes 1
+    pull_loss_weight = (1 - self.push_loss_weight) / self.push_loss_weight
+    pull_loss_grad_m *= pull_loss_weight
+
+    int_verbose = int(self.verbose)
+    disp = int_verbose - 2 if int_verbose > 1 else -1
+    optimizer_params = {
+      'method': 'L-BFGS-B',
+      'fun': self._loss_grad_lbfgs,
+      'jac': True,
+      'args': (X, y, classes, target_neighbors, pull_loss_grad_m),
+      'x0': self.components_,
+      'tol': self.tol,
+      'options': dict(maxiter=self.max_iter, disp=disp),
+    }
+
+    # Call the optimizer
+    self.n_iter_ = 0
+    opt_result = minimize(**optimizer_params)
+
+    # Reshape the solution found by the optimizer
+    self.components_ = opt_result.x.reshape(-1, n_features)
 
     if self.verbose:
-      print("iter | objective | objective difference | active constraints",
-            "| learning rate")
-
-    # main loop
-    for it in range(2, self.max_iter):
-      # then at each iteration, we try to find a value of L that has better
-      # objective than the previous L, following the gradient:
-      while True:
-        # the next point next_L to try out is found by a gradient step
-        L_next = L - learn_rate * G
-        # we compute the objective at next point
-        # we copy variables that can be modified by _loss_grad, because if we
-        # retry we don t want to modify them several times
-        (G_next, objective_next, total_active_next) = (
-            self._loss_grad(X, L_next, dfG, k, reg, target_neighbors,
-                            label_inds))
-        assert not np.isnan(objective)
-        delta_obj = objective_next - objective
-        if delta_obj > 0:
-          # if we did not find a better objective, we retry with an L closer to
-          # the starting point, by decreasing the learning rate (making the
-          # gradient step smaller)
-          learn_rate /= 2
-        else:
-          # otherwise, if we indeed found a better obj, we get out of the loop
-          break
-      # when the better L is found (and the related variables), we set the
-      # old variables to these new ones before next iteration and we
-      # slightly increase the learning rate
-      L = L_next
-      G, objective, total_active = G_next, objective_next, total_active_next
-      learn_rate *= 1.01
-
-      if self.verbose:
-        print(it, objective, delta_obj, total_active, learn_rate)
-
-      # check for convergence
-      if it > self.min_iter and abs(delta_obj) < self.convergence_tol:
-        if self.verbose:
-          print("LMNN converged with objective", objective)
-        break
-    else:
-      if self.verbose:
-        print("LMNN didn't converge in %d steps." % self.max_iter)
-
-    # store the last L
-    self.components_ = L
-    self.n_iter_ = it
+      cls_name = self.__class__.__name__
+
+      # Warn the user if the algorithm did not converge
+      if not opt_result.success:
+        warnings.warn('[{}] LMNN did not converge: {}'.format(
+          cls_name, opt_result.message), ConvergenceWarning)
+
     return self
 
-  def _loss_grad(self, X, L, dfG, k, reg, target_neighbors, label_inds):
-    # Compute pairwise distances under current metric
-    Lx = L.dot(X.T).T
-
-    # we need to find the furthest neighbor:
-    Ni = 1 + _inplace_paired_L2(Lx[target_neighbors], Lx[:, None, :])
-    furthest_neighbors = np.take_along_axis(target_neighbors,
-                                            Ni.argmax(axis=1)[:, None], 1)
-    impostors = self._find_impostors(furthest_neighbors.ravel(), X,
-                                     label_inds, L)
-
-    g0 = _inplace_paired_L2(*Lx[impostors])
-
-    # we reorder the target neighbors
-    g1, g2 = Ni[impostors]
-    # compute the gradient
-    total_active = 0
-    df = np.zeros((X.shape[1], X.shape[1]))
-    for nn_idx in reversed(range(k)):  # note: reverse not useful here
-      act1 = g0 < g1[:, nn_idx]
-      act2 = g0 < g2[:, nn_idx]
-      total_active += act1.sum() + act2.sum()
-
-      targets = target_neighbors[:, nn_idx]
-      PLUS, pweight = _count_edges(act1, act2, impostors, targets)
-      df += _sum_outer_products(X, PLUS[:, 0], PLUS[:, 1], pweight)
-
-      in_imp, out_imp = impostors
-      df -= _sum_outer_products(X, in_imp[act1], out_imp[act1])
-      df -= _sum_outer_products(X, in_imp[act2], out_imp[act2])
-
-    # do the gradient update
-    assert not np.isnan(df).any()
-    G = dfG * reg + df * (1 - reg)
-    G = L.dot(G)
-    # compute the objective function
-    objective = total_active * (1 - reg)
-    objective += G.flatten().dot(L.flatten())
-    return 2 * G, objective, total_active
-
-  def _select_targets(self, X, label_inds):
-    target_neighbors = np.empty((X.shape[0], self.k), dtype=int)
-    for label in self.labels_:
-      inds, = np.nonzero(label_inds == label)
-      dd = euclidean_distances(X[inds], squared=True)
-      np.fill_diagonal(dd, np.inf)
-      nn = np.argsort(dd)[..., :self.k]
-      target_neighbors[inds] = inds[nn]
+  def _validate_params(self, X, y):
+    """Validate parameters as soon as :meth:`fit` is called.
+
+    :param X: array, shape (n_samples, n_features), the training samples.
+    :param y: array, shape (n_samples,), the training labels.
+
+    :return:
+      X: array, shape (n_samples, n_features), the validated training samples.
+      y: array, shape (n_samples,), the validated training labels.
+      classes: array, shape (n_classes,), the non-singleton classes encoded.
+    """
+
+    # TODO: remove these in v0.6.4
+    if self.k != 'deprecated':
+      warnings.warn('"k" parameter from initialization is not used.'
+                    ' It has been deprecated in version 0.6.3 and will be'
+                    ' removed in 0.6.4. Use the "n_neighbors" parameter '
+                    'instead.', DeprecationWarning)
+
+    if self.regularization != 'deprecated':
+      warnings.warn('"regularization" parameter from initialization is not '
+                    'used. It has been deprecated in version 0.6.3 and will '
+                    'be removed in 0.6.4. Use the "push_loss_weight" parameter'
+                    'instead.', DeprecationWarning)
+
+    # Find the appearing classes and the class index of each of the samples
+    classes, y = np.unique(y, return_inverse=True)
+    classes_inverse = np.arange(len(classes))
+
+    # Ignore classes that have less than two samples (singleton classes)
+    class_sizes = np.bincount(y)
+    mask_singleton_class = class_sizes == 1
+    singleton_classes = np.where(mask_singleton_class)[0]
+    if len(singleton_classes):
+      warnings.warn('There are {} singleton classes that will be ignored '
+                    'during training. A copy of the inputs `X` and `y` will '
+                    'be made.'.format(len(singleton_classes)))
+
+      mask_singleton_sample = np.asarray([yi in singleton_classes for
+                                          yi in y])
+      X = X[~mask_singleton_sample]
+      y = y[~mask_singleton_sample]
+
+    # Check that there are at least 2 non-singleton classes
+    n_classes_non_singleton = len(classes) - len(singleton_classes)
+    if n_classes_non_singleton < 2:
+      raise ValueError('LMNN needs at least 2 non-singleton classes, got {}.'
+                       .format(n_classes_non_singleton))
+
+    classes_inverse_non_singleton = classes_inverse[~mask_singleton_class]
+
+    # Check the preferred number of neighbors
+    min_non_singleton_size = class_sizes[~mask_singleton_class].min()
+    if self.n_neighbors >= min_non_singleton_size:
+      warnings.warn('`n_neighbors` (={}) is not less than the number of '
+                    'samples in the smallest non-singleton class (={}). '
+                    '`n_neighbors_` will be set to {} for estimation.'
+                    .format(self.n_neighbors, min_non_singleton_size,
+                            min_non_singleton_size - 1))
+
+    self.n_neighbors_ = min(self.n_neighbors, min_non_singleton_size - 1)
+
+    return X, y, classes_inverse_non_singleton
+
+  def _loss_grad_lbfgs(self, L, X, y, classes, target_neighbors,
+                       pull_loss_grad_m):
+    """Compute loss and gradient after one optimization iteration.
+
+    :param L: array, shape (n_components * n_features,)
+              The current (flattened) linear transformation.
+    :param X: array, shape (n_samples, n_features), the training samples.
+    :param y: array, shape (n_samples,), the training labels.
+    :param classes: array, shape (n_classes,), classes encoded as intergers.
+    :param target_neighbors: array, shape (n_samples, n_neighbors),
+                             the target neighbors of each training sample.
+    :param pull_loss_grad_m: array, shape (n_features, n_features),
+                       the (weighted) gradient (w.r.t M=L.T@L) due to target
+                       neighbors that stays fixed throughout the algorithm.
+
+    :return:
+      loss: float, the loss given the current solution.
+      grad: array, shape (n_components, n_features), the (flattened) gradient.
+    """
+
+    # Print headers
+    if self.n_iter_ == 0 and self.verbose:
+      header_fields = ['Iteration', 'Objective Value',
+                       '#Active Triplets', 'Time(s)']
+      header_fmt = '{:>10} {:>20} {:>20} {:>10}'
+      header = header_fmt.format(*header_fields)
+      cls_name = self.__class__.__name__
+      print('[{}]'.format(cls_name))
+      print('[{}] {}'.format(cls_name, header))
+      print('[{}] {}'.format(cls_name, '-' * len(header)))
+
+    # Measure duration of optimization iteration
+    start_time = time.time()
+
+    # Reshape current solution
+    n_samples, n_features = X.shape
+    L = L.reshape(-1, n_features)
+
+    # Transform samples according to the current solution
+    X_embedded = X @ L.T
+
+    # Compute squared distances to the target neighbors
+    n_neighbors = target_neighbors.shape[1]
+    dist_tn = np.zeros((n_samples, n_neighbors))
+    for k in range(n_neighbors):
+      ind_tn = target_neighbors[:, k]
+      diffs = X_embedded - X_embedded[ind_tn]
+      dist_tn[:, k] = np.einsum('ij,ij->i', diffs, diffs)  # row norms
+
+    # Add margin to the distances
+    dist_tn += 1
+
+    # Find impostors
+    impostors_graph = self._find_impostors(X_embedded, y, classes,
+                                           dist_tn[:, -1])
+
+    # Compute the push loss and its gradient
+    push_loss, push_loss_grad_m, n_active = _push_loss_grad(
+      X, target_neighbors, dist_tn, impostors_graph)
+
+    # Compute the total loss
+    M = L.T @ L
+    # It holds that trace(M.T @ C) = np.dot(M.ravel(), C.ravel())
+    new_pull_loss = np.dot(pull_loss_grad_m.ravel(), M.ravel())
+    loss = push_loss + new_pull_loss
+
+    # Compute the total gradient (grad_L = 2L grad_M)
+    grad = np.dot(L, pull_loss_grad_m + push_loss_grad_m)
+    grad *= 2
+
+    # Report iteration metrics
+    self.n_iter_ += 1
+    if self.verbose:
+      elapsed_time = time.time() - start_time
+      values_fmt = '[{}] {:>10} {:>20.6e} {:>20,} {:>10.2f}'
+      cls_name = self.__class__.__name__
+      print(values_fmt.format(cls_name, self.n_iter_, loss,
+                              n_active, elapsed_time))
+      sys.stdout.flush()
+
+    return loss, grad.ravel()
+
+  def _select_target_neighbors(self, X, y, classes):
+    """Find the target neighbors of each training sample.
+
+    :param X: array, shape (n_samples, n_features), the training samples.
+    :param y: array, shape (n_samples,), the training labels.
+    :param classes: array, shape (n_classes,), the classes encoded as integers.
+
+    :return: array, shape (n_samples, n_neighbors),
+             the indices of the target neighbors of each training sample.
+    """
+
+    start_time = time.time()
+    cls_name = self.__class__.__name__
+    if self.verbose:
+      print('[{}] Finding the target neighbors...'.format(cls_name))
+      sys.stdout.flush()
+
+    nn_kwargs = self.neighbors_params or {}
+    nn = NearestNeighbors(n_neighbors=self.n_neighbors_, **nn_kwargs)
+    target_neighbors = np.empty((X.shape[0], self.n_neighbors_), dtype=int)
+
+    for label in classes:
+      ind_class = np.where(y == label)[0]
+      nn.fit(X[ind_class])
+      ind_neighbors = nn.kneighbors(return_distance=False)
+      target_neighbors[ind_class] = ind_class[ind_neighbors]
+
+    if self.verbose:
+      elapsed_time = time.time() - start_time
+      print('[{}] Found the target neighbors in {:5.2f}s.'.format(
+        cls_name, elapsed_time))
+
     return target_neighbors
 
-  def _find_impostors(self, furthest_neighbors, X, label_inds, L):
-    Lx = X.dot(L.T)
-    margin_radii = 1 + _inplace_paired_L2(Lx[furthest_neighbors], Lx)
-    impostors = []
-    for label in self.labels_[:-1]:
-      in_inds, = np.nonzero(label_inds == label)
-      out_inds, = np.nonzero(label_inds > label)
-      dist = euclidean_distances(Lx[out_inds], Lx[in_inds], squared=True)
-      i1, j1 = np.nonzero(dist < margin_radii[out_inds][:, None])
-      i2, j2 = np.nonzero(dist < margin_radii[in_inds])
-      i = np.hstack((i1, i2))
-      j = np.hstack((j1, j2))
-      if i.size > 0:
-        # get unique (i,j) pairs using index trickery
-        shape = (i.max() + 1, j.max() + 1)
-        tmp = np.ravel_multi_index((i, j), shape)
-        i, j = np.unravel_index(np.unique(tmp), shape)
-      impostors.append(np.vstack((in_inds[j], out_inds[i])))
-    if len(impostors) == 0:
-        # No impostors detected
-        return impostors
-    return np.hstack(impostors)
-
-
-def _inplace_paired_L2(A, B):
-  '''Equivalent to ((A-B)**2).sum(axis=-1), but modifies A in place.'''
-  A -= B
-  return np.einsum('...ij,...ij->...i', A, A)
-
-
-def _count_edges(act1, act2, impostors, targets):
-  imp = impostors[0, act1]
-  c = Counter(zip(imp, targets[imp]))
-  imp = impostors[1, act2]
-  c.update(zip(imp, targets[imp]))
-  if c:
-    active_pairs = np.array(list(c.keys()))
+  def _find_impostors(self, X_embedded, y, classes, margin_radii):
+    """Find the samples that violate the margin.
+
+    :param X_embedded: array, shape (n_samples, n_components),
+                       the training samples in the embedding space.
+    :param y: array, shape (n_samples,), training labels.
+    :param classes: array, shape (n_classes,), the classes encoded as integers.
+    :param margin_radii: array, shape (n_samples,), squared distances of
+                         training samples to their farthest target neighbors
+                         plus margin.
+
+    :return: coo_matrix, shape (n_samples, n_neighbors),
+             If sample i is an impostor to sample j or vice versa, then the
+             value of A[i, j] is the squared distance between samples i and j.
+             Otherwise A[i, j] is zero.
+
+    """
+
+    # Initialize lists for impostors storage
+    imp_row, imp_col = [], []
+
+    for label in classes[:-1]:
+      ind_a = np.where(y == label)[0]
+      ind_b = np.where(y > label)[0]
+
+      ind_impostors = _find_impostors_blockwise(X_embedded, margin_radii,
+                                                ind_b, ind_a)
+
+      n_impostors = len(ind_impostors)
+      if n_impostors:
+        if n_impostors > self.max_impostors:
+          ind_samples = self._random_state.choice(
+            n_impostors, self.max_impostors, replace=False)
+          ind_impostors = ind_impostors[ind_samples]
+
+        dims = (len(ind_b), len(ind_a))
+        ib, ia = np.unravel_index(ind_impostors, shape=dims)
+
+        imp_row.extend(ind_b[ib])
+        imp_col.extend(ind_a[ia])
+
+    # turn lists to numpy arrays
+    imp_row = np.asarray(imp_row, dtype=np.intp)
+    imp_col = np.asarray(imp_col, dtype=np.intp)
+
+    # Make sure we do not exceed max_impostors
+    n_impostors = len(imp_row)
+    if n_impostors > self.max_impostors:
+      ind_samples = self._random_state.choice(
+        n_impostors, self.max_impostors, replace=False)
+      imp_row = imp_row[ind_samples]
+      imp_col = imp_col[ind_samples]
+
+    # Compute distances
+    imp_dist = _paired_distances_blockwise(X_embedded, imp_row, imp_col)
+
+    # store impostors as a sparse matrix
+    n = X_embedded.shape[0]
+    impostors_graph = coo_matrix((imp_dist, (imp_row, imp_col)), shape=(n, n))
+
+    return impostors_graph
+
+
+########################
+# Some core functions #
+#######################
+
+def _find_impostors_blockwise(X, radii, ind_a, ind_b,
+                              return_distance=False, block_size=8):
+  """Find (sample, impostor) pairs in blocks to avoid large memory usage.
+
+  Parameters
+  ----------
+  X : array, shape (n_samples, n_components)
+      Transformed data samples.
+
+  radii : array, shape (n_samples,)
+      Squared distances of the samples in ``X`` to their margins.
+
+  ind_a : array, shape (n_samples_a,)
+      Indices of samples from class A.
+
+  ind_b : array, shape (n_samples_b,)
+      Indices of samples from class B, where n_samples_b << n_samples_a.
+
+  block_size : int, optional (default=8)
+      The maximum number of mebibytes (MiB) of memory to use at a time for
+      calculating paired squared distances.
+  return_distance : bool, optional (default=False)
+      Whether to return the squared distances to the impostors.
+
+  Returns
+  -------
+  imp_indices : array, shape (n_impostors,)
+      Unraveled indices of (sample, impostor) pairs referring to a matrix
+      of shape (n_samples_a, n_samples_b).
+  imp_distances : array, shape (n_impostors,), optional
+      imp_distances[i] is the squared distance between samples imp_row[i] and
+      imp_col[i], where
+      imp_row, imp_col = np.unravel_index(imp_indices, shape=(n_samples_a,
+      n_samples_b))
+  """
+
+  n_samples_a = len(ind_a)
+  bytes_per_row = len(ind_b) * X.itemsize
+  block_n_rows = int(block_size * 1024 * 1024 // bytes_per_row)
+
+  imp_indices, imp_distances = [], []
+
+  # data
+  X_b = X[ind_b]
+  radii_a = radii[ind_a]
+  radii_b = radii[ind_b]
+
+  # X_b squared norm stays constant, so pre-compute it to get a speed-up
+  X_b_norm_squared = np.einsum('ij,ij->i', X_b, X_b).reshape(1, -1)
+  for chunk in gen_batches(n_samples_a, block_n_rows):
+
+    dist = euclidean_distances(X[ind_a[chunk]], X_b, squared=True,
+                               Y_norm_squared=X_b_norm_squared)
+
+    ind_ba, = np.where((dist < radii_a[chunk, None]).ravel())
+    ind_ab, = np.where((dist < radii_b[None, :]).ravel())
+    ind = np.unique(np.concatenate((ind_ab, ind_ba)))
+
+    if len(ind):
+      ind_plus_offset = ind + chunk.start * X_b.shape[0]
+      imp_indices.extend(ind_plus_offset)
+
+      if return_distance:
+        # We only need to do clipping if we return the distances.
+        distances_chunk = dist.ravel()[ind]
+        # Clip only the indexed (unique) distances
+        np.maximum(distances_chunk, 0, out=distances_chunk)
+        imp_distances.extend(distances_chunk)
+
+  imp_indices = np.asarray(imp_indices)
+
+  if return_distance:
+    return imp_indices, np.asarray(imp_distances)
   else:
-    active_pairs = np.empty((0, 2), dtype=int)
-  return active_pairs, np.array(list(c.values()))
+    return imp_indices
+
+
+def _push_loss_grad(X, target_neighbors, inflated_dist_tn, impostors_graph):
+  """Compute the push loss max(d(a, p) + 1 - d(a, n), 0) and its gradient.
+
+  :param X: array, shape (n_samples, n_features), the training samples.
+  :param target_neighbors: array, shape (n_samples, n_neighbors),
+                           the target neighbors of each training sample.
+  :param inflated_dist_tn: array, shape (n_samples, n_neighbors),
+                           squared distances of each sample to their target
+                           neighbors plus margin.
+  :param impostors_graph: coo_matrix, shape (n_samples, n_samples),
+
+  :return:
+    loss: float, the push loss due to the given target neighbors and impostors.
+    grad: array, shape (n_features, n_features), the gradient of the push loss
+          with respect to M = L.T @ L.
+    n_active_triplets: int, the number of active triplet constraints.
+
+  """
+
+  n_samples, n_neighbors = inflated_dist_tn.shape
+  imp_row = impostors_graph.row
+  imp_col = impostors_graph.col
+  dist_impostors = impostors_graph.data
+
+  loss = 0
+  shape = (n_samples, n_samples)
+  A0 = csr_matrix(shape)
+  sample_range = range(n_samples)
+  n_active_triplets = 0
+  for k in reversed(range(n_neighbors)):
+    # Consider margin violations to the samples in imp_row
+    losses1 = np.maximum(inflated_dist_tn[imp_row, k] - dist_impostors, 0)
+    ac = np.where(losses1 > 0)[0]
+    n_active_triplets += len(ac)
+    A1 = csr_matrix((2 * losses1[ac], (imp_row[ac], imp_col[ac])), shape)
+
+    # Consider margin violations to the samples in imp_col
+    losses2 = np.maximum(inflated_dist_tn[imp_col, k] - dist_impostors, 0)
+    ac = np.where(losses2 > 0)[0]
+    n_active_triplets += len(ac)
+    A2 = csc_matrix((2 * losses2[ac], (imp_row[ac], imp_col[ac])), shape)
+
+    # Update the loss
+    loss += np.dot(losses1, losses1) + np.dot(losses2, losses2)
+
+    # Update the weight matrix for gradient computation
+    val = (A1.sum(1).ravel() + A2.sum(0)).getA1()
+    A3 = csr_matrix((val, (sample_range, target_neighbors[:, k])), shape)
+    A0 = A0 - A1 - A2 + A3
+
+  grad = _sum_weighted_outer_products(X, A0)
+
+  return loss, grad, n_active_triplets
+
+
+##########################
+# Some helper functions #
+#########################
+
+def _paired_distances_blockwise(X, ind_a, ind_b, squared=True, block_size=8):
+    """Equivalent to row_norms(X[ind_a] - X[ind_b], squared=squared).
+
+    Parameters
+    ----------
+    X : array, shape (n_samples, n_features)
+        An array of data samples.
+    ind_a : array, shape (n_indices,)
+        An array of sample indices.
+    ind_b : array, shape (n_indices,)
+        Another array of sample indices.
+    squared : bool (default=True)
+        Whether to return the squared distances.
+    block_size : int, optional (default=8)
+        The maximum number of mebibytes (MiB) of memory to use at a time for
+        calculating paired (squared) distances.
+
+    Returns
+    -------
+    distances: array, shape (n_indices,)
+        An array of paired, optionally squared, distances.
+    """
+
+    bytes_per_row = X.shape[1] * X.itemsize
+    batch_size = int(block_size * 1024 * 1024 // bytes_per_row)
+
+    n_pairs = len(ind_a)
+    distances = np.zeros(n_pairs)
+    for chunk in gen_batches(n_pairs, batch_size):
+      diffs = X[ind_a[chunk]] - X[ind_b[chunk]]
+      distances[chunk] = np.einsum('ij,ij->i', diffs, diffs)
+
+    return distances if squared else np.sqrt(distances, out=distances)
+
+
+def _sum_weighted_outer_products(X, weights):
+  """Compute the sum of weighted outer products.
+
+      Consider the squared Mahalanobis distance between x_i and x_j:
+
+      d_{ij} = (x_i - x_j)^T M (x_i - x_j)
+             = x_i^T M x_i - x_i^T M x_j - x_j^T M x_i + x_j^T M x_j
+
+      The gradient of d_{ij} with respect to M is a sum of outer products:
+
+      \nabla_M d_{ij} = x_i x_i^T - x_j x_i^T - x_i x_j^T + x_j x_j^T
+
+
+  :param X: array, shape (n_samples, n_features), data samples.
+  :param weights: csr_matrix, shape (n_samples, n_samples), sparse weights.
+
+  :return: array, shape (n_features, n_features),
+           the sum of all weighted outer products.
+  """
+
+  W = weights + weights.T
+  D = W.sum(1).getA()
+
+  # X.T * (D - W) * X
+  LX = D * X - W @ X  # D is n x 1, W is n x n
+  ret = X.T @ LX
+
+  return ret
+
+
+def _make_knn_graph(indices):
+  """Convert a dense indices matrix to a sparse adjacency matrix.
+
+  :param indices: array, shape (n_samples, n_neighbors),
+                  indices of the k nearest neighbors of each sample.
+  :return: csr_matrix, shape (n_samples, n_samples), the adjacency matrix.
+  """
 
+  n_samples, n_neighbors = indices.shape
+  row = np.repeat(range(n_samples), n_neighbors)
+  col = indices.ravel()
+  ind = np.ones(indices.size)
+  shape = (n_samples, n_samples)
+  knn_graph = csr_matrix((ind, (row, col)), shape=shape)
 
-def _sum_outer_products(data, a_inds, b_inds, weights=None):
-  Xab = data[a_inds] - data[b_inds]
-  if weights is not None:
-    return np.dot(Xab.T, Xab * weights[:, None])
-  return np.dot(Xab.T, Xab)
+  return knn_graph
diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py
index 5cae80f2..2fce2fa4 100644
--- a/test/metric_learn_test.py
+++ b/test/metric_learn_test.py
@@ -2,9 +2,8 @@
 import re
 import pytest
 import numpy as np
-import scipy
 from scipy.optimize import check_grad, approx_fprime
-from sklearn.metrics import pairwise_distances, euclidean_distances
+from sklearn.metrics import pairwise_distances
 from sklearn.datasets import (load_iris, make_classification, make_regression,
                               make_spd_matrix)
 from numpy.testing import (assert_array_almost_equal, assert_array_equal,
@@ -26,7 +25,11 @@
                           MMC_Supervised, SDML, RCA, ITML, SCML)
 # Import this specially for testing.
 from metric_learn.constraints import wrap_pairs, Constraints
-from metric_learn.lmnn import _sum_outer_products
+from metric_learn.lmnn import (
+    _sum_weighted_outer_products,
+    _make_knn_graph,
+    _push_loss_grad
+)
 
 
 def class_separation(X, labels):
@@ -362,12 +365,11 @@ def test_bounds_parameters_invalid(bounds):
 
 class TestLMNN(MetricTestCase):
   def test_iris(self):
-    lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False)
+    lmnn = LMNN(n_neighbors=5, verbose=False)
     lmnn.fit(self.iris_points, self.iris_labels)
 
-    csep = class_separation(lmnn.transform(self.iris_points),
-                            self.iris_labels)
-    self.assertLess(csep, 0.25)
+    csep = class_separation(lmnn.transform(self.iris_points), self.iris_labels)
+    self.assertLess(csep, 0.26)
 
   def test_loss_grad_lbfgs(self):
     """Test gradient of loss function
@@ -378,38 +380,46 @@ def test_loss_grad_lbfgs(self):
     X, y = make_classification(random_state=rng)
     L = rng.randn(rng.randint(1, X.shape[1] + 1), X.shape[1])
     lmnn = LMNN()
+    lmnn.n_iter_ = 0
+    lmnn.n_neighbors_ = lmnn.n_neighbors
 
-    k = lmnn.k
-    reg = lmnn.regularization
-
-    X, y = lmnn._prepare_inputs(X, y, dtype=float,
-                                ensure_min_samples=2)
+    X, y = lmnn._prepare_inputs(X, y, dtype=float, ensure_min_samples=2)
     num_pts, n_components = X.shape
-    unique_labels, label_inds = np.unique(y, return_inverse=True)
-    lmnn.labels_ = np.arange(len(unique_labels))
+    unique_labels, y_inverse = np.unique(y, return_inverse=True)
+    classes = np.arange(len(unique_labels))
     lmnn.components_ = np.eye(n_components)
 
-    target_neighbors = lmnn._select_targets(X, label_inds)
+    target_neighbors = lmnn._select_target_neighbors(X, y_inverse, classes)
 
     # sum outer products
-    dfG = _sum_outer_products(X, target_neighbors.flatten(),
-                              np.repeat(np.arange(X.shape[0]), k))
+    tn_graph = _make_knn_graph(target_neighbors)
+    pull_loss_grad_m = _sum_weighted_outer_products(X, tn_graph)
 
-    # initialize L
-    def loss_grad(flat_L):
-      return lmnn._loss_grad(X, flat_L.reshape(-1, X.shape[1]), dfG,
-                             k, reg, target_neighbors, label_inds)
+    kwargs = {
+        'classes': classes,
+        'target_neighbors': target_neighbors,
+        'pull_loss_grad_m': pull_loss_grad_m,
+    }
 
-    def fun(x):
-      return loss_grad(x)[1]
+    def fun(L):
+        return lmnn._loss_grad_lbfgs(L, X, y, **kwargs)[0]
 
-    def grad(x):
-      return loss_grad(x)[0].ravel()
+    def grad(L):
+        return lmnn._loss_grad_lbfgs(L, X, y, **kwargs)[1]
 
-    # compute relative error
+    # compute gradient with and without finite differences
     epsilon = np.sqrt(np.finfo(float).eps)
-    rel_diff = (check_grad(fun, grad, L.ravel()) /
-                np.linalg.norm(approx_fprime(L.ravel(), fun, epsilon)))
+    grad_fin_diff = approx_fprime(L.ravel(), fun, epsilon)
+    grad_lmnn = grad(L)
+
+    # compute absolute error
+    grad_error = np.sqrt((grad_lmnn - grad_fin_diff)**2)
+
+    # compute relative error
+    # rel_diff1 = grad_error / np.linalg.norm(grad_fin_diff)
+    # rel_diff2 = grad_error / np.linalg.norm(grad_lmnn)
+
+    rel_diff = grad_error / np.linalg.norm(grad_lmnn)
     np.testing.assert_almost_equal(rel_diff, 0., decimal=5)
 
 
@@ -418,8 +428,11 @@ def test_loss_func(capsys):
   by comparing the results with the actual implementation of metric-learn,
   with a very simple (but nonperformant) implementation"""
 
+  import scipy
+  from sklearn.metrics import euclidean_distances
+
   # toy dataset to use
-  X, y = make_classification(n_samples=10, n_classes=2,
+  X, y = make_classification(n_samples=40, n_classes=2,
                              n_features=6,
                              n_redundant=0, shuffle=True,
                              scale=[1, 1, 20, 20, 20, 20], random_state=42)
@@ -471,118 +484,75 @@ def _select_targets(X, y, k):
   x0 = np.random.randn(1, n_features)
 
   def loss(x0):
-    return loss_fn(x0.reshape(-1, X.shape[1]), X, y, target_neighbors,
+    return loss_fn(x0.reshape(-1, n_features), X, y, target_neighbors,
                    regularization)[1]
 
   def grad(x0):
-    return loss_fn(x0.reshape(-1, X.shape[1]), X, y, target_neighbors,
+    return loss_fn(x0.reshape(-1, n_features), X, y, target_neighbors,
                    regularization)[0].ravel()
 
   scipy.optimize.check_grad(loss, grad, x0.ravel())
 
-  class LMNN_with_callback(LMNN):
-    """ We will use a callback to get the gradient (see later)
+
+def test_compute_push_loss():
+    """Test if the push loss is computed correctly
+
+    This test continues on the example from test_find_impostors. The push
+    loss is easy to compute, as we have only 4 violations and all of them
+    amount to 1 (squared distance to target neighbor + 1 - squared distance
+    to impostor = 4 + 1 - 4).
     """
 
-    def __init__(self, callback, *args, **kwargs):
-      self.callback = callback
-      super(LMNN_with_callback, self).__init__(*args, **kwargs)
-
-    def _loss_grad(self, *args, **kwargs):
-      grad, objective, total_active = (
-          super(LMNN_with_callback, self)._loss_grad(*args, **kwargs))
-      self.callback.append(grad)
-      return grad, objective, total_active
-
-  class LMNN_nonperformant(LMNN_with_callback):
-
-    def fit(self, X, y):
-      self.y = y
-      return super(LMNN_nonperformant, self).fit(X, y)
-
-    def _loss_grad(self, X, L, dfG, k, reg, target_neighbors, label_inds):
-      grad, loss, total_active = loss_fn(L.ravel(), X, self.y,
-                                         target_neighbors, self.regularization)
-      self.callback.append(grad)
-      return grad, loss, total_active
-
-  mem1, mem2 = [], []
-  lmnn_perf = LMNN_with_callback(verbose=True, random_state=42,
-                                 init='identity', max_iter=30, callback=mem1)
-  lmnn_nonperf = LMNN_nonperformant(verbose=True, random_state=42,
-                                    init='identity', max_iter=30,
-                                    callback=mem2)
-  objectives, obj_diffs, learn_rate, total_active = (dict(), dict(), dict(),
-                                                     dict())
-  for algo, name in zip([lmnn_perf, lmnn_nonperf], ['perf', 'nonperf']):
-    algo.fit(X, y)
-    out, _ = capsys.readouterr()
-    lines = re.split("\n+", out)
-    # we get every variable that is printed from the algorithm in verbose
-    num = r'(-?\d+.?\d*(e[+|-]\d+)?)'
-    strings = [re.search(r"\d+ (?:{}) (?:{}) (?:(\d+)) (?:{})"
-                         .format(num, num, num), s) for s in lines]
-    objectives[name] = [float(match.group(1)) for match in strings if match is
-                        not None]
-    obj_diffs[name] = [float(match.group(3)) for match in strings if match is
-                       not None]
-    total_active[name] = [float(match.group(5)) for match in strings if
-                          match is not
-                          None]
-    learn_rate[name] = [float(match.group(6)) for match in strings if match is
-                        not None]
-    assert len(strings) >= 10  # we ensure that we actually did more than 10
-    # iterations
-    assert total_active[name][0] >= 2  # we ensure that we have some active
-    # constraints (that's the case we want to test)
-    # we remove the last element because it can be equal to the penultimate
-    # if the last gradient update is null
-  for i in range(len(mem1)):
-    np.testing.assert_allclose(lmnn_perf.callback[i],
-                               lmnn_nonperf.callback[i],
-                               err_msg='Gradient different at position '
-                                       '{}'.format(i))
-  np.testing.assert_allclose(objectives['perf'], objectives['nonperf'])
-  np.testing.assert_allclose(obj_diffs['perf'], obj_diffs['nonperf'])
-  np.testing.assert_allclose(total_active['perf'], total_active['nonperf'])
-  np.testing.assert_allclose(learn_rate['perf'], learn_rate['nonperf'])
+    class_distance = 4.
+    X_a = np.array([[-1., 1], [-1., -1.], [1., 1.], [1., -1.]])
+    X_b = X_a + np.array([class_distance, 0])
+    X = np.concatenate((X_a, X_b))
+    y = np.array([0, 0, 0, 0, 1, 1, 1, 1])
+    lmnn = LMNN(n_neighbors=1)
+    lmnn.n_neighbors_ = 1
+    classes = np.unique(y)
+    target_neighbors = lmnn._select_target_neighbors(X, y, classes)
+    diffs = X - X[target_neighbors[:, 0]]
+    dist_tn = np.einsum('ij,ij->i', diffs, diffs)
+    dist_tn = dist_tn[:, None]
+    dist_tn += 1
+    margin_radii = dist_tn[:, -1]
+    impostors_graph = lmnn._find_impostors(X, y, classes, margin_radii)
+    loss, grad, _ = _push_loss_grad(X, target_neighbors, dist_tn,
+                                    impostors_graph)
+
+    # The loss should be 4. (1. for each of the 4 violation)
+    assert loss == 4.
 
 
 @pytest.mark.parametrize('X, y, loss', [(np.array([[0], [1], [2], [3]]),
-                                         [1, 1, 0, 0], 3.0),
+                                         [1, 1, 0, 0], 6.0),
                                         (np.array([[0], [1], [2], [3]]),
-                                         [1, 0, 0, 1], 26.)])
+                                         [1, 0, 0, 1], 256.)])
 def test_toy_ex_lmnn(X, y, loss):
   """Test that the loss give the right result on a toy example"""
   L = np.array([[1]])
-  lmnn = LMNN(k=1, regularization=0.5)
+  lmnn = LMNN(n_neighbors=1, push_loss_weight=0.5)
 
-  k = lmnn.k
-  reg = lmnn.regularization
+  lmnn.n_neighbors_ = lmnn.n_neighbors
 
-  X, y = lmnn._prepare_inputs(X, y, dtype=float,
-                              ensure_min_samples=2)
+  X, y = lmnn._prepare_inputs(X, y, dtype=float, ensure_min_samples=2)
   num_pts, n_components = X.shape
   unique_labels, label_inds = np.unique(y, return_inverse=True)
-  lmnn.labels_ = np.arange(len(unique_labels))
+  classes = np.arange(len(unique_labels))
   lmnn.components_ = np.eye(n_components)
 
-  target_neighbors = lmnn._select_targets(X, label_inds)
+  target_neighbors = lmnn._select_target_neighbors(X, label_inds, classes)
 
   # sum outer products
-  dfG = _sum_outer_products(X, target_neighbors.flatten(),
-                            np.repeat(np.arange(X.shape[0]), k))
-
-  # storage
-  a1 = [None] * k
-  a2 = [None] * k
-  for nn_idx in range(k):
-    a1[nn_idx] = np.array([])
-    a2[nn_idx] = np.array([])
+  tn_graph = _make_knn_graph(target_neighbors)
+  const_grad = _sum_weighted_outer_products(X, tn_graph)
 
   #  assert that the loss equals the one computed by hand
-  assert lmnn._loss_grad(X, L.reshape(-1, X.shape[1]), dfG, k,
-                         reg, target_neighbors, label_inds)[1] == loss
+  lmnn.n_iter_ = 0
+  predicted_loss = lmnn._loss_grad_lbfgs(L, X, y, classes, target_neighbors,
+                                         const_grad)[0]
+  assert predicted_loss == loss
 
 
 def test_convergence_simple_example(capsys):
@@ -592,28 +562,7 @@ def test_convergence_simple_example(capsys):
   lmnn = LMNN(verbose=True)
   lmnn.fit(X, y)
   out, _ = capsys.readouterr()
-  assert "LMNN converged with objective" in out
-
-
-def test_no_twice_same_objective(capsys):
-  # test that the objective function never has twice the same value
-  # see https://github.com/scikit-learn-contrib/metric-learn/issues/88
-  X, y = make_classification(random_state=0)
-  lmnn = LMNN(verbose=True)
-  lmnn.fit(X, y)
-  out, _ = capsys.readouterr()
-  lines = re.split("\n+", out)
-  # we get only objectives from each line:
-  # the regexp matches a float that follows an integer (the iteration
-  # number), and which is followed by a (signed) float (delta obj). It
-  # matches for instance:
-  # 3 **1113.7665747189938** -3.182774197440267 46431.0200999999999998e-06
-  objectives = [re.search(r"\d* (?:(\d*.\d*))[ | -]\d*.\d*", s)
-                for s in lines]
-  objectives = [match.group(1) for match in objectives if match is not None]
-  # we remove the last element because it can be equal to the penultimate
-  # if the last gradient update is null
-  assert len(objectives[:-1]) == len(set(objectives[:-1]))
+  assert "LMNN did not converge" not in out
 
 
 class TestSDML(MetricTestCase):
diff --git a/test/test_base_metric.py b/test/test_base_metric.py
index 67f9b6a0..2d8fd69e 100644
--- a/test/test_base_metric.py
+++ b/test/test_base_metric.py
@@ -36,15 +36,17 @@ def test_covariance(self):
                      remove_spaces(f"Covariance({merged_kwargs})"))
 
   def test_lmnn(self):
-    def_kwargs = {'convergence_tol': 0.001, 'init': 'auto', 'k': 3,
-                  'learn_rate': 1e-07, 'max_iter': 1000, 'min_iter': 50,
-                  'n_components': None, 'preprocessor': None,
-                  'random_state': None, 'regularization': 0.5,
-                  'verbose': False}
-    nndef_kwargs = {'convergence_tol': 0.01, 'k': 6}
+    def_kwargs = {'n_neighbors': 3, 'k': 'deprecated', 'n_components': None,
+                  'init': 'auto', 'max_impostors': 500_000,
+                  'neighbors_params': None, 'push_loss_weight': 0.5,
+                  'regularization': 'deprecated', 'max_iter': 50, 'tol': 1e-5,
+                  'preprocessor': None, 'verbose': False,
+                  'random_state': None}
+    def_kwargs = {k: def_kwargs[k] for k in sorted(def_kwargs.keys())}
+    nndef_kwargs = {'n_neighbors': 6, 'tol': 0.01}
     merged_kwargs = sk_repr_kwargs(def_kwargs, nndef_kwargs)
     self.assertEqual(
-        remove_spaces(str(metric_learn.LMNN(convergence_tol=0.01, k=6))),
+        remove_spaces(str(metric_learn.LMNN(n_neighbors=6, tol=0.01))),
         remove_spaces(f"LMNN({merged_kwargs})"))
 
   def test_nca(self):
diff --git a/test/test_components_metric_conversion.py b/test/test_components_metric_conversion.py
index 5502ad90..f07e31a7 100644
--- a/test/test_components_metric_conversion.py
+++ b/test/test_components_metric_conversion.py
@@ -42,7 +42,7 @@ def test_itml_supervised(self):
     assert_array_almost_equal(L.T.dot(L), itml.get_mahalanobis_matrix())
 
   def test_lmnn(self):
-    lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False)
+    lmnn = LMNN(n_neighbors=5, verbose=False)
     lmnn.fit(self.X, self.y)
     L = lmnn.components_
     assert_array_almost_equal(L.T.dot(L), lmnn.get_mahalanobis_matrix())
diff --git a/test/test_fit_transform.py b/test/test_fit_transform.py
index d4d4bfe0..609db3c9 100644
--- a/test/test_fit_transform.py
+++ b/test/test_fit_transform.py
@@ -52,11 +52,11 @@ def test_itml_supervised(self):
     assert_array_almost_equal(res_1, res_2)
 
   def test_lmnn(self):
-    lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False)
+    lmnn = LMNN(n_neighbors=5, verbose=False)
     lmnn.fit(self.X, self.y)
     res_1 = lmnn.transform(self.X)
 
-    lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False)
+    lmnn = LMNN(n_neighbors=5, verbose=False)
     res_2 = lmnn.fit_transform(self.X, self.y)
 
     assert_array_almost_equal(res_1, res_2)
diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py
index 84058b32..5a533eab 100644
--- a/test/test_mahalanobis_mixin.py
+++ b/test/test_mahalanobis_mixin.py
@@ -417,7 +417,7 @@ def test_auto_init_transformation(n_samples, n_features, n_classes,
                           random_state=rng)
     # To make the test work for LMNN:
     if 'LMNN' in model_base.__class__.__name__:
-      model_base.set_params(k=1)
+      model_base.set_params(n_neighbors=1)
     # To make the test faster for estimators that have a max_iter:
     if hasattr(model_base, 'max_iter'):
       model_base.set_params(max_iter=1)