diff --git a/metric_learn/_util.py b/metric_learn/_util.py
index fa196a69..77e8d9fa 100644
--- a/metric_learn/_util.py
+++ b/metric_learn/_util.py
@@ -448,45 +448,45 @@ def _initialize_components(n_components, input, y=None, init='auto',
     The input labels (or not if there are no labels).
 
   init : string or numpy array, optional (default='auto')
-      Initialization of the linear transformation. Possible options are
-      'auto', 'pca', 'lda', 'identity', 'random', and a numpy array of shape
-      (n_features_a, n_features_b).
-
-      'auto'
-          Depending on ``n_components``, the most reasonable initialization
-          will be chosen. If ``n_components <= n_classes`` we use 'lda' (see
-          the description of 'lda' init), as it uses labels information. If
-          not, but ``n_components < min(n_features, n_samples)``, we use 'pca',
-          as it projects data onto meaningful directions (those of higher
-          variance). Otherwise, we just use 'identity'.
-
-      'pca'
-          ``n_components`` principal components of the inputs passed
-          to :meth:`fit` will be used to initialize the transformation.
-          (See `sklearn.decomposition.PCA`)
-
-      'lda'
-          ``min(n_components, n_classes)`` most discriminative
-          components of the inputs passed to :meth:`fit` will be used to
-          initialize the transformation. (If ``n_components > n_classes``,
-          the rest of the components will be zero.) (See
-          `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`).
-          This initialization is possible only if `has_classes == True`.
-
-      'identity'
-          The identity matrix. If ``n_components`` is strictly smaller than the
-          dimensionality of the inputs passed to :meth:`fit`, the identity
-          matrix will be truncated to the first ``n_components`` rows.
-
-      'random'
-          The initial transformation will be a random array of shape
-          `(n_components, n_features)`. Each value is sampled from the
-          standard normal distribution.
-
-      numpy array
-          n_features_b must match the dimensionality of the inputs passed to
-          :meth:`fit` and n_features_a must be less than or equal to that.
-          If ``n_components`` is not None, n_features_a must match it.
+    Initialization of the linear transformation. Possible options are
+    'auto', 'pca', 'lda', 'identity', 'random', and a numpy array of shape
+    (n_features_a, n_features_b).
+
+    'auto'
+      Depending on ``n_components``, the most reasonable initialization
+      will be chosen. If ``n_components <= n_classes`` we use 'lda' (see
+      the description of 'lda' init), as it uses labels information. If
+      not, but ``n_components < min(n_features, n_samples)``, we use 'pca',
+      as it projects data onto meaningful directions (those of higher
+      variance). Otherwise, we just use 'identity'.
+
+    'pca'
+      ``n_components`` principal components of the inputs passed
+      to :meth:`fit` will be used to initialize the transformation.
+      (See `sklearn.decomposition.PCA`)
+
+    'lda'
+      ``min(n_components, n_classes)`` most discriminative
+      components of the inputs passed to :meth:`fit` will be used to
+      initialize the transformation. (If ``n_components > n_classes``,
+      the rest of the components will be zero.) (See
+      `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`).
+      This initialization is possible only if `has_classes == True`.
+
+    'identity'
+      The identity matrix. If ``n_components`` is strictly smaller than the
+      dimensionality of the inputs passed to :meth:`fit`, the identity
+      matrix will be truncated to the first ``n_components`` rows.
+
+    'random'
+      The initial transformation will be a random array of shape
+      `(n_components, n_features)`. Each value is sampled from the
+      standard normal distribution.
+
+    numpy array
+      n_features_b must match the dimensionality of the inputs passed to
+      :meth:`fit` and n_features_a must be less than or equal to that.
+      If ``n_components`` is not None, n_features_a must match it.
 
   verbose : bool
     Whether to print the details of the initialization or not.
@@ -606,26 +606,26 @@ def _initialize_metric_mahalanobis(input, init='identity', random_state=None,
     The input samples (can be tuples or regular samples).
 
   init : string or numpy array, optional (default='identity')
-         Specification for the matrix to initialize. Possible options are
-         'identity', 'covariance', 'random', and a numpy array of shape
-         (n_features, n_features).
-
-         'identity'
-            An identity matrix of shape (n_features, n_features).
-
-         'covariance'
-            The (pseudo-)inverse covariance matrix (raises an error if the
-            covariance matrix is not definite and `strict_pd == True`)
-
-         'random'
-             A random positive definite (PD) matrix of shape
-             `(n_features, n_features)`, generated using
-             `sklearn.datasets.make_spd_matrix`.
-
-         numpy array
-             A PSD matrix (or strictly PD if strict_pd==True) of
-             shape (n_features, n_features), that will be used as such to
-             initialize the metric, or set the prior.
+    Specification for the matrix to initialize. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of shape
+    (n_features, n_features).
+
+    'identity'
+      An identity matrix of shape (n_features, n_features).
+
+    'covariance'
+      The (pseudo-)inverse covariance matrix (raises an error if the
+      covariance matrix is not definite and `strict_pd == True`)
+
+    'random'
+      A random positive definite (PD) matrix of shape
+      `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
+
+    numpy array
+      A PSD matrix (or strictly PD if strict_pd==True) of
+      shape (n_features, n_features), that will be used as such to
+      initialize the metric, or set the prior.
 
   random_state : int or `numpy.RandomState` or None, optional (default=None)
     A pseudo random number generator object or a seed for it if int. If
diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index ee73c793..d19998ff 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -154,12 +154,12 @@ def transform(self, X):
     Parameters
     ----------
     X : (n x d) matrix
-        Data to transform.
+      Data to transform.
 
     Returns
     -------
     transformed : (n x d) matrix
-        Input data transformed to the metric space by :math:`XL^{\\top}`
+      Input data transformed to the metric space by :math:`XL^{\\top}`
     """
 
 
@@ -180,7 +180,7 @@ class MahalanobisMixin(six.with_metaclass(ABCMeta, BaseMetricLearner,
   Attributes
   ----------
   components_ : `numpy.ndarray`, shape=(n_components, n_features)
-      The learned linear transformation ``L``.
+    The learned linear transformation ``L``.
   """
 
   def score_pairs(self, pairs):
@@ -313,9 +313,9 @@ class _PairsClassifierMixin(BaseMetricLearner):
   Attributes
   ----------
   threshold_ : `float`
-      If the distance metric between two points is lower than this threshold,
-      points will be classified as similar, otherwise they will be
-      classified as dissimilar.
+    If the distance metric between two points is lower than this threshold,
+    points will be classified as similar, otherwise they will be
+    classified as dissimilar.
   """
 
   _tuple_size = 2  # number of points in a tuple, 2 for pairs
diff --git a/metric_learn/constraints.py b/metric_learn/constraints.py
index 752ca6e0..36d77194 100644
--- a/metric_learn/constraints.py
+++ b/metric_learn/constraints.py
@@ -12,17 +12,60 @@
 
 class Constraints(object):
   """
-  Class to build constraints from labels.
+  Class to build constraints from labeled data.
 
-  See more in the :ref:`User Guide <supervised_version>`
+  See more in the :ref:`User Guide <supervised_version>`.
+
+  Parameters
+  ----------
+  partial_labels : `numpy.ndarray` of ints, shape=(n_samples,)
+      Array of labels, with -1 indicating unknown label.
+
+  Attributes
+  ----------
+  partial_labels : `numpy.ndarray` of ints, shape=(n_samples,)
+      Array of labels, with -1 indicating unknown label.
   """
+
   def __init__(self, partial_labels):
-    '''partial_labels : int arraylike, -1 indicating unknown label'''
     partial_labels = np.asanyarray(partial_labels, dtype=int)
     self.partial_labels = partial_labels
 
   def positive_negative_pairs(self, num_constraints, same_length=False,
                               random_state=None):
+    """
+    Generates positive pairs and negative pairs from labeled data.
+
+    Positive pairs are formed by randomly drawing ``num_constraints`` pairs of
+    points with the same label. Negative pairs are formed by randomly drawing
+    ``num_constraints`` pairs of points with different label.
+
+    In the case where it is not possible to generate enough positive or
+    negative pairs, a smaller number of pairs will be returned with a warning.
+
+    Parameters
+    ----------
+      num_constraints : int
+        Number of positive and negative constraints to generate.
+      same_length : bool, optional (default=False)
+        If True, forces the number of positive and negative pairs to be
+        equal by ignoring some pairs from the larger set.
+      random_state : int or numpy.RandomState or None, optional (default=None)
+        A pseudo random number generator object or a seed for it if int.
+    Returns
+    -------
+    a : array-like, shape=(n_constraints,)
+    1D array of indicators for the left elements of positive pairs.
+
+    b : array-like, shape=(n_constraints,)
+    1D array of indicators for the right elements of positive pairs.
+
+    c : array-like, shape=(n_constraints,)
+    1D array of indicators for the left elements of negative pairs.
+
+    d : array-like, shape=(n_constraints,)
+    1D array of indicators for the right elements of negative pairs.
+    """
     random_state = check_random_state(random_state)
     a, b = self._pairs(num_constraints, same_label=True,
                        random_state=random_state)
@@ -60,7 +103,30 @@ def _pairs(self, num_constraints, same_label=True, max_iter=10,
 
   def chunks(self, num_chunks=100, chunk_size=2, random_state=None):
     """
-    the random state object to be passed must be a numpy random seed
+    Generates chunks from labeled data.
+
+    Each of ``num_chunks`` chunks is composed of ``chunk_size`` points from
+    the same class drawn at random. Each point can belong to at most 1 chunk.
+
+    In the case where there is not enough points to generate ``num_chunks``
+    chunks of size ``chunk_size``, a ValueError will be raised.
+
+    Parameters
+    ----------
+    num_chunks : int, optional (default=100)
+      Number of chunks to generate.
+
+    chunk_size : int, optional (default=2)
+      Number of points in each chunk.
+
+    random_state : int or numpy.RandomState or None, optional (default=None)
+      A pseudo random number generator object or a seed for it if int.
+
+    Returns
+    -------
+    chunks : array-like, shape=(n_samples,)
+      1D array of chunk indicators, where -1 indicates that the point does not
+      belong to any chunk.
     """
     random_state = check_random_state(random_state)
     chunks = -np.ones_like(self.partial_labels, dtype=int)
diff --git a/metric_learn/itml.py b/metric_learn/itml.py
index 50eb41a4..2094e160 100644
--- a/metric_learn/itml.py
+++ b/metric_learn/itml.py
@@ -126,75 +126,75 @@ class ITML(_BaseITML, _PairsClassifierMixin):
 
   Parameters
   ----------
-  gamma : float, optional (default=1.)
-      Value for slack variables
+  gamma : float, optional (default=1.0)
+    Value for slack variables
 
   max_iter : int, optional (default=1000)
-      Maximum number of iteration of the optimization procedure.
+    Maximum number of iteration of the optimization procedure.
 
   convergence_threshold : float, optional (default=1e-3)
-      Convergence tolerance.
+    Convergence tolerance.
 
   prior : string or numpy array, optional (default='identity')
-      The Mahalanobis matrix to use as a prior. Possible options are
-      'identity', 'covariance', 'random', and a numpy array of shape
-      (n_features, n_features). For ITML, the prior should be strictly
-      positive definite (PD).
+    The Mahalanobis matrix to use as a prior. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of shape
+    (n_features, n_features). For ITML, the prior should be strictly
+    positive definite (PD).
 
-      'identity'
-          An identity matrix of shape (n_features, n_features).
+    'identity'
+      An identity matrix of shape (n_features, n_features).
 
-      'covariance'
-          The inverse covariance matrix.
+    'covariance'
+      The inverse covariance matrix.
 
-      'random'
-          The prior will be a random SPD matrix of shape
-          `(n_features, n_features)`, generated using
-          `sklearn.datasets.make_spd_matrix`.
+    'random'
+      The prior will be a random SPD matrix of shape
+      `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
 
-      numpy array
-          A positive definite (PD) matrix of shape
-          (n_features, n_features), that will be used as such to set the
-          prior.
+    numpy array
+      A positive definite (PD) matrix of shape
+      (n_features, n_features), that will be used as such to set the
+      prior.
 
   A0 : Not used
-      .. deprecated:: 0.5.0
-          `A0` was deprecated in version 0.5.0 and will
-          be removed in 0.6.0. Use 'prior' instead.
+    .. deprecated:: 0.5.0
+      `A0` was deprecated in version 0.5.0 and will
+      be removed in 0.6.0. Use 'prior' instead.
 
   verbose : bool, optional (default=False)
-      If True, prints information while learning
+    If True, prints information while learning
 
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be formed like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
 
   random_state : int or numpy.RandomState or None, optional (default=None)
-      A pseudo random number generator object or a seed for it if int. If
-      ``prior='random'``, ``random_state`` is used to set the prior.
+    A pseudo random number generator object or a seed for it if int. If
+    ``prior='random'``, ``random_state`` is used to set the prior.
 
   Attributes
   ----------
   bounds_ : `numpy.ndarray`, shape=(2,)
-      Bounds on similarity, aside slack variables, s.t.
-      ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
-      and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
-      dissimilar points ``c`` and ``d``, with ``d`` the learned distance. If
-      not provided at initialization, bounds_[0] and bounds_[1] are set at
-      train time to the 5th and 95th percentile of the pairwise distances among
-      all points present in the input `pairs`.
+    Bounds on similarity, aside slack variables, s.t.
+    ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
+    and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
+    dissimilar points ``c`` and ``d``, with ``d`` the learned distance. If
+    not provided at initialization, bounds_[0] and bounds_[1] are set at
+    train time to the 5th and 95th percentile of the pairwise distances among
+    all points present in the input `pairs`.
 
   n_iter_ : `int`
-      The number of iterations the solver has run.
+    The number of iterations the solver has run.
 
   components_ : `numpy.ndarray`, shape=(n_features, n_features)
-      The linear transformation ``L`` deduced from the learned Mahalanobis
-      metric (See function `components_from_metric`.)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
 
   threshold_ : `float`
-      If the distance metric between two points is lower than this threshold,
-      points will be classified as similar, otherwise they will be
-      classified as dissimilar.
+    If the distance metric between two points is lower than this threshold,
+    points will be classified as similar, otherwise they will be
+    classified as dissimilar.
 
   Examples
   --------
@@ -226,28 +226,28 @@ def fit(self, pairs, y, bounds=None, calibration_params=None):
     ----------
     pairs: array-like, shape=(n_constraints, 2, n_features) or \
            (n_constraints, 2)
-        3D Array of pairs with each row corresponding to two points,
-        or 2D array of indices of pairs if the metric learner uses a
-        preprocessor.
+      3D Array of pairs with each row corresponding to two points,
+      or 2D array of indices of pairs if the metric learner uses a
+      preprocessor.
     y: array-like, of shape (n_constraints,)
-        Labels of constraints. Should be -1 for dissimilar pair, 1 for similar.
+      Labels of constraints. Should be -1 for dissimilar pair, 1 for similar.
     bounds : array-like of two numbers
-        Bounds on similarity, aside slack variables, s.t.
-        ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
-        and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
-        dissimilar points ``c`` and ``d``, with ``d`` the learned distance.
-        If not provided at initialization, bounds_[0] and bounds_[1] will be
-        set to the 5th and 95th percentile of the pairwise distances among all
-        points present in the input `pairs`.
+      Bounds on similarity, aside slack variables, s.t.
+      ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
+      and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
+      dissimilar points ``c`` and ``d``, with ``d`` the learned distance.
+      If not provided at initialization, bounds_[0] and bounds_[1] will be
+      set to the 5th and 95th percentile of the pairwise distances among all
+      points present in the input `pairs`.
     calibration_params : `dict` or `None`
-        Dictionary of parameters to give to `calibrate_threshold` for the
-        threshold calibration step done at the end of `fit`. If `None` is
-        given, `calibrate_threshold` will use the default parameters.
+      Dictionary of parameters to give to `calibrate_threshold` for the
+      threshold calibration step done at the end of `fit`. If `None` is
+      given, `calibrate_threshold` will use the default parameters.
 
     Returns
     -------
     self : object
-        Returns the instance.
+      Returns the instance.
     """
     calibration_params = (calibration_params if calibration_params is not
                           None else dict())
@@ -266,77 +266,88 @@ class ITML_Supervised(_BaseITML, TransformerMixin):
 
   Parameters
   ----------
-  gamma : float, optional
-      value for slack variables
-  max_iter : int, optional
-  convergence_threshold : float, optional
+  gamma : float, optional (default=1.0)
+    Value for slack variables
+
+  max_iter : int, optional (default=1000)
+    Maximum number of iterations of the optimization procedure.
+
+  convergence_threshold : float, optional (default=1e-3)
+    Tolerance of the optimization procedure.
+
   num_labeled : Not used
-        .. deprecated:: 0.5.0
-           `num_labeled` was deprecated in version 0.5.0 and will
-           be removed in 0.6.0.
-  num_constraints: int, optional
-      number of constraints to generate
-      (`20 * num_classes**2` constraints by default)
+    .. deprecated:: 0.5.0
+      `num_labeled` was deprecated in version 0.5.0 and will
+      be removed in 0.6.0.
+
+  num_constraints: int, optional (default=None)
+    Number of constraints to generate. If None, default to `20 *
+    num_classes**2`.
+
   bounds : Not used
-         .. deprecated:: 0.5.0
-        `bounds` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0. Set `bounds` at fit time instead :
-        `itml_supervised.fit(X, y, bounds=...)`
+    .. deprecated:: 0.5.0
+      `bounds` was deprecated in version 0.5.0 and will
+      be removed in 0.6.0. Set `bounds` at fit time instead :
+      `itml_supervised.fit(X, y, bounds=...)`
 
   prior : string or numpy array, optional (default='identity')
-       Initialization of the Mahalanobis matrix. Possible options are
-       'identity', 'covariance', 'random', and a numpy array of shape
-       (n_features, n_features). For ITML, the prior should be strictly
-       positive definite (PD).
-
-       'identity'
-          An identity matrix of shape (n_features, n_features).
-
-       'covariance'
-          The inverse covariance matrix.
-
-       'random'
-          The prior will be a random SPD matrix of shape
-          `(n_features, n_features)`, generated using
-          `sklearn.datasets.make_spd_matrix`.
-
-       numpy array
-           A positive definite (PD) matrix of shape
-           (n_features, n_features), that will be used as such to set the
-           prior.
+    Initialization of the Mahalanobis matrix. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of shape
+    (n_features, n_features). For ITML, the prior should be strictly
+    positive definite (PD).
+
+    'identity'
+      An identity matrix of shape (n_features, n_features).
+
+    'covariance'
+      The inverse covariance matrix.
+
+    'random'
+      The prior will be a random SPD matrix of shape
+      `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
+
+    numpy array
+      A positive definite (PD) matrix of shape
+      (n_features, n_features), that will be used as such to set the
+      prior.
+
   A0 : Not used
     .. deprecated:: 0.5.0
-       `A0` was deprecated in version 0.5.0 and will
-       be removed in 0.6.0. Use 'prior' instead.
-  verbose : bool, optional
-      if True, prints information while learning
+      `A0` was deprecated in version 0.5.0 and will
+      be removed in 0.6.0. Use 'prior' instead.
+
+  verbose : bool, optional (default=False)
+    If True, prints information while learning
+
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be formed like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
+
   random_state : int or numpy.RandomState or None, optional (default=None)
-      A pseudo random number generator object or a seed for it if int. If
-      ``prior='random'``, ``random_state`` is used to set the prior. In any
-        case, `random_state` is also used to randomly sample constraints from
-        labels.
+    A pseudo random number generator object or a seed for it if int. If
+    ``prior='random'``, ``random_state`` is used to set the prior. In any
+    case, `random_state` is also used to randomly sample constraints from
+    labels.
 
 
   Attributes
   ----------
   bounds_ : `numpy.ndarray`, shape=(2,)
-      Bounds on similarity, aside slack variables, s.t.
-      ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
-      and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
-      dissimilar points ``c`` and ``d``, with ``d`` the learned distance.
-      If not provided at initialization, bounds_[0] and bounds_[1] are set at
-      train time to the 5th and 95th percentile of the pairwise distances
-      among all points in the training data `X`.
+    Bounds on similarity, aside slack variables, s.t.
+    ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
+    and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
+    dissimilar points ``c`` and ``d``, with ``d`` the learned distance.
+    If not provided at initialization, bounds_[0] and bounds_[1] are set at
+    train time to the 5th and 95th percentile of the pairwise distances
+    among all points in the training data `X`.
 
   n_iter_ : `int`
-      The number of iterations the solver has run.
+    The number of iterations the solver has run.
 
   components_ : `numpy.ndarray`, shape=(n_features, n_features)
-      The linear transformation ``L`` deduced from the learned Mahalanobis
-      metric (See function `components_from_metric`.)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
 
   Examples
   --------
@@ -355,7 +366,7 @@ class ITML_Supervised(_BaseITML, TransformerMixin):
     that describes the supervised version of weakly supervised estimators.
   """
 
-  def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3,
+  def __init__(self, gamma=1.0, max_iter=1000, convergence_threshold=1e-3,
                num_labeled='deprecated', num_constraints=None,
                bounds='deprecated', prior='identity', A0='deprecated',
                verbose=False, preprocessor=None, random_state=None):
@@ -374,10 +385,10 @@ def fit(self, X, y, random_state='deprecated', bounds=None):
     Parameters
     ----------
     X : (n x d) matrix
-        Input data, where each row corresponds to a single instance.
+      Input data, where each row corresponds to a single instance.
 
     y : (n) array-like
-        Data labels.
+      Data labels.
 
     random_state : Not used
       .. deprecated:: 0.5.0
@@ -386,13 +397,13 @@ def fit(self, X, y, random_state='deprecated', bounds=None):
         instead (when instantiating a new `ITML_Supervised` object).
 
     bounds : array-like of two numbers
-        Bounds on similarity, aside slack variables, s.t.
-        ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
-        and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
-        dissimilar points ``c`` and ``d``, with ``d`` the learned distance.
-        If not provided at initialization, bounds_[0] and bounds_[1] will be
-        set to the 5th and 95th percentile of the pairwise distances among all
-        points in the training data `X`.
+      Bounds on similarity, aside slack variables, s.t.
+      ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
+      and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
+      dissimilar points ``c`` and ``d``, with ``d`` the learned distance.
+      If not provided at initialization, bounds_[0] and bounds_[1] will be
+      set to the 5th and 95th percentile of the pairwise distances among all
+      points in the training data `X`.
     """
     # TODO: remove these in v0.6.0
     if self.num_labeled != 'deprecated':
diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py
index 99e7c978..12617a94 100644
--- a/metric_learn/lfda.py
+++ b/metric_learn/lfda.py
@@ -27,27 +27,26 @@ class LFDA(MahalanobisMixin, TransformerMixin):
   Parameters
   ----------
   n_components : int or None, optional (default=None)
-      Dimensionality of reduced space (if None, defaults to dimension of X).
+    Dimensionality of reduced space (if None, defaults to dimension of X).
 
   num_dims : Not used
+    .. deprecated:: 0.5.0
+      `num_dims` was deprecated in version 0.5.0 and will
+      be removed in 0.6.0. Use `n_components` instead.
 
-      .. deprecated:: 0.5.0
-        `num_dims` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0. Use `n_components` instead.
+  k : int, optional (default=None)
+    Number of nearest neighbors used in local scaling method. If None,
+    defaults to min(7, n_features - 1).
 
-  k : int, optional
-      Number of nearest neighbors used in local scaling method.
-      Defaults to min(7, n_components - 1).
-
-  embedding_type : str, optional
-      Type of metric in the embedding space (default: 'weighted')
-        'weighted'        - weighted eigenvectors
-        'orthonormalized' - orthonormalized
-        'plain'           - raw eigenvectors
+  embedding_type : str, optional (default: 'weighted')
+    Type of metric in the embedding space
+      'weighted'        - weighted eigenvectors
+      'orthonormalized' - orthonormalized
+      'plain'           - raw eigenvectors
 
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be formed like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
 
   Attributes
   ----------
diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
index a1b5a42f..df8fe649 100644
--- a/metric_learn/lmnn.py
+++ b/metric_learn/lmnn.py
@@ -28,101 +28,100 @@ class LMNN(MahalanobisMixin, TransformerMixin):
   Parameters
   ----------
   init : None, string or numpy array, optional (default=None)
-      Initialization of the linear transformation. Possible options are
-      'auto', 'pca', 'identity', 'random', and a numpy array of shape
-      (n_features_a, n_features_b). If None, will be set automatically to
-        'auto' (this option is to raise a warning if 'init' is not set,
-        and stays to its default value None, in v0.5.0).
-
-      'auto'
-          Depending on ``n_components``, the most reasonable initialization
-          will be chosen. If ``n_components <= n_classes`` we use 'lda', as
-          it uses labels information. If not, but
-          ``n_components < min(n_features, n_samples)``, we use 'pca', as
-          it projects data in meaningful directions (those of higher
-          variance). Otherwise, we just use 'identity'.
-
-      'pca'
-          ``n_components`` principal components of the inputs passed
-          to :meth:`fit` will be used to initialize the transformation.
-          (See `sklearn.decomposition.PCA`)
-
-      'lda'
-          ``min(n_components, n_classes)`` most discriminative
-          components of the inputs passed to :meth:`fit` will be used to
-          initialize the transformation. (If ``n_components > n_classes``,
-          the rest of the components will be zero.) (See
-          `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
-
-      'identity'
-          If ``n_components`` is strictly smaller than the
-          dimensionality of the inputs passed to :meth:`fit`, the identity
-          matrix will be truncated to the first ``n_components`` rows.
-
-      'random'
-          The initial transformation will be a random array of shape
-          `(n_components, n_features)`. Each value is sampled from the
-          standard normal distribution.
-
-      numpy array
-          n_features_b must match the dimensionality of the inputs passed to
-          :meth:`fit` and n_features_a must be less than or equal to that.
-          If ``n_components`` is not None, n_features_a must match it.
-
-  k : int, optional
-      Number of neighbors to consider, not including self-edges.
+    Initialization of the linear transformation. Possible options are
+    'auto', 'pca', 'identity', 'random', and a numpy array of shape
+    (n_features_a, n_features_b). If None, will be set automatically to
+    'auto' (this option is to raise a warning if 'init' is not set, and
+    stays to its default value None, in v0.5.0).
+
+    'auto'
+      Depending on ``n_components``, the most reasonable initialization
+      will be chosen. If ``n_components <= n_classes`` we use 'lda', as
+      it uses labels information. If not, but
+      ``n_components < min(n_features, n_samples)``, we use 'pca', as
+      it projects data in meaningful directions (those of higher
+      variance). Otherwise, we just use 'identity'.
+
+    'pca'
+      ``n_components`` principal components of the inputs passed
+      to :meth:`fit` will be used to initialize the transformation.
+      (See `sklearn.decomposition.PCA`)
+
+    'lda'
+      ``min(n_components, n_classes)`` most discriminative
+      components of the inputs passed to :meth:`fit` will be used to
+      initialize the transformation. (If ``n_components > n_classes``,
+      the rest of the components will be zero.) (See
+      `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
+
+    'identity'
+      If ``n_components`` is strictly smaller than the
+      dimensionality of the inputs passed to :meth:`fit`, the identity
+      matrix will be truncated to the first ``n_components`` rows.
+
+    'random'
+      The initial transformation will be a random array of shape
+      `(n_components, n_features)`. Each value is sampled from the
+      standard normal distribution.
+
+    numpy array
+      n_features_b must match the dimensionality of the inputs passed to
+      :meth:`fit` and n_features_a must be less than or equal to that.
+      If ``n_components`` is not None, n_features_a must match it.
+
+  k : int, optional (default=3)
+    Number of neighbors to consider, not including self-edges.
 
   min_iter : int, optional (default=50)
-      Minimum number of iterations of the optimization procedure.
+    Minimum number of iterations of the optimization procedure.
 
   max_iter : int, optional (default=1000)
-      Maximum number of iterations of the optimization procedure.
+    Maximum number of iterations of the optimization procedure.
 
   learn_rate : float, optional (default=1e-7)
-      Learning rate of the optimization procedure
+    Learning rate of the optimization procedure
 
   tol : float, optional (default=0.001)
-      Tolerance of the optimization procedure. If the objective value varies
-      less than `tol`, we consider the algorithm has converged and stop it.
+    Tolerance of the optimization procedure. If the objective value varies
+    less than `tol`, we consider the algorithm has converged and stop it.
 
   use_pca : Not used
-
-      .. deprecated:: 0.5.0
-        `use_pca` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0.
+    .. deprecated:: 0.5.0
+      `use_pca` was deprecated in version 0.5.0 and will
+      be removed in 0.6.0.
 
   verbose : bool, optional (default=False)
-      Whether to print the progress of the optimization procedure.
+    Whether to print the progress of the optimization procedure.
 
-  regularization: float, optional
-      Weighting of pull and push terms, with 0.5 meaning equal weight.
+  regularization: float, optional (default=0.5)
+    Relative weight between pull and push terms, with 0.5 meaning equal
+    weight.
 
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be formed like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
 
   n_components : int or None, optional (default=None)
-      Dimensionality of reduced space (if None, defaults to dimension of X).
+    Dimensionality of reduced space (if None, defaults to dimension of X).
 
   num_dims : Not used
-
-      .. deprecated:: 0.5.0
-        `num_dims` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0. Use `n_components` instead.
+    .. deprecated:: 0.5.0
+      `num_dims` was deprecated in version 0.5.0 and will
+      be removed in 0.6.0. Use `n_components` instead.
 
   random_state : int or numpy.RandomState or None, optional (default=None)
-      A pseudo random number generator object or a seed for it if int. If
-      ``init='random'``, ``random_state`` is used to initialize the random
-      transformation. If ``init='pca'``, ``random_state`` is passed as an
-      argument to PCA when initializing the transformation.
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to initialize the random
+    transformation. If ``init='pca'``, ``random_state`` is passed as an
+    argument to PCA when initializing the transformation.
 
   Attributes
   ----------
   n_iter_ : `int`
-      The number of iterations the solver has run.
+    The number of iterations the solver has run.
 
   components_ : `numpy.ndarray`, shape=(n_components, n_features)
-      The learned linear transformation ``L``.
+    The learned linear transformation ``L``.
 
   Examples
   --------
diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py
index 7989d0b9..c4cdca97 100644
--- a/metric_learn/lsml.py
+++ b/metric_learn/lsml.py
@@ -140,49 +140,55 @@ class LSML(_BaseLSML, _QuadrupletsClassifierMixin):
   Parameters
   ----------
   prior : None, string or numpy array, optional (default=None)
-       Prior to set for the metric. Possible options are
-       'identity', 'covariance', 'random', and a numpy array of
-       shape (n_features, n_features). For LSML, the prior should be strictly
-       positive definite (PD). If `None`, will be set
-       automatically to 'identity' (this is to raise a warning if
-       `prior` is not set, and stays to its default value (None), in v0.5.0).
-
-       'identity'
-          An identity matrix of shape (n_features, n_features).
-
-       'covariance'
-          The inverse covariance matrix.
-
-       'random'
-          The initial Mahalanobis matrix will be a random positive definite
-          (PD) matrix of shape `(n_features, n_features)`, generated using
-          `sklearn.datasets.make_spd_matrix`.
-
-       numpy array
-           A positive definite (PD) matrix of shape
-           (n_features, n_features), that will be used as such to set the
-           prior.
-
-  tol : float, optional
-  max_iter : int, optional
-  verbose : bool, optional
-      if True, prints information while learning
+    Prior to set for the metric. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of
+    shape (n_features, n_features). For LSML, the prior should be strictly
+    positive definite (PD). If `None`, will be set
+    automatically to 'identity' (this is to raise a warning if
+    `prior` is not set, and stays to its default value (None), in v0.5.0).
+
+    'identity'
+      An identity matrix of shape (n_features, n_features).
+
+    'covariance'
+      The inverse covariance matrix.
+
+    'random'
+      The initial Mahalanobis matrix will be a random positive definite
+      (PD) matrix of shape `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
+
+    numpy array
+      A positive definite (PD) matrix of shape
+      (n_features, n_features), that will be used as such to set the
+      prior.
+
+  tol : float, optional (default=1e-3)
+    Convergence tolerance of the optimization procedure.
+
+  max_iter : int, optional (default=1000)
+    Maximum number of iteration of the optimization procedure.
+
+  verbose : bool, optional (default=False)
+    If True, prints information while learning
+
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be formed like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
+
   random_state : int or numpy.RandomState or None, optional (default=None)
-      A pseudo random number generator object or a seed for it if int. If
-      ``init='random'``, ``random_state`` is used to set the random
-      prior.
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to set the random
+    prior.
 
   Attributes
   ----------
   n_iter_ : `int`
-      The number of iterations the solver has run.
+    The number of iterations the solver has run.
 
   components_ : `numpy.ndarray`, shape=(n_features, n_features)
-      The linear transformation ``L`` deduced from the learned Mahalanobis
-      metric (See function `components_from_metric`.)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
 
   Examples
   --------
@@ -219,18 +225,19 @@ def fit(self, quadruplets, weights=None):
     ----------
     quadruplets : array-like, shape=(n_constraints, 4, n_features) or \
                   (n_constraints, 4)
-        3D array-like of quadruplets of points or 2D array of quadruplets of
-        indicators. In order to supervise the algorithm in the right way, we
-        should have the four samples ordered in a way such that:
-        d(pairs[i, 0],X[i, 1]) < d(X[i, 2], X[i, 3]) for all 0 <= i <
-        n_constraints.
+      3D array-like of quadruplets of points or 2D array of quadruplets of
+      indicators. In order to supervise the algorithm in the right way, we
+      should have the four samples ordered in a way such that:
+      d(pairs[i, 0],X[i, 1]) < d(X[i, 2], X[i, 3]) for all 0 <= i <
+      n_constraints.
+
     weights : (n_constraints,) array of floats, optional
-        scale factor for each constraint
+      scale factor for each constraint
 
     Returns
     -------
     self : object
-        Returns the instance.
+      Returns the instance.
     """
     return self._fit(quadruplets, weights=weights)
 
@@ -246,51 +253,60 @@ class LSML_Supervised(_BaseLSML, TransformerMixin):
   Parameters
   ----------
   tol : float, optional (default=1e-3)
-      Tolerance for the convergence procedure.
+    Convergence tolerance of the optimization procedure.
+
   max_iter : int, optional (default=1000)
-      Number of maximum iterations of the convergence procedure.
+    Number of maximum iterations of the optimization procedure.
+
   prior : None, string or numpy array, optional (default=None)
-      Prior to set for the metric. Possible options are
-      'identity', 'covariance', 'random', and a numpy array of
-      shape (n_features, n_features). For LSML, the prior should be strictly
-      positive definite (PD). If `None`, will be set
-      automatically to 'identity' (this is to raise a warning if
-      `prior` is not set, and stays to its default value (None), in v0.5.0).
-
-      'identity'
-          An identity matrix of shape (n_features, n_features).
-
-      'covariance'
-          The inverse covariance matrix.
-
-      'random'
-          The initial Mahalanobis matrix will be a random positive definite
-          (PD) matrix of shape `(n_features, n_features)`, generated using
-          `sklearn.datasets.make_spd_matrix`.
-
-      numpy array
-          A positive definite (PD) matrix of shape
-          (n_features, n_features), that will be used as such to set the
-          prior.
+    Prior to set for the metric. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of
+    shape (n_features, n_features). For LSML, the prior should be strictly
+    positive definite (PD). If `None`, will be set
+    automatically to 'identity' (this is to raise a warning if
+    `prior` is not set, and stays to its default value (None), in v0.5.0).
+
+    'identity'
+      An identity matrix of shape (n_features, n_features).
+
+    'covariance'
+      The inverse covariance matrix.
+
+    'random'
+      The initial Mahalanobis matrix will be a random positive definite
+      (PD) matrix of shape `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
+
+    numpy array
+      A positive definite (PD) matrix of shape
+      (n_features, n_features), that will be used as such to set the
+      prior.
+
   num_labeled : Not used
     .. deprecated:: 0.5.0
-       `num_labeled` was deprecated in version 0.5.0 and will
-       be removed in 0.6.0.
-  num_constraints: int, optional
-      number of constraints to generate
-      (`20 * num_classes**2` constraints by default)
-  weights : (m,) array of floats, optional
-      scale factor for each constraint
-  verbose : bool, optional
-      if True, prints information while learning
+      `num_labeled` was deprecated in version 0.5.0 and will
+      be removed in 0.6.0.
+
+  num_constraints: int, optional (default=None)
+    Number of constraints to generate. If None, default to `20 *
+    num_classes**2`.
+
+  weights : (num_constraints,) array of floats, optional (default=None)
+    Relative weight given to each constraint. If None, defaults to uniform
+    weights.
+
+  verbose : bool, optional (default=False)
+    If True, prints information while learning
+
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be formed like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
+
   random_state : int or numpy.RandomState or None, optional (default=None)
-      A pseudo random number generator object or a seed for it if int. If
-      ``init='random'``, ``random_state`` is used to set the random
-      prior. In any case, `random_state` is also used to randomly sample
-      constraints from labels.
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to set the random
+    prior. In any case, `random_state` is also used to randomly sample
+    constraints from labels.
 
   Examples
   --------
@@ -305,11 +321,11 @@ class LSML_Supervised(_BaseLSML, TransformerMixin):
   Attributes
   ----------
   n_iter_ : `int`
-      The number of iterations the solver has run.
+    The number of iterations the solver has run.
 
   components_ : `numpy.ndarray`, shape=(n_features, n_features)
-      The linear transformation ``L`` deduced from the learned Mahalanobis
-      metric (See function `components_from_metric`.)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
   """
 
   def __init__(self, tol=1e-3, max_iter=1000, prior=None,
@@ -328,10 +344,10 @@ def fit(self, X, y, random_state='deprecated'):
     Parameters
     ----------
     X : (n x d) matrix
-        Input data, where each row corresponds to a single instance.
+      Input data, where each row corresponds to a single instance.
 
     y : (n) array-like
-        Data labels.
+      Data labels.
 
     random_state : Not used
       .. deprecated:: 0.5.0
diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py
index 471694b6..3199b518 100644
--- a/metric_learn/mlkr.py
+++ b/metric_learn/mlkr.py
@@ -33,78 +33,77 @@ class MLKR(MahalanobisMixin, TransformerMixin):
   Parameters
   ----------
   n_components : int or None, optional (default=None)
-      Dimensionality of reduced space (if None, defaults to dimension of X).
+    Dimensionality of reduced space (if None, defaults to dimension of X).
 
   num_dims : Not used
-
-      .. deprecated:: 0.5.0
-        `num_dims` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0. Use `n_components` instead.
+    .. deprecated:: 0.5.0
+      `num_dims` was deprecated in version 0.5.0 and will
+      be removed in 0.6.0. Use `n_components` instead.
 
   init : None, string or numpy array, optional (default=None)
-      Initialization of the linear transformation. Possible options are
-      'auto', 'pca', 'identity', 'random', and a numpy array of shape
-      (n_features_a, n_features_b). If None, will be set automatically to
-      'auto' (this option is to raise a warning if 'init' is not set,
-      and stays to its default value None, in v0.5.0).
-
-      'auto'
-          Depending on ``n_components``, the most reasonable initialization
-          will be chosen. If ``n_components < min(n_features, n_samples)``,
-          we use 'pca', as it projects data in meaningful directions (those
-          of higher variance). Otherwise, we just use 'identity'.
-
-      'pca'
-          ``n_components`` principal components of the inputs passed
-          to :meth:`fit` will be used to initialize the transformation.
-          (See `sklearn.decomposition.PCA`)
-
-      'identity'
-          If ``n_components`` is strictly smaller than the
-          dimensionality of the inputs passed to :meth:`fit`, the identity
-          matrix will be truncated to the first ``n_components`` rows.
-
-      'random'
-          The initial transformation will be a random array of shape
-          `(n_components, n_features)`. Each value is sampled from the
-          standard normal distribution.
-
-      numpy array
-          n_features_b must match the dimensionality of the inputs passed to
-          :meth:`fit` and n_features_a must be less than or equal to that.
-          If ``n_components`` is not None, n_features_a must match it.
+    Initialization of the linear transformation. Possible options are
+    'auto', 'pca', 'identity', 'random', and a numpy array of shape
+    (n_features_a, n_features_b). If None, will be set automatically to
+    'auto' (this option is to raise a warning if 'init' is not set,
+    and stays to its default value None, in v0.5.0).
+
+    'auto'
+      Depending on ``n_components``, the most reasonable initialization
+      will be chosen. If ``n_components < min(n_features, n_samples)``,
+      we use 'pca', as it projects data in meaningful directions (those
+      of higher variance). Otherwise, we just use 'identity'.
+
+    'pca'
+      ``n_components`` principal components of the inputs passed
+      to :meth:`fit` will be used to initialize the transformation.
+      (See `sklearn.decomposition.PCA`)
+
+    'identity'
+      If ``n_components`` is strictly smaller than the
+      dimensionality of the inputs passed to :meth:`fit`, the identity
+      matrix will be truncated to the first ``n_components`` rows.
+
+    'random'
+      The initial transformation will be a random array of shape
+      `(n_components, n_features)`. Each value is sampled from the
+      standard normal distribution.
+
+    numpy array
+      n_features_b must match the dimensionality of the inputs passed to
+      :meth:`fit` and n_features_a must be less than or equal to that.
+      If ``n_components`` is not None, n_features_a must match it.
 
   A0: Not used.
-      .. deprecated:: 0.5.0
-        `A0` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0. Use 'init' instead.
+    .. deprecated:: 0.5.0
+      `A0` was deprecated in version 0.5.0 and will
+      be removed in 0.6.0. Use 'init' instead.
 
   tol: float, optional (default=None)
-      Convergence tolerance for the optimization.
+    Convergence tolerance for the optimization.
 
-  max_iter: int, optional
-      Cap on number of conjugate gradient iterations.
+  max_iter: int, optional (default=1000)
+    Cap on number of conjugate gradient iterations.
 
   verbose : bool, optional (default=False)
-      Whether to print progress messages or not.
+    Whether to print progress messages or not.
 
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be formed like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
 
   random_state : int or numpy.RandomState or None, optional (default=None)
-      A pseudo random number generator object or a seed for it if int. If
-      ``init='random'``, ``random_state`` is used to initialize the random
-      transformation. If ``init='pca'``, ``random_state`` is passed as an
-      argument to PCA when initializing the transformation.
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to initialize the random
+    transformation. If ``init='pca'``, ``random_state`` is passed as an
+    argument to PCA when initializing the transformation.
 
   Attributes
   ----------
   n_iter_ : `int`
-      The number of iterations the solver has run.
+    The number of iterations the solver has run.
 
   components_ : `numpy.ndarray`, shape=(n_components, n_features)
-      The learned linear transformation ``L``.
+    The learned linear transformation ``L``.
 
   Examples
   --------
diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py
index 981bec48..3769497e 100644
--- a/metric_learn/mmc.py
+++ b/metric_learn/mmc.py
@@ -70,10 +70,10 @@ def _fit_full(self, pairs, y):
     Parameters
     ----------
     X : (n x d) data matrix
-        each row corresponds to a single instance
+      Each row corresponds to a single instance.
     constraints : 4-tuple of arrays
-        (a,b,c,d) indices into X, with (a,b) specifying similar and (c,d)
-        dissimilar pairs
+      (a,b,c,d) indices into X, with (a,b) specifying similar and (c,d)
+      dissimilar pairs.
     """
     num_dim = pairs.shape[2]
 
@@ -195,10 +195,10 @@ def _fit_diag(self, pairs, y):
     Parameters
     ----------
     X : (n x d) data matrix
-        each row corresponds to a single instance
+      Each row corresponds to a single instance.
     constraints : 4-tuple of arrays
-        (a,b,c,d) indices into X, with (a,b) specifying similar and (c,d)
-        dissimilar pairs
+      (a,b,c,d) indices into X, with (a,b) specifying similar and (c,d)
+      dissimilar pairs.
     """
     num_dim = pairs.shape[2]
     pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1]
@@ -352,77 +352,80 @@ class MMC(_BaseMMC, _PairsClassifierMixin):
   Parameters
   ----------
   max_iter : int, optional (default=100)
-      Maximum number of iterations of the convergence procedure.
+    Maximum number of iterations of the optimization procedure.
 
   max_proj : int, optional (default=10000)
-      Maximum number of projection steps.
+    Maximum number of projection steps.
 
-  convergence_threshold : float, optional (default=1e-6)
-      Convergence threshold for the convergence procedure.
+  convergence_threshold : float, optional (default=1e-3)
+    Convergence threshold for the optimization procedure.
 
   init : None, string or numpy array, optional (default=None)
-     Initialization of the Mahalanobis matrix. Possible options are
-     'identity', 'covariance', 'random', and a numpy array of
-     shape (n_features, n_features). If None, will be set
-     automatically to 'identity' (this is to raise a warning if
-     'init' is not set, and stays to its default value (None), in v0.5.0).
+    Initialization of the Mahalanobis matrix. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of
+    shape (n_features, n_features). If None, will be set
+    automatically to 'identity' (this is to raise a warning if
+    'init' is not set, and stays to its default value (None), in v0.5.0).
 
-      'identity'
-         An identity matrix of shape (n_features, n_features).
+    'identity'
+      An identity matrix of shape (n_features, n_features).
 
-      'covariance'
-         The (pseudo-)inverse of the covariance matrix.
+    'covariance'
+      The (pseudo-)inverse of the covariance matrix.
 
-      'random'
-         The initial Mahalanobis matrix will be a random SPD matrix of
-         shape
-         `(n_features, n_features)`, generated using
-         `sklearn.datasets.make_spd_matrix`.
+    'random'
+      The initial Mahalanobis matrix will be a random SPD matrix of
+      shape
+      `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
 
-      numpy array
-          An SPD matrix of shape (n_features, n_features), that will
-          be used as such to initialize the metric.
-
-  verbose : bool, optional
-     if True, prints information while learning
+    numpy array
+      An SPD matrix of shape (n_features, n_features), that will
+      be used as such to initialize the metric.
 
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-     The preprocessor to call to get tuples from indices. If array-like,
-     tuples will be gotten like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be gotten like this: X[indices].
+
   A0 : Not used.
-     .. deprecated:: 0.5.0
-       `A0` was deprecated in version 0.5.0 and will
-       be removed in 0.6.0. Use 'init' instead.
-  diagonal : bool, optional
-     if True, a diagonal metric will be learned,
-     i.e., a simple scaling of dimensions. The initialization will then
-     be the diagonal coefficients of the matrix given as 'init'.
-  diagonal_c : float, optional
-     weight of the dissimilarity constraint for diagonal
-     metric learning
-  verbose : bool, optional
-     if True, prints information while learning
+    .. deprecated:: 0.5.0
+      `A0` was deprecated in version 0.5.0 and will
+      be removed in 0.6.0. Use 'init' instead.
+
+  diagonal : bool, optional (default=False)
+    If True, a diagonal metric will be learned,
+    i.e., a simple scaling of dimensions. The initialization will then
+    be the diagonal coefficients of the matrix given as 'init'.
+
+  diagonal_c : float, optional (default=1.0)
+    Weight of the dissimilarity constraint for diagonal
+    metric learning. Ignored if ``diagonal=False``.
+
+  verbose : bool, optional (default=False)
+    If True, prints information while learning
+
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-     The preprocessor to call to get tuples from indices. If array-like,
-     tuples will be gotten like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be gotten like this: X[indices].
+
   random_state : int or numpy.RandomState or None, optional (default=None)
-     A pseudo random number generator object or a seed for it if int. If
-     ``init='random'``, ``random_state`` is used to initialize the random
-     transformation.
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to initialize the random
+    transformation.
 
   Attributes
   ----------
   n_iter_ : `int`
-      The number of iterations the solver has run.
+    The number of iterations the solver has run.
 
   components_ : `numpy.ndarray`, shape=(n_features, n_features)
-      The linear transformation ``L`` deduced from the learned Mahalanobis
-      metric (See function `components_from_metric`.)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
 
   threshold_ : `float`
-      If the distance metric between two points is lower than this threshold,
-      points will be classified as similar, otherwise they will be
-      classified as dissimilar.
+    If the distance metric between two points is lower than this threshold,
+    points will be classified as similar, otherwise they will be
+    classified as dissimilar.
 
   Examples
   --------
@@ -461,19 +464,22 @@ def fit(self, pairs, y, calibration_params=None):
     ----------
     pairs : array-like, shape=(n_constraints, 2, n_features) or \
            (n_constraints, 2)
-        3D Array of pairs with each row corresponding to two points,
-        or 2D array of indices of pairs if the metric learner uses a
-        preprocessor.
+      3D Array of pairs with each row corresponding to two points,
+      or 2D array of indices of pairs if the metric learner uses a
+      preprocessor.
+
     y : array-like, of shape (n_constraints,)
-        Labels of constraints. Should be -1 for dissimilar pair, 1 for similar.
+      Labels of constraints. Should be -1 for dissimilar pair, 1 for similar.
+
     calibration_params : `dict` or `None`
-        Dictionary of parameters to give to `calibrate_threshold` for the
-        threshold calibration step done at the end of `fit`. If `None` is
-        given, `calibrate_threshold` will use the default parameters.
+      Dictionary of parameters to give to `calibrate_threshold` for the
+      threshold calibration step done at the end of `fit`. If `None` is
+      given, `calibrate_threshold` will use the default parameters.
+
     Returns
     -------
     self : object
-        Returns the instance.
+      Returns the instance.
     """
     calibration_params = (calibration_params if calibration_params is not
                           None else dict())
@@ -492,68 +498,76 @@ class MMC_Supervised(_BaseMMC, TransformerMixin):
 
   Parameters
   ----------
-  max_iter : int, optional
-  max_proj : int, optional
-  convergence_threshold : float, optional
+  max_iter : int, optional (default=100)
+    Maximum number of iterations of the optimization procedure.
+
+  max_proj : int, optional (default=10000)
+    Maximum number of projection steps.
+
+  convergence_threshold : float, optional (default=1e-3)
+    Convergence threshold for the optimization procedure.
+
   num_labeled : Not used
     .. deprecated:: 0.5.0
-       `num_labeled` was deprecated in version 0.5.0 and will
-       be removed in 0.6.0.
-  num_constraints: int, optional
-      number of constraints to generate
-      (`20 * num_classes**2` constraints by default)
-  init : None, string or numpy array, optional (default=None)
-      Initialization of the Mahalanobis matrix. Possible options are
-      'identity', 'covariance', 'random', and a numpy array of
-      shape (n_features, n_features). If None, will be set
-      automatically to 'identity' (this is to raise a warning if
-      'init' is not set, and stays to its default value (None), in v0.5.0).
+      `num_labeled` was deprecated in version 0.5.0 and will
+      be removed in 0.6.0.
 
-       'identity'
-           An identity matrix of shape (n_features, n_features).
+  num_constraints: int, optional (default=None)
+    Number of constraints to generate. If None, default to `20 *
+    num_classes**2`.
 
-       'covariance'
-           The (pseudo-)inverse of the covariance matrix.
+  init : None, string or numpy array, optional (default=None)
+    Initialization of the Mahalanobis matrix. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of
+    shape (n_features, n_features). If None, will be set
+    automatically to 'identity' (this is to raise a warning if
+    'init' is not set, and stays to its default value (None), in v0.5.0).
+
+    'identity'
+      An identity matrix of shape (n_features, n_features).
 
-       'random'
-           The initial Mahalanobis matrix will be a random SPD matrix of
-           shape `(n_features, n_features)`, generated using
-           `sklearn.datasets.make_spd_matrix`.
+    'covariance'
+      The (pseudo-)inverse of the covariance matrix.
 
-       numpy array
-           A numpy array of shape (n_features, n_features), that will
-           be used as such to initialize the metric.
+    'random'
+      The initial Mahalanobis matrix will be a random SPD matrix of
+      shape `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
 
-  verbose : bool, optional
-      if True, prints information while learning
+    numpy array
+      A numpy array of shape (n_features, n_features), that will
+      be used as such to initialize the metric.
 
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be gotten like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be gotten like this: X[indices].
+
   A0 : Not used.
-      .. deprecated:: 0.5.0
-        `A0` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0. Use 'init' instead.
-  diagonal : bool, optional
-      if True, a diagonal metric will be learned,
-      i.e., a simple scaling of dimensions
-  diagonal_c : float, optional
-      weight of the dissimilarity constraint for diagonal
-      metric learning
-  verbose : bool, optional
-      if True, prints information while learning
+    .. deprecated:: 0.5.0
+      `A0` was deprecated in version 0.5.0 and will
+      be removed in 0.6.0. Use 'init' instead.
+
+  diagonal : bool, optional (default=False)
+    If True, a diagonal metric will be learned,
+    i.e., a simple scaling of dimensions. The initialization will then
+    be the diagonal coefficients of the matrix given as 'init'.
+
+  diagonal_c : float, optional (default=1.0)
+    Weight of the dissimilarity constraint for diagonal
+    metric learning. Ignored if ``diagonal=False``.
+
+  verbose : bool, optional (default=False)
+    If True, prints information while learning
+
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be formed like this: X[indices].
-  random_state : int or numpy.RandomState or None, optional (default=None)
-      A pseudo random number generator object or a seed for it if int. If
-      ``init='random'``, ``random_state`` is used to initialize the random
-      Mahalanobis matrix.  In any case, `random_state` is also used to
-      randomly sample constraints from labels.
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
 
-  `MMC_Supervised` creates pairs of similar sample by taking same class
-  samples, and pairs of dissimilar samples by taking different class
-  samples. It then passes these pairs to `MMC` for training.
+  random_state : int or numpy.RandomState or None, optional (default=None)
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to initialize the random
+    Mahalanobis matrix.  In any case, `random_state` is also used to
+    randomly sample constraints from labels.
 
   Examples
   --------
@@ -568,11 +582,11 @@ class MMC_Supervised(_BaseMMC, TransformerMixin):
   Attributes
   ----------
   n_iter_ : `int`
-      The number of iterations the solver has run.
+    The number of iterations the solver has run.
 
   components_ : `numpy.ndarray`, shape=(n_features, n_features)
-      The linear transformation ``L`` deduced from the learned Mahalanobis
-      metric (See function `components_from_metric`.)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
   """
 
   def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-6,
@@ -593,9 +607,11 @@ def fit(self, X, y, random_state='deprecated'):
     Parameters
     ----------
     X : (n x d) matrix
-        Input data, where each row corresponds to a single instance.
+      Input data, where each row corresponds to a single instance.
+
     y : (n) array-like
-        Data labels.
+      Data labels.
+
     random_state : Not used
       .. deprecated:: 0.5.0
         `random_state` in the `fit` function was deprecated in version 0.5.0
diff --git a/metric_learn/nca.py b/metric_learn/nca.py
index 03abdc41..983f1120 100644
--- a/metric_learn/nca.py
+++ b/metric_learn/nca.py
@@ -34,70 +34,69 @@ class NCA(MahalanobisMixin, TransformerMixin):
   Parameters
   ----------
   init : None, string or numpy array, optional (default=None)
-      Initialization of the linear transformation. Possible options are
-      'auto', 'pca', 'identity', 'random', and a numpy array of shape
-      (n_features_a, n_features_b). If None, will be set automatically to
-      'auto' (this option is to raise a warning if 'init' is not set,
-      and stays to its default value None, in v0.5.0).
-
-      'auto'
-          Depending on ``n_components``, the most reasonable initialization
-          will be chosen. If ``n_components <= n_classes`` we use 'lda', as
-          it uses labels information. If not, but
-          ``n_components < min(n_features, n_samples)``, we use 'pca', as
-          it projects data in meaningful directions (those of higher
-          variance). Otherwise, we just use 'identity'.
-
-      'pca'
-          ``n_components`` principal components of the inputs passed
-          to :meth:`fit` will be used to initialize the transformation.
-          (See `sklearn.decomposition.PCA`)
-
-      'lda'
-          ``min(n_components, n_classes)`` most discriminative
-          components of the inputs passed to :meth:`fit` will be used to
-          initialize the transformation. (If ``n_components > n_classes``,
-          the rest of the components will be zero.) (See
-          `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
-
-      'identity'
-          If ``n_components`` is strictly smaller than the
-          dimensionality of the inputs passed to :meth:`fit`, the identity
-          matrix will be truncated to the first ``n_components`` rows.
-
-      'random'
-          The initial transformation will be a random array of shape
-          `(n_components, n_features)`. Each value is sampled from the
-          standard normal distribution.
-
-      numpy array
-          n_features_b must match the dimensionality of the inputs passed to
-          :meth:`fit` and n_features_a must be less than or equal to that.
-          If ``n_components`` is not None, n_features_a must match it.
+    Initialization of the linear transformation. Possible options are
+    'auto', 'pca', 'identity', 'random', and a numpy array of shape
+    (n_features_a, n_features_b). If None, will be set automatically to
+    'auto' (this option is to raise a warning if 'init' is not set,
+    and stays to its default value None, in v0.5.0).
+
+    'auto'
+      Depending on ``n_components``, the most reasonable initialization
+      will be chosen. If ``n_components <= n_classes`` we use 'lda', as
+      it uses labels information. If not, but
+      ``n_components < min(n_features, n_samples)``, we use 'pca', as
+      it projects data in meaningful directions (those of higher
+      variance). Otherwise, we just use 'identity'.
+
+    'pca'
+      ``n_components`` principal components of the inputs passed
+      to :meth:`fit` will be used to initialize the transformation.
+      (See `sklearn.decomposition.PCA`)
+
+    'lda'
+      ``min(n_components, n_classes)`` most discriminative
+      components of the inputs passed to :meth:`fit` will be used to
+      initialize the transformation. (If ``n_components > n_classes``,
+      the rest of the components will be zero.) (See
+      `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
+
+    'identity'
+      If ``n_components`` is strictly smaller than the
+      dimensionality of the inputs passed to :meth:`fit`, the identity
+      matrix will be truncated to the first ``n_components`` rows.
+
+    'random'
+      The initial transformation will be a random array of shape
+      `(n_components, n_features)`. Each value is sampled from the
+      standard normal distribution.
+
+    numpy array
+      n_features_b must match the dimensionality of the inputs passed to
+      :meth:`fit` and n_features_a must be less than or equal to that.
+      If ``n_components`` is not None, n_features_a must match it.
 
   n_components : int or None, optional (default=None)
-      Dimensionality of reduced space (if None, defaults to dimension of X).
+    Dimensionality of reduced space (if None, defaults to dimension of X).
 
   num_dims : Not used
-
-      .. deprecated:: 0.5.0
-        `num_dims` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0. Use `n_components` instead.
+    .. deprecated:: 0.5.0
+      `num_dims` was deprecated in version 0.5.0 and will
+      be removed in 0.6.0. Use `n_components` instead.
 
   max_iter : int, optional (default=100)
     Maximum number of iterations done by the optimization algorithm.
 
   tol : float, optional (default=None)
-      Convergence tolerance for the optimization.
+    Convergence tolerance for the optimization.
 
   verbose : bool, optional (default=False)
     Whether to print progress messages or not.
 
   random_state : int or numpy.RandomState or None, optional (default=None)
-      A pseudo random number generator object or a seed for it if int. If
-      ``init='random'``, ``random_state`` is used to initialize the random
-      transformation. If ``init='pca'``, ``random_state`` is passed as an
-      argument to PCA when initializing the transformation.
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to initialize the random
+    transformation. If ``init='pca'``, ``random_state`` is passed as an
+    argument to PCA when initializing the transformation.
 
   Examples
   --------
@@ -114,10 +113,10 @@ class NCA(MahalanobisMixin, TransformerMixin):
   Attributes
   ----------
   n_iter_ : `int`
-      The number of iterations the solver has run.
+    The number of iterations the solver has run.
 
   components_ : `numpy.ndarray`, shape=(n_components, n_features)
-      The learned linear transformation ``L``.
+    The learned linear transformation ``L``.
 
   References
   ----------
diff --git a/metric_learn/rca.py b/metric_learn/rca.py
index 060a797d..f3a2ac89 100644
--- a/metric_learn/rca.py
+++ b/metric_learn/rca.py
@@ -43,22 +43,21 @@ class RCA(MahalanobisMixin, TransformerMixin):
   Parameters
   ----------
   n_components : int or None, optional (default=None)
-      Dimensionality of reduced space (if None, defaults to dimension of X).
+    Dimensionality of reduced space (if None, defaults to dimension of X).
 
   num_dims : Not used
-
-      .. deprecated:: 0.5.0
-        `num_dims` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0. Use `n_components` instead.
+    .. deprecated:: 0.5.0
+      `num_dims` was deprecated in version 0.5.0 and will
+      be removed in 0.6.0. Use `n_components` instead.
 
   pca_comps : Not used
-      .. deprecated:: 0.5.0
+    .. deprecated:: 0.5.0
       `pca_comps` was deprecated in version 0.5.0 and will
       be removed in 0.6.0.
 
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be formed like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
 
   Examples
   --------
@@ -82,7 +81,7 @@ class RCA(MahalanobisMixin, TransformerMixin):
   Attributes
   ----------
   components_ : `numpy.ndarray`, shape=(n_components, n_features)
-      The learned linear transformation ``L``.
+    The learned linear transformation ``L``.
   """
 
   def __init__(self, n_components=None, num_dims='deprecated',
@@ -112,10 +111,11 @@ def fit(self, X, chunks):
     Parameters
     ----------
     data : (n x d) data matrix
-        Each row corresponds to a single instance
+      Each row corresponds to a single instance
+
     chunks : (n,) array of ints
-        When ``chunks[i] == -1``, point i doesn't belong to any chunklet.
-        When ``chunks[i] == j``, point i belongs to chunklet j.
+      When ``chunks[i] == -1``, point i doesn't belong to any chunklet.
+      When ``chunks[i] == j``, point i belongs to chunklet j.
     """
     if self.num_dims != 'deprecated':
       warnings.warn('"num_dims" parameter is not used.'
@@ -177,25 +177,26 @@ class RCA_Supervised(RCA):
   Parameters
   ----------
   n_components : int or None, optional (default=None)
-      Dimensionality of reduced space (if None, defaults to dimension of X).
+    Dimensionality of reduced space (if None, defaults to dimension of X).
 
   num_dims : Not used
+    .. deprecated:: 0.5.0
+      `num_dims` was deprecated in version 0.5.0 and will
+      be removed in 0.6.0. Use `n_components` instead.
 
-      .. deprecated:: 0.5.0
-        `num_dims` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0. Use `n_components` instead.
-
-  num_chunks: int, optional
+  num_chunks: int, optional (default=100)
+    Number of chunks to generate.
 
-  chunk_size: int, optional
+  chunk_size: int, optional (default=2)
+    Number of points per chunk.
 
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be formed like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
 
   random_state : int or numpy.RandomState or None, optional (default=None)
-      A pseudo random number generator object or a seed for it if int.
-      It is used to randomly sample constraints from labels.
+    A pseudo random number generator object or a seed for it if int.
+    It is used to randomly sample constraints from labels.
 
   Examples
   --------
@@ -210,7 +211,7 @@ class RCA_Supervised(RCA):
   Attributes
   ----------
   components_ : `numpy.ndarray`, shape=(n_components, n_features)
-      The learned linear transformation ``L``.
+    The learned linear transformation ``L``.
   """
 
   def __init__(self, num_dims='deprecated', n_components=None,
@@ -230,8 +231,10 @@ def fit(self, X, y, random_state='deprecated'):
     Parameters
     ----------
     X : (n x d) data matrix
-        each row corresponds to a single instance
+      each row corresponds to a single instance
+
     y : (n) data labels
+
     random_state : Not used
       .. deprecated:: 0.5.0
         `random_state` in the `fit` function was deprecated in version 0.5.0
diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py
index 21fadd74..944739f2 100644
--- a/metric_learn/sdml.py
+++ b/metric_learn/sdml.py
@@ -142,62 +142,62 @@ class SDML(_BaseSDML, _PairsClassifierMixin):
 
   Parameters
   ----------
-  balance_param : float, optional
-      trade off between sparsity and M0 prior
+  balance_param : float, optional (default=0.5)
+    Trade off between sparsity and M0 prior.
 
-  sparsity_param : float, optional
-      trade off between optimizer and sparseness (see graph_lasso)
+  sparsity_param : float, optional  (default=0.01)
+    Trade off between optimizer and sparseness (see graph_lasso).
 
   prior : None, string or numpy array, optional (default=None)
-       Prior to set for the metric. Possible options are
-       'identity', 'covariance', 'random', and a numpy array of
-       shape (n_features, n_features). For SDML, the prior should be strictly
-       positive definite (PD). If `None`, will be set
-       automatically to 'identity' (this is to raise a warning if
-       `prior` is not set, and stays to its default value (None), in v0.5.0).
+    Prior to set for the metric. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of
+    shape (n_features, n_features). For SDML, the prior should be strictly
+    positive definite (PD). If `None`, will be set
+    automatically to 'identity' (this is to raise a warning if
+    `prior` is not set, and stays to its default value (None), in v0.5.0).
 
-       'identity'
-          An identity matrix of shape (n_features, n_features).
+    'identity'
+      An identity matrix of shape (n_features, n_features).
 
-       'covariance'
-          The inverse covariance matrix.
+    'covariance'
+      The inverse covariance matrix.
 
-       'random'
-          The prior will be a random positive definite (PD) matrix of shape
-          `(n_features, n_features)`, generated using
-          `sklearn.datasets.make_spd_matrix`.
+    'random'
+      The prior will be a random positive definite (PD) matrix of shape
+      `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
 
-       numpy array
-           A positive definite (PD) matrix of shape
-           (n_features, n_features), that will be used as such to set the
-           prior.
+    numpy array
+      A positive definite (PD) matrix of shape
+      (n_features, n_features), that will be used as such to set the
+      prior.
 
   use_cov : Not used.
-      .. deprecated:: 0.5.0
-        `A0` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0. Use 'prior' instead.
+    .. deprecated:: 0.5.0
+      `A0` was deprecated in version 0.5.0 and will
+      be removed in 0.6.0. Use 'prior' instead.
 
   verbose : bool, optional (default=False)
-      if True, prints information while learning
+    If True, prints information while learning.
 
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be gotten like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be gotten like this: X[indices].
 
   random_state : int or numpy.RandomState or None, optional (default=None)
-      A pseudo random number generator object or a seed for it if int. If
-      ``prior='random'``, ``random_state`` is used to set the prior.
+    A pseudo random number generator object or a seed for it if int. If
+    ``prior='random'``, ``random_state`` is used to set the prior.
 
   Attributes
   ----------
   components_ : `numpy.ndarray`, shape=(n_features, n_features)
-      The linear transformation ``L`` deduced from the learned Mahalanobis
-      metric (See function `components_from_metric`.)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
 
   threshold_ : `float`
-      If the distance metric between two points is lower than this threshold,
-      points will be classified as similar, otherwise they will be
-      classified as dissimilar.
+    If the distance metric between two points is lower than this threshold,
+    points will be classified as similar, otherwise they will be
+    classified as dissimilar.
 
   Examples
   --------
@@ -231,20 +231,22 @@ def fit(self, pairs, y, calibration_params=None):
     ----------
     pairs : array-like, shape=(n_constraints, 2, n_features) or \
            (n_constraints, 2)
-        3D Array of pairs with each row corresponding to two points,
-        or 2D array of indices of pairs if the metric learner uses a
-        preprocessor.
+      3D Array of pairs with each row corresponding to two points,
+      or 2D array of indices of pairs if the metric learner uses a
+      preprocessor.
+
     y : array-like, of shape (n_constraints,)
-        Labels of constraints. Should be -1 for dissimilar pair, 1 for similar.
+      Labels of constraints. Should be -1 for dissimilar pair, 1 for similar.
+
     calibration_params : `dict` or `None`
-        Dictionary of parameters to give to `calibrate_threshold` for the
-        threshold calibration step done at the end of `fit`. If `None` is
-        given, `calibrate_threshold` will use the default parameters.
+      Dictionary of parameters to give to `calibrate_threshold` for the
+      threshold calibration step done at the end of `fit`. If `None` is
+      given, `calibrate_threshold` will use the default parameters.
 
     Returns
     -------
     self : object
-        Returns the instance.
+      Returns the instance.
     """
     calibration_params = (calibration_params if calibration_params is not
                           None else dict())
@@ -264,61 +266,67 @@ class SDML_Supervised(_BaseSDML, TransformerMixin):
   Parameters
   ----------
   balance_param : float, optional (default=0.5)
-      trade off between sparsity and M0 prior
+    Trade off between sparsity and M0 prior.
+
   sparsity_param : float, optional (default=0.01)
-      trade off between optimizer and sparseness (see graph_lasso)
+    Trade off between optimizer and sparseness (see graph_lasso).
+
   prior : None, string or numpy array, optional (default=None)
-       Prior to set for the metric. Possible options are
-       'identity', 'covariance', 'random', and a numpy array of
-       shape (n_features, n_features). For SDML, the prior should be strictly
-       positive definite (PD). If `None`, will be set
-       automatically to 'identity' (this is to raise a warning if
-       `prior` is not set, and stays to its default value (None), in v0.5.0).
+    Prior to set for the metric. Possible options are
+    'identity', 'covariance', 'random', and a numpy array of
+    shape (n_features, n_features). For SDML, the prior should be strictly
+    positive definite (PD). If `None`, will be set
+    automatically to 'identity' (this is to raise a warning if
+    `prior` is not set, and stays to its default value (None), in v0.5.0).
 
-       'identity'
-          An identity matrix of shape (n_features, n_features).
+    'identity'
+      An identity matrix of shape (n_features, n_features).
 
-       'covariance'
-          The inverse covariance matrix.
+    'covariance'
+      The inverse covariance matrix.
 
-       'random'
-          The prior will be a random SPD matrix of shape
-          `(n_features, n_features)`, generated using
-          `sklearn.datasets.make_spd_matrix`.
+    'random'
+      The prior will be a random SPD matrix of shape
+      `(n_features, n_features)`, generated using
+      `sklearn.datasets.make_spd_matrix`.
 
-       numpy array
-           A positive definite (PD) matrix of shape
-           (n_features, n_features), that will be used as such to set the
-           prior.
+    numpy array
+      A positive definite (PD) matrix of shape
+      (n_features, n_features), that will be used as such to set the
+      prior.
 
   use_cov : Not used.
-      .. deprecated:: 0.5.0
-        `A0` was deprecated in version 0.5.0 and will
-        be removed in 0.6.0. Use 'prior' instead.
+    .. deprecated:: 0.5.0
+      `A0` was deprecated in version 0.5.0 and will
+      be removed in 0.6.0. Use 'prior' instead.
 
   num_labeled : Not used
     .. deprecated:: 0.5.0
-       `num_labeled` was deprecated in version 0.5.0 and will
-       be removed in 0.6.0.
+      `num_labeled` was deprecated in version 0.5.0 and will
+      be removed in 0.6.0.
+
   num_constraints : int, optional (default=None)
-      number of constraints to generate
-      (`20 * num_classes**2` constraints by default)
+    Number of constraints to generate. If None, defaults to `20 *
+    num_classes**2`.
+
   verbose : bool, optional (default=False)
-      if True, prints information while learning
+    If True, prints information while learning.
+
   preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be formed like this: X[indices].
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be formed like this: X[indices].
+
   random_state : int or numpy.RandomState or None, optional (default=None)
-      A pseudo random number generator object or a seed for it if int. If
-      ``init='random'``, ``random_state`` is used to set the random
-      prior. In any case, `random_state` is also used to randomly sample
-      constraints from labels.
+    A pseudo random number generator object or a seed for it if int. If
+    ``init='random'``, ``random_state`` is used to set the random
+    prior. In any case, `random_state` is also used to randomly sample
+    constraints from labels.
 
   Attributes
   ----------
   components_ : `numpy.ndarray`, shape=(n_features, n_features)
-      The linear transformation ``L`` deduced from the learned Mahalanobis
-      metric (See function `components_from_metric`.)
+    The linear transformation ``L`` deduced from the learned Mahalanobis
+    metric (See function `components_from_metric`.)
 
   See Also
   --------
@@ -344,9 +352,11 @@ def fit(self, X, y, random_state='deprecated'):
     Parameters
     ----------
     X : array-like, shape (n, d)
-        data matrix, where each row corresponds to a single instance
+      data matrix, where each row corresponds to a single instance
+
     y : array-like, shape (n,)
-        data labels, one for each instance
+      data labels, one for each instance
+
     random_state : Not used
       .. deprecated:: 0.5.0
         `random_state` in the `fit` function was deprecated in version 0.5.0
@@ -356,7 +366,7 @@ def fit(self, X, y, random_state='deprecated'):
     Returns
     -------
     self : object
-        Returns the instance.
+      Returns the instance.
     """
     if self.num_labeled != 'deprecated':
       warnings.warn('"num_labeled" parameter is not used.'