diff --git a/.gitignore b/.gitignore
index 449f70ea..8321c7d2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ htmlcov/
 .cache/
 .pytest_cache/
 doc/auto_examples/*
+doc/generated/*
\ No newline at end of file
diff --git a/doc/_templates/class.rst b/doc/_templates/class.rst
new file mode 100644
index 00000000..f0c1b5bc
--- /dev/null
+++ b/doc/_templates/class.rst
@@ -0,0 +1,16 @@
+:mod:`{{module}}`.{{objname}}
+{{ underline }}==============
+
+.. currentmodule:: {{ module }}
+
+.. autoclass:: {{ objname }}
+   :members:
+   :undoc-members:
+   :inherited-members:
+   :special-members: __init__
+
+.. include:: {{module}}.{{objname}}.examples
+
+.. raw:: html
+
+    <div style='clear:both'></div>
diff --git a/doc/conf.py b/doc/conf.py
index a11f8bba..e7e6a108 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 import sys
+import os
 
 extensions = [
     'sphinx.ext.autodoc',
@@ -28,7 +29,6 @@
 exclude_patterns = ['_build']
 pygments_style = 'sphinx'
 todo_include_todos = True
-numpydoc_show_class_members = False
 
 # Options for HTML output
 html_theme = 'sphinx_rtd_theme'
@@ -50,3 +50,16 @@
     'scipy': ('https://docs.scipy.org/doc/scipy/reference', None),
     'scikit-learn': ('https://scikit-learn.org/stable/', None)
 }
+
+
+# sphinx-gallery configuration
+sphinx_gallery_conf = {
+    # to generate mini-galleries at the end of each docstring in the API
+    # section: (see https://sphinx-gallery.github.io/configuration.html
+    # #references-to-examples)
+    'doc_module': 'metric_learn',
+    'backreferences_dir': os.path.join('generated'),
+}
+
+# generate autosummary even if no references
+autosummary_generate = True
diff --git a/doc/index.rst b/doc/index.rst
index 3e4d0ce3..9d303bee 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -25,7 +25,7 @@ Documentation outline
 .. toctree::
    :maxdepth: 2
 
-   Package Overview <metric_learn>
+   Package Contents <metric_learn>
 
 .. toctree::
    :maxdepth: 2
diff --git a/doc/metric_learn.base_metric.rst b/doc/metric_learn.base_metric.rst
deleted file mode 100644
index 050a360b..00000000
--- a/doc/metric_learn.base_metric.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-metric_learn.base_metric module
-===============================
-
-.. automodule:: metric_learn.base_metric
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/doc/metric_learn.constraints.rst b/doc/metric_learn.constraints.rst
deleted file mode 100644
index 97d79002..00000000
--- a/doc/metric_learn.constraints.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-metric_learn.constraints module
-===============================
-
-.. automodule:: metric_learn.constraints
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/doc/metric_learn.covariance.rst b/doc/metric_learn.covariance.rst
deleted file mode 100644
index 493878c1..00000000
--- a/doc/metric_learn.covariance.rst
+++ /dev/null
@@ -1,22 +0,0 @@
-Covariance metric (baseline method)
-===================================
-
-.. automodule:: metric_learn.covariance
-    :members:
-    :undoc-members:
-    :inherited-members:
-    :show-inheritance:
-    :special-members: __init__
-
-Example Code
-------------
-
-::
-
-    from metric_learn import Covariance
-    from sklearn.datasets import load_iris
-
-    iris = load_iris()['data']
-
-    cov = Covariance().fit(iris)
-    x = cov.transform(iris)
diff --git a/doc/metric_learn.itml.rst b/doc/metric_learn.itml.rst
deleted file mode 100644
index addb4c76..00000000
--- a/doc/metric_learn.itml.rst
+++ /dev/null
@@ -1,28 +0,0 @@
-Information Theoretic Metric Learning (ITML)
-============================================
-
-.. automodule:: metric_learn.itml
-    :members:
-    :undoc-members:
-    :inherited-members:
-    :show-inheritance:
-    :special-members: __init__
-
-Example Code
-------------
-
-::
-
-    from metric_learn import ITML_Supervised
-    from sklearn.datasets import load_iris
-
-    iris_data = load_iris()
-    X = iris_data['data']
-    Y = iris_data['target']
-
-    itml = ITML_Supervised(num_constraints=200)
-    itml.fit(X, Y)
-
-References
-----------
-`Information-theoretic Metric Learning <http://machinelearning.wustl.edu/mlpapers/paper_files/icml2007_DavisKJSD07.pdf>`_ Jason V. Davis, et al.
diff --git a/doc/metric_learn.lfda.rst b/doc/metric_learn.lfda.rst
deleted file mode 100644
index 41088a68..00000000
--- a/doc/metric_learn.lfda.rst
+++ /dev/null
@@ -1,31 +0,0 @@
-Local Fisher Discriminant Analysis (LFDA)
-=========================================
-
-.. automodule:: metric_learn.lfda
-    :members:
-    :undoc-members:
-    :inherited-members:
-    :show-inheritance:
-    :special-members: __init__
-
-Example Code
-------------
-
-::
-
-    import numpy as np
-    from metric_learn import LFDA
-    from sklearn.datasets import load_iris
-
-    iris_data = load_iris()
-    X = iris_data['data']
-    Y = iris_data['target']
-
-    lfda = LFDA(k=2, dim=2)
-    lfda.fit(X, Y)
-
-References
-------------------
-`Dimensionality Reduction of Multimodal Labeled Data by Local Fisher Discriminant Analysis <http://www.ms.k.u-tokyo.ac.jp/2007/LFDA.pdf>`_ Masashi Sugiyama.
-
-`Local Fisher Discriminant Analysis on Beer Style Clustering <https://gastrograph.com/resources/whitepapers/local-fisher-discriminant-analysis-on-beer-style-clustering.html#>`_ Yuan Tang.
diff --git a/doc/metric_learn.lmnn.rst b/doc/metric_learn.lmnn.rst
deleted file mode 100644
index bc65161e..00000000
--- a/doc/metric_learn.lmnn.rst
+++ /dev/null
@@ -1,34 +0,0 @@
-Large Margin Nearest Neighbor (LMNN)
-====================================
-
-.. automodule:: metric_learn.lmnn
-    :members:
-    :undoc-members:
-    :inherited-members:
-    :show-inheritance:
-    :special-members: __init__
-
-Example Code
-------------
-
-::
-
-    import numpy as np
-    from metric_learn import LMNN
-    from sklearn.datasets import load_iris
-
-    iris_data = load_iris()
-    X = iris_data['data']
-    Y = iris_data['target']
-
-    lmnn = LMNN(k=5, learn_rate=1e-6)
-    lmnn.fit(X, Y, verbose=False)
-
-If a recent version of the Shogun Python modular (``modshogun``) library
-is available, the LMNN implementation will use the fast C++ version from
-there. Otherwise, the included pure-Python version will be used.
-The two implementations differ slightly, and the C++ version is more complete.
-
-References
-----------
-`Distance Metric Learning for Large Margin Nearest Neighbor Classification <http://papers.nips.cc/paper/2795-distance-metric-learning-for-large-margin-nearest-neighbor-classification>`_ Kilian Q. Weinberger, John Blitzer, Lawrence K. Saul
diff --git a/doc/metric_learn.lsml.rst b/doc/metric_learn.lsml.rst
deleted file mode 100644
index 0deae4e6..00000000
--- a/doc/metric_learn.lsml.rst
+++ /dev/null
@@ -1,28 +0,0 @@
-Least Squares Metric Learning (LSML)
-====================================
-
-.. automodule:: metric_learn.lsml
-    :members:
-    :undoc-members:
-    :inherited-members:
-    :show-inheritance:
-    :special-members: __init__
-
-Example Code
-------------
-
-::
-
-    from metric_learn import LSML_Supervised
-    from sklearn.datasets import load_iris
-
-    iris_data = load_iris()
-    X = iris_data['data']
-    Y = iris_data['target']
-
-    lsml = LSML_Supervised(num_constraints=200)
-    lsml.fit(X, Y)
-
-References
-----------
-
diff --git a/doc/metric_learn.mlkr.rst b/doc/metric_learn.mlkr.rst
deleted file mode 100644
index f71697de..00000000
--- a/doc/metric_learn.mlkr.rst
+++ /dev/null
@@ -1,28 +0,0 @@
-Metric Learning for Kernel Regression (MLKR)
-============================================
-
-.. automodule:: metric_learn.mlkr
-    :members:
-    :undoc-members:
-    :inherited-members:
-    :show-inheritance:
-    :special-members: __init__
-
-Example Code
-------------
-
-::
-
-    from metric_learn import MLKR
-    from sklearn.datasets import load_iris
-
-    iris_data = load_iris()
-    X = iris_data['data']
-    Y = iris_data['target']
-
-    mlkr = MLKR()
-    mlkr.fit(X, Y)
-
-References
-----------
-`Information-theoretic Metric Learning <http://machinelearning.wustl.edu/mlpapers/paper_files/icml2007_DavisKJSD07.pdf>`_ Jason V. Davis, et al.
diff --git a/doc/metric_learn.mmc.rst b/doc/metric_learn.mmc.rst
deleted file mode 100644
index bb9031ba..00000000
--- a/doc/metric_learn.mmc.rst
+++ /dev/null
@@ -1,28 +0,0 @@
-Mahalanobis Metric Learning for Clustering (MMC)
-================================================
-
-.. automodule:: metric_learn.mmc
-    :members:
-    :undoc-members:
-    :inherited-members:
-    :show-inheritance:
-    :special-members: __init__
-
-Example Code
-------------
-
-::
-
-    from metric_learn import MMC_Supervised
-    from sklearn.datasets import load_iris
-
-    iris_data = load_iris()
-    X = iris_data['data']
-    Y = iris_data['target']
-
-    mmc = MMC_Supervised(num_constraints=200)
-    mmc.fit(X, Y)
-
-References
-----------
-`Distance metric learning with application to clustering with side-information <http://papers.nips.cc/paper/2164-distance-metric-learning-with-application-to-clustering-with-side-information.pdf>`_ Xing, Jordan, Russell, Ng.
diff --git a/doc/metric_learn.nca.rst b/doc/metric_learn.nca.rst
deleted file mode 100644
index 00bc4eac..00000000
--- a/doc/metric_learn.nca.rst
+++ /dev/null
@@ -1,29 +0,0 @@
-Neighborhood Components Analysis (NCA)
-======================================
-
-.. automodule:: metric_learn.nca
-    :members:
-    :undoc-members:
-    :inherited-members:
-    :show-inheritance:
-    :special-members: __init__
-
-Example Code
-------------
-
-::
-
-    import numpy as np
-    from metric_learn import NCA
-    from sklearn.datasets import load_iris
-
-    iris_data = load_iris()
-    X = iris_data['data']
-    Y = iris_data['target']
-
-    nca = NCA(max_iter=1000)
-    nca.fit(X, Y)
-
-References
-----------
-
diff --git a/doc/metric_learn.rca.rst b/doc/metric_learn.rca.rst
deleted file mode 100644
index 027d583b..00000000
--- a/doc/metric_learn.rca.rst
+++ /dev/null
@@ -1,28 +0,0 @@
-Relative Components Analysis (RCA)
-==================================
-
-.. automodule:: metric_learn.rca
-    :members:
-    :undoc-members:
-    :inherited-members:
-    :show-inheritance:
-    :special-members: __init__
-
-Example Code
-------------
-
-::
-
-    from metric_learn import RCA_Supervised
-    from sklearn.datasets import load_iris
-
-    iris_data = load_iris()
-    X = iris_data['data']
-    Y = iris_data['target']
-
-    rca = RCA_Supervised(num_chunks=30, chunk_size=2)
-    rca.fit(X, Y)
-
-References
-------------------
-`Adjustment learning and relevant component analysis <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.19.2871&rep=rep1&type=pdf>`_ Noam Shental, et al.
diff --git a/doc/metric_learn.rst b/doc/metric_learn.rst
index eb606542..930404d0 100644
--- a/doc/metric_learn.rst
+++ b/doc/metric_learn.rst
@@ -4,16 +4,52 @@ metric_learn package
 Module Contents
 ---------------
 
-.. toctree::
-
-   metric_learn.constraints
-   metric_learn.base_metric
-   metric_learn.itml
-   metric_learn.lfda
-   metric_learn.lmnn
-   metric_learn.lsml
-   metric_learn.mlkr
-   metric_learn.mmc
-   metric_learn.nca
-   metric_learn.rca
-   metric_learn.sdml
+Base Classes
+------------
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    metric_learn.Constraints
+    metric_learn.base_metric.BaseMetricLearner
+    metric_learn.base_metric._PairsClassifierMixin
+    metric_learn.base_metric._QuadrupletsClassifierMixin
+
+Supervised Learning Algorithms
+------------------------------
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+
+   metric_learn.LFDA
+   metric_learn.LMNN
+   metric_learn.MLKR
+   metric_learn.NCA
+   metric_learn.RCA
+   metric_learn.ITML_Supervised
+   metric_learn.LSML_Supervised
+   metric_learn.MMC_Supervised
+   metric_learn.SDML_Supervised
+   metric_learn.RCA_Supervised
+
+Weakly Supervised Learning Algorithms
+-------------------------------------
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+
+   metric_learn.ITML
+   metric_learn.LSML
+   metric_learn.MMC
+   metric_learn.SDML
+
+Unsupervised Learning Algorithms
+--------------------------------
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+
+   metric_learn.Covariance
\ No newline at end of file
diff --git a/doc/metric_learn.sdml.rst b/doc/metric_learn.sdml.rst
deleted file mode 100644
index 3e350a70..00000000
--- a/doc/metric_learn.sdml.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-Sparse Determinant Metric Learning (SDML)
-=========================================
-
-.. automodule:: metric_learn.sdml
-    :members:
-    :undoc-members:
-    :inherited-members:
-    :show-inheritance:
-    :special-members: __init__
-
-Example Code
-------------
-
-::
-
-    from metric_learn import SDML_Supervised
-    from sklearn.datasets import load_iris
-
-    iris_data = load_iris()
-    X = iris_data['data']
-    Y = iris_data['target']
-
-    sdml = SDML_Supervised(num_constraints=200)
-    sdml.fit(X, Y)
-
-References
-------------------
diff --git a/doc/supervised.rst b/doc/supervised.rst
index c438294f..5520ce8e 100644
--- a/doc/supervised.rst
+++ b/doc/supervised.rst
@@ -8,46 +8,108 @@ labels `y`, and learn a distance matrix that make points from the same class
 other, and points from different classes or with distant target values far away
 from each other.
 
-Scikit-learn compatibility
-==========================
+General API
+===========
 
-All supervised algorithms are scikit-learn `Estimators`, so they are
-compatible with Pipelining and scikit-learn model selection routines.
+Supervised Metric Learning Algorithms are the easiest metric-learn algorithms
+to use, since they use the same API as ``scikit-learn``.
 
-Algorithms
-==========
-
-Covariance
+Input data
 ----------
+In order to train a model, you need two `array-like <https://scikit-learn\
+.org/stable/glossary.html#term-array-like>`_ objects, `X` and `y`. `X`
+should be a 2D array-like of shape `(n_samples, n_features)`, where
+`n_samples` is the number of points of your dataset and `n_features` is the
+number of attributes of each of your points. `y` should be a 1D array-like
+of shape `(n_samples,)`, containing for each point in `X` the class it
+belongs to (or the value to regress for this sample, if you use `MLKR` for
+instance).
 
-.. todo:: Covariance is unsupervised, so its doc should not be here.
+Here is an example of a dataset of two dogs and one
+cat (the classes are 'dog' and 'cat') an animal being being represented by
+two numbers.
 
-`Covariance` does not "learn" anything, rather it calculates
-the covariance matrix of the input data. This is a simple baseline method.
+>>> import numpy as np
+>>> X = np.array([[2.3, 3.6], [0.2, 0.5], [6.7, 2.1]])
+>>> y = np.array(['dog', 'cat', 'dog'])
 
-.. topic:: Example Code:
+.. note::
 
-::
+   You can also use a preprocessor instead of directly giving the inputs as
+   2D arrays. See the :ref:`preprocessor_section` section for more details.
 
-    from metric_learn import Covariance
-    from sklearn.datasets import load_iris
+Fit, transform, and so on
+-------------------------
+The goal of supervised metric-learning algorithms is to transform
+points in a new space, in which the distance between two points from the
+same class will be small, and the distance between two points from different
+classes will be large. To do so, we fit the metric learner (example:
+`NCA`).
 
-    iris = load_iris()['data']
+>>> from metric_learn import NCA
+>>> nca = NCA(random_state=42)
+>>> nca.fit(X, y)
+NCA(init=None, max_iter=100, n_components=None, num_dims='deprecated',
+  preprocessor=None, random_state=42, tol=None, verbose=False)
 
-    cov = Covariance().fit(iris)
-    x = cov.transform(iris)
 
-.. topic:: References:
+Now that the estimator is fitted, you can use it on new data for several
+purposes.
+
+First, you can transform the data in the learned space, using `transform`:
+Here we transform two points in the new embedding space.
+
+>>> X_new = np.array([[9.4, 4.1], [2.1, 4.4]])
+>>> nca.transform(X_new)
+array([[ 5.91884732, 10.25406973],
+       [ 3.1545886 ,  6.80350083]])
+
+Also, as explained before, our metric learners has learn a distance between
+points. You can use this distance in two main ways:
+
+- You can either return the distance between pairs of points using the
+  `score_pairs` function:
+
+>>> nca.score_pairs([[[3.5, 3.6], [5.6, 2.4]], [[1.2, 4.2], [2.1, 6.4]]])
+array([0.49627072, 3.65287282])
 
-    .. [1] On the Generalized Distance in Statistics, P.C.Mahalanobis, 1936
+- Or you can return a function that will return the distance (in the new
+  space) between two 1D arrays (the coordinates of the points in the original
+  space), similarly to distance functions in `scipy.spatial.distance`.
+
+>>> metric_fun = nca.get_metric()
+>>> metric_fun([3.5, 3.6], [5.6, 2.4])
+0.4962707194621285
+
+.. note::
+
+    If the metric learner that you use learns a Mahalanobis Matrix (like it is
+    the case for all algorithms currently in metric-learn), you can get the
+    plain learned Mahalanobis matrix using `get_mahalanobis_matrix`.
+
+    >>> nca.get_mahalanobis_matrix()
+    array([[0.43680409, 0.89169412],
+           [0.89169412, 1.9542479 ]])
+
+.. TODO: remove the "like it is the case etc..." if it's not the case anymore
+
+Scikit-learn compatibility
+--------------------------
+
+All supervised algorithms are scikit-learn `sklearn.base.Estimators`, and
+`sklearn.base.TransformerMixin` so they are compatible with Pipelining and
+scikit-learn model selection routines.
+
+Algorithms
+==========
 
 .. _lmnn:
 
-LMNN
------
+:py:class:`LMNN <metric_learn.LMNN>`
+-----------------------------------------
 
 Large Margin Nearest Neighbor Metric Learning
-(:py:class:`LMNN <metric_learn.lmnn.LMNN>`)
+(:py:class:`LMNN <metric_learn.LMNN>`)
 
 `LMNN` learns a Mahalanobis distance metric in the kNN classification
 setting. The learned metric attempts to keep close k-nearest neighbors 
@@ -97,10 +159,10 @@ indicates :math:`\mathbf{x}_{i}, \mathbf{x}_{j}` belong to different class,
 
 .. _nca:
 
-NCA
----
+:py:class:`NCA <metric_learn.NCA>`
+--------------------------------------
 
-Neighborhood Components Analysis(:py:class:`NCA <metric_learn.nca.NCA>`)
+Neighborhood Components Analysis(:py:class:`NCA <metric_learn.NCA>`)
 
 `NCA` is a distance metric learning algorithm which aims to improve the 
 accuracy of nearest neighbors classification compared to the standard 
@@ -161,10 +223,10 @@ the sum of probability of being correctly classified:
 
 .. _lfda:
 
-LFDA
-----
+:py:class:`LFDA <metric_learn.LFDA>`
+-----------------------------------------
 
-Local Fisher Discriminant Analysis(:py:class:`LFDA <metric_learn.lfda.LFDA>`)
+Local Fisher Discriminant Analysis(:py:class:`LFDA <metric_learn.LFDA>`)
 
 `LFDA` is a linear supervised dimensionality reduction method. It is
 particularly useful when dealing with multi-modality, where one ore more classes
@@ -235,10 +297,10 @@ same class are not imposed to be close.
 
 .. _mlkr:
 
-MLKR
-----
+:py:class:`MLKR <metric_learn.MLKR>`
+-----------------------------------------
 
-Metric Learning for Kernel Regression(:py:class:`MLKR <metric_learn.mlkr.MLKR>`)
+Metric Learning for Kernel Regression(:py:class:`MLKR <metric_learn.MLKR>`)
 
 `MLKR` is an algorithm for supervised metric learning, which learns a
 distance function by directly minimizing the leave-one-out regression error.
@@ -298,15 +360,35 @@ calculating a weighted average of all the training samples:
        Gerald Tesauro
 
 
+.. _supervised_version:
+
 Supervised versions of weakly-supervised algorithms
 ---------------------------------------------------
 
 Note that each :ref:`weakly-supervised algorithm <weakly_supervised_section>`
 has a supervised version of the form `*_Supervised` where similarity tuples are
 generated from the labels information and passed to the underlying algorithm.
-
-.. todo:: add more details about that (see issue `<https://github
-          .com/metric-learn/metric-learn/issues/135>`_)
+These constraints are sampled randomly under the hood.
+
+For pairs learners (see :ref:`learning_on_pairs`), pairs (tuple of two points
+from the dataset), and labels (`int` indicating whether the two points are
+similar (+1) or dissimilar (-1)), are sampled with the function
+`metric_learn.constraints.positive_negative_pairs`. To sample positive pairs
+(of label +1), this method will look at all the samples from the same label and
+sample randomly a pair among them. To sample negative pairs (of label -1), this
+method will look at all the samples from a different class and sample randomly
+a pair among them. The method will try to build `num_constraints` positive
+pairs and `num_constraints` negative pairs, but sometimes it cannot find enough
+of one of those, so forcing `same_length=True` will return both times the
+minimum of the two lenghts.
+
+For using quadruplets learners (see :ref:`learning_on_quadruplets`) in a
+supervised way, we will basically sample positive and negative pairs like
+before, but we'll just concatenate them, so that we have a 3D array of
+quadruplets, where for each quadruplet the two first points are in fact points
+from the same class, and the two last points are in fact points from a
+different class (so indeed the two last points should be less similar than the
+two first points).
 
 .. topic:: Example Code:
 
diff --git a/doc/unsupervised.rst b/doc/unsupervised.rst
new file mode 100644
index 00000000..1d5bef43
--- /dev/null
+++ b/doc/unsupervised.rst
@@ -0,0 +1,37 @@
+============================
+Unsupervised Metric Learning
+============================
+
+Unsupervised metric learning algorithms just take as input points `X`. For
+now, in metric-learn, there only is `Covariance`, which is a simple
+baseline algorithm (see below).
+
+
+Algorithms
+==========
+.. _covariance:
+
+Covariance
+----------
+
+`Covariance` does not "learn" anything, rather it calculates
+the covariance matrix of the input data. This is a simple baseline method.
+It can be used for ZCA whitening of the data (see the Wikipedia page of
+`whitening transformation <https://en.wikipedia.org/wiki/\
+Whitening_transformation>`_).
+
+.. topic:: Example Code:
+
+::
+
+    from metric_learn import Covariance
+    from sklearn.datasets import load_iris
+
+    iris = load_iris()['data']
+
+    cov = Covariance().fit(iris)
+    x = cov.transform(iris)
+
+.. topic:: References:
+
+    .. [1] On the Generalized Distance in Statistics, P.C.Mahalanobis, 1936
\ No newline at end of file
diff --git a/doc/weakly_supervised.rst b/doc/weakly_supervised.rst
index 351c4e3b..7e488ac7 100644
--- a/doc/weakly_supervised.rst
+++ b/doc/weakly_supervised.rst
@@ -11,17 +11,28 @@ and dissimilar points. Refer to the documentation of each algorithm for its
 particular form of input data.
 
 
+General API
+===========
+
 Input data
-==========
+----------
 
 In the following paragraph we talk about tuples for sake of generality. These
 can be pairs, triplets, quadruplets etc, depending on the particular metric
 learning algorithm we use.
 
 Basic form
-----------
-Every weakly supervised algorithm will take as input tuples of points, and if
-needed labels for theses tuples.
+^^^^^^^^^^
+
+Every weakly supervised algorithm will take as input tuples of
+points, and if needed labels for theses tuples. The tuples of points can
+also be called "constraints". They are a set of points that we consider (ex:
+two points, three points, etc...). The label is some information we have
+about this set of points (e.g. "these two points are similar"). Note that
+some information can be contained in the ordering of these tuples (see for
+instance the section :ref:`learning_on_quadruplets`). For more details about
+the specific of each algorithms, refer to the appropriate section: either
+:ref:`learning_on_pairs` or :ref:`learning_on_quadruplets`)
 
 
 The `tuples` argument is the first argument of every method (like the X
@@ -44,7 +55,7 @@ These are two data structures that can be used to represent tuple in metric
 learn:
 
 3D array of tuples
-------------------
+^^^^^^^^^^^^^^^^^^
 
 The most intuitive way to represent tuples is to provide the algorithm with a
 3D array-like of tuples of shape ``(n_tuples, t, n_features)``, where
@@ -62,10 +73,10 @@ the number of features of each point.
 >>>                    [[-2.16, +0.11, -0.02],
 >>>                     [+1.58, +0.16, +0.93]],
 >>>
->>>                    [[+1.58, +0.16, +0.93 ],  # same as tuples[1, 1, :]
+>>>                    [[+1.58, +0.16, +0.93],  # same as tuples[1, 1, :]
 >>>                     [+0.89, -0.34, +2.41]],
 >>>
->>>                    [[-0.12, -1.21, -0.20 ],  # same as tuples[0, 0, :]
+>>>                    [[-0.12, -1.21, -0.20],  # same as tuples[0, 0, :]
 >>>                     [-2.16, +0.11, -0.02]]])  # same as tuples[1, 0, :]
 >>> y = np.array([-1, 1, 1, -1])
 
@@ -77,7 +88,7 @@ the number of features of each point.
 
 
 2D array of indicators + preprocessor
--------------------------------------
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Instead of forming each point in each tuple, a more efficient representation
 would be to keep the dataset of points ``X`` aside, and just represent tuples
@@ -101,13 +112,7 @@ the feature dimension there, the resulting array is 2D.
 In order to fit metric learning algorithms with this type of input, we need to
 give the original dataset of points ``X`` to the estimator so that it knows
 the points the indices refer to. We do this when initializing the estimator,
-through the argument `preprocessor`.
-
-.. topic:: Example:
-
->>> from metric_learn import MMC
->>> mmc = MMC(preprocessor=X)
->>> mmc.fit(pairs_indice, y)
+through the argument `preprocessor` (see below :ref:`fit_ws`)
 
 
 .. note::
@@ -118,17 +123,85 @@ through the argument `preprocessor`.
    paths in the filesystem, name of records in a database etc...) See section
    :ref:`preprocessor_section` for more details on how to use the preprocessor.
 
-.. _sklearn_compat_ws:
+.. _fit_ws:
+
+Fit, transform, and so on
+-------------------------
+
+The goal of weakly-supervised metric-learning algorithms is to transform
+points in a new space, in which the tuple-wise constraints between points
+are respected.
+
+>>> from metric_learn import MMC
+>>> mmc = MMC(random_state=42)
+>>> mmc.fit(tuples, y)
+MMC(A0='deprecated', convergence_threshold=0.001, diagonal=False,
+  diagonal_c=1.0, init=None, max_iter=100, max_proj=10000,
+  preprocessor=None, random_state=42, verbose=False)
+
+Or alternatively (using a preprocessor):
+
+>>> from metric_learn import MMC
+>>> mmc = MMC(preprocessor=X, random_state=42)
+>>> mmc.fit(pairs_indice, y)
+
+
+Now that the estimator is fitted, you can use it on new data for several
+purposes.
+
+First, you can transform the data in the learned space, using `transform`:
+Here we transform two points in the new embedding space.
+
+>>> X_new = np.array([[9.4, 4.1, 4.2], [2.1, 4.4, 2.3]])
+>>> mmc.transform(X_new)
+array([[-3.24667162e+01,  4.62622348e-07,  3.88325421e-08],
+       [-3.61531114e+01,  4.86778289e-07,  2.12654397e-08]])
+
+Also, as explained before, our metric learners has learn a distance between
+points. You can use this distance in two main ways:
+
+- You can either return the distance between pairs of points using the
+  `score_pairs` function:
+
+>>> mmc.score_pairs([[[3.5, 3.6, 5.2], [5.6, 2.4, 6.7]],
+...                  [[1.2, 4.2, 7.7], [2.1, 6.4, 0.9]]])
+array([7.27607365, 0.88853014])
+
+- Or you can return a function that will return the distance
+  (in the new space) between two 1D arrays (the coordinates of the points in
+  the original space), similarly to distance functions in
+  `scipy.spatial.distance`. To do that, use the `get_metric` method.
 
+>>> metric_fun = mmc.get_metric()
+>>> metric_fun([3.5, 3.6, 5.2], [5.6, 2.4, 6.7])
+7.276073646278203
+
+.. note::
+
+    If the metric learner that you use learns a Mahalanobis Matrix (like it is
+    the case for all algorithms currently in metric-learn), you can get the
+    plain Mahalanobis matrix using `get_mahalanobis_matrix`.
+
+>>> mmc.get_mahalanobis_matrix()
+array([[ 0.58603894, -5.69883982, -1.66614919],
+       [-5.69883982, 55.41743549, 16.20219519],
+       [-1.66614919, 16.20219519,  4.73697721]])
+
+.. TODO: remove the "like it is the case etc..." if it's not the case anymore
+
+.. _sklearn_compat_ws:
+    
 Scikit-learn compatibility
-==========================
+--------------------------
 
 Weakly supervised estimators are compatible with scikit-learn routines for
 model selection (grid-search, cross-validation etc). See the scoring section
-for more details on the scoring used in the case of Weakly Supervised
-Metric Learning.
+of the appropriate algorithm (:ref:`pairs learners <learning_on_pairs>`
+or :ref:`quadruplets learners <learning_on_quadruplets>`)
+for more details on the scoring used in the case of Weakly Supervised Metric
+Learning.
 
-.. topic:: Example
+Example:
 
 >>> from metric_learn import MMC
 >>> from sklearn.datasets import load_iris
@@ -141,13 +214,22 @@ Metric Learning.
 >>> mmc = MMC(preprocessor=X)
 >>> cross_val_score(mmc, pairs_indices, y)
 
-Scoring
-=======
+Prediction and scoring
+----------------------
+
+Since weakly supervised are also able, after being fitted, to predict for a
+given tuple what is its label (for pairs) or ordering (for quadruplets). See
+the appropriate section for more details, either :ref:`this
+one <pairs_predicting>` for pairs, or :ref:`this one
+<quadruplets_predicting>` for quadruplets.
 
-Some default scoring are implemented in metric-learn, depending on the kind of
-tuples you're working with (pairs, triplets...). See the docstring of the
-`score` method of the estimator you use.
+They also implement a default scoring method, `score`, that can be
+used to evaluate the performance of a metric-learner on a test dataset. See
+the appropriate section for more details, either :ref:`this
+one <pairs_scoring>` for pairs, or :ref:`this one <learning_on_quadruplets>`
+for quadruplets.
 
+.. _learning_on_pairs:
 
 Learning on pairs
 =================
@@ -158,15 +240,46 @@ corresponding target containing ``n_samples`` values being either +1 or -1.
 These values indicate whether the given pairs are similar points or
 dissimilar points.
 
+Fitting
+-------
+Here is an example for fitting on pairs (see :ref:`fit_ws` for more details on
+the input data format and how to fit, in the general case of learning on
+tuples).
+
+>>> from metric_learn import MMC
+>>> pairs = np.array([[[1.2, 3.2], [2.3, 5.5]],
+>>>                   [[4.5, 2.3], [2.1, 2.3]]])
+>>> y_pairs = np.array([1, -1])
+>>> mmc = MMC(random_state=42)
+>>> mmc.fit(pairs, y_pairs)
+MMC(A0='deprecated', convergence_threshold=0.001, diagonal=False,
+    diagonal_c=1.0, init=None, max_iter=100, max_proj=10000, preprocessor=None,
+    random_state=42, verbose=False)
+
+Here, we learned a metric that puts the two first points closer
+together in the transformed space, and the two next points further away from
+each other.
+
+.. _pairs_predicting:
+
+Predicting
+----------
+
+When a pairs learner is fitted, it is also able to predict, for an
+upcoming pair, whether it is a pair of similar or dissimilar points.
+
+>>> mmc.predict([[[0.6, 1.6], [1.15, 2.75]],
+...              [[3.2, 1.1], [5.4, 6.1]]])
+array([1, -1])
 
 .. _calibration:
 
 Thresholding
 ------------
 In order to predict whether a new pair represents similar or dissimilar
-samples, we need to set a distance threshold, so that points closer (in the
-learned space) than this threshold are predicted as similar, and points further
-away are predicted as dissimilar. Several methods are possible for this
+samples, we in fact need to set a distance threshold, so that points closer (in
+the learned space) than this threshold are predicted as similar, and points
+further away are predicted as dissimilar. Several methods are possible for this
 thresholding.
 
 - **At fit time**: The threshold is set with `calibrate_threshold` (see
@@ -177,26 +290,73 @@ thresholding.
   overfitting. If you want to avoid that, calibrate the threshold after
   fitting, on a validation set.
 
+  >>> mmc.fit(pairs, y) # will fit the threshold automatically after fitting
+
 - **Manual**: calling `set_threshold` will set the threshold to a
   particular value.
 
+  >>> mmc.set_threshold(0.4)
+
 - **Calibration**: calling `calibrate_threshold` will calibrate the
   threshold to achieve a particular score on a validation set, the score
   being among the classical scores for classification (accuracy, f1 score...).
 
+  >>> mmc.calibrate_threshold(pairs, y)
 
 See also: `sklearn.calibration`.
 
+.. _pairs_scoring:
+
+Scoring
+-------
+
+Not only are they able to predict the label of given pairs, they can also
+return a `decision_function` for a set of pairs. It is basically the "score"
+that will be thresholded to find the prediction for the pair. In fact this
+"score" is the opposite of the distance in the new space (higher score means
+ points are similar, and lower score dissimilar).
+
+>>> mmc.decision_function([[[0.6, 1.6], [1.15, 2.75]],
+...                        [[3.2, 1.1], [5.4, 6.1]]])
+array([-0.12811124, -0.74750256])
+
+This allows to return all kinds of estimator scoring usually used in classic
+classification tasks, like `sklearn.metrics.accuracy` for instance, which
+can be used inside cross-validation routines:
+
+>>> from sklearn.model_selection import cross_val_score
+>>> pairs_test = np.array([[[0.6, 1.6], [1.15, 2.75]],
+...                        [[3.2, 1.1], [5.4, 6.1]],
+...                        [[7.7, 5.6], [1.23, 8.4]]])
+>>> y_test = np.array([-1., 1., -1.])
+>>> cross_val_score(mmc, pairs_test, y_test, scoring='accuracy')
+array([1., 0., 1.])
+
+Pairs learners also have a default score, which basically
+returns the `sklearn.metrics.roc_auc_score` (therefore is not dependent on
+the threshold).
+
+>>> pairs_test = np.array([[[0.6, 1.6], [1.15, 2.75]],
+...                        [[3.2, 1.1], [5.4, 6.1]],
+...                        [[7.7, 5.6], [1.23, 8.4]]])
+>>> y_test = np.array([-1., 1., -1.])
+>>> mmc.score(pairs_test, y_test)
+0.5
+
+.. note::
+   See :ref:`fit_ws` for more details on metric learners functions that are
+   not specific to learning on pairs, like `transform`, `score_pairs`,
+   `get_metric` and `get_mahalanobis_matrix`.
 
 Algorithms
-==========
+----------
 
 .. _itml:
 
-ITML
-----
+:py:class:`ITML <metric_learn.ITML>`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Information Theoretic Metric Learning(:py:class:`ITML <metric_learn.itml.ITML>`)
+Information Theoretic Metric Learning(:py:class:`ITML <metric_learn.ITML>`)
 
 `ITML` minimizes the (differential) relative entropy, aka Kullback–Leibler 
 divergence, between two multivariate Gaussians subject to constraints on the 
@@ -270,99 +430,13 @@ is the prior distance metric, set to identity matrix by default,
        itml/
 
 
-.. _lsml:
-
-LSML
-----
-
-Metric Learning from Relative Comparisons by Minimizing Squared Residual
-(:py:class:`LSML <metric_learn.lsml.LSML>`)
-
-`LSML` proposes a simple, yet effective, algorithm that minimizes a convex 
-objective function corresponding to the sum of squared residuals of 
-constraints. This algorithm uses the constraints in the form of the 
-relative distance comparisons, such method is especially useful where 
-pairwise constraints are not natural to obtain, thus pairwise constraints 
-based algorithms become infeasible to be deployed. Furthermore, its sparsity 
-extension leads to more stable estimation when the dimension is high and 
-only a small amount of constraints is given.
-
-The loss function of each constraint 
-:math:`d(\mathbf{x}_a, \mathbf{x}_b) < d(\mathbf{x}_c, \mathbf{x}_d)` is 
-denoted as:
-
-.. math::
-
-    H(d_\mathbf{M}(\mathbf{x}_a, \mathbf{x}_b) 
-    - d_\mathbf{M}(\mathbf{x}_c, \mathbf{x}_d))
-
-where :math:`H(\cdot)` is the squared Hinge loss function defined as:
-
-.. math::
-
-    H(x) = \left\{\begin{aligned}0 \qquad x\leq 0 \\
-    \,\,x^2 \qquad x>0\end{aligned}\right.\\
-
-The summed loss function :math:`L(C)` is the simple sum over all constraints 
-:math:`C = \{(\mathbf{x}_a , \mathbf{x}_b , \mathbf{x}_c , \mathbf{x}_d) 
-: d(\mathbf{x}_a , \mathbf{x}_b) < d(\mathbf{x}_c , \mathbf{x}_d)\}`. The 
-original paper suggested here should be a weighted sum since the confidence 
-or probability of each constraint might differ. However, for the sake of 
-simplicity and assumption of no extra knowledge provided, we just deploy 
-the simple sum here as well as what the authors did in the experiments.
-
-The distance metric learning problem becomes minimizing the summed loss 
-function of all constraints plus a regularization term w.r.t. the prior 
-knowledge:
-
-.. math::
-
-    \min_\mathbf{M}(D_{ld}(\mathbf{M, M_0}) + \sum_{(\mathbf{x}_a, 
-    \mathbf{x}_b, \mathbf{x}_c, \mathbf{x}_d)\in C}H(d_\mathbf{M}(
-    \mathbf{x}_a, \mathbf{x}_b) - d_\mathbf{M}(\mathbf{x}_c, \mathbf{x}_c))\\
-
-where :math:`\mathbf{M}_0` is the prior metric matrix, set as identity 
-by default, :math:`D_{ld}(\mathbf{\cdot, \cdot})` is the LogDet divergence:
-
-.. math::
-
-    D_{ld}(\mathbf{M, M_0}) = \text{tr}(\mathbf{MM_0}) − \text{logdet}
-    (\mathbf{M})
-
-.. topic:: Example Code:
-
-::
-
-    from metric_learn import LSML
-
-    quadruplets = [[[1.2, 7.5], [1.3, 1.5], [6.4, 2.6], [6.2, 9.7]],
-                   [[1.3, 4.5], [3.2, 4.6], [6.2, 5.5], [5.4, 5.4]],
-                   [[3.2, 7.5], [3.3, 1.5], [8.4, 2.6], [8.2, 9.7]],
-                   [[3.3, 4.5], [5.2, 4.6], [8.2, 5.5], [7.4, 5.4]]]
-
-    # we want to make closer points where the first feature is close, and
-    # further if the second feature is close
-
-    lsml = LSML()
-    lsml.fit(quadruplets)
-
-.. topic:: References:
-
-    .. [1] Liu et al.
-       "Metric Learning from Relative Comparisons by Minimizing Squared
-       Residual". ICDM 2012. http://www.cs.ucla.edu/~weiwang/paper/ICDM12.pdf
-
-    .. [2] Adapted from https://gist.github.com/kcarnold/5439917
-
 .. _sdml:
 
-=======
-
-SDML
-----
+:py:class:`SDML <metric_learn.SDML>`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Sparse High-Dimensional Metric Learning
-(:py:class:`SDML <metric_learn.sdml.SDML>`)
+(:py:class:`SDML <metric_learn.SDML>`)
 
 `SDML` is an efficient sparse metric learning in high-dimensional space via 
 double regularization: an L1-penalization on the off-diagonal elements of the 
@@ -418,10 +492,10 @@ is the off-diagonal L1 norm.
 
 .. _rca:
 
-RCA
----
+:py:class:`RCA <metric_learn.RCA>`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Relative Components Analysis (:py:class:`RCA <metric_learn.rca.RCA>`)
+Relative Components Analysis (:py:class:`RCA <metric_learn.RCA>`)
 
 `RCA` learns a full rank Mahalanobis distance metric based on a weighted sum of
 in-chunklets covariance matrices. It applies a global linear transformation to 
@@ -474,11 +548,11 @@ as the Mahalanobis matrix.
 
 .. _mmc:
 
-MMC
----
+:py:class:`MMC <metric_learn.MMC>`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Metric Learning with Application for Clustering with Side Information
-(:py:class:`MMC <metric_learn.mmc.MMC>`)
+(:py:class:`MMC <metric_learn.MMC>`)
 
 `MMC` minimizes the sum of squared distances between similar points, while
 enforcing the sum of distances between dissimilar ones to be greater than one. 
@@ -528,23 +602,185 @@ points, while constrains the sum of distances between dissimilar points:
   .. [2] Adapted from Matlab code `here <http://www.cs.cmu
      .edu/%7Eepxing/papers/Old_papers/code_Metric_online.tar.gz>`_.
 
+
+.. _learning_on_quadruplets:
+
 Learning on quadruplets
 =======================
 
-A type of information even weaker than pairs is information about relative
-comparisons between pairs. The user should provide the algorithm with a
-quadruplet of points, where the two first points are closer than the two
-last points. No target vector (``y``) is needed, since the supervision is
-already in the order that points are given in the quadruplet.
+
+
+The goal of weakly-supervised metric-learning algorithms is to transform
+points in a new space, in which the tuple-wise constraints between points
+are respected.
+
+Fitting
+-------
+Here is an example for fitting on quadruplets (see :ref:`fit_ws` for more
+details on the input data format and how to fit, in the general case of
+learning on tuples).
+
+>>> from metric_learn import LSML
+>>> quadruplets = np.array([[[1.2, 3.2], [2.3, 5.5], [2.4, 6.7], [2.1, 0.6]],
+>>>                         [[4.5, 2.3], [2.1, 2.3], [0.6, 1.2], [7.3, 3.4]]])
+>>> lsml = LSML(random_state=42)
+>>> lsml.fit(quadruplets)
+LSML(max_iter=1000, preprocessor=None, prior=None, random_state=42, tol=0.001,
+   verbose=False)
+
+Or alternatively (using a preprocessor):
+
+>>> X = np.array([[1.2, 3.2],
+>>>               [2.3, 5.5],
+>>>               [2.4, 6.7],
+>>>               [2.1, 0.6],
+>>>               [4.5, 2.3],
+>>>               [2.1, 2.3],
+>>>               [0.6, 1.2],
+>>>               [7.3, 3.4]])
+>>> quadruplets_indices = np.array([[0, 1, 2, 3], [4, 5, 6, 7]])
+>>> lsml = LSML(preprocessor=X, random_state=42)
+>>> lsml.fit(quadruplets_indices)
+LSML(max_iter=1000,
+   preprocessor=array([[1.2, 3.2],
+       [2.3, 5.5],
+       [2.4, 6.7],
+       [2.1, 0.6],
+       [4.5, 2.3],
+       [2.1, 2.3],
+       [0.6, 1.2],
+       [7.3, 3.4]]),
+   prior=None, random_state=42, tol=0.001, verbose=False)
+
+
+Here, we want to learn a metric that, for each of the two
+`quadruplets`, will put the two first points closer together than the two
+last points.
+
+.. _quadruplets_predicting:
+
+Predicting
+----------
+
+When a quadruplets learner is fitted, it is also able to predict, for an
+upcoming quadruplet, whether the two first points are more similar than the
+two last points (+1), or not (-1).
+
+>>> quadruplets_test = np.array(
+... [[[5.6, 5.3], [2.2, 2.1], [0.4, 0.6], [1.2, 3.4]],
+...  [[6.0, 4.2], [4.3, 1.2], [4.5, 0.6], [0.1, 7.8]]])
+>>> lsml.predict(quadruplets_test)
+array([-1.,  1.])
+
+.. _quadruplets_scoring:
+
+Scoring
+-------
+
+Not only are they able to predict the label of given pairs, they can also
+return a `decision_function` for a set of pairs. It is basically the "score"
+which sign will be taken to find the prediction for the pair. In fact this
+"score" is the difference between the distance between the two last points,
+and the distance between the two last points of the quadruplet (higher
+score means the two last points are more likely to be more dissimilar than
+the two first points (i.e. more likely to have a +1 prediction since it's
+the right ordering)).
+
+>>> lsml.decision_function(quadruplets_test)
+array([-1.75700306,  4.98982131])
+
+In the above example, for the first quadruplet in `quadruplets_test`, the
+two first points are predicted less similar than the two last points (they
+are further away in the transformed space).
+
+Unlike for pairs learners, quadruplets learners don't allow to give a `y`
+when fitting, which does not allow to use scikit-learn scoring functions
+like:
+
+>>> from sklearn.model_selection import cross_val_score
+>>> cross_val_score(lsml, quadruplets, scoring='f1_score')  # this won't work
+
+(This is actually intentional, for more details
+about that, see
+`this comment <https://github.com/metric-learn/metric-learn/pull/168#pullrequestreview-203730742>`_
+on github.)
+
+However, quadruplets learners do have a default scoring function, which will
+basically return the accuracy score on a given test set, i.e. the proportion
+of quadruplets have the right predicted ordering.
+
+>>> lsml.score(quadruplets_test)
+0.5
+
+.. note::
+   See :ref:`fit_ws` for more details on metric learners functions that are
+   not specific to learning on pairs, like `transform`, `score_pairs`,
+   `get_metric` and `get_mahalanobis_matrix`.
+
+
+
 
 Algorithms
-==========
+----------
+
+.. _lsml:
 
-LSML
-----
+:py:class:`LSML <metric_learn.LSML>`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-`LSML`: Metric Learning from Relative Comparisons by Minimizing Squared
-Residual
+Metric Learning from Relative Comparisons by Minimizing Squared Residual
+(:py:class:`LSML <metric_learn.LSML>`)
+
+`LSML` proposes a simple, yet effective, algorithm that minimizes a convex 
+objective function corresponding to the sum of squared residuals of 
+constraints. This algorithm uses the constraints in the form of the 
+relative distance comparisons, such method is especially useful where 
+pairwise constraints are not natural to obtain, thus pairwise constraints 
+based algorithms become infeasible to be deployed. Furthermore, its sparsity 
+extension leads to more stable estimation when the dimension is high and 
+only a small amount of constraints is given.
+
+The loss function of each constraint 
+:math:`d(\mathbf{x}_a, \mathbf{x}_b) < d(\mathbf{x}_c, \mathbf{x}_d)` is 
+denoted as:
+
+.. math::
+
+    H(d_\mathbf{M}(\mathbf{x}_a, \mathbf{x}_b) 
+    - d_\mathbf{M}(\mathbf{x}_c, \mathbf{x}_d))
+
+where :math:`H(\cdot)` is the squared Hinge loss function defined as:
+
+.. math::
+
+    H(x) = \left\{\begin{aligned}0 \qquad x\leq 0 \\
+    \,\,x^2 \qquad x>0\end{aligned}\right.\\
+
+The summed loss function :math:`L(C)` is the simple sum over all constraints 
+:math:`C = \{(\mathbf{x}_a , \mathbf{x}_b , \mathbf{x}_c , \mathbf{x}_d) 
+: d(\mathbf{x}_a , \mathbf{x}_b) < d(\mathbf{x}_c , \mathbf{x}_d)\}`. The 
+original paper suggested here should be a weighted sum since the confidence 
+or probability of each constraint might differ. However, for the sake of 
+simplicity and assumption of no extra knowledge provided, we just deploy 
+the simple sum here as well as what the authors did in the experiments.
+
+The distance metric learning problem becomes minimizing the summed loss 
+function of all constraints plus a regularization term w.r.t. the prior 
+knowledge:
+
+.. math::
+
+    \min_\mathbf{M}(D_{ld}(\mathbf{M, M_0}) + \sum_{(\mathbf{x}_a, 
+    \mathbf{x}_b, \mathbf{x}_c, \mathbf{x}_d)\in C}H(d_\mathbf{M}(
+    \mathbf{x}_a, \mathbf{x}_b) - d_\mathbf{M}(\mathbf{x}_c, \mathbf{x}_c))\\
+
+where :math:`\mathbf{M}_0` is the prior metric matrix, set as identity 
+by default, :math:`D_{ld}(\mathbf{\cdot, \cdot})` is the LogDet divergence:
+
+.. math::
+
+    D_{ld}(\mathbf{M, M_0}) = \text{tr}(\mathbf{MM_0}) − \text{logdet}
+    (\mathbf{M})
 
 .. topic:: Example Code:
 
@@ -570,3 +806,5 @@ Residual
        Residual". ICDM 2012. http://www.cs.ucla.edu/~weiwang/paper/ICDM12.pdf
 
     .. [2] Adapted from https://gist.github.com/kcarnold/5439917
+
+
diff --git a/examples/plot_metric_learning_examples.py b/examples/plot_metric_learning_examples.py
index fd6cff20..b46d1adc 100644
--- a/examples/plot_metric_learning_examples.py
+++ b/examples/plot_metric_learning_examples.py
@@ -130,7 +130,7 @@ def plot_tsne(X, y, colormap=plt.cm.Paired):
 #
 # - See more in the :ref:`User Guide <lmnn>`
 # - See more in the documentation of the class :py:class:`LMNN
-#   <metric_learn.lmnn.LMNN>`
+#   <metric_learn.LMNN>`
 
 
 ######################################################################
@@ -139,7 +139,7 @@ def plot_tsne(X, y, colormap=plt.cm.Paired):
 # 
 
 # setting up LMNN
-lmnn = metric_learn.LMNN(k=5, learn_rate=1e-6)
+lmnn = metric_learn.LMNN(k=5, learn_rate=1e-6, init='random')
 
 # fit the data!
 lmnn.fit(X, y)
@@ -181,7 +181,7 @@ def plot_tsne(X, y, colormap=plt.cm.Paired):
 #
 # - See more in the :ref:`User Guide <itml>`
 # - See more in the documentation of the class :py:class:`ITML
-#   <metric_learn.itml.ITML>`
+#   <metric_learn.ITML>`
 
 itml = metric_learn.ITML_Supervised()
 X_itml = itml.fit_transform(X, y)
@@ -200,12 +200,12 @@ def plot_tsne(X, y, colormap=plt.cm.Paired):
 #
 # - See more in the :ref:`User Guide <mmc>`
 # - See more in the documentation of the class :py:class:`MMC
-#   <metric_learn.mmc.MMC>`
+#   <metric_learn.MMC>`
 
-itml = metric_learn.ITML_Supervised()
-X_itml = itml.fit_transform(X, y)
+mmc = metric_learn.MMC_Supervised()
+X_mmc = mmc.fit_transform(X, y)
 
-plot_tsne(X_itml, y)
+plot_tsne(X_mmc, y)
 
 ######################################################################
 # Sparse Determinant Metric Learning
@@ -219,9 +219,10 @@ def plot_tsne(X, y, colormap=plt.cm.Paired):
 #
 # - See more in the :ref:`User Guide <sdml>`
 # - See more in the documentation of the class :py:class:`SDML
-#   <metric_learn.sdml.SDML>`
+#   <metric_learn.SDML>`
 
-sdml = metric_learn.SDML_Supervised(sparsity_param=0.1, balance_param=0.0015)
+sdml = metric_learn.SDML_Supervised(sparsity_param=0.1, balance_param=0.0015,
+                                    prior='covariance')
 X_sdml = sdml.fit_transform(X, y)
 
 plot_tsne(X_sdml, y)
@@ -238,9 +239,10 @@ def plot_tsne(X, y, colormap=plt.cm.Paired):
 #
 # - See more in the :ref:`User Guide <lsml>`
 # - See more in the documentation of the class :py:class:`LSML
-#   <metric_learn.lsml.LSML>`
+#   <metric_learn.LSML>`
 
-lsml = metric_learn.LSML_Supervised(tol=0.0001, max_iter=10000)
+lsml = metric_learn.LSML_Supervised(tol=0.0001, max_iter=10000,
+                                    prior='covariance')
 X_lsml = lsml.fit_transform(X, y)
 
 plot_tsne(X_lsml, y)
@@ -265,7 +267,7 @@ def plot_tsne(X, y, colormap=plt.cm.Paired):
 #
 # - See more in the :ref:`User Guide <nca>`
 # - See more in the documentation of the class :py:class:`NCA
-#   <metric_learn.nca.NCA>`
+#   <metric_learn.NCA>`
 
 nca = metric_learn.NCA(max_iter=1000)
 X_nca = nca.fit_transform(X, y)
@@ -285,7 +287,7 @@ def plot_tsne(X, y, colormap=plt.cm.Paired):
 #
 # - See more in the :ref:`User Guide <lfda>`
 # - See more in the documentation of the class :py:class:`LFDA
-#   <metric_learn.lfda.LFDA>`
+#   <metric_learn.LFDA>`
 
 lfda = metric_learn.LFDA(k=2, num_dims=2)
 X_lfda = lfda.fit_transform(X, y)
@@ -306,7 +308,7 @@ def plot_tsne(X, y, colormap=plt.cm.Paired):
 #
 # - See more in the :ref:`User Guide <rca>`
 # - See more in the documentation of the class :py:class:`RCA
-#   <metric_learn.rca.RCA>`
+#   <metric_learn.RCA>`
 
 rca = metric_learn.RCA_Supervised(num_chunks=30, chunk_size=2)
 X_rca = rca.fit_transform(X, y)
@@ -326,7 +328,7 @@ def plot_tsne(X, y, colormap=plt.cm.Paired):
 #
 # - See more in the :ref:`User Guide <mlkr>`
 # - See more in the documentation of the class :py:class:`MLKR
-#   <metric_learn.mlkr.MLKR>`
+#   <metric_learn.MLKR>`
 #
 # To illustrate MLKR, let's use the dataset
 # `sklearn.datasets.make_regression` the same way as we did with the
@@ -445,8 +447,8 @@ def create_constraints(labels):
 ######################################################################
 # Using our constraints, let's now train ITML again. Note that we are no
 # longer calling the supervised class :py:class:`ITML_Supervised
-# <metric_learn.itml.ITML_Supervised>` but the more generic
-# (weakly-supervised) :py:class:`ITML <metric_learn.itml.ITML>`, which
+# <metric_learn.ITML_Supervised>` but the more generic
+# (weakly-supervised) :py:class:`ITML <metric_learn.ITML>`, which
 # takes the dataset `X` through the `preprocessor` argument (see
 # :ref:`this section  <preprocessor_section>` of the documentation to learn
 # about more advanced uses of `preprocessor`) and the pair information `pairs`
diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index aa7d66dd..1e5fa974 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -1,3 +1,7 @@
+"""
+Base module.
+"""
+
 from sklearn.base import BaseEstimator
 from sklearn.utils.extmath import stable_cumsum
 from sklearn.utils.validation import _is_arraylike, check_is_fitted
@@ -10,16 +14,17 @@
 
 
 class BaseMetricLearner(six.with_metaclass(ABCMeta, BaseEstimator)):
+  """
+  Base class for all metric-learners.
 
-  def __init__(self, preprocessor=None):
-    """
+  Parameters
+  ----------
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+    The preprocessor to call to get tuples from indices. If array-like,
+    tuples will be gotten like this: X[indices].
+  """
 
-    Parameters
-    ----------
-    preprocessor : array-like, shape=(n_samples, n_features) or callable
-      The preprocessor to call to get tuples from indices. If array-like,
-      tuples will be gotten like this: X[indices].
-    """
+  def __init__(self, preprocessor=None):
     self.preprocessor = preprocessor
 
   @abstractmethod
@@ -277,6 +282,8 @@ def metric_fun(u, v, squared=False):
   get_metric.__doc__ = BaseMetricLearner.get_metric.__doc__
 
   def metric(self):
+    """Deprecated. Will be removed in v0.6.0. Use `get_mahalanobis_matrix`
+    instead"""
     # TODO: remove this method in version 0.6.0
     warnings.warn(("`metric` is deprecated since version 0.5.0 and will be "
                    "removed in 0.6.0. Use `get_mahalanobis_matrix` instead."),
@@ -295,7 +302,8 @@ def get_mahalanobis_matrix(self):
 
 
 class _PairsClassifierMixin(BaseMetricLearner):
-  """
+  """Base class for pairs learners.
+
   Attributes
   ----------
   threshold_ : `float`
@@ -567,6 +575,8 @@ def _validate_calibration_params(strategy='accuracy', min_rate=None,
 
 
 class _QuadrupletsClassifierMixin(BaseMetricLearner):
+  """Base class for quadruplets learners.
+  """
 
   _tuple_size = 4  # number of points in a tuple, 4 for quadruplets
 
@@ -578,7 +588,7 @@ def predict(self, quadruplets):
 
     Parameters
     ----------
-    quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or
+    quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or \
                   (n_quadruplets, 4)
       3D Array of quadruplets to predict, with each row corresponding to four
       points, or 2D array of indices of quadruplets if the metric learner
@@ -607,7 +617,7 @@ def decision_function(self, quadruplets):
 
     Parameters
     ----------
-    quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or
+    quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or \
                   (n_quadruplets, 4)
       3D Array of quadruplets to predict, with each row corresponding to four
       points, or 2D array of indices of quadruplets if the metric learner
@@ -630,7 +640,7 @@ def score(self, quadruplets):
 
     Parameters
     ----------
-    quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or
+    quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or \
                   (n_quadruplets, 4)
       3D Array of quadruplets to score, with each row corresponding to four
       points, or 2D array of indices of quadruplets if the metric learner
diff --git a/metric_learn/constraints.py b/metric_learn/constraints.py
index e591830b..069a6564 100644
--- a/metric_learn/constraints.py
+++ b/metric_learn/constraints.py
@@ -11,6 +11,11 @@
 
 
 class Constraints(object):
+  """
+  Class to build constraints from labels.
+
+  See more in the :ref:`User Guide <supervised_version>`
+  """
   def __init__(self, partial_labels):
     '''partial_labels : int arraylike, -1 indicating unknown label'''
     partial_labels = np.asanyarray(partial_labels, dtype=int)
diff --git a/metric_learn/covariance.py b/metric_learn/covariance.py
index 19dad5d8..b9666494 100644
--- a/metric_learn/covariance.py
+++ b/metric_learn/covariance.py
@@ -1,11 +1,5 @@
 """
 Covariance metric (baseline method)
-
-This method does not "learn" anything, rather it calculates
-the covariance matrix of the input data.
-
-This is a simple baseline method first introduced in
-On the Generalized Distance in Statistics, P.C.Mahalanobis, 1936
 """
 
 from __future__ import absolute_import
@@ -20,11 +14,28 @@
 class Covariance(MahalanobisMixin, TransformerMixin):
   """Covariance metric (baseline method)
 
+  This method does not "learn" anything, rather it calculates
+  the covariance matrix of the input data.
+
+  This is a simple baseline method first introduced in
+  On the Generalized Distance in Statistics, P.C.Mahalanobis, 1936
+
+  Read more in the :ref:`User Guide <covariance>`.
+
   Attributes
   ----------
   transformer_ : `numpy.ndarray`, shape=(n_features, n_features)
       The linear transformation ``L`` deduced from the learned Mahalanobis
       metric (See function `transformer_from_metric`.)
+
+  Examples
+  --------
+  >>> from metric_learn import Covariance
+  >>> from sklearn.datasets import load_iris
+  >>> iris = load_iris()['data']
+  >>> cov = Covariance().fit(iris)
+  >>> x = cov.transform(iris)
+
   """
 
   def __init__(self, preprocessor=None):
diff --git a/metric_learn/itml.py b/metric_learn/itml.py
index 21303c18..16fc21db 100644
--- a/metric_learn/itml.py
+++ b/metric_learn/itml.py
@@ -1,17 +1,5 @@
-r"""
-Information Theoretic Metric Learning(ITML)
-
-`ITML` minimizes the (differential) relative entropy, aka Kullback-Leibler
-divergence, between two multivariate Gaussians subject to constraints on the
-associated Mahalanobis distance, which can be formulated into a Bregman
-optimization problem by minimizing the LogDet divergence subject to
-linear constraints. This algorithm can handle a wide variety of constraints
-and can optionally incorporate a prior on the distance function. Unlike some
-other methods, `ITML` does not rely on an eigenvalue computation or
-semi-definite programming.
-
-Read more in the :ref:`User Guide <itml>`.
-
+"""
+Information Theoretic Metric Learning (ITML)
 """
 
 from __future__ import print_function, absolute_import
@@ -34,55 +22,6 @@ class _BaseITML(MahalanobisMixin):
   def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3,
                prior='identity', A0='deprecated', verbose=False,
                preprocessor=None, random_state=None):
-    """Initialize ITML.
-
-    Parameters
-    ----------
-    gamma : float, optional
-        value for slack variables
-
-    max_iter : int, optional
-
-    convergence_threshold : float, optional
-
-    prior : string or numpy array, optional (default='identity')
-         The Mahalanobis matrix to use as a prior. Possible options are
-         'identity', 'covariance', 'random', and a numpy array of shape
-         (n_features, n_features). For ITML, the prior should be strictly
-         positive definite (PD).
-
-         'identity'
-            An identity matrix of shape (n_features, n_features).
-
-         'covariance'
-            The inverse covariance matrix.
-
-         'random'
-            The prior will be a random SPD matrix of shape
-            `(n_features, n_features)`, generated using
-            `sklearn.datasets.make_spd_matrix`.
-
-         numpy array
-             A positive definite (PD) matrix of shape
-             (n_features, n_features), that will be used as such to set the
-             prior.
-
-    A0 : Not used
-      .. deprecated:: 0.5.0
-         `A0` was deprecated in version 0.5.0 and will
-         be removed in 0.6.0. Use 'prior' instead.
-
-    verbose : bool, optional
-        if True, prints information while learning
-
-    preprocessor : array-like, shape=(n_samples, n_features) or callable
-        The preprocessor to call to get tuples from indices. If array-like,
-        tuples will be formed like this: X[indices].
-
-    random_state : int or numpy.RandomState or None, optional (default=None)
-        A pseudo random number generator object or a seed for it if int. If
-        ``prior='random'``, ``random_state`` is used to set the prior.
-    """
     self.gamma = gamma
     self.max_iter = max_iter
     self.convergence_threshold = convergence_threshold
@@ -172,6 +111,66 @@ def _fit(self, pairs, y, bounds=None):
 class ITML(_BaseITML, _PairsClassifierMixin):
   """Information Theoretic Metric Learning (ITML)
 
+  `ITML` minimizes the (differential) relative entropy, aka Kullback-Leibler
+  divergence, between two multivariate Gaussians subject to constraints on the
+  associated Mahalanobis distance, which can be formulated into a Bregman
+  optimization problem by minimizing the LogDet divergence subject to
+  linear constraints. This algorithm can handle a wide variety of constraints
+  and can optionally incorporate a prior on the distance function. Unlike some
+  other methods, `ITML` does not rely on an eigenvalue computation or
+  semi-definite programming.
+
+  Read more in the :ref:`User Guide <itml>`.
+
+  Parameters
+  ----------
+  gamma : float, optional (default=1.)
+      Value for slack variables
+
+  max_iter : int, optional (default=1000)
+      Maximum number of iteration of the optimization procedure.
+
+  convergence_threshold : float, optional (default=1e-3)
+      Convergence tolerance.
+
+  prior : string or numpy array, optional (default='identity')
+      The Mahalanobis matrix to use as a prior. Possible options are
+      'identity', 'covariance', 'random', and a numpy array of shape
+      (n_features, n_features). For ITML, the prior should be strictly
+      positive definite (PD).
+
+      'identity'
+          An identity matrix of shape (n_features, n_features).
+
+      'covariance'
+          The inverse covariance matrix.
+
+      'random'
+          The prior will be a random SPD matrix of shape
+          `(n_features, n_features)`, generated using
+          `sklearn.datasets.make_spd_matrix`.
+
+      numpy array
+          A positive definite (PD) matrix of shape
+          (n_features, n_features), that will be used as such to set the
+          prior.
+
+  A0 : Not used
+      .. deprecated:: 0.5.0
+          `A0` was deprecated in version 0.5.0 and will
+          be removed in 0.6.0. Use 'prior' instead.
+
+  verbose : bool, optional (default=False)
+      If True, prints information while learning
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+      The preprocessor to call to get tuples from indices. If array-like,
+      tuples will be formed like this: X[indices].
+
+  random_state : int or numpy.RandomState or None, optional (default=None)
+      A pseudo random number generator object or a seed for it if int. If
+      ``prior='random'``, ``random_state`` is used to set the prior.
+
   Attributes
   ----------
   bounds_ : `numpy.ndarray`, shape=(2,)
@@ -194,6 +193,22 @@ class ITML(_BaseITML, _PairsClassifierMixin):
       If the distance metric between two points is lower than this threshold,
       points will be classified as similar, otherwise they will be
       classified as dissimilar.
+
+  Examples
+  --------
+  >>> from metric_learn import ITML_Supervised
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> itml = ITML_Supervised(num_constraints=200)
+  >>> itml.fit(X, Y)
+
+  References
+  ----------
+  .. [1] `Information-theoretic Metric Learning
+         <http://machinelearning.wustl.edu/mlpapers/paper_files\
+/icml2007_DavisKJSD07.pdf>`_ Jason V. Davis, et al.
   """
 
   def fit(self, pairs, y, bounds=None, calibration_params=None):
@@ -204,7 +219,7 @@ def fit(self, pairs, y, bounds=None, calibration_params=None):
 
     Parameters
     ----------
-    pairs: array-like, shape=(n_constraints, 2, n_features) or
+    pairs: array-like, shape=(n_constraints, 2, n_features) or \
            (n_constraints, 2)
         3D Array of pairs with each row corresponding to two points,
         or 2D array of indices of pairs if the metric learner uses a
@@ -240,6 +255,64 @@ def fit(self, pairs, y, bounds=None, calibration_params=None):
 class ITML_Supervised(_BaseITML, TransformerMixin):
   """Supervised version of Information Theoretic Metric Learning (ITML)
 
+  `ITML_Supervised` creates pairs of similar sample by taking same class
+  samples, and pairs of dissimilar samples by taking different class
+  samples. It then passes these pairs to `ITML` for training.
+
+  Parameters
+  ----------
+  gamma : float, optional
+      value for slack variables
+  max_iter : int, optional
+  convergence_threshold : float, optional
+  num_labeled : Not used
+        .. deprecated:: 0.5.0
+           `num_labeled` was deprecated in version 0.5.0 and will
+           be removed in 0.6.0.
+  num_constraints: int, optional
+      number of constraints to generate
+  bounds : Not used
+         .. deprecated:: 0.5.0
+        `bounds` was deprecated in version 0.5.0 and will
+        be removed in 0.6.0. Set `bounds` at fit time instead :
+        `itml_supervised.fit(X, y, bounds=...)`
+
+  prior : string or numpy array, optional (default='identity')
+       Initialization of the Mahalanobis matrix. Possible options are
+       'identity', 'covariance', 'random', and a numpy array of shape
+       (n_features, n_features). For ITML, the prior should be strictly
+       positive definite (PD).
+
+       'identity'
+          An identity matrix of shape (n_features, n_features).
+
+       'covariance'
+          The inverse covariance matrix.
+
+       'random'
+          The prior will be a random SPD matrix of shape
+          `(n_features, n_features)`, generated using
+          `sklearn.datasets.make_spd_matrix`.
+
+       numpy array
+           A positive definite (PD) matrix of shape
+           (n_features, n_features), that will be used as such to set the
+           prior.
+
+  A0 : Not used
+    .. deprecated:: 0.5.0
+       `A0` was deprecated in version 0.5.0 and will
+       be removed in 0.6.0. Use 'prior' instead.
+  verbose : bool, optional
+      if True, prints information while learning
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+      The preprocessor to call to get tuples from indices. If array-like,
+      tuples will be formed like this: X[indices].
+  random_state : int or numpy.RandomState or None, optional (default=None)
+      A pseudo random number generator object or a seed for it if int. If
+      ``prior='random'``, ``random_state`` is used to set the prior.
+
+
   Attributes
   ----------
   bounds_ : `numpy.ndarray`, shape=(2,)
@@ -257,71 +330,18 @@ class ITML_Supervised(_BaseITML, TransformerMixin):
   transformer_ : `numpy.ndarray`, shape=(n_features, n_features)
       The linear transformation ``L`` deduced from the learned Mahalanobis
       metric (See function `transformer_from_metric`.)
+
+  See Also
+  --------
+  metric_learn.ITML : The original weakly-supervised algorithm
+  :ref:`supervised_version` : The section of the project documentation
+    that describes the supervised version of weakly supervised estimators.
   """
 
   def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3,
                num_labeled='deprecated', num_constraints=None,
                bounds='deprecated', prior='identity', A0='deprecated',
                verbose=False, preprocessor=None, random_state=None):
-    """Initialize the supervised version of `ITML`.
-
-    `ITML_Supervised` creates pairs of similar sample by taking same class
-    samples, and pairs of dissimilar samples by taking different class
-    samples. It then passes these pairs to `ITML` for training.
-
-    Parameters
-    ----------
-    gamma : float, optional
-        value for slack variables
-    max_iter : int, optional
-    convergence_threshold : float, optional
-    num_labeled : Not used
-          .. deprecated:: 0.5.0
-             `num_labeled` was deprecated in version 0.5.0 and will
-             be removed in 0.6.0.
-    num_constraints: int, optional
-        number of constraints to generate
-    bounds : Not used
-           .. deprecated:: 0.5.0
-          `bounds` was deprecated in version 0.5.0 and will
-          be removed in 0.6.0. Set `bounds` at fit time instead :
-          `itml_supervised.fit(X, y, bounds=...)`
-
-    prior : string or numpy array, optional (default='identity')
-         Initialization of the Mahalanobis matrix. Possible options are
-         'identity', 'covariance', 'random', and a numpy array of shape
-         (n_features, n_features). For ITML, the prior should be strictly
-         positive definite (PD).
-
-         'identity'
-            An identity matrix of shape (n_features, n_features).
-
-         'covariance'
-            The inverse covariance matrix.
-
-         'random'
-            The prior will be a random SPD matrix of shape
-            `(n_features, n_features)`, generated using
-            `sklearn.datasets.make_spd_matrix`.
-
-         numpy array
-             A positive definite (PD) matrix of shape
-             (n_features, n_features), that will be used as such to set the
-             prior.
-
-    A0 : Not used
-      .. deprecated:: 0.5.0
-         `A0` was deprecated in version 0.5.0 and will
-         be removed in 0.6.0. Use 'prior' instead.
-    verbose : bool, optional
-        if True, prints information while learning
-    preprocessor : array-like, shape=(n_samples, n_features) or callable
-        The preprocessor to call to get tuples from indices. If array-like,
-        tuples will be formed like this: X[indices].
-    random_state : int or numpy.RandomState or None, optional (default=None)
-        A pseudo random number generator object or a seed for it if int. If
-        ``prior='random'``, ``random_state`` is used to set the prior.
-    """
     _BaseITML.__init__(self, gamma=gamma, max_iter=max_iter,
                        convergence_threshold=convergence_threshold,
                        A0=A0, prior=prior, verbose=verbose,
diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py
index 1851a734..6c651b7b 100644
--- a/metric_learn/lfda.py
+++ b/metric_learn/lfda.py
@@ -1,13 +1,5 @@
-r"""
-Local Fisher Discriminant Analysis(LFDA)
-
-LFDA is a linear supervised dimensionality reduction method. It is
-particularly useful when dealing with multimodality, where one ore more classes
-consist of separate clusters in input space. The core optimization problem of
-LFDA is solved as a generalized eigenvalue problem.
-
-Read more in the :ref:`User Guide <lfda>`.
-
+"""
+Local Fisher Discriminant Analysis (LFDA)
 """
 from __future__ import division, absolute_import
 import numpy as np
@@ -24,44 +16,69 @@
 class LFDA(MahalanobisMixin, TransformerMixin):
   '''
   Local Fisher Discriminant Analysis for Supervised Dimensionality Reduction
-  Sugiyama, ICML 2006
 
-  Attributes
+  LFDA is a linear supervised dimensionality reduction method. It is
+  particularly useful when dealing with multimodality, where one ore more
+  classes consist of separate clusters in input space. The core optimization
+  problem of LFDA is solved as a generalized eigenvalue problem.
+
+  Read more in the :ref:`User Guide <lfda>`.
+
+  Parameters
   ----------
-  transformer_ : `numpy.ndarray`, shape=(n_components, n_features)
-      The learned linear transformation ``L``.
-  '''
+  n_components : int or None, optional (default=None)
+      Dimensionality of reduced space (if None, defaults to dimension of X).
 
-  def __init__(self, n_components=None, num_dims='deprecated',
-               k=None, embedding_type='weighted', preprocessor=None):
-    '''
-    Initialize LFDA.
+  num_dims : Not used
 
-    Parameters
-    ----------
-    n_components : int or None, optional (default=None)
-        Dimensionality of reduced space (if None, defaults to dimension of X).
+      .. deprecated:: 0.5.0
+        `num_dims` was deprecated in version 0.5.0 and will
+        be removed in 0.6.0. Use `n_components` instead.
 
-    num_dims : Not used
+  k : int, optional
+      Number of nearest neighbors used in local scaling method.
+      Defaults to min(7, n_components - 1).
 
-        .. deprecated:: 0.5.0
-          `num_dims` was deprecated in version 0.5.0 and will
-          be removed in 0.6.0. Use `n_components` instead.
+  embedding_type : str, optional
+      Type of metric in the embedding space (default: 'weighted')
+        'weighted'        - weighted eigenvectors
+        'orthonormalized' - orthonormalized
+        'plain'           - raw eigenvectors
 
-    k : int, optional
-        Number of nearest neighbors used in local scaling method.
-        Defaults to min(7, n_components - 1).
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+      The preprocessor to call to get tuples from indices. If array-like,
+      tuples will be formed like this: X[indices].
+
+  Attributes
+  ----------
+  transformer_ : `numpy.ndarray`, shape=(n_components, n_features)
+      The learned linear transformation ``L``.
 
-    embedding_type : str, optional
-        Type of metric in the embedding space (default: 'weighted')
-          'weighted'        - weighted eigenvectors
-          'orthonormalized' - orthonormalized
-          'plain'           - raw eigenvectors
+  Examples
+  --------
+
+  >>> import numpy as np
+  >>> from metric_learn import LFDA
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> lfda = LFDA(k=2, dim=2)
+  >>> lfda.fit(X, Y)
+
+  References
+  ------------------
+  .. [1] `Dimensionality Reduction of Multimodal Labeled Data by Local Fisher
+         Discriminant Analysis <http://www.ms.k.u-tokyo.ac.jp/2007/LFDA.pdf>`_
+         Masashi Sugiyama.
+
+  .. [2] `Local Fisher Discriminant Analysis on Beer Style Clustering
+         <https://gastrograph.com/resources/whitepapers/local-fisher\
+-discriminant-analysis-on-beer-style-clustering.html#>`_ Yuan Tang.
+  '''
 
-    preprocessor : array-like, shape=(n_samples, n_features) or callable
-        The preprocessor to call to get tuples from indices. If array-like,
-        tuples will be formed like this: X[indices].
-    '''
+  def __init__(self, n_components=None, num_dims='deprecated',
+               k=None, embedding_type='weighted', preprocessor=None):
     if embedding_type not in ('weighted', 'orthonormalized', 'plain'):
       raise ValueError('Invalid embedding_type: %r' % embedding_type)
     self.n_components = n_components
diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
index 20eeea3b..600d55c0 100644
--- a/metric_learn/lmnn.py
+++ b/metric_learn/lmnn.py
@@ -1,16 +1,7 @@
-r"""
-Large Margin Nearest Neighbor Metric learning(LMNN)
-
-LMNN learns a Mahalanobis distance metric in the kNN classification
-setting. The learned metric attempts to keep close k-nearest neighbors
-from the same class, while keeping examples from different classes
-separated by a large margin. This algorithm makes no assumptions about
-the distribution of the data.
-
-Read more in the :ref:`User Guide <lmnn>`.
-
 """
-#TODO: periodic recalculation of impostors, PCA initialization
+Large Margin Nearest Neighbor Metric learning (LMNN)
+"""
+# TODO: periodic recalculation of impostors, PCA initialization
 
 from __future__ import print_function, absolute_import
 import numpy as np
@@ -26,81 +17,142 @@
 
 
 class LMNN(MahalanobisMixin, TransformerMixin):
+  """Large Margin Nearest Neighbor (LMNN)
+
+  LMNN learns a Mahalanobis distance metric in the kNN classification
+  setting. The learned metric attempts to keep close k-nearest neighbors
+  from the same class, while keeping examples from different classes
+  separated by a large margin. This algorithm makes no assumptions about
+  the distribution of the data.
+
+  Read more in the :ref:`User Guide <lmnn>`.
+
+  Parameters
+  ----------
+  init : None, string or numpy array, optional (default=None)
+      Initialization of the linear transformation. Possible options are
+      'auto', 'pca', 'identity', 'random', and a numpy array of shape
+      (n_features_a, n_features_b). If None, will be set automatically to
+        'auto' (this option is to raise a warning if 'init' is not set,
+        and stays to its default value None, in v0.5.0).
+
+      'auto'
+          Depending on ``n_components``, the most reasonable initialization
+          will be chosen. If ``n_components <= n_classes`` we use 'lda', as
+          it uses labels information. If not, but
+          ``n_components < min(n_features, n_samples)``, we use 'pca', as
+          it projects data in meaningful directions (those of higher
+          variance). Otherwise, we just use 'identity'.
+
+      'pca'
+          ``n_components`` principal components of the inputs passed
+          to :meth:`fit` will be used to initialize the transformation.
+          (See `sklearn.decomposition.PCA`)
+
+      'lda'
+          ``min(n_components, n_classes)`` most discriminative
+          components of the inputs passed to :meth:`fit` will be used to
+          initialize the transformation. (If ``n_components > n_classes``,
+          the rest of the components will be zero.) (See
+          `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
+
+      'identity'
+          If ``n_components`` is strictly smaller than the
+          dimensionality of the inputs passed to :meth:`fit`, the identity
+          matrix will be truncated to the first ``n_components`` rows.
+
+      'random'
+          The initial transformation will be a random array of shape
+          `(n_components, n_features)`. Each value is sampled from the
+          standard normal distribution.
+
+      numpy array
+          n_features_b must match the dimensionality of the inputs passed to
+          :meth:`fit` and n_features_a must be less than or equal to that.
+          If ``n_components`` is not None, n_features_a must match it.
+
+  k : int, optional
+      Number of neighbors to consider, not including self-edges.
+
+  min_iter : int, optional (default=50)
+      Minimum number of iterations of the optimization procedure.
+
+  max_iter : int, optional (default=1000)
+      Maximum number of iterations of the optimization procedure.
+
+  learn_rate : float, optional (default=1e-7)
+      Learning rate of the optimization procedure
+
+  tol : float, optional (default=0.001)
+      Tolerance of the optimization procedure. If the objective value varies
+      less than `tol`, we consider the algorithm has converged and stop it.
+
+  verbose : bool, optional (default=False)
+      Whether to print the progress of the optimization procedure.
+
+  regularization: float, optional
+      Weighting of pull and push terms, with 0.5 meaning equal weight.
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+      The preprocessor to call to get tuples from indices. If array-like,
+      tuples will be formed like this: X[indices].
+
+  n_components : int or None, optional (default=None)
+      Dimensionality of reduced space (if None, defaults to dimension of X).
+
+  num_dims : Not used
+
+      .. deprecated:: 0.5.0
+        `num_dims` was deprecated in version 0.5.0 and will
+        be removed in 0.6.0. Use `n_components` instead.
+
+  random_state : int or numpy.RandomState or None, optional (default=None)
+      A pseudo random number generator object or a seed for it if int. If
+      ``init='random'``, ``random_state`` is used to initialize the random
+      transformation. If ``init='pca'``, ``random_state`` is passed as an
+      argument to PCA when initializing the transformation.
+
+  Attributes
+  ----------
+  n_iter_ : `int`
+      The number of iterations the solver has run.
+
+  transformer_ : `numpy.ndarray`, shape=(n_components, n_features)
+      The learned linear transformation ``L``.
+
+  Examples
+  --------
+
+  >>> import numpy as np
+  >>> from metric_learn import LMNN
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> lmnn = LMNN(k=5, learn_rate=1e-6)
+  >>> lmnn.fit(X, Y, verbose=False)
+
+  Notes
+  -----
+
+  If a recent version of the Shogun Python modular (``modshogun``) library
+  is available, the LMNN implementation will use the fast C++ version from
+  there. Otherwise, the included pure-Python version will be used.
+  The two implementations differ slightly, and the C++ version is more
+  complete.
+
+  References
+  ----------
+  .. [1] `Distance Metric Learning for Large Margin Nearest Neighbor
+         Classification <http://papers.nips.cc/paper/2795-distance-metric\
+-learning-for-large-margin-nearest-neighbor-classification>`_
+         Kilian Q. Weinberger, John Blitzer, Lawrence K. Saul
+  """
+
   def __init__(self, init=None, k=3, min_iter=50, max_iter=1000,
                learn_rate=1e-7, regularization=0.5, convergence_tol=0.001,
                use_pca=True, verbose=False, preprocessor=None,
                n_components=None, num_dims='deprecated', random_state=None):
-    """Initialize the LMNN object.
-
-    Parameters
-    ----------
-    init : None, string or numpy array, optional (default=None)
-        Initialization of the linear transformation. Possible options are
-        'auto', 'pca', 'identity', 'random', and a numpy array of shape
-        (n_features_a, n_features_b). If None, will be set automatically to
-        'auto' (this option is to raise a warning if 'init' is not set,
-        and stays to its default value None, in v0.5.0).
-
-        'auto'
-            Depending on ``n_components``, the most reasonable initialization
-            will be chosen. If ``n_components <= n_classes`` we use 'lda', as
-            it uses labels information. If not, but
-            ``n_components < min(n_features, n_samples)``, we use 'pca', as
-            it projects data in meaningful directions (those of higher
-            variance). Otherwise, we just use 'identity'.
-
-        'pca'
-            ``n_components`` principal components of the inputs passed
-            to :meth:`fit` will be used to initialize the transformation.
-            (See `sklearn.decomposition.PCA`)
-
-        'lda'
-            ``min(n_components, n_classes)`` most discriminative
-            components of the inputs passed to :meth:`fit` will be used to
-            initialize the transformation. (If ``n_components > n_classes``,
-            the rest of the components will be zero.) (See
-            `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
-
-        'identity'
-            If ``n_components`` is strictly smaller than the
-            dimensionality of the inputs passed to :meth:`fit`, the identity
-            matrix will be truncated to the first ``n_components`` rows.
-
-        'random'
-            The initial transformation will be a random array of shape
-            `(n_components, n_features)`. Each value is sampled from the
-            standard normal distribution.
-
-        numpy array
-            n_features_b must match the dimensionality of the inputs passed to
-            :meth:`fit` and n_features_a must be less than or equal to that.
-            If ``n_components`` is not None, n_features_a must match it.
-
-    k : int, optional
-        Number of neighbors to consider, not including self-edges.
-
-    regularization: float, optional
-        Weighting of pull and push terms, with 0.5 meaning equal weight.
-
-    preprocessor : array-like, shape=(n_samples, n_features) or callable
-        The preprocessor to call to get tuples from indices. If array-like,
-        tuples will be formed like this: X[indices].
-
-    n_components : int or None, optional (default=None)
-        Dimensionality of reduced space (if None, defaults to dimension of X).
-
-    num_dims : Not used
-
-        .. deprecated:: 0.5.0
-          `num_dims` was deprecated in version 0.5.0 and will
-          be removed in 0.6.0. Use `n_components` instead.
-
-    random_state : int or numpy.RandomState or None, optional (default=None)
-        A pseudo random number generator object or a seed for it if int. If
-        ``init='random'``, ``random_state`` is used to initialize the random
-        transformation. If ``init='pca'``, ``random_state`` is passed as an
-        argument to PCA when initializing the transformation.
-    """
     self.init = init
     self.k = k
     self.min_iter = min_iter
diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py
index f59392c1..e3b0d323 100644
--- a/metric_learn/lsml.py
+++ b/metric_learn/lsml.py
@@ -1,17 +1,5 @@
-r"""
-Metric Learning from Relative Comparisons by Minimizing Squared Residual(LSML)
-
-`LSML` proposes a simple, yet effective, algorithm that minimizes a convex
-objective function corresponding to the sum of squared residuals of
-constraints. This algorithm uses the constraints in the form of the
-relative distance comparisons, such method is especially useful where
-pairwise constraints are not natural to obtain, thus pairwise constraints
-based algorithms become infeasible to be deployed. Furthermore, its sparsity
-extension leads to more stable estimation when the dimension is high and
-only a small amount of constraints is given.
-
-Read more in the :ref:`User Guide <lsml>`.
-
+"""
+Metric Learning from Relative Comparisons by Minimizing Squared Residual (LSML)
 """
 
 from __future__ import print_function, absolute_import, division
@@ -33,46 +21,6 @@ class _BaseLSML(MahalanobisMixin):
 
   def __init__(self, tol=1e-3, max_iter=1000, prior=None,
                verbose=False, preprocessor=None, random_state=None):
-    """Initialize LSML.
-
-    Parameters
-    ----------
-    prior : None, string or numpy array, optional (default=None)
-         Prior to set for the metric. Possible options are
-         'identity', 'covariance', 'random', and a numpy array of
-         shape (n_features, n_features). For LSML, the prior should be strictly
-         positive definite (PD). If `None`, will be set
-         automatically to 'identity' (this is to raise a warning if
-         `prior` is not set, and stays to its default value (None), in v0.5.0).
-
-         'identity'
-            An identity matrix of shape (n_features, n_features).
-
-         'covariance'
-            The inverse covariance matrix.
-
-         'random'
-            The initial Mahalanobis matrix will be a random positive definite
-            (PD) matrix of shape `(n_features, n_features)`, generated using
-            `sklearn.datasets.make_spd_matrix`.
-
-         numpy array
-             A positive definite (PD) matrix of shape
-             (n_features, n_features), that will be used as such to set the
-             prior.
-
-    tol : float, optional
-    max_iter : int, optional
-    verbose : bool, optional
-        if True, prints information while learning
-    preprocessor : array-like, shape=(n_samples, n_features) or callable
-        The preprocessor to call to get tuples from indices. If array-like,
-        tuples will be formed like this: X[indices].
-    random_state : int or numpy.RandomState or None, optional (default=None)
-        A pseudo random number generator object or a seed for it if int. If
-        ``init='random'``, ``random_state`` is used to set the random
-        prior.
-    """
     self.prior = prior
     self.tol = tol
     self.max_iter = max_iter
@@ -178,6 +126,55 @@ def _gradient(self, metric, vab, vcd, prior_inv):
 class LSML(_BaseLSML, _QuadrupletsClassifierMixin):
   """Least Squared-residual Metric Learning (LSML)
 
+  `LSML` proposes a simple, yet effective, algorithm that minimizes a convex
+  objective function corresponding to the sum of squared residuals of
+  constraints. This algorithm uses the constraints in the form of the
+  relative distance comparisons, such method is especially useful where
+  pairwise constraints are not natural to obtain, thus pairwise constraints
+  based algorithms become infeasible to be deployed. Furthermore, its sparsity
+  extension leads to more stable estimation when the dimension is high and
+  only a small amount of constraints is given.
+
+  Read more in the :ref:`User Guide <lsml>`.
+
+  Parameters
+  ----------
+  prior : None, string or numpy array, optional (default=None)
+       Prior to set for the metric. Possible options are
+       'identity', 'covariance', 'random', and a numpy array of
+       shape (n_features, n_features). For LSML, the prior should be strictly
+       positive definite (PD). If `None`, will be set
+       automatically to 'identity' (this is to raise a warning if
+       `prior` is not set, and stays to its default value (None), in v0.5.0).
+
+       'identity'
+          An identity matrix of shape (n_features, n_features).
+
+       'covariance'
+          The inverse covariance matrix.
+
+       'random'
+          The initial Mahalanobis matrix will be a random positive definite
+          (PD) matrix of shape `(n_features, n_features)`, generated using
+          `sklearn.datasets.make_spd_matrix`.
+
+       numpy array
+           A positive definite (PD) matrix of shape
+           (n_features, n_features), that will be used as such to set the
+           prior.
+
+  tol : float, optional
+  max_iter : int, optional
+  verbose : bool, optional
+      if True, prints information while learning
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+      The preprocessor to call to get tuples from indices. If array-like,
+      tuples will be formed like this: X[indices].
+  random_state : int or numpy.RandomState or None, optional (default=None)
+      A pseudo random number generator object or a seed for it if int. If
+      ``init='random'``, ``random_state`` is used to set the random
+      prior.
+
   Attributes
   ----------
   n_iter_ : `int`
@@ -186,6 +183,31 @@ class LSML(_BaseLSML, _QuadrupletsClassifierMixin):
   transformer_ : `numpy.ndarray`, shape=(n_features, n_features)
       The linear transformation ``L`` deduced from the learned Mahalanobis
       metric (See function `transformer_from_metric`.)
+
+  Examples
+  --------
+  >>> from metric_learn import LSML_Supervised
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> lsml = LSML_Supervised(num_constraints=200)
+  >>> lsml.fit(X, Y)
+
+  References
+  ----------
+  .. [1] Liu et al. `Metric Learning from Relative Comparisons by Minimizing
+         Squared Residual
+         <http://www.cs.ucla.edu/~weiwang/paper/ICDM12.pdf>`_. ICDM 2012.
+
+  .. [2] Adapted from https://gist.github.com/kcarnold/5439917
+
+  See Also
+  --------
+  metric_learn.LSML : The original weakly-supervised algorithm
+
+  :ref:`supervised_version` : The section of the project documentation
+    that describes the supervised version of weakly supervised estimators.
   """
 
   def fit(self, quadruplets, weights=None):
@@ -193,7 +215,7 @@ def fit(self, quadruplets, weights=None):
 
     Parameters
     ----------
-    quadruplets : array-like, shape=(n_constraints, 4, n_features) or
+    quadruplets : array-like, shape=(n_constraints, 4, n_features) or \
                   (n_constraints, 4)
         3D array-like of quadruplets of points or 2D array of quadruplets of
         indicators. In order to supervise the algorithm in the right way, we
@@ -214,6 +236,58 @@ def fit(self, quadruplets, weights=None):
 class LSML_Supervised(_BaseLSML, TransformerMixin):
   """Supervised version of Least Squared-residual Metric Learning (LSML)
 
+  `LSML_Supervised` creates quadruplets from labeled samples by taking two
+  samples from the same class, and two samples from different classes.
+  This way it builds quadruplets where the two first points must be more
+  similar than the two last points.
+
+  Parameters
+  ----------
+  tol : float, optional (default=1e-3)
+      Tolerance for the convergence procedure.
+  max_iter : int, optional (default=1000)
+      Number of maximum iterations of the convergence procedure.
+  prior : None, string or numpy array, optional (default=None)
+      Prior to set for the metric. Possible options are
+      'identity', 'covariance', 'random', and a numpy array of
+      shape (n_features, n_features). For LSML, the prior should be strictly
+      positive definite (PD). If `None`, will be set
+      automatically to 'identity' (this is to raise a warning if
+      `prior` is not set, and stays to its default value (None), in v0.5.0).
+
+      'identity'
+          An identity matrix of shape (n_features, n_features).
+
+      'covariance'
+          The inverse covariance matrix.
+
+      'random'
+          The initial Mahalanobis matrix will be a random positive definite
+          (PD) matrix of shape `(n_features, n_features)`, generated using
+          `sklearn.datasets.make_spd_matrix`.
+
+      numpy array
+          A positive definite (PD) matrix of shape
+          (n_features, n_features), that will be used as such to set the
+          prior.
+  num_labeled : Not used
+    .. deprecated:: 0.5.0
+       `num_labeled` was deprecated in version 0.5.0 and will
+       be removed in 0.6.0.
+  num_constraints: int, optional
+      number of constraints to generate
+  weights : (m,) array of floats, optional
+      scale factor for each constraint
+  verbose : bool, optional
+      if True, prints information while learning
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+      The preprocessor to call to get tuples from indices. If array-like,
+      tuples will be formed like this: X[indices].
+  random_state : int or numpy.RandomState or None, optional (default=None)
+      A pseudo random number generator object or a seed for it if int. If
+      ``init='random'``, ``random_state`` is used to set the random
+      prior.
+
   Attributes
   ----------
   n_iter_ : `int`
@@ -227,58 +301,6 @@ class LSML_Supervised(_BaseLSML, TransformerMixin):
   def __init__(self, tol=1e-3, max_iter=1000, prior=None,
                num_labeled='deprecated', num_constraints=None, weights=None,
                verbose=False, preprocessor=None, random_state=None):
-    """Initialize the supervised version of `LSML`.
-
-    `LSML_Supervised` creates quadruplets from labeled samples by taking two
-    samples from the same class, and two samples from different classes.
-    This way it builds quadruplets where the two first points must be more
-    similar than the two last points.
-
-    Parameters
-    ----------
-    tol : float, optional
-    max_iter : int, optional
-    prior : None, string or numpy array, optional (default=None)
-         Prior to set for the metric. Possible options are
-         'identity', 'covariance', 'random', and a numpy array of
-         shape (n_features, n_features). For LSML, the prior should be strictly
-         positive definite (PD). If `None`, will be set
-         automatically to 'identity' (this is to raise a warning if
-         `prior` is not set, and stays to its default value (None), in v0.5.0).
-
-         'identity'
-            An identity matrix of shape (n_features, n_features).
-
-         'covariance'
-            The inverse covariance matrix.
-
-         'random'
-            The initial Mahalanobis matrix will be a random positive definite
-            (PD) matrix of shape `(n_features, n_features)`, generated using
-            `sklearn.datasets.make_spd_matrix`.
-
-         numpy array
-             A positive definite (PD) matrix of shape
-             (n_features, n_features), that will be used as such to set the
-             prior.
-    num_labeled : Not used
-      .. deprecated:: 0.5.0
-         `num_labeled` was deprecated in version 0.5.0 and will
-         be removed in 0.6.0.
-    num_constraints: int, optional
-        number of constraints to generate
-    weights : (m,) array of floats, optional
-        scale factor for each constraint
-    verbose : bool, optional
-        if True, prints information while learning
-    preprocessor : array-like, shape=(n_samples, n_features) or callable
-        The preprocessor to call to get tuples from indices. If array-like,
-        tuples will be formed like this: X[indices].
-    random_state : int or numpy.RandomState or None, optional (default=None)
-        A pseudo random number generator object or a seed for it if int. If
-        ``init='random'``, ``random_state`` is used to set the random
-        prior.
-    """
     _BaseLSML.__init__(self, tol=tol, max_iter=max_iter, prior=prior,
                        verbose=verbose, preprocessor=preprocessor,
                        random_state=random_state)
diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py
index c625b67c..ea8748be 100644
--- a/metric_learn/mlkr.py
+++ b/metric_learn/mlkr.py
@@ -1,13 +1,5 @@
-r"""
-Metric Learning for Kernel Regression(MLKR)
-
-MLKR is an algorithm for supervised metric learning, which learns a
-distance function by directly minimizing the leave-one-out regression error.
-This algorithm can also be viewed as a supervised variation of PCA and can be
-used for dimensionality reduction and high dimensional data visualization.
-
-Read more in the :ref:`User Guide <mlkr>`.
-
+"""
+Metric Learning for Kernel Regression (MLKR)
 """
 from __future__ import division, print_function
 import time
@@ -31,6 +23,81 @@
 class MLKR(MahalanobisMixin, TransformerMixin):
   """Metric Learning for Kernel Regression (MLKR)
 
+  MLKR is an algorithm for supervised metric learning, which learns a
+  distance function by directly minimizing the leave-one-out regression error.
+  This algorithm can also be viewed as a supervised variation of PCA and can be
+  used for dimensionality reduction and high dimensional data visualization.
+
+  Read more in the :ref:`User Guide <mlkr>`.
+
+  Parameters
+  ----------
+  n_components : int or None, optional (default=None)
+      Dimensionality of reduced space (if None, defaults to dimension of X).
+
+  num_dims : Not used
+
+      .. deprecated:: 0.5.0
+        `num_dims` was deprecated in version 0.5.0 and will
+        be removed in 0.6.0. Use `n_components` instead.
+
+  init : None, string or numpy array, optional (default=None)
+      Initialization of the linear transformation. Possible options are
+      'auto', 'pca', 'identity', 'random', and a numpy array of shape
+      (n_features_a, n_features_b). If None, will be set automatically to
+      'auto' (this option is to raise a warning if 'init' is not set,
+      and stays to its default value None, in v0.5.0).
+
+      'auto'
+          Depending on ``n_components``, the most reasonable initialization
+          will be chosen. If ``n_components < min(n_features, n_samples)``,
+          we use 'pca', as it projects data in meaningful directions (those
+          of higher variance). Otherwise, we just use 'identity'.
+
+      'pca'
+          ``n_components`` principal components of the inputs passed
+          to :meth:`fit` will be used to initialize the transformation.
+          (See `sklearn.decomposition.PCA`)
+
+      'identity'
+          If ``n_components`` is strictly smaller than the
+          dimensionality of the inputs passed to :meth:`fit`, the identity
+          matrix will be truncated to the first ``n_components`` rows.
+
+      'random'
+          The initial transformation will be a random array of shape
+          `(n_components, n_features)`. Each value is sampled from the
+          standard normal distribution.
+
+      numpy array
+          n_features_b must match the dimensionality of the inputs passed to
+          :meth:`fit` and n_features_a must be less than or equal to that.
+          If ``n_components`` is not None, n_features_a must match it.
+
+  A0: Not used.
+      .. deprecated:: 0.5.0
+        `A0` was deprecated in version 0.5.0 and will
+        be removed in 0.6.0. Use 'init' instead.
+
+  tol: float, optional (default=None)
+      Convergence tolerance for the optimization.
+
+  max_iter: int, optional
+      Cap on number of conjugate gradient iterations.
+
+  verbose : bool, optional (default=False)
+      Whether to print progress messages or not.
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+      The preprocessor to call to get tuples from indices. If array-like,
+      tuples will be formed like this: X[indices].
+
+  random_state : int or numpy.RandomState or None, optional (default=None)
+      A pseudo random number generator object or a seed for it if int. If
+      ``init='random'``, ``random_state`` is used to initialize the random
+      transformation. If ``init='pca'``, ``random_state`` is passed as an
+      argument to PCA when initializing the transformation.
+
   Attributes
   ----------
   n_iter_ : `int`
@@ -38,82 +105,28 @@ class MLKR(MahalanobisMixin, TransformerMixin):
 
   transformer_ : `numpy.ndarray`, shape=(n_components, n_features)
       The learned linear transformation ``L``.
+
+  Examples
+  --------
+
+  >>> from metric_learn import MLKR
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> mlkr = MLKR()
+  >>> mlkr.fit(X, Y)
+
+  References
+  ----------
+  .. [1] `Information-theoretic Metric Learning
+     <http://machinelearning.wustl.edu/\
+mlpapers/paper_files/icml2007_DavisKJSD07.pdf>`_ Jason V. Davis, et al.
   """
 
   def __init__(self, n_components=None, num_dims='deprecated', init=None,
                A0='deprecated', tol=None, max_iter=1000, verbose=False,
                preprocessor=None, random_state=None):
-    """
-    Initialize MLKR.
-
-    Parameters
-    ----------
-    n_components : int or None, optional (default=None)
-        Dimensionality of reduced space (if None, defaults to dimension of X).
-
-    num_dims : Not used
-
-        .. deprecated:: 0.5.0
-          `num_dims` was deprecated in version 0.5.0 and will
-          be removed in 0.6.0. Use `n_components` instead.
-
-    init : None, string or numpy array, optional (default=None)
-        Initialization of the linear transformation. Possible options are
-        'auto', 'pca', 'identity', 'random', and a numpy array of shape
-        (n_features_a, n_features_b). If None, will be set automatically to
-        'auto' (this option is to raise a warning if 'init' is not set,
-        and stays to its default value None, in v0.5.0).
-
-        'auto'
-            Depending on ``n_components``, the most reasonable initialization
-            will be chosen. If ``n_components < min(n_features, n_samples)``,
-            we use 'pca', as it projects data in meaningful directions (those
-            of higher variance). Otherwise, we just use 'identity'.
-
-        'pca'
-            ``n_components`` principal components of the inputs passed
-            to :meth:`fit` will be used to initialize the transformation.
-            (See `sklearn.decomposition.PCA`)
-
-        'identity'
-            If ``n_components`` is strictly smaller than the
-            dimensionality of the inputs passed to :meth:`fit`, the identity
-            matrix will be truncated to the first ``n_components`` rows.
-
-        'random'
-            The initial transformation will be a random array of shape
-            `(n_components, n_features)`. Each value is sampled from the
-            standard normal distribution.
-
-        numpy array
-            n_features_b must match the dimensionality of the inputs passed to
-            :meth:`fit` and n_features_a must be less than or equal to that.
-            If ``num_dims`` is not None, n_features_a must match it.
-
-    A0: Not used.
-        .. deprecated:: 0.5.0
-          `A0` was deprecated in version 0.5.0 and will
-          be removed in 0.6.0. Use 'init' instead.
-
-    tol: float, optional (default=None)
-        Convergence tolerance for the optimization.
-
-    max_iter: int, optional
-        Cap on number of conjugate gradient iterations.
-
-    verbose : bool, optional (default=False)
-        Whether to print progress messages or not.
-
-    preprocessor : array-like, shape=(n_samples, n_features) or callable
-        The preprocessor to call to get tuples from indices. If array-like,
-        tuples will be formed like this: X[indices].
-
-    random_state : int or numpy.RandomState or None, optional (default=None)
-        A pseudo random number generator object or a seed for it if int. If
-        ``init='random'``, ``random_state`` is used to initialize the random
-        transformation. If ``init='pca'``, ``random_state`` is passed as an
-        argument to PCA when initializing the transformation.
-    """
     self.n_components = n_components
     self.num_dims = num_dims
     self.init = init
diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py
index b3e6c203..9f02425c 100644
--- a/metric_learn/mmc.py
+++ b/metric_learn/mmc.py
@@ -1,21 +1,4 @@
-r"""
-Metric Learning with Application for Clustering with Side Information(MMC)
-
-MMC minimizes the sum of squared distances between similar points, while
-enforcing the sum of distances between dissimilar ones to be greater than one.
-This leads to a convex and, thus, local-minima-free optimization problem that
-can be solved efficiently.
-However, the algorithm involves the computation of eigenvalues, which is the
-main speed-bottleneck. Since it has initially been designed for clustering
-applications, one of the implicit assumptions of MMC is that all classes form
-a compact set, i.e., follow a unimodal distribution, which restricts the
-possible use-cases of this method. However, it is one of the earliest and a
-still often cited technique.
-
-Read more in the :ref:`User Guide <mmc>`.
-
-"""
-
+"""Mahalanobis Metric for Clustering (MMC)"""
 from __future__ import print_function, absolute_import, division
 import warnings
 import numpy as np
@@ -30,7 +13,6 @@
 
 
 class _BaseMMC(MahalanobisMixin):
-  """Mahalanobis Metric for Clustering (MMC)"""
 
   _tuple_size = 2  # constraints are pairs
 
@@ -38,61 +20,6 @@ def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3,
                init=None, A0='deprecated', diagonal=False,
                diagonal_c=1.0, verbose=False, preprocessor=None,
                random_state=None):
-    """Initialize MMC.
-    Parameters
-    ----------
-    max_iter : int, optional
-    max_proj : int, optional
-    convergence_threshold : float, optional
-    init : None, string or numpy array, optional (default=None)
-        Initialization of the Mahalanobis matrix. Possible options are
-        'identity', 'covariance', 'random', and a numpy array of
-        shape (n_features, n_features). If None, will be set
-        automatically to 'identity' (this is to raise a warning if
-        'init' is not set, and stays to its default value (None), in v0.5.0).
-
-         'identity'
-            An identity matrix of shape (n_features, n_features).
-
-         'covariance'
-            The (pseudo-)inverse of the covariance matrix.
-
-         'random'
-            The initial Mahalanobis matrix will be a random SPD matrix of shape
-            `(n_features, n_features)`, generated using
-            `sklearn.datasets.make_spd_matrix`.
-
-         numpy array
-             An SPD matrix of shape (n_features, n_features), that will
-             be used as such to initialize the metric.
-
-    verbose : bool, optional
-        if True, prints information while learning
-
-    preprocessor : array-like, shape=(n_samples, n_features) or callable
-        The preprocessor to call to get tuples from indices. If array-like,
-        tuples will be gotten like this: X[indices].
-    A0 : Not used.
-        .. deprecated:: 0.5.0
-          `A0` was deprecated in version 0.5.0 and will
-          be removed in 0.6.0. Use 'init' instead.
-    diagonal : bool, optional
-        if True, a diagonal metric will be learned,
-        i.e., a simple scaling of dimensions. The initialization will then
-        be the diagonal coefficients of the matrix given as 'init'.
-    diagonal_c : float, optional
-        weight of the dissimilarity constraint for diagonal
-        metric learning
-    verbose : bool, optional
-        if True, prints information while learning
-    preprocessor : array-like, shape=(n_samples, n_features) or callable
-        The preprocessor to call to get tuples from indices. If array-like,
-        tuples will be gotten like this: X[indices].
-    random_state : int or numpy.RandomState or None, optional (default=None)
-        A pseudo random number generator object or a seed for it if int. If
-        ``init='random'``, ``random_state`` is used to initialize the random
-        transformation.
-    """
     self.max_iter = max_iter
     self.max_proj = max_proj
     self.convergence_threshold = convergence_threshold
@@ -403,6 +330,80 @@ def _D_constraint(self, neg_pairs, w):
 class MMC(_BaseMMC, _PairsClassifierMixin):
   """Mahalanobis Metric for Clustering (MMC)
 
+  MMC minimizes the sum of squared distances between similar points, while
+  enforcing the sum of distances between dissimilar ones to be greater than
+  one. This leads to a convex and, thus, local-minima-free optimization
+  problem that can be solved efficiently.
+  However, the algorithm involves the computation of eigenvalues, which is the
+  main speed-bottleneck. Since it has initially been designed for clustering
+  applications, one of the implicit assumptions of MMC is that all classes form
+  a compact set, i.e., follow a unimodal distribution, which restricts the
+  possible use-cases of this method. However, it is one of the earliest and a
+  still often cited technique.
+
+  Read more in the :ref:`User Guide <mmc>`.
+
+  Parameters
+  ----------
+  max_iter : int, optional (default=100)
+      Maximum number of iterations of the convergence procedure.
+
+  max_proj : int, optional (default=10000)
+      Maximum number of projection steps.
+
+  convergence_threshold : float, optional (default=1e-6)
+      Convergence threshold for the convergence procedure.
+
+  init : None, string or numpy array, optional (default=None)
+     Initialization of the Mahalanobis matrix. Possible options are
+     'identity', 'covariance', 'random', and a numpy array of
+     shape (n_features, n_features). If None, will be set
+     automatically to 'identity' (this is to raise a warning if
+     'init' is not set, and stays to its default value (None), in v0.5.0).
+
+      'identity'
+         An identity matrix of shape (n_features, n_features).
+
+      'covariance'
+         The (pseudo-)inverse of the covariance matrix.
+
+      'random'
+         The initial Mahalanobis matrix will be a random SPD matrix of
+         shape
+         `(n_features, n_features)`, generated using
+         `sklearn.datasets.make_spd_matrix`.
+
+      numpy array
+          An SPD matrix of shape (n_features, n_features), that will
+          be used as such to initialize the metric.
+
+  verbose : bool, optional
+     if True, prints information while learning
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+     The preprocessor to call to get tuples from indices. If array-like,
+     tuples will be gotten like this: X[indices].
+  A0 : Not used.
+     .. deprecated:: 0.5.0
+       `A0` was deprecated in version 0.5.0 and will
+       be removed in 0.6.0. Use 'init' instead.
+  diagonal : bool, optional
+     if True, a diagonal metric will be learned,
+     i.e., a simple scaling of dimensions. The initialization will then
+     be the diagonal coefficients of the matrix given as 'init'.
+  diagonal_c : float, optional
+     weight of the dissimilarity constraint for diagonal
+     metric learning
+  verbose : bool, optional
+     if True, prints information while learning
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+     The preprocessor to call to get tuples from indices. If array-like,
+     tuples will be gotten like this: X[indices].
+  random_state : int or numpy.RandomState or None, optional (default=None)
+     A pseudo random number generator object or a seed for it if int. If
+     ``init='random'``, ``random_state`` is used to initialize the random
+     transformation.
+
   Attributes
   ----------
   n_iter_ : `int`
@@ -416,6 +417,29 @@ class MMC(_BaseMMC, _PairsClassifierMixin):
       If the distance metric between two points is lower than this threshold,
       points will be classified as similar, otherwise they will be
       classified as dissimilar.
+
+  Examples
+  --------
+  >>> from metric_learn import MMC_Supervised
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> mmc = MMC_Supervised(num_constraints=200)
+  >>> mmc.fit(X, Y)
+
+  References
+  ----------
+  .. [1] `Distance metric learning with application to clustering with
+         side-information <http://papers.nips.cc/paper/2164-distance-metric-\
+learning-with-application-to-clustering-with-side-information.pdf>`_
+         Xing, Jordan, Russell, Ng.
+
+  See Also
+  --------
+  metric_learn.MMC : The original weakly-supervised algorithm
+  :ref:`supervised_version` : The section of the project documentation
+    that describes the supervised version of weakly supervised estimators.
   """
 
   def fit(self, pairs, y, calibration_params=None):
@@ -426,7 +450,7 @@ def fit(self, pairs, y, calibration_params=None):
 
     Parameters
     ----------
-    pairs : array-like, shape=(n_constraints, 2, n_features) or
+    pairs : array-like, shape=(n_constraints, 2, n_features) or \
            (n_constraints, 2)
         3D Array of pairs with each row corresponding to two points,
         or 2D array of indices of pairs if the metric learner uses a
@@ -453,6 +477,73 @@ def fit(self, pairs, y, calibration_params=None):
 class MMC_Supervised(_BaseMMC, TransformerMixin):
   """Supervised version of Mahalanobis Metric for Clustering (MMC)
 
+  `MMC_Supervised` creates pairs of similar sample by taking same class
+  samples, and pairs of dissimilar samples by taking different class
+  samples. It then passes these pairs to `MMC` for training.
+
+  Parameters
+  ----------
+  max_iter : int, optional
+  max_proj : int, optional
+  convergence_threshold : float, optional
+  num_labeled : Not used
+    .. deprecated:: 0.5.0
+       `num_labeled` was deprecated in version 0.5.0 and will
+       be removed in 0.6.0.
+  num_constraints: int, optional
+      number of constraints to generate
+  init : None, string or numpy array, optional (default=None)
+      Initialization of the Mahalanobis matrix. Possible options are
+      'identity', 'covariance', 'random', and a numpy array of
+      shape (n_features, n_features). If None, will be set
+      automatically to 'identity' (this is to raise a warning if
+      'init' is not set, and stays to its default value (None), in v0.5.0).
+
+       'identity'
+           An identity matrix of shape (n_features, n_features).
+
+       'covariance'
+           The (pseudo-)inverse of the covariance matrix.
+
+       'random'
+           The initial Mahalanobis matrix will be a random SPD matrix of
+           shape `(n_features, n_features)`, generated using
+           `sklearn.datasets.make_spd_matrix`.
+
+       numpy array
+           A numpy array of shape (n_features, n_features), that will
+           be used as such to initialize the metric.
+
+  verbose : bool, optional
+      if True, prints information while learning
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+      The preprocessor to call to get tuples from indices. If array-like,
+      tuples will be gotten like this: X[indices].
+  A0 : Not used.
+      .. deprecated:: 0.5.0
+        `A0` was deprecated in version 0.5.0 and will
+        be removed in 0.6.0. Use 'init' instead.
+  diagonal : bool, optional
+      if True, a diagonal metric will be learned,
+      i.e., a simple scaling of dimensions
+  diagonal_c : float, optional
+      weight of the dissimilarity constraint for diagonal
+      metric learning
+  verbose : bool, optional
+      if True, prints information while learning
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+      The preprocessor to call to get tuples from indices. If array-like,
+      tuples will be formed like this: X[indices].
+  random_state : int or numpy.RandomState or None, optional (default=None)
+      A pseudo random number generator object or a seed for it if int. If
+      ``init='random'``, ``random_state`` is used to initialize the random
+      Mahalanobis matrix.
+
+  `MMC_Supervised` creates pairs of similar sample by taking same class
+  samples, and pairs of dissimilar samples by taking different class
+  samples. It then passes these pairs to `MMC` for training.
+
   Attributes
   ----------
   n_iter_ : `int`
@@ -467,71 +558,6 @@ def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-6,
                num_labeled='deprecated', num_constraints=None, init=None,
                A0='deprecated', diagonal=False, diagonal_c=1.0, verbose=False,
                preprocessor=None, random_state=None):
-    """Initialize the supervised version of `MMC`.
-
-    `MMC_Supervised` creates pairs of similar sample by taking same class
-    samples, and pairs of dissimilar samples by taking different class
-    samples. It then passes these pairs to `MMC` for training.
-
-    Parameters
-    ----------
-    max_iter : int, optional
-    max_proj : int, optional
-    convergence_threshold : float, optional
-    num_labeled : Not used
-      .. deprecated:: 0.5.0
-         `num_labeled` was deprecated in version 0.5.0 and will
-         be removed in 0.6.0.
-    num_constraints: int, optional
-        number of constraints to generate
-    init : None, string or numpy array, optional (default=None)
-        Initialization of the Mahalanobis matrix. Possible options are
-        'identity', 'covariance', 'random', and a numpy array of
-        shape (n_features, n_features). If None, will be set
-        automatically to 'identity' (this is to raise a warning if
-        'init' is not set, and stays to its default value (None), in v0.5.0).
-
-         'identity'
-             An identity matrix of shape (n_features, n_features).
-
-         'covariance'
-             The (pseudo-)inverse of the covariance matrix.
-
-         'random'
-             The initial Mahalanobis matrix will be a random SPD matrix of
-             shape `(n_features, n_features)`, generated using
-             `sklearn.datasets.make_spd_matrix`.
-
-         numpy array
-             A numpy array of shape (n_features, n_features), that will
-             be used as such to initialize the metric.
-
-    verbose : bool, optional
-        if True, prints information while learning
-
-    preprocessor : array-like, shape=(n_samples, n_features) or callable
-        The preprocessor to call to get tuples from indices. If array-like,
-        tuples will be gotten like this: X[indices].
-    A0 : Not used.
-        .. deprecated:: 0.5.0
-          `A0` was deprecated in version 0.5.0 and will
-          be removed in 0.6.0. Use 'init' instead.
-    diagonal : bool, optional
-        if True, a diagonal metric will be learned,
-        i.e., a simple scaling of dimensions
-    diagonal_c : float, optional
-        weight of the dissimilarity constraint for diagonal
-        metric learning
-    verbose : bool, optional
-        if True, prints information while learning
-    preprocessor : array-like, shape=(n_samples, n_features) or callable
-        The preprocessor to call to get tuples from indices. If array-like,
-        tuples will be formed like this: X[indices].
-    random_state : int or numpy.RandomState or None, optional (default=None)
-        A pseudo random number generator object or a seed for it if int. If
-        ``init='random'``, ``random_state`` is used to initialize the random
-        Mahalanobis matrix.
-    """
     _BaseMMC.__init__(self, max_iter=max_iter, max_proj=max_proj,
                       convergence_threshold=convergence_threshold,
                       init=init, A0=A0, diagonal=diagonal,
diff --git a/metric_learn/nca.py b/metric_learn/nca.py
index 2b541a64..dcfdac8a 100644
--- a/metric_learn/nca.py
+++ b/metric_learn/nca.py
@@ -1,15 +1,5 @@
-r"""
-Neighborhood Components Analysis(NCA)
-
-NCA is a distance metric learning algorithm which aims to improve the
-accuracy of nearest neighbors classification compared to the standard
-Euclidean distance. The algorithm directly maximizes a stochastic variant
-of the leave-one-out k-nearest neighbors(KNN) score on the training set.
-It can also learn a low-dimensional linear transformation of data that can
-be used for data visualization and fast classification.
-
-Read more in the :ref:`User Guide <nca>`.
-
+"""
+Neighborhood Components Analysis (NCA)
 """
 
 from __future__ import absolute_import
@@ -32,6 +22,95 @@
 class NCA(MahalanobisMixin, TransformerMixin):
   """Neighborhood Components Analysis (NCA)
 
+  NCA is a distance metric learning algorithm which aims to improve the
+  accuracy of nearest neighbors classification compared to the standard
+  Euclidean distance. The algorithm directly maximizes a stochastic variant
+  of the leave-one-out k-nearest neighbors(KNN) score on the training set.
+  It can also learn a low-dimensional linear transformation of data that can
+  be used for data visualization and fast classification.
+
+  Read more in the :ref:`User Guide <nca>`.
+
+  Parameters
+  ----------
+  init : None, string or numpy array, optional (default=None)
+      Initialization of the linear transformation. Possible options are
+      'auto', 'pca', 'identity', 'random', and a numpy array of shape
+      (n_features_a, n_features_b). If None, will be set automatically to
+      'auto' (this option is to raise a warning if 'init' is not set,
+      and stays to its default value None, in v0.5.0).
+
+      'auto'
+          Depending on ``n_components``, the most reasonable initialization
+          will be chosen. If ``n_components <= n_classes`` we use 'lda', as
+          it uses labels information. If not, but
+          ``n_components < min(n_features, n_samples)``, we use 'pca', as
+          it projects data in meaningful directions (those of higher
+          variance). Otherwise, we just use 'identity'.
+
+      'pca'
+          ``n_components`` principal components of the inputs passed
+          to :meth:`fit` will be used to initialize the transformation.
+          (See `sklearn.decomposition.PCA`)
+
+      'lda'
+          ``min(n_components, n_classes)`` most discriminative
+          components of the inputs passed to :meth:`fit` will be used to
+          initialize the transformation. (If ``n_components > n_classes``,
+          the rest of the components will be zero.) (See
+          `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
+
+      'identity'
+          If ``n_components`` is strictly smaller than the
+          dimensionality of the inputs passed to :meth:`fit`, the identity
+          matrix will be truncated to the first ``n_components`` rows.
+
+      'random'
+          The initial transformation will be a random array of shape
+          `(n_components, n_features)`. Each value is sampled from the
+          standard normal distribution.
+
+      numpy array
+          n_features_b must match the dimensionality of the inputs passed to
+          :meth:`fit` and n_features_a must be less than or equal to that.
+          If ``n_components`` is not None, n_features_a must match it.
+
+  n_components : int or None, optional (default=None)
+      Dimensionality of reduced space (if None, defaults to dimension of X).
+
+  num_dims : Not used
+
+      .. deprecated:: 0.5.0
+        `num_dims` was deprecated in version 0.5.0 and will
+        be removed in 0.6.0. Use `n_components` instead.
+
+  max_iter : int, optional (default=100)
+    Maximum number of iterations done by the optimization algorithm.
+
+  tol : float, optional (default=None)
+      Convergence tolerance for the optimization.
+
+  verbose : bool, optional (default=False)
+    Whether to print progress messages or not.
+
+  random_state : int or numpy.RandomState or None, optional (default=None)
+      A pseudo random number generator object or a seed for it if int. If
+      ``init='random'``, ``random_state`` is used to initialize the random
+      transformation. If ``init='pca'``, ``random_state`` is passed as an
+      argument to PCA when initializing the transformation.
+
+  Examples
+  --------
+
+  >>> import numpy as np
+  >>> from metric_learn import NCA
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> nca = NCA(max_iter=1000)
+  >>> nca.fit(X, Y)
+
   Attributes
   ----------
   n_iter_ : `int`
@@ -39,81 +118,21 @@ class NCA(MahalanobisMixin, TransformerMixin):
 
   transformer_ : `numpy.ndarray`, shape=(n_components, n_features)
       The learned linear transformation ``L``.
+
+  References
+  ----------
+  .. [1] J. Goldberger, G. Hinton, S. Roweis, R. Salakhutdinov. `Neighbourhood
+         Components Analysis
+         <http://www.cs.nyu.edu/~roweis/papers/ncanips.pdf>`_.
+         Advances in Neural Information Processing Systems. 17, 513-520, 2005.
+
+  .. [2] Wikipedia entry on `Neighborhood Components Analysis
+         <https://en.wikipedia.org/wiki/Neighbourhood_components_analysis>`_
   """
 
   def __init__(self, init=None, n_components=None, num_dims='deprecated',
                max_iter=100, tol=None, verbose=False, preprocessor=None,
                random_state=None):
-    """Neighborhood Components Analysis
-
-    Parameters
-    ----------
-    init : None, string or numpy array, optional (default=None)
-        Initialization of the linear transformation. Possible options are
-        'auto', 'pca', 'identity', 'random', and a numpy array of shape
-        (n_features_a, n_features_b). If None, will be set automatically to
-        'auto' (this option is to raise a warning if 'init' is not set,
-        and stays to its default value None, in v0.5.0).
-
-        'auto'
-            Depending on ``n_components``, the most reasonable initialization
-            will be chosen. If ``n_components <= n_classes`` we use 'lda', as
-            it uses labels information. If not, but
-            ``n_components < min(n_features, n_samples)``, we use 'pca', as
-            it projects data in meaningful directions (those of higher
-            variance). Otherwise, we just use 'identity'.
-
-        'pca'
-            ``n_components`` principal components of the inputs passed
-            to :meth:`fit` will be used to initialize the transformation.
-            (See `sklearn.decomposition.PCA`)
-
-        'lda'
-            ``min(n_components, n_classes)`` most discriminative
-            components of the inputs passed to :meth:`fit` will be used to
-            initialize the transformation. (If ``n_components > n_classes``,
-            the rest of the components will be zero.) (See
-            `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
-
-        'identity'
-            If ``n_components`` is strictly smaller than the
-            dimensionality of the inputs passed to :meth:`fit`, the identity
-            matrix will be truncated to the first ``n_components`` rows.
-
-        'random'
-            The initial transformation will be a random array of shape
-            `(n_components, n_features)`. Each value is sampled from the
-            standard normal distribution.
-
-        numpy array
-            n_features_b must match the dimensionality of the inputs passed to
-            :meth:`fit` and n_features_a must be less than or equal to that.
-            If ``n_components`` is not None, n_features_a must match it.
-
-    n_components : int or None, optional (default=None)
-        Dimensionality of reduced space (if None, defaults to dimension of X).
-
-    num_dims : Not used
-
-        .. deprecated:: 0.5.0
-          `num_dims` was deprecated in version 0.5.0 and will
-          be removed in 0.6.0. Use `n_components` instead.
-
-    max_iter : int, optional (default=100)
-      Maximum number of iterations done by the optimization algorithm.
-
-    tol : float, optional (default=None)
-        Convergence tolerance for the optimization.
-
-    verbose : bool, optional (default=False)
-      Whether to print progress messages or not.
-
-    random_state : int or numpy.RandomState or None, optional (default=None)
-        A pseudo random number generator object or a seed for it if int. If
-        ``init='random'``, ``random_state`` is used to initialize the random
-        transformation. If ``init='pca'``, ``random_state`` is passed as an
-        argument to PCA when initializing the transformation.
-    """
     self.n_components = n_components
     self.init = init
     self.num_dims = num_dims
diff --git a/metric_learn/rca.py b/metric_learn/rca.py
index 1dbffdd6..503e2408 100644
--- a/metric_learn/rca.py
+++ b/metric_learn/rca.py
@@ -1,14 +1,5 @@
-r"""
-Relative Components Analysis(RCA)
-
-RCA learns a full rank Mahalanobis distance metric based on a weighted sum of
-in-chunklets covariance matrices. It applies a global linear transformation to
-assign large weights to relevant dimensions and low weights to irrelevant
-dimensions. Those relevant dimensions are estimated using "chunklets", subsets
-of points that are known to belong to the same class.
-
-Read more in the :ref:`User Guide <rca>`.
-
+"""
+Relative Components Analysis (RCA)
 """
 
 from __future__ import absolute_import
@@ -42,6 +33,52 @@ def _chunk_mean_centering(data, chunks):
 class RCA(MahalanobisMixin, TransformerMixin):
   """Relevant Components Analysis (RCA)
 
+  RCA learns a full rank Mahalanobis distance metric based on a weighted sum of
+  in-chunklets covariance matrices. It applies a global linear transformation
+  to assign large weights to relevant dimensions and low weights to irrelevant
+  dimensions. Those relevant dimensions are estimated using "chunklets",
+  subsets of points that are known to belong to the same class.
+
+  Read more in the :ref:`User Guide <rca>`.
+
+  Parameters
+  ----------
+  n_components : int or None, optional (default=None)
+      Dimensionality of reduced space (if None, defaults to dimension of X).
+
+  num_dims : Not used
+
+      .. deprecated:: 0.5.0
+        `num_dims` was deprecated in version 0.5.0 and will
+        be removed in 0.6.0. Use `n_components` instead.
+
+  pca_comps : Not used
+      .. deprecated:: 0.5.0
+      `pca_comps` was deprecated in version 0.5.0 and will
+      be removed in 0.6.0.
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+      The preprocessor to call to get tuples from indices. If array-like,
+      tuples will be formed like this: X[indices].
+
+  Examples
+  --------
+  >>> from metric_learn import RCA_Supervised
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> rca = RCA_Supervised(num_chunks=30, chunk_size=2)
+  >>> rca.fit(X, Y)
+
+  References
+  ------------------
+  .. [1] `Adjustment learning and relevant component analysis
+         <http://citeseerx.ist.\
+psu.edu/viewdoc/download?doi=10.1.1.19.2871&rep=rep1&type=pdf>`_ Noam
+         Shental, et al.
+
+
   Attributes
   ----------
   transformer_ : `numpy.ndarray`, shape=(n_components, n_features)
@@ -50,28 +87,6 @@ class RCA(MahalanobisMixin, TransformerMixin):
 
   def __init__(self, n_components=None, num_dims='deprecated',
                pca_comps='deprecated', preprocessor=None):
-    """Initialize the learner.
-
-    Parameters
-    ----------
-    n_components : int or None, optional (default=None)
-        Dimensionality of reduced space (if None, defaults to dimension of X).
-
-    num_dims : Not used
-
-        .. deprecated:: 0.5.0
-          `num_dims` was deprecated in version 0.5.0 and will
-          be removed in 0.6.0. Use `n_components` instead.
-
-    pca_comps : Not used
-      .. deprecated:: 0.5.0
-         `pca_comps` was deprecated in version 0.5.0 and will
-         be removed in 0.6.0.
-
-    preprocessor : array-like, shape=(n_samples, n_features) or callable
-        The preprocessor to call to get tuples from indices. If array-like,
-        tuples will be formed like this: X[indices].
-    """
     self.n_components = n_components
     self.num_dims = num_dims
     self.pca_comps = pca_comps
@@ -153,6 +168,27 @@ def _inv_sqrtm(x):
 class RCA_Supervised(RCA):
   """Supervised version of Relevant Components Analysis (RCA)
 
+  `RCA_Supervised` creates chunks of similar points by first sampling a
+  class, taking `chunk_size` elements in it, and repeating the process
+  `num_chunks` times.
+
+  Parameters
+  ----------
+  n_components : int or None, optional (default=None)
+      Dimensionality of reduced space (if None, defaults to dimension of X).
+
+  num_dims : Not used
+
+      .. deprecated:: 0.5.0
+        `num_dims` was deprecated in version 0.5.0 and will
+        be removed in 0.6.0. Use `n_components` instead.
+
+  num_chunks: int, optional
+  chunk_size: int, optional
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+      The preprocessor to call to get tuples from indices. If array-like,
+      tuples will be formed like this: X[indices].
+
   Attributes
   ----------
   transformer_ : `numpy.ndarray`, shape=(n_components, n_features)
@@ -162,29 +198,6 @@ class RCA_Supervised(RCA):
   def __init__(self, num_dims='deprecated', n_components=None,
                pca_comps='deprecated', num_chunks=100, chunk_size=2,
                preprocessor=None):
-    """Initialize the supervised version of `RCA`.
-
-    `RCA_Supervised` creates chunks of similar points by first sampling a
-    class, taking `chunk_size` elements in it, and repeating the process
-    `num_chunks` times.
-
-    Parameters
-    ----------
-    n_components : int or None, optional (default=None)
-        Dimensionality of reduced space (if None, defaults to dimension of X).
-
-    num_dims : Not used
-
-        .. deprecated:: 0.5.0
-          `num_dims` was deprecated in version 0.5.0 and will
-          be removed in 0.6.0. Use `n_components` instead.
-
-    num_chunks: int, optional
-    chunk_size: int, optional
-    preprocessor : array-like, shape=(n_samples, n_features) or callable
-        The preprocessor to call to get tuples from indices. If array-like,
-        tuples will be formed like this: X[indices].
-    """
     RCA.__init__(self, num_dims=num_dims, n_components=n_components,
                  pca_comps=pca_comps, preprocessor=preprocessor)
     self.num_chunks = num_chunks
diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py
index c5e63fa8..70e65c86 100644
--- a/metric_learn/sdml.py
+++ b/metric_learn/sdml.py
@@ -1,15 +1,5 @@
-r"""
-Sparse High-Dimensional Metric Learning(SDML)
-
-SDML is an efficient sparse metric learning in high-dimensional space via
-double regularization: an L1-penalization on the off-diagonal elements of the
-Mahalanobis matrix :math:`\mathbf{M}`, and a log-determinant divergence between
-:math:`\mathbf{M}` and :math:`\mathbf{M_0}` (set as either :math:`\mathbf{I}`
-or :math:`\mathbf{\Omega}^{-1}`, where :math:`\mathbf{\Omega}` is the
-covariance matrix).
-
-Read more in the :ref:`User Guide <sdml>`.
-
+"""
+Sparse High-Dimensional Metric Learning (SDML)
 """
 
 from __future__ import absolute_import
@@ -38,55 +28,6 @@ class _BaseSDML(MahalanobisMixin):
   def __init__(self, balance_param=0.5, sparsity_param=0.01, prior=None,
                use_cov='deprecated', verbose=False, preprocessor=None,
                random_state=None):
-    """
-    Parameters
-    ----------
-    balance_param : float, optional
-        trade off between sparsity and M0 prior
-
-    sparsity_param : float, optional
-        trade off between optimizer and sparseness (see graph_lasso)
-
-    prior : None, string or numpy array, optional (default=None)
-         Prior to set for the metric. Possible options are
-         'identity', 'covariance', 'random', and a numpy array of
-         shape (n_features, n_features). For SDML, the prior should be strictly
-         positive definite (PD). If `None`, will be set
-         automatically to 'identity' (this is to raise a warning if
-         `prior` is not set, and stays to its default value (None), in v0.5.0).
-
-         'identity'
-            An identity matrix of shape (n_features, n_features).
-
-         'covariance'
-            The inverse covariance matrix.
-
-         'random'
-            The prior will be a random positive definite (PD) matrix of shape
-            `(n_features, n_features)`, generated using
-            `sklearn.datasets.make_spd_matrix`.
-
-         numpy array
-             A positive definite (PD) matrix of shape
-             (n_features, n_features), that will be used as such to set the
-             prior.
-
-    use_cov : Not used.
-        .. deprecated:: 0.5.0
-          `A0` was deprecated in version 0.5.0 and will
-          be removed in 0.6.0. Use 'prior' instead.
-
-    verbose : bool, optional
-        if True, prints information while learning
-
-    preprocessor : array-like, shape=(n_samples, n_features) or callable
-        The preprocessor to call to get tuples from indices. If array-like,
-        tuples will be gotten like this: X[indices].
-
-    random_state : int or numpy.RandomState or None, optional (default=None)
-        A pseudo random number generator object or a seed for it if int. If
-        ``prior='random'``, ``random_state`` is used to set the prior.
-    """
     self.balance_param = balance_param
     self.sparsity_param = sparsity_param
     self.prior = prior
@@ -190,6 +131,63 @@ def _fit(self, pairs, y):
 class SDML(_BaseSDML, _PairsClassifierMixin):
   """Sparse Distance Metric Learning (SDML)
 
+  SDML is an efficient sparse metric learning in high-dimensional space via
+  double regularization: an L1-penalization on the off-diagonal elements of the
+  Mahalanobis matrix :math:`\mathbf{M}`, and a log-determinant divergence
+  between :math:`\mathbf{M}` and :math:`\mathbf{M_0}` (set as either
+  :math:`\mathbf{I}` or :math:`\mathbf{\Omega}^{-1}`, where
+  :math:`\mathbf{\Omega}` is the covariance matrix).
+
+  Read more in the :ref:`User Guide <sdml>`.
+
+  Parameters
+  ----------
+  balance_param : float, optional
+      trade off between sparsity and M0 prior
+
+  sparsity_param : float, optional
+      trade off between optimizer and sparseness (see graph_lasso)
+
+  prior : None, string or numpy array, optional (default=None)
+       Prior to set for the metric. Possible options are
+       'identity', 'covariance', 'random', and a numpy array of
+       shape (n_features, n_features). For SDML, the prior should be strictly
+       positive definite (PD). If `None`, will be set
+       automatically to 'identity' (this is to raise a warning if
+       `prior` is not set, and stays to its default value (None), in v0.5.0).
+
+       'identity'
+          An identity matrix of shape (n_features, n_features).
+
+       'covariance'
+          The inverse covariance matrix.
+
+       'random'
+          The prior will be a random positive definite (PD) matrix of shape
+          `(n_features, n_features)`, generated using
+          `sklearn.datasets.make_spd_matrix`.
+
+       numpy array
+           A positive definite (PD) matrix of shape
+           (n_features, n_features), that will be used as such to set the
+           prior.
+
+  use_cov : Not used.
+      .. deprecated:: 0.5.0
+        `A0` was deprecated in version 0.5.0 and will
+        be removed in 0.6.0. Use 'prior' instead.
+
+  verbose : bool, optional (default=False)
+      if True, prints information while learning
+
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+      The preprocessor to call to get tuples from indices. If array-like,
+      tuples will be gotten like this: X[indices].
+
+  random_state : int or numpy.RandomState or None, optional (default=None)
+      A pseudo random number generator object or a seed for it if int. If
+      ``prior='random'``, ``random_state`` is used to set the prior.
+
   Attributes
   ----------
   transformer_ : `numpy.ndarray`, shape=(n_features, n_features)
@@ -200,6 +198,27 @@ class SDML(_BaseSDML, _PairsClassifierMixin):
       If the distance metric between two points is lower than this threshold,
       points will be classified as similar, otherwise they will be
       classified as dissimilar.
+
+  Examples
+  --------
+  >>> from metric_learn import SDML_Supervised
+  >>> from sklearn.datasets import load_iris
+  >>> iris_data = load_iris()
+  >>> X = iris_data['data']
+  >>> Y = iris_data['target']
+  >>> sdml = SDML_Supervised(num_constraints=200)
+  >>> sdml.fit(X, Y)
+
+  References
+  ----------
+
+  .. [1] Qi et al.
+         An efficient sparse metric learning in high-dimensional space via
+         L1-penalized log-determinant regularization. ICML 2009.
+         http://lms.comp.nus.edu.sg/sites/default/files/publication\
+-attachments/icml09-guojun.pdf
+
+  .. [2] Adapted from https://gist.github.com/kcarnold/5439945
   """
 
   def fit(self, pairs, y, calibration_params=None):
@@ -210,7 +229,7 @@ def fit(self, pairs, y, calibration_params=None):
 
     Parameters
     ----------
-    pairs : array-like, shape=(n_constraints, 2, n_features) or
+    pairs : array-like, shape=(n_constraints, 2, n_features) or \
            (n_constraints, 2)
         3D Array of pairs with each row corresponding to two points,
         or 2D array of indices of pairs if the metric learner uses a
@@ -238,74 +257,78 @@ def fit(self, pairs, y, calibration_params=None):
 class SDML_Supervised(_BaseSDML, TransformerMixin):
   """Supervised version of Sparse Distance Metric Learning (SDML)
 
+  `SDML_Supervised` creates pairs of similar sample by taking same class
+  samples, and pairs of dissimilar samples by taking different class
+  samples. It then passes these pairs to `SDML` for training.
+
+  Parameters
+  ----------
+  balance_param : float, optional (default=0.5)
+      trade off between sparsity and M0 prior
+  sparsity_param : float, optional (default=0.01)
+      trade off between optimizer and sparseness (see graph_lasso)
+  prior : None, string or numpy array, optional (default=None)
+       Prior to set for the metric. Possible options are
+       'identity', 'covariance', 'random', and a numpy array of
+       shape (n_features, n_features). For SDML, the prior should be strictly
+       positive definite (PD). If `None`, will be set
+       automatically to 'identity' (this is to raise a warning if
+       `prior` is not set, and stays to its default value (None), in v0.5.0).
+
+       'identity'
+          An identity matrix of shape (n_features, n_features).
+
+       'covariance'
+          The inverse covariance matrix.
+
+       'random'
+          The prior will be a random SPD matrix of shape
+          `(n_features, n_features)`, generated using
+          `sklearn.datasets.make_spd_matrix`.
+
+       numpy array
+           A positive definite (PD) matrix of shape
+           (n_features, n_features), that will be used as such to set the
+           prior.
+
+  use_cov : Not used.
+      .. deprecated:: 0.5.0
+        `A0` was deprecated in version 0.5.0 and will
+        be removed in 0.6.0. Use 'prior' instead.
+
+  num_labeled : Not used
+    .. deprecated:: 0.5.0
+       `num_labeled` was deprecated in version 0.5.0 and will
+       be removed in 0.6.0.
+  num_constraints : int, optional (default=None)
+      number of constraints to generate
+  verbose : bool, optional (default=False)
+      if True, prints information while learning
+  preprocessor : array-like, shape=(n_samples, n_features) or callable
+      The preprocessor to call to get tuples from indices. If array-like,
+      tuples will be formed like this: X[indices].
+  random_state : int or numpy.RandomState or None, optional (default=None)
+      A pseudo random number generator object or a seed for it if int. If
+      ``init='random'``, ``random_state`` is used to set the random
+      prior.
+
   Attributes
   ----------
   transformer_ : `numpy.ndarray`, shape=(n_features, n_features)
       The linear transformation ``L`` deduced from the learned Mahalanobis
       metric (See function `transformer_from_metric`.)
+
+  See Also
+  --------
+  metric_learn.SDML : The original weakly-supervised algorithm
+  :ref:`supervised_version` : The section of the project documentation
+    that describes the supervised version of weakly supervised estimators.
   """
 
   def __init__(self, balance_param=0.5, sparsity_param=0.01, prior=None,
                use_cov='deprecated', num_labeled='deprecated',
                num_constraints=None, verbose=False, preprocessor=None,
                random_state=None):
-    """Initialize the supervised version of `SDML`.
-
-    `SDML_Supervised` creates pairs of similar sample by taking same class
-    samples, and pairs of dissimilar samples by taking different class
-    samples. It then passes these pairs to `SDML` for training.
-
-    Parameters
-    ----------
-    balance_param : float, optional
-        trade off between sparsity and M0 prior
-    sparsity_param : float, optional
-        trade off between optimizer and sparseness (see graph_lasso)
-    prior : None, string or numpy array, optional (default=None)
-         Prior to set for the metric. Possible options are
-         'identity', 'covariance', 'random', and a numpy array of
-         shape (n_features, n_features). For SDML, the prior should be strictly
-         positive definite (PD). If `None`, will be set
-         automatically to 'identity' (this is to raise a warning if
-         `prior` is not set, and stays to its default value (None), in v0.5.0).
-
-         'identity'
-            An identity matrix of shape (n_features, n_features).
-
-         'covariance'
-            The inverse covariance matrix.
-
-         'random'
-            The prior will be a random SPD matrix of shape
-            `(n_features, n_features)`, generated using
-            `sklearn.datasets.make_spd_matrix`.
-
-         numpy array
-             A positive definite (PD) matrix of shape
-             (n_features, n_features), that will be used as such to set the
-             prior.
-
-    use_cov : Not used.
-        .. deprecated:: 0.5.0
-          `A0` was deprecated in version 0.5.0 and will
-          be removed in 0.6.0. Use 'prior' instead.
-
-    num_labeled : Not used
-      .. deprecated:: 0.5.0
-         `num_labeled` was deprecated in version 0.5.0 and will
-         be removed in 0.6.0.
-    num_constraints : int, optional
-        number of constraints to generate
-    verbose : bool, optional
-        if True, prints information while learning
-    preprocessor : array-like, shape=(n_samples, n_features) or callable
-        The preprocessor to call to get tuples from indices. If array-like,
-        tuples will be formed like this: X[indices].
-    random_state : int or numpy.RandomState or None, optional (default=None)
-        A pseudo random number generator object or a seed for it if int. If
-        ``init='random'``, ``random_state`` is used to set the random
-        prior.
-    """
     _BaseSDML.__init__(self, balance_param=balance_param,
                        sparsity_param=sparsity_param, prior=prior,
                        use_cov=use_cov, verbose=verbose,