diff --git a/.gitignore b/.gitignore
index 4c81e9fa..c532a6cb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@ dist/
 .coverage
 htmlcov/
 .cache/
+doc/auto_examples/*
diff --git a/doc/conf.py b/doc/conf.py
index 1c8beeab..ed476edd 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -7,6 +7,7 @@
     'sphinx.ext.viewcode',
     'sphinx.ext.mathjax',
     'numpydoc',
+    'sphinx_gallery.gen_gallery'
 ]
 
 templates_path = ['_templates']
@@ -15,7 +16,7 @@
 
 # General information about the project.
 project = u'metric-learn'
-copyright = u'2015-2017, CJ Carey and Yuan Tang'
+copyright = u'2015-2018, CJ Carey and Yuan Tang'
 author = u'CJ Carey and Yuan Tang'
 version = '0.4.0'
 release = '0.4.0'
@@ -31,3 +32,6 @@
 html_static_path = ['_static']
 htmlhelp_basename = 'metric-learndoc'
 
+# Option to only need single backticks to refer to symbols
+default_role = 'any'
+
diff --git a/doc/getting_started.rst b/doc/getting_started.rst
new file mode 100644
index 00000000..040adedc
--- /dev/null
+++ b/doc/getting_started.rst
@@ -0,0 +1,42 @@
+###############
+Getting started
+###############
+
+Installation and Setup
+======================
+
+Run ``pip install metric-learn`` to download and install from PyPI.
+
+Alternately, download the source repository and run:
+
+-  ``python setup.py install`` for default installation.
+-  ``python setup.py test`` to run all tests.
+
+**Dependencies**
+
+-  Python 2.7+, 3.4+
+-  numpy, scipy, scikit-learn
+-  (for running the examples only: matplotlib)
+
+**Notes**
+
+If a recent version of the Shogun Python modular (``modshogun``) library
+is available, the LMNN implementation will use the fast C++ version from
+there. The two implementations differ slightly, and the C++ version is
+more complete.
+
+
+Quick start
+===========
+
+This example loads the iris dataset, and evaluates a k-nearest neighbors
+algorithm on an embedding space learned with `NCA`.
+
+>>> from metric_learn import NCA
+>>> from sklearn.datasets import load_iris
+>>> from sklearn.model_selection import cross_val_score
+>>> from sklearn.pipeline import make_pipeline
+>>>
+>>> X, y = load_iris(return_X_y=True)
+>>> clf = make_pipeline(NCA(), KNeighborsClassifier())
+>>> cross_val_score(clf, X, y)
diff --git a/doc/index.rst b/doc/index.rst
index f50781fe..9dbcd9b0 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -2,78 +2,31 @@ metric-learn: Metric Learning in Python
 =======================================
 |License| |PyPI version|
 
-Distance metrics are widely used in the machine learning literature.
-Traditionally, practicioners would choose a standard distance metric
-(Euclidean, City-Block, Cosine, etc.) using a priori knowledge of
-the domain.
-Distance metric learning (or simply, metric learning) is the sub-field of
-machine learning dedicated to automatically constructing optimal distance
-metrics.
-
-This package contains efficient Python implementations of several popular
-metric learning algorithms.
+Welcome to metric-learn's documentation !
+-----------------------------------------
 
 .. toctree::
-   :caption: Algorithms
-   :maxdepth: 1
-
-   metric_learn.covariance
-   metric_learn.lmnn
-   metric_learn.itml
-   metric_learn.sdml
-   metric_learn.lsml
-   metric_learn.nca
-   metric_learn.lfda
-   metric_learn.rca
-
-Each metric supports the following methods:
-
--  ``fit(...)``, which learns the model.
--  ``transformer()``, which returns a transformation matrix
-   :math:`L \in \mathbb{R}^{D \times d}`, which can be used to convert a
-   data matrix :math:`X \in \mathbb{R}^{n \times d}` to the
-   :math:`D`-dimensional learned metric space :math:`X L^{\top}`,
-   in which standard Euclidean distances may be used.
--  ``transform(X)``, which applies the aforementioned transformation.
--  ``metric()``, which returns a Mahalanobis matrix
-   :math:`M = L^{\top}L` such that distance between vectors ``x`` and
-   ``y`` can be computed as :math:`\left(x-y\right)M\left(x-y\right)`.
+   :maxdepth: 2
 
+   getting_started
 
-Installation and Setup
-======================
-
-Run ``pip install metric-learn`` to download and install from PyPI.
-
-Alternately, download the source repository and run:
-
--  ``python setup.py install`` for default installation.
--  ``python setup.py test`` to run all tests.
+.. toctree::
+   :maxdepth: 2
 
-**Dependencies**
+   user_guide
 
--  Python 2.7+, 3.4+
--  numpy, scipy, scikit-learn
--  (for running the examples only: matplotlib)
+.. toctree::
+   :maxdepth: 2
 
-**Notes**
+   Package Overview <metric_learn>
 
-If a recent version of the Shogun Python modular (``modshogun``) library
-is available, the LMNN implementation will use the fast C++ version from
-there. The two implementations differ slightly, and the C++ version is
-more complete.
+.. toctree::
+   :maxdepth: 2
 
-Navigation
-----------
+   auto_examples/index
 
 :ref:`genindex` | :ref:`modindex` | :ref:`search`
 
-.. toctree::
-   :maxdepth: 4
-   :hidden:
-
-   Package Overview <metric_learn>
-
 .. |PyPI version| image:: https://badge.fury.io/py/metric-learn.svg
    :target: http://badge.fury.io/py/metric-learn
 .. |License| image:: http://img.shields.io/:license-mit-blue.svg?style=flat
diff --git a/doc/introduction.rst b/doc/introduction.rst
new file mode 100644
index 00000000..079b82d0
--- /dev/null
+++ b/doc/introduction.rst
@@ -0,0 +1,38 @@
+============
+Introduction
+============
+
+Distance metrics are widely used in the machine learning literature.
+Traditionally, practitioners would choose a standard distance metric
+(Euclidean, City-Block, Cosine, etc.) using a priori knowledge of
+the domain.
+Distance metric learning (or simply, metric learning) is the sub-field of
+machine learning dedicated to automatically construct task-specific distance
+metrics from (weakly) supervised data.
+The learned distance metric often corresponds to a Euclidean distance in a new
+embedding space, hence distance metric learning can be seen as a form of
+representation learning.
+
+This package contains a efficient Python implementations of several popular
+metric learning algorithms, compatible with scikit-learn. This allows to use
+all the scikit-learn routines for pipelining and model selection for
+metric learning algorithms.
+
+
+Currently, each metric learning algorithm supports the following methods:
+
+-  ``fit(...)``, which learns the model.
+-  ``metric()``, which returns a Mahalanobis matrix
+   :math:`M = L^{\top}L` such that distance between vectors ``x`` and
+   ``y`` can be computed as :math:`\left(x-y\right)M\left(x-y\right)`.
+-  ``transformer_from_metric(metric)``, which returns a transformation matrix
+   :math:`L \in \mathbb{R}^{D \times d}`, which can be used to convert a
+   data matrix :math:`X \in \mathbb{R}^{n \times d}` to the
+   :math:`D`-dimensional learned metric space :math:`X L^{\top}`,
+   in which standard Euclidean distances may be used.
+-  ``transform(X)``, which applies the aforementioned transformation.
+- ``score_pairs(pairs)`` which returns the distance between pairs of
+  points. ``pairs`` should be a 3D array-like of pairs of shape ``(n_pairs,
+  2, n_features)``, or it can be a 2D array-like of pairs indicators of
+  shape ``(n_pairs, 2)`` (see section :ref:`preprocessor_section` for more
+  details).
diff --git a/doc/metric_learn.covariance.rst b/doc/metric_learn.covariance.rst
index 92326cc0..493878c1 100644
--- a/doc/metric_learn.covariance.rst
+++ b/doc/metric_learn.covariance.rst
@@ -6,6 +6,7 @@ Covariance metric (baseline method)
     :undoc-members:
     :inherited-members:
     :show-inheritance:
+    :special-members: __init__
 
 Example Code
 ------------
diff --git a/doc/metric_learn.itml.rst b/doc/metric_learn.itml.rst
index d6fb2221..addb4c76 100644
--- a/doc/metric_learn.itml.rst
+++ b/doc/metric_learn.itml.rst
@@ -6,6 +6,7 @@ Information Theoretic Metric Learning (ITML)
     :undoc-members:
     :inherited-members:
     :show-inheritance:
+    :special-members: __init__
 
 Example Code
 ------------
diff --git a/doc/metric_learn.lfda.rst b/doc/metric_learn.lfda.rst
index 95cde90d..41088a68 100644
--- a/doc/metric_learn.lfda.rst
+++ b/doc/metric_learn.lfda.rst
@@ -6,6 +6,7 @@ Local Fisher Discriminant Analysis (LFDA)
     :undoc-members:
     :inherited-members:
     :show-inheritance:
+    :special-members: __init__
 
 Example Code
 ------------
diff --git a/doc/metric_learn.lmnn.rst b/doc/metric_learn.lmnn.rst
index 4062bfa0..bc65161e 100644
--- a/doc/metric_learn.lmnn.rst
+++ b/doc/metric_learn.lmnn.rst
@@ -6,6 +6,7 @@ Large Margin Nearest Neighbor (LMNN)
     :undoc-members:
     :inherited-members:
     :show-inheritance:
+    :special-members: __init__
 
 Example Code
 ------------
diff --git a/doc/metric_learn.lsml.rst b/doc/metric_learn.lsml.rst
index c6c8ede9..0deae4e6 100644
--- a/doc/metric_learn.lsml.rst
+++ b/doc/metric_learn.lsml.rst
@@ -6,6 +6,7 @@ Least Squares Metric Learning (LSML)
     :undoc-members:
     :inherited-members:
     :show-inheritance:
+    :special-members: __init__
 
 Example Code
 ------------
diff --git a/doc/metric_learn.mlkr.rst b/doc/metric_learn.mlkr.rst
index a2f36c4f..f71697de 100644
--- a/doc/metric_learn.mlkr.rst
+++ b/doc/metric_learn.mlkr.rst
@@ -6,6 +6,7 @@ Metric Learning for Kernel Regression (MLKR)
     :undoc-members:
     :inherited-members:
     :show-inheritance:
+    :special-members: __init__
 
 Example Code
 ------------
diff --git a/doc/metric_learn.mmc.rst b/doc/metric_learn.mmc.rst
index f3ddaa9e..bb9031ba 100644
--- a/doc/metric_learn.mmc.rst
+++ b/doc/metric_learn.mmc.rst
@@ -6,6 +6,7 @@ Mahalanobis Metric Learning for Clustering (MMC)
     :undoc-members:
     :inherited-members:
     :show-inheritance:
+    :special-members: __init__
 
 Example Code
 ------------
diff --git a/doc/metric_learn.nca.rst b/doc/metric_learn.nca.rst
index 6a2675e5..7a4ee2c4 100644
--- a/doc/metric_learn.nca.rst
+++ b/doc/metric_learn.nca.rst
@@ -6,6 +6,7 @@ Neighborhood Components Analysis (NCA)
     :undoc-members:
     :inherited-members:
     :show-inheritance:
+    :special-members: __init__
 
 Example Code
 ------------
diff --git a/doc/metric_learn.rca.rst b/doc/metric_learn.rca.rst
index 2430cd82..027d583b 100644
--- a/doc/metric_learn.rca.rst
+++ b/doc/metric_learn.rca.rst
@@ -6,6 +6,7 @@ Relative Components Analysis (RCA)
     :undoc-members:
     :inherited-members:
     :show-inheritance:
+    :special-members: __init__
 
 Example Code
 ------------
diff --git a/doc/metric_learn.rst b/doc/metric_learn.rst
index 70a99a04..c2472408 100644
--- a/doc/metric_learn.rst
+++ b/doc/metric_learn.rst
@@ -1,8 +1,8 @@
 metric_learn package
 ====================
 
-Submodules
-----------
+Module Contents
+---------------
 
 .. toctree::
 
@@ -16,11 +16,3 @@ Submodules
    metric_learn.nca
    metric_learn.rca
    metric_learn.sdml
-
-Module contents
----------------
-
-.. automodule:: metric_learn
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/doc/metric_learn.sdml.rst b/doc/metric_learn.sdml.rst
index 83570483..3e350a70 100644
--- a/doc/metric_learn.sdml.rst
+++ b/doc/metric_learn.sdml.rst
@@ -6,6 +6,7 @@ Sparse Determinant Metric Learning (SDML)
     :undoc-members:
     :inherited-members:
     :show-inheritance:
+    :special-members: __init__
 
 Example Code
 ------------
diff --git a/doc/preprocessor.rst b/doc/preprocessor.rst
new file mode 100644
index 00000000..ad1ffd8f
--- /dev/null
+++ b/doc/preprocessor.rst
@@ -0,0 +1,111 @@
+.. _preprocessor_section:
+
+============
+Preprocessor
+============
+
+Estimators in metric-learn all have a ``preprocessor`` option at instantiation.
+Filling this argument allows them to take more compact input representation
+when fitting, predicting etc...
+
+If ``preprocessor=None``, no preprocessor will be used and the user must
+provide the classical representation to the fit/predict/score/etc... methods of
+the estimators (see the documentation of the particular estimator to know the
+type of input it accepts). Otherwise, two types of objects can be put in this
+argument:
+
+Array-like
+----------
+You can specify ``preprocessor=X`` where ``X`` is an array-like containing the
+dataset of points. In this case, the fit/predict/score/etc... methods of the
+estimator will be able to take as inputs an array-like of indices, replacing
+under the hood each index by the corresponding sample.
+
+
+Example with a supervised metric learner:
+
+>>> from metric_learn import NCA
+>>>
+>>> X = np.array([[-0.7 , -0.23],
+>>>               [-0.43, -0.49],
+>>>               [ 0.14, -0.37]])  # array of 3 samples of 2 features
+>>> points_indices = np.array([2, 0, 1, 0])
+>>> y = np.array([1, 0, 1, 1])
+>>>
+>>> nca = NCA(preprocessor=X)
+>>> nca.fit(points_indices, y)
+>>> # under the hood the algorithm will create
+>>> # points = np.array([[ 0.14, -0.37],
+>>> #                    [-0.7 , -0.23],
+>>> #                    [-0.43, -0.49],
+>>> #                    [ 0.14, -0.37]]) and fit on it
+
+
+Example with a weakly supervised metric learner:
+
+>>> from metric_learn import MMC
+>>> X = np.array([[-0.7 , -0.23],
+>>>               [-0.43, -0.49],
+>>>               [ 0.14, -0.37]])  # array of 3 samples of 2 features
+>>> pairs_indices = np.array([[2, 0], [1, 0]])
+>>> y_pairs = np.array([1, -1])
+>>>
+>>> mmc = MMC(preprocessor=X)
+>>> mmc.fit(pairs_indices, y_pairs)
+>>> # under the hood the algorithm will create
+>>> # pairs = np.array([[[ 0.14, -0.37], [-0.7 , -0.23]],
+>>> #                    [[-0.43, -0.49], [-0.7 , -0.23]]]) and fit on it
+
+Callable
+--------
+Alternatively, you can provide a callable as ``preprocessor``. Then the
+estimator will accept indicators of points instead of points. Under the hood,
+the estimator will call this callable on the indicators you provide as input
+when fitting, predicting etc... Using a callable can be really useful to
+represent lazily a dataset of images stored on the file system for instance.
+The callable should take as an input a 1D array-like, and return a 2D
+array-like. For supervised learners it will be applied on the whole 1D array of
+indicators at once, and for weakly supervised learners it will be applied on
+each column of the 2D array of tuples.
+
+Example with a supervised metric learner:
+
+>>> def find_images(file_paths):
+>>>    # each file contains a small image to use as an input datapoint
+>>>    return np.row_stack([imread(f).ravel() for f in file_paths])
+>>>
+>>> nca = NCA(preprocessor=find_images)
+>>> nca.fit(['img01.png', 'img00.png', 'img02.png'], [1, 0, 1])
+>>> # under the hood preprocessor(indicators) will be called
+
+
+Example with a weakly supervised metric learner:
+
+>>> pairs_images_paths = [['img02.png', 'img00.png'],
+>>>                       ['img01.png', 'img00.png']]
+>>> y_pairs = np.array([1, -1])
+>>>
+>>> mmc = NCA(preprocessor=find_images)
+>>> mmc.fit(pairs_images_paths, y_pairs)
+>>> # under the hood preprocessor(pairs_indicators[i]) will be called for each
+>>> # i in [0, 1]
+
+
+.. note:: Note that when you fill the ``preprocessor`` option, it allows you
+ to give more compact inputs, but the classical way of providing inputs
+ stays valid (2D array-like for supervised learners and 3D array-like of
+ tuples for weakly supervised learners). If a classical input
+ is provided, the metric learner will not use the preprocessor.
+
+ Example: This will work:
+
+ >>> from metric_learn import MMC
+ >>> def preprocessor_wip(array):
+ >>>    raise NotImplementedError("This preprocessor does nothing yet.")
+ >>>
+ >>> pairs = np.array([[[ 0.14, -0.37], [-0.7 , -0.23]],
+ >>>                   [[-0.43, -0.49], [-0.7 , -0.23]]])
+ >>> y_pairs = np.array([1, -1])
+ >>>
+ >>> mmc = MMC(preprocessor=preprocessor_wip)
+ >>> mmc.fit(pairs, y_pairs)  # preprocessor_wip will not be called here
diff --git a/doc/supervised.rst b/doc/supervised.rst
new file mode 100644
index 00000000..32dba84b
--- /dev/null
+++ b/doc/supervised.rst
@@ -0,0 +1,209 @@
+==========================
+Supervised Metric Learning
+==========================
+
+Supervised metric learning algorithms take as inputs points `X` and target
+labels `y`, and learn a distance matrix that make points from the same class
+(for classification) or with close target value (for regression) close to each
+other, and points from different classes or with distant target values far away
+from each other.
+
+Scikit-learn compatibility
+==========================
+
+All supervised algorithms are scikit-learn `Estimators`, so they are
+compatible with Pipelining and scikit-learn model selection routines.
+
+Algorithms
+==========
+
+Covariance
+----------
+
+.. todo:: Covariance is unsupervised, so its doc should not be here.
+
+`Covariance` does not "learn" anything, rather it calculates
+the covariance matrix of the input data. This is a simple baseline method.
+
+.. topic:: Example Code:
+
+::
+
+    from metric_learn import Covariance
+    from sklearn.datasets import load_iris
+
+    iris = load_iris()['data']
+
+    cov = Covariance().fit(iris)
+    x = cov.transform(iris)
+
+.. topic:: References:
+
+    .. [1] On the Generalized Distance in Statistics, P.C.Mahalanobis, 1936
+
+LMNN
+-----
+
+Large-margin nearest neighbor metric learning.
+
+`LMNN` learns a Mahanalobis distance metric in the kNN classification
+setting using semidefinite programming. The learned metric attempts to keep
+k-nearest neighbors in the same class, while keeping examples from different
+classes separated by a large margin. This algorithm makes no assumptions about
+the distribution of the data.
+
+.. topic:: Example Code:
+
+::
+
+    import numpy as np
+    from metric_learn import LMNN
+    from sklearn.datasets import load_iris
+
+    iris_data = load_iris()
+    X = iris_data['data']
+    Y = iris_data['target']
+
+    lmnn = LMNN(k=5, learn_rate=1e-6)
+    lmnn.fit(X, Y, verbose=False)
+
+If a recent version of the Shogun Python modular (``modshogun``) library
+is available, the LMNN implementation will use the fast C++ version from
+there. Otherwise, the included pure-Python version will be used.
+The two implementations differ slightly, and the C++ version is more complete.
+
+.. topic:: References:
+
+    .. [1] `Distance Metric Learning for Large Margin Nearest Neighbor
+       Classification
+       <http://papers.nips.cc/paper/2795-distance-metric-learning-for-large
+       -margin -nearest-neighbor-classification>`_ Kilian Q. Weinberger, John
+       Blitzer, Lawrence K. Saul
+
+NCA
+---
+
+Neighborhood Components Analysis (`NCA`) is a distance metric learning
+algorithm which aims to improve the accuracy of nearest neighbors
+classification compared to the standard Euclidean distance. The algorithm
+directly  maximizes  a stochastic  variant  of  the leave-one-out k-nearest
+neighbors (KNN) score on the training set.  It can also learn a low-dimensional
+linear  embedding  of  data  that  can  be used for data visualization and fast
+classification.
+
+.. topic:: Example Code:
+
+::
+
+    import numpy as np
+    from metric_learn import NCA
+    from sklearn.datasets import load_iris
+
+    iris_data = load_iris()
+    X = iris_data['data']
+    Y = iris_data['target']
+
+    nca = NCA(max_iter=1000, learning_rate=0.01)
+    nca.fit(X, Y)
+
+.. topic:: References:
+
+    .. [1] J. Goldberger, G. Hinton, S. Roweis, R. Salakhutdinov.
+       "Neighbourhood Components Analysis". Advances in Neural Information
+       Processing Systems. 17, 513-520, 2005.
+       http://www.cs.nyu.edu/~roweis/papers/ncanips.pdf
+
+    .. [2] Wikipedia entry on Neighborhood Components Analysis
+       https://en.wikipedia.org/wiki/Neighbourhood_components_analysis
+
+LFDA
+----
+
+Local Fisher Discriminant Analysis (LFDA)
+
+`LFDA` is a linear supervised dimensionality reduction method. It is
+particularly useful when dealing with multimodality, where one ore more classes
+consist of separate clusters in input space. The core optimization problem of
+LFDA is solved as a generalized eigenvalue problem.
+
+.. topic:: Example Code:
+
+::
+
+    import numpy as np
+    from metric_learn import LFDA
+    from sklearn.datasets import load_iris
+
+    iris_data = load_iris()
+    X = iris_data['data']
+    Y = iris_data['target']
+
+    lfda = LFDA(k=2, dim=2)
+    lfda.fit(X, Y)
+
+.. topic:: References:
+
+    .. [1] `Dimensionality Reduction of Multimodal Labeled Data by Local
+       Fisher Discriminant Analysis <http://www.ms.k.u-tokyo.ac.jp/2007/LFDA
+       .pdf>`_ Masashi Sugiyama.
+
+    .. [2] `Local Fisher Discriminant Analysis on Beer Style Clustering
+       <https://gastrograph.com/resources/whitepapers/local-fisher
+       -discriminant-analysis-on-beer-style-clustering.html#>`_ Yuan Tang.
+
+
+MLKR
+----
+
+Metric Learning for Kernel Regression.
+
+`MLKR` is an algorithm for supervised metric learning, which learns a
+distance function by directly minimising the leave-one-out regression error.
+This algorithm can also be viewed as a supervised variation of PCA and can be
+used for dimensionality reduction and high dimensional data visualization.
+
+.. topic:: Example Code:
+
+::
+
+    from metric_learn import MLKR
+    from sklearn.datasets import load_iris
+
+    iris_data = load_iris()
+    X = iris_data['data']
+    Y = iris_data['target']
+
+    mlkr = MLKR()
+    mlkr.fit(X, Y)
+
+.. topic:: References:
+
+    .. [1] `Metric Learning for Kernel Regression <http://proceedings.mlr.
+       press/v2/weinberger07a/weinberger07a.pdf>`_ Kilian Q. Weinberger,
+       Gerald Tesauro
+
+
+Supervised versions of weakly-supervised algorithms
+---------------------------------------------------
+
+Note that each :ref:`weakly-supervised algorithm <weakly_supervised_section>`
+has a supervised version of the form `*_Supervised` where similarity tuples are
+generated from the labels information and passed to the underlying algorithm.
+
+.. todo:: add more details about that (see issue `<https://github
+          .com/metric-learn/metric-learn/issues/135>`_)
+
+
+.. topic:: Example Code:
+
+::
+
+    from metric_learn import MMC_Supervised
+    from sklearn.datasets import load_iris
+
+    iris_data = load_iris()
+    X = iris_data['data']
+    Y = iris_data['target']
+
+    mmc = MMC_Supervised(num_constraints=200)
+    mmc.fit(X, Y)
diff --git a/doc/user_guide.rst b/doc/user_guide.rst
new file mode 100644
index 00000000..fb7060ce
--- /dev/null
+++ b/doc/user_guide.rst
@@ -0,0 +1,15 @@
+.. title:: User guide: contents
+
+.. _user_guide:
+
+==========
+User Guide
+==========
+
+.. toctree::
+   :numbered:
+
+   introduction.rst
+   supervised.rst
+   weakly_supervised.rst
+   preprocessor.rst
\ No newline at end of file
diff --git a/doc/weakly_supervised.rst b/doc/weakly_supervised.rst
new file mode 100644
index 00000000..deae9b40
--- /dev/null
+++ b/doc/weakly_supervised.rst
@@ -0,0 +1,345 @@
+.. _weakly_supervised_section:
+
+=================================
+Weakly Supervised Metric Learning
+=================================
+
+Weakly supervised algorithms work on weaker information about the data points
+than supervised algorithms. Rather than labeled points, they take as input
+similarity judgments on tuples of data points, for instance pairs of similar
+and dissimilar points. Refer to the documentation of each algorithm for its
+particular form of input data.
+
+
+Input data
+==========
+
+In the following paragraph we talk about tuples for sake of generality. These
+can be pairs, triplets, quadruplets etc, depending on the particular metric
+learning algorithm we use.
+
+Basic form
+----------
+Every weakly supervised algorithm will take as input tuples of points, and if
+needed labels for theses tuples.
+
+
+The `tuples` argument is the first argument of every method (like the X
+argument for classical algorithms in scikit-learn). The second argument is the
+label of the tuple: its semantic depends on the algorithm used. For instance
+for pairs learners ``y`` is a label indicating whether the pair is of similar
+samples or dissimilar samples.
+
+Then one can fit a Weakly Supervised Metric Learner on this tuple, like this:
+
+>>> my_algo.fit(tuples, y)
+
+Like in a classical setting we split the points ``X`` between train and test,
+here we split the ``tuples`` between train and test.
+
+>>> from sklearn.model_selection import train_test_split
+>>> pairs_train, pairs_test, y_train, y_test = train_test_split(pairs, y)
+
+These are two data structures that can be used to represent tuple in metric
+learn:
+
+3D array of tuples
+------------------
+
+The most intuitive way to represent tuples is to provide the algorithm with a
+3D array-like of tuples of shape ``(n_tuples, t, n_features)``, where
+``n_tuples`` is the number of tuples, ``tuple_size`` is the number of elements
+in a tuple (2 for pairs, 3 for triplets for instance), and ``n_features`` is
+the number of features of each point.
+
+.. topic:: Example:
+   Here is an artificial dataset of 4 pairs of 2 points of 3 features each:
+
+>>> import numpy as np
+>>> tuples = np.array([[[-0.12, -1.21, -0.20],
+>>>                     [+0.05, -0.19, -0.05]],
+>>>
+>>>                    [[-2.16, +0.11, -0.02],
+>>>                     [+1.58, +0.16, +0.93]],
+>>>
+>>>                    [[+1.58, +0.16, +0.93 ],  # same as tuples[1, 1, :]
+>>>                     [+0.89, -0.34, +2.41]],
+>>>
+>>>                    [[-0.12, -1.21, -0.20 ],  # same as tuples[0, 0, :]
+>>>                     [-2.16, +0.11, -0.02]]])  # same as tuples[1, 0, :]
+>>> y = np.array([-1, 1, 1, -1])
+
+.. warning:: This way of specifying pairs is not recommended for a large number
+   of tuples, as it is redundant (see the comments in the example) and hence
+   takes a lot of memory. Indeed each feature vector of a point will be
+   replicated as many times as a point is involved in a tuple. The second way
+   to specify pairs is more efficient
+
+
+2D array of indicators + preprocessor
+-------------------------------------
+
+Instead of forming each point in each tuple, a more efficient representation
+would be to keep the dataset of points ``X`` aside, and just represent tuples
+as a collection of tuples of *indices* from the points in ``X``. Since we loose
+the feature dimension there, the resulting array is 2D.
+
+.. topic:: Example: An equivalent representation of the above pairs would be:
+
+>>> X = np.array([[-0.12, -1.21, -0.20],
+>>>               [+0.05, -0.19, -0.05],
+>>>               [-2.16, +0.11, -0.02],
+>>>               [+1.58, +0.16, +0.93],
+>>>               [+0.89, -0.34, +2.41]])
+>>>
+>>> tuples_indices = np.array([[0, 1],
+>>>                            [2, 3],
+>>>                            [3, 4],
+>>>                            [0, 2]])
+>>> y = np.array([-1, 1, 1, -1])
+
+In order to fit metric learning algorithms with this type of input, we need to
+give the original dataset of points ``X`` to the estimator so that it knows
+the points the indices refer to. We do this when initializing the estimator,
+through the argument `preprocessor`.
+
+.. topic:: Example:
+
+>>> from metric_learn import MMC
+>>> mmc = MMC(preprocessor=X)
+>>> mmc.fit(pairs_indice, y)
+
+
+.. note::
+
+   Instead of an array-like, you can give a callable in the argument
+   ``preprocessor``, which will go fetch and form the tuples. This allows to
+   give more general indicators than just indices from an array (for instance
+   paths in the filesystem, name of records in a database etc...) See section
+   :ref:`preprocessor_section` for more details on how to use the preprocessor.
+
+
+Scikit-learn compatibility
+==========================
+
+Weakly supervised estimators are compatible with scikit-learn routines for
+model selection (grid-search, cross-validation etc). See the scoring section
+for more details on the scoring used in the case of Weakly Supervised
+Metric Learning.
+
+.. topic:: Example
+
+>>> from metric_learn import MMC
+>>> from sklearn.datasets import load_iris
+>>> from sklearn.model_selection import cross_val_score
+>>> rng = np.random.RandomState(42)
+>>> X, _ = load_iris(return_X_y=True)
+>>> # let's sample 30 random pairs and labels of pairs
+>>> pairs_indices = rng.randint(X.shape[0], size=(30, 2))
+>>> y = rng.randint(2, size=30)
+>>> mmc = MMC(preprocessor=X)
+>>> cross_val_score(mmc, pairs_indices, y)
+
+Scoring
+=======
+
+Some default scoring are implemented in metric-learn, depending on the kind of
+tuples you're working with (pairs, triplets...). See the docstring of the
+`score` method of the estimator you use.
+
+
+Algorithms
+==================
+
+ITML
+----
+
+Information Theoretic Metric Learning, Davis et al., ICML 2007
+
+`ITML` minimizes the differential relative entropy between two multivariate
+Gaussians under constraints on the distance function, which can be formulated
+into a Bregman optimization problem by minimizing the LogDet divergence subject
+to linear constraints. This algorithm can handle a wide variety of constraints
+and can optionally incorporate a prior on the distance function. Unlike some
+other methods, ITML does not rely on an eigenvalue computation or semi-definite
+programming.
+
+.. topic:: Example Code:
+
+::
+
+    from metric_learn import ITML
+
+    pairs = [[[1.2, 7.5], [1.3, 1.5]],
+             [[6.4, 2.6], [6.2, 9.7]],
+             [[1.3, 4.5], [3.2, 4.6]],
+             [[6.2, 5.5], [5.4, 5.4]]]
+    y = [1, 1, -1, -1]
+
+    # in this task we want points where the first feature is close to be closer
+    # to each other, no matter how close the second feature is
+
+
+    itml = ITML()
+    itml.fit(pairs, y)
+
+.. topic:: References:
+
+    .. [1] `Information-theoretic Metric Learning <http://machinelearning.wustl
+       .edu/mlpapers/paper_files/icml2007_DavisKJSD07.pdf>`_ Jason V. Davis,
+       et al.
+
+    .. [2] Adapted from Matlab code at http://www.cs.utexas.edu/users/pjain/
+       itml/
+
+
+LSML
+----
+
+`LSML`: Metric Learning from Relative Comparisons by Minimizing Squared
+Residual
+
+.. topic:: Example Code:
+
+::
+
+    from metric_learn import LSML
+
+    quadruplets = [[[1.2, 7.5], [1.3, 1.5], [6.4, 2.6], [6.2, 9.7]],
+                   [[1.3, 4.5], [3.2, 4.6], [6.2, 5.5], [5.4, 5.4]],
+                   [[3.2, 7.5], [3.3, 1.5], [8.4, 2.6], [8.2, 9.7]],
+                   [[3.3, 4.5], [5.2, 4.6], [8.2, 5.5], [7.4, 5.4]]]
+
+    # we want to make closer points where the first feature is close, and
+    # further if the second feature is close
+
+    lsml = LSML()
+    lsml.fit(quadruplets)
+
+.. topic:: References:
+
+    .. [1] Liu et al.
+       "Metric Learning from Relative Comparisons by Minimizing Squared
+       Residual". ICDM 2012. http://www.cs.ucla.edu/~weiwang/paper/ICDM12.pdf
+
+    .. [2] Adapted from https://gist.github.com/kcarnold/5439917
+
+
+SDML
+----
+
+`SDML`: An efficient sparse metric learning in high-dimensional space via
+L1-penalized log-determinant regularization
+
+.. topic:: Example Code:
+
+::
+
+    from metric_learn import SDML
+
+    pairs = [[[1.2, 7.5], [1.3, 1.5]],
+             [[6.4, 2.6], [6.2, 9.7]],
+             [[1.3, 4.5], [3.2, 4.6]],
+             [[6.2, 5.5], [5.4, 5.4]]]
+    y = [1, 1, -1, -1]
+
+    # in this task we want points where the first feature is close to be closer
+    # to each other, no matter how close the second feature is
+
+    sdml = SDML()
+    sdml.fit(pairs, y)
+
+.. topic:: References:
+
+    .. [1] Qi et al.
+       An efficient sparse metric learning in high-dimensional space via
+       L1-penalized log-determinant regularization. ICML 2009.
+       http://lms.comp.nus.edu.sg/sites/default/files/publication-attachments/
+       icml09-guojun.pdf
+
+    .. [2] Adapted from https://gist.github.com/kcarnold/5439945
+
+
+RCA
+---
+
+Relative Components Analysis (RCA)
+
+`RCA` learns a full rank Mahalanobis distance metric based on a weighted sum of
+in-class covariance matrices. It applies a global linear transformation to
+assign large weights to relevant dimensions and low weights to irrelevant
+dimensions. Those relevant dimensions are estimated using "chunklets", subsets
+of points that are known to belong to the same class.
+
+.. topic:: Example Code:
+
+::
+
+    from metric_learn import RCA
+
+    pairs = [[[1.2, 7.5], [1.3, 1.5]],
+             [[6.4, 2.6], [6.2, 9.7]],
+             [[1.3, 4.5], [3.2, 4.6]],
+             [[6.2, 5.5], [5.4, 5.4]]]
+    y = [1, 1, -1, -1]
+
+    # in this task we want points where the first feature is close to be closer
+    # to each other, no matter how close the second feature is
+
+    rca = RCA()
+    rca.fit(pairs, y)
+
+
+.. topic:: References:
+
+    .. [1] `Adjustment learning and relevant component analysis
+       <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.19.2871
+       &rep=rep1&type=pdf>`_ Noam Shental, et al.
+
+    .. [2] 'Learning distance functions using equivalence relations', ICML 2003
+
+    .. [3]'Learning a Mahalanobis metric from equivalence constraints', JMLR
+       2005
+
+MMC
+---
+
+Mahalanobis Metric Learning with Application for Clustering with
+Side-Information, Xing et al., NIPS 2002
+
+`MMC` minimizes the sum of squared distances between similar examples, while
+enforcing the sum of distances between dissimilar examples to be greater than a
+certain margin. This leads to a convex and, thus, local-minima-free
+optimization problem that can be solved efficiently. However, the algorithm
+involves the computation of eigenvalues, which is the main speed-bottleneck.
+Since it has initially been designed for clustering applications, one of the
+implicit assumptions of MMC is that all classes form a compact set, i.e.,
+follow a unimodal distribution, which restricts the possible use-cases of this
+method. However, it is one of the earliest and a still often cited technique.
+
+.. topic:: Example Code:
+
+::
+
+    from metric_learn import MMC
+
+    pairs = [[[1.2, 7.5], [1.3, 1.5]],
+             [[6.4, 2.6], [6.2, 9.7]],
+             [[1.3, 4.5], [3.2, 4.6]],
+             [[6.2, 5.5], [5.4, 5.4]]]
+    y = [1, 1, -1, -1]
+
+    # in this task we want points where the first feature is close to be closer
+    # to each other, no matter how close the second feature is
+
+    mmc = MMC()
+    mmc.fit(pairs, y)
+
+.. topic:: References:
+
+  .. [1] `Distance metric learning with application to clustering with
+        side-information <http://papers.nips
+        .cc/paper/2164-distance-metric-learning-with-application-to-clustering
+        -with-side-information.pdf>`_ Xing, Jordan, Russell, Ng.
+  .. [2] Adapted from Matlab code `here <http://www.cs.cmu
+     .edu/%7Eepxing/papers/Old_papers/code_Metric_online.tar.gz>`_.
diff --git a/examples/README.txt b/examples/README.txt
new file mode 100644
index 00000000..10dbe0d5
--- /dev/null
+++ b/examples/README.txt
@@ -0,0 +1,4 @@
+Examples
+========
+
+Below is a gallery of example metric-learn use cases.
\ No newline at end of file
diff --git a/examples/sandwich.py b/examples/plot_sandwich.py
similarity index 98%
rename from examples/sandwich.py
rename to examples/plot_sandwich.py
index 08ec17c5..0e7658d3 100644
--- a/examples/sandwich.py
+++ b/examples/plot_sandwich.py
@@ -1,4 +1,8 @@
+# -*- coding: utf-8 -*-
 """
+Sandwich demo
+=============
+
 Sandwich demo based on code from http://nbviewer.ipython.org/6576096
 """