diff --git a/doc/_static/css/styles.css b/doc/_static/css/styles.css
new file mode 100644
index 00000000..6d350ae4
--- /dev/null
+++ b/doc/_static/css/styles.css
@@ -0,0 +1,36 @@
+.hatnote {
+    border-color: #e1e4e5 ;
+    border-style: solid ;
+    border-width: 1px ;
+    font-size: x-small ;
+    font-style: italic ;
+    margin-left: auto ;
+    margin-right: auto ;
+    margin-bottom: 24px;
+    padding: 12px;
+}
+.hatnote-gray {
+  background-color: #f5f5f5 
+}
+.hatnote li {
+  list-style-type: square;
+  margin-left: 12px !important;
+}
+.hatnote ul {
+  list-style-type: square;
+  margin-left: 0px !important;
+  margin-bottom: 0px !important;
+}
+.deprecated {
+  color: #b94a48;
+  background-color: #F3E5E5;
+  border-color: #eed3d7;
+  margin-top: 0.5rem;
+  padding: 0.5rem;
+  border-radius: 0.5rem;
+  margin-bottom: 0.5rem;
+}
+
+.deprecated p {
+  margin-bottom: 0 !important;
+}
\ No newline at end of file
diff --git a/doc/conf.py b/doc/conf.py
index 94263c7a..5eb312dc 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -38,9 +38,6 @@
 html_static_path = ['_static']
 htmlhelp_basename = 'metric-learndoc'
 
-# Option to only need single backticks to refer to symbols
-default_role = 'any'
-
 # Option to hide doctests comments in the documentation (like # doctest:
 # +NORMALIZE_WHITESPACE for instance)
 trim_doctest_flags = True
@@ -67,10 +64,6 @@
 # generate autosummary even if no references
 autosummary_generate = True
 
-# Switch to old behavior with html4, for a good display of references,
-# as described in https://github.com/sphinx-doc/sphinx/issues/6705
-html4_writer = True
-
 
 # Temporary work-around for spacing problem between parameter and parameter
 # type in the doc, see https://github.com/numpy/numpydoc/issues/215. The bug
@@ -79,8 +72,8 @@
 # In an ideal world, this would get fixed in this PR:
 # https://github.com/readthedocs/sphinx_rtd_theme/pull/747/files
 def setup(app):
-    app.add_javascript('js/copybutton.js')
-    app.add_stylesheet("basic.css")
+  app.add_js_file('js/copybutton.js')
+  app.add_css_file('css/styles.css')
 
 
 # Remove matplotlib agg warnings from generated doc when using plt.show
diff --git a/doc/index.rst b/doc/index.rst
index 6ec4fb26..f9dfd83d 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -1,6 +1,6 @@
 metric-learn: Metric Learning in Python
 =======================================
-|Travis-CI Build Status| |License| |PyPI version| |Code coverage|
+|GitHub Actions Build Status| |License| |PyPI version| |Code coverage|
 
 `metric-learn <https://github.com/scikit-learn-contrib/metric-learn>`_
 contains efficient Python implementations of several popular supervised and
@@ -57,8 +57,8 @@ Documentation outline
 
 :ref:`genindex` | :ref:`search`
 
-.. |Travis-CI Build Status| image:: https://api.travis-ci.org/scikit-learn-contrib/metric-learn.svg?branch=master
-   :target: https://travis-ci.org/scikit-learn-contrib/metric-learn
+.. |GitHub Actions Build Status| image:: https://github.com/scikit-learn-contrib/metric-learn/workflows/CI/badge.svg
+   :target: https://github.com/scikit-learn-contrib/metric-learn/actions?query=event%3Apush+branch%3Amaster
 .. |PyPI version| image:: https://badge.fury.io/py/metric-learn.svg
    :target: http://badge.fury.io/py/metric-learn
 .. |License| image:: http://img.shields.io/:license-mit-blue.svg?style=flat
diff --git a/doc/metric_learn.rst b/doc/metric_learn.rst
index 8f91d91c..4d0676b9 100644
--- a/doc/metric_learn.rst
+++ b/doc/metric_learn.rst
@@ -13,6 +13,8 @@ Base Classes
 
     metric_learn.Constraints
     metric_learn.base_metric.BaseMetricLearner
+    metric_learn.base_metric.MetricTransformer
+    metric_learn.base_metric.MahalanobisMixin
     metric_learn.base_metric._PairsClassifierMixin
     metric_learn.base_metric._TripletsClassifierMixin
     metric_learn.base_metric._QuadrupletsClassifierMixin
diff --git a/doc/supervised.rst b/doc/supervised.rst
index e27b58ec..09077dc2 100644
--- a/doc/supervised.rst
+++ b/doc/supervised.rst
@@ -152,7 +152,7 @@ neighbors (with same labels) of :math:`\mathbf{x}_{i}`, :math:`y_{ij}=0`
 indicates :math:`\mathbf{x}_{i}, \mathbf{x}_{j}` belong to different classes, 
 :math:`[\cdot]_+=\max(0, \cdot)` is the Hinge loss.
 
-.. topic:: Example Code:
+.. rubric:: Example Code
 
 ::
 
@@ -167,15 +167,15 @@ indicates :math:`\mathbf{x}_{i}, \mathbf{x}_{j}` belong to different classes,
     lmnn = LMNN(k=5, learn_rate=1e-6)
     lmnn.fit(X, Y, verbose=False)
 
-.. topic:: References:
+.. rubric:: References
 
-    .. [1] Weinberger et al. `Distance Metric Learning for Large Margin
-       Nearest Neighbor Classification
-       <http://jmlr.csail.mit.edu/papers/volume10/weinberger09a/weinberger09a.pdf>`_.
-       JMLR 2009
 
-    .. [2] `Wikipedia entry on Large Margin Nearest Neighbor <https://en.wikipedia.org/wiki/Large_margin_nearest_neighbor>`_
-       
+.. container:: hatnote hatnote-gray
+
+  [1]. Weinberger et al. `Distance Metric Learning for Large Margin Nearest Neighbor Classification <http://jmlr.csail.mit.edu/papers/volume10/weinberger09a/weinberger09a.pdf>`_. JMLR 2009.
+
+  [2]. `Wikipedia entry on Large Margin Nearest Neighbor <https://en.wikipedia.org/wiki/Large_margin_nearest_neighbor>`_.
+             
 
 .. _nca:
 
@@ -216,7 +216,7 @@ the sum of probability of being correctly classified:
 
       \mathbf{L} = \text{argmax}\sum_i p_i
 
-.. topic:: Example Code:
+.. rubric:: Example Code
 
 ::
 
@@ -231,13 +231,14 @@ the sum of probability of being correctly classified:
     nca = NCA(max_iter=1000)
     nca.fit(X, Y)
 
-.. topic:: References:
+.. rubric:: References
+
+
+.. container:: hatnote hatnote-gray
 
-    .. [1] Goldberger et al.
-       `Neighbourhood Components Analysis <https://papers.nips.cc/paper/2566-neighbourhood-components-analysis.pdf>`_.
-       NIPS 2005
+      [1]. Goldberger et al. `Neighbourhood Components Analysis <https://papers.nips.cc/paper/2566-neighbourhood-components-analysis.pdf>`_. NIPS 2005.
 
-    .. [2] `Wikipedia entry on Neighborhood Components Analysis <https://en.wikipedia.org/wiki/Neighbourhood_components_analysis>`_
+      [2]. `Wikipedia entry on Neighborhood Components Analysis <https://en.wikipedia.org/wiki/Neighbourhood_components_analysis>`_.
        
 
 .. _lfda:
@@ -289,7 +290,7 @@ nearby data pairs in the same class are made close and the data pairs in
 different classes are separated from each other; far apart data pairs in the 
 same class are not imposed to be close.
 
-.. topic:: Example Code:
+.. rubric:: Example Code
 
 ::
 
@@ -309,15 +310,14 @@ same class are not imposed to be close.
     
     To work around this, fit instances of this class to data once, then keep the instance around to do transformations.
 
-.. topic:: References:
+.. rubric:: References
 
-    .. [1] Sugiyama. `Dimensionality Reduction of Multimodal Labeled Data by Local
-       Fisher Discriminant Analysis <http://www.jmlr.org/papers/volume8/sugiyama07b/sugiyama07b.pdf>`_.
-       JMLR 2007
 
-    .. [2] Tang. `Local Fisher Discriminant Analysis on Beer Style Clustering
-       <https://gastrograph.com/resources/whitepapers/local-fisher
-       -discriminant-analysis-on-beer-style-clustering.html#>`_.
+.. container:: hatnote hatnote-gray
+
+      [1]. Sugiyama. `Dimensionality Reduction of Multimodal Labeled Data by Local Fisher Discriminant Analysis <http://www.jmlr.org/papers/volume8/sugiyama07b/sugiyama07b.pdf>`_. JMLR 2007.
+
+      [2]. Tang. `Local Fisher Discriminant Analysis on Beer Style Clustering <https://gastrograph.com/resources/whitepapers/local-fisher-discriminant-analysis-on-beer-style-clustering.html#>`_.
 
 .. _mlkr:
 
@@ -363,7 +363,7 @@ calculating a weighted average of all the training samples:
 
     \hat{y}_i = \frac{\sum_{j\neq i}y_jk_{ij}}{\sum_{j\neq i}k_{ij}}
 
-.. topic:: Example Code:
+.. rubric:: Example Code
 
 ::
 
@@ -377,10 +377,12 @@ calculating a weighted average of all the training samples:
     mlkr = MLKR()
     mlkr.fit(X, Y)
 
-.. topic:: References:
+.. rubric:: References
+
+
+.. container:: hatnote hatnote-gray
 
-    .. [1] Weinberger et al. `Metric Learning for Kernel Regression <http://proceedings.mlr.
-       press/v2/weinberger07a/weinberger07a.pdf>`_. AISTATS 2007
+    [1]. Weinberger et al. `Metric Learning for Kernel Regression <http://proceedings.mlr.press/v2/weinberger07a/weinberger07a.pdf>`_. AISTATS 2007.
 
 
 .. _supervised_version:
@@ -417,7 +419,7 @@ quadruplets, where for each quadruplet the two first points are from the same
 class, and the two last points are from a different class (so indeed the two
 last points should be less similar than the two first points).
 
-.. topic:: Example Code:
+.. rubric:: Example Code
 
 ::
 
diff --git a/doc/unsupervised.rst b/doc/unsupervised.rst
index 1191e805..110b07f9 100644
--- a/doc/unsupervised.rst
+++ b/doc/unsupervised.rst
@@ -20,7 +20,7 @@ It can be used for ZCA whitening of the data (see the Wikipedia page of
 `whitening transformation <https://en.wikipedia.org/wiki/\
 Whitening_transformation>`_).
 
-.. topic:: Example Code:
+.. rubric:: Example Code
 
 ::
 
@@ -32,6 +32,9 @@ Whitening_transformation>`_).
     cov = Covariance().fit(iris)
     x = cov.transform(iris)
 
-.. topic:: References:
+.. rubric:: References
 
-    .. [1] On the Generalized Distance in Statistics, P.C.Mahalanobis, 1936
\ No newline at end of file
+
+.. container:: hatnote hatnote-gray
+
+      [1]. On the Generalized Distance in Statistics, P.C.Mahalanobis, 1936.
\ No newline at end of file
diff --git a/doc/weakly_supervised.rst b/doc/weakly_supervised.rst
index 02ea4ef6..341d959a 100644
--- a/doc/weakly_supervised.rst
+++ b/doc/weakly_supervised.rst
@@ -62,8 +62,9 @@ The most intuitive way to represent tuples is to provide the algorithm with a
 in a tuple (2 for pairs, 3 for triplets for instance), and `n_features` is
 the number of features of each point.
 
-.. topic:: Example:
-   Here is an artificial dataset of 4 pairs of 2 points of 3 features each:
+.. rubric:: Example Code
+
+Here is an artificial dataset of 4 pairs of 2 points of 3 features each:
 
 >>> import numpy as np
 >>> tuples = np.array([[[-0.12, -1.21, -0.20],
@@ -94,7 +95,9 @@ would be to keep the dataset of points `X` aside, and just represent tuples
 as a collection of tuples of *indices* from the points in `X`. Since we loose
 the feature dimension there, the resulting array is 2D.
 
-.. topic:: Example: An equivalent representation of the above pairs would be:
+.. rubric:: Example Code
+    
+An equivalent representation of the above pairs would be:
 
 >>> X = np.array([[-0.12, -1.21, -0.20],
 >>>               [+0.05, -0.19, -0.05],
@@ -410,7 +413,7 @@ for similar and dissimilar pairs respectively, and :math:`\mathbf{M}_0`
 is the prior distance metric, set to identity matrix by default, 
 :math:`D_{\ell \mathrm{d}}(\cdot)` is the log determinant.
 
-.. topic:: Example Code:
+.. rubric:: Example Code
 
 ::
 
@@ -429,11 +432,14 @@ is the prior distance metric, set to identity matrix by default,
     itml = ITML()
     itml.fit(pairs, y)
 
-.. topic:: References:
+.. rubric:: References
+
 
-    .. [1] Jason V. Davis, et al. `Information-theoretic Metric Learning <https://icml.cc/imls/conferences/2007/proceedings/papers/404.pdf>`_. ICML 2007
+.. container:: hatnote hatnote-gray
 
-    .. [2] Adapted from Matlab code at http://www.cs.utexas.edu/users/pjain/itml/
+      [1]. Jason V. Davis, et al. `Information-theoretic Metric Learning <https://icml.cc/imls/conferences/2007/proceedings/papers/404.pdf>`_. ICML 2007.
+
+      [2]. Adapted from Matlab code at http://www.cs.utexas.edu/users/pjain/itml/ .
 
 
 .. _sdml:
@@ -468,7 +474,7 @@ the sums of the row elements of :math:`\mathbf{K}`., :math:`||\cdot||_{1, off}`
 is the off-diagonal L1 norm.
 
 
-.. topic:: Example Code:
+.. rubric:: Example Code
 
 ::
 
@@ -486,19 +492,19 @@ is the off-diagonal L1 norm.
     sdml = SDML()
     sdml.fit(pairs, y)
 
-.. topic:: References:
+.. rubric:: References
+
 
-    .. [1] Qi et al.
-       `An efficient sparse metric learning in high-dimensional space via
-       L1-penalized log-determinant regularization <https://icml.cc/Conferences/2009/papers/46.pdf>`_.
-       ICML 2009.
+.. container:: hatnote hatnote-gray
 
-    .. [2] Code adapted from https://gist.github.com/kcarnold/5439945
+      [1]. Qi et al. `An efficient sparse metric learning in high-dimensional space via L1-penalized log-determinant regularization <https://icml.cc/Conferences/2009/papers/46.pdf>`_. ICML 2009.
+
+      [2]. Code adapted from https://gist.github.com/kcarnold/5439945 .
 
 .. _rca:
 
 :py:class:`RCA <metric_learn.RCA>`
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Relative Components Analysis (:py:class:`RCA <metric_learn.RCA>`)
 
@@ -522,7 +528,7 @@ where chunklet :math:`j` consists of :math:`\{\mathbf{x}_{ji}\}_{i=1}^{n_j}`
 with a mean :math:`\hat{m}_j`. The inverse of :math:`\mathbf{C}^{-1}` is used 
 as the Mahalanobis matrix.
 
-.. topic:: Example Code:
+.. rubric:: Example Code
 
 ::
 
@@ -537,15 +543,16 @@ as the Mahalanobis matrix.
     rca = RCA()
     rca.fit(X, chunks)
 
-.. topic:: References:
+.. rubric:: References
+
 
-    .. [1] Shental et al. `Adjustment learning and relevant component analysis
-       <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.19.2871
-       &rep=rep1&type=pdf>`_. ECCV 2002
+.. container:: hatnote hatnote-gray
 
-    .. [2] Bar-Hillel et al. `Learning distance functions using equivalence relations <https://aaai.org/Papers/ICML/2003/ICML03-005.pdf>`_. ICML 2003
+      [1]. Shental et al. `Adjustment learning and relevant component analysis <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.19.2871 &rep=rep1&type=pdf>`_. ECCV 2002.
 
-    .. [3] Bar-Hillel et al. `Learning a Mahalanobis metric from equivalence constraints <http://www.jmlr.org/papers/volume6/bar-hillel05a/bar-hillel05a.pdf>`_. JMLR 2005
+      [2]. Bar-Hillel et al. `Learning distance functions using equivalence relations <https://aaai.org/Papers/ICML/2003/ICML03-005.pdf>`_. ICML 2003.
+
+      [3]. Bar-Hillel et al. `Learning a Mahalanobis metric from equivalence constraints <http://www.jmlr.org/papers/volume6/bar-hillel05a/bar-hillel05a.pdf>`_. JMLR 2005.
 
 .. _mmc:
 
@@ -576,7 +583,7 @@ points, while constrains the sum of distances between dissimilar points:
       \qquad \qquad \text{s.t.} \qquad \sum_{(\mathbf{x}_i, \mathbf{x}_j)
       \in D} d^2_{\mathbf{M}}(\mathbf{x}_i, \mathbf{x}_j) \geq 1
 
-.. topic:: Example Code:
+.. rubric:: Example Code
 
 ::
 
@@ -594,13 +601,14 @@ points, while constrains the sum of distances between dissimilar points:
     mmc = MMC()
     mmc.fit(pairs, y)
 
-.. topic:: References:
+.. rubric:: References
+
 
-  .. [1] Xing et al. `Distance metric learning with application to clustering with
-        side-information <http://papers.nips
-        .cc/paper/2164-distance-metric-learning-with-application-to-clustering
-        -with-side-information.pdf>`_. NIPS 2002
-  .. [2] Adapted from Matlab code http://www.cs.cmu.edu/%7Eepxing/papers/Old_papers/code_Metric_online.tar.gz
+.. container:: hatnote hatnote-gray
+
+    [1]. Xing et al. `Distance metric learning with application to clustering with side-information <http://papers.nips .cc/paper/2164-distance-metric-learning-with-application-to-clustering-with-side-information.pdf>`_. NIPS 2002.
+    
+    [2]. Adapted from Matlab code http://www.cs.cmu.edu/%7Eepxing/papers/Old_papers/code_Metric_online.tar.gz .
 
 .. _learning_on_triplets:
 
@@ -744,7 +752,7 @@ is added to yield a sparse combination. The formulation is the following:
 
 where :math:`[\cdot]_+` is the hinge loss. 
  
-.. topic:: Example Code:
+.. rubric:: Example Code
 
 ::
 
@@ -758,14 +766,14 @@ where :math:`[\cdot]_+` is the hinge loss.
     scml = SCML()
     scml.fit(triplets)
 
-.. topic:: References:
+.. rubric:: References
+
 
-  .. [1] Y. Shi, A. Bellet and F. Sha. `Sparse Compositional Metric Learning.
-         <http://researchers.lille.inria.fr/abellet/papers/aaai14.pdf>`_. \
-         (AAAI), 2014.
+.. container:: hatnote hatnote-gray
 
-  .. [2] Adapted from original \
-         `Matlab implementation.<https://github.com/bellet/SCML>`_.
+    [1]. Y. Shi, A. Bellet and F. Sha. `Sparse Compositional Metric Learning. <http://researchers.lille.inria.fr/abellet/papers/aaai14.pdf>`_. (AAAI), 2014.
+
+    [2]. Adapted from original `Matlab implementation. <https://github.com/bellet/SCML>`_.
 
 
 .. _learning_on_quadruplets:
@@ -937,7 +945,7 @@ by default, :math:`D_{ld}(\mathbf{\cdot, \cdot})` is the LogDet divergence:
     D_{ld}(\mathbf{M, M_0}) = \text{tr}(\mathbf{MM_0}) − \text{logdet}
     (\mathbf{M})
 
-.. topic:: Example Code:
+.. rubric:: Example Code
 
 ::
 
@@ -954,12 +962,13 @@ by default, :math:`D_{ld}(\mathbf{\cdot, \cdot})` is the LogDet divergence:
     lsml = LSML()
     lsml.fit(quadruplets)
 
-.. topic:: References:
+.. rubric:: References
+
+
+.. container:: hatnote hatnote-gray
 
-    .. [1] Liu et al.
-       `Metric Learning from Relative Comparisons by Minimizing Squared
-       Residual <http://www.cs.ucla.edu/~weiwang/paper/ICDM12.pdf>`_. ICDM 2012
+      [1]. Liu et al. `Metric Learning from Relative Comparisons by Minimizing Squared Residual <http://www.cs.ucla.edu/~weiwang/paper/ICDM12.pdf>`_. ICDM 2012.
 
-    .. [2] Code adapted from https://gist.github.com/kcarnold/5439917
+      [2]. Code adapted from https://gist.github.com/kcarnold/5439917 .
 
 
diff --git a/examples/plot_metric_learning_examples.py b/examples/plot_metric_learning_examples.py
index 71229554..f00f838c 100644
--- a/examples/plot_metric_learning_examples.py
+++ b/examples/plot_metric_learning_examples.py
@@ -35,9 +35,9 @@
 # We will be using a synthetic dataset to illustrate the plotting,
 # using the function `sklearn.datasets.make_classification` from
 # scikit-learn. The dataset will contain:
-#   - 100 points in 3 classes with 2 clusters per class
-#   - 5 features, among which 3 are informative (correlated with the class
-#     labels) and two are random noise with large magnitude
+# - 100 points in 3 classes with 2 clusters per class
+# - 5 features, among which 3 are informative (correlated with the class
+# labels) and two are random noise with large magnitude
 
 X, y = make_classification(n_samples=100, n_classes=3, n_clusters_per_class=2,
                            n_informative=3, class_sep=4., n_features=5,
diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index 9064c100..7b449c8e 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -29,16 +29,17 @@ def __init__(self, preprocessor=None):
   @abstractmethod
   def score_pairs(self, pairs):
     """
-    .. deprecated:: 0.7.0 Refer to `pair_distance` and `pair_score`.
+    Returns the score between pairs
+    (can be a similarity, or a distance/metric depending on the algorithm)
+
+    .. deprecated:: 0.7.0
+        Refer to `pair_distance` and `pair_score`.
 
     .. warning::
         This method will be removed in 0.8.0. Please refer to `pair_distance`
         or `pair_score`. This change will occur in order to add learners
         that don't necessarily learn a Mahalanobis distance.
 
-    Returns the score between pairs
-    (can be a similarity, or a distance/metric depending on the algorithm)
-
     Parameters
     ----------
     pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
@@ -225,7 +226,10 @@ def get_metric(self):
 
 
 class MetricTransformer(metaclass=ABCMeta):
-
+  """
+  Base class for all learners that can transform data into a new space
+  with the metric learned.
+  """
   @abstractmethod
   def transform(self, X):
     """Applies the metric transformation.
@@ -264,14 +268,6 @@ class MahalanobisMixin(BaseMetricLearner, MetricTransformer,
 
   def score_pairs(self, pairs):
     r"""
-    .. deprecated:: 0.7.0
-        This method is deprecated. Please use `pair_distance` instead.
-
-    .. warning::
-        This method will be removed in 0.8.0. Please refer to `pair_distance`
-        or `pair_score`. This change will occur in order to add learners
-        that don't necessarily learn a Mahalanobis distance.
-
     Returns the learned Mahalanobis distance between pairs.
 
     This distance is defined as: :math:`d_M(x, x') = \\sqrt{(x-x')^T M (x-x')}`
@@ -282,6 +278,14 @@ def score_pairs(self, pairs):
     x_e')^T (x_e- x_e')}`, with :math:`x_e = L x` (See
     :class:`MahalanobisMixin`).
 
+    .. deprecated:: 0.7.0
+        Please use `pair_distance` instead.
+
+    .. warning::
+        This method will be removed in 0.8.0. Please refer to `pair_distance`
+        or `pair_score`. This change will occur in order to add learners
+        that don't necessarily learn a Mahalanobis distance.
+
     Parameters
     ----------
     pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2)
@@ -632,7 +636,7 @@ def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
            evaluation tool in clinical medicine, MH Zweig, G Campbell -
            Clinical chemistry, 1993
 
-    .. [2] most of the code of this function is from scikit-learn's PR #10117
+    .. [2] Most of the code of this function is from scikit-learn's PR #10117
 
     See Also
     --------
@@ -745,7 +749,8 @@ def _validate_calibration_params(strategy='accuracy', min_rate=None,
 
 
 class _TripletsClassifierMixin(BaseMetricLearner):
-  """Base class for triplets learners.
+  """
+  Base class for triplets learners.
   """
 
   _tuple_size = 3  # number of points in a tuple, 3 for triplets
@@ -829,7 +834,8 @@ def score(self, triplets):
 
 
 class _QuadrupletsClassifierMixin(BaseMetricLearner):
-  """Base class for quadruplets learners.
+  """
+  Base class for quadruplets learners.
   """
 
   _tuple_size = 4  # number of points in a tuple, 4 for quadruplets
diff --git a/metric_learn/constraints.py b/metric_learn/constraints.py
index 2d86b819..68e205f6 100644
--- a/metric_learn/constraints.py
+++ b/metric_learn/constraints.py
@@ -95,12 +95,14 @@ def generate_knntriplets(self, X, k_genuine, k_impostor):
 
     Parameters
     ----------
-      X : (n x d) matrix
-        Input data, where each row corresponds to a single instance.
-      k_genuine : int
-        Number of neighbors of the same class to be taken into account.
-      k_impostor : int
-        Number of neighbors of different classes to be taken into account.
+    X : (n x d) matrix
+      Input data, where each row corresponds to a single instance.
+
+    k_genuine : int
+      Number of neighbors of the same class to be taken into account.
+
+    k_impostor : int
+      Number of neighbors of different classes to be taken into account.
 
     Returns
     -------
diff --git a/metric_learn/covariance.py b/metric_learn/covariance.py
index 3b218e6d..2c05b28d 100644
--- a/metric_learn/covariance.py
+++ b/metric_learn/covariance.py
@@ -42,6 +42,10 @@ def __init__(self, preprocessor=None):
 
   def fit(self, X, y=None):
     """
+    Calculates the covariance matrix of the input data.
+
+    Parameters
+    ----------
     X : data matrix, (n x d)
     y : unused
     """
diff --git a/metric_learn/itml.py b/metric_learn/itml.py
index 9fa3b75e..af87f530 100644
--- a/metric_learn/itml.py
+++ b/metric_learn/itml.py
@@ -198,7 +198,7 @@ class ITML(_BaseITML, _PairsClassifierMixin):
   ----------
   .. [1] Jason V. Davis, et al. `Information-theoretic Metric Learning
          <http://www.prateekjain.org/publications/all_papers\
-          /DavisKJSD07_ICML.pdf>`_. ICML 2007.
+         /DavisKJSD07_ICML.pdf>`_. ICML 2007.
   """
 
   def fit(self, pairs, y, bounds=None, calibration_params=None):
diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py
index bfa3275e..82ae20eb 100644
--- a/metric_learn/lfda.py
+++ b/metric_learn/lfda.py
@@ -65,7 +65,7 @@ class LFDA(MahalanobisMixin, TransformerMixin):
   >>> lfda.fit(X, Y)
 
   References
-  ------------------
+  ----------
   .. [1] Masashi Sugiyama. `Dimensionality Reduction of Multimodal Labeled
          Data by Local Fisher Discriminant Analysis
          <http://www.ms.k.u-tokyo.ac.jp/2007/LFDA.pdf>`_. JMLR 2007.
diff --git a/metric_learn/rca.py b/metric_learn/rca.py
index 1da00062..a63aa1d8 100644
--- a/metric_learn/rca.py
+++ b/metric_learn/rca.py
@@ -58,7 +58,7 @@ class RCA(MahalanobisMixin, TransformerMixin):
   >>> rca.fit(X, chunks)
 
   References
-  ------------------
+  ----------
   .. [1] Noam Shental, et al. `Adjustment learning and relevant component
          analysis <http://citeseerx.ist.\
          psu.edu/viewdoc/download?doi=10.1.1.19.2871&rep=rep1&type=pdf>`_ .
diff --git a/metric_learn/scml.py b/metric_learn/scml.py
index b86c6fe1..2bdd0d57 100644
--- a/metric_learn/scml.py
+++ b/metric_learn/scml.py
@@ -377,8 +377,8 @@ class SCML(_BaseSCML, _TripletsClassifierMixin):
          <http://researchers.lille.inria.fr/abellet/papers/aaai14.pdf>`_. \
          (AAAI), 2014.
 
-  .. [2] Adapted from original \
-         `Matlab implementation.<https://github.com/bellet/SCML>`_.
+  .. [2] Adapted from original `Matlab implementation. \
+         <https://github.com/bellet/SCML>`_.
 
   See Also
   --------
@@ -492,8 +492,8 @@ class SCML_Supervised(_BaseSCML, TransformerMixin):
          <http://researchers.lille.inria.fr/abellet/papers/aaai14.pdf>`_. \
          (AAAI), 2014.
 
-  .. [2] Adapted from original \
-         `Matlab implementation.<https://github.com/bellet/SCML>`_.
+  .. [2] Adapted from original `Matlab implementation. \
+         <https://github.com/bellet/SCML>`_.
 
   See Also
   --------
diff --git a/setup.py b/setup.py
index 255671a2..23392077 100755
--- a/setup.py
+++ b/setup.py
@@ -68,7 +68,8 @@
           'scikit-learn>=0.21.3',
       ],
       extras_require=dict(
-          docs=['sphinx', 'shinx_rtd_theme', 'numpydoc'],
+          docs=['sphinx', 'sphinx_rtd_theme', 'numpydoc', 'sphinx-gallery',
+                'matplotlib'],
           demo=['matplotlib'],
           sdml=['skggm>=0.2.9']
       ),