From afe0fb2e295aa09cb7a94ba2c89893384be1115b Mon Sep 17 00:00:00 2001
From: hansen7 <hc.wang96@gmail.com>
Date: Wed, 6 Mar 2019 23:19:27 +0000
Subject: [PATCH 1/9] Update_Doc_hc

---
 .gitignore                |  2 ++
 doc/supervised.rst        | 53 ++++++++++++++++++++++++++++++++++++---
 doc/weakly_supervised.rst | 44 +++++++++++++++++++++++++++-----
 metric_learn/lmnn.py      | 19 ++++++++++++++
 metric_learn/mmc.py       | 11 ++++++++
 metric_learn/nca.py       | 34 +++++++++++++++++++++++++
 metric_learn/rca.py       | 13 ++++++++++
 metric_learn/sdml.py      |  9 ++++---
 8 files changed, 172 insertions(+), 13 deletions(-)

diff --git a/.gitignore b/.gitignore
index c532a6cb..03c22a1c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,5 @@ dist/
 htmlcov/
 .cache/
 doc/auto_examples/*
+.idea/workspace.xml
+.DS_Store
diff --git a/doc/supervised.rst b/doc/supervised.rst
index 26934a47..3c1026dd 100644
--- a/doc/supervised.rst
+++ b/doc/supervised.rst
@@ -46,12 +46,31 @@ LMNN
 
 Large-margin nearest neighbor metric learning.
 
-`LMNN` learns a Mahanalobis distance metric in the kNN classification
+`LMNN` learns a Mahalanobis distance metric in the kNN classification
 setting using semidefinite programming. The learned metric attempts to keep
 k-nearest neighbors in the same class, while keeping examples from different
 classes separated by a large margin. This algorithm makes no assumptions about
 the distribution of the data.
 
+The distance is learned using the following convex optimization:
+
+.. math::
+
+      \min_\mathbf{M}\sum_{i, j}\eta_{ij}||\mathbf{L}(x_i-x_j)||^2 + 
+      c\sum_{i, j, k}\eta_{ij}(1-y_{ij})[1+||\mathbf{L}(x_i-x_j)||^2-||
+      \mathbf{L}(x_i-x_l)||^2]_+)
+
+where :math:`x_i` is the 'target', :math:`x_j` are its k nearest neighbors 
+sharing the same label, and :math:`x_l` are all the other instances within 
+that region with different labels, :math:`\eta_{ij}, y_{ij} \in \{0, 1\}` 
+are both the indicators, :math:`\eta_{ij}` represents :math:`x_{j}` is the 
+k nearest neighbors(with same labels) of :math:`x_{i}`, :math:`y_{ij}=0` 
+indicates :math:`x_{i}, x_{j}` belong to different class, :math:`[\cdot]_+` 
+is Hinge loss. In the optimization process, the second term is replaced 
+by the slack variables :math:`\xi_{ijk}` for the sake of convexity. 
+
+
+
 .. topic:: Example Code:
 
 ::
@@ -86,11 +105,37 @@ NCA
 Neighborhood Components Analysis (`NCA`) is a distance metric learning
 algorithm which aims to improve the accuracy of nearest neighbors
 classification compared to the standard Euclidean distance. The algorithm
-directly  maximizes  a stochastic  variant  of  the leave-one-out k-nearest
-neighbors (KNN) score on the training set.  It can also learn a low-dimensional
-linear  embedding  of  data  that  can  be used for data visualization and fast
+directly maximizes a stochastic variant of the leave-one-out k-nearest
+neighbors (KNN) score on the training set. It can also learn a low-dimensional
+linear  embedding of data that can be used for data visualization and fast
 classification.
 
+They use the decomposition :math:`\mathbf{M} = \mathbf{L}^T\mathbf{L}` and 
+define the probability :math:`p_{ij}` that :math:`x_i` is the neighbor of 
+:math:`x_j` by calculating the softmax likelihood of the Mahalanobis distance:
+
+.. math::
+
+      p_{ij} = \frac{\exp(-|| \mathbf{L}x_i - \mathbf{L}x_j ||_2^2)}
+      {\sum_{l\neq i}\exp(-||\mathbf{L}x_i - \mathbf{L}x_l||_2^2)}, 
+      \qquad p_{ii}=0
+
+
+Then the probability that :math:`x_i` will be correctly classified is:
+
+.. math::
+
+      p_{i} = \sum_{j:j\neq i, y_j=y_i}p_{ij}
+
+The optimization is to find matrix :math:`\mathbf{L}` that maximizes 
+the sum of probability of being correctly classified:
+
+.. math::
+
+      \mathbf{L} = \text{argmax}\sum_i p_i
+
+
+
 .. topic:: Example Code:
 
 ::
diff --git a/doc/weakly_supervised.rst b/doc/weakly_supervised.rst
index deae9b40..d991e097 100644
--- a/doc/weakly_supervised.rst
+++ b/doc/weakly_supervised.rst
@@ -228,8 +228,12 @@ Residual
 SDML
 ----
 
-`SDML`: An efficient sparse metric learning in high-dimensional space via
-L1-penalized log-determinant regularization
+`SDML`: An efficient sparse metric learning in high-dimensional space via 
+double regularization: L1-penalized on the off-diagonal elements of Mahalanobis
+matrix :math:`\mathbf{M}` and the log-determinant divergence between 
+:math:`\mathbf{M}` and :math:`\mathbf{M_0}` (set as either :math:`\mathbf{I}` 
+or :math:`\mathbf{\Omega}^{-1}`, where :math:`\mathbf{\Omega}` is the 
+covariance matrix).
 
 .. topic:: Example Code:
 
@@ -266,10 +270,24 @@ RCA
 Relative Components Analysis (RCA)
 
 `RCA` learns a full rank Mahalanobis distance metric based on a weighted sum of
-in-class covariance matrices. It applies a global linear transformation to
-assign large weights to relevant dimensions and low weights to irrelevant
-dimensions. Those relevant dimensions are estimated using "chunklets", subsets
-of points that are known to belong to the same class.
+in-class(only utilizes the instances share the similarity) covariance matrices. 
+It applies a global linear transformation to assign large weights to relevant 
+dimensions and low weights to irrelevant dimensions. Those relevant dimensions 
+are estimated using "chunklets", subsets of points that are known to belong to 
+the same class.
+
+For a training set with :math:`n` training points in :math:`k` chunklets, the 
+algorithm is efficient since it simply amounts to computing
+
+.. math::
+
+      \mathbf{C} = \frac{1}{n}\sum_{j=1}^k\sum_{i=1}^{n_j}(x_{ji}-\hat{m}_j)
+      (x_{ji}-\hat{m}_j)^T
+
+
+where chunklet :math:`j` consists of :math:`\{x_{ji}\}_{i=1}^{n_j}` with a 
+mean :math:`\hat{m}_j`. The inverse of :math:`\mathbf{C}` is used as the 
+Mahalanobis matrix.
 
 .. topic:: Example Code:
 
@@ -317,6 +335,20 @@ implicit assumptions of MMC is that all classes form a compact set, i.e.,
 follow a unimodal distribution, which restricts the possible use-cases of this
 method. However, it is one of the earliest and a still often cited technique.
 
+This is the first Mahalanobis distance learning method, the algorithm aims at 
+maximizing the sum of distances between all the instances from the dissimilar 
+set :math:`\mathbf{D}`, while constrains the sum of distances between examples 
+from the similar set :math:`\mathbf{S}`.
+
+.. math::
+
+      \max_{\mathbf{M}\in\mathbb{S}_+^d}\sum_{(x_i, x_j)\in\mathbf{D}} 
+      d_{\mathbf{M}}(x_i, x_j)\qquad \qquad \text{s.t.} \qquad 
+      \sum_{(x'_i, x'_j)\in\mathbf{S}} d^2_{\mathbf{M}}(x'_i, x'_j) \leq 1
+
+
+
+
 .. topic:: Example Code:
 
 ::
diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
index f9cd0e91..7a0ae722 100644
--- a/metric_learn/lmnn.py
+++ b/metric_learn/lmnn.py
@@ -6,6 +6,25 @@
 The learned metric attempts to keep k-nearest neighbors in the same class,
 while keeping examples from different classes separated by a large margin.
 This algorithm makes no assumptions about the distribution of the data.
+
+
+The distance is learned using the following convex optimization:
+
+.. math::
+
+      \min_\mathbf{M}\sum_{i, j}\eta_{ij}||\mathbf{L}(x_i-x_j)||^2 +
+      c\sum_{i, j, k}\eta_{ij}(1-y_{ij})[1+||\mathbf{L}(x_i-x_j)||^2-||
+      \mathbf{L}(x_i-x_l)||^2]_+)
+
+where :math:`x_i` is the 'target', :math:`x_j` are its k nearest neighbors
+sharing the same label, and :math:`x_l` are all the other instances within
+that region with different labels, :math:`\eta_{ij}, y_{ij} \in \{0, 1\}`
+are both the indicators, :math:`\eta_{ij}` represents :math:`x_{j}` is the
+k nearest neighbors(with same labels) of :math:`x_{i}`, :math:`y_{ij}=0`
+indicates :math:`x_{i}, x_{j}` belong to different class, :math:`[\cdot]_+`
+is Hinge loss. In the optimization process, the second term is replaced
+by the slack variables :math:`\xi_{ijk}` for the sake of convexity.
+
 """
 #TODO: periodic recalculation of impostors, PCA initialization
 
diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py
index f9d3690b..b6fdbb99 100644
--- a/metric_learn/mmc.py
+++ b/metric_learn/mmc.py
@@ -13,6 +13,17 @@
 follow a unimodal distribution, which restricts the possible use-cases of
 this method. However, it is one of the earliest and a still often cited technique.
 
+This is the first Mahalanobis distance learning method, the algorithm aims at
+maximizing the sum of distances between all the instances from the dissimilar
+set :math:`\mathbf{D}`, while constrains the sum of distances between examples
+from the similar set :math:`\mathbf{S}`.
+
+.. math::
+
+      \max_{\mathbf{M}\in\mathbb{S}_+^d}\sum_{(x_i, x_j)\in\mathbf{D}}
+      d_{\mathbf{M}}(x_i, x_j)\qquad \qquad \text{s.t.} \qquad
+      \sum_{(x'_i, x'_j)\in\mathbf{S}} d^2_{\mathbf{M}}(x'_i, x'_j) \leq 1
+
 Adapted from Matlab code at http://www.cs.cmu.edu/%7Eepxing/papers/Old_papers/code_Metric_online.tar.gz
 """
 
diff --git a/metric_learn/nca.py b/metric_learn/nca.py
index 5abe52e3..8535c6cf 100644
--- a/metric_learn/nca.py
+++ b/metric_learn/nca.py
@@ -1,6 +1,40 @@
 """
 Neighborhood Components Analysis (NCA)
 Ported to Python from https://github.com/vomjom/nca
+
+Neighborhood Components Analysis (`NCA`) is a distance
+metric learning algorithm which aims to improve the accuracy
+of nearest neighbors classification compared to the standard
+Euclidean distance. The algorithm directly maximizes a stochastic
+variant of the leave-one-out k-nearest neighbors (kNN) score
+on the training set. It can also learn a low-dimensional linear
+embedding of data that can be used for data visualization and fast
+classification.
+
+They use the decomposition :math:`\mathbf{M} = \mathbf{L}^T\mathbf{L}`
+and define the probability :math:`p_{ij}` that :math:`x_i` is
+the neighbor of :math:`x_j` by calculating the softmax likelihood
+of the Mahalanobis distance:
+
+.. math::
+
+      p_{ij} = \frac{\exp(-|| \mathbf{L}x_i - \mathbf{L}x_j ||_2^2)}
+      {\sum_{l\neq i}\exp(-||\mathbf{L}x_i - \mathbf{L}x_l||_2^2)},
+      \qquad p_{ii}=0
+
+
+Then the probability that :math:`x_i` will be correctly classified is:
+
+.. math::
+
+      p_{i} = \sum_{j:j\neq i, y_j=y_i}p_{ij}
+
+The optimization is to find matrix :math:`\mathbf{L}` that maximizes
+the sum of probability of being correctly classified:
+
+.. math::
+
+      \mathbf{L} = \text{argmax}\sum_i p_i
 """
 
 from __future__ import absolute_import
diff --git a/metric_learn/rca.py b/metric_learn/rca.py
index c9fedd59..6efba0e4 100644
--- a/metric_learn/rca.py
+++ b/metric_learn/rca.py
@@ -7,6 +7,19 @@
 Those relevant dimensions are estimated using "chunklets",
 subsets of points that are known to belong to the same class.
 
+For a training set with :math:`n` training points in :math:`k`
+chunklets, the algorithm is efficient since it simply amounts to
+computing
+
+.. math::
+
+      \mathbf{C} = \frac{1}{n}\sum_{j=1}^k\sum_{i=1}^{n_j}
+      (x_{ji}-\hat{m}_j)(x_{ji}-\hat{m}_j)^T
+
+where chunklet :math:`j` consists of :math:`\{x_{ji}\}_{i=1}^{n_j}`
+and :math:`\hat{m}_j` is its mean. The inverse of :math:`\mathbf{C}`
+is used as the Mahalanobis matrix.
+
 'Learning distance functions using equivalence relations', ICML 2003
 'Learning a Mahalanobis metric from equivalence constraints', JMLR 2005
 """
diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py
index 78fc4ebc..3f551ecf 100644
--- a/metric_learn/sdml.py
+++ b/metric_learn/sdml.py
@@ -1,8 +1,11 @@
 """
-Qi et al.
+Qi et al. ICML 2009
 An efficient sparse metric learning in high-dimensional space via
-L1-penalized log-determinant regularization.
-ICML 2009
+double regularization: L1-penalized on the off-diagonal elements of Mahalanobis
+matrix :math:`\mathbf{M}` and the log-determinant divergence between
+:math:`\mathbf{M}` and :math:`\mathbf{M_0}` (set as either :math:`\mathbf{I}`
+or :math:`\mathbf{\Omega}^{-1}`, where :math:`\mathbf{\Omega}` is the
+covariance matrix).
 
 Adapted from https://gist.github.com/kcarnold/5439945
 Paper: http://lms.comp.nus.edu.sg/sites/default/files/publication-attachments/icml09-guojun.pdf

From 2fcd8ca33798df39452e2dfe6a8fb907268da60a Mon Sep 17 00:00:00 2001
From: hansen7 <hc.wang96@gmail.com>
Date: Thu, 7 Mar 2019 17:25:42 +0000
Subject: [PATCH 2/9] Update lmnn.py, test if the issue was raised up by the
 unicode encode

---
 metric_learn/lmnn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
index 7a0ae722..9643b16f 100644
--- a/metric_learn/lmnn.py
+++ b/metric_learn/lmnn.py
@@ -1,4 +1,4 @@
-"""
+r"""
 Large-margin nearest neighbor metric learning. (Weinberger 2005)
 
 LMNN learns a Mahanalobis distance metric in the kNN classification setting

From ecedf1d269e0ea8d15ab09bc816a1d8c31dd7132 Mon Sep 17 00:00:00 2001
From: hansen7 <hc.wang96@gmail.com>
Date: Wed, 13 Mar 2019 12:27:12 +0000
Subject: [PATCH 3/9] Update .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 03c22a1c..8a7eb41f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@ htmlcov/
 doc/auto_examples/*
 .idea/workspace.xml
 .DS_Store
+.idea/

From 7f2f0f681f87ebba535b9d1eed26050122594341 Mon Sep 17 00:00:00 2001
From: hansen7 <hc.wang96@gmail.com>
Date: Thu, 28 Mar 2019 23:08:09 +0000
Subject: [PATCH 4/9] Update with mispelled notations, wrong interpretations,
 and add more details of sdml

---
 doc/supervised.rst        | 36 ++++++++++++++++++------------------
 doc/weakly_supervised.rst | 36 ++++++++++++++++++++++++++++--------
 metric_learn/lmnn.py      | 18 +-----------------
 metric_learn/mmc.py       | 20 +++-----------------
 metric_learn/nca.py       | 30 ++----------------------------
 metric_learn/rca.py       |  2 +-
 metric_learn/sdml.py      |  2 ++
 7 files changed, 55 insertions(+), 89 deletions(-)

diff --git a/doc/supervised.rst b/doc/supervised.rst
index 3c1026dd..1bd15a9e 100644
--- a/doc/supervised.rst
+++ b/doc/supervised.rst
@@ -41,35 +41,35 @@ the covariance matrix of the input data. This is a simple baseline method.
 
     .. [1] On the Generalized Distance in Statistics, P.C.Mahalanobis, 1936
 
+
+.. _lmnn:
+
 LMNN
 -----
 
 Large-margin nearest neighbor metric learning.
 
 `LMNN` learns a Mahalanobis distance metric in the kNN classification
-setting using semidefinite programming. The learned metric attempts to keep
-k-nearest neighbors in the same class, while keeping examples from different
-classes separated by a large margin. This algorithm makes no assumptions about
+setting. The learned metric attempts to keep close k-nearest neighbors 
+from the same class, while keeping examples from different classes 
+separated by a large margin. This algorithm makes no assumptions about
 the distribution of the data.
 
-The distance is learned using the following convex optimization:
+The distance is learned using the following optimization:
 
 .. math::
 
-      \min_\mathbf{M}\sum_{i, j}\eta_{ij}||\mathbf{L}(x_i-x_j)||^2 + 
-      c\sum_{i, j, k}\eta_{ij}(1-y_{ij})[1+||\mathbf{L}(x_i-x_j)||^2-||
+      \min_\mathbf{L}\sum_{i, j}\eta_{ij}||\mathbf{L}(x_i-x_j)||^2 + 
+      c\sum_{i, j, l}\eta_{ij}(1-y_{ij})[1+||\mathbf{L}(x_i-x_j)||^2-||
       \mathbf{L}(x_i-x_l)||^2]_+)
 
-where :math:`x_i` is the 'target', :math:`x_j` are its k nearest neighbors 
+where :math:`x_i` is an data point, :math:`x_j` are its k nearest neighbors 
 sharing the same label, and :math:`x_l` are all the other instances within 
 that region with different labels, :math:`\eta_{ij}, y_{ij} \in \{0, 1\}` 
 are both the indicators, :math:`\eta_{ij}` represents :math:`x_{j}` is the 
 k nearest neighbors(with same labels) of :math:`x_{i}`, :math:`y_{ij}=0` 
 indicates :math:`x_{i}, x_{j}` belong to different class, :math:`[\cdot]_+` 
-is Hinge loss. In the optimization process, the second term is replaced 
-by the slack variables :math:`\xi_{ijk}` for the sake of convexity. 
-
-
+is the Hinge loss :math:`[\cdot]_+=\max(0, \cdot)`.
 
 .. topic:: Example Code:
 
@@ -99,6 +99,9 @@ The two implementations differ slightly, and the C++ version is more complete.
        -margin -nearest-neighbor-classification>`_ Kilian Q. Weinberger, John
        Blitzer, Lawrence K. Saul
 
+
+.. _nca:
+
 NCA
 ---
 
@@ -107,7 +110,7 @@ algorithm which aims to improve the accuracy of nearest neighbors
 classification compared to the standard Euclidean distance. The algorithm
 directly maximizes a stochastic variant of the leave-one-out k-nearest
 neighbors (KNN) score on the training set. It can also learn a low-dimensional
-linear  embedding of data that can be used for data visualization and fast
+linear transformation of data that can be used for data visualization and fast
 classification.
 
 They use the decomposition :math:`\mathbf{M} = \mathbf{L}^T\mathbf{L}` and 
@@ -120,22 +123,20 @@ define the probability :math:`p_{ij}` that :math:`x_i` is the neighbor of
       {\sum_{l\neq i}\exp(-||\mathbf{L}x_i - \mathbf{L}x_l||_2^2)}, 
       \qquad p_{ii}=0
 
-
-Then the probability that :math:`x_i` will be correctly classified is:
+Then the probability that :math:`x_i` will be correctly classified by the 
+stochastic nearest neighbors rule is:
 
 .. math::
 
       p_{i} = \sum_{j:j\neq i, y_j=y_i}p_{ij}
 
-The optimization is to find matrix :math:`\mathbf{L}` that maximizes 
+The optimization problem is to find matrix :math:`\mathbf{L}` that maximizes 
 the sum of probability of being correctly classified:
 
 .. math::
 
       \mathbf{L} = \text{argmax}\sum_i p_i
 
-
-
 .. topic:: Example Code:
 
 ::
@@ -238,7 +239,6 @@ generated from the labels information and passed to the underlying algorithm.
 .. todo:: add more details about that (see issue `<https://github
           .com/metric-learn/metric-learn/issues/135>`_)
 
-
 .. topic:: Example Code:
 
 ::
diff --git a/doc/weakly_supervised.rst b/doc/weakly_supervised.rst
index d991e097..299e4287 100644
--- a/doc/weakly_supervised.rst
+++ b/doc/weakly_supervised.rst
@@ -225,16 +225,34 @@ Residual
     .. [2] Adapted from https://gist.github.com/kcarnold/5439917
 
 
+.. _sdml:
+
 SDML
 ----
 
 `SDML`: An efficient sparse metric learning in high-dimensional space via 
-double regularization: L1-penalized on the off-diagonal elements of Mahalanobis
-matrix :math:`\mathbf{M}` and the log-determinant divergence between 
+double regularization: an L1-penalization on the off-diagonal elements of the 
+Mahalanobis matrix :math:`\mathbf{M}`, and a log-determinant divergence between 
 :math:`\mathbf{M}` and :math:`\mathbf{M_0}` (set as either :math:`\mathbf{I}` 
 or :math:`\mathbf{\Omega}^{-1}`, where :math:`\mathbf{\Omega}` is the 
 covariance matrix).
 
+The formulated optimization on the semidfinite matrix :math:`M` is convex:
+
+.. math::
+
+    \min_{\mathbf{M}} = \text{tr}((M_0 + \eta XLX^{T})\cdot M) - \log\det M 
+    + \lambda ||M||_{1, off}
+
+where :math:`\mathbf{X}=[x_1, x_2, ..., x_n]`, :math:`\mathbf{L = D − K}` is 
+the Laplacian matrix, :math:`\mathbf{D}` is a diagonal matrix whose diagonal 
+elements are the sums of the row elements of :math:`\mathbf{K}`, 
+:math:`\mathbf{K}` is the incidence matrix to encode the (dis)similarity 
+information as :math:`\mathbf{K}_{ij} = 1` if :math:`(x_i,x_j)\in \mathbf{S}`, 
+:math:`\mathbf{K}_{ij} = -1` if :math:`(x_i,x_j)\in \mathbf{D}`, 
+:math:`||\cdot||_{1, off}` is the off-diagonal L1 norm of :math:`\mathbf{M}`.
+
+
 .. topic:: Example Code:
 
 ::
@@ -270,7 +288,7 @@ RCA
 Relative Components Analysis (RCA)
 
 `RCA` learns a full rank Mahalanobis distance metric based on a weighted sum of
-in-class(only utilizes the instances share the similarity) covariance matrices. 
+in-chunklets (see below for the definition of chunklets) covariance matrices. 
 It applies a global linear transformation to assign large weights to relevant 
 dimensions and low weights to irrelevant dimensions. Those relevant dimensions 
 are estimated using "chunklets", subsets of points that are known to belong to 
@@ -319,6 +337,9 @@ Mahalanobis matrix.
     .. [3]'Learning a Mahalanobis metric from equivalence constraints', JMLR
        2005
 
+
+.. _mmc:
+
 MMC
 ---
 
@@ -327,7 +348,7 @@ Side-Information, Xing et al., NIPS 2002
 
 `MMC` minimizes the sum of squared distances between similar examples, while
 enforcing the sum of distances between dissimilar examples to be greater than a
-certain margin. This leads to a convex and, thus, local-minima-free
+certain margin, default is 1. This leads to a convex and, thus, local-minima-free
 optimization problem that can be solved efficiently. However, the algorithm
 involves the computation of eigenvalues, which is the main speed-bottleneck.
 Since it has initially been designed for clustering applications, one of the
@@ -335,10 +356,9 @@ implicit assumptions of MMC is that all classes form a compact set, i.e.,
 follow a unimodal distribution, which restricts the possible use-cases of this
 method. However, it is one of the earliest and a still often cited technique.
 
-This is the first Mahalanobis distance learning method, the algorithm aims at 
-maximizing the sum of distances between all the instances from the dissimilar 
-set :math:`\mathbf{D}`, while constrains the sum of distances between examples 
-from the similar set :math:`\mathbf{S}`.
+The algorithm aims at maximizing the sum of distances between all the instances 
+from the dissimilar set :math:`\mathbf{D}`, while constrains the sum of distances 
+between examples from the similar set :math:`\mathbf{S}`.
 
 .. math::
 
diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
index 9643b16f..9881188d 100644
--- a/metric_learn/lmnn.py
+++ b/metric_learn/lmnn.py
@@ -7,23 +7,7 @@
 while keeping examples from different classes separated by a large margin.
 This algorithm makes no assumptions about the distribution of the data.
 
-
-The distance is learned using the following convex optimization:
-
-.. math::
-
-      \min_\mathbf{M}\sum_{i, j}\eta_{ij}||\mathbf{L}(x_i-x_j)||^2 +
-      c\sum_{i, j, k}\eta_{ij}(1-y_{ij})[1+||\mathbf{L}(x_i-x_j)||^2-||
-      \mathbf{L}(x_i-x_l)||^2]_+)
-
-where :math:`x_i` is the 'target', :math:`x_j` are its k nearest neighbors
-sharing the same label, and :math:`x_l` are all the other instances within
-that region with different labels, :math:`\eta_{ij}, y_{ij} \in \{0, 1\}`
-are both the indicators, :math:`\eta_{ij}` represents :math:`x_{j}` is the
-k nearest neighbors(with same labels) of :math:`x_{i}`, :math:`y_{ij}=0`
-indicates :math:`x_{i}, x_{j}` belong to different class, :math:`[\cdot]_+`
-is Hinge loss. In the optimization process, the second term is replaced
-by the slack variables :math:`\xi_{ijk}` for the sake of convexity.
+Read more in the :ref:`User Guide <lmnn>`.
 
 """
 #TODO: periodic recalculation of impostors, PCA initialization
diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py
index b6fdbb99..c5772bc0 100644
--- a/metric_learn/mmc.py
+++ b/metric_learn/mmc.py
@@ -6,23 +6,9 @@
 greater than a certain margin.
 This leads to a convex and, thus, local-minima-free optimization problem
 that can be solved efficiently.
-However, the algorithm involves the computation of eigenvalues, which is the
-main speed-bottleneck.
-Since it has initially been designed for clustering applications, one of the
-implicit assumptions of MMC is that all classes form a compact set, i.e.,
-follow a unimodal distribution, which restricts the possible use-cases of
-this method. However, it is one of the earliest and a still often cited technique.
-
-This is the first Mahalanobis distance learning method, the algorithm aims at
-maximizing the sum of distances between all the instances from the dissimilar
-set :math:`\mathbf{D}`, while constrains the sum of distances between examples
-from the similar set :math:`\mathbf{S}`.
-
-.. math::
-
-      \max_{\mathbf{M}\in\mathbb{S}_+^d}\sum_{(x_i, x_j)\in\mathbf{D}}
-      d_{\mathbf{M}}(x_i, x_j)\qquad \qquad \text{s.t.} \qquad
-      \sum_{(x'_i, x'_j)\in\mathbf{S}} d^2_{\mathbf{M}}(x'_i, x'_j) \leq 1
+
+Read more in the :ref:`User Guide <mmc>`.
+
 
 Adapted from Matlab code at http://www.cs.cmu.edu/%7Eepxing/papers/Old_papers/code_Metric_online.tar.gz
 """
diff --git a/metric_learn/nca.py b/metric_learn/nca.py
index 8535c6cf..7bcf056e 100644
--- a/metric_learn/nca.py
+++ b/metric_learn/nca.py
@@ -5,36 +5,10 @@
 Neighborhood Components Analysis (`NCA`) is a distance
 metric learning algorithm which aims to improve the accuracy
 of nearest neighbors classification compared to the standard
-Euclidean distance. The algorithm directly maximizes a stochastic
-variant of the leave-one-out k-nearest neighbors (kNN) score
-on the training set. It can also learn a low-dimensional linear
-embedding of data that can be used for data visualization and fast
-classification.
+Euclidean distance.
 
-They use the decomposition :math:`\mathbf{M} = \mathbf{L}^T\mathbf{L}`
-and define the probability :math:`p_{ij}` that :math:`x_i` is
-the neighbor of :math:`x_j` by calculating the softmax likelihood
-of the Mahalanobis distance:
+Read more in the :ref:`User Guide <nca>`.
 
-.. math::
-
-      p_{ij} = \frac{\exp(-|| \mathbf{L}x_i - \mathbf{L}x_j ||_2^2)}
-      {\sum_{l\neq i}\exp(-||\mathbf{L}x_i - \mathbf{L}x_l||_2^2)},
-      \qquad p_{ii}=0
-
-
-Then the probability that :math:`x_i` will be correctly classified is:
-
-.. math::
-
-      p_{i} = \sum_{j:j\neq i, y_j=y_i}p_{ij}
-
-The optimization is to find matrix :math:`\mathbf{L}` that maximizes
-the sum of probability of being correctly classified:
-
-.. math::
-
-      \mathbf{L} = \text{argmax}\sum_i p_i
 """
 
 from __future__ import absolute_import
diff --git a/metric_learn/rca.py b/metric_learn/rca.py
index 6efba0e4..3af4b131 100644
--- a/metric_learn/rca.py
+++ b/metric_learn/rca.py
@@ -1,4 +1,4 @@
-"""Relative Components Analysis (RCA)
+r"""Relative Components Analysis (RCA)
 
 RCA learns a full rank Mahalanobis distance metric based on a
 weighted sum of in-class covariance matrices.
diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py
index 3f551ecf..641877ac 100644
--- a/metric_learn/sdml.py
+++ b/metric_learn/sdml.py
@@ -7,6 +7,8 @@
 or :math:`\mathbf{\Omega}^{-1}`, where :math:`\mathbf{\Omega}` is the
 covariance matrix).
 
+Read more in the :ref:`User Guide <sdml>`.
+
 Adapted from https://gist.github.com/kcarnold/5439945
 Paper: http://lms.comp.nus.edu.sg/sites/default/files/publication-attachments/icml09-guojun.pdf
 """

From abea1743b4e0e4d6edc03548602dc665b554e8f4 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 9 Apr 2019 16:31:46 +0200
Subject: [PATCH 5/9] Test: fix nca's reference

---
 doc/supervised.rst | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/doc/supervised.rst b/doc/supervised.rst
index 1bd15a9e..8302785f 100644
--- a/doc/supervised.rst
+++ b/doc/supervised.rst
@@ -105,13 +105,13 @@ The two implementations differ slightly, and the C++ version is more complete.
 NCA
 ---
 
-Neighborhood Components Analysis (`NCA`) is a distance metric learning
-algorithm which aims to improve the accuracy of nearest neighbors
-classification compared to the standard Euclidean distance. The algorithm
-directly maximizes a stochastic variant of the leave-one-out k-nearest
-neighbors (KNN) score on the training set. It can also learn a low-dimensional
-linear transformation of data that can be used for data visualization and fast
-classification.
+Neighborhood Components Analysis (:py:class:`NCA <metric_learn.nca.NCA>`)
+is a distance metric learning algorithm which aims to improve the accuracy of
+nearest neighbors classification compared to the standard Euclidean distance.
+The algorithm directly maximizes a stochastic variant of the leave-one-out
+k-nearest neighbors (KNN) score on the training set. It can also learn a
+low-dimensional linear transformation of data that can be used for data
+visualization and fast classification.
 
 They use the decomposition :math:`\mathbf{M} = \mathbf{L}^T\mathbf{L}` and 
 define the probability :math:`p_{ij}` that :math:`x_i` is the neighbor of 

From ffc5e3ec31af3281f728df8cd2a86ac85228f821 Mon Sep 17 00:00:00 2001
From: hansen7 <hc.wang96@gmail.com>
Date: Tue, 16 Apr 2019 18:27:32 +0100
Subject: [PATCH 6/9] Update Doc and Docstring

---
 doc/supervised.rst        |  87 ++++++++++++-----
 doc/weakly_supervised.rst | 194 +++++++++++++++++++++++++++-----------
 metric_learn/itml.py      |  27 +++---
 metric_learn/lmnn.py      |  12 +--
 metric_learn/lsml.py      |  20 ++--
 metric_learn/mmc.py       |  23 +++--
 metric_learn/nca.py       |  12 +--
 metric_learn/rca.py       |  29 ++----
 metric_learn/sdml.py      |   7 +-
 9 files changed, 262 insertions(+), 149 deletions(-)

diff --git a/doc/supervised.rst b/doc/supervised.rst
index 8302785f..f12661fd 100644
--- a/doc/supervised.rst
+++ b/doc/supervised.rst
@@ -42,12 +42,11 @@ the covariance matrix of the input data. This is a simple baseline method.
     .. [1] On the Generalized Distance in Statistics, P.C.Mahalanobis, 1936
 
 
-.. _lmnn:
-
 LMNN
 -----
 
-Large-margin nearest neighbor metric learning.
+Large Margin Nearest Neighbor Metric Learning
+(:py:class:`LMNN <metric_learn.lmnn.LMNN>`)
 
 `LMNN` learns a Mahalanobis distance metric in the kNN classification
 setting. The learned metric attempts to keep close k-nearest neighbors 
@@ -63,13 +62,14 @@ The distance is learned using the following optimization:
       c\sum_{i, j, l}\eta_{ij}(1-y_{ij})[1+||\mathbf{L}(x_i-x_j)||^2-||
       \mathbf{L}(x_i-x_l)||^2]_+)
 
-where :math:`x_i` is an data point, :math:`x_j` are its k nearest neighbors 
-sharing the same label, and :math:`x_l` are all the other instances within 
-that region with different labels, :math:`\eta_{ij}, y_{ij} \in \{0, 1\}` 
-are both the indicators, :math:`\eta_{ij}` represents :math:`x_{j}` is the 
-k nearest neighbors(with same labels) of :math:`x_{i}`, :math:`y_{ij}=0` 
-indicates :math:`x_{i}, x_{j}` belong to different class, :math:`[\cdot]_+` 
-is the Hinge loss :math:`[\cdot]_+=\max(0, \cdot)`.
+where :math:`\mathbf{x}_i` is an data point, :math:`\mathbf{x}_j` are its 
+k nearest neighbors sharing the same label, and :math:`\mathbf{x}_l` are 
+all the other instances within that region with different labels, 
+:math:`\eta_{ij}, y_{ij} \in \{0, 1\}` are both the indicators, 
+:math:`\eta_{ij}` represents :math:`\mathbf{x}_{j}` is the k nearest 
+neighbors(with same labels) of :math:`\mathbf{x}_{i}`, :math:`y_{ij}=0` 
+indicates :math:`\mathbf{x}_{i}, \mathbf{x}_{j}` belong to different class, 
+:math:`[\cdot]_+` is the Hinge loss :math:`[\cdot]_+=\max(0, \cdot)`.
 
 .. topic:: Example Code:
 
@@ -100,31 +100,30 @@ The two implementations differ slightly, and the C++ version is more complete.
        Blitzer, Lawrence K. Saul
 
 
-.. _nca:
-
 NCA
 ---
 
-Neighborhood Components Analysis (:py:class:`NCA <metric_learn.nca.NCA>`)
-is a distance metric learning algorithm which aims to improve the accuracy of
-nearest neighbors classification compared to the standard Euclidean distance.
-The algorithm directly maximizes a stochastic variant of the leave-one-out
-k-nearest neighbors (KNN) score on the training set. It can also learn a
-low-dimensional linear transformation of data that can be used for data
-visualization and fast classification.
+Neighborhood Components Analysis(:py:class:`NCA <metric_learn.nca.NCA>`)
+
+`NCA` is a distance metric learning algorithm which aims to improve the accuracy of nearest neighbors classification compared to the standard 
+Euclidean distance. The algorithm directly maximizes a stochastic variant 
+of the leave-one-out k-nearest neighbors(KNN) score on the training set. 
+It can also learn a low-dimensional linear transformation of data that can 
+be used for data visualization and fast classification.
 
 They use the decomposition :math:`\mathbf{M} = \mathbf{L}^T\mathbf{L}` and 
-define the probability :math:`p_{ij}` that :math:`x_i` is the neighbor of 
-:math:`x_j` by calculating the softmax likelihood of the Mahalanobis distance:
+define the probability :math:`p_{ij}` that :math:`\mathbf{x}_i` is the 
+neighbor of :math:`\mathbf{x}_j` by calculating the softmax likelihood of 
+the Mahalanobis distance:
 
 .. math::
 
-      p_{ij} = \frac{\exp(-|| \mathbf{L}x_i - \mathbf{L}x_j ||_2^2)}
-      {\sum_{l\neq i}\exp(-||\mathbf{L}x_i - \mathbf{L}x_l||_2^2)}, 
+      p_{ij} = \frac{\exp(-|| \mathbf{Lx}_i - \mathbf{Lx}_j ||_2^2)}
+      {\sum_{l\neq i}\exp(-||\mathbf{Lx}_i - \mathbf{Lx}_l||_2^2)}, 
       \qquad p_{ii}=0
 
-Then the probability that :math:`x_i` will be correctly classified by the 
-stochastic nearest neighbors rule is:
+Then the probability that :math:`\mathbf{x}_i` will be correctly classified 
+by the stochastic nearest neighbors rule is:
 
 .. math::
 
@@ -162,16 +161,52 @@ the sum of probability of being correctly classified:
     .. [2] Wikipedia entry on Neighborhood Components Analysis
        https://en.wikipedia.org/wiki/Neighbourhood_components_analysis
 
+
 LFDA
 ----
 
-Local Fisher Discriminant Analysis (LFDA)
+Local Fisher Discriminant Analysis(:py:class:`LFDA <metric_learn.lfda.LFDA>`)
 
 `LFDA` is a linear supervised dimensionality reduction method. It is
 particularly useful when dealing with multimodality, where one ore more classes
 consist of separate clusters in input space. The core optimization problem of
 LFDA is solved as a generalized eigenvalue problem.
 
+
+The algorithm define the Fisher local within-/between-class scatter matrix 
+:math: `\mathbf{S}^(w)/\mathbf{S}^(b)` in a pairwise fashion:
+
+..math::
+
+    \mathbf{S}^(w) = \frac{1}{2}\sum_{i,j=1}^nW_{ij}^{(w)}(\mathbf{x}_i - 
+    \mathbf{x}_j)(\mathbf{x}_i - \mathbf{x}_j)^T,\\
+    \mathbf{S}^(b) = \frac{1}{2}\sum_{i,j=1}^nW_{ij}^{(b)}(\mathbf{x}_i - 
+    \mathbf{x}_j)(\mathbf{x}_i - \mathbf{x}_j)^T,\\
+
+where 
+
+..math::
+
+    W_{ij}^{(w)} = \left\{\begin{aligned}0 \qquad y_i\neq y_j \\
+    \,\,\mathbf{A}_{i,j}/n_l \qquad y_i = y_j\end{aligned}\right.\\
+    W_{ij}^{(b)} = \left\{\begin{aligned}1/n \qquad y_i\neq y_j \\
+    \,\,\mathbf{A}_{i,j}(1/n-1/n_l) \qquad y_i = y_j\end{aligned}\right.\\
+
+here :math:`\mathbf{A}_{i,j}` is the :math:`(i,j)`-th entry of the affinity
+matrix :math:`\mathbf{A}`:
+
+Then the learning problem becomes derive the LFDA transformation matrix 
+:math:`\mathbf{T}_{LFDA}`:
+
+..math::
+
+    \mathbf{T}_{LFDA} = \arg\max[\text{tr}(\mathbf{T}^T\mathbf{S}^{(w)}
+    \mathbf{T})^{-1}\mathbf{T}^T\mathbf{S}^{(b)}\mathbf{T})]
+
+That is, the algorithm is looking for a transformation matrix T such that 
+nearby data pairs in the same class are made close and the data pairs in different classes are separated from each other; far apart data pairs in the 
+same class are not imposed to be close.
+
 .. topic:: Example Code:
 
 ::
diff --git a/doc/weakly_supervised.rst b/doc/weakly_supervised.rst
index 299e4287..eb83c008 100644
--- a/doc/weakly_supervised.rst
+++ b/doc/weakly_supervised.rst
@@ -151,18 +151,50 @@ tuples you're working with (pairs, triplets...). See the docstring of the
 Algorithms
 ==================
 
+
 ITML
 ----
 
-Information Theoretic Metric Learning, Davis et al., ICML 2007
+Information Theoretic Metric Learning(:py:class:`ITML <metric_learn.itml.ITML>`)
 
-`ITML` minimizes the differential relative entropy between two multivariate
-Gaussians under constraints on the distance function, which can be formulated
-into a Bregman optimization problem by minimizing the LogDet divergence subject
-to linear constraints. This algorithm can handle a wide variety of constraints
+`ITML` minimizes the (differential) relative entropy, aka Kullback–Leibler 
+divergence, between two multivariate Gaussians subject to constraints on the 
+associated Mahalanobis distance, which can be formulated into a Bregman 
+optimization problem by minimizing the LogDet divergence subject to 
+linear constraints. This algorithm can handle a wide variety of constraints
 and can optionally incorporate a prior on the distance function. Unlike some
-other methods, ITML does not rely on an eigenvalue computation or semi-definite
-programming.
+other methods, `ITML` does not rely on an eigenvalue computation or 
+semi-definite programming.
+
+
+Given a Mahalanobis distance parameterized by :math:`A`, its corresponding 
+multivariate Gaussian is denoted as:
+
+.. math::
+    p(\mathbf{x}; \mathbf{A}) = \frac{1}{Z}\exp(-\frac{1}{2}d_\mathbf{A}
+    (\mathbf{x}, \mu)) 
+    =  \frac{1}{Z}\exp(-\frac{1}{2}((\mathbf{x} - \mu)^T\mathbf{A}
+    (\mathbf{x} - \mu)) 
+
+where :math:`Z` is the normalization constant, the inverse of Mahalanobis 
+matrix :math:`\mathbf{A}^{-1}` is the covariance of the Gaussian.
+
+Given pairs of similar points :math:`S` and pairs of dissimilar points 
+:math:`D`, the distance metric learning problem is:
+
+.. math::
+
+    \min_\mathbf{A} \textbf{KL}(p(\mathbf{x}; \mathbf{A}_0) || p(\mathbf{x}; 
+    \mathbf{A})) \qquad\qquad\\
+    \text{subject to } \quad d_\mathbf{A}(\mathbf{x}_i, \mathbf{x}_j) 
+    \leq u \qquad (\mathbf{x}_i, \mathbf{x}_j)\in S \\
+    d_\mathbf{A}(\mathbf{x}_i, \mathbf{x}_j) \geq l \qquad (\mathbf{x}_i, 
+    \mathbf{x}_j)\in D
+
+
+where :math:`u` and :math:`l` is the upper and the lower bound of distance
+for similar and dissimilar pairs respectively, and :math:`\mathbf{A}_0` 
+is the prior distance metric, set to identity matrix by default.
 
 .. topic:: Example Code:
 
@@ -196,8 +228,60 @@ programming.
 LSML
 ----
 
-`LSML`: Metric Learning from Relative Comparisons by Minimizing Squared
-Residual
+Metric Learning from Relative Comparisons by Minimizing Squared Residual
+(:py:class:`LSML <metric_learn.lsml.LSML>`)
+
+`LSML` proposes a simple, yet effective, algorithm that minimizes a convex 
+objective function corresponding to the sum of squared residuals of 
+constraints. This algorithm uses the constraints in the form of the 
+relative distance comparisons, such method is especially useful where 
+pairwise constraints are not natural to obtain, thus pairwise constraints 
+based algorithms become infeasible to be deployed. Furthermore, its sparsity 
+extension leads to more stable estimation when the dimension is high and 
+only a small amount of constraints is given.
+
+The loss function of each constraint 
+:math:`d(\mathbf{x}_a, \mathbf{x}_b) < d(\mathbf{x}_c, \mathbf{x}_d)` is 
+denoted as:
+
+.. math::
+
+    L(d(\mathbf{x}_a, \mathbf{x}_b) < d(\mathbf{x}_c, \mathbf{x}_d)) = 
+    H(d_\mathbf{M}(\mathbf{x}_a, \mathbf{x}_b) 
+    - d_\mathbf{M}(\mathbf{x}_c, \mathbf{x}_d))
+
+where :math:`H(\cdot)` is the squared Hinge loss function defined as:
+
+.. math::
+
+    H(x) = \left\{\begin{aligned}0 \qquad x\leq 0 \\
+    \,\,x^2 \qquad x>0\end{aligned}\right.\\
+
+The summed loss function :math:`L(C)` is the simple sum over all constraints 
+:math:`C = \{(\mathbf{x}_a , \mathbf{x}_b , \mathbf{x}_c , \mathbf{x}_d) 
+: d(\mathbf{x}_a , \mathbf{x}_b) < d(\mathbf{x}_c , \mathbf{x}_d)\}`. The 
+original paper suggested here should be a weighted sum since the confidence 
+or probability of each constraint might differ. However, for the sake of 
+simplicity and assumption of no extra knowledge provided, we just deploy 
+the simple sum here as well as what the authors did in the experiments.
+
+The distance metric learning problem becomes minimizing the summed loss 
+function of all constraints plus a regularization term w.r.t. the prior 
+knowledge:
+
+.. math::
+
+    \min_\mathbf{M}(D_{ld}(\mathbf{M, M_0}) + \sum_{(\mathbf{x}_a, 
+    \mathbf{x}_b, \mathbf{x}_c, \mathbf{x}_d)\in C}H(d_\mathbf{M}(
+    \mathbf{x}_a, \mathbf{x}_b) - d_\mathbf{M}(\mathbf{x}_c, \mathbf{x}_c))\\
+
+where :math:`\mathbf{M}_0` is the prior metric matrix, set as identity 
+by default, :math:`D_{ld}(\mathbf{\cdot, \cdot})` is the LogDet divergence:
+
+.. math::
+
+    D_{ld}(\mathbf{M, M_0}) = \text{tr}(\mathbf{MM_0}) − \text{logdet}
+    (\mathbf{M})
 
 .. topic:: Example Code:
 
@@ -225,32 +309,35 @@ Residual
     .. [2] Adapted from https://gist.github.com/kcarnold/5439917
 
 
-.. _sdml:
-
 SDML
 ----
 
-`SDML`: An efficient sparse metric learning in high-dimensional space via 
+Sparse High-Dimensional Metric Learning
+(:py:class:`SDML <metric_learn.sdml.SDML>`)
+
+`SDML` is an efficient sparse metric learning in high-dimensional space via 
 double regularization: an L1-penalization on the off-diagonal elements of the 
 Mahalanobis matrix :math:`\mathbf{M}`, and a log-determinant divergence between 
 :math:`\mathbf{M}` and :math:`\mathbf{M_0}` (set as either :math:`\mathbf{I}` 
 or :math:`\mathbf{\Omega}^{-1}`, where :math:`\mathbf{\Omega}` is the 
 covariance matrix).
 
-The formulated optimization on the semidfinite matrix :math:`M` is convex:
+The formulated optimization on the semidefinite matrix :math:`\mathbf{M}` 
+is convex:
 
 .. math::
 
-    \min_{\mathbf{M}} = \text{tr}((M_0 + \eta XLX^{T})\cdot M) - \log\det M 
-    + \lambda ||M||_{1, off}
+    \min_{\mathbf{M}} = \text{tr}((\mathbf{M}_0 + \eta \mathbf{XLX}^{T})
+    \cdot \mathbf{M}) - \log\det \mathbf{M} + \lambda ||\mathbf{M}||_{1, off}
 
-where :math:`\mathbf{X}=[x_1, x_2, ..., x_n]`, :math:`\mathbf{L = D − K}` is 
-the Laplacian matrix, :math:`\mathbf{D}` is a diagonal matrix whose diagonal 
-elements are the sums of the row elements of :math:`\mathbf{K}`, 
-:math:`\mathbf{K}` is the incidence matrix to encode the (dis)similarity 
-information as :math:`\mathbf{K}_{ij} = 1` if :math:`(x_i,x_j)\in \mathbf{S}`, 
-:math:`\mathbf{K}_{ij} = -1` if :math:`(x_i,x_j)\in \mathbf{D}`, 
-:math:`||\cdot||_{1, off}` is the off-diagonal L1 norm of :math:`\mathbf{M}`.
+where :math:`\mathbf{X}=[\mathbf{x}_1, \mathbf{x}_2, ..., \mathbf{x}_n]`, 
+:math:`\mathbf{L = D − K}` is the Laplacian matrix, :math:`\mathbf{D}` is 
+a diagonal matrix whose diagonal elements are the sums of the row elements 
+of :math:`\mathbf{K}`, :math:`\mathbf{K}` is the incidence matrix to encode 
+the (dis)similarity information as :math:`\mathbf{K}_{ij} = 1` if 
+:math:`(\mathbf{x}_i, \mathbf{x}_j)\in S`, :math:`\mathbf{K}_{ij} = -1` if 
+:math:`(\mathbf{x}_i, \mathbf{x}_j)\in D`, :math:`||\cdot||_{1, off}` is the 
+off-diagonal L1 norm of :math:`\mathbf{M}`.
 
 
 .. topic:: Example Code:
@@ -285,27 +372,27 @@ information as :math:`\mathbf{K}_{ij} = 1` if :math:`(x_i,x_j)\in \mathbf{S}`,
 RCA
 ---
 
-Relative Components Analysis (RCA)
+Relative Components Analysis (:py:class:`RCA <metric_learn.rca.RCA>`)
 
 `RCA` learns a full rank Mahalanobis distance metric based on a weighted sum of
-in-chunklets (see below for the definition of chunklets) covariance matrices. 
-It applies a global linear transformation to assign large weights to relevant 
-dimensions and low weights to irrelevant dimensions. Those relevant dimensions 
-are estimated using "chunklets", subsets of points that are known to belong to 
-the same class.
+in-chunklets covariance matrices. It applies a global linear transformation to 
+assign large weights to relevant dimensions and low weights to irrelevant 
+dimensions. Those relevant dimensions are estimated using "chunklets", subsets 
+of points that are known to belong to the same class.
 
 For a training set with :math:`n` training points in :math:`k` chunklets, the 
 algorithm is efficient since it simply amounts to computing
 
 .. math::
 
-      \mathbf{C} = \frac{1}{n}\sum_{j=1}^k\sum_{i=1}^{n_j}(x_{ji}-\hat{m}_j)
-      (x_{ji}-\hat{m}_j)^T
+      \mathbf{C} = \frac{1}{n}\sum_{j=1}^k\sum_{i=1}^{n_j}
+      (\mathbf{x}_{ji}-\hat{\mathbf{m}}_j)
+      (\mathbf{x}_{ji}-\hat{\mathbf{m}}_j)^T
 
 
-where chunklet :math:`j` consists of :math:`\{x_{ji}\}_{i=1}^{n_j}` with a 
-mean :math:`\hat{m}_j`. The inverse of :math:`\mathbf{C}` is used as the 
-Mahalanobis matrix.
+where chunklet :math:`j` consists of :math:`\{\mathbf{x}_{ji}\}_{i=1}^{n_j}` 
+with a mean :math:`\hat{m}_j`. The inverse of :math:`\mathbf{C}` is used as 
+the Mahalanobis matrix.
 
 .. topic:: Example Code:
 
@@ -325,7 +412,6 @@ Mahalanobis matrix.
     rca = RCA()
     rca.fit(pairs, y)
 
-
 .. topic:: References:
 
     .. [1] `Adjustment learning and relevant component analysis
@@ -338,36 +424,32 @@ Mahalanobis matrix.
        2005
 
 
-.. _mmc:
-
 MMC
 ---
 
-Mahalanobis Metric Learning with Application for Clustering with
-Side-Information, Xing et al., NIPS 2002
+Metric Learning with Application for Clustering with Side Information
+(:py:class:`MMC <metric_learn.mmc.MMC>`)
 
-`MMC` minimizes the sum of squared distances between similar examples, while
-enforcing the sum of distances between dissimilar examples to be greater than a
-certain margin, default is 1. This leads to a convex and, thus, local-minima-free
-optimization problem that can be solved efficiently. However, the algorithm
-involves the computation of eigenvalues, which is the main speed-bottleneck.
-Since it has initially been designed for clustering applications, one of the
-implicit assumptions of MMC is that all classes form a compact set, i.e.,
-follow a unimodal distribution, which restricts the possible use-cases of this
-method. However, it is one of the earliest and a still often cited technique.
+`MMC` minimizes the sum of squared distances between similar points, while
+enforcing the sum of distances between dissimilar ones to be greater than one. 
+This leads to a convex and, thus, local-minima-free optimization problem that 
+can be solved efficiently. 
+However, the algorithm involves the computation of eigenvalues, which is the 
+main speed-bottleneck. Since it has initially been designed for clustering 
+applications, one of the implicit assumptions of MMC is that all classes form 
+a compact set, i.e., follow a unimodal distribution, which restricts the 
+possible use-cases of this method. However, it is one of the earliest and a 
+still often cited technique.
 
-The algorithm aims at maximizing the sum of distances between all the instances 
-from the dissimilar set :math:`\mathbf{D}`, while constrains the sum of distances 
-between examples from the similar set :math:`\mathbf{S}`.
+The algorithm aims at minimizing the sum of distances between all the similar 
+points, while constrains the sum of distances between dissimilar points:
 
 .. math::
 
-      \max_{\mathbf{M}\in\mathbb{S}_+^d}\sum_{(x_i, x_j)\in\mathbf{D}} 
-      d_{\mathbf{M}}(x_i, x_j)\qquad \qquad \text{s.t.} \qquad 
-      \sum_{(x'_i, x'_j)\in\mathbf{S}} d^2_{\mathbf{M}}(x'_i, x'_j) \leq 1
-
-
-
+      \min_{\mathbf{M}\in\mathbb{S}_+^d}\sum_{(\mathbf{x}_i, 
+      \mathbf{x}_j)\in S} d_{\mathbf{M}}(\mathbf{x}_i, \mathbf{x}_j)
+      \qquad \qquad \text{s.t.} \qquad \sum_{(\mathbf{x}'_i, \mathbf{x}'_j)
+      \in D} d^2_{\mathbf{M}}(\mathbf{x}'_i, \mathbf{x}'_j) \geq 1
 
 .. topic:: Example Code:
 
diff --git a/metric_learn/itml.py b/metric_learn/itml.py
index a0ff05f9..b637ec17 100644
--- a/metric_learn/itml.py
+++ b/metric_learn/itml.py
@@ -1,16 +1,17 @@
-"""
-Information Theoretic Metric Learning, Kulis et al., ICML 2007
-
-ITML minimizes the differential relative entropy between two multivariate
-Gaussians under constraints on the distance function,
-which can be formulated into a Bregman optimization problem by minimizing the
-LogDet divergence subject to linear constraints.
-This algorithm can handle a wide variety of constraints and can optionally
-incorporate a prior on the distance function.
-Unlike some other methods, ITML does not rely on an eigenvalue computation
-or semi-definite programming.
-
-Adapted from Matlab code at http://www.cs.utexas.edu/users/pjain/itml/
+r"""
+Information Theoretic Metric Learning(ITML)
+
+`ITML` minimizes the (differential) relative entropy, aka Kullback–Leibler
+divergence, between two multivariate Gaussians subject to constraints on the
+associated Mahalanobis distance, which can be formulated into a Bregman
+optimization problem by minimizing the LogDet divergence subject to
+linear constraints. This algorithm can handle a wide variety of constraints
+and can optionally incorporate a prior on the distance function. Unlike some
+other methods, `ITML` does not rely on an eigenvalue computation or
+semi-definite programming.
+
+Read more in the :ref:`User Guide <itml>`.
+
 """
 
 from __future__ import print_function, absolute_import
diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
index 9881188d..36153384 100644
--- a/metric_learn/lmnn.py
+++ b/metric_learn/lmnn.py
@@ -1,11 +1,11 @@
 r"""
-Large-margin nearest neighbor metric learning. (Weinberger 2005)
+Large Margin Nearest Neighbor Metric learning(LMNN)
 
-LMNN learns a Mahanalobis distance metric in the kNN classification setting
-using semidefinite programming.
-The learned metric attempts to keep k-nearest neighbors in the same class,
-while keeping examples from different classes separated by a large margin.
-This algorithm makes no assumptions about the distribution of the data.
+`LMNN` learns a Mahalanobis distance metric in the kNN classification
+setting. The learned metric attempts to keep close k-nearest neighbors
+from the same class, while keeping examples from different classes
+separated by a large margin. This algorithm makes no assumptions about
+the distribution of the data.
 
 Read more in the :ref:`User Guide <lmnn>`.
 
diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py
index 312990ab..d527ebe5 100644
--- a/metric_learn/lsml.py
+++ b/metric_learn/lsml.py
@@ -1,10 +1,16 @@
-"""
-Liu et al.
-"Metric Learning from Relative Comparisons by Minimizing Squared Residual".
-ICDM 2012.
-
-Adapted from https://gist.github.com/kcarnold/5439917
-Paper: http://www.cs.ucla.edu/~weiwang/paper/ICDM12.pdf
+r"""
+Metric Learning from Relative Comparisons by Minimizing Squared Residual(LSML)
+
+`LSML` proposes a simple, yet effective, algorithm that minimizes a convex
+objective function corresponding to the sum of squared residuals of
+constraints. This algorithm uses the constraints in the form of the
+relative distance comparisons, such method is especially useful where
+pairwise constraints are not natural to obtain, thus pairwise constraints
+based algorithms become infeasible to be deployed. Furthermore, its sparsity
+extension leads to more stable estimation when the dimension is high and
+only a small amount of constraints is given.
+
+Read more in the :ref:`User Guide <lsml>`.
 """
 
 from __future__ import print_function, absolute_import, division
diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py
index c5772bc0..630ed01f 100644
--- a/metric_learn/mmc.py
+++ b/metric_learn/mmc.py
@@ -1,16 +1,19 @@
-"""
-Mahalanobis Metric Learning with Application for Clustering with Side-Information, Xing et al., NIPS 2002
-
-MMC minimizes the sum of squared distances between similar examples,
-while enforcing the sum of distances between dissimilar examples to be
-greater than a certain margin.
-This leads to a convex and, thus, local-minima-free optimization problem
-that can be solved efficiently.
+r"""
+Metric Learning with Application for Clustering with Side Information(MMC)
+
+`MMC` minimizes the sum of squared distances between similar points, while
+enforcing the sum of distances between dissimilar ones to be greater than one.
+This leads to a convex and, thus, local-minima-free optimization problem that
+can be solved efficiently.
+However, the algorithm involves the computation of eigenvalues, which is the
+main speed-bottleneck. Since it has initially been designed for clustering
+applications, one of the implicit assumptions of MMC is that all classes form
+a compact set, i.e., follow a unimodal distribution, which restricts the
+possible use-cases of this method. However, it is one of the earliest and a
+still often cited technique.
 
 Read more in the :ref:`User Guide <mmc>`.
 
-
-Adapted from Matlab code at http://www.cs.cmu.edu/%7Eepxing/papers/Old_papers/code_Metric_online.tar.gz
 """
 
 from __future__ import print_function, absolute_import, division
diff --git a/metric_learn/nca.py b/metric_learn/nca.py
index 7bcf056e..3f8f48fe 100644
--- a/metric_learn/nca.py
+++ b/metric_learn/nca.py
@@ -1,11 +1,11 @@
-"""
+r"""
 Neighborhood Components Analysis (NCA)
-Ported to Python from https://github.com/vomjom/nca
 
-Neighborhood Components Analysis (`NCA`) is a distance
-metric learning algorithm which aims to improve the accuracy
-of nearest neighbors classification compared to the standard
-Euclidean distance.
+`NCA` is a distance metric learning algorithm which aims to improve the accuracy of nearest neighbors classification compared to the standard
+Euclidean distance. The algorithm directly maximizes a stochastic variant
+of the leave-one-out k-nearest neighbors(KNN) score on the training set.
+It can also learn a low-dimensional linear transformation of data that can
+be used for data visualization and fast classification.
 
 Read more in the :ref:`User Guide <nca>`.
 
diff --git a/metric_learn/rca.py b/metric_learn/rca.py
index 3af4b131..41b0ac93 100644
--- a/metric_learn/rca.py
+++ b/metric_learn/rca.py
@@ -1,27 +1,14 @@
-r"""Relative Components Analysis (RCA)
+r"""
+Relative Components Analysis(RCA)
 
-RCA learns a full rank Mahalanobis distance metric based on a
-weighted sum of in-class covariance matrices.
-It applies a global linear transformation to assign large weights to
-relevant dimensions and low weights to irrelevant dimensions.
-Those relevant dimensions are estimated using "chunklets",
-subsets of points that are known to belong to the same class.
+`RCA` learns a full rank Mahalanobis distance metric based on a weighted sum of
+in-chunklets covariance matrices. It applies a global linear transformation to
+assign large weights to relevant dimensions and low weights to irrelevant
+dimensions. Those relevant dimensions are estimated using "chunklets", subsets
+of points that are known to belong to the same class.
 
-For a training set with :math:`n` training points in :math:`k`
-chunklets, the algorithm is efficient since it simply amounts to
-computing
+Read more in the :ref:`User Guide <rca>`.
 
-.. math::
-
-      \mathbf{C} = \frac{1}{n}\sum_{j=1}^k\sum_{i=1}^{n_j}
-      (x_{ji}-\hat{m}_j)(x_{ji}-\hat{m}_j)^T
-
-where chunklet :math:`j` consists of :math:`\{x_{ji}\}_{i=1}^{n_j}`
-and :math:`\hat{m}_j` is its mean. The inverse of :math:`\mathbf{C}`
-is used as the Mahalanobis matrix.
-
-'Learning distance functions using equivalence relations', ICML 2003
-'Learning a Mahalanobis metric from equivalence constraints', JMLR 2005
 """
 
 from __future__ import absolute_import
diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py
index 641877ac..b4d32989 100644
--- a/metric_learn/sdml.py
+++ b/metric_learn/sdml.py
@@ -1,5 +1,6 @@
-"""
-Qi et al. ICML 2009
+r"""
+Sparse High-Dimensional Metric Learning(SDML)
+
 An efficient sparse metric learning in high-dimensional space via
 double regularization: L1-penalized on the off-diagonal elements of Mahalanobis
 matrix :math:`\mathbf{M}` and the log-determinant divergence between
@@ -9,8 +10,6 @@
 
 Read more in the :ref:`User Guide <sdml>`.
 
-Adapted from https://gist.github.com/kcarnold/5439945
-Paper: http://lms.comp.nus.edu.sg/sites/default/files/publication-attachments/icml09-guojun.pdf
 """
 
 from __future__ import absolute_import

From 19a00d228d59dc81a6161e61821469f093ef045e Mon Sep 17 00:00:00 2001
From: hansen7 <hc.wang96@gmail.com>
Date: Tue, 16 Apr 2019 23:19:34 +0100
Subject: [PATCH 7/9] Update Doc and Docstring

---
 doc/supervised.rst        | 86 +++++++++++++++++++++++++++++----------
 doc/weakly_supervised.rst | 22 +++++-----
 metric_learn/lfda.py      | 17 ++++----
 metric_learn/lmnn.py      |  2 +-
 metric_learn/lsml.py      |  1 +
 metric_learn/mlkr.py      | 15 ++++---
 metric_learn/mmc.py       |  2 +-
 metric_learn/nca.py       |  5 ++-
 metric_learn/rca.py       |  2 +-
 metric_learn/sdml.py      |  6 +--
 10 files changed, 102 insertions(+), 56 deletions(-)

diff --git a/doc/supervised.rst b/doc/supervised.rst
index f12661fd..a8900264 100644
--- a/doc/supervised.rst
+++ b/doc/supervised.rst
@@ -54,22 +54,22 @@ from the same class, while keeping examples from different classes
 separated by a large margin. This algorithm makes no assumptions about
 the distribution of the data.
 
-The distance is learned using the following optimization:
+The distance is learned by solving the following optimization problem:
 
 .. math::
 
-      \min_\mathbf{L}\sum_{i, j}\eta_{ij}||\mathbf{L}(x_i-x_j)||^2 + 
-      c\sum_{i, j, l}\eta_{ij}(1-y_{ij})[1+||\mathbf{L}(x_i-x_j)||^2-||
-      \mathbf{L}(x_i-x_l)||^2]_+)
+      \min_\mathbf{L}\sum_{i, j}\eta_{ij}||\mathbf{L(x_i-x_j)}||^2 + 
+      c\sum_{i, j, l}\eta_{ij}(1-y_{ij})[1+||\mathbf{L(x_i-x_j)}||^2-||
+      \mathbf{L(x_i-x_l)}||^2]_+)
 
-where :math:`\mathbf{x}_i` is an data point, :math:`\mathbf{x}_j` are its 
-k nearest neighbors sharing the same label, and :math:`\mathbf{x}_l` are 
-all the other instances within that region with different labels, 
+where :math:`\mathbf{x}_i` is an data point, :math:`\mathbf{x}_j` is one 
+of its k nearest neighbors sharing the same label, and :math:`\mathbf{x}_l` 
+are all the other instances within that region with different labels, 
 :math:`\eta_{ij}, y_{ij} \in \{0, 1\}` are both the indicators, 
 :math:`\eta_{ij}` represents :math:`\mathbf{x}_{j}` is the k nearest 
 neighbors(with same labels) of :math:`\mathbf{x}_{i}`, :math:`y_{ij}=0` 
 indicates :math:`\mathbf{x}_{i}, \mathbf{x}_{j}` belong to different class, 
-:math:`[\cdot]_+` is the Hinge loss :math:`[\cdot]_+=\max(0, \cdot)`.
+:math:`[\cdot]_+=\max(0, \cdot)` is the Hinge loss.
 
 .. topic:: Example Code:
 
@@ -105,7 +105,8 @@ NCA
 
 Neighborhood Components Analysis(:py:class:`NCA <metric_learn.nca.NCA>`)
 
-`NCA` is a distance metric learning algorithm which aims to improve the accuracy of nearest neighbors classification compared to the standard 
+`NCA` is a distance metric learning algorithm which aims to improve the 
+accuracy of nearest neighbors classification compared to the standard 
 Euclidean distance. The algorithm directly maximizes a stochastic variant 
 of the leave-one-out k-nearest neighbors(KNN) score on the training set. 
 It can also learn a low-dimensional linear transformation of data that can 
@@ -168,24 +169,24 @@ LFDA
 Local Fisher Discriminant Analysis(:py:class:`LFDA <metric_learn.lfda.LFDA>`)
 
 `LFDA` is a linear supervised dimensionality reduction method. It is
-particularly useful when dealing with multimodality, where one ore more classes
+particularly useful when dealing with multi-modality, where one ore more classes
 consist of separate clusters in input space. The core optimization problem of
 LFDA is solved as a generalized eigenvalue problem.
 
 
 The algorithm define the Fisher local within-/between-class scatter matrix 
-:math: `\mathbf{S}^(w)/\mathbf{S}^(b)` in a pairwise fashion:
+:math:`\mathbf{S}^{(w)}/ \mathbf{S}^{(b)}` in a pairwise fashion:
 
-..math::
+.. math::
 
-    \mathbf{S}^(w) = \frac{1}{2}\sum_{i,j=1}^nW_{ij}^{(w)}(\mathbf{x}_i - 
+    \mathbf{S}^{(w)} = \frac{1}{2}\sum_{i,j=1}^nW_{ij}^{(w)}(\mathbf{x}_i - 
     \mathbf{x}_j)(\mathbf{x}_i - \mathbf{x}_j)^T,\\
-    \mathbf{S}^(b) = \frac{1}{2}\sum_{i,j=1}^nW_{ij}^{(b)}(\mathbf{x}_i - 
+    \mathbf{S}^{(b)} = \frac{1}{2}\sum_{i,j=1}^nW_{ij}^{(b)}(\mathbf{x}_i - 
     \mathbf{x}_j)(\mathbf{x}_i - \mathbf{x}_j)^T,\\
 
 where 
 
-..math::
+.. math::
 
     W_{ij}^{(w)} = \left\{\begin{aligned}0 \qquad y_i\neq y_j \\
     \,\,\mathbf{A}_{i,j}/n_l \qquad y_i = y_j\end{aligned}\right.\\
@@ -198,13 +199,15 @@ matrix :math:`\mathbf{A}`:
 Then the learning problem becomes derive the LFDA transformation matrix 
 :math:`\mathbf{T}_{LFDA}`:
 
-..math::
+.. math::
 
-    \mathbf{T}_{LFDA} = \arg\max[\text{tr}(\mathbf{T}^T\mathbf{S}^{(w)}
+    \mathbf{T}_{LFDA} = \arg\max_\mathbf{T}
+    [\text{tr}((\mathbf{T}^T\mathbf{S}^{(w)}
     \mathbf{T})^{-1}\mathbf{T}^T\mathbf{S}^{(b)}\mathbf{T})]
 
-That is, the algorithm is looking for a transformation matrix T such that 
-nearby data pairs in the same class are made close and the data pairs in different classes are separated from each other; far apart data pairs in the 
+That is, it is looking for a transformation matrix :math:`\mathbf{T}` such that 
+nearby data pairs in the same class are made close and the data pairs in 
+different classes are separated from each other; far apart data pairs in the 
 same class are not imposed to be close.
 
 .. topic:: Example Code:
@@ -236,13 +239,54 @@ same class are not imposed to be close.
 MLKR
 ----
 
-Metric Learning for Kernel Regression.
+Metric Learning for Kernel Regression(:py:class:`MLKR <metric_learn.mlkr.MLKR>`)
 
 `MLKR` is an algorithm for supervised metric learning, which learns a
-distance function by directly minimising the leave-one-out regression error.
+distance function by directly minimizing the leave-one-out regression error.
 This algorithm can also be viewed as a supervised variation of PCA and can be
 used for dimensionality reduction and high dimensional data visualization.
 
+Theoretically, `MLKR` can be applied with many types of kernel functions and 
+distance metrics, we hereafter focus the exposition on a particular instance 
+of the Gaussian kernel and Mahalanobis metric, as these are used in our 
+empirical development. The Gaussian kernel is denoted as:
+
+.. math::
+
+    k_{ij} = \frac{1}{\sqrt{2\pi}\sigma}\exp(-\frac{d(\mathbf{x}_i, 
+    \mathbf{x}_j)}{\sigma^2})
+
+where :math:`d(\cdot, \cdot)` is the squared distance under some metrics, 
+here in the fashion of Mahalanobis, it should be :math:`d(\mathbf{x}_i, 
+\mathbf{x}_j) = ||\mathbf{A}(\mathbf{x}_i - \mathbf{x}_j)||`, the transition 
+matrix :math:`\mathbf{A}` is derived from the decomposition of Mahalanobis 
+matrix :math:`\mathbf{M=A^TA}`.
+
+Since :math:`\sigma^2` can be integrated into :math:`d(\cdot)`, we can set 
+:math:`\sigma^2=1` for the sake of simplicity. Here we use the cumulative 
+leave-one-out quadratic regression error of the training samples as the 
+loss function:
+
+.. math::
+
+    \mathcal{L} = \sum_i(y_i - \hat{y}_i)^2
+
+where the prediction :math:`\hat{y}_i` is derived from kernel regression by 
+calculating a weighted average of all the training samples:
+
+.. math::
+
+    \hat{y}_i = \frac{\sum_{j\neq i}y_jk_{ij}}{\sum_{j\neq i}k_{ij}}
+
+The tractable property has enabled the distance metric learning problem to 
+be solved by stochastic gradient descent, where the gradient is:
+
+.. math::
+
+    \frac{\partial\mathcal{L}}{\partial\mathbf{A}} = 4\mathbf{A}\sum_i
+    (\hat{y}_i - y_i)\sum_j(\hat{y}_j - y_j)k_{ij}(\mathbf{x}_i - 
+    \mathbf{x}_j)(\mathbf{x}_i - \mathbf{x}_j)^T
+
 .. topic:: Example Code:
 
 ::
diff --git a/doc/weakly_supervised.rst b/doc/weakly_supervised.rst
index eb83c008..344156d1 100644
--- a/doc/weakly_supervised.rst
+++ b/doc/weakly_supervised.rst
@@ -330,14 +330,12 @@ is convex:
     \min_{\mathbf{M}} = \text{tr}((\mathbf{M}_0 + \eta \mathbf{XLX}^{T})
     \cdot \mathbf{M}) - \log\det \mathbf{M} + \lambda ||\mathbf{M}||_{1, off}
 
-where :math:`\mathbf{X}=[\mathbf{x}_1, \mathbf{x}_2, ..., \mathbf{x}_n]`, 
-:math:`\mathbf{L = D − K}` is the Laplacian matrix, :math:`\mathbf{D}` is 
-a diagonal matrix whose diagonal elements are the sums of the row elements 
-of :math:`\mathbf{K}`, :math:`\mathbf{K}` is the incidence matrix to encode 
-the (dis)similarity information as :math:`\mathbf{K}_{ij} = 1` if 
-:math:`(\mathbf{x}_i, \mathbf{x}_j)\in S`, :math:`\mathbf{K}_{ij} = -1` if 
-:math:`(\mathbf{x}_i, \mathbf{x}_j)\in D`, :math:`||\cdot||_{1, off}` is the 
-off-diagonal L1 norm of :math:`\mathbf{M}`.
+where :math:`\mathbf{X}=[\mathbf{x}_1, \mathbf{x}_2, ..., \mathbf{x}_n]` is 
+the training data, incidence matrix :math:`\mathbf{K}_{ij} = 1` if 
+:math:`(\mathbf{x}_i, \mathbf{x}_j)` is a similar pair, otherwise -1. The 
+Laplacian matrix :math:`\mathbf{L}` is calculated from :math:`\mathbf{K}` 
+(see original paper for more details), :math:`||\cdot||_{1, off}` is the 
+off-diagonal L1 norm.
 
 
 .. topic:: Example Code:
@@ -391,8 +389,8 @@ algorithm is efficient since it simply amounts to computing
 
 
 where chunklet :math:`j` consists of :math:`\{\mathbf{x}_{ji}\}_{i=1}^{n_j}` 
-with a mean :math:`\hat{m}_j`. The inverse of :math:`\mathbf{C}` is used as 
-the Mahalanobis matrix.
+with a mean :math:`\hat{m}_j`. The inverse of :math:`\mathbf{C}^{-1}` is used 
+as the Mahalanobis matrix.
 
 .. topic:: Example Code:
 
@@ -448,8 +446,8 @@ points, while constrains the sum of distances between dissimilar points:
 
       \min_{\mathbf{M}\in\mathbb{S}_+^d}\sum_{(\mathbf{x}_i, 
       \mathbf{x}_j)\in S} d_{\mathbf{M}}(\mathbf{x}_i, \mathbf{x}_j)
-      \qquad \qquad \text{s.t.} \qquad \sum_{(\mathbf{x}'_i, \mathbf{x}'_j)
-      \in D} d^2_{\mathbf{M}}(\mathbf{x}'_i, \mathbf{x}'_j) \geq 1
+      \qquad \qquad \text{s.t.} \qquad \sum_{(\mathbf{x}_i, \mathbf{x}_j)
+      \in D} d^2_{\mathbf{M}}(\mathbf{x}_i, \mathbf{x}_j) \geq 1
 
 .. topic:: Example Code:
 
diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py
index 2feff211..2ca085d4 100644
--- a/metric_learn/lfda.py
+++ b/metric_learn/lfda.py
@@ -1,14 +1,13 @@
-"""
-Local Fisher Discriminant Analysis (LFDA)
+r"""
+Local Fisher Discriminant Analysis(LFDA)
+
+LFDA is a linear supervised dimensionality reduction method. It is
+particularly useful when dealing with multimodality, where one ore more classes
+consist of separate clusters in input space. The core optimization problem of
+LFDA is solved as a generalized eigenvalue problem.
 
-Local Fisher Discriminant Analysis for Supervised Dimensionality Reduction
-Sugiyama, ICML 2006
+Read more in the :ref:`User Guide <lfda>`.
 
-LFDA is a linear supervised dimensionality reduction method.
-It is particularly useful when dealing with multimodality,
-where one ore more classes consist of separate clusters in input space.
-The core optimization problem of LFDA is solved as a generalized
-eigenvalue problem.
 """
 from __future__ import division, absolute_import
 import numpy as np
diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
index 36153384..9e606c56 100644
--- a/metric_learn/lmnn.py
+++ b/metric_learn/lmnn.py
@@ -1,7 +1,7 @@
 r"""
 Large Margin Nearest Neighbor Metric learning(LMNN)
 
-`LMNN` learns a Mahalanobis distance metric in the kNN classification
+LMNN learns a Mahalanobis distance metric in the kNN classification
 setting. The learned metric attempts to keep close k-nearest neighbors
 from the same class, while keeping examples from different classes
 separated by a large margin. This algorithm makes no assumptions about
diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py
index d527ebe5..e4e3553b 100644
--- a/metric_learn/lsml.py
+++ b/metric_learn/lsml.py
@@ -11,6 +11,7 @@
 only a small amount of constraints is given.
 
 Read more in the :ref:`User Guide <lsml>`.
+
 """
 
 from __future__ import print_function, absolute_import, division
diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py
index 74a21a82..927c64e3 100644
--- a/metric_learn/mlkr.py
+++ b/metric_learn/mlkr.py
@@ -1,10 +1,13 @@
-"""
-Metric Learning for Kernel Regression (MLKR), Weinberger et al.,
+r"""
+Metric Learning for Kernel Regression(MLKR)
+
+MLKR is an algorithm for supervised metric learning, which learns a
+distance function by directly minimizing the leave-one-out regression error.
+This algorithm can also be viewed as a supervised variation of PCA and can be
+used for dimensionality reduction and high dimensional data visualization.
+
+Read more in the :ref:`User Guide <mlkr>`.
 
-MLKR is an algorithm for supervised metric learning, which learns a distance
-function by directly minimising the leave-one-out regression error. This
-algorithm can also be viewed as a supervised variation of PCA and can be used
-for dimensionality reduction and high dimensional data visualization.
 """
 from __future__ import division, print_function
 import time
diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py
index 630ed01f..c620fbae 100644
--- a/metric_learn/mmc.py
+++ b/metric_learn/mmc.py
@@ -1,7 +1,7 @@
 r"""
 Metric Learning with Application for Clustering with Side Information(MMC)
 
-`MMC` minimizes the sum of squared distances between similar points, while
+MMC minimizes the sum of squared distances between similar points, while
 enforcing the sum of distances between dissimilar ones to be greater than one.
 This leads to a convex and, thus, local-minima-free optimization problem that
 can be solved efficiently.
diff --git a/metric_learn/nca.py b/metric_learn/nca.py
index 3f8f48fe..7139f0ff 100644
--- a/metric_learn/nca.py
+++ b/metric_learn/nca.py
@@ -1,7 +1,8 @@
 r"""
-Neighborhood Components Analysis (NCA)
+Neighborhood Components Analysis(NCA)
 
-`NCA` is a distance metric learning algorithm which aims to improve the accuracy of nearest neighbors classification compared to the standard
+NCA is a distance metric learning algorithm which aims to improve the
+accuracy of nearest neighbors classification compared to the standard
 Euclidean distance. The algorithm directly maximizes a stochastic variant
 of the leave-one-out k-nearest neighbors(KNN) score on the training set.
 It can also learn a low-dimensional linear transformation of data that can
diff --git a/metric_learn/rca.py b/metric_learn/rca.py
index 41b0ac93..88538e8b 100644
--- a/metric_learn/rca.py
+++ b/metric_learn/rca.py
@@ -1,7 +1,7 @@
 r"""
 Relative Components Analysis(RCA)
 
-`RCA` learns a full rank Mahalanobis distance metric based on a weighted sum of
+RCA learns a full rank Mahalanobis distance metric based on a weighted sum of
 in-chunklets covariance matrices. It applies a global linear transformation to
 assign large weights to relevant dimensions and low weights to irrelevant
 dimensions. Those relevant dimensions are estimated using "chunklets", subsets
diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py
index b4d32989..bf08adc3 100644
--- a/metric_learn/sdml.py
+++ b/metric_learn/sdml.py
@@ -1,9 +1,9 @@
 r"""
 Sparse High-Dimensional Metric Learning(SDML)
 
-An efficient sparse metric learning in high-dimensional space via
-double regularization: L1-penalized on the off-diagonal elements of Mahalanobis
-matrix :math:`\mathbf{M}` and the log-determinant divergence between
+SDML is an efficient sparse metric learning in high-dimensional space via
+double regularization: an L1-penalization on the off-diagonal elements of the
+Mahalanobis matrix :math:`\mathbf{M}`, and a log-determinant divergence between
 :math:`\mathbf{M}` and :math:`\mathbf{M_0}` (set as either :math:`\mathbf{I}`
 or :math:`\mathbf{\Omega}^{-1}`, where :math:`\mathbf{\Omega}` is the
 covariance matrix).

From 70af371828c1ea00be0708220c1eb24773e7f91a Mon Sep 17 00:00:00 2001
From: hansen7 <hc.wang96@gmail.com>
Date: Sun, 28 Apr 2019 00:02:01 +0100
Subject: [PATCH 8/9] Minor Equation Update and Reference

---
 doc/supervised.rst        | 17 ++++++-----------
 doc/weakly_supervised.rst | 27 ++++++++++++++++++---------
 2 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/doc/supervised.rst b/doc/supervised.rst
index a8900264..83bf4449 100644
--- a/doc/supervised.rst
+++ b/doc/supervised.rst
@@ -41,6 +41,7 @@ the covariance matrix of the input data. This is a simple baseline method.
 
     .. [1] On the Generalized Distance in Statistics, P.C.Mahalanobis, 1936
 
+.. _lmnn:
 
 LMNN
 -----
@@ -99,6 +100,7 @@ The two implementations differ slightly, and the C++ version is more complete.
        -margin -nearest-neighbor-classification>`_ Kilian Q. Weinberger, John
        Blitzer, Lawrence K. Saul
 
+.. _nca:
 
 NCA
 ---
@@ -108,7 +110,7 @@ Neighborhood Components Analysis(:py:class:`NCA <metric_learn.nca.NCA>`)
 `NCA` is a distance metric learning algorithm which aims to improve the 
 accuracy of nearest neighbors classification compared to the standard 
 Euclidean distance. The algorithm directly maximizes a stochastic variant 
-of the leave-one-out k-nearest neighbors(KNN) score on the training set. 
+of the leave-one-out k-nearest neighbors (KNN) score on the training set. 
 It can also learn a low-dimensional linear transformation of data that can 
 be used for data visualization and fast classification.
 
@@ -162,6 +164,7 @@ the sum of probability of being correctly classified:
     .. [2] Wikipedia entry on Neighborhood Components Analysis
        https://en.wikipedia.org/wiki/Neighbourhood_components_analysis
 
+.. _lfda:
 
 LFDA
 ----
@@ -194,7 +197,7 @@ where
     \,\,\mathbf{A}_{i,j}(1/n-1/n_l) \qquad y_i = y_j\end{aligned}\right.\\
 
 here :math:`\mathbf{A}_{i,j}` is the :math:`(i,j)`-th entry of the affinity
-matrix :math:`\mathbf{A}`:
+matrix :math:`\mathbf{A}`:, which can be calculated with local scaling methods.
 
 Then the learning problem becomes derive the LFDA transformation matrix 
 :math:`\mathbf{T}_{LFDA}`:
@@ -235,6 +238,7 @@ same class are not imposed to be close.
        <https://gastrograph.com/resources/whitepapers/local-fisher
        -discriminant-analysis-on-beer-style-clustering.html#>`_ Yuan Tang.
 
+.. _mlkr:
 
 MLKR
 ----
@@ -278,15 +282,6 @@ calculating a weighted average of all the training samples:
 
     \hat{y}_i = \frac{\sum_{j\neq i}y_jk_{ij}}{\sum_{j\neq i}k_{ij}}
 
-The tractable property has enabled the distance metric learning problem to 
-be solved by stochastic gradient descent, where the gradient is:
-
-.. math::
-
-    \frac{\partial\mathcal{L}}{\partial\mathbf{A}} = 4\mathbf{A}\sum_i
-    (\hat{y}_i - y_i)\sum_j(\hat{y}_j - y_j)k_{ij}(\mathbf{x}_i - 
-    \mathbf{x}_j)(\mathbf{x}_i - \mathbf{x}_j)^T
-
 .. topic:: Example Code:
 
 ::
diff --git a/doc/weakly_supervised.rst b/doc/weakly_supervised.rst
index 344156d1..728d3450 100644
--- a/doc/weakly_supervised.rst
+++ b/doc/weakly_supervised.rst
@@ -151,6 +151,7 @@ tuples you're working with (pairs, triplets...). See the docstring of the
 Algorithms
 ==================
 
+.. _itml:
 
 ITML
 ----
@@ -180,12 +181,15 @@ where :math:`Z` is the normalization constant, the inverse of Mahalanobis
 matrix :math:`\mathbf{A}^{-1}` is the covariance of the Gaussian.
 
 Given pairs of similar points :math:`S` and pairs of dissimilar points 
-:math:`D`, the distance metric learning problem is:
+:math:`D`, the distance metric learning problem is to minimize the LogDet
+divergence, which is equivalent as minimizing :math:`\textbf{KL}(p(\mathbf{x}; 
+\mathbf{A}_0) || p(\mathbf{x}; \mathbf{A}))`:
 
 .. math::
 
-    \min_\mathbf{A} \textbf{KL}(p(\mathbf{x}; \mathbf{A}_0) || p(\mathbf{x}; 
-    \mathbf{A})) \qquad\qquad\\
+    \min_\mathbf{A} D_{\ell \mathrm{d}}\left(A, A_{0}\right) = 
+    \operatorname{tr}\left(A A_{0}^{-1}\right)-\log \operatorname{det}
+    \left(A A_{0}^{-1}\right)-n\\
     \text{subject to } \quad d_\mathbf{A}(\mathbf{x}_i, \mathbf{x}_j) 
     \leq u \qquad (\mathbf{x}_i, \mathbf{x}_j)\in S \\
     d_\mathbf{A}(\mathbf{x}_i, \mathbf{x}_j) \geq l \qquad (\mathbf{x}_i, 
@@ -194,7 +198,8 @@ Given pairs of similar points :math:`S` and pairs of dissimilar points
 
 where :math:`u` and :math:`l` is the upper and the lower bound of distance
 for similar and dissimilar pairs respectively, and :math:`\mathbf{A}_0` 
-is the prior distance metric, set to identity matrix by default.
+is the prior distance metric, set to identity matrix by default, 
+:math:`D_{\ell \mathrm{d}}(\cdot)` is the log determinant.
 
 .. topic:: Example Code:
 
@@ -224,6 +229,7 @@ is the prior distance metric, set to identity matrix by default.
     .. [2] Adapted from Matlab code at http://www.cs.utexas.edu/users/pjain/
        itml/
 
+.. _lsml:
 
 LSML
 ----
@@ -246,7 +252,6 @@ denoted as:
 
 .. math::
 
-    L(d(\mathbf{x}_a, \mathbf{x}_b) < d(\mathbf{x}_c, \mathbf{x}_d)) = 
     H(d_\mathbf{M}(\mathbf{x}_a, \mathbf{x}_b) 
     - d_\mathbf{M}(\mathbf{x}_c, \mathbf{x}_d))
 
@@ -308,6 +313,7 @@ by default, :math:`D_{ld}(\mathbf{\cdot, \cdot})` is the LogDet divergence:
 
     .. [2] Adapted from https://gist.github.com/kcarnold/5439917
 
+.. _sdml:
 
 SDML
 ----
@@ -331,11 +337,12 @@ is convex:
     \cdot \mathbf{M}) - \log\det \mathbf{M} + \lambda ||\mathbf{M}||_{1, off}
 
 where :math:`\mathbf{X}=[\mathbf{x}_1, \mathbf{x}_2, ..., \mathbf{x}_n]` is 
-the training data, incidence matrix :math:`\mathbf{K}_{ij} = 1` if 
+the training data, the incidence matrix :math:`\mathbf{K}_{ij} = 1` if 
 :math:`(\mathbf{x}_i, \mathbf{x}_j)` is a similar pair, otherwise -1. The 
-Laplacian matrix :math:`\mathbf{L}` is calculated from :math:`\mathbf{K}` 
-(see original paper for more details), :math:`||\cdot||_{1, off}` is the 
-off-diagonal L1 norm.
+Laplacian matrix :math:`\mathbf{L}=\mathbf{D}-\mathbf{K}` is calculated from 
+:math:`\mathbf{K}` and :math:`\mathbf{D}`, a diagonal matrix whose entries are 
+the sums of the row elements of :math:`\mathbf{K}`., :math:`||\cdot||_{1, off}` 
+is the off-diagonal L1 norm.
 
 
 .. topic:: Example Code:
@@ -366,6 +373,7 @@ off-diagonal L1 norm.
 
     .. [2] Adapted from https://gist.github.com/kcarnold/5439945
 
+.. _rca:
 
 RCA
 ---
@@ -421,6 +429,7 @@ as the Mahalanobis matrix.
     .. [3]'Learning a Mahalanobis metric from equivalence constraints', JMLR
        2005
 
+.. _mmc:
 
 MMC
 ---

From c50fbaf11538beaf2985d67f3dce73ae8b89bacb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aur=C3=A9lien?= <aurelien.bellet@inria.fr>
Date: Fri, 3 May 2019 15:35:13 +0200
Subject: [PATCH 9/9] fix bad hyphen

---
 metric_learn/itml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metric_learn/itml.py b/metric_learn/itml.py
index 0d91c9fe..6cb34313 100644
--- a/metric_learn/itml.py
+++ b/metric_learn/itml.py
@@ -1,7 +1,7 @@
 r"""
 Information Theoretic Metric Learning(ITML)
 
-`ITML` minimizes the (differential) relative entropy, aka Kullback–Leibler
+`ITML` minimizes the (differential) relative entropy, aka Kullback-Leibler
 divergence, between two multivariate Gaussians subject to constraints on the
 associated Mahalanobis distance, which can be formulated into a Bregman
 optimization problem by minimizing the LogDet divergence subject to