From f2cb2e95d9ddb08c44d47ede93f521a29e83fdc0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Barz?= <bjoern.barz@uni-jena.de>
Date: Tue, 23 May 2017 08:51:57 +0200
Subject: [PATCH 1/7] Implementation of PGDM

---
 README.rst                  |   1 +
 metric_learn/__init__.py    |   1 +
 metric_learn/pgdm.py        | 436 ++++++++++++++++++++++++++++++++++++
 test/metric_learn_test.py   |  47 +++-
 test/test_base_metric.py    |  11 +
 test/test_fit_transform.py  |  14 +-
 test/test_sklearn_compat.py |   9 +-
 7 files changed, 515 insertions(+), 4 deletions(-)
 create mode 100644 metric_learn/pgdm.py

diff --git a/README.rst b/README.rst
index 9bb762b4..af692623 100644
--- a/README.rst
+++ b/README.rst
@@ -15,6 +15,7 @@ Metric Learning algorithms in Python.
 -  Local Fisher Discriminant Analysis (LFDA)
 -  Relative Components Analysis (RCA)
 -  Metric Learning for Kernel Regression (MLKR)
+-  Probabilistic Global Distance Metric Learning (PGDM)
 
 **Dependencies**
 
diff --git a/metric_learn/__init__.py b/metric_learn/__init__.py
index 5a7508c0..adc5307e 100644
--- a/metric_learn/__init__.py
+++ b/metric_learn/__init__.py
@@ -10,3 +10,4 @@
 from .lfda import LFDA
 from .rca import RCA, RCA_Supervised
 from .mlkr import MLKR
+from .pgdm import PGDM, PGDM_Supervised
diff --git a/metric_learn/pgdm.py b/metric_learn/pgdm.py
new file mode 100644
index 00000000..09cc8ab5
--- /dev/null
+++ b/metric_learn/pgdm.py
@@ -0,0 +1,436 @@
+"""
+Probabilistic Global Distance Metric Learning, Xing et al., NIPS 2002
+
+PGDM minimizes the sum of squared distances between similar examples,
+while enforcing the sum of distances between dissimilar examples to be
+greater than a certain margin.
+This leads to a convex and, thus, local-minima-free optimization problem
+that can be solved efficiently.
+However, the algorithm involves the computation of eigenvalues, which is the
+main speed-bottleneck.
+Since it has initially been designed for clustering applications, one of the
+implicit assumptions of PGDM is that all classes form a compact set, i.e.,
+follow a unimodal distribution, which restricts the possible use-cases of
+this method. However, it is one of the earliest and a still often cited technique.
+
+Adapted from Matlab code at http://www.cs.cmu.edu/%7Eepxing/papers/Old_papers/code_Metric_online.tar.gz
+"""
+
+from __future__ import print_function, absolute_import
+import numpy as np
+from six.moves import xrange
+from sklearn.metrics import pairwise_distances
+from sklearn.utils.validation import check_array, check_X_y
+
+from .base_metric import BaseMetricLearner
+from .constraints import Constraints
+
+
+# hack around lack of axis kwarg in older numpy versions
+try:
+  np.linalg.norm([[4]], axis=1)
+except TypeError:
+  def _vector_norm(X):
+    return np.apply_along_axis(np.linalg.norm, 1, X)
+else:
+  def _vector_norm(X):
+    return np.linalg.norm(X, axis=1)
+
+
+class PGDM(BaseMetricLearner):
+  """Probabilistic Global Distance Metric Learning (PGDM)"""
+  def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3,
+               A0=None, diagonal=False, diagonal_c=1, verbose=False):
+    """Initialize PGDM.
+    Parameters
+    ----------
+    max_iter : int, optional
+    max_proj : int, optional
+    convergence_threshold : float, optional
+    A0 : (d x d) matrix, optional
+        initial metric, defaults to identity
+        only the main diagonal is taken if `diagonal == True`
+    diagonal : bool, optional
+        if True, a diagonal metric will be learned,
+        i.e., a simple scaling of dimensions
+    diagonal_c : float, optional
+        weight of the dissimilarity constraint for diagonal
+        metric learning
+    verbose : bool, optional
+        if True, prints information while learning
+    """
+    self.max_iter = max_iter
+    self.max_proj = max_proj
+    self.convergence_threshold = convergence_threshold
+    self.A0 = A0
+    self.diagonal = diagonal
+    self.diagonal_c = diagonal_c
+    self.verbose = verbose
+  
+  def fit(self, X, constraints):
+    """Learn the PGDM model.
+    Parameters
+    ----------
+    X : (n x d) data matrix
+        each row corresponds to a single instance
+    constraints : 4-tuple of arrays
+        (a,b,c,d) indices into X, with (a,b) specifying similar and (c,d)
+        dissimilar pairs
+    """
+    constraints = self._process_inputs(X, constraints)
+    if self.diagonal:
+      return self._fit_diag(X, constraints)
+    else:
+      return self._fit_full(X, constraints)
+  
+  def _process_inputs(self, X, constraints):
+    
+    self.X_ = X = check_array(X)
+    
+    # check to make sure that no two constrained vectors are identical
+    a,b,c,d = constraints
+    ident = _vector_norm(X[a] - X[b]) > 1e-9
+    a, b = a[ident], b[ident]
+    ident = _vector_norm(X[c] - X[d]) > 1e-9
+    c, d = c[ident], d[ident]
+    
+    # init metric
+    if self.A0 is None:
+      self.A_ = np.identity(X.shape[1])
+      if not self.diagonal:
+        # Don't know why division by 10... it's in the original code
+        # and seems to affect the overall scale of the learned metric.
+        self.A_ /= 10
+    else:
+      self.A_ = check_array(self.A0)
+    
+    return a,b,c,d
+
+  def _fit_full(self, X, constraints):
+    """Learn full metric using PGDM.
+    Parameters
+    ----------
+    X : (n x d) data matrix
+        each row corresponds to a single instance
+    constraints : 4-tuple of arrays
+        (a,b,c,d) indices into X, with (a,b) specifying similar and (c,d)
+        dissimilar pairs
+    """
+    a,b,c,d = constraints
+    num_pos = len(a)
+    num_neg = len(c)
+    num_samples, num_dim = X.shape
+    
+    error1 = error2 = 1e10
+    eps = 0.01        # error-bound of iterative projection on C1 and C2
+    A = self.A_
+    
+    # Create weight vector from similar samples
+    pos_diff = X[a] - X[b]
+    w = np.einsum('ij,ik->jk', pos_diff, pos_diff).ravel()
+    # `w` is the sum of all outer products of the rows in `pos_diff`.
+    # The above `einsum` is equivalent to the much more inefficient:
+    # w = np.apply_along_axis(
+    #         lambda x: np.outer(x,x).ravel(),
+    #         1,
+    #         X[a] - X[b]
+    #     ).sum(axis = 0)
+    t = w.dot(A.ravel() / 100.0)
+    
+    w1 = w / np.linalg.norm(w) # make `w` a unit vector
+    t1 = t / np.linalg.norm(w) # distance from origin to `w^T*x=t` plane
+    
+    cycle = 1
+    alpha = 0.1 # initial step size along gradient
+    
+    grad1 = self._fS1(X, a, b, A)           # gradient of similarity constraint function
+    grad2 = self._fD1(X, c, d, A)           # gradient of dissimilarity constraint function
+    M = self._grad_projection(grad1, grad2) # gradient of fD1 orthogonal to fS1
+    
+    A_old = A.copy()
+
+    for cycle in xrange(self.max_iter):
+      
+      # projection of constraints C1 and C2
+      satisfy = False
+      
+      for it in xrange(self.max_proj):
+        
+        # First constraint:
+        # f(A) = \sum_{i,j \in S} d_ij' A d_ij <= t              (1)
+        # (1) can be rewritten as a linear constraint: w^T x = t,
+        # where x is the unrolled matrix of A,
+        # w is also an unrolled matrix of W where
+        # W_{kl}= \sum_{i,j \in S}d_ij^k * d_ij^l
+        x0 = A.ravel()
+        if w.dot(x0) <= t:
+          x = x0
+        else:
+          x = x0 + (t1 - w1.dot(x0)) * w1
+          A[:] = x.reshape(num_dim, num_dim)
+        
+        # Second constraint:
+        # PSD constraint A >= 0
+        # project A onto domain A>0
+        l, V = np.linalg.eigh((A + A.T) / 2)
+        A[:] = np.dot(V * np.maximum(0, l[None,:]), V.T)
+        
+        fDC2 = w.dot(A.ravel())
+        error2 = (fDC2 - t) / t
+        if error2 < eps:
+          satisfy = True
+          break
+      
+      # third constraint: gradient ascent
+      # max: g(A) >= 1
+      # here we suppose g(A) = fD(A) = \sum_{I,J \in D} sqrt(d_ij' A d_ij)
+      
+      obj_previous = self._fD(X, c, d, A_old) # g(A_old)
+      obj = self._fD(X, c, d, A)              # g(A)
+      
+      if ((obj > obj_previous) or (cycle == 0)) and (satisfy):
+        
+        # If projection of 1 and 2 is successful, and such projection
+        # imprives objective function, slightly increase learning rate
+        # and update from the current A.
+        alpha *= 1.05
+        A_old[:] = A
+        grad2 = self._fS1(X, a, b, A)
+        grad1 = self._fD1(X, c, d, A)
+        M = self._grad_projection(grad1, grad2)
+        A += alpha * M
+      
+      else:
+        
+        # If projection of 1 and 2 failed, or obj <= obj_previous due
+        # to projection of 1 and 2, shrink learning rate and re-update
+        # from the previous A.
+        alpha /= 2
+        A[:] = A_old + alpha * M
+      
+      delta = np.linalg.norm(alpha * M) / np.linalg.norm(A_old)
+      if delta < self.convergence_threshold:
+        break
+      if self.verbose:
+        print('pgdm iter: %d, conv = %f, projections = %d' % (cycle, delta, it+1))
+
+    if delta > self.convergence_threshold:
+      self.converged_ = False
+      if self.verbose:
+        print('pgdm did not converge, conv = %f' % (delta,))
+    else:
+      self.converged_ = True
+      if self.verbose:
+        print('pgdm converged at iter %d, conv = %f' % (cycle, delta))
+    self.A_[:] = A_old
+    self.n_iter_ = cycle
+    return self
+  
+  def _fit_diag(self, X, constraints):
+    """Learn diagonal metric using PGDM.
+    Parameters
+    ----------
+    X : (n x d) data matrix
+        each row corresponds to a single instance
+    constraints : 4-tuple of arrays
+        (a,b,c,d) indices into X, with (a,b) specifying similar and (c,d)
+        dissimilar pairs
+    """
+    a,b,c,d = constraints
+    num_pos = len(a)
+    num_neg = len(c)
+    num_samples, num_dim = X.shape
+    
+    s_sum = np.sum((X[a] - X[b]) ** 2, axis = 0)
+    
+    it = 0
+    error = 1
+    eps = 1e-6
+    reduction = 2
+    w = np.diag(self.A_).copy()
+    
+    while error > self.convergence_threshold:
+      
+      fD0, fD_1st_d, fD_2nd_d = self._D_constraint(X, c, d, w)
+      obj_initial = np.dot(s_sum, w) + self.diagonal_c * fD0
+      fS_1st_d = s_sum # first derivative of the similarity constraints
+      
+      gradient = fS_1st_d - self.diagonal_c * fD_1st_d              # gradient of the objective
+      hessian = -self.diagonal_c * fD_2nd_d + eps * np.eye(num_dim) # Hessian of the objective
+      step = np.dot(np.linalg.inv(hessian), gradient);
+      
+      # Newton-Rapshon update
+      # search over optimal lambda
+      lambd = 1 # initial step-size
+      w_tmp = np.maximum(0, w - lambd * step)
+      
+      obj = np.dot(s_sum, w_tmp) + self.diagonal_c * self._D_objective(X, c, d, w_tmp)
+      obj_previous = obj * 1.1 # just to get the while-loop started
+      
+      inner_it = 0
+      while obj < obj_previous:
+        obj_previous = obj
+        w_previous = w_tmp.copy()
+        lambd /= reduction
+        w_tmp = np.maximum(0, w - lambd * step)
+        obj = np.dot(s_sum, w_tmp) + self.diagonal_c * self._D_objective(X, c, d, w_tmp)
+        inner_it += 1
+      
+      w[:] = w_previous
+      error = np.abs((obj_previous - obj_initial) / obj_previous)
+      if self.verbose:
+        print('pgdm iter: %d, conv = %f' % (it, error))
+      it += 1
+    
+    self.A_ = np.diag(w)
+    return self
+
+  def _fD(self, X, c, d, A):
+    """The value of the dissimilarity constraint function.
+    
+    f = f(\sum_{ij \in D} distance(x_i, x_j))
+    i.e. distance can be L1:  \sqrt{(x_i-x_j)A(x_i-x_j)'}
+    """
+    diff = X[c] - X[d]
+    return np.log(np.sum(np.sqrt(np.sum(np.dot(diff, A) * diff, axis = 1))) + 1e-6)
+  
+  def _fD1(self, X, c, d, A):
+    """The gradient of the dissimilarity constraint function w.r.t. A.
+    
+    For example, let distance by L1 norm:
+    f = f(\sum_{ij \in D} \sqrt{(x_i-x_j)A(x_i-x_j)'})
+    df/dA_{kl} = f'* d(\sum_{ij \in D} \sqrt{(x_i-x_j)^k*(x_i-x_j)^l})/dA_{kl}
+    
+    Note that d_ij*A*d_ij' = tr(d_ij*A*d_ij') = tr(d_ij'*d_ij*A)
+    so, d(d_ij*A*d_ij')/dA = d_ij'*d_ij
+        df/dA = f'(\sum_{ij \in D} \sqrt{tr(d_ij'*d_ij*A)})
+                * 0.5*(\sum_{ij \in D} (1/sqrt{tr(d_ij'*d_ij*A)})*(d_ij'*d_ij))
+    """
+    dim = X.shape[1]
+    diff = X[c] - X[d]
+    M = np.einsum('ij,ik->ijk', diff, diff) # outer products of all rows in `diff`
+    dist = np.sqrt(M.dot(A).trace(axis1 = 1, axis2 = 2))
+    sum_deri = np.sum(0.5 * (M / (dist[:,None,None] + 1e-6)), axis = 0)
+    sum_dist = dist.sum()
+    return sum_deri / (sum_dist + 1e-6)
+  
+  def _fS1(self, X, a, b, A):
+    """The gradient of the similarity constraint function w.r.t. A.
+    
+    f = \sum_{ij}(x_i-x_j)A(x_i-x_j)' = \sum_{ij}d_ij*A*d_ij'
+    df/dA = d(d_ij*A*d_ij')/dA
+    
+    Note that d_ij*A*d_ij' = tr(d_ij*A*d_ij') = tr(d_ij'*d_ij*A)
+    so, d(d_ij*A*d_ij')/dA = d_ij'*d_ij
+    """
+    dim = X.shape[1]
+    diff = X[a] - X[b]
+    return np.einsum('ij,ik->jk', diff, diff) # sum of outer products of all rows in `diff`
+  
+  def _grad_projection(self, grad1, grad2):
+    grad2 = grad2 / np.linalg.norm(grad2)
+    gtemp = grad1 - np.sum(grad1 * grad2) * grad2
+    gtemp /= np.linalg.norm(gtemp)
+    return gtemp
+  
+  def _D_objective(self, X, c, d, w):
+    return np.log(np.sum(np.sqrt(np.sum(((X[c] - X[d]) ** 2) * w[None,:], axis = 1) + 1e-6)))
+  
+  def _D_constraint(self, X, c, d, w):
+    """Compute the value, 1st derivative, second derivative (Hessian) of 
+    a dissimilarity constraint function gF(sum_ij distance(d_ij A d_ij))
+    where A is a diagonal matrix (in the form of a column vector 'w').
+    """
+    diff = X[c] - X[d]
+    diff_sq = diff * diff
+    dist = np.sqrt(diff_sq.dot(w))
+    sum_deri1 = np.sum(diff_sq / (2 * np.maximum(dist, 1e-6))[:,None], axis = 0)
+    sum_deri2 = np.sum(
+      np.einsum('ij,ik->ijk', diff_sq, diff_sq) / (-4 * np.maximum(1e-6, dist**3)[:,None,None]),
+      axis = 0
+    )
+    sum_dist = dist.sum()
+    return (
+      np.log(sum_dist),
+      sum_deri1 / sum_dist,
+      sum_deri2 / sum_dist - np.outer(sum_deri1, sum_deri1) / (sum_dist * sum_dist)
+    )
+  
+  def metric(self):
+    return self.A_
+  
+  def transformer(self):
+    """Computes the transformation matrix from the Mahalanobis matrix.
+    L = V.T * w^(-1/2), with A = V*w*V.T being the eigenvector decomposition of A with
+    the eigenvalues in the diagonal matrix w and the columns of V being the eigenvectors.
+    
+    The Cholesky decomposition cannot be applied here, since PGDM learns only a positive
+    *semi*-definite Mahalanobis matrix.
+    
+    Returns
+    -------
+    L : (d x d) matrix
+    """
+    if self.diagonal:
+      return np.sqrt(self.A_)
+    else:
+      w, V = np.linalg.eigh(self.A_)
+      return V.T * np.sqrt(np.maximum(0, w[:,None]))
+
+
+class PGDM_Supervised(PGDM):
+  """Probabilistic Global Distance Metric Learning (PGDM)"""
+  def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-6,
+               num_labeled=np.inf, num_constraints=None,
+               A0=None, diagonal=False, diagonal_c=1, verbose=False):
+    """Initialize the learner.
+    Parameters
+    ----------
+    max_iter : int, optional
+    max_proj : int, optional
+    convergence_threshold : float, optional
+    num_labeled : int, optional
+        number of labels to preserve for training
+    num_constraints: int, optional
+        number of constraints to generate
+    A0 : (d x d) matrix, optional
+        initial metric, defaults to identity
+        only the main diagonal is taken if `diagonal == True`
+    diagonal : bool, optional
+        if True, a diagonal metric will be learned,
+        i.e., a simple scaling of dimensions
+    diagonal_c : float, optional
+        weight of the dissimilarity constraint for diagonal
+        metric learning
+    verbose : bool, optional
+        if True, prints information while learning
+    """
+    PGDM.__init__(self, max_iter=max_iter, max_proj=max_proj,
+                  convergence_threshold=convergence_threshold,
+                  A0=A0, diagonal=diagonal, diagonal_c=diagonal_c,
+                  verbose=verbose)
+    self.num_labeled = num_labeled
+    self.num_constraints = num_constraints
+
+  def fit(self, X, y, random_state=np.random):
+    """Create constraints from labels and learn the PGDM model.
+    Parameters
+    ----------
+    X : (n x d) matrix
+        Input data, where each row corresponds to a single instance.
+    y : (n) array-like
+        Data labels.
+    random_state : numpy.random.RandomState, optional
+        If provided, controls random number generation.
+    """
+    X, y = check_X_y(X, y)
+    num_constraints = self.num_constraints
+    if num_constraints is None:
+      num_classes = len(np.unique(y))
+      num_constraints = 20 * num_classes**2
+
+    c = Constraints.random_subset(y, self.num_labeled,
+                                  random_state=random_state)
+    pos_neg = c.positive_negative_pairs(num_constraints,
+                                        random_state=random_state)
+    return PGDM.fit(self, X, pos_neg)
diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py
index 1e7f31fe..9694ec02 100644
--- a/test/metric_learn_test.py
+++ b/test/metric_learn_test.py
@@ -6,8 +6,8 @@
 from numpy.testing import assert_array_almost_equal
 
 from metric_learn import (
-    LMNN, NCA, LFDA, Covariance, MLKR,
-    LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised)
+    LMNN, NCA, LFDA, Covariance, MLKR, PGDM,
+    LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised, PGDM_Supervised)
 # Import this specially for testing.
 from metric_learn.lmnn import python_LMNN
 
@@ -149,5 +149,48 @@ def test_iris(self):
     self.assertLess(csep, 0.25)
 
 
+class TestPGDM(MetricTestCase):
+  def test_iris(self):
+
+    # Generate full set of constraints for comparison with reference implementation
+    n = self.iris_points.shape[0]
+    a, b, c, d = [], [], [], []
+    for i in range(n):
+      for j in range(i+1, n):
+        if self.iris_labels[i] == self.iris_labels[j]:
+          a.append(i)
+          b.append(j)
+        else:
+          c.append(i)
+          d.append(j)
+
+    # Full metric
+    pgdm = PGDM(convergence_threshold = 0.01)
+    pgdm.fit(self.iris_points, [np.asarray(x) for x in [a,b,c,d]])
+    expected = [[+0.00046504, +0.00083371, -0.00111959, -0.00165265],
+                [+0.00083371, +0.00149466, -0.00200719, -0.00296284],
+                [-0.00111959, -0.00200719, +0.00269546, +0.00397881],
+                [-0.00165265, -0.00296284, +0.00397881, +0.00587320]]
+    assert_array_almost_equal(expected, pgdm.metric(), decimal=6)
+
+    # Diagonal metric
+    pgdm = PGDM(diagonal = True)
+    pgdm.fit(self.iris_points, [np.asarray(x) for x in [a,b,c,d]])
+    expected = [0, 0, 1.21045968, 1.22552608]
+    assert_array_almost_equal(np.diag(expected), pgdm.metric(), decimal=6)
+    
+    # Supervised Full
+    pgdm = PGDM_Supervised()
+    pgdm.fit(self.iris_points, self.iris_labels)
+    csep = class_separation(pgdm.transform(), self.iris_labels)
+    self.assertLess(csep, 0.15)
+    
+    # Supervised Diagonal
+    pgdm = PGDM_Supervised(diagonal = True)
+    pgdm.fit(self.iris_points, self.iris_labels)
+    csep = class_separation(pgdm.transform(), self.iris_labels)
+    self.assertLess(csep, 0.2)
+
+
 if __name__ == '__main__':
   unittest.main()
diff --git a/test/test_base_metric.py b/test/test_base_metric.py
index d73138cd..6b35dd78 100644
--- a/test/test_base_metric.py
+++ b/test/test_base_metric.py
@@ -63,5 +63,16 @@ def test_mlkr(self):
                      "MLKR(A0=None, alpha=0.0001, epsilon=0.01, "
                      "max_iter=1000, num_dims=None)")
 
+  def test_pgdm(self):
+    self.assertEqual(str(metric_learn.PGDM()), """
+PGDM(A0=None, convergence_threshold=0.001, diagonal=False, diagonal_c=1,
+   max_iter=100, max_proj=10000, verbose=False)
+""".strip('\n'))
+    self.assertEqual(str(metric_learn.PGDM_Supervised()), """
+PGDM_Supervised(A0=None, convergence_threshold=1e-06, diagonal=False,
+        diagonal_c=1, max_iter=100, max_proj=10000, num_constraints=None,
+        num_labeled=inf, verbose=False)
+""".strip('\n'))
+
 if __name__ == '__main__':
   unittest.main()
diff --git a/test/test_fit_transform.py b/test/test_fit_transform.py
index eff8fa01..9e687a63 100644
--- a/test/test_fit_transform.py
+++ b/test/test_fit_transform.py
@@ -5,7 +5,7 @@
 
 from metric_learn import (
     LMNN, NCA, LFDA, Covariance, MLKR,
-    LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised)
+    LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised, PGDM_Supervised)
 
 
 class TestFitTransform(unittest.TestCase):
@@ -118,6 +118,18 @@ def test_mlkr(self):
 
     assert_array_almost_equal(res_1, res_2)
 
+  def test_pgdm_supervised(self):
+    seed = np.random.RandomState(1234)
+    pgdm = PGDM_Supervised(num_constraints=200)
+    pgdm.fit(self.X, self.y, random_state=seed)
+    res_1 = pgdm.transform()
+
+    seed = np.random.RandomState(1234)
+    pgdm = PGDM_Supervised(num_constraints=200)
+    res_2 = pgdm.fit_transform(self.X, self.y, random_state=seed)
+
+    assert_array_almost_equal(res_1, res_2)
+
 
 if __name__ == '__main__':
   unittest.main()
diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py
index 58c7cd05..156a6c63 100644
--- a/test/test_sklearn_compat.py
+++ b/test/test_sklearn_compat.py
@@ -4,7 +4,7 @@
 
 from metric_learn import (
     LMNN, NCA, LFDA, Covariance, MLKR,
-    LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised)
+    LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised, PGDM_Supervised)
 
 
 # Wrap the _Supervised methods with a deterministic wrapper for testing.
@@ -22,6 +22,10 @@ class dITML(deterministic_mixin, ITML_Supervised):
   pass
 
 
+class dPGDM(deterministic_mixin, PGDM_Supervised):
+  pass
+
+
 class dSDML(deterministic_mixin, SDML_Supervised):
   pass
 
@@ -52,6 +56,9 @@ def test_lsml(self):
   def test_itml(self):
     check_estimator(dITML)
 
+  def test_pgdm(self):
+    check_estimator(dPGDM)
+
   # This fails due to a FloatingPointError
   # def test_sdml(self):
   #   check_estimator(dSDML)

From fc1d026bc630fb0ca9b76e872cb1ca7b0ac109b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Barz?= <bjoern.barz@uni-jena.de>
Date: Tue, 23 May 2017 11:40:10 +0200
Subject: [PATCH 2/7] Python2 compatibility

---
 metric_learn/pgdm.py     | 10 +++++-----
 test/test_base_metric.py |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/metric_learn/pgdm.py b/metric_learn/pgdm.py
index 09cc8ab5..4b6c6ce5 100644
--- a/metric_learn/pgdm.py
+++ b/metric_learn/pgdm.py
@@ -40,7 +40,7 @@ def _vector_norm(X):
 class PGDM(BaseMetricLearner):
   """Probabilistic Global Distance Metric Learning (PGDM)"""
   def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3,
-               A0=None, diagonal=False, diagonal_c=1, verbose=False):
+               A0=None, diagonal=False, diagonal_c=1.0, verbose=False):
     """Initialize PGDM.
     Parameters
     ----------
@@ -100,7 +100,7 @@ def _process_inputs(self, X, constraints):
       if not self.diagonal:
         # Don't know why division by 10... it's in the original code
         # and seems to affect the overall scale of the learned metric.
-        self.A_ /= 10
+        self.A_ /= 10.0
     else:
       self.A_ = check_array(self.A0)
     
@@ -244,9 +244,9 @@ def _fit_diag(self, X, constraints):
     s_sum = np.sum((X[a] - X[b]) ** 2, axis = 0)
     
     it = 0
-    error = 1
+    error = 1.0
     eps = 1e-6
-    reduction = 2
+    reduction = 2.0
     w = np.diag(self.A_).copy()
     
     while error > self.convergence_threshold:
@@ -382,7 +382,7 @@ class PGDM_Supervised(PGDM):
   """Probabilistic Global Distance Metric Learning (PGDM)"""
   def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-6,
                num_labeled=np.inf, num_constraints=None,
-               A0=None, diagonal=False, diagonal_c=1, verbose=False):
+               A0=None, diagonal=False, diagonal_c=1.0, verbose=False):
     """Initialize the learner.
     Parameters
     ----------
diff --git a/test/test_base_metric.py b/test/test_base_metric.py
index 6b35dd78..3292260a 100644
--- a/test/test_base_metric.py
+++ b/test/test_base_metric.py
@@ -65,12 +65,12 @@ def test_mlkr(self):
 
   def test_pgdm(self):
     self.assertEqual(str(metric_learn.PGDM()), """
-PGDM(A0=None, convergence_threshold=0.001, diagonal=False, diagonal_c=1,
+PGDM(A0=None, convergence_threshold=0.001, diagonal=False, diagonal_c=1.0,
    max_iter=100, max_proj=10000, verbose=False)
 """.strip('\n'))
     self.assertEqual(str(metric_learn.PGDM_Supervised()), """
 PGDM_Supervised(A0=None, convergence_threshold=1e-06, diagonal=False,
-        diagonal_c=1, max_iter=100, max_proj=10000, num_constraints=None,
+        diagonal_c=1.0, max_iter=100, max_proj=10000, num_constraints=None,
         num_labeled=inf, verbose=False)
 """.strip('\n'))
 

From 23067e44c5df92d4adbba8c6ac022c3e5d728932 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Barz?= <bjoern.barz@uni-jena.de>
Date: Tue, 23 May 2017 13:32:40 +0200
Subject: [PATCH 3/7] Speed up PGDM on high-dimensional data

---
 metric_learn/pgdm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/metric_learn/pgdm.py b/metric_learn/pgdm.py
index 4b6c6ce5..6b48bfed 100644
--- a/metric_learn/pgdm.py
+++ b/metric_learn/pgdm.py
@@ -309,8 +309,8 @@ def _fD1(self, X, c, d, A):
     dim = X.shape[1]
     diff = X[c] - X[d]
     M = np.einsum('ij,ik->ijk', diff, diff) # outer products of all rows in `diff`
-    dist = np.sqrt(M.dot(A).trace(axis1 = 1, axis2 = 2))
-    sum_deri = np.sum(0.5 * (M / (dist[:,None,None] + 1e-6)), axis = 0)
+    dist = np.sqrt(np.sum(M * A[None,:,:], axis = (1,2)))
+    sum_deri = np.sum(M / (2 * (dist[:,None,None] + 1e-6)), axis = 0)
     sum_dist = dist.sum()
     return sum_deri / (sum_dist + 1e-6)
   

From 758276179ac79e24b0e23a3ea1b09be8ea516987 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Barz?= <bjoern.barz@uni-jena.de>
Date: Thu, 25 May 2017 10:34:23 +0200
Subject: [PATCH 4/7] Optimized some summations using `np.einsum`

---
 metric_learn/pgdm.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/metric_learn/pgdm.py b/metric_learn/pgdm.py
index 6b48bfed..89d0e451 100644
--- a/metric_learn/pgdm.py
+++ b/metric_learn/pgdm.py
@@ -308,9 +308,9 @@ def _fD1(self, X, c, d, A):
     """
     dim = X.shape[1]
     diff = X[c] - X[d]
-    M = np.einsum('ij,ik->ijk', diff, diff) # outer products of all rows in `diff`
-    dist = np.sqrt(np.sum(M * A[None,:,:], axis = (1,2)))
-    sum_deri = np.sum(M / (2 * (dist[:,None,None] + 1e-6)), axis = 0)
+    M = np.einsum('ij,ik->ijk', diff, diff)    # outer products of all rows in `diff`
+    dist = np.sqrt(np.einsum('ijk,jk', M, A))  # equivalent to: np.sqrt(np.sum(M * A[None,:,:], axis = (1,2)))
+    sum_deri = np.einsum('ijk,i->jk', M, 0.5 / (dist + 1e-6))  # equivalent to: np.sum(M / (2 * (dist[:,None,None] + 1e-6)), axis = 0)
     sum_dist = dist.sum()
     return sum_deri / (sum_dist + 1e-6)
   
@@ -344,10 +344,11 @@ def _D_constraint(self, X, c, d, w):
     diff = X[c] - X[d]
     diff_sq = diff * diff
     dist = np.sqrt(diff_sq.dot(w))
-    sum_deri1 = np.sum(diff_sq / (2 * np.maximum(dist, 1e-6))[:,None], axis = 0)
-    sum_deri2 = np.sum(
-      np.einsum('ij,ik->ijk', diff_sq, diff_sq) / (-4 * np.maximum(1e-6, dist**3)[:,None,None]),
-      axis = 0
+    sum_deri1 = np.einsum('ij,i', diff_sq, 0.5 / np.maximum(dist, 1e-6))
+    sum_deri2 = np.einsum(
+      'ijk,i',
+      np.einsum('ij,ik->ijk', diff_sq, diff_sq),
+      -0.25 / np.maximum(1e-6, dist**3)
     )
     sum_dist = dist.sum()
     return (

From 9a29405b32a02d464d1f0854761fdfb99e09cf4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Barz?= <bjoern.barz@uni-jena.de>
Date: Thu, 25 May 2017 11:29:21 +0200
Subject: [PATCH 5/7] Addressed requests from review by perimosocordiae

---
 metric_learn/_util.py     | 12 +++++++
 metric_learn/itml.py      | 19 +++--------
 metric_learn/pgdm.py      | 70 ++++++++++++++++++---------------------
 test/metric_learn_test.py | 22 +++++-------
 4 files changed, 58 insertions(+), 65 deletions(-)
 create mode 100644 metric_learn/_util.py

diff --git a/metric_learn/_util.py b/metric_learn/_util.py
new file mode 100644
index 00000000..b34860d6
--- /dev/null
+++ b/metric_learn/_util.py
@@ -0,0 +1,12 @@
+import numpy as np
+
+
+# hack around lack of axis kwarg in older numpy versions
+try:
+  np.linalg.norm([[4]], axis=1)
+except TypeError:
+  def vector_norm(X):
+    return np.apply_along_axis(np.linalg.norm, 1, X)
+else:
+  def vector_norm(X):
+    return np.linalg.norm(X, axis=1)
\ No newline at end of file
diff --git a/metric_learn/itml.py b/metric_learn/itml.py
index 4c154ad4..7169fb36 100644
--- a/metric_learn/itml.py
+++ b/metric_learn/itml.py
@@ -21,6 +21,7 @@
 
 from .base_metric import BaseMetricLearner
 from .constraints import Constraints
+from ._util import vector_norm
 
 
 class ITML(BaseMetricLearner):
@@ -54,10 +55,10 @@ def _process_inputs(self, X, constraints, bounds):
     self.X_ = X = check_array(X)
     # check to make sure that no two constrained vectors are identical
     a,b,c,d = constraints
-    ident = _vector_norm(X[a] - X[b]) > 1e-9
-    a, b = a[ident], b[ident]
-    ident = _vector_norm(X[c] - X[d]) > 1e-9
-    c, d = c[ident], d[ident]
+    no_ident = vector_norm(X[a] - X[b]) > 1e-9
+    a, b = a[no_ident], b[no_ident]
+    no_ident = vector_norm(X[c] - X[d]) > 1e-9
+    c, d = c[no_ident], d[no_ident]
     # init bounds
     if bounds is None:
       self.bounds_ = np.percentile(pairwise_distances(X), (5, 95))
@@ -138,16 +139,6 @@ def fit(self, X, constraints, bounds=None):
   def metric(self):
     return self.A_
 
-# hack around lack of axis kwarg in older numpy versions
-try:
-  np.linalg.norm([[4]], axis=1)
-except TypeError:
-  def _vector_norm(X):
-    return np.apply_along_axis(np.linalg.norm, 1, X)
-else:
-  def _vector_norm(X):
-    return np.linalg.norm(X, axis=1)
-
 
 class ITML_Supervised(ITML):
   """Information Theoretic Metric Learning (ITML)"""
diff --git a/metric_learn/pgdm.py b/metric_learn/pgdm.py
index 89d0e451..8e2e02a0 100644
--- a/metric_learn/pgdm.py
+++ b/metric_learn/pgdm.py
@@ -16,7 +16,7 @@
 Adapted from Matlab code at http://www.cs.cmu.edu/%7Eepxing/papers/Old_papers/code_Metric_online.tar.gz
 """
 
-from __future__ import print_function, absolute_import
+from __future__ import print_function, absolute_import, division
 import numpy as np
 from six.moves import xrange
 from sklearn.metrics import pairwise_distances
@@ -24,18 +24,9 @@
 
 from .base_metric import BaseMetricLearner
 from .constraints import Constraints
+from ._util import vector_norm
 
 
-# hack around lack of axis kwarg in older numpy versions
-try:
-  np.linalg.norm([[4]], axis=1)
-except TypeError:
-  def _vector_norm(X):
-    return np.apply_along_axis(np.linalg.norm, 1, X)
-else:
-  def _vector_norm(X):
-    return np.linalg.norm(X, axis=1)
-
 
 class PGDM(BaseMetricLearner):
   """Probabilistic Global Distance Metric Learning (PGDM)"""
@@ -89,10 +80,14 @@ def _process_inputs(self, X, constraints):
     
     # check to make sure that no two constrained vectors are identical
     a,b,c,d = constraints
-    ident = _vector_norm(X[a] - X[b]) > 1e-9
-    a, b = a[ident], b[ident]
-    ident = _vector_norm(X[c] - X[d]) > 1e-9
-    c, d = c[ident], d[ident]
+    no_ident = vector_norm(X[a] - X[b]) > 1e-9
+    a, b = a[no_ident], b[no_ident]
+    no_ident = vector_norm(X[c] - X[d]) > 1e-9
+    c, d = c[no_ident], d[no_ident]
+    if len(a) == 0:
+      raise RuntimeError('No similarity constraints given for PGDM.')
+    if len(c) == 0:
+      raise RuntimeError('No dissimilarity constraints given for PGDM.')
     
     # init metric
     if self.A0 is None:
@@ -135,17 +130,18 @@ def _fit_full(self, X, constraints):
     #         1,
     #         X[a] - X[b]
     #     ).sum(axis = 0)
-    t = w.dot(A.ravel() / 100.0)
+    t = w.dot(A.ravel()) / 100.0
     
-    w1 = w / np.linalg.norm(w) # make `w` a unit vector
-    t1 = t / np.linalg.norm(w) # distance from origin to `w^T*x=t` plane
+    w_norm = np.linalg.norm(w)
+    w1 = w / w_norm  # make `w` a unit vector
+    t1 = t / w_norm  # distance from origin to `w^T*x=t` plane
     
     cycle = 1
-    alpha = 0.1 # initial step size along gradient
+    alpha = 0.1  # initial step size along gradient
     
-    grad1 = self._fS1(X, a, b, A)           # gradient of similarity constraint function
-    grad2 = self._fD1(X, c, d, A)           # gradient of dissimilarity constraint function
-    M = self._grad_projection(grad1, grad2) # gradient of fD1 orthogonal to fS1
+    grad1 = self._fS1(X, a, b, A)            # gradient of similarity constraint function
+    grad2 = self._fD1(X, c, d, A)            # gradient of dissimilarity constraint function
+    M = self._grad_projection(grad1, grad2)  # gradient of fD1 orthogonal to fS1
     
     A_old = A.copy()
 
@@ -185,13 +181,13 @@ def _fit_full(self, X, constraints):
       # max: g(A) >= 1
       # here we suppose g(A) = fD(A) = \sum_{I,J \in D} sqrt(d_ij' A d_ij)
       
-      obj_previous = self._fD(X, c, d, A_old) # g(A_old)
-      obj = self._fD(X, c, d, A)              # g(A)
+      obj_previous = self._fD(X, c, d, A_old)  # g(A_old)
+      obj = self._fD(X, c, d, A)               # g(A)
       
-      if ((obj > obj_previous) or (cycle == 0)) and (satisfy):
+      if satisfy and (obj > obj_previous or cycle == 0):
         
         # If projection of 1 and 2 is successful, and such projection
-        # imprives objective function, slightly increase learning rate
+        # improves objective function, slightly increase learning rate
         # and update from the current A.
         alpha *= 1.05
         A_old[:] = A
@@ -241,7 +237,7 @@ def _fit_diag(self, X, constraints):
     num_neg = len(c)
     num_samples, num_dim = X.shape
     
-    s_sum = np.sum((X[a] - X[b]) ** 2, axis = 0)
+    s_sum = np.sum((X[a] - X[b]) ** 2, axis=0)
     
     it = 0
     error = 1.0
@@ -253,19 +249,19 @@ def _fit_diag(self, X, constraints):
       
       fD0, fD_1st_d, fD_2nd_d = self._D_constraint(X, c, d, w)
       obj_initial = np.dot(s_sum, w) + self.diagonal_c * fD0
-      fS_1st_d = s_sum # first derivative of the similarity constraints
+      fS_1st_d = s_sum  # first derivative of the similarity constraints
       
-      gradient = fS_1st_d - self.diagonal_c * fD_1st_d              # gradient of the objective
-      hessian = -self.diagonal_c * fD_2nd_d + eps * np.eye(num_dim) # Hessian of the objective
+      gradient = fS_1st_d - self.diagonal_c * fD_1st_d               # gradient of the objective
+      hessian = -self.diagonal_c * fD_2nd_d + eps * np.eye(num_dim)  # Hessian of the objective
       step = np.dot(np.linalg.inv(hessian), gradient);
       
       # Newton-Rapshon update
       # search over optimal lambda
-      lambd = 1 # initial step-size
+      lambd = 1  # initial step-size
       w_tmp = np.maximum(0, w - lambd * step)
       
       obj = np.dot(s_sum, w_tmp) + self.diagonal_c * self._D_objective(X, c, d, w_tmp)
-      obj_previous = obj * 1.1 # just to get the while-loop started
+      obj_previous = obj * 1.1  # just to get the while-loop started
       
       inner_it = 0
       while obj < obj_previous:
@@ -292,7 +288,7 @@ def _fD(self, X, c, d, A):
     i.e. distance can be L1:  \sqrt{(x_i-x_j)A(x_i-x_j)'}
     """
     diff = X[c] - X[d]
-    return np.log(np.sum(np.sqrt(np.sum(np.dot(diff, A) * diff, axis = 1))) + 1e-6)
+    return np.log(np.sum(np.sqrt(np.sum(np.dot(diff, A) * diff, axis=1))) + 1e-6)
   
   def _fD1(self, X, c, d, A):
     """The gradient of the dissimilarity constraint function w.r.t. A.
@@ -309,8 +305,8 @@ def _fD1(self, X, c, d, A):
     dim = X.shape[1]
     diff = X[c] - X[d]
     M = np.einsum('ij,ik->ijk', diff, diff)    # outer products of all rows in `diff`
-    dist = np.sqrt(np.einsum('ijk,jk', M, A))  # equivalent to: np.sqrt(np.sum(M * A[None,:,:], axis = (1,2)))
-    sum_deri = np.einsum('ijk,i->jk', M, 0.5 / (dist + 1e-6))  # equivalent to: np.sum(M / (2 * (dist[:,None,None] + 1e-6)), axis = 0)
+    dist = np.sqrt(np.einsum('ijk,jk', M, A))  # equivalent to: np.sqrt(np.sum(M * A[None,:,:], axis=(1,2)))
+    sum_deri = np.einsum('ijk,i->jk', M, 0.5 / (dist + 1e-6))  # equivalent to: np.sum(M / (2 * (dist[:,None,None] + 1e-6)), axis=0)
     sum_dist = dist.sum()
     return sum_deri / (sum_dist + 1e-6)
   
@@ -325,7 +321,7 @@ def _fS1(self, X, a, b, A):
     """
     dim = X.shape[1]
     diff = X[a] - X[b]
-    return np.einsum('ij,ik->jk', diff, diff) # sum of outer products of all rows in `diff`
+    return np.einsum('ij,ik->jk', diff, diff)  # sum of outer products of all rows in `diff`
   
   def _grad_projection(self, grad1, grad2):
     grad2 = grad2 / np.linalg.norm(grad2)
@@ -334,7 +330,7 @@ def _grad_projection(self, grad1, grad2):
     return gtemp
   
   def _D_objective(self, X, c, d, w):
-    return np.log(np.sum(np.sqrt(np.sum(((X[c] - X[d]) ** 2) * w[None,:], axis = 1) + 1e-6)))
+    return np.log(np.sum(np.sqrt(np.sum(((X[c] - X[d]) ** 2) * w[None,:], axis=1) + 1e-6)))
   
   def _D_constraint(self, X, c, d, w):
     """Compute the value, 1st derivative, second derivative (Hessian) of 
diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py
index 9694ec02..2aa37687 100644
--- a/test/metric_learn_test.py
+++ b/test/metric_learn_test.py
@@ -154,19 +154,13 @@ def test_iris(self):
 
     # Generate full set of constraints for comparison with reference implementation
     n = self.iris_points.shape[0]
-    a, b, c, d = [], [], [], []
-    for i in range(n):
-      for j in range(i+1, n):
-        if self.iris_labels[i] == self.iris_labels[j]:
-          a.append(i)
-          b.append(j)
-        else:
-          c.append(i)
-          d.append(j)
+    mask = (self.iris_labels[None] == self.iris_labels[:,None])
+    a, b = np.nonzero(np.triu(mask, k=1))
+    c, d = np.nonzero(np.triu(~mask, k=1))
 
     # Full metric
-    pgdm = PGDM(convergence_threshold = 0.01)
-    pgdm.fit(self.iris_points, [np.asarray(x) for x in [a,b,c,d]])
+    pgdm = PGDM(convergence_threshold=0.01)
+    pgdm.fit(self.iris_points, [a,b,c,d])
     expected = [[+0.00046504, +0.00083371, -0.00111959, -0.00165265],
                 [+0.00083371, +0.00149466, -0.00200719, -0.00296284],
                 [-0.00111959, -0.00200719, +0.00269546, +0.00397881],
@@ -174,8 +168,8 @@ def test_iris(self):
     assert_array_almost_equal(expected, pgdm.metric(), decimal=6)
 
     # Diagonal metric
-    pgdm = PGDM(diagonal = True)
-    pgdm.fit(self.iris_points, [np.asarray(x) for x in [a,b,c,d]])
+    pgdm = PGDM(diagonal=True)
+    pgdm.fit(self.iris_points, [a,b,c,d])
     expected = [0, 0, 1.21045968, 1.22552608]
     assert_array_almost_equal(np.diag(expected), pgdm.metric(), decimal=6)
     
@@ -186,7 +180,7 @@ def test_iris(self):
     self.assertLess(csep, 0.15)
     
     # Supervised Diagonal
-    pgdm = PGDM_Supervised(diagonal = True)
+    pgdm = PGDM_Supervised(diagonal=True)
     pgdm.fit(self.iris_points, self.iris_labels)
     csep = class_separation(pgdm.transform(), self.iris_labels)
     self.assertLess(csep, 0.2)

From e9893c3c0dd5b81ef017086e88646f0a46122d2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Barz?= <bjoern.barz@uni-jena.de>
Date: Thu, 25 May 2017 11:46:08 +0200
Subject: [PATCH 6/7] Renamed PGDM to MMC

---
 README.rst                       |  2 +-
 metric_learn/__init__.py         |  2 +-
 metric_learn/{pgdm.py => mmc.py} | 48 ++++++++++++++++----------------
 test/metric_learn_test.py        | 30 ++++++++++----------
 test/test_base_metric.py         | 12 ++++----
 test/test_fit_transform.py       | 14 +++++-----
 test/test_sklearn_compat.py      |  8 +++---
 7 files changed, 58 insertions(+), 58 deletions(-)
 rename metric_learn/{pgdm.py => mmc.py} (91%)

diff --git a/README.rst b/README.rst
index af692623..1e8adbe7 100644
--- a/README.rst
+++ b/README.rst
@@ -15,7 +15,7 @@ Metric Learning algorithms in Python.
 -  Local Fisher Discriminant Analysis (LFDA)
 -  Relative Components Analysis (RCA)
 -  Metric Learning for Kernel Regression (MLKR)
--  Probabilistic Global Distance Metric Learning (PGDM)
+-  Mahalanobis Metric for Clustering (MMC)
 
 **Dependencies**
 
diff --git a/metric_learn/__init__.py b/metric_learn/__init__.py
index adc5307e..b86c10e1 100644
--- a/metric_learn/__init__.py
+++ b/metric_learn/__init__.py
@@ -10,4 +10,4 @@
 from .lfda import LFDA
 from .rca import RCA, RCA_Supervised
 from .mlkr import MLKR
-from .pgdm import PGDM, PGDM_Supervised
+from .mmc import MMC, MMC_Supervised
diff --git a/metric_learn/pgdm.py b/metric_learn/mmc.py
similarity index 91%
rename from metric_learn/pgdm.py
rename to metric_learn/mmc.py
index 8e2e02a0..36c16812 100644
--- a/metric_learn/pgdm.py
+++ b/metric_learn/mmc.py
@@ -1,7 +1,7 @@
 """
-Probabilistic Global Distance Metric Learning, Xing et al., NIPS 2002
+Mahalanobis Metric Learning with Application for Clustering with Side-Information, Xing et al., NIPS 2002
 
-PGDM minimizes the sum of squared distances between similar examples,
+MMC minimizes the sum of squared distances between similar examples,
 while enforcing the sum of distances between dissimilar examples to be
 greater than a certain margin.
 This leads to a convex and, thus, local-minima-free optimization problem
@@ -9,7 +9,7 @@
 However, the algorithm involves the computation of eigenvalues, which is the
 main speed-bottleneck.
 Since it has initially been designed for clustering applications, one of the
-implicit assumptions of PGDM is that all classes form a compact set, i.e.,
+implicit assumptions of MMC is that all classes form a compact set, i.e.,
 follow a unimodal distribution, which restricts the possible use-cases of
 this method. However, it is one of the earliest and a still often cited technique.
 
@@ -28,11 +28,11 @@
 
 
 
-class PGDM(BaseMetricLearner):
-  """Probabilistic Global Distance Metric Learning (PGDM)"""
+class MMC(BaseMetricLearner):
+  """Mahalanobis Metric for Clustering (MMC)"""
   def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3,
                A0=None, diagonal=False, diagonal_c=1.0, verbose=False):
-    """Initialize PGDM.
+    """Initialize MMC.
     Parameters
     ----------
     max_iter : int, optional
@@ -59,7 +59,7 @@ def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3,
     self.verbose = verbose
   
   def fit(self, X, constraints):
-    """Learn the PGDM model.
+    """Learn the MMC model.
     Parameters
     ----------
     X : (n x d) data matrix
@@ -85,9 +85,9 @@ def _process_inputs(self, X, constraints):
     no_ident = vector_norm(X[c] - X[d]) > 1e-9
     c, d = c[no_ident], d[no_ident]
     if len(a) == 0:
-      raise RuntimeError('No similarity constraints given for PGDM.')
+      raise RuntimeError('No similarity constraints given for MMC.')
     if len(c) == 0:
-      raise RuntimeError('No dissimilarity constraints given for PGDM.')
+      raise RuntimeError('No dissimilarity constraints given for MMC.')
     
     # init metric
     if self.A0 is None:
@@ -102,7 +102,7 @@ def _process_inputs(self, X, constraints):
     return a,b,c,d
 
   def _fit_full(self, X, constraints):
-    """Learn full metric using PGDM.
+    """Learn full metric using MMC.
     Parameters
     ----------
     X : (n x d) data matrix
@@ -208,22 +208,22 @@ def _fit_full(self, X, constraints):
       if delta < self.convergence_threshold:
         break
       if self.verbose:
-        print('pgdm iter: %d, conv = %f, projections = %d' % (cycle, delta, it+1))
+        print('mmc iter: %d, conv = %f, projections = %d' % (cycle, delta, it+1))
 
     if delta > self.convergence_threshold:
       self.converged_ = False
       if self.verbose:
-        print('pgdm did not converge, conv = %f' % (delta,))
+        print('mmc did not converge, conv = %f' % (delta,))
     else:
       self.converged_ = True
       if self.verbose:
-        print('pgdm converged at iter %d, conv = %f' % (cycle, delta))
+        print('mmc converged at iter %d, conv = %f' % (cycle, delta))
     self.A_[:] = A_old
     self.n_iter_ = cycle
     return self
   
   def _fit_diag(self, X, constraints):
-    """Learn diagonal metric using PGDM.
+    """Learn diagonal metric using MMC.
     Parameters
     ----------
     X : (n x d) data matrix
@@ -275,7 +275,7 @@ def _fit_diag(self, X, constraints):
       w[:] = w_previous
       error = np.abs((obj_previous - obj_initial) / obj_previous)
       if self.verbose:
-        print('pgdm iter: %d, conv = %f' % (it, error))
+        print('mmc iter: %d, conv = %f' % (it, error))
       it += 1
     
     self.A_ = np.diag(w)
@@ -361,7 +361,7 @@ def transformer(self):
     L = V.T * w^(-1/2), with A = V*w*V.T being the eigenvector decomposition of A with
     the eigenvalues in the diagonal matrix w and the columns of V being the eigenvectors.
     
-    The Cholesky decomposition cannot be applied here, since PGDM learns only a positive
+    The Cholesky decomposition cannot be applied here, since MMC learns only a positive
     *semi*-definite Mahalanobis matrix.
     
     Returns
@@ -375,8 +375,8 @@ def transformer(self):
       return V.T * np.sqrt(np.maximum(0, w[:,None]))
 
 
-class PGDM_Supervised(PGDM):
-  """Probabilistic Global Distance Metric Learning (PGDM)"""
+class MMC_Supervised(MMC):
+  """Mahalanobis Metric for Clustering (MMC)"""
   def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-6,
                num_labeled=np.inf, num_constraints=None,
                A0=None, diagonal=False, diagonal_c=1.0, verbose=False):
@@ -402,15 +402,15 @@ def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-6,
     verbose : bool, optional
         if True, prints information while learning
     """
-    PGDM.__init__(self, max_iter=max_iter, max_proj=max_proj,
-                  convergence_threshold=convergence_threshold,
-                  A0=A0, diagonal=diagonal, diagonal_c=diagonal_c,
-                  verbose=verbose)
+    MMC.__init__(self, max_iter=max_iter, max_proj=max_proj,
+                 convergence_threshold=convergence_threshold,
+                 A0=A0, diagonal=diagonal, diagonal_c=diagonal_c,
+                 verbose=verbose)
     self.num_labeled = num_labeled
     self.num_constraints = num_constraints
 
   def fit(self, X, y, random_state=np.random):
-    """Create constraints from labels and learn the PGDM model.
+    """Create constraints from labels and learn the MMC model.
     Parameters
     ----------
     X : (n x d) matrix
@@ -430,4 +430,4 @@ def fit(self, X, y, random_state=np.random):
                                   random_state=random_state)
     pos_neg = c.positive_negative_pairs(num_constraints,
                                         random_state=random_state)
-    return PGDM.fit(self, X, pos_neg)
+    return MMC.fit(self, X, pos_neg)
diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py
index 2aa37687..351b6298 100644
--- a/test/metric_learn_test.py
+++ b/test/metric_learn_test.py
@@ -6,8 +6,8 @@
 from numpy.testing import assert_array_almost_equal
 
 from metric_learn import (
-    LMNN, NCA, LFDA, Covariance, MLKR, PGDM,
-    LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised, PGDM_Supervised)
+    LMNN, NCA, LFDA, Covariance, MLKR, MMC,
+    LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised, MMC_Supervised)
 # Import this specially for testing.
 from metric_learn.lmnn import python_LMNN
 
@@ -149,7 +149,7 @@ def test_iris(self):
     self.assertLess(csep, 0.25)
 
 
-class TestPGDM(MetricTestCase):
+class TestMMC(MetricTestCase):
   def test_iris(self):
 
     # Generate full set of constraints for comparison with reference implementation
@@ -159,30 +159,30 @@ def test_iris(self):
     c, d = np.nonzero(np.triu(~mask, k=1))
 
     # Full metric
-    pgdm = PGDM(convergence_threshold=0.01)
-    pgdm.fit(self.iris_points, [a,b,c,d])
+    mmc = MMC(convergence_threshold=0.01)
+    mmc.fit(self.iris_points, [a,b,c,d])
     expected = [[+0.00046504, +0.00083371, -0.00111959, -0.00165265],
                 [+0.00083371, +0.00149466, -0.00200719, -0.00296284],
                 [-0.00111959, -0.00200719, +0.00269546, +0.00397881],
                 [-0.00165265, -0.00296284, +0.00397881, +0.00587320]]
-    assert_array_almost_equal(expected, pgdm.metric(), decimal=6)
+    assert_array_almost_equal(expected, mmc.metric(), decimal=6)
 
     # Diagonal metric
-    pgdm = PGDM(diagonal=True)
-    pgdm.fit(self.iris_points, [a,b,c,d])
+    mmc = MMC(diagonal=True)
+    mmc.fit(self.iris_points, [a,b,c,d])
     expected = [0, 0, 1.21045968, 1.22552608]
-    assert_array_almost_equal(np.diag(expected), pgdm.metric(), decimal=6)
+    assert_array_almost_equal(np.diag(expected), mmc.metric(), decimal=6)
     
     # Supervised Full
-    pgdm = PGDM_Supervised()
-    pgdm.fit(self.iris_points, self.iris_labels)
-    csep = class_separation(pgdm.transform(), self.iris_labels)
+    mmc = MMC_Supervised()
+    mmc.fit(self.iris_points, self.iris_labels)
+    csep = class_separation(mmc.transform(), self.iris_labels)
     self.assertLess(csep, 0.15)
     
     # Supervised Diagonal
-    pgdm = PGDM_Supervised(diagonal=True)
-    pgdm.fit(self.iris_points, self.iris_labels)
-    csep = class_separation(pgdm.transform(), self.iris_labels)
+    mmc = MMC_Supervised(diagonal=True)
+    mmc.fit(self.iris_points, self.iris_labels)
+    csep = class_separation(mmc.transform(), self.iris_labels)
     self.assertLess(csep, 0.2)
 
 
diff --git a/test/test_base_metric.py b/test/test_base_metric.py
index 3292260a..31db4e6f 100644
--- a/test/test_base_metric.py
+++ b/test/test_base_metric.py
@@ -63,13 +63,13 @@ def test_mlkr(self):
                      "MLKR(A0=None, alpha=0.0001, epsilon=0.01, "
                      "max_iter=1000, num_dims=None)")
 
-  def test_pgdm(self):
-    self.assertEqual(str(metric_learn.PGDM()), """
-PGDM(A0=None, convergence_threshold=0.001, diagonal=False, diagonal_c=1.0,
-   max_iter=100, max_proj=10000, verbose=False)
+  def test_mmc(self):
+    self.assertEqual(str(metric_learn.MMC()), """
+MMC(A0=None, convergence_threshold=0.001, diagonal=False, diagonal_c=1.0,
+  max_iter=100, max_proj=10000, verbose=False)
 """.strip('\n'))
-    self.assertEqual(str(metric_learn.PGDM_Supervised()), """
-PGDM_Supervised(A0=None, convergence_threshold=1e-06, diagonal=False,
+    self.assertEqual(str(metric_learn.MMC_Supervised()), """
+MMC_Supervised(A0=None, convergence_threshold=1e-06, diagonal=False,
         diagonal_c=1.0, max_iter=100, max_proj=10000, num_constraints=None,
         num_labeled=inf, verbose=False)
 """.strip('\n'))
diff --git a/test/test_fit_transform.py b/test/test_fit_transform.py
index 9e687a63..707815ec 100644
--- a/test/test_fit_transform.py
+++ b/test/test_fit_transform.py
@@ -5,7 +5,7 @@
 
 from metric_learn import (
     LMNN, NCA, LFDA, Covariance, MLKR,
-    LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised, PGDM_Supervised)
+    LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised, MMC_Supervised)
 
 
 class TestFitTransform(unittest.TestCase):
@@ -118,15 +118,15 @@ def test_mlkr(self):
 
     assert_array_almost_equal(res_1, res_2)
 
-  def test_pgdm_supervised(self):
+  def test_mmc_supervised(self):
     seed = np.random.RandomState(1234)
-    pgdm = PGDM_Supervised(num_constraints=200)
-    pgdm.fit(self.X, self.y, random_state=seed)
-    res_1 = pgdm.transform()
+    mmc = MMC_Supervised(num_constraints=200)
+    mmc.fit(self.X, self.y, random_state=seed)
+    res_1 = mmc.transform()
 
     seed = np.random.RandomState(1234)
-    pgdm = PGDM_Supervised(num_constraints=200)
-    res_2 = pgdm.fit_transform(self.X, self.y, random_state=seed)
+    mmc = MMC_Supervised(num_constraints=200)
+    res_2 = mmc.fit_transform(self.X, self.y, random_state=seed)
 
     assert_array_almost_equal(res_1, res_2)
 
diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py
index 156a6c63..f1e1a09d 100644
--- a/test/test_sklearn_compat.py
+++ b/test/test_sklearn_compat.py
@@ -4,7 +4,7 @@
 
 from metric_learn import (
     LMNN, NCA, LFDA, Covariance, MLKR,
-    LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised, PGDM_Supervised)
+    LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised, MMC_Supervised)
 
 
 # Wrap the _Supervised methods with a deterministic wrapper for testing.
@@ -22,7 +22,7 @@ class dITML(deterministic_mixin, ITML_Supervised):
   pass
 
 
-class dPGDM(deterministic_mixin, PGDM_Supervised):
+class dMMC(deterministic_mixin, MMC_Supervised):
   pass
 
 
@@ -56,8 +56,8 @@ def test_lsml(self):
   def test_itml(self):
     check_estimator(dITML)
 
-  def test_pgdm(self):
-    check_estimator(dPGDM)
+  def test_mmc(self):
+    check_estimator(dMMC)
 
   # This fails due to a FloatingPointError
   # def test_sdml(self):

From 11d7d0a64c5b270a0f71defef133a6ec1d77a761 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Barz?= <bjoern.barz@uni-jena.de>
Date: Thu, 25 May 2017 16:07:23 +0200
Subject: [PATCH 7/7] Addressed 2nd review by perimosocordiae

---
 metric_learn/mmc.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py
index 36c16812..7760e1b1 100644
--- a/metric_learn/mmc.py
+++ b/metric_learn/mmc.py
@@ -85,9 +85,9 @@ def _process_inputs(self, X, constraints):
     no_ident = vector_norm(X[c] - X[d]) > 1e-9
     c, d = c[no_ident], d[no_ident]
     if len(a) == 0:
-      raise RuntimeError('No similarity constraints given for MMC.')
+      raise ValueError('No non-trivial similarity constraints given for MMC.')
     if len(c) == 0:
-      raise RuntimeError('No dissimilarity constraints given for MMC.')
+      raise ValueError('No non-trivial dissimilarity constraints given for MMC.')
     
     # init metric
     if self.A0 is None:
@@ -304,9 +304,12 @@ def _fD1(self, X, c, d, A):
     """
     dim = X.shape[1]
     diff = X[c] - X[d]
-    M = np.einsum('ij,ik->ijk', diff, diff)    # outer products of all rows in `diff`
-    dist = np.sqrt(np.einsum('ijk,jk', M, A))  # equivalent to: np.sqrt(np.sum(M * A[None,:,:], axis=(1,2)))
-    sum_deri = np.einsum('ijk,i->jk', M, 0.5 / (dist + 1e-6))  # equivalent to: np.sum(M / (2 * (dist[:,None,None] + 1e-6)), axis=0)
+    # outer products of all rows in `diff`
+    M = np.einsum('ij,ik->ijk', diff, diff)
+    # faster version of: dist = np.sqrt(np.sum(M * A[None,:,:], axis=(1,2)))
+    dist = np.sqrt(np.einsum('ijk,jk', M, A))
+    # faster version of: sum_deri = np.sum(M / (2 * (dist[:,None,None] + 1e-6)), axis=0)
+    sum_deri = np.einsum('ijk,i->jk', M, 0.5 / (dist + 1e-6))
     sum_dist = dist.sum()
     return sum_deri / (sum_dist + 1e-6)
   
@@ -342,9 +345,9 @@ def _D_constraint(self, X, c, d, w):
     dist = np.sqrt(diff_sq.dot(w))
     sum_deri1 = np.einsum('ij,i', diff_sq, 0.5 / np.maximum(dist, 1e-6))
     sum_deri2 = np.einsum(
-      'ijk,i',
-      np.einsum('ij,ik->ijk', diff_sq, diff_sq),
-      -0.25 / np.maximum(1e-6, dist**3)
+        'ij,ik->jk',
+        diff_sq,
+        diff_sq / (-4 * np.maximum(1e-6, dist**3))[:,None]
     )
     sum_dist = dist.sum()
     return (