From e568d671a41b11aee2a64995269b7e7a7fda3a84 Mon Sep 17 00:00:00 2001
From: aje <leo_g_autheron@hotmail.fr>
Date: Fri, 24 Nov 2017 10:27:30 +0100
Subject: [PATCH 1/4] Fix mistake in LMNN

Issue in function _find_impostors:
 - the squared euclidean distance is used to compute the margins in variable "margin_radii"
 - the euclidean distance is used (through the function sklearn.metrics.pairwise.pairwise_distances) to compute distances between samples of different labels in variable "dist"
 - the issue is that the impostors are found by testing "dist < margin_radii" which is wrong because "dist" represent distances, and "margin_radii" represent squared distances.

I propose to solve this problem by computing always the squared distances.
---
 metric_learn/lmnn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
index 3682f3f6..177954db 100644
--- a/metric_learn/lmnn.py
+++ b/metric_learn/lmnn.py
@@ -185,7 +185,7 @@ def _select_targets(self):
     target_neighbors = np.empty((self.X_.shape[0], self.k), dtype=int)
     for label in self.labels_:
       inds, = np.nonzero(self.label_inds_ == label)
-      dd = pairwise_distances(self.X_[inds])
+      dd = pairwise_distances(self.X_[inds], metric='seuclidean')
       np.fill_diagonal(dd, np.inf)
       nn = np.argsort(dd)[..., :self.k]
       target_neighbors[inds] = inds[nn]
@@ -198,7 +198,7 @@ def _find_impostors(self, furthest_neighbors):
     for label in self.labels_[:-1]:
       in_inds, = np.nonzero(self.label_inds_ == label)
       out_inds, = np.nonzero(self.label_inds_ > label)
-      dist = pairwise_distances(Lx[out_inds], Lx[in_inds])
+      dist = pairwise_distances(Lx[out_inds], Lx[in_inds], metric='seuclidean')
       i1,j1 = np.nonzero(dist < margin_radii[out_inds][:,None])
       i2,j2 = np.nonzero(dist < margin_radii[in_inds])
       i = np.hstack((i1,i2))

From bd7a4ba2170adf04987295acaff09728a57e44cc Mon Sep 17 00:00:00 2001
From: aje <leo_g_autheron@hotmail.fr>
Date: Fri, 24 Nov 2017 10:35:52 +0100
Subject: [PATCH 2/4] Faster LMNN

The use of the function sklearn.metrics.pairwise_distances gives horrible performances.
Replace it by a faster function.

Below is a little script to show the computation time of LMNN

import time
from sklearn.datasets import load_breast_cancer
from metric_learn import lmnn
dataset = load_breast_cancer()
s = time.time()
l1 = lmnn.LMNN(k=3)
l1.fit(dataset["data"], dataset["target"])
e = time.time()
print(e - s)

Before this commit, this script executes in 32 seconds
After this commit, this script executes in 1 second
---
 metric_learn/lmnn.py | 54 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 3 deletions(-)

diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
index 177954db..cd5d9548 100644
--- a/metric_learn/lmnn.py
+++ b/metric_learn/lmnn.py
@@ -14,7 +14,6 @@
 import warnings
 from collections import Counter
 from six.moves import xrange
-from sklearn.metrics import pairwise_distances
 from sklearn.utils.validation import check_X_y, check_array
 
 from .base_metric import BaseMetricLearner
@@ -185,7 +184,7 @@ def _select_targets(self):
     target_neighbors = np.empty((self.X_.shape[0], self.k), dtype=int)
     for label in self.labels_:
       inds, = np.nonzero(self.label_inds_ == label)
-      dd = pairwise_distances(self.X_[inds], metric='seuclidean')
+      dd = pairwiseEuclidean(self.X_[inds], self.X_[inds], squared=True)
       np.fill_diagonal(dd, np.inf)
       nn = np.argsort(dd)[..., :self.k]
       target_neighbors[inds] = inds[nn]
@@ -198,7 +197,7 @@ def _find_impostors(self, furthest_neighbors):
     for label in self.labels_[:-1]:
       in_inds, = np.nonzero(self.label_inds_ == label)
       out_inds, = np.nonzero(self.label_inds_ > label)
-      dist = pairwise_distances(Lx[out_inds], Lx[in_inds], metric='seuclidean')
+      dist = pairwiseEuclidean(Lx[out_inds], Lx[in_inds], squared=True)
       i1,j1 = np.nonzero(dist < margin_radii[out_inds][:,None])
       i2,j2 = np.nonzero(dist < margin_radii[in_inds])
       i = np.hstack((i1,i2))
@@ -221,6 +220,55 @@ def _inplace_paired_L2(A, B):
   return np.einsum('...ij,...ij->...i', A, A)
 
 
+def pairwiseEuclidean(a, b, squared=False):
+    """
+    Compute the pairwise euclidean distance between matrices a and b.
+
+
+    Parameters
+    ----------
+    a : np.ndarray (n, f)
+        first matrix
+    b : np.ndarray (m, f)
+        second matrix
+    squared : boolean, optional (default False)
+        if True, return squared euclidean distance matrix
+
+
+    Returns
+    -------
+    c : (n x m) np.ndarray
+        pairwise euclidean distance distance matrix
+    """
+    # a is shape (n, f) and b shape (m, f). Return matrix c of shape (n, m).
+    # First compute in c the squared euclidean distance. And return its
+    # square root. At each cell [i,j] of c, we want to have
+    # sum{k in range(f)} ( (a[i,k] - b[j,k])^2 ). We know that
+    # (a-b)^2 = a^2 -2ab +b^2. Thus we want to have in each cell of c:
+    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2).
+
+    # Multiply a by b transpose to obtain in each cell [i,j] of c the
+    # value sum{k in range(f)} ( a[i,k]b[j,k] )
+    c = a.dot(b.T)
+    # multiply by -2 to have sum{k in range(f)} ( -2a[i,k]b[j,k] )
+    np.multiply(c, -2, out=c)
+
+    # Compute the vectors of the sum of squared elements.
+    a = np.power(a, 2).sum(axis=1)
+    b = np.power(b, 2).sum(axis=1)
+
+    # Add the vectors in each columns (respectivly rows) of c.
+    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] )
+    c += a.reshape(-1, 1)
+    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2)
+    c += b
+
+    if not squared:
+        np.sqrt(c, out=c)
+
+    return c
+
+
 def _count_edges(act1, act2, impostors, targets):
   imp = impostors[0,act1]
   c = Counter(zip(imp, targets[imp]))

From 27eac1f2e7b54d25bd8d87b36f34b354dc1facfa Mon Sep 17 00:00:00 2001
From: aje <leo_g_autheron@hotmail.fr>
Date: Sat, 25 Nov 2017 09:03:59 +0100
Subject: [PATCH 3/4] Use euclidean_distances from sklearn

---
 metric_learn/lmnn.py | 54 +++-----------------------------------------
 1 file changed, 3 insertions(+), 51 deletions(-)

diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
index cd5d9548..d4154e74 100644
--- a/metric_learn/lmnn.py
+++ b/metric_learn/lmnn.py
@@ -15,6 +15,7 @@
 from collections import Counter
 from six.moves import xrange
 from sklearn.utils.validation import check_X_y, check_array
+from sklearn.metrics import euclidean_distances
 
 from .base_metric import BaseMetricLearner
 
@@ -184,7 +185,7 @@ def _select_targets(self):
     target_neighbors = np.empty((self.X_.shape[0], self.k), dtype=int)
     for label in self.labels_:
       inds, = np.nonzero(self.label_inds_ == label)
-      dd = pairwiseEuclidean(self.X_[inds], self.X_[inds], squared=True)
+      dd = euclidean_distances(self.X_[inds], self.X_[inds], squared=True)
       np.fill_diagonal(dd, np.inf)
       nn = np.argsort(dd)[..., :self.k]
       target_neighbors[inds] = inds[nn]
@@ -197,7 +198,7 @@ def _find_impostors(self, furthest_neighbors):
     for label in self.labels_[:-1]:
       in_inds, = np.nonzero(self.label_inds_ == label)
       out_inds, = np.nonzero(self.label_inds_ > label)
-      dist = pairwiseEuclidean(Lx[out_inds], Lx[in_inds], squared=True)
+      dist = euclidean_distances(Lx[out_inds], Lx[in_inds], squared=True)
       i1,j1 = np.nonzero(dist < margin_radii[out_inds][:,None])
       i2,j2 = np.nonzero(dist < margin_radii[in_inds])
       i = np.hstack((i1,i2))
@@ -220,55 +221,6 @@ def _inplace_paired_L2(A, B):
   return np.einsum('...ij,...ij->...i', A, A)
 
 
-def pairwiseEuclidean(a, b, squared=False):
-    """
-    Compute the pairwise euclidean distance between matrices a and b.
-
-
-    Parameters
-    ----------
-    a : np.ndarray (n, f)
-        first matrix
-    b : np.ndarray (m, f)
-        second matrix
-    squared : boolean, optional (default False)
-        if True, return squared euclidean distance matrix
-
-
-    Returns
-    -------
-    c : (n x m) np.ndarray
-        pairwise euclidean distance distance matrix
-    """
-    # a is shape (n, f) and b shape (m, f). Return matrix c of shape (n, m).
-    # First compute in c the squared euclidean distance. And return its
-    # square root. At each cell [i,j] of c, we want to have
-    # sum{k in range(f)} ( (a[i,k] - b[j,k])^2 ). We know that
-    # (a-b)^2 = a^2 -2ab +b^2. Thus we want to have in each cell of c:
-    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2).
-
-    # Multiply a by b transpose to obtain in each cell [i,j] of c the
-    # value sum{k in range(f)} ( a[i,k]b[j,k] )
-    c = a.dot(b.T)
-    # multiply by -2 to have sum{k in range(f)} ( -2a[i,k]b[j,k] )
-    np.multiply(c, -2, out=c)
-
-    # Compute the vectors of the sum of squared elements.
-    a = np.power(a, 2).sum(axis=1)
-    b = np.power(b, 2).sum(axis=1)
-
-    # Add the vectors in each columns (respectivly rows) of c.
-    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] )
-    c += a.reshape(-1, 1)
-    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2)
-    c += b
-
-    if not squared:
-        np.sqrt(c, out=c)
-
-    return c
-
-
 def _count_edges(act1, act2, impostors, targets):
   imp = impostors[0,act1]
   c = Counter(zip(imp, targets[imp]))

From 6f3786557d0f4be7c658b8254dae943469eaf415 Mon Sep 17 00:00:00 2001
From: aje <leo_g_autheron@hotmail.fr>
Date: Sun, 26 Nov 2017 19:53:50 +0100
Subject: [PATCH 4/4] Remove no needed parameter

---
 metric_learn/lmnn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
index d4154e74..dea12f0c 100644
--- a/metric_learn/lmnn.py
+++ b/metric_learn/lmnn.py
@@ -185,7 +185,7 @@ def _select_targets(self):
     target_neighbors = np.empty((self.X_.shape[0], self.k), dtype=int)
     for label in self.labels_:
       inds, = np.nonzero(self.label_inds_ == label)
-      dd = euclidean_distances(self.X_[inds], self.X_[inds], squared=True)
+      dd = euclidean_distances(self.X_[inds], squared=True)
       np.fill_diagonal(dd, np.inf)
       nn = np.argsort(dd)[..., :self.k]
       target_neighbors[inds] = inds[nn]