From e568d671a41b11aee2a64995269b7e7a7fda3a84 Mon Sep 17 00:00:00 2001 From: aje Date: Fri, 24 Nov 2017 10:27:30 +0100 Subject: [PATCH 1/4] Fix mistake in LMNN Issue in function _find_impostors: - the squared euclidean distance is used to compute the margins in variable "margin_radii" - the euclidean distance is used (through the function sklearn.metrics.pairwise.pairwise_distances) to compute distances between samples of different labels in variable "dist" - the issue is that the impostors are found by testing "dist < margin_radii" which is wrong because "dist" represent distances, and "margin_radii" represent squared distances. I propose to solve this problem by computing always the squared distances. --- metric_learn/lmnn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index 3682f3f6..177954db 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -185,7 +185,7 @@ def _select_targets(self): target_neighbors = np.empty((self.X_.shape[0], self.k), dtype=int) for label in self.labels_: inds, = np.nonzero(self.label_inds_ == label) - dd = pairwise_distances(self.X_[inds]) + dd = pairwise_distances(self.X_[inds], metric='seuclidean') np.fill_diagonal(dd, np.inf) nn = np.argsort(dd)[..., :self.k] target_neighbors[inds] = inds[nn] @@ -198,7 +198,7 @@ def _find_impostors(self, furthest_neighbors): for label in self.labels_[:-1]: in_inds, = np.nonzero(self.label_inds_ == label) out_inds, = np.nonzero(self.label_inds_ > label) - dist = pairwise_distances(Lx[out_inds], Lx[in_inds]) + dist = pairwise_distances(Lx[out_inds], Lx[in_inds], metric='seuclidean') i1,j1 = np.nonzero(dist < margin_radii[out_inds][:,None]) i2,j2 = np.nonzero(dist < margin_radii[in_inds]) i = np.hstack((i1,i2)) From bd7a4ba2170adf04987295acaff09728a57e44cc Mon Sep 17 00:00:00 2001 From: aje Date: Fri, 24 Nov 2017 10:35:52 +0100 Subject: [PATCH 2/4] Faster LMNN The use of the function sklearn.metrics.pairwise_distances gives horrible performances. Replace it by a faster function. Below is a little script to show the computation time of LMNN import time from sklearn.datasets import load_breast_cancer from metric_learn import lmnn dataset = load_breast_cancer() s = time.time() l1 = lmnn.LMNN(k=3) l1.fit(dataset["data"], dataset["target"]) e = time.time() print(e - s) Before this commit, this script executes in 32 seconds After this commit, this script executes in 1 second --- metric_learn/lmnn.py | 54 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 3 deletions(-) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index 177954db..cd5d9548 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -14,7 +14,6 @@ import warnings from collections import Counter from six.moves import xrange -from sklearn.metrics import pairwise_distances from sklearn.utils.validation import check_X_y, check_array from .base_metric import BaseMetricLearner @@ -185,7 +184,7 @@ def _select_targets(self): target_neighbors = np.empty((self.X_.shape[0], self.k), dtype=int) for label in self.labels_: inds, = np.nonzero(self.label_inds_ == label) - dd = pairwise_distances(self.X_[inds], metric='seuclidean') + dd = pairwiseEuclidean(self.X_[inds], self.X_[inds], squared=True) np.fill_diagonal(dd, np.inf) nn = np.argsort(dd)[..., :self.k] target_neighbors[inds] = inds[nn] @@ -198,7 +197,7 @@ def _find_impostors(self, furthest_neighbors): for label in self.labels_[:-1]: in_inds, = np.nonzero(self.label_inds_ == label) out_inds, = np.nonzero(self.label_inds_ > label) - dist = pairwise_distances(Lx[out_inds], Lx[in_inds], metric='seuclidean') + dist = pairwiseEuclidean(Lx[out_inds], Lx[in_inds], squared=True) i1,j1 = np.nonzero(dist < margin_radii[out_inds][:,None]) i2,j2 = np.nonzero(dist < margin_radii[in_inds]) i = np.hstack((i1,i2)) @@ -221,6 +220,55 @@ def _inplace_paired_L2(A, B): return np.einsum('...ij,...ij->...i', A, A) +def pairwiseEuclidean(a, b, squared=False): + """ + Compute the pairwise euclidean distance between matrices a and b. + + + Parameters + ---------- + a : np.ndarray (n, f) + first matrix + b : np.ndarray (m, f) + second matrix + squared : boolean, optional (default False) + if True, return squared euclidean distance matrix + + + Returns + ------- + c : (n x m) np.ndarray + pairwise euclidean distance distance matrix + """ + # a is shape (n, f) and b shape (m, f). Return matrix c of shape (n, m). + # First compute in c the squared euclidean distance. And return its + # square root. At each cell [i,j] of c, we want to have + # sum{k in range(f)} ( (a[i,k] - b[j,k])^2 ). We know that + # (a-b)^2 = a^2 -2ab +b^2. Thus we want to have in each cell of c: + # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2). + + # Multiply a by b transpose to obtain in each cell [i,j] of c the + # value sum{k in range(f)} ( a[i,k]b[j,k] ) + c = a.dot(b.T) + # multiply by -2 to have sum{k in range(f)} ( -2a[i,k]b[j,k] ) + np.multiply(c, -2, out=c) + + # Compute the vectors of the sum of squared elements. + a = np.power(a, 2).sum(axis=1) + b = np.power(b, 2).sum(axis=1) + + # Add the vectors in each columns (respectivly rows) of c. + # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] ) + c += a.reshape(-1, 1) + # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2) + c += b + + if not squared: + np.sqrt(c, out=c) + + return c + + def _count_edges(act1, act2, impostors, targets): imp = impostors[0,act1] c = Counter(zip(imp, targets[imp])) From 27eac1f2e7b54d25bd8d87b36f34b354dc1facfa Mon Sep 17 00:00:00 2001 From: aje Date: Sat, 25 Nov 2017 09:03:59 +0100 Subject: [PATCH 3/4] Use euclidean_distances from sklearn --- metric_learn/lmnn.py | 54 +++----------------------------------------- 1 file changed, 3 insertions(+), 51 deletions(-) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index cd5d9548..d4154e74 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -15,6 +15,7 @@ from collections import Counter from six.moves import xrange from sklearn.utils.validation import check_X_y, check_array +from sklearn.metrics import euclidean_distances from .base_metric import BaseMetricLearner @@ -184,7 +185,7 @@ def _select_targets(self): target_neighbors = np.empty((self.X_.shape[0], self.k), dtype=int) for label in self.labels_: inds, = np.nonzero(self.label_inds_ == label) - dd = pairwiseEuclidean(self.X_[inds], self.X_[inds], squared=True) + dd = euclidean_distances(self.X_[inds], self.X_[inds], squared=True) np.fill_diagonal(dd, np.inf) nn = np.argsort(dd)[..., :self.k] target_neighbors[inds] = inds[nn] @@ -197,7 +198,7 @@ def _find_impostors(self, furthest_neighbors): for label in self.labels_[:-1]: in_inds, = np.nonzero(self.label_inds_ == label) out_inds, = np.nonzero(self.label_inds_ > label) - dist = pairwiseEuclidean(Lx[out_inds], Lx[in_inds], squared=True) + dist = euclidean_distances(Lx[out_inds], Lx[in_inds], squared=True) i1,j1 = np.nonzero(dist < margin_radii[out_inds][:,None]) i2,j2 = np.nonzero(dist < margin_radii[in_inds]) i = np.hstack((i1,i2)) @@ -220,55 +221,6 @@ def _inplace_paired_L2(A, B): return np.einsum('...ij,...ij->...i', A, A) -def pairwiseEuclidean(a, b, squared=False): - """ - Compute the pairwise euclidean distance between matrices a and b. - - - Parameters - ---------- - a : np.ndarray (n, f) - first matrix - b : np.ndarray (m, f) - second matrix - squared : boolean, optional (default False) - if True, return squared euclidean distance matrix - - - Returns - ------- - c : (n x m) np.ndarray - pairwise euclidean distance distance matrix - """ - # a is shape (n, f) and b shape (m, f). Return matrix c of shape (n, m). - # First compute in c the squared euclidean distance. And return its - # square root. At each cell [i,j] of c, we want to have - # sum{k in range(f)} ( (a[i,k] - b[j,k])^2 ). We know that - # (a-b)^2 = a^2 -2ab +b^2. Thus we want to have in each cell of c: - # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2). - - # Multiply a by b transpose to obtain in each cell [i,j] of c the - # value sum{k in range(f)} ( a[i,k]b[j,k] ) - c = a.dot(b.T) - # multiply by -2 to have sum{k in range(f)} ( -2a[i,k]b[j,k] ) - np.multiply(c, -2, out=c) - - # Compute the vectors of the sum of squared elements. - a = np.power(a, 2).sum(axis=1) - b = np.power(b, 2).sum(axis=1) - - # Add the vectors in each columns (respectivly rows) of c. - # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] ) - c += a.reshape(-1, 1) - # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2) - c += b - - if not squared: - np.sqrt(c, out=c) - - return c - - def _count_edges(act1, act2, impostors, targets): imp = impostors[0,act1] c = Counter(zip(imp, targets[imp])) From 6f3786557d0f4be7c658b8254dae943469eaf415 Mon Sep 17 00:00:00 2001 From: aje Date: Sun, 26 Nov 2017 19:53:50 +0100 Subject: [PATCH 4/4] Remove no needed parameter --- metric_learn/lmnn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index d4154e74..dea12f0c 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -185,7 +185,7 @@ def _select_targets(self): target_neighbors = np.empty((self.X_.shape[0], self.k), dtype=int) for label in self.labels_: inds, = np.nonzero(self.label_inds_ == label) - dd = euclidean_distances(self.X_[inds], self.X_[inds], squared=True) + dd = euclidean_distances(self.X_[inds], squared=True) np.fill_diagonal(dd, np.inf) nn = np.argsort(dd)[..., :self.k] target_neighbors[inds] = inds[nn]