Vectorize gradient for sinkhorn l1l2

kachayev · kachayev · commit eb0ca02048bf · 2023-08-20T21:45:29.000+02:00
diff --git a/ot/backend.py b/ot/backend.py
@@ -407,7 +407,7 @@ def power(self, a, exponents):
         """
         raise NotImplementedError()
 
-    def norm(self, a, axis=None):
+    def norm(self, a, axis=None, keepdims=False):
         r"""
         Computes the matrix frobenius norm.
 
@@ -1087,8 +1087,8 @@ def sqrt(self, a):
     def power(self, a, exponents):
         return np.power(a, exponents)
 
-    def norm(self, a, axis=None):
-        return np.linalg.norm(a, axis=axis)
+    def norm(self, a, axis=None, keepdims=False):
+        return np.linalg.norm(a, axis=axis, keepdims=keepdims)
 
     def any(self, a):
         return np.any(a)
@@ -1461,8 +1461,8 @@ def sqrt(self, a):
     def power(self, a, exponents):
         return jnp.power(a, exponents)
 
-    def norm(self, a, axis=None):
-        return jnp.linalg.norm(a, axis=axis)
+    def norm(self, a, axis=None, keepdims=False):
+        return jnp.linalg.norm(a, axis=axis, keepdims=keepdims)
 
     def any(self, a):
         return jnp.any(a)
@@ -1881,8 +1881,8 @@ def sqrt(self, a):
     def power(self, a, exponents):
         return torch.pow(a, exponents)
 
-    def norm(self, a, axis=None):
-        return torch.linalg.norm(a.double(), dim=axis)
+    def norm(self, a, axis=None, keepdims=False):
+        return torch.linalg.norm(a.double(), dim=axis, keepdims=keepdims)
 
     def any(self, a):
         return torch.any(a)
@@ -2306,8 +2306,8 @@ def power(self, a, exponents):
     def dot(self, a, b):
         return cp.dot(a, b)
 
-    def norm(self, a, axis=None):
-        return cp.linalg.norm(a, axis=axis)
+    def norm(self, a, axis=None, keepdims=False):
+        return cp.linalg.norm(a, axis=axis, keepdims=keepdims)
 
     def any(self, a):
         return cp.any(a)
@@ -2717,8 +2717,8 @@ def sqrt(self, a):
     def power(self, a, exponents):
         return tnp.power(a, exponents)
 
-    def norm(self, a, axis=None):
-        return tf.math.reduce_euclidean_norm(a, axis=axis)
+    def norm(self, a, axis=None, keepdims=False):
+        return tf.math.reduce_euclidean_norm(a, axis=axis, keepdims=keepdims)
 
     def any(self, a):
         return tnp.any(a)
diff --git a/ot/da.py b/ot/da.py
@@ -148,8 +148,8 @@ def sinkhorn_lpl1_mm(a, labels_a, b, M, reg, eta=0.1, numItermax=10,
 
 
 def sinkhorn_l1l2_gl(a, labels_a, b, M, reg, eta=0.1, numItermax=10,
-                     numInnerItermax=200, stopInnerThr=1e-9, verbose=False,
-                     log=False):
+                     numInnerItermax=200, stopInnerThr=1e-9, eps=1e-12,
+                     verbose=False, log=False):
     r"""
     Solve the entropic regularization optimal transport problem with group
     lasso regularization
@@ -202,6 +202,8 @@ def sinkhorn_l1l2_gl(a, labels_a, b, M, reg, eta=0.1, numItermax=10,
         Max number of iterations (inner sinkhorn solver)
     stopInnerThr : float, optional
         Stop threshold on error (inner sinkhorn solver) (>0)
+    eps: float, optional (default=1e-12)
+        Small value to avoid division by zero
     verbose : bool, optional
         Print information along iterations
     log : bool, optional
@@ -241,19 +243,13 @@ def sinkhorn_l1l2_gl(a, labels_a, b, M, reg, eta=0.1, numItermax=10,
 
     def f(G):
         G_split = nx.repeat(G.T[:, :, None], n_labels, axis=2)
-        return nx.norm(G_split * unroll_labels_idx, axis=1).sum()
-
-    lstlab = nx.unique(labels_a)
+        return nx.sum(nx.norm(G_split * unroll_labels_idx, axis=1))
 
     def df(G):
-        W = nx.zeros(G.shape, type_as=G)
-        for i in range(G.shape[1]):
-            for lab in lstlab:
-                temp = G[labels_a == lab, i]
-                n = nx.norm(temp)
-                if n:
-                    W[labels_a == lab, i] = temp / n
-        return W
+        G_split = nx.repeat(G.T[:, :, None], n_labels, axis=2) * unroll_labels_idx
+        W = nx.norm(G_split * unroll_labels_idx, axis=1, keepdims=True)
+        G_norm = G_split / nx.clip(W, eps, None)
+        return nx.sum(G_norm, axis=2).T
 
     return gcg(a, b, M, reg, eta, f, df, G0=None, numItermax=numItermax,
                numInnerItermax=numInnerItermax, stopThr=stopInnerThr,
diff --git a/test/test_da.py b/test/test_da.py
@@ -802,30 +802,48 @@ def test_emd_laplace_class(nx):
     assert_equal(transp_ys.shape[1], len(np.unique(nx.to_numpy(yt))))
 
 
-def test_sinkhorn_l1l2_gl_cost_vectorized():
+def test_sinkhorn_l1l2_gl_cost_vectorized(nx):
     n_samples, n_labels = 150, 3
     rng = np.random.RandomState(42)
     G = rng.rand(n_samples, n_samples)
     labels_a = rng.randint(n_labels, size=(n_samples,))
+    G, labels_a = nx.from_numpy(G), nx.from_numpy(labels_a)
 
     # previously used implementation for the cost estimator
-    lstlab = np.unique(labels_a)
+    lstlab = nx.unique(labels_a)
 
     def f(G):
         res = 0
         for i in range(G.shape[1]):
             for lab in lstlab:
                 temp = G[labels_a == lab, i]
-                res += np.linalg.norm(temp)
+                res += nx.norm(temp)
         return res
 
+    def df(G):
+        W = nx.zeros(G.shape, type_as=G)
+        for i in range(G.shape[1]):
+            for lab in lstlab:
+                temp = G[labels_a == lab, i]
+                n = nx.norm(temp)
+                if n:
+                    W[labels_a == lab, i] = temp / n
+        return W
+
     # new vectorized implementation for the cost estimator
-    lstlab, lstlab_idx = np.unique(labels_a, return_inverse=True)
-    n_samples = lstlab.shape[0]
-    midx = np.eye(n_samples, dtype='int32')[None, lstlab_idx]
+    labels_u, labels_idx = nx.unique(labels_a, return_inverse=True)
+    n_labels = labels_u.shape[0]
+    unroll_labels_idx = nx.eye(n_labels, type_as=labels_u)[None, labels_idx]
 
     def f2(G):
-        G_split = np.repeat(G.T[:, :, None], n_samples, axis=2)
-        return np.linalg.norm(G_split * midx, axis=1).sum()
+        G_split = nx.repeat(G.T[:, :, None], n_labels, axis=2)
+        return nx.sum(nx.norm(G_split * unroll_labels_idx, axis=1))
+
+    def df2(G):
+        G_split = nx.repeat(G.T[:, :, None], n_labels, axis=2) * unroll_labels_idx
+        W = nx.norm(G_split * unroll_labels_idx, axis=1, keepdims=True)
+        G_norm = G_split / nx.clip(W, 1e-12, None)
+        return nx.sum(G_norm, axis=2).T
 
     assert np.allclose(f(G), f2(G))
+    assert np.allclose(df(G), df2(G))