Fix nested style

onurtore · onurtore · commit bc65968d1220 · 2023-06-10T18:54:39.000+03:00
Signed-off-by: Onur Berk Töre &lt;onurberk_t@hotmail.com&gt;
diff --git a/prototype_source/nestedtensor.py b/prototype_source/nestedtensor.py
@@ -25,7 +25,6 @@
 import torch.nn.functional as F
 
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-torch.set_default_device(device)
 
 ######################################################################
 # NestedTensor Initialization
@@ -36,7 +35,7 @@
 # From the Python frontend, a nestedtensor can be created from a list of tensors.
 # We denote nt[i] as the ith tensor component of a nestedtensor.
 nt = torch.nested.nested_tensor([torch.arange(12).reshape(
-    2, 6), torch.arange(18).reshape(3, 6)], dtype=torch.float)
+    2, 6), torch.arange(18).reshape(3, 6)], dtype=torch.float, device=device)
 print(f"{nt=}")
 
 ######################################################################
@@ -112,7 +111,7 @@
 # Applying the operation on a nestedtensor is equivalent to
 # applying the operation to the underlying tensor components,
 # with the result being a nestedtensor as well.
-nt_mm = torch.nested.nested_tensor([torch.randn((2, 3, 4)), torch.randn((2, 3, 5))])
+nt_mm = torch.nested.nested_tensor([torch.randn((2, 3, 4)), torch.randn((2, 3, 5))], device=device)
 nt3 = torch.matmul(nt_transposed, nt_mm)
 print(f"Result of Matmul:\n {nt3}")
 
@@ -319,7 +318,7 @@ def mha_padded(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, nhea
 
     # Have to manipulate masks in order to apply them to the attention weights
     key_padding_mask = attn_mask_q.view(N, 1, 1, L_t).expand(-1, nheads, -1, -1).reshape(N*nheads, 1, L_t).to(device=device)
-    attn_mask = torch.zeros(key_padding_mask.shape, dtype=torch.float32)
+    attn_mask = torch.zeros(key_padding_mask.shape, device=device, dtype=torch.float32)
     attn_mask = attn_mask.masked_fill_(key_padding_mask, float("-inf"))
 
     # Zero out the attention weights where the mask is True by adding -inf prior to softmax
@@ -385,10 +384,10 @@ def zipf_sentence_lengths(alpha: float, batch_size: int) -> np.ndarray:
 # create inputs
 
 # create parameters
-W_q, b_q = torch.randn((E_total, E_q)), torch.randn(E_total)
-W_k, b_k = torch.randn((E_total, E_k)), torch.randn(E_total)
-W_v, b_v = torch.randn((E_total, E_v)), torch.randn(E_total)
-W_out, b_out = torch.randn((E_out, E_total)), torch.randn(E_out)
+W_q, b_q = torch.randn((E_total, E_q), device=device), torch.randn(E_total, device=device)
+W_k, b_k = torch.randn((E_total, E_k), device=device), torch.randn(E_total, device=device)
+W_v, b_v = torch.randn((E_total, E_v), device=device), torch.randn(E_total, device=device)
+W_out, b_out = torch.randn((E_out, E_total), device=device), torch.randn(E_out, device=device)
 
 # create nested input
 queries = []
@@ -397,9 +396,9 @@ def zipf_sentence_lengths(alpha: float, batch_size: int) -> np.ndarray:
 for i in range(N):
     l = sentence_lengths[i]
     s = l
-    queries.append(torch.randn((l, E_q)))
-    keys   .append(torch.randn((s, E_k)))
-    values .append(torch.randn((s, E_v)))
+    queries.append(torch.randn((l, E_q), device=device))
+    keys   .append(torch.randn((s, E_k), device=device))
+    values .append(torch.randn((s, E_v), device=device))
 query = torch.nested.nested_tensor(queries)
 key = torch.nested.nested_tensor(keys)
 value = torch.nested.nested_tensor(values)
@@ -454,7 +453,7 @@ def zipf_sentence_lengths(alpha: float, batch_size: int) -> np.ndarray:
 
 # embeddings are assumed to be the same
 E = E_total
-mha_lib = torch.nn.MultiheadAttention(E, nheads, batch_first=True)
+mha_lib =  torch.nn.MultiheadAttention(E, nheads, batch_first=True, device=device)
 mha_lib.eval()
 
 ######################################################################