diff --git a/.gitignore b/.gitignore
index 715947e8bfa..0d01ed4e467 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 .ipynb_checkpoints
-*~
\ No newline at end of file
+data
+*~
diff --git a/Creating extensions using numpy and scipy.ipynb b/Creating extensions using numpy and scipy.ipynb
index f87b473706a..8112cd1360e 100644
--- a/Creating extensions using numpy and scipy.ipynb	
+++ b/Creating extensions using numpy and scipy.ipynb	
@@ -86,25 +86,25 @@
      "output_type": "stream",
      "text": [
       "\n",
-      "  4.8235   2.8210   4.0698  11.3034   4.0292\n",
-      "  2.8678   6.5351   0.9228  15.3372   3.8725\n",
-      "  4.0342   7.6988   3.8099   4.0796  12.9163\n",
-      "  9.2561   6.0761   6.0133   5.7306   5.8949\n",
-      "  6.1931   3.0239   8.3571   9.1645   2.3575\n",
-      "  9.2561   7.6919   5.4074   9.5971   5.8949\n",
-      "  4.0342  13.2063   9.0728   5.2962  12.9163\n",
-      "  2.8678   2.4252   2.0834   4.9570   3.8725\n",
+      "  6.9997  11.0343   9.7395   6.0724   6.0526\n",
+      "  7.0250  11.4841   7.1110   5.6337   8.6441\n",
+      "  7.8062  10.9281   9.8279  23.4972   7.4842\n",
+      "  6.4962   4.5987   0.7936   3.9360   4.9595\n",
+      "  9.7913  10.3732   1.6261   2.0874  14.5295\n",
+      "  6.4962   5.7111   1.9793   8.8037   4.9595\n",
+      "  7.8062   8.7752   6.4442  14.1250   7.4842\n",
+      "  7.0250   5.4642   1.7983   4.4346   8.6441\n",
       "[torch.FloatTensor of size 8x5]\n",
       "\n",
-      "\n",
-      " 0.1849 -0.0055  0.0743 -0.0751  0.1089 -0.0751  0.0743 -0.0055\n",
-      "-0.0662  0.1506  0.1307 -0.0629 -0.1199  0.0800 -0.0873  0.1036\n",
-      "-0.0024 -0.0936  0.0083  0.0327 -0.1370 -0.2486 -0.0117 -0.0216\n",
-      "-0.0074 -0.1277  0.0631  0.0348  0.0422  0.1335  0.0221 -0.0900\n",
-      " 0.1353  0.0098  0.0030  0.0408 -0.0442  0.0408  0.0030  0.0098\n",
-      "-0.0074 -0.0900  0.0221  0.1335  0.0422  0.0348  0.0631 -0.1277\n",
-      "-0.0024 -0.0216 -0.0117 -0.2486 -0.1370  0.0327  0.0083 -0.0936\n",
-      "-0.0662  0.1036 -0.0873  0.0800 -0.1199 -0.0629  0.1307  0.1506\n",
+      "Variable containing:\n",
+      "-0.0129  0.0330  0.0036 -0.0737  0.2354 -0.0737  0.0036  0.0330\n",
+      " 0.0542  0.0986 -0.0382 -0.1137 -0.0944 -0.0973 -0.0172 -0.0021\n",
+      "-0.1538 -0.1444  0.0356  0.1590  0.0588 -0.0188 -0.0611  0.0346\n",
+      " 0.1511  0.0370 -0.2513 -0.1518  0.1513 -0.2312 -0.0896 -0.1450\n",
+      "-0.1668 -0.0814  0.1954  0.1405  0.2191  0.1405  0.1954 -0.0814\n",
+      " 0.1511 -0.1450 -0.0896 -0.2312  0.1513 -0.1518 -0.2513  0.0370\n",
+      "-0.1538  0.0346 -0.0611 -0.0188  0.0588  0.1590  0.0356 -0.1444\n",
+      " 0.0542 -0.0021 -0.0172 -0.0973 -0.0944 -0.1137 -0.0382  0.0986\n",
       "[torch.FloatTensor of size 8x8]\n",
       "\n"
      ]
@@ -139,7 +139,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 4,
    "metadata": {
     "collapsed": false
    },
@@ -182,7 +182,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 5,
    "metadata": {
     "collapsed": false
    },
@@ -192,33 +192,33 @@
      "output_type": "stream",
      "text": [
       "[Parameter containing:\n",
-      "-0.1271  0.8109  0.4178\n",
-      "-0.5126 -1.1835 -0.2776\n",
-      " 0.4214  0.0886  1.0216\n",
+      " 0.0460  0.5052  0.9281\n",
+      " 0.8355  1.2642 -0.1283\n",
+      " 1.7027 -0.3146 -0.6927\n",
       "[torch.FloatTensor of size 3x3]\n",
       "]\n",
       "Variable containing:\n",
-      " 4.9120  0.6210  1.7908 -0.6933  3.4223  0.4025  2.0330  1.9110\n",
-      "-1.6563 -0.9113 -2.3579  0.5714 -2.4509 -1.1477 -2.2362  0.2235\n",
-      "-2.5879 -0.2629  0.0876  1.1707  1.2481  1.6186  1.2425  3.4960\n",
-      " 2.6881  2.0360  1.5574 -0.8602 -2.8442 -2.2571 -1.5803 -2.0943\n",
-      "-1.0176 -2.6565 -0.5091  1.5564  1.4575  3.1081  0.9570  1.0759\n",
-      "-2.7108  0.7214 -3.5160 -2.0563  0.6138 -2.6700 -1.1769 -1.0721\n",
-      " 2.8449  3.2918  1.6901  1.5427  2.4700  0.3433  2.2472  1.3501\n",
-      "-3.9733 -1.4927 -0.6596 -1.2467 -2.1322  1.1351 -1.4640 -0.5982\n",
+      " 1.4619 -4.0543  0.4391 -0.5423 -4.3719  3.9728 -0.4084 -2.8224\n",
+      "-3.6799 -3.9278  4.9932 -3.8952  3.0663  1.6303  2.9775  1.1806\n",
+      "-3.1694  2.1434  0.4432  1.6941  1.9344 -0.1196  1.1259  4.3571\n",
+      "-0.7934 -1.4610  2.2360  0.6406  0.3729  1.9140  0.2427  0.4298\n",
+      "-2.2961 -0.4189  5.6658  0.8090 -1.3030  2.2934  0.7164 -0.0272\n",
+      " 1.0649  1.0400 -1.3774 -0.2026 -0.9841  1.7192  3.0843  3.4241\n",
+      " 3.2743 -1.8780 -2.3084  0.8508  1.1622  0.6060  2.5559  1.0228\n",
+      "-2.3282 -1.1790 -2.4604 -1.9252 -1.3962  1.1054  3.6035  3.1302\n",
       "[torch.FloatTensor of size 8x8]\n",
       "\n",
-      "\n",
-      "-0.0466 -0.1651  0.2731  0.0436 -0.1487 -0.3089  0.7800  0.4718  0.1200 -0.5005\n",
-      " 0.1600 -0.8988  1.6481  0.3330  0.3586 -2.7015  0.7774 -0.2702  1.4118  0.0614\n",
-      " 1.1303 -2.6691  0.4635  1.2966  2.5482 -3.1470  2.8663 -1.8794 -1.9309 -0.8698\n",
-      "-0.0614  1.5925 -0.7043 -0.9832 -0.7737 -4.6351  5.2933  0.2257 -0.9895  0.9198\n",
-      "-0.9014  2.8442 -2.7092  2.2500  1.1892 -5.0975  2.4289  0.2922 -2.1747  0.8316\n",
-      "-2.7050  3.6107 -1.7208 -0.4780 -0.3891 -2.2356  1.2152 -1.4541 -0.5707  1.2749\n",
-      "-2.1614  2.0130 -4.0183  0.6822  0.9159  0.5670  3.7633 -0.9087  0.0326 -0.0958\n",
-      " 0.3509  0.1484 -1.2759 -0.8248  0.8566 -2.6416  2.8875 -1.2788  1.1253  0.5939\n",
-      "-0.0029  0.4912  1.8060 -1.4529  2.6439 -0.9157  0.5279 -3.4779  0.2804 -0.2260\n",
-      "-0.1932  0.1283 -0.1745 -0.4872  1.0467 -0.1953  0.3003  1.3696  0.8338  0.4173\n",
+      "Variable containing:\n",
+      " 0.0427  0.7780  1.7383  1.8333  3.8198  0.1135 -3.5576 -4.3994 -0.4354 -0.6021\n",
+      " 0.4661  1.2470  2.1080  6.3960  0.6894 -4.5144 -3.2005 -0.2762  0.3508  1.7803\n",
+      " 0.8492  0.9083  4.1836  0.6133 -3.4092 -1.8541  0.2254  3.6970  1.0382  0.5031\n",
+      " 0.0919  1.7864  1.5422  0.2942  2.0176  1.0741  0.8390  2.6984  2.4786  0.2636\n",
+      " 0.2600  0.5248  2.3759  2.1921 -3.4520 -3.2025  2.6008 -0.7395  0.3200  0.0964\n",
+      " 0.1632  1.9750  2.5973 -2.0378 -5.2213  1.2097  1.3411  1.6995 -1.4448 -2.6965\n",
+      " 0.5332  0.8034 -3.0446 -6.2269 -3.4281 -0.5354 -0.4278 -0.7310 -1.1542  0.7947\n",
+      " 0.1243 -1.0476 -2.9011 -5.9247 -2.5209 -3.1030 -4.4343 -2.7956  1.4640  0.0090\n",
+      "-0.9033 -0.4323 -2.5873 -1.8884 -1.4657 -1.4747 -0.0032  1.4012 -0.7892 -0.1049\n",
+      " 0.0739 -0.7349 -0.3925 -0.9291 -1.1198  0.5321  1.9748  0.1242 -0.4062  0.3108\n",
       "[torch.FloatTensor of size 10x10]\n",
       "\n"
      ]
@@ -233,34 +233,25 @@
     "output.backward(torch.randn(8, 8))\n",
     "print(input.grad)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.12"
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
diff --git a/Deep Learning with PyTorch.ipynb b/Deep Learning with PyTorch.ipynb
index acc44b32e87..b0761451771 100644
--- a/Deep Learning with PyTorch.ipynb	
+++ b/Deep Learning with PyTorch.ipynb	
@@ -19,7 +19,9 @@
     "It's a Python based scientific computing package targeted at two sets of audiences:\n",
     "\n",
     "- A replacement for numpy to use the power of GPUs\n",
-    "- a deep learning research platform that provides maximum flexibility and speed"
+    "- a deep learning research platform that provides maximum flexibility and speed\n",
+    "\n",
+    "**If you want to complete the full tutorial, including training a neural network for image classification, you have to install the `torchvision` package.**"
    ]
   },
   {
@@ -88,6 +90,13 @@
     "x.size()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "*NOTE: `torch.Size` is in fact a tuple, so it supports the same operations*"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -293,23 +302,23 @@
     "## Autograd: automatic differentiation\n",
     "\n",
     "The `autograd` package provides automatic differentiation for all operations on Tensors.  \n",
-    "It is a define-by-run framework, which means that your backprop is defined by how your code is run. \n",
+    "It is a define-by-run framework, which means that your backprop is defined by how your code is run, and that every single iteration can be different. \n",
     "\n",
     "Let us see this in more simple terms with some examples.\n",
     "\n",
     "`autograd.Variable` is the central class of the package. \n",
-    "It wraps a Tensor, and afterwards you can run tensor operations on it, and finally call `.backward()`\n",
+    "It wraps a Tensor, and supports nearly all of operations defined on it. Once you finish your computation you can call `.backward()` and have all the gradients computed automatically.\n",
     "\n",
-    "You can access the raw tensor through the `.data` attribute, and after computing the backward pass, a gradient w.r.t. this variable is accumulated into `.grad` attribute.\n",
+    "You can access the raw tensor through the `.data` attribute, while the gradient w.r.t. this variable is accumulated into `.grad`.\n",
     "\n",
     "![Variable](images/Variable.png)\n",
     "\n",
     "There's one more class which is very important for autograd implementation - a `Function`. \n",
     "\n",
-    "`Variable` and `Function` are interconnected and build up an acyclic graph, that encodes a complete history of computation. Each variable has a `.creator` attribute that references a `Function` that has created the `Variable` (except for Variables created by the user - these have `creator=None`).\n",
+    "`Variable` and `Function` are interconnected and build up an acyclic graph, that encodes a complete history of computation. Each variable has a `.creator` attribute that references a `Function` that has created the `Variable` (except for Variables created by the user - their `creator is None`).\n",
     "\n",
     "If you want to compute the derivatives, you can call `.backward()` on a `Variable`. \n",
-    "If `Variable` is a scalar (i.e. it holds a one element tensor), you don't need to specify any arguments to `backward()`, however if it has more elements, you need to specify a `grad_output` argument that is a tensor of matching shape.\n"
+    "If `Variable` is a scalar (i.e. it holds a one element data), you don't need to specify any arguments to `backward()`, however if it has more elements, you need to specify a `grad_output` argument that is a tensor of matching shape.\n"
    ]
   },
   {
@@ -523,26 +532,31 @@
    "outputs": [],
    "source": [
     "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "# Some more python helpers\n",
+    "import functools\n",
+    "import operator\n",
     "\n",
     "class Net(nn.Container):\n",
     "    def __init__(self):\n",
     "        super(Net, self).__init__()\n",
     "        self.conv1 = nn.Conv2d(1, 6, 5) # 1 input image channel, 6 output channels, 5x5 square convolution kernel\n",
-    "        self.pool  = nn.MaxPool2d(2,2)  # A max-pooling operation that looks at 2x2 windows and finds the max.\n",
     "        self.conv2 = nn.Conv2d(6, 16, 5)\n",
     "        self.fc1   = nn.Linear(16*5*5, 120) # an affine operation: y = Wx + b\n",
     "        self.fc2   = nn.Linear(120, 84)\n",
     "        self.fc3   = nn.Linear(84, 10)\n",
-    "        self.relu  = nn.ReLU()\n",
     "\n",
     "    def forward(self, x):\n",
-    "        x = self.pool(self.relu(self.conv1(x)))\n",
-    "        x = self.pool(self.relu(self.conv2(x)))\n",
-    "        x = x.view(-1, 16*5*5)\n",
-    "        x = self.relu(self.fc1(x))\n",
-    "        x = self.relu(self.fc2(x))\n",
+    "        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2)) # Max pooling over a (2, 2) window\n",
+    "        x = F.max_pool2d(F.relu(self.conv2(x)), 2) # If the size is a square you can only specify a single number\n",
+    "        x = x.view(-1, self.num_flat_features(x))\n",
+    "        x = F.relu(self.fc1(x))\n",
+    "        x = F.relu(self.fc2(x))\n",
     "        x = self.fc3(x)\n",
     "        return x\n",
+    "    \n",
+    "    def num_flat_features(self, x):\n",
+    "        return functools.reduce(operator.mul, x.size()[1:])\n",
     "\n",
     "net = Net()\n",
     "net"
@@ -610,15 +624,25 @@
    "source": [
     "> #### NOTE: `torch.nn` only supports mini-batches\n",
     "The entire `torch.nn` package only supports inputs that are a mini-batch of samples, and not a single sample.  \n",
-    "For example, `nn.Conv2d` will take in a 4D Tensor of `nSamples x nChannels x Height x Width`  \n",
-    "*This is done to simplify developer code and eliminate bugs*"
+    "For example, `nn.Conv2d` will take in a 4D Tensor of `nSamples x nChannels x Height x Width`.\n",
+    "\n",
+    "> *If you have a single sample, just use `input.unsqueeze(0)` to add a fake batch dimension.*"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "##### Review of what you learnt so far:\n",
+    "### Recap of all the classes you've seen so far:\n",
+    "\n",
+    "* `torch.Tensor` - A **multi-dimensional array**.\n",
+    "* `autograd.Variable` - **Wraps a Tensor and records the history of operations** applied to it. Has the same API as a `Tensor`, with some additions like `backward()`. Also **holds the gradient** w.r.t. the tensor.\n",
+    "* `nn.Module` - Neural network module. **Convenient way of encapsulating parameters**, with helpers for moving them to GPU, exporting, loading, etc.\n",
+    "* `nn.Container` - `Module` that is a **container for other Modules**.\n",
+    "* `nn.Parameter` - A kind of Variable, that is **automatically registered as a parameter when assigned as an attribute to a `Module`**.\n",
+    "* `autograd.Function` - Implements **forward and backward definitions of an autograd operation**. Every `Variable` operation, creates at least a single `Function` node, that connects to functions that created a `Variable` and **encodes its history**.\n",
+    "\n",
+    "##### At this point, we covered:\n",
     "- Defining a neural network\n",
     "- Processing inputs and calling backward.\n",
     "\n",
@@ -670,7 +694,7 @@
     "      -> loss\n",
     "```\n",
     "\n",
-    "So, when we call `loss.backward()`, the whole graph is differentiated w.r.t. the loss, and all Variables in the graph will have their `.grad` Tensor accumulated with the gradient.\n",
+    "So, when we call `loss.backward()`, the whole graph is differentiated w.r.t. the loss, and all Variables in the graph will have their `.grad` Variable accumulated with the gradient.\n",
     "       "
    ]
   },
@@ -727,7 +751,7 @@
     "```python\n",
     "learning_rate = 0.01\n",
     "for f in net.parameters():\n",
-    "    f.data.sub_(f.grad * learning_rate)\n",
+    "    f.data.sub_(f.grad.data * learning_rate)\n",
     "```\n",
     "\n",
     "However, as you use neural networks, you want to use various different update rules such as SGD, Nesterov-SGD, Adam, RMSProp, etc.\n",
@@ -822,13 +846,11 @@
     "transform=transforms.Compose([transforms.ToTensor(),\n",
     "                              transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),\n",
     "                             ])\n",
-    "trainset = torchvision.datasets.CIFAR10(root='/Users/soumith/code/pytorch-vision/test/cifar', \n",
-    "                                        train=True, download=True, transform=transform)\n",
+    "trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)\n",
     "trainloader = torch.utils.data.DataLoader(trainset, batch_size=4, \n",
     "                                          shuffle=True, num_workers=2)\n",
     "\n",
-    "testset = torchvision.datasets.CIFAR10(root='/Users/soumith/code/pytorch-vision/test/cifar', \n",
-    "                                        train=False, download=True, transform=transform)\n",
+    "testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)\n",
     "testloader = torch.utils.data.DataLoader(testset, batch_size=4, \n",
     "                                          shuffle=False, num_workers=2)\n",
     "classes = ('plane', 'car', 'bird', 'cat',\n",
@@ -1163,8 +1185,8 @@
    "metadata": {},
    "source": [
     "#### Training on the GPU\n",
-    "The idea is pretty simple.  \n",
-    "Just like how you transfer a Tensor on to the GPU, you transfer the neural net onto the GPU."
+    "Just like how you transfer a Tensor on to the GPU, you transfer the neural net onto the GPU.\n",
+    "This will recursively go over all modules and convert their parameters and buffers to CUDA tensors."
    ]
   },
   {
@@ -1207,34 +1229,25 @@
     "- [More tutorials](https://github.com/pytorch/tutorials)\n",
     "- [Chat with other users on Slack](pytorch.slack.com/messages/beginner/)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.12"
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
diff --git a/Introduction to PyTorch for former Torchies.ipynb b/Introduction to PyTorch for former Torchies.ipynb
index 920ada91583..d1f081c40b1 100644
--- a/Introduction to PyTorch for former Torchies.ipynb	
+++ b/Introduction to PyTorch for former Torchies.ipynb	
@@ -439,13 +439,13 @@
    },
    "outputs": [],
    "source": [
-    "gradient = torch.randn(2, 2)\n",
     "# just backproping random gradients\n",
-    "\n",
-    "z.backward(gradient)\n",
+    "gradient = torch.randn(2, 2)\n",
     "\n",
     "# this would fail if we didn't specify \n",
     "# that we want to retain variables\n",
+    "z.backward(gradient)\n",
+    "\n",
     "x.grad"
    ]
   },
@@ -638,7 +638,7 @@
    "outputs": [],
    "source": [
     "print(net.conv1.weight.data.norm()) # norm of the weight\n",
-    "print(net.conv1.weight.grad.norm()) # norm of the gradients"
+    "print(net.conv1.weight.grad.data.norm()) # norm of the gradients"
    ]
   },
   {
@@ -705,7 +705,7 @@
     "    print('')    \n",
     "    print('grad_input size:', grad_input[0].size())\n",
     "    print('grad_output size:', grad_output[0].size())\n",
-    "    print('grad_input norm:', grad_input[0].norm())\n",
+    "    print('grad_input norm:', grad_input[0].data.norm())\n",
     "\n",
     "net.conv2.register_backward_hook(printgradnorm)\n",
     "\n",
@@ -910,34 +910,25 @@
     "        output_device = -1 if not input.is_cuda else input.get_device()\n",
     "    return gather(outputs, output_device)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.12"
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
   }
  },
  "nbformat": 4,
diff --git a/Reinforcement (Q-)Learning with PyTorch.ipynb b/Reinforcement (Q-)Learning with PyTorch.ipynb
index cc05dcaf226..09282ca2e28 100644
--- a/Reinforcement (Q-)Learning with PyTorch.ipynb	
+++ b/Reinforcement (Q-)Learning with PyTorch.ipynb	
@@ -330,7 +330,8 @@
     "    non_final_mask = torch.ByteTensor(tuple(map(lambda s: s is not None, batch.next_state)))\n",
     "    if USE_CUDA:\n",
     "        non_final_mask = non_final_mask.cuda()\n",
-    "    # This will be used in inference-mode only so let's use volatile for efficiency\n",
+    "    # We don't want to backprop through the expected action values and volatile will save us\n",
+    "    # on temporarily changing the model parameters' requires_grad to False!\n",
     "    non_final_next_states = Variable(torch.cat(tuple(s for s in batch.next_state if s is not None)), volatile=True)\n",
     "    state_batch = Variable(torch.cat(batch.state))\n",
     "    action_batch = Variable(torch.cat(batch.action))\n",
@@ -342,9 +343,10 @@
     "    # Compute V(s_{t+1}) for all next states.\n",
     "    next_state_values = Variable(torch.zeros(BATCH_SIZE))\n",
     "    next_state_values[non_final_mask] = model(non_final_next_states).max(1)[0]\n",
-    "    # Now, we don't want to mess up the loss with a volatile flag, so let's clear it\n",
+    "    # Now, we don't want to mess up the loss with a volatile flag, so let's clear it.\n",
+    "    # After this, we'll just end up with a Variable that has requires_grad=False\n",
     "    next_state_values.volatile = False\n",
-    "    # Compute the Q values\n",
+    "    # Compute the expected Q values\n",
     "    expected_state_action_values = (next_state_values * GAMMA) + reward_batch\n",
     "\n",
     "    # Compute Huber loss\n",
@@ -354,7 +356,7 @@
     "    optimizer.zero_grad()\n",
     "    loss.backward()\n",
     "    for param in model.parameters():\n",
-    "        param.grad.clamp(-1, 1)\n",
+    "        param.grad.data.clamp(-1, 1)\n",
     "    optimizer.step()\n",
     "\n",
     "\n",