From d5e6f41dcc8dee25c555c89fadb8dece9fd87c4b Mon Sep 17 00:00:00 2001
From: Alanna Burke <burkealanna@meta.com>
Date: Wed, 30 Oct 2024 16:13:34 -0400
Subject: [PATCH 1/7] Redirecting autograd_tutorial_old.

---
 .../former_torchies/autograd_tutorial_old.py  | 130 ------------------
 .../former_torchies/autograd_tutorial_old.rst |   8 ++
 2 files changed, 8 insertions(+), 130 deletions(-)
 delete mode 100644 beginner_source/former_torchies/autograd_tutorial_old.py
 create mode 100644 beginner_source/former_torchies/autograd_tutorial_old.rst

diff --git a/beginner_source/former_torchies/autograd_tutorial_old.py b/beginner_source/former_torchies/autograd_tutorial_old.py
deleted file mode 100644
index 4030831b8ef..00000000000
--- a/beginner_source/former_torchies/autograd_tutorial_old.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Autograd
-========
-
-Autograd is now a core torch package for automatic differentiation.
-It uses a tape based system for automatic differentiation.
-
-In the forward phase, the autograd tape will remember all the operations
-it executed, and in the backward phase, it will replay the operations.
-
-Tensors that track history
---------------------------
-
-In autograd, if any input ``Tensor`` of an operation has ``requires_grad=True``,
-the computation will be tracked. After computing the backward pass, a gradient
-w.r.t. this tensor is accumulated into ``.grad`` attribute.
-
-There’s one more class which is very important for autograd
-implementation - a ``Function``. ``Tensor`` and ``Function`` are
-interconnected and build up an acyclic graph, that encodes a complete
-history of computation. Each variable has a ``.grad_fn`` attribute that
-references a function that has created a function (except for Tensors
-created by the user - these have ``None`` as ``.grad_fn``).
-
-If you want to compute the derivatives, you can call ``.backward()`` on
-a ``Tensor``. If ``Tensor`` is a scalar (i.e. it holds a one element
-tensor), you don’t need to specify any arguments to ``backward()``,
-however if it has more elements, you need to specify a ``grad_output``
-argument that is a tensor of matching shape.
-"""
-
-import torch
-
-###############################################################
-# Create a tensor and set requires_grad=True to track computation with it
-x = torch.ones(2, 2, requires_grad=True)
-print(x)
-
-###############################################################
-#
-print(x.data)
-
-###############################################################
-#
-print(x.grad)
-
-###############################################################
-#
-
-print(x.grad_fn)  # we've created x ourselves
-
-###############################################################
-# Do an operation of x:
-
-y = x + 2
-print(y)
-
-###############################################################
-# y was created as a result of an operation,
-# so it has a grad_fn
-print(y.grad_fn)
-
-###############################################################
-# More operations on y:
-
-z = y * y * 3
-out = z.mean()
-
-print(z, out)
-
-################################################################
-# ``.requires_grad_( ... )`` changes an existing Tensor's ``requires_grad``
-# flag in-place. The input flag defaults to ``True`` if not given.
-a = torch.randn(2, 2)
-a = ((a * 3) / (a - 1))
-print(a.requires_grad)
-a.requires_grad_(True)
-print(a.requires_grad)
-b = (a * a).sum()
-print(b.grad_fn)
-
-###############################################################
-# Gradients
-# ---------
-#
-# let's backprop now and print gradients d(out)/dx
-
-out.backward()
-print(x.grad)
-
-
-###############################################################
-# By default, gradient computation flushes all the internal buffers
-# contained in the graph, so if you even want to do the backward on some
-# part of the graph twice, you need to pass in ``retain_variables = True``
-# during the first pass.
-
-x = torch.ones(2, 2, requires_grad=True)
-y = x + 2
-y.backward(torch.ones(2, 2), retain_graph=True)
-# the retain_variables flag will prevent the internal buffers from being freed
-print(x.grad)
-
-###############################################################
-#
-z = y * y
-print(z)
-
-###############################################################
-#
-# just backprop random gradients
-
-gradient = torch.randn(2, 2)
-
-# this would fail if we didn't specify
-# that we want to retain variables
-y.backward(gradient)
-
-print(x.grad)
-
-###############################################################
-# You can also stop autograd from tracking history on Tensors
-# with requires_grad=True by wrapping the code block in
-# ``with torch.no_grad():``
-print(x.requires_grad)
-print((x ** 2).requires_grad)
-
-with torch.no_grad():
-	print((x ** 2).requires_grad)
diff --git a/beginner_source/former_torchies/autograd_tutorial_old.rst b/beginner_source/former_torchies/autograd_tutorial_old.rst
new file mode 100644
index 00000000000..8c887e00c8a
--- /dev/null
+++ b/beginner_source/former_torchies/autograd_tutorial_old.rst
@@ -0,0 +1,8 @@
+Autograd
+==============
+
+This tutorial is out of date. You'll be redirected to the new tutorial in 3 seconds: https://pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html'" />

From db1ef5411c765880650179ae0220cc271ff720c7 Mon Sep 17 00:00:00 2001
From: Alanna Burke <burkealanna@meta.com>
Date: Wed, 30 Oct 2024 16:23:07 -0400
Subject: [PATCH 2/7] Redirecting parallelism_tutorial.

---
 .../former_torchies/parallelism_tutorial.py   | 145 ------------------
 .../former_torchies/parallelism_tutorial.rst  |   8 +
 2 files changed, 8 insertions(+), 145 deletions(-)
 delete mode 100644 beginner_source/former_torchies/parallelism_tutorial.py
 create mode 100644 beginner_source/former_torchies/parallelism_tutorial.rst

diff --git a/beginner_source/former_torchies/parallelism_tutorial.py b/beginner_source/former_torchies/parallelism_tutorial.py
deleted file mode 100644
index a11d844e1bd..00000000000
--- a/beginner_source/former_torchies/parallelism_tutorial.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Multi-GPU Examples
-==================
-
-Data Parallelism is when we split the mini-batch of samples into
-multiple smaller mini-batches and run the computation for each of the
-smaller mini-batches in parallel.
-
-Data Parallelism is implemented using ``torch.nn.DataParallel``.
-One can wrap a Module in ``DataParallel`` and it will be parallelized
-over multiple GPUs in the batch dimension.
-
-
-DataParallel
--------------
-"""
-import torch
-import torch.nn as nn
-
-
-class DataParallelModel(nn.Module):
-
-    def __init__(self):
-        super().__init__()
-        self.block1 = nn.Linear(10, 20)
-
-        # wrap block2 in DataParallel
-        self.block2 = nn.Linear(20, 20)
-        self.block2 = nn.DataParallel(self.block2)
-
-        self.block3 = nn.Linear(20, 20)
-
-    def forward(self, x):
-        x = self.block1(x)
-        x = self.block2(x)
-        x = self.block3(x)
-        return x
-
-########################################################################
-# The code does not need to be changed in CPU-mode.
-#
-# The documentation for DataParallel can be found
-# `here <https://pytorch.org/docs/stable/nn.html#dataparallel-layers-multi-gpu-distributed>`_.
-# 
-# **Attributes of the wrapped module**
-# 
-# After wrapping a Module with ``DataParallel``, the attributes of the module
-# (e.g. custom methods) became inaccessible. This is because ``DataParallel``
-# defines a few new members, and allowing other attributes might lead to 
-# clashes in their names. For those who still want to access the attributes, 
-# a workaround is to use a subclass of ``DataParallel`` as below.
-
-class MyDataParallel(nn.DataParallel):
-    def __getattr__(self, name):
-        try:
-            return super().__getattr__(name)
-        except AttributeError:
-            return getattr(self.module, name)
-    
-########################################################################
-# **Primitives on which DataParallel is implemented upon:**
-#
-#
-# In general, pytorch’s `nn.parallel` primitives can be used independently.
-# We have implemented simple MPI-like primitives:
-#
-# - replicate: replicate a Module on multiple devices
-# - scatter: distribute the input in the first-dimension
-# - gather: gather and concatenate the input in the first-dimension
-# - parallel\_apply: apply a set of already-distributed inputs to a set of
-#   already-distributed models.
-#
-# To give a better clarity, here function ``data_parallel`` composed using
-# these collectives
-
-
-def data_parallel(module, input, device_ids, output_device=None):
-    if not device_ids:
-        return module(input)
-
-    if output_device is None:
-        output_device = device_ids[0]
-
-    replicas = nn.parallel.replicate(module, device_ids)
-    inputs = nn.parallel.scatter(input, device_ids)
-    replicas = replicas[:len(inputs)]
-    outputs = nn.parallel.parallel_apply(replicas, inputs)
-    return nn.parallel.gather(outputs, output_device)
-
-########################################################################
-# Part of the model on CPU and part on the GPU
-# --------------------------------------------
-#
-# Let’s look at a small example of implementing a network where part of it
-# is on the CPU and part on the GPU
-
-device = torch.device("cuda:0")
-
-class DistributedModel(nn.Module):
-
-    def __init__(self):
-        super().__init__(
-            embedding=nn.Embedding(1000, 10),
-            rnn=nn.Linear(10, 10).to(device),
-        )
-
-    def forward(self, x):
-        # Compute embedding on CPU
-        x = self.embedding(x)
-
-        # Transfer to GPU
-        x = x.to(device)
-
-        # Compute RNN on GPU
-        x = self.rnn(x)
-        return x
-
-########################################################################
-#
-# This was a small introduction to PyTorch for former Torch users.
-# There’s a lot more to learn.
-#
-# Look at our more comprehensive introductory tutorial which introduces
-# the ``optim`` package, data loaders etc.: :doc:`/beginner/deep_learning_60min_blitz`.
-#
-# Also look at
-#
-# -  :doc:`Train neural nets to play video games </intermediate/reinforcement_q_learning>`
-# -  `Train a state-of-the-art ResNet network on imagenet`_
-# -  `Train a face generator using Generative Adversarial Networks`_
-# -  `Train a word-level language model using Recurrent LSTM networks`_
-# -  `More examples`_
-# -  `More tutorials`_
-# -  `Discuss PyTorch on the Forums`_
-# -  `Chat with other users on Slack`_
-#
-# .. _`Deep Learning with PyTorch: a 60-minute blitz`: https://github.com/pytorch/tutorials/blob/main/Deep%20Learning%20with%20PyTorch.ipynb
-# .. _Train a state-of-the-art ResNet network on imagenet: https://github.com/pytorch/examples/tree/master/imagenet
-# .. _Train a face generator using Generative Adversarial Networks: https://github.com/pytorch/examples/tree/master/dcgan
-# .. _Train a word-level language model using Recurrent LSTM networks: https://github.com/pytorch/examples/tree/master/word_language_model
-# .. _More examples: https://github.com/pytorch/examples
-# .. _More tutorials: https://github.com/pytorch/tutorials
-# .. _Discuss PyTorch on the Forums: https://discuss.pytorch.org/
-# .. _Chat with other users on Slack: https://pytorch.slack.com/messages/beginner/
diff --git a/beginner_source/former_torchies/parallelism_tutorial.rst b/beginner_source/former_torchies/parallelism_tutorial.rst
new file mode 100644
index 00000000000..04bb1d69e57
--- /dev/null
+++ b/beginner_source/former_torchies/parallelism_tutorial.rst
@@ -0,0 +1,8 @@
+Multi-GPU Examples
+==============
+
+This tutorial is out of date. You'll be redirected to the new tutorial in 3 seconds: https://pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html'" />

From a7440f7be15925cf452e9c8de403c65d1e0a65de Mon Sep 17 00:00:00 2001
From: Alanna Burke <burkealanna@meta.com>
Date: Wed, 30 Oct 2024 16:24:58 -0400
Subject: [PATCH 3/7] Redirecting nnft_tutorial.

---
 .../former_torchies/{nnft_tutorial.py => nnft_tutorial.rst}       | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename beginner_source/former_torchies/{nnft_tutorial.py => nnft_tutorial.rst} (100%)

diff --git a/beginner_source/former_torchies/nnft_tutorial.py b/beginner_source/former_torchies/nnft_tutorial.rst
similarity index 100%
rename from beginner_source/former_torchies/nnft_tutorial.py
rename to beginner_source/former_torchies/nnft_tutorial.rst

From 43cc28609bfdc3b868e2ef7e79f3da491e9bccf4 Mon Sep 17 00:00:00 2001
From: Alanna Burke <burkealanna@meta.com>
Date: Wed, 30 Oct 2024 16:42:21 -0400
Subject: [PATCH 4/7] Redirecting tensor_tutorial_old.

---
 .../former_torchies/tensor_tutorial_old.py    | 143 ------------------
 .../former_torchies/tensor_tutorial_old.rst   |   8 +
 2 files changed, 8 insertions(+), 143 deletions(-)
 delete mode 100644 beginner_source/former_torchies/tensor_tutorial_old.py
 create mode 100644 beginner_source/former_torchies/tensor_tutorial_old.rst

diff --git a/beginner_source/former_torchies/tensor_tutorial_old.py b/beginner_source/former_torchies/tensor_tutorial_old.py
deleted file mode 100644
index 10a9d81fadb..00000000000
--- a/beginner_source/former_torchies/tensor_tutorial_old.py
+++ /dev/null
@@ -1,143 +0,0 @@
-"""
-Tensors
-=======
-
-Tensors behave almost exactly the same way in PyTorch as they do in
-Torch.
-
-Create a tensor of size (5 x 7) with uninitialized memory:
-
-"""
-
-import torch
-a = torch.empty(5, 7, dtype=torch.float)
-
-###############################################################
-# Initialize a double tensor randomized with a normal distribution with mean=0,
-# var=1:
-
-a = torch.randn(5, 7, dtype=torch.double)
-print(a)
-print(a.size())
-
-###############################################################
-# .. note::
-#     ``torch.Size`` is in fact a tuple, so it supports the same operations
-#
-# Inplace / Out-of-place
-# ----------------------
-#
-# The first difference is that ALL operations on the tensor that operate
-# in-place on it will have an ``_`` postfix. For example, ``add`` is the
-# out-of-place version, and ``add_`` is the in-place version.
-
-a.fill_(3.5)
-# a has now been filled with the value 3.5
-
-b = a.add(4.0)
-# a is still filled with 3.5
-# new tensor b is returned with values 3.5 + 4.0 = 7.5
-
-print(a, b)
-
-###############################################################
-# Some operations like ``narrow`` do not have in-place versions, and
-# hence, ``.narrow_`` does not exist. Similarly, some operations like
-# ``fill_`` do not have an out-of-place version, so ``.fill`` does not
-# exist.
-#
-# Zero Indexing
-# -------------
-#
-# Another difference is that Tensors are zero-indexed. (In lua, tensors are
-# one-indexed)
-
-b = a[0, 3]  # select 1st row, 4th column from a
-
-###############################################################
-# Tensors can be also indexed with Python's slicing
-
-b = a[:, 3:5]  # selects all rows, 4th column and  5th column from a
-
-###############################################################
-# No camel casing
-# ---------------
-#
-# The next small difference is that all functions are now NOT camelCase
-# anymore. For example ``indexAdd`` is now called ``index_add_``
-
-
-x = torch.ones(5, 5)
-print(x)
-
-###############################################################
-#
-
-z = torch.empty(5, 2)
-z[:, 0] = 10
-z[:, 1] = 100
-print(z)
-
-###############################################################
-#
-x.index_add_(1, torch.tensor([4, 0], dtype=torch.long), z)
-print(x)
-
-###############################################################
-# Numpy Bridge
-# ------------
-#
-# Converting a torch Tensor to a numpy array and vice versa is a breeze.
-# The torch Tensor and numpy array will share their underlying memory
-# locations, and changing one will change the other.
-#
-# Converting torch Tensor to numpy Array
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-a = torch.ones(5)
-print(a)
-
-###############################################################
-#
-
-b = a.numpy()
-print(b)
-
-###############################################################
-#
-a.add_(1)
-print(a)
-print(b) 	# see how the numpy array changed in value
-
-
-###############################################################
-# Converting numpy Array to torch Tensor
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-import numpy as np
-a = np.ones(5)
-b = torch.from_numpy(a)
-np.add(a, 1, out=a)
-print(a)
-print(b)  # see how changing the np array changed the torch Tensor automatically
-
-###############################################################
-# All the Tensors on the CPU except a CharTensor support converting to
-# NumPy and back.
-#
-# CUDA Tensors
-# ------------
-#
-# CUDA Tensors are nice and easy in pytorch, and transfering a CUDA tensor
-# from the CPU to GPU will retain its underlying type.
-
-# let us run this cell only if CUDA is available
-if torch.cuda.is_available():
-
-    # creates a LongTensor and transfers it
-    # to GPU as torch.cuda.LongTensor
-    a = torch.full((10,), 3, device=torch.device("cuda"))
-    print(type(a))
-    b = a.to(torch.device("cpu"))
-    # transfers it to CPU, back to
-    # being a torch.LongTensor
diff --git a/beginner_source/former_torchies/tensor_tutorial_old.rst b/beginner_source/former_torchies/tensor_tutorial_old.rst
new file mode 100644
index 00000000000..939a6855c27
--- /dev/null
+++ b/beginner_source/former_torchies/tensor_tutorial_old.rst
@@ -0,0 +1,8 @@
+Tensors
+==============
+
+This tutorial is out of date. You'll be redirected to the new tutorial in 3 seconds: https://pytorch.org/tutorials/beginner/basics/tensorqs_tutorial.html
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/beginner/basics/tensorqs_tutorial.html'" />

From 17108d5bdd1a5ad5ecdac0f0727c69affef8b012 Mon Sep 17 00:00:00 2001
From: Alanna Burke <burkealanna@meta.com>
Date: Wed, 30 Oct 2024 17:03:13 -0400
Subject: [PATCH 5/7] Redirecting former_torchies_tutorial.

---
 beginner_source/former_torchies_tutorial.rst | 37 +++-----------------
 1 file changed, 4 insertions(+), 33 deletions(-)

diff --git a/beginner_source/former_torchies_tutorial.rst b/beginner_source/former_torchies_tutorial.rst
index e6ae59b7082..79aac42f3b8 100644
--- a/beginner_source/former_torchies_tutorial.rst
+++ b/beginner_source/former_torchies_tutorial.rst
@@ -1,37 +1,8 @@
 PyTorch for Former Torch Users
-------------------------------
-**Author**: `Soumith Chintala <http://soumith.ch>`_
-
-In this tutorial, you will learn the following:
-
-1. Using torch Tensors, and important difference against (Lua)Torch
-2. Using the autograd package
-3. Building neural networks
-
-  -  Building a ConvNet
-  -  Building a Recurrent Net
-
-4. Use multiple GPUs
-
-
-.. toctree::
-   :hidden:
-
-   /beginner/former_torchies/tensor_tutorial_old
-   /beginner/former_torchies/autograd_tutorial_old
-   /beginner/former_torchies/nnft_tutorial
-   /beginner/former_torchies/parallelism_tutorial
-
-.. galleryitem:: /beginner/former_torchies/tensor_tutorial_old.py
-    :figure: /_static/img/tensor_illustration_flat.png
-
-.. galleryitem:: /beginner/former_torchies/autograd_tutorial_old.py
-
-.. galleryitem:: /beginner/former_torchies/nnft_tutorial.py
-    :figure: /_static/img/torch-nn-vs-pytorch-nn.png
-
-.. galleryitem:: /beginner/former_torchies/parallelism_tutorial.py
+==============
+This tutorial is out of date. Please check out the PyTorch tutorials here: https://pytorch.org/tutorials/
 
+You will be redirected in 3 seconds.
 .. raw:: html
 
-    <div style='clear:both'></div>
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/'" />

From 0428b86e339e30cae6f13ace76c34a7e65d049e1 Mon Sep 17 00:00:00 2001
From: Alanna Burke <burkealanna@meta.com>
Date: Wed, 30 Oct 2024 17:04:01 -0400
Subject: [PATCH 6/7] Removing README as .py files have been changed to .rst.

---
 beginner_source/former_torchies/README.txt | 18 ------------------
 1 file changed, 18 deletions(-)
 delete mode 100644 beginner_source/former_torchies/README.txt

diff --git a/beginner_source/former_torchies/README.txt b/beginner_source/former_torchies/README.txt
deleted file mode 100644
index 5bb8c93f00c..00000000000
--- a/beginner_source/former_torchies/README.txt
+++ /dev/null
@@ -1,18 +0,0 @@
- PyTorch for former Torch users
- ------------------------------
- 
-1. tensor_tutorial_old.py
-	Tensors
-	https://pytorch.org/tutorials/beginner/former_torchies/tensor_tutorial_old.html
-
-2. autograd_tutorial_old.py
-	Autograd
-	https://pytorch.org/tutorials/beginner/former_torchies/autograd_tutorial_old.html
-
-3. nnft_tutorial.py
-	nn package
-	https://pytorch.org/tutorials/beginner/former_torchies/nnft_tutorial.html
-
-4. parallelism_tutorial.py
-	Multi-GPU examples
-	https://pytorch.org/tutorials/beginner/former_torchies/parallelism_tutorial.html

From 2880d6bc4f94169e0491227546c2f70810d33acb Mon Sep 17 00:00:00 2001
From: Alanna Burke <burkealanna@meta.com>
Date: Wed, 30 Oct 2024 18:05:08 -0400
Subject: [PATCH 7/7] Redirecting nnft_tutorial.

---
 .../former_torchies/nnft_tutorial.rst         | 268 +-----------------
 1 file changed, 5 insertions(+), 263 deletions(-)

diff --git a/beginner_source/former_torchies/nnft_tutorial.rst b/beginner_source/former_torchies/nnft_tutorial.rst
index 316bf03a985..db378a7162b 100644
--- a/beginner_source/former_torchies/nnft_tutorial.rst
+++ b/beginner_source/former_torchies/nnft_tutorial.rst
@@ -1,266 +1,8 @@
-# -*- coding: utf-8 -*-
-"""
-nn package
-==========
+nn Package
+===============
 
-We’ve redesigned the nn package, so that it’s fully integrated with
-autograd. Let's review the changes.
+This tutorial is out of date. You'll be redirected to the new tutorial in 3 seconds: https://pytorch.org/tutorials/beginner/nn_tutorial.html
 
-**Replace containers with autograd:**
+.. raw:: html
 
-    You no longer have to use Containers like ``ConcatTable``, or modules like
-    ``CAddTable``, or use and debug with nngraph. We will seamlessly use
-    autograd to define our neural networks. For example,
-
-    * ``output = nn.CAddTable():forward({input1, input2})`` simply becomes
-      ``output = input1 + input2``
-    * ``output = nn.MulConstant(0.5):forward(input)`` simply becomes
-      ``output = input * 0.5``
-
-**State is no longer held in the module, but in the network graph:**
-
-    Using recurrent networks should be simpler because of this reason. If
-    you want to create a recurrent network, simply use the same Linear layer
-    multiple times, without having to think about sharing weights.
-
-    .. figure:: /_static/img/torch-nn-vs-pytorch-nn.png
-       :alt: torch-nn-vs-pytorch-nn
-
-       torch-nn-vs-pytorch-nn
-
-**Simplified debugging:**
-
-    Debugging is intuitive using Python’s pdb debugger, and **the debugger
-    and stack traces stop at exactly where an error occurred.** What you see
-    is what you get.
-
-Example 1: ConvNet
-------------------
-
-Let’s see how to create a small ConvNet.
-
-All of your networks are derived from the base class ``nn.Module``:
-
--  In the constructor, you declare all the layers you want to use.
--  In the forward function, you define how your model is going to be
-   run, from input to output
-"""
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class MNISTConvNet(nn.Module):
-
-    def __init__(self):
-        # this is the place where you instantiate all your modules
-        # you can later access them using the same names you've given them in
-        # here
-        super(MNISTConvNet, self).__init__()
-        self.conv1 = nn.Conv2d(1, 10, 5)
-        self.pool1 = nn.MaxPool2d(2, 2)
-        self.conv2 = nn.Conv2d(10, 20, 5)
-        self.pool2 = nn.MaxPool2d(2, 2)
-        self.fc1 = nn.Linear(320, 50)
-        self.fc2 = nn.Linear(50, 10)
-
-    # it's the forward function that defines the network structure
-    # we're accepting only a single input in here, but if you want,
-    # feel free to use more
-    def forward(self, input):
-        x = self.pool1(F.relu(self.conv1(input)))
-        x = self.pool2(F.relu(self.conv2(x)))
-
-        # in your model definition you can go full crazy and use arbitrary
-        # python code to define your model structure
-        # all these are perfectly legal, and will be handled correctly
-        # by autograd:
-        # if x.gt(0) > x.numel() / 2:
-        #      ...
-        #
-        # you can even do a loop and reuse the same module inside it
-        # modules no longer hold ephemeral state, so you can use them
-        # multiple times during your forward pass
-        # while x.norm(2) < 10:
-        #    x = self.conv1(x)
-
-        x = x.view(x.size(0), -1)
-        x = F.relu(self.fc1(x))
-        x = F.relu(self.fc2(x))
-        return x
-
-###############################################################
-# Let's use the defined ConvNet now.
-# You create an instance of the class first.
-
-
-net = MNISTConvNet()
-print(net)
-
-########################################################################
-# .. note::
-#
-#     ``torch.nn`` only supports mini-batches The entire ``torch.nn``
-#     package only supports inputs that are a mini-batch of samples, and not
-#     a single sample.
-#
-#     For example, ``nn.Conv2d`` will take in a 4D Tensor of
-#     ``nSamples x nChannels x Height x Width``.
-#
-#     If you have a single sample, just use ``input.unsqueeze(0)`` to add
-#     a fake batch dimension.
-#
-# Create a mini-batch containing a single sample of random data and send the
-# sample through the ConvNet.
-
-input = torch.randn(1, 1, 28, 28)
-out = net(input)
-print(out.size())
-
-########################################################################
-# Define a dummy target label and compute error using a loss function.
-
-target = torch.tensor([3], dtype=torch.long)
-loss_fn = nn.CrossEntropyLoss()  # LogSoftmax + ClassNLL Loss
-err = loss_fn(out, target)
-err.backward()
-
-print(err)
-
-########################################################################
-# The output of the ConvNet ``out`` is a ``Tensor``. We compute the loss
-# using that, and that results in ``err`` which is also a ``Tensor``.
-# Calling ``.backward`` on ``err`` hence will propagate gradients all the
-# way through the ConvNet to it’s weights
-#
-# Let's access individual layer weights and gradients:
-
-print(net.conv1.weight.grad.size())
-
-########################################################################
-print(net.conv1.weight.data.norm())  # norm of the weight
-print(net.conv1.weight.grad.data.norm())  # norm of the gradients
-
-########################################################################
-# Forward and Backward Function Hooks
-# -----------------------------------
-#
-# We’ve inspected the weights and the gradients. But how about inspecting
-# / modifying the output and grad\_output of a layer?
-#
-# We introduce **hooks** for this purpose.
-#
-# You can register a function on a ``Module`` or a ``Tensor``.
-# The hook can be a forward hook or a backward hook.
-# The forward hook will be executed when a forward call is executed.
-# The backward hook will be executed in the backward phase.
-# Let’s look at an example.
-#
-# We register a forward hook on conv2 and print some information
-
-
-def printnorm(self, input, output):
-    # input is a tuple of packed inputs
-    # output is a Tensor. output.data is the Tensor we are interested
-    print('Inside ' + self.__class__.__name__ + ' forward')
-    print('')
-    print('input: ', type(input))
-    print('input[0]: ', type(input[0]))
-    print('output: ', type(output))
-    print('')
-    print('input size:', input[0].size())
-    print('output size:', output.data.size())
-    print('output norm:', output.data.norm())
-
-
-net.conv2.register_forward_hook(printnorm)
-
-out = net(input)
-
-########################################################################
-#
-# We register a backward hook on conv2 and print some information
-
-
-def printgradnorm(self, grad_input, grad_output):
-    print('Inside ' + self.__class__.__name__ + ' backward')
-    print('Inside class:' + self.__class__.__name__)
-    print('')
-    print('grad_input: ', type(grad_input))
-    print('grad_input[0]: ', type(grad_input[0]))
-    print('grad_output: ', type(grad_output))
-    print('grad_output[0]: ', type(grad_output[0]))
-    print('')
-    print('grad_input size:', grad_input[0].size())
-    print('grad_output size:', grad_output[0].size())
-    print('grad_input norm:', grad_input[0].norm())
-
-
-net.conv2.register_backward_hook(printgradnorm)
-
-out = net(input)
-err = loss_fn(out, target)
-err.backward()
-
-########################################################################
-# A full and working MNIST example is located here
-# https://github.com/pytorch/examples/tree/master/mnist
-#
-# Example 2: Recurrent Net
-# ------------------------
-#
-# Next, let’s look at building recurrent nets with PyTorch.
-#
-# Since the state of the network is held in the graph and not in the
-# layers, you can simply create an nn.Linear and reuse it over and over
-# again for the recurrence.
-
-
-class RNN(nn.Module):
-
-    # you can also accept arguments in your model constructor
-    def __init__(self, data_size, hidden_size, output_size):
-        super(RNN, self).__init__()
-
-        self.hidden_size = hidden_size
-        input_size = data_size + hidden_size
-
-        self.i2h = nn.Linear(input_size, hidden_size)
-        self.h2o = nn.Linear(hidden_size, output_size)
-
-    def forward(self, data, last_hidden):
-        input = torch.cat((data, last_hidden), 1)
-        hidden = self.i2h(input)
-        output = self.h2o(hidden)
-        return hidden, output
-
-
-rnn = RNN(50, 20, 10)
-
-########################################################################
-#
-# A more complete Language Modeling example using LSTMs and Penn Tree-bank
-# is located
-# `here <https://github.com/pytorch/examples/tree/master/word\_language\_model>`_
-#
-# PyTorch by default has seamless CuDNN integration for ConvNets and
-# Recurrent Nets
-
-loss_fn = nn.MSELoss()
-
-batch_size = 10
-TIMESTEPS = 5
-
-# Create some fake data
-batch = torch.randn(batch_size, 50)
-hidden = torch.zeros(batch_size, 20)
-target = torch.zeros(batch_size, 10)
-
-loss = 0
-for t in range(TIMESTEPS):
-    # yes! you can reuse the same network several times,
-    # sum up the losses, and call backward!
-    hidden, output = rnn(batch, hidden)
-    loss += loss_fn(output, target)
-loss.backward()
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/beginner/nn_tutorial.html'" />