From d8d7f406783cf1b94c9642a40f10dcfe984b532c Mon Sep 17 00:00:00 2001
From: SiftingSands <zcz2020@gmail.com>
Date: Tue, 6 Sep 2022 23:34:47 -0400
Subject: [PATCH 01/18] increased model capacity and input resolution

---
 intermediate_source/reinforcement_q_learning.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
index cb9abc229c9..acbdef9c0e6 100644
--- a/intermediate_source/reinforcement_q_learning.py
+++ b/intermediate_source/reinforcement_q_learning.py
@@ -205,12 +205,12 @@ class DQN(nn.Module):
 
     def __init__(self, h, w, outputs):
         super(DQN, self).__init__()
-        self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=2)
+        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=2)
         self.bn1 = nn.BatchNorm2d(16)
         self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
         self.bn2 = nn.BatchNorm2d(32)
-        self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
-        self.bn3 = nn.BatchNorm2d(32)
+        self.conv3 = nn.Conv2d(32, 64, kernel_size=5, stride=2)
+        self.bn3 = nn.BatchNorm2d(64)
 
         # Number of Linear input connections depends on output of conv2d layers
         # and therefore the input image size, so compute it.
@@ -218,7 +218,7 @@ def conv2d_size_out(size, kernel_size = 5, stride = 2):
             return (size - (kernel_size - 1) - 1) // stride  + 1
         convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
         convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
-        linear_input_size = convw * convh * 32
+        linear_input_size = convw * convh * 64
         self.head = nn.Linear(linear_input_size, outputs)
 
     # Called with either one element to determine next action, or a batch
@@ -242,7 +242,7 @@ def forward(self, x):
 #
 
 resize = T.Compose([T.ToPILImage(),
-                    T.Resize(40, interpolation=Image.CUBIC),
+                    T.Resize(64, interpolation=Image.BILINEAR),
                     T.ToTensor()])
 
 

From 9f43173868d3308a94e4690b3017db9b18673144 Mon Sep 17 00:00:00 2001
From: SiftingSands <zcz2020@gmail.com>
Date: Tue, 6 Sep 2022 23:36:48 -0400
Subject: [PATCH 02/18] changed out optimizer and greatly increased replay
 buffer

---
 intermediate_source/reinforcement_q_learning.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
index acbdef9c0e6..8cd1fbdf14f 100644
--- a/intermediate_source/reinforcement_q_learning.py
+++ b/intermediate_source/reinforcement_q_learning.py
@@ -328,8 +328,8 @@ def get_screen():
 target_net.load_state_dict(policy_net.state_dict())
 target_net.eval()
 
-optimizer = optim.RMSprop(policy_net.parameters())
-memory = ReplayMemory(10000)
+optimizer = optim.AdamW(policy_net.parameters(), lr=0.0003, weight_decay=0.01, amsgrad=True)
+memory = ReplayMemory(100000)
 
 
 steps_done = 0

From ee25c65c4e9433173d7d5706a111e03ee0dbd939 Mon Sep 17 00:00:00 2001
From: SiftingSands <zcz2020@gmail.com>
Date: Tue, 6 Sep 2022 23:42:59 -0400
Subject: [PATCH 03/18] added reward shaping based solely on duration.
 intention was to reduce penalty in the first few episodes to enable more
 exploration

---
 intermediate_source/reinforcement_q_learning.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
index 8cd1fbdf14f..13184e66804 100644
--- a/intermediate_source/reinforcement_q_learning.py
+++ b/intermediate_source/reinforcement_q_learning.py
@@ -461,8 +461,13 @@ def optimize_model():
     for t in count():
         # Select and perform an action
         action = select_action(state)
-        _, reward, done, _, _ = env.step(action.item())
-        reward = torch.tensor([reward], device=device)
+        _, _, done, _, _ = env.step(action.item())
+        # Reward shaping
+        if i_episode < 100:
+            reward = t * np.clip((i_episode / 500), None, 1)
+        else:
+            reward = (t - 50 * (i_episode / 500)) * np.clip((i_episode / 500), None, 1)
+        reward = torch.tensor([reward], device=device).type(torch.float32)
 
         # Observe new state
         last_screen = current_screen

From dffcd1f41ca52b9af4561af2c196e8104d411ab0 Mon Sep 17 00:00:00 2001
From: SiftingSands <zcz2020@gmail.com>
Date: Wed, 7 Sep 2022 21:55:10 -0400
Subject: [PATCH 04/18] forgot to included my increase to EPS_DECAY. Might as
 well change the # of episodes to match for now

---
 intermediate_source/reinforcement_q_learning.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
index 13184e66804..f08c0fab671 100644
--- a/intermediate_source/reinforcement_q_learning.py
+++ b/intermediate_source/reinforcement_q_learning.py
@@ -311,7 +311,7 @@ def get_screen():
 GAMMA = 0.999
 EPS_START = 0.9
 EPS_END = 0.05
-EPS_DECAY = 200
+EPS_DECAY = 2000
 TARGET_UPDATE = 10
 
 # Get screen size so that we can initialize layers correctly based on shape
@@ -451,7 +451,7 @@ def optimize_model():
 # duration improvements.
 #
 
-num_episodes = 50
+num_episodes = 1000
 for i_episode in range(num_episodes):
     # Initialize the environment and state
     env.reset()

From d899d8dbd0e2d2a2f62e8f39afca92c392d72ce7 Mon Sep 17 00:00:00 2001
From: SiftingSands <zcz2020@gmail.com>
Date: Wed, 7 Sep 2022 22:02:49 -0400
Subject: [PATCH 05/18] revisions to batch norm behavior as suggested by vmoens

---
 intermediate_source/reinforcement_q_learning.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
index f08c0fab671..e2359d6b1ec 100644
--- a/intermediate_source/reinforcement_q_learning.py
+++ b/intermediate_source/reinforcement_q_learning.py
@@ -326,7 +326,6 @@ def get_screen():
 policy_net = DQN(screen_height, screen_width, n_actions).to(device)
 target_net = DQN(screen_height, screen_width, n_actions).to(device)
 target_net.load_state_dict(policy_net.state_dict())
-target_net.eval()
 
 optimizer = optim.AdamW(policy_net.parameters(), lr=0.0003, weight_decay=0.01, amsgrad=True)
 memory = ReplayMemory(100000)
@@ -422,7 +421,8 @@ def optimize_model():
     # This is merged based on the mask, such that we'll have either the expected
     # state value or 0 in case the state was final.
     next_state_values = torch.zeros(BATCH_SIZE, device=device)
-    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
+    with torch.no_grad():
+        next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0]
     # Compute the expected Q values
     expected_state_action_values = (next_state_values * GAMMA) + reward_batch
 
@@ -460,7 +460,10 @@ def optimize_model():
     state = current_screen - last_screen
     for t in count():
         # Select and perform an action
+        policy_net.eval()
         action = select_action(state)
+        policy_net.train()
+
         _, _, done, _, _ = env.step(action.item())
         # Reward shaping
         if i_episode < 100:

From 2a59217cb6a0f9f3809d77c657254e911abba2b4 Mon Sep 17 00:00:00 2001
From: SiftingSands <zcz2020@gmail.com>
Date: Sat, 24 Sep 2022 16:28:04 -0400
Subject: [PATCH 06/18] switched to state vector input and modified hyps so
 training suceeds in a few 100 episodes. removed all code related to image
 processing. added timelimit wrapper. added soft updates.

---
 .../reinforcement_q_learning.py               | 160 +++++-------------
 1 file changed, 40 insertions(+), 120 deletions(-)

diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
index e2359d6b1ec..1cf10434e3c 100644
--- a/intermediate_source/reinforcement_q_learning.py
+++ b/intermediate_source/reinforcement_q_learning.py
@@ -58,6 +58,7 @@
 """
 
 import gym
+from gym.wrappers import TimeLimit
 import math
 import random
 import numpy as np
@@ -74,7 +75,7 @@
 import torchvision.transforms as T
 
 
-env = gym.make('CartPole-v0', new_step_api=True, render_mode='single_rgb_array').unwrapped
+env = TimeLimit(gym.make('CartPole-v1'), max_episode_steps=500)
 
 # set up matplotlib
 is_ipython = 'inline' in matplotlib.get_backend()
@@ -203,86 +204,18 @@ def __len__(self):
 
 class DQN(nn.Module):
 
-    def __init__(self, h, w, outputs):
+    def __init__(self, outputs):
         super(DQN, self).__init__()
-        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=2)
-        self.bn1 = nn.BatchNorm2d(16)
-        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
-        self.bn2 = nn.BatchNorm2d(32)
-        self.conv3 = nn.Conv2d(32, 64, kernel_size=5, stride=2)
-        self.bn3 = nn.BatchNorm2d(64)
-
-        # Number of Linear input connections depends on output of conv2d layers
-        # and therefore the input image size, so compute it.
-        def conv2d_size_out(size, kernel_size = 5, stride = 2):
-            return (size - (kernel_size - 1) - 1) // stride  + 1
-        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
-        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
-        linear_input_size = convw * convh * 64
-        self.head = nn.Linear(linear_input_size, outputs)
+        self.layer1 = nn.Linear(4, 128)
+        self.layer2 = nn.Linear(128, 128)
+        self.layer3 = nn.Linear(128, outputs)
 
     # Called with either one element to determine next action, or a batch
     # during optimization. Returns tensor([[left0exp,right0exp]...]).
     def forward(self, x):
-        x = x.to(device)
-        x = F.relu(self.bn1(self.conv1(x)))
-        x = F.relu(self.bn2(self.conv2(x)))
-        x = F.relu(self.bn3(self.conv3(x)))
-        return self.head(x.view(x.size(0), -1))
-
-
-######################################################################
-# Input extraction
-# ^^^^^^^^^^^^^^^^
-#
-# The code below are utilities for extracting and processing rendered
-# images from the environment. It uses the ``torchvision`` package, which
-# makes it easy to compose image transforms. Once you run the cell it will
-# display an example patch that it extracted.
-#
-
-resize = T.Compose([T.ToPILImage(),
-                    T.Resize(64, interpolation=Image.BILINEAR),
-                    T.ToTensor()])
-
-
-def get_cart_location(screen_width):
-    world_width = env.x_threshold * 2
-    scale = screen_width / world_width
-    return int(env.state[0] * scale + screen_width / 2.0)  # MIDDLE OF CART
-
-def get_screen():
-    # Returned screen requested by gym is 400x600x3, but is sometimes larger
-    # such as 800x1200x3. Transpose it into torch order (CHW).
-    screen = env.render().transpose((2, 0, 1))
-    # Cart is in the lower half, so strip off the top and bottom of the screen
-    _, screen_height, screen_width = screen.shape
-    screen = screen[:, int(screen_height*0.4):int(screen_height * 0.8)]
-    view_width = int(screen_width * 0.6)
-    cart_location = get_cart_location(screen_width)
-    if cart_location < view_width // 2:
-        slice_range = slice(view_width)
-    elif cart_location > (screen_width - view_width // 2):
-        slice_range = slice(-view_width, None)
-    else:
-        slice_range = slice(cart_location - view_width // 2,
-                            cart_location + view_width // 2)
-    # Strip off the edges, so that we have a square image centered on a cart
-    screen = screen[:, :, slice_range]
-    # Convert to float, rescale, convert to torch tensor
-    # (this doesn't require a copy)
-    screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
-    screen = torch.from_numpy(screen)
-    # Resize, and add a batch dimension (BCHW)
-    return resize(screen).unsqueeze(0)
-
-
-env.reset()
-plt.figure()
-plt.imshow(get_screen().cpu().squeeze(0).permute(1, 2, 0).numpy(),
-           interpolation='none')
-plt.title('Example extracted screen')
-plt.show()
+        x = F.relu(self.layer1(x))
+        x = F.relu(self.layer2(x))
+        return self.layer3(x)
 
 
 ######################################################################
@@ -305,30 +238,24 @@ def get_screen():
 #    the official evaluations). The plot will be underneath the cell
 #    containing the main training loop, and will update after every
 #    episode.
-#
 
 BATCH_SIZE = 128
-GAMMA = 0.999
+GAMMA = 0.99
 EPS_START = 0.9
 EPS_END = 0.05
-EPS_DECAY = 2000
-TARGET_UPDATE = 10
-
-# Get screen size so that we can initialize layers correctly based on shape
-# returned from AI gym. Typical dimensions at this point are close to 3x40x90
-# which is the result of a clamped and down-scaled render buffer in get_screen()
-init_screen = get_screen()
-_, _, screen_height, screen_width = init_screen.shape
+EPS_DECAY = 1000
+TAU = 0.005
+LR = 1e-4
 
 # Get number of actions from gym action space
 n_actions = env.action_space.n
 
-policy_net = DQN(screen_height, screen_width, n_actions).to(device)
-target_net = DQN(screen_height, screen_width, n_actions).to(device)
+policy_net = DQN(n_actions).to(device)
+target_net = DQN(n_actions).to(device)
 target_net.load_state_dict(policy_net.state_dict())
 
-optimizer = optim.AdamW(policy_net.parameters(), lr=0.0003, weight_decay=0.01, amsgrad=True)
-memory = ReplayMemory(100000)
+optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)
+memory = ReplayMemory(10000)
 
 
 steps_done = 0
@@ -433,8 +360,8 @@ def optimize_model():
     # Optimize the model
     optimizer.zero_grad()
     loss.backward()
-    for param in policy_net.parameters():
-        param.grad.data.clamp_(-1, 1)
+    # In-place gradient clipping
+    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
     optimizer.step()
 
 
@@ -453,32 +380,20 @@ def optimize_model():
 
 num_episodes = 1000
 for i_episode in range(num_episodes):
-    # Initialize the environment and state
-    env.reset()
-    last_screen = get_screen()
-    current_screen = get_screen()
-    state = current_screen - last_screen
+    # Initialize the environment and get it's state
+    state, _ = env.reset()
+    state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
     for t in count():
-        # Select and perform an action
-        policy_net.eval()
         action = select_action(state)
-        policy_net.train()
+        observation, reward, terminated, truncated, _ = env.step(action.item())
+        reward = torch.tensor([reward], device=device)
+        if truncated:
+            terminated = True
 
-        _, _, done, _, _ = env.step(action.item())
-        # Reward shaping
-        if i_episode < 100:
-            reward = t * np.clip((i_episode / 500), None, 1)
-        else:
-            reward = (t - 50 * (i_episode / 500)) * np.clip((i_episode / 500), None, 1)
-        reward = torch.tensor([reward], device=device).type(torch.float32)
-
-        # Observe new state
-        last_screen = current_screen
-        current_screen = get_screen()
-        if not done:
-            next_state = current_screen - last_screen
-        else:
+        if terminated:
             next_state = None
+        else:
+            next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)
 
         # Store the transition in memory
         memory.push(state, action, next_state, reward)
@@ -488,17 +403,22 @@ def optimize_model():
 
         # Perform one step of the optimization (on the policy network)
         optimize_model()
-        if done:
+
+        # Soft update of the target network's weights at every step
+        # https://arxiv.org/pdf/1509.02971.pdf
+        # θ′ ← τ θ + (1 −τ )θ′
+        target_net_state_dict = target_net.state_dict()
+        policy_net_state_dict = policy_net.state_dict()
+        for key in policy_net_state_dict:
+            target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
+        target_net.load_state_dict(target_net_state_dict)
+
+        if terminated:
             episode_durations.append(t + 1)
             plot_durations()
             break
-    # Update the target network, copying all weights and biases in DQN
-    if i_episode % TARGET_UPDATE == 0:
-        target_net.load_state_dict(policy_net.state_dict())
 
 print('Complete')
-env.render()
-env.close()
 plt.ioff()
 plt.show()
 

From db90afd72769e41dbec8bed78590937f2da0caa8 Mon Sep 17 00:00:00 2001
From: SiftingSands <zcz2020@gmail.com>
Date: Sat, 24 Sep 2022 17:20:59 -0400
Subject: [PATCH 07/18] numerous doc changes. removed unused imports. used
 gym's action space sampler

---
 .../reinforcement_q_learning.py               | 64 +++++++++----------
 1 file changed, 30 insertions(+), 34 deletions(-)

diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
index 1cf10434e3c..328fa34a3cd 100644
--- a/intermediate_source/reinforcement_q_learning.py
+++ b/intermediate_source/reinforcement_q_learning.py
@@ -6,7 +6,7 @@
 
 
 This tutorial shows how to use PyTorch to train a Deep Q Learning (DQN) agent
-on the CartPole-v0 task from the `OpenAI Gym <https://www.gymlibrary.dev/>`__.
+on the CartPole-v1 task from the `OpenAI Gym <https://www.gymlibrary.dev/>`__.
 
 **Task**
 
@@ -30,30 +30,24 @@
 
 The CartPole task is designed so that the inputs to the agent are 4 real
 values representing the environment state (position, velocity, etc.).
-However, neural networks can solve the task purely by looking at the
-scene, so we'll use a patch of the screen centered on the cart as an
-input. Because of this, our results aren't directly comparable to the
-ones from the official leaderboard - our task is much harder.
-Unfortunately this does slow down the training, because we have to
-render all the frames.
+We take these 4 inputs without any scaling and pass them through a 
+small fully-connected network with 2 outputs, one for each action. 
+The network is trained to predict the expected value for each action, 
+given the input state. The action with the highest expected value is 
+then chosen.
 
-Strictly speaking, we will present the state as the difference between
-the current screen patch and the previous one. This will allow the agent
-to take the velocity of the pole into account from one image.
 
 **Packages**
 
 
 First, let's import needed packages. Firstly, we need
 `gym <https://github.com/openai/gym>`__ for the environment
-(Install using `pip install gym`).
+(Install using `pip install gym`). Developed on v0.26.1 of gym.
 We'll also use the following from PyTorch:
 
 -  neural networks (``torch.nn``)
 -  optimization (``torch.optim``)
 -  automatic differentiation (``torch.autograd``)
--  utilities for vision tasks (``torchvision`` - `a separate
-   package <https://github.com/pytorch/vision>`__).
 
 """
 
@@ -66,15 +60,13 @@
 import matplotlib.pyplot as plt
 from collections import namedtuple, deque
 from itertools import count
-from PIL import Image
 
 import torch
 import torch.nn as nn
 import torch.optim as optim
 import torch.nn.functional as F
-import torchvision.transforms as T
-
 
+# Wrap the environment to limit the number of steps per episode
 env = TimeLimit(gym.make('CartPole-v1'), max_episode_steps=500)
 
 # set up matplotlib
@@ -145,9 +137,11 @@ def __len__(self):
 # :math:`R_{t_0} = \sum_{t=t_0}^{\infty} \gamma^{t - t_0} r_t`, where
 # :math:`R_{t_0}` is also known as the *return*. The discount,
 # :math:`\gamma`, should be a constant between :math:`0` and :math:`1`
-# that ensures the sum converges. It makes rewards from the uncertain far
-# future less important for our agent than the ones in the near future
-# that it can be fairly confident about.
+# that ensures the sum converges. A lower :math:`\gamma` makes 
+# rewards from the uncertain far future less important for our agent 
+# than the ones in the near future that it can be fairly confident 
+# about. It also encourages agents to collect reward closer in time 
+# than equivalent rewards temporally future away.
 #
 # The main idea behind Q-learning is that if we had a function
 # :math:`Q^*: State \times Action \rightarrow \mathbb{R}`, that could tell
@@ -170,7 +164,7 @@ def __len__(self):
 # The difference between the two sides of the equality is known as the
 # temporal difference error, :math:`\delta`:
 #
-# .. math:: \delta = Q(s, a) - (r + \gamma \max_a Q(s', a))
+# .. math:: \delta = Q(s, a) - (r + \gamma \max_a' Q(s', a))
 #
 # To minimise this error, we will use the `Huber
 # loss <https://en.wikipedia.org/wiki/Huber_loss>`__. The Huber loss acts
@@ -239,6 +233,13 @@ def forward(self, x):
 #    containing the main training loop, and will update after every
 #    episode.
 
+# BATCH_SIZE is the number of transitions sampled from the replay buffer
+# GAMMA is the discount factor as mentioned in the previous section
+# EPS_START is the starting value of epsilon
+# EPS_END is the final value of epsilon
+# EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay
+# TAU is the update rate of the target network
+# LR is the learning rate of the AdamW optimizer
 BATCH_SIZE = 128
 GAMMA = 0.99
 EPS_START = 0.9
@@ -274,7 +275,7 @@ def select_action(state):
             # found, so we pick action with the larger expected reward.
             return policy_net(state).max(1)[1].view(1, 1)
     else:
-        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)
+        return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)
 
 
 episode_durations = []
@@ -312,11 +313,9 @@ def plot_durations():
 # :math:`V(s_{t+1}) = \max_a Q(s_{t+1}, a)`, and combines them into our
 # loss. By definition we set :math:`V(s) = 0` if :math:`s` is a terminal
 # state. We also use a target network to compute :math:`V(s_{t+1})` for
-# added stability. The target network has its weights kept frozen most of
-# the time, but is updated with the policy network's weights every so often.
-# This is usually a set number of steps but we shall use episodes for
-# simplicity.
-#
+# added stability. The target network is updated at every step with a 
+# `soft update <https://arxiv.org/pdf/1509.02971.pdf>`__ controlled by 
+# the hyperparameter ``TAU``, which was previously defined. 
 
 def optimize_model():
     if len(memory) < BATCH_SIZE:
@@ -368,15 +367,13 @@ def optimize_model():
 ######################################################################
 #
 # Below, you can find the main training loop. At the beginning we reset
-# the environment and initialize the ``state`` Tensor. Then, we sample
-# an action, execute it, observe the next screen and the reward (always
+# the environment and obtain the initial ``state`` Tensor. Then, we sample
+# an action, execute it, observe the next state and the reward (always
 # 1), and optimize our model once. When the episode ends (our model
 # fails), we restart the loop.
 #
-# Below, `num_episodes` is set small. You should download
-# the notebook and run lot more epsiodes, such as 300+ for meaningful
-# duration improvements.
-#
+# Below, `num_episodes` to 1000, but you should the model constantly
+# achieve 500 steps within 600 training episodes.
 
 num_episodes = 1000
 for i_episode in range(num_episodes):
@@ -404,8 +401,7 @@ def optimize_model():
         # Perform one step of the optimization (on the policy network)
         optimize_model()
 
-        # Soft update of the target network's weights at every step
-        # https://arxiv.org/pdf/1509.02971.pdf
+        # Soft update of the target network's weights
         # θ′ ← τ θ + (1 −τ )θ′
         target_net_state_dict = target_net.state_dict()
         policy_net_state_dict = policy_net.state_dict()

From 8e507a2e079ffcd5b8968fc69f9b03b9eb919785 Mon Sep 17 00:00:00 2001
From: SiftingSands <zcz2020@gmail.com>
Date: Sat, 24 Sep 2022 17:39:39 -0400
Subject: [PATCH 08/18] minor doc change. removed hard coding of network input
 size

---
 intermediate_source/reinforcement_q_learning.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
index 328fa34a3cd..42ab5ad436c 100644
--- a/intermediate_source/reinforcement_q_learning.py
+++ b/intermediate_source/reinforcement_q_learning.py
@@ -66,7 +66,7 @@
 import torch.optim as optim
 import torch.nn.functional as F
 
-# Wrap the environment to limit the number of steps per episode
+# Wrap the environment to limit the maximum number of steps per episode
 env = TimeLimit(gym.make('CartPole-v1'), max_episode_steps=500)
 
 # set up matplotlib
@@ -198,11 +198,11 @@ def __len__(self):
 
 class DQN(nn.Module):
 
-    def __init__(self, outputs):
+    def __init__(self, n_observations, n_actions):
         super(DQN, self).__init__()
-        self.layer1 = nn.Linear(4, 128)
+        self.layer1 = nn.Linear(n_observations, 128)
         self.layer2 = nn.Linear(128, 128)
-        self.layer3 = nn.Linear(128, outputs)
+        self.layer3 = nn.Linear(128, n_actions)
 
     # Called with either one element to determine next action, or a batch
     # during optimization. Returns tensor([[left0exp,right0exp]...]).
@@ -250,9 +250,12 @@ def forward(self, x):
 
 # Get number of actions from gym action space
 n_actions = env.action_space.n
+# Get the number of state observations
+state, _ = env.reset()
+n_observations = len(state)
 
-policy_net = DQN(n_actions).to(device)
-target_net = DQN(n_actions).to(device)
+policy_net = DQN(n_observations, n_actions).to(device)
+target_net = DQN(n_observations, n_actions).to(device)
 target_net.load_state_dict(policy_net.state_dict())
 
 optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)

From 0f0c07bcf8236d085718ff7ec7905d0814115b56 Mon Sep 17 00:00:00 2001
From: SiftingSands <zcz2020@gmail.com>
Date: Sun, 25 Sep 2022 12:47:52 -0400
Subject: [PATCH 09/18] removed unneeded timelimit wrapper

---
 intermediate_source/reinforcement_q_learning.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
index 42ab5ad436c..10f8ca19a58 100644
--- a/intermediate_source/reinforcement_q_learning.py
+++ b/intermediate_source/reinforcement_q_learning.py
@@ -66,8 +66,7 @@
 import torch.optim as optim
 import torch.nn.functional as F
 
-# Wrap the environment to limit the maximum number of steps per episode
-env = TimeLimit(gym.make('CartPole-v1'), max_episode_steps=500)
+env = gym.make('CartPole-v1')
 
 # set up matplotlib
 is_ipython = 'inline' in matplotlib.get_backend()

From d3bff00f93185e6a8501fb26c688e9d932a5a7c4 Mon Sep 17 00:00:00 2001
From: SiftingSands <zcz2020@gmail.com>
Date: Tue, 27 Sep 2022 19:07:18 -0400
Subject: [PATCH 10/18] Fixed termination vs truncation behavior. Remove the
 timelimit import that wasn't used anymore

---
 intermediate_source/reinforcement_q_learning.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
index 10f8ca19a58..da67325ada3 100644
--- a/intermediate_source/reinforcement_q_learning.py
+++ b/intermediate_source/reinforcement_q_learning.py
@@ -52,7 +52,6 @@
 """
 
 import gym
-from gym.wrappers import TimeLimit
 import math
 import random
 import numpy as np
@@ -386,8 +385,7 @@ def optimize_model():
         action = select_action(state)
         observation, reward, terminated, truncated, _ = env.step(action.item())
         reward = torch.tensor([reward], device=device)
-        if truncated:
-            terminated = True
+        done = terminated or truncated
 
         if terminated:
             next_state = None
@@ -411,7 +409,7 @@ def optimize_model():
             target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
         target_net.load_state_dict(target_net_state_dict)
 
-        if terminated:
+        if done:
             episode_durations.append(t + 1)
             plot_durations()
             break

From 70bffde64f77c33e63c970cf8c1bd8590c56ca1b Mon Sep 17 00:00:00 2001
From: SiftingSands <43226539+SiftingSands@users.noreply.github.com>
Date: Wed, 28 Sep 2022 13:12:46 -0400
Subject: [PATCH 11/18] Added missing # to see if the webpage will rende

---
 intermediate_source/reinforcement_q_learning.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
index da67325ada3..9450ae50b2d 100644
--- a/intermediate_source/reinforcement_q_learning.py
+++ b/intermediate_source/reinforcement_q_learning.py
@@ -230,6 +230,7 @@ def forward(self, x):
 #    the official evaluations). The plot will be underneath the cell
 #    containing the main training loop, and will update after every
 #    episode.
+#
 
 # BATCH_SIZE is the number of transitions sampled from the replay buffer
 # GAMMA is the discount factor as mentioned in the previous section
@@ -316,7 +317,8 @@ def plot_durations():
 # state. We also use a target network to compute :math:`V(s_{t+1})` for
 # added stability. The target network is updated at every step with a 
 # `soft update <https://arxiv.org/pdf/1509.02971.pdf>`__ controlled by 
-# the hyperparameter ``TAU``, which was previously defined. 
+# the hyperparameter ``TAU``, which was previously defined.
+#
 
 def optimize_model():
     if len(memory) < BATCH_SIZE:
@@ -375,6 +377,7 @@ def optimize_model():
 #
 # Below, `num_episodes` to 1000, but you should the model constantly
 # achieve 500 steps within 600 training episodes.
+#
 
 num_episodes = 1000
 for i_episode in range(num_episodes):

From 04c3b320d3358593a5006a77f26f9982929e8ede Mon Sep 17 00:00:00 2001
From: SiftingSands <43226539+SiftingSands@users.noreply.github.com>
Date: Wed, 28 Sep 2022 13:54:43 -0400
Subject: [PATCH 12/18] Remove comment w/ special chars for webpage render

---
 intermediate_source/reinforcement_q_learning.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
index 9450ae50b2d..761e118121e 100644
--- a/intermediate_source/reinforcement_q_learning.py
+++ b/intermediate_source/reinforcement_q_learning.py
@@ -405,7 +405,6 @@ def optimize_model():
         optimize_model()
 
         # Soft update of the target network's weights
-        # θ′ ← τ θ + (1 −τ )θ′
         target_net_state_dict = target_net.state_dict()
         policy_net_state_dict = policy_net.state_dict()
         for key in policy_net_state_dict:

From 26f774f088dff278fc60b8bb00b6b41f923c069d Mon Sep 17 00:00:00 2001
From: SiftingSands <zcz2020@gmail.com>
Date: Wed, 28 Sep 2022 19:09:50 -0400
Subject: [PATCH 13/18] Naive removal of block beginning with comments to see
 if it fixes the webpage

---
 intermediate_source/reinforcement_q_learning.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
index 761e118121e..a735164a6c1 100644
--- a/intermediate_source/reinforcement_q_learning.py
+++ b/intermediate_source/reinforcement_q_learning.py
@@ -232,13 +232,6 @@ def forward(self, x):
 #    episode.
 #
 
-# BATCH_SIZE is the number of transitions sampled from the replay buffer
-# GAMMA is the discount factor as mentioned in the previous section
-# EPS_START is the starting value of epsilon
-# EPS_END is the final value of epsilon
-# EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay
-# TAU is the update rate of the target network
-# LR is the learning rate of the AdamW optimizer
 BATCH_SIZE = 128
 GAMMA = 0.99
 EPS_START = 0.9

From 2e629a6b0bbab4ba8e937c7aa4743d01120d7cd9 Mon Sep 17 00:00:00 2001
From: SiftingSands <zcz2020@gmail.com>
Date: Wed, 28 Sep 2022 19:36:42 -0400
Subject: [PATCH 14/18] undid more docstring changes for testing

---
 .../reinforcement_q_learning.py               | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
index a735164a6c1..a5725ead356 100644
--- a/intermediate_source/reinforcement_q_learning.py
+++ b/intermediate_source/reinforcement_q_learning.py
@@ -135,11 +135,9 @@ def __len__(self):
 # :math:`R_{t_0} = \sum_{t=t_0}^{\infty} \gamma^{t - t_0} r_t`, where
 # :math:`R_{t_0}` is also known as the *return*. The discount,
 # :math:`\gamma`, should be a constant between :math:`0` and :math:`1`
-# that ensures the sum converges. A lower :math:`\gamma` makes 
-# rewards from the uncertain far future less important for our agent 
-# than the ones in the near future that it can be fairly confident 
-# about. It also encourages agents to collect reward closer in time 
-# than equivalent rewards temporally future away.
+# that ensures the sum converges. It makes rewards from the uncertain far
+# future less important for our agent than the ones in the near future
+# that it can be fairly confident about.
 #
 # The main idea behind Q-learning is that if we had a function
 # :math:`Q^*: State \times Action \rightarrow \mathbb{R}`, that could tell
@@ -162,7 +160,7 @@ def __len__(self):
 # The difference between the two sides of the equality is known as the
 # temporal difference error, :math:`\delta`:
 #
-# .. math:: \delta = Q(s, a) - (r + \gamma \max_a' Q(s', a))
+# .. math:: \delta = Q(s, a) - (r + \gamma \max_a Q(s', a))
 #
 # To minimise this error, we will use the `Huber
 # loss <https://en.wikipedia.org/wiki/Huber_loss>`__. The Huber loss acts
@@ -308,9 +306,10 @@ def plot_durations():
 # :math:`V(s_{t+1}) = \max_a Q(s_{t+1}, a)`, and combines them into our
 # loss. By definition we set :math:`V(s) = 0` if :math:`s` is a terminal
 # state. We also use a target network to compute :math:`V(s_{t+1})` for
-# added stability. The target network is updated at every step with a 
-# `soft update <https://arxiv.org/pdf/1509.02971.pdf>`__ controlled by 
-# the hyperparameter ``TAU``, which was previously defined.
+# added stability. The target network has its weights kept frozen most of
+# the time, but is updated with the policy network's weights every so often.
+# This is usually a set number of steps but we shall use episodes for
+# simplicity.
 #
 
 def optimize_model():
@@ -368,8 +367,9 @@ def optimize_model():
 # 1), and optimize our model once. When the episode ends (our model
 # fails), we restart the loop.
 #
-# Below, `num_episodes` to 1000, but you should the model constantly
-# achieve 500 steps within 600 training episodes.
+# Below, `num_episodes` is set small. You should download
+# the notebook and run lot more epsiodes, such as 300+ for meaningful
+# duration improvements.
 #
 
 num_episodes = 1000

From 068260df45102e471c7adeafc75cd72fc9b3ed9e Mon Sep 17 00:00:00 2001
From: SiftingSands <zcz2020@gmail.com>
Date: Wed, 28 Sep 2022 19:55:25 -0400
Subject: [PATCH 15/18] undid the last few plausible docstring changes and
 added the input extraction section back in for testing

---
 .../reinforcement_q_learning.py               | 68 ++++++++++++++++---
 1 file changed, 60 insertions(+), 8 deletions(-)

diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
index a5725ead356..1f6946bbf72 100644
--- a/intermediate_source/reinforcement_q_learning.py
+++ b/intermediate_source/reinforcement_q_learning.py
@@ -30,24 +30,30 @@
 
 The CartPole task is designed so that the inputs to the agent are 4 real
 values representing the environment state (position, velocity, etc.).
-We take these 4 inputs without any scaling and pass them through a 
-small fully-connected network with 2 outputs, one for each action. 
-The network is trained to predict the expected value for each action, 
-given the input state. The action with the highest expected value is 
-then chosen.
+However, neural networks can solve the task purely by looking at the
+scene, so we'll use a patch of the screen centered on the cart as an
+input. Because of this, our results aren't directly comparable to the
+ones from the official leaderboard - our task is much harder.
+Unfortunately this does slow down the training, because we have to
+render all the frames.
 
+Strictly speaking, we will present the state as the difference between
+the current screen patch and the previous one. This will allow the agent
+to take the velocity of the pole into account from one image.
 
 **Packages**
 
 
 First, let's import needed packages. Firstly, we need
 `gym <https://github.com/openai/gym>`__ for the environment
-(Install using `pip install gym`). Developed on v0.26.1 of gym.
+(Install using `pip install gym`).
 We'll also use the following from PyTorch:
 
 -  neural networks (``torch.nn``)
 -  optimization (``torch.optim``)
 -  automatic differentiation (``torch.autograd``)
+-  utilities for vision tasks (``torchvision`` - `a separate
+   package <https://github.com/pytorch/vision>`__).
 
 """
 
@@ -208,6 +214,52 @@ def forward(self, x):
         return self.layer3(x)
 
 
+######################################################################
+# Input extraction
+# ^^^^^^^^^^^^^^^^
+#
+# The code below are utilities for extracting and processing rendered
+# images from the environment. It uses the ``torchvision`` package, which
+# makes it easy to compose image transforms. Once you run the cell it will
+# display an example patch that it extracted.
+#
+
+resize = T.Compose([T.ToPILImage(),
+                    T.Resize(40, interpolation=Image.CUBIC),
+                    T.ToTensor()])
+
+
+def get_cart_location(screen_width):
+    world_width = env.x_threshold * 2
+    scale = screen_width / world_width
+    return int(env.state[0] * scale + screen_width / 2.0)  # MIDDLE OF CART
+
+def get_screen():
+    # Returned screen requested by gym is 400x600x3, but is sometimes larger
+    # such as 800x1200x3. Transpose it into torch order (CHW).
+    screen = env.render().transpose((2, 0, 1))
+    # Cart is in the lower half, so strip off the top and bottom of the screen
+    _, screen_height, screen_width = screen.shape
+    screen = screen[:, int(screen_height*0.4):int(screen_height * 0.8)]
+    view_width = int(screen_width * 0.6)
+    cart_location = get_cart_location(screen_width)
+    if cart_location < view_width // 2:
+        slice_range = slice(view_width)
+    elif cart_location > (screen_width - view_width // 2):
+        slice_range = slice(-view_width, None)
+    else:
+        slice_range = slice(cart_location - view_width // 2,
+                            cart_location + view_width // 2)
+    # Strip off the edges, so that we have a square image centered on a cart
+    screen = screen[:, :, slice_range]
+    # Convert to float, rescale, convert to torch tensor
+    # (this doesn't require a copy)
+    screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
+    screen = torch.from_numpy(screen)
+    # Resize, and add a batch dimension (BCHW)
+    return resize(screen).unsqueeze(0)
+
+
 ######################################################################
 # Training
 # --------
@@ -362,8 +414,8 @@ def optimize_model():
 ######################################################################
 #
 # Below, you can find the main training loop. At the beginning we reset
-# the environment and obtain the initial ``state`` Tensor. Then, we sample
-# an action, execute it, observe the next state and the reward (always
+# the environment and initialize the ``state`` Tensor. Then, we sample
+# an action, execute it, observe the next screen and the reward (always
 # 1), and optimize our model once. When the episode ends (our model
 # fails), we restart the loop.
 #

From 80d133a4ccd28ae1e18d2021b5a5c87b0bc80973 Mon Sep 17 00:00:00 2001
From: SiftingSands <zcz2020@gmail.com>
Date: Sat, 1 Oct 2022 11:43:23 -0400
Subject: [PATCH 16/18] minor docstring changes

---
 intermediate_source/reinforcement_q_learning.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
index 9450ae50b2d..af03572dc95 100644
--- a/intermediate_source/reinforcement_q_learning.py
+++ b/intermediate_source/reinforcement_q_learning.py
@@ -284,7 +284,7 @@ def select_action(state):
 
 
 def plot_durations():
-    plt.figure(2)
+    plt.figure(1)
     plt.clf()
     durations_t = torch.tensor(episode_durations, dtype=torch.float)
     plt.title('Training...')
@@ -375,7 +375,7 @@ def optimize_model():
 # 1), and optimize our model once. When the episode ends (our model
 # fails), we restart the loop.
 #
-# Below, `num_episodes` to 1000, but you should the model constantly
+# Below, `num_episodes` to 1000, but you should see the model constantly
 # achieve 500 steps within 600 training episodes.
 #
 
@@ -430,6 +430,6 @@ def optimize_model():
 # step sample from the gym environment. We record the results in the
 # replay memory and also run optimization step on every iteration.
 # Optimization picks a random batch from the replay memory to do training of the
-# new policy. "Older" target_net is also used in optimization to compute the
-# expected Q values; it is updated occasionally to keep it current.
+# new policy. The "older" target_net is also used in optimization to compute the
+# expected Q values. A soft update of its weights are performed at every step.
 #

From b3d65d03b654a63f434374630e46361782e29fc5 Mon Sep 17 00:00:00 2001
From: SiftingSands <zcz2020@gmail.com>
Date: Tue, 1 Nov 2022 21:48:26 -0400
Subject: [PATCH 17/18] gym version handling authored by
 https://github.com/pseudo-rnd-thoughts; hardware dependent # episodes w/ more
 writing

---
 .../reinforcement_q_learning.py               | 21 +++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
index af03572dc95..38676a84479 100644
--- a/intermediate_source/reinforcement_q_learning.py
+++ b/intermediate_source/reinforcement_q_learning.py
@@ -65,7 +65,12 @@
 import torch.optim as optim
 import torch.nn.functional as F
 
-env = gym.make('CartPole-v1')
+if gym.__version__[:4] == '0.26':
+    env = gym.make('CartPole-v1')
+elif gym.__version__[:4] == '0.25':
+    env = gym.make('CartPole-v1', new_step_api=True)
+else:
+    raise ImportError(f"Requires gym v25 or v26, actual version: {gym.__version__}")
 
 # set up matplotlib
 is_ipython = 'inline' in matplotlib.get_backend()
@@ -375,11 +380,19 @@ def optimize_model():
 # 1), and optimize our model once. When the episode ends (our model
 # fails), we restart the loop.
 #
-# Below, `num_episodes` to 1000, but you should see the model constantly
-# achieve 500 steps within 600 training episodes.
+# Below, `num_episodes` is set to 600 if a GPU is available, otherwise 50 
+# episodes are scheduled so training does not take too long. However, 50 
+# episodes is insufficient for to observe good performance on cartpole.
+# You should see the model constantly achieve 500 steps within 600 training 
+# episodes. Training RL agents can be a noisy process, so restarting training
+# can produce better results if convergence is not observed.
 #
 
-num_episodes = 1000
+if torch.cuda.is_available():
+    num_episodes = 600
+else:
+    num_episodes = 50
+
 for i_episode in range(num_episodes):
     # Initialize the environment and get it's state
     state, _ = env.reset()

From 1015af66648e39c08a9fce2558ea9e0150a1d3a8 Mon Sep 17 00:00:00 2001
From: SiftingSands <zcz2020@gmail.com>
Date: Fri, 4 Nov 2022 20:45:27 -0400
Subject: [PATCH 18/18] more version handling for v.25 and v.26

---
 intermediate_source/reinforcement_q_learning.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
index b33d7abd1b0..4095f90206c 100644
--- a/intermediate_source/reinforcement_q_learning.py
+++ b/intermediate_source/reinforcement_q_learning.py
@@ -261,7 +261,10 @@ def forward(self, x):
 # Get number of actions from gym action space
 n_actions = env.action_space.n
 # Get the number of state observations
-state, _ = env.reset()
+if gym.__version__[:4] == '0.26':
+    state, _ = env.reset()
+elif gym.__version__[:4] == '0.25':
+    state, _ = env.reset(return_info=True)
 n_observations = len(state)
 
 policy_net = DQN(n_observations, n_actions).to(device)
@@ -401,7 +404,10 @@ def optimize_model():
 
 for i_episode in range(num_episodes):
     # Initialize the environment and get it's state
-    state, _ = env.reset()
+    if gym.__version__[:4] == '0.26':
+        state, _ = env.reset()
+    elif gym.__version__[:4] == '0.25':
+        state, _ = env.reset(return_info=True)
     state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
     for t in count():
         action = select_action(state)