From d8d7f406783cf1b94c9642a40f10dcfe984b532c Mon Sep 17 00:00:00 2001 From: SiftingSands Date: Tue, 6 Sep 2022 23:34:47 -0400 Subject: [PATCH 01/18] increased model capacity and input resolution --- intermediate_source/reinforcement_q_learning.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py index cb9abc229c9..acbdef9c0e6 100644 --- a/intermediate_source/reinforcement_q_learning.py +++ b/intermediate_source/reinforcement_q_learning.py @@ -205,12 +205,12 @@ class DQN(nn.Module): def __init__(self, h, w, outputs): super(DQN, self).__init__() - self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=2) + self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=2) self.bn1 = nn.BatchNorm2d(16) self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2) self.bn2 = nn.BatchNorm2d(32) - self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2) - self.bn3 = nn.BatchNorm2d(32) + self.conv3 = nn.Conv2d(32, 64, kernel_size=5, stride=2) + self.bn3 = nn.BatchNorm2d(64) # Number of Linear input connections depends on output of conv2d layers # and therefore the input image size, so compute it. @@ -218,7 +218,7 @@ def conv2d_size_out(size, kernel_size = 5, stride = 2): return (size - (kernel_size - 1) - 1) // stride + 1 convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w))) convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h))) - linear_input_size = convw * convh * 32 + linear_input_size = convw * convh * 64 self.head = nn.Linear(linear_input_size, outputs) # Called with either one element to determine next action, or a batch @@ -242,7 +242,7 @@ def forward(self, x): # resize = T.Compose([T.ToPILImage(), - T.Resize(40, interpolation=Image.CUBIC), + T.Resize(64, interpolation=Image.BILINEAR), T.ToTensor()]) From 9f43173868d3308a94e4690b3017db9b18673144 Mon Sep 17 00:00:00 2001 From: SiftingSands Date: Tue, 6 Sep 2022 23:36:48 -0400 Subject: [PATCH 02/18] changed out optimizer and greatly increased replay buffer --- intermediate_source/reinforcement_q_learning.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py index acbdef9c0e6..8cd1fbdf14f 100644 --- a/intermediate_source/reinforcement_q_learning.py +++ b/intermediate_source/reinforcement_q_learning.py @@ -328,8 +328,8 @@ def get_screen(): target_net.load_state_dict(policy_net.state_dict()) target_net.eval() -optimizer = optim.RMSprop(policy_net.parameters()) -memory = ReplayMemory(10000) +optimizer = optim.AdamW(policy_net.parameters(), lr=0.0003, weight_decay=0.01, amsgrad=True) +memory = ReplayMemory(100000) steps_done = 0 From ee25c65c4e9433173d7d5706a111e03ee0dbd939 Mon Sep 17 00:00:00 2001 From: SiftingSands Date: Tue, 6 Sep 2022 23:42:59 -0400 Subject: [PATCH 03/18] added reward shaping based solely on duration. intention was to reduce penalty in the first few episodes to enable more exploration --- intermediate_source/reinforcement_q_learning.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py index 8cd1fbdf14f..13184e66804 100644 --- a/intermediate_source/reinforcement_q_learning.py +++ b/intermediate_source/reinforcement_q_learning.py @@ -461,8 +461,13 @@ def optimize_model(): for t in count(): # Select and perform an action action = select_action(state) - _, reward, done, _, _ = env.step(action.item()) - reward = torch.tensor([reward], device=device) + _, _, done, _, _ = env.step(action.item()) + # Reward shaping + if i_episode < 100: + reward = t * np.clip((i_episode / 500), None, 1) + else: + reward = (t - 50 * (i_episode / 500)) * np.clip((i_episode / 500), None, 1) + reward = torch.tensor([reward], device=device).type(torch.float32) # Observe new state last_screen = current_screen From dffcd1f41ca52b9af4561af2c196e8104d411ab0 Mon Sep 17 00:00:00 2001 From: SiftingSands Date: Wed, 7 Sep 2022 21:55:10 -0400 Subject: [PATCH 04/18] forgot to included my increase to EPS_DECAY. Might as well change the # of episodes to match for now --- intermediate_source/reinforcement_q_learning.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py index 13184e66804..f08c0fab671 100644 --- a/intermediate_source/reinforcement_q_learning.py +++ b/intermediate_source/reinforcement_q_learning.py @@ -311,7 +311,7 @@ def get_screen(): GAMMA = 0.999 EPS_START = 0.9 EPS_END = 0.05 -EPS_DECAY = 200 +EPS_DECAY = 2000 TARGET_UPDATE = 10 # Get screen size so that we can initialize layers correctly based on shape @@ -451,7 +451,7 @@ def optimize_model(): # duration improvements. # -num_episodes = 50 +num_episodes = 1000 for i_episode in range(num_episodes): # Initialize the environment and state env.reset() From d899d8dbd0e2d2a2f62e8f39afca92c392d72ce7 Mon Sep 17 00:00:00 2001 From: SiftingSands Date: Wed, 7 Sep 2022 22:02:49 -0400 Subject: [PATCH 05/18] revisions to batch norm behavior as suggested by vmoens --- intermediate_source/reinforcement_q_learning.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py index f08c0fab671..e2359d6b1ec 100644 --- a/intermediate_source/reinforcement_q_learning.py +++ b/intermediate_source/reinforcement_q_learning.py @@ -326,7 +326,6 @@ def get_screen(): policy_net = DQN(screen_height, screen_width, n_actions).to(device) target_net = DQN(screen_height, screen_width, n_actions).to(device) target_net.load_state_dict(policy_net.state_dict()) -target_net.eval() optimizer = optim.AdamW(policy_net.parameters(), lr=0.0003, weight_decay=0.01, amsgrad=True) memory = ReplayMemory(100000) @@ -422,7 +421,8 @@ def optimize_model(): # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = torch.zeros(BATCH_SIZE, device=device) - next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach() + with torch.no_grad(): + next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0] # Compute the expected Q values expected_state_action_values = (next_state_values * GAMMA) + reward_batch @@ -460,7 +460,10 @@ def optimize_model(): state = current_screen - last_screen for t in count(): # Select and perform an action + policy_net.eval() action = select_action(state) + policy_net.train() + _, _, done, _, _ = env.step(action.item()) # Reward shaping if i_episode < 100: From 2a59217cb6a0f9f3809d77c657254e911abba2b4 Mon Sep 17 00:00:00 2001 From: SiftingSands Date: Sat, 24 Sep 2022 16:28:04 -0400 Subject: [PATCH 06/18] switched to state vector input and modified hyps so training suceeds in a few 100 episodes. removed all code related to image processing. added timelimit wrapper. added soft updates. --- .../reinforcement_q_learning.py | 160 +++++------------- 1 file changed, 40 insertions(+), 120 deletions(-) diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py index e2359d6b1ec..1cf10434e3c 100644 --- a/intermediate_source/reinforcement_q_learning.py +++ b/intermediate_source/reinforcement_q_learning.py @@ -58,6 +58,7 @@ """ import gym +from gym.wrappers import TimeLimit import math import random import numpy as np @@ -74,7 +75,7 @@ import torchvision.transforms as T -env = gym.make('CartPole-v0', new_step_api=True, render_mode='single_rgb_array').unwrapped +env = TimeLimit(gym.make('CartPole-v1'), max_episode_steps=500) # set up matplotlib is_ipython = 'inline' in matplotlib.get_backend() @@ -203,86 +204,18 @@ def __len__(self): class DQN(nn.Module): - def __init__(self, h, w, outputs): + def __init__(self, outputs): super(DQN, self).__init__() - self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=2) - self.bn1 = nn.BatchNorm2d(16) - self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2) - self.bn2 = nn.BatchNorm2d(32) - self.conv3 = nn.Conv2d(32, 64, kernel_size=5, stride=2) - self.bn3 = nn.BatchNorm2d(64) - - # Number of Linear input connections depends on output of conv2d layers - # and therefore the input image size, so compute it. - def conv2d_size_out(size, kernel_size = 5, stride = 2): - return (size - (kernel_size - 1) - 1) // stride + 1 - convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w))) - convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h))) - linear_input_size = convw * convh * 64 - self.head = nn.Linear(linear_input_size, outputs) + self.layer1 = nn.Linear(4, 128) + self.layer2 = nn.Linear(128, 128) + self.layer3 = nn.Linear(128, outputs) # Called with either one element to determine next action, or a batch # during optimization. Returns tensor([[left0exp,right0exp]...]). def forward(self, x): - x = x.to(device) - x = F.relu(self.bn1(self.conv1(x))) - x = F.relu(self.bn2(self.conv2(x))) - x = F.relu(self.bn3(self.conv3(x))) - return self.head(x.view(x.size(0), -1)) - - -###################################################################### -# Input extraction -# ^^^^^^^^^^^^^^^^ -# -# The code below are utilities for extracting and processing rendered -# images from the environment. It uses the ``torchvision`` package, which -# makes it easy to compose image transforms. Once you run the cell it will -# display an example patch that it extracted. -# - -resize = T.Compose([T.ToPILImage(), - T.Resize(64, interpolation=Image.BILINEAR), - T.ToTensor()]) - - -def get_cart_location(screen_width): - world_width = env.x_threshold * 2 - scale = screen_width / world_width - return int(env.state[0] * scale + screen_width / 2.0) # MIDDLE OF CART - -def get_screen(): - # Returned screen requested by gym is 400x600x3, but is sometimes larger - # such as 800x1200x3. Transpose it into torch order (CHW). - screen = env.render().transpose((2, 0, 1)) - # Cart is in the lower half, so strip off the top and bottom of the screen - _, screen_height, screen_width = screen.shape - screen = screen[:, int(screen_height*0.4):int(screen_height * 0.8)] - view_width = int(screen_width * 0.6) - cart_location = get_cart_location(screen_width) - if cart_location < view_width // 2: - slice_range = slice(view_width) - elif cart_location > (screen_width - view_width // 2): - slice_range = slice(-view_width, None) - else: - slice_range = slice(cart_location - view_width // 2, - cart_location + view_width // 2) - # Strip off the edges, so that we have a square image centered on a cart - screen = screen[:, :, slice_range] - # Convert to float, rescale, convert to torch tensor - # (this doesn't require a copy) - screen = np.ascontiguousarray(screen, dtype=np.float32) / 255 - screen = torch.from_numpy(screen) - # Resize, and add a batch dimension (BCHW) - return resize(screen).unsqueeze(0) - - -env.reset() -plt.figure() -plt.imshow(get_screen().cpu().squeeze(0).permute(1, 2, 0).numpy(), - interpolation='none') -plt.title('Example extracted screen') -plt.show() + x = F.relu(self.layer1(x)) + x = F.relu(self.layer2(x)) + return self.layer3(x) ###################################################################### @@ -305,30 +238,24 @@ def get_screen(): # the official evaluations). The plot will be underneath the cell # containing the main training loop, and will update after every # episode. -# BATCH_SIZE = 128 -GAMMA = 0.999 +GAMMA = 0.99 EPS_START = 0.9 EPS_END = 0.05 -EPS_DECAY = 2000 -TARGET_UPDATE = 10 - -# Get screen size so that we can initialize layers correctly based on shape -# returned from AI gym. Typical dimensions at this point are close to 3x40x90 -# which is the result of a clamped and down-scaled render buffer in get_screen() -init_screen = get_screen() -_, _, screen_height, screen_width = init_screen.shape +EPS_DECAY = 1000 +TAU = 0.005 +LR = 1e-4 # Get number of actions from gym action space n_actions = env.action_space.n -policy_net = DQN(screen_height, screen_width, n_actions).to(device) -target_net = DQN(screen_height, screen_width, n_actions).to(device) +policy_net = DQN(n_actions).to(device) +target_net = DQN(n_actions).to(device) target_net.load_state_dict(policy_net.state_dict()) -optimizer = optim.AdamW(policy_net.parameters(), lr=0.0003, weight_decay=0.01, amsgrad=True) -memory = ReplayMemory(100000) +optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True) +memory = ReplayMemory(10000) steps_done = 0 @@ -433,8 +360,8 @@ def optimize_model(): # Optimize the model optimizer.zero_grad() loss.backward() - for param in policy_net.parameters(): - param.grad.data.clamp_(-1, 1) + # In-place gradient clipping + torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100) optimizer.step() @@ -453,32 +380,20 @@ def optimize_model(): num_episodes = 1000 for i_episode in range(num_episodes): - # Initialize the environment and state - env.reset() - last_screen = get_screen() - current_screen = get_screen() - state = current_screen - last_screen + # Initialize the environment and get it's state + state, _ = env.reset() + state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0) for t in count(): - # Select and perform an action - policy_net.eval() action = select_action(state) - policy_net.train() + observation, reward, terminated, truncated, _ = env.step(action.item()) + reward = torch.tensor([reward], device=device) + if truncated: + terminated = True - _, _, done, _, _ = env.step(action.item()) - # Reward shaping - if i_episode < 100: - reward = t * np.clip((i_episode / 500), None, 1) - else: - reward = (t - 50 * (i_episode / 500)) * np.clip((i_episode / 500), None, 1) - reward = torch.tensor([reward], device=device).type(torch.float32) - - # Observe new state - last_screen = current_screen - current_screen = get_screen() - if not done: - next_state = current_screen - last_screen - else: + if terminated: next_state = None + else: + next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0) # Store the transition in memory memory.push(state, action, next_state, reward) @@ -488,17 +403,22 @@ def optimize_model(): # Perform one step of the optimization (on the policy network) optimize_model() - if done: + + # Soft update of the target network's weights at every step + # https://arxiv.org/pdf/1509.02971.pdf + # θ′ ← τ θ + (1 −τ )θ′ + target_net_state_dict = target_net.state_dict() + policy_net_state_dict = policy_net.state_dict() + for key in policy_net_state_dict: + target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU) + target_net.load_state_dict(target_net_state_dict) + + if terminated: episode_durations.append(t + 1) plot_durations() break - # Update the target network, copying all weights and biases in DQN - if i_episode % TARGET_UPDATE == 0: - target_net.load_state_dict(policy_net.state_dict()) print('Complete') -env.render() -env.close() plt.ioff() plt.show() From db90afd72769e41dbec8bed78590937f2da0caa8 Mon Sep 17 00:00:00 2001 From: SiftingSands Date: Sat, 24 Sep 2022 17:20:59 -0400 Subject: [PATCH 07/18] numerous doc changes. removed unused imports. used gym's action space sampler --- .../reinforcement_q_learning.py | 64 +++++++++---------- 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py index 1cf10434e3c..328fa34a3cd 100644 --- a/intermediate_source/reinforcement_q_learning.py +++ b/intermediate_source/reinforcement_q_learning.py @@ -6,7 +6,7 @@ This tutorial shows how to use PyTorch to train a Deep Q Learning (DQN) agent -on the CartPole-v0 task from the `OpenAI Gym `__. +on the CartPole-v1 task from the `OpenAI Gym `__. **Task** @@ -30,30 +30,24 @@ The CartPole task is designed so that the inputs to the agent are 4 real values representing the environment state (position, velocity, etc.). -However, neural networks can solve the task purely by looking at the -scene, so we'll use a patch of the screen centered on the cart as an -input. Because of this, our results aren't directly comparable to the -ones from the official leaderboard - our task is much harder. -Unfortunately this does slow down the training, because we have to -render all the frames. +We take these 4 inputs without any scaling and pass them through a +small fully-connected network with 2 outputs, one for each action. +The network is trained to predict the expected value for each action, +given the input state. The action with the highest expected value is +then chosen. -Strictly speaking, we will present the state as the difference between -the current screen patch and the previous one. This will allow the agent -to take the velocity of the pole into account from one image. **Packages** First, let's import needed packages. Firstly, we need `gym `__ for the environment -(Install using `pip install gym`). +(Install using `pip install gym`). Developed on v0.26.1 of gym. We'll also use the following from PyTorch: - neural networks (``torch.nn``) - optimization (``torch.optim``) - automatic differentiation (``torch.autograd``) -- utilities for vision tasks (``torchvision`` - `a separate - package `__). """ @@ -66,15 +60,13 @@ import matplotlib.pyplot as plt from collections import namedtuple, deque from itertools import count -from PIL import Image import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F -import torchvision.transforms as T - +# Wrap the environment to limit the number of steps per episode env = TimeLimit(gym.make('CartPole-v1'), max_episode_steps=500) # set up matplotlib @@ -145,9 +137,11 @@ def __len__(self): # :math:`R_{t_0} = \sum_{t=t_0}^{\infty} \gamma^{t - t_0} r_t`, where # :math:`R_{t_0}` is also known as the *return*. The discount, # :math:`\gamma`, should be a constant between :math:`0` and :math:`1` -# that ensures the sum converges. It makes rewards from the uncertain far -# future less important for our agent than the ones in the near future -# that it can be fairly confident about. +# that ensures the sum converges. A lower :math:`\gamma` makes +# rewards from the uncertain far future less important for our agent +# than the ones in the near future that it can be fairly confident +# about. It also encourages agents to collect reward closer in time +# than equivalent rewards temporally future away. # # The main idea behind Q-learning is that if we had a function # :math:`Q^*: State \times Action \rightarrow \mathbb{R}`, that could tell @@ -170,7 +164,7 @@ def __len__(self): # The difference between the two sides of the equality is known as the # temporal difference error, :math:`\delta`: # -# .. math:: \delta = Q(s, a) - (r + \gamma \max_a Q(s', a)) +# .. math:: \delta = Q(s, a) - (r + \gamma \max_a' Q(s', a)) # # To minimise this error, we will use the `Huber # loss `__. The Huber loss acts @@ -239,6 +233,13 @@ def forward(self, x): # containing the main training loop, and will update after every # episode. +# BATCH_SIZE is the number of transitions sampled from the replay buffer +# GAMMA is the discount factor as mentioned in the previous section +# EPS_START is the starting value of epsilon +# EPS_END is the final value of epsilon +# EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay +# TAU is the update rate of the target network +# LR is the learning rate of the AdamW optimizer BATCH_SIZE = 128 GAMMA = 0.99 EPS_START = 0.9 @@ -274,7 +275,7 @@ def select_action(state): # found, so we pick action with the larger expected reward. return policy_net(state).max(1)[1].view(1, 1) else: - return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long) + return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long) episode_durations = [] @@ -312,11 +313,9 @@ def plot_durations(): # :math:`V(s_{t+1}) = \max_a Q(s_{t+1}, a)`, and combines them into our # loss. By definition we set :math:`V(s) = 0` if :math:`s` is a terminal # state. We also use a target network to compute :math:`V(s_{t+1})` for -# added stability. The target network has its weights kept frozen most of -# the time, but is updated with the policy network's weights every so often. -# This is usually a set number of steps but we shall use episodes for -# simplicity. -# +# added stability. The target network is updated at every step with a +# `soft update `__ controlled by +# the hyperparameter ``TAU``, which was previously defined. def optimize_model(): if len(memory) < BATCH_SIZE: @@ -368,15 +367,13 @@ def optimize_model(): ###################################################################### # # Below, you can find the main training loop. At the beginning we reset -# the environment and initialize the ``state`` Tensor. Then, we sample -# an action, execute it, observe the next screen and the reward (always +# the environment and obtain the initial ``state`` Tensor. Then, we sample +# an action, execute it, observe the next state and the reward (always # 1), and optimize our model once. When the episode ends (our model # fails), we restart the loop. # -# Below, `num_episodes` is set small. You should download -# the notebook and run lot more epsiodes, such as 300+ for meaningful -# duration improvements. -# +# Below, `num_episodes` to 1000, but you should the model constantly +# achieve 500 steps within 600 training episodes. num_episodes = 1000 for i_episode in range(num_episodes): @@ -404,8 +401,7 @@ def optimize_model(): # Perform one step of the optimization (on the policy network) optimize_model() - # Soft update of the target network's weights at every step - # https://arxiv.org/pdf/1509.02971.pdf + # Soft update of the target network's weights # θ′ ← τ θ + (1 −τ )θ′ target_net_state_dict = target_net.state_dict() policy_net_state_dict = policy_net.state_dict() From 8e507a2e079ffcd5b8968fc69f9b03b9eb919785 Mon Sep 17 00:00:00 2001 From: SiftingSands Date: Sat, 24 Sep 2022 17:39:39 -0400 Subject: [PATCH 08/18] minor doc change. removed hard coding of network input size --- intermediate_source/reinforcement_q_learning.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py index 328fa34a3cd..42ab5ad436c 100644 --- a/intermediate_source/reinforcement_q_learning.py +++ b/intermediate_source/reinforcement_q_learning.py @@ -66,7 +66,7 @@ import torch.optim as optim import torch.nn.functional as F -# Wrap the environment to limit the number of steps per episode +# Wrap the environment to limit the maximum number of steps per episode env = TimeLimit(gym.make('CartPole-v1'), max_episode_steps=500) # set up matplotlib @@ -198,11 +198,11 @@ def __len__(self): class DQN(nn.Module): - def __init__(self, outputs): + def __init__(self, n_observations, n_actions): super(DQN, self).__init__() - self.layer1 = nn.Linear(4, 128) + self.layer1 = nn.Linear(n_observations, 128) self.layer2 = nn.Linear(128, 128) - self.layer3 = nn.Linear(128, outputs) + self.layer3 = nn.Linear(128, n_actions) # Called with either one element to determine next action, or a batch # during optimization. Returns tensor([[left0exp,right0exp]...]). @@ -250,9 +250,12 @@ def forward(self, x): # Get number of actions from gym action space n_actions = env.action_space.n +# Get the number of state observations +state, _ = env.reset() +n_observations = len(state) -policy_net = DQN(n_actions).to(device) -target_net = DQN(n_actions).to(device) +policy_net = DQN(n_observations, n_actions).to(device) +target_net = DQN(n_observations, n_actions).to(device) target_net.load_state_dict(policy_net.state_dict()) optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True) From 0f0c07bcf8236d085718ff7ec7905d0814115b56 Mon Sep 17 00:00:00 2001 From: SiftingSands Date: Sun, 25 Sep 2022 12:47:52 -0400 Subject: [PATCH 09/18] removed unneeded timelimit wrapper --- intermediate_source/reinforcement_q_learning.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py index 42ab5ad436c..10f8ca19a58 100644 --- a/intermediate_source/reinforcement_q_learning.py +++ b/intermediate_source/reinforcement_q_learning.py @@ -66,8 +66,7 @@ import torch.optim as optim import torch.nn.functional as F -# Wrap the environment to limit the maximum number of steps per episode -env = TimeLimit(gym.make('CartPole-v1'), max_episode_steps=500) +env = gym.make('CartPole-v1') # set up matplotlib is_ipython = 'inline' in matplotlib.get_backend() From d3bff00f93185e6a8501fb26c688e9d932a5a7c4 Mon Sep 17 00:00:00 2001 From: SiftingSands Date: Tue, 27 Sep 2022 19:07:18 -0400 Subject: [PATCH 10/18] Fixed termination vs truncation behavior. Remove the timelimit import that wasn't used anymore --- intermediate_source/reinforcement_q_learning.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py index 10f8ca19a58..da67325ada3 100644 --- a/intermediate_source/reinforcement_q_learning.py +++ b/intermediate_source/reinforcement_q_learning.py @@ -52,7 +52,6 @@ """ import gym -from gym.wrappers import TimeLimit import math import random import numpy as np @@ -386,8 +385,7 @@ def optimize_model(): action = select_action(state) observation, reward, terminated, truncated, _ = env.step(action.item()) reward = torch.tensor([reward], device=device) - if truncated: - terminated = True + done = terminated or truncated if terminated: next_state = None @@ -411,7 +409,7 @@ def optimize_model(): target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU) target_net.load_state_dict(target_net_state_dict) - if terminated: + if done: episode_durations.append(t + 1) plot_durations() break From 70bffde64f77c33e63c970cf8c1bd8590c56ca1b Mon Sep 17 00:00:00 2001 From: SiftingSands <43226539+SiftingSands@users.noreply.github.com> Date: Wed, 28 Sep 2022 13:12:46 -0400 Subject: [PATCH 11/18] Added missing # to see if the webpage will rende --- intermediate_source/reinforcement_q_learning.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py index da67325ada3..9450ae50b2d 100644 --- a/intermediate_source/reinforcement_q_learning.py +++ b/intermediate_source/reinforcement_q_learning.py @@ -230,6 +230,7 @@ def forward(self, x): # the official evaluations). The plot will be underneath the cell # containing the main training loop, and will update after every # episode. +# # BATCH_SIZE is the number of transitions sampled from the replay buffer # GAMMA is the discount factor as mentioned in the previous section @@ -316,7 +317,8 @@ def plot_durations(): # state. We also use a target network to compute :math:`V(s_{t+1})` for # added stability. The target network is updated at every step with a # `soft update `__ controlled by -# the hyperparameter ``TAU``, which was previously defined. +# the hyperparameter ``TAU``, which was previously defined. +# def optimize_model(): if len(memory) < BATCH_SIZE: @@ -375,6 +377,7 @@ def optimize_model(): # # Below, `num_episodes` to 1000, but you should the model constantly # achieve 500 steps within 600 training episodes. +# num_episodes = 1000 for i_episode in range(num_episodes): From 04c3b320d3358593a5006a77f26f9982929e8ede Mon Sep 17 00:00:00 2001 From: SiftingSands <43226539+SiftingSands@users.noreply.github.com> Date: Wed, 28 Sep 2022 13:54:43 -0400 Subject: [PATCH 12/18] Remove comment w/ special chars for webpage render --- intermediate_source/reinforcement_q_learning.py | 1 - 1 file changed, 1 deletion(-) diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py index 9450ae50b2d..761e118121e 100644 --- a/intermediate_source/reinforcement_q_learning.py +++ b/intermediate_source/reinforcement_q_learning.py @@ -405,7 +405,6 @@ def optimize_model(): optimize_model() # Soft update of the target network's weights - # θ′ ← τ θ + (1 −τ )θ′ target_net_state_dict = target_net.state_dict() policy_net_state_dict = policy_net.state_dict() for key in policy_net_state_dict: From 26f774f088dff278fc60b8bb00b6b41f923c069d Mon Sep 17 00:00:00 2001 From: SiftingSands Date: Wed, 28 Sep 2022 19:09:50 -0400 Subject: [PATCH 13/18] Naive removal of block beginning with comments to see if it fixes the webpage --- intermediate_source/reinforcement_q_learning.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py index 761e118121e..a735164a6c1 100644 --- a/intermediate_source/reinforcement_q_learning.py +++ b/intermediate_source/reinforcement_q_learning.py @@ -232,13 +232,6 @@ def forward(self, x): # episode. # -# BATCH_SIZE is the number of transitions sampled from the replay buffer -# GAMMA is the discount factor as mentioned in the previous section -# EPS_START is the starting value of epsilon -# EPS_END is the final value of epsilon -# EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay -# TAU is the update rate of the target network -# LR is the learning rate of the AdamW optimizer BATCH_SIZE = 128 GAMMA = 0.99 EPS_START = 0.9 From 2e629a6b0bbab4ba8e937c7aa4743d01120d7cd9 Mon Sep 17 00:00:00 2001 From: SiftingSands Date: Wed, 28 Sep 2022 19:36:42 -0400 Subject: [PATCH 14/18] undid more docstring changes for testing --- .../reinforcement_q_learning.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py index a735164a6c1..a5725ead356 100644 --- a/intermediate_source/reinforcement_q_learning.py +++ b/intermediate_source/reinforcement_q_learning.py @@ -135,11 +135,9 @@ def __len__(self): # :math:`R_{t_0} = \sum_{t=t_0}^{\infty} \gamma^{t - t_0} r_t`, where # :math:`R_{t_0}` is also known as the *return*. The discount, # :math:`\gamma`, should be a constant between :math:`0` and :math:`1` -# that ensures the sum converges. A lower :math:`\gamma` makes -# rewards from the uncertain far future less important for our agent -# than the ones in the near future that it can be fairly confident -# about. It also encourages agents to collect reward closer in time -# than equivalent rewards temporally future away. +# that ensures the sum converges. It makes rewards from the uncertain far +# future less important for our agent than the ones in the near future +# that it can be fairly confident about. # # The main idea behind Q-learning is that if we had a function # :math:`Q^*: State \times Action \rightarrow \mathbb{R}`, that could tell @@ -162,7 +160,7 @@ def __len__(self): # The difference between the two sides of the equality is known as the # temporal difference error, :math:`\delta`: # -# .. math:: \delta = Q(s, a) - (r + \gamma \max_a' Q(s', a)) +# .. math:: \delta = Q(s, a) - (r + \gamma \max_a Q(s', a)) # # To minimise this error, we will use the `Huber # loss `__. The Huber loss acts @@ -308,9 +306,10 @@ def plot_durations(): # :math:`V(s_{t+1}) = \max_a Q(s_{t+1}, a)`, and combines them into our # loss. By definition we set :math:`V(s) = 0` if :math:`s` is a terminal # state. We also use a target network to compute :math:`V(s_{t+1})` for -# added stability. The target network is updated at every step with a -# `soft update `__ controlled by -# the hyperparameter ``TAU``, which was previously defined. +# added stability. The target network has its weights kept frozen most of +# the time, but is updated with the policy network's weights every so often. +# This is usually a set number of steps but we shall use episodes for +# simplicity. # def optimize_model(): @@ -368,8 +367,9 @@ def optimize_model(): # 1), and optimize our model once. When the episode ends (our model # fails), we restart the loop. # -# Below, `num_episodes` to 1000, but you should the model constantly -# achieve 500 steps within 600 training episodes. +# Below, `num_episodes` is set small. You should download +# the notebook and run lot more epsiodes, such as 300+ for meaningful +# duration improvements. # num_episodes = 1000 From 068260df45102e471c7adeafc75cd72fc9b3ed9e Mon Sep 17 00:00:00 2001 From: SiftingSands Date: Wed, 28 Sep 2022 19:55:25 -0400 Subject: [PATCH 15/18] undid the last few plausible docstring changes and added the input extraction section back in for testing --- .../reinforcement_q_learning.py | 68 ++++++++++++++++--- 1 file changed, 60 insertions(+), 8 deletions(-) diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py index a5725ead356..1f6946bbf72 100644 --- a/intermediate_source/reinforcement_q_learning.py +++ b/intermediate_source/reinforcement_q_learning.py @@ -30,24 +30,30 @@ The CartPole task is designed so that the inputs to the agent are 4 real values representing the environment state (position, velocity, etc.). -We take these 4 inputs without any scaling and pass them through a -small fully-connected network with 2 outputs, one for each action. -The network is trained to predict the expected value for each action, -given the input state. The action with the highest expected value is -then chosen. +However, neural networks can solve the task purely by looking at the +scene, so we'll use a patch of the screen centered on the cart as an +input. Because of this, our results aren't directly comparable to the +ones from the official leaderboard - our task is much harder. +Unfortunately this does slow down the training, because we have to +render all the frames. +Strictly speaking, we will present the state as the difference between +the current screen patch and the previous one. This will allow the agent +to take the velocity of the pole into account from one image. **Packages** First, let's import needed packages. Firstly, we need `gym `__ for the environment -(Install using `pip install gym`). Developed on v0.26.1 of gym. +(Install using `pip install gym`). We'll also use the following from PyTorch: - neural networks (``torch.nn``) - optimization (``torch.optim``) - automatic differentiation (``torch.autograd``) +- utilities for vision tasks (``torchvision`` - `a separate + package `__). """ @@ -208,6 +214,52 @@ def forward(self, x): return self.layer3(x) +###################################################################### +# Input extraction +# ^^^^^^^^^^^^^^^^ +# +# The code below are utilities for extracting and processing rendered +# images from the environment. It uses the ``torchvision`` package, which +# makes it easy to compose image transforms. Once you run the cell it will +# display an example patch that it extracted. +# + +resize = T.Compose([T.ToPILImage(), + T.Resize(40, interpolation=Image.CUBIC), + T.ToTensor()]) + + +def get_cart_location(screen_width): + world_width = env.x_threshold * 2 + scale = screen_width / world_width + return int(env.state[0] * scale + screen_width / 2.0) # MIDDLE OF CART + +def get_screen(): + # Returned screen requested by gym is 400x600x3, but is sometimes larger + # such as 800x1200x3. Transpose it into torch order (CHW). + screen = env.render().transpose((2, 0, 1)) + # Cart is in the lower half, so strip off the top and bottom of the screen + _, screen_height, screen_width = screen.shape + screen = screen[:, int(screen_height*0.4):int(screen_height * 0.8)] + view_width = int(screen_width * 0.6) + cart_location = get_cart_location(screen_width) + if cart_location < view_width // 2: + slice_range = slice(view_width) + elif cart_location > (screen_width - view_width // 2): + slice_range = slice(-view_width, None) + else: + slice_range = slice(cart_location - view_width // 2, + cart_location + view_width // 2) + # Strip off the edges, so that we have a square image centered on a cart + screen = screen[:, :, slice_range] + # Convert to float, rescale, convert to torch tensor + # (this doesn't require a copy) + screen = np.ascontiguousarray(screen, dtype=np.float32) / 255 + screen = torch.from_numpy(screen) + # Resize, and add a batch dimension (BCHW) + return resize(screen).unsqueeze(0) + + ###################################################################### # Training # -------- @@ -362,8 +414,8 @@ def optimize_model(): ###################################################################### # # Below, you can find the main training loop. At the beginning we reset -# the environment and obtain the initial ``state`` Tensor. Then, we sample -# an action, execute it, observe the next state and the reward (always +# the environment and initialize the ``state`` Tensor. Then, we sample +# an action, execute it, observe the next screen and the reward (always # 1), and optimize our model once. When the episode ends (our model # fails), we restart the loop. # From 80d133a4ccd28ae1e18d2021b5a5c87b0bc80973 Mon Sep 17 00:00:00 2001 From: SiftingSands Date: Sat, 1 Oct 2022 11:43:23 -0400 Subject: [PATCH 16/18] minor docstring changes --- intermediate_source/reinforcement_q_learning.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py index 9450ae50b2d..af03572dc95 100644 --- a/intermediate_source/reinforcement_q_learning.py +++ b/intermediate_source/reinforcement_q_learning.py @@ -284,7 +284,7 @@ def select_action(state): def plot_durations(): - plt.figure(2) + plt.figure(1) plt.clf() durations_t = torch.tensor(episode_durations, dtype=torch.float) plt.title('Training...') @@ -375,7 +375,7 @@ def optimize_model(): # 1), and optimize our model once. When the episode ends (our model # fails), we restart the loop. # -# Below, `num_episodes` to 1000, but you should the model constantly +# Below, `num_episodes` to 1000, but you should see the model constantly # achieve 500 steps within 600 training episodes. # @@ -430,6 +430,6 @@ def optimize_model(): # step sample from the gym environment. We record the results in the # replay memory and also run optimization step on every iteration. # Optimization picks a random batch from the replay memory to do training of the -# new policy. "Older" target_net is also used in optimization to compute the -# expected Q values; it is updated occasionally to keep it current. +# new policy. The "older" target_net is also used in optimization to compute the +# expected Q values. A soft update of its weights are performed at every step. # From b3d65d03b654a63f434374630e46361782e29fc5 Mon Sep 17 00:00:00 2001 From: SiftingSands Date: Tue, 1 Nov 2022 21:48:26 -0400 Subject: [PATCH 17/18] gym version handling authored by https://github.com/pseudo-rnd-thoughts; hardware dependent # episodes w/ more writing --- .../reinforcement_q_learning.py | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py index af03572dc95..38676a84479 100644 --- a/intermediate_source/reinforcement_q_learning.py +++ b/intermediate_source/reinforcement_q_learning.py @@ -65,7 +65,12 @@ import torch.optim as optim import torch.nn.functional as F -env = gym.make('CartPole-v1') +if gym.__version__[:4] == '0.26': + env = gym.make('CartPole-v1') +elif gym.__version__[:4] == '0.25': + env = gym.make('CartPole-v1', new_step_api=True) +else: + raise ImportError(f"Requires gym v25 or v26, actual version: {gym.__version__}") # set up matplotlib is_ipython = 'inline' in matplotlib.get_backend() @@ -375,11 +380,19 @@ def optimize_model(): # 1), and optimize our model once. When the episode ends (our model # fails), we restart the loop. # -# Below, `num_episodes` to 1000, but you should see the model constantly -# achieve 500 steps within 600 training episodes. +# Below, `num_episodes` is set to 600 if a GPU is available, otherwise 50 +# episodes are scheduled so training does not take too long. However, 50 +# episodes is insufficient for to observe good performance on cartpole. +# You should see the model constantly achieve 500 steps within 600 training +# episodes. Training RL agents can be a noisy process, so restarting training +# can produce better results if convergence is not observed. # -num_episodes = 1000 +if torch.cuda.is_available(): + num_episodes = 600 +else: + num_episodes = 50 + for i_episode in range(num_episodes): # Initialize the environment and get it's state state, _ = env.reset() From 1015af66648e39c08a9fce2558ea9e0150a1d3a8 Mon Sep 17 00:00:00 2001 From: SiftingSands Date: Fri, 4 Nov 2022 20:45:27 -0400 Subject: [PATCH 18/18] more version handling for v.25 and v.26 --- intermediate_source/reinforcement_q_learning.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py index b33d7abd1b0..4095f90206c 100644 --- a/intermediate_source/reinforcement_q_learning.py +++ b/intermediate_source/reinforcement_q_learning.py @@ -261,7 +261,10 @@ def forward(self, x): # Get number of actions from gym action space n_actions = env.action_space.n # Get the number of state observations -state, _ = env.reset() +if gym.__version__[:4] == '0.26': + state, _ = env.reset() +elif gym.__version__[:4] == '0.25': + state, _ = env.reset(return_info=True) n_observations = len(state) policy_net = DQN(n_observations, n_actions).to(device) @@ -401,7 +404,10 @@ def optimize_model(): for i_episode in range(num_episodes): # Initialize the environment and get it's state - state, _ = env.reset() + if gym.__version__[:4] == '0.26': + state, _ = env.reset() + elif gym.__version__[:4] == '0.25': + state, _ = env.reset(return_info=True) state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0) for t in count(): action = select_action(state)