From 869fbbe8340633a7597ddbdcd02bc27b6dd863be Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Fri, 7 Oct 2022 17:46:23 -0700 Subject: [PATCH] Update RL Mario Tutorial By switching to new API if gym-v0.25 is used and thus making it more compatible with gym-v0.26 Also, slightly optimize how tensors are created on CUDA vs CPU devices --- intermediate_source/mario_rl_tutorial.py | 65 +++++++++++------------- 1 file changed, 31 insertions(+), 34 deletions(-) diff --git a/intermediate_source/mario_rl_tutorial.py b/intermediate_source/mario_rl_tutorial.py index a2d856c4129..4445704cd1b 100755 --- a/intermediate_source/mario_rl_tutorial.py +++ b/intermediate_source/mario_rl_tutorial.py @@ -31,8 +31,10 @@ ###################################################################### # # - -# !pip install gym-super-mario-bros==7.3.0 +# .. code-block:: bash +# +# %%bash +# pip install gym-super-mario-bros==7.4.0 import torch from torch import nn @@ -95,8 +97,11 @@ # (next) state, reward and other info. # -# Initialize Super Mario environment -env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0") +# Initialize Super Mario environment (in v0.26 change render mode to 'human' to see results on the screen) +if gym.__version__ < '0.26': + env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0", new_step_api=True) +else: + env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0", render_mode='rgb', apply_api_compatibility=True) # Limit the action-space to # 0. walk right @@ -104,7 +109,7 @@ env = JoypadSpace(env, [["right"], ["right", "A"]]) env.reset() -next_state, reward, done, info = env.step(action=0) +next_state, reward, done, trunc, info = env.step(action=0) print(f"{next_state.shape},\n {reward},\n {done},\n {info}") @@ -151,14 +156,13 @@ def __init__(self, env, skip): def step(self, action): """Repeat action, and sum reward""" total_reward = 0.0 - done = False for i in range(self._skip): # Accumulate reward and repeat the same action - obs, reward, done, info = self.env.step(action) + obs, reward, done, trunk, info = self.env.step(action) total_reward += reward if done: break - return obs, total_reward, done, info + return obs, total_reward, done, trunk, info class GrayScaleObservation(gym.ObservationWrapper): @@ -203,7 +207,10 @@ def observation(self, observation): env = SkipFrame(env, skip=4) env = GrayScaleObservation(env) env = ResizeObservation(env, shape=84) -env = FrameStack(env, num_stack=4) +if gym.__version__ < '0.26': + env = FrameStack(env, num_stack=4, new_step_api=True) +else: + env = FrameStack(env, num_stack=4) ###################################################################### @@ -283,12 +290,11 @@ def __init__(self, state_dim, action_dim, save_dir): self.action_dim = action_dim self.save_dir = save_dir - self.use_cuda = torch.cuda.is_available() + self.device = "cuda" if torch.cuda.is_available() else "cpu" # Mario's DNN to predict the most optimal action - we implement this in the Learn section self.net = MarioNet(self.state_dim, self.action_dim).float() - if self.use_cuda: - self.net = self.net.to(device="cuda") + self.net = self.net.to(device=self.device) self.exploration_rate = 1 self.exploration_rate_decay = 0.99999975 @@ -312,12 +318,8 @@ def act(self, state): # EXPLOIT else: - state = state.__array__() - if self.use_cuda: - state = torch.tensor(state).cuda() - else: - state = torch.tensor(state) - state = state.unsqueeze(0) + state = state[0].__array__() if isinstance(state, tuple) else state.__array__() + state = torch.tensor(state, device=self.device).unsqueeze(0) action_values = self.net(state, model="online") action_idx = torch.argmax(action_values, axis=1).item() @@ -363,21 +365,16 @@ def cache(self, state, next_state, action, reward, done): reward (float), done(bool)) """ - state = state.__array__() - next_state = next_state.__array__() - - if self.use_cuda: - state = torch.tensor(state).cuda() - next_state = torch.tensor(next_state).cuda() - action = torch.tensor([action]).cuda() - reward = torch.tensor([reward]).cuda() - done = torch.tensor([done]).cuda() - else: - state = torch.tensor(state) - next_state = torch.tensor(next_state) - action = torch.tensor([action]) - reward = torch.tensor([reward]) - done = torch.tensor([done]) + def first_if_tuple(x): + return x[0] if isinstance(x, tuple) else x + state = first_if_tuple(state).__array__() + next_state = first_if_tuple(next_state).__array__() + + state = torch.tensor(state, device=self.device) + next_state = torch.tensor(next_state, device=self.device) + action = torch.tensor([action], device=self.device) + reward = torch.tensor([reward], device=self.device) + done = torch.tensor([done], device=self.device) self.memory.append((state, next_state, action, reward, done,)) @@ -753,7 +750,7 @@ def record(self, episode, epsilon, step): action = mario.act(state) # Agent performs action - next_state, reward, done, info = env.step(action) + next_state, reward, done, trunc, info = env.step(action) # Remember mario.cache(state, next_state, action, reward, done)