Skip to content

Update RL Mario Tutorial #2075

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 8, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 31 additions & 34 deletions intermediate_source/mario_rl_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,10 @@
######################################################################
#
#

# !pip install gym-super-mario-bros==7.3.0
# .. code-block:: bash
#
# %%bash
# pip install gym-super-mario-bros==7.4.0

import torch
from torch import nn
Expand Down Expand Up @@ -95,16 +97,19 @@
# (next) state, reward and other info.
#

# Initialize Super Mario environment
env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0")
# Initialize Super Mario environment (in v0.26 change render mode to 'human' to see results on the screen)
if gym.__version__ < '0.26':
env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0", new_step_api=True)
else:
env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0", render_mode='rgb', apply_api_compatibility=True)

# Limit the action-space to
# 0. walk right
# 1. jump right
env = JoypadSpace(env, [["right"], ["right", "A"]])

env.reset()
next_state, reward, done, info = env.step(action=0)
next_state, reward, done, trunc, info = env.step(action=0)
print(f"{next_state.shape},\n {reward},\n {done},\n {info}")


Expand Down Expand Up @@ -151,14 +156,13 @@ def __init__(self, env, skip):
def step(self, action):
"""Repeat action, and sum reward"""
total_reward = 0.0
done = False
for i in range(self._skip):
# Accumulate reward and repeat the same action
obs, reward, done, info = self.env.step(action)
obs, reward, done, trunk, info = self.env.step(action)
total_reward += reward
if done:
break
return obs, total_reward, done, info
return obs, total_reward, done, trunk, info


class GrayScaleObservation(gym.ObservationWrapper):
Expand Down Expand Up @@ -203,7 +207,10 @@ def observation(self, observation):
env = SkipFrame(env, skip=4)
env = GrayScaleObservation(env)
env = ResizeObservation(env, shape=84)
env = FrameStack(env, num_stack=4)
if gym.__version__ < '0.26':
env = FrameStack(env, num_stack=4, new_step_api=True)
else:
env = FrameStack(env, num_stack=4)


######################################################################
Expand Down Expand Up @@ -283,12 +290,11 @@ def __init__(self, state_dim, action_dim, save_dir):
self.action_dim = action_dim
self.save_dir = save_dir

self.use_cuda = torch.cuda.is_available()
self.device = "cuda" if torch.cuda.is_available() else "cpu"

# Mario's DNN to predict the most optimal action - we implement this in the Learn section
self.net = MarioNet(self.state_dim, self.action_dim).float()
if self.use_cuda:
self.net = self.net.to(device="cuda")
self.net = self.net.to(device=self.device)

self.exploration_rate = 1
self.exploration_rate_decay = 0.99999975
Expand All @@ -312,12 +318,8 @@ def act(self, state):

# EXPLOIT
else:
state = state.__array__()
if self.use_cuda:
state = torch.tensor(state).cuda()
else:
state = torch.tensor(state)
state = state.unsqueeze(0)
state = state[0].__array__() if isinstance(state, tuple) else state.__array__()
state = torch.tensor(state, device=self.device).unsqueeze(0)
action_values = self.net(state, model="online")
action_idx = torch.argmax(action_values, axis=1).item()

Expand Down Expand Up @@ -363,21 +365,16 @@ def cache(self, state, next_state, action, reward, done):
reward (float),
done(bool))
"""
state = state.__array__()
next_state = next_state.__array__()

if self.use_cuda:
state = torch.tensor(state).cuda()
next_state = torch.tensor(next_state).cuda()
action = torch.tensor([action]).cuda()
reward = torch.tensor([reward]).cuda()
done = torch.tensor([done]).cuda()
else:
state = torch.tensor(state)
next_state = torch.tensor(next_state)
action = torch.tensor([action])
reward = torch.tensor([reward])
done = torch.tensor([done])
def first_if_tuple(x):
return x[0] if isinstance(x, tuple) else x
state = first_if_tuple(state).__array__()
next_state = first_if_tuple(next_state).__array__()

state = torch.tensor(state, device=self.device)
next_state = torch.tensor(next_state, device=self.device)
action = torch.tensor([action], device=self.device)
reward = torch.tensor([reward], device=self.device)
done = torch.tensor([done], device=self.device)

self.memory.append((state, next_state, action, reward, done,))

Expand Down Expand Up @@ -753,7 +750,7 @@ def record(self, episode, epsilon, step):
action = mario.act(state)

# Agent performs action
next_state, reward, done, info = env.step(action)
next_state, reward, done, trunc, info = env.step(action)

# Remember
mario.cache(state, next_state, action, reward, done)
Expand Down