31
31
######################################################################
32
32
#
33
33
#
34
-
35
- # !pip install gym-super-mario-bros==7.3.0
34
+ # .. code-block:: bash
35
+ #
36
+ # %%bash
37
+ # pip install gym-super-mario-bros==7.4.0
36
38
37
39
import torch
38
40
from torch import nn
95
97
# (next) state, reward and other info.
96
98
#
97
99
98
- # Initialize Super Mario environment
99
- env = gym_super_mario_bros .make ("SuperMarioBros-1-1-v0" )
100
+ # Initialize Super Mario environment (in v0.26 change render mode to 'human' to see results on the screen)
101
+ if gym .__version__ < '0.26' :
102
+ env = gym_super_mario_bros .make ("SuperMarioBros-1-1-v0" , new_step_api = True )
103
+ else :
104
+ env = gym_super_mario_bros .make ("SuperMarioBros-1-1-v0" , render_mode = 'rgb' , apply_api_compatibility = True )
100
105
101
106
# Limit the action-space to
102
107
# 0. walk right
103
108
# 1. jump right
104
109
env = JoypadSpace (env , [["right" ], ["right" , "A" ]])
105
110
106
111
env .reset ()
107
- next_state , reward , done , info = env .step (action = 0 )
112
+ next_state , reward , done , trunc , info = env .step (action = 0 )
108
113
print (f"{ next_state .shape } ,\n { reward } ,\n { done } ,\n { info } " )
109
114
110
115
@@ -151,14 +156,13 @@ def __init__(self, env, skip):
151
156
def step (self , action ):
152
157
"""Repeat action, and sum reward"""
153
158
total_reward = 0.0
154
- done = False
155
159
for i in range (self ._skip ):
156
160
# Accumulate reward and repeat the same action
157
- obs , reward , done , info = self .env .step (action )
161
+ obs , reward , done , trunk , info = self .env .step (action )
158
162
total_reward += reward
159
163
if done :
160
164
break
161
- return obs , total_reward , done , info
165
+ return obs , total_reward , done , trunk , info
162
166
163
167
164
168
class GrayScaleObservation (gym .ObservationWrapper ):
@@ -203,7 +207,10 @@ def observation(self, observation):
203
207
env = SkipFrame (env , skip = 4 )
204
208
env = GrayScaleObservation (env )
205
209
env = ResizeObservation (env , shape = 84 )
206
- env = FrameStack (env , num_stack = 4 )
210
+ if gym .__version__ < '0.26' :
211
+ env = FrameStack (env , num_stack = 4 , new_step_api = True )
212
+ else :
213
+ env = FrameStack (env , num_stack = 4 )
207
214
208
215
209
216
######################################################################
@@ -283,12 +290,11 @@ def __init__(self, state_dim, action_dim, save_dir):
283
290
self .action_dim = action_dim
284
291
self .save_dir = save_dir
285
292
286
- self .use_cuda = torch .cuda .is_available ()
293
+ self .device = "cuda" if torch .cuda .is_available () else "cpu"
287
294
288
295
# Mario's DNN to predict the most optimal action - we implement this in the Learn section
289
296
self .net = MarioNet (self .state_dim , self .action_dim ).float ()
290
- if self .use_cuda :
291
- self .net = self .net .to (device = "cuda" )
297
+ self .net = self .net .to (device = self .device )
292
298
293
299
self .exploration_rate = 1
294
300
self .exploration_rate_decay = 0.99999975
@@ -312,12 +318,8 @@ def act(self, state):
312
318
313
319
# EXPLOIT
314
320
else :
315
- state = state .__array__ ()
316
- if self .use_cuda :
317
- state = torch .tensor (state ).cuda ()
318
- else :
319
- state = torch .tensor (state )
320
- state = state .unsqueeze (0 )
321
+ state = state [0 ].__array__ () if isinstance (state , tuple ) else state .__array__ ()
322
+ state = torch .tensor (state , device = self .device ).unsqueeze (0 )
321
323
action_values = self .net (state , model = "online" )
322
324
action_idx = torch .argmax (action_values , axis = 1 ).item ()
323
325
@@ -363,21 +365,16 @@ def cache(self, state, next_state, action, reward, done):
363
365
reward (float),
364
366
done(bool))
365
367
"""
366
- state = state .__array__ ()
367
- next_state = next_state .__array__ ()
368
-
369
- if self .use_cuda :
370
- state = torch .tensor (state ).cuda ()
371
- next_state = torch .tensor (next_state ).cuda ()
372
- action = torch .tensor ([action ]).cuda ()
373
- reward = torch .tensor ([reward ]).cuda ()
374
- done = torch .tensor ([done ]).cuda ()
375
- else :
376
- state = torch .tensor (state )
377
- next_state = torch .tensor (next_state )
378
- action = torch .tensor ([action ])
379
- reward = torch .tensor ([reward ])
380
- done = torch .tensor ([done ])
368
+ def first_if_tuple (x ):
369
+ return x [0 ] if isinstance (x , tuple ) else x
370
+ state = first_if_tuple (state ).__array__ ()
371
+ next_state = first_if_tuple (next_state ).__array__ ()
372
+
373
+ state = torch .tensor (state , device = self .device )
374
+ next_state = torch .tensor (next_state , device = self .device )
375
+ action = torch .tensor ([action ], device = self .device )
376
+ reward = torch .tensor ([reward ], device = self .device )
377
+ done = torch .tensor ([done ], device = self .device )
381
378
382
379
self .memory .append ((state , next_state , action , reward , done ,))
383
380
@@ -753,7 +750,7 @@ def record(self, episode, epsilon, step):
753
750
action = mario .act (state )
754
751
755
752
# Agent performs action
756
- next_state , reward , done , info = env .step (action )
753
+ next_state , reward , done , trunc , info = env .step (action )
757
754
758
755
# Remember
759
756
mario .cache (state , next_state , action , reward , done )
0 commit comments