From 79bb1c95dd2bac4fe8a63278bbc5d902d0abeb48 Mon Sep 17 00:00:00 2001
From: Vincent Moens <vmoens@meta.com>
Date: Mon, 13 Jan 2025 16:14:28 +0000
Subject: [PATCH] use DETERMINISTIC sampling in PPO

---
 intermediate_source/reinforcement_ppo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/intermediate_source/reinforcement_ppo.py b/intermediate_source/reinforcement_ppo.py
index 30216ff880c..ec2dc0a488d 100644
--- a/intermediate_source/reinforcement_ppo.py
+++ b/intermediate_source/reinforcement_ppo.py
@@ -639,7 +639,7 @@
         # number of steps (1000, which is our ``env`` horizon).
         # The ``rollout`` method of the ``env`` can take a policy as argument:
         # it will then execute this policy at each step.
-        with set_exploration_type(ExplorationType.MEAN), torch.no_grad():
+        with set_exploration_type(ExplorationType.DETERMINISTIC), torch.no_grad():
             # execute a rollout with the trained policy
             eval_rollout = env.rollout(1000, policy_module)
             logs["eval reward"].append(eval_rollout["next", "reward"].mean().item())