diff --git a/mindspore_rl/algorithm/maddpg/maddpg.py b/mindspore_rl/algorithm/maddpg/maddpg.py index 2ad91392..ba7e0377 100644 --- a/mindspore_rl/algorithm/maddpg/maddpg.py +++ b/mindspore_rl/algorithm/maddpg/maddpg.py @@ -240,8 +240,8 @@ def __init__(self, params): self.actor_net = params.get('actor_net') # optimizer network - critic_optimizer = nn.Adam(self.critic_net.trainable_params(), learning_rate=params.get('learning_rate')) - actor_optimizer = nn.Adam(self.actor_net.trainable_params(), learning_rate=params.get('learning_rate')) + critic_optimizer = nn.Adam(self.critic_net.trainable_params(), learning_rate=params.get('learning_rate'), eps=1e-5) + actor_optimizer = nn.Adam(self.actor_net.trainable_params(), learning_rate=params.get('learning_rate'), eps=1e-5) # loss network self.target_actor_net = params.get('target_actor_net') diff --git a/mindspore_rl/algorithm/maddpg/maddpg_trainer.py b/mindspore_rl/algorithm/maddpg/maddpg_trainer.py index 9948c80e..31da8908 100644 --- a/mindspore_rl/algorithm/maddpg/maddpg_trainer.py +++ b/mindspore_rl/algorithm/maddpg/maddpg_trainer.py @@ -96,7 +96,7 @@ def train(self, episodes, callbacks=None, ckpt_path=None): and represent for `loss, rewards, steps, [optional]others.` in order" ) episode_rewards.append(float(rewards.asnumpy())) - if i % 1000 == 0: + if (i % 1000 == 0) and (i != 0): print("-----------------------------------------") # pylint: disable=C0209 print( diff --git a/mindspore_rl/algorithm/ppo/ppo.py b/mindspore_rl/algorithm/ppo/ppo.py index 2495fa91..6246c0bc 100644 --- a/mindspore_rl/algorithm/ppo/ppo.py +++ b/mindspore_rl/algorithm/ppo/ppo.py @@ -273,7 +273,7 @@ def __init__(self, params): trainable_parameter = ( self.critic_net.trainable_params() + self.actor_net.trainable_params() ) - optimizer_ppo = nn.Adam(trainable_parameter, learning_rate=params["lr"]) + optimizer_ppo = nn.Adam(trainable_parameter, learning_rate=params["lr"], eps=1e-5) ppo_loss_net = self.PPOLossCell( self.actor_net, self.critic_net, diff --git a/mindspore_rl/algorithm/qmix/qmix.py b/mindspore_rl/algorithm/qmix/qmix.py index 772990f4..7759be02 100644 --- a/mindspore_rl/algorithm/qmix/qmix.py +++ b/mindspore_rl/algorithm/qmix/qmix.py @@ -813,7 +813,7 @@ def __init__(self, params): trainable_params = ( self.policy_net.trainable_params() + self.mixer_net.trainable_params() ) - optimizer = nn.Adam(trainable_params, learning_rate=params["lr"]) + optimizer = nn.Adam(trainable_params, learning_rate=params["lr"], eps=1e-5) qmix_loss_cell = self.QMIXLossCell( params, self.policy_net, self.mixer_net, self.target_mixer_net diff --git a/mindspore_rl/environment/petting_zoo_mpe_environment.py b/mindspore_rl/environment/petting_zoo_mpe_environment.py index 6c1c1695..09711f4c 100644 --- a/mindspore_rl/environment/petting_zoo_mpe_environment.py +++ b/mindspore_rl/environment/petting_zoo_mpe_environment.py @@ -87,7 +87,7 @@ def __init__(self, params, env_id=0): observation_space = gym2ms_adapter(list(self._env.observation_spaces.values())) env_action_space = self._env.action_spaces["agent_0"] action_space = Space( - (env_action_space.n,), env_action_space.dtype.type, batch_shape=(self._num,) + (env_action_space.n,), np.float32, batch_shape=(self._num,) ) reward_space = Space((1,), np.float32, batch_shape=(self._num,)) done_space = Space((1,), np.bool_, low=0, high=2, batch_shape=(self._num,))