From 7b742c33aafa73b0673d5e8d1402cd1958a36a16 Mon Sep 17 00:00:00 2001 From: wilfChen Date: Tue, 7 Nov 2023 21:30:59 +0800 Subject: [PATCH] adapte to 910B --- example/sac/train.py | 2 +- mindspore_rl/algorithm/gail/gail_session.py | 2 +- mindspore_rl/algorithm/sac/sac.py | 6 +++--- mindspore_rl/algorithm/sac/sac_trainer.py | 5 ++--- mindspore_rl/algorithm/td3/config.py | 2 +- mindspore_rl/algorithm/td3/td3.py | 23 +++++++++++++++------ 6 files changed, 25 insertions(+), 15 deletions(-) diff --git a/example/sac/train.py b/example/sac/train.py index b57c4a19..4e8c8de6 100644 --- a/example/sac/train.py +++ b/example/sac/train.py @@ -49,7 +49,7 @@ def train(episode=options.episode): if compute_type == mstype.float16 and options.device_target != 'Ascend': raise ValueError("Fp16 mode is supported by Ascend backend.") - context.set_context(mode=context.GRAPH_MODE) + context.set_context(mode=context.GRAPH_MODE, ascend_config={'precision_mode': 'allow_mix_precision'}) sac_session = SACSession(options.env_yaml, options.algo_yaml) sac_session.run(class_type=SACTrainer, episode=episode) diff --git a/mindspore_rl/algorithm/gail/gail_session.py b/mindspore_rl/algorithm/gail/gail_session.py index 32eae6ba..cd38c574 100644 --- a/mindspore_rl/algorithm/gail/gail_session.py +++ b/mindspore_rl/algorithm/gail/gail_session.py @@ -50,7 +50,7 @@ def __init__(self, shapes=[obs_space.shape, action_space.shape], dtypes=[obs_space.ms_dtype, action_space.ms_dtype]) - policy_replay_buffer = UniformReplayBuffer(batch_size=policy_batch_size, + policy_replay_buffer = UniformReplayBuffer(sample_size=policy_batch_size, capacity=policy_buffer_size, shapes=[obs_space.shape, action_space.shape, obs_space.shape, (1,)], types=[obs_space.ms_dtype, action_space.ms_dtype, diff --git a/mindspore_rl/algorithm/sac/sac.py b/mindspore_rl/algorithm/sac/sac.py index 393f30c5..f8e89261 100644 --- a/mindspore_rl/algorithm/sac/sac.py +++ b/mindspore_rl/algorithm/sac/sac.py @@ -432,10 +432,10 @@ def __init__(self, params): critic_net1.trainable_params() + critic_net2.trainable_params() ) critic_optim = nn.Adam( - critic_trainable_params, learning_rate=params["critic_lr"] + critic_trainable_params, learning_rate=params["critic_lr"], eps=1e-5 ) actor_optim = nn.Adam( - actor_net.trainable_params(), learning_rate=params["actor_lr"] + actor_net.trainable_params(), learning_rate=params["actor_lr"], eps=1e-5 ) self.critic_train = nn.TrainOneStepCell(critic_loss_net, critic_optim) @@ -449,7 +449,7 @@ def __init__(self, params): params["alpha_loss_weight"], actor_net, ) - alpha_optim = nn.Adam([log_alpha], learning_rate=params["alpha_lr"]) + alpha_optim = nn.Adam([log_alpha], learning_rate=params["alpha_lr"], eps=1e-5) self.alpha_train = nn.TrainOneStepCell(alpha_loss_net, alpha_optim) factor, interval = params["update_factor"], params["update_interval"] diff --git a/mindspore_rl/algorithm/sac/sac_trainer.py b/mindspore_rl/algorithm/sac/sac_trainer.py index 050edbb5..6085f0e8 100644 --- a/mindspore_rl/algorithm/sac/sac_trainer.py +++ b/mindspore_rl/algorithm/sac/sac_trainer.py @@ -30,7 +30,6 @@ def __init__(self, msrl, params=None): super(SACTrainer, self).__init__(msrl) self.inited = Parameter(Tensor([False], mindspore.bool_), name="init_flag") self.zero = Tensor([0], mindspore.float32) - self.fill_value = Tensor([10000], mindspore.float32) self.false = Tensor([False], mindspore.bool_) self.true = Tensor([True], mindspore.bool_) self.less = P.Less() @@ -47,8 +46,8 @@ def init_training(self): """Initialize training""" state = self.msrl.collect_environment.reset() done = self.false - i = self.zero - while self.less(i, self.fill_value): + i = Tensor([0], mindspore.int32) + while self.less(i, Tensor([10000], mindspore.int32)): new_state, action, reward, done = self.msrl.agent_act(trainer.INIT, state) self.msrl.replay_buffer_insert([state, action, reward, new_state, done]) state = new_state diff --git a/mindspore_rl/algorithm/td3/config.py b/mindspore_rl/algorithm/td3/config.py index 1f27cf11..8af58cd2 100644 --- a/mindspore_rl/algorithm/td3/config.py +++ b/mindspore_rl/algorithm/td3/config.py @@ -60,7 +60,7 @@ "type": TD3Actor, "params": actor_params, "policies": [], - "networks": ["actor_net"], + "networks": ["actor_net", "init_policy"], }, "learner": { "number": 1, diff --git a/mindspore_rl/algorithm/td3/td3.py b/mindspore_rl/algorithm/td3/td3.py index 96299871..c7cd80a1 100644 --- a/mindspore_rl/algorithm/td3/td3.py +++ b/mindspore_rl/algorithm/td3/td3.py @@ -13,6 +13,7 @@ # limitations under the License. # ============================================================================ """TD3""" +import numpy as np import mindspore import mindspore.nn.probability.distribution as msd from mindspore import Parameter, Tensor, nn @@ -40,11 +41,10 @@ class GaussianNoise(nn.Cell): def __init__(self, mean, stddev, clip=None): super().__init__() - self.abs = P.Abs() self.clip = clip if self.clip is not None: - self.high_clip = self.abs(Tensor(self.clip)) - self.low_clip = -self.high_clip + self.high_clip = Tensor(np.abs(self.clip)) + self.low_clip = Tensor(-np.abs(self.clip)) self.normal = msd.Normal(mean, stddev) def construct(self, actions): @@ -164,6 +164,15 @@ def construct(self, observation, action): return q + class RandomPolicy(nn.Cell): + def __init__(self, action_space_dim): + super(TD3Policy.RandomPolicy, self).__init__() + self.uniform = P.UniformReal() + self.shape = (action_space_dim,) + + def construct(self): + return self.uniform(self.shape) * 2 - 1 + def __init__(self, params): self.actor_net = self.TD3ActorNet( params["state_space_dim"], @@ -217,6 +226,7 @@ def __init__(self, params): params["compute_type"], name="target_critic_net_2.", ) + self.init_policy = self.RandomPolicy(params['action_space_dim']) class TD3Actor(Actor): @@ -225,6 +235,7 @@ class TD3Actor(Actor): def __init__(self, params=None): super().__init__() self.actor_net = params["actor_net"] + self.init_policy = params["init_policy"] self.env = params["collect_environment"] self.expand_dims = P.ExpandDims() self.squeeze = P.Squeeze() @@ -242,7 +253,7 @@ def act(self, phase, params): def get_action(self, phase, params): if phase == 1: - actions = Tensor(self.env.action_space.sample(), mindspore.float32) + return self.init_policy() else: obs = self.expand_dims(params, 0) actions = self.actor_net(obs) @@ -353,10 +364,10 @@ def __init__(self, params): # optimizer network critic_optimizer = nn.Adam( self.critic_net_1.trainable_params() + self.critic_net_2.trainable_params(), - learning_rate=params["critic_lr"], + learning_rate=params["critic_lr"], eps=1e-5 ) actor_optimizer = nn.Adam( - self.actor_net.trainable_params(), learning_rate=params["actor_lr"] + self.actor_net.trainable_params(), learning_rate=params["actor_lr"], eps=1e-5 ) # target networks and their initializations