update configs

agi-brain · Aug 16, 2023 · 83e3715 · 83e3715
1 parent 9673fa0
commit 83e3715
Show file tree

Hide file tree

Showing 33 changed files with 130 additions and 127 deletions.
diff --git a/README.md b/README.md
@@ -322,20 +322,21 @@ $ tensorboard --logdir ./logs/dqn/torch/CartPole-v0
 
 ### Atari Environment (Ongoing)
 
-| Task              | DQN      | C51     | PPO     |
-|-------------------|----------|---------|---------|
-| ALE/AirRaid-v5    | 7316.67  | 5450.00 | 9283.33 |
-| ALE/Alien-v5      | 2676.67  | 2413.33 | 2313.33 |
-| ALE/Amidar-v5     | 627.00   | 293.0   | 964.67  |
-| ALE/Assault-v5    | 9981.67  |         | 6265.67 |
-| ALE/Asterix-v5    | 30516.67 |         | 2900.00 |
-| ALE/Asteroids-v5  |          |         | 3430.00 |
-| ALE/Bowling-v5    | 92.00    | 56.67   | 76.00   |
-| ALE/Breakout-v5   | 415.33   | 431.0   | 371.67  |
-| ALE/Freeway-v5    | 34.00    | 33.0    | 34.0    |
-| ALE/MsPacman-v5   | 4650.00  | 4690.00 | 4120.00 |
-| ALE/Pong-v5       | 21.0     | 20.0    | 21.0    |
-| ALE/Qbert-v5      | 16350.0  | 12875.0 | 20050.0 |
+| Task              | DQN      | C51      | PPO     |
+|-------------------|----------|----------|---------|
+| ALE/AirRaid-v5    | 7316.67  | 5450.00  | 9283.33 |
+| ALE/Alien-v5      | 2676.67  | 2413.33  | 2313.33 |
+| ALE/Amidar-v5     | 627.00   | 293.0    | 964.67  |
+| ALE/Assault-v5    | 9981.67  | 9088.67  | 6265.67 |
+| ALE/Asterix-v5    | 30516.67 | 12866.67 | 2900.00 |
+| ALE/Asteroids-v5  | 1393.33  | 2180.0   | 3430.00 |
+| ALE/Atlantis-v5   |          |          |         |
+| ALE/Bowling-v5    | 92.00    | 56.67    | 76.00   |
+| ALE/Breakout-v5   | 415.33   | 431.0    | 371.67  |
+| ALE/Freeway-v5    | 34.00    | 33.0     | 34.0    |
+| ALE/MsPacman-v5   | 4650.00  | 4690.00  | 4120.00 |
+| ALE/Pong-v5       | 21.0     | 20.0     | 21.0    |
+| ALE/Qbert-v5      | 16350.0  | 12875.0  | 20050.0 |
 
 
 

diff --git a/train_c51.sh b/train_c51.sh
diff --git a/train_ppo.sh b/train_ppo.sh
diff --git a/xuanpolicy/configs/dcg/sc2/25m.yaml b/xuanpolicy/configs/dcg/sc2/25m.yaml
@@ -17,26 +17,26 @@ recurrent_hidden_size: 64
 N_recurrent_layers: 1
 dropout: 0
 
-representation_hidden_size: [32, ]
-q_hidden_size: [128, ]  # the units for each hidden layer
-hidden_utility_dim: 256  # hidden units of the utility function
-hidden_payoff_dim: 256  # hidden units of the payoff function
+representation_hidden_size: [64, ]
+q_hidden_size: [64, ]  # the units for each hidden layer
+hidden_utility_dim: 64  # hidden units of the utility function
+hidden_payoff_dim: 64  # hidden units of the payoff function
 bias_net: "Basic_MLP"
-hidden_bias_dim: [256, ]  # hidden units of the bias network with global states as input
+hidden_bias_dim: [64, ]  # hidden units of the bias network with global states as input
 activation: "ReLU"
 
 low_rank_payoff: False  # low-rank approximation of payoff function
 payoff_rank: 5  # the rank K in the paper
 graph_type: "FULL"  # specific type of the coordination graph
-n_msg_iterations: 1  # number of iterations for message passing during belief propagation
+n_msg_iterations: 8  # number of iterations for message passing during belief propagation
 msg_normalized: True  # Message normalization during greedy action selection (Kok and Vlassis, 2006)
 
 seed: 1
 parallels: 1
 buffer_size: 5000
 batch_size: 32
 learning_rate: 0.0007
-gamma: 0.95  # discount factor
+gamma: 0.99  # discount factor
 double_q: True  # use double q learning
 
 start_greedy: 1.0

diff --git a/xuanpolicy/configs/dcg/sc2/2m_vs_1z.yaml b/xuanpolicy/configs/dcg/sc2/2m_vs_1z.yaml
@@ -17,26 +17,26 @@ recurrent_hidden_size: 64
 N_recurrent_layers: 1
 dropout: 0
 
-representation_hidden_size: [32, ]
-q_hidden_size: [128, ]  # the units for each hidden layer
-hidden_utility_dim: 256  # hidden units of the utility function
-hidden_payoff_dim: 256  # hidden units of the payoff function
+representation_hidden_size: [64, ]
+q_hidden_size: [64, ]  # the units for each hidden layer
+hidden_utility_dim: 64  # hidden units of the utility function
+hidden_payoff_dim: 64  # hidden units of the payoff function
 bias_net: "Basic_MLP"
-hidden_bias_dim: [256, ]  # hidden units of the bias network with global states as input
+hidden_bias_dim: [64, ]  # hidden units of the bias network with global states as input
 activation: "ReLU"
 
 low_rank_payoff: False  # low-rank approximation of payoff function
 payoff_rank: 5  # the rank K in the paper
 graph_type: "FULL"  # specific type of the coordination graph
-n_msg_iterations: 1  # number of iterations for message passing during belief propagation
+n_msg_iterations: 8  # number of iterations for message passing during belief propagation
 msg_normalized: True  # Message normalization during greedy action selection (Kok and Vlassis, 2006)
 
 seed: 1
 parallels: 1
 buffer_size: 5000
 batch_size: 32
 learning_rate: 0.0007
-gamma: 0.95  # discount factor
+gamma: 0.99  # discount factor
 double_q: True  # use double q learning
 
 start_greedy: 1.0

diff --git a/xuanpolicy/configs/dcg/sc2/3m.yaml b/xuanpolicy/configs/dcg/sc2/3m.yaml
@@ -17,26 +17,26 @@ recurrent_hidden_size: 64
 N_recurrent_layers: 1
 dropout: 0
 
-representation_hidden_size: [32, ]
-q_hidden_size: [128, ]  # the units for each hidden layer
-hidden_utility_dim: 256  # hidden units of the utility function
-hidden_payoff_dim: 256  # hidden units of the payoff function
+representation_hidden_size: [64, ]
+q_hidden_size: [64, ]  # the units for each hidden layer
+hidden_utility_dim: 64  # hidden units of the utility function
+hidden_payoff_dim: 64  # hidden units of the payoff function
 bias_net: "Basic_MLP"
-hidden_bias_dim: [256, ]  # hidden units of the bias network with global states as input
+hidden_bias_dim: [64, ]  # hidden units of the bias network with global states as input
 activation: "ReLU"
 
 low_rank_payoff: False  # low-rank approximation of payoff function
 payoff_rank: 5  # the rank K in the paper
 graph_type: "FULL"  # specific type of the coordination graph
-n_msg_iterations: 1  # number of iterations for message passing during belief propagation
+n_msg_iterations: 8  # number of iterations for message passing during belief propagation
 msg_normalized: True  # Message normalization during greedy action selection (Kok and Vlassis, 2006)
 
 seed: 1
 parallels: 1
 buffer_size: 5000
 batch_size: 32
 learning_rate: 0.0007
-gamma: 0.95  # discount factor
+gamma: 0.99  # discount factor
 double_q: True  # use double q learning
 
 start_greedy: 1.0

diff --git a/xuanpolicy/configs/dcg/sc2/5m_vs_6m.yaml b/xuanpolicy/configs/dcg/sc2/5m_vs_6m.yaml
@@ -17,26 +17,26 @@ recurrent_hidden_size: 64
 N_recurrent_layers: 1
 dropout: 0
 
-representation_hidden_size: [32, ]
-q_hidden_size: [128, ]  # the units for each hidden layer
-hidden_utility_dim: 256  # hidden units of the utility function
-hidden_payoff_dim: 256  # hidden units of the payoff function
+representation_hidden_size: [64, ]
+q_hidden_size: [64, ]  # the units for each hidden layer
+hidden_utility_dim: 64  # hidden units of the utility function
+hidden_payoff_dim: 64  # hidden units of the payoff function
 bias_net: "Basic_MLP"
-hidden_bias_dim: [256, ]  # hidden units of the bias network with global states as input
+hidden_bias_dim: [64, ]  # hidden units of the bias network with global states as input
 activation: "ReLU"
 
 low_rank_payoff: False  # low-rank approximation of payoff function
 payoff_rank: 5  # the rank K in the paper
 graph_type: "FULL"  # specific type of the coordination graph
-n_msg_iterations: 1  # number of iterations for message passing during belief propagation
+n_msg_iterations: 8  # number of iterations for message passing during belief propagation
 msg_normalized: True  # Message normalization during greedy action selection (Kok and Vlassis, 2006)
 
 seed: 1
 parallels: 1
 buffer_size: 5000
 batch_size: 32
 learning_rate: 0.0007
-gamma: 0.95  # discount factor
+gamma: 0.99  # discount factor
 double_q: True  # use double q learning
 
 start_greedy: 1.0

diff --git a/xuanpolicy/configs/dcg/sc2/8m.yaml b/xuanpolicy/configs/dcg/sc2/8m.yaml
@@ -17,26 +17,26 @@ recurrent_hidden_size: 64
 N_recurrent_layers: 1
 dropout: 0
 
-representation_hidden_size: [32, ]
-q_hidden_size: [128, ]  # the units for each hidden layer
-hidden_utility_dim: 256  # hidden units of the utility function
-hidden_payoff_dim: 256  # hidden units of the payoff function
+representation_hidden_size: [64, ]
+q_hidden_size: [64, ]  # the units for each hidden layer
+hidden_utility_dim: 64  # hidden units of the utility function
+hidden_payoff_dim: 64  # hidden units of the payoff function
 bias_net: "Basic_MLP"
-hidden_bias_dim: [256, ]  # hidden units of the bias network with global states as input
+hidden_bias_dim: [64, ]  # hidden units of the bias network with global states as input
 activation: "ReLU"
 
 low_rank_payoff: False  # low-rank approximation of payoff function
 payoff_rank: 5  # the rank K in the paper
 graph_type: "FULL"  # specific type of the coordination graph
-n_msg_iterations: 1  # number of iterations for message passing during belief propagation
+n_msg_iterations: 8  # number of iterations for message passing during belief propagation
 msg_normalized: True  # Message normalization during greedy action selection (Kok and Vlassis, 2006)
 
 seed: 1
 parallels: 1
 buffer_size: 5000
 batch_size: 32
 learning_rate: 0.0007
-gamma: 0.95  # discount factor
+gamma: 0.99  # discount factor
 double_q: True  # use double q learning
 
 start_greedy: 1.0

diff --git a/xuanpolicy/configs/dcg/sc2/8m_vs_9m.yaml b/xuanpolicy/configs/dcg/sc2/8m_vs_9m.yaml
@@ -17,26 +17,26 @@ recurrent_hidden_size: 64
 N_recurrent_layers: 1
 dropout: 0
 
-representation_hidden_size: [32, ]
-q_hidden_size: [128, ]  # the units for each hidden layer
-hidden_utility_dim: 256  # hidden units of the utility function
-hidden_payoff_dim: 256  # hidden units of the payoff function
+representation_hidden_size: [64, ]
+q_hidden_size: [64, ]  # the units for each hidden layer
+hidden_utility_dim: 64  # hidden units of the utility function
+hidden_payoff_dim: 64  # hidden units of the payoff function
 bias_net: "Basic_MLP"
-hidden_bias_dim: [256, ]  # hidden units of the bias network with global states as input
+hidden_bias_dim: [64, ]  # hidden units of the bias network with global states as input
 activation: "ReLU"
 
 low_rank_payoff: False  # low-rank approximation of payoff function
 payoff_rank: 5  # the rank K in the paper
 graph_type: "FULL"  # specific type of the coordination graph
-n_msg_iterations: 1  # number of iterations for message passing during belief propagation
+n_msg_iterations: 8  # number of iterations for message passing during belief propagation
 msg_normalized: True  # Message normalization during greedy action selection (Kok and Vlassis, 2006)
 
 seed: 1
 parallels: 1
 buffer_size: 5000
 batch_size: 32
 learning_rate: 0.0007
-gamma: 0.95  # discount factor
+gamma: 0.99  # discount factor
 double_q: True  # use double q learning
 
 start_greedy: 1.0

diff --git a/xuanpolicy/configs/dcg/sc2/MMM2.yaml b/xuanpolicy/configs/dcg/sc2/MMM2.yaml
@@ -17,26 +17,26 @@ recurrent_hidden_size: 64
 N_recurrent_layers: 1
 dropout: 0
 
-representation_hidden_size: [32, ]
-q_hidden_size: [128, ]  # the units for each hidden layer
-hidden_utility_dim: 256  # hidden units of the utility function
-hidden_payoff_dim: 256  # hidden units of the payoff function
+representation_hidden_size: [64, ]
+q_hidden_size: [64, ]  # the units for each hidden layer
+hidden_utility_dim: 64  # hidden units of the utility function
+hidden_payoff_dim: 64  # hidden units of the payoff function
 bias_net: "Basic_MLP"
-hidden_bias_dim: [256, ]  # hidden units of the bias network with global states as input
+hidden_bias_dim: [64, ]  # hidden units of the bias network with global states as input
 activation: "ReLU"
 
 low_rank_payoff: False  # low-rank approximation of payoff function
 payoff_rank: 5  # the rank K in the paper
 graph_type: "FULL"  # specific type of the coordination graph
-n_msg_iterations: 1  # number of iterations for message passing during belief propagation
+n_msg_iterations: 8  # number of iterations for message passing during belief propagation
 msg_normalized: True  # Message normalization during greedy action selection (Kok and Vlassis, 2006)
 
 seed: 1
 parallels: 1
 buffer_size: 5000
 batch_size: 32
 learning_rate: 0.0007
-gamma: 0.95  # discount factor
+gamma: 0.99  # discount factor
 double_q: True  # use double q learning
 
 start_greedy: 1.0

diff --git a/xuanpolicy/configs/dcg/sc2/corridor.yaml b/xuanpolicy/configs/dcg/sc2/corridor.yaml
@@ -17,26 +17,26 @@ recurrent_hidden_size: 64
 N_recurrent_layers: 1
 dropout: 0
 
-representation_hidden_size: [32, ]
-q_hidden_size: [128, ]  # the units for each hidden layer
-hidden_utility_dim: 256  # hidden units of the utility function
-hidden_payoff_dim: 256  # hidden units of the payoff function
+representation_hidden_size: [64, ]
+q_hidden_size: [64, ]  # the units for each hidden layer
+hidden_utility_dim: 64  # hidden units of the utility function
+hidden_payoff_dim: 64  # hidden units of the payoff function
 bias_net: "Basic_MLP"
-hidden_bias_dim: [256, ]  # hidden units of the bias network with global states as input
+hidden_bias_dim: [64, ]  # hidden units of the bias network with global states as input
 activation: "ReLU"
 
 low_rank_payoff: False  # low-rank approximation of payoff function
 payoff_rank: 5  # the rank K in the paper
 graph_type: "FULL"  # specific type of the coordination graph
-n_msg_iterations: 1  # number of iterations for message passing during belief propagation
+n_msg_iterations: 8  # number of iterations for message passing during belief propagation
 msg_normalized: True  # Message normalization during greedy action selection (Kok and Vlassis, 2006)
 
 seed: 1
 parallels: 1
 buffer_size: 5000
 batch_size: 32
 learning_rate: 0.0007
-gamma: 0.95  # discount factor
+gamma: 0.99  # discount factor
 double_q: True  # use double q learning
 
 start_greedy: 1.0

diff --git a/xuanpolicy/configs/mappo/sc2/25m.yaml b/xuanpolicy/configs/mappo/sc2/25m.yaml
@@ -27,7 +27,7 @@ activation: "ReLU"
 seed: 1
 parallels: 1
 n_size: 128
-n_epoch: 15
+n_epoch: 10
 n_minibatch: 1
 learning_rate: 0.0007  # 7e-4
 weight_decay: 0

diff --git a/xuanpolicy/configs/mappo/sc2/5m_vs_6m.yaml b/xuanpolicy/configs/mappo/sc2/5m_vs_6m.yaml
@@ -27,15 +27,15 @@ activation: "ReLU"
 seed: 1
 parallels: 1
 n_size: 128
-n_epoch: 15
+n_epoch: 10
 n_minibatch: 1
 learning_rate: 0.0007  # 7e-4
 weight_decay: 0
 
 vf_coef: 1.0
 ent_coef: 0.01
 target_kl: 0.25
-clip_range: 0.2
+clip_range: 0.05
 clip_type: 1  # Gradient clip for Mindspore: 0: ms.ops.clip_by_value; 1: ms.nn.ClipByNorm()
 gamma: 0.99  # discount factor
 
@@ -46,7 +46,7 @@ use_global_state: False  # if use global state to replace joint observations
 use_grad_norm: True  # gradient normalization
 max_grad_norm: 10.0
 use_value_clip: True  # limit the value range
-value_clip_range: 0.2
+value_clip_range: 0.05
 use_value_norm: True  # use running mean and std to normalize rewards.
 use_huber_loss: True  # True: use huber loss; False: use MSE loss.
 huber_delta: 10.0

diff --git a/xuanpolicy/configs/mappo/sc2/8m_vs_9m.yaml b/xuanpolicy/configs/mappo/sc2/8m_vs_9m.yaml
@@ -35,7 +35,7 @@ weight_decay: 0
 vf_coef: 1.0
 ent_coef: 0.01
 target_kl: 0.25
-clip_range: 0.2
+clip_range: 0.05
 clip_type: 1  # Gradient clip for Mindspore: 0: ms.ops.clip_by_value; 1: ms.nn.ClipByNorm()
 gamma: 0.99  # discount factor
 
@@ -46,7 +46,7 @@ use_global_state: False  # if use global state to replace joint observations
 use_grad_norm: True  # gradient normalization
 max_grad_norm: 10.0
 use_value_clip: True  # limit the value range
-value_clip_range: 0.2
+value_clip_range: 0.05
 use_value_norm: True  # use running mean and std to normalize rewards.
 use_huber_loss: True  # True: use huber loss; False: use MSE loss.
 huber_delta: 10.0

diff --git a/xuanpolicy/configs/mappo/sc2/MMM2.yaml b/xuanpolicy/configs/mappo/sc2/MMM2.yaml
@@ -18,7 +18,7 @@ N_recurrent_layers: 1
 dropout: 0
 normalize: "LayerNorm"
 initialize: "orthogonal"
-gain: 0.01
+gain: 1.0
 
 actor_hidden_size: []
 critic_hidden_size: []
@@ -27,8 +27,8 @@ activation: "ReLU"
 seed: 1
 parallels: 1
 n_size: 128
-n_epoch: 15
-n_minibatch: 1
+n_epoch: 5
+n_minibatch: 2
 learning_rate: 0.0007  # 7e-4
 weight_decay: 0