Fix mesh_axes and data_sharding for LLaMA 2 GPU configs.

PiperOrigin-RevId: 646795068
AI-Hypercomputer · Jun 26, 2024 · 679ec8c · 679ec8c
1 parent 5a215db
commit 679ec8c
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 4 deletions.
diff --git a/MaxText/configs/llama2_70b_gpu.yml b/MaxText/configs/llama2_70b_gpu.yml
@@ -17,7 +17,7 @@ logits_dot_in_fp32: False
 per_device_batch_size: 6
 max_target_length: 4096
 
-mesh_axes: ['stage', 'data', 'fsdp', 'fsdp_transpose', 'sequence', 'tensor', 'autoregressive']
+mesh_axes: ['data', 'stage', 'fsdp', 'fsdp_transpose', 'sequence', 'tensor', 'autoregressive']
 logical_axis_rules: [
                       ['activation_batch', ['data', 'fsdp', 'fsdp_transpose',]],
                        # For pipeline parallelism the pre and post decoder layer tensors' batch dimension is sharded by stages.
@@ -52,4 +52,4 @@ logical_axis_rules: [
                       ['cache_sequence', []],
                     ]
 # Axes used for DCN must be earlier in this list than ICI, see (b/339009148) for details
-data_sharding: [['stage', 'data', 'fsdp', 'fsdp_transpose', 'sequence', 'tensor', 'autoregressive']]
+data_sharding: [['data', 'stage', 'fsdp', 'fsdp_transpose', 'sequence', 'tensor', 'autoregressive']]
diff --git a/MaxText/configs/llama2_7b_gpu.yml b/MaxText/configs/llama2_7b_gpu.yml
@@ -18,7 +18,7 @@ logits_dot_in_fp32: False
 per_device_batch_size: 4
 max_target_length: 4096
 
-mesh_axes: ['stage', 'data', 'fsdp', 'fsdp_transpose', 'sequence', 'tensor', 'autoregressive']
+mesh_axes: ['data', 'stage', 'fsdp', 'fsdp_transpose', 'sequence', 'tensor', 'autoregressive']
 logical_axis_rules: [
                       ['activation_batch', ['data', 'fsdp', 'fsdp_transpose',]],
                        # For pipeline parallelism the pre and post decoder layer tensors' batch dimension is sharded by stages.
@@ -54,4 +54,4 @@ logical_axis_rules: [
                     ]
 
 # Axes used for DCN must be earlier in this list than ICI, see (b/339009148) for details
-data_sharding: [['stage', 'data', 'fsdp', 'fsdp_transpose', 'sequence', 'tensor', 'autoregressive']]
+data_sharding: [['data', 'stage', 'fsdp', 'fsdp_transpose', 'sequence', 'tensor', 'autoregressive']]