-
Notifications
You must be signed in to change notification settings - Fork 12
/
moe-scattermoe-granite-ep4-padding-free-foak-sample-configuration.yaml
51 lines (42 loc) · 1.72 KB
/
moe-scattermoe-granite-ep4-padding-free-foak-sample-configuration.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# FMS Acceleration Plugin Configuration.
#
# Each stanza incorporates various configurations for
# different fine-tuning / training tasks.
plugins:
# Configurations to accelerate data packing/padding in training
training:
# attention module configurations
# e.g. padding-free modifications to attention layer
attention:
# this controls the confgurations for padding free computation of flash attention
padding_free:
method: huggingface
fused_ops_and_kernels:
# if under training stanza, then putting
# base_layer and fused_lora will be a misnomer
# - this should be in peft.quantized
# However, if it is specified, it will still
# be read. This is useful in use cases where
# the yaml is system generated and not shown
# to a user.
# activate various unsloth optimizations
# there are two versions of the plugin
# - the FastKernel version supports individual kernels
# - the FastQuantized version is all-or-nothing
# fast loss triton kernels
fast_loss: true
# fast rms norm triton kernels
fast_rms_layernorm: true
# fast RoPE embedding triton kernels
fast_rope_embeddings: true
moe:
# expert-parallel for MoE
scattermoe:
# The level of expert parallel sharding.
# - 1 means no sharding
# - if > 1, please ensure that this divides the world_size. This is because
# the devices will be replicated for every ep_degree devices, and
# the experts will be sharded within each group.
# - if > 1, also ensure that it divides the number of experts, as each device
# will then have num_of_experts / ep_degree experts.
ep_degree: 4