-
Notifications
You must be signed in to change notification settings - Fork 12
/
accelerated-peft-bnb-nf4-foak-padding-free-sample-configuration.yaml
56 lines (44 loc) · 1.73 KB
/
accelerated-peft-bnb-nf4-foak-padding-free-sample-configuration.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# FMS Acceleration Plugin Configuration.
#
# Each stanza incorporates various configurations for
# different fine-tuning / training tasks.
plugins:
# Configurations to accelerate data packing/padding in training
training:
# attention module configurations
# e.g. padding-free modifications to attention layer
attention:
# this controls the confgurations for padding free computation of flash attention
padding_free:
method: huggingface
peft:
# quantization-releated acceleration
# e.g., kernels for quantized base weights
quantization:
# For loading BitsAndBytes quantized layers
# to serve as 4bit base-weights for LoRA PEFT-tuning.
# NOTE: currently AutoGPTQ is not properly integrated into huggingface /
# bitsandbytes, thus recommended quant_type to be either "nf4"
# or "fp4".
# bitsandbytes:
bitsandbytes:
quant_type: nf4
# If True, then no get_peft_model and prepare_model_for_kbit_training
# will be called.
no_peft_model: false
fused_ops_and_kernels:
# load unsloth optimizations for these 4bit base layer weights.
# currently only support "auto_gptq" and "bitsandbytes"
base_layer: bitsandbytes
# activate various unsloth optimizations
# there are two versions of the plugin
# - the FastKernel version supports individual kernels
# - the FastQuantized version is all-or-nothing
# fused kernels for lora linear layers
fused_lora: true
# fast loss triton kernels
fast_loss: true
# fast rms norm triton kernels
fast_rms_layernorm: true
# fast RoPE embedding triton kernels
fast_rope_embeddings: true