sample-configurations/accelerated-peft-bnb-nf4-foak-padding-free-sample-configuration.yaml

# FMS Acceleration Plugin Configuration. 
#
# Each stanza incorporates various configurations for 
# different fine-tuning / training tasks.
plugins:
  # Configurations to accelerate data packing/padding in training
  training:

    # attention module configurations
    # e.g. padding-free modifications to attention layer
    attention:

      # this controls the confgurations for padding free computation of flash attention
      padding_free:
        method: huggingface
  peft:

    # quantization-releated acceleration
    # e.g., kernels for quantized base weights
    quantization:

      # For loading BitsAndBytes quantized layers
      # to serve as 4bit base-weights for LoRA PEFT-tuning.
      # NOTE: currently AutoGPTQ is not properly integrated into huggingface /
      # bitsandbytes, thus recommended quant_type to be either "nf4"
      # or "fp4".
      # bitsandbytes:
      bitsandbytes:
        quant_type: nf4

        # If True, then no get_peft_model and prepare_model_for_kbit_training
        # will be called. 
        no_peft_model: false
      fused_ops_and_kernels:

        # load unsloth optimizations for these 4bit base layer weights.
        # currently only support "auto_gptq" and "bitsandbytes"
        base_layer: bitsandbytes

        # activate various unsloth optimizations
        # there are two versions of the plugin
        # - the FastKernel version supports individual kernels
        # - the FastQuantized version is all-or-nothing


        # fused kernels for lora linear layers
        fused_lora: true

        # fast loss triton kernels
        fast_loss: true

        # fast rms norm triton kernels
        fast_rms_layernorm: true

        # fast RoPE embedding triton kernels
        fast_rope_embeddings: true