config-base.yaml

# Mostly set externally explicitly
adapter: qlora
base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
debug: False
micro_batch_size: 1
revision_of_model: null
sequence_len: 2048
val_set_size: 0.1

# ---------------------
# Auto computed and set by script based on environment and external state
# Only edit them if you know what you are doing
data_dir: auto # type: string
datasets: auto # type: list
test_datasets: auto # type: list
batch_flattening: auto # type: bool
bf16: auto # type: bool
bfloat16: auto # type: bool
flash_attention: auto # type: bool
flash_attn_cross_entropy: auto # type: bool
flash_attn_fuse_mlp: auto # type: bool
flash_attn_fuse_qkv: auto # type: bool
flash_attn_rms_norm: auto # type: bool
float16: auto # type: bool
fp16: auto # type: bool
load_in_4bit: auto # type: bool
lora_modules_to_save: auto # type: list
optimizer: auto # type: string
resume_from_checkpoint: auto # type: bool
special_tokens: auto  # type: dict
unsloth_cross_entropy_loss: auto # type: bool
unsloth_lora_mlp: auto # type: bool
unsloth_lora_qkv: auto # type: bool
unsloth_lora_o: auto # type: bool
unsloth_rms_norm: auto # type: bool
unsloth_rope: auto # type: bool
tf32: auto # type: bool


# ---------------------
# Defaults
auto_find_batch_size: False
bnb_config_kwargs:
  bnb_4bit_quant_type: nf4
  bnb_4bit_use_double_quant: True
  llm_int8_has_fp16_weight: False
adam_beta1: 0.9
adam_beta2: 0.99
auto_resume_from_checkpoints: False
base_model_ignore_patterns:
  - '*.h5'
  - '*.ot'
  - '*.tflite'
  - '*.msgpack'
chat_template: tokenizer_default_fallback_chatml
dataset_prepared_path: ./outputs/data/last_run_prepared
dataset_processes: 1
ddp_timeout: 21600
deepspeed: ./deepspeed_configs/3_ds_z2_config.json
default_system_message: null
device_map: null
early_stopping_patience: 10
eval_sample_packing: False
eval_steps: 0.1
eval_strategy: steps
fix_untrained_tokens: False
gradient_accumulation_steps: 4
gradient_checkpointing: True
gradient_checkpointing_kwargs:
  use_reentrant: True
learning_rate: 0.0002
load_best_model_at_end: True
load_in_8bit: False
logging_steps: 5
lora_alpha: 64
lora_dropout: 0.05
lora_on_cpu: False
lora_r: 32
lora_target_linear: True
lora_target_modules: null
loraplus_lr_ratio: 16
low_cpu_mem_usage: True
lr_scheduler: cosine
max_grad_norm: 1.0
mean_resizing_embeddings: True
multipack_real_batches: False
num_epochs: 10
output_dir: ./outputs
peft_use_dora: True
peft_use_rslora: True
plugins:
  - axolotl_truefoundry.TrueFoundryMLPlugin
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
pad_to_sequence_len: True
remove_unused_columns: True
report_to: tensorboard
resize_token_embeddings_to_32x: False
sample_packing: True
save_safetensors: True
save_steps: 0.1
save_strategy: steps
save_total_limit: 1
seed: 42
strict: False
tokenizer_type: AutoTokenizer
train_on_inputs: False
trust_remote_code: True
type_of_model: AutoModelForCausalLM
warmup_ratio: 0.1
weight_decay: 0.01
use_mflow: False
use_wandb: False
use_tensorboard: True

# ---------------------
# Plugin specific configs

## TrueFoundry
cleanup_output_dir_on_start: False
dataset_type: chat  # Can be completion | chat
drop_long_sequences: False
extra_hf_training_args:
  average_tokens_across_devices: True
  eval_on_start: True
logging_dir: ./tensorboard_logs
merge_adapters_post_train: True
save_model_on_interrupt: False
train_data_uri: null
truefoundry_ml_checkpoint_artifact_name: auto # type: string
truefoundry_ml_enable_reporting: False
truefoundry_ml_log_checkpoints: True
truefoundry_ml_log_gpu_metrics: False
truefoundry_ml_log_merged_model: True
truefoundry_ml_repo: null
truefoundry_ml_run_name: auto # type: string
val_data_uri: null

## Liger
liger_rope: True
liger_rms_norm: True
liger_glu_activation: True
liger_layer_norm: True
liger_fused_linear_cross_entropy: True

## CutCrossEntropy
cut_cross_entropy: False