NVIDIA · stevehuang52 · Sep 16, 2024 · Sep 16, 2024 · Sep 19, 2024 · Sep 19, 2024
diff --git a/examples/speechlm/conf/salm/salm_llama3-8b_fc_fc_peft.yaml b/examples/speechlm/conf/salm/salm_llama3-8b_fc_fc_peft.yaml
@@ -6,7 +6,7 @@ data:
   common:
     global_batch_size: 2
     micro_batch_size: 2
-    max_seq_length: 2048
+    max_seq_length: 4096
     min_seq_length: 1
     sample_rate: 16000
     end_string: null
@@ -120,10 +120,11 @@ model:
     _target_: nemo.collections.asr.models.EncDecMultiTaskModel
     pretrained_model: "nvidia/canary-1b"
     target_module: "encoder"
+    sample_rate: ${data.common.sample_rate}
     spec_augment_config:
       _target_: nemo.collections.asr.modules.SpectrogramAugmentation
-      freq_masks: 2 # set to zero to disable it
-      time_masks: 10 # set to zero to disable it
+      freq_masks: 0 # set to zero to disable it, otherwise use something like 2
+      time_masks: 0 # set to zero to disable it, otherwise use something like 10
       freq_width: 27
       time_width: 0.05
 
@@ -214,23 +215,25 @@ trainer:
   devices: -1
   accelerator: gpu
   num_nodes: 1
-  max_epochs: 1000  # used to keep epoch logging correctly, but training will stop based on max_steps
+  max_epochs: -1 
   max_steps: 1000000 # 1M steps
-  log_every_n_steps: 2 # frequency with which training steps are logged 
-  val_check_interval: 1.0 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  accumulate_grad_batches: 1
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 2000 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
   num_sanity_val_steps: 0
   sync_batchnorm: true # used for convolution modules like FC
 
 strategy:
   _target_: nemo.collections.speechlm.strategies.SpeechLMMegatronStrategy
   tensor_model_parallel_size: 1
   pipeline_model_parallel_size: 1
+  context_parallel_size: 1
   ckpt_async_save: true
 
 callbacks:
   checkpoint:
     _target_: nemo.lightning.pytorch.callbacks.ModelCheckpoint
-    filename: '${name}--{${callbacks.checkpoint.monitor}:.3f}-{step}-{epoch}'
+    filename: '${name}--{${callbacks.checkpoint.monitor}:.5f}-{step}'
     monitor: "val_loss"
     mode: "min"
     save_last: true

diff --git a/examples/speechlm/conf/salm/salm_llama3-8b_fc_linear_peft.yaml b/examples/speechlm/conf/salm/salm_llama3-8b_fc_linear_peft.yaml
@@ -6,7 +6,7 @@ data:
   common:
     global_batch_size: 2
     micro_batch_size: 2
-    max_seq_length: 2048
+    max_seq_length: 4096
     min_seq_length: 1
     sample_rate: 16000
     end_string: null
@@ -118,6 +118,13 @@ model:
   speech_encoder:
     pretrained_model: "stt_en_fastconformer_transducer_large"
     target_module: "encoder"
+    sample_rate: ${data.common.sample_rate}
+    spec_augment_config:
+      _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+      freq_masks: 0 # set to zero to disable it
+      time_masks: 0 # set to zero to disable it
+      freq_width: 27
+      time_width: 0.05
 
   modality_adapter:
     input_key_from: "d_model"  # attribute of model dim in the speech model
@@ -161,23 +168,25 @@ trainer:
   devices: -1
   accelerator: gpu
   num_nodes: 1
-  max_epochs: 1000  # used to keep epoch logging correctly, but training will stop based on max_steps
+  max_epochs: -1
   max_steps: 1000000 # 1M steps
-  log_every_n_steps: 2 # frequency with which training steps are logged 
-  val_check_interval: 1.0 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  accumulate_grad_batches: 1
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 2000 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
   num_sanity_val_steps: 0
   sync_batchnorm: true # used for convolution modules like FC
 
 strategy:
   _target_: nemo.collections.speechlm.strategies.SpeechLMMegatronStrategy
   tensor_model_parallel_size: 1
   pipeline_model_parallel_size: 1
+  context_parallel_size: 1
   ckpt_async_save: true
 
 callbacks:
   checkpoint:
     _target_: nemo.lightning.pytorch.callbacks.ModelCheckpoint
-    filename: '${name}--{${callbacks.checkpoint.monitor}:.3f}-{step}-{epoch}'
+    filename: '${name}--{${callbacks.checkpoint.monitor}:.5f}-{step}'
     monitor: "val_loss"
     mode: "min"
     save_last: true

diff --git a/examples/speechlm/conf/salm/salm_llama3.1-8b_fc_fc_peft.yaml b/examples/speechlm/conf/salm/salm_llama3.1-8b_fc_fc_peft.yaml
@@ -0,0 +1,270 @@
+name: megatron_audio_gpt_peft
+
+
+############ Data ############
+data:
+  common:
+    global_batch_size: 2
+    micro_batch_size: 2
+    max_seq_length: 4096
+    min_seq_length: 1
+    sample_rate: 16000
+    end_string: null
+    context_key: 'context'
+    answer_key: 'answer'
+    prompt_format: null
+    prompt_template: "Q: {context}\nA: {answer}" # fstring to use for assistant prompt.
+    separate_prompt_and_response_with_newline: False
+    truncation_field: 'context'
+    add_eos: true
+    add_sep: false
+    add_bos: false
+    tokens_to_generate: 128
+    audio_locator: null
+    add_boa_eoa: false
+
+  train_ds:
+    # Example of how to specify paths to multiple datasets
+    # manifest_filepath:
+    #   - /path/to/squad.jsonl
+    #   - /path/to/mnli.jsonl
+    #   - /path/to/boolq.jsonl
+    # Example of how each dataset is formatted
+    # {'audio_filepath': 'audio1.wav', 'offset': 0.0, 'duration': 12.3, 'context': 'transcribe this audio', 'answer': 'I have a dream...'}
+    # the 'answer' field can also be 'text', and a default 'context' field is added if missing in manigests, so as to work with ASR manifests
+    manifest_filepath: ??? # Path to a list of JSONL files corresponding to the source data.
+    global_batch_size: ${data.common.global_batch_size}
+    micro_batch_size: ${data.common.micro_batch_size}
+    shuffle: True
+    num_workers: 0
+    pin_memory: True
+    max_seq_length: ${data.common.max_seq_length}
+    min_seq_length: ${data.common.min_seq_length}
+    drop_last: True
+    # Notably, the data weights are controlled by either bucketing_weights
+    # or concat_sampling_probabilities depending on the dataset type (tar and
+    # non-tar).
+    concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+    context_key: ${data.common.context_key}
+    answer_key: ${data.common.answer_key}
+    end_string: ${data.common.end_string}
+    add_eos: ${data.common.add_eos}
+    add_sep: ${data.common.add_sep}
+    add_bos: ${data.common.add_bos}
+    separate_prompt_and_response_with_newline: ${data.common.separate_prompt_and_response_with_newline}
+    truncation_field: ${data.common.truncation_field} # Options: ['context', 'answer']
+    prompt_template: ${data.common.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+    # ASR configs
+    sample_rate: ${data.common.sample_rate}
+    max_duration: 24 # it is set for LibriSpeech, you may need to update it for your dataset
+    min_duration: 0.1
+    # tarred datasets
+    is_concat: false
+    is_tarred: false
+    tarred_audio_filepaths: null
+    shuffle_n: 2048
+    # bucketing params
+    bucketing_strategy: "fully_randomized"
+    bucketing_batch_size: null
+    audio_locator: ${data.common.audio_locator}
+    prompt_format: ${data.common.prompt_format}
+
+  validation_ds:
+    manifest_filepath: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+    global_batch_size: ${data.common.global_batch_size}
+    micro_batch_size: ${data.common.micro_batch_size}
+    shuffle: False
+    num_workers: 0
+    pin_memory: True
+    max_seq_length: ${data.common.max_seq_length}
+    min_seq_length: ${data.common.min_seq_length}
+    drop_last: true  # no effect, the dataloader will drop last for train and validation anyway
+    context_key: ${data.common.context_key}
+    answer_key: ${data.common.answer_key}
+    add_eos: ${data.common.add_eos}
+    end_string: ${data.common.end_string}
+    add_sep: ${data.common.add_sep}
+    add_bos: ${data.common.add_bos}
+    separate_prompt_and_response_with_newline: ${data.common.separate_prompt_and_response_with_newline}
+    output_file_path_prefix: null # Prefix of the file to write predictions to.
+    truncation_field: ${data.common.truncation_field} # Options: ['context', 'answer']
+    index_mapping_dir: null # Path to a directory to write index mapping files.
+    prompt_template: ${data.common.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+    tokens_to_generate: ${data.common.tokens_to_generate}
+    write_predictions_to_file: False
+    # ASR configs
+    sample_rate: ${data.common.sample_rate}
+    audio_locator: ${data.common.audio_locator}
+    prompt_format: ${data.common.prompt_format}
+
+    log_every_n_steps: 10
+    metric:
+      name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss', 'wer', 'bleu', 'rouge']
+      average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+      num_classes: null
+
+
+############ Model ############
+model:
+  freeze_language_model: true
+  freeze_speech_model: false
+  freeze_modality_adapter: false
+
+  llm:
+    pretrained_model: "meta-llama/Llama-3.1-8B"
+    _target_: nemo.collections.llm.LlamaModel
+    config: 
+      _target_: nemo.collections.llm.Llama31Config8B
+
+  speech_encoder:
+    _target_: nemo.collections.asr.models.EncDecMultiTaskModel
+    pretrained_model: "nvidia/canary-1b"
+    target_module: "encoder"
+    sample_rate: ${data.common.sample_rate}
+    spec_augment_config:
+      _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+      freq_masks: 0 # set to zero to disable it, otherwise use something like 2
+      time_masks: 0 # set to zero to disable it, otherwise use something like 10
+      freq_width: 27
+      time_width: 0.05
+
+  modality_adapter:
+    input_key_from: "d_model"  # attribute of model dim in the speech model
+    input_key_to: "feat_in"  # attribute of input dim in the modality adapter
+    output_key: "feat_out"  # attrubuite of output dim in the modality adapter
+    config:
+      _target_: nemo.collections.asr.modules.ConformerEncoder
+      feat_in: -1
+      feat_out: -1 # you may set it if you need different output size other than the default d_model
+      n_layers: 2
+      d_model: 512
+
+      # Sub-sampling parameters
+      subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
+      subsampling_factor: 1 # must be power of 2 for striding and vggnet
+      subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
+      causal_downsampling: false
+
+      # Reduction parameters: Can be used to add another subsampling layer at a given position.
+      # Having a 2x reduction will speedup the training and inference speech while keeping similar WER.
+      # Adding it at the end will give the best WER while adding it at the beginning will give the best speedup.
+      reduction: null # pooling, striding, or null
+      reduction_position: null # Encoder block index or -1 for subsampling at the end of encoder
+      reduction_factor: 1
+
+      # Feed forward module's params
+      ff_expansion_factor: 4
+
+      # Multi-headed Attention Module's params
+      self_attention_model: rel_pos # rel_pos or abs_pos
+      n_heads: 8 # may need to be lower for smaller d_models
+      # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
+      att_context_size: [-1, -1] # -1 means unlimited context
+      att_context_style: regular # regular or chunked_limited
+      xscaling: true # scales up the input embeddings by sqrt(d_model)
+      untie_biases: true # unties the biases of the TransformerXL layers
+      pos_emb_max_len: 5000
+
+      # Convolution module's params
+      conv_kernel_size: 9
+      conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
+      # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
+      # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
+      conv_context_size: null
+
+      ### regularization
+      dropout: 0.1 # The dropout used in most of the Conformer Modules
+      dropout_pre_encoder: 0.1 # The dropout used before the encoder
+      dropout_emb: 0.0 # The dropout used for embeddings
+      dropout_att: 0.1 # The dropout for multi-headed attention modules
+
+      # set to non-zero to enable stochastic depth
+      stochastic_depth_drop_prob: 0.0
+      stochastic_depth_mode: linear  # linear or uniform
+      stochastic_depth_start_layer: 1
+
+  peft:
+    _target_: nemo.collections.llm.peft.LoRA
+    dim: 32
+
+
+############ Optimizer ############
+optim:
+  _target_: nemo.lightning.MegatronOptimizerModule
+  config:
+    _target_: megatron.core.optimizer.OptimizerConfig
+    optimizer: adam
+    lr: 1e-4
+    clip_grad: 1.0
+    weight_decay: 0.0001
+  lr_scheduler:
+    _target_: nemo.lightning.pytorch.optim.CosineAnnealingScheduler
+    max_steps: ${trainer.max_steps}
+    warmup_steps: 250
+    constant_steps: 10000
+    min_lr: 5e-5
+
+############ Trainer ############
+
+# Set this to "DD:HH:MM:SS" format to limit the max time for this job
+# If `max_time_per_run` is set, `strategy.ckpt_async_save` must be set to false
+max_time_per_run: null
+
+trainer:
+  # _target_: nemo.lightning.Trainer
+  devices: -1
+  accelerator: gpu
+  num_nodes: 1
+  max_epochs: -1 
+  max_steps: 1000000 # 1M steps
+  accumulate_grad_batches: 1
+  log_every_n_steps: 10 # frequency with which training steps are logged 
+  val_check_interval: 2000 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  num_sanity_val_steps: 0
+  sync_batchnorm: true # used for convolution modules like FC
+
+strategy:
+  _target_: nemo.collections.speechlm.strategies.SpeechLMMegatronStrategy
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  context_parallel_size: 1
+  ckpt_async_save: true
+
+callbacks:
+  checkpoint:
+    _target_: nemo.lightning.pytorch.callbacks.ModelCheckpoint
+    filename: '${name}--{${callbacks.checkpoint.monitor}:.5f}-{step}'
+    monitor: "val_loss"
+    mode: "min"
+    save_last: true
+    save_top_k: 1
+    save_weights_only: false
+    always_save_context: true
+
+plugins:
+  _target_: nemo.lightning.MegatronMixedPrecision
+  precision: "bf16-mixed"
+  autocast_enabled: null
+
+############ AutoResume ############
+resume:
+  _target_: nemo.collections.speechlm.utils.resume.SpeechLMAutoResume
+  resume_from_directory: null
+  resume_from_path: null
+  adapter_path: null
+  resume_if_exists: true
+  resume_past_end: false
+  resume_ignore_no_checkpoint: true
+
+
+############ Logging ############
+logger:
+  _target_: nemo.lightning.NeMoLogger
+  log_dir: null  # default to ./nemo_experiments
+  name: ${name}
+  wandb:
+    _target_: lightning.pytorch.loggers.WandbLogger
+    project: null
+    name: ${logger.name}
+    resume: false
+