From 2e569e0941fc641434367fc4cadf5d70dcf20e0c Mon Sep 17 00:00:00 2001 From: Mamta Singh Date: Wed, 8 Jan 2025 08:12:48 +0000 Subject: [PATCH] update documentation and config_utils Signed-off-by: Mamta Singh --- QEfficient/finetune/configs/training.py | 2 +- QEfficient/finetune/utils/config_utils.py | 30 ++++++++++------------- docs/source/finetune.md | 14 ++++++++++- 3 files changed, 27 insertions(+), 19 deletions(-) diff --git a/QEfficient/finetune/configs/training.py b/QEfficient/finetune/configs/training.py index e92aeb41..b714259e 100644 --- a/QEfficient/finetune/configs/training.py +++ b/QEfficient/finetune/configs/training.py @@ -37,7 +37,7 @@ class train_config: save_model: bool = True save_metrics: bool = True # saves training metrics to a json file for later plotting intermediate_step_save: int = 1000 - batching_strategy: str = "padding" + batching_strategy: str = "packing" # TODO: vbaddi: Uncomment post adding qaic to Pytorch Profiler # flop_counter: bool = False # Enable flop counter to measure model throughput, can not be used with pytorch profiler at the same time. diff --git a/QEfficient/finetune/utils/config_utils.py b/QEfficient/finetune/utils/config_utils.py index 8f2780eb..7b13f8a6 100644 --- a/QEfficient/finetune/utils/config_utils.py +++ b/QEfficient/finetune/utils/config_utils.py @@ -21,7 +21,7 @@ import QEfficient.finetune.configs.dataset_config as datasets from QEfficient.finetune.configs.peft_config import lora_config, prefix_config from QEfficient.finetune.configs.training import train_config -from QEfficient.finetune.data.sampler import DistributedLengthBasedBatchSampler, LengthBasedBatchSampler +from QEfficient.finetune.data.sampler import DistributedLengthBasedBatchSampler from QEfficient.finetune.dataset.dataset_config import DATASET_PREPROC @@ -74,29 +74,25 @@ def generate_dataset_config(train_config, kwargs): def get_dataloader_kwargs(train_config, dataset, dataset_processer, mode): kwargs = {} batch_size = train_config.batch_size_training if mode == "train" else train_config.val_batch_size - if train_config.batching_strategy == "padding": - if train_config.enable_ddp: - kwargs["batch_sampler"] = DistributedLengthBasedBatchSampler( - dataset, - batch_size=batch_size, - rank=dist.get_rank(), - num_replicas=dist.get_world_size(), + if train_config.enable_ddp: + if train_config.context_length: + kwargs["sampler"] = data_utils.DistributedSampler( + dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank(), shuffle=True ) + kwargs["batch_size"] = batch_size + kwargs["drop_last"] = True + kwargs["collate_fn"] = default_data_collator else: - kwargs["batch_sampler"] = LengthBasedBatchSampler(dataset, batch_size, drop_last=True) - kwargs["collate_fn"] = DataCollatorForSeq2Seq(dataset_processer) - elif train_config.batching_strategy == "packing": - if train_config.enable_ddp: - kwargs["sampler"] = data_utils.DistributedSampler( + kwargs["batch_sampler"] = DistributedLengthBasedBatchSampler( dataset, + batch_size=batch_size, rank=dist.get_rank(), num_replicas=dist.get_world_size(), - shuffle=mode == "train", - drop_last=True, + shuffle=False, ) + kwargs["collate_fn"] = DataCollatorForSeq2Seq(dataset_processer) + else: kwargs["batch_size"] = batch_size kwargs["drop_last"] = True kwargs["collate_fn"] = default_data_collator - else: - raise ValueError(f"Unknown batching strategy: {train_config.batching_strategy}") return kwargs diff --git a/docs/source/finetune.md b/docs/source/finetune.md index c42a53f3..966750ba 100644 --- a/docs/source/finetune.md +++ b/docs/source/finetune.md @@ -6,6 +6,9 @@ Same CLI can be used to run Finetuning on gpu by setting the device flag.(for fi ## Installation Same as QEfficient along with QAIC PyTorch Eager mode. + +For QEfficient Library : https://github.com/quic/efficient-transformers + For torch_qaic, assuming QEfficient is already installed, ```bash pip install /opt/qti-aic/integrations/torch_qaic/py310/torch_qaic-0.1.0-cp310-cp310-linux_x86_64.whl @@ -51,4 +54,13 @@ python -m QEfficient.cloud.finetune --device qaic:0 --use-peft --output_dir ./me ```python QAIC_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc-per-node 4 -m QEfficient.cloud.finetune --device qaic --enable_ddp --dist_backend qccl --num_epochs 2 --model_name "meta-llama/Llama-3.2-1B" ``` -**nproc-per-node is number of workers(gpus) running locally. \ No newline at end of file +**nproc-per-node is number of workers(gpus) running locally. + +## Visualization + +Tensorboard logs are generated inside runs/ directory with date and time stamp. +to visualise the data, + +```python +tensorboard --logdir runs/ --bind_all +``` \ No newline at end of file