Skip to content

Commit

Permalink
update documentation and config_utils
Browse files Browse the repository at this point in the history
Signed-off-by: Mamta Singh <[email protected]>
  • Loading branch information
quic-mamta committed Jan 8, 2025
1 parent 793d44a commit 2e569e0
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 19 deletions.
2 changes: 1 addition & 1 deletion QEfficient/finetune/configs/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class train_config:
save_model: bool = True
save_metrics: bool = True # saves training metrics to a json file for later plotting
intermediate_step_save: int = 1000
batching_strategy: str = "padding"
batching_strategy: str = "packing"

# TODO: vbaddi: Uncomment post adding qaic to Pytorch Profiler
# flop_counter: bool = False # Enable flop counter to measure model throughput, can not be used with pytorch profiler at the same time.
Expand Down
30 changes: 13 additions & 17 deletions QEfficient/finetune/utils/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import QEfficient.finetune.configs.dataset_config as datasets
from QEfficient.finetune.configs.peft_config import lora_config, prefix_config
from QEfficient.finetune.configs.training import train_config
from QEfficient.finetune.data.sampler import DistributedLengthBasedBatchSampler, LengthBasedBatchSampler
from QEfficient.finetune.data.sampler import DistributedLengthBasedBatchSampler
from QEfficient.finetune.dataset.dataset_config import DATASET_PREPROC


Expand Down Expand Up @@ -74,29 +74,25 @@ def generate_dataset_config(train_config, kwargs):
def get_dataloader_kwargs(train_config, dataset, dataset_processer, mode):
kwargs = {}
batch_size = train_config.batch_size_training if mode == "train" else train_config.val_batch_size
if train_config.batching_strategy == "padding":
if train_config.enable_ddp:
kwargs["batch_sampler"] = DistributedLengthBasedBatchSampler(
dataset,
batch_size=batch_size,
rank=dist.get_rank(),
num_replicas=dist.get_world_size(),
if train_config.enable_ddp:
if train_config.context_length:
kwargs["sampler"] = data_utils.DistributedSampler(
dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank(), shuffle=True
)
kwargs["batch_size"] = batch_size
kwargs["drop_last"] = True
kwargs["collate_fn"] = default_data_collator
else:
kwargs["batch_sampler"] = LengthBasedBatchSampler(dataset, batch_size, drop_last=True)
kwargs["collate_fn"] = DataCollatorForSeq2Seq(dataset_processer)
elif train_config.batching_strategy == "packing":
if train_config.enable_ddp:
kwargs["sampler"] = data_utils.DistributedSampler(
kwargs["batch_sampler"] = DistributedLengthBasedBatchSampler(
dataset,
batch_size=batch_size,
rank=dist.get_rank(),
num_replicas=dist.get_world_size(),
shuffle=mode == "train",
drop_last=True,
shuffle=False,
)
kwargs["collate_fn"] = DataCollatorForSeq2Seq(dataset_processer)
else:
kwargs["batch_size"] = batch_size
kwargs["drop_last"] = True
kwargs["collate_fn"] = default_data_collator
else:
raise ValueError(f"Unknown batching strategy: {train_config.batching_strategy}")
return kwargs
14 changes: 13 additions & 1 deletion docs/source/finetune.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ Same CLI can be used to run Finetuning on gpu by setting the device flag.(for fi
## Installation

Same as QEfficient along with QAIC PyTorch Eager mode.

For QEfficient Library : https://github.com/quic/efficient-transformers

For torch_qaic, assuming QEfficient is already installed,
```bash
pip install /opt/qti-aic/integrations/torch_qaic/py310/torch_qaic-0.1.0-cp310-cp310-linux_x86_64.whl
Expand Down Expand Up @@ -51,4 +54,13 @@ python -m QEfficient.cloud.finetune --device qaic:0 --use-peft --output_dir ./me
```python
QAIC_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc-per-node 4 -m QEfficient.cloud.finetune --device qaic --enable_ddp --dist_backend qccl --num_epochs 2 --model_name "meta-llama/Llama-3.2-1B"
```
**nproc-per-node is number of workers(gpus) running locally.
**nproc-per-node is number of workers(gpus) running locally.

## Visualization

Tensorboard logs are generated inside runs/ directory with date and time stamp.
to visualise the data,

```python
tensorboard --logdir runs/<file> --bind_all
```

0 comments on commit 2e569e0

Please sign in to comment.