diff --git a/large_language_model_pretraining/nemo/Dockerfile b/large_language_model_pretraining/nemo/Dockerfile index a609ea89e..ee30dfd2d 100644 --- a/large_language_model_pretraining/nemo/Dockerfile +++ b/large_language_model_pretraining/nemo/Dockerfile @@ -18,6 +18,7 @@ FROM ${NEMO_BASE_IMAGE} AS nemo-base-image RUN pip uninstall transformers -y RUN pip install transformers==4.47.1 blobfile==3.0.0 RUN pip install prettytable==3.12.0 +RUN pip install toml==0.10.2 RUN pip install git+https://github.com/mlcommons/logging.git@4.1.0-rc3 # setup workspace diff --git a/large_language_model_pretraining/nemo/pretrain_llama31.py b/large_language_model_pretraining/nemo/pretrain_llama31.py index 2a58dd6cd..6ae450b3b 100644 --- a/large_language_model_pretraining/nemo/pretrain_llama31.py +++ b/large_language_model_pretraining/nemo/pretrain_llama31.py @@ -75,8 +75,8 @@ def slurm_executor( gpus_per_node=devices, mem="0", exclusive=True, - gres="gpu:8", - packager=run.GitArchivePackager(subpath="large_language_model_pretraining/nemo", ref="HEAD"), + gres=f"gpu:{devices}", + packager=run.GitArchivePackager(), dependencies=dependencies, )