diff --git a/finetune.ipynb b/finetune.ipynb
index afa221a..086ffd7 100644
--- a/finetune.ipynb
+++ b/finetune.ipynb
@@ -366,7 +366,8 @@
     "--mlfoundry_checkpoint_artifact_name {mlfoundry_checkpoint_artifact_name} \\\n",
     "--mlfoundry_log_checkpoints {mlfoundry_log_checkpoints} \\\n",
     "--cleanup_output_dir_on_start False \\\n",
-    "--resume_from_checkpoint True\n",
+    "--resume_from_checkpoint True \\\n",
+    "| tee train.log\n",
     "\"\"\"\n",
     "\n",
     "print(f\"Command to run: {COMMAND}\")"
@@ -381,7 +382,7 @@
    },
    "outputs": [],
    "source": [
-    "!{COMMAND} | tee train.log"
+    "!{COMMAND} "
    ]
   }
  ],
diff --git a/train.py b/train.py
index 60235fe..06da0c8 100644
--- a/train.py
+++ b/train.py
@@ -127,6 +127,10 @@ class OtherArguments:
         default="NA",
         metadata={"help": "URL to the jsonl evaluation dataset. Overrides eval_size. Leave as NA if not available"},
     )
+    revision: Optional[str] = field(
+        default=None,
+        metadata={"help": "Model Revision to load"},
+    )
     train_on_prompt: bool = field(
         default=False,
         metadata={"help": "If to train on prompt and include it in the loss"},
@@ -302,13 +306,15 @@ def _cleanup_gpus():
 
 def merge_adapters_if_any(
     model_id: str,
+    revision: Optional[str],
     torch_dtype,
     output_dir: str,
 ):
-    check_if_model_will_fit_only_with_gpus(model_id=model_id, torch_dtype=torch_dtype)
+    check_if_model_will_fit_only_with_gpus(model_id=model_id, revision=revision, torch_dtype=torch_dtype)
     logger.info("Loading model and lora layers for merging ...")
     model = AutoPeftModelForCausalLM.from_pretrained(
         output_dir,
+        revision=revision,
         trust_remote_code=True,
         low_cpu_mem_usage=True,
         torch_dtype=torch_dtype,
@@ -494,6 +500,7 @@ def get_model(
         )
         model = AutoModelForCausalLM.from_pretrained(
             model_source,
+            revision=other_arguments.revision,
             trust_remote_code=True,
             torch_dtype=torch_dtype,
             quantization_config=bnb_config,
@@ -514,6 +521,7 @@ def get_model(
             model_load_kwargs.pop("low_cpu_mem_usage", None)
         model = AutoModelForCausalLM.from_pretrained(
             model_source,
+            revision=other_arguments.revision,
             trust_remote_code=True,
             torch_dtype=get_torch_dtype(training_arguments),
             device_map=device_map,
@@ -596,14 +604,17 @@ def get_peft_wrapped_model(
     return model
 
 
-def get_tokenizer(model_source: str):
+def get_tokenizer(model_source: str, revision: Optional[str]):
     logger.info("Loading tokenizer...")
     try:
         # Note: First we try loading with use_fast=False because for some models conversion takes too long
-        tokenizer = AutoTokenizer.from_pretrained(model_source, trust_remote_code=True, use_fast=False)
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_source, revision=revision, trust_remote_code=True, use_fast=False
+        )
     except ValueError:
         tokenizer = AutoTokenizer.from_pretrained(
             model_source,
+            revision=revision,
             trust_remote_code=True,
         )
     logger.info(f"Tokenizer's padding side is {tokenizer.padding_side}")
@@ -675,11 +686,17 @@ def deepspeed_zero3_disabled(training_arguments: HFTrainingArguments):
 
 def check_if_model_will_fit_only_with_gpus(
     model_id: str,
+    revision: Optional[str],
     torch_dtype,
 ):
     with init_empty_weights():
-        model = AutoModelForCausalLM.from_pretrained(
+        config = AutoConfig.from_pretrained(
             model_id,
+            revision=revision,
+            trust_remote_code=True,
+        )
+        model = AutoModelForCausalLM.from_config(
+            config=config,
             trust_remote_code=True,
             torch_dtype=torch_dtype,
             # low_cpu_mem_usage=True,
@@ -796,14 +813,16 @@ def _train(
         )
 
         logger.info("Loading config ...")
-        model_config = AutoConfig.from_pretrained(other_arguments.model_id, trust_remote_code=True)
+        model_config = AutoConfig.from_pretrained(
+            other_arguments.model_id, revision=other_arguments.revision, trust_remote_code=True
+        )
 
         if last_checkpoint_dir:
             model_source = last_checkpoint_dir
         else:
             model_source = other_arguments.model_id
 
-        tokenizer, num_new_tokens = get_tokenizer(model_source)
+        tokenizer, num_new_tokens = get_tokenizer(model_source, revision=other_arguments.revision)
 
         max_length = get_max_length(
             max_length=other_arguments.max_length,
@@ -985,7 +1004,9 @@ def train(training_arguments: HFTrainingArguments, other_arguments: OtherArgumen
         if other_arguments.use_lora or other_arguments.use_qlora:
             with deepspeed_zero3_disabled(training_arguments):
                 check_if_model_will_fit_only_with_gpus(
-                    model_id=other_arguments.model_id, torch_dtype=get_torch_dtype(training_arguments)
+                    model_id=other_arguments.model_id,
+                    revision=other_arguments.revision,
+                    torch_dtype=get_torch_dtype(training_arguments),
                 )
 
     logger.info(get_gpu_metrics())
@@ -1004,6 +1025,7 @@ def train(training_arguments: HFTrainingArguments, other_arguments: OtherArgumen
             with deepspeed_zero3_disabled(training_arguments):
                 merge_adapters_if_any(
                     model_id=other_arguments.model_id,
+                    revision=other_arguments.revision,
                     torch_dtype=get_torch_dtype(training_arguments),
                     output_dir=training_arguments.output_dir,
                 )