Update on "Modifying memory estimation options and minor changes"

As per suggestions from tianyu-l in #425, the config options are now: `./run_llama_train.sh --memory_estimation.enabled --memory_estimation.fake_mode_only` [ghstack-poisoned]
pytorch · Jul 1, 2024 · 6dc4cb0 · 6dc4cb0
1 parent a273a49
commit 6dc4cb0
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 6 deletions.
diff --git a/estimation.py b/estimation.py
@@ -49,7 +49,7 @@ def estimate_memory(job_config: JobConfig):
     # fake tensor doesn't work with fused rmsnorm
     if (
         job_config.model.norm_type == "fused_rmsnorm"
-        and job_config.memory_estimation.fake_mode_only
+        and not job_config.memory_estimation.disable_fake_mode
     ):
         logger.info(
             "Fused RMSNorm is not supported yet under fake estimation mode. "
@@ -111,7 +111,7 @@ def loss_fn(pred, labels):
     model_config.vocab_size = tokenizer.n_words
     model_config.max_seq_len = job_config.training.seq_len
 
-    with FakeTensorMode() if job_config.memory_estimation.fake_mode_only else contextlib.nullcontext():
+    with FakeTensorMode() if not job_config.memory_estimation.disable_fake_mode else contextlib.nullcontext():
 
         logger.info(
             f"Building {model_name} {job_config.model.flavor} with {model_config}"
@@ -202,7 +202,7 @@ def loss_fn(pred, labels):
             f" {peak_reserved / gib} GiB | num_retries: {num_retries}"
         )
         print(f"Tracker Max: {tracker_peak / gib} GiB")
-        if not job_config.memory_estimation.fake_mode_only and peak_active > 0:
+        if job_config.memory_estimation.disable_fake_mode and peak_active > 0:
             print(f"Tracker Accuracy: {tracker_peak/peak_active}")
         gc.enable()
 

diff --git a/test_runner.py b/test_runner.py
@@ -267,12 +267,11 @@ def build_test_list():
             [
                 [
                     "--memory_estimation.enabled",
-                    "--memory_estimation.fake_mode_only",
                 ]
             ],
             "FSDP2 Memory Tracking and Estimation",
             "fsdp2_mem_tracker",
-            ngpu=8,
+            ngpu=4,
         ),
     ]
     return integration_tests_flavors

diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -488,8 +488,9 @@ def __init__(self):
         )
 
         self.parser.add_argument(
-            "--memory_estimation.fake_mode_only",
+            "--memory_estimation.disable_fake_mode",
             help="Whether to estimate memory under FakeTensorMode",
+            default=False,
             action="store_true",
         )