fix for CPU ooming

Summary: issue: when we have prefetch_pipeline disabled, we still keep filling ssd_scratch_pads but didn't pop it. Differential Revision: D63547432
pytorch · Sep 27, 2024 · 5bc5565 · 5bc5565
1 parent d056aa3
commit 5bc5565
Showing 1 changed file with 12 additions and 10 deletions.
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
@@ -499,15 +499,16 @@ def __init__(
         # pyre-fixme[4]: Attribute must be annotated.
         self.ssd_prefetch_data = []
 
-        # Scratch pad value queue
-        self.ssd_scratch_pads: List[Tuple[Tensor, Tensor, Tensor]] = []
         # Scratch pad eviction data queue
         self.ssd_scratch_pad_eviction_data: List[
             Tuple[Tensor, Tensor, Tensor, bool]
         ] = []
         self.ssd_location_update_data: List[Tuple[Tensor, Tensor]] = []
 
         if self.prefetch_pipeline:
+            # Scratch pad value queue
+            self.ssd_scratch_pads: List[Tuple[Tensor, Tensor, Tensor]] = []
+
             # pyre-ignore[4]
             # Scratch pad index queue
             self.scratch_pad_idx_queue = torch.classes.fbgemm.SSDScratchPadIndicesQueue(
@@ -1407,15 +1408,16 @@ def prefetch(  # noqa C901
                     if t.is_cuda:
                         t.record_stream(forward_stream)
 
-            # Store scratch pad info for the lookup in the next iteration
-            # prefetch
-            self.ssd_scratch_pads.append(
-                (
-                    inserted_rows,
-                    post_bwd_evicted_indices_cpu,
-                    actions_count_cpu,
+            if self.prefetch_pipeline:
+                # Store scratch pad info for the lookup in the next iteration
+                # prefetch
+                self.ssd_scratch_pads.append(
+                    (
+                        inserted_rows,
+                        post_bwd_evicted_indices_cpu,
+                        actions_count_cpu,
+                    )
                 )
-            )
 
             # Store scratch pad info for post backward eviction
             self.ssd_scratch_pad_eviction_data.append(