Skip to content

Commit

Permalink
fix for CPU ooming
Browse files Browse the repository at this point in the history
Summary: issue: when we have prefetch_pipeline disabled, we still keep filling ssd_scratch_pads but didn't pop it.

Differential Revision: D63547432
  • Loading branch information
duduyi2013 authored and facebook-github-bot committed Sep 27, 2024
1 parent d056aa3 commit 5bc5565
Showing 1 changed file with 12 additions and 10 deletions.
22 changes: 12 additions & 10 deletions fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,15 +499,16 @@ def __init__(
# pyre-fixme[4]: Attribute must be annotated.
self.ssd_prefetch_data = []

# Scratch pad value queue
self.ssd_scratch_pads: List[Tuple[Tensor, Tensor, Tensor]] = []
# Scratch pad eviction data queue
self.ssd_scratch_pad_eviction_data: List[
Tuple[Tensor, Tensor, Tensor, bool]
] = []
self.ssd_location_update_data: List[Tuple[Tensor, Tensor]] = []

if self.prefetch_pipeline:
# Scratch pad value queue
self.ssd_scratch_pads: List[Tuple[Tensor, Tensor, Tensor]] = []

# pyre-ignore[4]
# Scratch pad index queue
self.scratch_pad_idx_queue = torch.classes.fbgemm.SSDScratchPadIndicesQueue(
Expand Down Expand Up @@ -1407,15 +1408,16 @@ def prefetch( # noqa C901
if t.is_cuda:
t.record_stream(forward_stream)

# Store scratch pad info for the lookup in the next iteration
# prefetch
self.ssd_scratch_pads.append(
(
inserted_rows,
post_bwd_evicted_indices_cpu,
actions_count_cpu,
if self.prefetch_pipeline:
# Store scratch pad info for the lookup in the next iteration
# prefetch
self.ssd_scratch_pads.append(
(
inserted_rows,
post_bwd_evicted_indices_cpu,
actions_count_cpu,
)
)
)

# Store scratch pad info for post backward eviction
self.ssd_scratch_pad_eviction_data.append(
Expand Down

0 comments on commit 5bc5565

Please sign in to comment.