Skip to content

Commit

Permalink
[bugfix][distributed] fix multi-node bug for shared memory (vllm-proj…
Browse files Browse the repository at this point in the history
  • Loading branch information
youkaichao authored Jul 19, 2024
1 parent 9dd56cf commit 98fb2f4
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions vllm/distributed/device_communicators/shm_broadcast.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,14 @@ def __init__(self,
# created by the process. The following patch is a workaround.
with patch("multiprocessing.resource_tracker.register",
lambda *args, **kwargs: None):
self.shared_memory = shared_memory.SharedMemory(name=name)
assert self.shared_memory.size == self.total_bytes_of_buffer
try:
self.shared_memory = shared_memory.SharedMemory(name=name)
assert self.shared_memory.size == self.total_bytes_of_buffer # noqa
except FileNotFoundError:
# we might deserialize the object in a different node
# in this case, this object is not used,
# and we should suppress the error
pass

def __reduce__(self):
return (
Expand Down

0 comments on commit 98fb2f4

Please sign in to comment.