diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 17fa5c35457c2..90c86d4e6c59d 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -104,7 +104,10 @@ def init_device(self) -> None: # Use persistent cache to avoid XLA recompilation. # NOTE(woosuk): This does not completely eliminate the recompilation # overhead because dynamo does not cache the compiled results. - xr.initialize_cache(envs.VLLM_XLA_CACHE_PATH, readonly=False) + # NOTE(woosuk): Set readonly=False only for the rank 0 process to avoid + # race conditions. + xr.initialize_cache(envs.VLLM_XLA_CACHE_PATH, + readonly=not self.is_driver_worker) def load_model(self): self.model_runner.load_model()