diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 595a9256f958e..32b9341ae0b93 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -1,4 +1,5 @@ import importlib +import os import pickle import subprocess import sys @@ -423,9 +424,13 @@ def is_attention_free_model(self, architectures: Union[str, def _run_in_subprocess(fn: Callable[[], _T]) -> _T: - with tempfile.NamedTemporaryFile() as output_file: + # NOTE: We use a temporary directory instead of a temporary file to avoid + # issues like https://stackoverflow.com/questions/23212435/permission-denied-to-write-to-my-temporary-file + with tempfile.TemporaryDirectory() as tempdir: + output_filepath = os.path.join(tempdir, "registry_output.tmp") + # `cloudpickle` allows pickling lambda functions directly - input_bytes = cloudpickle.dumps((fn, output_file.name)) + input_bytes = cloudpickle.dumps((fn, output_filepath)) # cannot use `sys.executable __file__` here because the script # contains relative imports @@ -442,7 +447,7 @@ def _run_in_subprocess(fn: Callable[[], _T]) -> _T: raise RuntimeError(f"Error raised in subprocess:\n" f"{returned.stderr.decode()}") from e - with open(output_file.name, "rb") as f: + with open(output_filepath, "rb") as f: return pickle.load(f)