You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Creating model 0
Loading model 0
Loading & Quantizing Model Shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [03:02<00:00, 12.18s/it]
Model created 0 0.067 GB
LoRA layers added 0 0.067 GB
Wrapping model w/ FSDP 0
Traceback (most recent call last):
File "/home/alyssa/lm_fun/fsdp_qlora/train.py", line 953, in <module>
def main(
File "/home/alyssa/anaconda3/envs/lm_fun/lib/python3.10/site-packages/fastcore/script.py", line 125, in call_parse
return _f()
File "/home/alyssa/anaconda3/envs/lm_fun/lib/python3.10/site-packages/fastcore/script.py", line 119, in _f
return tfunc(**merge(args, args_from_prog(func, xtra)))
File "/home/alyssa/lm_fun/fsdp_qlora/train.py", line 1026, in main
mp.spawn(fsdp_main,
File "/home/alyssa/anaconda3/envs/lm_fun/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 241, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method="spawn")
File "/home/alyssa/anaconda3/envs/lm_fun/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 197, in start_processes
while not context.join():
File "/home/alyssa/anaconda3/envs/lm_fun/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 158, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/home/alyssa/anaconda3/envs/lm_fun/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 68, in _wrap
fn(i, *args)
File "/home/alyssa/lm_fun/fsdp_qlora/train.py", line 703, in fsdp_main
model = FSDP(
File "/home/alyssa/anaconda3/envs/lm_fun/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 477, in __init__
_auto_wrap(
File "/home/alyssa/anaconda3/envs/lm_fun/lib/python3.10/site-packages/torch/distributed/fsdp/_wrap_utils.py", line 101, in _auto_wrap
_recursive_wrap(**recursive_wrap_kwargs, **root_kwargs) # type: ignore[arg-type]
File "/home/alyssa/anaconda3/envs/lm_fun/lib/python3.10/site-packages/torch/distributed/fsdp/wrap.py", line 543, in _recursive_wrap
wrapped_child, num_wrapped_params = _recursive_wrap(
File "/home/alyssa/anaconda3/envs/lm_fun/lib/python3.10/site-packages/torch/distributed/fsdp/wrap.py", line 543, in _recursive_wrap
wrapped_child, num_wrapped_params = _recursive_wrap(
File "/home/alyssa/anaconda3/envs/lm_fun/lib/python3.10/site-packages/torch/distributed/fsdp/wrap.py", line 543, in _recursive_wrap
wrapped_child, num_wrapped_params = _recursive_wrap(
[Previous line repeated 1 more time]
File "/home/alyssa/anaconda3/envs/lm_fun/lib/python3.10/site-packages/torch/distributed/fsdp/wrap.py", line 561, in _recursive_wrap
return _wrap(module, wrapper_cls, **kwargs), nonwrapped_numel
File "/home/alyssa/anaconda3/envs/lm_fun/lib/python3.10/site-packages/torch/distributed/fsdp/wrap.py", line 490, in _wrap
return wrapper_cls(module, **kwargs)
File "/home/alyssa/anaconda3/envs/lm_fun/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 503, in __init__
_init_param_handle_from_module(
File "/home/alyssa/anaconda3/envs/lm_fun/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py", line 548, in _init_param_handle_from_module
_materialize_with_param_init_fn(
File "/home/alyssa/anaconda3/envs/lm_fun/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py", line 851, in _materialize_with_param_init_fn
param_init_fn(module)
File "/home/alyssa/lm_fun/fsdp_qlora/train.py", line 713, in <lambda>
param_init_fn=lambda module: module.to_empty(device=torch.device("cuda"), recurse=False)
File "/home/alyssa/anaconda3/envs/lm_fun/lib/python3.10/site-packages/hqq/core/quantize.py", line 485, in to_empty
return self.cuda(device)
File "/home/alyssa/anaconda3/envs/lm_fun/lib/python3.10/site-packages/hqq/core/quantize.py", line 419, in cuda
self.W_q.data, self.meta = Quantizer.cuda(self.W_q.data, self.meta, device)
File "/home/alyssa/anaconda3/envs/lm_fun/lib/python3.10/site-packages/hqq/core/quantize.py", line 220, in cuda
return Quantizer.to_inplace(W_q, meta, device=device)
File "/home/alyssa/anaconda3/envs/lm_fun/lib/python3.10/site-packages/hqq/core/quantize.py", line 181, in to_inplace
W_q = W_q.to(device).contiguous()
NotImplementedError: Cannot copy out of meta tensor; no data!
The text was updated successfully, but these errors were encountered:
Here's the command I ran:
this crashes with the stack trace:
The text was updated successfully, but these errors were encountered: