You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Traceback (most recent call last):
File "/home/hpc/LAB-data/disk1/VisCPM/./finetune/ft_viscpm_chat/train_viscpm_chat.py", line 210, in <module>
main()
File "/home/hpc/LAB-data/disk1/VisCPM/./finetune/ft_viscpm_chat/train_viscpm_chat.py", line 206, in main
train(model, args)
File "/home/hpc/LAB-data/disk1/VisCPM/./finetune/ft_viscpm_chat/train_viscpm_chat.py", line 91, in train
vllm_engine, vllm_optim, _, _ = deepspeed.initialize(
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/__init__.py", line 165, in initialize
engine = DeepSpeedEngine(args=args,
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 308, in __init__
self._configure_optimizer(optimizer, model_parameters)
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1167, in _configure_optimizer
self.optimizer = self._configure_zero_optimizer(basic_optimizer)
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1398, in _configure_zero_optimizer
optimizer = DeepSpeedZeroOptimizer(
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 485, in __init__
self.initialize_optimizer_states()
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 614, in initialize_optimizer_states
self.optimizer.step()
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/torch/optim/optimizer.py", line 280, in wrapper
out = func(*args, **kwargs)
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/ops/adam/fused_adam.py", line 151, in step
multi_tensor_applier(self.multi_tensor_adam, self._dummy_overflow_buf, [g_32, p_32, m_32, v_32],
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/ops/adam/multi_tensor_apply.py", line 17, in __call__
return op(self.chunk_size, noop_flag_buffer, tensor_lists, *args)
RuntimeError: CUDA error: an illegal memory access was encountered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
Traceback (most recent call last):
File "/home/hpc/LAB-data/disk1/VisCPM/./finetune/ft_viscpm_chat/train_viscpm_chat.py", line 210, in <module>
main()
File "/home/hpc/LAB-data/disk1/VisCPM/./finetune/ft_viscpm_chat/train_viscpm_chat.py", line 206, in main
train(model, args)
File "/home/hpc/LAB-data/disk1/VisCPM/./finetune/ft_viscpm_chat/train_viscpm_chat.py", line 91, in train
vllm_engine, vllm_optim, _, _ = deepspeed.initialize(
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/__init__.py", line 165, in initialize
engine = DeepSpeedEngine(args=args,
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 308, in __init__
self._configure_optimizer(optimizer, model_parameters)
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1167, in _configure_optimizer
Traceback (most recent call last):
File "/home/hpc/LAB-data/disk1/VisCPM/./finetune/ft_viscpm_chat/train_viscpm_chat.py", line 210, in <module>
self.optimizer = self._configure_zero_optimizer(basic_optimizer)
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1398, in _configure_zero_optimizer
main()
File "/home/hpc/LAB-data/disk1/VisCPM/./finetune/ft_viscpm_chat/train_viscpm_chat.py", line 206, in main
optimizer = DeepSpeedZeroOptimizer(
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 485, in __init__
train(model, args)
File "/home/hpc/LAB-data/disk1/VisCPM/./finetune/ft_viscpm_chat/train_viscpm_chat.py", line 91, in train
vllm_engine, vllm_optim, _, _ = deepspeed.initialize(
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/__init__.py", line 165, in initialize
self.initialize_optimizer_states()
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 614, in initialize_optimizer_states
engine = DeepSpeedEngine(args=args,
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 308, in __init__
self.optimizer.step()
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/torch/optim/optimizer.py", line 280, in wrapper
self._configure_optimizer(optimizer, model_parameters)
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1167, in _configure_optimizer
out = func(*args, **kwargs)
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/ops/adam/fused_adam.py", line 151, in step
multi_tensor_applier(self.multi_tensor_adam, self._dummy_overflow_buf, [g_32, p_32, m_32, v_32],
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/ops/adam/multi_tensor_apply.py", line 17, in __call__
return op(self.chunk_size, noop_flag_buffer, tensor_lists, *args)
RuntimeError: CUDA error: an illegal memory access was encountered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
self.optimizer = self._configure_zero_optimizer(basic_optimizer)
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1398, in _configure_zero_optimizer
optimizer = DeepSpeedZeroOptimizer(
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 485, in __init__
self.initialize_optimizer_states()
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 614, in initialize_optimizer_states
self.optimizer.step()
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/torch/optim/optimizer.py", line 280, in wrapper
out = func(*args, **kwargs)
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/ops/adam/fused_adam.py", line 151, in step
multi_tensor_applier(self.multi_tensor_adam, self._dummy_overflow_buf, [g_32, p_32, m_32, v_32],
File "/home/hpc/anaconda3/envs/viscpm/lib/python3.10/site-packages/deepspeed/ops/adam/multi_tensor_apply.py", line 17, in __call__
return op(self.chunk_size, noop_flag_buffer, tensor_lists, *args)
RuntimeError: CUDA error: an illegal memory access was encountered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
The text was updated successfully, but these errors were encountered:
我尝试在两张A800 80G 上进行微调,cuda 11.8,报了如下错误,请问要怎么解决呢
The text was updated successfully, but these errors were encountered: