You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hi. I tried run demo function with model mistralai/Mamba-Codestral-7B-v0.1. I use torch 2.4.0 and CUDA 12.4 on Nvidia Tesla P40.
How I understand triton have some problem with bf16 on P40. But early I was loading llama in bf16.
---------------------------------------------------------------------------
CalledProcessError Traceback (most recent call last)
File ~/.local/lib/python3.10/site-packages/triton/backends/nvidia/compiler.py:292, in CUDABackend.make_cubin(src, metadata, opt, capability)
291 try:
--> 292 subprocess.run(cmd, shell=True, check=True)
293 except subprocess.CalledProcessError as e:
File /usr/lib/python3.10/subprocess.py:526, in run(input, capture_output, timeout, check, *popenargs, **kwargs)
525 if check and retcode:
--> 526 raise CalledProcessError(retcode, process.args,
527 output=stdout, stderr=stderr)
528 return CompletedProcess(process.args, retcode, stdout, stderr)
CalledProcessError: Command '/home/andrew/.local/lib/python3.10/site-packages/triton/backends/nvidia/bin/ptxas -lineinfo -v --gpu-name=sm_61 /tmp/tmpxqf2tngx.ptx -o /tmp/tmpxqf2tngx.ptx.o 2> /tmp/tmpbh62k9ys.log' returned non-zero exit status 255.
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call last)
Cell In[1], line 4
1 import torch
2 from mistral_inference.main import *
----> 4 demo('models/Mamba-Codestral-7B-v0.1')
File ~/.local/lib/python3.10/site-packages/mistral_inference/main.py:185, in demo(model_path, max_tokens, temperature, lora_path)
178 warnings.warn(
179 "Batched generation is not correctly supported at the moment and therefore might lead to worse results "
180 "as compared to non-batched generation. "
181 "See https://github.com/state-spaces/mamba/issues/66#issuecomment-1862349718 for more information."
182 )
183 encoded_prompts = pad_and_convert_to_tensor(encoded_prompts, mistral_tokenizer.instruct_tokenizer.BOS) # type: ignore[attr-defined]
--> 185 generated_tokens, _logprobs = generate_fn(
186 encoded_prompts,
187 model, # type: ignore[arg-type]
188 max_tokens=max_tokens,
189 temperature=temperature,
190 eos_id=tokenizer.eos_id,
191 )
193 generated_words = []
194 fori, xin enumerate(generated_tokens):
File ~/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py:116, in context_decorator.<locals>.decorate_context(*args, **kwargs)
113 @functools.wraps(func)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)
File ~/.local/lib/python3.10/site-packages/mistral_inference/generate.py:21, in generate_mamba(encoded_prompts, model, max_tokens, temperature, chunk_size, eos_id)
10 @torch.inference_mode()
11 def generate_mamba(
12 encoded_prompts: List[List[int]],
(...)
18 eos_id: Optional[int] = None,
19 ) -> Tuple[List[List[int]], List[List[float]]]:
20 input_ids = torch.tensor(encoded_prompts, device=model.device)
---> 21 output = model.model.generate(
22 input_ids=input_ids,
23 max_length=input_ids.shape[-1] + max_tokens,
24 cg=True,
25 return_dict_in_generate=True,
26 output_scores=True,
27 enable_timing=False,
28 eos_token_id=eos_id,
29 temperature=temperature,
30 top_p=0.8,
31 )
32 generated_tokens = output.sequences[:, input_ids.shape[-1] :].tolist()
34 _logprobs: List[List[float]] = [[] for_in range(len(generated_tokens))]
File ~/.local/lib/python3.10/site-packages/mamba_ssm/utils/generation.py:260, in GenerationMixin.generate(self, input_ids, max_length, top_k, top_p, min_p, temperature, return_dict_in_generate, output_scores, **kwargs)
248 def generate(
249 self,
250 input_ids,
(...)
258 **kwargs,
259 ):
--> 260 output = decode(
261 input_ids, self, max_length, top_k=top_k, top_p=top_p, min_p = min_p, temperature=temperature, **kwargs
262 )
263 if not output_scores:
264 output.scores = None
File ~/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py:116, in context_decorator.<locals>.decorate_context(*args, **kwargs)
113 @functools.wraps(func)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)
File ~/.local/lib/python3.10/site-packages/mamba_ssm/utils/generation.py:160, in decode(input_ids, model, max_length, top_k, top_p, min_p, temperature, repetition_penalty, eos_token_id, teacher_outputs, vocab_size, cg, enable_timing, streamer)
158 if not hasattr(model, "_decoding_cache"):
159 model._decoding_cache = None
--> 160 model._decoding_cache = update_graph_cache(
161 model,
162 model._decoding_cache,
163 batch_size,
164 seqlen_og,
165 max_length,
166 )
167 inference_params = model._decoding_cache.inference_params
168 inference_params.reset(max_length, batch_size)
File ~/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py:116, in context_decorator.<locals>.decorate_context(*args, **kwargs)
113 @functools.wraps(func)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)
File ~/.local/lib/python3.10/site-packages/mamba_ssm/utils/generation.py:321, in update_graph_cache(model, cache, batch_size, seqlen_og, max_seqlen, decoding_seqlens, dtype, n_warmups)
319 fordecoding_seqlenin decoding_seqlens:
320 if (batch_size, decoding_seqlen) not in cache.callables:
--> 321 cache.callables[batch_size, decoding_seqlen] = capture_graph(
322 model,
323 cache.inference_params,
324 batch_size,
325 max_seqlen,
326 decoding_seqlen=decoding_seqlen,
327 mempool=cache.mempool,
328 n_warmups=n_warmups,
329 )
331 def dispatch(input_ids, position_ids, seqlen):
332 batch_size, decoding_seqlen = input_ids.shape[:2]
File ~/.local/lib/python3.10/site-packages/mamba_ssm/utils/generation.py:355, in capture_graph(model, inference_params, batch_size, max_seqlen, decoding_seqlen, mempool, n_warmups)
353 with torch.cuda.stream(s):
354 for_in range(n_warmups):
--> 355 logits = model(
356 input_ids,
357 position_ids=position_ids,
358 inference_params=inference_params,
359 num_last_tokens=decoding_seqlen,
360 ).logits
361 s.synchronize()
362 # This might be needed for correctness if we run with NCCL_GRAPH_MIXING_SUPPORT=0,
363 # which requires that graph launch and non-captured launch to not overlap (I think,
364 # that's how I interpret the documentation). I'm not sure if this is required.
File ~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File ~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File ~/.local/lib/python3.10/site-packages/mamba_ssm/models/mixer_seq_simple.py:279, in MambaLMHeadModel.forward(self, input_ids, position_ids, inference_params, num_last_tokens, **mixer_kwargs)
274 def forward(self, input_ids, position_ids=None, inference_params=None, num_last_tokens=0, **mixer_kwargs):
275 """ 276 "position_ids" is just to be compatible with Transformer generation. We don't use it. 277 num_last_tokens: if > 0, only return the logits for the last n tokens 278 """
--> 279 hidden_states = self.backbone(input_ids, inference_params=inference_params, **mixer_kwargs)
280 if num_last_tokens > 0:
281 hidden_states = hidden_states[:, -num_last_tokens:]
File ~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File ~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File ~/.local/lib/python3.10/site-packages/mamba_ssm/models/mixer_seq_simple.py:194, in MixerModel.forward(self, input_ids, inference_params, **mixer_kwargs)
192 residual = None
193 forlayerin self.layers:
--> 194 hidden_states, residual = layer(
195 hidden_states, residual, inference_params=inference_params, **mixer_kwargs
196 )
197 if not self.fused_add_norm:
198 residual = (hidden_states + residual) if residual is not None else hidden_states
File ~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File ~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File ~/.local/lib/python3.10/site-packages/mamba_ssm/modules/block.py:57, in Block.forward(self, hidden_states, residual, inference_params, **mixer_kwargs)
55 residual = residual.to(torch.float32)
56 else:
---> 57 hidden_states, residual = layer_norm_fn(
58 hidden_states,
59 self.norm.weight,
60 self.norm.bias,
61 residual=residual,
62 prenorm=True,
63 residual_in_fp32=self.residual_in_fp32,
64 eps=self.norm.eps,
65 is_rms_norm=isinstance(self.norm, RMSNorm)
66 )
67 hidden_states = self.mixer(hidden_states, inference_params=inference_params, **mixer_kwargs)
69 if self.mlp is not None:
File ~/.local/lib/python3.10/site-packages/mamba_ssm/ops/triton/layer_norm.py:902, in layer_norm_fn(x, weight, bias, residual, x1, weight1, bias1, eps, dropout_p, rowscale, prenorm, residual_in_fp32, is_rms_norm, return_dropout_mask)
886 def layer_norm_fn(
887 x,
888 weight,
(...)
900 return_dropout_mask=False,
901 ):
--> 902 return LayerNormFn.apply(
903 x,
904 weight,
905 bias,
906 residual,
907 x1,
908 weight1,
909 bias1,
910 eps,
911 dropout_p,
912 rowscale,
913 prenorm,
914 residual_in_fp32,
915 is_rms_norm,
916 return_dropout_mask,
917 )
File ~/.local/lib/python3.10/site-packages/torch/autograd/function.py:574, in Function.apply(cls, *args, **kwargs)
571 if not torch._C._are_functorch_transforms_active():
572 # See NOTE: [functorch vjp and autograd interaction]
573 args = _functorch.utils.unwrap_dead_wrappers(args)
--> 574 returnsuper().apply(*args, **kwargs) # type: ignore[misc]
576 if not is_setup_ctx_defined:
577 raise RuntimeError(
578 "In order to use an autograd.Function with functorch transforms "
579 "(vmap, grad, jvp, jacrev, ...), it must override the setup_context "
580 "staticmethod. For more details, please see "
581 "https://pytorch.org/docs/main/notes/extending.func.html"
582 )
File ~/.local/lib/python3.10/site-packages/mamba_ssm/ops/triton/layer_norm.py:775, in LayerNormFn.forward(ctx, x, weight, bias, residual, x1, weight1, bias1, eps, dropout_p, rowscale, prenorm, residual_in_fp32, is_rms_norm, return_dropout_mask)
769 rowscale = rowscale.reshape(-1).contiguous()
770 residual_dtype = (
771 residual.dtype
772 if residual is not None
773 else (torch.float32 if residual_in_fp32 else None)
774 )
--> 775 y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = _layer_norm_fwd(
776 x,
777 weight,
778 bias,
779 eps,
780 residual,
781 x1,
782 weight1,
783 bias1,
784 dropout_p=dropout_p,
785 rowscale=rowscale,
786 residual_dtype=residual_dtype,
787 is_rms_norm=is_rms_norm,
788 return_dropout_mask=return_dropout_mask,
789 )
790 ctx.save_for_backward(
791 residual_out, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd
792 )
793 ctx.x_shape_og = x_shape_og
File ~/.local/lib/python3.10/site-packages/mamba_ssm/ops/triton/layer_norm.py:369, in _layer_norm_fwd(x, weight, bias, eps, residual, x1, weight1, bias1, dropout_p, rowscale, out_dtype, residual_dtype, is_rms_norm, return_dropout_mask)
367 raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
368 with torch.cuda.device(x.device.index):
--> 369 _layer_norm_fwd_1pass_kernel[(M,)](
370 x,
371 y,
372 weight,
373 bias,
374 residual,
375 x1,
376 weight1,
377 bias1,
378 y1,
379 residual_out,
380 rowscale,
381 seeds,
382 dropout_mask,
383 mean,
384 rstd,
385 x.stride(0),
386 y.stride(0),
387 residual.stride(0) if residual is not None else 0,
388 residual_out.stride(0) if residual_out is not None else 0,
389 x1.stride(0) if x1 is not None else 0,
390 y1.stride(0) if y1 is not None else 0,
391 M,
392 N,
393 eps,
394 dropout_p,
395 is_rms_norm,
396 BLOCK_N,
397 residual is not None,
398 residual_out is not None,
399 bias is not None,
400 dropout_p > 0.0,
401 dropout_mask is not None,
402 rowscale is not None,
403 )
404 # residual_out is None if residual is None and residual_dtype == input_dtype and dropout_p == 0.0
405 if dropout_mask is not None and x1 is not None:
File ~/.local/lib/python3.10/site-packages/triton/runtime/jit.py:345, in KernelInterface.__getitem__.<locals>.<lambda>(*args, **kwargs)
339 def __getitem__(self, grid) -> T:
340 """ 341 A JIT function is launched with: fn[grid](*args, **kwargs). 342 Hence JITFunction.__getitem__ returns a callable proxy that 343 memorizes the grid. 344 """
--> 345 return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
File ~/.local/lib/python3.10/site-packages/triton/runtime/autotuner.py:156, in Autotuner.run(self, *args, **kwargs)
154 pruned_configs = self.prune_configs(kwargs)
155 bench_start = time.time()
--> 156 timings = {config: self._bench(*args, config=config, **kwargs) forconfigin pruned_configs}
157 bench_end = time.time()
158 self.bench_time = bench_end - bench_start
File ~/.local/lib/python3.10/site-packages/triton/runtime/autotuner.py:156, in<dictcomp>(.0)
154 pruned_configs = self.prune_configs(kwargs)
155 bench_start = time.time()
--> 156 timings = {config: self._bench(*args, config=config, **kwargs) forconfigin pruned_configs}
157 bench_end = time.time()
158 self.bench_time = bench_end - bench_start
File ~/.local/lib/python3.10/site-packages/triton/runtime/autotuner.py:133, in Autotuner._bench(self, config, *args, **meta)
131 bench_res = do_bench_cudagraph(kernel_call, rep=self.num_reps, return_mode="median")
132 return bench_res
--> 133 return do_bench(kernel_call, warmup=self.num_warmups, rep=self.num_reps, quantiles=(0.5, 0.2, 0.8))
134 except (OutOfResources, CompileTimeAssertionFailure):
135 return float("inf") if self.use_cuda_graph else [float("inf"), float("inf"), float("inf")]
File ~/.local/lib/python3.10/site-packages/triton/testing.py:103, in do_bench(fn, warmup, rep, grad_to_none, quantiles, fast_flush, return_mode)
100 assert return_mode in ["min", "max", "mean", "median"]
101 import torch
--> 103 fn()
104 torch.cuda.synchronize()
106 # We maintain a buffer of 256 MB that we clear
107 # before each kernel call to make sure that the L2
108 # doesn't contain any input data before the run
File ~/.local/lib/python3.10/site-packages/triton/runtime/autotuner.py:114, inAutotuner._bench.<locals>.kernel_call()
112 self.pre_hook(args)
113 try:
--> 114 self.fn.run(
115 *args,
116 **current,
117 )
118 except Exception as e:
119 try:
File ~/.local/lib/python3.10/site-packages/triton/runtime/autotuner.py:338, in Heuristics.run(self, *args, **kwargs)
336 forv, heurinself.values.items():
337 kwargs[v] = heur({**dict(zip(self.arg_names, args)), **kwargs})
--> 338 return self.fn.run(*args, **kwargs)
File ~/.local/lib/python3.10/site-packages/triton/runtime/autotuner.py:338, in Heuristics.run(self, *args, **kwargs)
336 forv, heurinself.values.items():
337 kwargs[v] = heur({**dict(zip(self.arg_names, args)), **kwargs})
--> 338 return self.fn.run(*args, **kwargs)
File ~/.local/lib/python3.10/site-packages/triton/runtime/autotuner.py:338, in Heuristics.run(self, *args, **kwargs)
336 forv, heurinself.values.items():
337 kwargs[v] = heur({**dict(zip(self.arg_names, args)), **kwargs})
--> 338 return self.fn.run(*args, **kwargs)
File ~/.local/lib/python3.10/site-packages/triton/runtime/jit.py:662, in JITFunction.run(self, grid, warmup, *args, **kwargs)
660 # compile the kernel
661 src = self.ASTSource(self, signature, constants, configs[0])
--> 662 kernel = self.compile(
663 src,
664 target=target,
665 options=options.__dict__,
666 )
667 self.cache[device][key] = kernel
669 # Check that used global values have not changed.
File ~/.local/lib/python3.10/site-packages/triton/compiler/compiler.py:282, in compile(src, target, options)
280 use_ttgir_loc = os.environ.get("USE_TTGIR_LOC", "0") == "1"
281 forext, compile_irinlist(stages.items())[first_stage:]:
--> 282 next_module = compile_ir(module, metadata)
283 ir_filename = f"{src.name}.{ext}"
284 metadata_group[ir_filename] = fn_cache_manager.put(next_module, ir_filename)
File ~/.local/lib/python3.10/site-packages/triton/backends/nvidia/compiler.py:320, in CUDABackend.add_stages.<locals>.<lambda>(src, metadata)
318 stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options, self.capability)
319 stages["ptx"] = lambda src, metadata: self.make_ptx(src, metadata, options, self.capability)
--> 320 stages["cubin"] = lambda src, metadata: self.make_cubin(src, metadata, options, self.capability)
File ~/.local/lib/python3.10/site-packages/triton/backends/nvidia/compiler.py:297, in CUDABackend.make_cubin(src, metadata, opt, capability)
295 log = log_file.read()
296 if e.returncode == 255:
--> 297 raise RuntimeError(f'Internal Triton PTX codegen error: \n{log}')
298 elif e.returncode == 128 + signal.SIGSEGV:
299 raise RuntimeError(
300 f'Please run `ptxas {fsrc.name}` to confirm that this is a bug in `ptxas`\n{log}')
RuntimeError: Internal Triton PTX codegen error:
ptxas /tmp/tmpxqf2tngx.ptx, line 579; error : Feature '.bf16' requires .target sm_80 or higher
ptxas /tmp/tmpxqf2tngx.ptx, line 579; error : Feature 'cvt with .f32.bf16' requires .target sm_80 or higher
ptxas /tmp/tmpxqf2tngx.ptx, line 582; error : Feature '.bf16' requires .target sm_80 or higher
ptxas /tmp/tmpxqf2tngx.ptx, line 582; error : Feature 'cvt with .f32.bf16' requires .target sm_80 or higher
ptxas /tmp/tmpxqf2tngx.ptx, line 585; error : Feature '.bf16' requires ......
### Suggested Solutions
_No response_
The text was updated successfully, but these errors were encountered:
Python -VV
Pip Freeze
Reproduction Steps
Expected Behavior
Just generated output answer
Additional Context
Hi. I tried run demo function with model mistralai/Mamba-Codestral-7B-v0.1. I use torch 2.4.0 and CUDA 12.4 on Nvidia Tesla P40.
How I understand triton have some problem with bf16 on P40. But early I was loading llama in bf16.
The text was updated successfully, but these errors were encountered: