You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
from bitblas.gpu.matmul_analysis import get_tensorized_func_and_tags
from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy
from bitblas.base.arch import CUDA
from bitblas.base.utils import apply_and_build
import tvm
from tvm.script import tir as T
import bitblas
@tvm.script.ir_module
class FusedSingleOp:
@T.prim_func(private=True)
def dense1(lv11: T.Buffer((T.int64(1024), T.int64(1024)), "float16"), B: T.Buffer((T.int64(1024), T.int64(1024)), "float16"), T_matmul_NT: T.Buffer((T.int64(1024), T.int64(1024)), "float16")):
T.func_attr({"layout_free_buffers": [1], "op_attrs": {"op_name": "nn.dense", "out_dtype": "float16", "units": None}, "op_pattern": 4, "tir.noalias": T.bool(True)})
# with T.block("root"):
for i0, i1, k in T.grid(T.int64(1024), T.int64(1024), T.int64(1024)):
with T.block("T_matmul_NT"):
v_i0, v_i1, v_k = T.axis.remap("SSR", [i0, i1, k])
T.reads(lv11[v_i0, v_k], B[v_i1, v_k])
T.writes(T_matmul_NT[v_i0, v_i1])
with T.init():
T_matmul_NT[v_i0, v_i1] = T.float16(0)
T_matmul_NT[v_i0, v_i1] = T_matmul_NT[v_i0, v_i1] + lv11[v_i0, v_k] * B[v_i1, v_k]
ir_module = FusedSingleOp
func = ir_module["dense1"]
target = tvm.target.Target("cuda")
arch = CUDA(target)
policy = DefaultPolicy(func=func, arch=arch)
try:
tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target)
except Exception:
tags = None
# Tune with Tensor Core if possible
if tags:
policy = TensorCorePolicy(func=tensorized_func, arch=arch, tags=tags)
configs = policy.emit_config(topk=20)
cpresults, best = apply_and_build(func, configs, arch, parallel_build=False)
print(best.code)
When I tested the same script on the V100 with same Environment Configuration, the errorInternalError: Check failed: func->buffer_map.size() == 0 (3 vs. 0) : This pass must be called after MakePackedAPIdid not occur. Instead, it resulted in:
The specific test script is as follows:
from bitblas.gpu.matmul_analysis import get_tensorized_func_and_tags
from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy
from bitblas.base.arch import CUDA
from bitblas.base.utils import apply_and_build
import tvm
from tvm.script import tir as T
import bitblas
@tvm.script.ir_module
class FusedSingleOp:
@T.prim_func(private=True)
def dense1(lv11: T.Buffer((T.int64(1024), T.int64(1024)), "float16"), B: T.Buffer((T.int64(1024), T.int64(1024)), "float16"), T_matmul_NT: T.Buffer((T.int64(1024), T.int64(1024)), "float16")):
T.func_attr({"layout_free_buffers": [1], "op_attrs": {"op_name": "nn.dense", "out_dtype": "float16", "units": None}, "op_pattern": 4, "tir.noalias": T.bool(True)})
# with T.block("root"):
for i0, i1, k in T.grid(T.int64(1024), T.int64(1024), T.int64(1024)):
with T.block("T_matmul_NT"):
v_i0, v_i1, v_k = T.axis.remap("SSR", [i0, i1, k])
T.reads(lv11[v_i0, v_k], B[v_i1, v_k])
T.writes(T_matmul_NT[v_i0, v_i1])
with T.init():
T_matmul_NT[v_i0, v_i1] = T.float16(0)
T_matmul_NT[v_i0, v_i1] = T_matmul_NT[v_i0, v_i1] + lv11[v_i0, v_k] * B[v_i1, v_k]
ir_module = FusedSingleOp
func = ir_module["dense1"]
target = tvm.target.Target("cuda")
arch = CUDA(target)
policy = DefaultPolicy(func=func, arch=arch)
try:
tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target)
except Exception:
tags = None
# Tune with Tensor Core if possible
if tags:
policy = TensorCorePolicy(func=tensorized_func, arch=arch, tags=tags)
configs = policy.emit_config(topk=20)
cpresults, best = apply_and_build(func, configs, arch, parallel_build=False)
print(best.code)
The text was updated successfully, but these errors were encountered:
@Cunxiao2002 , thanks for your reporting, interesting bug as I can also reproduce on my A100.
seems something related to @T.prim_func(private=True), if we replace @T.prim_func(private=True) with @T.prim_func (in which case private was set into default value False), the pipeline can work.
Think we should checkout ir modules before get into unsupported_dtype_legalize pass in those two cases.
if you want to apply on a single operator, please use specalized_function = func.with_attr("global_symbol", g_var.name_hint) to make the function be non-private
When I used bitblas to tune a matmul operator on the A100, I encountered the following error.
Environment Configuration:
Ubuntu 22.04.2 LTS
bitblas: 2bd1dee
cuda12.1
The specific test script is as follows:
When I tested the same script on the V100 with same Environment Configuration, the error
InternalError: Check failed: func->buffer_map.size() == 0 (3 vs. 0) : This pass must be called after MakePackedAPI
did not occur. Instead, it resulted in:The specific test script is as follows:
The text was updated successfully, but these errors were encountered: