FlagOpen · tongxin · Sep 2, 2024 · Jul 28, 2024 · Jul 29, 2024 · Jul 29, 2024
diff --git a/benchmark/test_special_perf.py b/benchmark/test_special_perf.py
@@ -98,6 +98,23 @@ def unique_kwargs(dtype, batch, size):
     bench.run()
 
 
+def test_multinomial_with_replacement():
+    def multinomial_args(dtype, batch, size):
+        dist = torch.rand((batch, size), dtype=dtype, device="cuda")
+        n_samples = 10000
+        return (dist, n_samples, True)
+
+    bench = Benchmark(
+        op_name="multinomial",
+        torch_op=torch.multinomial,
+        arg_func=multinomial_args,
+        dtypes=(torch.float16, torch.float32),
+        batch=POINTWISE_BATCH,
+        sizes=SIZES,
+    )
+    bench.run()
+
+
 def test_perf_pad():
     def padding_kwargs(dtype, batch, size):
         input = torch.randn((batch, size), device="cuda", dtype=dtype)

diff --git a/src/flag_gems/__init__.py b/src/flag_gems/__init__.py
@@ -82,6 +82,7 @@ def enable(lib=aten_lib):
     lib.impl("mean.dim", mean_dim, "CUDA")
     lib.impl("mm", mm, "CUDA")
     lib.impl("mul.Tensor", mul, "CUDA")
+    lib.impl("multinomial", multinomial, "CUDA")
     lib.impl("mv", mv, "CUDA")
     lib.impl("ne.Tensor", ne, "CUDA")
     lib.impl("ne.Scalar", ne_scalar, "CUDA")

diff --git a/src/flag_gems/ops/__init__.py b/src/flag_gems/ops/__init__.py
@@ -16,7 +16,7 @@
 from .clamp import clamp, clamp_tensor
 from .cos import cos
 from .cross_entropy_loss import cross_entropy_loss
-from .cumsum import cumsum
+from .cumsum import cumsum, normed_cumsum
 from .div import div_mode, floor_divide, true_divide
 from .dropout import native_dropout
 from .embedding import embedding
@@ -48,6 +48,7 @@
 from .minimum import minimum
 from .mm import mm
 from .mul import mul
+from .multinomial import multinomial
 from .mv import mv
 from .ne import ne, ne_scalar
 from .neg import neg
@@ -115,6 +116,7 @@
     "cos",
     "pad",
     "cumsum",
+    "normed_cumsum",
     "true_divide",
     "div_mode",
     "floor_divide",
@@ -153,6 +155,7 @@
     "mean_dim",
     "mm",
     "mul",
+    "multinomial",
     "maximum",
     "minimum",
     "rand",

diff --git a/src/flag_gems/ops/cumsum.py b/src/flag_gems/ops/cumsum.py
@@ -76,3 +76,260 @@ def cumsum(inp, dim=1, *, dtype=None):
     with torch.cuda.device(inp.device):
         cumsum_kernel[grid](inp, out, M, N, K)
     return out
+
+
+@libentry()
+@triton.jit(do_not_specialize=["K"])
+def normed_cumsum_kernel(inp, out, K, BLOCK: tl.constexpr):
+    row_start = tl.program_id(0) * K
+    row_off = tl.arange(0, BLOCK)
+    x = tl.load(inp + row_start + row_off, mask=row_off < K, other=0)
+    if x.dtype.is_fp16():
+        x = x.to(tl.float32)
+    y_sum = tl.sum(x, 0)
+    y = tl.cumsum(x, 0)
+    y = y / y_sum
+    tl.store(out + row_start + row_off, y, mask=row_off < K)
+
+
+@libentry()
+@triton.jit(
+    do_not_specialize=[
+        "r",
+        "t",
+        "R",
+        "K",
+        "r_stride",
+        "out_r_stride",
+    ]
+)
+def block_cumsum_kernel(
+    inp,
+    out,
+    sums,
+    r,
+    t,
+    R,
+    K,
+    r_stride,
+    k_stride,
+    out_r_stride,
+    out_k_stride,
+    OUTPUT_SUMS: tl.constexpr,
+    NORMALIZE: tl.constexpr,
+    HAS_OUT_LAYOUT: tl.constexpr,
+    TILE: tl.constexpr,
+):
+    # One CTA processes a (r, t*tile) chunk
+    # rows = [ grid.y, grid.y + r )
+    # cols = [ grid.x * t * tile, (grid.x + 1) * t * tile )
+    gridx = tl.program_id(0).to(tl.int64)
+    gridy = tl.program_id(1).to(tl.int64)
+    n_chunks = tl.num_programs(0)
+
+    for row in range(gridy * r, min((gridy + 1) * r, R)):
+        curr_cumsum = tl.zeros((1,), tl.float32)
+        row_offset = row * r_stride
+        cols = gridx * t * TILE + tl.arange(0, TILE)
+        for ti in range(0, t):
+            cols_offset = cols * k_stride
+            x = tl.load(inp + row_offset + cols_offset, mask=cols < K, other=0)
+            if x.dtype.is_fp16() | x.dtype.is_bf16():
+                x = x.to(tl.float32)
+            tile_sum = tl.sum(x, 0)[None]
+            tile_cumsum = tl.cumsum(x, 0) + curr_cumsum
+            curr_cumsum += tile_sum
+            if HAS_OUT_LAYOUT:
+                cols_offset = cols * out_k_stride
+                row_offset = row * out_r_stride
+            tl.store(out + row_offset + cols_offset, tile_cumsum, mask=cols < K)
+            if OUTPUT_SUMS:
+                tl.store(sums + row * n_chunks + gridx[None], curr_cumsum)
+            cols += TILE
+        if NORMALIZE:
+            cols = gridx * t * TILE + tl.arange(0, TILE)
+            for _ in range(0, t):
+                cols_offset = cols * k_stride
+                if HAS_OUT_LAYOUT:
+                    cols_offset = cols * out_k_stride
+                    row_offset = row * out_r_stride
+                x = tl.load(out + row_offset + cols_offset, mask=cols < K, other=0)
+                if x.dtype.is_fp16() | x.dtype.is_bf16():
+                    x = x.to(tl.float32)
+                x = x / curr_cumsum
+                tl.store(out + row_offset + cols_offset, x, mask=cols < K)
+                cols += TILE
+
+
+@libentry()
+@triton.jit(
+    do_not_specialize=[
+        "r",
+        "t",
+        "R",
+        "K",
+        "r_stride",
+        "out_r_stride",
+    ]
+)
+def block_update_kernel(
+    inp,
+    base,
+    rscale_ptr,
+    out,
+    r,
+    t,
+    R,
+    K,
+    r_stride,
+    k_stride,
+    out_r_stride,
+    out_k_stride,
+    rscale_stride,
+    HAS_OUT_LAYOUT: tl.constexpr,
+    TILE: tl.constexpr,
+):
+    # One CTA processes a (r, t*tile) chunk
+    # rows = [ grid.y, grid.y + r )
+    # cols = [ grid.x * t * tile, (grid.x + 1) * t * tile )
+    gridx = tl.program_id(0).to(tl.int64)
+    gridy = tl.program_id(1).to(tl.int64)
+    n_gridx = tl.num_programs(1)
+
+    base += gridy * n_gridx + gridx
+    rscale_ptr += gridy * rscale_stride
+
+    for row in range(gridy, min(gridy + r, R)):
+        d = tl.load(base)
+        rscale = tl.load(rscale_ptr)
+        base += gridx
+        rscale_ptr += rscale_stride
+        row_offset = row * r_stride
+        cols = gridx * t * TILE + tl.arange(0, TILE)
+        for _ in range(0, t):
+            cols_offset = cols * k_stride
+            x = tl.load(inp + row_offset + cols_offset, mask=cols < K, other=0)
+            x += d
+            x /= rscale
+            if HAS_OUT_LAYOUT:
+                cols_offset = cols * out_k_stride
+                row_offset = row * out_r_stride
+            tl.store(out + row_offset + cols_offset, x, mask=cols < K)
+            cols += TILE
+
+
+GRID_Y_LIMIT = 65535
+
+
+def normed_cumsum(inp, dim=-1):
+    logging.debug("GEMS NORMED_CUMSUM")
+    assert inp.dtype in (torch.float16, torch.bfloat16, torch.float32, torch.float64)
+    dim = dim % inp.ndim
+    N = inp.numel()
+    K = inp.size(dim)
+    # inp = inp.contiguous()
+    # First and last dims are easier to handle, but transpose the middle dim to the last
+    ranked_dims = sorted(range(inp.ndim), key=lambda i: inp.stride(i), reverse=True)
+    is_mid_dim = dim not in (ranked_dims[0], ranked_dims[-1])
+    if is_mid_dim:
+        inp = inp.transpose(dim, -1).contiguous()
+        dim = -1
+    out = torch.empty_like(inp)
+    with torch.cuda.device(inp.device.index):
+        # Pass one, scan a (batch, n_tiles * TILE) sized block within each cta
+        num_sms = torch.cuda.get_device_properties("cuda").multi_processor_count
+        TILE = 2048
+        # Each row is split into n_chunks of chunks where each chunk is compised of
+        # n_tiles of tiles. Different chunks are assigned to different ctas.
+        n_rows = N // K
+        n_chunks = min(triton.cdiv(num_sms, n_rows), triton.cdiv(K, TILE))
+        n_tiles = triton.cdiv(triton.cdiv(K, TILE), n_chunks)
+        k_stride = inp.stride(dim)
+        r_stride = inp.size(dim) if k_stride == 1 else 1
+        if n_rows > GRID_Y_LIMIT:
+            batch = triton.cdiv(n_rows, GRID_Y_LIMIT)
+            n_batch = triton.cdiv(n_rows, batch)
+        else:
+            batch = 1
+            n_batch = n_rows
+
+        grid = (n_chunks, n_batch)
+        if n_chunks == 1:
+            block_cumsum_kernel[grid](
+                inp,
+                out,
+                0,
+                batch,
+                n_tiles,
+                n_rows,
+                K,
+                r_stride,
+                k_stride,
+                r_stride,
+                k_stride,
+                OUTPUT_SUMS=False,
+                NORMALIZE=True,
+                HAS_OUT_LAYOUT=False,
+                TILE=TILE,
+            )
+            return out
+
+        if inp.dtype != torch.float64:
+            acc_dtype = torch.float32
+        sums = torch.empty((n_rows, n_chunks), dtype=acc_dtype, device="cuda")
+        cumsums = torch.empty_like(sums)
+        block_cumsum_kernel[grid](
+            inp,
+            out,
+            sums,
+            batch,
+            n_tiles,
+            n_rows,
+            K,
+            r_stride,
+            k_stride,
+            r_stride,
+            k_stride,
+            OUTPUT_SUMS=True,
+            NORMALIZE=False,
+            HAS_OUT_LAYOUT=False,
+            TILE=TILE,
+        )
+        # Pass two, scan partial cumsums
+        block_cumsum_kernel[(1, n_batch)](
+            sums,
+            cumsums,
+            0,
+            batch,
+            1,
+            n_rows,
+            n_chunks,
+            n_chunks,
+            1,
+            n_chunks,
+            1,
+            OUTPUT_SUMS=False,
+            NORMALIZE=False,
+            HAS_OUT_LAYOUT=True,
+            TILE=TILE,
+        )
+        # print(sums)
+        rscale = cumsums[..., -1]
+        block_update_kernel[grid](
+            out,
+            cumsums - sums,
+            rscale,
+            out,
+            batch,
+            n_tiles,
+            n_rows,
+            K,
+            r_stride,
+            k_stride,
+            r_stride,
+            k_stride,
+            n_chunks,
+            HAS_OUT_LAYOUT=False,
+            TILE=TILE,
+        )
+        return out