[Feature] Add TIR builtins for warp-level vote and block-level predicate sync

sepcnt · sepcnt · commit 475eea1f0ce8 · 2026-02-19T01:45:57.000+08:00
diff --git a/docs/programming_guides/instructions.md b/docs/programming_guides/instructions.md
@@ -139,9 +139,32 @@ Annotation helpers
 - `T.annotate_l2_hit_ratio(buf, ratio)`: Cache behavior hint.
 
 Synchronization helpers
+- `T.sync_threads([barrier_id, arrive_count])`: Block-wide barrier (`__syncthreads()`).
+- `T.sync_warp([mask])`: Warp-wide barrier (`__syncwarp([mask])`).
+- `T.sync_grid()`: Cooperative grid barrier (requires cooperative launch).
 - `T.pdl_trigger()`: Signal programmatic launch completion for the current kernel.
 - `T.pdl_sync()`: Wait until kernel dependencies are satisfied.
 
+Warp-vote / warp-ballot (CUDA ≥ 9 / HIP)
+- `T.any_sync(mask, predicate)` → `int32`: Non-zero if ANY lane in `mask` has non-zero predicate  (`__any_sync`).
+- `T.all_sync(mask, predicate)` → `int32`: Non-zero if ALL lanes in `mask` have non-zero predicate  (`__all_sync`).
+- `T.ballot_sync(mask, predicate)` → `uint32`: Bitmask of lanes in `mask` with non-zero predicate (`__ballot_sync`).
+- `T.ballot(predicate)` → `uint32`: Full-warp ballot (mask = `0xFFFFFFFF`); equivalent to `T.ballot_sync(0xFFFFFFFF, pred)`.
+- `T.activemask()` → `uint32`: Bitmask of currently active (non-exited) lanes (`__activemask`).
+
+Block-wide predicated sync
+- `T.syncthreads_count(predicate)` → `int32`: Sync all threads; return count with non-zero predicate (`__syncthreads_count`).
+- `T.syncthreads_and(predicate)` → `int32`: Sync; non-zero iff ALL threads have non-zero predicate (`__syncthreads_and`).
+- `T.syncthreads_or(predicate)` → `int32`: Sync; non-zero iff ANY thread has non-zero predicate (`__syncthreads_or`).
+
+Warp-shuffle (intra-warp data exchange)
+- `T.shfl_sync(mask, value, src_lane[, width])`: Broadcast value from `src_lane` to all lanes (`__shfl_sync`).
+- `T.shfl_xor(value, offset)`: XOR-swap across lanes (`__shfl_xor_sync`, full mask).
+- `T.shfl_down(value, offset)`: Shift down by `offset` lanes (`__shfl_down_sync`, full mask).
+- `T.shfl_up(value, offset)`: Shift up by `offset` lanes (`__shfl_up_sync`, full mask).
+
+> **Note on HIP:** `any_sync`/`all_sync` ignore the mask and call `__any`/`__all` directly. `ballot_sync`, `ballot`, and `activemask` internally call `__ballot` (which returns `uint64` on a 64-thread wavefront) and cast the result to `uint32`. `syncthreads_count/and/or` have identical signatures on both platforms.
+
 Atomics
 - `T.atomic_add(dst, value, memory_order=None, return_prev=False, use_tma=False)`.
 - `T.atomic_addx2(dst, value, return_prev=False)`; `T.atomic_addx4(...)`.
diff --git a/testing/python/language/test_tilelang_language_warp_vote.py b/testing/python/language/test_tilelang_language_warp_vote.py
@@ -0,0 +1,319 @@
+"""Tests for warp-vote / warp-ballot / block-sync-with-predicate intrinsics.
+
+Covered intrinsics
+------------------
+T.any_sync        – __any_sync  / __any  (HIP)
+T.all_sync        – __all_sync  / __all  (HIP)
+T.ballot_sync     – __ballot_sync / __ballot cast to uint32 (HIP)
+T.ballot          – ballot with full-warp mask / __ballot (HIP)
+T.activemask      – __activemask / __ballot(1) cast to uint32 (HIP)
+T.syncthreads_count – __syncthreads_count
+T.syncthreads_and   – __syncthreads_and
+T.syncthreads_or    – __syncthreads_or
+"""
+
+import tilelang
+import tilelang.language as T
+import torch
+import tilelang.testing
+
+
+# ---------------------------------------------------------------------------
+# any_sync
+# ---------------------------------------------------------------------------
+
+
+@tilelang.jit
+def kernel_any_sync():
+    """Lane 0 writes 1 to A; all lanes use any_sync to see if any lane wrote."""
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((1,), "int32"),
+        B: T.Tensor((32,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            val = T.any_sync(0xFFFFFFFF, tx == 0)
+            B[tx] = val
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_any_sync():
+    a = torch.zeros((1,), device="cuda", dtype=torch.int32)
+    b = torch.zeros((32,), device="cuda", dtype=torch.int32)
+    kernel = kernel_any_sync()
+    src = kernel.get_kernel_source()
+    assert "__any_sync" in src or "__any" in src, f"Expected __any_sync/__any in source:\n{src}"
+    kernel(a, b)
+    # any lane (lane 0) has predicate==1 → result must be non-zero for all lanes
+    assert torch.all(b != 0), f"Expected all non-zero, got {b}"
+
+
+# ---------------------------------------------------------------------------
+# all_sync
+# ---------------------------------------------------------------------------
+
+
+@tilelang.jit
+def kernel_all_sync():
+    """All lanes always pass predicate 1 → all_sync should return non-zero."""
+
+    @T.prim_func
+    def main(
+        B: T.Tensor((32,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            val = T.all_sync(0xFFFFFFFF, 1)
+            B[tx] = val
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_all_sync():
+    b = torch.zeros((32,), device="cuda", dtype=torch.int32)
+    kernel = kernel_all_sync()
+    src = kernel.get_kernel_source()
+    assert "__all_sync" in src or "__all" in src, f"Expected __all_sync/__all in source:\n{src}"
+    kernel(b)
+    assert torch.all(b != 0), f"Expected all non-zero, got {b}"
+
+
+# ---------------------------------------------------------------------------
+# ballot_sync
+# ---------------------------------------------------------------------------
+
+
+@tilelang.jit
+def kernel_ballot_sync():
+    """Only lane 0 has a non-zero predicate → ballot bit 0 must be set."""
+
+    @T.prim_func
+    def main(
+        B: T.Tensor((32,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            mask = T.ballot_sync(0xFFFFFFFF, tx == 0)
+            B[tx] = T.cast(mask, "int32")
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_ballot_sync():
+    b = torch.zeros((32,), device="cuda", dtype=torch.int32)
+    kernel = kernel_ballot_sync()
+    src = kernel.get_kernel_source()
+    assert "__ballot_sync" in src or "__ballot" in src, f"Expected __ballot_sync/__ballot in source:\n{src}"
+    kernel(b)
+    # All lanes read the same ballot value; bit 0 must be set (lane 0 had pred=1)
+    assert int(b[0]) & 1, f"Expected bit 0 set in ballot result, got {b[0]:#010x}"
+
+
+# ---------------------------------------------------------------------------
+# ballot  (full-warp convenience wrapper)
+# ---------------------------------------------------------------------------
+
+
+@tilelang.jit
+def kernel_ballot():
+    """All lanes pass predicate 1 → all 32 bits in ballot must be set."""
+
+    @T.prim_func
+    def main(
+        B: T.Tensor((32,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            mask = T.ballot(1)
+            B[tx] = T.cast(mask, "int32")
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_ballot():
+    b = torch.zeros((32,), device="cuda", dtype=torch.int32)
+    kernel = kernel_ballot()
+    src = kernel.get_kernel_source()
+    assert "__ballot_sync" in src or "__ballot" in src, f"Expected __ballot_sync/__ballot in source:\n{src}"
+    kernel(b)
+    # With predicate=1 for all 32 lanes the mask should be 0xFFFFFFFF;
+    # stored as int32 this is -1.
+    assert int(b[0]) == -1 or int(b[0]) == 0xFFFFFFFF, f"Expected 0xFFFFFFFF (-1 as int32), got {int(b[0])}"
+
+
+# ---------------------------------------------------------------------------
+# activemask
+# ---------------------------------------------------------------------------
+
+
+@tilelang.jit
+def kernel_activemask():
+    """All 32 threads are active → activemask should equal 0xFFFFFFFF."""
+
+    @T.prim_func
+    def main(
+        B: T.Tensor((32,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            mask = T.activemask()
+            B[tx] = T.cast(mask, "int32")
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_activemask():
+    b = torch.zeros((32,), device="cuda", dtype=torch.int32)
+    kernel = kernel_activemask()
+    src = kernel.get_kernel_source()
+    assert "__activemask" in src or "__ballot" in src, f"Expected __activemask/__ballot in source:\n{src}"
+    kernel(b)
+    # All 32 lanes active → 0xFFFFFFFF; as int32 this is -1.
+    assert int(b[0]) == -1 or int(b[0]) == 0xFFFFFFFF, f"Expected 0xFFFFFFFF (-1 as int32), got {int(b[0])}"
+
+
+# ---------------------------------------------------------------------------
+# syncthreads_count
+# ---------------------------------------------------------------------------
+
+
+@tilelang.jit
+def kernel_syncthreads_count():
+    """Exactly half the threads (lanes 0–15) pass predicate 1."""
+
+    @T.prim_func
+    def main(
+        B: T.Tensor((32,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            cnt = T.syncthreads_count(tx < 16)
+            B[tx] = cnt
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_syncthreads_count():
+    b = torch.zeros((32,), device="cuda", dtype=torch.int32)
+    kernel = kernel_syncthreads_count()
+    src = kernel.get_kernel_source()
+    assert "__syncthreads_count" in src, f"Expected __syncthreads_count in source:\n{src}"
+    kernel(b)
+    assert torch.all(b == 16), f"Expected all 16, got {b}"
+
+
+# ---------------------------------------------------------------------------
+# syncthreads_and
+# ---------------------------------------------------------------------------
+
+
+@tilelang.jit
+def kernel_syncthreads_and_true():
+    """All threads pass predicate 1 → syncthreads_and returns non-zero."""
+
+    @T.prim_func
+    def main(
+        B: T.Tensor((32,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            result = T.syncthreads_and(1)
+            B[tx] = result
+
+    return main
+
+
+@tilelang.jit
+def kernel_syncthreads_and_false():
+    """Thread 0 passes predicate 0 → syncthreads_and returns 0."""
+
+    @T.prim_func
+    def main(
+        B: T.Tensor((32,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            result = T.syncthreads_and(tx != 0)
+            B[tx] = result
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_syncthreads_and():
+    b = torch.zeros((32,), device="cuda", dtype=torch.int32)
+    kernel = kernel_syncthreads_and_true()
+    src = kernel.get_kernel_source()
+    assert "__syncthreads_and" in src, f"Expected __syncthreads_and in source:\n{src}"
+    kernel(b)
+    assert torch.all(b != 0), f"Expected all non-zero, got {b}"
+
+    b2 = torch.zeros((32,), device="cuda", dtype=torch.int32)
+    kernel2 = kernel_syncthreads_and_false()
+    kernel2(b2)
+    assert torch.all(b2 == 0), f"Expected all 0, got {b2}"
+
+
+# ---------------------------------------------------------------------------
+# syncthreads_or
+# ---------------------------------------------------------------------------
+
+
+@tilelang.jit
+def kernel_syncthreads_or_true():
+    """At least one thread (lane 0) passes predicate 1 → syncthreads_or != 0."""
+
+    @T.prim_func
+    def main(
+        B: T.Tensor((32,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            result = T.syncthreads_or(tx == 0)
+            B[tx] = result
+
+    return main
+
+
+@tilelang.jit
+def kernel_syncthreads_or_false():
+    """No thread passes predicate 1 → syncthreads_or returns 0."""
+
+    @T.prim_func
+    def main(
+        B: T.Tensor((32,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            result = T.syncthreads_or(0)
+            B[tx] = result
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_syncthreads_or():
+    b = torch.zeros((32,), device="cuda", dtype=torch.int32)
+    kernel = kernel_syncthreads_or_true()
+    src = kernel.get_kernel_source()
+    assert "__syncthreads_or" in src, f"Expected __syncthreads_or in source:\n{src}"
+    kernel(b)
+    assert torch.all(b != 0), f"Expected all non-zero, got {b}"
+
+    b2 = torch.zeros((32,), device="cuda", dtype=torch.int32)
+    kernel2 = kernel_syncthreads_or_false()
+    kernel2(b2)
+    assert torch.all(b2 == 0), f"Expected all 0, got {b2}"
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/tilelang/language/__init__.py b/tilelang/language/__init__.py
@@ -100,6 +100,14 @@
 from .builtin import stg64 as stg64  # noqa: F401
 from .builtin import stg128 as stg128  # noqa: F401
 from .builtin import stg256 as stg256  # noqa: F401
+from .builtin import any_sync as any_sync  # noqa: F401
+from .builtin import all_sync as all_sync  # noqa: F401
+from .builtin import ballot_sync as ballot_sync  # noqa: F401
+from .builtin import ballot as ballot  # noqa: F401
+from .builtin import activemask as activemask  # noqa: F401
+from .builtin import syncthreads_count as syncthreads_count  # noqa: F401
+from .builtin import syncthreads_and as syncthreads_and  # noqa: F401
+from .builtin import syncthreads_or as syncthreads_or  # noqa: F401
 
 from .utils import index_to_coordinates  # noqa: F401
 
diff --git a/tilelang/language/builtin.py b/tilelang/language/builtin.py