meta-pytorch · dboyda · May 2, 2026 · May 2, 2026
diff --git a/comms/ctran/algos/tests/CtranAlgoDevUT.cc b/comms/ctran/algos/tests/CtranAlgoDevUT.cc
@@ -309,6 +309,40 @@ TYPED_TEST(CtranAlgoDevTypedTest, localReduceSumUnaligned) {
   }
 }
 
+// Exercises localReduce at counts where the tail's "designated CTA"
+// (under copyUnroll-style per-block ownership = `(limitCount/numPerBlock)
+// % nGroups`) is NOT block 0. Existing localReduceSumUnaligned uses
+// count=1041 with blockDim=640 and gridDim=2, where for every supported
+// dtype the designated CTA collapses to 0; this test covers the case
+// where the tail is owned by a non-zero block.
+TYPED_TEST(CtranAlgoDevTypedTest, localReduceSumUnalignedNonZeroDesignatedCta) {
+  std::vector<size_t> testnSrcs{2, 8};
+  std::vector<size_t> testnDsts{1, 2};
+  constexpr uint numThreadBlocks = 4;
+  // For T=float (sizeofT=4), blockDim=640: numPerBlock = 640 * (16/4) * 4
+  // = 10240. count=10241 => limitCount=10240, tail=1, designated=1.
+  // count=20481 => designated=2. count=30721 => designated=3. count=40961
+  // => designated=0 (wrap). For other dtypes the designated index shifts
+  // proportionally; in every case the tail is non-empty and at least one
+  // count puts it on a non-zero block.
+  std::vector<size_t> testCounts{10241, 20481, 30721, 40961};
+
+  for (auto nsrcs : testnSrcs) {
+    for (auto ndsts : testnDsts) {
+      for (auto count : testCounts) {
+        localReduceTest<TypeParam>(
+            nsrcs,
+            ndsts,
+            count,
+            commSum,
+            /*nranks=*/1,
+            /*subsetThreadBlocks=*/false,
+            numThreadBlocks);
+      }
+    }
+  }
+}
+
 TYPED_TEST(CtranAlgoDevTypedTest, localReduceProd) {
   // test reduction kernel with 2 or 8 srcs, and 1 or 2 dsts, which are common
   // cases for inter-node and intra-node collectives.

diff --git a/comms/ctran/algos/tests/LocalReduceTailRaceUT.cc b/comms/ctran/algos/tests/LocalReduceTailRaceUT.cc
@@ -0,0 +1,123 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+// GPU reproducer for the per-CTA byte-ownership invariant violation
+// between `copyUnroll<4, T>` (`fbcode/comms/ctran/algos/DevCommon.cuh`)
+// and `localReduceVectorized` (`fbcode/comms/ctran/algos/localReduce.cuh`).
+//
+// D69774173 gave `copyUnroll`'s tail a single
+// designated CTA so per-byte ownership is stable across calls on the
+// same buffer. The same fix was never applied to `localReduce.cuh`.
+//
+// In ring AllReduce the bug surfaces at the RS→AG transition: RS-last-step
+// writes `recvbuff` (and `tmpSendBuf`) via `localReduce` (line 121); AG
+// steps touch the same buffers via `copyUnroll`-via-`ctranKernCopy*`
+// (lines 127, 137, 268, 278). Inter-round sync is per-CTA only via
+// `GpeKernelSync.completeFlag[blockIdx]`. When per-CTA ownership of two
+// writers disagrees, a CTA in the next round can read or overwrite bytes
+// a different CTA in the prior round hasn't yet committed.
+
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+#include <cstddef>
+#include <cstdint>
+#include <new>
+#include <vector>
+#include "comms/ctran/algos/common/GpeKernelSync.h"
+#include "comms/ctran/algos/tests/LocalReduceTailRaceUTKernels.cuh"
+#include "comms/testinfra/TestXPlatUtils.h"
+
+class LocalReduceTailRaceTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    CUDACHECK_TEST(cudaSetDevice(0));
+  }
+};
+
+TEST_F(
+    LocalReduceTailRaceTest,
+    DISABLED_DelayedBlockZeroExposesBlockOneStaleRead) {
+  // count=2200, blockDim=128, gridDim=8, fp32:
+  // - copyUnroll-tail designated CTA = (2048/2048) % 8 = 1, so block 1
+  //   reads ALL of [2048, 2200) in phase 2.
+  // - Pre-fix localReduceVectorized-tail: block 0 writes [2048, 2176),
+  //   block 1 writes [2176, 2200) (grid-strided over linearThreadId).
+  // - Post-fix: block 1 writes [2048, 2200) (single-CTA tail mirrors
+  //   copyUnroll's predicate).
+  constexpr size_t count = 2200;
+  constexpr int gridDim = 8;
+  constexpr int blockDim = 128;
+  // 50 ms is well above any plausible time block 1 needs to reach phase 2.
+  constexpr unsigned long long block0DelayNs = 50ULL * 1000ULL * 1000ULL;
+  using T = int32_t;
+  const size_t bytes = count * sizeof(T);
+
+  std::vector<T> srcHost(count);
+  for (size_t i = 0; i < count; ++i) {
+    srcHost[i] = static_cast<T>(i + 1);
+  }
+
+  T* srcDev{nullptr};
+  T* bufDev{nullptr};
+  T* outDev{nullptr};
+
+  CUDACHECK_TEST(cudaMalloc(&srcDev, bytes));
+  CUDACHECK_TEST(cudaMalloc(&bufDev, bytes));
+  CUDACHECK_TEST(cudaMalloc(&outDev, bytes));
+
+  // Allocate the REAL `ctran::algos::GpeKernelSync` in pinned host memory
+  // — same allocation pattern as production (and as D103255142). The
+  // kernel uses `GpeKernelSyncDev::complete` / `waitPost` (the production
+  // sync APIs) over `sync->completeFlag[]` / `sync->postFlag[]`. We
+  // pre-post step=1 for every worker BEFORE launch so the kernel's
+  // `waitPost` succeeds without a host poll loop — the only intentionally
+  // race-relevant interaction is the per-CTA release/acquire pair the
+  // kernel performs around the buf write/read.
+  void* syncPtr = nullptr;
+  CUDACHECK_TEST(cudaHostAlloc(
+      &syncPtr, sizeof(ctran::algos::GpeKernelSync), cudaHostAllocDefault));
+  auto* sync = new (syncPtr) ctran::algos::GpeKernelSync(gridDim);
+  sync->post(/*step=*/1);
+
+  CUDACHECK_TEST(
+      cudaMemcpy(srcDev, srcHost.data(), bytes, cudaMemcpyHostToDevice));
+  // Sentinel for `buf`. Stale phase-2 reads will return this value
+  // (broadcasts to T=int32 as 0xCDCDCDCD).
+  CUDACHECK_TEST(cudaMemset(bufDev, 0xCD, bytes));
+  CUDACHECK_TEST(cudaMemset(outDev, 0, bytes));
+
+  dim3 grid{gridDim, 1, 1};
+  dim3 block{blockDim, 1, 1};
+  size_t countLocal = count;
+  unsigned long long delayLocal = block0DelayNs;
+  void* args[] = {&bufDev, &outDev, &srcDev, &countLocal, &sync, &delayLocal};
+  CUDACHECK_TEST(cudaLaunchKernel(
+      reinterpret_cast<void*>(multiWriterTailRaceKernel<T>),
+      grid,
+      block,
+      args));
+  CUDACHECK_TEST(cudaDeviceSynchronize());
+
+  std::vector<T> outHost(count);
+  CUDACHECK_TEST(
+      cudaMemcpy(outHost.data(), outDev, bytes, cudaMemcpyDeviceToHost));
+
+  size_t firstStale = count;
+  for (size_t i = 0; i < count; ++i) {
+    if (outHost[i] != srcHost[i]) {
+      firstStale = i;
+      break;
+    }
+  }
+
+  EXPECT_EQ(firstStale, count)
+      << "stale read at out[" << firstStale
+      << "] = " << static_cast<uint32_t>(outHost[firstStale]) << " (expected "
+      << static_cast<uint32_t>(srcHost[firstStale]) << ")"
+      << " — block 1 read a slot block 0 had not yet written in phase 1";
+
+  CUDACHECK_TEST(cudaFree(srcDev));
+  CUDACHECK_TEST(cudaFree(bufDev));
+  CUDACHECK_TEST(cudaFree(outDev));
+  sync->~GpeKernelSync();
+  CUDACHECK_TEST(cudaFreeHost(syncPtr));
+}
diff --git a/comms/ctran/algos/tests/LocalReduceTailRaceUTKernels.cu b/comms/ctran/algos/tests/LocalReduceTailRaceUTKernels.cu
@@ -0,0 +1,83 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include "comms/ctran/algos/DevCommon.cuh"
+#include "comms/ctran/algos/common/GpeKernelSyncDev.cuh"
+#include "comms/ctran/algos/localReduce.cuh"
+#include "comms/ctran/algos/tests/LocalReduceTailRaceUTKernels.cuh"
+
+namespace {
+
+// Spin-sleep on block 0 only via a clock-cycle busy wait. Cross-platform
+// (NVIDIA + AMD/HIP); `__nanosleep` is NVIDIA-only. Cycle-to-ns
+// conversion is approximate (~2 GHz upper bound) which is fine since
+// the goal is just "delay enough that block 1 reaches phase 2 first".
+__device__ __forceinline__ void block0Delay(unsigned long long totalNs) {
+  if (blockIdx.x != 0 || threadIdx.x != 0) {
+    return;
+  }
+  constexpr unsigned long long kCyclesPerNs = 2;
+  const long long targetCycles = static_cast<long long>(totalNs * kCyclesPerNs);
+  const long long start = clock64();
+  while (clock64() - start < targetCycles) {
+    // busy wait
+  }
+}
+
+} // namespace
+
+template <typename T>
+__global__ void multiWriterTailRaceKernel(
+    T* buf,
+    T* out,
+    const T* src,
+    size_t count,
+    ctran::algos::GpeKernelSync* sync,
+    unsigned long long block0DelayNs) {
+  // Delay block 0 so other blocks reach phase 2 before block 0 finishes
+  // phase 1. The delay runs only on block 0's thread 0; the rest of
+  // block 0 waits at the barrier below. Other blocks pass through
+  // immediately and proceed to phase 1.
+  block0Delay(block0DelayNs);
+  __syncthreads();
+
+  // Phase 1: real `localReduceVectorized` writes `buf` from `src`. With
+  // NSrcs=1 + commSum the reduction is identity, so `buf` should equal
+  // `src` for every byte once all CTAs' writes have committed.
+  {
+    const T* srcs[1] = {src};
+    T* dsts[1] = {buf};
+    localReduceVectorized<T, commSum, 1, 1>(
+        srcs, dsts, count, blockIdx.x, gridDim.x);
+  }
+
+  // Per-CTA release + per-CTA acquire-on-own-flag, using the REAL
+  // production sync API. `complete` does `__syncthreads()` +
+  // `st.release.sys.global` on `sync->completeFlag[blockIdx.x]`.
+  // `waitPost` polls `sync->postFlag[blockIdx.x]` with `ld.acquire.sys.global`
+  // until it observes a value >= step. The host pre-posted step=1 for
+  // every worker before kernel launch, so `waitPost` returns immediately
+  // — but the acquire fence still fires. There is intentionally NO
+  // cross-CTA acquire here (no host `isComplete` between phases),
+  // matching the per-CTA-only sync pattern that opens the bug window.
+  ctran::algos::GpeKernelSyncDev::complete(sync, blockIdx.x, /*step=*/1);
+  ctran::algos::GpeKernelSyncDev::waitPost(sync, blockIdx.x, /*step=*/1);
+
+  // Phase 2: real `copyUnroll<4, T>` reads `buf` and writes `out`. Each
+  // CTA reads only the bytes it owns under copyUnroll's per-CTA
+  // partition. If localReduceVectorized's per-CTA partition assigned
+  // some of those bytes to a different CTA in phase 1 (the bug), this
+  // CTA's read may hit init sentinel because the writer CTA was still
+  // sleeping in `block0Delay` and hadn't issued its phase-1 writes yet.
+  copyUnroll<4, T>(out, buf, count, blockIdx.x, gridDim.x);
+}
+
+#define DECL_MULTI_WRITER_KERN(T)                        \
+  template __global__ void multiWriterTailRaceKernel<T>( \
+      T * buf,                                           \
+      T * out,                                           \
+      const T* src,                                      \
+      size_t count,                                      \
+      ctran::algos::GpeKernelSync* sync,                 \
+      unsigned long long block0DelayNs)
+
+DECL_MULTI_WRITER_KERN(int32_t);
diff --git a/comms/ctran/algos/tests/LocalReduceTailRaceUTKernels.cuh b/comms/ctran/algos/tests/LocalReduceTailRaceUTKernels.cuh
@@ -0,0 +1,38 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include "comms/ctran/algos/common/GpeKernelSync.h"
+
+// Single-kernel multi-writer test that DELIBERATELY delays block 0 so
+// block 1 reaches phase 2 before block 0 finishes phase 1. With the
+// pre-fix tail in `localReduceVectorized`, block 0 is the writer for
+// part of the byte range that block 1 reads in `copyUnroll`'s tail.
+// Because block 0 hasn't issued its writes yet (it's sleeping), block 1
+// reads the pre-init sentinel from L2 — `out` then carries that
+// sentinel instead of `src`, and the test fails.
+//
+// With the fix (single-designated-CTA tail in localReduce matching
+// copyUnroll's), block 1 owns its entire copyUnroll-tail range in BOTH
+// writers, so block 1's phase 2 read returns block 1's own phase 1
+// write — no dependency on block 0 — and the test passes.
+//
+// `sync` is the REAL `ctran::algos::GpeKernelSync` allocated via
+// `cudaHostAlloc` and pre-posted by the host (post(1) for all workers
+// before launch). The kernel uses `GpeKernelSyncDev::complete` for the
+// release on `completeFlag` and `GpeKernelSyncDev::waitPost` for the
+// acquire on `postFlag` — exactly the production sync code path
+// (`comms/ctran/algos/common/GpeKernelSyncDev.cuh`). This means the
+// test exercises the SAME visibility chain that ring AllReduce uses
+// between rounds; if `GpeKernelSync`'s sync semantics ever change, the
+// test moves with it.
+template <typename T>
+__global__ void multiWriterTailRaceKernel(
+    T* buf,
+    T* out,
+    const T* src,
+    size_t count,
+    ctran::algos::GpeKernelSync* sync,
+    unsigned long long block0DelayNs);