Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions comms/ctran/algos/tests/CtranAlgoDevUT.cc
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,40 @@ TYPED_TEST(CtranAlgoDevTypedTest, localReduceSumUnaligned) {
}
}

// Exercises localReduce at counts where the tail's "designated CTA"
// (under copyUnroll-style per-block ownership = `(limitCount/numPerBlock)
// % nGroups`) is NOT block 0. Existing localReduceSumUnaligned uses
// count=1041 with blockDim=640 and gridDim=2, where for every supported
// dtype the designated CTA collapses to 0; this test covers the case
// where the tail is owned by a non-zero block.
TYPED_TEST(CtranAlgoDevTypedTest, localReduceSumUnalignedNonZeroDesignatedCta) {
std::vector<size_t> testnSrcs{2, 8};
std::vector<size_t> testnDsts{1, 2};
constexpr uint numThreadBlocks = 4;
// For T=float (sizeofT=4), blockDim=640: numPerBlock = 640 * (16/4) * 4
// = 10240. count=10241 => limitCount=10240, tail=1, designated=1.
// count=20481 => designated=2. count=30721 => designated=3. count=40961
// => designated=0 (wrap). For other dtypes the designated index shifts
// proportionally; in every case the tail is non-empty and at least one
// count puts it on a non-zero block.
std::vector<size_t> testCounts{10241, 20481, 30721, 40961};

for (auto nsrcs : testnSrcs) {
for (auto ndsts : testnDsts) {
for (auto count : testCounts) {
localReduceTest<TypeParam>(
nsrcs,
ndsts,
count,
commSum,
/*nranks=*/1,
/*subsetThreadBlocks=*/false,
numThreadBlocks);
}
}
}
}

TYPED_TEST(CtranAlgoDevTypedTest, localReduceProd) {
// test reduction kernel with 2 or 8 srcs, and 1 or 2 dsts, which are common
// cases for inter-node and intra-node collectives.
Expand Down
123 changes: 123 additions & 0 deletions comms/ctran/algos/tests/LocalReduceTailRaceUT.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

// GPU reproducer for the per-CTA byte-ownership invariant violation
// between `copyUnroll<4, T>` (`fbcode/comms/ctran/algos/DevCommon.cuh`)
// and `localReduceVectorized` (`fbcode/comms/ctran/algos/localReduce.cuh`).
//
// D69774173 gave `copyUnroll`'s tail a single
// designated CTA so per-byte ownership is stable across calls on the
// same buffer. The same fix was never applied to `localReduce.cuh`.
//
// In ring AllReduce the bug surfaces at the RS→AG transition: RS-last-step
// writes `recvbuff` (and `tmpSendBuf`) via `localReduce` (line 121); AG
// steps touch the same buffers via `copyUnroll`-via-`ctranKernCopy*`
// (lines 127, 137, 268, 278). Inter-round sync is per-CTA only via
// `GpeKernelSync.completeFlag[blockIdx]`. When per-CTA ownership of two
// writers disagrees, a CTA in the next round can read or overwrite bytes
// a different CTA in the prior round hasn't yet committed.

#include <cuda_runtime.h>
#include <gtest/gtest.h>
#include <cstddef>
#include <cstdint>
#include <new>
#include <vector>
#include "comms/ctran/algos/common/GpeKernelSync.h"
#include "comms/ctran/algos/tests/LocalReduceTailRaceUTKernels.cuh"
#include "comms/testinfra/TestXPlatUtils.h"

class LocalReduceTailRaceTest : public ::testing::Test {
protected:
void SetUp() override {
CUDACHECK_TEST(cudaSetDevice(0));
}
};

TEST_F(
LocalReduceTailRaceTest,
DISABLED_DelayedBlockZeroExposesBlockOneStaleRead) {
// count=2200, blockDim=128, gridDim=8, fp32:
// - copyUnroll-tail designated CTA = (2048/2048) % 8 = 1, so block 1
// reads ALL of [2048, 2200) in phase 2.
// - Pre-fix localReduceVectorized-tail: block 0 writes [2048, 2176),
// block 1 writes [2176, 2200) (grid-strided over linearThreadId).
// - Post-fix: block 1 writes [2048, 2200) (single-CTA tail mirrors
// copyUnroll's predicate).
constexpr size_t count = 2200;
constexpr int gridDim = 8;
constexpr int blockDim = 128;
// 50 ms is well above any plausible time block 1 needs to reach phase 2.
constexpr unsigned long long block0DelayNs = 50ULL * 1000ULL * 1000ULL;
using T = int32_t;
const size_t bytes = count * sizeof(T);

std::vector<T> srcHost(count);
for (size_t i = 0; i < count; ++i) {
srcHost[i] = static_cast<T>(i + 1);
}

T* srcDev{nullptr};
T* bufDev{nullptr};
T* outDev{nullptr};

CUDACHECK_TEST(cudaMalloc(&srcDev, bytes));
CUDACHECK_TEST(cudaMalloc(&bufDev, bytes));
CUDACHECK_TEST(cudaMalloc(&outDev, bytes));

// Allocate the REAL `ctran::algos::GpeKernelSync` in pinned host memory
// — same allocation pattern as production (and as D103255142). The
// kernel uses `GpeKernelSyncDev::complete` / `waitPost` (the production
// sync APIs) over `sync->completeFlag[]` / `sync->postFlag[]`. We
// pre-post step=1 for every worker BEFORE launch so the kernel's
// `waitPost` succeeds without a host poll loop — the only intentionally
// race-relevant interaction is the per-CTA release/acquire pair the
// kernel performs around the buf write/read.
void* syncPtr = nullptr;
CUDACHECK_TEST(cudaHostAlloc(
&syncPtr, sizeof(ctran::algos::GpeKernelSync), cudaHostAllocDefault));
auto* sync = new (syncPtr) ctran::algos::GpeKernelSync(gridDim);
sync->post(/*step=*/1);

CUDACHECK_TEST(
cudaMemcpy(srcDev, srcHost.data(), bytes, cudaMemcpyHostToDevice));
// Sentinel for `buf`. Stale phase-2 reads will return this value
// (broadcasts to T=int32 as 0xCDCDCDCD).
CUDACHECK_TEST(cudaMemset(bufDev, 0xCD, bytes));
CUDACHECK_TEST(cudaMemset(outDev, 0, bytes));

dim3 grid{gridDim, 1, 1};
dim3 block{blockDim, 1, 1};
size_t countLocal = count;
unsigned long long delayLocal = block0DelayNs;
void* args[] = {&bufDev, &outDev, &srcDev, &countLocal, &sync, &delayLocal};
CUDACHECK_TEST(cudaLaunchKernel(
reinterpret_cast<void*>(multiWriterTailRaceKernel<T>),
grid,
block,
args));
CUDACHECK_TEST(cudaDeviceSynchronize());

std::vector<T> outHost(count);
CUDACHECK_TEST(
cudaMemcpy(outHost.data(), outDev, bytes, cudaMemcpyDeviceToHost));

size_t firstStale = count;
for (size_t i = 0; i < count; ++i) {
if (outHost[i] != srcHost[i]) {
firstStale = i;
break;
}
}

EXPECT_EQ(firstStale, count)
<< "stale read at out[" << firstStale
<< "] = " << static_cast<uint32_t>(outHost[firstStale]) << " (expected "
<< static_cast<uint32_t>(srcHost[firstStale]) << ")"
<< " — block 1 read a slot block 0 had not yet written in phase 1";

CUDACHECK_TEST(cudaFree(srcDev));
CUDACHECK_TEST(cudaFree(bufDev));
CUDACHECK_TEST(cudaFree(outDev));
sync->~GpeKernelSync();
CUDACHECK_TEST(cudaFreeHost(syncPtr));
}
83 changes: 83 additions & 0 deletions comms/ctran/algos/tests/LocalReduceTailRaceUTKernels.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

#include "comms/ctran/algos/DevCommon.cuh"
#include "comms/ctran/algos/common/GpeKernelSyncDev.cuh"
#include "comms/ctran/algos/localReduce.cuh"
#include "comms/ctran/algos/tests/LocalReduceTailRaceUTKernels.cuh"

namespace {

// Spin-sleep on block 0 only via a clock-cycle busy wait. Cross-platform
// (NVIDIA + AMD/HIP); `__nanosleep` is NVIDIA-only. Cycle-to-ns
// conversion is approximate (~2 GHz upper bound) which is fine since
// the goal is just "delay enough that block 1 reaches phase 2 first".
__device__ __forceinline__ void block0Delay(unsigned long long totalNs) {
if (blockIdx.x != 0 || threadIdx.x != 0) {
return;
}
constexpr unsigned long long kCyclesPerNs = 2;
const long long targetCycles = static_cast<long long>(totalNs * kCyclesPerNs);
const long long start = clock64();
while (clock64() - start < targetCycles) {
// busy wait
}
}

} // namespace

template <typename T>
__global__ void multiWriterTailRaceKernel(
T* buf,
T* out,
const T* src,
size_t count,
ctran::algos::GpeKernelSync* sync,
unsigned long long block0DelayNs) {
// Delay block 0 so other blocks reach phase 2 before block 0 finishes
// phase 1. The delay runs only on block 0's thread 0; the rest of
// block 0 waits at the barrier below. Other blocks pass through
// immediately and proceed to phase 1.
block0Delay(block0DelayNs);
__syncthreads();

// Phase 1: real `localReduceVectorized` writes `buf` from `src`. With
// NSrcs=1 + commSum the reduction is identity, so `buf` should equal
// `src` for every byte once all CTAs' writes have committed.
{
const T* srcs[1] = {src};
T* dsts[1] = {buf};
localReduceVectorized<T, commSum, 1, 1>(
srcs, dsts, count, blockIdx.x, gridDim.x);
}

// Per-CTA release + per-CTA acquire-on-own-flag, using the REAL
// production sync API. `complete` does `__syncthreads()` +
// `st.release.sys.global` on `sync->completeFlag[blockIdx.x]`.
// `waitPost` polls `sync->postFlag[blockIdx.x]` with `ld.acquire.sys.global`
// until it observes a value >= step. The host pre-posted step=1 for
// every worker before kernel launch, so `waitPost` returns immediately
// — but the acquire fence still fires. There is intentionally NO
// cross-CTA acquire here (no host `isComplete` between phases),
// matching the per-CTA-only sync pattern that opens the bug window.
ctran::algos::GpeKernelSyncDev::complete(sync, blockIdx.x, /*step=*/1);
ctran::algos::GpeKernelSyncDev::waitPost(sync, blockIdx.x, /*step=*/1);

// Phase 2: real `copyUnroll<4, T>` reads `buf` and writes `out`. Each
// CTA reads only the bytes it owns under copyUnroll's per-CTA
// partition. If localReduceVectorized's per-CTA partition assigned
// some of those bytes to a different CTA in phase 1 (the bug), this
// CTA's read may hit init sentinel because the writer CTA was still
// sleeping in `block0Delay` and hadn't issued its phase-1 writes yet.
copyUnroll<4, T>(out, buf, count, blockIdx.x, gridDim.x);
}

#define DECL_MULTI_WRITER_KERN(T) \
template __global__ void multiWriterTailRaceKernel<T>( \
T * buf, \
T * out, \
const T* src, \
size_t count, \
ctran::algos::GpeKernelSync* sync, \
unsigned long long block0DelayNs)

DECL_MULTI_WRITER_KERN(int32_t);
38 changes: 38 additions & 0 deletions comms/ctran/algos/tests/LocalReduceTailRaceUTKernels.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

#pragma once

#include <cstddef>
#include <cstdint>
#include "comms/ctran/algos/common/GpeKernelSync.h"

// Single-kernel multi-writer test that DELIBERATELY delays block 0 so
// block 1 reaches phase 2 before block 0 finishes phase 1. With the
// pre-fix tail in `localReduceVectorized`, block 0 is the writer for
// part of the byte range that block 1 reads in `copyUnroll`'s tail.
// Because block 0 hasn't issued its writes yet (it's sleeping), block 1
// reads the pre-init sentinel from L2 — `out` then carries that
// sentinel instead of `src`, and the test fails.
//
// With the fix (single-designated-CTA tail in localReduce matching
// copyUnroll's), block 1 owns its entire copyUnroll-tail range in BOTH
// writers, so block 1's phase 2 read returns block 1's own phase 1
// write — no dependency on block 0 — and the test passes.
//
// `sync` is the REAL `ctran::algos::GpeKernelSync` allocated via
// `cudaHostAlloc` and pre-posted by the host (post(1) for all workers
// before launch). The kernel uses `GpeKernelSyncDev::complete` for the
// release on `completeFlag` and `GpeKernelSyncDev::waitPost` for the
// acquire on `postFlag` — exactly the production sync code path
// (`comms/ctran/algos/common/GpeKernelSyncDev.cuh`). This means the
// test exercises the SAME visibility chain that ring AllReduce uses
// between rounds; if `GpeKernelSync`'s sync semantics ever change, the
// test moves with it.
template <typename T>
__global__ void multiWriterTailRaceKernel(
T* buf,
T* out,
const T* src,
size_t count,
ctran::algos::GpeKernelSync* sync,
unsigned long long block0DelayNs);
Loading