Skip to content

Commit 0d3bc3c

Browse files
committed
Fix TOCTOU Race Condition in registerDebouncedRun
1 parent 3bbee8c commit 0d3bc3c

File tree

2 files changed

+155
-12
lines changed

2 files changed

+155
-12
lines changed

internal-packages/run-engine/src/engine/systems/debounceSystem.ts

Lines changed: 53 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,22 @@ end
121121
return { 0, value }
122122
`,
123123
});
124+
125+
// Atomically sets runId only if current value equals expected pending claim.
126+
// This prevents the TOCTOU race condition where between GET (check claim) and SET (register),
127+
// another server could claim and register a different run, which would get overwritten.
128+
// Returns 1 if set succeeded, 0 if claim mismatch (lost the claim).
129+
this.redis.defineCommand("registerIfClaimOwned", {
130+
numberOfKeys: 1,
131+
lua: `
132+
local value = redis.call('GET', KEYS[1])
133+
if value == ARGV[1] then
134+
redis.call('SET', KEYS[1], ARGV[2], 'PX', ARGV[3])
135+
return 1
136+
end
137+
return 0
138+
`,
139+
});
124140
}
125141

126142
/**
@@ -593,32 +609,40 @@ return { 0, value }
593609
async (span) => {
594610
const redisKey = this.getDebounceRedisKey(environmentId, taskIdentifier, debounceKey);
595611

612+
// Calculate TTL: delay until + buffer
613+
const ttlMs = Math.max(
614+
delayUntil.getTime() - Date.now() + 60_000, // Add 1 minute buffer
615+
60_000
616+
);
617+
596618
if (claimId) {
597-
// Verify we still own the pending claim before overwriting
598-
const currentValue = await this.redis.get(redisKey);
599-
if (currentValue !== `pending:${claimId}`) {
619+
// Use atomic Lua script to verify claim and set runId in one operation.
620+
// This prevents the TOCTOU race where another server could claim and register
621+
// between our GET check and SET.
622+
const result = await this.redis.registerIfClaimOwned(
623+
redisKey,
624+
`pending:${claimId}`,
625+
runId,
626+
ttlMs.toString()
627+
);
628+
629+
if (result === 0) {
600630
// We lost the claim - another server took over or it expired
601631
this.$.logger.warn("registerDebouncedRun: lost claim, not registering", {
602632
runId,
603633
environmentId,
604634
taskIdentifier,
605635
debounceKey,
606636
claimId,
607-
currentValue,
608637
});
609638
span.setAttribute("claimLost", true);
610639
return false;
611640
}
641+
} else {
642+
// No claim to verify, just set directly
643+
await this.redis.set(redisKey, runId, "PX", ttlMs);
612644
}
613645

614-
// Calculate TTL: delay until + buffer
615-
const ttlMs = Math.max(
616-
delayUntil.getTime() - Date.now() + 60_000, // Add 1 minute buffer
617-
60_000
618-
);
619-
620-
await this.redis.set(redisKey, runId, "PX", ttlMs);
621-
622646
this.$.logger.debug("registerDebouncedRun: stored debounce key mapping", {
623647
runId,
624648
environmentId,
@@ -751,5 +775,22 @@ declare module "@internal/redis" {
751775
key: string,
752776
callback?: Callback<[number, string | null]>
753777
): Result<[number, string | null], Context>;
778+
779+
/**
780+
* Atomically sets runId only if current value equals expected pending claim.
781+
* Prevents TOCTOU race condition between claim verification and registration.
782+
* @param key - The Redis key
783+
* @param expectedClaim - Expected value "pending:{claimId}"
784+
* @param runId - The new value (run ID) to set
785+
* @param ttlMs - TTL in milliseconds
786+
* @returns 1 if set succeeded, 0 if claim mismatch
787+
*/
788+
registerIfClaimOwned(
789+
key: string,
790+
expectedClaim: string,
791+
runId: string,
792+
ttlMs: string,
793+
callback?: Callback<number>
794+
): Result<number, Context>;
754795
}
755796
}

internal-packages/run-engine/src/engine/tests/debounce.test.ts

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1939,5 +1939,107 @@ describe("RunEngine debounce", () => {
19391939
}
19401940
}
19411941
);
1942+
1943+
containerTest(
1944+
"registerDebouncedRun: atomic claim prevents overwrite when claim is lost",
1945+
async ({ prisma, redisOptions }) => {
1946+
// This test verifies the fix for the TOCTOU race condition in registerDebouncedRun.
1947+
// The race occurs when:
1948+
// 1. Server A claims debounce key with claimId-A
1949+
// 2. Server B claims same key with claimId-B (after A's claim expires)
1950+
// 3. Server B registers runId-B successfully
1951+
// 4. Server A attempts to register runId-A with stale claimId-A
1952+
// Without the fix, step 4 would overwrite runId-B. With the fix, it fails atomically.
1953+
1954+
const { createRedisClient } = await import("@internal/redis");
1955+
1956+
const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
1957+
1958+
const engine = new RunEngine({
1959+
prisma,
1960+
worker: {
1961+
redis: redisOptions,
1962+
workers: 1,
1963+
tasksPerWorker: 10,
1964+
pollIntervalMs: 100,
1965+
},
1966+
queue: {
1967+
redis: redisOptions,
1968+
},
1969+
runLock: {
1970+
redis: redisOptions,
1971+
},
1972+
machines: {
1973+
defaultMachine: "small-1x",
1974+
machines: {
1975+
"small-1x": {
1976+
name: "small-1x" as const,
1977+
cpu: 0.5,
1978+
memory: 0.5,
1979+
centsPerMs: 0.0001,
1980+
},
1981+
},
1982+
baseCostInCents: 0.0001,
1983+
},
1984+
debounce: {
1985+
maxDebounceDurationMs: 60_000,
1986+
},
1987+
tracer: trace.getTracer("test", "0.0.0"),
1988+
});
1989+
1990+
// Create a separate Redis client to simulate "another server" modifying keys directly
1991+
const simulatedServerRedis = createRedisClient({
1992+
...redisOptions,
1993+
keyPrefix: `${redisOptions.keyPrefix ?? ""}debounce:`,
1994+
});
1995+
1996+
try {
1997+
const taskIdentifier = "test-task";
1998+
const debounceKey = "race-test-key";
1999+
const environmentId = authenticatedEnvironment.id;
2000+
const delayUntil = new Date(Date.now() + 60_000);
2001+
2002+
await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier);
2003+
2004+
// Construct the Redis key (same format as DebounceSystem.getDebounceRedisKey)
2005+
const redisKey = `${environmentId}:${taskIdentifier}:${debounceKey}`;
2006+
2007+
// Step 1: Server A claims the key with claimId-A
2008+
const claimIdA = "claim-server-A";
2009+
await simulatedServerRedis.set(redisKey, `pending:${claimIdA}`, "PX", 60_000);
2010+
2011+
// Step 2 & 3: Simulate Server B claiming and registering (after A's claim "expires")
2012+
// In reality, this simulates the race where B's claim overwrites A's pending claim
2013+
const runIdB = "run_server_B";
2014+
await simulatedServerRedis.set(redisKey, runIdB, "PX", 60_000);
2015+
2016+
// Verify Server B's registration is in place
2017+
const valueAfterB = await simulatedServerRedis.get(redisKey);
2018+
expect(valueAfterB).toBe(runIdB);
2019+
2020+
// Step 4: Server A attempts to register with its stale claimId-A
2021+
// This should FAIL because the key no longer contains "pending:claim-server-A"
2022+
const runIdA = "run_server_A";
2023+
const registered = await engine.debounceSystem.registerDebouncedRun({
2024+
runId: runIdA,
2025+
environmentId,
2026+
taskIdentifier,
2027+
debounceKey,
2028+
delayUntil,
2029+
claimId: claimIdA, // Stale claim ID
2030+
});
2031+
2032+
// Step 5: Verify Server A's registration failed
2033+
expect(registered).toBe(false);
2034+
2035+
// Step 6: Verify Redis still contains runId-B (not overwritten by Server A)
2036+
const finalValue = await simulatedServerRedis.get(redisKey);
2037+
expect(finalValue).toBe(runIdB);
2038+
} finally {
2039+
await simulatedServerRedis.quit();
2040+
await engine.quit();
2041+
}
2042+
}
2043+
);
19422044
});
19432045

0 commit comments

Comments
 (0)