@@ -1939,5 +1939,107 @@ describe("RunEngine debounce", () => {
19391939 }
19401940 }
19411941 ) ;
1942+
1943+ containerTest (
1944+ "registerDebouncedRun: atomic claim prevents overwrite when claim is lost" ,
1945+ async ( { prisma, redisOptions } ) => {
1946+ // This test verifies the fix for the TOCTOU race condition in registerDebouncedRun.
1947+ // The race occurs when:
1948+ // 1. Server A claims debounce key with claimId-A
1949+ // 2. Server B claims same key with claimId-B (after A's claim expires)
1950+ // 3. Server B registers runId-B successfully
1951+ // 4. Server A attempts to register runId-A with stale claimId-A
1952+ // Without the fix, step 4 would overwrite runId-B. With the fix, it fails atomically.
1953+
1954+ const { createRedisClient } = await import ( "@internal/redis" ) ;
1955+
1956+ const authenticatedEnvironment = await setupAuthenticatedEnvironment ( prisma , "PRODUCTION" ) ;
1957+
1958+ const engine = new RunEngine ( {
1959+ prisma,
1960+ worker : {
1961+ redis : redisOptions ,
1962+ workers : 1 ,
1963+ tasksPerWorker : 10 ,
1964+ pollIntervalMs : 100 ,
1965+ } ,
1966+ queue : {
1967+ redis : redisOptions ,
1968+ } ,
1969+ runLock : {
1970+ redis : redisOptions ,
1971+ } ,
1972+ machines : {
1973+ defaultMachine : "small-1x" ,
1974+ machines : {
1975+ "small-1x" : {
1976+ name : "small-1x" as const ,
1977+ cpu : 0.5 ,
1978+ memory : 0.5 ,
1979+ centsPerMs : 0.0001 ,
1980+ } ,
1981+ } ,
1982+ baseCostInCents : 0.0001 ,
1983+ } ,
1984+ debounce : {
1985+ maxDebounceDurationMs : 60_000 ,
1986+ } ,
1987+ tracer : trace . getTracer ( "test" , "0.0.0" ) ,
1988+ } ) ;
1989+
1990+ // Create a separate Redis client to simulate "another server" modifying keys directly
1991+ const simulatedServerRedis = createRedisClient ( {
1992+ ...redisOptions ,
1993+ keyPrefix : `${ redisOptions . keyPrefix ?? "" } debounce:` ,
1994+ } ) ;
1995+
1996+ try {
1997+ const taskIdentifier = "test-task" ;
1998+ const debounceKey = "race-test-key" ;
1999+ const environmentId = authenticatedEnvironment . id ;
2000+ const delayUntil = new Date ( Date . now ( ) + 60_000 ) ;
2001+
2002+ await setupBackgroundWorker ( engine , authenticatedEnvironment , taskIdentifier ) ;
2003+
2004+ // Construct the Redis key (same format as DebounceSystem.getDebounceRedisKey)
2005+ const redisKey = `${ environmentId } :${ taskIdentifier } :${ debounceKey } ` ;
2006+
2007+ // Step 1: Server A claims the key with claimId-A
2008+ const claimIdA = "claim-server-A" ;
2009+ await simulatedServerRedis . set ( redisKey , `pending:${ claimIdA } ` , "PX" , 60_000 ) ;
2010+
2011+ // Step 2 & 3: Simulate Server B claiming and registering (after A's claim "expires")
2012+ // In reality, this simulates the race where B's claim overwrites A's pending claim
2013+ const runIdB = "run_server_B" ;
2014+ await simulatedServerRedis . set ( redisKey , runIdB , "PX" , 60_000 ) ;
2015+
2016+ // Verify Server B's registration is in place
2017+ const valueAfterB = await simulatedServerRedis . get ( redisKey ) ;
2018+ expect ( valueAfterB ) . toBe ( runIdB ) ;
2019+
2020+ // Step 4: Server A attempts to register with its stale claimId-A
2021+ // This should FAIL because the key no longer contains "pending:claim-server-A"
2022+ const runIdA = "run_server_A" ;
2023+ const registered = await engine . debounceSystem . registerDebouncedRun ( {
2024+ runId : runIdA ,
2025+ environmentId,
2026+ taskIdentifier,
2027+ debounceKey,
2028+ delayUntil,
2029+ claimId : claimIdA , // Stale claim ID
2030+ } ) ;
2031+
2032+ // Step 5: Verify Server A's registration failed
2033+ expect ( registered ) . toBe ( false ) ;
2034+
2035+ // Step 6: Verify Redis still contains runId-B (not overwritten by Server A)
2036+ const finalValue = await simulatedServerRedis . get ( redisKey ) ;
2037+ expect ( finalValue ) . toBe ( runIdB ) ;
2038+ } finally {
2039+ await simulatedServerRedis . quit ( ) ;
2040+ await engine . quit ( ) ;
2041+ }
2042+ }
2043+ ) ;
19422044} ) ;
19432045
0 commit comments