Skip to content

Commit 7e41820

Browse files
committed
feat(sandbox): add warm pool claim and scheduler
introduce a warm pool system to reduce sandbox cold-start latency by pre-warming instances and claiming them on create when available. add global and per-profile warm pool configuration, startup/shutdown lifecycle hooks, an in-process bounded warmup queue with workers, and a periodic scheduler to replenish and rotate warm instances. update sandbox creation flow to check idempotency first, attempt warm claim before normal create, and enqueue warmup work through the queue (with background-task fallback when queue is unavailable). extend sandbox model/manager with warm pool state and atomic claim logic, exclude warm pool instances from user listing and GC expiry/idle scans, and add unit tests for claim behavior, manager methods, and queue lifecycle/statistics.
1 parent c5e22e4 commit 7e41820

File tree

20 files changed

+2064
-35
lines changed

20 files changed

+2064
-35
lines changed

deploy/docker/config.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,18 @@ security:
5050
api_key: "CHANGE-ME"
5151
allow_anonymous: false
5252

53+
# Warm Pool — pre-start standby sandbox instances to reduce cold-start latency.
54+
# When a user creates a sandbox, Bay will first try to claim an available warm instance,
55+
# delivering near-instant startup instead of waiting for container boot.
56+
warm_pool:
57+
enabled: true
58+
warmup_queue_workers: 2 # Concurrent warmup workers
59+
warmup_queue_max_size: 256 # Maximum queue depth
60+
warmup_queue_drop_policy: "drop_newest"
61+
warmup_queue_drop_alert_threshold: 50
62+
interval_seconds: 30 # Pool maintenance scan interval
63+
run_on_startup: true
64+
5365
profiles:
5466
# ── Standard Python sandbox ────────────────────────
5567
- id: python-default
@@ -65,6 +77,7 @@ profiles:
6577
- shell
6678
- python
6779
idle_timeout: 1800 # 30 minutes
80+
warm_pool_size: 1 # Keep 1 pre-warmed instance ready
6881
env: {}
6982

7083
# ── Data Science sandbox (more resources) ──────────
@@ -81,6 +94,7 @@ profiles:
8194
- shell
8295
- python
8396
idle_timeout: 1800
97+
warm_pool_size: 1
8498
env: {}
8599

86100
# ── Browser + Python multi-container sandbox ───────
@@ -114,6 +128,7 @@ profiles:
114128
- browser
115129
env: {}
116130
idle_timeout: 1800
131+
warm_pool_size: 1
117132

118133
gc:
119134
# Enable automatic GC for production

pkgs/bay/app/api/v1/sandboxes.py

Lines changed: 74 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from datetime import datetime
1010

1111
import structlog
12-
from fastapi import APIRouter, BackgroundTasks, Header, Query
12+
from fastapi import APIRouter, BackgroundTasks, Header, Query, Request
1313
from fastapi.responses import JSONResponse
1414
from pydantic import BaseModel
1515

@@ -332,12 +332,13 @@ async def create_sandbox(
332332
- Lazy session creation: status may be 'idle' initially
333333
- ttl=null or ttl=0 means no expiry
334334
- Supports Idempotency-Key header for safe retries
335+
- Prioritizes claiming a warm pool sandbox if available (§6.1)
335336
"""
336337
# Serialize request body for fingerprinting
337338
request_body = request.model_dump_json()
338339
request_path = "/v1/sandboxes"
339340

340-
# 1. Check idempotency key if provided
341+
# 1. Check idempotency key if provided (must be BEFORE claim, §6.1 step 2)
341342
if idempotency_key:
342343
cached = await idempotency_svc.check(
343344
owner=owner,
@@ -348,12 +349,50 @@ async def create_sandbox(
348349
)
349350
if cached:
350351
# Return cached response with original status code
352+
# Do NOT trigger claim/warmup side effects (§11.1)
351353
return JSONResponse(
352354
content=cached.response,
353355
status_code=cached.status_code,
354356
)
355357

356-
# 2. Create sandbox
358+
# 2. Try to claim a warm sandbox (§6.1 step 3)
359+
# Skip claim when user specifies a cargo_id (warm sandbox has its own cargo)
360+
sandbox = None
361+
if request.cargo_id is None:
362+
sandbox = await sandbox_mgr.claim_warm_sandbox(
363+
owner=owner,
364+
profile_id=request.profile,
365+
ttl=request.ttl,
366+
)
367+
368+
if sandbox is not None:
369+
# Claim succeeded - return immediately (already warm/running)
370+
_log.info(
371+
"sandbox.create.warm_claim_hit",
372+
sandbox_id=sandbox.id,
373+
profile=request.profile,
374+
)
375+
response = _sandbox_to_response(sandbox)
376+
377+
# Save idempotency key if provided
378+
if idempotency_key:
379+
await idempotency_svc.save(
380+
owner=owner,
381+
key=idempotency_key,
382+
path=request_path,
383+
method="POST",
384+
body=request_body,
385+
response=response,
386+
status_code=201,
387+
)
388+
389+
return response
390+
391+
# 3. Claim miss - fall back to normal create
392+
_log.debug(
393+
"sandbox.create.warm_claim_miss",
394+
profile=request.profile,
395+
)
357396
sandbox = await sandbox_mgr.create(
358397
owner=owner,
359398
profile_id=request.profile,
@@ -362,7 +401,7 @@ async def create_sandbox(
362401
)
363402
response = _sandbox_to_response(sandbox)
364403

365-
# 3. Save idempotency key if provided
404+
# 4. Save idempotency key if provided
366405
if idempotency_key:
367406
await idempotency_svc.save(
368407
owner=owner,
@@ -374,13 +413,19 @@ async def create_sandbox(
374413
status_code=201,
375414
)
376415

377-
# 4. Enqueue warmup hook. The hook itself detaches actual warmup work,
378-
# so this does not block request completion on keep-alive connections.
379-
background_tasks.add_task(
380-
_warmup_sandbox_runtime,
381-
sandbox_id=sandbox.id,
382-
owner=owner,
383-
)
416+
# 5. Enqueue warmup via queue (§2.5.1: only enqueue, never execute directly)
417+
from app.services.warm_pool.lifecycle import get_warmup_queue
418+
419+
warmup_queue = get_warmup_queue()
420+
if warmup_queue is not None and warmup_queue.is_running:
421+
warmup_queue.enqueue(sandbox_id=sandbox.id, owner=owner)
422+
else:
423+
# Fallback: if queue not available, use background task
424+
background_tasks.add_task(
425+
_warmup_sandbox_runtime,
426+
sandbox_id=sandbox.id,
427+
owner=owner,
428+
)
384429

385430
return response
386431

@@ -537,14 +582,29 @@ async def stop_sandbox(
537582
@router.delete("/{sandbox_id}", status_code=204)
538583
async def delete_sandbox(
539584
sandbox_id: str,
585+
request: Request,
540586
sandbox_mgr: SandboxManagerDep,
541587
owner: AuthDep,
542588
) -> None:
543-
"""Delete sandbox permanently.
589+
"""Delete sandbox permanently (idempotent).
544590
545591
- Destroys all running sessions
546592
- Cascade deletes managed cargo
547593
- Does NOT cascade delete external cargo
594+
- If sandbox already soft-deleted, returns 204 (idempotent)
548595
"""
549-
sandbox = await sandbox_mgr.get(sandbox_id, owner)
550-
await sandbox_mgr.delete(sandbox)
596+
request_id = getattr(request.state, "request_id", None)
597+
_log.info(
598+
"sandbox.delete.request",
599+
sandbox_id=sandbox_id,
600+
owner=owner,
601+
request_id=request_id,
602+
delete_source="api.v1.sandboxes.delete",
603+
)
604+
await sandbox_mgr.delete_by_id(
605+
sandbox_id=sandbox_id,
606+
owner=owner,
607+
idempotent=True,
608+
delete_source="api.v1.sandboxes.delete",
609+
request_id=request_id,
610+
)

pkgs/bay/app/config.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,15 @@ class ProfileConfig(BaseModel):
180180
# ========== Shared configuration ==========
181181
idle_timeout: int = 1800 # 30 minutes
182182

183+
# ========== Warm pool configuration ==========
184+
warm_pool_size: int = 0 # Number of pre-warmed sandbox instances (0 = disabled)
185+
warm_rotate_ttl: int = 1800 # Seconds before a warm instance is rotated (>= 60)
186+
warm_claim_timeout_ms: int = 200 # Max time to attempt claim before fallback (100-3000)
187+
warmup_retry_max_attempts: int = 3 # Max retry on warmup failure (>= 0)
188+
warmup_retry_backoff_base_ms: int = 200 # Base backoff for warmup retry
189+
warmup_retry_backoff_max_ms: int = 5000 # Max backoff for warmup retry
190+
warmup_circuit_breaker_threshold: int = 10 # Consecutive failures before circuit break
191+
183192
def model_post_init(self, __context: Any) -> None:
184193
"""Normalize single-container format to multi-container format.
185194
@@ -346,6 +355,20 @@ class BrowserLearningConfig(BaseModel):
346355
error_rate_multiplier_threshold: float = 2.0
347356

348357

358+
class WarmPoolConfig(BaseModel):
359+
"""Warm pool global configuration."""
360+
361+
enabled: bool = True
362+
# Warmup queue settings (in-process bounded queue)
363+
warmup_queue_workers: int = 2 # Number of concurrent warmup workers (>= 1)
364+
warmup_queue_max_size: int = 256 # Maximum queue depth (>= 1)
365+
warmup_queue_drop_policy: Literal["drop_newest", "drop_oldest"] = "drop_newest"
366+
warmup_queue_drop_alert_threshold: int = 50 # Alert when drops exceed this count
367+
# Scheduler settings
368+
interval_seconds: int = 30 # Pool maintenance interval
369+
run_on_startup: bool = True # Whether to run pool maintenance on startup
370+
371+
349372
class SecurityConfig(BaseModel):
350373
"""Security configuration."""
351374

@@ -389,6 +412,7 @@ class Settings(BaseSettings):
389412
security: SecurityConfig = Field(default_factory=SecurityConfig)
390413
idempotency: IdempotencyConfig = Field(default_factory=IdempotencyConfig)
391414
gc: GCConfig = Field(default_factory=GCConfig)
415+
warm_pool: WarmPoolConfig = Field(default_factory=WarmPoolConfig)
392416
browser_learning: BrowserLearningConfig = Field(default_factory=BrowserLearningConfig)
393417
browser_auto_release_enabled: bool = True
394418

pkgs/bay/app/main.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
init_browser_learning_scheduler,
2020
shutdown_browser_learning_scheduler,
2121
)
22+
from app.services.warm_pool.lifecycle import init_warm_pool, shutdown_warm_pool
2223

2324
logger = structlog.get_logger()
2425

@@ -46,6 +47,9 @@ async def lifespan(app: FastAPI):
4647
await init_gc_scheduler()
4748
await init_browser_learning_scheduler()
4849

50+
# Initialize warm pool (queue + scheduler)
51+
await init_warm_pool()
52+
4953
yield
5054

5155
# Shutdown
@@ -55,6 +59,9 @@ async def lifespan(app: FastAPI):
5559
await shutdown_gc_scheduler()
5660
await shutdown_browser_learning_scheduler()
5761

62+
# Stop warm pool
63+
await shutdown_warm_pool()
64+
5865
# Close HTTP client
5966
await http_client_manager.shutdown()
6067

0 commit comments

Comments
 (0)