vllm-project
diff --git a/‎.github/workflows/ci-changes.yml‎
Lines changed: 8 additions & 0 deletions b/‎.github/workflows/ci-changes.yml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎.github/workflows/integration-test-k8s.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/integration-test-k8s.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎deploy/kubernetes/router-replay/postgres.yaml‎
Lines changed: 54 additions & 0 deletions b/‎deploy/kubernetes/router-replay/postgres.yaml‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎docs/agent/state-taxonomy-and-inventory.md‎
Lines changed: 1 addition & 2 deletions b/‎docs/agent/state-taxonomy-and-inventory.md‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎e2e/pkg/fixtures/http.go‎
Lines changed: 5 additions & 0 deletions b/‎e2e/pkg/fixtures/http.go‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎e2e/profiles/all/imports.go‎
Lines changed: 6 additions & 0 deletions b/‎e2e/profiles/all/imports.go‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎e2e/profiles/router-replay-postgres/profile.go‎
Lines changed: 67 additions & 0 deletions b/‎e2e/profiles/router-replay-postgres/profile.go‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎e2e/profiles/router-replay-postgres/values.yaml‎
Lines changed: 111 additions & 0 deletions b/‎e2e/profiles/router-replay-postgres/values.yaml‎
Lines changed: 111 additions & 0 deletions
@@ -47,6 +47,8 @@ on:
         value: ${{ jobs.filter.outputs.e2e_response_api_redis }}
       e2e_response_api_redis_cluster:
         value: ${{ jobs.filter.outputs.e2e_response_api_redis_cluster }}
+      e2e_router_replay_postgres:
+        value: ${{ jobs.filter.outputs.e2e_router_replay_postgres }}
       e2e_ml_model_selection:
         value: ${{ jobs.filter.outputs.e2e_ml_model_selection }}
       e2e_multi_endpoint:
@@ -85,6 +87,7 @@ jobs:
       e2e_response_api: ${{ steps.changes.outputs.e2e_response_api }}
       e2e_response_api_redis: ${{ steps.changes.outputs.e2e_response_api_redis }}
       e2e_response_api_redis_cluster: ${{ steps.changes.outputs.e2e_response_api_redis_cluster }}
+      e2e_router_replay_postgres: ${{ steps.changes.outputs.e2e_router_replay_postgres }}
       e2e_ml_model_selection: ${{ steps.changes.outputs.e2e_ml_model_selection }}
       e2e_multi_endpoint: ${{ steps.changes.outputs.e2e_multi_endpoint }}
       e2e_authz_rbac: ${{ steps.changes.outputs.e2e_authz_rbac }}
@@ -206,6 +209,11 @@ jobs:
             e2e_response_api_redis_cluster:
               - 'e2e/profiles/response-api-redis-cluster/**'
               - 'deploy/kubernetes/response-api/redis-cluster.yaml'
+            e2e_router_replay_postgres:
+              - 'e2e/profiles/router-replay-postgres/**'
+              - 'deploy/kubernetes/router-replay/**'
+              - 'src/semantic-router/pkg/routerreplay/**'
+              - 'src/semantic-router/pkg/extproc/router_replay_setup.go'
             e2e_ml_model_selection:
               - 'e2e/profiles/ml-model-selection/**'
               - 'src/semantic-router/pkg/modelselection/**'
 
@@ -42,7 +42,7 @@ jobs:
              [[ "${{ needs.changes.outputs.agent_exec }}" == "true" ]] || \
              [[ "${{ github.event_name }}" == "schedule" ]] || \
              [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
-            echo 'profiles=["kubernetes", "dashboard"]' >> $GITHUB_OUTPUT
+            echo 'profiles=["kubernetes", "dashboard", "router-replay-postgres"]' >> $GITHUB_OUTPUT
             echo 'should_run=true' >> $GITHUB_OUTPUT
             echo "Running default baseline profiles due to common/core changes or push/schedule/manual trigger"
             exit 0
@@ -62,6 +62,7 @@ jobs:
           [[ "${{ needs.changes.outputs.e2e_multi_endpoint }}" == "true" ]] && profiles+=("multi-endpoint")
           [[ "${{ needs.changes.outputs.e2e_authz_rbac }}" == "true" ]] && profiles+=("authz-rbac")
           [[ "${{ needs.changes.outputs.e2e_streaming }}" == "true" ]] && profiles+=("streaming")
+          [[ "${{ needs.changes.outputs.e2e_router_replay_postgres }}" == "true" ]] && profiles+=("router-replay-postgres")
 
           # Convert to JSON array
           if [ ${#profiles[@]} -eq 0 ]; then
 
@@ -0,0 +1,54 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: postgres
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: postgres
+  template:
+    metadata:
+      labels:
+        app: postgres
+    spec:
+      containers:
+      - name: postgres
+        image: postgres:16-alpine
+        imagePullPolicy: IfNotPresent
+        ports:
+        - containerPort: 5432
+          name: postgres
+          protocol: TCP
+        env:
+        - name: POSTGRES_DB
+          value: vsr
+        - name: POSTGRES_USER
+          value: router
+        - name: POSTGRES_PASSWORD
+          value: router-secret
+        readinessProbe:
+          exec:
+            command:
+            - pg_isready
+            - -U
+            - router
+            - -d
+            - vsr
+          initialDelaySeconds: 3
+          periodSeconds: 3
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: postgres
+  namespace: default
+spec:
+  selector:
+    app: postgres
+  ports:
+  - name: postgres
+    port: 5432
+    targetPort: 5432
+    protocol: TCP
@@ -52,7 +52,7 @@ Use it to answer three questions before adding or changing a stateful feature:
 | Surface | Primary owner | Current backend / default | Current durability class | Restart behavior today | Scale risk | Recommended direction |
 | --- | --- | --- | --- | --- | --- | --- |
 | Response API stored responses and conversations | router runtime, `src/semantic-router/pkg/responsestore/**` | Default `redis`; optional `memory` for local dev only | `shared_durable_workflow_state` | Response and conversation history survives restart when using the default Redis backend. The `memory` backend emits a startup warning and loses all data on restart. | Replica-local only when `memory` is explicitly selected; Redis backend is shared across replicas | Keep metadata and conversation chain in a durable server-owned store by default for product use. Prefer relational storage for metadata and queryability; keep large payloads in blob/object storage only if needed later. |
-| Router replay records | router runtime, `src/semantic-router/pkg/routerreplay/**`, `src/semantic-router/pkg/extproc/router_replay_setup.go` | Default `memory`; optional Redis, Postgres, Milvus | `audit_analytics_telemetry` presented as `ephemeral_request_state` by default | Restart drops replay history when default backend is used | Debuggability and audit posture degrade under restart and multi-replica routing | Prefer Postgres for durable operator-facing replay history. Keep Redis only for transient debug buffers and Milvus only when semantic replay search is explicitly needed. |
+| Router replay records | router runtime, `src/semantic-router/pkg/routerreplay/**`, `src/semantic-router/pkg/extproc/router_replay_setup.go` | Default `postgres`; optional `redis`, `milvus`, `memory` (local dev only) | `audit_analytics_telemetry` with `shared_durable_workflow_state` default | Replay history survives restart when using the default Postgres backend. The `memory` backend emits a startup warning and loses all records on restart. | Postgres provides SQL queryability for audit and compliance. Redis available for lightweight deployments. | Keep metadata and replay records in a durable server-owned store by default. Postgres is the default for long-term audit retention and compliance. Keep Milvus only when semantic replay search is explicitly needed. |
 | Semantic cache entries | router runtime, `src/semantic-router/pkg/cache/**` | Default `memory`; optional Redis, Milvus, hybrid | `ephemeral_request_state` in local dev; shared cache in scaled deploys | Restart flushes cache; replicas do not share hot entries by default | Cold-start latency, inconsistent cache hit rates, and uneven behavior across replicas | Keep this as cache, not a database table. Prefer Redis or hybrid shared backends for scaled deployments; document memory backend as local/dev or single-node only. |
 | RAG retrieval result cache | router runtime, `src/semantic-router/pkg/extproc/req_filter_rag_cache.go` | Process-wide singleton in-memory LRU with TTL | `ephemeral_request_state` | Restart flushes cache; cache is global per process, not per tenant or replica | Hidden shared mutable state, no observability, no durability, and no multi-replica coherence | Keep as optional cache only. Move to a pluggable shared cache backend if this becomes performance-critical, or document as local process optimization. |
 | Agentic memory vectors | router runtime, `src/semantic-router/pkg/memory/**` | Disabled by default; vector content leans on Milvus config when enabled | `shared_durable_workflow_state` when enabled | Depends on backend choice; not enabled by default | Product semantics remain ambiguous between experimental memory and supported user data | Keep vector embeddings in Milvus or another vector store, but pair them with explicit metadata and lifecycle ownership in a durable server-owned contract. |
@@ -78,7 +78,6 @@ Use it to answer three questions before adding or changing a stateful feature:
 
 ## Default Memory-Backed Surfaces To Treat As High Risk
 
-- `global.services.router_replay.store_backend = memory`
 - `global.stores.semantic_cache.backend_type = memory`
 - `global.stores.vector_store.backend_type = memory` in dashboard defaults when enabled
 - RAG `cache_results` in `src/semantic-router/pkg/config/rag_plugin.go`
 
@@ -24,6 +24,11 @@ func (r *HTTPResponse) DecodeJSON(v any) error {
 	return nil
 }
 
+// DoGETRequest sends a GET request and returns the raw HTTP response.
+func DoGETRequest(ctx context.Context, httpClient *http.Client, url string) (*HTTPResponse, error) {
+	return doJSONRequest(ctx, httpClient, http.MethodGet, url, nil, nil)
+}
+
 func doJSONRequest(
 	ctx context.Context,
 	httpClient *http.Client,
 
@@ -17,6 +17,7 @@ import (
 	responseapi "github.com/vllm-project/semantic-router/e2e/profiles/response-api"
 	responseapiredis "github.com/vllm-project/semantic-router/e2e/profiles/response-api-redis"
 	responseapirediscluster "github.com/vllm-project/semantic-router/e2e/profiles/response-api-redis-cluster"
+	routerreplaypostgres "github.com/vllm-project/semantic-router/e2e/profiles/router-replay-postgres"
 	routingstrategies "github.com/vllm-project/semantic-router/e2e/profiles/routing-strategies"
 	streaming "github.com/vllm-project/semantic-router/e2e/profiles/streaming"
 )
@@ -61,6 +62,11 @@ func init() {
 		func() framework.Profile { return responseapirediscluster.NewProfile() },
 		framework.ProfileCapabilities{LocalImages: mockVLLMLocalImages},
 	)
+	register(
+		"router-replay-postgres",
+		func() framework.Profile { return routerreplaypostgres.NewProfile() },
+		framework.ProfileCapabilities{LocalImages: mockVLLMLocalImages},
+	)
 	register("routing-strategies", func() framework.Profile { return routingstrategies.NewProfile() }, framework.ProfileCapabilities{})
 	register("streaming", func() framework.Profile { return streaming.NewProfile() }, framework.ProfileCapabilities{})
 }
 
@@ -0,0 +1,67 @@
+package routerreplaypostgres
+
+import (
+	"context"
+
+	"github.com/vllm-project/semantic-router/e2e/pkg/framework"
+	gatewaystack "github.com/vllm-project/semantic-router/e2e/pkg/stacks/gateway"
+)
+
+const (
+	valuesFile       = "e2e/profiles/router-replay-postgres/values.yaml"
+	postgresManifest = "deploy/kubernetes/router-replay/postgres.yaml"
+)
+
+var resourceManifests = []string{
+	"deploy/kubernetes/response-api/mock-vllm.yaml",
+	"deploy/kubernetes/response-api/gwapi-resources.yaml",
+}
+
+// Profile implements the Router Replay Postgres test profile.
+type Profile struct {
+	stack *gatewaystack.Stack
+}
+
+// NewProfile creates a new Router Replay Postgres profile.
+func NewProfile() *Profile {
+	return &Profile{
+		stack: gatewaystack.New(gatewaystack.Config{
+			Name:                     "router-replay-postgres",
+			SemanticRouterValuesFile: valuesFile,
+			PrerequisiteManifests:    []string{postgresManifest},
+			ResourceManifests:        resourceManifests,
+		}),
+	}
+}
+
+// Name returns the profile name.
+func (p *Profile) Name() string {
+	return "router-replay-postgres"
+}
+
+// Description returns the profile description.
+func (p *Profile) Description() string {
+	return "Tests Router Replay restart recovery using the default Postgres backend"
+}
+
+// Setup deploys Postgres, the router, and gateway resources.
+func (p *Profile) Setup(ctx context.Context, opts *framework.SetupOptions) error {
+	return p.stack.Setup(ctx, opts)
+}
+
+// Teardown removes the stack.
+func (p *Profile) Teardown(ctx context.Context, opts *framework.TeardownOptions) error {
+	return p.stack.Teardown(ctx, opts)
+}
+
+// GetTestCases returns the test cases for this profile.
+func (p *Profile) GetTestCases() []string {
+	return []string{
+		"router-replay-restart-recovery",
+	}
+}
+
+// GetServiceConfig returns the service configuration for accessing the deployed service.
+func (p *Profile) GetServiceConfig() framework.ServiceConfig {
+	return p.stack.ServiceConfig()
+}
@@ -0,0 +1,111 @@
+replicaCount: 1
+image:
+  repository: ghcr.io/vllm-project/semantic-router/extproc
+  tag: latest
+  pullPolicy: Never
+config:
+  version: v0.3
+  listeners: []
+  providers:
+    defaults:
+      default_model: openai/gpt-oss-20b
+    models:
+      - name: openai/gpt-oss-20b
+        backend_refs:
+          - name: test-endpoint
+            endpoint: mock-vllm.default.svc.cluster.local:8000
+            weight: 1
+  routing:
+    decisions:
+      - name: default_decision
+        description: Default catch-all decision
+        priority: 1
+        rules:
+          operator: AND
+          conditions: []
+        modelRefs:
+          - model: openai/gpt-oss-20b
+            use_reasoning: false
+        plugins:
+          - type: router_replay
+            configuration:
+              enabled: true
+              max_records: 1000
+              capture_request_body: true
+              capture_response_body: true
+              max_body_bytes: 65536
+    signals: {}
+    modelCards:
+      - name: openai/gpt-oss-20b
+  global:
+    router:
+      strategy: priority
+    services:
+      response_api:
+        enabled: true
+        store_backend: memory
+        ttl_seconds: 86400
+        max_responses: 1000
+      router_replay:
+        store_backend: postgres
+        ttl_seconds: 2592000
+        async_writes: false
+        postgres:
+          host: postgres.default.svc.cluster.local
+          port: 5432
+          database: vsr
+          user: router
+          password: router-secret
+          ssl_mode: disable
+          max_open_conns: 10
+          max_idle_conns: 5
+          conn_max_lifetime: 300
+          table_name: router_replay
+      api:
+        batch_classification:
+          max_batch_size: 100
+          concurrency_threshold: 5
+          max_concurrency: 8
+          metrics:
+            enabled: true
+            detailed_goroutine_tracking: false
+            high_resolution_timing: false
+            sample_rate: 1.0
+      observability:
+        tracing:
+          enabled: false
+    stores:
+      semantic_cache:
+        embedding_model: mmbert
+      memory:
+        embedding_model: mmbert
+      vector_store:
+        embedding_model: mmbert
+    integrations: {}
+    model_catalog:
+      kbs: []
+      modules:
+        prompt_guard:
+          enabled: false
+          model_ref: ""
+          model_id: ""
+          jailbreak_mapping_path: ""
+          use_mmbert_32k: false
+        classifier:
+          domain:
+            model_ref: ""
+            model_id: ""
+            category_mapping_path: ""
+            use_mmbert_32k: false
+          pii:
+            model_ref: ""
+            model_id: ""
+            pii_mapping_path: ""
+            use_mmbert_32k: false
+resources:
+  limits:
+    cpu: '2'
+    memory: 10Gi
+  requests:
+    cpu: 500m
+    memory: 2Gi
Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@ import (`
`17`	`17`	`responseapi "github.com/vllm-project/semantic-router/e2e/profiles/response-api"`
`18`	`18`	`responseapiredis "github.com/vllm-project/semantic-router/e2e/profiles/response-api-redis"`
`19`	`19`	`responseapirediscluster "github.com/vllm-project/semantic-router/e2e/profiles/response-api-redis-cluster"`
	`20`	`+ routerreplaypostgres "github.com/vllm-project/semantic-router/e2e/profiles/router-replay-postgres"`
`20`	`21`	`routingstrategies "github.com/vllm-project/semantic-router/e2e/profiles/routing-strategies"`
`21`	`22`	`streaming "github.com/vllm-project/semantic-router/e2e/profiles/streaming"`
`22`	`23`	`)`
`@@ -61,6 +62,11 @@ func init() {`
`61`	`62`	`func() framework.Profile { return responseapirediscluster.NewProfile() },`
`62`	`63`	`framework.ProfileCapabilities{LocalImages: mockVLLMLocalImages},`
`63`	`64`	`)`
	`65`	`+ register(`
	`66`	`+ "router-replay-postgres",`
	`67`	`+ func() framework.Profile { return routerreplaypostgres.NewProfile() },`
	`68`	`+ framework.ProfileCapabilities{LocalImages: mockVLLMLocalImages},`
	`69`	`+ )`
`64`	`70`	`register("routing-strategies", func() framework.Profile { return routingstrategies.NewProfile() }, framework.ProfileCapabilities{})`
`65`	`71`	`register("streaming", func() framework.Profile { return streaming.NewProfile() }, framework.ProfileCapabilities{})`
`66`	`72`	`}`