llm-d · NaomiEisen · Mar 2, 2026 · Mar 2, 2026 · Mar 3, 2026 · Mar 3, 2026
@@ -22,7 +22,6 @@ import (
 	"github.com/llm-d/llm-d-kv-cache/examples/testdata"
 	"github.com/llm-d/llm-d-kv-cache/pkg/kvcache/kvblock"
 	"github.com/llm-d/llm-d-kv-cache/pkg/kvevents"
-	"github.com/llm-d/llm-d-kv-cache/pkg/utils"
 	"github.com/vmihailenco/msgpack/v5"
 	"sigs.k8s.io/controller-runtime/pkg/log"
 )
@@ -33,23 +32,27 @@ func SimulateProduceEvent(ctx context.Context, publisher *Publisher) error {
 	logger := log.FromContext(ctx)
 	logger.Info("@@@ Simulating vLLM engine publishing BlockStored events...")
 	medium := "GPU"
-	blockStoredEvent := kvevents.BlockStored{
-		BlockHashes:     utils.SliceMap(testdata.PromptHashes, func(h uint64) any { return h }),
-		ParentBlockHash: nil,
-		TokenIds:        []uint32{1, 2, 3},
-		BlockSize:       256,
-		LoraID:          nil,
-		Medium:          &medium,
-		LoraName:        nil,
+
+	// Create event in vLLM msgpack array format: [tag, hashes, parent, tokens, blockSize, loraID, medium, loraName]
+	blockStoredEvent := []any{
+		"BlockStored",         // Tag
+		testdata.PromptHashes, // BlockHashes (already []uint64)
+		nil,                   // ParentBlockHash
+		[]uint32{1, 2, 3},     // TokenIds
+		256,                   // BlockSize
+		nil,                   // LoraID
+		medium,                // Medium
+		nil,                   // LoraName
 	}
 
 	//nolint // won't fail
-	blockStoredPayload, _ := msgpack.Marshal(blockStoredEvent.ToTaggedUnion())
+	blockStoredPayload, _ := msgpack.Marshal(blockStoredEvent)
 
-	eventBatch := kvevents.EventBatch{
-		TS:               float64(time.Now().UnixNano()) / 1e9,
-		Events:           []msgpack.RawMessage{blockStoredPayload},
-		DataParallelRank: nil,
+	// Create vLLM msgpack event batch in array format: [timestamp, [event1, event2, ...], data_parallel_rank]
+	eventBatch := []any{
+		float64(time.Now().UnixNano()) / 1e9,     // Timestamp
+		[]msgpack.RawMessage{blockStoredPayload}, // Events: nested arrays (vLLM wire format)
+		nil,                                      // DataParallelRank
 	}
 
 	if err := publisher.PublishEvent(ctx, topic, eventBatch); err != nil {
@@ -70,17 +73,21 @@ func SimulateProduceEvent(ctx context.Context, publisher *Publisher) error {
 func SimulateRemoveEvent(ctx context.Context, publisher *Publisher) error {
 	logger := log.FromContext(ctx)
 	logger.Info("@@@ Simulating vLLM engine removing some blocks...")
-	blockRemovedEvent := kvevents.BlockRemoved{
-		BlockHashes: []any{testdata.PromptHashes[2], testdata.PromptHashes[3]},
+
+	// Create event in vLLM msgpack array format: [tag, hashes]
+	blockRemovedEvent := []any{
+		"BlockRemoved",
+		[]uint64{testdata.PromptHashes[2], testdata.PromptHashes[3]},
 	}
 
 	//nolint // won't fail
-	blockRemovedPayload, _ := msgpack.Marshal(blockRemovedEvent.ToTaggedUnion())
+	blockRemovedPayload, _ := msgpack.Marshal(blockRemovedEvent)
 
-	removeEventBatch := kvevents.EventBatch{
-		TS:               float64(time.Now().UnixNano()) / 1e9,
-		Events:           []msgpack.RawMessage{blockRemovedPayload},
-		DataParallelRank: nil,
+	// Create vLLM msgpack event batch in array format: [timestamp, [event1, event2, ...], data_parallel_rank]
+	removeEventBatch := []any{
+		float64(time.Now().UnixNano()) / 1e9,
+		[]msgpack.RawMessage{blockRemovedPayload},
+		nil,
 	}
 
 	if err := publisher.PublishEvent(ctx, topic, removeEventBatch); err != nil {

@@ -27,6 +27,7 @@ import (
 	"github.com/llm-d/llm-d-kv-cache/pkg/kvcache"
 	"github.com/llm-d/llm-d-kv-cache/pkg/kvcache/kvblock"
 	"github.com/llm-d/llm-d-kv-cache/pkg/kvevents"
+	"github.com/llm-d/llm-d-kv-cache/pkg/kvevents/engineadapter"
 	types "github.com/llm-d/llm-d-kv-cache/pkg/tokenization/types"
 	"sigs.k8s.io/controller-runtime/pkg/log"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
@@ -142,7 +143,8 @@ func New(ctx context.Context, config PrecisePrefixCachePluginConfig) (*PrecisePr
 	if config.KVEventsConfig.ZMQEndpoint != "" {
 		// setup local subscriber to support global socket mode
 		if err := subscribersManager.EnsureSubscriber(ctx, "local-subscriber",
-			config.KVEventsConfig.ZMQEndpoint, config.KVEventsConfig.TopicFilter, false); err != nil {
+			config.KVEventsConfig.ZMQEndpoint, config.KVEventsConfig.TopicFilter,
+			engineadapter.EngineTypeVLLM, false); err != nil {
 			return nil, fmt.Errorf("failed to create local subscriber for global socket mode: %w", err)
 		}
 	}

@@ -32,6 +32,7 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/predicate"
 
 	"github.com/llm-d/llm-d-kv-cache/pkg/kvevents"
+	"github.com/llm-d/llm-d-kv-cache/pkg/kvevents/engineadapter"
 	"github.com/llm-d/llm-d-kv-cache/pkg/utils/logging"
 )
 
@@ -44,8 +45,10 @@ type PodReconcilerConfig struct {
 	PodNamespace string
 	// TopicFilter is the ZMQ subscription filter (e.g., "kv@").
 	TopicFilter string
-	// SocketPort is the port where vLLM pods expose ZMQ (default: 5557).
+	// SocketPort is the port where LLM pods expose ZMQ (default: 5557).
 	SocketPort string
+	// EngineType specifies which LLM engine type this reconciler manages.
+	EngineType string
 }
 
 // NewPodReconcilerConfig creates a PodReconcilerConfig from kvevents.PodDiscoveryConfig.
@@ -71,6 +74,7 @@ func NewPodReconcilerConfig(cfg *kvevents.PodDiscoveryConfig, topicFilter string
 		PodNamespace:     cfg.PodNamespace,
 		TopicFilter:      topicFilter,
 		SocketPort:       fmt.Sprintf("%d", socketPort),
+		EngineType:       cfg.EngineType,
 	}, nil
 }
 
@@ -118,13 +122,18 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
 	podIdentifier := req.String()
 	endpoint := r.buildEndpoint(&pod)
 
+	// Get engine type from config (currently vLLM only)
+	engineType := engineadapter.EngineType(r.Config.EngineType)
+
 	debugLogger.Info("Ensuring subscriber for pod",
 		"pod", req,
 		"endpoint", endpoint,
-		"podIP", pod.Status.PodIP)
+		"podIP", pod.Status.PodIP,
+		"engineType", engineType)
 
-	if err := r.SubscriberManager.EnsureSubscriber(ctx, podIdentifier, endpoint, r.Config.TopicFilter, true); err != nil {
-		debugLogger.Error(err, "Failed to ensure subscriber for pod", "pod", req)
+	if err := r.SubscriberManager.EnsureSubscriber(ctx, podIdentifier, endpoint,
+		r.Config.TopicFilter, engineType, true); err != nil {
+		debugLogger.Error(err, "Failed to ensure subscriber for pod", "pod", req, "engineType", engineType)
 		return ctrl.Result{}, err
 	}
 

@@ -78,7 +78,7 @@ def create_llm():
         disable_hybrid_kv_cache_manager=True,
         kv_events_config=kv_events_config,
         block_size=16,
-        prefix_caching_hash_algo="sha256_cbor",
+        prefix_caching_hash_algo="sha256_cbor_64bit",
         enable_lora=True,
         max_model_len=4096,
     )

@@ -0,0 +1,87 @@
+/*
+Copyright 2026 The llm-d Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package engineadapter
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/llm-d/llm-d-kv-cache/pkg/kvevents/events"
+)
+
+// EngineType represents the type of LLM engine.
+type EngineType string
+
+const (
+	// EngineTypeVLLM represents the vLLM engine.
+	EngineTypeVLLM EngineType = "vllm"
+)
+
+// RawMessage holds the pre-parsed framing metadata from a received transport
+// message, with the payload still in raw (not yet decoded) bytes.
+// It is returned by ReceiveMessage and passed to DecodeMessageToEventBatch.
+type RawMessage struct {
+	// PodID that is parsed from the topic.
+	PodID string
+	// Model name that is parsed from the topic.
+	ModelName string
+	// Sequence is the message sequence number from the transport.
+	Sequence uint64
+	// Topic is the original transport topic string.
+	Topic string
+	// Payload is the raw msgpack-encoded event batch bytes, not yet decoded.
+	Payload []byte
+	// Adapter is the engine adapter that can decode this payload.
+	Adapter EngineAdapter
+}
+
+// NewAdapter creates a new engine adapter based on the engine type.
+func NewAdapter(engineType EngineType) (EngineAdapter, error) {
+	// It looks useless right now but we're preparing for future support of other engines ;)
+	switch engineType {
+	case EngineTypeVLLM:
+		return NewVLLMAdapter()
+	default:
+		return nil, fmt.Errorf("unknown engine type: %s", engineType)
+	}
+}
+
+// EngineAdapter defines the interface for engine-specific adapters.
+// Each inference engine has its own adapter implementation that handles
+// engine-specific message receiving, decoding, and connection management.
+type EngineAdapter interface {
+	// ReceiveMessage receives a raw message and returns a RawMessage
+	// with pre-parsed framing metadata, but with the payload still in raw bytes.
+	// This is intentionally cheap — no event payload decoding happens here.
+	ReceiveMessage(ctx context.Context) (*RawMessage, error)
+
+	// DecodeMessageToEventBatch decodes the raw payload of a RawMessage into a
+	// fully populated EventBatch.
+	DecodeMessageToEventBatch(msg *RawMessage) (*events.EventBatch, error)
+
+	// Connect establishes a connection to a remote endpoint.
+	Connect(ctx context.Context, endpoint string) error
+
+	// Bind listens on a local endpoint for incoming connections.
+	Bind(ctx context.Context, endpoint string) error
+
+	// SubscribeToTopic sets the topic filter for receiving messages.
+	SubscribeToTopic(topicFilter string) error
+
+	// Close closes the adapter and releases all resources.
+	Close() error
+}