docker
diff --git a/‎cmd/cli/commands/configure.go‎
Lines changed: 1 addition & 1 deletion b/‎cmd/cli/commands/configure.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmd/cli/commands/configure_flags.go‎
Lines changed: 11 additions & 2 deletions b/‎cmd/cli/commands/configure_flags.go‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎cmd/cli/commands/configure_test.go‎
Lines changed: 111 additions & 0 deletions b/‎cmd/cli/commands/configure_test.go‎
Lines changed: 111 additions & 0 deletions
diff --git a/‎cmd/cli/commands/ps.go‎
Lines changed: 24 additions & 21 deletions b/‎cmd/cli/commands/ps.go‎
Lines changed: 24 additions & 21 deletions
diff --git a/‎cmd/cli/desktop/desktop.go‎
Lines changed: 6 additions & 10 deletions b/‎cmd/cli/desktop/desktop.go‎
Lines changed: 6 additions & 10 deletions
diff --git a/‎cmd/cli/docs/reference/docker_model_configure.yaml‎
Lines changed: 11 additions & 1 deletion b/‎cmd/cli/docs/reference/docker_model_configure.yaml‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎pkg/inference/backend.go‎
Lines changed: 60 additions & 0 deletions b/‎pkg/inference/backend.go‎
Lines changed: 60 additions & 0 deletions
@@ -11,7 +11,7 @@ func newConfigureCmd() *cobra.Command {
 	var flags ConfigureFlags
 
 	c := &cobra.Command{
-		Use:     "configure [--context-size=<n>] [--speculative-draft-model=<model>] [--hf_overrides=<json>] [--gpu-memory-utilization=<float>] [--mode=<mode>] [--think] MODEL [-- <runtime-flags...>]",
+		Use:     "configure [--context-size=<n>] [--speculative-draft-model=<model>] [--hf_overrides=<json>] [--gpu-memory-utilization=<float>] [--mode=<mode>] [--think] [--keep-alive=<duration>] MODEL [-- <runtime-flags...>]",
 		Aliases: []string{"config"},
 		Short:   "Manage model runtime configurations",
 		Hidden:  true,
 
@@ -133,8 +133,8 @@ type ConfigureFlags struct {
 	// vLLM-specific flags
 	HFOverrides          string
 	GPUMemoryUtilization *float64
-	// Think parameter for reasoning models
-	Think *bool
+	Think                *bool
+	KeepAlive            string
 }
 
 // RegisterFlags registers all configuration flags on the given cobra command.
@@ -147,6 +147,7 @@ func (f *ConfigureFlags) RegisterFlags(cmd *cobra.Command) {
 	cmd.Flags().Var(NewFloat64PtrValue(&f.GPUMemoryUtilization), "gpu-memory-utilization", "fraction of GPU memory to use for the model executor (0.0-1.0) - vLLM only")
 	cmd.Flags().Var(NewBoolPtrValue(&f.Think), "think", "enable reasoning mode for thinking models")
 	cmd.Flags().StringVar(&f.Mode, "mode", "", "backend operation mode (completion, embedding, reranking, image-generation)")
+	cmd.Flags().StringVar(&f.KeepAlive, "keep-alive", "", "duration to keep model loaded (e.g., '5m', '1h', '0' to unload immediately, '-1' to never unload)")
 }
 
 // BuildConfigureRequest builds a scheduling.ConfigureRequest from the flags.
@@ -205,6 +206,14 @@ func (f *ConfigureFlags) BuildConfigureRequest(model string) (scheduling.Configu
 		req.LlamaCpp.ReasoningBudget = reasoningBudget
 	}
 
+	if f.KeepAlive != "" {
+		ka, err := inference.ParseKeepAlive(f.KeepAlive)
+		if err != nil {
+			return req, err
+		}
+		req.KeepAlive = &ka
+	}
+
 	// Parse mode if provided
 	if f.Mode != "" {
 		parsedMode, err := parseBackendMode(f.Mode)
 
@@ -2,6 +2,8 @@ package commands
 
 import (
 	"testing"
+
+	"github.com/docker/model-runner/pkg/inference"
 )
 
 func TestConfigureCmdHfOverridesFlag(t *testing.T) {
@@ -326,6 +328,115 @@ func TestThinkFlagBehavior(t *testing.T) {
 	}
 }
 
+func TestConfigureCmdKeepAliveFlag(t *testing.T) {
+	// Create the configure command
+	cmd := newConfigureCmd()
+
+	// Verify the --keep-alive flag exists
+	keepAliveFlag := cmd.Flags().Lookup("keep-alive")
+	if keepAliveFlag == nil {
+		t.Fatal("--keep-alive flag not found")
+		return
+	}
+
+	// Verify the default value is empty
+	if keepAliveFlag.DefValue != "" {
+		t.Errorf("Expected default keep-alive value to be empty, got '%s'", keepAliveFlag.DefValue)
+	}
+
+	// Verify the flag type
+	if keepAliveFlag.Value.Type() != "string" {
+		t.Errorf("Expected keep-alive flag type to be 'string', got '%s'", keepAliveFlag.Value.Type())
+	}
+
+	// Test setting the flag value
+	if err := cmd.Flags().Set("keep-alive", "10m"); err != nil {
+		t.Errorf("Failed to set keep-alive flag: %v", err)
+	}
+
+	// Verify the value was set
+	if keepAliveFlag.Value.String() != "10m" {
+		t.Errorf("Expected keep-alive flag value to be '10m', got '%s'", keepAliveFlag.Value.String())
+	}
+}
+
+func TestKeepAliveFlagBehavior(t *testing.T) {
+	tests := []struct {
+		name          string
+		keepAlive     string
+		expectSet     bool
+		expectError   bool
+		expectedValue inference.KeepAlive
+	}{
+		{
+			name:      "default - not set",
+			keepAlive: "",
+			expectSet: false,
+		},
+		{
+			name:          "5 minutes",
+			keepAlive:     "5m",
+			expectSet:     true,
+			expectedValue: inference.KeepAliveDefault,
+		},
+		{
+			name:          "unload immediately",
+			keepAlive:     "0",
+			expectSet:     true,
+			expectedValue: inference.KeepAliveImmediate,
+		},
+		{
+			name:          "never unload",
+			keepAlive:     "-1",
+			expectSet:     true,
+			expectedValue: inference.KeepAliveForever,
+		},
+		{
+			name:          "negative duration means forever",
+			keepAlive:     "-1m",
+			expectSet:     true,
+			expectedValue: inference.KeepAliveForever,
+		},
+		{
+			name:        "invalid value",
+			keepAlive:   "abc",
+			expectError: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			flags := ConfigureFlags{
+				KeepAlive: tt.keepAlive,
+			}
+
+			req, err := flags.BuildConfigureRequest("test-model")
+			if tt.expectError {
+				if err == nil {
+					t.Fatal("Expected error but got none")
+				}
+				return
+			}
+			if err != nil {
+				t.Fatalf("Unexpected error: %v", err)
+			}
+
+			if tt.expectSet {
+				if req.KeepAlive == nil {
+					t.Fatal("Expected KeepAlive to be set")
+				}
+				if *req.KeepAlive != tt.expectedValue {
+					t.Errorf("Expected KeepAlive to be %v, got %v", tt.expectedValue, *req.KeepAlive)
+				}
+			} else {
+				if req.KeepAlive != nil {
+					t.Errorf("Expected KeepAlive to be nil, got %v", *req.KeepAlive)
+				}
+			}
+		})
+	}
+}
+
 func TestRuntimeFlagsValidation(t *testing.T) {
 	tests := []struct {
 		name          string
 
@@ -8,6 +8,7 @@ import (
 	"github.com/docker/go-units"
 	"github.com/docker/model-runner/cmd/cli/commands/completion"
 	"github.com/docker/model-runner/cmd/cli/desktop"
+	"github.com/docker/model-runner/pkg/inference"
 	"github.com/spf13/cobra"
 )
 
@@ -31,43 +32,45 @@ func newPSCmd() *cobra.Command {
 func psTable(ps []desktop.BackendStatus) string {
 	var buf bytes.Buffer
 	table := newTable(&buf)
-	table.Header([]string{"MODEL NAME", "BACKEND", "MODE", "LAST USED"})
+	table.Header([]string{"MODEL NAME", "BACKEND", "MODE", "UNTIL"})
 
 	for _, status := range ps {
 		modelName := status.ModelName
 		if strings.HasPrefix(modelName, "sha256:") {
 			modelName = modelName[7:19]
 		} else {
-			// Strip default "ai/" prefix and ":latest" tag for display
 			modelName = stripDefaultsFromModelName(modelName)
 		}
 
-		var lastUsed string
-		if status.InUse {
-			lastUsed = "in use"
-		} else if !status.LastUsed.IsZero() {
-			duration := time.Since(status.LastUsed)
-			if duration < 0 {
-				duration = 0
-			}
-			if duration < time.Second {
-				lastUsed = "just now"
-			} else {
-				lastUsed = units.HumanDuration(duration) + " ago"
-			}
-		} else {
-			// This case should not happen if InUse is properly set, but fallback to "in use" for zero time
-			lastUsed = "in use"
-		}
-
 		table.Append([]string{
 			modelName,
 			status.BackendName,
 			status.Mode,
-			lastUsed,
+			formatUntil(status),
 		})
 	}
 
 	table.Render()
 	return buf.String()
 }
+
+func formatUntil(status desktop.BackendStatus) string {
+	keepAlive := inference.KeepAliveDefault
+	if status.KeepAlive != nil {
+		keepAlive = *status.KeepAlive
+	}
+
+	if keepAlive == inference.KeepAliveForever {
+		return "Forever"
+	}
+
+	if status.InUse || status.LastUsed.IsZero() {
+		return units.HumanDuration(keepAlive.Duration()) + " from now"
+	}
+
+	remaining := keepAlive.Duration() - time.Since(status.LastUsed)
+	if remaining <= 0 {
+		return "Expiring"
+	}
+	return units.HumanDuration(remaining) + " from now"
+}
@@ -650,16 +650,12 @@ func (c *Client) Remove(modelArgs []string, force bool) (string, error) {
 
 // BackendStatus to be imported from docker/model-runner when https://github.com/docker/model-runner/pull/42 is merged.
 type BackendStatus struct {
-	// BackendName is the name of the backend
-	BackendName string `json:"backend_name"`
-	// ModelName is the name of the model loaded in the backend
-	ModelName string `json:"model_name"`
-	// Mode is the mode the backend is operating in
-	Mode string `json:"mode"`
-	// LastUsed represents when this backend was last used (if it's idle)
-	LastUsed time.Time `json:"last_used,omitempty"`
-	// InUse indicates whether this backend is currently handling a request
-	InUse bool `json:"in_use,omitempty"`
+	BackendName string               `json:"backend_name"`
+	ModelName   string               `json:"model_name"`
+	Mode        string               `json:"mode"`
+	LastUsed    time.Time            `json:"last_used,omitempty"`
+	InUse       bool                 `json:"in_use,omitempty"`
+	KeepAlive   *inference.KeepAlive `json:"keep_alive,omitempty"`
 }
 
 func (c *Client) PS() ([]BackendStatus, error) {
 
@@ -2,7 +2,7 @@ command: docker model configure
 aliases: docker model configure, docker model config
 short: Manage model runtime configurations
 long: Manage model runtime configurations
-usage: docker model configure [--context-size=<n>] [--speculative-draft-model=<model>] [--hf_overrides=<json>] [--gpu-memory-utilization=<float>] [--mode=<mode>] [--think] MODEL [-- <runtime-flags...>]
+usage: docker model configure [--context-size=<n>] [--speculative-draft-model=<model>] [--hf_overrides=<json>] [--gpu-memory-utilization=<float>] [--mode=<mode>] [--think] [--keep-alive=<duration>] MODEL [-- <runtime-flags...>]
 pname: docker model
 plink: docker_model.yaml
 cname:
@@ -38,6 +38,16 @@ options:
       experimentalcli: false
       kubernetes: false
       swarm: false
+    - option: keep-alive
+      value_type: string
+      description: |
+        duration to keep model loaded (e.g., '5m', '1h', '0' to unload immediately, '-1' to never unload)
+      deprecated: false
+      hidden: false
+      experimental: false
+      experimentalcli: false
+      kubernetes: false
+      swarm: false
     - option: mode
       value_type: string
       description: |
 
@@ -5,6 +5,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"net/http"
+	"time"
 )
 
 // BackendMode encodes the mode in which a backend should operate.
@@ -108,11 +109,70 @@ type LlamaCppConfig struct {
 	ReasoningBudget *int32 `json:"reasoning-budget,omitempty"`
 }
 
+// KeepAlive is a duration controlling how long a model stays loaded in memory.
+// JSON representation uses Go duration strings (e.g. "5m", "1h") plus the
+// special value "-1" (never unload). A nil *KeepAlive means use the default
+// (5 minutes).
+type KeepAlive time.Duration
+
+const (
+	KeepAliveDefault   = KeepAlive(5 * time.Minute)
+	KeepAliveImmediate = KeepAlive(0)
+	KeepAliveForever   = KeepAlive(-1)
+)
+
+func (d KeepAlive) Duration() time.Duration {
+	return time.Duration(d)
+}
+
+func (d KeepAlive) MarshalJSON() ([]byte, error) {
+	if d == KeepAliveForever {
+		return json.Marshal("-1")
+	}
+	return json.Marshal(time.Duration(d).String())
+}
+
+func (d *KeepAlive) UnmarshalJSON(data []byte) error {
+	var s string
+	if err := json.Unmarshal(data, &s); err != nil {
+		return err
+	}
+	parsed, err := ParseKeepAlive(s)
+	if err != nil {
+		return err
+	}
+	*d = parsed
+	return nil
+}
+
+// ParseKeepAlive converts a keep_alive string to a KeepAlive value.
+// Accepts:
+//   - Go duration strings: "5m", "1h", "30s"
+//   - "0" to unload immediately
+//   - Any negative value ("-1", "-1m") to keep loaded forever
+func ParseKeepAlive(s string) (KeepAlive, error) {
+	if s == "0" {
+		return KeepAliveImmediate, nil
+	}
+	if s == "-1" {
+		return KeepAliveForever, nil
+	}
+	d, err := time.ParseDuration(s)
+	if err != nil {
+		return 0, fmt.Errorf("invalid keep_alive duration %q: %w", s, err)
+	}
+	if d < 0 {
+		return KeepAliveForever, nil
+	}
+	return KeepAlive(d), nil
+}
+
 type BackendConfiguration struct {
 	// Shared configuration across all backends
 	ContextSize  *int32                     `json:"context-size,omitempty"`
 	RuntimeFlags []string                   `json:"runtime-flags,omitempty"`
 	Speculative  *SpeculativeDecodingConfig `json:"speculative,omitempty"`
+	KeepAlive    *KeepAlive                 `json:"keep_alive,omitempty"`
 
 	// Backend-specific configuration
 	VLLM     *VLLMConfig     `json:"vllm,omitempty"`