Add support for request-time specification of model API keys

RobGeada · RobGeada · commit 4f0d4bff25a1 · 2026-02-28T19:20:11.000Z
diff --git a/examples/configs/api_key_from_header/config.yml b/examples/configs/api_key_from_header/config.yml
@@ -0,0 +1,34 @@
+# Example configuration showing how to extract API keys from HTTP request headers
+#
+# This is useful when you want to use different API keys for different users/requests,
+# rather than a single API key for all requests.
+#
+# To use, add the api_key_header to your model config e.g.:
+# api_key_header: "X-API-Key"
+#  Then, when a request comes in with a matching header:
+#   X-API-Key: sk-abc123xyz
+#
+# The value "sk-abc123xyz" will be used as the Bearer token for the LLM API call.
+
+models:
+  - type: main
+    engine: openai
+    model: gpt-4
+    # Instead of using api_key_env_var (which loads from environment variable),
+    # use api_key_header to extract the API key from each request's headers
+    api_key_header: "X-API-Key"  # The name of the HTTP header containing the API key
+
+# You can also use this with other model types
+# - type: content_safety
+#   engine: openai
+#   model: gpt-3.5-turbo
+#   api_key_header: "X-API-Key"
+
+rails:
+  input:
+    flows:
+      - self check input
+
+  output:
+    flows:
+      - self check output
diff --git a/nemoguardrails/context.py b/nemoguardrails/context.py
@@ -18,11 +18,7 @@
 
 from nemoguardrails.logging.explain import LLMCallInfo
 from nemoguardrails.rails.llm.options import GenerationOptions
-from nemoguardrails.streaming import StreamingHandler
 
-streaming_handler_var: contextvars.ContextVar[Optional[StreamingHandler]] = contextvars.ContextVar(
-    "streaming_handler", default=None
-)
 if TYPE_CHECKING:
     from nemoguardrails.logging.explain import ExplainInfo
     from nemoguardrails.logging.stats import LLMStats
@@ -62,3 +58,7 @@
 llm_response_metadata_var: contextvars.ContextVar[Optional[dict]] = contextvars.ContextVar(
     "llm_response_metadata", default=None
 )
+
+# The HTTP request headers for API requests.
+# This is used to extract API keys or other authentication tokens from headers.
+api_request_headers: contextvars.ContextVar = contextvars.ContextVar("api_request_headers", default=None)
diff --git a/nemoguardrails/llm/models/header_api_key_wrapper.py b/nemoguardrails/llm/models/header_api_key_wrapper.py
@@ -0,0 +1,154 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Module for initializing LLM models with proper error handling and type checking."""
+
+import logging
+from typing import Any, List, Optional
+
+from langchain_core.language_models import BaseChatModel
+from langchain_core.messages import AIMessage
+from pydantic import ConfigDict, Field
+
+from nemoguardrails.context import api_request_headers
+
+log = logging.getLogger(__name__)
+
+
+class HeaderAPIKeyWrapper(BaseChatModel):
+    """Wrapper that injects API keys from request headers at runtime.
+
+    This wrapper intercepts LLM calls and reads the API key from the HTTP request
+    headers (via the api_request_headers context variable from server.api) on every request.
+
+    From testing, this adds negligible time to each LLM call (~1e-06 seconds)
+    """
+
+    wrapped_llm: BaseChatModel = Field(description="The LangChain LLM to wrap")
+    api_key_header: str = Field(description="The name of the HTTP header containing the API key")
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    def __init__(self, llm: BaseChatModel, api_key_header: str, **kwargs):
+        """Initialize the wrapper.
+
+        Args:
+            llm: The LangChain LLM to wrap (must be a BaseChatModel)
+            api_key_header: The name of the HTTP header containing the API key
+        """
+        # Initialize with the data dict for Pydantic
+        super().__init__(**{"wrapped_llm": llm, "api_key_header": api_key_header, **kwargs})
+
+    def _get_api_key_from_headers(self) -> Optional[str]:
+        """Extract API key from the current request headers."""
+        try:
+            headers = api_request_headers.get(None)
+            if headers and self.api_key_header in headers:
+                return headers[self.api_key_header]
+        except LookupError:
+            # Context variable not set (e.g., not in a server request context)
+            pass
+        return None
+
+    def _get_llm_with_api_key(self, api_key: Optional[str]) -> BaseChatModel:
+        """Get LLM instance with the specified API key.
+
+        Creates a new LLM instance if api_key is provided, otherwise returns
+        the wrapped LLM. This ensures thread-safety by avoiding shared state mutation.
+        """
+        if not api_key:
+            return self.wrapped_llm
+
+        # Try ChatOpenAI-specific approach (most common)
+        try:
+            from langchain_openai import ChatOpenAI
+
+            if isinstance(self.wrapped_llm, ChatOpenAI):
+                # Create a shallow copy with the new API key
+                # We use model_dump() to get all current settings, then override the API key
+                config = self.wrapped_llm.model_dump()
+                # Try both parameter names for compatibility
+                config["openai_api_key"] = api_key
+                config["api_key"] = api_key
+                return ChatOpenAI(**config)
+        except Exception as e:
+            log.warning(f"Failed to create ChatOpenAI with custom API key: {e}")
+
+        # Fallback: Try generic model_copy for other providers
+        if hasattr(self.wrapped_llm, "model_copy"):
+            for key_param in ["api_key", "anthropic_api_key", "cohere_api_key"]:
+                try:
+                    return self.wrapped_llm.model_copy(update={key_param: api_key})
+                except (TypeError, ValueError):
+                    continue
+
+        # If all fails, log warning and use default
+        log.warning(
+            f"Unable to create new instance for {type(self.wrapped_llm).__name__}. "
+            f"Using default API key. Multi-tenant isolation not available for this provider."
+        )
+        return self.wrapped_llm
+
+    def _generate(self, messages, stop=None, run_manager=None, **kwargs):
+        """Generate response using the wrapped LLM with runtime API key."""
+        llm = self._get_llm_with_api_key(self._get_api_key_from_headers())
+        return llm._generate(messages, stop=stop, run_manager=run_manager, **kwargs)
+
+    async def _agenerate(self, messages, stop=None, run_manager=None, **kwargs):
+        """Async generate response using the wrapped LLM with runtime API key."""
+        llm = self._get_llm_with_api_key(self._get_api_key_from_headers())
+        return await llm._agenerate(messages, stop=stop, run_manager=run_manager, **kwargs)
+
+    def invoke(
+        self,
+        input: Any,
+        config: Optional[Any] = None,
+        *,
+        stop: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> AIMessage:
+        """Invoke the LLM with runtime API key from headers."""
+        llm = self._get_llm_with_api_key(self._get_api_key_from_headers())
+        return llm.invoke(input, config=config, stop=stop, **kwargs)
+
+    async def ainvoke(
+        self,
+        input: Any,
+        config: Optional[Any] = None,
+        *,
+        stop: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> AIMessage:
+        """Async invoke the LLM with runtime API key from headers."""
+        llm = self._get_llm_with_api_key(self._get_api_key_from_headers())
+        return await llm.ainvoke(input, config=config, stop=stop, **kwargs)
+
+    def _stream(self, messages, stop=None, run_manager=None, **kwargs):
+        """Stream response using the wrapped LLM with runtime API key."""
+        llm = self._get_llm_with_api_key(self._get_api_key_from_headers())
+        yield from llm._stream(messages, stop=stop, run_manager=run_manager, **kwargs)
+
+    async def _astream(self, messages, stop=None, run_manager=None, **kwargs):
+        """Async stream response using the wrapped LLM with runtime API key."""
+        llm = self._get_llm_with_api_key(self._get_api_key_from_headers())
+        async for chunk in llm._astream(messages, stop=stop, run_manager=run_manager, **kwargs):
+            yield chunk
+
+    @property
+    def _llm_type(self) -> str:
+        """Return the LLM type."""
+        return f"header_api_key_wrapper_{self.wrapped_llm._llm_type}"
+
+
+__all__ = ["HeaderAPIKeyWrapper"]
diff --git a/nemoguardrails/rails/llm/config.py b/nemoguardrails/rails/llm/config.py
@@ -118,6 +118,10 @@ class Model(BaseModel):
         default=None,
         description='Optional environment variable with model\'s API Key. Do not include "$".',
     )
+    api_key_header: Optional[str] = Field(
+        default=None,
+        description="Optional HTTP header name from which to extract the API key. The header value will be used as a Bearer token.",
+    )
     parameters: Dict[str, Any] = Field(default_factory=dict)
 
     mode: Literal["chat", "text"] = Field(
diff --git a/nemoguardrails/rails/llm/llmrails.py b/nemoguardrails/rails/llm/llmrails.py
@@ -78,6 +78,7 @@
 )
 from nemoguardrails.kb.kb import KnowledgeBase
 from nemoguardrails.llm.cache import CacheInterface, LFUCache
+from nemoguardrails.llm.models.header_api_key_wrapper import HeaderAPIKeyWrapper
 from nemoguardrails.llm.models.initializer import (
     ModelInitializationError,
     init_llm_model,
@@ -420,6 +421,13 @@ def _init_llms(self):
                     mode="chat",
                     kwargs=kwargs,
                 )
+
+                # Wrap with header-based API key wrapper if configured
+                if main_model.api_key_header and isinstance(self.llm, BaseChatModel):
+                    log.info(
+                        f"Wrapping main LLM with header-based API key extraction from header: {main_model.api_key_header}"
+                    )
+                    self.llm = HeaderAPIKeyWrapper(self.llm, main_model.api_key_header)
                 self.runtime.register_action_param("llm", self.llm)
 
             else:
@@ -453,6 +461,13 @@ def _init_llms(self):
                     kwargs=kwargs,
                 )
 
+                # Wrap with header-based API key wrapper if configured
+                if llm_config.api_key_header and isinstance(llm_model, BaseChatModel):
+                    log.info(
+                        f"Wrapping {llm_config.type} LLM with header-based API key extraction from header: {llm_config.api_key_header}"
+                    )
+                    llm_model = HeaderAPIKeyWrapper(llm_model, llm_config.api_key_header)
+
                 # Configure the model based on its type
                 if llm_config.type == "main":
                     # If a main LLM was already injected, skip creating another
diff --git a/nemoguardrails/server/api.py b/nemoguardrails/server/api.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import asyncio
-import contextvars
 import importlib.util
 import json
 import logging
@@ -34,6 +33,7 @@
 from starlette.staticfiles import StaticFiles
 
 from nemoguardrails import LLMRails, RailsConfig, utils
+from nemoguardrails.context import api_request_headers
 from nemoguardrails.rails.llm.config import Model
 from nemoguardrails.rails.llm.options import GenerationResponse
 from nemoguardrails.server.datastore.datastore import DataStore
@@ -75,9 +75,6 @@ def __init__(self, *args, **kwargs):
 
 api_description = """Guardrails Sever API."""
 
-# The headers for each request
-api_request_headers: contextvars.ContextVar = contextvars.ContextVar("headers")
-
 # The datastore that the Server should use.
 # This is currently used only for storing threads.
 # TODO: refactor to wrap the FastAPI instance inside a RailsServer class
@@ -309,17 +306,33 @@ def _get_rails(config_ids: List[str], model_name: Optional[str] = None) -> LLMRa
         raise ValueError("No valid rails configuration found.")
 
     if model_name:
+        # Get engine from environment or use existing main model's engine
+        existing_main_model = next((m for m in full_llm_rails_config.models if m.type == "main"), None)
+
         engine = os.environ.get("MAIN_MODEL_ENGINE")
-        if not engine:
+        if not engine and existing_main_model:
+            engine = existing_main_model.engine
+        elif not engine:
             engine = "openai"
-            log.warning("MAIN_MODEL_ENGINE not set, defaulting to 'openai'. ")
+            log.warning("No main model in config and MAIN_MODEL_ENGINE not set, defaulting to 'openai'. ")
 
         parameters = {}
         base_url = os.environ.get("MAIN_MODEL_BASE_URL")
         if base_url:
             parameters["base_url"] = base_url
 
-        main_model = Model(model=model_name, type="main", engine=engine, parameters=parameters)
+        # Preserve api_key_header and api_key_env_var from existing config
+        api_key_header = existing_main_model.api_key_header if existing_main_model else None
+        api_key_env_var = existing_main_model.api_key_env_var if existing_main_model else None
+
+        main_model = Model(
+            model=model_name,
+            type="main",
+            engine=engine,
+            parameters=parameters,
+            api_key_header=api_key_header,
+            api_key_env_var=api_key_env_var,
+        )
         full_llm_rails_config = _update_models_in_config(full_llm_rails_config, main_model)
 
     llm_rails = LLMRails(config=full_llm_rails_config, verbose=True)
diff --git a/tests/server/test_header_api_key.py b/tests/server/test_header_api_key.py
diff --git a/tests/test_configs/with_api_key_header/test_config/config.yml b/tests/test_configs/with_api_key_header/test_config/config.yml