fix(llm): filter stop parameter for OpenAI reasoning models (#1653)

trebedea · web-flow · commit 22314755136f · 2026-02-24T20:31:18.000+02:00
* Fix OpenAI GPT-5 and other reasoning models that do not allow stop tokens as parameters in API requests

* Ensure llm_params in llm_call is not modified, also added test.
diff --git a/nemoguardrails/actions/llm/utils.py b/nemoguardrails/actions/llm/utils.py
@@ -144,27 +144,41 @@ def _infer_model_name(llm: BaseLanguageModel):
 def _filter_params_for_openai_reasoning_models(llm: BaseLanguageModel, llm_params: Optional[dict]) -> Optional[dict]:
     """Filter out unsupported parameters for OpenAI reasoning models.
 
-    OpenAI reasoning models (o1, o3, gpt-5 excluding gpt-5-chat) only support
-    temperature=1. When using .bind() with other temperature values, the API
-    returns an error. This function removes the temperature parameter for these
-    models to allow the API default to apply.
+    OpenAI reasoning models (o1, o3, gpt-5 excluding gpt-5-chat) do only allow
+    specific parameters (e.g. temperature, which is always fixed at 1, or stop).
+    When using .bind() with different values for these parameters, the API
+    returns an error. This function removes the unsupported parameters for specific
+    OpenAI reasoning models to ensure correct functionality for the API calls.
 
-    See: https://github.com/langchain-ai/langchain/blob/master/libs/partners/openai/langchain_openai/chat_models/base.py
+    See also: https://github.com/langchain-ai/langchain/blob/master/libs/partners/openai/langchain_openai/chat_models/base.py
+
+    Stop not supported as a parameter in the following models (as of Jan 26):
+    gpt5+ (only gpt-5-chat-latest works), o3, o3-pro (but o3-mini works), o4-mini
     """
-    if not llm_params or "temperature" not in llm_params:
+    if not llm_params or ("temperature" not in llm_params and "stop" not in llm_params):
         return llm_params
 
     model_name = _infer_model_name(llm).lower()
 
-    is_openai_reasoning_model = (
+    # Models that do not support temperature as a param, or changing its default value
+    is_temperature_not_supported = (
         model_name.startswith("o1")
         or model_name.startswith("o3")
         or (model_name.startswith("gpt-5") and "chat" not in model_name)
     )
+    # Models that do not support stop as a param
+    is_stop_not_supported = (
+        (model_name.startswith("o3") and "o3-mini" not in model_name)
+        or model_name.startswith("o4-mini")
+        or (model_name.startswith("gpt-5") and "gpt-5-chat" not in model_name)
+    )
 
-    if is_openai_reasoning_model:
+    if is_temperature_not_supported or is_stop_not_supported:
         filtered = llm_params.copy()
-        filtered.pop("temperature", None)
+        if is_temperature_not_supported:
+            filtered.pop("temperature", None)
+        if is_stop_not_supported:
+            filtered.pop("stop", None)
         return filtered
 
     return llm_params
@@ -202,18 +216,25 @@ async def llm_call(
         raise LLMCallException(ValueError("No LLM provided to llm_call()"))
     _setup_llm_call_info(llm, model_name, model_provider)
 
-    filtered_params = _filter_params_for_openai_reasoning_models(llm, llm_params)
+    llm_params_with_stop: Optional[dict]
+    if stop:
+        llm_params_with_stop = llm_params.copy() if llm_params else {}
+        llm_params_with_stop["stop"] = stop
+    else:
+        llm_params_with_stop = llm_params
+
+    filtered_params = _filter_params_for_openai_reasoning_models(llm, llm_params_with_stop)
     generation_llm: Union[BaseLanguageModel, Runnable] = llm.bind(**filtered_params) if filtered_params else llm
 
     if streaming_handler:
-        return await _stream_llm_call(generation_llm, prompt, streaming_handler, stop)
+        return await _stream_llm_call(generation_llm, prompt, streaming_handler)
     else:
         all_callbacks = _prepare_callbacks(custom_callback_handlers)
 
         if isinstance(prompt, str):
-            response = await _invoke_with_string_prompt(generation_llm, prompt, all_callbacks, stop)
+            response = await _invoke_with_string_prompt(generation_llm, prompt, all_callbacks)
         else:
-            response = await _invoke_with_message_list(generation_llm, prompt, all_callbacks, stop)
+            response = await _invoke_with_message_list(generation_llm, prompt, all_callbacks)
 
         _store_reasoning_traces(response)
         _store_tool_calls(response)
@@ -225,7 +246,6 @@ async def _stream_llm_call(
     llm: Union[BaseLanguageModel, Runnable],
     prompt: Union[str, List[dict]],
     handler: "StreamingHandler",
-    stop: Optional[List[str]],
 ) -> str:
     """Stream LLM response using astream().
 
@@ -237,11 +257,17 @@ async def _stream_llm_call(
     else:
         messages = prompt
 
-    handler.stop = stop or []
+    stop = []
+    if hasattr(llm, "kwargs"):
+        current_params = getattr(llm, "kwargs", {})
+        stop = current_params.get("stop", [])
+    if not stop:
+        stop = getattr(llm, "stop", [])
+    handler.stop = stop
     accumulated_metadata: Dict[str, Any] = {}
 
     try:
-        async for chunk in llm.astream(messages, stop=stop, config=RunnableConfig(callbacks=logging_callbacks)):
+        async for chunk in llm.astream(messages, config=RunnableConfig(callbacks=logging_callbacks)):
             if hasattr(chunk, "content"):
                 content = chunk.content
             else:
@@ -351,11 +377,10 @@ async def _invoke_with_string_prompt(
     llm: Union[BaseLanguageModel, Runnable],
     prompt: str,
     callbacks: BaseCallbackManager,
-    stop: Optional[List[str]],
 ):
     """Invoke LLM with string prompt."""
     try:
-        return await llm.ainvoke(prompt, config=RunnableConfig(callbacks=callbacks), stop=stop)
+        return await llm.ainvoke(prompt, config=RunnableConfig(callbacks=callbacks))
     except Exception as e:
         _raise_llm_call_exception(e, llm)
 
@@ -364,13 +389,12 @@ async def _invoke_with_message_list(
     llm: Union[BaseLanguageModel, Runnable],
     prompt: List[dict],
     callbacks: BaseCallbackManager,
-    stop: Optional[List[str]],
 ):
     """Invoke LLM with message list after converting to LangChain format."""
     messages = _convert_messages_to_langchain_format(prompt)
 
     try:
-        return await llm.ainvoke(messages, config=RunnableConfig(callbacks=callbacks), stop=stop)
+        return await llm.ainvoke(messages, config=RunnableConfig(callbacks=callbacks))
     except Exception as e:
         _raise_llm_call_exception(e, llm)
 
diff --git a/tests/test_actions_llm_utils.py b/tests/test_actions_llm_utils.py
@@ -33,6 +33,7 @@
 )
 from nemoguardrails.context import reasoning_trace_var, tool_calls_var
 from nemoguardrails.exceptions import LLMCallException
+from tests.utils import get_bound_llm_magic_mock
 
 
 @pytest.fixture(autouse=True)
@@ -547,18 +548,23 @@ def test_store_tool_calls_with_real_aimessage_multiple_tool_calls():
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("llm_params", [None, {}])
-async def test_llm_call_stop_tokens_passed_without_llm_params(llm_params):
-    """Stop tokens must be passed to ainvoke even when llm_params is None or empty."""
-    from unittest.mock import AsyncMock, MagicMock
-
+@pytest.mark.parametrize("stop", [None, ["User:"]])
+async def test_llm_call_stop_tokens_passed_without_llm_params(llm_params, stop):
+    """Stop tokens must be passed to bind or ainvoke even when llm_params is None or empty."""
     from nemoguardrails.actions.llm.utils import llm_call
 
-    mock_llm = AsyncMock()
-    mock_llm.ainvoke.return_value = MagicMock(content="response")
+    mock_llm = get_bound_llm_magic_mock(ainvoke_return_value={"content": "response"})
 
-    await llm_call(mock_llm, "prompt", stop=["User:"], llm_params=llm_params)
+    await llm_call(mock_llm, "prompt", stop=stop, llm_params=llm_params)
 
-    assert mock_llm.ainvoke.call_args[1]["stop"] == ["User:"]
+    if mock_llm.bind.called:
+        # Option A: Check if .bind() was called with the stop tokens
+        args, kwargs = mock_llm.bind.call_args
+        assert kwargs.get("stop", None) == stop
+    else:
+        # Option B: Check if it fell back to passing stop to .ainvoke
+        args, kwargs = mock_llm.ainvoke.call_args
+        assert kwargs.get("stop", None) == stop
 
 
 @pytest.mark.asyncio
@@ -677,6 +683,11 @@ class TestFilterParamsForOpenAIReasoningModels:
             ("gpt-5-nano", {"temperature": 0.001}, {}),
             ("o1-preview", {"max_tokens": 100}, {"max_tokens": 100}),
             ("o1-preview", {}, {}),
+            ("gpt-5", {"stop": "stop"}, {}),
+            ("gpt-5-mini", {"temperature": 0.5, "max_tokens": 100, "stop": "stop"}, {"max_tokens": 100}),
+            ("o4-mini", {"stop": "stop"}, {}),
+            ("o3", {"stop": "stop"}, {}),
+            ("o3-pro", {"temperature": 0.5, "stop": "stop"}, {}),
         ],
     )
     def test_filter_params(self, model, params, expected):
@@ -694,3 +705,10 @@ def test_does_not_modify_original_params(self):
         params = {"temperature": 0.5, "max_tokens": 100}
         _filter_params_for_openai_reasoning_models(llm, params)
         assert params == {"temperature": 0.5, "max_tokens": 100}
+
+    @pytest.mark.asyncio
+    async def test_llm_call_does_not_mutate_llm_params(self):
+        mock_llm = get_bound_llm_magic_mock(ainvoke_return_value={"content": "response"})
+        original_params = {"max_tokens": 100}
+        await llm_call(mock_llm, "prompt", stop=["User:"], llm_params=original_params)
+        assert original_params == {"max_tokens": 100}
diff --git a/tests/test_llmrails.py b/tests/test_llmrails.py
@@ -15,7 +15,7 @@
 
 import os
 from typing import Optional
-from unittest.mock import MagicMock, patch
+from unittest.mock import patch
 
 import pytest
 from langchain_core.language_models import BaseChatModel
@@ -24,7 +24,7 @@
 from nemoguardrails.logging.explain import ExplainInfo
 from nemoguardrails.rails.llm.config import Model
 from tests.conftest import REASONING_TRACE_MOCK_PATH
-from tests.utils import FakeLLM, clean_events, event_sequence_conforms
+from tests.utils import FakeLLM, clean_events, event_sequence_conforms, get_bound_llm_magic_mock
 
 
 @pytest.fixture
@@ -1059,7 +1059,7 @@ def test_explain_calls_ensure_explain_info():
     """Make sure if no `explain_info` attribute is present in LLMRails it's populated with
     an empty ExplainInfo object"""
 
-    mock_llm = MagicMock(spec=BaseChatModel)
+    mock_llm = get_bound_llm_magic_mock(ainvoke_return_value={"spec": BaseChatModel})
     config = RailsConfig.from_content(config={"models": []})
     rails = LLMRails(config=config, llm=mock_llm)
     rails.generate(messages=[{"role": "user", "content": "Hi!"}])
diff --git a/tests/test_tool_calling_passthrough_only.py b/tests/test_tool_calling_passthrough_only.py
@@ -15,21 +15,20 @@
 
 """Test that tool calling ONLY works in passthrough mode."""
 
-from unittest.mock import AsyncMock, MagicMock
+from unittest.mock import MagicMock
 
 import pytest
 from langchain_core.messages import AIMessage
 
 from nemoguardrails import LLMRails, RailsConfig
 from nemoguardrails.actions.llm.generation import LLMGenerationActions
 from nemoguardrails.context import tool_calls_var
+from tests.utils import get_bound_llm_magic_mock
 
 
 @pytest.fixture
 def mock_llm_with_tool_calls():
     """Mock LLM that returns tool calls."""
-    llm = AsyncMock()
-
     mock_response = AIMessage(
         content="",
         tool_calls=[
@@ -41,8 +40,7 @@ def mock_llm_with_tool_calls():
             }
         ],
     )
-    llm.ainvoke.return_value = mock_response
-    llm.invoke.return_value = mock_response
+    llm = get_bound_llm_magic_mock(ainvoke_return_value=mock_response)
     return llm
 
 
diff --git a/tests/utils.py b/tests/utils.py
@@ -19,12 +19,14 @@
 import sys
 from datetime import datetime, timedelta, timezone
 from typing import Any, Dict, Iterable, List, Mapping, Optional, Union
+from unittest.mock import AsyncMock, MagicMock
 
 from langchain_core.callbacks.manager import (
     AsyncCallbackManagerForLLMRun,
     CallbackManagerForLLMRun,
 )
 from langchain_core.language_models import LLM
+from langchain_core.messages import AIMessage
 
 from nemoguardrails import LLMRails, RailsConfig
 from nemoguardrails.colang import parse_colang_file
@@ -414,3 +416,21 @@ def _init_state(colang_content, yaml_content: Optional[str] = None) -> State:
     json.dump(state.flow_configs, sys.stdout, indent=4, cls=EnhancedJsonEncoder)
 
     return state
+
+
+def get_bound_llm_magic_mock(ainvoke_return_value: Union[AIMessage, dict]) -> MagicMock:
+    mock_llm = MagicMock()
+    mock_llm.return_value = mock_llm
+
+    bound_llm_mock = AsyncMock()
+    if isinstance(ainvoke_return_value, dict):
+        bound_llm_mock.ainvoke.return_value = MagicMock(**ainvoke_return_value)
+    else:
+        bound_llm_mock.ainvoke.return_value = ainvoke_return_value
+
+    mock_llm.bind.return_value = bound_llm_mock
+    if isinstance(ainvoke_return_value, dict):
+        mock_llm.ainvoke = AsyncMock(return_value=MagicMock(**ainvoke_return_value))
+    else:
+        mock_llm.ainvoke = AsyncMock(return_value=ainvoke_return_value)
+    return mock_llm