Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 8 additions & 9 deletions spoon_ai/agents/toolcall.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
FINAL_RESPONSE_PROMPT = (
"You have reached the tool budget. Do not call any more tools. "
"Using only the tool results already in memory, provide the final user-facing answer now. "
"Summarize the concrete results and do not describe future actions."
"Follow the latest user's requested output format exactly. "
"Do not replace it with a recap or progress summary unless the user explicitly asked for one. "
"Do not describe future actions."
)

class ToolCallAgent(ReActAgent):
Expand Down Expand Up @@ -705,15 +707,12 @@ def _should_finish_execution(self, name: str, result: Any, **kwargs) -> bool:

def _should_terminate_on_finish_reason(self, response) -> bool:
"""Check if agent should terminate based on finish_reason signals."""
# For Anthropic: native_finish_reason="end_turn" maps to finish_reason="stop"
# For OpenAI: both finish_reason and native_finish_reason are "stop"
finish_reason = getattr(response, 'finish_reason', None)
native_finish_reason = getattr(response, 'native_finish_reason', None)

if finish_reason == "stop":
# Accept either "stop" (OpenAI) or "end_turn" (Anthropic) as valid termination signals
return native_finish_reason in ["stop", "end_turn"]
return False
# The normalized finish_reason is the contract spoon-core relies on.
# Native provider reasons differ across APIs (e.g. Responses API may
# surface "completed"), so requiring a legacy native value can turn a
# valid final answer into a repeated loop.
return finish_reason == "stop"

async def _call_llm_with_middleware(
self,
Expand Down
37 changes: 37 additions & 0 deletions tests/test_agent_llm_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,32 @@ async def test_toolcall_agent_with_manager(self, mock_chatbot_manager, tool_mana
mock_chatbot_manager.ask_tool.assert_called()
assert "I'll help you with that task." in result

@pytest.mark.asyncio
async def test_toolcall_agent_terminates_on_stop_even_when_native_reason_is_completed(
self,
mock_chatbot_manager,
tool_manager,
):
"""Responses API may use native status values like 'completed' for terminal answers."""
mock_chatbot_manager.ask_tool.return_value = LLMResponse(
content="FINAL_OK",
tool_calls=[],
finish_reason="stop",
native_finish_reason="completed",
)

agent = ToolCallAgent(
name="test_agent",
llm=mock_chatbot_manager,
available_tools=tool_manager,
max_steps=1,
)

result = await agent.run("Reply with exactly: FINAL_OK")

assert result == "FINAL_OK"
assert mock_chatbot_manager.ask_tool.await_count == 1

@pytest.mark.asyncio
async def test_toolcall_agent_forwards_thinking_flag_to_llm(self, mock_chatbot_manager, tool_manager):
mock_chatbot_manager.ask_tool.return_value = LLMResponse(
Expand Down Expand Up @@ -446,6 +472,17 @@ async def test_toolcall_agent_uses_final_tool_free_summary_after_budget_exhausti
assert result == "Final summary after tool execution."
assert mock_chatbot_manager.ask_tool.await_count == 1
mock_chatbot_manager.ask.assert_awaited_once()
final_messages = mock_chatbot_manager.ask.await_args.kwargs["messages"]
assert any(
"Follow the latest user's requested output format exactly." in msg.content
for msg in final_messages
if isinstance(getattr(msg, "content", None), str)
)
assert any(
"Do not replace it with a recap or progress summary" in msg.content
for msg in final_messages
if isinstance(getattr(msg, "content", None), str)
)

@pytest.mark.asyncio
async def test_agent_memory_consistency(self, mock_chatbot_manager, tool_manager):
Expand Down
Loading