-
Notifications
You must be signed in to change notification settings - Fork 56
Expand file tree
/
Copy pathtest_custom_judge_llm.py
More file actions
139 lines (115 loc) · 4.49 KB
/
test_custom_judge_llm.py
File metadata and controls
139 lines (115 loc) · 4.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""
Example test demonstrating a fully custom LLM-based judge.
This example shows how to build a judge that calls an LLM directly using
litellm, with a strict JSON schema for structured output. This gives you
full control over the prompt, model, and response parsing.
"""
import json
import pytest
import litellm
import scenario
from scenario.types import AgentInput, AgentReturnTypes, ScenarioResult
scenario.configure(default_model="openai/gpt-4.1-mini")
class CustomLLMJudge(scenario.AgentAdapter):
role = scenario.AgentRole.JUDGE
def __init__(self, criteria: list[str], model: str = "openai/gpt-4.1-mini"):
self.criteria = criteria
self.model = model
async def call(self, input: AgentInput) -> AgentReturnTypes:
if not input.judgment_request:
return []
effective_criteria = (
input.judgment_request.criteria
if input.judgment_request.criteria is not None
else self.criteria
)
# Build a simple transcript
transcript = "\n".join(
f"{m['role']}: {m.get('content', '[tool call]')}" for m in input.messages
)
criteria_list = "\n".join(f"- {c}" for c in effective_criteria)
criteria_numbered = "\n".join(
f"{i + 1}. {c}" for i, c in enumerate(effective_criteria)
)
response = litellm.completion(
model=self.model,
temperature=0.0,
messages=[
{
"role": "system",
"content": f"""Evaluate this conversation against the criteria.
Criteria:
{criteria_numbered}
Return a result for each criterion using the exact criterion text.""",
},
{"role": "user", "content": transcript},
],
response_format={
"type": "json_schema",
"json_schema": {
"name": "evaluation",
"schema": {
"type": "object",
"properties": {
"pass": {"type": "boolean"},
"reasoning": {"type": "string"},
"results": {
"type": "array",
"items": {
"type": "object",
"properties": {
"criterion": {"type": "string"},
"met": {"type": "boolean"},
},
"required": ["criterion", "met"],
"additionalProperties": False,
},
},
},
"required": ["pass", "reasoning", "results"],
"additionalProperties": False,
},
},
},
)
result = json.loads(response.choices[0].message.content) # type: ignore[union-attr]
results_map = {r["criterion"]: r["met"] for r in result["results"]}
passed = [c for c in effective_criteria if results_map.get(c, False)]
failed = [c for c in effective_criteria if not results_map.get(c, True)]
return ScenarioResult(
success=result["pass"],
messages=[],
reasoning=result["reasoning"],
passed_criteria=passed,
failed_criteria=failed,
)
class PoliteAgent(scenario.AgentAdapter):
"""A mock agent that always responds politely."""
async def call(self, input: AgentInput) -> AgentReturnTypes:
return "Hello! I'd be happy to help you with that. How can I assist you today?"
@pytest.mark.agent_test
@pytest.mark.flaky(reruns=2)
@pytest.mark.asyncio
async def test_custom_llm_judge():
"""Custom LLM judge evaluates a polite agent response."""
result = await scenario.run(
name="custom LLM judge",
description="User greets the agent",
agents=[
PoliteAgent(),
scenario.UserSimulatorAgent(),
CustomLLMJudge(
criteria=[
"Agent responds with a greeting",
"Agent offers to help",
],
),
],
script=[
scenario.user("Hi there!"),
scenario.agent(),
scenario.judge(),
],
)
assert result.success
assert len(result.passed_criteria) == 2