scenario/python/examples/test_custom_judge_llm.py at main · langwatch/scenario · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""
Example test demonstrating a fully custom LLM-based judge.

This example shows how to build a judge that calls an LLM directly using
litellm, with a strict JSON schema for structured output. This gives you
full control over the prompt, model, and response parsing.
"""

import json

import pytest
import litellm
import scenario
from scenario.types import AgentInput, AgentReturnTypes, ScenarioResult

scenario.configure(default_model="openai/gpt-4.1-mini")


class CustomLLMJudge(scenario.AgentAdapter):
    role = scenario.AgentRole.JUDGE

    def __init__(self, criteria: list[str], model: str = "openai/gpt-4.1-mini"):
        self.criteria = criteria
        self.model = model

    async def call(self, input: AgentInput) -> AgentReturnTypes:
        if not input.judgment_request:
            return []

        effective_criteria = (
            input.judgment_request.criteria
            if input.judgment_request.criteria is not None
            else self.criteria
        )

        # Build a simple transcript
        transcript = "\n".join(
            f"{m['role']}: {m.get('content', '[tool call]')}" for m in input.messages
        )

        criteria_list = "\n".join(f"- {c}" for c in effective_criteria)

        criteria_numbered = "\n".join(
            f"{i + 1}. {c}" for i, c in enumerate(effective_criteria)
        )

        response = litellm.completion(
            model=self.model,
            temperature=0.0,
            messages=[
                {
                    "role": "system",
                    "content": f"""Evaluate this conversation against the criteria.

Criteria:
{criteria_numbered}

Return a result for each criterion using the exact criterion text.""",
                },
                {"role": "user", "content": transcript},
            ],
            response_format={
                "type": "json_schema",
                "json_schema": {
                    "name": "evaluation",
                    "schema": {
                        "type": "object",
                        "properties": {
                            "pass": {"type": "boolean"},
                            "reasoning": {"type": "string"},
                            "results": {
                                "type": "array",
                                "items": {
                                    "type": "object",
                                    "properties": {
                                        "criterion": {"type": "string"},
                                        "met": {"type": "boolean"},
                                    },
                                    "required": ["criterion", "met"],
                                    "additionalProperties": False,
                                },
                            },
                        },
                        "required": ["pass", "reasoning", "results"],
                        "additionalProperties": False,
                    },
                },
            },
        )

        result = json.loads(response.choices[0].message.content)  # type: ignore[union-attr]

        results_map = {r["criterion"]: r["met"] for r in result["results"]}
        passed = [c for c in effective_criteria if results_map.get(c, False)]
        failed = [c for c in effective_criteria if not results_map.get(c, True)]

        return ScenarioResult(
            success=result["pass"],
            messages=[],
            reasoning=result["reasoning"],
            passed_criteria=passed,
            failed_criteria=failed,
        )


class PoliteAgent(scenario.AgentAdapter):
    """A mock agent that always responds politely."""

    async def call(self, input: AgentInput) -> AgentReturnTypes:
        return "Hello! I'd be happy to help you with that. How can I assist you today?"


@pytest.mark.agent_test
@pytest.mark.flaky(reruns=2)
@pytest.mark.asyncio
async def test_custom_llm_judge():
    """Custom LLM judge evaluates a polite agent response."""
    result = await scenario.run(
        name="custom LLM judge",
        description="User greets the agent",
        agents=[
            PoliteAgent(),
            scenario.UserSimulatorAgent(),
            CustomLLMJudge(
                criteria=[
                    "Agent responds with a greeting",
                    "Agent offers to help",
                ],
            ),
        ],
        script=[
            scenario.user("Hi there!"),
            scenario.agent(),
            scenario.judge(),
        ],
    )

    assert result.success
    assert len(result.passed_criteria) == 2