Skip to content

Commit 7a7ca1e

Browse files
hinnefe2DouweM
andauthored
Add option to pass expected output to LLMJudge (#1853)
Co-authored-by: Douwe Maan <douwe@pydantic.dev>
1 parent 688a026 commit 7a7ca1e

File tree

4 files changed

+365
-12
lines changed

4 files changed

+365
-12
lines changed

pydantic_evals/pydantic_evals/evaluators/common.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ class LLMJudge(Evaluator[object, object, object]):
194194
rubric: str
195195
model: models.Model | models.KnownModelName | None = None
196196
include_input: bool = False
197+
include_expected_output: bool = False
197198
model_settings: ModelSettings | None = None
198199
score: OutputConfig | Literal[False] = False
199200
assertion: OutputConfig | Literal[False] = field(default_factory=lambda: OutputConfig(include_reason=True))
@@ -203,15 +204,29 @@ async def evaluate(
203204
ctx: EvaluatorContext[object, object, object],
204205
) -> EvaluatorOutput:
205206
if self.include_input:
206-
from .llm_as_a_judge import judge_input_output
207-
208-
grading_output = await judge_input_output(
209-
ctx.inputs, ctx.output, self.rubric, self.model, self.model_settings
210-
)
207+
if self.include_expected_output:
208+
from .llm_as_a_judge import judge_input_output_expected
209+
210+
grading_output = await judge_input_output_expected(
211+
ctx.inputs, ctx.output, ctx.expected_output, self.rubric, self.model, self.model_settings
212+
)
213+
else:
214+
from .llm_as_a_judge import judge_input_output
215+
216+
grading_output = await judge_input_output(
217+
ctx.inputs, ctx.output, self.rubric, self.model, self.model_settings
218+
)
211219
else:
212-
from .llm_as_a_judge import judge_output
220+
if self.include_expected_output:
221+
from .llm_as_a_judge import judge_output_expected
222+
223+
grading_output = await judge_output_expected(
224+
ctx.output, ctx.expected_output, self.rubric, self.model, self.model_settings
225+
)
226+
else:
227+
from .llm_as_a_judge import judge_output
213228

214-
grading_output = await judge_output(ctx.output, self.rubric, self.model, self.model_settings)
229+
grading_output = await judge_output(ctx.output, self.rubric, self.model, self.model_settings)
215230

216231
output: dict[str, EvaluationScalar | EvaluationReason] = {}
217232
include_both = self.score is not False and self.assertion is not False

pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py

Lines changed: 148 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,14 @@
99
from pydantic_ai import Agent, models
1010
from pydantic_ai.settings import ModelSettings
1111

12-
__all__ = ('GradingOutput', 'judge_input_output', 'judge_output', 'set_default_judge_model')
12+
__all__ = (
13+
'GradingOutput',
14+
'judge_input_output',
15+
'judge_input_output_expected',
16+
'judge_output',
17+
'judge_output_expected',
18+
'set_default_judge_model',
19+
)
1320

1421

1522
_default_model: models.Model | models.KnownModelName = 'openai:gpt-4o'
@@ -55,7 +62,16 @@ async def judge_output(
5562
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
5663
but this can be changed using the `set_default_judge_model` function.
5764
"""
58-
user_prompt = f'<Output>\n{_stringify(output)}\n</Output>\n<Rubric>\n{rubric}\n</Rubric>'
65+
user_prompt = dedent(
66+
f"""
67+
<Output>
68+
{_stringify(output)}
69+
</Output>
70+
<Rubric>
71+
{rubric}
72+
</Rubric>
73+
"""
74+
)
5975
return (
6076
await _judge_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
6177
).output
@@ -96,12 +112,141 @@ async def judge_input_output(
96112
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
97113
but this can be changed using the `set_default_judge_model` function.
98114
"""
99-
user_prompt = f'<Input>\n{_stringify(inputs)}\n</Input>\n<Output>\n{_stringify(output)}\n</Output>\n<Rubric>\n{rubric}\n</Rubric>'
115+
user_prompt = dedent(
116+
f"""
117+
<Input>
118+
{_stringify(inputs)}
119+
</Input>
120+
<Output>
121+
{_stringify(output)}
122+
</Output>
123+
<Rubric>
124+
{rubric}
125+
</Rubric>
126+
"""
127+
)
100128
return (
101129
await _judge_input_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
102130
).output
103131

104132

133+
_judge_input_output_expected_agent = Agent(
134+
name='judge_input_output_expected',
135+
system_prompt=dedent(
136+
"""
137+
You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided input, expected output, and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
138+
139+
Examples:
140+
141+
<Input>What color is the sky?</Input>
142+
<ExpectedOutput>Blue</ExpectedOutput>
143+
<Output>Cerulean</Output>
144+
<Rubric>The output is consistent with the expected output but doesn't have to match exactly</Rubric>
145+
{"reason": "'Cerulean' is a shade of blue", "pass": true, "score": 1.0}
146+
147+
<Input>How many legs does a spider have?</Input>
148+
<ExpectedOutput>8</ExpectedOutput>
149+
<Output>Six</Output>
150+
<Rubric>The output is factually consistent with the expected output</Rubric>
151+
{"reason": "Spiders have 8 legs", "pass": false, "score": 0.0}
152+
"""
153+
),
154+
output_type=GradingOutput,
155+
)
156+
157+
158+
async def judge_input_output_expected(
159+
inputs: Any,
160+
output: Any,
161+
expected_output: Any,
162+
rubric: str,
163+
model: models.Model | models.KnownModelName | None = None,
164+
model_settings: ModelSettings | None = None,
165+
) -> GradingOutput:
166+
"""Judge the output of a model based on the inputs and a rubric.
167+
168+
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
169+
but this can be changed using the `set_default_judge_model` function.
170+
"""
171+
user_prompt = dedent(
172+
f"""
173+
<Input>
174+
{_stringify(inputs)}
175+
</Input>
176+
<ExpectedOutput>
177+
{_stringify(expected_output)}
178+
</ExpectedOutput>
179+
<Output>
180+
{_stringify(output)}
181+
</Output>
182+
<Rubric>
183+
{rubric}
184+
</Rubric>
185+
"""
186+
)
187+
188+
return (
189+
await _judge_input_output_expected_agent.run(
190+
user_prompt, model=model or _default_model, model_settings=model_settings
191+
)
192+
).output
193+
194+
195+
_judge_output_expected_agent = Agent(
196+
name='judge_output_expected',
197+
system_prompt=dedent(
198+
"""
199+
You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided expected output and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
200+
201+
Examples:
202+
203+
<ExpectedOutput>Blue</ExpectedOutput>
204+
<Output>Cerulean</Output>
205+
<Rubric>The output should be a shade of the expected output color</Rubric>
206+
{"reason": "'Cerulean' is a shade of blue", "pass": true, "score": 1.0}
207+
208+
<ExpectedOutput>8</ExpectedOutput>
209+
<Output>Six</Output>
210+
<Rubric>The output should be a number written in words which matches the number written in digits in the expected output</Rubric>
211+
{"reason": "The output is 'Six' which is a different number than 8", "pass": false, "score": 0.0}
212+
"""
213+
),
214+
output_type=GradingOutput,
215+
)
216+
217+
218+
async def judge_output_expected(
219+
output: Any,
220+
expected_output: Any,
221+
rubric: str,
222+
model: models.Model | models.KnownModelName | None = None,
223+
model_settings: ModelSettings | None = None,
224+
) -> GradingOutput:
225+
"""Judge the output of a model based on the expected output, output, and a rubric.
226+
227+
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
228+
but this can be changed using the `set_default_judge_model` function.
229+
"""
230+
user_prompt = dedent(
231+
f"""
232+
<ExpectedOutput>
233+
{_stringify(expected_output)}
234+
</ExpectedOutput>
235+
<Output>
236+
{_stringify(output)}
237+
</Output>
238+
<Rubric>
239+
{rubric}
240+
</Rubric>
241+
"""
242+
)
243+
return (
244+
await _judge_output_expected_agent.run(
245+
user_prompt, model=model or _default_model, model_settings=model_settings
246+
)
247+
).output
248+
249+
105250
def set_default_judge_model(model: models.Model | models.KnownModelName) -> None: # pragma: no cover
106251
"""Set the default model used for judging.
107252

tests/evals/test_evaluator_common.py

Lines changed: 82 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -208,11 +208,21 @@ async def test_llm_judge_evaluator(mocker: MockerFixture):
208208
mock_judge_input_output = mocker.patch('pydantic_evals.evaluators.llm_as_a_judge.judge_input_output')
209209
mock_judge_input_output.return_value = mock_grading_output
210210

211+
# Mock the judge_input_output_expected function
212+
mock_judge_input_output_expected = mocker.patch(
213+
'pydantic_evals.evaluators.llm_as_a_judge.judge_input_output_expected'
214+
)
215+
mock_judge_input_output_expected.return_value = mock_grading_output
216+
217+
# Mock the judge_output_expected function
218+
mock_judge_output_expected = mocker.patch('pydantic_evals.evaluators.llm_as_a_judge.judge_output_expected')
219+
mock_judge_output_expected.return_value = mock_grading_output
220+
211221
ctx = EvaluatorContext(
212222
name='test',
213223
inputs={'prompt': 'Hello'},
214224
metadata=None,
215-
expected_output=None,
225+
expected_output='Hello',
216226
output='Hello world',
217227
duration=0.0,
218228
_span_tree=SpanTreeRecordingError('spans were not recorded'),
@@ -238,6 +248,29 @@ async def test_llm_judge_evaluator(mocker: MockerFixture):
238248
{'prompt': 'Hello'}, 'Hello world', 'Output contains input', 'openai:gpt-4o', None
239249
)
240250

251+
# Test with input and expected output
252+
evaluator = LLMJudge(
253+
rubric='Output contains input', include_input=True, include_expected_output=True, model='openai:gpt-4o'
254+
)
255+
assert to_jsonable_python(await evaluator.evaluate(ctx)) == snapshot(
256+
{'LLMJudge': {'value': True, 'reason': 'Test passed'}}
257+
)
258+
259+
mock_judge_input_output_expected.assert_called_once_with(
260+
{'prompt': 'Hello'}, 'Hello world', 'Hello', 'Output contains input', 'openai:gpt-4o', None
261+
)
262+
263+
# Test with output and expected output
264+
evaluator = LLMJudge(
265+
rubric='Output contains input', include_input=False, include_expected_output=True, model='openai:gpt-4o'
266+
)
267+
assert to_jsonable_python(await evaluator.evaluate(ctx)) == snapshot(
268+
{'LLMJudge': {'value': True, 'reason': 'Test passed'}}
269+
)
270+
271+
mock_judge_output_expected.assert_called_once_with(
272+
'Hello world', 'Hello', 'Output contains input', 'openai:gpt-4o', None
273+
)
241274
# Test with failing result
242275
mock_grading_output.score = 0.0
243276
mock_grading_output.pass_ = False
@@ -273,13 +306,21 @@ async def test_llm_judge_evaluator_with_model_settings(mocker: MockerFixture):
273306
mock_judge_input_output = mocker.patch('pydantic_evals.evaluators.llm_as_a_judge.judge_input_output')
274307
mock_judge_input_output.return_value = mock_grading_output
275308

309+
mock_judge_input_output_expected = mocker.patch(
310+
'pydantic_evals.evaluators.llm_as_a_judge.judge_input_output_expected'
311+
)
312+
mock_judge_input_output_expected.return_value = mock_grading_output
313+
314+
mock_judge_output_expected = mocker.patch('pydantic_evals.evaluators.llm_as_a_judge.judge_output_expected')
315+
mock_judge_output_expected.return_value = mock_grading_output
316+
276317
custom_model_settings = ModelSettings(temperature=0.77)
277318

278319
ctx = EvaluatorContext(
279320
name='test_custom_settings',
280321
inputs={'prompt': 'Hello Custom'},
281322
metadata=None,
282-
expected_output=None,
323+
expected_output='Hello',
283324
output='Hello world custom settings',
284325
duration=0.0,
285326
_span_tree=SpanTreeRecordingError('spans were not recorded'),
@@ -314,6 +355,45 @@ async def test_llm_judge_evaluator_with_model_settings(mocker: MockerFixture):
314355
custom_model_settings,
315356
)
316357

358+
# Test with input and expected output, with custom model_settings
359+
evaluator_with_input_expected = LLMJudge(
360+
rubric='Output contains input with custom settings',
361+
include_input=True,
362+
include_expected_output=True,
363+
model='openai:gpt-3.5-turbo',
364+
model_settings=custom_model_settings,
365+
)
366+
assert to_jsonable_python(await evaluator_with_input_expected.evaluate(ctx)) == snapshot(
367+
{'LLMJudge': {'value': True, 'reason': 'Test passed with settings'}}
368+
)
369+
mock_judge_input_output_expected.assert_called_once_with(
370+
{'prompt': 'Hello Custom'},
371+
'Hello world custom settings',
372+
'Hello',
373+
'Output contains input with custom settings',
374+
'openai:gpt-3.5-turbo',
375+
custom_model_settings,
376+
)
377+
378+
# Test with output and expected output
379+
evaluator_with_output_expected = LLMJudge(
380+
rubric='Output contains input with custom settings',
381+
include_input=False,
382+
include_expected_output=True,
383+
model='openai:gpt-3.5-turbo',
384+
model_settings=custom_model_settings,
385+
)
386+
assert to_jsonable_python(await evaluator_with_output_expected.evaluate(ctx)) == snapshot(
387+
{'LLMJudge': {'value': True, 'reason': 'Test passed with settings'}}
388+
)
389+
mock_judge_output_expected.assert_called_once_with(
390+
'Hello world custom settings',
391+
'Hello',
392+
'Output contains input with custom settings',
393+
'openai:gpt-3.5-turbo',
394+
custom_model_settings,
395+
)
396+
317397

318398
async def test_python():
319399
"""Test Python evaluator."""

0 commit comments

Comments
 (0)