Add option to pass expected output to LLMJudge (#1853)

hinnefe2 · DouweM · web-flow · commit 7a7ca1ececb6 · 2025-05-29T19:44:25.000Z
Co-authored-by: Douwe Maan &lt;douwe@pydantic.dev&gt;
diff --git a/pydantic_evals/pydantic_evals/evaluators/common.py b/pydantic_evals/pydantic_evals/evaluators/common.py
@@ -194,6 +194,7 @@ class LLMJudge(Evaluator[object, object, object]):
     rubric: str
     model: models.Model | models.KnownModelName | None = None
     include_input: bool = False
+    include_expected_output: bool = False
     model_settings: ModelSettings | None = None
     score: OutputConfig | Literal[False] = False
     assertion: OutputConfig | Literal[False] = field(default_factory=lambda: OutputConfig(include_reason=True))
@@ -203,15 +204,29 @@ async def evaluate(
         ctx: EvaluatorContext[object, object, object],
     ) -> EvaluatorOutput:
         if self.include_input:
-            from .llm_as_a_judge import judge_input_output
-
-            grading_output = await judge_input_output(
-                ctx.inputs, ctx.output, self.rubric, self.model, self.model_settings
-            )
+            if self.include_expected_output:
+                from .llm_as_a_judge import judge_input_output_expected
+
+                grading_output = await judge_input_output_expected(
+                    ctx.inputs, ctx.output, ctx.expected_output, self.rubric, self.model, self.model_settings
+                )
+            else:
+                from .llm_as_a_judge import judge_input_output
+
+                grading_output = await judge_input_output(
+                    ctx.inputs, ctx.output, self.rubric, self.model, self.model_settings
+                )
         else:
-            from .llm_as_a_judge import judge_output
+            if self.include_expected_output:
+                from .llm_as_a_judge import judge_output_expected
+
+                grading_output = await judge_output_expected(
+                    ctx.output, ctx.expected_output, self.rubric, self.model, self.model_settings
+                )
+            else:
+                from .llm_as_a_judge import judge_output
 
-            grading_output = await judge_output(ctx.output, self.rubric, self.model, self.model_settings)
+                grading_output = await judge_output(ctx.output, self.rubric, self.model, self.model_settings)
 
         output: dict[str, EvaluationScalar | EvaluationReason] = {}
         include_both = self.score is not False and self.assertion is not False
diff --git a/pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py b/pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py
@@ -9,7 +9,14 @@
 from pydantic_ai import Agent, models
 from pydantic_ai.settings import ModelSettings
 
-__all__ = ('GradingOutput', 'judge_input_output', 'judge_output', 'set_default_judge_model')
+__all__ = (
+    'GradingOutput',
+    'judge_input_output',
+    'judge_input_output_expected',
+    'judge_output',
+    'judge_output_expected',
+    'set_default_judge_model',
+)
 
 
 _default_model: models.Model | models.KnownModelName = 'openai:gpt-4o'
@@ -55,7 +62,16 @@ async def judge_output(
     If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
     but this can be changed using the `set_default_judge_model` function.
     """
-    user_prompt = f'<Output>\n{_stringify(output)}\n</Output>\n<Rubric>\n{rubric}\n</Rubric>'
+    user_prompt = dedent(
+        f"""
+        <Output>
+        {_stringify(output)}
+        </Output>
+        <Rubric>
+        {rubric}
+        </Rubric>
+        """
+    )
     return (
         await _judge_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
     ).output
@@ -96,12 +112,141 @@ async def judge_input_output(
     If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
     but this can be changed using the `set_default_judge_model` function.
     """
-    user_prompt = f'<Input>\n{_stringify(inputs)}\n</Input>\n<Output>\n{_stringify(output)}\n</Output>\n<Rubric>\n{rubric}\n</Rubric>'
+    user_prompt = dedent(
+        f"""
+        <Input>
+        {_stringify(inputs)}
+        </Input>
+        <Output>
+        {_stringify(output)}
+        </Output>
+        <Rubric>
+        {rubric}
+        </Rubric>
+        """
+    )
     return (
         await _judge_input_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
     ).output
 
 
+_judge_input_output_expected_agent = Agent(
+    name='judge_input_output_expected',
+    system_prompt=dedent(
+        """
+        You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided input, expected output, and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
+
+        Examples:
+
+        <Input>What color is the sky?</Input>
+        <ExpectedOutput>Blue</ExpectedOutput>
+        <Output>Cerulean</Output>
+        <Rubric>The output is consistent with the expected output but doesn't have to match exactly</Rubric>
+        {"reason": "'Cerulean' is a shade of blue", "pass": true, "score": 1.0}
+
+        <Input>How many legs does a spider have?</Input>
+        <ExpectedOutput>8</ExpectedOutput>
+        <Output>Six</Output>
+        <Rubric>The output is factually consistent with the expected output</Rubric>
+        {"reason": "Spiders have 8 legs", "pass": false, "score": 0.0}
+        """
+    ),
+    output_type=GradingOutput,
+)
+
+
+async def judge_input_output_expected(
+    inputs: Any,
+    output: Any,
+    expected_output: Any,
+    rubric: str,
+    model: models.Model | models.KnownModelName | None = None,
+    model_settings: ModelSettings | None = None,
+) -> GradingOutput:
+    """Judge the output of a model based on the inputs and a rubric.
+
+    If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
+    but this can be changed using the `set_default_judge_model` function.
+    """
+    user_prompt = dedent(
+        f"""
+        <Input>
+        {_stringify(inputs)}
+        </Input>
+        <ExpectedOutput>
+        {_stringify(expected_output)}
+        </ExpectedOutput>
+        <Output>
+        {_stringify(output)}
+        </Output>
+        <Rubric>
+        {rubric}
+        </Rubric>
+        """
+    )
+
+    return (
+        await _judge_input_output_expected_agent.run(
+            user_prompt, model=model or _default_model, model_settings=model_settings
+        )
+    ).output
+
+
+_judge_output_expected_agent = Agent(
+    name='judge_output_expected',
+    system_prompt=dedent(
+        """
+        You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided expected output and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
+
+        Examples:
+
+        <ExpectedOutput>Blue</ExpectedOutput>
+        <Output>Cerulean</Output>
+        <Rubric>The output should be a shade of the expected output color</Rubric>
+        {"reason": "'Cerulean' is a shade of blue", "pass": true, "score": 1.0}
+
+        <ExpectedOutput>8</ExpectedOutput>
+        <Output>Six</Output>
+        <Rubric>The output should be a number written in words which matches the number written in digits in the expected output</Rubric>
+        {"reason": "The output is 'Six' which is a different number than 8", "pass": false, "score": 0.0}
+        """
+    ),
+    output_type=GradingOutput,
+)
+
+
+async def judge_output_expected(
+    output: Any,
+    expected_output: Any,
+    rubric: str,
+    model: models.Model | models.KnownModelName | None = None,
+    model_settings: ModelSettings | None = None,
+) -> GradingOutput:
+    """Judge the output of a model based on the expected output, output, and a rubric.
+
+    If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
+    but this can be changed using the `set_default_judge_model` function.
+    """
+    user_prompt = dedent(
+        f"""
+        <ExpectedOutput>
+        {_stringify(expected_output)}
+        </ExpectedOutput>
+        <Output>
+        {_stringify(output)}
+        </Output>
+        <Rubric>
+        {rubric}
+        </Rubric>
+        """
+    )
+    return (
+        await _judge_output_expected_agent.run(
+            user_prompt, model=model or _default_model, model_settings=model_settings
+        )
+    ).output
+
+
 def set_default_judge_model(model: models.Model | models.KnownModelName) -> None:  # pragma: no cover
     """Set the default model used for judging.
 
diff --git a/tests/evals/test_evaluator_common.py b/tests/evals/test_evaluator_common.py
@@ -208,11 +208,21 @@ async def test_llm_judge_evaluator(mocker: MockerFixture):
     mock_judge_input_output = mocker.patch('pydantic_evals.evaluators.llm_as_a_judge.judge_input_output')
     mock_judge_input_output.return_value = mock_grading_output
 
+    # Mock the judge_input_output_expected function
+    mock_judge_input_output_expected = mocker.patch(
+        'pydantic_evals.evaluators.llm_as_a_judge.judge_input_output_expected'
+    )
+    mock_judge_input_output_expected.return_value = mock_grading_output
+
+    # Mock the judge_output_expected function
+    mock_judge_output_expected = mocker.patch('pydantic_evals.evaluators.llm_as_a_judge.judge_output_expected')
+    mock_judge_output_expected.return_value = mock_grading_output
+
     ctx = EvaluatorContext(
         name='test',
         inputs={'prompt': 'Hello'},
         metadata=None,
-        expected_output=None,
+        expected_output='Hello',
         output='Hello world',
         duration=0.0,
         _span_tree=SpanTreeRecordingError('spans were not recorded'),
@@ -238,6 +248,29 @@ async def test_llm_judge_evaluator(mocker: MockerFixture):
         {'prompt': 'Hello'}, 'Hello world', 'Output contains input', 'openai:gpt-4o', None
     )
 
+    # Test with input and expected output
+    evaluator = LLMJudge(
+        rubric='Output contains input', include_input=True, include_expected_output=True, model='openai:gpt-4o'
+    )
+    assert to_jsonable_python(await evaluator.evaluate(ctx)) == snapshot(
+        {'LLMJudge': {'value': True, 'reason': 'Test passed'}}
+    )
+
+    mock_judge_input_output_expected.assert_called_once_with(
+        {'prompt': 'Hello'}, 'Hello world', 'Hello', 'Output contains input', 'openai:gpt-4o', None
+    )
+
+    # Test with output and expected output
+    evaluator = LLMJudge(
+        rubric='Output contains input', include_input=False, include_expected_output=True, model='openai:gpt-4o'
+    )
+    assert to_jsonable_python(await evaluator.evaluate(ctx)) == snapshot(
+        {'LLMJudge': {'value': True, 'reason': 'Test passed'}}
+    )
+
+    mock_judge_output_expected.assert_called_once_with(
+        'Hello world', 'Hello', 'Output contains input', 'openai:gpt-4o', None
+    )
     # Test with failing result
     mock_grading_output.score = 0.0
     mock_grading_output.pass_ = False
@@ -273,13 +306,21 @@ async def test_llm_judge_evaluator_with_model_settings(mocker: MockerFixture):
     mock_judge_input_output = mocker.patch('pydantic_evals.evaluators.llm_as_a_judge.judge_input_output')
     mock_judge_input_output.return_value = mock_grading_output
 
+    mock_judge_input_output_expected = mocker.patch(
+        'pydantic_evals.evaluators.llm_as_a_judge.judge_input_output_expected'
+    )
+    mock_judge_input_output_expected.return_value = mock_grading_output
+
+    mock_judge_output_expected = mocker.patch('pydantic_evals.evaluators.llm_as_a_judge.judge_output_expected')
+    mock_judge_output_expected.return_value = mock_grading_output
+
     custom_model_settings = ModelSettings(temperature=0.77)
 
     ctx = EvaluatorContext(
         name='test_custom_settings',
         inputs={'prompt': 'Hello Custom'},
         metadata=None,
-        expected_output=None,
+        expected_output='Hello',
         output='Hello world custom settings',
         duration=0.0,
         _span_tree=SpanTreeRecordingError('spans were not recorded'),
@@ -314,6 +355,45 @@ async def test_llm_judge_evaluator_with_model_settings(mocker: MockerFixture):
         custom_model_settings,
     )
 
+    # Test with input and expected output, with custom model_settings
+    evaluator_with_input_expected = LLMJudge(
+        rubric='Output contains input with custom settings',
+        include_input=True,
+        include_expected_output=True,
+        model='openai:gpt-3.5-turbo',
+        model_settings=custom_model_settings,
+    )
+    assert to_jsonable_python(await evaluator_with_input_expected.evaluate(ctx)) == snapshot(
+        {'LLMJudge': {'value': True, 'reason': 'Test passed with settings'}}
+    )
+    mock_judge_input_output_expected.assert_called_once_with(
+        {'prompt': 'Hello Custom'},
+        'Hello world custom settings',
+        'Hello',
+        'Output contains input with custom settings',
+        'openai:gpt-3.5-turbo',
+        custom_model_settings,
+    )
+
+    # Test with output and expected output
+    evaluator_with_output_expected = LLMJudge(
+        rubric='Output contains input with custom settings',
+        include_input=False,
+        include_expected_output=True,
+        model='openai:gpt-3.5-turbo',
+        model_settings=custom_model_settings,
+    )
+    assert to_jsonable_python(await evaluator_with_output_expected.evaluate(ctx)) == snapshot(
+        {'LLMJudge': {'value': True, 'reason': 'Test passed with settings'}}
+    )
+    mock_judge_output_expected.assert_called_once_with(
+        'Hello world custom settings',
+        'Hello',
+        'Output contains input with custom settings',
+        'openai:gpt-3.5-turbo',
+        custom_model_settings,
+    )
+
 
 async def test_python():
     """Test Python evaluator."""
diff --git a/tests/evals/test_llm_as_a_judge.py b/tests/evals/test_llm_as_a_judge.py