diff --git a/src/agents/_run_impl.py b/src/agents/_run_impl.py index b5a83685c..d31e456e8 100644 --- a/src/agents/_run_impl.py +++ b/src/agents/_run_impl.py @@ -2,6 +2,7 @@ import asyncio import dataclasses +import hashlib import inspect from collections.abc import Awaitable from dataclasses import dataclass, field @@ -76,6 +77,18 @@ class QueueCompleteSentinel: _NOT_FINAL_OUTPUT = ToolsToFinalOutputResult(is_final_output=False, final_output=None) +# Screenshots are cached so identical images are not resent each step. +_SCREENSHOT_CACHE: dict[str, str] = {} + + +def _cache_screenshot(data: str) -> tuple[str, bool]: + """Return an ID for the screenshot and whether it was newly cached.""" + image_id = hashlib.sha1(data.encode()).hexdigest() + if image_id not in _SCREENSHOT_CACHE: + _SCREENSHOT_CACHE[image_id] = data + return image_id, True + return image_id, False + @dataclass class AgentToolUseTracker: @@ -849,17 +862,28 @@ async def execute( ), ) - # TODO: don't send a screenshot every single time, use references - image_url = f"data:image/png;base64,{output}" + # Cache screenshots to avoid resending duplicate images. + image_id, is_new = _cache_screenshot(output) + if is_new: + image_url = f"data:image/png;base64,{output}" + raw_output = { + "type": "computer_screenshot", + "image_url": image_url, + "image_id": image_id, + } + final_output = image_url + else: + raw_output = { + "type": "computer_screenshot_ref", + "image_id": image_id, + } + final_output = image_id return ToolCallOutputItem( agent=agent, - output=image_url, + output=final_output, raw_item=ComputerCallOutput( call_id=action.tool_call.call_id, - output={ - "type": "computer_screenshot", - "image_url": image_url, - }, + output=raw_output, type="computer_call_output", ), ) diff --git a/tests/test_computer_action.py b/tests/test_computer_action.py index 70dcabd59..ca0c5bbe5 100644 --- a/tests/test_computer_action.py +++ b/tests/test_computer_action.py @@ -5,6 +5,7 @@ hooks and returns the expected ToolCallOutputItem.""" from typing import Any +import hashlib import pytest from openai.types.responses.response_computer_tool_call import ( @@ -307,5 +308,48 @@ async def test_execute_invokes_hooks_and_returns_tool_call_output() -> None: assert isinstance(raw, dict) assert raw["type"] == "computer_call_output" assert raw["output"]["type"] == "computer_screenshot" - assert "image_url" in raw["output"] assert raw["output"]["image_url"].endswith("xyz") + assert "image_id" in raw["output"] + + +@pytest.mark.asyncio +async def test_execute_reuses_cached_screenshot() -> None: + """A repeated screenshot should return a cached reference instead of data.""" + computer = LoggingComputer(screenshot_return="abc") + comptool = ComputerTool(computer=computer) + action = ActionClick(type="click", x=0, y=0, button="left") + tool_call = ResponseComputerToolCall( + id="tool1", + type="computer_call", + action=action, + call_id="tool1", + pending_safety_checks=[], + status="completed", + ) + tool_call.call_id = "tool1" + tool_run = ToolRunComputerAction(tool_call=tool_call, computer_tool=comptool) + agent = Agent(name="agent", tools=[comptool]) + context_wrapper: RunContextWrapper[Any] = RunContextWrapper(context=None) + run_hooks = LoggingRunHooks() + # First call caches the screenshot data. + first = await ComputerAction.execute( + agent=agent, + action=tool_run, + hooks=run_hooks, + context_wrapper=context_wrapper, + config=RunConfig(), + ) + # Second call should send only a reference. + second = await ComputerAction.execute( + agent=agent, + action=tool_run, + hooks=run_hooks, + context_wrapper=context_wrapper, + config=RunConfig(), + ) + digest = hashlib.sha1("abc".encode()).hexdigest() + assert first.output == "" + assert second.output == digest + raw_second = second.raw_item + assert raw_second["output"]["type"] == "computer_screenshot_ref" + assert raw_second["output"]["image_id"] == digest