TGreen87 · TGreen87 · May 19, 2025 · May 19, 2025
diff --git a/src/agents/_run_impl.py b/src/agents/_run_impl.py
@@ -2,6 +2,7 @@
 
 import asyncio
 import dataclasses
+import hashlib
 import inspect
 from collections.abc import Awaitable
 from dataclasses import dataclass, field
@@ -76,6 +77,18 @@ class QueueCompleteSentinel:
 
 _NOT_FINAL_OUTPUT = ToolsToFinalOutputResult(is_final_output=False, final_output=None)
 
+# Screenshots are cached so identical images are not resent each step.
+_SCREENSHOT_CACHE: dict[str, str] = {}
+
+
+def _cache_screenshot(data: str) -> tuple[str, bool]:
+    """Return an ID for the screenshot and whether it was newly cached."""
+    image_id = hashlib.sha1(data.encode()).hexdigest()
+    if image_id not in _SCREENSHOT_CACHE:
+        _SCREENSHOT_CACHE[image_id] = data
+        return image_id, True
+    return image_id, False
+
 
 @dataclass
 class AgentToolUseTracker:
@@ -849,17 +862,28 @@ async def execute(
             ),
         )
 
-        # TODO: don't send a screenshot every single time, use references
-        image_url = f"data:image/png;base64,{output}"
+        # Cache screenshots to avoid resending duplicate images.
+        image_id, is_new = _cache_screenshot(output)
+        if is_new:
+            image_url = f"data:image/png;base64,{output}"
+            raw_output = {
+                "type": "computer_screenshot",
+                "image_url": image_url,
+                "image_id": image_id,
+            }
+            final_output = image_url
+        else:
+            raw_output = {
+                "type": "computer_screenshot_ref",
+                "image_id": image_id,
+            }
+            final_output = image_id
         return ToolCallOutputItem(
             agent=agent,
-            output=image_url,
+            output=final_output,
             raw_item=ComputerCallOutput(
                 call_id=action.tool_call.call_id,
-                output={
-                    "type": "computer_screenshot",
-                    "image_url": image_url,
-                },
+                output=raw_output,
                 type="computer_call_output",
             ),
         )

diff --git a/tests/test_computer_action.py b/tests/test_computer_action.py
@@ -5,6 +5,7 @@
 hooks and returns the expected ToolCallOutputItem."""
 
 from typing import Any
+import hashlib
 
 import pytest
 from openai.types.responses.response_computer_tool_call import (
@@ -307,5 +308,48 @@ async def test_execute_invokes_hooks_and_returns_tool_call_output() -> None:
     assert isinstance(raw, dict)
     assert raw["type"] == "computer_call_output"
     assert raw["output"]["type"] == "computer_screenshot"
-    assert "image_url" in raw["output"]
     assert raw["output"]["image_url"].endswith("xyz")
+    assert "image_id" in raw["output"]
+
+
+@pytest.mark.asyncio
+async def test_execute_reuses_cached_screenshot() -> None:
+    """A repeated screenshot should return a cached reference instead of data."""
+    computer = LoggingComputer(screenshot_return="abc")
+    comptool = ComputerTool(computer=computer)
+    action = ActionClick(type="click", x=0, y=0, button="left")
+    tool_call = ResponseComputerToolCall(
+        id="tool1",
+        type="computer_call",
+        action=action,
+        call_id="tool1",
+        pending_safety_checks=[],
+        status="completed",
+    )
+    tool_call.call_id = "tool1"
+    tool_run = ToolRunComputerAction(tool_call=tool_call, computer_tool=comptool)
+    agent = Agent(name="agent", tools=[comptool])
+    context_wrapper: RunContextWrapper[Any] = RunContextWrapper(context=None)
+    run_hooks = LoggingRunHooks()
+    # First call caches the screenshot data.
+    first = await ComputerAction.execute(
+        agent=agent,
+        action=tool_run,
+        hooks=run_hooks,
+        context_wrapper=context_wrapper,
+        config=RunConfig(),
+    )
+    # Second call should send only a reference.
+    second = await ComputerAction.execute(
+        agent=agent,
+        action=tool_run,
+        hooks=run_hooks,
+        context_wrapper=context_wrapper,
+        config=RunConfig(),
+    )
+    digest = hashlib.sha1("abc".encode()).hexdigest()
+    assert first.output == "data:image/png;base64,abc"
+    assert second.output == digest
+    raw_second = second.raw_item
+    assert raw_second["output"]["type"] == "computer_screenshot_ref"
+    assert raw_second["output"]["image_id"] == digest