Skip to content

Implement screenshot caching #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 31 additions & 7 deletions src/agents/_run_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import asyncio
import dataclasses
import hashlib
import inspect
from collections.abc import Awaitable
from dataclasses import dataclass, field
Expand Down Expand Up @@ -76,6 +77,18 @@ class QueueCompleteSentinel:

_NOT_FINAL_OUTPUT = ToolsToFinalOutputResult(is_final_output=False, final_output=None)

# Screenshots are cached so identical images are not resent each step.
_SCREENSHOT_CACHE: dict[str, str] = {}


def _cache_screenshot(data: str) -> tuple[str, bool]:
"""Return an ID for the screenshot and whether it was newly cached."""
image_id = hashlib.sha1(data.encode()).hexdigest()
if image_id not in _SCREENSHOT_CACHE:
_SCREENSHOT_CACHE[image_id] = data
return image_id, True
return image_id, False


@dataclass
class AgentToolUseTracker:
Expand Down Expand Up @@ -849,17 +862,28 @@ async def execute(
),
)

# TODO: don't send a screenshot every single time, use references
image_url = f"data:image/png;base64,{output}"
# Cache screenshots to avoid resending duplicate images.
image_id, is_new = _cache_screenshot(output)
if is_new:
image_url = f"data:image/png;base64,{output}"
raw_output = {
"type": "computer_screenshot",
"image_url": image_url,
"image_id": image_id,
}
final_output = image_url
else:
raw_output = {
"type": "computer_screenshot_ref",
"image_id": image_id,
}
final_output = image_id
return ToolCallOutputItem(
agent=agent,
output=image_url,
output=final_output,
raw_item=ComputerCallOutput(
call_id=action.tool_call.call_id,
output={
"type": "computer_screenshot",
"image_url": image_url,
},
output=raw_output,
type="computer_call_output",
),
)
Expand Down
46 changes: 45 additions & 1 deletion tests/test_computer_action.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
hooks and returns the expected ToolCallOutputItem."""

from typing import Any
import hashlib

import pytest
from openai.types.responses.response_computer_tool_call import (
Expand Down Expand Up @@ -307,5 +308,48 @@ async def test_execute_invokes_hooks_and_returns_tool_call_output() -> None:
assert isinstance(raw, dict)
assert raw["type"] == "computer_call_output"
assert raw["output"]["type"] == "computer_screenshot"
assert "image_url" in raw["output"]
assert raw["output"]["image_url"].endswith("xyz")
assert "image_id" in raw["output"]


@pytest.mark.asyncio
async def test_execute_reuses_cached_screenshot() -> None:
"""A repeated screenshot should return a cached reference instead of data."""
computer = LoggingComputer(screenshot_return="abc")
comptool = ComputerTool(computer=computer)
action = ActionClick(type="click", x=0, y=0, button="left")
tool_call = ResponseComputerToolCall(
id="tool1",
type="computer_call",
action=action,
call_id="tool1",
pending_safety_checks=[],
status="completed",
)
tool_call.call_id = "tool1"
tool_run = ToolRunComputerAction(tool_call=tool_call, computer_tool=comptool)
agent = Agent(name="agent", tools=[comptool])
context_wrapper: RunContextWrapper[Any] = RunContextWrapper(context=None)
run_hooks = LoggingRunHooks()
# First call caches the screenshot data.
first = await ComputerAction.execute(
agent=agent,
action=tool_run,
hooks=run_hooks,
context_wrapper=context_wrapper,
config=RunConfig(),
)
# Second call should send only a reference.
second = await ComputerAction.execute(
agent=agent,
action=tool_run,
hooks=run_hooks,
context_wrapper=context_wrapper,
config=RunConfig(),
)
digest = hashlib.sha1("abc".encode()).hexdigest()
assert first.output == ""
assert second.output == digest
raw_second = second.raw_item
assert raw_second["output"]["type"] == "computer_screenshot_ref"
assert raw_second["output"]["image_id"] == digest