Skip to main content
The vision_agents.testing module provides a lightweight testing layer for verifying agent behavior — tool calls, arguments, responses, and intent — without spinning up audio/video infrastructure.
This framework uses familiar pytest patterns. No custom test runner required.

Installation

The testing module is included with Vision Agents:
uv add vision-agents
Configure pytest for async support in pytest.ini:
[pytest]
asyncio_mode = auto

Core concepts

ComponentPurpose
TestSessionAsync context manager that wraps an LLM for testing
TestResponseResult of a conversation turn with events and assertions
LLMJudgeEvaluates agent responses against target intents
mock_toolsTemporarily replaces tool implementations
mock_functionsWraps tools in AsyncMock for call tracking

Basic usage

Testing a greeting

from vision_agents.plugins import gemini
from vision_agents.testing import LLMJudge, TestSession

async def test_greeting():
    llm = gemini.LLM("gemini-2.0-flash")
    judge = LLMJudge(gemini.LLM("gemini-2.0-flash"))

    async with TestSession(llm=llm, instructions="Be friendly") as session:
        response = await session.simple_response("Hello")

        # Verify no tools were called
        assert response.function_calls == []

        # Judge the response intent
        verdict = await judge.evaluate(
            response.chat_messages[0],
            intent="Friendly greeting"
        )
        assert verdict.success, verdict.reason

Testing tool calls

async def test_weather():
    llm = gemini.LLM("gemini-2.0-flash")
    judge = LLMJudge(gemini.LLM("gemini-2.0-flash"))

    @llm.register_function(description="Get weather for a location")
    async def get_weather(location: str) -> dict:
        return {"temp": 72, "condition": "sunny"}

    async with TestSession(llm=llm, instructions="You can check weather") as session:
        response = await session.simple_response("Weather in Tokyo?")

        # Assert the tool was called with expected arguments
        response.assert_function_called("get_weather", arguments={"location": "Tokyo"})

        # Judge the response
        verdict = await judge.evaluate(
            response.chat_messages[0],
            intent="Reports weather for Tokyo"
        )
        assert verdict.success, verdict.reason

TestResponse assertions

TestResponse provides built-in assertion methods:

assert_function_called

Verifies a tool was called with expected arguments (partial match):
# Check function was called with specific argument
response.assert_function_called("get_weather", arguments={"location": "Tokyo"})

# Check function was called (any arguments)
response.assert_function_called("get_weather")

# Check any function was called
response.assert_function_called()

assert_function_output

Verifies tool output:
# Check exact output
response.assert_function_output("get_weather", output={"temp": 72, "condition": "sunny"})

# Check if output was an error
response.assert_function_output("get_weather", is_error=True)

Accessing events directly

# Pre-computed lists for inspection
response.function_calls  # List of FunctionCallEvent
response.chat_messages   # List of ChatMessageEvent
response.events          # All events in order
response.output          # Final assistant message text
response.duration_ms     # Response time in milliseconds

Mocking tools

mock_tools

Replace tool implementations without changing the schema visible to the LLM:
from vision_agents.testing import mock_tools

async def test_with_mock_tools():
    llm = gemini.LLM("gemini-2.0-flash")

    @llm.register_function(description="Get weather")
    async def get_weather(location: str) -> dict:
        return {"temp": 72}  # Real implementation

    async with TestSession(llm=llm, instructions="...") as session:
        with mock_tools(llm, {"get_weather": lambda location: {"temp": 55, "condition": "rainy"}}):
            response = await session.simple_response("Weather in Seattle?")
            # Tool returns mocked value

mock_functions

Wrap tools in AsyncMock for call tracking with standard unittest.mock assertions:
async def test_with_mock_functions():
    llm = gemini.LLM("gemini-2.0-flash")

    @llm.register_function(description="Get weather")
    async def get_weather(location: str) -> dict:
        return {"temp": 72}

    async with TestSession(llm=llm, instructions="...") as session:
        with session.mock_functions(
            {"get_weather": lambda **_: {"temp": 55, "condition": "rainy"}}
        ) as mocked:
            response = await session.simple_response("Weather in Berlin?")

            # unittest.mock assertions
            mocked["get_weather"].assert_called_once()
            mocked["get_weather"].assert_called_with(location="Berlin")

            # TestResponse assertion
            response.assert_function_output(
                "get_weather",
                output={"temp": 55, "condition": "rainy"}
            )

LLM-as-judge

LLMJudge uses a separate LLM instance to evaluate whether agent responses match target intents:
from vision_agents.testing import LLMJudge, JudgeVerdict

# Use a separate LLM instance for judging
judge = LLMJudge(gemini.LLM("gemini-2.0-flash"))

# Evaluate a response
verdict: JudgeVerdict = await judge.evaluate(
    response.chat_messages[0],
    intent="Provides a helpful, accurate weather report"
)

if verdict.success:
    print(f"Passed: {verdict.reason}")
else:
    print(f"Failed: {verdict.reason}")
Use a separate LLM instance for the judge to avoid polluting the agent’s conversation history.

Event types

The framework captures three event types during a conversation turn:
EventDescriptionFields
ChatMessageEventAssistant or user messagerole, content
FunctionCallEventTool invocation requestname, arguments, tool_call_id
FunctionCallOutputEventTool execution resultname, output, is_error, execution_time_ms

Complete example

import os
import pytest
from vision_agents.plugins import gemini
from vision_agents.testing import LLMJudge, TestSession

MODEL = "gemini-2.0-flash"
INSTRUCTIONS = """You are a helpful assistant.
You can check the weather using the get_weather tool."""


def setup_llm(model: str):
    llm = gemini.LLM(model)

    @llm.register_function(description="Get weather for a location")
    async def get_weather(location: str) -> dict:
        return {"temp_f": 72, "condition": "sunny"}

    return llm


@pytest.mark.integration
async def test_greeting():
    """Agent gives a friendly, short greeting."""
    llm = setup_llm(MODEL)
    judge = LLMJudge(gemini.LLM(MODEL))

    async with TestSession(llm=llm, instructions=INSTRUCTIONS) as session:
        response = await session.simple_response("Hey there!")
        assert response.function_calls == []
        verdict = await judge.evaluate(
            response.chat_messages[0],
            intent="Friendly, short greeting"
        )
        assert verdict.success, verdict.reason


@pytest.mark.integration
async def test_weather_tool_call():
    """Agent calls get_weather with the right location."""
    llm = setup_llm(MODEL)
    judge = LLMJudge(gemini.LLM(MODEL))

    async with TestSession(llm=llm, instructions=INSTRUCTIONS) as session:
        response = await session.simple_response("What's the weather in Berlin?")
        response.assert_function_called("get_weather", arguments={"location": "Berlin"})
        verdict = await judge.evaluate(
            response.chat_messages[0],
            intent="Reports current weather for Berlin"
        )
        assert verdict.success, verdict.reason


@pytest.mark.integration
async def test_weather_mocked():
    """Verify tool calls with mocked implementation."""
    llm = setup_llm(MODEL)
    judge = LLMJudge(gemini.LLM(MODEL))

    async with TestSession(llm=llm, instructions=INSTRUCTIONS) as session:
        with session.mock_functions(
            {"get_weather": lambda **_: {"temp_f": 55, "condition": "rainy"}}
        ) as mocked:
            response = await session.simple_response("Weather in Berlin?")

            mocked["get_weather"].assert_called_once()
            mocked["get_weather"].assert_called_with(location="Berlin")

            response.assert_function_output(
                "get_weather",
                output={"temp_f": 55, "condition": "rainy"}
            )

            verdict = await judge.evaluate(
                response.chat_messages[0],
                intent="Reports rainy weather for Berlin"
            )
            assert verdict.success, verdict.reason
Run tests:
uv run pytest tests/ -m integration

API reference

TestSession

ParameterTypeDescription
llmLLMLLM instance with tools registered
instructionsstrSystem instructions for the agent
Methods:
  • simple_response(text: str) -> TestResponse — Send user text and capture response
  • mock_functions(mocks: dict) -> ContextManager[dict[str, AsyncMock]] — Mock tools with call tracking

TestResponse

PropertyTypeDescription
inputstrUser input text
outputstr | NoneFinal assistant message
eventslist[RunEvent]All captured events
function_callslist[FunctionCallEvent]Tool call events
chat_messageslist[ChatMessageEvent]Message events
duration_msfloatResponse time

LLMJudge

ParameterTypeDescription
llmLLMSeparate LLM instance for evaluation
Methods:
  • evaluate(event: ChatMessageEvent, intent: str) -> JudgeVerdict — Evaluate response against intent

JudgeVerdict

PropertyTypeDescription
successboolWhether the intent was fulfilled
reasonstrExplanation of the verdict

Next steps