Skip to content

Tokens usage is not consistent across Inspect-ai and chatlas for solver #216

@karangattu

Description

@karangattu

This might not be a bug, but needs some investigation hence am creating this issue to track that work.
I am seeing inconsistent tokens being used when inspect-ai is used for the solver step as compared to using chatlasin the solver step.
Also, how does one leverage the cache read on the chatlas side?

Image
inspect-ai eval code

script_using_inspect_ai_tool_calling.py

from inspect_ai import Task, task
from inspect_ai.dataset import Sample
from inspect_ai.scorer import model_graded_qa
from inspect_ai.solver import generate, system_message, use_tools
from inspect_ai.tool import ToolError, tool


@tool
def get_weather():
    async def execute() -> dict[str, dict[str, int | str]]:
        """Get current weather data for various cities."""
        weather_data = {
            "New York": {
                "temp": 72,
                "condition": "Sunny",
                "humidity": 65,
            },
            "London": {
                "temp": 58,
                "condition": "Rainy",
                "humidity": 85,
            },
            "Tokyo": {
                "temp": 68,
                "condition": "Cloudy",
                "humidity": 70,
            },
            "Sydney": {
                "temp": 82,
                "condition": "Sunny",
                "humidity": 60,
            },
            "Paris": {
                "temp": 61,
                "condition": "Partly Cloudy",
                "humidity": 72,
            },
        }
        return weather_data

    return execute


@tool
def calculate_average():
    async def execute(temperatures: list[float]) -> float:
        """Calculate the average temperature.

        Args:
            temperatures: Temperature readings in degrees Fahrenheit.
        """
        if not temperatures:
            raise ToolError("No temperatures provided")
        return sum(temperatures) / len(temperatures)

    return execute


@tool
def compare_values():
    async def execute(
        value1: float,
        value2: float,
        label1: str,
        label2: str,
    ) -> str:
        """Compare two numeric values.

        Args:
            value1: First numeric value to compare.
            value2: Second numeric value to compare.
            label1: Description for the first value.
            label2: Description for the second value.
        """
        if value1 > value2:
            return f"{label1} ({value1}) is greater than {label2} ({value2})"
        if value2 > value1:
            return f"{label2} ({value2}) is greater than {label1} ({value1})"
        return f"{label1} and {label2} are equal ({value1})"

    return execute


@task
def weather_tool_task():
    """Task requiring multiple tool calls to analyze weather data."""
    dataset = [
        Sample(
            input=(
                "Using the available tools, get the weather data, "
                "calculate the average temperature across all cities, "
                "and compare the average temperature to London's "
                "temperature. Tell me if the average is higher or "
                "lower than London's temperature."
            ),
            target=(
                "Average across cities is about 68.2°F. "
                "London's temperature is 58°F. "
                "Average (68.2°F) is about 10°F higher than London's "
                "value (58°F)."
            ),
        ),
        Sample(
            input=(
                "Get the weather information and determine which city "
                "has the highest humidity. Then compare that city's "
                "temperature to Tokyo's temperature."
            ),
            target=(
                "London has the highest humidity at 85%. "
                "London's temperature is 58°F, while Tokyo's temperature "
                "is 68°F. Tokyo is warmer than London by 10 degrees."
            ),
        ),
        Sample(
            input=(
                "Using the tools, find the weather data and calculate "
                "the average temperature of only the sunny cities. "
                "How does this compare to the overall average?"
            ),
            target=(
                "The sunny cities are New York (72°F) and Sydney (82°F), "
                "with an average of 77°F. This is higher than the overall "
                "average temperature of approximately 68.2°F across all "
                "cities."
            ),
        ),
    ]

    return Task(
        dataset=dataset,
        solver=[
            system_message(
                "You are a helpful assistant with access to weather tools. "
                "Use the tools systematically to answer questions accurately. "
                "Make multiple tool calls as needed."
            ),
            use_tools([get_weather(), calculate_average(), compare_values()]),
            generate(),
        ],
        scorer=model_graded_qa(),
        name="inspect_ai_weather",
        metadata={"tags": ["tool_calling", "weather_analysis", "inspect-ai"]},
        model="openai/gpt-5-nano-2025-08-07",
    )
chatlas eval code

script_using_chatlas_tool_calling.py

from chatlas import ChatOpenAI
from inspect_ai import Task, task
from inspect_ai.dataset import Sample
from inspect_ai.scorer import model_graded_qa

WEATHER_DATA: dict[str, dict[str, int | str]] = {
    "New York": {
        "temp": 72,
        "condition": "Sunny",
        "humidity": 65,
    },
    "London": {
        "temp": 58,
        "condition": "Rainy",
        "humidity": 85,
    },
    "Tokyo": {
        "temp": 68,
        "condition": "Cloudy",
        "humidity": 70,
    },
    "Sydney": {
        "temp": 82,
        "condition": "Sunny",
        "humidity": 60,
    },
    "Paris": {
        "temp": 61,
        "condition": "Partly Cloudy",
        "humidity": 72,
    },
}


def get_weather() -> dict[str, dict[str, int | str]]:
    """Get current weather data for various cities."""

    return WEATHER_DATA


def calculate_average(temperatures: list[float]) -> float:
    """Calculate the average temperature."""

    if not temperatures:
        raise ValueError("No temperatures provided")
    return sum(temperatures) / len(temperatures)


def compare_values(
    value1: float,
    value2: float,
    label1: str,
    label2: str,
) -> str:
    """Compare two numeric values."""

    if value1 > value2:
        return f"{label1} ({value1}) is greater than {label2} ({value2})"
    if value2 > value1:
        return f"{label2} ({value2}) is greater than {label1} ({value1})"
    return f"{label1} and {label2} are equal ({value1})"


chat = ChatOpenAI(
    system_prompt=(
        "You are a helpful assistant with access to weather tools. "
        "Use the tools systematically to answer questions accurately. "
        "Make multiple tool calls as needed."
    ),
    model="gpt-5-nano-2025-08-07",
)
chat.register_tool(get_weather)
chat.register_tool(calculate_average)
chat.register_tool(compare_values)


@task
def weather_tool_task():
    """Task requiring multiple tool calls to analyze weather data."""
    dataset = [
        Sample(
            input=(
                "Using the available tools, get the weather data, "
                "calculate the average temperature across all cities, "
                "and compare the average temperature to London's "
                "temperature. Tell me if the average is higher or "
                "lower than London's temperature."
            ),
            target=(
                "Average across cities is about 68.2°F. "
                "London's temperature is 58°F. "
                "Average (68.2°F) is about 10°F higher than London's "
                "value (58°F)."
            ),
        ),
        Sample(
            input=(
                "Get the weather information and determine which city "
                "has the highest humidity. Then compare that city's "
                "temperature to Tokyo's temperature."
            ),
            target=(
                "London has the highest humidity at 85%. "
                "London's temperature is 58°F, while Tokyo's temperature "
                "is 68°F. Tokyo is warmer than London by 10 degrees."
            ),
        ),
        Sample(
            input=(
                "Using the tools, find the weather data and calculate "
                "the average temperature of only the sunny cities. "
                "How does this compare to the overall average?"
            ),
            target=(
                "The sunny cities are New York (72°F) and Sydney (82°F), "
                "with an average of 77°F. This is higher than the overall "
                "average temperature of approximately 68.2°F across all "
                "cities."
            ),
        ),
    ]

    return Task(
        dataset=dataset,
        solver=chat.to_solver(include_system_prompt=True),
        scorer=model_graded_qa(),
        name="chatlas_weather",
        metadata={"tags": ["tool_calling", "weather_analysis", "chatlas"]},
        model="openai/gpt-5-nano-2025-08-07",
    )

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions