-
Notifications
You must be signed in to change notification settings - Fork 20
Open
Description
This might not be a bug, but needs some investigation hence am creating this issue to track that work.
I am seeing inconsistent tokens being used when inspect-ai is used for the solver step as compared to using chatlasin the solver step.
Also, how does one leverage the cache read on the chatlas side?
inspect-ai eval code
script_using_inspect_ai_tool_calling.py
from inspect_ai import Task, task
from inspect_ai.dataset import Sample
from inspect_ai.scorer import model_graded_qa
from inspect_ai.solver import generate, system_message, use_tools
from inspect_ai.tool import ToolError, tool
@tool
def get_weather():
async def execute() -> dict[str, dict[str, int | str]]:
"""Get current weather data for various cities."""
weather_data = {
"New York": {
"temp": 72,
"condition": "Sunny",
"humidity": 65,
},
"London": {
"temp": 58,
"condition": "Rainy",
"humidity": 85,
},
"Tokyo": {
"temp": 68,
"condition": "Cloudy",
"humidity": 70,
},
"Sydney": {
"temp": 82,
"condition": "Sunny",
"humidity": 60,
},
"Paris": {
"temp": 61,
"condition": "Partly Cloudy",
"humidity": 72,
},
}
return weather_data
return execute
@tool
def calculate_average():
async def execute(temperatures: list[float]) -> float:
"""Calculate the average temperature.
Args:
temperatures: Temperature readings in degrees Fahrenheit.
"""
if not temperatures:
raise ToolError("No temperatures provided")
return sum(temperatures) / len(temperatures)
return execute
@tool
def compare_values():
async def execute(
value1: float,
value2: float,
label1: str,
label2: str,
) -> str:
"""Compare two numeric values.
Args:
value1: First numeric value to compare.
value2: Second numeric value to compare.
label1: Description for the first value.
label2: Description for the second value.
"""
if value1 > value2:
return f"{label1} ({value1}) is greater than {label2} ({value2})"
if value2 > value1:
return f"{label2} ({value2}) is greater than {label1} ({value1})"
return f"{label1} and {label2} are equal ({value1})"
return execute
@task
def weather_tool_task():
"""Task requiring multiple tool calls to analyze weather data."""
dataset = [
Sample(
input=(
"Using the available tools, get the weather data, "
"calculate the average temperature across all cities, "
"and compare the average temperature to London's "
"temperature. Tell me if the average is higher or "
"lower than London's temperature."
),
target=(
"Average across cities is about 68.2°F. "
"London's temperature is 58°F. "
"Average (68.2°F) is about 10°F higher than London's "
"value (58°F)."
),
),
Sample(
input=(
"Get the weather information and determine which city "
"has the highest humidity. Then compare that city's "
"temperature to Tokyo's temperature."
),
target=(
"London has the highest humidity at 85%. "
"London's temperature is 58°F, while Tokyo's temperature "
"is 68°F. Tokyo is warmer than London by 10 degrees."
),
),
Sample(
input=(
"Using the tools, find the weather data and calculate "
"the average temperature of only the sunny cities. "
"How does this compare to the overall average?"
),
target=(
"The sunny cities are New York (72°F) and Sydney (82°F), "
"with an average of 77°F. This is higher than the overall "
"average temperature of approximately 68.2°F across all "
"cities."
),
),
]
return Task(
dataset=dataset,
solver=[
system_message(
"You are a helpful assistant with access to weather tools. "
"Use the tools systematically to answer questions accurately. "
"Make multiple tool calls as needed."
),
use_tools([get_weather(), calculate_average(), compare_values()]),
generate(),
],
scorer=model_graded_qa(),
name="inspect_ai_weather",
metadata={"tags": ["tool_calling", "weather_analysis", "inspect-ai"]},
model="openai/gpt-5-nano-2025-08-07",
)chatlas eval code
script_using_chatlas_tool_calling.py
from chatlas import ChatOpenAI
from inspect_ai import Task, task
from inspect_ai.dataset import Sample
from inspect_ai.scorer import model_graded_qa
WEATHER_DATA: dict[str, dict[str, int | str]] = {
"New York": {
"temp": 72,
"condition": "Sunny",
"humidity": 65,
},
"London": {
"temp": 58,
"condition": "Rainy",
"humidity": 85,
},
"Tokyo": {
"temp": 68,
"condition": "Cloudy",
"humidity": 70,
},
"Sydney": {
"temp": 82,
"condition": "Sunny",
"humidity": 60,
},
"Paris": {
"temp": 61,
"condition": "Partly Cloudy",
"humidity": 72,
},
}
def get_weather() -> dict[str, dict[str, int | str]]:
"""Get current weather data for various cities."""
return WEATHER_DATA
def calculate_average(temperatures: list[float]) -> float:
"""Calculate the average temperature."""
if not temperatures:
raise ValueError("No temperatures provided")
return sum(temperatures) / len(temperatures)
def compare_values(
value1: float,
value2: float,
label1: str,
label2: str,
) -> str:
"""Compare two numeric values."""
if value1 > value2:
return f"{label1} ({value1}) is greater than {label2} ({value2})"
if value2 > value1:
return f"{label2} ({value2}) is greater than {label1} ({value1})"
return f"{label1} and {label2} are equal ({value1})"
chat = ChatOpenAI(
system_prompt=(
"You are a helpful assistant with access to weather tools. "
"Use the tools systematically to answer questions accurately. "
"Make multiple tool calls as needed."
),
model="gpt-5-nano-2025-08-07",
)
chat.register_tool(get_weather)
chat.register_tool(calculate_average)
chat.register_tool(compare_values)
@task
def weather_tool_task():
"""Task requiring multiple tool calls to analyze weather data."""
dataset = [
Sample(
input=(
"Using the available tools, get the weather data, "
"calculate the average temperature across all cities, "
"and compare the average temperature to London's "
"temperature. Tell me if the average is higher or "
"lower than London's temperature."
),
target=(
"Average across cities is about 68.2°F. "
"London's temperature is 58°F. "
"Average (68.2°F) is about 10°F higher than London's "
"value (58°F)."
),
),
Sample(
input=(
"Get the weather information and determine which city "
"has the highest humidity. Then compare that city's "
"temperature to Tokyo's temperature."
),
target=(
"London has the highest humidity at 85%. "
"London's temperature is 58°F, while Tokyo's temperature "
"is 68°F. Tokyo is warmer than London by 10 degrees."
),
),
Sample(
input=(
"Using the tools, find the weather data and calculate "
"the average temperature of only the sunny cities. "
"How does this compare to the overall average?"
),
target=(
"The sunny cities are New York (72°F) and Sydney (82°F), "
"with an average of 77°F. This is higher than the overall "
"average temperature of approximately 68.2°F across all "
"cities."
),
),
]
return Task(
dataset=dataset,
solver=chat.to_solver(include_system_prompt=True),
scorer=model_graded_qa(),
name="chatlas_weather",
metadata={"tags": ["tool_calling", "weather_analysis", "chatlas"]},
model="openai/gpt-5-nano-2025-08-07",
)Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels