diff --git a/START_GPU_IPC_SERVER.bat b/START_GPU_IPC_SERVER.bat
new file mode 100644
index 0000000..40b8750
--- /dev/null
+++ b/START_GPU_IPC_SERVER.bat
@@ -0,0 +1,57 @@
+@echo off
+REM Agent Arena - GPU-Accelerated IPC Server Startup Script
+REM This script starts the Python IPC server with GPU-accelerated LLM backend
+
+echo ========================================
+echo Agent Arena - GPU IPC Server
+echo ========================================
+echo.
+
+cd /d "%~dp0\python"
+
+REM Check if venv exists
+if not exist "venv\" (
+    echo ERROR: Python virtual environment not found!
+    echo Please run: python -m venv venv
+    echo Then install dependencies: venv\Scripts\pip install -r requirements.txt
+    pause
+    exit /b 1
+)
+
+REM Activate venv
+echo Activating Python virtual environment...
+call venv\Scripts\activate.bat
+
+REM Check if required packages are installed
+python -c "import fastapi, uvicorn, llama_cpp" 2>nul
+if errorlevel 1 (
+    echo.
+    echo ERROR: Required packages not installed!
+    echo Please install dependencies: pip install -r requirements.txt
+    pause
+    exit /b 1
+)
+
+echo.
+echo Starting GPU-Accelerated IPC Server...
+echo ========================================
+echo Model: Llama-2-7B Chat (Q4_K_M quantization)
+echo GPU Acceleration: ENABLED (all layers)
+echo Expected Speed: ~113 tokens/sec
+echo Server Address: http://127.0.0.1:5000
+echo.
+echo Tools Available: 15+ (movement, inventory, world query)
+echo Default Agent: gpu_agent_001
+echo ========================================
+echo.
+echo Press Ctrl+C to stop the server
+echo.
+
+python run_ipc_server_with_gpu.py --gpu-layers -1
+
+REM If server exits, pause so user can see error
+if errorlevel 1 (
+    echo.
+    echo Server exited with error!
+    pause
+)
diff --git a/TESTING_AGENT_WITH_GPU.md b/TESTING_AGENT_WITH_GPU.md
new file mode 100644
index 0000000..1155816
--- /dev/null
+++ b/TESTING_AGENT_WITH_GPU.md
@@ -0,0 +1,329 @@
+# Testing Agents with GPU-Accelerated Backend
+
+This guide shows how to run the full Godot + Python IPC setup with your GPU-accelerated llama.cpp backend.
+
+## Quick Start (Tool Execution Only)
+
+This tests that the IPC communication works without LLM agents:
+
+### Step 1: Start IPC Server
+```bash
+# From project root
+START_IPC_SERVER.bat
+```
+
+The server will start at `http://127.0.0.1:5000` and automatically register all tools (movement, inventory, world_query).
+
+### Step 2: Open Test Scene in Godot
+1. Open Godot editor
+2. Navigate to: `scenes/tests/test_tool_execution_simple.tscn`
+3. Press **F6** (Run Current Scene)
+
+### Step 3: Verify Results
+Check both consoles:
+- **Godot Console**: Shows test execution and results
+- **Python Console**: Shows tool execution logs
+
+**Expected Output (Python):**
+```
+2025-11-18 - ipc.server - INFO - Registered 15 tools
+2025-11-18 - ipc.server - INFO - Executing tool 'move_to' for agent...
+2025-11-18 - ipc.server - INFO - Tool 'move_to' executed: success=True
+```
+
+---
+
+## Full Agent Test (with GPU Backend)
+
+This tests agents making decisions with your GPU-accelerated LLM backend.
+
+### Prerequisites
+
+1. **GPU-accelerated backend working** ✅ (You already have this!)
+2. **IPC server modified to use LLM backend**
+3. **Test scene that triggers agent decisions**
+
+### Step 1: Create GPU-Enabled IPC Server Script
+
+Create `python/run_ipc_server_with_gpu.py`:
+
+```python
+"""
+IPC Server with GPU-accelerated agent backend.
+"""
+
+import argparse
+import logging
+import sys
+
+from agent_runtime.runtime import AgentRuntime
+from agent_runtime.agent import Agent
+from agent_runtime.tool_dispatcher import ToolDispatcher
+from backends import LlamaCppBackend, BackendConfig
+from ipc.server import create_server
+from tools import register_movement_tools, register_inventory_tools, register_world_query_tools
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Agent Arena IPC Server with GPU Backend")
+    parser.add_argument("--host", type=str, default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=5000)
+    parser.add_argument("--workers", type=int, default=4)
+    parser.add_argument("--debug", action="store_true")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="../models/llama-2-7b-chat.Q4_K_M.gguf",
+        help="Path to GGUF model file"
+    )
+    parser.add_argument(
+        "--gpu-layers",
+        type=int,
+        default=-1,
+        help="Number of layers to offload to GPU (-1 = all, 0 = CPU only)"
+    )
+
+    args = parser.parse_args()
+
+    if args.debug:
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    logger.info("=" * 60)
+    logger.info("Agent Arena IPC Server (GPU-Accelerated)")
+    logger.info("=" * 60)
+    logger.info(f"Host: {args.host}")
+    logger.info(f"Port: {args.port}")
+    logger.info(f"Max Workers: {args.workers}")
+    logger.info(f"Model: {args.model}")
+    logger.info(f"GPU Layers: {args.gpu_layers}")
+    logger.info("=" * 60)
+
+    try:
+        # Create GPU-accelerated backend
+        backend_config = BackendConfig(
+            model_path=args.model,
+            temperature=0.7,
+            max_tokens=256,
+            n_gpu_layers=args.gpu_layers
+        )
+
+        logger.info("Loading GPU-accelerated LLM backend...")
+        backend = LlamaCppBackend(backend_config)
+        logger.info("✓ Backend loaded successfully")
+
+        # Create runtime
+        runtime = AgentRuntime(max_workers=args.workers)
+
+        # Create a test agent with GPU backend
+        tool_dispatcher = ToolDispatcher()
+        register_movement_tools(tool_dispatcher)
+        register_inventory_tools(tool_dispatcher)
+        register_world_query_tools(tool_dispatcher)
+
+        test_agent = Agent(
+            agent_id="gpu_agent_001",
+            backend=backend,
+            tools=list(tool_dispatcher.tools.keys()),
+            goals=["explore the world", "collect resources"]
+        )
+
+        runtime.register_agent(test_agent)
+        logger.info(f"✓ Registered agent '{test_agent.state.agent_id}' with GPU backend")
+
+        # Create and start server
+        server = create_server(runtime=runtime, host=args.host, port=args.port)
+        logger.info("Starting IPC server...")
+        server.run()
+
+    except KeyboardInterrupt:
+        logger.info("\nShutting down gracefully...")
+        if 'backend' in locals():
+            backend.unload()
+        sys.exit(0)
+    except Exception as e:
+        logger.error(f"Fatal error: {e}", exc_info=True)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
+```
+
+### Step 2: Create Batch File to Start GPU Server
+
+Create `START_GPU_IPC_SERVER.bat` in project root:
+
+```batch
+@echo off
+REM Agent Arena - GPU-Accelerated IPC Server Startup
+
+echo ========================================
+echo Agent Arena - GPU IPC Server
+echo ========================================
+echo.
+
+cd /d "%~dp0\python"
+
+REM Activate venv
+echo Activating Python virtual environment...
+call venv\Scripts\activate.bat
+
+echo.
+echo Starting GPU-Accelerated IPC Server...
+echo Model: Llama-2-7B (Q4_K_M)
+echo GPU Acceleration: ENABLED (all layers)
+echo Server: http://127.0.0.1:5000
+echo.
+echo Press Ctrl+C to stop the server
+echo ========================================
+echo.
+
+python run_ipc_server_with_gpu.py --gpu-layers -1
+
+if errorlevel 1 (
+    echo.
+    echo Server exited with error!
+    pause
+)
+```
+
+### Step 3: Test with Godot Scene
+
+**Option A: Use existing test scene**
+1. Start GPU server: `START_GPU_IPC_SERVER.bat`
+2. Open `scenes/tests/test_tool_execution_simple.tscn`
+3. Press F6
+4. Tools will execute (no LLM needed)
+
+**Option B: Create agent decision scene**
+
+You'll need to modify one of the benchmark scenes to:
+1. Register an agent via `/agents/register` endpoint
+2. Send observations via `/tick` endpoint
+3. Receive agent's LLM-driven action decision
+
+Example GDScript:
+```gdscript
+extends Node
+
+var http_client := HTTPRequest.new()
+var agent_id = "gpu_agent_001"
+
+func _ready():
+    add_child(http_client)
+    http_client.request_completed.connect(_on_request_completed)
+
+    # Send observation to agent
+    var observation = {
+        "tick": 0,
+        "perceptions": [{
+            "agent_id": agent_id,
+            "position": [0, 0, 0],
+            "visible_entities": [
+                {"type": "wood", "distance": 5.0}
+            ],
+            "inventory": []
+        }]
+    }
+
+    var json = JSON.stringify(observation)
+    http_client.request(
+        "http://127.0.0.1:5000/tick",
+        ["Content-Type: application/json"],
+        HTTPClient.METHOD_POST,
+        json
+    )
+
+func _on_request_completed(result, response_code, headers, body):
+    var json = JSON.parse_string(body.get_string_from_utf8())
+    print("Agent decision: ", json)
+```
+
+---
+
+## Testing Workflow
+
+### 1. Test IPC Server (No LLM)
+```bash
+START_IPC_SERVER.bat
+# Run: scenes/tests/test_tool_execution_simple.tscn
+```
+**Verifies:** Tool execution works ✓
+
+### 2. Test GPU Backend (Python Only)
+```bash
+cd python
+venv\Scripts\activate
+python test_agent_gpu.py
+```
+**Verifies:** GPU backend + agent decisions work ✓
+
+### 3. Test Full Integration (Godot + Python + GPU)
+```bash
+START_GPU_IPC_SERVER.bat
+# Run modified scene that sends /tick requests
+```
+**Verifies:** End-to-end agent pipeline works ✓
+
+---
+
+## Performance Expectations
+
+With GPU acceleration enabled:
+- **LLM Speed**: ~113 tokens/sec
+- **Decision Time**: ~1-2 seconds per action
+- **Recommended Tick Rate**: 0.5-1 Hz (one decision every 1-2 seconds)
+
+Without GPU (CPU only):
+- **LLM Speed**: ~9 tokens/sec
+- **Decision Time**: ~15-20 seconds per action
+- **Not recommended** for real-time simulation
+
+---
+
+## Troubleshooting
+
+### Server won't start
+- Check Python venv is activated
+- Verify model exists: `models/llama-2-7b-chat.Q4_K_M.gguf`
+- Check CUDA PATH (should be fixed now)
+
+### Agent not responding
+- Verify agent registered: Check server logs for "Registered agent"
+- Send observation to `/tick` endpoint
+- Check both consoles for errors
+
+### GPU not being used
+- Check server startup logs for "Offloading all layers to GPU"
+- Verify CUDA toolkit installed
+- Monitor GPU usage: `nvidia-smi`
+
+### Slow responses
+- Check GPU utilization in `nvidia-smi`
+- Verify `n_gpu_layers=-1` (all layers on GPU)
+- Reduce `max_tokens` parameter (currently 256)
+
+---
+
+## Next Steps
+
+1. **Modify existing benchmark scenes** to send `/tick` requests
+2. **Create custom test scene** for agent decision-making
+3. **Add agent registration** in scene `_ready()` function
+4. **Implement perception loop** (Godot → Python observations)
+5. **Handle action responses** (Python → Godot actions)
+
+## Current Status
+
+✅ GPU backend working (113 tok/s)
+✅ IPC server working (tool execution)
+✅ Python agent test working (all 3 scenarios)
+⏳ **TODO**: Connect agents to IPC `/tick` endpoint
+⏳ **TODO**: Modify Godot scenes to use agent decisions
diff --git a/docs/llama_cpp_gpu_setup.md b/docs/llama_cpp_gpu_setup.md
new file mode 100644
index 0000000..8714616
--- /dev/null
+++ b/docs/llama_cpp_gpu_setup.md
@@ -0,0 +1,189 @@
+# GPU Acceleration for llama.cpp Backend
+
+This guide explains how to enable GPU acceleration for the llama.cpp backend on Windows.
+
+## Current Status
+
+- ✅ RTX 3090 with 24GB VRAM detected
+- ✅ CUDA 12.9 driver installed
+- ✅ Backend code updated to support `n_gpu_layers` parameter
+- ⚠️ CUDA-enabled llama-cpp-python requires additional setup
+
+## Why GPU Acceleration?
+
+With your RTX 3090, you can expect:
+- **10-50x faster** inference compared to CPU
+- **Lower latency** for real-time agent responses
+- **Larger models** can fit in VRAM
+
+## Setup Options
+
+### Option 1: Install CUDA Toolkit (Recommended)
+
+The pre-built CUDA wheels require CUDA runtime libraries.
+
+1. **Download CUDA Toolkit 12.x**:
+   - Visit: https://developer.nvidia.com/cuda-downloads
+   - Select: Windows → x86_64 → 12.6 or 12.9
+   - Download and install (Base Installer, ~3GB)
+
+2. **Install llama-cpp-python with CUDA**:
+   ```bash
+   cd python
+   venv\Scripts\activate
+   pip uninstall llama-cpp-python
+   pip install llama-cpp-python==0.3.4 --index-url https://abetlen.github.io/llama-cpp-python/whl/cu122
+   ```
+
+3. **Test GPU acceleration**:
+   ```bash
+   python test_llama_gpu.py
+   ```
+
+### Option 2: Use vLLM (Production Alternative)
+
+For maximum GPU performance on Linux or WSL2:
+
+```bash
+# In WSL2 or Linux
+pip install vllm
+python run_vllm_server.py --model meta-llama/Llama-2-7b-chat-hf
+
+# Connect from Windows
+from backends import VLLMBackend, VLLMBackendConfig
+config = VLLMBackendConfig(api_base="http://localhost:8000/v1")
+backend = VLLMBackend(config)
+```
+
+### Option 3: llama.cpp Standalone (Advanced)
+
+Build llama.cpp with CUDA support directly:
+
+```bash
+git clone https://github.com/ggerganov/llama.cpp
+cd llama.cpp
+cmake -B build -DGGML_CUDA=ON
+cmake --build build --config Release
+```
+
+Then use the compiled `llama-cli.exe` or `llama-server.exe`.
+
+## Configuration
+
+Once CUDA is set up, configure GPU layers in your code:
+
+```python
+from backends import LlamaCppBackend, BackendConfig
+
+# Full GPU offload (recommended for RTX 3090)
+config = BackendConfig(
+    model_path="../models/llama-2-7b-chat.Q4_K_M.gguf",
+    temperature=0.7,
+    max_tokens=512,
+    n_gpu_layers=-1,  # -1 = all layers to GPU
+)
+
+backend = LlamaCppBackend(config)
+```
+
+**GPU Layer Options:**
+- `n_gpu_layers=0`: CPU only (current default)
+- `n_gpu_layers=20`: Offload 20 layers to GPU (hybrid)
+- `n_gpu_layers=-1`: Offload all layers to GPU (fastest)
+
+## Expected Performance
+
+With RTX 3090 and full GPU offload:
+
+| Model | Quantization | CPU Speed | GPU Speed | Speedup |
+|-------|--------------|-----------|-----------|---------|
+| Llama-2-7B | Q4_K_M | ~9 tok/s | ~100+ tok/s | 10-15x |
+| Llama-2-13B | Q4_K_M | ~4 tok/s | ~60+ tok/s | 15-20x |
+| Llama-2-70B | Q4_K_M | N/A | ~20 tok/s | - |
+
+## Testing GPU Acceleration
+
+Use the provided test script:
+
+```bash
+cd python
+venv\Scripts\activate
+python test_llama_gpu.py
+```
+
+This will compare:
+1. CPU-only inference
+2. Partial GPU offload (20 layers)
+3. Full GPU offload (all layers)
+
+## Troubleshooting
+
+### Error: "Could not find module llama.dll"
+
+**Cause**: CUDA runtime DLLs not found in PATH.
+
+**Solution**: Install CUDA Toolkit (Option 1 above) or add CUDA bin directory to PATH:
+```
+C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.x\bin
+```
+
+### Error: "CUDA out of memory"
+
+**Cause**: Model too large for VRAM.
+
+**Solutions**:
+- Use smaller quantization (Q4_K_M instead of Q8)
+- Reduce `n_ctx` (context window)
+- Use partial GPU offload (e.g., `n_gpu_layers=20`)
+
+### GPU not being used (nvidia-smi shows 0% usage)
+
+**Cause**: `n_gpu_layers=0` (CPU-only mode).
+
+**Solution**: Set `n_gpu_layers=-1` in BackendConfig.
+
+### Slow first inference
+
+**Cause**: GPU kernel compilation on first run.
+
+**Solution**: This is normal. Subsequent inferences will be fast.
+
+## Current CPU Performance
+
+Without GPU acceleration, your current setup achieves:
+- **~9 tokens/second** with Q4_K_M quantization
+- **~110ms per token** generation time
+- Works reliably for development and testing
+
+## Verification
+
+Check if CUDA support is available:
+
+```python
+from backends import LlamaCppBackend, BackendConfig
+
+config = BackendConfig(
+    model_path="../models/llama-2-7b-chat.Q4_K_M.gguf",
+    n_gpu_layers=-1,
+)
+
+try:
+    backend = LlamaCppBackend(config)
+    print("✓ GPU acceleration is working!")
+except Exception as e:
+    print(f"✗ GPU error: {e}")
+    print("Falling back to CPU mode...")
+```
+
+## Next Steps
+
+1. **For local development**: Continue using CPU mode (works well)
+2. **For production**: Install CUDA Toolkit for GPU acceleration
+3. **For maximum performance**: Use vLLM on Linux/WSL2
+
+## Additional Resources
+
+- [llama.cpp Documentation](https://github.com/ggerganov/llama.cpp)
+- [llama-cpp-python GPU Guide](https://llama-cpp-python.readthedocs.io/en/latest/)
+- [CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads)
+- [vLLM Documentation](https://docs.vllm.ai/)
diff --git a/docs/llama_cpp_windows_setup.md b/docs/llama_cpp_windows_setup.md
new file mode 100644
index 0000000..3b090f3
--- /dev/null
+++ b/docs/llama_cpp_windows_setup.md
@@ -0,0 +1,302 @@
+# llama.cpp Backend Setup for Windows
+
+This guide shows how to set up and use the llama.cpp backend for local development on Windows.
+
+## Overview
+
+llama.cpp provides efficient CPU and GPU inference for LLaMA models using GGUF format. It's perfect for:
+- Local development on Windows
+- CPU-only inference (no CUDA required)
+- Low memory usage with quantized models
+- Quick prototyping
+
+## Prerequisites
+
+- ✅ Python 3.11 (already installed)
+- ✅ llama-cpp-python (already installed)
+- ✅ GGUF model file
+
+## Model Setup
+
+### 1. Download a GGUF Model
+
+You've already downloaded: `llama-2-7b-chat.Q4_K_M.gguf` (3.9GB)
+
+Place it in the `models/` directory:
+```
+AgentArena/
+├── models/
+│   └── llama-2-7b-chat.Q4_K_M.gguf
+```
+
+### 2. Model Quantization Levels
+
+GGUF models come in different quantization levels:
+
+| Quantization | File Size | Quality | Speed |
+|--------------|-----------|---------|-------|
+| Q2_K | ~2.5GB | Lower | Fastest |
+| Q4_K_M | ~3.9GB | Good | Fast |
+| Q5_K_M | ~4.8GB | Better | Medium |
+| Q8_0 | ~7GB | Best | Slower |
+
+**Q4_K_M** is the recommended balance for most use cases.
+
+## Configuration
+
+### Using Python Code
+
+```python
+from backends import LlamaCppBackend, BackendConfig
+
+config = BackendConfig(
+    model_path="models/llama-2-7b-chat.Q4_K_M.gguf",
+    temperature=0.7,
+    max_tokens=512,
+    top_p=0.9,
+    top_k=40,
+)
+
+backend = LlamaCppBackend(config)
+```
+
+### Using Hydra Config
+
+Edit `configs/backend/llama_cpp.yaml`:
+
+```yaml
+backend:
+  type: llama_cpp
+  model_path: "models/llama-2-7b-chat.Q4_K_M.gguf"
+
+  n_ctx: 4096  # Context window
+  n_threads: 8  # CPU threads (adjust based on your CPU)
+  n_gpu_layers: 0  # 0 = CPU only, 35 = full GPU offload
+
+  temperature: 0.7
+  max_tokens: 512
+```
+
+## Quick Start
+
+### 1. Run the Test Script
+
+```bash
+cd python
+venv\Scripts\activate
+python test_llama_backend.py
+```
+
+This will test:
+- Basic text generation
+- Function/tool calling
+- Different temperature settings
+- Multi-turn conversations
+
+### 2. Use in Your Code
+
+```python
+from backends import LlamaCppBackend, BackendConfig
+
+# Initialize
+config = BackendConfig(
+    model_path="models/llama-2-7b-chat.Q4_K_M.gguf",
+    temperature=0.7,
+    max_tokens=256,
+)
+backend = LlamaCppBackend(config)
+
+# Generate text
+result = backend.generate("Hello! My name is")
+print(result.text)
+
+# Generate with tools
+tools = [
+    {
+        "name": "move_to",
+        "description": "Move to coordinates",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "target": {
+                    "type": "array",
+                    "items": {"type": "number"}
+                }
+            }
+        }
+    }
+]
+
+result = backend.generate_with_tools(
+    "Move to position (10, 20, 5)",
+    tools
+)
+
+if "parsed_tool_call" in result.metadata:
+    print(f"Tool: {result.metadata['parsed_tool_call']}")
+```
+
+## GPU Acceleration (Optional)
+
+If you have an NVIDIA GPU with CUDA, you can offload layers to GPU:
+
+### 1. Install CUDA-enabled llama-cpp-python
+
+```bash
+# Uninstall CPU version
+pip uninstall llama-cpp-python
+
+# Install with CUDA support (requires CUDA 11.x or 12.x)
+set CMAKE_ARGS=-DLLAMA_CUBLAS=on
+pip install llama-cpp-python --force-reinstall --no-cache-dir
+```
+
+### 2. Update Configuration
+
+```python
+config = BackendConfig(
+    model_path="models/llama-2-7b-chat.Q4_K_M.gguf",
+    # ... other settings ...
+)
+
+# When creating backend, enable GPU layers
+from llama_cpp import Llama
+
+llm = Llama(
+    model_path=config.model_path,
+    n_ctx=4096,
+    n_threads=8,
+    n_gpu_layers=35,  # Offload all layers to GPU
+)
+```
+
+Or edit the backend code to support `n_gpu_layers` parameter.
+
+## Performance Tuning
+
+### CPU Threads
+
+Adjust `n_threads` based on your CPU:
+- **4-core CPU**: 4-6 threads
+- **8-core CPU**: 8-12 threads
+- **16-core CPU**: 12-16 threads
+
+```python
+# In llama_cpp_backend.py, line 30-34
+self.llm = Llama(
+    model_path=self.config.model_path,
+    n_ctx=4096,
+    n_threads=12,  # Adjust this
+    n_gpu_layers=0,
+)
+```
+
+### Context Window
+
+Reduce `n_ctx` if running out of memory:
+- **4096**: Full context (default)
+- **2048**: Half context, less memory
+- **1024**: Quarter context, minimal memory
+
+### Batch Size
+
+Adjust `n_batch` for prompt processing speed:
+- **512**: Default, good balance
+- **128**: Lower memory, slower
+- **1024**: More memory, faster
+
+## Troubleshooting
+
+### Model Loading is Slow
+
+**Expected behavior**: First load takes 10-30 seconds for Q4_K_M model.
+
+**Solutions**:
+- Use `use_mmap=true` (default) for faster loading
+- Keep the model loaded between requests
+- Use a smaller quantization (Q2_K)
+
+### Out of Memory
+
+```
+RuntimeError: Failed to allocate memory
+```
+
+**Solutions**:
+- Reduce `n_ctx` to 2048 or 1024
+- Close other applications
+- Use a smaller model or quantization
+
+### Slow Generation
+
+**Solutions**:
+- Increase `n_threads` up to your CPU core count
+- Enable GPU offload with `n_gpu_layers`
+- Reduce `max_tokens`
+- Use a smaller model
+
+### Import Error
+
+```
+ModuleNotFoundError: No module named 'llama_cpp'
+```
+
+**Solution**:
+```bash
+cd python
+venv\Scripts\activate
+pip install llama-cpp-python
+```
+
+## Example Use Cases
+
+### Agent Decision Making
+
+```python
+def get_agent_action(observation):
+    prompt = f"""You are an AI agent in a game world.
+
+Current observation: {observation}
+
+Available actions:
+- move_to(x, y, z): Move to coordinates
+- pickup_item(name): Pick up an item
+- use_item(name): Use an item from inventory
+
+What action should you take? Respond with JSON:
+{{"action": "action_name", "params": {{}}, "reasoning": "why"}}
+"""
+
+    result = backend.generate(prompt, temperature=0.5, max_tokens=200)
+    return result.text
+```
+
+### Conversation System
+
+```python
+def chat_with_agent(messages):
+    # Format conversation for Llama-2 chat format
+    prompt = ""
+    for msg in messages:
+        if msg["role"] == "user":
+            prompt += f"[INST] {msg['content']} [/INST]"
+        else:
+            prompt += f" {msg['content']}</s>"
+
+    result = backend.generate(prompt, max_tokens=300)
+    return result.text
+```
+
+## Next Steps
+
+1. ✅ Test the backend with `python test_llama_backend.py`
+2. Integrate with your agent runtime
+3. Experiment with different prompts and temperatures
+4. Consider GPU acceleration for production use
+
+## Resources
+
+- [llama.cpp GitHub](https://github.com/ggerganov/llama.cpp)
+- [llama-cpp-python Documentation](https://llama-cpp-python.readthedocs.io/)
+- [GGUF Model Download](https://huggingface.co/TheBloke)
+- [Model Quantization Guide](https://github.com/ggerganov/llama.cpp#quantization)
diff --git a/docs/vllm_backend.md b/docs/vllm_backend.md
new file mode 100644
index 0000000..ebbe3ec
--- /dev/null
+++ b/docs/vllm_backend.md
@@ -0,0 +1,311 @@
+# vLLM Backend Integration
+
+This document describes how to use the vLLM backend for high-throughput LLM inference in Agent Arena.
+
+## Overview
+
+vLLM is a high-performance inference engine optimized for serving large language models at scale. It provides:
+
+- **High throughput**: PagedAttention and continuous batching
+- **OpenAI-compatible API**: Drop-in replacement for OpenAI API
+- **Multiple model support**: Llama, Mistral, Qwen, and more
+- **Function calling**: Native support for tool/function calling
+- **GPU acceleration**: Optimized CUDA kernels
+
+## Requirements
+
+### System Requirements
+
+- **GPU**: NVIDIA GPU with CUDA support (8GB+ VRAM recommended)
+- **CUDA**: Version 11.8 or 12.1
+- **Python**: 3.8-3.11
+- **OS**: Linux (recommended) or Windows with WSL2
+
+### Installation
+
+```bash
+# Activate your virtual environment
+cd python
+venv\Scripts\activate  # Windows
+# source venv/bin/activate  # Linux/Mac
+
+# Install vLLM (requires CUDA)
+pip install vllm
+
+# For specific CUDA version (e.g., CUDA 12.1)
+pip install vllm-cuda121
+```
+
+**Note**: vLLM requires a CUDA-capable GPU. It does not support CPU-only inference.
+
+## Starting the vLLM Server
+
+### Option 1: Using the helper script
+
+```bash
+cd python
+python run_vllm_server.py --model meta-llama/Llama-2-7b-chat-hf --port 8000
+```
+
+### Option 2: Direct command
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+    --model meta-llama/Llama-2-7b-chat-hf \
+    --port 8000 \
+    --gpu-memory-utilization 0.9 \
+    --max-model-len 4096
+```
+
+### Common Arguments
+
+- `--model`: Model name from HuggingFace or local path
+- `--port`: Server port (default: 8000)
+- `--tensor-parallel-size`: Number of GPUs to use
+- `--gpu-memory-utilization`: GPU memory fraction (0.0-1.0)
+- `--max-model-len`: Maximum context length
+- `--dtype`: Data type (auto, half, float16, bfloat16, float32)
+
+## Configuration
+
+### Hydra Config File
+
+Edit `configs/backend/vllm.yaml`:
+
+```yaml
+backend:
+  type: vllm
+  model: "meta-llama/Llama-2-7b-chat-hf"
+
+  # Server settings
+  host: "localhost"
+  port: 8000
+  api_base: "http://localhost:8000/v1"
+
+  # Model parameters
+  tensor_parallel_size: 1  # Number of GPUs
+  dtype: "auto"
+  max_model_len: 4096
+  gpu_memory_utilization: 0.9
+
+  # Generation
+  temperature: 0.7
+  top_p: 0.9
+  max_tokens: 512
+
+  # Function calling
+  function_calling:
+    enabled: true
+    format: "json"
+```
+
+### Python Code
+
+```python
+from backends import VLLMBackend, VLLMBackendConfig
+
+# Create configuration
+config = VLLMBackendConfig(
+    model_path="meta-llama/Llama-2-7b-chat-hf",
+    api_base="http://localhost:8000/v1",
+    temperature=0.7,
+    max_tokens=512,
+)
+
+# Initialize backend
+backend = VLLMBackend(config)
+
+# Check availability
+if backend.is_available():
+    print("vLLM server is ready!")
+
+# Generate text
+result = backend.generate("Hello, my name is")
+print(result.text)
+
+# Generate with tools
+tools = [
+    {
+        "name": "move_to",
+        "description": "Move agent to coordinates",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "target": {
+                    "type": "array",
+                    "items": {"type": "number"},
+                    "description": "Target [x, y, z] coordinates"
+                }
+            },
+            "required": ["target"]
+        }
+    }
+]
+
+result = backend.generate_with_tools(
+    "I need to move to coordinates (10, 20, 5)",
+    tools
+)
+
+if "tool_call" in result.metadata:
+    print(f"Tool: {result.metadata['tool_call']['name']}")
+    print(f"Arguments: {result.metadata['tool_call']['arguments']}")
+```
+
+## Supported Models
+
+vLLM supports many model architectures. Popular choices:
+
+### Llama Models
+- `meta-llama/Llama-2-7b-chat-hf`
+- `meta-llama/Llama-2-13b-chat-hf`
+- `meta-llama/Meta-Llama-3-8B-Instruct`
+
+### Mistral Models
+- `mistralai/Mistral-7B-Instruct-v0.2`
+- `mistralai/Mixtral-8x7B-Instruct-v0.1`
+
+### Qwen Models
+- `Qwen/Qwen2-7B-Instruct`
+- `Qwen/Qwen2.5-7B-Instruct`
+
+### Function Calling Models
+For best function calling support, use models trained for tool use:
+- `NousResearch/Hermes-2-Pro-Llama-3-8B`
+- `gorilla-llm/gorilla-openfunctions-v2`
+
+## Performance Tuning
+
+### GPU Memory
+
+Adjust `gpu_memory_utilization` based on your VRAM:
+
+- **8GB GPU**: 0.7-0.8
+- **16GB GPU**: 0.85-0.9
+- **24GB+ GPU**: 0.9-0.95
+
+### Multi-GPU
+
+For multiple GPUs, use tensor parallelism:
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+    --model meta-llama/Llama-2-13b-chat-hf \
+    --tensor-parallel-size 2  # Use 2 GPUs
+```
+
+### Context Length
+
+Reduce `max_model_len` if running out of memory:
+
+```yaml
+max_model_len: 2048  # Instead of 4096
+```
+
+## Function Calling
+
+vLLM supports OpenAI-style function calling for compatible models.
+
+### Native Function Calling
+
+```python
+tools = [
+    {
+        "name": "get_weather",
+        "description": "Get current weather",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {"type": "string"}
+            }
+        }
+    }
+]
+
+result = backend.generate_with_tools(
+    "What's the weather in Paris?",
+    tools
+)
+
+if "tool_call" in result.metadata:
+    tool_call = result.metadata["tool_call"]
+    print(f"Calling {tool_call['name']} with {tool_call['arguments']}")
+```
+
+### Fallback Method
+
+If the model doesn't support native function calling, the backend automatically falls back to prompt-based tool calling.
+
+## Troubleshooting
+
+### Server Not Starting
+
+```
+Error: CUDA out of memory
+```
+
+**Solution**: Reduce `gpu_memory_utilization` or `max_model_len`
+
+### Connection Refused
+
+```
+ConnectionError: Cannot connect to vLLM server
+```
+
+**Solution**:
+1. Check if server is running: `curl http://localhost:8000/v1/models`
+2. Verify port is correct
+3. Check firewall settings
+
+### Slow Generation
+
+**Solutions**:
+- Enable tensor parallelism for multi-GPU
+- Reduce `max_model_len`
+- Use quantized models (e.g., AWQ, GPTQ)
+- Check GPU utilization with `nvidia-smi`
+
+### Model Not Found
+
+```
+OSError: meta-llama/Llama-2-7b-chat-hf is not a local folder
+```
+
+**Solution**:
+1. Model will be downloaded from HuggingFace on first run
+2. Ensure you have a HuggingFace token for gated models
+3. Or download manually and use local path
+
+## Testing
+
+Run the vLLM backend tests:
+
+```bash
+# Start vLLM server first
+python run_vllm_server.py --model meta-llama/Llama-2-7b-chat-hf
+
+# In another terminal
+cd python
+venv\Scripts\activate
+pytest ../tests/test_vllm_backend.py -v
+```
+
+Tests will be skipped if the server is not available.
+
+## Comparison with llama.cpp
+
+| Feature | vLLM | llama.cpp |
+|---------|------|-----------|
+| **Performance** | High throughput, optimized for serving | Good single-request performance |
+| **Hardware** | Requires CUDA GPU | CPU + optional GPU |
+| **Memory** | Higher VRAM usage | Lower memory footprint |
+| **Batching** | Continuous batching | Manual batching |
+| **Setup** | Requires server | Direct library |
+| **Use Case** | Production serving, multiple agents | Development, single agent |
+
+## References
+
+- [vLLM Documentation](https://docs.vllm.ai/)
+- [vLLM GitHub](https://github.com/vllm-project/vllm)
+- [OpenAI API Compatibility](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html)
+- [Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
diff --git a/godot/include/agent_arena.h b/godot/include/agent_arena.h
index 6293e51..b80fe14 100644
--- a/godot/include/agent_arena.h
+++ b/godot/include/agent_arena.h
@@ -180,12 +180,14 @@ class IPCClient : public godot::Node {
 private:
     godot::String server_url;
     godot::HTTPRequest* http_request;
+    godot::HTTPRequest* http_request_tool;  // Separate request node for tool execution
     bool is_connected;
     uint64_t current_tick;
     godot::Dictionary pending_response;
     bool response_received;
 
     void _on_request_completed(int result, int response_code, const godot::PackedStringArray& headers, const godot::PackedByteArray& body);
+    void _on_tool_request_completed(int result, int response_code, const godot::PackedStringArray& headers, const godot::PackedByteArray& body);
 
 protected:
     static void _bind_methods();
diff --git a/godot/src/agent_arena.cpp b/godot/src/agent_arena.cpp
index d281029..5aa2e7b 100644
--- a/godot/src/agent_arena.cpp
+++ b/godot/src/agent_arena.cpp
@@ -362,6 +362,7 @@ void ToolRegistry::set_ipc_client(IPCClient* client) {
 IPCClient::IPCClient()
     : server_url("http://127.0.0.1:5000"),
       http_request(nullptr),
+      http_request_tool(nullptr),
       is_connected(false),
       current_tick(0),
       response_received(false) {
@@ -371,6 +372,9 @@ IPCClient::~IPCClient() {
     if (http_request != nullptr) {
         http_request->queue_free();
     }
+    if (http_request_tool != nullptr) {
+        http_request_tool->queue_free();
+    }
 }
 
 void IPCClient::_bind_methods() {
@@ -390,6 +394,8 @@ void IPCClient::_bind_methods() {
 
     ClassDB::bind_method(D_METHOD("_on_request_completed", "result", "response_code", "headers", "body"),
                          &IPCClient::_on_request_completed);
+    ClassDB::bind_method(D_METHOD("_on_tool_request_completed", "result", "response_code", "headers", "body"),
+                         &IPCClient::_on_tool_request_completed);
 
     ADD_PROPERTY(PropertyInfo(Variant::STRING, "server_url"), "set_server_url", "get_server_url");
 
@@ -398,7 +404,7 @@ void IPCClient::_bind_methods() {
 }
 
 void IPCClient::_ready() {
-    // Create HTTPRequest node
+    // Create HTTPRequest node for general requests (health check, tick)
     http_request = memnew(HTTPRequest);
     add_child(http_request);
 
@@ -406,6 +412,14 @@ void IPCClient::_ready() {
     http_request->connect("request_completed",
                          Callable(this, "_on_request_completed"));
 
+    // Create separate HTTPRequest node for tool execution
+    http_request_tool = memnew(HTTPRequest);
+    add_child(http_request_tool);
+
+    // Connect signal for tool requests
+    http_request_tool->connect("request_completed",
+                               Callable(this, "_on_tool_request_completed"));
+
     UtilityFunctions::print("IPCClient initialized with server URL: ", server_url);
 }
 
@@ -515,6 +529,42 @@ void IPCClient::_on_request_completed(int result, int response_code,
     }
 }
 
+void IPCClient::_on_tool_request_completed(int result, int response_code,
+                                           const PackedStringArray& headers,
+                                           const PackedByteArray& body) {
+    UtilityFunctions::print("[C++] Tool request callback triggered - result: ", result, ", code: ", response_code);
+
+    if (result != HTTPRequest::RESULT_SUCCESS) {
+        UtilityFunctions::print("Tool HTTP Request failed with result: ", result);
+        return;
+    }
+
+    if (response_code == 200) {
+        // Parse JSON response
+        String body_string = body.get_string_from_utf8();
+
+        // Parse JSON
+        JSON json;
+        Error err = json.parse(body_string);
+
+        if (err == OK) {
+            Variant data = json.get_data();
+            if (data.get_type() == Variant::DICTIONARY) {
+                Dictionary tool_response = data;
+                UtilityFunctions::print("Tool execution response received: ", tool_response);
+                // Could emit a signal here for async handling
+                emit_signal("response_received", tool_response);
+            } else {
+                UtilityFunctions::print("Invalid tool response JSON format");
+            }
+        } else {
+            UtilityFunctions::print("Failed to parse tool response JSON");
+        }
+    } else {
+        UtilityFunctions::print("Tool HTTP request returned error code: ", response_code);
+    }
+}
+
 Dictionary IPCClient::execute_tool_sync(const String& tool_name, const Dictionary& params,
                                         const String& agent_id, uint64_t tick) {
     Dictionary result;
@@ -532,12 +582,12 @@ Dictionary IPCClient::execute_tool_sync(const String& tool_name, const Dictionar
 
     String json_str = JSON::stringify(request_dict);
 
-    // Send POST request using main http_request
+    // Send POST request using separate http_request_tool to avoid conflicts
     String url = server_url + "/tools/execute";
     PackedStringArray headers;
     headers.append("Content-Type: application/json");
 
-    Error err = http_request->request(url, headers, HTTPClient::METHOD_POST, json_str);
+    Error err = http_request_tool->request(url, headers, HTTPClient::METHOD_POST, json_str);
 
     if (err != OK) {
         UtilityFunctions::print("Error sending tool execution request: ", err);
diff --git a/python/backends/__init__.py b/python/backends/__init__.py
index f404cb2..b4c4f30 100644
--- a/python/backends/__init__.py
+++ b/python/backends/__init__.py
@@ -2,7 +2,8 @@
 LLM Backend Adapters for Agent Arena
 """
 
-from .base import BaseBackend
+from .base import BaseBackend, BackendConfig
 from .llama_cpp_backend import LlamaCppBackend
+from .vllm_backend import VLLMBackend, VLLMBackendConfig
 
-__all__ = ["BaseBackend", "LlamaCppBackend"]
+__all__ = ["BaseBackend", "BackendConfig", "LlamaCppBackend", "VLLMBackend", "VLLMBackendConfig"]
diff --git a/python/backends/base.py b/python/backends/base.py
index 8439f7e..cf55cb1 100644
--- a/python/backends/base.py
+++ b/python/backends/base.py
@@ -16,6 +16,7 @@ class BackendConfig:
     max_tokens: int = 512
     top_p: float = 0.9
     top_k: int = 40
+    n_gpu_layers: int = 0  # Number of layers to offload to GPU (0 = CPU only, -1 = all)
 
 
 @dataclass
diff --git a/python/backends/llama_cpp_backend.py b/python/backends/llama_cpp_backend.py
index 2a9dc70..3437e3a 100644
--- a/python/backends/llama_cpp_backend.py
+++ b/python/backends/llama_cpp_backend.py
@@ -27,11 +27,22 @@ def _load_model(self) -> None:
 
             logger.info(f"Loading model from {self.config.model_path}")
 
+            # Use GPU layers from config
+            n_gpu_layers = getattr(self.config, 'n_gpu_layers', 0)
+
+            if n_gpu_layers > 0:
+                logger.info(f"Offloading {n_gpu_layers} layers to GPU")
+            elif n_gpu_layers == -1:
+                logger.info("Offloading all layers to GPU")
+            else:
+                logger.info("Using CPU only (no GPU offload)")
+
             self.llm = Llama(
                 model_path=self.config.model_path,
                 n_ctx=4096,  # Context window
                 n_threads=8,  # CPU threads
-                n_gpu_layers=0,  # GPU layers (0 = CPU only)
+                n_gpu_layers=n_gpu_layers,  # GPU layers (0 = CPU only, -1 = all)
+                verbose=False,  # Reduce output noise
             )
 
             logger.info("Model loaded successfully")
diff --git a/python/backends/vllm_backend.py b/python/backends/vllm_backend.py
new file mode 100644
index 0000000..c543318
--- /dev/null
+++ b/python/backends/vllm_backend.py
@@ -0,0 +1,344 @@
+"""
+vLLM backend adapter using OpenAI-compatible API.
+
+vLLM is a high-throughput inference engine that provides an OpenAI-compatible
+REST API. This backend connects to a vLLM server instance.
+"""
+
+import json
+import logging
+from typing import Any
+
+from openai import OpenAI
+
+from .base import BackendConfig, BaseBackend, GenerationResult
+
+logger = logging.getLogger(__name__)
+
+
+class VLLMBackendConfig(BackendConfig):
+    """Extended configuration for vLLM backend."""
+
+    def __init__(
+        self,
+        model_path: str,
+        api_base: str = "http://localhost:8000/v1",
+        api_key: str = "EMPTY",
+        temperature: float = 0.7,
+        max_tokens: int = 512,
+        top_p: float = 0.9,
+        top_k: int = 40,
+    ):
+        """
+        Initialize vLLM backend config.
+
+        Args:
+            model_path: Model identifier (e.g., "meta-llama/Llama-2-7b-chat-hf")
+            api_base: Base URL for vLLM server
+            api_key: API key (vLLM uses "EMPTY" by default)
+            temperature: Sampling temperature
+            max_tokens: Maximum tokens to generate
+            top_p: Nucleus sampling parameter
+            top_k: Top-k sampling parameter
+        """
+        super().__init__(
+            model_path=model_path,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            top_p=top_p,
+            top_k=top_k,
+        )
+        self.api_base = api_base
+        self.api_key = api_key
+
+
+class VLLMBackend(BaseBackend):
+    """
+    Backend adapter for vLLM inference server.
+
+    This backend connects to a running vLLM server using the OpenAI-compatible API.
+    The vLLM server must be started separately before using this backend.
+
+    Example:
+        Start vLLM server:
+        ```bash
+        python -m vllm.entrypoints.openai.api_server \\
+            --model meta-llama/Llama-2-7b-chat-hf \\
+            --port 8000
+        ```
+
+        Then use this backend:
+        ```python
+        config = VLLMBackendConfig(
+            model_path="meta-llama/Llama-2-7b-chat-hf",
+            api_base="http://localhost:8000/v1"
+        )
+        backend = VLLMBackend(config)
+        result = backend.generate("Hello, world!")
+        ```
+    """
+
+    def __init__(self, config: VLLMBackendConfig):
+        """
+        Initialize vLLM backend.
+
+        Args:
+            config: vLLM backend configuration
+        """
+        super().__init__(config)
+        self.config: VLLMBackendConfig = config
+        self.client: OpenAI | None = None
+        self._connect()
+
+    def _connect(self) -> None:
+        """Connect to vLLM server."""
+        try:
+            logger.info(f"Connecting to vLLM server at {self.config.api_base}")
+
+            self.client = OpenAI(
+                api_key=self.config.api_key,
+                base_url=self.config.api_base,
+            )
+
+            # Test connection with a simple request
+            try:
+                models = self.client.models.list()
+                logger.info(f"Connected to vLLM. Available models: {[m.id for m in models.data]}")
+            except Exception as e:
+                logger.warning(f"Could not list models (server may not be ready): {e}")
+
+        except Exception as e:
+            logger.error(f"Failed to connect to vLLM server: {e}")
+            raise
+
+    def generate(
+        self,
+        prompt: str,
+        temperature: float | None = None,
+        max_tokens: int | None = None,
+    ) -> GenerationResult:
+        """
+        Generate text from prompt using vLLM.
+
+        Args:
+            prompt: Input prompt
+            temperature: Override temperature (optional)
+            max_tokens: Override max tokens (optional)
+
+        Returns:
+            GenerationResult with generated text and metadata
+        """
+        if not self.client:
+            raise RuntimeError("vLLM client not connected")
+
+        temp = temperature if temperature is not None else self.config.temperature
+        max_tok = max_tokens if max_tokens is not None else self.config.max_tokens
+
+        try:
+            response = self.client.completions.create(
+                model=self.config.model_path,
+                prompt=prompt,
+                temperature=temp,
+                max_tokens=max_tok,
+                top_p=self.config.top_p,
+                extra_body={"top_k": self.config.top_k},
+            )
+
+            text = response.choices[0].text
+            tokens_used = response.usage.total_tokens if response.usage else 0
+
+            return GenerationResult(
+                text=text,
+                tokens_used=tokens_used,
+                finish_reason=response.choices[0].finish_reason or "stop",
+                metadata={
+                    "model": self.config.model_path,
+                    "api_base": self.config.api_base,
+                },
+            )
+
+        except Exception as e:
+            logger.error(f"Generation error: {e}")
+            return GenerationResult(
+                text="",
+                tokens_used=0,
+                finish_reason="error",
+                metadata={"error": str(e)},
+            )
+
+    def generate_with_tools(
+        self,
+        prompt: str,
+        tools: list[dict[str, Any]],
+        temperature: float | None = None,
+    ) -> GenerationResult:
+        """
+        Generate with function calling support.
+
+        vLLM supports OpenAI-style function calling for compatible models.
+
+        Args:
+            prompt: Input prompt
+            tools: List of available tool schemas
+            temperature: Override temperature (optional)
+
+        Returns:
+            GenerationResult with tool call or text
+        """
+        if not self.client:
+            raise RuntimeError("vLLM client not connected")
+
+        temp = temperature if temperature is not None else self.config.temperature
+
+        try:
+            # Convert tool schemas to OpenAI format
+            openai_tools = []
+            for tool in tools:
+                openai_tools.append({
+                    "type": "function",
+                    "function": {
+                        "name": tool["name"],
+                        "description": tool["description"],
+                        "parameters": tool.get("parameters", {}),
+                    }
+                })
+
+            # Use chat completions API for function calling
+            response = self.client.chat.completions.create(
+                model=self.config.model_path,
+                messages=[{"role": "user", "content": prompt}],
+                tools=openai_tools,
+                tool_choice="auto",
+                temperature=temp,
+                max_tokens=self.config.max_tokens,
+            )
+
+            choice = response.choices[0]
+            tokens_used = response.usage.total_tokens if response.usage else 0
+
+            # Check if model returned a tool call
+            if choice.message.tool_calls:
+                tool_call = choice.message.tool_calls[0]
+                text = choice.message.content or ""
+
+                return GenerationResult(
+                    text=text,
+                    tokens_used=tokens_used,
+                    finish_reason=choice.finish_reason or "stop",
+                    metadata={
+                        "model": self.config.model_path,
+                        "tool_call": {
+                            "name": tool_call.function.name,
+                            "arguments": json.loads(tool_call.function.arguments),
+                        },
+                    },
+                )
+            else:
+                # No tool call, return regular text
+                text = choice.message.content or ""
+                return GenerationResult(
+                    text=text,
+                    tokens_used=tokens_used,
+                    finish_reason=choice.finish_reason or "stop",
+                    metadata={"model": self.config.model_path},
+                )
+
+        except Exception as e:
+            logger.error(f"Tool generation error: {e}")
+
+            # Fallback to prompt-based tool calling
+            logger.info("Falling back to prompt-based tool calling")
+            return self._generate_with_tools_fallback(prompt, tools, temp)
+
+    def _generate_with_tools_fallback(
+        self,
+        prompt: str,
+        tools: list[dict[str, Any]],
+        temperature: float,
+    ) -> GenerationResult:
+        """
+        Fallback method for tool calling using prompt engineering.
+
+        Used when the model doesn't support native function calling.
+
+        Args:
+            prompt: Input prompt
+            tools: List of available tool schemas
+            temperature: Sampling temperature
+
+        Returns:
+            GenerationResult with tool call attempt
+        """
+        # Build a prompt that includes tool schemas
+        tool_descriptions = []
+        for tool in tools:
+            tool_desc = f"- {tool['name']}: {tool['description']}"
+            if "parameters" in tool:
+                tool_desc += f"\n  Parameters: {json.dumps(tool['parameters'])}"
+            tool_descriptions.append(tool_desc)
+
+        tools_text = "\n".join(tool_descriptions)
+
+        enhanced_prompt = f"""{prompt}
+
+Available tools:
+{tools_text}
+
+Respond with a JSON object in the format:
+{{"tool": "tool_name", "params": {{}}, "reasoning": "why this tool"}}
+
+Or if no tool is needed:
+{{"tool": "none", "reasoning": "explanation"}}
+"""
+
+        result = self.generate(enhanced_prompt, temperature)
+
+        # Try to parse JSON from result
+        try:
+            text = result.text.strip()
+            # Remove markdown code blocks if present
+            if text.startswith("```json"):
+                text = text[7:]
+            elif text.startswith("```"):
+                text = text[3:]
+            if text.endswith("```"):
+                text = text[:-3]
+
+            parsed = json.loads(text.strip())
+            result.metadata["parsed_tool_call"] = parsed
+
+        except json.JSONDecodeError:
+            logger.warning("Failed to parse tool call JSON from response")
+            result.metadata["parse_error"] = True
+
+        return result
+
+    def is_available(self) -> bool:
+        """
+        Check if vLLM server is available and ready.
+
+        Returns:
+            True if server is connected and responsive
+        """
+        if not self.client:
+            return False
+
+        try:
+            # Try to list models as a health check
+            self.client.models.list()
+            return True
+        except Exception as e:
+            logger.debug(f"vLLM availability check failed: {e}")
+            return False
+
+    def unload(self) -> None:
+        """
+        Disconnect from vLLM server.
+
+        Note: This only closes the client connection. The vLLM server
+        continues running and must be stopped separately if needed.
+        """
+        if self.client:
+            self.client.close()
+            self.client = None
+            logger.info("Disconnected from vLLM server")
diff --git a/python/run_ipc_server_with_gpu.py b/python/run_ipc_server_with_gpu.py
new file mode 100644
index 0000000..821bd02
--- /dev/null
+++ b/python/run_ipc_server_with_gpu.py
@@ -0,0 +1,157 @@
+"""
+IPC Server with GPU-accelerated agent backend.
+
+This script starts the FastAPI server with LLM-powered agents using
+GPU-accelerated llama.cpp backend.
+"""
+
+import argparse
+import logging
+import sys
+
+from agent_runtime.runtime import AgentRuntime
+from agent_runtime.agent import Agent
+from agent_runtime.tool_dispatcher import ToolDispatcher
+from backends import LlamaCppBackend, BackendConfig
+from ipc.server import create_server
+from tools import register_movement_tools, register_inventory_tools, register_world_query_tools
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Agent Arena IPC Server with GPU-Accelerated LLM Backend"
+    )
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="127.0.0.1",
+        help="Host address to bind to (default: 127.0.0.1)",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=5000,
+        help="Port to listen on (default: 5000)",
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=4,
+        help="Maximum number of concurrent agent workers (default: 4)",
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Enable debug logging",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="../models/llama-2-7b-chat.Q4_K_M.gguf",
+        help="Path to GGUF model file (default: ../models/llama-2-7b-chat.Q4_K_M.gguf)"
+    )
+    parser.add_argument(
+        "--gpu-layers",
+        type=int,
+        default=-1,
+        help="Number of layers to offload to GPU: -1=all, 0=CPU only (default: -1)"
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.7,
+        help="LLM temperature for decision making (default: 0.7)"
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=256,
+        help="Maximum tokens to generate per decision (default: 256)"
+    )
+
+    args = parser.parse_args()
+
+    if args.debug:
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    logger.info("=" * 60)
+    logger.info("Agent Arena IPC Server (GPU-Accelerated)")
+    logger.info("=" * 60)
+    logger.info(f"Host: {args.host}")
+    logger.info(f"Port: {args.port}")
+    logger.info(f"Max Workers: {args.workers}")
+    logger.info(f"Model: {args.model}")
+    logger.info(f"GPU Layers: {args.gpu_layers} ({'all' if args.gpu_layers == -1 else 'CPU only' if args.gpu_layers == 0 else args.gpu_layers})")
+    logger.info(f"Temperature: {args.temperature}")
+    logger.info(f"Max Tokens: {args.max_tokens}")
+    logger.info("=" * 60)
+
+    try:
+        # Create GPU-accelerated backend configuration
+        backend_config = BackendConfig(
+            model_path=args.model,
+            temperature=args.temperature,
+            max_tokens=args.max_tokens,
+            n_gpu_layers=args.gpu_layers
+        )
+
+        logger.info("Loading GPU-accelerated LLM backend...")
+        backend = LlamaCppBackend(backend_config)
+        logger.info("✓ Backend loaded successfully")
+
+        # Create runtime
+        runtime = AgentRuntime(max_workers=args.workers)
+
+        # Create tool dispatcher and register all tools
+        tool_dispatcher = ToolDispatcher()
+        register_movement_tools(tool_dispatcher)
+        register_inventory_tools(tool_dispatcher)
+        register_world_query_tools(tool_dispatcher)
+        logger.info(f"✓ Registered {len(tool_dispatcher.tools)} tools")
+
+        # Create a test agent with GPU backend
+        test_agent = Agent(
+            agent_id="gpu_agent_001",
+            backend=backend,
+            tools=list(tool_dispatcher.tools.keys()),
+            goals=["explore the world", "collect resources", "survive"]
+        )
+
+        runtime.register_agent(test_agent)
+        logger.info(f"✓ Registered agent '{test_agent.state.agent_id}' with GPU backend and {len(test_agent.available_tools)} tools")
+
+        logger.info("=" * 60)
+        logger.info("Server ready! You can now:")
+        logger.info("  1. Run Godot test scenes")
+        logger.info("  2. Send POST requests to /tick with agent observations")
+        logger.info("  3. Execute tools via POST /tools/execute")
+        logger.info("=" * 60)
+
+        # Create and start server
+        server = create_server(runtime=runtime, host=args.host, port=args.port)
+        logger.info("Starting IPC server...")
+        server.run()
+
+    except KeyboardInterrupt:
+        logger.info("\nShutting down gracefully...")
+        if 'backend' in locals():
+            logger.info("Unloading LLM backend...")
+            backend.unload()
+        sys.exit(0)
+    except Exception as e:
+        logger.error(f"Fatal error: {e}", exc_info=True)
+        if 'backend' in locals():
+            backend.unload()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/run_vllm_server.py b/python/run_vllm_server.py
new file mode 100644
index 0000000..3c8fa57
--- /dev/null
+++ b/python/run_vllm_server.py
@@ -0,0 +1,145 @@
+"""
+Script to start a vLLM inference server.
+
+Usage:
+    python run_vllm_server.py --model meta-llama/Llama-2-7b-chat-hf
+    python run_vllm_server.py --model meta-llama/Llama-2-7b-chat-hf --port 8000 --gpu-memory 0.9
+"""
+
+import argparse
+import logging
+import sys
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Start vLLM inference server")
+
+    # Model configuration
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="meta-llama/Llama-2-7b-chat-hf",
+        help="Model name or path (e.g., meta-llama/Llama-2-7b-chat-hf)",
+    )
+
+    # Server configuration
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="localhost",
+        help="Server host (default: localhost)",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8000,
+        help="Server port (default: 8000)",
+    )
+
+    # Performance configuration
+    parser.add_argument(
+        "--tensor-parallel-size",
+        type=int,
+        default=1,
+        help="Number of GPUs to use for tensor parallelism (default: 1)",
+    )
+    parser.add_argument(
+        "--gpu-memory",
+        type=float,
+        default=0.9,
+        help="GPU memory utilization (0.0-1.0, default: 0.9)",
+    )
+    parser.add_argument(
+        "--max-model-len",
+        type=int,
+        default=4096,
+        help="Maximum model context length (default: 4096)",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="auto",
+        choices=["auto", "half", "float16", "bfloat16", "float32"],
+        help="Data type for model weights (default: auto)",
+    )
+
+    # Additional options
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Trust remote code when loading model",
+    )
+    parser.add_argument(
+        "--enable-function-calling",
+        action="store_true",
+        default=True,
+        help="Enable function calling support (default: True)",
+    )
+
+    args = parser.parse_args()
+
+    try:
+        # Check if vLLM is installed
+        try:
+            import vllm
+            logger.info(f"vLLM version: {vllm.__version__}")
+        except ImportError:
+            logger.error(
+                "vLLM is not installed. Install with: pip install vllm\n"
+                "Note: vLLM requires CUDA and is not available on CPU-only systems."
+            )
+            sys.exit(1)
+
+        # Import vLLM server
+        from vllm.entrypoints.openai.api_server import run_server
+
+        logger.info(f"Starting vLLM server for model: {args.model}")
+        logger.info(f"Server will be available at: http://{args.host}:{args.port}")
+        logger.info(f"GPU memory utilization: {args.gpu_memory}")
+        logger.info(f"Tensor parallel size: {args.tensor_parallel_size}")
+        logger.info(f"Max model length: {args.max_model_len}")
+        logger.info(f"Data type: {args.dtype}")
+
+        # Build command-line arguments for vLLM
+        vllm_args = [
+            "--model", args.model,
+            "--host", args.host,
+            "--port", str(args.port),
+            "--tensor-parallel-size", str(args.tensor_parallel_size),
+            "--gpu-memory-utilization", str(args.gpu_memory),
+            "--max-model-len", str(args.max_model_len),
+            "--dtype", args.dtype,
+        ]
+
+        if args.trust_remote_code:
+            vllm_args.append("--trust-remote-code")
+
+        if args.enable_function_calling:
+            vllm_args.extend(["--enable-auto-tool-choice", "--tool-call-parser", "hermes"])
+
+        logger.info(f"vLLM arguments: {' '.join(vllm_args)}")
+
+        # Note: The actual server starting requires using vLLM's CLI
+        # This script is a helper that shows the configuration
+        logger.info("\nTo start the server, run:")
+        logger.info(f"python -m vllm.entrypoints.openai.api_server {' '.join(vllm_args)}")
+
+        # Or start directly if vLLM supports it
+        import subprocess
+        subprocess.run(
+            ["python", "-m", "vllm.entrypoints.openai.api_server"] + vllm_args,
+            check=True
+        )
+
+    except KeyboardInterrupt:
+        logger.info("\nShutting down vLLM server...")
+    except Exception as e:
+        logger.error(f"Error starting vLLM server: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/test_agent_gpu.py b/python/test_agent_gpu.py
new file mode 100644
index 0000000..bbb5ce1
--- /dev/null
+++ b/python/test_agent_gpu.py
@@ -0,0 +1,491 @@
+"""
+End-to-end test of Agent with GPU-accelerated backend and tools.
+
+This test demonstrates:
+1. Creating a ToolDispatcher with sample tools
+2. Initializing an Agent with GPU-accelerated LlamaCppBackend
+3. Agent perceiving observations
+4. Agent deciding actions using tools via LLM
+5. Executing those actions through the ToolDispatcher
+"""
+
+import json
+import logging
+from backends import LlamaCppBackend, BackendConfig
+from agent_runtime.agent import Agent, Action
+from agent_runtime.tool_dispatcher import ToolDispatcher
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+
+# ============================================================
+# Step 1: Create Sample Tools
+# ============================================================
+
+def create_tool_dispatcher() -> ToolDispatcher:
+    """Create a ToolDispatcher with sample game tools."""
+    dispatcher = ToolDispatcher()
+
+    # Tool 1: Move to position
+    def move_to(target_x: float, target_y: float, speed: float = 1.0) -> dict:
+        """Move agent to target position."""
+        distance = ((target_x**2) + (target_y**2)) ** 0.5
+        time_estimate = distance / speed
+        return {
+            "success": True,
+            "message": f"Moving to ({target_x}, {target_y}) at speed {speed}",
+            "estimated_time": time_estimate,
+            "distance": distance
+        }
+
+    dispatcher.register_tool(
+        name="move_to",
+        function=move_to,
+        description="Move the agent to a target position in 2D space",
+        parameters={
+            "type": "object",
+            "properties": {
+                "target_x": {
+                    "type": "number",
+                    "description": "Target X coordinate"
+                },
+                "target_y": {
+                    "type": "number",
+                    "description": "Target Y coordinate"
+                },
+                "speed": {
+                    "type": "number",
+                    "description": "Movement speed (default 1.0)",
+                    "default": 1.0
+                }
+            },
+            "required": ["target_x", "target_y"]
+        },
+        returns={
+            "type": "object",
+            "properties": {
+                "success": {"type": "boolean"},
+                "message": {"type": "string"},
+                "estimated_time": {"type": "number"},
+                "distance": {"type": "number"}
+            }
+        }
+    )
+
+    # Tool 2: Collect resource
+    def collect_resource(resource_name: str) -> dict:
+        """Collect a resource from the environment."""
+        valid_resources = ["wood", "stone", "food"]
+        if resource_name in valid_resources:
+            return {
+                "success": True,
+                "message": f"Collected {resource_name}",
+                "resource": resource_name,
+                "quantity": 1
+            }
+        else:
+            return {
+                "success": False,
+                "message": f"Unknown resource: {resource_name}",
+                "error": "Invalid resource type"
+            }
+
+    dispatcher.register_tool(
+        name="collect_resource",
+        function=collect_resource,
+        description="Collect a resource (wood, stone, or food) from the current location",
+        parameters={
+            "type": "object",
+            "properties": {
+                "resource_name": {
+                    "type": "string",
+                    "description": "Name of resource to collect (wood, stone, or food)",
+                    "enum": ["wood", "stone", "food"]
+                }
+            },
+            "required": ["resource_name"]
+        },
+        returns={
+            "type": "object",
+            "properties": {
+                "success": {"type": "boolean"},
+                "message": {"type": "string"},
+                "resource": {"type": "string"},
+                "quantity": {"type": "number"}
+            }
+        }
+    )
+
+    # Tool 3: Check inventory
+    def check_inventory() -> dict:
+        """Check current inventory (mock data for demo)."""
+        return {
+            "success": True,
+            "inventory": {
+                "wood": 5,
+                "stone": 3,
+                "food": 2
+            },
+            "total_items": 10
+        }
+
+    dispatcher.register_tool(
+        name="check_inventory",
+        function=check_inventory,
+        description="Check the agent's current inventory",
+        parameters={
+            "type": "object",
+            "properties": {}
+        },
+        returns={
+            "type": "object",
+            "properties": {
+                "success": {"type": "boolean"},
+                "inventory": {"type": "object"},
+                "total_items": {"type": "number"}
+            }
+        }
+    )
+
+    logger.info(f"Created ToolDispatcher with {len(dispatcher.tools)} tools")
+    return dispatcher
+
+
+# ============================================================
+# Step 2: Enhanced Agent with Backend Integration
+# ============================================================
+
+class EnhancedAgent(Agent):
+    """
+    Enhanced Agent that properly integrates with LLM backend and tools.
+
+    This extends the base Agent class to implement actual backend communication.
+    """
+
+    def __init__(self, agent_id: str, backend, tool_dispatcher: ToolDispatcher, goals: list[str] | None = None):
+        # Get available tool names from dispatcher
+        available_tools = list(tool_dispatcher.tools.keys())
+
+        super().__init__(
+            agent_id=agent_id,
+            backend=backend,
+            tools=available_tools,
+            goals=goals
+        )
+
+        self.tool_dispatcher = tool_dispatcher
+
+    def _query_llm(self, context: str) -> str:
+        """
+        Query the LLM backend with context and tool information.
+
+        This overrides the placeholder in the base Agent class.
+        """
+        # Get tool schemas for the prompt
+        tool_schemas = self.tool_dispatcher.export_schemas_json()
+
+        # Build the prompt with Llama-2 chat format
+        prompt = f"""[INST] You are an autonomous agent in a game world. You can use tools to interact with the environment.
+
+{context}
+
+Available tools (JSON format):
+{tool_schemas}
+
+Respond with ONLY a JSON object in this exact format:
+{{"tool": "tool_name", "params": {{"param1": value1}}, "reasoning": "why you chose this action"}}
+
+Choose the most appropriate tool based on your current observations and goals.
+Your response (JSON only): [/INST]"""
+
+        logger.debug(f"Querying LLM with prompt length: {len(prompt)} chars")
+
+        # Query backend
+        result = self.backend.generate(
+            prompt=prompt,
+            temperature=0.3,  # Lower temperature for more consistent JSON
+            max_tokens=150
+        )
+
+        # Extract JSON from response
+        response_text = result.text.strip()
+        logger.info(f"LLM Response: {response_text}")
+
+        # Try to extract JSON if model added extra text
+        return self._extract_json(response_text)
+
+    def _extract_json(self, text: str) -> str:
+        """Extract JSON object from text that might contain extra content."""
+        import re
+
+        # Try to find JSON object in the response
+        start = text.find('{')
+        end = text.rfind('}')
+
+        if start != -1 and end != -1:
+            json_str = text[start:end+1]
+
+            # Try to validate and return if valid
+            try:
+                json.loads(json_str)
+                return json_str
+            except json.JSONDecodeError as e:
+                logger.debug(f"JSON parse error: {e}")
+
+                # Common issue: missing comma between fields
+                # Pattern: "value"\n"field" should be "value",\n"field"
+                fixed_json = re.sub(r'([\d"])\s*\n\s*("(?:reasoning|tool|params))', r'\1,\n\2', json_str)
+
+                try:
+                    json.loads(fixed_json)
+                    logger.debug("Fixed JSON by adding missing commas")
+                    return fixed_json
+                except json.JSONDecodeError:
+                    # Fallback: extract key-value pairs manually
+                    tool_match = re.search(r'"tool"\s*:\s*"([^"]+)"', text)
+                    resource_match = re.search(r'"resource_name"\s*:\s*"([^"]+)"', text)
+                    target_x_match = re.search(r'"target_x"\s*:\s*([\d.]+)', text)
+                    target_y_match = re.search(r'"target_y"\s*:\s*([\d.]+)', text)
+
+                    if tool_match:
+                        tool = tool_match.group(1)
+                        params = {}
+
+                        # Extract parameters based on tool type
+                        if tool == "collect_resource" and resource_match:
+                            params = {"resource_name": resource_match.group(1)}
+                        elif tool == "move_to" and target_x_match and target_y_match:
+                            params = {
+                                "target_x": float(target_x_match.group(1)),
+                                "target_y": float(target_y_match.group(1))
+                            }
+                        elif tool == "check_inventory":
+                            params = {}
+
+                        reconstructed = {
+                            "tool": tool,
+                            "params": params
+                        }
+
+                        logger.debug(f"Reconstructed JSON from pattern matching: {reconstructed}")
+                        return json.dumps(reconstructed)
+
+        # Fallback: return original text
+        logger.warning(f"Could not extract valid JSON from: {text[:200]}...")
+        return text
+
+    def execute_action(self, action: Action) -> dict:
+        """Execute an action through the tool dispatcher."""
+        if action is None:
+            return {"success": False, "error": "No action provided"}
+
+        logger.info(f"Executing action: {action.tool_name} with params {action.parameters}")
+        result = self.tool_dispatcher.execute_tool(action.tool_name, action.parameters)
+
+        return result
+
+
+# ============================================================
+# Step 3: Test Scenarios
+# ============================================================
+
+def test_scenario_1_resource_collection():
+    """
+    Scenario: Agent sees wood nearby and should collect it.
+    """
+    print("\n" + "="*60)
+    print("SCENARIO 1: Resource Collection")
+    print("="*60)
+
+    # Setup
+    dispatcher = create_tool_dispatcher()
+
+    config = BackendConfig(
+        model_path="../models/llama-2-7b-chat.Q4_K_M.gguf",
+        temperature=0.3,
+        max_tokens=150,
+        n_gpu_layers=-1  # Full GPU acceleration
+    )
+    backend = LlamaCppBackend(config)
+
+    agent = EnhancedAgent(
+        agent_id="forager_001",
+        backend=backend,
+        tool_dispatcher=dispatcher,
+        goals=["collect resources for crafting"]
+    )
+
+    # Simulate observations
+    print("\n[Simulation] Agent observes environment...")
+    agent.perceive({
+        "position": {"x": 0, "y": 0},
+        "visible_objects": [
+            {"type": "wood", "distance": 2.5, "position": {"x": 2, "y": 1}},
+            {"type": "tree", "distance": 5.0}
+        ],
+        "inventory_count": 10
+    }, source="vision")
+
+    # Agent decides action
+    print("\n[Agent] Deciding action based on observations and goals...")
+    action = agent.decide_action()
+
+    if action:
+        print(f"\n[Agent] Decided to use: {action.tool_name}")
+        print(f"[Agent] Parameters: {action.parameters}")
+        if action.reasoning:
+            print(f"[Agent] Reasoning: {action.reasoning}")
+
+        # Execute the action
+        print("\n[Execution] Running tool...")
+        result = agent.execute_action(action)
+        print(f"[Result] {result}")
+    else:
+        print("\n[Agent] Failed to decide action")
+
+    backend.unload()
+    print("\n" + "="*60)
+
+
+def test_scenario_2_navigation():
+    """
+    Scenario: Agent needs to move to a target location.
+    """
+    print("\n" + "="*60)
+    print("SCENARIO 2: Navigation")
+    print("="*60)
+
+    # Setup
+    dispatcher = create_tool_dispatcher()
+
+    config = BackendConfig(
+        model_path="../models/llama-2-7b-chat.Q4_K_M.gguf",
+        temperature=0.3,
+        max_tokens=150,
+        n_gpu_layers=-1
+    )
+    backend = LlamaCppBackend(config)
+
+    agent = EnhancedAgent(
+        agent_id="explorer_001",
+        backend=backend,
+        tool_dispatcher=dispatcher,
+        goals=["explore the map", "find the tower at (10, 15)"]
+    )
+
+    # Simulate observations
+    print("\n[Simulation] Agent receives navigation task...")
+    agent.perceive({
+        "position": {"x": 0, "y": 0},
+        "target_location": {"x": 10, "y": 15},
+        "obstacles": []
+    }, source="navigation")
+
+    # Agent decides action
+    print("\n[Agent] Deciding navigation action...")
+    action = agent.decide_action()
+
+    if action:
+        print(f"\n[Agent] Decided to use: {action.tool_name}")
+        print(f"[Agent] Parameters: {action.parameters}")
+        if action.reasoning:
+            print(f"[Agent] Reasoning: {action.reasoning}")
+
+        # Execute the action
+        print("\n[Execution] Running tool...")
+        result = agent.execute_action(action)
+        print(f"[Result] {result}")
+    else:
+        print("\n[Agent] Failed to decide action")
+
+    backend.unload()
+    print("\n" + "="*60)
+
+
+def test_scenario_3_inventory_check():
+    """
+    Scenario: Agent checks inventory before crafting.
+    """
+    print("\n" + "="*60)
+    print("SCENARIO 3: Inventory Management")
+    print("="*60)
+
+    # Setup
+    dispatcher = create_tool_dispatcher()
+
+    config = BackendConfig(
+        model_path="../models/llama-2-7b-chat.Q4_K_M.gguf",
+        temperature=0.3,
+        max_tokens=150,
+        n_gpu_layers=-1
+    )
+    backend = LlamaCppBackend(config)
+
+    agent = EnhancedAgent(
+        agent_id="crafter_001",
+        backend=backend,
+        tool_dispatcher=dispatcher,
+        goals=["craft a wooden tool", "check if we have enough materials"]
+    )
+
+    # Simulate observations
+    print("\n[Simulation] Agent wants to craft something...")
+    agent.perceive({
+        "crafting_station": "workbench",
+        "recipe_requires": {"wood": 3, "stone": 1},
+        "action": "prepare_crafting"
+    }, source="crafting")
+
+    # Agent decides action
+    print("\n[Agent] Deciding what to do before crafting...")
+    action = agent.decide_action()
+
+    if action:
+        print(f"\n[Agent] Decided to use: {action.tool_name}")
+        print(f"[Agent] Parameters: {action.parameters}")
+        if action.reasoning:
+            print(f"[Agent] Reasoning: {action.reasoning}")
+
+        # Execute the action
+        print("\n[Execution] Running tool...")
+        result = agent.execute_action(action)
+        print(f"[Result] {result}")
+    else:
+        print("\n[Agent] Failed to decide action")
+
+    backend.unload()
+    print("\n" + "="*60)
+
+
+# ============================================================
+# Main Test Runner
+# ============================================================
+
+if __name__ == "__main__":
+    print("\n" + "="*60)
+    print("Agent + GPU Backend + Tools Integration Test")
+    print("="*60)
+    print("\nThis test demonstrates an autonomous agent using:")
+    print("  - GPU-accelerated Llama-2-7B backend (113 tok/s)")
+    print("  - ToolDispatcher with 3 sample tools")
+    print("  - Perception-Reasoning-Action loop")
+    print("\nRunning 3 scenarios...\n")
+
+    try:
+        # Run test scenarios
+        test_scenario_1_resource_collection()
+        test_scenario_2_navigation()
+        test_scenario_3_inventory_check()
+
+        print("\n" + "="*60)
+        print("All scenarios completed!")
+        print("="*60)
+
+    except Exception as e:
+        logger.error(f"Test failed: {e}", exc_info=True)
+        print(f"\nTest failed: {e}")
diff --git a/python/test_llama_backend.py b/python/test_llama_backend.py
new file mode 100644
index 0000000..e96b86c
--- /dev/null
+++ b/python/test_llama_backend.py
@@ -0,0 +1,146 @@
+"""
+Test script for llama.cpp backend on Windows.
+
+This script demonstrates how to use the llama.cpp backend
+for local development with a GGUF model.
+"""
+
+import logging
+from backends import LlamaCppBackend, BackendConfig
+
+# Set up logging to see what's happening
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    logger.info("Starting llama.cpp backend test")
+
+    # Configuration for the backend
+    config = BackendConfig(
+        model_path="../models/llama-2-7b-chat.Q4_K_M.gguf",  # Relative to python/ directory
+        temperature=0.7,
+        max_tokens=256,
+        top_p=0.9,
+        top_k=40,
+    )
+
+    logger.info(f"Loading model from: {config.model_path}")
+
+    try:
+        # Initialize the backend
+        backend = LlamaCppBackend(config)
+
+        # Check if backend is available
+        if not backend.is_available():
+            logger.error("Backend is not available!")
+            return
+
+        logger.info("Backend loaded successfully!")
+
+        # Test 1: Basic text generation
+        logger.info("\n" + "="*60)
+        logger.info("Test 1: Basic Text Generation")
+        logger.info("="*60)
+
+        prompt = "Hello! My name is"
+        logger.info(f"Prompt: '{prompt}'")
+
+        result = backend.generate(prompt, max_tokens=50)
+
+        logger.info(f"Generated text: {result.text}")
+        logger.info(f"Tokens used: {result.tokens_used}")
+        logger.info(f"Finish reason: {result.finish_reason}")
+
+        # Test 2: Tool calling
+        logger.info("\n" + "="*60)
+        logger.info("Test 2: Tool Calling (Function Calling)")
+        logger.info("="*60)
+
+        tools = [
+            {
+                "name": "move_to",
+                "description": "Move agent to target coordinates",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "target": {
+                            "type": "array",
+                            "items": {"type": "number"},
+                            "description": "Target [x, y, z] coordinates",
+                        }
+                    },
+                    "required": ["target"],
+                },
+            },
+            {
+                "name": "pickup_item",
+                "description": "Pick up an item from the world",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "item_name": {
+                            "type": "string",
+                            "description": "Name of the item to pick up",
+                        }
+                    },
+                    "required": ["item_name"],
+                },
+            }
+        ]
+
+        prompt = "I need to pick up the sword and then move to coordinates (10, 20, 5)"
+        logger.info(f"Prompt: '{prompt}'")
+
+        result = backend.generate_with_tools(prompt, tools, temperature=0.5)
+
+        logger.info(f"Generated text: {result.text}")
+        logger.info(f"Tokens used: {result.tokens_used}")
+
+        if "parsed_tool_call" in result.metadata:
+            logger.info(f"Parsed tool call: {result.metadata['parsed_tool_call']}")
+        elif "parse_error" in result.metadata:
+            logger.warning("Failed to parse tool call from response")
+
+        # Test 3: Different temperatures
+        logger.info("\n" + "="*60)
+        logger.info("Test 3: Temperature Comparison")
+        logger.info("="*60)
+
+        prompt = "The capital of France is"
+
+        for temp in [0.1, 0.7, 1.0]:
+            logger.info(f"\nTemperature: {temp}")
+            result = backend.generate(prompt, temperature=temp, max_tokens=20)
+            logger.info(f"Result: {result.text.strip()}")
+
+        # Test 4: Conversation context
+        logger.info("\n" + "="*60)
+        logger.info("Test 4: Multi-turn Conversation")
+        logger.info("="*60)
+
+        conversation = """<s>[INST] You are a helpful AI assistant. [/INST] I understand. I'm here to help!</s>
+[INST] What is the weather like today? [/INST]"""
+
+        result = backend.generate(conversation, max_tokens=100)
+        logger.info(f"Assistant: {result.text}")
+
+        logger.info("\n" + "="*60)
+        logger.info("All tests completed successfully!")
+        logger.info("="*60)
+
+        # Clean up
+        backend.unload()
+        logger.info("Backend unloaded")
+
+    except Exception as e:
+        logger.error(f"Error during testing: {e}", exc_info=True)
+        return
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/test_llama_gpu.py b/python/test_llama_gpu.py
new file mode 100644
index 0000000..f346920
--- /dev/null
+++ b/python/test_llama_gpu.py
@@ -0,0 +1,101 @@
+"""
+Test GPU-accelerated inference with llama.cpp backend.
+
+This script compares CPU vs GPU performance.
+"""
+
+import logging
+import time
+from backends import LlamaCppBackend, BackendConfig
+
+logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
+logger = logging.getLogger(__name__)
+
+
+def test_inference(config: BackendConfig, test_name: str):
+    """Test inference with given config."""
+    print(f"\n{'='*60}")
+    print(f"{test_name}")
+    print('='*60)
+
+    start_time = time.time()
+    backend = LlamaCppBackend(config)
+    load_time = time.time() - start_time
+
+    print(f"Load time: {load_time:.2f}s\n")
+
+    # Test prompt
+    prompt = "[INST] Write a short story about a robot exploring Mars in 3 sentences. [/INST]"
+
+    # Generate
+    start_time = time.time()
+    result = backend.generate(prompt, max_tokens=100)
+    gen_time = time.time() - start_time
+
+    tokens_per_sec = result.tokens_used / gen_time if gen_time > 0 else 0
+
+    print(f"Response: {result.text.strip()}\n")
+    print(f"Generation time: {gen_time:.2f}s")
+    print(f"Tokens: {result.tokens_used}")
+    print(f"Speed: {tokens_per_sec:.2f} tokens/sec")
+
+    backend.unload()
+    return tokens_per_sec
+
+
+def main():
+    print("\n" + "="*60)
+    print("GPU Acceleration Test for llama.cpp")
+    print("="*60)
+
+    model_path = "../models/llama-2-7b-chat.Q4_K_M.gguf"
+
+    # Test 1: CPU only
+    cpu_config = BackendConfig(
+        model_path=model_path,
+        temperature=0.7,
+        max_tokens=100,
+        n_gpu_layers=0,  # CPU only
+    )
+
+    cpu_speed = test_inference(cpu_config, "Test 1: CPU Only (0 GPU layers)")
+
+    # Test 2: Partial GPU offload
+    partial_gpu_config = BackendConfig(
+        model_path=model_path,
+        temperature=0.7,
+        max_tokens=100,
+        n_gpu_layers=20,  # Offload 20 layers to GPU
+    )
+
+    partial_speed = test_inference(partial_gpu_config, "Test 2: Partial GPU (20 layers)")
+
+    # Test 3: Full GPU offload
+    full_gpu_config = BackendConfig(
+        model_path=model_path,
+        temperature=0.7,
+        max_tokens=100,
+        n_gpu_layers=-1,  # Offload all layers to GPU
+    )
+
+    full_speed = test_inference(full_gpu_config, "Test 3: Full GPU (all layers)")
+
+    # Summary
+    print("\n" + "="*60)
+    print("Performance Summary")
+    print("="*60)
+    print(f"CPU only:      {cpu_speed:.2f} tokens/sec (baseline)")
+    print(f"Partial GPU:   {partial_speed:.2f} tokens/sec ({partial_speed/cpu_speed:.2f}x speedup)")
+    print(f"Full GPU:      {full_speed:.2f} tokens/sec ({full_speed/cpu_speed:.2f}x speedup)")
+    print("="*60)
+
+    if full_speed > cpu_speed * 2:
+        print("\n✓ GPU acceleration is working! Significant speedup achieved.")
+    elif full_speed > cpu_speed:
+        print("\n⚠ GPU acceleration is working but speedup is modest.")
+    else:
+        print("\n✗ GPU acceleration may not be working properly.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/test_llama_simple.py b/python/test_llama_simple.py
new file mode 100644
index 0000000..6e67b7b
--- /dev/null
+++ b/python/test_llama_simple.py
@@ -0,0 +1,103 @@
+"""
+Simple test script for llama.cpp backend.
+
+Demonstrates basic usage without complex tool calling.
+"""
+
+import logging
+from backends import LlamaCppBackend, BackendConfig
+
+logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
+logger = logging.getLogger(__name__)
+
+
+def main():
+    print("\n" + "="*60)
+    print("Llama.cpp Backend - Simple Test")
+    print("="*60 + "\n")
+
+    # Initialize backend
+    config = BackendConfig(
+        model_path="../models/llama-2-7b-chat.Q4_K_M.gguf",
+        temperature=0.7,
+        max_tokens=150,
+    )
+
+    logger.info("Loading model (this may take 10-20 seconds)...")
+    backend = LlamaCppBackend(config)
+
+    if not backend.is_available():
+        logger.error("Backend failed to load!")
+        return
+
+    logger.info("Model loaded successfully!\n")
+
+    # Test 1: Simple completion
+    print("Test 1: Text Completion")
+    print("-" * 40)
+
+    prompt = "The three primary colors are"
+    print(f"Prompt: '{prompt}'")
+
+    result = backend.generate(prompt, max_tokens=50)
+    print(f"Response: {result.text.strip()}")
+    print(f"Tokens: {result.tokens_used}\n")
+
+    # Test 2: Question answering (using Llama-2 chat format)
+    print("Test 2: Question Answering")
+    print("-" * 40)
+
+    # Llama-2 chat format: [INST] question [/INST]
+    prompt = "[INST] What is the capital of France? Answer in one word. [/INST]"
+    print(f"Question: What is the capital of France?")
+
+    result = backend.generate(prompt, temperature=0.1, max_tokens=10)
+    print(f"Answer: {result.text.strip()}\n")
+
+    # Test 3: Creative writing
+    print("Test 3: Creative Writing")
+    print("-" * 40)
+
+    prompt = "[INST] Write a single sentence about a robot exploring Mars. [/INST]"
+    print("Task: Write about a robot on Mars")
+
+    result = backend.generate(prompt, temperature=0.9, max_tokens=100)
+    print(f"Story: {result.text.strip()}\n")
+
+    # Test 4: Simple tool selection
+    print("Test 4: Action Selection")
+    print("-" * 40)
+
+    prompt = """[INST] You are a game agent at position (0, 0, 0). You see a sword at position (5, 5, 0).
+
+Available actions:
+1. move_to(x, y, z) - Move to coordinates
+2. pickup_item(name) - Pick up an item
+3. wait() - Do nothing
+
+What should you do FIRST? Reply with just the action name and parameters, like: move_to(5, 5, 0) [/INST]"""
+
+    print("Scenario: Agent sees sword at (5, 5, 0)")
+
+    result = backend.generate(prompt, temperature=0.3, max_tokens=50)
+    print(f"Decision: {result.text.strip()}\n")
+
+    # Test 5: Different temperatures
+    print("Test 5: Temperature Comparison")
+    print("-" * 40)
+
+    base_prompt = "[INST] Complete this sentence in a creative way: The robot opened the door and saw [/INST]"
+
+    for temp in [0.1, 0.5, 1.0]:
+        result = backend.generate(base_prompt, temperature=temp, max_tokens=30)
+        print(f"Temp {temp}: {result.text.strip()}")
+
+    print("\n" + "="*60)
+    print("All tests completed!")
+    print("="*60)
+
+    backend.unload()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/test_quick_gpu.py b/python/test_quick_gpu.py
new file mode 100644
index 0000000..3222e0f
--- /dev/null
+++ b/python/test_quick_gpu.py
@@ -0,0 +1,49 @@
+"""
+Quick GPU acceleration test.
+"""
+
+import time
+from backends import LlamaCppBackend, BackendConfig
+
+print("\n" + "="*60)
+print("Quick GPU Test")
+print("="*60 + "\n")
+
+# Test with GPU
+config_gpu = BackendConfig(
+    model_path="../models/llama-2-7b-chat.Q4_K_M.gguf",
+    temperature=0.7,
+    max_tokens=50,
+    n_gpu_layers=-1,  # All layers to GPU
+)
+
+print("Loading model with GPU acceleration...")
+start = time.time()
+backend = LlamaCppBackend(config_gpu)
+load_time = time.time() - start
+print(f"Load time: {load_time:.2f}s\n")
+
+prompt = "[INST] What is 2+2? Answer in one sentence. [/INST]"
+
+print("Generating response...")
+start = time.time()
+result = backend.generate(prompt, max_tokens=30)
+gen_time = time.time() - start
+
+tokens_per_sec = result.tokens_used / gen_time if gen_time > 0 else 0
+
+print(f"\nPrompt: {prompt}")
+print(f"Response: {result.text.strip()}\n")
+print(f"Generation time: {gen_time:.2f}s")
+print(f"Tokens: {result.tokens_used}")
+print(f"Speed: {tokens_per_sec:.2f} tokens/sec")
+
+if tokens_per_sec > 50:
+    print("\nSUCCESS: GPU acceleration is WORKING! Excellent speed!")
+elif tokens_per_sec > 20:
+    print("\nSUCCESS: GPU acceleration appears to be working.")
+else:
+    print("\nWARNING: Speed seems slow - GPU may not be fully utilized.")
+
+backend.unload()
+print("\n" + "="*60)
diff --git a/scripts/tests/test_tool_execution.gd b/scripts/tests/test_tool_execution.gd
index f778ca5..902d4c5 100644
--- a/scripts/tests/test_tool_execution.gd
+++ b/scripts/tests/test_tool_execution.gd
@@ -7,6 +7,9 @@ extends Node
 var ipc_client: IPCClient
 var tool_registry: ToolRegistry
 var agent: Agent
+var test_running := true  # Keep scene alive
+var wait_time := 0.0
+var tests_started := false
 
 func _ready():
 	print("=== Tool Execution Test ===")
@@ -58,9 +61,18 @@ func _ready():
 
 	ipc_client.connect_to_server("http://127.0.0.1:5000")
 
-	# Wait a moment for connection, then test tools
-	print("Waiting 3 seconds for connection...")
-	await get_tree().create_timer(3.0).timeout
+	# Skip waiting - call tests immediately after 1 frame
+	print("Starting tests after 1 frame...")
+	call_deferred("_start_tests")
+
+func _process(delta):
+	# Keep scene alive while test is running
+	# (We removed the manual timer since we're using call_deferred now)
+	pass
+
+func _start_tests():
+	print("\nChecking connection status...")
+	print("is_server_connected() = ", ipc_client.is_server_connected())
 
 	if not ipc_client.is_server_connected():
 		print("\n[WARNING] Not connected to server!")
@@ -69,8 +81,13 @@ func _ready():
 		print("2. Server is on http://127.0.0.1:5000")
 		print("3. No firewall is blocking the connection")
 		print("\nTrying to test tools anyway...")
+	else:
+		print("[SUCCESS] Connected to IPC server!")
 
+	print("About to call test_tools()...")
+	tests_started = true
 	test_tools()
+	print("test_tools() call completed")
 
 func test_tools():
 	print("\n=== Testing Tool Execution ===")
@@ -85,7 +102,6 @@ func test_tools():
 	}
 	var move_result = agent.call_tool("move_to", move_params)
 	print("Request sent: ", move_result)
-	await get_tree().create_timer(0.5).timeout
 
 	# Test 2: Pickup item tool
 	print("\n[Test 2] Testing pickup_item tool...")
@@ -94,19 +110,16 @@ func test_tools():
 	}
 	var pickup_result = agent.call_tool("pickup_item", pickup_params)
 	print("Request sent: ", pickup_result)
-	await get_tree().create_timer(0.5).timeout
 
 	# Test 3: Stop movement tool
 	print("\n[Test 3] Testing stop_movement tool...")
 	var stop_result = agent.call_tool("stop_movement", {})
 	print("Request sent: ", stop_result)
-	await get_tree().create_timer(0.5).timeout
 
 	# Test 4: Get inventory tool
 	print("\n[Test 4] Testing get_inventory tool...")
 	var inventory_result = agent.call_tool("get_inventory", {})
 	print("Request sent: ", inventory_result)
-	await get_tree().create_timer(0.5).timeout
 
 	# Test 5: Direct ToolRegistry execution
 	print("\n[Test 5] Testing navigate_to tool...")
@@ -116,13 +129,10 @@ func test_tools():
 	print("Request sent: ", direct_result)
 
 	print("\n=== All Tool Requests Sent ===")
-	print("Waiting for responses (check 'IPC Response Received' below)...")
+	print("Waiting for async responses from Python server...")
 	print("Python server log should show tool executions")
-
-	# Wait a bit for all responses
-	await get_tree().create_timer(2.0).timeout
-	print("\n=== Test Complete ===")
-	print("If you saw response_received callbacks above, tool execution works!")
+	print("Scene will stay running - press Q to quit when done")
+	print("\nWatch for '[IPC Response Received]' messages below...")
 
 func _on_response_received(response: Dictionary):
 	print("\n[IPC Response Received]")
diff --git a/tests/test_vllm_backend.py b/tests/test_vllm_backend.py
new file mode 100644
index 0000000..b68f198
--- /dev/null
+++ b/tests/test_vllm_backend.py
@@ -0,0 +1,196 @@
+"""
+Tests for vLLM backend.
+
+Note: These tests require a running vLLM server.
+Use pytest markers to skip if server is not available.
+"""
+
+import pytest
+from backends.vllm_backend import VLLMBackend, VLLMBackendConfig
+
+
+@pytest.fixture
+def vllm_config():
+    """Create a vLLM config for testing."""
+    return VLLMBackendConfig(
+        model_path="meta-llama/Llama-2-7b-chat-hf",
+        api_base="http://localhost:8000/v1",
+        temperature=0.7,
+        max_tokens=100,
+    )
+
+
+@pytest.fixture
+def vllm_backend(vllm_config):
+    """Create a vLLM backend instance."""
+    try:
+        backend = VLLMBackend(vllm_config)
+        if not backend.is_available():
+            pytest.skip("vLLM server not available")
+        return backend
+    except Exception as e:
+        pytest.skip(f"Could not connect to vLLM server: {e}")
+
+
+def test_vllm_config_creation():
+    """Test vLLM config initialization."""
+    config = VLLMBackendConfig(
+        model_path="test-model",
+        api_base="http://test:8000/v1",
+        api_key="test-key",
+        temperature=0.5,
+        max_tokens=256,
+    )
+
+    assert config.model_path == "test-model"
+    assert config.api_base == "http://test:8000/v1"
+    assert config.api_key == "test-key"
+    assert config.temperature == 0.5
+    assert config.max_tokens == 256
+
+
+def test_vllm_backend_initialization(vllm_config):
+    """Test vLLM backend can be initialized."""
+    try:
+        backend = VLLMBackend(vllm_config)
+        assert backend.client is not None
+        assert backend.config == vllm_config
+    except Exception:
+        pytest.skip("vLLM server not available")
+
+
+def test_vllm_is_available(vllm_backend):
+    """Test availability check."""
+    assert vllm_backend.is_available() is True
+
+
+def test_vllm_generate(vllm_backend):
+    """Test basic text generation."""
+    prompt = "Hello, my name is"
+    result = vllm_backend.generate(prompt, max_tokens=20)
+
+    assert result is not None
+    assert len(result.text) > 0
+    assert result.tokens_used > 0
+    assert result.finish_reason in ["stop", "length"]
+    assert "model" in result.metadata
+
+
+def test_vllm_generate_with_temperature(vllm_backend):
+    """Test generation with custom temperature."""
+    prompt = "The weather today is"
+    result = vllm_backend.generate(prompt, temperature=0.1, max_tokens=20)
+
+    assert result is not None
+    assert len(result.text) > 0
+    assert result.finish_reason in ["stop", "length"]
+
+
+def test_vllm_generate_with_tools(vllm_backend):
+    """Test tool calling generation."""
+    prompt = "I need to move to coordinates (10, 20, 5)"
+
+    tools = [
+        {
+            "name": "move_to",
+            "description": "Move agent to target coordinates",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "target": {
+                        "type": "array",
+                        "items": {"type": "number"},
+                        "description": "Target [x, y, z] coordinates",
+                    }
+                },
+                "required": ["target"],
+            },
+        }
+    ]
+
+    result = vllm_backend.generate_with_tools(prompt, tools)
+
+    assert result is not None
+    # Result should contain either a tool call or text response
+    assert len(result.text) > 0 or "tool_call" in result.metadata
+
+
+def test_vllm_generate_error_handling(vllm_backend):
+    """Test error handling with invalid input."""
+    # Empty prompt should still work
+    result = vllm_backend.generate("", max_tokens=10)
+    assert result is not None
+    assert result.finish_reason in ["stop", "length", "error"]
+
+
+def test_vllm_unload(vllm_config):
+    """Test unloading backend."""
+    try:
+        backend = VLLMBackend(vllm_config)
+        backend.unload()
+        assert backend.client is None
+        assert backend.is_available() is False
+    except Exception:
+        pytest.skip("vLLM server not available")
+
+
+def test_vllm_multiple_generations(vllm_backend):
+    """Test multiple sequential generations."""
+    prompts = ["Hello", "How are you?", "What is AI?"]
+
+    for prompt in prompts:
+        result = vllm_backend.generate(prompt, max_tokens=20)
+        assert result is not None
+        assert len(result.text) > 0
+
+
+def test_vllm_generate_with_tools_fallback(vllm_backend):
+    """Test fallback tool calling method."""
+    prompt = "Pick up the sword item"
+
+    tools = [
+        {
+            "name": "pickup_item",
+            "description": "Pick up an item from the world",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "item_name": {
+                        "type": "string",
+                        "description": "Name of the item to pick up",
+                    }
+                },
+                "required": ["item_name"],
+            },
+        }
+    ]
+
+    # Test the fallback method directly
+    result = vllm_backend._generate_with_tools_fallback(prompt, tools, temperature=0.7)
+
+    assert result is not None
+    assert len(result.text) > 0
+
+
+@pytest.mark.parametrize("max_tokens", [10, 50, 100])
+def test_vllm_different_max_tokens(vllm_backend, max_tokens):
+    """Test generation with different max token limits."""
+    prompt = "Once upon a time"
+    result = vllm_backend.generate(prompt, max_tokens=max_tokens)
+
+    assert result is not None
+    assert result.tokens_used <= max_tokens * 1.5  # Some tolerance
+
+
+@pytest.mark.parametrize("temperature", [0.1, 0.7, 1.0])
+def test_vllm_different_temperatures(vllm_backend, temperature):
+    """Test generation with different temperatures."""
+    prompt = "The capital of France is"
+    result = vllm_backend.generate(prompt, temperature=temperature, max_tokens=20)
+
+    assert result is not None
+    assert len(result.text) > 0
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "-s"])