diff --git a/START_GPU_IPC_SERVER.bat b/START_GPU_IPC_SERVER.bat new file mode 100644 index 0000000..40b8750 --- /dev/null +++ b/START_GPU_IPC_SERVER.bat @@ -0,0 +1,57 @@ +@echo off +REM Agent Arena - GPU-Accelerated IPC Server Startup Script +REM This script starts the Python IPC server with GPU-accelerated LLM backend + +echo ======================================== +echo Agent Arena - GPU IPC Server +echo ======================================== +echo. + +cd /d "%~dp0\python" + +REM Check if venv exists +if not exist "venv\" ( + echo ERROR: Python virtual environment not found! + echo Please run: python -m venv venv + echo Then install dependencies: venv\Scripts\pip install -r requirements.txt + pause + exit /b 1 +) + +REM Activate venv +echo Activating Python virtual environment... +call venv\Scripts\activate.bat + +REM Check if required packages are installed +python -c "import fastapi, uvicorn, llama_cpp" 2>nul +if errorlevel 1 ( + echo. + echo ERROR: Required packages not installed! + echo Please install dependencies: pip install -r requirements.txt + pause + exit /b 1 +) + +echo. +echo Starting GPU-Accelerated IPC Server... +echo ======================================== +echo Model: Llama-2-7B Chat (Q4_K_M quantization) +echo GPU Acceleration: ENABLED (all layers) +echo Expected Speed: ~113 tokens/sec +echo Server Address: http://127.0.0.1:5000 +echo. +echo Tools Available: 15+ (movement, inventory, world query) +echo Default Agent: gpu_agent_001 +echo ======================================== +echo. +echo Press Ctrl+C to stop the server +echo. + +python run_ipc_server_with_gpu.py --gpu-layers -1 + +REM If server exits, pause so user can see error +if errorlevel 1 ( + echo. + echo Server exited with error! + pause +) diff --git a/TESTING_AGENT_WITH_GPU.md b/TESTING_AGENT_WITH_GPU.md new file mode 100644 index 0000000..1155816 --- /dev/null +++ b/TESTING_AGENT_WITH_GPU.md @@ -0,0 +1,329 @@ +# Testing Agents with GPU-Accelerated Backend + +This guide shows how to run the full Godot + Python IPC setup with your GPU-accelerated llama.cpp backend. + +## Quick Start (Tool Execution Only) + +This tests that the IPC communication works without LLM agents: + +### Step 1: Start IPC Server +```bash +# From project root +START_IPC_SERVER.bat +``` + +The server will start at `http://127.0.0.1:5000` and automatically register all tools (movement, inventory, world_query). + +### Step 2: Open Test Scene in Godot +1. Open Godot editor +2. Navigate to: `scenes/tests/test_tool_execution_simple.tscn` +3. Press **F6** (Run Current Scene) + +### Step 3: Verify Results +Check both consoles: +- **Godot Console**: Shows test execution and results +- **Python Console**: Shows tool execution logs + +**Expected Output (Python):** +``` +2025-11-18 - ipc.server - INFO - Registered 15 tools +2025-11-18 - ipc.server - INFO - Executing tool 'move_to' for agent... +2025-11-18 - ipc.server - INFO - Tool 'move_to' executed: success=True +``` + +--- + +## Full Agent Test (with GPU Backend) + +This tests agents making decisions with your GPU-accelerated LLM backend. + +### Prerequisites + +1. **GPU-accelerated backend working** ✅ (You already have this!) +2. **IPC server modified to use LLM backend** +3. **Test scene that triggers agent decisions** + +### Step 1: Create GPU-Enabled IPC Server Script + +Create `python/run_ipc_server_with_gpu.py`: + +```python +""" +IPC Server with GPU-accelerated agent backend. +""" + +import argparse +import logging +import sys + +from agent_runtime.runtime import AgentRuntime +from agent_runtime.agent import Agent +from agent_runtime.tool_dispatcher import ToolDispatcher +from backends import LlamaCppBackend, BackendConfig +from ipc.server import create_server +from tools import register_movement_tools, register_inventory_tools, register_world_query_tools + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", +) + +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Agent Arena IPC Server with GPU Backend") + parser.add_argument("--host", type=str, default="127.0.0.1") + parser.add_argument("--port", type=int, default=5000) + parser.add_argument("--workers", type=int, default=4) + parser.add_argument("--debug", action="store_true") + parser.add_argument( + "--model", + type=str, + default="../models/llama-2-7b-chat.Q4_K_M.gguf", + help="Path to GGUF model file" + ) + parser.add_argument( + "--gpu-layers", + type=int, + default=-1, + help="Number of layers to offload to GPU (-1 = all, 0 = CPU only)" + ) + + args = parser.parse_args() + + if args.debug: + logging.getLogger().setLevel(logging.DEBUG) + + logger.info("=" * 60) + logger.info("Agent Arena IPC Server (GPU-Accelerated)") + logger.info("=" * 60) + logger.info(f"Host: {args.host}") + logger.info(f"Port: {args.port}") + logger.info(f"Max Workers: {args.workers}") + logger.info(f"Model: {args.model}") + logger.info(f"GPU Layers: {args.gpu_layers}") + logger.info("=" * 60) + + try: + # Create GPU-accelerated backend + backend_config = BackendConfig( + model_path=args.model, + temperature=0.7, + max_tokens=256, + n_gpu_layers=args.gpu_layers + ) + + logger.info("Loading GPU-accelerated LLM backend...") + backend = LlamaCppBackend(backend_config) + logger.info("✓ Backend loaded successfully") + + # Create runtime + runtime = AgentRuntime(max_workers=args.workers) + + # Create a test agent with GPU backend + tool_dispatcher = ToolDispatcher() + register_movement_tools(tool_dispatcher) + register_inventory_tools(tool_dispatcher) + register_world_query_tools(tool_dispatcher) + + test_agent = Agent( + agent_id="gpu_agent_001", + backend=backend, + tools=list(tool_dispatcher.tools.keys()), + goals=["explore the world", "collect resources"] + ) + + runtime.register_agent(test_agent) + logger.info(f"✓ Registered agent '{test_agent.state.agent_id}' with GPU backend") + + # Create and start server + server = create_server(runtime=runtime, host=args.host, port=args.port) + logger.info("Starting IPC server...") + server.run() + + except KeyboardInterrupt: + logger.info("\nShutting down gracefully...") + if 'backend' in locals(): + backend.unload() + sys.exit(0) + except Exception as e: + logger.error(f"Fatal error: {e}", exc_info=True) + sys.exit(1) + + +if __name__ == "__main__": + main() +``` + +### Step 2: Create Batch File to Start GPU Server + +Create `START_GPU_IPC_SERVER.bat` in project root: + +```batch +@echo off +REM Agent Arena - GPU-Accelerated IPC Server Startup + +echo ======================================== +echo Agent Arena - GPU IPC Server +echo ======================================== +echo. + +cd /d "%~dp0\python" + +REM Activate venv +echo Activating Python virtual environment... +call venv\Scripts\activate.bat + +echo. +echo Starting GPU-Accelerated IPC Server... +echo Model: Llama-2-7B (Q4_K_M) +echo GPU Acceleration: ENABLED (all layers) +echo Server: http://127.0.0.1:5000 +echo. +echo Press Ctrl+C to stop the server +echo ======================================== +echo. + +python run_ipc_server_with_gpu.py --gpu-layers -1 + +if errorlevel 1 ( + echo. + echo Server exited with error! + pause +) +``` + +### Step 3: Test with Godot Scene + +**Option A: Use existing test scene** +1. Start GPU server: `START_GPU_IPC_SERVER.bat` +2. Open `scenes/tests/test_tool_execution_simple.tscn` +3. Press F6 +4. Tools will execute (no LLM needed) + +**Option B: Create agent decision scene** + +You'll need to modify one of the benchmark scenes to: +1. Register an agent via `/agents/register` endpoint +2. Send observations via `/tick` endpoint +3. Receive agent's LLM-driven action decision + +Example GDScript: +```gdscript +extends Node + +var http_client := HTTPRequest.new() +var agent_id = "gpu_agent_001" + +func _ready(): + add_child(http_client) + http_client.request_completed.connect(_on_request_completed) + + # Send observation to agent + var observation = { + "tick": 0, + "perceptions": [{ + "agent_id": agent_id, + "position": [0, 0, 0], + "visible_entities": [ + {"type": "wood", "distance": 5.0} + ], + "inventory": [] + }] + } + + var json = JSON.stringify(observation) + http_client.request( + "http://127.0.0.1:5000/tick", + ["Content-Type: application/json"], + HTTPClient.METHOD_POST, + json + ) + +func _on_request_completed(result, response_code, headers, body): + var json = JSON.parse_string(body.get_string_from_utf8()) + print("Agent decision: ", json) +``` + +--- + +## Testing Workflow + +### 1. Test IPC Server (No LLM) +```bash +START_IPC_SERVER.bat +# Run: scenes/tests/test_tool_execution_simple.tscn +``` +**Verifies:** Tool execution works ✓ + +### 2. Test GPU Backend (Python Only) +```bash +cd python +venv\Scripts\activate +python test_agent_gpu.py +``` +**Verifies:** GPU backend + agent decisions work ✓ + +### 3. Test Full Integration (Godot + Python + GPU) +```bash +START_GPU_IPC_SERVER.bat +# Run modified scene that sends /tick requests +``` +**Verifies:** End-to-end agent pipeline works ✓ + +--- + +## Performance Expectations + +With GPU acceleration enabled: +- **LLM Speed**: ~113 tokens/sec +- **Decision Time**: ~1-2 seconds per action +- **Recommended Tick Rate**: 0.5-1 Hz (one decision every 1-2 seconds) + +Without GPU (CPU only): +- **LLM Speed**: ~9 tokens/sec +- **Decision Time**: ~15-20 seconds per action +- **Not recommended** for real-time simulation + +--- + +## Troubleshooting + +### Server won't start +- Check Python venv is activated +- Verify model exists: `models/llama-2-7b-chat.Q4_K_M.gguf` +- Check CUDA PATH (should be fixed now) + +### Agent not responding +- Verify agent registered: Check server logs for "Registered agent" +- Send observation to `/tick` endpoint +- Check both consoles for errors + +### GPU not being used +- Check server startup logs for "Offloading all layers to GPU" +- Verify CUDA toolkit installed +- Monitor GPU usage: `nvidia-smi` + +### Slow responses +- Check GPU utilization in `nvidia-smi` +- Verify `n_gpu_layers=-1` (all layers on GPU) +- Reduce `max_tokens` parameter (currently 256) + +--- + +## Next Steps + +1. **Modify existing benchmark scenes** to send `/tick` requests +2. **Create custom test scene** for agent decision-making +3. **Add agent registration** in scene `_ready()` function +4. **Implement perception loop** (Godot → Python observations) +5. **Handle action responses** (Python → Godot actions) + +## Current Status + +✅ GPU backend working (113 tok/s) +✅ IPC server working (tool execution) +✅ Python agent test working (all 3 scenarios) +⏳ **TODO**: Connect agents to IPC `/tick` endpoint +⏳ **TODO**: Modify Godot scenes to use agent decisions diff --git a/docs/llama_cpp_gpu_setup.md b/docs/llama_cpp_gpu_setup.md new file mode 100644 index 0000000..8714616 --- /dev/null +++ b/docs/llama_cpp_gpu_setup.md @@ -0,0 +1,189 @@ +# GPU Acceleration for llama.cpp Backend + +This guide explains how to enable GPU acceleration for the llama.cpp backend on Windows. + +## Current Status + +- ✅ RTX 3090 with 24GB VRAM detected +- ✅ CUDA 12.9 driver installed +- ✅ Backend code updated to support `n_gpu_layers` parameter +- ⚠️ CUDA-enabled llama-cpp-python requires additional setup + +## Why GPU Acceleration? + +With your RTX 3090, you can expect: +- **10-50x faster** inference compared to CPU +- **Lower latency** for real-time agent responses +- **Larger models** can fit in VRAM + +## Setup Options + +### Option 1: Install CUDA Toolkit (Recommended) + +The pre-built CUDA wheels require CUDA runtime libraries. + +1. **Download CUDA Toolkit 12.x**: + - Visit: https://developer.nvidia.com/cuda-downloads + - Select: Windows → x86_64 → 12.6 or 12.9 + - Download and install (Base Installer, ~3GB) + +2. **Install llama-cpp-python with CUDA**: + ```bash + cd python + venv\Scripts\activate + pip uninstall llama-cpp-python + pip install llama-cpp-python==0.3.4 --index-url https://abetlen.github.io/llama-cpp-python/whl/cu122 + ``` + +3. **Test GPU acceleration**: + ```bash + python test_llama_gpu.py + ``` + +### Option 2: Use vLLM (Production Alternative) + +For maximum GPU performance on Linux or WSL2: + +```bash +# In WSL2 or Linux +pip install vllm +python run_vllm_server.py --model meta-llama/Llama-2-7b-chat-hf + +# Connect from Windows +from backends import VLLMBackend, VLLMBackendConfig +config = VLLMBackendConfig(api_base="http://localhost:8000/v1") +backend = VLLMBackend(config) +``` + +### Option 3: llama.cpp Standalone (Advanced) + +Build llama.cpp with CUDA support directly: + +```bash +git clone https://github.com/ggerganov/llama.cpp +cd llama.cpp +cmake -B build -DGGML_CUDA=ON +cmake --build build --config Release +``` + +Then use the compiled `llama-cli.exe` or `llama-server.exe`. + +## Configuration + +Once CUDA is set up, configure GPU layers in your code: + +```python +from backends import LlamaCppBackend, BackendConfig + +# Full GPU offload (recommended for RTX 3090) +config = BackendConfig( + model_path="../models/llama-2-7b-chat.Q4_K_M.gguf", + temperature=0.7, + max_tokens=512, + n_gpu_layers=-1, # -1 = all layers to GPU +) + +backend = LlamaCppBackend(config) +``` + +**GPU Layer Options:** +- `n_gpu_layers=0`: CPU only (current default) +- `n_gpu_layers=20`: Offload 20 layers to GPU (hybrid) +- `n_gpu_layers=-1`: Offload all layers to GPU (fastest) + +## Expected Performance + +With RTX 3090 and full GPU offload: + +| Model | Quantization | CPU Speed | GPU Speed | Speedup | +|-------|--------------|-----------|-----------|---------| +| Llama-2-7B | Q4_K_M | ~9 tok/s | ~100+ tok/s | 10-15x | +| Llama-2-13B | Q4_K_M | ~4 tok/s | ~60+ tok/s | 15-20x | +| Llama-2-70B | Q4_K_M | N/A | ~20 tok/s | - | + +## Testing GPU Acceleration + +Use the provided test script: + +```bash +cd python +venv\Scripts\activate +python test_llama_gpu.py +``` + +This will compare: +1. CPU-only inference +2. Partial GPU offload (20 layers) +3. Full GPU offload (all layers) + +## Troubleshooting + +### Error: "Could not find module llama.dll" + +**Cause**: CUDA runtime DLLs not found in PATH. + +**Solution**: Install CUDA Toolkit (Option 1 above) or add CUDA bin directory to PATH: +``` +C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.x\bin +``` + +### Error: "CUDA out of memory" + +**Cause**: Model too large for VRAM. + +**Solutions**: +- Use smaller quantization (Q4_K_M instead of Q8) +- Reduce `n_ctx` (context window) +- Use partial GPU offload (e.g., `n_gpu_layers=20`) + +### GPU not being used (nvidia-smi shows 0% usage) + +**Cause**: `n_gpu_layers=0` (CPU-only mode). + +**Solution**: Set `n_gpu_layers=-1` in BackendConfig. + +### Slow first inference + +**Cause**: GPU kernel compilation on first run. + +**Solution**: This is normal. Subsequent inferences will be fast. + +## Current CPU Performance + +Without GPU acceleration, your current setup achieves: +- **~9 tokens/second** with Q4_K_M quantization +- **~110ms per token** generation time +- Works reliably for development and testing + +## Verification + +Check if CUDA support is available: + +```python +from backends import LlamaCppBackend, BackendConfig + +config = BackendConfig( + model_path="../models/llama-2-7b-chat.Q4_K_M.gguf", + n_gpu_layers=-1, +) + +try: + backend = LlamaCppBackend(config) + print("✓ GPU acceleration is working!") +except Exception as e: + print(f"✗ GPU error: {e}") + print("Falling back to CPU mode...") +``` + +## Next Steps + +1. **For local development**: Continue using CPU mode (works well) +2. **For production**: Install CUDA Toolkit for GPU acceleration +3. **For maximum performance**: Use vLLM on Linux/WSL2 + +## Additional Resources + +- [llama.cpp Documentation](https://github.com/ggerganov/llama.cpp) +- [llama-cpp-python GPU Guide](https://llama-cpp-python.readthedocs.io/en/latest/) +- [CUDA Toolkit Download](https://developer.nvidia.com/cuda-downloads) +- [vLLM Documentation](https://docs.vllm.ai/) diff --git a/docs/llama_cpp_windows_setup.md b/docs/llama_cpp_windows_setup.md new file mode 100644 index 0000000..3b090f3 --- /dev/null +++ b/docs/llama_cpp_windows_setup.md @@ -0,0 +1,302 @@ +# llama.cpp Backend Setup for Windows + +This guide shows how to set up and use the llama.cpp backend for local development on Windows. + +## Overview + +llama.cpp provides efficient CPU and GPU inference for LLaMA models using GGUF format. It's perfect for: +- Local development on Windows +- CPU-only inference (no CUDA required) +- Low memory usage with quantized models +- Quick prototyping + +## Prerequisites + +- ✅ Python 3.11 (already installed) +- ✅ llama-cpp-python (already installed) +- ✅ GGUF model file + +## Model Setup + +### 1. Download a GGUF Model + +You've already downloaded: `llama-2-7b-chat.Q4_K_M.gguf` (3.9GB) + +Place it in the `models/` directory: +``` +AgentArena/ +├── models/ +│ └── llama-2-7b-chat.Q4_K_M.gguf +``` + +### 2. Model Quantization Levels + +GGUF models come in different quantization levels: + +| Quantization | File Size | Quality | Speed | +|--------------|-----------|---------|-------| +| Q2_K | ~2.5GB | Lower | Fastest | +| Q4_K_M | ~3.9GB | Good | Fast | +| Q5_K_M | ~4.8GB | Better | Medium | +| Q8_0 | ~7GB | Best | Slower | + +**Q4_K_M** is the recommended balance for most use cases. + +## Configuration + +### Using Python Code + +```python +from backends import LlamaCppBackend, BackendConfig + +config = BackendConfig( + model_path="models/llama-2-7b-chat.Q4_K_M.gguf", + temperature=0.7, + max_tokens=512, + top_p=0.9, + top_k=40, +) + +backend = LlamaCppBackend(config) +``` + +### Using Hydra Config + +Edit `configs/backend/llama_cpp.yaml`: + +```yaml +backend: + type: llama_cpp + model_path: "models/llama-2-7b-chat.Q4_K_M.gguf" + + n_ctx: 4096 # Context window + n_threads: 8 # CPU threads (adjust based on your CPU) + n_gpu_layers: 0 # 0 = CPU only, 35 = full GPU offload + + temperature: 0.7 + max_tokens: 512 +``` + +## Quick Start + +### 1. Run the Test Script + +```bash +cd python +venv\Scripts\activate +python test_llama_backend.py +``` + +This will test: +- Basic text generation +- Function/tool calling +- Different temperature settings +- Multi-turn conversations + +### 2. Use in Your Code + +```python +from backends import LlamaCppBackend, BackendConfig + +# Initialize +config = BackendConfig( + model_path="models/llama-2-7b-chat.Q4_K_M.gguf", + temperature=0.7, + max_tokens=256, +) +backend = LlamaCppBackend(config) + +# Generate text +result = backend.generate("Hello! My name is") +print(result.text) + +# Generate with tools +tools = [ + { + "name": "move_to", + "description": "Move to coordinates", + "parameters": { + "type": "object", + "properties": { + "target": { + "type": "array", + "items": {"type": "number"} + } + } + } + } +] + +result = backend.generate_with_tools( + "Move to position (10, 20, 5)", + tools +) + +if "parsed_tool_call" in result.metadata: + print(f"Tool: {result.metadata['parsed_tool_call']}") +``` + +## GPU Acceleration (Optional) + +If you have an NVIDIA GPU with CUDA, you can offload layers to GPU: + +### 1. Install CUDA-enabled llama-cpp-python + +```bash +# Uninstall CPU version +pip uninstall llama-cpp-python + +# Install with CUDA support (requires CUDA 11.x or 12.x) +set CMAKE_ARGS=-DLLAMA_CUBLAS=on +pip install llama-cpp-python --force-reinstall --no-cache-dir +``` + +### 2. Update Configuration + +```python +config = BackendConfig( + model_path="models/llama-2-7b-chat.Q4_K_M.gguf", + # ... other settings ... +) + +# When creating backend, enable GPU layers +from llama_cpp import Llama + +llm = Llama( + model_path=config.model_path, + n_ctx=4096, + n_threads=8, + n_gpu_layers=35, # Offload all layers to GPU +) +``` + +Or edit the backend code to support `n_gpu_layers` parameter. + +## Performance Tuning + +### CPU Threads + +Adjust `n_threads` based on your CPU: +- **4-core CPU**: 4-6 threads +- **8-core CPU**: 8-12 threads +- **16-core CPU**: 12-16 threads + +```python +# In llama_cpp_backend.py, line 30-34 +self.llm = Llama( + model_path=self.config.model_path, + n_ctx=4096, + n_threads=12, # Adjust this + n_gpu_layers=0, +) +``` + +### Context Window + +Reduce `n_ctx` if running out of memory: +- **4096**: Full context (default) +- **2048**: Half context, less memory +- **1024**: Quarter context, minimal memory + +### Batch Size + +Adjust `n_batch` for prompt processing speed: +- **512**: Default, good balance +- **128**: Lower memory, slower +- **1024**: More memory, faster + +## Troubleshooting + +### Model Loading is Slow + +**Expected behavior**: First load takes 10-30 seconds for Q4_K_M model. + +**Solutions**: +- Use `use_mmap=true` (default) for faster loading +- Keep the model loaded between requests +- Use a smaller quantization (Q2_K) + +### Out of Memory + +``` +RuntimeError: Failed to allocate memory +``` + +**Solutions**: +- Reduce `n_ctx` to 2048 or 1024 +- Close other applications +- Use a smaller model or quantization + +### Slow Generation + +**Solutions**: +- Increase `n_threads` up to your CPU core count +- Enable GPU offload with `n_gpu_layers` +- Reduce `max_tokens` +- Use a smaller model + +### Import Error + +``` +ModuleNotFoundError: No module named 'llama_cpp' +``` + +**Solution**: +```bash +cd python +venv\Scripts\activate +pip install llama-cpp-python +``` + +## Example Use Cases + +### Agent Decision Making + +```python +def get_agent_action(observation): + prompt = f"""You are an AI agent in a game world. + +Current observation: {observation} + +Available actions: +- move_to(x, y, z): Move to coordinates +- pickup_item(name): Pick up an item +- use_item(name): Use an item from inventory + +What action should you take? Respond with JSON: +{{"action": "action_name", "params": {{}}, "reasoning": "why"}} +""" + + result = backend.generate(prompt, temperature=0.5, max_tokens=200) + return result.text +``` + +### Conversation System + +```python +def chat_with_agent(messages): + # Format conversation for Llama-2 chat format + prompt = "" + for msg in messages: + if msg["role"] == "user": + prompt += f"[INST] {msg['content']} [/INST]" + else: + prompt += f" {msg['content']}" + + result = backend.generate(prompt, max_tokens=300) + return result.text +``` + +## Next Steps + +1. ✅ Test the backend with `python test_llama_backend.py` +2. Integrate with your agent runtime +3. Experiment with different prompts and temperatures +4. Consider GPU acceleration for production use + +## Resources + +- [llama.cpp GitHub](https://github.com/ggerganov/llama.cpp) +- [llama-cpp-python Documentation](https://llama-cpp-python.readthedocs.io/) +- [GGUF Model Download](https://huggingface.co/TheBloke) +- [Model Quantization Guide](https://github.com/ggerganov/llama.cpp#quantization) diff --git a/docs/vllm_backend.md b/docs/vllm_backend.md new file mode 100644 index 0000000..ebbe3ec --- /dev/null +++ b/docs/vllm_backend.md @@ -0,0 +1,311 @@ +# vLLM Backend Integration + +This document describes how to use the vLLM backend for high-throughput LLM inference in Agent Arena. + +## Overview + +vLLM is a high-performance inference engine optimized for serving large language models at scale. It provides: + +- **High throughput**: PagedAttention and continuous batching +- **OpenAI-compatible API**: Drop-in replacement for OpenAI API +- **Multiple model support**: Llama, Mistral, Qwen, and more +- **Function calling**: Native support for tool/function calling +- **GPU acceleration**: Optimized CUDA kernels + +## Requirements + +### System Requirements + +- **GPU**: NVIDIA GPU with CUDA support (8GB+ VRAM recommended) +- **CUDA**: Version 11.8 or 12.1 +- **Python**: 3.8-3.11 +- **OS**: Linux (recommended) or Windows with WSL2 + +### Installation + +```bash +# Activate your virtual environment +cd python +venv\Scripts\activate # Windows +# source venv/bin/activate # Linux/Mac + +# Install vLLM (requires CUDA) +pip install vllm + +# For specific CUDA version (e.g., CUDA 12.1) +pip install vllm-cuda121 +``` + +**Note**: vLLM requires a CUDA-capable GPU. It does not support CPU-only inference. + +## Starting the vLLM Server + +### Option 1: Using the helper script + +```bash +cd python +python run_vllm_server.py --model meta-llama/Llama-2-7b-chat-hf --port 8000 +``` + +### Option 2: Direct command + +```bash +python -m vllm.entrypoints.openai.api_server \ + --model meta-llama/Llama-2-7b-chat-hf \ + --port 8000 \ + --gpu-memory-utilization 0.9 \ + --max-model-len 4096 +``` + +### Common Arguments + +- `--model`: Model name from HuggingFace or local path +- `--port`: Server port (default: 8000) +- `--tensor-parallel-size`: Number of GPUs to use +- `--gpu-memory-utilization`: GPU memory fraction (0.0-1.0) +- `--max-model-len`: Maximum context length +- `--dtype`: Data type (auto, half, float16, bfloat16, float32) + +## Configuration + +### Hydra Config File + +Edit `configs/backend/vllm.yaml`: + +```yaml +backend: + type: vllm + model: "meta-llama/Llama-2-7b-chat-hf" + + # Server settings + host: "localhost" + port: 8000 + api_base: "http://localhost:8000/v1" + + # Model parameters + tensor_parallel_size: 1 # Number of GPUs + dtype: "auto" + max_model_len: 4096 + gpu_memory_utilization: 0.9 + + # Generation + temperature: 0.7 + top_p: 0.9 + max_tokens: 512 + + # Function calling + function_calling: + enabled: true + format: "json" +``` + +### Python Code + +```python +from backends import VLLMBackend, VLLMBackendConfig + +# Create configuration +config = VLLMBackendConfig( + model_path="meta-llama/Llama-2-7b-chat-hf", + api_base="http://localhost:8000/v1", + temperature=0.7, + max_tokens=512, +) + +# Initialize backend +backend = VLLMBackend(config) + +# Check availability +if backend.is_available(): + print("vLLM server is ready!") + +# Generate text +result = backend.generate("Hello, my name is") +print(result.text) + +# Generate with tools +tools = [ + { + "name": "move_to", + "description": "Move agent to coordinates", + "parameters": { + "type": "object", + "properties": { + "target": { + "type": "array", + "items": {"type": "number"}, + "description": "Target [x, y, z] coordinates" + } + }, + "required": ["target"] + } + } +] + +result = backend.generate_with_tools( + "I need to move to coordinates (10, 20, 5)", + tools +) + +if "tool_call" in result.metadata: + print(f"Tool: {result.metadata['tool_call']['name']}") + print(f"Arguments: {result.metadata['tool_call']['arguments']}") +``` + +## Supported Models + +vLLM supports many model architectures. Popular choices: + +### Llama Models +- `meta-llama/Llama-2-7b-chat-hf` +- `meta-llama/Llama-2-13b-chat-hf` +- `meta-llama/Meta-Llama-3-8B-Instruct` + +### Mistral Models +- `mistralai/Mistral-7B-Instruct-v0.2` +- `mistralai/Mixtral-8x7B-Instruct-v0.1` + +### Qwen Models +- `Qwen/Qwen2-7B-Instruct` +- `Qwen/Qwen2.5-7B-Instruct` + +### Function Calling Models +For best function calling support, use models trained for tool use: +- `NousResearch/Hermes-2-Pro-Llama-3-8B` +- `gorilla-llm/gorilla-openfunctions-v2` + +## Performance Tuning + +### GPU Memory + +Adjust `gpu_memory_utilization` based on your VRAM: + +- **8GB GPU**: 0.7-0.8 +- **16GB GPU**: 0.85-0.9 +- **24GB+ GPU**: 0.9-0.95 + +### Multi-GPU + +For multiple GPUs, use tensor parallelism: + +```bash +python -m vllm.entrypoints.openai.api_server \ + --model meta-llama/Llama-2-13b-chat-hf \ + --tensor-parallel-size 2 # Use 2 GPUs +``` + +### Context Length + +Reduce `max_model_len` if running out of memory: + +```yaml +max_model_len: 2048 # Instead of 4096 +``` + +## Function Calling + +vLLM supports OpenAI-style function calling for compatible models. + +### Native Function Calling + +```python +tools = [ + { + "name": "get_weather", + "description": "Get current weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + } + } + } +] + +result = backend.generate_with_tools( + "What's the weather in Paris?", + tools +) + +if "tool_call" in result.metadata: + tool_call = result.metadata["tool_call"] + print(f"Calling {tool_call['name']} with {tool_call['arguments']}") +``` + +### Fallback Method + +If the model doesn't support native function calling, the backend automatically falls back to prompt-based tool calling. + +## Troubleshooting + +### Server Not Starting + +``` +Error: CUDA out of memory +``` + +**Solution**: Reduce `gpu_memory_utilization` or `max_model_len` + +### Connection Refused + +``` +ConnectionError: Cannot connect to vLLM server +``` + +**Solution**: +1. Check if server is running: `curl http://localhost:8000/v1/models` +2. Verify port is correct +3. Check firewall settings + +### Slow Generation + +**Solutions**: +- Enable tensor parallelism for multi-GPU +- Reduce `max_model_len` +- Use quantized models (e.g., AWQ, GPTQ) +- Check GPU utilization with `nvidia-smi` + +### Model Not Found + +``` +OSError: meta-llama/Llama-2-7b-chat-hf is not a local folder +``` + +**Solution**: +1. Model will be downloaded from HuggingFace on first run +2. Ensure you have a HuggingFace token for gated models +3. Or download manually and use local path + +## Testing + +Run the vLLM backend tests: + +```bash +# Start vLLM server first +python run_vllm_server.py --model meta-llama/Llama-2-7b-chat-hf + +# In another terminal +cd python +venv\Scripts\activate +pytest ../tests/test_vllm_backend.py -v +``` + +Tests will be skipped if the server is not available. + +## Comparison with llama.cpp + +| Feature | vLLM | llama.cpp | +|---------|------|-----------| +| **Performance** | High throughput, optimized for serving | Good single-request performance | +| **Hardware** | Requires CUDA GPU | CPU + optional GPU | +| **Memory** | Higher VRAM usage | Lower memory footprint | +| **Batching** | Continuous batching | Manual batching | +| **Setup** | Requires server | Direct library | +| **Use Case** | Production serving, multiple agents | Development, single agent | + +## References + +- [vLLM Documentation](https://docs.vllm.ai/) +- [vLLM GitHub](https://github.com/vllm-project/vllm) +- [OpenAI API Compatibility](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html) +- [Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html) diff --git a/godot/include/agent_arena.h b/godot/include/agent_arena.h index 6293e51..b80fe14 100644 --- a/godot/include/agent_arena.h +++ b/godot/include/agent_arena.h @@ -180,12 +180,14 @@ class IPCClient : public godot::Node { private: godot::String server_url; godot::HTTPRequest* http_request; + godot::HTTPRequest* http_request_tool; // Separate request node for tool execution bool is_connected; uint64_t current_tick; godot::Dictionary pending_response; bool response_received; void _on_request_completed(int result, int response_code, const godot::PackedStringArray& headers, const godot::PackedByteArray& body); + void _on_tool_request_completed(int result, int response_code, const godot::PackedStringArray& headers, const godot::PackedByteArray& body); protected: static void _bind_methods(); diff --git a/godot/src/agent_arena.cpp b/godot/src/agent_arena.cpp index d281029..5aa2e7b 100644 --- a/godot/src/agent_arena.cpp +++ b/godot/src/agent_arena.cpp @@ -362,6 +362,7 @@ void ToolRegistry::set_ipc_client(IPCClient* client) { IPCClient::IPCClient() : server_url("http://127.0.0.1:5000"), http_request(nullptr), + http_request_tool(nullptr), is_connected(false), current_tick(0), response_received(false) { @@ -371,6 +372,9 @@ IPCClient::~IPCClient() { if (http_request != nullptr) { http_request->queue_free(); } + if (http_request_tool != nullptr) { + http_request_tool->queue_free(); + } } void IPCClient::_bind_methods() { @@ -390,6 +394,8 @@ void IPCClient::_bind_methods() { ClassDB::bind_method(D_METHOD("_on_request_completed", "result", "response_code", "headers", "body"), &IPCClient::_on_request_completed); + ClassDB::bind_method(D_METHOD("_on_tool_request_completed", "result", "response_code", "headers", "body"), + &IPCClient::_on_tool_request_completed); ADD_PROPERTY(PropertyInfo(Variant::STRING, "server_url"), "set_server_url", "get_server_url"); @@ -398,7 +404,7 @@ void IPCClient::_bind_methods() { } void IPCClient::_ready() { - // Create HTTPRequest node + // Create HTTPRequest node for general requests (health check, tick) http_request = memnew(HTTPRequest); add_child(http_request); @@ -406,6 +412,14 @@ void IPCClient::_ready() { http_request->connect("request_completed", Callable(this, "_on_request_completed")); + // Create separate HTTPRequest node for tool execution + http_request_tool = memnew(HTTPRequest); + add_child(http_request_tool); + + // Connect signal for tool requests + http_request_tool->connect("request_completed", + Callable(this, "_on_tool_request_completed")); + UtilityFunctions::print("IPCClient initialized with server URL: ", server_url); } @@ -515,6 +529,42 @@ void IPCClient::_on_request_completed(int result, int response_code, } } +void IPCClient::_on_tool_request_completed(int result, int response_code, + const PackedStringArray& headers, + const PackedByteArray& body) { + UtilityFunctions::print("[C++] Tool request callback triggered - result: ", result, ", code: ", response_code); + + if (result != HTTPRequest::RESULT_SUCCESS) { + UtilityFunctions::print("Tool HTTP Request failed with result: ", result); + return; + } + + if (response_code == 200) { + // Parse JSON response + String body_string = body.get_string_from_utf8(); + + // Parse JSON + JSON json; + Error err = json.parse(body_string); + + if (err == OK) { + Variant data = json.get_data(); + if (data.get_type() == Variant::DICTIONARY) { + Dictionary tool_response = data; + UtilityFunctions::print("Tool execution response received: ", tool_response); + // Could emit a signal here for async handling + emit_signal("response_received", tool_response); + } else { + UtilityFunctions::print("Invalid tool response JSON format"); + } + } else { + UtilityFunctions::print("Failed to parse tool response JSON"); + } + } else { + UtilityFunctions::print("Tool HTTP request returned error code: ", response_code); + } +} + Dictionary IPCClient::execute_tool_sync(const String& tool_name, const Dictionary& params, const String& agent_id, uint64_t tick) { Dictionary result; @@ -532,12 +582,12 @@ Dictionary IPCClient::execute_tool_sync(const String& tool_name, const Dictionar String json_str = JSON::stringify(request_dict); - // Send POST request using main http_request + // Send POST request using separate http_request_tool to avoid conflicts String url = server_url + "/tools/execute"; PackedStringArray headers; headers.append("Content-Type: application/json"); - Error err = http_request->request(url, headers, HTTPClient::METHOD_POST, json_str); + Error err = http_request_tool->request(url, headers, HTTPClient::METHOD_POST, json_str); if (err != OK) { UtilityFunctions::print("Error sending tool execution request: ", err); diff --git a/python/backends/__init__.py b/python/backends/__init__.py index f404cb2..b4c4f30 100644 --- a/python/backends/__init__.py +++ b/python/backends/__init__.py @@ -2,7 +2,8 @@ LLM Backend Adapters for Agent Arena """ -from .base import BaseBackend +from .base import BaseBackend, BackendConfig from .llama_cpp_backend import LlamaCppBackend +from .vllm_backend import VLLMBackend, VLLMBackendConfig -__all__ = ["BaseBackend", "LlamaCppBackend"] +__all__ = ["BaseBackend", "BackendConfig", "LlamaCppBackend", "VLLMBackend", "VLLMBackendConfig"] diff --git a/python/backends/base.py b/python/backends/base.py index 8439f7e..cf55cb1 100644 --- a/python/backends/base.py +++ b/python/backends/base.py @@ -16,6 +16,7 @@ class BackendConfig: max_tokens: int = 512 top_p: float = 0.9 top_k: int = 40 + n_gpu_layers: int = 0 # Number of layers to offload to GPU (0 = CPU only, -1 = all) @dataclass diff --git a/python/backends/llama_cpp_backend.py b/python/backends/llama_cpp_backend.py index 2a9dc70..3437e3a 100644 --- a/python/backends/llama_cpp_backend.py +++ b/python/backends/llama_cpp_backend.py @@ -27,11 +27,22 @@ def _load_model(self) -> None: logger.info(f"Loading model from {self.config.model_path}") + # Use GPU layers from config + n_gpu_layers = getattr(self.config, 'n_gpu_layers', 0) + + if n_gpu_layers > 0: + logger.info(f"Offloading {n_gpu_layers} layers to GPU") + elif n_gpu_layers == -1: + logger.info("Offloading all layers to GPU") + else: + logger.info("Using CPU only (no GPU offload)") + self.llm = Llama( model_path=self.config.model_path, n_ctx=4096, # Context window n_threads=8, # CPU threads - n_gpu_layers=0, # GPU layers (0 = CPU only) + n_gpu_layers=n_gpu_layers, # GPU layers (0 = CPU only, -1 = all) + verbose=False, # Reduce output noise ) logger.info("Model loaded successfully") diff --git a/python/backends/vllm_backend.py b/python/backends/vllm_backend.py new file mode 100644 index 0000000..c543318 --- /dev/null +++ b/python/backends/vllm_backend.py @@ -0,0 +1,344 @@ +""" +vLLM backend adapter using OpenAI-compatible API. + +vLLM is a high-throughput inference engine that provides an OpenAI-compatible +REST API. This backend connects to a vLLM server instance. +""" + +import json +import logging +from typing import Any + +from openai import OpenAI + +from .base import BackendConfig, BaseBackend, GenerationResult + +logger = logging.getLogger(__name__) + + +class VLLMBackendConfig(BackendConfig): + """Extended configuration for vLLM backend.""" + + def __init__( + self, + model_path: str, + api_base: str = "http://localhost:8000/v1", + api_key: str = "EMPTY", + temperature: float = 0.7, + max_tokens: int = 512, + top_p: float = 0.9, + top_k: int = 40, + ): + """ + Initialize vLLM backend config. + + Args: + model_path: Model identifier (e.g., "meta-llama/Llama-2-7b-chat-hf") + api_base: Base URL for vLLM server + api_key: API key (vLLM uses "EMPTY" by default) + temperature: Sampling temperature + max_tokens: Maximum tokens to generate + top_p: Nucleus sampling parameter + top_k: Top-k sampling parameter + """ + super().__init__( + model_path=model_path, + temperature=temperature, + max_tokens=max_tokens, + top_p=top_p, + top_k=top_k, + ) + self.api_base = api_base + self.api_key = api_key + + +class VLLMBackend(BaseBackend): + """ + Backend adapter for vLLM inference server. + + This backend connects to a running vLLM server using the OpenAI-compatible API. + The vLLM server must be started separately before using this backend. + + Example: + Start vLLM server: + ```bash + python -m vllm.entrypoints.openai.api_server \\ + --model meta-llama/Llama-2-7b-chat-hf \\ + --port 8000 + ``` + + Then use this backend: + ```python + config = VLLMBackendConfig( + model_path="meta-llama/Llama-2-7b-chat-hf", + api_base="http://localhost:8000/v1" + ) + backend = VLLMBackend(config) + result = backend.generate("Hello, world!") + ``` + """ + + def __init__(self, config: VLLMBackendConfig): + """ + Initialize vLLM backend. + + Args: + config: vLLM backend configuration + """ + super().__init__(config) + self.config: VLLMBackendConfig = config + self.client: OpenAI | None = None + self._connect() + + def _connect(self) -> None: + """Connect to vLLM server.""" + try: + logger.info(f"Connecting to vLLM server at {self.config.api_base}") + + self.client = OpenAI( + api_key=self.config.api_key, + base_url=self.config.api_base, + ) + + # Test connection with a simple request + try: + models = self.client.models.list() + logger.info(f"Connected to vLLM. Available models: {[m.id for m in models.data]}") + except Exception as e: + logger.warning(f"Could not list models (server may not be ready): {e}") + + except Exception as e: + logger.error(f"Failed to connect to vLLM server: {e}") + raise + + def generate( + self, + prompt: str, + temperature: float | None = None, + max_tokens: int | None = None, + ) -> GenerationResult: + """ + Generate text from prompt using vLLM. + + Args: + prompt: Input prompt + temperature: Override temperature (optional) + max_tokens: Override max tokens (optional) + + Returns: + GenerationResult with generated text and metadata + """ + if not self.client: + raise RuntimeError("vLLM client not connected") + + temp = temperature if temperature is not None else self.config.temperature + max_tok = max_tokens if max_tokens is not None else self.config.max_tokens + + try: + response = self.client.completions.create( + model=self.config.model_path, + prompt=prompt, + temperature=temp, + max_tokens=max_tok, + top_p=self.config.top_p, + extra_body={"top_k": self.config.top_k}, + ) + + text = response.choices[0].text + tokens_used = response.usage.total_tokens if response.usage else 0 + + return GenerationResult( + text=text, + tokens_used=tokens_used, + finish_reason=response.choices[0].finish_reason or "stop", + metadata={ + "model": self.config.model_path, + "api_base": self.config.api_base, + }, + ) + + except Exception as e: + logger.error(f"Generation error: {e}") + return GenerationResult( + text="", + tokens_used=0, + finish_reason="error", + metadata={"error": str(e)}, + ) + + def generate_with_tools( + self, + prompt: str, + tools: list[dict[str, Any]], + temperature: float | None = None, + ) -> GenerationResult: + """ + Generate with function calling support. + + vLLM supports OpenAI-style function calling for compatible models. + + Args: + prompt: Input prompt + tools: List of available tool schemas + temperature: Override temperature (optional) + + Returns: + GenerationResult with tool call or text + """ + if not self.client: + raise RuntimeError("vLLM client not connected") + + temp = temperature if temperature is not None else self.config.temperature + + try: + # Convert tool schemas to OpenAI format + openai_tools = [] + for tool in tools: + openai_tools.append({ + "type": "function", + "function": { + "name": tool["name"], + "description": tool["description"], + "parameters": tool.get("parameters", {}), + } + }) + + # Use chat completions API for function calling + response = self.client.chat.completions.create( + model=self.config.model_path, + messages=[{"role": "user", "content": prompt}], + tools=openai_tools, + tool_choice="auto", + temperature=temp, + max_tokens=self.config.max_tokens, + ) + + choice = response.choices[0] + tokens_used = response.usage.total_tokens if response.usage else 0 + + # Check if model returned a tool call + if choice.message.tool_calls: + tool_call = choice.message.tool_calls[0] + text = choice.message.content or "" + + return GenerationResult( + text=text, + tokens_used=tokens_used, + finish_reason=choice.finish_reason or "stop", + metadata={ + "model": self.config.model_path, + "tool_call": { + "name": tool_call.function.name, + "arguments": json.loads(tool_call.function.arguments), + }, + }, + ) + else: + # No tool call, return regular text + text = choice.message.content or "" + return GenerationResult( + text=text, + tokens_used=tokens_used, + finish_reason=choice.finish_reason or "stop", + metadata={"model": self.config.model_path}, + ) + + except Exception as e: + logger.error(f"Tool generation error: {e}") + + # Fallback to prompt-based tool calling + logger.info("Falling back to prompt-based tool calling") + return self._generate_with_tools_fallback(prompt, tools, temp) + + def _generate_with_tools_fallback( + self, + prompt: str, + tools: list[dict[str, Any]], + temperature: float, + ) -> GenerationResult: + """ + Fallback method for tool calling using prompt engineering. + + Used when the model doesn't support native function calling. + + Args: + prompt: Input prompt + tools: List of available tool schemas + temperature: Sampling temperature + + Returns: + GenerationResult with tool call attempt + """ + # Build a prompt that includes tool schemas + tool_descriptions = [] + for tool in tools: + tool_desc = f"- {tool['name']}: {tool['description']}" + if "parameters" in tool: + tool_desc += f"\n Parameters: {json.dumps(tool['parameters'])}" + tool_descriptions.append(tool_desc) + + tools_text = "\n".join(tool_descriptions) + + enhanced_prompt = f"""{prompt} + +Available tools: +{tools_text} + +Respond with a JSON object in the format: +{{"tool": "tool_name", "params": {{}}, "reasoning": "why this tool"}} + +Or if no tool is needed: +{{"tool": "none", "reasoning": "explanation"}} +""" + + result = self.generate(enhanced_prompt, temperature) + + # Try to parse JSON from result + try: + text = result.text.strip() + # Remove markdown code blocks if present + if text.startswith("```json"): + text = text[7:] + elif text.startswith("```"): + text = text[3:] + if text.endswith("```"): + text = text[:-3] + + parsed = json.loads(text.strip()) + result.metadata["parsed_tool_call"] = parsed + + except json.JSONDecodeError: + logger.warning("Failed to parse tool call JSON from response") + result.metadata["parse_error"] = True + + return result + + def is_available(self) -> bool: + """ + Check if vLLM server is available and ready. + + Returns: + True if server is connected and responsive + """ + if not self.client: + return False + + try: + # Try to list models as a health check + self.client.models.list() + return True + except Exception as e: + logger.debug(f"vLLM availability check failed: {e}") + return False + + def unload(self) -> None: + """ + Disconnect from vLLM server. + + Note: This only closes the client connection. The vLLM server + continues running and must be stopped separately if needed. + """ + if self.client: + self.client.close() + self.client = None + logger.info("Disconnected from vLLM server") diff --git a/python/run_ipc_server_with_gpu.py b/python/run_ipc_server_with_gpu.py new file mode 100644 index 0000000..821bd02 --- /dev/null +++ b/python/run_ipc_server_with_gpu.py @@ -0,0 +1,157 @@ +""" +IPC Server with GPU-accelerated agent backend. + +This script starts the FastAPI server with LLM-powered agents using +GPU-accelerated llama.cpp backend. +""" + +import argparse +import logging +import sys + +from agent_runtime.runtime import AgentRuntime +from agent_runtime.agent import Agent +from agent_runtime.tool_dispatcher import ToolDispatcher +from backends import LlamaCppBackend, BackendConfig +from ipc.server import create_server +from tools import register_movement_tools, register_inventory_tools, register_world_query_tools + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", +) + +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser( + description="Agent Arena IPC Server with GPU-Accelerated LLM Backend" + ) + parser.add_argument( + "--host", + type=str, + default="127.0.0.1", + help="Host address to bind to (default: 127.0.0.1)", + ) + parser.add_argument( + "--port", + type=int, + default=5000, + help="Port to listen on (default: 5000)", + ) + parser.add_argument( + "--workers", + type=int, + default=4, + help="Maximum number of concurrent agent workers (default: 4)", + ) + parser.add_argument( + "--debug", + action="store_true", + help="Enable debug logging", + ) + parser.add_argument( + "--model", + type=str, + default="../models/llama-2-7b-chat.Q4_K_M.gguf", + help="Path to GGUF model file (default: ../models/llama-2-7b-chat.Q4_K_M.gguf)" + ) + parser.add_argument( + "--gpu-layers", + type=int, + default=-1, + help="Number of layers to offload to GPU: -1=all, 0=CPU only (default: -1)" + ) + parser.add_argument( + "--temperature", + type=float, + default=0.7, + help="LLM temperature for decision making (default: 0.7)" + ) + parser.add_argument( + "--max-tokens", + type=int, + default=256, + help="Maximum tokens to generate per decision (default: 256)" + ) + + args = parser.parse_args() + + if args.debug: + logging.getLogger().setLevel(logging.DEBUG) + + logger.info("=" * 60) + logger.info("Agent Arena IPC Server (GPU-Accelerated)") + logger.info("=" * 60) + logger.info(f"Host: {args.host}") + logger.info(f"Port: {args.port}") + logger.info(f"Max Workers: {args.workers}") + logger.info(f"Model: {args.model}") + logger.info(f"GPU Layers: {args.gpu_layers} ({'all' if args.gpu_layers == -1 else 'CPU only' if args.gpu_layers == 0 else args.gpu_layers})") + logger.info(f"Temperature: {args.temperature}") + logger.info(f"Max Tokens: {args.max_tokens}") + logger.info("=" * 60) + + try: + # Create GPU-accelerated backend configuration + backend_config = BackendConfig( + model_path=args.model, + temperature=args.temperature, + max_tokens=args.max_tokens, + n_gpu_layers=args.gpu_layers + ) + + logger.info("Loading GPU-accelerated LLM backend...") + backend = LlamaCppBackend(backend_config) + logger.info("✓ Backend loaded successfully") + + # Create runtime + runtime = AgentRuntime(max_workers=args.workers) + + # Create tool dispatcher and register all tools + tool_dispatcher = ToolDispatcher() + register_movement_tools(tool_dispatcher) + register_inventory_tools(tool_dispatcher) + register_world_query_tools(tool_dispatcher) + logger.info(f"✓ Registered {len(tool_dispatcher.tools)} tools") + + # Create a test agent with GPU backend + test_agent = Agent( + agent_id="gpu_agent_001", + backend=backend, + tools=list(tool_dispatcher.tools.keys()), + goals=["explore the world", "collect resources", "survive"] + ) + + runtime.register_agent(test_agent) + logger.info(f"✓ Registered agent '{test_agent.state.agent_id}' with GPU backend and {len(test_agent.available_tools)} tools") + + logger.info("=" * 60) + logger.info("Server ready! You can now:") + logger.info(" 1. Run Godot test scenes") + logger.info(" 2. Send POST requests to /tick with agent observations") + logger.info(" 3. Execute tools via POST /tools/execute") + logger.info("=" * 60) + + # Create and start server + server = create_server(runtime=runtime, host=args.host, port=args.port) + logger.info("Starting IPC server...") + server.run() + + except KeyboardInterrupt: + logger.info("\nShutting down gracefully...") + if 'backend' in locals(): + logger.info("Unloading LLM backend...") + backend.unload() + sys.exit(0) + except Exception as e: + logger.error(f"Fatal error: {e}", exc_info=True) + if 'backend' in locals(): + backend.unload() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/python/run_vllm_server.py b/python/run_vllm_server.py new file mode 100644 index 0000000..3c8fa57 --- /dev/null +++ b/python/run_vllm_server.py @@ -0,0 +1,145 @@ +""" +Script to start a vLLM inference server. + +Usage: + python run_vllm_server.py --model meta-llama/Llama-2-7b-chat-hf + python run_vllm_server.py --model meta-llama/Llama-2-7b-chat-hf --port 8000 --gpu-memory 0.9 +""" + +import argparse +import logging +import sys + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser(description="Start vLLM inference server") + + # Model configuration + parser.add_argument( + "--model", + type=str, + default="meta-llama/Llama-2-7b-chat-hf", + help="Model name or path (e.g., meta-llama/Llama-2-7b-chat-hf)", + ) + + # Server configuration + parser.add_argument( + "--host", + type=str, + default="localhost", + help="Server host (default: localhost)", + ) + parser.add_argument( + "--port", + type=int, + default=8000, + help="Server port (default: 8000)", + ) + + # Performance configuration + parser.add_argument( + "--tensor-parallel-size", + type=int, + default=1, + help="Number of GPUs to use for tensor parallelism (default: 1)", + ) + parser.add_argument( + "--gpu-memory", + type=float, + default=0.9, + help="GPU memory utilization (0.0-1.0, default: 0.9)", + ) + parser.add_argument( + "--max-model-len", + type=int, + default=4096, + help="Maximum model context length (default: 4096)", + ) + parser.add_argument( + "--dtype", + type=str, + default="auto", + choices=["auto", "half", "float16", "bfloat16", "float32"], + help="Data type for model weights (default: auto)", + ) + + # Additional options + parser.add_argument( + "--trust-remote-code", + action="store_true", + help="Trust remote code when loading model", + ) + parser.add_argument( + "--enable-function-calling", + action="store_true", + default=True, + help="Enable function calling support (default: True)", + ) + + args = parser.parse_args() + + try: + # Check if vLLM is installed + try: + import vllm + logger.info(f"vLLM version: {vllm.__version__}") + except ImportError: + logger.error( + "vLLM is not installed. Install with: pip install vllm\n" + "Note: vLLM requires CUDA and is not available on CPU-only systems." + ) + sys.exit(1) + + # Import vLLM server + from vllm.entrypoints.openai.api_server import run_server + + logger.info(f"Starting vLLM server for model: {args.model}") + logger.info(f"Server will be available at: http://{args.host}:{args.port}") + logger.info(f"GPU memory utilization: {args.gpu_memory}") + logger.info(f"Tensor parallel size: {args.tensor_parallel_size}") + logger.info(f"Max model length: {args.max_model_len}") + logger.info(f"Data type: {args.dtype}") + + # Build command-line arguments for vLLM + vllm_args = [ + "--model", args.model, + "--host", args.host, + "--port", str(args.port), + "--tensor-parallel-size", str(args.tensor_parallel_size), + "--gpu-memory-utilization", str(args.gpu_memory), + "--max-model-len", str(args.max_model_len), + "--dtype", args.dtype, + ] + + if args.trust_remote_code: + vllm_args.append("--trust-remote-code") + + if args.enable_function_calling: + vllm_args.extend(["--enable-auto-tool-choice", "--tool-call-parser", "hermes"]) + + logger.info(f"vLLM arguments: {' '.join(vllm_args)}") + + # Note: The actual server starting requires using vLLM's CLI + # This script is a helper that shows the configuration + logger.info("\nTo start the server, run:") + logger.info(f"python -m vllm.entrypoints.openai.api_server {' '.join(vllm_args)}") + + # Or start directly if vLLM supports it + import subprocess + subprocess.run( + ["python", "-m", "vllm.entrypoints.openai.api_server"] + vllm_args, + check=True + ) + + except KeyboardInterrupt: + logger.info("\nShutting down vLLM server...") + except Exception as e: + logger.error(f"Error starting vLLM server: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/python/test_agent_gpu.py b/python/test_agent_gpu.py new file mode 100644 index 0000000..bbb5ce1 --- /dev/null +++ b/python/test_agent_gpu.py @@ -0,0 +1,491 @@ +""" +End-to-end test of Agent with GPU-accelerated backend and tools. + +This test demonstrates: +1. Creating a ToolDispatcher with sample tools +2. Initializing an Agent with GPU-accelerated LlamaCppBackend +3. Agent perceiving observations +4. Agent deciding actions using tools via LLM +5. Executing those actions through the ToolDispatcher +""" + +import json +import logging +from backends import LlamaCppBackend, BackendConfig +from agent_runtime.agent import Agent, Action +from agent_runtime.tool_dispatcher import ToolDispatcher + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + + +# ============================================================ +# Step 1: Create Sample Tools +# ============================================================ + +def create_tool_dispatcher() -> ToolDispatcher: + """Create a ToolDispatcher with sample game tools.""" + dispatcher = ToolDispatcher() + + # Tool 1: Move to position + def move_to(target_x: float, target_y: float, speed: float = 1.0) -> dict: + """Move agent to target position.""" + distance = ((target_x**2) + (target_y**2)) ** 0.5 + time_estimate = distance / speed + return { + "success": True, + "message": f"Moving to ({target_x}, {target_y}) at speed {speed}", + "estimated_time": time_estimate, + "distance": distance + } + + dispatcher.register_tool( + name="move_to", + function=move_to, + description="Move the agent to a target position in 2D space", + parameters={ + "type": "object", + "properties": { + "target_x": { + "type": "number", + "description": "Target X coordinate" + }, + "target_y": { + "type": "number", + "description": "Target Y coordinate" + }, + "speed": { + "type": "number", + "description": "Movement speed (default 1.0)", + "default": 1.0 + } + }, + "required": ["target_x", "target_y"] + }, + returns={ + "type": "object", + "properties": { + "success": {"type": "boolean"}, + "message": {"type": "string"}, + "estimated_time": {"type": "number"}, + "distance": {"type": "number"} + } + } + ) + + # Tool 2: Collect resource + def collect_resource(resource_name: str) -> dict: + """Collect a resource from the environment.""" + valid_resources = ["wood", "stone", "food"] + if resource_name in valid_resources: + return { + "success": True, + "message": f"Collected {resource_name}", + "resource": resource_name, + "quantity": 1 + } + else: + return { + "success": False, + "message": f"Unknown resource: {resource_name}", + "error": "Invalid resource type" + } + + dispatcher.register_tool( + name="collect_resource", + function=collect_resource, + description="Collect a resource (wood, stone, or food) from the current location", + parameters={ + "type": "object", + "properties": { + "resource_name": { + "type": "string", + "description": "Name of resource to collect (wood, stone, or food)", + "enum": ["wood", "stone", "food"] + } + }, + "required": ["resource_name"] + }, + returns={ + "type": "object", + "properties": { + "success": {"type": "boolean"}, + "message": {"type": "string"}, + "resource": {"type": "string"}, + "quantity": {"type": "number"} + } + } + ) + + # Tool 3: Check inventory + def check_inventory() -> dict: + """Check current inventory (mock data for demo).""" + return { + "success": True, + "inventory": { + "wood": 5, + "stone": 3, + "food": 2 + }, + "total_items": 10 + } + + dispatcher.register_tool( + name="check_inventory", + function=check_inventory, + description="Check the agent's current inventory", + parameters={ + "type": "object", + "properties": {} + }, + returns={ + "type": "object", + "properties": { + "success": {"type": "boolean"}, + "inventory": {"type": "object"}, + "total_items": {"type": "number"} + } + } + ) + + logger.info(f"Created ToolDispatcher with {len(dispatcher.tools)} tools") + return dispatcher + + +# ============================================================ +# Step 2: Enhanced Agent with Backend Integration +# ============================================================ + +class EnhancedAgent(Agent): + """ + Enhanced Agent that properly integrates with LLM backend and tools. + + This extends the base Agent class to implement actual backend communication. + """ + + def __init__(self, agent_id: str, backend, tool_dispatcher: ToolDispatcher, goals: list[str] | None = None): + # Get available tool names from dispatcher + available_tools = list(tool_dispatcher.tools.keys()) + + super().__init__( + agent_id=agent_id, + backend=backend, + tools=available_tools, + goals=goals + ) + + self.tool_dispatcher = tool_dispatcher + + def _query_llm(self, context: str) -> str: + """ + Query the LLM backend with context and tool information. + + This overrides the placeholder in the base Agent class. + """ + # Get tool schemas for the prompt + tool_schemas = self.tool_dispatcher.export_schemas_json() + + # Build the prompt with Llama-2 chat format + prompt = f"""[INST] You are an autonomous agent in a game world. You can use tools to interact with the environment. + +{context} + +Available tools (JSON format): +{tool_schemas} + +Respond with ONLY a JSON object in this exact format: +{{"tool": "tool_name", "params": {{"param1": value1}}, "reasoning": "why you chose this action"}} + +Choose the most appropriate tool based on your current observations and goals. +Your response (JSON only): [/INST]""" + + logger.debug(f"Querying LLM with prompt length: {len(prompt)} chars") + + # Query backend + result = self.backend.generate( + prompt=prompt, + temperature=0.3, # Lower temperature for more consistent JSON + max_tokens=150 + ) + + # Extract JSON from response + response_text = result.text.strip() + logger.info(f"LLM Response: {response_text}") + + # Try to extract JSON if model added extra text + return self._extract_json(response_text) + + def _extract_json(self, text: str) -> str: + """Extract JSON object from text that might contain extra content.""" + import re + + # Try to find JSON object in the response + start = text.find('{') + end = text.rfind('}') + + if start != -1 and end != -1: + json_str = text[start:end+1] + + # Try to validate and return if valid + try: + json.loads(json_str) + return json_str + except json.JSONDecodeError as e: + logger.debug(f"JSON parse error: {e}") + + # Common issue: missing comma between fields + # Pattern: "value"\n"field" should be "value",\n"field" + fixed_json = re.sub(r'([\d"])\s*\n\s*("(?:reasoning|tool|params))', r'\1,\n\2', json_str) + + try: + json.loads(fixed_json) + logger.debug("Fixed JSON by adding missing commas") + return fixed_json + except json.JSONDecodeError: + # Fallback: extract key-value pairs manually + tool_match = re.search(r'"tool"\s*:\s*"([^"]+)"', text) + resource_match = re.search(r'"resource_name"\s*:\s*"([^"]+)"', text) + target_x_match = re.search(r'"target_x"\s*:\s*([\d.]+)', text) + target_y_match = re.search(r'"target_y"\s*:\s*([\d.]+)', text) + + if tool_match: + tool = tool_match.group(1) + params = {} + + # Extract parameters based on tool type + if tool == "collect_resource" and resource_match: + params = {"resource_name": resource_match.group(1)} + elif tool == "move_to" and target_x_match and target_y_match: + params = { + "target_x": float(target_x_match.group(1)), + "target_y": float(target_y_match.group(1)) + } + elif tool == "check_inventory": + params = {} + + reconstructed = { + "tool": tool, + "params": params + } + + logger.debug(f"Reconstructed JSON from pattern matching: {reconstructed}") + return json.dumps(reconstructed) + + # Fallback: return original text + logger.warning(f"Could not extract valid JSON from: {text[:200]}...") + return text + + def execute_action(self, action: Action) -> dict: + """Execute an action through the tool dispatcher.""" + if action is None: + return {"success": False, "error": "No action provided"} + + logger.info(f"Executing action: {action.tool_name} with params {action.parameters}") + result = self.tool_dispatcher.execute_tool(action.tool_name, action.parameters) + + return result + + +# ============================================================ +# Step 3: Test Scenarios +# ============================================================ + +def test_scenario_1_resource_collection(): + """ + Scenario: Agent sees wood nearby and should collect it. + """ + print("\n" + "="*60) + print("SCENARIO 1: Resource Collection") + print("="*60) + + # Setup + dispatcher = create_tool_dispatcher() + + config = BackendConfig( + model_path="../models/llama-2-7b-chat.Q4_K_M.gguf", + temperature=0.3, + max_tokens=150, + n_gpu_layers=-1 # Full GPU acceleration + ) + backend = LlamaCppBackend(config) + + agent = EnhancedAgent( + agent_id="forager_001", + backend=backend, + tool_dispatcher=dispatcher, + goals=["collect resources for crafting"] + ) + + # Simulate observations + print("\n[Simulation] Agent observes environment...") + agent.perceive({ + "position": {"x": 0, "y": 0}, + "visible_objects": [ + {"type": "wood", "distance": 2.5, "position": {"x": 2, "y": 1}}, + {"type": "tree", "distance": 5.0} + ], + "inventory_count": 10 + }, source="vision") + + # Agent decides action + print("\n[Agent] Deciding action based on observations and goals...") + action = agent.decide_action() + + if action: + print(f"\n[Agent] Decided to use: {action.tool_name}") + print(f"[Agent] Parameters: {action.parameters}") + if action.reasoning: + print(f"[Agent] Reasoning: {action.reasoning}") + + # Execute the action + print("\n[Execution] Running tool...") + result = agent.execute_action(action) + print(f"[Result] {result}") + else: + print("\n[Agent] Failed to decide action") + + backend.unload() + print("\n" + "="*60) + + +def test_scenario_2_navigation(): + """ + Scenario: Agent needs to move to a target location. + """ + print("\n" + "="*60) + print("SCENARIO 2: Navigation") + print("="*60) + + # Setup + dispatcher = create_tool_dispatcher() + + config = BackendConfig( + model_path="../models/llama-2-7b-chat.Q4_K_M.gguf", + temperature=0.3, + max_tokens=150, + n_gpu_layers=-1 + ) + backend = LlamaCppBackend(config) + + agent = EnhancedAgent( + agent_id="explorer_001", + backend=backend, + tool_dispatcher=dispatcher, + goals=["explore the map", "find the tower at (10, 15)"] + ) + + # Simulate observations + print("\n[Simulation] Agent receives navigation task...") + agent.perceive({ + "position": {"x": 0, "y": 0}, + "target_location": {"x": 10, "y": 15}, + "obstacles": [] + }, source="navigation") + + # Agent decides action + print("\n[Agent] Deciding navigation action...") + action = agent.decide_action() + + if action: + print(f"\n[Agent] Decided to use: {action.tool_name}") + print(f"[Agent] Parameters: {action.parameters}") + if action.reasoning: + print(f"[Agent] Reasoning: {action.reasoning}") + + # Execute the action + print("\n[Execution] Running tool...") + result = agent.execute_action(action) + print(f"[Result] {result}") + else: + print("\n[Agent] Failed to decide action") + + backend.unload() + print("\n" + "="*60) + + +def test_scenario_3_inventory_check(): + """ + Scenario: Agent checks inventory before crafting. + """ + print("\n" + "="*60) + print("SCENARIO 3: Inventory Management") + print("="*60) + + # Setup + dispatcher = create_tool_dispatcher() + + config = BackendConfig( + model_path="../models/llama-2-7b-chat.Q4_K_M.gguf", + temperature=0.3, + max_tokens=150, + n_gpu_layers=-1 + ) + backend = LlamaCppBackend(config) + + agent = EnhancedAgent( + agent_id="crafter_001", + backend=backend, + tool_dispatcher=dispatcher, + goals=["craft a wooden tool", "check if we have enough materials"] + ) + + # Simulate observations + print("\n[Simulation] Agent wants to craft something...") + agent.perceive({ + "crafting_station": "workbench", + "recipe_requires": {"wood": 3, "stone": 1}, + "action": "prepare_crafting" + }, source="crafting") + + # Agent decides action + print("\n[Agent] Deciding what to do before crafting...") + action = agent.decide_action() + + if action: + print(f"\n[Agent] Decided to use: {action.tool_name}") + print(f"[Agent] Parameters: {action.parameters}") + if action.reasoning: + print(f"[Agent] Reasoning: {action.reasoning}") + + # Execute the action + print("\n[Execution] Running tool...") + result = agent.execute_action(action) + print(f"[Result] {result}") + else: + print("\n[Agent] Failed to decide action") + + backend.unload() + print("\n" + "="*60) + + +# ============================================================ +# Main Test Runner +# ============================================================ + +if __name__ == "__main__": + print("\n" + "="*60) + print("Agent + GPU Backend + Tools Integration Test") + print("="*60) + print("\nThis test demonstrates an autonomous agent using:") + print(" - GPU-accelerated Llama-2-7B backend (113 tok/s)") + print(" - ToolDispatcher with 3 sample tools") + print(" - Perception-Reasoning-Action loop") + print("\nRunning 3 scenarios...\n") + + try: + # Run test scenarios + test_scenario_1_resource_collection() + test_scenario_2_navigation() + test_scenario_3_inventory_check() + + print("\n" + "="*60) + print("All scenarios completed!") + print("="*60) + + except Exception as e: + logger.error(f"Test failed: {e}", exc_info=True) + print(f"\nTest failed: {e}") diff --git a/python/test_llama_backend.py b/python/test_llama_backend.py new file mode 100644 index 0000000..e96b86c --- /dev/null +++ b/python/test_llama_backend.py @@ -0,0 +1,146 @@ +""" +Test script for llama.cpp backend on Windows. + +This script demonstrates how to use the llama.cpp backend +for local development with a GGUF model. +""" + +import logging +from backends import LlamaCppBackend, BackendConfig + +# Set up logging to see what's happening +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) + +logger = logging.getLogger(__name__) + + +def main(): + logger.info("Starting llama.cpp backend test") + + # Configuration for the backend + config = BackendConfig( + model_path="../models/llama-2-7b-chat.Q4_K_M.gguf", # Relative to python/ directory + temperature=0.7, + max_tokens=256, + top_p=0.9, + top_k=40, + ) + + logger.info(f"Loading model from: {config.model_path}") + + try: + # Initialize the backend + backend = LlamaCppBackend(config) + + # Check if backend is available + if not backend.is_available(): + logger.error("Backend is not available!") + return + + logger.info("Backend loaded successfully!") + + # Test 1: Basic text generation + logger.info("\n" + "="*60) + logger.info("Test 1: Basic Text Generation") + logger.info("="*60) + + prompt = "Hello! My name is" + logger.info(f"Prompt: '{prompt}'") + + result = backend.generate(prompt, max_tokens=50) + + logger.info(f"Generated text: {result.text}") + logger.info(f"Tokens used: {result.tokens_used}") + logger.info(f"Finish reason: {result.finish_reason}") + + # Test 2: Tool calling + logger.info("\n" + "="*60) + logger.info("Test 2: Tool Calling (Function Calling)") + logger.info("="*60) + + tools = [ + { + "name": "move_to", + "description": "Move agent to target coordinates", + "parameters": { + "type": "object", + "properties": { + "target": { + "type": "array", + "items": {"type": "number"}, + "description": "Target [x, y, z] coordinates", + } + }, + "required": ["target"], + }, + }, + { + "name": "pickup_item", + "description": "Pick up an item from the world", + "parameters": { + "type": "object", + "properties": { + "item_name": { + "type": "string", + "description": "Name of the item to pick up", + } + }, + "required": ["item_name"], + }, + } + ] + + prompt = "I need to pick up the sword and then move to coordinates (10, 20, 5)" + logger.info(f"Prompt: '{prompt}'") + + result = backend.generate_with_tools(prompt, tools, temperature=0.5) + + logger.info(f"Generated text: {result.text}") + logger.info(f"Tokens used: {result.tokens_used}") + + if "parsed_tool_call" in result.metadata: + logger.info(f"Parsed tool call: {result.metadata['parsed_tool_call']}") + elif "parse_error" in result.metadata: + logger.warning("Failed to parse tool call from response") + + # Test 3: Different temperatures + logger.info("\n" + "="*60) + logger.info("Test 3: Temperature Comparison") + logger.info("="*60) + + prompt = "The capital of France is" + + for temp in [0.1, 0.7, 1.0]: + logger.info(f"\nTemperature: {temp}") + result = backend.generate(prompt, temperature=temp, max_tokens=20) + logger.info(f"Result: {result.text.strip()}") + + # Test 4: Conversation context + logger.info("\n" + "="*60) + logger.info("Test 4: Multi-turn Conversation") + logger.info("="*60) + + conversation = """[INST] You are a helpful AI assistant. [/INST] I understand. I'm here to help! +[INST] What is the weather like today? [/INST]""" + + result = backend.generate(conversation, max_tokens=100) + logger.info(f"Assistant: {result.text}") + + logger.info("\n" + "="*60) + logger.info("All tests completed successfully!") + logger.info("="*60) + + # Clean up + backend.unload() + logger.info("Backend unloaded") + + except Exception as e: + logger.error(f"Error during testing: {e}", exc_info=True) + return + + +if __name__ == "__main__": + main() diff --git a/python/test_llama_gpu.py b/python/test_llama_gpu.py new file mode 100644 index 0000000..f346920 --- /dev/null +++ b/python/test_llama_gpu.py @@ -0,0 +1,101 @@ +""" +Test GPU-accelerated inference with llama.cpp backend. + +This script compares CPU vs GPU performance. +""" + +import logging +import time +from backends import LlamaCppBackend, BackendConfig + +logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') +logger = logging.getLogger(__name__) + + +def test_inference(config: BackendConfig, test_name: str): + """Test inference with given config.""" + print(f"\n{'='*60}") + print(f"{test_name}") + print('='*60) + + start_time = time.time() + backend = LlamaCppBackend(config) + load_time = time.time() - start_time + + print(f"Load time: {load_time:.2f}s\n") + + # Test prompt + prompt = "[INST] Write a short story about a robot exploring Mars in 3 sentences. [/INST]" + + # Generate + start_time = time.time() + result = backend.generate(prompt, max_tokens=100) + gen_time = time.time() - start_time + + tokens_per_sec = result.tokens_used / gen_time if gen_time > 0 else 0 + + print(f"Response: {result.text.strip()}\n") + print(f"Generation time: {gen_time:.2f}s") + print(f"Tokens: {result.tokens_used}") + print(f"Speed: {tokens_per_sec:.2f} tokens/sec") + + backend.unload() + return tokens_per_sec + + +def main(): + print("\n" + "="*60) + print("GPU Acceleration Test for llama.cpp") + print("="*60) + + model_path = "../models/llama-2-7b-chat.Q4_K_M.gguf" + + # Test 1: CPU only + cpu_config = BackendConfig( + model_path=model_path, + temperature=0.7, + max_tokens=100, + n_gpu_layers=0, # CPU only + ) + + cpu_speed = test_inference(cpu_config, "Test 1: CPU Only (0 GPU layers)") + + # Test 2: Partial GPU offload + partial_gpu_config = BackendConfig( + model_path=model_path, + temperature=0.7, + max_tokens=100, + n_gpu_layers=20, # Offload 20 layers to GPU + ) + + partial_speed = test_inference(partial_gpu_config, "Test 2: Partial GPU (20 layers)") + + # Test 3: Full GPU offload + full_gpu_config = BackendConfig( + model_path=model_path, + temperature=0.7, + max_tokens=100, + n_gpu_layers=-1, # Offload all layers to GPU + ) + + full_speed = test_inference(full_gpu_config, "Test 3: Full GPU (all layers)") + + # Summary + print("\n" + "="*60) + print("Performance Summary") + print("="*60) + print(f"CPU only: {cpu_speed:.2f} tokens/sec (baseline)") + print(f"Partial GPU: {partial_speed:.2f} tokens/sec ({partial_speed/cpu_speed:.2f}x speedup)") + print(f"Full GPU: {full_speed:.2f} tokens/sec ({full_speed/cpu_speed:.2f}x speedup)") + print("="*60) + + if full_speed > cpu_speed * 2: + print("\n✓ GPU acceleration is working! Significant speedup achieved.") + elif full_speed > cpu_speed: + print("\n⚠ GPU acceleration is working but speedup is modest.") + else: + print("\n✗ GPU acceleration may not be working properly.") + + +if __name__ == "__main__": + main() diff --git a/python/test_llama_simple.py b/python/test_llama_simple.py new file mode 100644 index 0000000..6e67b7b --- /dev/null +++ b/python/test_llama_simple.py @@ -0,0 +1,103 @@ +""" +Simple test script for llama.cpp backend. + +Demonstrates basic usage without complex tool calling. +""" + +import logging +from backends import LlamaCppBackend, BackendConfig + +logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') +logger = logging.getLogger(__name__) + + +def main(): + print("\n" + "="*60) + print("Llama.cpp Backend - Simple Test") + print("="*60 + "\n") + + # Initialize backend + config = BackendConfig( + model_path="../models/llama-2-7b-chat.Q4_K_M.gguf", + temperature=0.7, + max_tokens=150, + ) + + logger.info("Loading model (this may take 10-20 seconds)...") + backend = LlamaCppBackend(config) + + if not backend.is_available(): + logger.error("Backend failed to load!") + return + + logger.info("Model loaded successfully!\n") + + # Test 1: Simple completion + print("Test 1: Text Completion") + print("-" * 40) + + prompt = "The three primary colors are" + print(f"Prompt: '{prompt}'") + + result = backend.generate(prompt, max_tokens=50) + print(f"Response: {result.text.strip()}") + print(f"Tokens: {result.tokens_used}\n") + + # Test 2: Question answering (using Llama-2 chat format) + print("Test 2: Question Answering") + print("-" * 40) + + # Llama-2 chat format: [INST] question [/INST] + prompt = "[INST] What is the capital of France? Answer in one word. [/INST]" + print(f"Question: What is the capital of France?") + + result = backend.generate(prompt, temperature=0.1, max_tokens=10) + print(f"Answer: {result.text.strip()}\n") + + # Test 3: Creative writing + print("Test 3: Creative Writing") + print("-" * 40) + + prompt = "[INST] Write a single sentence about a robot exploring Mars. [/INST]" + print("Task: Write about a robot on Mars") + + result = backend.generate(prompt, temperature=0.9, max_tokens=100) + print(f"Story: {result.text.strip()}\n") + + # Test 4: Simple tool selection + print("Test 4: Action Selection") + print("-" * 40) + + prompt = """[INST] You are a game agent at position (0, 0, 0). You see a sword at position (5, 5, 0). + +Available actions: +1. move_to(x, y, z) - Move to coordinates +2. pickup_item(name) - Pick up an item +3. wait() - Do nothing + +What should you do FIRST? Reply with just the action name and parameters, like: move_to(5, 5, 0) [/INST]""" + + print("Scenario: Agent sees sword at (5, 5, 0)") + + result = backend.generate(prompt, temperature=0.3, max_tokens=50) + print(f"Decision: {result.text.strip()}\n") + + # Test 5: Different temperatures + print("Test 5: Temperature Comparison") + print("-" * 40) + + base_prompt = "[INST] Complete this sentence in a creative way: The robot opened the door and saw [/INST]" + + for temp in [0.1, 0.5, 1.0]: + result = backend.generate(base_prompt, temperature=temp, max_tokens=30) + print(f"Temp {temp}: {result.text.strip()}") + + print("\n" + "="*60) + print("All tests completed!") + print("="*60) + + backend.unload() + + +if __name__ == "__main__": + main() diff --git a/python/test_quick_gpu.py b/python/test_quick_gpu.py new file mode 100644 index 0000000..3222e0f --- /dev/null +++ b/python/test_quick_gpu.py @@ -0,0 +1,49 @@ +""" +Quick GPU acceleration test. +""" + +import time +from backends import LlamaCppBackend, BackendConfig + +print("\n" + "="*60) +print("Quick GPU Test") +print("="*60 + "\n") + +# Test with GPU +config_gpu = BackendConfig( + model_path="../models/llama-2-7b-chat.Q4_K_M.gguf", + temperature=0.7, + max_tokens=50, + n_gpu_layers=-1, # All layers to GPU +) + +print("Loading model with GPU acceleration...") +start = time.time() +backend = LlamaCppBackend(config_gpu) +load_time = time.time() - start +print(f"Load time: {load_time:.2f}s\n") + +prompt = "[INST] What is 2+2? Answer in one sentence. [/INST]" + +print("Generating response...") +start = time.time() +result = backend.generate(prompt, max_tokens=30) +gen_time = time.time() - start + +tokens_per_sec = result.tokens_used / gen_time if gen_time > 0 else 0 + +print(f"\nPrompt: {prompt}") +print(f"Response: {result.text.strip()}\n") +print(f"Generation time: {gen_time:.2f}s") +print(f"Tokens: {result.tokens_used}") +print(f"Speed: {tokens_per_sec:.2f} tokens/sec") + +if tokens_per_sec > 50: + print("\nSUCCESS: GPU acceleration is WORKING! Excellent speed!") +elif tokens_per_sec > 20: + print("\nSUCCESS: GPU acceleration appears to be working.") +else: + print("\nWARNING: Speed seems slow - GPU may not be fully utilized.") + +backend.unload() +print("\n" + "="*60) diff --git a/scripts/tests/test_tool_execution.gd b/scripts/tests/test_tool_execution.gd index f778ca5..902d4c5 100644 --- a/scripts/tests/test_tool_execution.gd +++ b/scripts/tests/test_tool_execution.gd @@ -7,6 +7,9 @@ extends Node var ipc_client: IPCClient var tool_registry: ToolRegistry var agent: Agent +var test_running := true # Keep scene alive +var wait_time := 0.0 +var tests_started := false func _ready(): print("=== Tool Execution Test ===") @@ -58,9 +61,18 @@ func _ready(): ipc_client.connect_to_server("http://127.0.0.1:5000") - # Wait a moment for connection, then test tools - print("Waiting 3 seconds for connection...") - await get_tree().create_timer(3.0).timeout + # Skip waiting - call tests immediately after 1 frame + print("Starting tests after 1 frame...") + call_deferred("_start_tests") + +func _process(delta): + # Keep scene alive while test is running + # (We removed the manual timer since we're using call_deferred now) + pass + +func _start_tests(): + print("\nChecking connection status...") + print("is_server_connected() = ", ipc_client.is_server_connected()) if not ipc_client.is_server_connected(): print("\n[WARNING] Not connected to server!") @@ -69,8 +81,13 @@ func _ready(): print("2. Server is on http://127.0.0.1:5000") print("3. No firewall is blocking the connection") print("\nTrying to test tools anyway...") + else: + print("[SUCCESS] Connected to IPC server!") + print("About to call test_tools()...") + tests_started = true test_tools() + print("test_tools() call completed") func test_tools(): print("\n=== Testing Tool Execution ===") @@ -85,7 +102,6 @@ func test_tools(): } var move_result = agent.call_tool("move_to", move_params) print("Request sent: ", move_result) - await get_tree().create_timer(0.5).timeout # Test 2: Pickup item tool print("\n[Test 2] Testing pickup_item tool...") @@ -94,19 +110,16 @@ func test_tools(): } var pickup_result = agent.call_tool("pickup_item", pickup_params) print("Request sent: ", pickup_result) - await get_tree().create_timer(0.5).timeout # Test 3: Stop movement tool print("\n[Test 3] Testing stop_movement tool...") var stop_result = agent.call_tool("stop_movement", {}) print("Request sent: ", stop_result) - await get_tree().create_timer(0.5).timeout # Test 4: Get inventory tool print("\n[Test 4] Testing get_inventory tool...") var inventory_result = agent.call_tool("get_inventory", {}) print("Request sent: ", inventory_result) - await get_tree().create_timer(0.5).timeout # Test 5: Direct ToolRegistry execution print("\n[Test 5] Testing navigate_to tool...") @@ -116,13 +129,10 @@ func test_tools(): print("Request sent: ", direct_result) print("\n=== All Tool Requests Sent ===") - print("Waiting for responses (check 'IPC Response Received' below)...") + print("Waiting for async responses from Python server...") print("Python server log should show tool executions") - - # Wait a bit for all responses - await get_tree().create_timer(2.0).timeout - print("\n=== Test Complete ===") - print("If you saw response_received callbacks above, tool execution works!") + print("Scene will stay running - press Q to quit when done") + print("\nWatch for '[IPC Response Received]' messages below...") func _on_response_received(response: Dictionary): print("\n[IPC Response Received]") diff --git a/tests/test_vllm_backend.py b/tests/test_vllm_backend.py new file mode 100644 index 0000000..b68f198 --- /dev/null +++ b/tests/test_vllm_backend.py @@ -0,0 +1,196 @@ +""" +Tests for vLLM backend. + +Note: These tests require a running vLLM server. +Use pytest markers to skip if server is not available. +""" + +import pytest +from backends.vllm_backend import VLLMBackend, VLLMBackendConfig + + +@pytest.fixture +def vllm_config(): + """Create a vLLM config for testing.""" + return VLLMBackendConfig( + model_path="meta-llama/Llama-2-7b-chat-hf", + api_base="http://localhost:8000/v1", + temperature=0.7, + max_tokens=100, + ) + + +@pytest.fixture +def vllm_backend(vllm_config): + """Create a vLLM backend instance.""" + try: + backend = VLLMBackend(vllm_config) + if not backend.is_available(): + pytest.skip("vLLM server not available") + return backend + except Exception as e: + pytest.skip(f"Could not connect to vLLM server: {e}") + + +def test_vllm_config_creation(): + """Test vLLM config initialization.""" + config = VLLMBackendConfig( + model_path="test-model", + api_base="http://test:8000/v1", + api_key="test-key", + temperature=0.5, + max_tokens=256, + ) + + assert config.model_path == "test-model" + assert config.api_base == "http://test:8000/v1" + assert config.api_key == "test-key" + assert config.temperature == 0.5 + assert config.max_tokens == 256 + + +def test_vllm_backend_initialization(vllm_config): + """Test vLLM backend can be initialized.""" + try: + backend = VLLMBackend(vllm_config) + assert backend.client is not None + assert backend.config == vllm_config + except Exception: + pytest.skip("vLLM server not available") + + +def test_vllm_is_available(vllm_backend): + """Test availability check.""" + assert vllm_backend.is_available() is True + + +def test_vllm_generate(vllm_backend): + """Test basic text generation.""" + prompt = "Hello, my name is" + result = vllm_backend.generate(prompt, max_tokens=20) + + assert result is not None + assert len(result.text) > 0 + assert result.tokens_used > 0 + assert result.finish_reason in ["stop", "length"] + assert "model" in result.metadata + + +def test_vllm_generate_with_temperature(vllm_backend): + """Test generation with custom temperature.""" + prompt = "The weather today is" + result = vllm_backend.generate(prompt, temperature=0.1, max_tokens=20) + + assert result is not None + assert len(result.text) > 0 + assert result.finish_reason in ["stop", "length"] + + +def test_vllm_generate_with_tools(vllm_backend): + """Test tool calling generation.""" + prompt = "I need to move to coordinates (10, 20, 5)" + + tools = [ + { + "name": "move_to", + "description": "Move agent to target coordinates", + "parameters": { + "type": "object", + "properties": { + "target": { + "type": "array", + "items": {"type": "number"}, + "description": "Target [x, y, z] coordinates", + } + }, + "required": ["target"], + }, + } + ] + + result = vllm_backend.generate_with_tools(prompt, tools) + + assert result is not None + # Result should contain either a tool call or text response + assert len(result.text) > 0 or "tool_call" in result.metadata + + +def test_vllm_generate_error_handling(vllm_backend): + """Test error handling with invalid input.""" + # Empty prompt should still work + result = vllm_backend.generate("", max_tokens=10) + assert result is not None + assert result.finish_reason in ["stop", "length", "error"] + + +def test_vllm_unload(vllm_config): + """Test unloading backend.""" + try: + backend = VLLMBackend(vllm_config) + backend.unload() + assert backend.client is None + assert backend.is_available() is False + except Exception: + pytest.skip("vLLM server not available") + + +def test_vllm_multiple_generations(vllm_backend): + """Test multiple sequential generations.""" + prompts = ["Hello", "How are you?", "What is AI?"] + + for prompt in prompts: + result = vllm_backend.generate(prompt, max_tokens=20) + assert result is not None + assert len(result.text) > 0 + + +def test_vllm_generate_with_tools_fallback(vllm_backend): + """Test fallback tool calling method.""" + prompt = "Pick up the sword item" + + tools = [ + { + "name": "pickup_item", + "description": "Pick up an item from the world", + "parameters": { + "type": "object", + "properties": { + "item_name": { + "type": "string", + "description": "Name of the item to pick up", + } + }, + "required": ["item_name"], + }, + } + ] + + # Test the fallback method directly + result = vllm_backend._generate_with_tools_fallback(prompt, tools, temperature=0.7) + + assert result is not None + assert len(result.text) > 0 + + +@pytest.mark.parametrize("max_tokens", [10, 50, 100]) +def test_vllm_different_max_tokens(vllm_backend, max_tokens): + """Test generation with different max token limits.""" + prompt = "Once upon a time" + result = vllm_backend.generate(prompt, max_tokens=max_tokens) + + assert result is not None + assert result.tokens_used <= max_tokens * 1.5 # Some tolerance + + +@pytest.mark.parametrize("temperature", [0.1, 0.7, 1.0]) +def test_vllm_different_temperatures(vllm_backend, temperature): + """Test generation with different temperatures.""" + prompt = "The capital of France is" + result = vllm_backend.generate(prompt, temperature=temperature, max_tokens=20) + + assert result is not None + assert len(result.text) > 0 + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "-s"])