Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions cloud/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
SerializeMessage, DeserializeMessage, CreateTimestamp,
CalculateAcceptanceRate
)
from common.config import get_cloud_model_config, get_network_config
from cloud.target_model import CloudTargetModel

# Configure logging
Expand All @@ -25,7 +26,16 @@ class CloudServer:
def __init__(self, host: str = "0.0.0.0", port: int = 8765):
self.m_host = host
self.m_port = port
self.m_target_model = CloudTargetModel()

cloud_config = get_cloud_model_config()
self.m_target_model = CloudTargetModel(model_name=cloud_config["model_name"])
self.m_max_cloud_tokens = cloud_config.get("max_tokens", 10)

# Get network configuration
network_config = get_network_config()
self.m_ping_interval = network_config.get("ping_interval", 300)
self.m_ping_timeout = network_config.get("ping_timeout", 300)

self.m_connected_clients = set()

async def Initialize(self) -> bool:
Expand Down Expand Up @@ -101,15 +111,15 @@ async def _ProcessSpeculativeRequest(self, websocket, data: dict):
request.prompt,
request.draft_tokens,
request.draft_probabilities, # Use the probabilities from draft model
max_new_tokens=min(10, request.max_new_tokens)
max_new_tokens=min(self.m_max_cloud_tokens, request.max_new_tokens)
)
g_logger.info(f"Used probabilistic verification with {len(request.draft_probabilities)} probabilities")
else:
# Fallback to legacy method for compatibility
verified_tokens, new_tokens, accepted_count, inference_time = self.m_target_model.VerifyAndComplete(
request.prompt,
request.draft_tokens,
max_new_tokens=min(10, request.max_new_tokens)
max_new_tokens=min(self.m_max_cloud_tokens, request.max_new_tokens)
)
g_logger.warning("Using legacy string-based verification (no probabilities provided)")

Expand Down Expand Up @@ -185,8 +195,8 @@ async def StartServer(self):
self.HandleClient,
self.m_host,
self.m_port,
ping_interval=300, # 5 minutes - match client settings
ping_timeout=300, # 5 minutes - match client settings
ping_interval=self.m_ping_interval,
ping_timeout=self.m_ping_timeout,
close_timeout=60 # 1 minute close timeout
)

Expand Down
73 changes: 53 additions & 20 deletions common/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Configuration loader for SpecECD project
"""
import toml
import os
from pathlib import Path
from typing import Dict, Any

Expand Down Expand Up @@ -47,29 +48,22 @@ def get_edge_model_config() -> Dict[str, Any]:
"""Get edge model configuration with device selection"""
config = load_config()

# Determine device to use
edge_device = config["devices"]["edge_device"]

# Check GPU configuration
# Read requested device and low-level backend settings
edge_device = config.get("devices", {}).get("edge_device", "cpu")
gpu_config = config.get("devices", {}).get("gpu", {})
if gpu_config.get("enabled", False) and edge_device == "cpu":
# GPU is enabled, try to use it
edge_device = "gpu"

# Check NPU configuration
npu_config = config.get("devices", {}).get("npu", {})
if npu_config.get("enabled", False) and edge_device == "cpu":
# NPU is enabled, try to use it
edge_device = "npu"


# Return raw configuration values so callers can decide how to apply them
return {
"model_name": config["models"]["edge_model"],
"model_name": config.get("models", {}).get("edge_model", "meta-llama/Llama-3.2-1B-Instruct"),
"device": edge_device,
"max_tokens": config["models"]["max_edge_tokens"],
"temperature": config["performance"]["temperature"],
"repetition_penalty": config["performance"]["repetition_penalty"],
"gpu_device_id": gpu_config.get("device_id", 0),
"npu_fallback": npu_config.get("fallback_to_cpu", True)
"max_tokens": config.get("models", {}).get("max_edge_tokens", 5),
"temperature": config.get("performance", {}).get("temperature", 0.7),
"repetition_penalty": config.get("performance", {}).get("repetition_penalty", 1.1),
"gpu_enabled": bool(gpu_config.get("enabled", False)),
"gpu_device_id": int(gpu_config.get("device_id", 0)),
"npu_enabled": bool(npu_config.get("enabled", False)),
"npu_fallback": bool(npu_config.get("fallback_to_cpu", True))
}

def get_cloud_model_config() -> Dict[str, Any]:
Expand All @@ -88,5 +82,44 @@ def get_network_config() -> Dict[str, Any]:
config = load_config()
return {
"host": config["network"]["default_host"],
"port": config["network"]["default_port"]
"port": config["network"]["default_port"],
"ping_interval": config["network"].get("ping_interval", 300),
"ping_timeout": config["network"].get("ping_timeout", 300)
}

def get_performance_config() -> Dict[str, Any]:
"""Get performance test configuration"""
config = load_config()
return {
"warmup_iterations": config["performance"].get("warmup_iterations", 2),
"test_iterations": config["performance"].get("test_iterations", 2),
"max_tokens_per_test": config["performance"].get("max_tokens_per_test", 50),
"temperature": config["performance"].get("temperature", 0.7),
"repetition_penalty": config["performance"].get("repetition_penalty", 1.1)
}

def get_fast_model_config() -> Dict[str, Any]:
"""Get fast model configuration for testing"""
config = load_config()
fast_config = config.get("models", {}).get("fast", {})
if not fast_config:
# Return None if no fast config exists
return None

return {
"cloud_model": fast_config.get("cloud_model"),
"expected_inference_time": fast_config.get("expected_inference_time")
}

def set_environment_variables():
"""Set environment variables from configuration"""
config = load_config()

# Set TORCH environment variables if specified
torch_cuda_dsa = config.get("TORCH_USE_CUDA_DSA")
if torch_cuda_dsa is not None:
os.environ["TORCH_USE_CUDA_DSA"] = str(torch_cuda_dsa)

cuda_launch_blocking = config.get("CUDA_LAUNCH_BLOCKING")
if cuda_launch_blocking is not None:
os.environ["CUDA_LAUNCH_BLOCKING"] = str(cuda_launch_blocking)
6 changes: 4 additions & 2 deletions edge/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,13 @@ def __init__(self, cloud_host: str = "localhost", cloud_port: int = 8765, device
# Get device configuration
edge_config = get_edge_model_config()
self.m_device = device if device is not None else edge_config["device"]
self.m_gpu_device_id = edge_config.get("gpu_device_id", 0)

# Initialize draft model with configured device
# Initialize draft model with configured device and GPU index
self.m_draft_model = EdgeDraftModel(
model_name=edge_config["model_name"],
device=self.m_device
device=self.m_device,
gpu_device_id=self.m_gpu_device_id
)

self.m_websocket = None
Expand Down
20 changes: 13 additions & 7 deletions edge/draft_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,14 @@
class EdgeDraftModel:
"""Draft model running on edge device with CPU, GPU, or NPU support"""

def __init__(self, model_name: str = "meta-llama/Llama-3.2-1B-Instruct", device: str = "cpu"):
def __init__(self, model_name: str = "meta-llama/Llama-3.2-1B-Instruct", device: str = "cpu", gpu_device_id: int = 0):
"""
Initialize draft model with device selection

Args:
model_name: HuggingFace model name
device: Device to use ("cpu", "gpu", or "npu")
gpu_device_id: GPU index to use when device is "gpu"
"""
self.m_model_name = model_name
self.m_device = device.lower()
Expand All @@ -42,6 +43,7 @@ def __init__(self, model_name: str = "meta-llama/Llama-3.2-1B-Instruct", device:
self.m_generation_config = None
self.m_npu_model = None
self.m_cuda_device = None
self.m_gpu_device_id = int(gpu_device_id)

g_logger.info(f"Initializing edge draft model: {model_name}")
g_logger.info(f"Target device: {self.m_device}")
Expand All @@ -53,10 +55,14 @@ def __init__(self, model_name: str = "meta-llama/Llama-3.2-1B-Instruct", device:
g_logger.info("Falling back to CPU device")
self.m_device = "cpu"
else:
# Set CUDA device
self.m_cuda_device = f"cuda:0" # Default to first GPU
g_logger.info(f"GPU available: {torch.cuda.get_device_name(0)}")
g_logger.info(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
# Set CUDA device using configured GPU index
self.m_cuda_device = f"cuda:{self.m_gpu_device_id}"
try:
g_logger.info(f"GPU available: {torch.cuda.get_device_name(self.m_gpu_device_id)}")
g_logger.info(f"GPU memory: {torch.cuda.get_device_properties(self.m_gpu_device_id).total_memory / 1e9:.1f} GB")
except Exception:
# Fallback to device 0 logging if index lookup fails
g_logger.info(f"GPU available (index {self.m_gpu_device_id})")

elif self.m_device == "npu":
if not OPENVINO_AVAILABLE:
Expand Down Expand Up @@ -130,10 +136,10 @@ def _LoadGPUModel(self) -> bool:
self.m_model = AutoModelForCausalLM.from_pretrained(
self.m_model_name,
torch_dtype=torch.float16, # Use float16 for GPU memory efficiency
device_map="auto", # Automatically distribute across available GPUs
device_map=self.m_cuda_device, # Automatically distribute across available GPUs
trust_remote_code=True,
low_cpu_mem_usage=True
).to(self.m_cuda_device)
)

# Configure generation parameters optimized for GPU
self.m_generation_config = GenerationConfig(
Expand Down