From e57d32deb36f2a4d81c33c77987366b0c1f4dd06 Mon Sep 17 00:00:00 2001 From: Shrikara A <97931056+sudokara@users.noreply.github.com> Date: Wed, 17 Sep 2025 09:46:47 +0530 Subject: [PATCH 1/4] [fix] Read cloud model name from config.toml --- cloud/server.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cloud/server.py b/cloud/server.py index ab250c8..a6635cf 100644 --- a/cloud/server.py +++ b/cloud/server.py @@ -13,6 +13,7 @@ SerializeMessage, DeserializeMessage, CreateTimestamp, CalculateAcceptanceRate ) +from common.config import get_cloud_model_config from cloud.target_model import CloudTargetModel # Configure logging @@ -25,7 +26,9 @@ class CloudServer: def __init__(self, host: str = "0.0.0.0", port: int = 8765): self.m_host = host self.m_port = port - self.m_target_model = CloudTargetModel() + + cloud_config = get_cloud_model_config() + self.m_target_model = CloudTargetModel(model_name=cloud_config["model_name"]) self.m_connected_clients = set() async def Initialize(self) -> bool: From 3c3633b943694f4de272e2b16cfc14cb999f34f6 Mon Sep 17 00:00:00 2001 From: Shrikara Arun Date: Wed, 17 Sep 2025 09:04:30 +0000 Subject: [PATCH 2/4] [wip][fix] read edge config params --- common/config.py | 31 ++++++++++++------------------- edge/client.py | 6 ++++-- edge/draft_model.py | 16 +++++++++++----- 3 files changed, 27 insertions(+), 26 deletions(-) diff --git a/common/config.py b/common/config.py index 66f2d7f..4573e32 100644 --- a/common/config.py +++ b/common/config.py @@ -47,29 +47,22 @@ def get_edge_model_config() -> Dict[str, Any]: """Get edge model configuration with device selection""" config = load_config() - # Determine device to use - edge_device = config["devices"]["edge_device"] - - # Check GPU configuration + # Read requested device and low-level backend settings + edge_device = config.get("devices", {}).get("edge_device", "cpu") gpu_config = config.get("devices", {}).get("gpu", {}) - if gpu_config.get("enabled", False) and edge_device == "cpu": - # GPU is enabled, try to use it - edge_device = "gpu" - - # Check NPU configuration npu_config = config.get("devices", {}).get("npu", {}) - if npu_config.get("enabled", False) and edge_device == "cpu": - # NPU is enabled, try to use it - edge_device = "npu" - + + # Return raw configuration values so callers can decide how to apply them return { - "model_name": config["models"]["edge_model"], + "model_name": config.get("models", {}).get("edge_model", "meta-llama/Llama-3.2-1B-Instruct"), "device": edge_device, - "max_tokens": config["models"]["max_edge_tokens"], - "temperature": config["performance"]["temperature"], - "repetition_penalty": config["performance"]["repetition_penalty"], - "gpu_device_id": gpu_config.get("device_id", 0), - "npu_fallback": npu_config.get("fallback_to_cpu", True) + "max_tokens": config.get("models", {}).get("max_edge_tokens", 5), + "temperature": config.get("performance", {}).get("temperature", 0.7), + "repetition_penalty": config.get("performance", {}).get("repetition_penalty", 1.1), + "gpu_enabled": bool(gpu_config.get("enabled", False)), + "gpu_device_id": int(gpu_config.get("device_id", 0)), + "npu_enabled": bool(npu_config.get("enabled", False)), + "npu_fallback": bool(npu_config.get("fallback_to_cpu", True)) } def get_cloud_model_config() -> Dict[str, Any]: diff --git a/edge/client.py b/edge/client.py index af02bee..369112b 100644 --- a/edge/client.py +++ b/edge/client.py @@ -28,11 +28,13 @@ def __init__(self, cloud_host: str = "localhost", cloud_port: int = 8765, device # Get device configuration edge_config = get_edge_model_config() self.m_device = device if device is not None else edge_config["device"] + self.m_gpu_device_id = edge_config.get("gpu_device_id", 0) - # Initialize draft model with configured device + # Initialize draft model with configured device and GPU index self.m_draft_model = EdgeDraftModel( model_name=edge_config["model_name"], - device=self.m_device + device=self.m_device, + gpu_device_id=self.m_gpu_device_id ) self.m_websocket = None diff --git a/edge/draft_model.py b/edge/draft_model.py index 1f1ec01..cbf6283 100644 --- a/edge/draft_model.py +++ b/edge/draft_model.py @@ -27,13 +27,14 @@ class EdgeDraftModel: """Draft model running on edge device with CPU, GPU, or NPU support""" - def __init__(self, model_name: str = "meta-llama/Llama-3.2-1B-Instruct", device: str = "cpu"): + def __init__(self, model_name: str = "meta-llama/Llama-3.2-1B-Instruct", device: str = "cpu", gpu_device_id: int = 0): """ Initialize draft model with device selection Args: model_name: HuggingFace model name device: Device to use ("cpu", "gpu", or "npu") + gpu_device_id: GPU index to use when device is "gpu" """ self.m_model_name = model_name self.m_device = device.lower() @@ -42,6 +43,7 @@ def __init__(self, model_name: str = "meta-llama/Llama-3.2-1B-Instruct", device: self.m_generation_config = None self.m_npu_model = None self.m_cuda_device = None + self.m_gpu_device_id = int(gpu_device_id) g_logger.info(f"Initializing edge draft model: {model_name}") g_logger.info(f"Target device: {self.m_device}") @@ -53,10 +55,14 @@ def __init__(self, model_name: str = "meta-llama/Llama-3.2-1B-Instruct", device: g_logger.info("Falling back to CPU device") self.m_device = "cpu" else: - # Set CUDA device - self.m_cuda_device = f"cuda:0" # Default to first GPU - g_logger.info(f"GPU available: {torch.cuda.get_device_name(0)}") - g_logger.info(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB") + # Set CUDA device using configured GPU index + self.m_cuda_device = f"cuda:{self.m_gpu_device_id}" + try: + g_logger.info(f"GPU available: {torch.cuda.get_device_name(self.m_gpu_device_id)}") + g_logger.info(f"GPU memory: {torch.cuda.get_device_properties(self.m_gpu_device_id).total_memory / 1e9:.1f} GB") + except Exception: + # Fallback to device 0 logging if index lookup fails + g_logger.info(f"GPU available (index {self.m_gpu_device_id})") elif self.m_device == "npu": if not OPENVINO_AVAILABLE: From 414a62527ccc5357aa3fb18e6400d90112fe1496 Mon Sep 17 00:00:00 2001 From: Shrikara Arun Date: Thu, 18 Sep 2025 04:13:17 +0000 Subject: [PATCH 3/4] [fix][wip]more config reading --- cloud/server.py | 17 ++++++++++++----- common/config.py | 42 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/cloud/server.py b/cloud/server.py index a6635cf..c5dbd9a 100644 --- a/cloud/server.py +++ b/cloud/server.py @@ -13,7 +13,7 @@ SerializeMessage, DeserializeMessage, CreateTimestamp, CalculateAcceptanceRate ) -from common.config import get_cloud_model_config +from common.config import get_cloud_model_config, get_network_config from cloud.target_model import CloudTargetModel # Configure logging @@ -29,6 +29,13 @@ def __init__(self, host: str = "0.0.0.0", port: int = 8765): cloud_config = get_cloud_model_config() self.m_target_model = CloudTargetModel(model_name=cloud_config["model_name"]) + self.m_max_cloud_tokens = cloud_config.get("max_tokens", 10) + + # Get network configuration + network_config = get_network_config() + self.m_ping_interval = network_config.get("ping_interval", 300) + self.m_ping_timeout = network_config.get("ping_timeout", 300) + self.m_connected_clients = set() async def Initialize(self) -> bool: @@ -104,7 +111,7 @@ async def _ProcessSpeculativeRequest(self, websocket, data: dict): request.prompt, request.draft_tokens, request.draft_probabilities, # Use the probabilities from draft model - max_new_tokens=min(10, request.max_new_tokens) + max_new_tokens=min(self.m_max_cloud_tokens, request.max_new_tokens) ) g_logger.info(f"Used probabilistic verification with {len(request.draft_probabilities)} probabilities") else: @@ -112,7 +119,7 @@ async def _ProcessSpeculativeRequest(self, websocket, data: dict): verified_tokens, new_tokens, accepted_count, inference_time = self.m_target_model.VerifyAndComplete( request.prompt, request.draft_tokens, - max_new_tokens=min(10, request.max_new_tokens) + max_new_tokens=min(self.m_max_cloud_tokens, request.max_new_tokens) ) g_logger.warning("Using legacy string-based verification (no probabilities provided)") @@ -188,8 +195,8 @@ async def StartServer(self): self.HandleClient, self.m_host, self.m_port, - ping_interval=300, # 5 minutes - match client settings - ping_timeout=300, # 5 minutes - match client settings + ping_interval=self.m_ping_interval, + ping_timeout=self.m_ping_timeout, close_timeout=60 # 1 minute close timeout ) diff --git a/common/config.py b/common/config.py index 4573e32..446d88d 100644 --- a/common/config.py +++ b/common/config.py @@ -2,6 +2,7 @@ Configuration loader for SpecECD project """ import toml +import os from pathlib import Path from typing import Dict, Any @@ -81,5 +82,44 @@ def get_network_config() -> Dict[str, Any]: config = load_config() return { "host": config["network"]["default_host"], - "port": config["network"]["default_port"] + "port": config["network"]["default_port"], + "ping_interval": config["network"].get("ping_interval", 300), + "ping_timeout": config["network"].get("ping_timeout", 300) } + +def get_performance_config() -> Dict[str, Any]: + """Get performance test configuration""" + config = load_config() + return { + "warmup_iterations": config["performance"].get("warmup_iterations", 2), + "test_iterations": config["performance"].get("test_iterations", 2), + "max_tokens_per_test": config["performance"].get("max_tokens_per_test", 50), + "temperature": config["performance"].get("temperature", 0.7), + "repetition_penalty": config["performance"].get("repetition_penalty", 1.1) + } + +def get_fast_model_config() -> Dict[str, Any]: + """Get fast model configuration for testing""" + config = load_config() + fast_config = config.get("models", {}).get("fast", {}) + if not fast_config: + # Return None if no fast config exists + return None + + return { + "cloud_model": fast_config.get("cloud_model"), + "expected_inference_time": fast_config.get("expected_inference_time") + } + +def set_environment_variables(): + """Set environment variables from configuration""" + config = load_config() + + # Set TORCH environment variables if specified + torch_cuda_dsa = config.get("TORCH_USE_CUDA_DSA") + if torch_cuda_dsa is not None: + os.environ["TORCH_USE_CUDA_DSA"] = str(torch_cuda_dsa) + + cuda_launch_blocking = config.get("CUDA_LAUNCH_BLOCKING") + if cuda_launch_blocking is not None: + os.environ["CUDA_LAUNCH_BLOCKING"] = str(cuda_launch_blocking) From 0cc69ecb424b1c3870241d6c87a636a29b8f614b Mon Sep 17 00:00:00 2001 From: Shrikara Arun Date: Thu, 18 Sep 2025 04:26:55 +0000 Subject: [PATCH 4/4] [fix] initialize draft model to correct gpu --- edge/draft_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/edge/draft_model.py b/edge/draft_model.py index cbf6283..b3a4ba0 100644 --- a/edge/draft_model.py +++ b/edge/draft_model.py @@ -136,10 +136,10 @@ def _LoadGPUModel(self) -> bool: self.m_model = AutoModelForCausalLM.from_pretrained( self.m_model_name, torch_dtype=torch.float16, # Use float16 for GPU memory efficiency - device_map="auto", # Automatically distribute across available GPUs + device_map=self.m_cuda_device, # Automatically distribute across available GPUs trust_remote_code=True, low_cpu_mem_usage=True - ).to(self.m_cuda_device) + ) # Configure generation parameters optimized for GPU self.m_generation_config = GenerationConfig(