ashrit-ms · sudokara · Sep 17, 2025 · Sep 17, 2025 · Sep 18, 2025 · Sep 18, 2025
diff --git a/cloud/server.py b/cloud/server.py
@@ -13,6 +13,7 @@
     SerializeMessage, DeserializeMessage, CreateTimestamp,
     CalculateAcceptanceRate
 )
+from common.config import get_cloud_model_config, get_network_config
 from cloud.target_model import CloudTargetModel
 
 # Configure logging
@@ -25,7 +26,16 @@ class CloudServer:
     def __init__(self, host: str = "0.0.0.0", port: int = 8765):
         self.m_host = host
         self.m_port = port
-        self.m_target_model = CloudTargetModel()
+
+        cloud_config = get_cloud_model_config()
+        self.m_target_model = CloudTargetModel(model_name=cloud_config["model_name"])
+        self.m_max_cloud_tokens = cloud_config.get("max_tokens", 10)
+
+        # Get network configuration  
+        network_config = get_network_config()
+        self.m_ping_interval = network_config.get("ping_interval", 300)
+        self.m_ping_timeout = network_config.get("ping_timeout", 300)
+
         self.m_connected_clients = set()
 
     async def Initialize(self) -> bool:
@@ -101,15 +111,15 @@ async def _ProcessSpeculativeRequest(self, websocket, data: dict):
                     request.prompt,
                     request.draft_tokens,
                     request.draft_probabilities,  # Use the probabilities from draft model
-                    max_new_tokens=min(10, request.max_new_tokens)
+                    max_new_tokens=min(self.m_max_cloud_tokens, request.max_new_tokens)
                 )
                 g_logger.info(f"Used probabilistic verification with {len(request.draft_probabilities)} probabilities")
             else:
                 # Fallback to legacy method for compatibility
                 verified_tokens, new_tokens, accepted_count, inference_time = self.m_target_model.VerifyAndComplete(
                     request.prompt,
                     request.draft_tokens,
-                    max_new_tokens=min(10, request.max_new_tokens)
+                    max_new_tokens=min(self.m_max_cloud_tokens, request.max_new_tokens)
                 )
                 g_logger.warning("Using legacy string-based verification (no probabilities provided)")
 
@@ -185,8 +195,8 @@ async def StartServer(self):
                 self.HandleClient,
                 self.m_host,
                 self.m_port,
-                ping_interval=300,  # 5 minutes - match client settings
-                ping_timeout=300,   # 5 minutes - match client settings
+                ping_interval=self.m_ping_interval,
+                ping_timeout=self.m_ping_timeout,
                 close_timeout=60    # 1 minute close timeout
             )
 

diff --git a/common/config.py b/common/config.py
@@ -2,6 +2,7 @@
 Configuration loader for SpecECD project
 """
 import toml
+import os
 from pathlib import Path
 from typing import Dict, Any
 
@@ -47,29 +48,22 @@ def get_edge_model_config() -> Dict[str, Any]:
     """Get edge model configuration with device selection"""
     config = load_config()
 
-    # Determine device to use
-    edge_device = config["devices"]["edge_device"]
-
-    # Check GPU configuration
+    # Read requested device and low-level backend settings
+    edge_device = config.get("devices", {}).get("edge_device", "cpu")
     gpu_config = config.get("devices", {}).get("gpu", {})
-    if gpu_config.get("enabled", False) and edge_device == "cpu":
-        # GPU is enabled, try to use it
-        edge_device = "gpu"
-
-    # Check NPU configuration
     npu_config = config.get("devices", {}).get("npu", {})
-    if npu_config.get("enabled", False) and edge_device == "cpu":
-        # NPU is enabled, try to use it
-        edge_device = "npu"
-
+
+    # Return raw configuration values so callers can decide how to apply them
     return {
-        "model_name": config["models"]["edge_model"],
+        "model_name": config.get("models", {}).get("edge_model", "meta-llama/Llama-3.2-1B-Instruct"),
         "device": edge_device,
-        "max_tokens": config["models"]["max_edge_tokens"],
-        "temperature": config["performance"]["temperature"],
-        "repetition_penalty": config["performance"]["repetition_penalty"],
-        "gpu_device_id": gpu_config.get("device_id", 0),
-        "npu_fallback": npu_config.get("fallback_to_cpu", True)
+        "max_tokens": config.get("models", {}).get("max_edge_tokens", 5),
+        "temperature": config.get("performance", {}).get("temperature", 0.7),
+        "repetition_penalty": config.get("performance", {}).get("repetition_penalty", 1.1),
+        "gpu_enabled": bool(gpu_config.get("enabled", False)),
+        "gpu_device_id": int(gpu_config.get("device_id", 0)),
+        "npu_enabled": bool(npu_config.get("enabled", False)),
+        "npu_fallback": bool(npu_config.get("fallback_to_cpu", True))
     }
 
 def get_cloud_model_config() -> Dict[str, Any]:
@@ -88,5 +82,44 @@ def get_network_config() -> Dict[str, Any]:
     config = load_config()
     return {
         "host": config["network"]["default_host"],
-        "port": config["network"]["default_port"]
+        "port": config["network"]["default_port"],
+        "ping_interval": config["network"].get("ping_interval", 300),
+        "ping_timeout": config["network"].get("ping_timeout", 300)
+    }
+
+def get_performance_config() -> Dict[str, Any]:
+    """Get performance test configuration"""
+    config = load_config()
+    return {
+        "warmup_iterations": config["performance"].get("warmup_iterations", 2),
+        "test_iterations": config["performance"].get("test_iterations", 2),
+        "max_tokens_per_test": config["performance"].get("max_tokens_per_test", 50),
+        "temperature": config["performance"].get("temperature", 0.7),
+        "repetition_penalty": config["performance"].get("repetition_penalty", 1.1)
     }
+
+def get_fast_model_config() -> Dict[str, Any]:
+    """Get fast model configuration for testing"""
+    config = load_config()
+    fast_config = config.get("models", {}).get("fast", {})
+    if not fast_config:
+        # Return None if no fast config exists
+        return None
+
+    return {
+        "cloud_model": fast_config.get("cloud_model"),
+        "expected_inference_time": fast_config.get("expected_inference_time")
+    }
+
+def set_environment_variables():
+    """Set environment variables from configuration"""
+    config = load_config()
+
+    # Set TORCH environment variables if specified
+    torch_cuda_dsa = config.get("TORCH_USE_CUDA_DSA")
+    if torch_cuda_dsa is not None:
+        os.environ["TORCH_USE_CUDA_DSA"] = str(torch_cuda_dsa)
+
+    cuda_launch_blocking = config.get("CUDA_LAUNCH_BLOCKING")
+    if cuda_launch_blocking is not None:
+        os.environ["CUDA_LAUNCH_BLOCKING"] = str(cuda_launch_blocking)
diff --git a/edge/client.py b/edge/client.py
@@ -28,11 +28,13 @@ def __init__(self, cloud_host: str = "localhost", cloud_port: int = 8765, device
         # Get device configuration
         edge_config = get_edge_model_config()
         self.m_device = device if device is not None else edge_config["device"]
+        self.m_gpu_device_id = edge_config.get("gpu_device_id", 0)
 
-        # Initialize draft model with configured device
+        # Initialize draft model with configured device and GPU index
         self.m_draft_model = EdgeDraftModel(
             model_name=edge_config["model_name"],
-            device=self.m_device
+            device=self.m_device,
+            gpu_device_id=self.m_gpu_device_id
         )
 
         self.m_websocket = None

diff --git a/edge/draft_model.py b/edge/draft_model.py
@@ -27,13 +27,14 @@
 class EdgeDraftModel:
     """Draft model running on edge device with CPU, GPU, or NPU support"""
 
-    def __init__(self, model_name: str = "meta-llama/Llama-3.2-1B-Instruct", device: str = "cpu"):
+    def __init__(self, model_name: str = "meta-llama/Llama-3.2-1B-Instruct", device: str = "cpu", gpu_device_id: int = 0):
         """
         Initialize draft model with device selection
 
         Args:
             model_name: HuggingFace model name
             device: Device to use ("cpu", "gpu", or "npu")
+            gpu_device_id: GPU index to use when device is "gpu"
         """
         self.m_model_name = model_name
         self.m_device = device.lower()
@@ -42,6 +43,7 @@ def __init__(self, model_name: str = "meta-llama/Llama-3.2-1B-Instruct", device:
         self.m_generation_config = None
         self.m_npu_model = None
         self.m_cuda_device = None
+        self.m_gpu_device_id = int(gpu_device_id)
 
         g_logger.info(f"Initializing edge draft model: {model_name}")
         g_logger.info(f"Target device: {self.m_device}")
@@ -53,10 +55,14 @@ def __init__(self, model_name: str = "meta-llama/Llama-3.2-1B-Instruct", device:
                 g_logger.info("Falling back to CPU device")
                 self.m_device = "cpu"
             else:
-                # Set CUDA device
-                self.m_cuda_device = f"cuda:0"  # Default to first GPU
-                g_logger.info(f"GPU available: {torch.cuda.get_device_name(0)}")
-                g_logger.info(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
+                # Set CUDA device using configured GPU index
+                self.m_cuda_device = f"cuda:{self.m_gpu_device_id}"
+                try:
+                    g_logger.info(f"GPU available: {torch.cuda.get_device_name(self.m_gpu_device_id)}")
+                    g_logger.info(f"GPU memory: {torch.cuda.get_device_properties(self.m_gpu_device_id).total_memory / 1e9:.1f} GB")
+                except Exception:
+                    # Fallback to device 0 logging if index lookup fails
+                    g_logger.info(f"GPU available (index {self.m_gpu_device_id})")
 
         elif self.m_device == "npu":
             if not OPENVINO_AVAILABLE:
@@ -130,10 +136,10 @@ def _LoadGPUModel(self) -> bool:
         self.m_model = AutoModelForCausalLM.from_pretrained(
             self.m_model_name,
             torch_dtype=torch.float16,  # Use float16 for GPU memory efficiency
-            device_map="auto",  # Automatically distribute across available GPUs
+            device_map=self.m_cuda_device,  # Automatically distribute across available GPUs
             trust_remote_code=True,
             low_cpu_mem_usage=True
-        ).to(self.m_cuda_device)
+        )
 
         # Configure generation parameters optimized for GPU
         self.m_generation_config = GenerationConfig(