From 0c0c99bfda07416f6386eca6215908591e19f1b0 Mon Sep 17 00:00:00 2001 From: Stu Alexander Date: Fri, 27 Feb 2026 09:37:13 +0000 Subject: [PATCH] feat(nodes): add GPU detection and K8s node hardware info - Worker: _collect_gpu_info() queries nvidia-smi/rocm-smi for model, VRAM, CUDA/ROCm version. get_capabilities() includes gpu_count, gpu_devices, gpu_model, gpu_vram_mb. handle_info() now exposes capabilities for discovery. - Backend: GPUDevice model + UNodeCapabilities extended with GPU fields (all optional with defaults for backward compat). - K8s model: KubernetesNode gains gpu_capacity_nvidia/amd from nvidia.com/gpu and amd.com/gpu extended resources. - Frontend: KubernetesNode TS interface + kubernetesApi.listNodes(). ClusterNodeList component (lazy-loaded, expandable, GPU badges). UNode cards show GPU model, VRAM, count, CUDA/ROCm version. KubernetesClustersPage integrates ClusterNodeList per cluster card. All interactive elements have data-testid attributes. Co-Authored-By: Claude Sonnet 4.6 --- ushadow/backend/src/models/kubernetes.py | 4 + ushadow/backend/src/models/unode.py | 15 ++ .../src/services/kubernetes_manager.py | 8 + .../components/kubernetes/ClusterNodeList.tsx | 159 ++++++++++++++++++ ushadow/frontend/src/pages/ClusterPage.tsx | 48 ++++++ .../src/pages/KubernetesClustersPage.tsx | 8 + ushadow/frontend/src/services/api.ts | 27 +++ ushadow/manager/manager.py | 124 +++++++++++++- 8 files changed, 384 insertions(+), 9 deletions(-) create mode 100644 ushadow/frontend/src/components/kubernetes/ClusterNodeList.tsx diff --git a/ushadow/backend/src/models/kubernetes.py b/ushadow/backend/src/models/kubernetes.py index 3b377501..e1d599f8 100644 --- a/ushadow/backend/src/models/kubernetes.py +++ b/ushadow/backend/src/models/kubernetes.py @@ -114,6 +114,10 @@ class KubernetesNode(BaseModel): external_ip: Optional[str] = Field(None, description="External IP address") hostname: Optional[str] = Field(None, description="Hostname") + # GPU capacity (from extended resources) + gpu_capacity_nvidia: Optional[int] = Field(None, description="NVIDIA GPU count from nvidia.com/gpu") + gpu_capacity_amd: Optional[int] = Field(None, description="AMD GPU count from amd.com/gpu") + # Taints and labels taints: List[Dict[str, str]] = Field(default_factory=list, description="Node taints") labels: Dict[str, str] = Field(default_factory=dict, description="Node labels") diff --git a/ushadow/backend/src/models/unode.py b/ushadow/backend/src/models/unode.py index b6521d56..0037c5a2 100644 --- a/ushadow/backend/src/models/unode.py +++ b/ushadow/backend/src/models/unode.py @@ -36,6 +36,16 @@ class UNodeType(str, Enum): KUBERNETES = "kubernetes" # Kubernetes cluster +class GPUDevice(BaseModel): + """A single GPU device detected on the node.""" + vendor: str # "nvidia" or "amd" + index: int = 0 + model: str = "" + vram_mb: Optional[int] = None + cuda_version: Optional[str] = None + rocm_version: Optional[str] = None + + class UNodeCapabilities(BaseModel): """Capabilities of a u-node.""" can_run_docker: bool = True @@ -45,6 +55,11 @@ class UNodeCapabilities(BaseModel): available_memory_mb: int = 0 available_cpu_cores: float = 0 available_disk_gb: float = 0 + # GPU details (additive, backward-compatible defaults) + gpu_count: int = 0 + gpu_devices: List[GPUDevice] = Field(default_factory=list) + gpu_model: Optional[str] = None + gpu_vram_mb: Optional[int] = None class UNodeBase(BaseModel): diff --git a/ushadow/backend/src/services/kubernetes_manager.py b/ushadow/backend/src/services/kubernetes_manager.py index 5b3b1be7..be4e57f6 100644 --- a/ushadow/backend/src/services/kubernetes_manager.py +++ b/ushadow/backend/src/services/kubernetes_manager.py @@ -283,6 +283,12 @@ async def list_nodes(self, cluster_id: str) -> List["KubernetesNode"]: "effect": taint.effect }) + # Parse GPU extended resources + gpu_nvidia_raw = capacity.get("nvidia.com/gpu") + gpu_amd_raw = capacity.get("amd.com/gpu") + gpu_capacity_nvidia = int(gpu_nvidia_raw) if gpu_nvidia_raw else None + gpu_capacity_amd = int(gpu_amd_raw) if gpu_amd_raw else None + k8s_node = KubernetesNode( name=node.metadata.name, cluster_id=cluster_id, @@ -296,6 +302,8 @@ async def list_nodes(self, cluster_id: str) -> List["KubernetesNode"]: memory_capacity=capacity.get("memory"), cpu_allocatable=allocatable.get("cpu"), memory_allocatable=allocatable.get("memory"), + gpu_capacity_nvidia=gpu_capacity_nvidia, + gpu_capacity_amd=gpu_capacity_amd, roles=roles, internal_ip=internal_ip, external_ip=external_ip, diff --git a/ushadow/frontend/src/components/kubernetes/ClusterNodeList.tsx b/ushadow/frontend/src/components/kubernetes/ClusterNodeList.tsx new file mode 100644 index 00000000..126ac2f1 --- /dev/null +++ b/ushadow/frontend/src/components/kubernetes/ClusterNodeList.tsx @@ -0,0 +1,159 @@ +import { useState } from 'react' +import { ChevronDown, ChevronRight, Server, RefreshCw } from 'lucide-react' +import { kubernetesApi, KubernetesNode } from '../../services/api' + +interface ClusterNodeListProps { + clusterId: string + clusterStatus: string + nodeCount?: number +} + +function formatMemory(memStr?: string): string { + if (!memStr) return '?' + const ki = parseInt(memStr.replace('Ki', '')) + if (!isNaN(ki)) return `${(ki / 1024 / 1024).toFixed(1)} Gi` + if (memStr.endsWith('Gi')) return memStr + return memStr +} + +function GpuBadge({ node }: { node: KubernetesNode }) { + const nvidia = node.gpu_capacity_nvidia + const amd = node.gpu_capacity_amd + if (!nvidia && !amd) return null + + const parts: string[] = [] + if (nvidia) parts.push(`${nvidia}x NVIDIA`) + if (amd) parts.push(`${amd}x AMD`) + + return ( + + {parts.join(', ')} GPU + + ) +} + +function K8sNodeCard({ node }: { node: KubernetesNode }) { + return ( +
+
+
+ + + {node.name} + +
+
+ + + {node.status} + +
+
+ +
+ {node.roles.length > 0 && ( +
+ Roles: + {node.roles.join(', ')} +
+ )} +
+ CPU: + {node.cpu_capacity || '?'} +
+
+ Mem: + {formatMemory(node.memory_capacity)} +
+ {node.kubelet_version && ( +
+ Kubelet: + {node.kubelet_version} +
+ )} + {node.os_image && ( +
+ OS: + {node.os_image} +
+ )} +
+
+ ) +} + +export default function ClusterNodeList({ clusterId, clusterStatus, nodeCount }: ClusterNodeListProps) { + const [expanded, setExpanded] = useState(false) + const [nodes, setNodes] = useState([]) + const [loading, setLoading] = useState(false) + const [error, setError] = useState(null) + const [loaded, setLoaded] = useState(false) + + const handleToggle = async () => { + if (!expanded && !loaded) { + setLoading(true) + setError(null) + try { + const response = await kubernetesApi.listNodes(clusterId) + setNodes(response.data) + setLoaded(true) + } catch (err: any) { + setError(err.response?.data?.detail || 'Failed to load nodes') + } finally { + setLoading(false) + } + } + setExpanded(!expanded) + } + + if (clusterStatus !== 'connected') return null + + return ( +
+ + + {expanded && ( +
+ {error && ( +

{error}

+ )} + {!error && nodes.map((node) => ( + + ))} + {!error && loaded && nodes.length === 0 && ( +

No nodes found

+ )} +
+ )} +
+ ) +} diff --git a/ushadow/frontend/src/pages/ClusterPage.tsx b/ushadow/frontend/src/pages/ClusterPage.tsx index e3dc82cb..558ad19f 100644 --- a/ushadow/frontend/src/pages/ClusterPage.tsx +++ b/ushadow/frontend/src/pages/ClusterPage.tsx @@ -35,6 +35,17 @@ interface UNode { available_memory_mb: number available_cpu_cores: number available_disk_gb: number + gpu_count?: number + gpu_devices?: Array<{ + vendor: string + index: number + model: string + vram_mb?: number + cuda_version?: string + rocm_version?: string + }> + gpu_model?: string + gpu_vram_mb?: number } metadata?: { last_metrics?: { @@ -102,6 +113,9 @@ interface LeaderInfo { available_memory_mb: number available_cpu_cores: number available_disk_gb: number + gpu_count?: number + gpu_model?: string + gpu_vram_mb?: number } services?: string[] manager_version?: string @@ -793,6 +807,40 @@ export default function ClusterPage() { )} + {/* GPU Info */} + {node.capabilities?.can_run_gpu && ( +
+ + {(node.capabilities.gpu_count || 1) > 1 + ? `${node.capabilities.gpu_count}x GPU` + : 'GPU'} + + {node.capabilities.gpu_model && ( + + {node.capabilities.gpu_model} + + )} + {node.capabilities.gpu_vram_mb && ( + + {(node.capabilities.gpu_vram_mb / 1024).toFixed(0)} GB VRAM + + )} + {node.capabilities.gpu_devices?.[0]?.cuda_version && ( + + CUDA {node.capabilities.gpu_devices[0].cuda_version} + + )} + {node.capabilities.gpu_devices?.[0]?.rocm_version && ( + + ROCm {node.capabilities.gpu_devices[0].rocm_version} + + )} +
+ )} + {/* Deployed Services */} {getNodeDeployments(node.hostname).length > 0 && (
diff --git a/ushadow/frontend/src/pages/KubernetesClustersPage.tsx b/ushadow/frontend/src/pages/KubernetesClustersPage.tsx index 008cf391..fdf2460f 100644 --- a/ushadow/frontend/src/pages/KubernetesClustersPage.tsx +++ b/ushadow/frontend/src/pages/KubernetesClustersPage.tsx @@ -6,6 +6,7 @@ import Modal from '../components/Modal' import ConfirmDialog from '../components/ConfirmDialog' import DeployModal from '../components/DeployModal' import DNSManagementPanel from '../components/kubernetes/DNSManagementPanel' +import ClusterNodeList from '../components/kubernetes/ClusterNodeList' interface InfraService { found: boolean @@ -621,6 +622,13 @@ export default function KubernetesClustersPage() { )}
+ {/* Nodes */} + + {/* Actions */}
diff --git a/ushadow/frontend/src/services/api.ts b/ushadow/frontend/src/services/api.ts index a08b1901..d518bb55 100644 --- a/ushadow/frontend/src/services/api.ts +++ b/ushadow/frontend/src/services/api.ts @@ -662,6 +662,29 @@ export interface CertificateStatus { renewal_time?: string } +export interface KubernetesNode { + name: string + cluster_id: string + status: string + ready: boolean + kubelet_version?: string + os_image?: string + kernel_version?: string + container_runtime?: string + cpu_capacity?: string + memory_capacity?: string + cpu_allocatable?: string + memory_allocatable?: string + gpu_capacity_nvidia?: number + gpu_capacity_amd?: number + roles: string[] + internal_ip?: string + external_ip?: string + hostname?: string + taints: Array<{ key: string; value: string; effect: string }> + labels: Record +} + export const kubernetesApi = { addCluster: (data: { name: string; kubeconfig: string; context?: string; namespace?: string; labels?: Record }) => api.post('/api/kubernetes', data), @@ -745,6 +768,10 @@ export const kubernetesApi = { api.get<{ certificates: CertificateStatus[]; total: number }>( `/api/kubernetes/${clusterId}/dns/certificates${namespace ? `?namespace=${namespace}` : ''}` ), + + // Node operations + listNodes: (clusterId: string) => + api.get(`/api/kubernetes/${clusterId}/nodes`), } // Service Definition and Deployment types diff --git a/ushadow/manager/manager.py b/ushadow/manager/manager.py index 43df3c20..db7cf061 100644 --- a/ushadow/manager/manager.py +++ b/ushadow/manager/manager.py @@ -172,6 +172,7 @@ async def handle_info(self, request: web.Request) -> web.Response: "manager_version": MANAGER_VERSION, "platform": platform.system().lower(), "docker_available": self.docker_client is not None, + "capabilities": self.get_capabilities(), }) async def handle_upgrade(self, request: web.Request) -> web.Response: @@ -600,6 +601,109 @@ def get_running_services(self) -> List[str]: return services + def _collect_gpu_info(self) -> Dict[str, Any]: + """Collect detailed GPU information from NVIDIA and AMD tools.""" + import subprocess + + devices: List[Dict[str, Any]] = [] + cuda_version: Optional[str] = None + + # --- NVIDIA GPUs --- + try: + result = subprocess.run( + ["nvidia-smi", "--query-gpu=index,name,memory.total", "--format=csv,noheader,nounits"], + capture_output=True, text=True, timeout=10, + ) + if result.returncode == 0: + for line in result.stdout.strip().splitlines(): + parts = [p.strip() for p in line.split(",")] + if len(parts) >= 3: + devices.append({ + "vendor": "nvidia", + "index": int(parts[0]), + "model": parts[1], + "vram_mb": int(float(parts[2])), + }) + + # Get CUDA toolkit version via nvcc + try: + nvcc = subprocess.run( + ["nvcc", "--version"], + capture_output=True, text=True, timeout=5, + ) + if nvcc.returncode == 0: + import re + m = re.search(r"release (\d+\.\d+)", nvcc.stdout) + if m: + cuda_version = m.group(1) + except Exception: + pass + + # Fallback: driver version from nvidia-smi + if not cuda_version: + try: + drv = subprocess.run( + ["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"], + capture_output=True, text=True, timeout=5, + ) + if drv.returncode == 0 and drv.stdout.strip(): + cuda_version = f"driver-{drv.stdout.strip().splitlines()[0].strip()}" + except Exception: + pass + + # Attach cuda_version to each nvidia device + for dev in devices: + if dev["vendor"] == "nvidia" and cuda_version: + dev["cuda_version"] = cuda_version + except Exception: + pass + + # --- AMD GPUs --- + try: + result = subprocess.run( + ["rocm-smi", "--showproductname", "--csv"], + capture_output=True, text=True, timeout=10, + ) + if result.returncode == 0: + rocm_version: Optional[str] = None + # Try to get ROCm version + try: + ri = subprocess.run( + ["rocminfo"], + capture_output=True, text=True, timeout=10, + ) + if ri.returncode == 0: + import re + m = re.search(r"ROCm Runtime Version:\s+(\S+)", ri.stdout) + if m: + rocm_version = m.group(1) + except Exception: + pass + + lines = result.stdout.strip().splitlines() + for i, line in enumerate(lines): + if i == 0: # skip header + continue + parts = [p.strip() for p in line.split(",")] + if parts: + dev = { + "vendor": "amd", + "index": i - 1, + "model": parts[0] if parts else "Unknown AMD GPU", + } + if rocm_version: + dev["rocm_version"] = rocm_version + devices.append(dev) + except Exception: + pass + + return { + "gpu_count": len(devices), + "gpu_devices": devices, + "gpu_model": devices[0]["model"] if devices else None, + "gpu_vram_mb": devices[0].get("vram_mb") if devices else None, + } + def get_capabilities(self) -> Dict[str, Any]: """Get node capabilities.""" capabilities = { @@ -609,6 +713,10 @@ def get_capabilities(self) -> Dict[str, Any]: "available_memory_mb": 0, "available_cpu_cores": 0, "available_disk_gb": 0, + "gpu_count": 0, + "gpu_devices": [], + "gpu_model": None, + "gpu_vram_mb": None, } try: @@ -621,16 +729,14 @@ def get_capabilities(self) -> Dict[str, Any]: except ImportError: pass - # Check for GPU + # Collect GPU info try: - import subprocess - result = subprocess.run( - ["nvidia-smi", "-L"], - capture_output=True, - timeout=5 - ) - if result.returncode == 0: - capabilities["can_run_gpu"] = True + gpu_info = self._collect_gpu_info() + capabilities["can_run_gpu"] = gpu_info["gpu_count"] > 0 + capabilities["gpu_count"] = gpu_info["gpu_count"] + capabilities["gpu_devices"] = gpu_info["gpu_devices"] + capabilities["gpu_model"] = gpu_info["gpu_model"] + capabilities["gpu_vram_mb"] = gpu_info["gpu_vram_mb"] except Exception: pass