Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions ushadow/backend/src/models/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,10 @@ class KubernetesNode(BaseModel):
external_ip: Optional[str] = Field(None, description="External IP address")
hostname: Optional[str] = Field(None, description="Hostname")

# GPU capacity (from extended resources)
gpu_capacity_nvidia: Optional[int] = Field(None, description="NVIDIA GPU count from nvidia.com/gpu")
gpu_capacity_amd: Optional[int] = Field(None, description="AMD GPU count from amd.com/gpu")

# Taints and labels
taints: List[Dict[str, str]] = Field(default_factory=list, description="Node taints")
labels: Dict[str, str] = Field(default_factory=dict, description="Node labels")
Expand Down
15 changes: 15 additions & 0 deletions ushadow/backend/src/models/unode.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,16 @@ class UNodeType(str, Enum):
KUBERNETES = "kubernetes" # Kubernetes cluster


class GPUDevice(BaseModel):
"""A single GPU device detected on the node."""
vendor: str # "nvidia" or "amd"
index: int = 0
model: str = ""
vram_mb: Optional[int] = None
cuda_version: Optional[str] = None
rocm_version: Optional[str] = None


class UNodeCapabilities(BaseModel):
"""Capabilities of a u-node."""
can_run_docker: bool = True
Expand All @@ -45,6 +55,11 @@ class UNodeCapabilities(BaseModel):
available_memory_mb: int = 0
available_cpu_cores: float = 0
available_disk_gb: float = 0
# GPU details (additive, backward-compatible defaults)
gpu_count: int = 0
gpu_devices: List[GPUDevice] = Field(default_factory=list)
gpu_model: Optional[str] = None
gpu_vram_mb: Optional[int] = None


class UNodeBase(BaseModel):
Expand Down
8 changes: 8 additions & 0 deletions ushadow/backend/src/services/kubernetes_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,12 @@ async def list_nodes(self, cluster_id: str) -> List["KubernetesNode"]:
"effect": taint.effect
})

# Parse GPU extended resources
gpu_nvidia_raw = capacity.get("nvidia.com/gpu")
gpu_amd_raw = capacity.get("amd.com/gpu")
gpu_capacity_nvidia = int(gpu_nvidia_raw) if gpu_nvidia_raw else None
gpu_capacity_amd = int(gpu_amd_raw) if gpu_amd_raw else None

k8s_node = KubernetesNode(
name=node.metadata.name,
cluster_id=cluster_id,
Expand All @@ -296,6 +302,8 @@ async def list_nodes(self, cluster_id: str) -> List["KubernetesNode"]:
memory_capacity=capacity.get("memory"),
cpu_allocatable=allocatable.get("cpu"),
memory_allocatable=allocatable.get("memory"),
gpu_capacity_nvidia=gpu_capacity_nvidia,
gpu_capacity_amd=gpu_capacity_amd,
roles=roles,
internal_ip=internal_ip,
external_ip=external_ip,
Expand Down
159 changes: 159 additions & 0 deletions ushadow/frontend/src/components/kubernetes/ClusterNodeList.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import { useState } from 'react'
import { ChevronDown, ChevronRight, Server, RefreshCw } from 'lucide-react'
import { kubernetesApi, KubernetesNode } from '../../services/api'

interface ClusterNodeListProps {
clusterId: string
clusterStatus: string
nodeCount?: number
}

function formatMemory(memStr?: string): string {
if (!memStr) return '?'
const ki = parseInt(memStr.replace('Ki', ''))
if (!isNaN(ki)) return `${(ki / 1024 / 1024).toFixed(1)} Gi`
if (memStr.endsWith('Gi')) return memStr
return memStr
}

function GpuBadge({ node }: { node: KubernetesNode }) {
const nvidia = node.gpu_capacity_nvidia
const amd = node.gpu_capacity_amd
if (!nvidia && !amd) return null

const parts: string[] = []
if (nvidia) parts.push(`${nvidia}x NVIDIA`)
if (amd) parts.push(`${amd}x AMD`)

return (
<span
className="inline-flex items-center px-2 py-0.5 text-xs font-medium rounded-full bg-accent-100 dark:bg-accent-900/30 text-accent-700 dark:text-accent-300"
data-testid={`k8s-node-gpu-${node.name}`}
>
{parts.join(', ')} GPU
</span>
)
}

function K8sNodeCard({ node }: { node: KubernetesNode }) {
return (
<div
className="p-3 bg-neutral-50 dark:bg-neutral-800/50 rounded-lg border border-neutral-200 dark:border-neutral-700"
data-testid={`k8s-node-card-${node.name}`}
>
<div className="flex items-center justify-between mb-2">
<div className="flex items-center gap-2">
<Server className="h-4 w-4 text-neutral-400" />
<span className="text-sm font-medium text-neutral-900 dark:text-neutral-100 truncate">
{node.name}
</span>
</div>
<div className="flex items-center gap-2">
<GpuBadge node={node} />
<span
className={`px-2 py-0.5 text-xs rounded-full font-medium ${
node.ready
? 'bg-success-100 dark:bg-success-900/30 text-success-700 dark:text-success-300'
: 'bg-danger-100 dark:bg-danger-900/30 text-danger-700 dark:text-danger-300'
}`}
data-testid={`k8s-node-status-${node.name}`}
>
{node.status}
</span>
</div>
</div>

<div className="grid grid-cols-2 gap-x-4 gap-y-1 text-xs text-neutral-600 dark:text-neutral-400">
{node.roles.length > 0 && (
<div>
<span className="text-neutral-400 dark:text-neutral-500">Roles: </span>
{node.roles.join(', ')}
</div>
)}
<div>
<span className="text-neutral-400 dark:text-neutral-500">CPU: </span>
{node.cpu_capacity || '?'}
</div>
<div>
<span className="text-neutral-400 dark:text-neutral-500">Mem: </span>
{formatMemory(node.memory_capacity)}
</div>
{node.kubelet_version && (
<div>
<span className="text-neutral-400 dark:text-neutral-500">Kubelet: </span>
{node.kubelet_version}
</div>
)}
{node.os_image && (
<div className="col-span-2 truncate">
<span className="text-neutral-400 dark:text-neutral-500">OS: </span>
{node.os_image}
</div>
)}
</div>
</div>
)
}

export default function ClusterNodeList({ clusterId, clusterStatus, nodeCount }: ClusterNodeListProps) {
const [expanded, setExpanded] = useState(false)
const [nodes, setNodes] = useState<KubernetesNode[]>([])
const [loading, setLoading] = useState(false)
const [error, setError] = useState<string | null>(null)
const [loaded, setLoaded] = useState(false)

const handleToggle = async () => {
if (!expanded && !loaded) {
setLoading(true)
setError(null)
try {
const response = await kubernetesApi.listNodes(clusterId)
setNodes(response.data)
setLoaded(true)
} catch (err: any) {
setError(err.response?.data?.detail || 'Failed to load nodes')
} finally {
setLoading(false)
}
}
setExpanded(!expanded)
}

if (clusterStatus !== 'connected') return null

return (
<div className="mt-4 pt-4 border-t border-neutral-200 dark:border-neutral-700">
<button
onClick={handleToggle}
className="flex items-center gap-2 text-sm font-medium text-neutral-700 dark:text-neutral-300 hover:text-neutral-900 dark:hover:text-neutral-100 transition-colors w-full"
data-testid={`cluster-nodes-toggle-${clusterId}`}
>
{loading ? (
<RefreshCw className="h-4 w-4 animate-spin" />
) : expanded ? (
<ChevronDown className="h-4 w-4" />
) : (
<ChevronRight className="h-4 w-4" />
)}
Nodes{nodeCount != null && ` (${nodeCount})`}
</button>

{expanded && (
<div
className="mt-3 space-y-2"
data-testid={`cluster-nodes-list-${clusterId}`}
>
{error && (
<p className="text-xs text-danger-600 dark:text-danger-400">{error}</p>
)}
{!error && nodes.map((node) => (
<K8sNodeCard key={node.name} node={node} />
))}
{!error && loaded && nodes.length === 0 && (
<p className="text-xs text-neutral-500">No nodes found</p>
)}
</div>
)}
</div>
)
}
48 changes: 48 additions & 0 deletions ushadow/frontend/src/pages/ClusterPage.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,17 @@ interface UNode {
available_memory_mb: number
available_cpu_cores: number
available_disk_gb: number
gpu_count?: number
gpu_devices?: Array<{
vendor: string
index: number
model: string
vram_mb?: number
cuda_version?: string
rocm_version?: string
}>
gpu_model?: string
gpu_vram_mb?: number
}
metadata?: {
last_metrics?: {
Expand Down Expand Up @@ -102,6 +113,9 @@ interface LeaderInfo {
available_memory_mb: number
available_cpu_cores: number
available_disk_gb: number
gpu_count?: number
gpu_model?: string
gpu_vram_mb?: number
}
services?: string[]
manager_version?: string
Expand Down Expand Up @@ -793,6 +807,40 @@ export default function ClusterPage() {
)}
</div>

{/* GPU Info */}
{node.capabilities?.can_run_gpu && (
<div
className="flex flex-wrap items-center gap-2 text-xs mb-4"
data-testid={`node-gpu-info-${node.hostname}`}
>
<span className="inline-flex items-center px-2 py-0.5 rounded-full bg-accent-100 dark:bg-accent-900/30 text-accent-700 dark:text-accent-300 font-medium">
{(node.capabilities.gpu_count || 1) > 1
? `${node.capabilities.gpu_count}x GPU`
: 'GPU'}
</span>
{node.capabilities.gpu_model && (
<span className="text-neutral-600 dark:text-neutral-300">
{node.capabilities.gpu_model}
</span>
)}
{node.capabilities.gpu_vram_mb && (
<span className="text-neutral-500 dark:text-neutral-400">
{(node.capabilities.gpu_vram_mb / 1024).toFixed(0)} GB VRAM
</span>
)}
{node.capabilities.gpu_devices?.[0]?.cuda_version && (
<span className="text-neutral-500 dark:text-neutral-400">
CUDA {node.capabilities.gpu_devices[0].cuda_version}
</span>
)}
{node.capabilities.gpu_devices?.[0]?.rocm_version && (
<span className="text-neutral-500 dark:text-neutral-400">
ROCm {node.capabilities.gpu_devices[0].rocm_version}
</span>
)}
</div>
)}

{/* Deployed Services */}
{getNodeDeployments(node.hostname).length > 0 && (
<div className="mb-4 border-t border-neutral-200 dark:border-neutral-700 pt-3">
Expand Down
8 changes: 8 additions & 0 deletions ushadow/frontend/src/pages/KubernetesClustersPage.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import Modal from '../components/Modal'
import ConfirmDialog from '../components/ConfirmDialog'
import DeployModal from '../components/DeployModal'
import DNSManagementPanel from '../components/kubernetes/DNSManagementPanel'
import ClusterNodeList from '../components/kubernetes/ClusterNodeList'

interface InfraService {
found: boolean
Expand Down Expand Up @@ -621,6 +622,13 @@ export default function KubernetesClustersPage() {
)}
</div>

{/* Nodes */}
<ClusterNodeList
clusterId={cluster.cluster_id}
clusterStatus={cluster.status}
nodeCount={cluster.node_count}
/>

{/* Actions */}
<div className="flex justify-between items-center pt-4 border-t border-neutral-200 dark:border-neutral-700 gap-2">
<div className="flex gap-2">
Expand Down
27 changes: 27 additions & 0 deletions ushadow/frontend/src/services/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -662,6 +662,29 @@ export interface CertificateStatus {
renewal_time?: string
}

export interface KubernetesNode {
name: string
cluster_id: string
status: string
ready: boolean
kubelet_version?: string
os_image?: string
kernel_version?: string
container_runtime?: string
cpu_capacity?: string
memory_capacity?: string
cpu_allocatable?: string
memory_allocatable?: string
gpu_capacity_nvidia?: number
gpu_capacity_amd?: number
roles: string[]
internal_ip?: string
external_ip?: string
hostname?: string
taints: Array<{ key: string; value: string; effect: string }>
labels: Record<string, string>
}

export const kubernetesApi = {
addCluster: (data: { name: string; kubeconfig: string; context?: string; namespace?: string; labels?: Record<string, string> }) =>
api.post<KubernetesCluster>('/api/kubernetes', data),
Expand Down Expand Up @@ -745,6 +768,10 @@ export const kubernetesApi = {
api.get<{ certificates: CertificateStatus[]; total: number }>(
`/api/kubernetes/${clusterId}/dns/certificates${namespace ? `?namespace=${namespace}` : ''}`
),

// Node operations
listNodes: (clusterId: string) =>
api.get<KubernetesNode[]>(`/api/kubernetes/${clusterId}/nodes`),
}

// Service Definition and Deployment types
Expand Down
Loading
Loading