diff --git a/src/madengine/core/context.py b/src/madengine/core/context.py index b1c7c225..4627ab8d 100644 --- a/src/madengine/core/context.py +++ b/src/madengine/core/context.py @@ -191,7 +191,7 @@ def get_gpu_vendor(self) -> str: """ # Check if the GPU vendor is NVIDIA or AMD, and if it is unable to detect the GPU vendor. return self.console.sh( - 'bash -c \'if [[ -f /usr/bin/nvidia-smi ]] && $(/usr/bin/nvidia-smi > /dev/null 2>&1); then echo "NVIDIA"; elif [[ -f /opt/rocm/bin/amd-smi ]]; then echo "AMD"; elif [[ -f /usr/local/bin/amd-smi ]]; then echo "AMD"; else echo "Unable to detect GPU vendor"; fi || true\'' + 'bash -c \'if [[ -f /usr/bin/nvidia-smi ]] && $(/usr/bin/nvidia-smi > /dev/null 2>&1); then echo "NVIDIA"; elif [[ -f /opt/rocm/bin/amd-smi ]]; then echo "AMD"; elif [[ -f /usr/local/bin/amd-smi ]]; then echo "AMD"; elif [[ -f /opt/rocm/bin/rocm-smi ]]; then echo "AMD"; else echo "Unable to detect GPU vendor"; fi || true\'' ) def get_host_os(self) -> str: @@ -322,7 +322,28 @@ def get_system_gpu_product_name(self) -> str: - AMD """ if self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] == "AMD": - return self.console.sh("amd-smi static -g 0 | grep MARKET_NAME: | cut -d ':' -f 2") + try: + return self.console.sh("amd-smi static -g 0 | grep MARKET_NAME: | cut -d ':' -f 2") + except Exception as e: + # Try fallback to rocm-smi + try: + output = self.console.sh("rocm-smi -i") + # Parse output to extract product name from brackets + # Example: "GPU[0] : Device Name: Arcturus GL-XL [Instinct MI100]" + # Extract: "Instinct MI100" + for line in output.split('\n'): + if 'Device Name:' in line and 'GPU[0]' in line: + # Use regex to find text within brackets + match = re.search(r'\[(.*?)\]', line) + if match: + return match.group(1).strip() + raise RuntimeError("Could not parse GPU product name from rocm-smi output") + except Exception as rocm_error: + raise RuntimeError( + f"Unable to determine AMD GPU product name. " + f"Ensure amd-smi or rocm-smi is installed and GPUs are accessible. " + f"amd-smi error: {e}, rocm-smi error: {rocm_error}" + ) elif self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] == "NVIDIA": return self.console.sh("nvidia-smi --query-gpu=name --format=csv,noheader,nounits -i 0") else: @@ -414,9 +435,64 @@ def get_gpu_renderD_nodes(self) -> typing.Optional[typing.List[int]]: kfd_renderDs = [int(line.split()[-1]) for line in kfd_properties] - # Get gpu id - renderD mapping using unique id if ROCm < 6.4.0 and node id otherwise - # node id is more robust but is only available from 6.4.0 - if rocm_version < (6, 4, 0): + # Get gpu id - renderD mapping using unique id if ROCm < 6.4.1 and node id otherwise + # node id is more robust but is only available from 6.4.1 + use_legacy_method = False + + if rocm_version >= (6, 4, 1): + # Try modern method using node_id (ROCm >= 6.4.1) + try: + kfd_nodeids = [] + for line in kfd_properties: + try: + match = re.search(r"\d+", line.split()[0]) + if match: + kfd_nodeids.append(int(match.group())) + else: + print(f"Warning: Could not extract node ID from line: {line}") + except (IndexError, ValueError) as e: + print(f"Warning: Failed to parse node ID from line '{line}': {e}") + continue + + if len(kfd_nodeids) != len(kfd_renderDs): + raise RuntimeError( + f"Mismatch between node IDs count ({len(kfd_nodeids)}) " + f"and renderDs count ({len(kfd_renderDs)})" + ) + + # Map node ids to renderDs + nodeid_renderD_map = { + nodeid: renderD + for nodeid, renderD in zip(kfd_nodeids, kfd_renderDs) + } + + # Get list of GPUs from amd-smi + output = self.console.sh("amd-smi list -e --json") + if not output or output.strip() == "": + raise ValueError("Failed to retrieve AMD GPU data from amd-smi") + + data = json.loads(output) + + if not data or not isinstance(data, list): + raise ValueError("amd-smi returned empty or invalid data") + + # Get gpu id to node id map from amd-smi + gpuid_nodeid_map = {} + for item in data: + gpuid_nodeid_map[item["gpu"]] = item["node_id"] + + # Sort gpu_renderDs based on gpu ids + gpu_renderDs = [ + nodeid_renderD_map[gpuid_nodeid_map[gpuid]] + for gpuid in sorted(gpuid_nodeid_map.keys()) + ] + + except Exception as e: + # Fallback to legacy method if amd-smi fails + print(f"Warning: amd-smi failed on ROCm >= 6.4.1, falling back to rocm-smi: {e}") + use_legacy_method = True + + if rocm_version < (6, 4, 1) or use_legacy_method: # Legacy method using unique_id kfd_unique_output = self.console.sh("grep -r unique_id /sys/devices/virtual/kfd/kfd/topology/nodes") if not kfd_unique_output: @@ -463,61 +539,6 @@ def get_gpu_renderD_nodes(self) -> typing.Optional[typing.List[int]]: gpu_renderDs.append(uniqueid_renderD_map[unique_id]) except (IndexError, KeyError) as e: raise RuntimeError(f"Failed to map unique ID from line '{line}': {e}") - else: - # Modern method using node_id (ROCm >= 6.4.0) - kfd_nodeids = [] - for line in kfd_properties: - try: - match = re.search(r"\d+", line.split()[0]) - if match: - kfd_nodeids.append(int(match.group())) - else: - print(f"Warning: Could not extract node ID from line: {line}") - except (IndexError, ValueError) as e: - print(f"Warning: Failed to parse node ID from line '{line}': {e}") - continue - - if len(kfd_nodeids) != len(kfd_renderDs): - raise RuntimeError( - f"Mismatch between node IDs count ({len(kfd_nodeids)}) " - f"and renderDs count ({len(kfd_renderDs)})" - ) - - # Map node ids to renderDs - nodeid_renderD_map = { - nodeid: renderD - for nodeid, renderD in zip(kfd_nodeids, kfd_renderDs) - } - - # Get list of GPUs from amd-smi - output = self.console.sh("amd-smi list -e --json") - if not output or output.strip() == "": - raise ValueError("Failed to retrieve AMD GPU data from amd-smi") - - try: - data = json.loads(output) - except json.JSONDecodeError as e: - raise ValueError(f"Failed to parse amd-smi JSON output: {e}") - - if not data or not isinstance(data, list): - raise ValueError("amd-smi returned empty or invalid data") - - # Get gpu id to node id map from amd-smi - gpuid_nodeid_map = {} - for item in data: - try: - gpuid_nodeid_map[item["gpu"]] = item["node_id"] - except KeyError as e: - raise KeyError(f"Failed to parse node_id from amd-smi data: {e}. Item: {item}") - - # Sort gpu_renderDs based on gpu ids - try: - gpu_renderDs = [ - nodeid_renderD_map[gpuid_nodeid_map[gpuid]] - for gpuid in sorted(gpuid_nodeid_map.keys()) - ] - except KeyError as e: - raise RuntimeError(f"Failed to map GPU IDs to renderDs: {e}") except (RuntimeError, ValueError, KeyError) as e: # Re-raise with context diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index a620d96f..00078a95 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -202,7 +202,7 @@ def clean_up_docker_container(self, is_cleaned: bool = False) -> None: gpu_vendor = self.context.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] # show gpu info if gpu_vendor.find("AMD") != -1: - self.console.sh("/opt/rocm/bin/amd-smi || true") + self.console.sh("/opt/rocm/bin/amd-smi || /opt/rocm/bin/rocm-smi || true") elif gpu_vendor.find("NVIDIA") != -1: self.console.sh("nvidia-smi -L || true") @@ -726,7 +726,7 @@ def run_model_impl( # echo gpu smi info if gpu_vendor.find("AMD") != -1: - smi = model_docker.sh("/opt/rocm/bin/amd-smi || true") + smi = model_docker.sh("/opt/rocm/bin/amd-smi || /opt/rocm/bin/rocm-smi || true") elif gpu_vendor.find("NVIDIA") != -1: smi = model_docker.sh("/usr/bin/nvidia-smi || true") else: