diff --git a/src/madengine/core/console.py b/src/madengine/core/console.py index 9340924a..782ba539 100644 --- a/src/madengine/core/console.py +++ b/src/madengine/core/console.py @@ -40,7 +40,8 @@ def sh( timeout: int=60, secret: bool=False, prefix: str="", - env: typing.Optional[typing.Dict[str, str]]=None + env: typing.Optional[typing.Dict[str, str]]=None, + ignore_stderr: bool=False ) -> str: """Run shell command. @@ -51,6 +52,7 @@ def sh( secret (bool): The flag to hide the command. prefix (str): The prefix of the output. env (typing_extensions.TypedDict): The environment variables. + ignore_stderr (bool): Don't include stderr in command's output. Returns: str: The output of the shell command. @@ -67,7 +69,7 @@ def sh( command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, + stderr=None if ignore_stderr else subprocess.STDOUT, shell=True, universal_newlines=True, bufsize=1, diff --git a/src/madengine/core/context.py b/src/madengine/core/context.py index 9b94ed32..87f8ddb4 100644 --- a/src/madengine/core/context.py +++ b/src/madengine/core/context.py @@ -174,7 +174,11 @@ def get_gpu_vendor(self) -> str: """ # Check if the GPU vendor is NVIDIA or AMD, and if it is unable to detect the GPU vendor. return self.console.sh( - 'bash -c \'if [[ -f /usr/bin/nvidia-smi ]] && $(/usr/bin/nvidia-smi > /dev/null 2>&1); then echo "NVIDIA"; elif [[ -f /opt/rocm/bin/rocm-smi ]]; then echo "AMD"; elif [[ -f /usr/local/bin/rocm-smi ]]; then echo "AMD"; else echo "Unable to detect GPU vendor"; fi || true\'' + 'bash -c \'if [[ -f /usr/bin/nvidia-smi ]] && $(/usr/bin/nvidia-smi > /dev/null 2>&1); then echo "NVIDIA";' + + ' elif [[ -f /opt/rocm/bin/rocm-smi ]]; then echo "AMD";' + + ' elif [[ -f /usr/local/bin/rocm-smi ]]; then echo "AMD";' + + ' else echo "Unable to detect GPU vendor"; fi || true\'', + ignore_stderr=True ) def get_host_os(self) -> str: @@ -236,7 +240,7 @@ def get_system_ngpus(self) -> int: """ number_gpus = 0 if self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] == "AMD": - number_gpus = int(self.console.sh("rocm-smi --showid --csv | grep card | wc -l")) + number_gpus = int(self.console.sh("rocm-smi --showid --csv | grep card | wc -l", ignore_stderr=True)) elif self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] == "NVIDIA": number_gpus = int(self.console.sh("nvidia-smi -L | wc -l")) else: @@ -327,7 +331,7 @@ def get_gpu_renderD_nodes(self) -> typing.Optional[typing.List[int]]: uniqueid_renderD_map = {unique_id:renderD for unique_id, renderD in zip(kfd_unique_ids, kfd_renderDs)} # get gpu id unique id map from rocm-smi - rsmi = self.console.sh("rocm-smi --showuniqueid | grep Unique.*:").split("\n") + rsmi = self.console.sh("rocm-smi --showuniqueid | grep Unique.*:", ignore_stderr=True).split("\n") # sort gpu_renderDs based on gpu ids gpu_renderDs = [uniqueid_renderD_map[line.split()[-1]] for line in rsmi] @@ -338,7 +342,7 @@ def get_gpu_renderD_nodes(self) -> typing.Optional[typing.List[int]]: nodeid_renderD_map = {nodeid: renderD for nodeid, renderD in zip(kfd_nodeids, kfd_renderDs)} # get gpu id node id map from rocm-smi - rsmi = re.findall(r"\n\d+\s+\d+",self.console.sh("rocm-smi --showhw")) + rsmi = re.findall(r"\n\d+\s+\d+",self.console.sh("rocm-smi --showhw", ignore_stderr=True)) rsmi_gpuids = [int(s.split()[0]) for s in rsmi] rsmi_nodeids = [int(s.split()[1]) for s in rsmi] gpuid_nodeid_map = {gpuid: nodeid for gpuid, nodeid in zip(rsmi_gpuids, rsmi_nodeids)} diff --git a/tests/test_console.py b/tests/test_console.py index 6ed0cb79..533793a4 100644 --- a/tests/test_console.py +++ b/tests/test_console.py @@ -52,3 +52,7 @@ def test_sh_env(self): def test_sh_verbose(self): obj = console.Console(shellVerbose=False) assert obj.sh("echo MAD Engine") == "MAD Engine" + + def test_sh_ignore_stderr(self): + obj = console.Console(shellVerbose=False) + assert obj.sh("echo fail 1>&2 | xargs echo success", ignore_stderr=True) == "success"