diff --git a/src/job-exporter/build/job-exporter.common.dockerfile b/src/job-exporter/build/job-exporter.common.dockerfile index 3997304d..0fcd2dbc 100644 --- a/src/job-exporter/build/job-exporter.common.dockerfile +++ b/src/job-exporter/build/job-exporter.common.dockerfile @@ -16,75 +16,120 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +############################ +# builder: only for compiling python wheels +############################ +FROM mcr.microsoft.com/mirror/nvcr/nvidia/cuda:12.0.1-runtime-ubuntu22.04 AS builder + +ARG TARGETARCH + +RUN set -eux; \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + ca-certificates \ + python3-pip \ + python3-dev \ + build-essential \ + gcc; \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /w + +# build wheels once +COPY requirements.txt /w/requirements.txt +RUN python3 -m pip install --no-cache-dir -U pip wheel && \ + python3 -m pip wheel --no-cache-dir --wheel-dir /w/wheels \ + -r /w/requirements.txt \ + prometheus_client psutil filelock + + +############################ +# runtime: final image +############################ FROM mcr.microsoft.com/mirror/nvcr/nvidia/cuda:12.0.1-runtime-ubuntu22.04 ARG TARGETARCH -# Register the ROCM package repository, and install rocm-dev package ARG ROCM_VERSION=6.2.2 ARG AMDGPU_VERSION=6.2.2 +ARG DCGM_TARGET_VERSION=1:4.4.1-1 -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - autoconf \ - automake \ - bash \ - build-essential \ - cmake \ - curl \ - file \ - g++ \ - git \ - gnupg \ - ibverbs-utils \ - kmod \ - libc++-dev \ - libcap-dev \ - libelf1 \ - libgflags-dev \ - libgtest-dev \ - libnuma-dev \ - libtool \ - numactl \ - pkg-config \ - python3-dev \ - python3-pip \ - sudo \ - unzip && \ - if [ "$TARGETARCH" = "amd64" ]; then \ - printf "Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" | tee /etc/apt/preferences.d/rocm-pin-600 && \ - curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \ - echo "deb https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" | tee /etc/apt/sources.list.d/rocm.list && \ - echo "deb https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" | tee /etc/apt/sources.list.d/amdgpu.list && \ - apt-get update && \ - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rocm-dev; \ - fi - -COPY src/Moneo /Moneo +# -------------------------- +# base + REQUIRED apt upgrade +# -------------------------- +RUN set -eux; \ + apt-get update; \ + apt-get upgrade -y; \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + bash \ + ca-certificates \ + curl \ + gnupg \ + wget \ + python3 \ + python3-pip; \ + apt-get clean; \ + rm -rf /var/lib/apt/lists/* /var/cache/apt/* -# Install RDC -RUN if [ "$TARGETARCH" = "amd64" ]; then sudo bash Moneo/src/worker/install/amd.sh; fi +# -------------------------- +# ROCm (runtime only) +# -------------------------- +RUN set -eux; \ + if [ "$TARGETARCH" = "amd64" ]; then \ + printf "Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" \ + > /etc/apt/preferences.d/rocm-pin-600; \ + curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -; \ + echo "deb https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" \ + > /etc/apt/sources.list.d/rocm.list; \ + echo "deb https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" \ + > /etc/apt/sources.list.d/amdgpu.list; \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rdc; \ + rm -rf /var/lib/apt/lists/*; \ + fi -# Install DCGM -RUN sed -i 's/systemctl --now enable nvidia-dcgm/#&/' Moneo/src/worker/install/nvidia.sh && \ - sed -i 's/systemctl start nvidia-dcgm/#&/' Moneo/src/worker/install/nvidia.sh && \ - sudo bash Moneo/src/worker/install/nvidia.sh +# -------------------------- +# DCGM (runtime only, same layer clean) +# -------------------------- +RUN set -eux; \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + datacenter-gpu-manager-4-cuda12=${DCGM_TARGET_VERSION} \ + datacenter-gpu-manager-4-core=${DCGM_TARGET_VERSION} \ + datacenter-gpu-manager-4-proprietary-cuda12=${DCGM_TARGET_VERSION}; \ + apt-get clean; \ + rm -rf /var/lib/apt/lists/* -ENV PATH="${PATH}:/opt/rocm/bin" -COPY build/moneo-*-exporter_entrypoint.sh ./ -COPY build/update-dcgm.py . +# -------------------------- +# nerdctl +# -------------------------- +ENV NERDCTL_VERSION=2.2.1 +RUN set -eux; \ + wget -O /tmp/nerdctl.tar.gz \ + https://github.com/containerd/nerdctl/releases/download/v${NERDCTL_VERSION}/nerdctl-${NERDCTL_VERSION}-linux-${TARGETARCH}.tar.gz; \ + mkdir -p /tmp/nerdctl; \ + tar -xzf /tmp/nerdctl.tar.gz -C /tmp/nerdctl; \ + mv /tmp/nerdctl/nerdctl /usr/local/bin/nerdctl; \ + rm -rf /tmp/nerdctl* /tmp/nerdctl.tar.gz -# For the job exporter -ENV NERDCTL_VERSION=2.1.3 -RUN apt-get update && apt-get install --no-install-recommends -y wget ca-certificates -RUN wget -O /tmp/nerdctl.tar.gz https://github.com/containerd/nerdctl/releases/download/v${NERDCTL_VERSION}/nerdctl-${NERDCTL_VERSION}-linux-${TARGETARCH}.tar.gz && \ - mkdir -p /tmp/nerdctl && \ - tar -xzvf /tmp/nerdctl.tar.gz -C /tmp/nerdctl && \ - mv /tmp/nerdctl/nerdctl /usr/local/bin/nerdctl && \ - mkdir -p /job_exporter && \ - rm -rf /tmp/nerdctl* +# -------------------------- +# python runtime deps (from wheels) +# -------------------------- -COPY requirements.txt /job_exporter/ -RUN pip3 install -r /job_exporter/requirements.txt +COPY --from=builder /w/wheels /wheels +COPY requirements.txt /job_exporter/requirements.txt -RUN apt update && apt upgrade -y && apt-get clean && rm -rf /var/lib/apt/lists/* +RUN python3 -m pip install --no-cache-dir -U pip && \ + python3 -m pip install --no-cache-dir \ + --no-index --find-links=/wheels \ + -r /job_exporter/requirements.txt && \ + python3 -m pip install --no-cache-dir \ + --no-index --find-links=/wheels \ + prometheus_client psutil filelock && \ + rm -rf /wheels +# -------------------------- +# app files +# -------------------------- +COPY src/Moneo /Moneo COPY src/*.py /job_exporter/ +COPY build/moneo-*-exporter_entrypoint.sh ./ diff --git a/src/job-exporter/build/moneo-gpu-exporter_entrypoint.sh b/src/job-exporter/build/moneo-gpu-exporter_entrypoint.sh index 9798fd96..e7006354 100755 --- a/src/job-exporter/build/moneo-gpu-exporter_entrypoint.sh +++ b/src/job-exporter/build/moneo-gpu-exporter_entrypoint.sh @@ -9,7 +9,6 @@ if lsmod | grep -qi amdgpu; then echo "AMD Exporter Started!" elif lsmod | grep -qi nvidia; then echo "NVIDIA Graphics card detected." - python3 /update-dcgm.py # Launches NVIDIA DCGM Daemon nohup nv-hostengine & echo "DCGM Daemon Started!" diff --git a/src/job-exporter/build/update-dcgm.py b/src/job-exporter/build/update-dcgm.py deleted file mode 100644 index 8eef6d2c..00000000 --- a/src/job-exporter/build/update-dcgm.py +++ /dev/null @@ -1,117 +0,0 @@ -import subprocess -import sys -import re -import fileinput - -#!/usr/bin/env python3 - -DCGM_TARGET_VERSION = "1:4.4.1-1" - -def get_dcgm_version(): - try: - result = subprocess.run( - ["dpkg", "--list", "datacenter-gpu-manager"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - check=True - ) - for line in result.stdout.splitlines(): - if line.startswith("ii") and "datacenter-gpu-manager" in line: - # Example line: ii datacenter-gpu-manager 1.2.3-1 amd64 NVIDIA datacenter GPU management tools - parts = re.split(r'\s+', line) - if len(parts) >= 3: - return parts[2] - print("datacenter-gpu-manager is not installed.", file=sys.stderr) - sys.exit(0) - except subprocess.CalledProcessError as e: - print("Error running dpkg:", e, file=sys.stderr) - sys.exit(0) - - -def get_cuda_version(): - try: - result = subprocess.run( - ["nvidia-smi"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - check=True - ) - for line in result.stdout.splitlines(): - match = re.search(r"CUDA Version:\s*([\d\.]+)", line) - if match: - return match.group(1) - print("CUDA version not found in nvidia-smi output.", file=sys.stderr) - sys.exit(0) - except subprocess.CalledProcessError as e: - print("Error running nvidia-smi:", e, file=sys.stderr) - sys.exit(0) - - -def remove_dcgm(): - try: - subprocess.run( - ["apt", "purge", "--yes", "datacenter-gpu-manager"], - check=True - ) - subprocess.run( - ["apt", "purge", "--yes", "datacenter-gpu-manager-config"], - check=True - ) - print("datacenter-gpu-manager and its config have been removed.") - except subprocess.CalledProcessError as e: - print("Error removing datacenter-gpu-manager:", e, file=sys.stderr) - - - -def install_latest_dcgm(): - try: - subprocess.run( - ["apt", "update"], - check=True - ) - subprocess.run( - [ - "apt-get", "install", "--yes", - f"datacenter-gpu-manager-4-cuda12={DCGM_TARGET_VERSION}", - f"datacenter-gpu-manager-4-core={DCGM_TARGET_VERSION}", - f"datacenter-gpu-manager-4-proprietary-cuda12={DCGM_TARGET_VERSION}" - ], - check=True - ) - print("Latest datacenter-gpu-manager-4-cuda12 has been installed.") - except subprocess.CalledProcessError as e: - print("Error installing datacenter-gpu-manager-4-cuda12:", e, file=sys.stderr) - sys.exit(1) - - -def update_nvidia_exporter_path(file_path): - - old_line = "sys.path.append('/usr/local/dcgm/bindings/python3')" - new_line = "sys.path.append('/usr/share/datacenter-gpu-manager-4/bindings/python3')" - - replaced = False - for line in fileinput.input(file_path, inplace=True): - if old_line in line: - print(line.replace(old_line, new_line), end='') - replaced = True - else: - print(line, end='') - if replaced: - print(f"Updated sys.path in {file_path}") - else: - print(f"No matching sys.path line found in {file_path}") - -if __name__ == "__main__": - version = get_dcgm_version() - print(f"Current DCGM version: {version}") - cuda_version = get_cuda_version() - print(f"Current CUDA version: {cuda_version}") - - if version.startswith("1:3.") and float(cuda_version) >= 12.8: - remove_dcgm() - install_latest_dcgm() - update_nvidia_exporter_path("/Moneo/src/worker/exporters/nvidia_exporter.py") - else: - print("no dcgm update") \ No newline at end of file diff --git a/src/job-exporter/src/Moneo/src/worker/install/amd.sh b/src/job-exporter/src/Moneo/src/worker/install/amd.sh deleted file mode 100644 index cc39c286..00000000 --- a/src/job-exporter/src/Moneo/src/worker/install/amd.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -set -e - -# install dependencies -source ./$(dirname "${BASH_SOURCE[0]}")/common.sh -apt-get install -y automake make g++ unzip build-essential autoconf libtool pkg-config libgflags-dev libgtest-dev libc++-dev curl libcap-dev - -# install grpc -export GRPC_ROOT=/opt/grpc - -# Check if the directory exists and is not empty -if [ -d "$GRPC_ROOT" ] && [ "$(ls -A $GRPC_ROOT)" ]; then - cd "$GRPC_ROOT" - git pull -else - git clone -b v1.61.0 https://github.com/grpc/grpc --depth=1 --shallow-submodules --recurse-submodules "$GRPC_ROOT" - cd "$GRPC_ROOT" -fi -cmake -B build \ - -DgRPC_INSTALL=ON \ - -DgRPC_BUILD_TESTS=OFF \ - -DBUILD_SHARED_LIBS=ON \ - -DCMAKE_INSTALL_PREFIX="$GRPC_ROOT" \ - -DCMAKE_INSTALL_LIBDIR=lib \ - -DCMAKE_BUILD_TYPE=Release -make -C build -j $(nproc) -make -C build install -echo "$GRPC_ROOT" | sudo tee /etc/ld.so.conf.d/grpc.conf - -# install rdc -export RDC_ROOT=/opt/rdc -# Check if the directory exists and is not empty -if [ -d "$RDC_ROOT" ] && [ "$(ls -A $RDC_ROOT)" ]; then - cd "$RDC_ROOT" - git pull -else - git clone --depth 1 --branch rocm-6.2.2 https://github.com/RadeonOpenCompute/rdc "$RDC_ROOT" - cd "$RDC_ROOT" -fi - -git fetch origin amd-staging -git config user.email "Moneo@local.host" -git config user.name "Moneo" -git cherry-pick 660c5afaf49630781c1059ba6d30bae21743c32f - -# default installation location is /opt/rocm, specify with -DROCM_DIR or -DCMAKE_INSTALL_PREFIX -cmake -B build -DGRPC_ROOT="$GRPC_ROOT" -DROCM_DIR="/opt/rocm" -DCMAKE_INSTALL_PREFIX="/opt/rocm" -make -C build -j $(nproc) -make -C build install - -# Update ldconfig -export RDC_LIB_DIR=/opt/rocm/lib/rdc -export GRPC_LIB_DIR=/opt/grpc/lib -echo -e "${GRPC_LIB_DIR}\n${GRPC_LIB_DIR}64" | sudo tee /etc/ld.so.conf.d/x86_64-librdc_client.conf -echo -e "${RDC_LIB_DIR}\n${RDC_LIB_DIR}64" | sudo tee -a /etc/ld.so.conf.d/x86_64-librdc_client.conf -ldconfig diff --git a/src/job-exporter/src/Moneo/src/worker/install/common.sh b/src/job-exporter/src/Moneo/src/worker/install/common.sh deleted file mode 100644 index 938887b8..00000000 --- a/src/job-exporter/src/Moneo/src/worker/install/common.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -set -e - -# install dependencies -# install DCGM -distro=`awk -F= '/^NAME/{print $2}' /etc/os-release` -if [[ $distro =~ "Ubuntu" ]]; then - apt-get install -y python3-dev -elif [[ $distro =~ "AlmaLinux" ]]; then - yum install -y python3-devel -else - echo "OS version is not supported" -fi - -command -v pip3 >/dev/null 2>&1 || python3 <(curl -s https://bootstrap.pypa.io/get-pip.py) -python3 -m pip -qqq install prometheus_client psutil filelock diff --git a/src/job-exporter/src/Moneo/src/worker/install/nvidia.sh b/src/job-exporter/src/Moneo/src/worker/install/nvidia.sh deleted file mode 100755 index 5346bb76..00000000 --- a/src/job-exporter/src/Moneo/src/worker/install/nvidia.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/bin/bash - -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -set -e - -# install dependencies -source $(dirname "${BASH_SOURCE[0]}")/common.sh - - -distro=`awk -F= '/^NAME/{print $2}' /etc/os-release` -echo $distro - -ubuntu_dcgm_install () { - echo "Installing Dcgm" - apt-get update \ - && sudo apt-get install -y datacenter-gpu-manager - systemctl --now enable nvidia-dcgm - systemctl start nvidia-dcgm -} - -alma_dcgm_install () { - echo "Installing Dcgm" - DCGM_VERSION=2.4.4 - DCGM_URL=https://azhpcstor.blob.core.windows.net/azhpc-images-store/datacenter-gpu-manager-${DCGM_VERSION}-1-x86_64.rpm - wget --retry-connrefused --tries=3 --waitretry=5 $DCGM_URL - FILE_NAME=$(basename $DCGM_URL) - RLINK=$(readlink -f $FILE_NAME) - Check="1d8fbe97797fada8048a7832bfac4bc7d3ad661bb24163d21324965ae7e7817d" - checksum=`sha256sum $RLINK | awk '{print $1}'` - if [[ $checksum != $Check ]] - then - echo "*** Error - Checksum verification failed" - echo "*** Error - Checksum verification failed" > dcgm_fail.log - exit -1 - fi - rpm -i datacenter-gpu-manager-${DCGM_VERSION}-1-x86_64.rpm - rm -f datacenter-gpu-manager-${DCGM_VERSION}-1-x86_64.rpm - systemctl --now enable nvidia-dcgm - systemctl start nvidia-dcgm -} - -check_min_dcgm_ver(){ - DCGM_VER=`dcgmi --version |grep version | awk -F ': ' '{print $2}'` - REQ_VER=$2 - if [ "$(printf '%s\n' "$REQ_VER" "$DCGM_VER" | sort -V | head -n1)" = "$REQ_VER" ]; then - echo "A suitable version of Dcgm is already installed" - else - echo "removing old DCGM" - # remove old version - if [[ $distro =~ "Ubuntu" ]]; then - apt -y remove datacenter-gpu-manager - elif [[ $distro =~ "AlmaLinux" ]]; then - yum -y remove datacenter-gpu-manager - fi - $1 - fi - -} - - -# install DCGM -if [[ $distro =~ "Ubuntu" ]]; then - dcgm_check=`sudo dpkg-query -l` - if [[ $dcgm_check =~ "datacenter-gpu-manager" ]]; then - check_min_dcgm_ver ubuntu_dcgm_install "3.1.6" - else - ubuntu_dcgm_install - systemctl --now enable nvidia-dcgm - systemctl start nvidia-dcgm - fi -elif [[ $distro =~ "AlmaLinux" ]]; then - dcgm_check=`rpm -qa` - if [[ $dcgm_check =~ "datacenter-gpu-manager" ]]; then - check_min_dcgm_ver alma_dcgm_install "2.4.4" - else - alma_dcgm_install - systemctl --now enable nvidia-dcgm - systemctl start nvidia-dcgm - fi -else - echo "OS version is not supported" -fi - -exit 0