From eeea5f813223d796819b5974a17e0514754df6a2 Mon Sep 17 00:00:00 2001 From: Thomas Molinari Date: Fri, 6 Dec 2024 12:47:16 +0100 Subject: [PATCH 1/2] Fix [CROSS-PLATFORM] libcudart.so portability using libbpf-rs --- src/gpuprobe/gpuprobe_bandwidth_util.rs | 18 ++++++++++--- src/gpuprobe/gpuprobe_cudatrace.rs | 10 ++++++-- src/gpuprobe/gpuprobe_memleak.rs | 34 +++++++++++++++++++++---- 3 files changed, 52 insertions(+), 10 deletions(-) diff --git a/src/gpuprobe/gpuprobe_bandwidth_util.rs b/src/gpuprobe/gpuprobe_bandwidth_util.rs index f36fe5c..4d8936e 100644 --- a/src/gpuprobe/gpuprobe_bandwidth_util.rs +++ b/src/gpuprobe/gpuprobe_bandwidth_util.rs @@ -5,7 +5,7 @@ mod gpuprobe { )); } -use libbpf_rs::MapCore; +use libbpf_rs::{MapCore, UprobeOpts}; use super::uprobe_data::BandwidthUtilData; use super::{Gpuprobe, GpuprobeError, LIBCUDART_PATH}; @@ -14,12 +14,24 @@ impl Gpuprobe { /// attaches uprobes for the bandwidth util program, or returns an error on /// failure pub fn attach_bandwidth_util_uprobes(&mut self) -> Result<(), GpuprobeError> { + let opts_memcpy = UprobeOpts { + func_name: "cudaMemcpy".to_string(), + retprobe: false, + ..Default::default() + }; + + let opts_memcpy_ret = UprobeOpts { + func_name: "cudaMemcpy".to_string(), + retprobe: true, + ..Default::default() + }; + let cuda_memcpy_uprobe_link = self .skel .skel .progs .trace_cuda_memcpy - .attach_uprobe(false, -1, LIBCUDART_PATH, 0x000000000006f150) + .attach_uprobe_with_opts(-1, LIBCUDART_PATH, 0, opts_memcpy) .map_err(|_| GpuprobeError::AttachError)?; let cuda_memcpy_uretprobe_link = self @@ -27,7 +39,7 @@ impl Gpuprobe { .skel .progs .trace_cuda_memcpy_ret - .attach_uprobe(true, -1, LIBCUDART_PATH, 0x000000000006f150) + .attach_uprobe_with_opts(-1, LIBCUDART_PATH, 0, opts_memcpy_ret) .map_err(|_| GpuprobeError::AttachError)?; self.links.links.trace_cuda_memcpy = Some(cuda_memcpy_uprobe_link); diff --git a/src/gpuprobe/gpuprobe_cudatrace.rs b/src/gpuprobe/gpuprobe_cudatrace.rs index 93e5abc..78e98ea 100644 --- a/src/gpuprobe/gpuprobe_cudatrace.rs +++ b/src/gpuprobe/gpuprobe_cudatrace.rs @@ -5,7 +5,7 @@ mod gpuprobe { )); } -use libbpf_rs::{MapCore, MapFlags}; +use libbpf_rs::{MapCore, MapFlags, UprobeOpts}; use super::uprobe_data::CudaTraceData; use super::{Gpuprobe, GpuprobeError, LIBCUDART_PATH}; @@ -15,12 +15,18 @@ impl Gpuprobe { /// attaches uprobes for the cudatrace program, or returns an error on /// failure pub fn attach_cudatrace_uprobes(&mut self) -> Result<(), GpuprobeError> { + let opts_launch_kernel = UprobeOpts { + func_name: "cudaLaunchKernel".to_string(), + retprobe: false, + ..Default::default() + }; + let cuda_launch_kernel_uprobe_link = self .skel .skel .progs .trace_cuda_launch_kernel - .attach_uprobe(false, -1, LIBCUDART_PATH, 0x0000000000074440) + .attach_uprobe_with_opts(-1, LIBCUDART_PATH, 0, opts_launch_kernel) .map_err(|_| GpuprobeError::AttachError)?; self.links.links.trace_cuda_launch_kernel = Some(cuda_launch_kernel_uprobe_link); diff --git a/src/gpuprobe/gpuprobe_memleak.rs b/src/gpuprobe/gpuprobe_memleak.rs index 19aedbd..b182263 100644 --- a/src/gpuprobe/gpuprobe_memleak.rs +++ b/src/gpuprobe/gpuprobe_memleak.rs @@ -1,4 +1,4 @@ -use libbpf_rs::MapCore; +use libbpf_rs::{MapCore, MapFlags, UprobeOpts}; use super::{Gpuprobe, GpuprobeError, LIBCUDART_PATH}; use std::collections::{BTreeMap, HashMap, HashSet}; @@ -8,12 +8,36 @@ impl Gpuprobe { /// attaches uprobes for the memleak program, or returns an error on /// failure pub fn attach_memleak_uprobes(&mut self) -> Result<(), GpuprobeError> { + let opts_malloc = UprobeOpts { + func_name: "cudaMalloc".to_string(), + retprobe: false, + ..Default::default() + }; + + let opts_malloc_ret = UprobeOpts { + func_name: "cudaMalloc".to_string(), + retprobe: true, + ..Default::default() + }; + + let opts_free = UprobeOpts { + func_name: "cudaFree".to_string(), + retprobe: false, + ..Default::default() + }; + + let opts_free_ret = UprobeOpts { + func_name: "cudaFree".to_string(), + retprobe: true, + ..Default::default() + }; + let cuda_malloc_uprobe_link = self .skel .skel .progs .memleak_cuda_malloc - .attach_uprobe(false, -1, LIBCUDART_PATH, 0x00000000000560c0) + .attach_uprobe_with_opts(-1, LIBCUDART_PATH, 0, opts_malloc) .map_err(|_| GpuprobeError::AttachError)?; let cuda_malloc_uretprobe_link = self @@ -21,7 +45,7 @@ impl Gpuprobe { .skel .progs .memleak_cuda_malloc_ret - .attach_uprobe(true, -1, LIBCUDART_PATH, 0x00000000000560c0) + .attach_uprobe_with_opts(-1, LIBCUDART_PATH, 0, opts_malloc_ret) .map_err(|_| GpuprobeError::AttachError)?; let cuda_free_uprobe_link = self @@ -29,7 +53,7 @@ impl Gpuprobe { .skel .progs .trace_cuda_free - .attach_uprobe(false, -1, LIBCUDART_PATH, 0x00000000000568c0) + .attach_uprobe_with_opts(-1, LIBCUDART_PATH, 0, opts_free) .map_err(|_| GpuprobeError::AttachError)?; let cuda_free_uretprobe_link = self @@ -37,7 +61,7 @@ impl Gpuprobe { .skel .progs .trace_cuda_free_ret - .attach_uprobe(true, -1, LIBCUDART_PATH, 0x00000000000568c0) + .attach_uprobe_with_opts(-1, LIBCUDART_PATH, 0, opts_free_ret) .map_err(|_| GpuprobeError::AttachError)?; self.links.links.memleak_cuda_malloc = Some(cuda_malloc_uprobe_link); From 586395faf5b4949f263258e04a594a6eaa9a6779 Mon Sep 17 00:00:00 2001 From: Thomas Molinari Date: Sat, 7 Dec 2024 15:05:20 +0100 Subject: [PATCH 2/2] add Dockerfile --- Dockerfile | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..66ae9b2 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,53 @@ +# ------------------------------------------------------- +# Builder stage +# ------------------------------------------------------- +FROM nvidia/cuda:12.4.0-devel-ubuntu22.04 AS builder +# Update and install dependencies without bpftool for now +RUN apt-get update && apt-get install -y --no-install-recommends \ + clang llvm libelf-dev libbpf-dev pkg-config git build-essential curl \ + libssl-dev ca-certificates linux-tools-generic linux-tools-common && \ + rm -rf /var/lib/apt/lists/* + +# Symlink bpftool to /usr/bin +RUN ln -s /usr/lib/linux-tools/*/bpftool /usr/bin/bpftool + +# Install Rust toolchain +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +ENV PATH=/root/.cargo/bin:$PATH + +# Set workdir +WORKDIR /app + +# Copy the entire project into the container +COPY . . + +# Since we already generated vmlinux.h on the host, it should be present in src/bpf/ + +# Build the Rust project in release mode +RUN cargo build --release + +# ------------------------------------------------------- +# Runtime stage +# ------------------------------------------------------- +FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04 + +# Install only minimal runtime dependencies +RUN apt-get update && apt-get install -y --no-install-recommends libelf1 && \ + rm -rf /var/lib/apt/lists/* + +# Symlink libcudart.so.12 to libcudart.so +RUN ln -s /usr/local/cuda/lib64/libcudart.so.12 /usr/local/cuda/lib64/libcudart.so + + +WORKDIR /app +COPY --from=builder /app/target/release/gpu_probe /usr/local/bin/gpu_probe +COPY --from=builder /app/readme-assets /app/readme-assets +# copy the memleaktest binary to test inside of docker +# COPY memleaktest /app/memleaktest + +# Expose the Prometheus metrics port +EXPOSE 9000 + +# Run the GPU probe binary with some default arguments +CMD ["/usr/local/bin/gpu_probe", "--memleak", "--metrics-addr", "0.0.0.0:9000"] + \ No newline at end of file