GPUprobe · thomas0903 · Dec 6, 2024 · Dec 7, 2024 · ethangraham2001 · Dec 7, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,53 @@
+# -------------------------------------------------------
+# Builder stage
+# -------------------------------------------------------
+FROM nvidia/cuda:12.4.0-devel-ubuntu22.04 AS builder
+# Update and install dependencies without bpftool for now
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    clang llvm libelf-dev libbpf-dev pkg-config git build-essential curl \
+    libssl-dev ca-certificates linux-tools-generic linux-tools-common && \
+    rm -rf /var/lib/apt/lists/*
+
+# Symlink bpftool to /usr/bin
+RUN ln -s /usr/lib/linux-tools/*/bpftool /usr/bin/bpftool
+
+# Install Rust toolchain
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+ENV PATH=/root/.cargo/bin:$PATH
+
+# Set workdir
+WORKDIR /app
+
+# Copy the entire project into the container
+COPY . .
+
+# Since we already generated vmlinux.h on the host, it should be present in src/bpf/
+
+# Build the Rust project in release mode
+RUN cargo build --release
+
+# -------------------------------------------------------
+# Runtime stage
+# -------------------------------------------------------
+FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04
+
+# Install only minimal runtime dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends libelf1 && \
+    rm -rf /var/lib/apt/lists/*
+
+# Symlink libcudart.so.12 to libcudart.so
+RUN ln -s /usr/local/cuda/lib64/libcudart.so.12 /usr/local/cuda/lib64/libcudart.so
+
+
+WORKDIR /app
+COPY --from=builder /app/target/release/gpu_probe /usr/local/bin/gpu_probe
+COPY --from=builder /app/readme-assets /app/readme-assets
+# copy the memleaktest binary to test inside of docker
+# COPY memleaktest /app/memleaktest
+
+# Expose the Prometheus metrics port
+EXPOSE 9000
+
+# Run the GPU probe binary with some default arguments
+CMD ["/usr/local/bin/gpu_probe", "--memleak", "--metrics-addr", "0.0.0.0:9000"]
+
diff --git a/src/gpuprobe/gpuprobe_bandwidth_util.rs b/src/gpuprobe/gpuprobe_bandwidth_util.rs
@@ -5,7 +5,7 @@ mod gpuprobe {
     ));
 }
 
-use libbpf_rs::MapCore;
+use libbpf_rs::{MapCore, UprobeOpts};
 
 use super::uprobe_data::BandwidthUtilData;
 use super::{Gpuprobe, GpuprobeError, LIBCUDART_PATH};
@@ -14,20 +14,32 @@ impl Gpuprobe {
     /// attaches uprobes for the bandwidth util program, or returns an error on
     /// failure
     pub fn attach_bandwidth_util_uprobes(&mut self) -> Result<(), GpuprobeError> {
+        let opts_memcpy = UprobeOpts {
+            func_name: "cudaMemcpy".to_string(),
+            retprobe: false,
+            ..Default::default()
+        };
+
+        let opts_memcpy_ret = UprobeOpts {
+            func_name: "cudaMemcpy".to_string(),
+            retprobe: true,
+            ..Default::default()
+        };
+
         let cuda_memcpy_uprobe_link = self
             .skel
             .skel
             .progs
             .trace_cuda_memcpy
-            .attach_uprobe(false, -1, LIBCUDART_PATH, 0x000000000006f150)
+            .attach_uprobe_with_opts(-1, LIBCUDART_PATH, 0, opts_memcpy)
             .map_err(|_| GpuprobeError::AttachError)?;
 
         let cuda_memcpy_uretprobe_link = self
             .skel
             .skel
             .progs
             .trace_cuda_memcpy_ret
-            .attach_uprobe(true, -1, LIBCUDART_PATH, 0x000000000006f150)
+            .attach_uprobe_with_opts(-1, LIBCUDART_PATH, 0, opts_memcpy_ret)
             .map_err(|_| GpuprobeError::AttachError)?;
 
         self.links.links.trace_cuda_memcpy = Some(cuda_memcpy_uprobe_link);

diff --git a/src/gpuprobe/gpuprobe_cudatrace.rs b/src/gpuprobe/gpuprobe_cudatrace.rs
@@ -5,7 +5,7 @@ mod gpuprobe {
     ));
 }
 
-use libbpf_rs::{MapCore, MapFlags};
+use libbpf_rs::{MapCore, MapFlags, UprobeOpts};
 
 use super::uprobe_data::CudaTraceData;
 use super::{Gpuprobe, GpuprobeError, LIBCUDART_PATH};
@@ -15,12 +15,18 @@ impl Gpuprobe {
     /// attaches uprobes for the cudatrace program, or returns an error on
     /// failure
     pub fn attach_cudatrace_uprobes(&mut self) -> Result<(), GpuprobeError> {
+        let opts_launch_kernel = UprobeOpts {
+            func_name: "cudaLaunchKernel".to_string(),
+            retprobe: false,
+            ..Default::default()
+        };
+
         let cuda_launch_kernel_uprobe_link = self
             .skel
             .skel
             .progs
             .trace_cuda_launch_kernel
-            .attach_uprobe(false, -1, LIBCUDART_PATH, 0x0000000000074440)
+            .attach_uprobe_with_opts(-1, LIBCUDART_PATH, 0, opts_launch_kernel)
             .map_err(|_| GpuprobeError::AttachError)?;
 
         self.links.links.trace_cuda_launch_kernel = Some(cuda_launch_kernel_uprobe_link);

diff --git a/src/gpuprobe/gpuprobe_memleak.rs b/src/gpuprobe/gpuprobe_memleak.rs
@@ -1,4 +1,4 @@
-use libbpf_rs::MapCore;
+use libbpf_rs::{MapCore, MapFlags, UprobeOpts};
 
 use super::{Gpuprobe, GpuprobeError, LIBCUDART_PATH};
 use std::collections::{BTreeMap, HashMap, HashSet};
@@ -8,36 +8,60 @@ impl Gpuprobe {
     /// attaches uprobes for the memleak program, or returns an error on
     /// failure
     pub fn attach_memleak_uprobes(&mut self) -> Result<(), GpuprobeError> {
+        let opts_malloc = UprobeOpts {
+            func_name: "cudaMalloc".to_string(),
+            retprobe: false,
+            ..Default::default()
+        };
+
+        let opts_malloc_ret = UprobeOpts {
+            func_name: "cudaMalloc".to_string(),
+            retprobe: true,
+            ..Default::default()
+        };
+
+        let opts_free = UprobeOpts {
+            func_name: "cudaFree".to_string(),
+            retprobe: false,
+            ..Default::default()
+        };
+
+        let opts_free_ret = UprobeOpts {
+            func_name: "cudaFree".to_string(),
+            retprobe: true,
+            ..Default::default()
+        };
+
         let cuda_malloc_uprobe_link = self
             .skel
             .skel
             .progs
             .memleak_cuda_malloc
-            .attach_uprobe(false, -1, LIBCUDART_PATH, 0x00000000000560c0)
+            .attach_uprobe_with_opts(-1, LIBCUDART_PATH, 0, opts_malloc)
             .map_err(|_| GpuprobeError::AttachError)?;
 
         let cuda_malloc_uretprobe_link = self
             .skel
             .skel
             .progs
             .memleak_cuda_malloc_ret
-            .attach_uprobe(true, -1, LIBCUDART_PATH, 0x00000000000560c0)
+            .attach_uprobe_with_opts(-1, LIBCUDART_PATH, 0, opts_malloc_ret)
             .map_err(|_| GpuprobeError::AttachError)?;
 
         let cuda_free_uprobe_link = self
             .skel
             .skel
             .progs
             .trace_cuda_free
-            .attach_uprobe(false, -1, LIBCUDART_PATH, 0x00000000000568c0)
+            .attach_uprobe_with_opts(-1, LIBCUDART_PATH, 0, opts_free)
             .map_err(|_| GpuprobeError::AttachError)?;
 
         let cuda_free_uretprobe_link = self
             .skel
             .skel
             .progs
             .trace_cuda_free_ret
-            .attach_uprobe(true, -1, LIBCUDART_PATH, 0x00000000000568c0)
+            .attach_uprobe_with_opts(-1, LIBCUDART_PATH, 0, opts_free_ret)
             .map_err(|_| GpuprobeError::AttachError)?;
 
         self.links.links.memleak_cuda_malloc = Some(cuda_malloc_uprobe_link);