From fee5282e716dd6eb74c17da8d22f705decaa8b06 Mon Sep 17 00:00:00 2001 From: Weili An Date: Wed, 11 Feb 2026 12:03:44 -0500 Subject: [PATCH 1/3] Add LRC max coalesce entry size ubench --- .../ubench/lrc/lrc_max_merged/Makefile | 7 + .../lrc/lrc_max_merged/lrc_max_merged.cu | 155 ++++++++++++++++++ .../lrc_max_merged/sweep_lrc_max_merged.sh | 26 +++ .../ubench/lrc/lrc_queue_size/Makefile | 7 + .../lrc/lrc_queue_size/lrc_queue_size.cu | 38 +++++ 5 files changed, 233 insertions(+) create mode 100644 src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/Makefile create mode 100644 src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/lrc_max_merged.cu create mode 100644 src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/sweep_lrc_max_merged.sh create mode 100644 src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_queue_size/Makefile create mode 100644 src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_queue_size/lrc_queue_size.cu diff --git a/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/Makefile b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/Makefile new file mode 100644 index 000000000..f877e294b --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/Makefile @@ -0,0 +1,7 @@ +SRC = lrc_max_merged.cu + +EXE = lrc_max_merged + +NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt + +include ../../../common/common.mk diff --git a/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/lrc_max_merged.cu b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/lrc_max_merged.cu new file mode 100644 index 000000000..41cec5396 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/lrc_max_merged.cu @@ -0,0 +1,155 @@ +// LRC Max Merged Microbenchmark (NCU-based) +// +// Discovers the maximum number of read requests that can be coalesced +// into a single LRC (L2 Request Coalescer) entry on NVIDIA GPUs. +// +// Principle: Launch N blocks (1 warp each, 1 per SM) that all pointer-chase +// through the same sequence of L2 sectors (bypassing L1 with ld.global.cg). +// Blocks naturally stay roughly in lockstep since they all do identical work. +// +// Measurement: Use ncu hardware counters to compare SM-side sector requests +// (pre-LRC) with L2-side sector reads (post-LRC). The compression ratio +// reveals max_merged. +// +// Usage: +// ./lrc_max_merged [N] [THREADS_PER_BLOCK] [ITERS] [SYNC_INTERVAL] +// - N: Number of blocks (default: all SMs) +// - THREADS_PER_BLOCK: Number of threads per block (default: 256) +// - ITERS: Number of iterations (default: 4096) +// - SYNC_INTERVAL: Number of iterations between grid-wide sync (default: ITERS / 8) +// +// NCU profiling: +// ncu --metrics lrc__lts2lrc_sectors_op_read.sum.sum,lrc__xbar2gpc_sectors_op_read.sum.sum \ +// ./lrc_max_merged +// +// Compile: nvcc -Xptxas -dlcm=cg lrc_max_merged.cu -o lrc_max_merged + +#include +#include +#include +#include +#include +#include + +namespace cg = cooperative_groups; + +#include "../../../hw_def/hw_def.h" + +#define ARRAY_SIZE 4096 // uint64_t elements -> 32KB total +#define SECTOR_STRIDE 4 // 4 * sizeof(uint64_t) = 32 bytes = 1 sector +#define COMPUTE_ITERS 64 // Number of multiply-adds to insert after each load + +__global__ void lrc_max_merged_kernel(uint64_t *data, uint64_t *dsink, uint32_t ITERS, uint32_t SYNC_INTERVAL) { + // All threads in all blocks start at the same pointer (sector 0) + uint64_t ptr = (uint64_t)data; + + // Block-level sync before starting + asm volatile("bar.sync 0;"); + + // Global synchronization for all blocks in the grid + cg::grid_group grid = cg::this_grid(); + grid.sync(); + + // Pointer chase across sectors: + // - ld.global.cg bypasses L1, caches in L2 -> goes through LRC + // - Data dependency (ptr = *ptr) prevents compiler optimization + // - Each iteration: all blocks read same sector (same ptr value) + // since they started at the same address and follow the same chain + // - ptr advances to next sector after each load + // + // After each load we insert a chain of dependent ALU ops so that + // the loop body takes long enough for all blocks to converge on the + // same sector before any block advances to the next one. Without + // this padding the loop is too tight and blocks slip out of the + // coalescing window. + uint64_t sink = 0; + for (uint32_t i = 0; i < ITERS; i++) { + asm volatile("ld.global.cg.u64 %0, [%0];" : "+l"(ptr)::"memory"); + + // Dependent ALU padding: a chain of multiply-adds that the + // compiler cannot remove (volatile asm, data dependency on ptr). + // Each op depends on the previous result, serialising them. + uint64_t tmp = ptr; + #pragma unroll + for (uint32_t j = 0; j < COMPUTE_ITERS; j++) { + asm volatile("mad.lo.u64 %0, %0, %1, %2;" + : "+l"(tmp) : "l"((uint64_t)5), "l"((uint64_t)3) : ); + } + sink += tmp; // prevent dead-code elimination of the chain + + asm volatile("bar.sync 0;"); + // Periodic synchronization for grid-wide sync + if (SYNC_INTERVAL > 0 && (i + 1) % SYNC_INTERVAL == 0) + grid.sync(); + } + + asm volatile("bar.sync 0;"); + + // Prevent dead code elimination + dsink[blockIdx.x * blockDim.x + threadIdx.x] = ptr + sink; + asm volatile("bar.sync 0;"); +} + +int main(int argc, char *argv[]) { + initializeDeviceProp(0, argc, argv); + + unsigned sm_count = config.SM_NUMBER; + + // Number of blocks = CLI arg or all SMs + unsigned N = sm_count; + unsigned threads_per_block = config.THREADS_PER_BLOCK; + uint32_t ITERS = 4096; + uint32_t SYNC_INTERVAL = ITERS / 8; + if (argc > 1) { + N = (unsigned)atoi(argv[1]); + threads_per_block = (unsigned)atoi(argv[2]); + ITERS = (uint32_t)atoi(argv[3]); + SYNC_INTERVAL = (uint32_t)atoi(argv[4]); + } + + // Pointer chain array must fit in L2 + size_t array_bytes = ARRAY_SIZE * sizeof(uint64_t); + assert(array_bytes < config.L2_SIZE); + + unsigned num_sectors = ARRAY_SIZE / SECTOR_STRIDE; + + printf("=== LRC Max Merged (NCU-based) ===\n"); + printf("SM_COUNT=%u, N_BLOCKS=%u, ITERS=%d, NUM_SECTORS=%u\n", sm_count, N, + ITERS, num_sectors); + printf("Profile with ncu to measure L2 sector compression.\n"); + + // Allocate device memory + uint64_t *posArray_g, *dsink_g; + gpuErrchk(cudaMalloc(&posArray_g, array_bytes)); + gpuErrchk(cudaMalloc(&dsink_g, N * threads_per_block * sizeof(uint64_t))); + + // Initialize pointer chain on host using device pointer arithmetic + // Chain: sector 0 -> sector 1 -> ... -> sector N-1 -> sector 0 + uint64_t *init = (uint64_t *)malloc(array_bytes); + memset(init, 0, array_bytes); + for (unsigned s = 0; s < num_sectors - 1; s++) + init[s * SECTOR_STRIDE] = + (uint64_t)(posArray_g + (s + 1) * SECTOR_STRIDE); + init[(num_sectors - 1) * SECTOR_STRIDE] = + (uint64_t)(posArray_g); // cycle back + + gpuErrchk( + cudaMemcpy(posArray_g, init, array_bytes, cudaMemcpyHostToDevice)); + free(init); + + // Launch: N blocks, threads per block to enforce 2 block/SM + // Cooperative launch required for grid-wide sync + void *kernelArgs[] = {(void *)&posArray_g, (void *)&dsink_g, (void *)&ITERS, (void *)&SYNC_INTERVAL}; + gpuErrchk(cudaLaunchCooperativeKernel( + (void *)lrc_max_merged_kernel, N, threads_per_block, kernelArgs)); + gpuErrchk(cudaPeekAtLastError()); + gpuErrchk(cudaDeviceSynchronize()); + + printf("Kernel completed. Use ncu to analyze L2 sector counts.\n"); + + // Cleanup + cudaFree(posArray_g); + cudaFree(dsink_g); + + return 0; +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/sweep_lrc_max_merged.sh b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/sweep_lrc_max_merged.sh new file mode 100644 index 000000000..37f263c16 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/sweep_lrc_max_merged.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# A script to sweep the number of concurrent accesses to the L2 cache to test for the LRC max merged + +export CUDA_VISIBLE_DEVICES=7 +for tb_size in 32 64 128; do + echo "--------------------------------" + echo "Thread block size: $tb_size" + echo "NUM_CONCURRENT_ACCESS,NUM_BLOCKS,lts2lrc_sectors,xbar2gpc_sectors,LRC ratio(xbar2gpc/lts2lrc)" + for nb in 1 4 8 16 32 48 64 80 96 112 128; do + num_concurrent_access=$((nb * tb_size / 32)) + output=$(TMPDIR=./ ncu --metrics lrc__xbar2gpc_sectors_op_read.sum,lrc__lts2lrc_sectors_op_read.sum,lrc__average_xbar2gpc_sectors_op_read.ratio,l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum ./lrc_max_merged "$nb" 128 2048 0 2>&1) + + lts2lrc=$(echo "$output" | grep 'lrc__lts2lrc_sectors_op_read.sum' | awk '{print $NF}' | tr -d ',') + xbar2gpc=$(echo "$output" | grep 'lrc__xbar2gpc_sectors_op_read.sum' | awk '{print $NF}' | tr -d ',') + + if [[ -n "$lts2lrc" && -n "$xbar2gpc" && "$xbar2gpc" != "0" ]]; then + ratio=$(awk "BEGIN {printf \"%.4f\", $xbar2gpc / $lts2lrc}") + else + ratio="N/A" + fi + + echo "${num_concurrent_access},${nb},${lts2lrc},${xbar2gpc},${ratio}" + done + echo "--------------------------------" + echo "" +done \ No newline at end of file diff --git a/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_queue_size/Makefile b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_queue_size/Makefile new file mode 100644 index 000000000..da59aeb11 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_queue_size/Makefile @@ -0,0 +1,7 @@ +SRC = lrc_queue_size.cu + +EXE = lrc_queue_size + +NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt + +include ../../../common/common.mk diff --git a/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_queue_size/lrc_queue_size.cu b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_queue_size/lrc_queue_size.cu new file mode 100644 index 000000000..5a4a280f2 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_queue_size/lrc_queue_size.cu @@ -0,0 +1,38 @@ +// LRC Queue Size Microbenchmark — POSTPONED +// +// TODO: Implement benchmark to discover LRC queue depth (max_entries) per +// L2 sub-partition on NVIDIA GPUs. +// +// Challenges that need to be resolved before implementation: +// +// 1. Address-to-sub-partition mapping: +// To measure the queue depth of a specific L2 sub-partition, we need to +// generate unique sector addresses that all map to the SAME sub-partition. +// This requires knowledge of the H100's L2 address interleaving scheme. +// A simple linear mapping (addr / sector_size % num_sub_partitions) is +// likely incorrect — NVIDIA may use XOR-based hashing or other schemes. +// +// 2. Measurement approach: +// Once the address mapping is known, launch multiple warps that each read +// from a unique sector address (all mapping to the same sub-partition). +// Sweep the number of unique sectors and use ncu to observe: +// - Pre-LRC: l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum +// - Post-LRC: lts__t_sectors_srcunit_tex_op_read.sum +// When the number of unique sectors exceeds the LRC queue depth, +// the LRC can no longer track all entries simultaneously, causing +// evictions and additional L2 lookups. +// +// 3. Alternative approach: +// Reverse-engineer the address interleaving first using a separate +// microbenchmark (e.g., by measuring latency variation across different +// address strides to identify the interleaving pattern). +// +// Accel-Sim config parameter: -gpgpu_lrc_max_entries + +#include + +int main() { + printf("lrc_queue_size: NOT YET IMPLEMENTED\n"); + printf("See source file for TODO details.\n"); + return 0; +} From 650220a2032d840f91bf528710c37a83b1a5620b Mon Sep 17 00:00:00 2001 From: Weili An Date: Sun, 22 Feb 2026 12:53:12 -0500 Subject: [PATCH 2/3] Update LRC max merged benchmark --- .gitignore | 10 +- .../ubench/lrc/lrc_max_merged/Makefile | 9 +- .../lrc/lrc_max_merged/lrc_max_merged.cu | 370 ++++++++++++------ .../lrc/lrc_max_merged/run_lrc_merged.sh | 10 + .../lrc_max_merged/sweep_lrc_max_merged.sh | 26 -- 5 files changed, 276 insertions(+), 149 deletions(-) create mode 100644 src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/run_lrc_merged.sh delete mode 100644 src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/sweep_lrc_max_merged.sh diff --git a/.gitignore b/.gitignore index b729af5d6..83605321e 100644 --- a/.gitignore +++ b/.gitignore @@ -24,4 +24,12 @@ tmp/ src/cuda/GPU_Microbenchmark/ubench/**/* !src/cuda/GPU_Microbenchmark/ubench/**/*/ !src/cuda/GPU_Microbenchmark/ubench/**/*.* -!src/cuda/GPU_Microbenchmark/ubench/**/Makefile \ No newline at end of file +!src/cuda/GPU_Microbenchmark/ubench/**/Makefile + +# IDE/AI tool related files +.vscode/ +.cursor/ +.cursorrules +.claude/ +CLAUDE.md +.clangd \ No newline at end of file diff --git a/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/Makefile b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/Makefile index f877e294b..b3492f1d0 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/Makefile +++ b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/Makefile @@ -2,6 +2,13 @@ SRC = lrc_max_merged.cu EXE = lrc_max_merged -NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt +# NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt +# LRC is supported on SM_90 and above +ARCH?=sm_90a sm_100a sm_101 sm_120 +# Unset the CUDA_CPPFLAGS which is set based on CUDA version +# but LRC is only supported on SM_90 and above +CUDA_CPPFLAGS= +# Generate code for both sm_XXX and compute_XXX (SASS and PTX) +NVCC_FLAGS := $(foreach arch,$(ARCH),-gencode=arch=compute_$(subst sm_,,$(arch)),code=$(arch) -gencode=arch=compute_$(subst sm_,,$(arch)),code=compute_$(subst sm_,,$(arch))) -std=c++17 -O0 include ../../../common/common.mk diff --git a/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/lrc_max_merged.cu b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/lrc_max_merged.cu index 41cec5396..c091ed2ae 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/lrc_max_merged.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/lrc_max_merged.cu @@ -1,154 +1,282 @@ -// LRC Max Merged Microbenchmark (NCU-based) -// -// Discovers the maximum number of read requests that can be coalesced -// into a single LRC (L2 Request Coalescer) entry on NVIDIA GPUs. -// -// Principle: Launch N blocks (1 warp each, 1 per SM) that all pointer-chase -// through the same sequence of L2 sectors (bypassing L1 with ld.global.cg). -// Blocks naturally stay roughly in lockstep since they all do identical work. -// -// Measurement: Use ncu hardware counters to compare SM-side sector requests -// (pre-LRC) with L2-side sector reads (post-LRC). The compression ratio -// reveals max_merged. -// -// Usage: -// ./lrc_max_merged [N] [THREADS_PER_BLOCK] [ITERS] [SYNC_INTERVAL] -// - N: Number of blocks (default: all SMs) -// - THREADS_PER_BLOCK: Number of threads per block (default: 256) -// - ITERS: Number of iterations (default: 4096) -// - SYNC_INTERVAL: Number of iterations between grid-wide sync (default: ITERS / 8) -// -// NCU profiling: -// ncu --metrics lrc__lts2lrc_sectors_op_read.sum.sum,lrc__xbar2gpc_sectors_op_read.sum.sum \ -// ./lrc_max_merged -// -// Compile: nvcc -Xptxas -dlcm=cg lrc_max_merged.cu -o lrc_max_merged +// LRC Max Merged Microbenchmark +// Use mbarrier and threadblock cluster to ensure +// best synchronization among warps sending request +// to the same L2 sector #include -#include -#include +#include +#include #include #include #include +#include "../../../hw_def/common/gpuConfig.h" +#include "../../../hw_def/hw_def.h" +#include +#include +#include namespace cg = cooperative_groups; -#include "../../../hw_def/hw_def.h" +/** + * @brief Synchronize within a cluster using mbarrier + * + * @return + */ +__device__ __forceinline__ void sync_cluster(void) { + /** + * Synchronization + */ + // Initialize mbarrier + #pragma nv_diag_suppress static_var_with_dynamic_init + __shared__ uint64_t bar; + // Barrier handles + uint64_t *clusterleader_bar_handle; + uint64_t *local_bar_handle; + + // Setup remote barrier address + cg::cluster_group cluster = cg::this_cluster(); + unsigned int clusterBlockRank = cluster.block_rank(); + unsigned int clusterSize = cluster.dim_blocks().x; + + // Check if this threadblock is the cluster leader + bool isClusterLeader = clusterBlockRank == 0; + // Check if this thread is the thread leader + bool isThreadLeader = threadIdx.x == 0; + + // Only cluster leader should initialize the barrier + // other threadblocks should use remote barrier address + if (isClusterLeader) { + // Get total number of threads in the cluster + unsigned int totalThreads = cluster.num_threads(); + + // Only one thread should initialize the barrier + if (isThreadLeader) { + cuda::ptx::mbarrier_init(&bar, totalThreads); + } + __syncthreads(); + clusterleader_bar_handle = &bar; + local_bar_handle = &bar; + } else { + // Other threadblocks should init local barrier + // to wait for clusterleader's signal that all threadblocks + // have arrived + if (isThreadLeader) { + cuda::ptx::mbarrier_init(&bar, 1); + } + __syncthreads(); + clusterleader_bar_handle = cluster.map_shared_rank(&bar, 0); + local_bar_handle = &bar; + } + + // Here is the synchronization part + // 1. All threads in the cluster should arrive at the clusterleader's barrier + // 2. Since mbarrier only supports waiting on local barrier: + // 1. ClusterLeader will wait on its barrier that get released when all threadblocks have arrived + // 2. Other threadblocks will wait on their local barrier that get released when clusterleader release its barrier + // 3. Once clusterleader's barrier is released, all other threadblocks should launch the load to same sector address + + // First all threads arrive at the clusterleader's barrier + cuda::ptx::mbarrier_arrive(cuda::ptx::sem_release, cuda::ptx::scope_cluster, cuda::ptx::space_cluster, clusterleader_bar_handle, 1); + + // Then based on the role, we will either wait on the barrier or release the barrier + if (isClusterLeader) { + // ClusterLeader will wait on its barrier that get released when all threadblocks have arrived + while (!cuda::ptx::mbarrier_test_wait_parity(cuda::ptx::sem_acquire, cuda::ptx::scope_cluster, clusterleader_bar_handle, 0)) {} + + // Now ClusterLeader should release other threadblocks to proceed, we will execute this in a single warp + // for each threadblock, we will release its barrier + if (threadIdx.x < clusterSize && threadIdx.x != clusterBlockRank) { + uint64_t *remote_bar_handle = cluster.map_shared_rank(&bar, threadIdx.x); + cuda::ptx::mbarrier_arrive(cuda::ptx::sem_release, cuda::ptx::scope_cluster, cuda::ptx::space_cluster, remote_bar_handle, 1); + } + } else { + // Other threadblocks will wait on their local barrier that get released when clusterleader release its barrier + while (!cuda::ptx::mbarrier_test_wait_parity(cuda::ptx::sem_acquire, cuda::ptx::scope_cluster, local_bar_handle, 0)) {} + } + __syncthreads(); +} + +/** + * @brief Kernel to test LRC maximum merged count per entry + * + * @param data Same sector data for all warps to access for LRC merge + * @param dsink Global sink to avoid optimization + * @return __global__ + */ +__global__ void lrc_max_merged_kernel(uint64_t *data, uint64_t *dsink, unsigned active_threads_per_cluster) { + // The LRC max merged count should be reflected from NCU metrics -#define ARRAY_SIZE 4096 // uint64_t elements -> 32KB total -#define SECTOR_STRIDE 4 // 4 * sizeof(uint64_t) = 32 bytes = 1 sector -#define COMPUTE_ITERS 64 // Number of multiply-adds to insert after each load + // Shmem buffer + __shared__ volatile uint64_t smem_buffer[16]; -__global__ void lrc_max_merged_kernel(uint64_t *data, uint64_t *dsink, uint32_t ITERS, uint32_t SYNC_INTERVAL) { - // All threads in all blocks start at the same pointer (sector 0) - uint64_t ptr = (uint64_t)data; + // Get thread index within a cluster + cg::cluster_group cluster = cg::this_cluster(); + unsigned thread_within_cluster = cluster.thread_rank(); - // Block-level sync before starting - asm volatile("bar.sync 0;"); + // Synchronize within a cluster + // If threadblocks are within the same GPC, they will be relatively in sync + // anyway + // sync_cluster(); + + /** + * LRC max merged test + */ + // Now threadblocks within a cluster should be relatively in sync for testing LRC coalescing + // Load the same sector data into shared memory + // Use ld.global.cg to bypass L1 cache MSHR + // Send down one sector per warp, make this explicit despite in + // hardware with this "if", loads are still coalesced into 1 sector + if (thread_within_cluster < active_threads_per_cluster) { + // Only one sector access per warp + if (threadIdx.x % 32 == 0) { + uint64_t data_value; + asm volatile("{\t\n" + "ld.global.cg.u64 %0, [%1];\n\t" + "}" + : "=l"(data_value) + : "l"(&data[0])); + smem_buffer[0] = data_value; + } + } + __syncthreads(); + + // Write to global sink to prevent optimization + dsink[0] = smem_buffer[0]; +} + +__global__ void lrc_max_merged_kernel_cooperative(uint64_t *data, uint64_t *dsink) { + __shared__ volatile uint64_t smem_buffer[16]; // Global synchronization for all blocks in the grid cg::grid_group grid = cg::this_grid(); grid.sync(); - // Pointer chase across sectors: - // - ld.global.cg bypasses L1, caches in L2 -> goes through LRC - // - Data dependency (ptr = *ptr) prevents compiler optimization - // - Each iteration: all blocks read same sector (same ptr value) - // since they started at the same address and follow the same chain - // - ptr advances to next sector after each load - // - // After each load we insert a chain of dependent ALU ops so that - // the loop body takes long enough for all blocks to converge on the - // same sector before any block advances to the next one. Without - // this padding the loop is too tight and blocks slip out of the - // coalescing window. - uint64_t sink = 0; - for (uint32_t i = 0; i < ITERS; i++) { - asm volatile("ld.global.cg.u64 %0, [%0];" : "+l"(ptr)::"memory"); - - // Dependent ALU padding: a chain of multiply-adds that the - // compiler cannot remove (volatile asm, data dependency on ptr). - // Each op depends on the previous result, serialising them. - uint64_t tmp = ptr; - #pragma unroll - for (uint32_t j = 0; j < COMPUTE_ITERS; j++) { - asm volatile("mad.lo.u64 %0, %0, %1, %2;" - : "+l"(tmp) : "l"((uint64_t)5), "l"((uint64_t)3) : ); - } - sink += tmp; // prevent dead-code elimination of the chain - - asm volatile("bar.sync 0;"); - // Periodic synchronization for grid-wide sync - if (SYNC_INTERVAL > 0 && (i + 1) % SYNC_INTERVAL == 0) - grid.sync(); + if (threadIdx.x % 32 == 0) { + uint64_t data_value; + asm volatile("{\t\n" + "ld.global.cg.u64 %0, [%1];\n\t" + "}" + : "=l"(data_value) + : "l"(&data[0])); + smem_buffer[0] = data_value; } - - asm volatile("bar.sync 0;"); + __syncthreads(); - // Prevent dead code elimination - dsink[blockIdx.x * blockDim.x + threadIdx.x] = ptr + sink; - asm volatile("bar.sync 0;"); + // Write to global sink to prevent optimization + dsink[0] = smem_buffer[0]; } int main(int argc, char *argv[]) { initializeDeviceProp(0, argc, argv); + enum KernelLaunchMode { + NORMAL = 0, + CLUSTER = 1, + COOPERATIVE = 2, + }; + const char *launch_mode_str[] = { + "NORMAL", + "CLUSTER", + "COOPERATIVE", + }; + + // Size of threadblock cluster + const unsigned MAX_CLUSTER_SIZE = 16; + unsigned N = MAX_CLUSTER_SIZE; + unsigned cluster_size = MAX_CLUSTER_SIZE; + unsigned threads_per_block = 256; + bool provide_active_threads_per_cluster = false; + unsigned active_threads_per_cluster = cluster_size * threads_per_block; + KernelLaunchMode launch_mode = NORMAL; + const char *optstring = "N:C:T:A:m:"; + // CLI parsing + int opt; + while ((opt = getopt(argc, argv, optstring)) != -1) { + switch (opt) { + case 'N': + N = atoi(optarg); + break; + case 'C': + cluster_size = atoi(optarg); + assert(cluster_size <= MAX_CLUSTER_SIZE && "cluster_size is out of range, must be less than 16 on Hopper"); + break; + case 'T': + threads_per_block = atoi(optarg); + break; + case 'A': + active_threads_per_cluster = atoi(optarg); + provide_active_threads_per_cluster = true; + break; + case 'm': + launch_mode = static_cast(atoi(optarg)); + assert(launch_mode == NORMAL || launch_mode == CLUSTER || launch_mode == COOPERATIVE && "launch_mode must be 0 (NORMAL), 1 (CLUSTER), or 2 (COOPERATIVE)"); + break; + default: + printf("Usage: %s -N -C -T -A -m \n", argv[0]); + return 1; + } + } + + if (!provide_active_threads_per_cluster) { + // Default to all threads in a cluster + active_threads_per_cluster = cluster_size * threads_per_block; + } + + if (launch_mode == CLUSTER || launch_mode == NORMAL) { + assert(active_threads_per_cluster <= cluster_size * threads_per_block && "active_threads_per_cluster is out of range, must be less than cluster_size * threads_per_block"); + } - unsigned sm_count = config.SM_NUMBER; - - // Number of blocks = CLI arg or all SMs - unsigned N = sm_count; - unsigned threads_per_block = config.THREADS_PER_BLOCK; - uint32_t ITERS = 4096; - uint32_t SYNC_INTERVAL = ITERS / 8; - if (argc > 1) { - N = (unsigned)atoi(argv[1]); - threads_per_block = (unsigned)atoi(argv[2]); - ITERS = (uint32_t)atoi(argv[3]); - SYNC_INTERVAL = (uint32_t)atoi(argv[4]); + // Initialize the data and global sink with single value + uint64_t *data_g; + gpuErrchk(cudaMalloc(&data_g, sizeof(uint64_t))); + gpuErrchk(cudaMemset(data_g, 0, sizeof(uint64_t))); + uint64_t *dsink_g; + gpuErrchk(cudaMalloc(&dsink_g, sizeof(uint64_t))); + gpuErrchk(cudaMemset(dsink_g, 0, sizeof(uint64_t))); + + printf("=== LRC Max Merged ===\n"); + unsigned num_cluster = launch_mode == CLUSTER ? N / cluster_size : N; + unsigned max_concurrent_access = active_threads_per_cluster / 32 * num_cluster; + printf("N_BLOCKS=%u, THREADS_PER_BLOCK=%u, MAX_CONCURRENT_ACCESS=%u, LAUNCH_MODE=%s\n", N, threads_per_block, max_concurrent_access, launch_mode_str[launch_mode]); + if (launch_mode == CLUSTER) { + printf("CLUSTER_SIZE=%u\n", cluster_size); } + printf("Profile with ncu to measure LRC max merged count.\n"); + + if (launch_mode == CLUSTER) { + printf("Launching with threadblock cluster to schedule threadblocks in the same GPC...\n"); + // Launch configurable cluster shape + // Enable upto 16 cluster size + gpuErrchk(cudaFuncSetAttribute(lrc_max_merged_kernel, cudaFuncAttributeNonPortableClusterSizeAllowed, 1)); + // Kernel launch configuration + cudaLaunchConfig_t config = {0}; + config.gridDim = dim3{N}; // N blocks + config.blockDim = dim3{threads_per_block}; // threads_per_block threads per block + cudaLaunchAttribute attribute[1]; + attribute[0].id = cudaLaunchAttributeClusterDimension; + attribute[0].val.clusterDim.x = cluster_size; // Cluster size is also Nx1x1 + attribute[0].val.clusterDim.y = 1; + attribute[0].val.clusterDim.z = 1; + config.attrs = attribute; + config.numAttrs = 1; - // Pointer chain array must fit in L2 - size_t array_bytes = ARRAY_SIZE * sizeof(uint64_t); - assert(array_bytes < config.L2_SIZE); - - unsigned num_sectors = ARRAY_SIZE / SECTOR_STRIDE; - - printf("=== LRC Max Merged (NCU-based) ===\n"); - printf("SM_COUNT=%u, N_BLOCKS=%u, ITERS=%d, NUM_SECTORS=%u\n", sm_count, N, - ITERS, num_sectors); - printf("Profile with ncu to measure L2 sector compression.\n"); - - // Allocate device memory - uint64_t *posArray_g, *dsink_g; - gpuErrchk(cudaMalloc(&posArray_g, array_bytes)); - gpuErrchk(cudaMalloc(&dsink_g, N * threads_per_block * sizeof(uint64_t))); - - // Initialize pointer chain on host using device pointer arithmetic - // Chain: sector 0 -> sector 1 -> ... -> sector N-1 -> sector 0 - uint64_t *init = (uint64_t *)malloc(array_bytes); - memset(init, 0, array_bytes); - for (unsigned s = 0; s < num_sectors - 1; s++) - init[s * SECTOR_STRIDE] = - (uint64_t)(posArray_g + (s + 1) * SECTOR_STRIDE); - init[(num_sectors - 1) * SECTOR_STRIDE] = - (uint64_t)(posArray_g); // cycle back - - gpuErrchk( - cudaMemcpy(posArray_g, init, array_bytes, cudaMemcpyHostToDevice)); - free(init); - - // Launch: N blocks, threads per block to enforce 2 block/SM - // Cooperative launch required for grid-wide sync - void *kernelArgs[] = {(void *)&posArray_g, (void *)&dsink_g, (void *)&ITERS, (void *)&SYNC_INTERVAL}; - gpuErrchk(cudaLaunchCooperativeKernel( - (void *)lrc_max_merged_kernel, N, threads_per_block, kernelArgs)); + cudaLaunchKernelEx(&config, lrc_max_merged_kernel, data_g, dsink_g, active_threads_per_cluster); + } else if (launch_mode == NORMAL) { + printf("Launching threadblocks normally...\n"); + lrc_max_merged_kernel<<>>(data_g, dsink_g, active_threads_per_cluster); + } else if (launch_mode == COOPERATIVE) { + printf("Launching with cooperative kernel...\n"); + void *kernelArgs[] = {(void *)&data_g, (void *)&dsink_g}; + gpuErrchk(cudaLaunchCooperativeKernel( + (void *)lrc_max_merged_kernel_cooperative, N, threads_per_block, kernelArgs)); + } + gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); - printf("Kernel completed. Use ncu to analyze L2 sector counts.\n"); + printf("Kernel completed. Use ncu to analyze LRC max merged count.\n"); // Cleanup - cudaFree(posArray_g); + cudaFree(data_g); cudaFree(dsink_g); return 0; diff --git a/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/run_lrc_merged.sh b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/run_lrc_merged.sh new file mode 100644 index 000000000..b6507e225 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/run_lrc_merged.sh @@ -0,0 +1,10 @@ +#!/bin/bash +export CUDA_VISIBLE_DEVICES=7 + +# Measure maximum merge count per LRC entry +# Run with cluster launch with 8 threadblocks and 8 threadblocks per cluster +# Should see 8*128/32 = 32 sectors request issued from SM +# and 8 sectors request issued from L2 cache +TMPDIR=./ ncu \ + --metrics l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,lts__t_sectors_srcunit_tex_op_read.sum,lts__t_sectors_srcunit_tex_op_read_lookup_hit.sum,lts__t_sectors_srcunit_tex_op_read_lookup_miss.sum \ + --replay-mode application ./lrc_max_merged -N 8 -C 8 -T 128 -m 1 diff --git a/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/sweep_lrc_max_merged.sh b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/sweep_lrc_max_merged.sh deleted file mode 100644 index 37f263c16..000000000 --- a/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/sweep_lrc_max_merged.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -# A script to sweep the number of concurrent accesses to the L2 cache to test for the LRC max merged - -export CUDA_VISIBLE_DEVICES=7 -for tb_size in 32 64 128; do - echo "--------------------------------" - echo "Thread block size: $tb_size" - echo "NUM_CONCURRENT_ACCESS,NUM_BLOCKS,lts2lrc_sectors,xbar2gpc_sectors,LRC ratio(xbar2gpc/lts2lrc)" - for nb in 1 4 8 16 32 48 64 80 96 112 128; do - num_concurrent_access=$((nb * tb_size / 32)) - output=$(TMPDIR=./ ncu --metrics lrc__xbar2gpc_sectors_op_read.sum,lrc__lts2lrc_sectors_op_read.sum,lrc__average_xbar2gpc_sectors_op_read.ratio,l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum ./lrc_max_merged "$nb" 128 2048 0 2>&1) - - lts2lrc=$(echo "$output" | grep 'lrc__lts2lrc_sectors_op_read.sum' | awk '{print $NF}' | tr -d ',') - xbar2gpc=$(echo "$output" | grep 'lrc__xbar2gpc_sectors_op_read.sum' | awk '{print $NF}' | tr -d ',') - - if [[ -n "$lts2lrc" && -n "$xbar2gpc" && "$xbar2gpc" != "0" ]]; then - ratio=$(awk "BEGIN {printf \"%.4f\", $xbar2gpc / $lts2lrc}") - else - ratio="N/A" - fi - - echo "${num_concurrent_access},${nb},${lts2lrc},${xbar2gpc},${ratio}" - done - echo "--------------------------------" - echo "" -done \ No newline at end of file From 00858e5e01ae703499ba5c87f6a6f4af78220fcf Mon Sep 17 00:00:00 2001 From: Weili An Date: Tue, 3 Mar 2026 15:02:58 -0500 Subject: [PATCH 3/3] Add ubench to test LRC merge size --- .../ubench/lrc/lrc_merge_size/Makefile | 14 +++ .../lrc/lrc_merge_size/lrc_merge_size.cu | 93 +++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_merge_size/Makefile create mode 100644 src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_merge_size/lrc_merge_size.cu diff --git a/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_merge_size/Makefile b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_merge_size/Makefile new file mode 100644 index 000000000..9c9d0cae3 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_merge_size/Makefile @@ -0,0 +1,14 @@ +SRC = lrc_merge_size.cu + +EXE = lrc_merge_size + +# NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt +# LRC is supported on SM_90 and above +ARCH?=sm_90a sm_100a sm_101 sm_120 +# Unset the CUDA_CPPFLAGS which is set based on CUDA version +# but LRC is only supported on SM_90 and above +CUDA_CPPFLAGS= +# Generate code for both sm_XXX and compute_XXX (SASS and PTX) +NVCC_FLAGS := $(foreach arch,$(ARCH),-gencode=arch=compute_$(subst sm_,,$(arch)),code=$(arch) -gencode=arch=compute_$(subst sm_,,$(arch)),code=compute_$(subst sm_,,$(arch))) -std=c++17 -O0 + +include ../../../common/common.mk diff --git a/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_merge_size/lrc_merge_size.cu b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_merge_size/lrc_merge_size.cu new file mode 100644 index 000000000..eb0df06bc --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_merge_size/lrc_merge_size.cu @@ -0,0 +1,93 @@ +// LRC Merge Size Microbenchmark +// Use mbarrier and threadblock cluster to ensure +// best synchronization among warps sending request +// to the same L2 sector + +#include +#include +#include +#include +#include +#include +#include "../../../hw_def/common/gpuConfig.h" +#include "../../../hw_def/hw_def.h" + +#include +#include +#include +namespace cg = cooperative_groups; + +/** + * @brief Kernel to test LRC merge size being sector or cacheline + * + * @param data Same sector data for all warps to access for LRC merge + * @param dsink Global sink to avoid optimization + * @return __global__ + */ +__global__ void __cluster_dims__(4, 1, 1) lrc_merge_size_kernel(uint8_t *data, uint8_t *dsink) { + // The LRC merge size should be reflected from NCU metrics + + // Shmem buffer + __shared__ volatile uint8_t smem_buffer[16]; + + // Get thread index within a cluster + cg::cluster_group cluster = cg::this_cluster(); + unsigned block_rank = cluster.block_rank(); + + + /** + * LRC merge size test + */ + // Base on the block rank in a cluster, each block have one warp to access the data + i + // with i being the block rank + // Only one sector access per warp + if (threadIdx.x % 32 == 0) { + uint64_t data_value; + // As sector size is 32B, so we access at 32B stride + uint8_t *ptr = data + block_rank * 32; + asm volatile("{\t\n" + "ld.global.cg.u64 %0, [%1];\n\t" + "}" + : "=l"(data_value) + : "l"(ptr)); + smem_buffer[0] = (uint8_t)data_value; + } + __syncthreads(); + + // Write to global sink to prevent optimization + dsink[0] = smem_buffer[0]; +} + +int main(int argc, char *argv[]) { + initializeDeviceProp(0, argc, argv); + + // Initialize the data and global sink with single value + uint8_t *data_g; + const unsigned SECTOR_SIZE = 32; + const unsigned CACHELINE_SIZE = 128; + + // cudaMalloc is aligned to 256B, so this array is in a single cacheline + gpuErrchk(cudaMalloc(&data_g, sizeof(uint8_t) * CACHELINE_SIZE)); + gpuErrchk(cudaMemset(data_g, 0, sizeof(uint8_t) * CACHELINE_SIZE)); + uint8_t *dsink_g; + gpuErrchk(cudaMalloc(&dsink_g, sizeof(uint8_t))); + gpuErrchk(cudaMemset(dsink_g, 0, sizeof(uint8_t))); + + printf("=== LRC Merge Size ===\n"); + printf("Profile with ncu to measure LRC merge size.\n"); + + printf("Launching threadblocks normally...\n"); + // 2 clusters, 8 threadblocks, each TB has 1 warp, each warp has 32 threads + lrc_merge_size_kernel<<<8, 32>>>(data_g, dsink_g); + + gpuErrchk(cudaPeekAtLastError()); + gpuErrchk(cudaDeviceSynchronize()); + + printf("Kernel completed. Use ncu to analyze LRC merge size.\n"); + + // Cleanup + cudaFree(data_g); + cudaFree(dsink_g); + + return 0; +}