diff --git a/.gitignore b/.gitignore index b729af5d6..83605321e 100644 --- a/.gitignore +++ b/.gitignore @@ -24,4 +24,12 @@ tmp/ src/cuda/GPU_Microbenchmark/ubench/**/* !src/cuda/GPU_Microbenchmark/ubench/**/*/ !src/cuda/GPU_Microbenchmark/ubench/**/*.* -!src/cuda/GPU_Microbenchmark/ubench/**/Makefile \ No newline at end of file +!src/cuda/GPU_Microbenchmark/ubench/**/Makefile + +# IDE/AI tool related files +.vscode/ +.cursor/ +.cursorrules +.claude/ +CLAUDE.md +.clangd \ No newline at end of file diff --git a/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/Makefile b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/Makefile new file mode 100644 index 000000000..b3492f1d0 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/Makefile @@ -0,0 +1,14 @@ +SRC = lrc_max_merged.cu + +EXE = lrc_max_merged + +# NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt +# LRC is supported on SM_90 and above +ARCH?=sm_90a sm_100a sm_101 sm_120 +# Unset the CUDA_CPPFLAGS which is set based on CUDA version +# but LRC is only supported on SM_90 and above +CUDA_CPPFLAGS= +# Generate code for both sm_XXX and compute_XXX (SASS and PTX) +NVCC_FLAGS := $(foreach arch,$(ARCH),-gencode=arch=compute_$(subst sm_,,$(arch)),code=$(arch) -gencode=arch=compute_$(subst sm_,,$(arch)),code=compute_$(subst sm_,,$(arch))) -std=c++17 -O0 + +include ../../../common/common.mk diff --git a/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/lrc_max_merged.cu b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/lrc_max_merged.cu new file mode 100644 index 000000000..c091ed2ae --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/lrc_max_merged.cu @@ -0,0 +1,283 @@ +// LRC Max Merged Microbenchmark +// Use mbarrier and threadblock cluster to ensure +// best synchronization among warps sending request +// to the same L2 sector + +#include +#include +#include +#include +#include +#include +#include "../../../hw_def/common/gpuConfig.h" +#include "../../../hw_def/hw_def.h" + +#include +#include +#include +namespace cg = cooperative_groups; + +/** + * @brief Synchronize within a cluster using mbarrier + * + * @return + */ +__device__ __forceinline__ void sync_cluster(void) { + /** + * Synchronization + */ + // Initialize mbarrier + #pragma nv_diag_suppress static_var_with_dynamic_init + __shared__ uint64_t bar; + // Barrier handles + uint64_t *clusterleader_bar_handle; + uint64_t *local_bar_handle; + + // Setup remote barrier address + cg::cluster_group cluster = cg::this_cluster(); + unsigned int clusterBlockRank = cluster.block_rank(); + unsigned int clusterSize = cluster.dim_blocks().x; + + // Check if this threadblock is the cluster leader + bool isClusterLeader = clusterBlockRank == 0; + // Check if this thread is the thread leader + bool isThreadLeader = threadIdx.x == 0; + + // Only cluster leader should initialize the barrier + // other threadblocks should use remote barrier address + if (isClusterLeader) { + // Get total number of threads in the cluster + unsigned int totalThreads = cluster.num_threads(); + + // Only one thread should initialize the barrier + if (isThreadLeader) { + cuda::ptx::mbarrier_init(&bar, totalThreads); + } + __syncthreads(); + clusterleader_bar_handle = &bar; + local_bar_handle = &bar; + } else { + // Other threadblocks should init local barrier + // to wait for clusterleader's signal that all threadblocks + // have arrived + if (isThreadLeader) { + cuda::ptx::mbarrier_init(&bar, 1); + } + __syncthreads(); + clusterleader_bar_handle = cluster.map_shared_rank(&bar, 0); + local_bar_handle = &bar; + } + + // Here is the synchronization part + // 1. All threads in the cluster should arrive at the clusterleader's barrier + // 2. Since mbarrier only supports waiting on local barrier: + // 1. ClusterLeader will wait on its barrier that get released when all threadblocks have arrived + // 2. Other threadblocks will wait on their local barrier that get released when clusterleader release its barrier + // 3. Once clusterleader's barrier is released, all other threadblocks should launch the load to same sector address + + // First all threads arrive at the clusterleader's barrier + cuda::ptx::mbarrier_arrive(cuda::ptx::sem_release, cuda::ptx::scope_cluster, cuda::ptx::space_cluster, clusterleader_bar_handle, 1); + + // Then based on the role, we will either wait on the barrier or release the barrier + if (isClusterLeader) { + // ClusterLeader will wait on its barrier that get released when all threadblocks have arrived + while (!cuda::ptx::mbarrier_test_wait_parity(cuda::ptx::sem_acquire, cuda::ptx::scope_cluster, clusterleader_bar_handle, 0)) {} + + // Now ClusterLeader should release other threadblocks to proceed, we will execute this in a single warp + // for each threadblock, we will release its barrier + if (threadIdx.x < clusterSize && threadIdx.x != clusterBlockRank) { + uint64_t *remote_bar_handle = cluster.map_shared_rank(&bar, threadIdx.x); + cuda::ptx::mbarrier_arrive(cuda::ptx::sem_release, cuda::ptx::scope_cluster, cuda::ptx::space_cluster, remote_bar_handle, 1); + } + } else { + // Other threadblocks will wait on their local barrier that get released when clusterleader release its barrier + while (!cuda::ptx::mbarrier_test_wait_parity(cuda::ptx::sem_acquire, cuda::ptx::scope_cluster, local_bar_handle, 0)) {} + } + __syncthreads(); +} + +/** + * @brief Kernel to test LRC maximum merged count per entry + * + * @param data Same sector data for all warps to access for LRC merge + * @param dsink Global sink to avoid optimization + * @return __global__ + */ +__global__ void lrc_max_merged_kernel(uint64_t *data, uint64_t *dsink, unsigned active_threads_per_cluster) { + // The LRC max merged count should be reflected from NCU metrics + + // Shmem buffer + __shared__ volatile uint64_t smem_buffer[16]; + + // Get thread index within a cluster + cg::cluster_group cluster = cg::this_cluster(); + unsigned thread_within_cluster = cluster.thread_rank(); + + // Synchronize within a cluster + // If threadblocks are within the same GPC, they will be relatively in sync + // anyway + // sync_cluster(); + + /** + * LRC max merged test + */ + // Now threadblocks within a cluster should be relatively in sync for testing LRC coalescing + // Load the same sector data into shared memory + // Use ld.global.cg to bypass L1 cache MSHR + // Send down one sector per warp, make this explicit despite in + // hardware with this "if", loads are still coalesced into 1 sector + if (thread_within_cluster < active_threads_per_cluster) { + // Only one sector access per warp + if (threadIdx.x % 32 == 0) { + uint64_t data_value; + asm volatile("{\t\n" + "ld.global.cg.u64 %0, [%1];\n\t" + "}" + : "=l"(data_value) + : "l"(&data[0])); + smem_buffer[0] = data_value; + } + } + __syncthreads(); + + // Write to global sink to prevent optimization + dsink[0] = smem_buffer[0]; +} + +__global__ void lrc_max_merged_kernel_cooperative(uint64_t *data, uint64_t *dsink) { + __shared__ volatile uint64_t smem_buffer[16]; + + // Global synchronization for all blocks in the grid + cg::grid_group grid = cg::this_grid(); + grid.sync(); + + if (threadIdx.x % 32 == 0) { + uint64_t data_value; + asm volatile("{\t\n" + "ld.global.cg.u64 %0, [%1];\n\t" + "}" + : "=l"(data_value) + : "l"(&data[0])); + smem_buffer[0] = data_value; + } + __syncthreads(); + + // Write to global sink to prevent optimization + dsink[0] = smem_buffer[0]; +} + +int main(int argc, char *argv[]) { + initializeDeviceProp(0, argc, argv); + enum KernelLaunchMode { + NORMAL = 0, + CLUSTER = 1, + COOPERATIVE = 2, + }; + const char *launch_mode_str[] = { + "NORMAL", + "CLUSTER", + "COOPERATIVE", + }; + + // Size of threadblock cluster + const unsigned MAX_CLUSTER_SIZE = 16; + unsigned N = MAX_CLUSTER_SIZE; + unsigned cluster_size = MAX_CLUSTER_SIZE; + unsigned threads_per_block = 256; + bool provide_active_threads_per_cluster = false; + unsigned active_threads_per_cluster = cluster_size * threads_per_block; + KernelLaunchMode launch_mode = NORMAL; + const char *optstring = "N:C:T:A:m:"; + // CLI parsing + int opt; + while ((opt = getopt(argc, argv, optstring)) != -1) { + switch (opt) { + case 'N': + N = atoi(optarg); + break; + case 'C': + cluster_size = atoi(optarg); + assert(cluster_size <= MAX_CLUSTER_SIZE && "cluster_size is out of range, must be less than 16 on Hopper"); + break; + case 'T': + threads_per_block = atoi(optarg); + break; + case 'A': + active_threads_per_cluster = atoi(optarg); + provide_active_threads_per_cluster = true; + break; + case 'm': + launch_mode = static_cast(atoi(optarg)); + assert(launch_mode == NORMAL || launch_mode == CLUSTER || launch_mode == COOPERATIVE && "launch_mode must be 0 (NORMAL), 1 (CLUSTER), or 2 (COOPERATIVE)"); + break; + default: + printf("Usage: %s -N -C -T -A -m \n", argv[0]); + return 1; + } + } + + if (!provide_active_threads_per_cluster) { + // Default to all threads in a cluster + active_threads_per_cluster = cluster_size * threads_per_block; + } + + if (launch_mode == CLUSTER || launch_mode == NORMAL) { + assert(active_threads_per_cluster <= cluster_size * threads_per_block && "active_threads_per_cluster is out of range, must be less than cluster_size * threads_per_block"); + } + + // Initialize the data and global sink with single value + uint64_t *data_g; + gpuErrchk(cudaMalloc(&data_g, sizeof(uint64_t))); + gpuErrchk(cudaMemset(data_g, 0, sizeof(uint64_t))); + uint64_t *dsink_g; + gpuErrchk(cudaMalloc(&dsink_g, sizeof(uint64_t))); + gpuErrchk(cudaMemset(dsink_g, 0, sizeof(uint64_t))); + + printf("=== LRC Max Merged ===\n"); + unsigned num_cluster = launch_mode == CLUSTER ? N / cluster_size : N; + unsigned max_concurrent_access = active_threads_per_cluster / 32 * num_cluster; + printf("N_BLOCKS=%u, THREADS_PER_BLOCK=%u, MAX_CONCURRENT_ACCESS=%u, LAUNCH_MODE=%s\n", N, threads_per_block, max_concurrent_access, launch_mode_str[launch_mode]); + if (launch_mode == CLUSTER) { + printf("CLUSTER_SIZE=%u\n", cluster_size); + } + printf("Profile with ncu to measure LRC max merged count.\n"); + + if (launch_mode == CLUSTER) { + printf("Launching with threadblock cluster to schedule threadblocks in the same GPC...\n"); + // Launch configurable cluster shape + // Enable upto 16 cluster size + gpuErrchk(cudaFuncSetAttribute(lrc_max_merged_kernel, cudaFuncAttributeNonPortableClusterSizeAllowed, 1)); + // Kernel launch configuration + cudaLaunchConfig_t config = {0}; + config.gridDim = dim3{N}; // N blocks + config.blockDim = dim3{threads_per_block}; // threads_per_block threads per block + cudaLaunchAttribute attribute[1]; + attribute[0].id = cudaLaunchAttributeClusterDimension; + attribute[0].val.clusterDim.x = cluster_size; // Cluster size is also Nx1x1 + attribute[0].val.clusterDim.y = 1; + attribute[0].val.clusterDim.z = 1; + config.attrs = attribute; + config.numAttrs = 1; + + cudaLaunchKernelEx(&config, lrc_max_merged_kernel, data_g, dsink_g, active_threads_per_cluster); + } else if (launch_mode == NORMAL) { + printf("Launching threadblocks normally...\n"); + lrc_max_merged_kernel<<>>(data_g, dsink_g, active_threads_per_cluster); + } else if (launch_mode == COOPERATIVE) { + printf("Launching with cooperative kernel...\n"); + void *kernelArgs[] = {(void *)&data_g, (void *)&dsink_g}; + gpuErrchk(cudaLaunchCooperativeKernel( + (void *)lrc_max_merged_kernel_cooperative, N, threads_per_block, kernelArgs)); + } + + gpuErrchk(cudaPeekAtLastError()); + gpuErrchk(cudaDeviceSynchronize()); + + printf("Kernel completed. Use ncu to analyze LRC max merged count.\n"); + + // Cleanup + cudaFree(data_g); + cudaFree(dsink_g); + + return 0; +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/run_lrc_merged.sh b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/run_lrc_merged.sh new file mode 100644 index 000000000..b6507e225 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_max_merged/run_lrc_merged.sh @@ -0,0 +1,10 @@ +#!/bin/bash +export CUDA_VISIBLE_DEVICES=7 + +# Measure maximum merge count per LRC entry +# Run with cluster launch with 8 threadblocks and 8 threadblocks per cluster +# Should see 8*128/32 = 32 sectors request issued from SM +# and 8 sectors request issued from L2 cache +TMPDIR=./ ncu \ + --metrics l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,lts__t_sectors_srcunit_tex_op_read.sum,lts__t_sectors_srcunit_tex_op_read_lookup_hit.sum,lts__t_sectors_srcunit_tex_op_read_lookup_miss.sum \ + --replay-mode application ./lrc_max_merged -N 8 -C 8 -T 128 -m 1 diff --git a/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_merge_size/Makefile b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_merge_size/Makefile new file mode 100644 index 000000000..9c9d0cae3 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_merge_size/Makefile @@ -0,0 +1,14 @@ +SRC = lrc_merge_size.cu + +EXE = lrc_merge_size + +# NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt +# LRC is supported on SM_90 and above +ARCH?=sm_90a sm_100a sm_101 sm_120 +# Unset the CUDA_CPPFLAGS which is set based on CUDA version +# but LRC is only supported on SM_90 and above +CUDA_CPPFLAGS= +# Generate code for both sm_XXX and compute_XXX (SASS and PTX) +NVCC_FLAGS := $(foreach arch,$(ARCH),-gencode=arch=compute_$(subst sm_,,$(arch)),code=$(arch) -gencode=arch=compute_$(subst sm_,,$(arch)),code=compute_$(subst sm_,,$(arch))) -std=c++17 -O0 + +include ../../../common/common.mk diff --git a/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_merge_size/lrc_merge_size.cu b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_merge_size/lrc_merge_size.cu new file mode 100644 index 000000000..eb0df06bc --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_merge_size/lrc_merge_size.cu @@ -0,0 +1,93 @@ +// LRC Merge Size Microbenchmark +// Use mbarrier and threadblock cluster to ensure +// best synchronization among warps sending request +// to the same L2 sector + +#include +#include +#include +#include +#include +#include +#include "../../../hw_def/common/gpuConfig.h" +#include "../../../hw_def/hw_def.h" + +#include +#include +#include +namespace cg = cooperative_groups; + +/** + * @brief Kernel to test LRC merge size being sector or cacheline + * + * @param data Same sector data for all warps to access for LRC merge + * @param dsink Global sink to avoid optimization + * @return __global__ + */ +__global__ void __cluster_dims__(4, 1, 1) lrc_merge_size_kernel(uint8_t *data, uint8_t *dsink) { + // The LRC merge size should be reflected from NCU metrics + + // Shmem buffer + __shared__ volatile uint8_t smem_buffer[16]; + + // Get thread index within a cluster + cg::cluster_group cluster = cg::this_cluster(); + unsigned block_rank = cluster.block_rank(); + + + /** + * LRC merge size test + */ + // Base on the block rank in a cluster, each block have one warp to access the data + i + // with i being the block rank + // Only one sector access per warp + if (threadIdx.x % 32 == 0) { + uint64_t data_value; + // As sector size is 32B, so we access at 32B stride + uint8_t *ptr = data + block_rank * 32; + asm volatile("{\t\n" + "ld.global.cg.u64 %0, [%1];\n\t" + "}" + : "=l"(data_value) + : "l"(ptr)); + smem_buffer[0] = (uint8_t)data_value; + } + __syncthreads(); + + // Write to global sink to prevent optimization + dsink[0] = smem_buffer[0]; +} + +int main(int argc, char *argv[]) { + initializeDeviceProp(0, argc, argv); + + // Initialize the data and global sink with single value + uint8_t *data_g; + const unsigned SECTOR_SIZE = 32; + const unsigned CACHELINE_SIZE = 128; + + // cudaMalloc is aligned to 256B, so this array is in a single cacheline + gpuErrchk(cudaMalloc(&data_g, sizeof(uint8_t) * CACHELINE_SIZE)); + gpuErrchk(cudaMemset(data_g, 0, sizeof(uint8_t) * CACHELINE_SIZE)); + uint8_t *dsink_g; + gpuErrchk(cudaMalloc(&dsink_g, sizeof(uint8_t))); + gpuErrchk(cudaMemset(dsink_g, 0, sizeof(uint8_t))); + + printf("=== LRC Merge Size ===\n"); + printf("Profile with ncu to measure LRC merge size.\n"); + + printf("Launching threadblocks normally...\n"); + // 2 clusters, 8 threadblocks, each TB has 1 warp, each warp has 32 threads + lrc_merge_size_kernel<<<8, 32>>>(data_g, dsink_g); + + gpuErrchk(cudaPeekAtLastError()); + gpuErrchk(cudaDeviceSynchronize()); + + printf("Kernel completed. Use ncu to analyze LRC merge size.\n"); + + // Cleanup + cudaFree(data_g); + cudaFree(dsink_g); + + return 0; +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_queue_size/Makefile b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_queue_size/Makefile new file mode 100644 index 000000000..da59aeb11 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_queue_size/Makefile @@ -0,0 +1,7 @@ +SRC = lrc_queue_size.cu + +EXE = lrc_queue_size + +NVCC_FLGAS = -Xptxas -dlcm=cg -Xptxas -dscm=wt + +include ../../../common/common.mk diff --git a/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_queue_size/lrc_queue_size.cu b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_queue_size/lrc_queue_size.cu new file mode 100644 index 000000000..5a4a280f2 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/lrc/lrc_queue_size/lrc_queue_size.cu @@ -0,0 +1,38 @@ +// LRC Queue Size Microbenchmark — POSTPONED +// +// TODO: Implement benchmark to discover LRC queue depth (max_entries) per +// L2 sub-partition on NVIDIA GPUs. +// +// Challenges that need to be resolved before implementation: +// +// 1. Address-to-sub-partition mapping: +// To measure the queue depth of a specific L2 sub-partition, we need to +// generate unique sector addresses that all map to the SAME sub-partition. +// This requires knowledge of the H100's L2 address interleaving scheme. +// A simple linear mapping (addr / sector_size % num_sub_partitions) is +// likely incorrect — NVIDIA may use XOR-based hashing or other schemes. +// +// 2. Measurement approach: +// Once the address mapping is known, launch multiple warps that each read +// from a unique sector address (all mapping to the same sub-partition). +// Sweep the number of unique sectors and use ncu to observe: +// - Pre-LRC: l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum +// - Post-LRC: lts__t_sectors_srcunit_tex_op_read.sum +// When the number of unique sectors exceeds the LRC queue depth, +// the LRC can no longer track all entries simultaneously, causing +// evictions and additional L2 lookups. +// +// 3. Alternative approach: +// Reverse-engineer the address interleaving first using a separate +// microbenchmark (e.g., by measuring latency variation across different +// address strides to identify the interleaving pattern). +// +// Accel-Sim config parameter: -gpgpu_lrc_max_entries + +#include + +int main() { + printf("lrc_queue_size: NOT YET IMPLEMENTED\n"); + printf("See source file for TODO details.\n"); + return 0; +}