diff --git a/README.md b/README.md index 0e38ddb..d68a89b 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,107 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Zixiao Wang + * [LinkedIn](https://www.linkedin.com/in/zixiao-wang-826a5a255/) +* Tested on: Windows 11, i7-14700K @ 3.40 GHz 32GB, GTX 4070TI 12GB 4K Monitor (Personal PC) -### (TODO: Your README) +### Background -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +* In this project, I implemented GPU stream compaction in CUDA, from scratch. This algorithm is widely used, and will be important for accelerating path tracer. I implemented several versions of the Scan (Prefix Sum) algorithm, a critical building block for stream compaction: + * CPU Implementation: Begin by coding a CPU version of the Scan algorithm. + * GPU Implementations: + * Naive Scan: Develop a straightforward GPU version that parallelizes the Scan operation. + * Work-Efficient Scan: Optimize GPU implementation to make better use of parallel resources, minimizing idle threads and redundant computations. + + * GPU Stream Compaction: Utilize Scan implementations to create a GPU-based stream compaction algorithm that efficiently removes zeros from an integer array. +* For more detail [GPU GEM](https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda). + +### Prefix-Sum(Scan) + +![](img/ScanSpeed.png) +The chart compares the execution time (in milliseconds) of different scan implementations (CPU, Naive, Work-Efficient, and Thrust) for various array sizes +* CPU Implementation shows a significant increase in running time as the array size increases. And it is the slowest among all implementations, particularly for larger array sizes. +* The Naive scan algorithm, which is a GPU implementation, performs better than the CPU version but still increases noticeably as the array size grows. This implementation does not utilize memory optimizations, and the iterative approach of the naive scan leads to suboptimal performance, especially for larger arrays. +* The Work-Efficient scan performs better than the Naive scan but is slightly slower than the Thrust implementation for larger array sizes. This approach is optimized for better memory access patterns and reduces the number of required operations. +* The Thrust library implementation is the fastest across all array sizes. The execution time remains almost constant even as the array size increases, which indicates very efficient scaling and use of GPU resources +#### Summary +Thrust is the clear winner for larger data sets, offering the best performance and scalability. For smaller arrays, the CPU implementation’s overhead isn’t as visible, but as the array size increases, the GPU’s parallel nature shows a dramatic improvement in running time. The Work-Efficient scan addresses some of the Naive algorithm's computational inefficiencies by reducing the number of redundant calculations and focusing on better parallelization techniques. However, memory I/O remains a limiting factor because it relies on frequent reads and writes to global memory. +![](img/NsightCompute.png) + +* Memory-bound issues: Low memory throughput across kernels might suggest suboptimal memory access patterns, which might be caused by excessive global memory operations. Optimizations using shared memory might hugely improve performance. +* Compute-bound issues: The low compute throughput in the kernels may indicate the underutilization of GPU compute resources. This might be improved by increasing occupancy and reducing divergence between threads. +### Extra Feature +#### Optimized Work Efficient Approach +![](img/Upsweep.png) + +As the Up Sweep Phase Picture shows, it is obvious the number of threads needed for different depths is not fixed. Most threads become idle as only a few elements need to be processed. For example, at level d, only N / 2^d threads are doing useful work out of the total threads launched. Adjust the number of threads and blocks launched at each level based on the workload as follows: +``` +int step = 1 << (d + 1); +int threads = new_n / step; +dim3 fullBlocksPerGrid((threads + block_size - 1) / block_size); +``` +However, once these value are not fixed, the calculation of index in the Kernel also need to be modified as follow: +``` +int k = (blockIdx.x * blockDim.x) + threadIdx.x; +int step = 1 << (d + 1); +int index = k * step + (step - 1); +``` +### test result +test on size 1<<22 and block size 128 +```bash +**************** +** SCAN TESTS ** +**************** + [ 21 19 39 37 37 32 3 36 8 22 22 5 49 ... 20 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 5.5752ms (std::chrono Measured) + [ 0 21 40 79 116 153 185 188 224 232 254 276 281 ... 102730753 102730773 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 5.5342ms (std::chrono Measured) + [ 0 21 40 79 116 153 185 188 224 232 254 276 281 ... 102730634 102730677 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.57712ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.496128ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.489792ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.549632ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 1.10934ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.57584ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 1 3 1 1 0 1 0 0 2 2 1 3 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 6.5848ms (std::chrono Measured) + [ 1 1 3 1 1 1 2 2 1 3 1 2 1 ... 2 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 6.5552ms (std::chrono Measured) + [ 1 1 3 1 1 1 2 2 1 3 1 2 1 ... 1 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 17.0166ms (std::chrono Measured) + [ 1 1 3 1 1 1 2 2 1 3 1 2 1 ... 2 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.525504ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.521216ms (CUDA Measured) + passed +``` +### Modification to CmakeList +Add `radixsort.cu` and `radixsort.h` and add entry for them in Cmakelist diff --git a/img/NsightCompute.png b/img/NsightCompute.png new file mode 100644 index 0000000..11a8b73 Binary files /dev/null and b/img/NsightCompute.png differ diff --git a/img/ScanSpeed.png b/img/ScanSpeed.png new file mode 100644 index 0000000..b0fb0f5 Binary files /dev/null and b/img/ScanSpeed.png differ diff --git a/img/Upsweep.png b/img/Upsweep.png new file mode 100644 index 0000000..8e35648 Binary files /dev/null and b/img/Upsweep.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..a5f1c0c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1<< 22; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..b2acda0 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,12 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (index < n) { + bools[index] = (idata[index] != 0) ? 1 : 0; + } } /** @@ -33,6 +39,14 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (index < n) { + if (bools[index] == 1) { + odata[indices[index]] = idata[index]; + } + } + } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..5b0117c 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -2,6 +2,7 @@ #include "cpu.h" #include "common.h" +#include namespace StreamCompaction { namespace CPU { @@ -20,6 +21,18 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + if (n == 0) return; + + + odata[0] = 0; + + + for (int i = 1; i <=n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + + + } + timer().endCpuTimer(); } @@ -31,8 +44,16 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int count = 0; + + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[count] = idata[i]; + count++; + } + } timer().endCpuTimer(); - return -1; + return count; } /** @@ -41,10 +62,43 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + int* flag = new int[n]; + int* scannedFlag = new int[n]; + + timer().startCpuTimer(); // TODO + for (int i = 0; i < n; i++) { + flag[i] = (idata[i] != 0)? 1:0; + + } + + + + scannedFlag[0] = 0; + + + for (int i = 1; i < n; i++) { + scannedFlag[i] = scannedFlag[i - 1] + flag[i - 1]; + } + + + int count = 0; + + for (int i = 0; i < n; i++) { + if (flag[i] == 1) { + if (scannedFlag[i] < n) { + odata[scannedFlag[i]] = idata[i]; + } + + count++; + } + } + timer().endCpuTimer(); - return -1; + delete[] flag; + delete[] scannedFlag; + return count; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..1824bca 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -11,15 +11,82 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } + int block_size = 128; + + __global__ void Upsweep_kernel(int n, int* data, int d) { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + int step = 1 << (d + 1); + int index = k * step + (step - 1); + + if (index < n) { + int offset = 1 << d; + data[index] += data[index - offset]; + + } + + } + __global__ void Downsweep_kernel(int n, int* data, int d) + { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + int step = 1 << (d + 1); + int index = k * step + (step - 1); + + if (index < n) { + int offset = 1 << d; + int t = data[index - offset]; + data[index - offset] = data[index]; + data[index] += t; + } + } + + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int d = ilog2ceil(n); + int new_n = 1 << d; + + int* dev_data; + cudaMalloc((void**)&dev_data, new_n * sizeof(int)); + + cudaMemset(dev_data, 0, new_n * sizeof(int)); + cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + timer().startGpuTimer(); // TODO + //Upsweep + for (int h = 0; h < ilog2ceil(new_n); h++) { + int step = 1 << (h + 1); + int threads = new_n / step; + dim3 fullBlocksPerGrid((threads + block_size - 1) / block_size); + Upsweep_kernel << > > (new_n, dev_data, h); + + } + + cudaMemset(&dev_data[new_n - 1], 0, sizeof(int)); + + for (int d = ilog2ceil(new_n) - 1; d >= 0; d--) { + int step = 1 << (d + 1); + int threads = new_n / step; + dim3 fullBlocksPerGrid((threads + block_size - 1) / block_size); + Downsweep_kernel << > > (new_n, dev_data, d); + + + } + + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_data); + } + + /** * Performs stream compaction on idata, storing the result into odata. @@ -31,10 +98,57 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + + + int* dev_idata; + int* dev_odata; + int* dev_bools; + int* dev_indices; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + cudaMalloc((void**)&dev_indices, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + + + cudaMemcpy(dev_idata, idata, n*sizeof(int), cudaMemcpyHostToDevice); + dim3 fullBlocksPerGrid((n + block_size - 1) / block_size); + + + + + //timer().startGpuTimer(); // TODO - timer().endGpuTimer(); - return -1; + StreamCompaction::Common::kernMapToBoolean << > > (n, dev_bools, dev_idata); + cudaDeviceSynchronize(); + + scan(n, dev_indices, dev_bools); + cudaDeviceSynchronize(); + + StreamCompaction::Common::kernScatter << > > (n, dev_odata, dev_idata, dev_bools, dev_indices); + cudaDeviceSynchronize(); + + + + + //timer().endGpuTimer(); + + + + int last_bool; + int last_index; + cudaMemcpy(&last_bool, dev_bools + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&last_index, dev_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + + + + cudaMemcpy(odata, dev_odata, (last_index + last_bool)* sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_bools); + cudaFree(dev_indices); + cudaFree(dev_odata); + return last_index + last_bool; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..d349714 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -12,14 +12,63 @@ namespace StreamCompaction { return timer; } // TODO: __global__ + __global__ void scan_kernel(int n, int* odata, const int* idata, int step) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n) { + if (index >= step) { + odata[index] = idata[index] + idata[index - step]; + } + else { + odata[index] = idata[index]; + } + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* dev_buffer_A; + int* dev_buffer_B; + + cudaMalloc((void**)&dev_buffer_A, n * sizeof(int)); + cudaMalloc((void**)&dev_buffer_B, n * sizeof(int)); + + cudaMemcpy(dev_buffer_A, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + int block_size = 128; + dim3 fullBlocksPerGrid((n + block_size - 1) / block_size); + timer().startGpuTimer(); // TODO + for (int d = 1; d <= ilog2ceil(n); d++) { + int step = 1 << (d - 1); + + if ((d % 2) == 1) { + scan_kernel << > > (n, dev_buffer_B + , dev_buffer_A, step); + } + else { + scan_kernel << > > (n, dev_buffer_A + , dev_buffer_B, step); + } + + } timer().endGpuTimer(); + if (ilog2ceil(n) % 2 == 1) { + cudaMemcpy(odata, dev_buffer_B, n * sizeof(int), cudaMemcpyDeviceToHost); + } + else { + cudaMemcpy(odata, dev_buffer_A, n * sizeof(int), cudaMemcpyDeviceToHost); + } + for (int i = n - 1; i > 0; i--) { + odata[i] = odata[i - 1]; + } + odata[0] = 0; + + cudaFree(dev_buffer_A); + cudaFree(dev_buffer_B); + } } } diff --git a/stream_compaction/radixsort.cu b/stream_compaction/radixsort.cu new file mode 100644 index 0000000..ad50526 --- /dev/null +++ b/stream_compaction/radixsort.cu @@ -0,0 +1,106 @@ +#include +#include +#include "common.h" +#include "radixsort.h" +#include "efficient.h" + +namespace StreamCompaction { + namespace RadixSort { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + const int blockSize = 128; + + __global__ void kernComputeBits(int n, int bit, const int* idata, int* odata1, int* odata2) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= n) return; + + int mask = 1 << bit; + odata1[index] = (idata[index] & mask) ? 1 : 0; + odata2[index] = ~odata1[index]; + } + + + + __global__ void kernComplementBits(int n, int total, int* idata, int* odata) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= n) return; + + odata[index] = index - idata[index] + total; + } + + __global__ void kernScatter(int n, int* b_array, int* t_array, int* f_array, int* odata) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= n) return; + + odata[index] = (b_array[index]) ? t_array[index] : f_array[index]; + } + + + void sort(int n, int* odata, const int* idata) { + + int* dev_idata; + int* dev_odata; + int* dev_b; + int* dev_e; + int* dev_f; + int* dev_t; + int* dev_d; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMalloc((void**)&dev_b, n * sizeof(int)); + cudaMalloc((void**)&dev_e, n * sizeof(int)); + cudaMalloc((void**)&dev_f, n * sizeof(int)); + cudaMalloc((void**)&dev_t, n * sizeof(int)); + cudaMalloc((void**)&dev_d, n * sizeof(int)); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + + for (int i = 0; i < sizeof(int); i++) { + dim3 fullBlocksPerGrid((n +blockSize - 1) / blockSize); + + kernComputeBits<<>>(n, i, dev_idata, dev_b, dev_e); + + StreamCompaction::Efficient::scan(n, dev_e, dev_f); + + int total1 = 0; + cudaMemcpy(&total1, &dev_e[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + int total2 = 0; + cudaMemcpy(&total2, &dev_f[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + + kernComplementBits<<>>(n, total1 + total2, dev_f, dev_t); + + + kernScatter<<>>(int n, dev_b, dev_t, dev_f, dev_d); + + StreamCompaction::Common::kernScatter<<>>(n, dev_odata, dev_idata, dev_b, dev_d); + + + int* temp = dev_idata; + dev_idata = dev_odata; + dev_odata = temp; + } + cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_b); + cudaFree(dev_e); + cudaFree(dev_f); + cudaFree(dev_t); + cudaFree(dev_d); + + + } + + + + + } +} \ No newline at end of file diff --git a/stream_compaction/radixsort.h b/stream_compaction/radixsort.h new file mode 100644 index 0000000..fe38607 --- /dev/null +++ b/stream_compaction/radixsort.h @@ -0,0 +1,12 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace RadixSort { + StreamCompaction::Common::PerformanceTimer& timer(); + + + void sort(int n, int* odata, const int* idata); + } +} \ No newline at end of file diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..df90fda 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,22 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::host_vectorh_vector_in(idata, idata + n); + thrust::host_vectorh_vector_out(odata, odata + n); + + thrust::device_vector d_vector_in = h_vector_in; + thrust::device_vector d_vector_out = h_vector_out; + timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + thrust::exclusive_scan(d_vector_in.begin(), d_vector_in.end(), d_vector_out.begin()); + + timer().endGpuTimer(); + thrust::copy(d_vector_out.begin(), d_vector_out.end(), odata); } } }