diff --git a/README.md b/README.md index 0e38ddb..cf6f3e5 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,107 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Nadine Adnane + * [LinkedIn](https://www.linkedin.com/in/nadnane/) +* Tested on my personal laptop (ASUS ROG Zephyrus M16): +* **OS:** Windows 11 +* **Processor:** 12th Gen Intel(R) Core(TM) i9-12900H, 2500 Mhz, 14 Core(s), 20 Logical Processor(s) +* **GPU:** NVIDIA GeForce RTX 3070 Ti Laptop GPU -### (TODO: Your README) +Note: I used a late day on this assignment. I also need a few more hours to finish up my analysis. -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +### Introduction +In this project, I set out to implement a few different versions of the Scan (Prefix Sum) algorithm and GPU stream compaction in CUDA. I first implemented the algorithms on the CPU as a basis for comparison and to reinforce my understanding of the algorithm. Then, I implemented the "naive" and "work-efficient" versions of the algorithm on the GPU using CUDA. Finally, I utilized some of my earlier implementations to implement GPU stream compaction. Through this project, I analyze the trade-offs between CPU and GPU performance and explore the benefits of parallel programming. + +### Features + +1. **CPU Scan & Stream Compaction** + - **CPU Scan:** Implemented a straightforward exclusive prefix sum using a for loop for sequential processing. + - **Compaction Without Scan:** A basic method that filters out zero values without employing a scan operation. + - **Compaction With Scan:** An optimized approach that leverages the scan algorithm to enhance the efficiency of the compaction process. + +2. **Naive GPU Scan** + - Developed a naive GPU scan algorithm following the method outlined in *GPU Gems 3*, Section 39.2.1. This implementation utilizes global memory and alternates between input/output arrays through multiple kernel invocations. + +3. **Work-Efficient GPU Scan & Stream Compaction** + - **Work-Efficient Scan:** Implemented an optimized scan using a tree-based approach, as described in *GPU Gems 3*, Section 39.2.2, for better performance. + - **Stream Compaction with Scan:** Built upon the work-efficient scan by first mapping the input to a boolean array, scanning it, and then scattering the elements that satisfy the condition to achieve compaction. + - Efficiently handles arrays that are not sized to a power of two. + +4. **Thrust Library Integration** + - Integrated the Thrust library’s `exclusive_scan` function to perform stream compaction utilizing the GPU-accelerated primitives offered by Thrust. + +## Performance Analysis + +# Block Size Optimization + + +# Scan Implementation Comparison + + +# Stream Compaction Implementation Comparison + + + + +# Test Program Output + +The following test output was generated by running the Scan and Stream compaction algorithms on: +- an array of size 2^21 +- an array of size 2^21 - 3 + +with a block size of 128. + +```**************** +** SCAN TESTS ** +**************** + [ 47 4 13 41 40 12 10 35 21 19 24 19 8 ... 46 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 6.6257ms (std::chrono Measured) + [ 0 47 51 64 105 145 157 167 202 223 242 266 285 ... 51377593 51377639 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 6.5007ms (std::chrono Measured) + [ 0 47 51 64 105 145 157 167 202 223 242 266 285 ... 51377490 51377518 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 1.49398ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 1.31354ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 1.48496ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 1.13254ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.857024ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.697344ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 1 3 2 1 2 1 2 3 1 0 3 1 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 9.0749ms (std::chrono Measured) + [ 2 1 3 2 1 2 1 2 3 1 3 1 3 ... 3 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 8.3302ms (std::chrono Measured) + [ 2 1 3 2 1 2 1 2 3 1 3 1 3 ... 3 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 12.1135ms (std::chrono Measured) + [ 2 1 3 2 1 2 1 2 3 1 3 1 3 ... 3 1 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 1.50966ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 1.41312ms (CUDA Measured) + passed``` \ No newline at end of file diff --git a/images/graph1.png b/images/graph1.png new file mode 100644 index 0000000..f1f9204 Binary files /dev/null and b/images/graph1.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..268179f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 21; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..379cdb9 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,16 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + // DONE + int idx = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (idx >= n) + { + return; + } + + // Map non-zero values to 1, otherwise 0 + bools[idx] = idata[idx] != 0 ? 1 : 0; } /** @@ -32,7 +41,18 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + // DONE + int idx = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (idx >= n) + { + return; + } + + if (bools[idx]) + { + odata[indices[idx]] = idata[idx]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..e68c3d0 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -17,6 +17,7 @@ * Check for CUDA errors; print and exit if there was a problem. */ void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1); +const int blockSize = 512; inline int ilog2(int x) { int lg = 0; diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..bffa308 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,7 +19,14 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + // DONE + // Exclusive - Include the identity in the output + odata[0] = 0; + for(int k = 1; k < n; k++) + { + odata[k] = odata[k - 1] + idata[k - 1]; + } + timer().endCpuTimer(); } @@ -30,9 +37,21 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + // DONE + int j = 0; + for (int i = 0; i < n; ++i) + { + if (idata[i] != 0) + { + odata[j] = idata[i]; + ++j; + } + } + + timer().endCpuTimer(); - return -1; + return j; } /** @@ -42,9 +61,49 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + // DONE + + int numRemaining; + + // Step 1: Compute temporary array containing + // either 1 or 0, depending on if element meets criteria + int* temp = new int[n]; + for (int i = 0; i < n; ++i) + { + if (idata[i] == 0) + { + temp[i] = 0; + } + else + { + temp[i] = 1; + } + } + + + // Step 2: Run exclusive scan on the temp array + // Exclusive - Insert the identity + odata[0] = 0; + // Start at 1 since we inserted the identity and are shifting to the right + for (int k = 1; k < n; ++k) + { + odata[k] = odata[k - 1] + temp[k - 1]; + } + + // Step 3: Scatter! + // Result of scan is index into the final array + numRemaining = odata[n - 1]; + for (int i = 0; i < n; ++i) + { + if (temp[i] == 1) + { + odata[odata[i]] = idata[i]; + } + } + timer().endCpuTimer(); - return -1; + + return numRemaining; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..c7f75fe 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -12,13 +12,115 @@ namespace StreamCompaction { return timer; } + // Helper functions + // Up-Sweep - parallel reduction + __global__ void upSweep(int n, int d, int* data) + { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + int index = idx << (d + 1); + + if (index + (1 << (d + 1)) - 1 < n) + { + // From the slides: + // x[k + 2d+1 – 1] += x[k + 2d – 1]; + data[index + (1 << (d + 1)) - 1] += data[index + (1 << d) - 1]; + } + } + + // Down-Sweep - traverse back down the tree using partial sums to build the scan in place. + __global__ void downSweep(int n, int d, int* data) + { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int index = idx << (d + 1); + + if (index + (1 << (d + 1)) - 1 < n) + { + // Save left child + int temp = data[index + (1 << d) - 1]; + + // Set left child to this node’s value + data[index + (1 << d) - 1] = data[index + (1 << (d + 1)) - 1]; + + // Set right child to old left value + this node’s value + data[index + (1 << (d + 1)) - 1] += temp; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + // DONE + // This approach uses balanced trees to avoid the extra factor of log2n work + // performed by the naive algorithm. + // GPU Gems 3, Chapter 39.2.2 + + int* dev_idata; + // Calculate number of levels needed for the scan + // ilog2ceil(x): computes the ceiling of log2(x), as an integer. + int numLevels = ilog2ceil(n); + // Calculate the power of 2 number of levels + int numLevelsPow2 = 1 << numLevels; + + // Memory allocation + cudaMalloc((void**)&dev_idata, numLevelsPow2 * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_idata failed!"); + + int padding = numLevelsPow2 - n; + if (padding >= 0) + { + cudaMemset(&dev_idata[n], 0, padding * sizeof(int)); + checkCUDAError("cudaMemset dev_idata failed!"); + } + timer().startGpuTimer(); - // TODO + + //================================================================================ + // Part 1 - Upsweep phase + //================================================================================ + for (int offset = 0; offset < numLevels - 1; offset++) + { + // Calculate the number of blocks + int numBlocks = (numLevelsPow2 / (1 << (offset + 1)) + blockSize - 1) / blockSize; + + // Perform the upsweep phase + upSweep << > > (numLevelsPow2, offset, dev_idata); + checkCUDAError("upSweep kernel failed!"); + + // Sync before proceeding to the next iteration + cudaDeviceSynchronize(); + } + + // Need to set the last element to 0 before starting the down sweep phase + cudaMemset(dev_idata + numLevelsPow2 - 1, 0, sizeof(int)); + checkCUDAError("cudaMemset dev_idata failed!"); + + //================================================================================ + // Part 2 - Downsweep phase + //================================================================================ + for (int offset = numLevels - 1; offset >= 0; offset--) { + // Calculate the number of blocks + int numBlocks = (numLevelsPow2 / (1 << (offset + 1)) + blockSize - 1) / blockSize; + + // Perform the downsweep phase + downSweep << > > (numLevelsPow2, offset, dev_idata); + checkCUDAError("downSweep kernel failed!"); + + // Sync before proceeding to the next iteration + cudaDeviceSynchronize(); + } + timer().endGpuTimer(); + + // Copy the results back to the host + cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy device to host (dev_idata to odata) failed!"); + + // Free device memory + cudaFree(dev_idata); } /** @@ -31,10 +133,105 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + // This stream compaction method will remove 0s from an array of ints. + // Initialize necessary buffers + int* dev_idata; + int* dev_odata; + int* dev_booleans; + int* dev_indices; + + // ilog2ceil(x): computes the ceiling of log2(x), as an integer. + int numLevels = ilog2ceil(n); + size_t paddedSize = (size_t) 1 << numLevels; + + // Allocate memory for device arrays, copy input data to device + cudaMalloc((void**)&dev_idata, paddedSize * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed"); + cudaMalloc((void**)&dev_odata, paddedSize * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed"); + cudaMalloc((void**)&dev_booleans, paddedSize * sizeof(int)); + checkCUDAError("cudaMalloc dev_booleans failed"); + cudaMalloc((void**)&dev_indices, paddedSize * sizeof(int)); + checkCUDAError("cudaMalloc dev_indices failed"); + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_idata failed"); + + if (paddedSize > n) { + cudaMemset(dev_idata + n, 0, (paddedSize - n) * sizeof(int)); + checkCUDAError("cudaMemset dev_idata failed!"); + } + + dim3 fullBlocksPerGrid((paddedSize + blockSize - 1) / blockSize); + timer().startGpuTimer(); - // TODO + + // Map to boolean + StreamCompaction::Common::kernMapToBoolean << > > (paddedSize, dev_booleans, dev_idata); + checkCUDAError("kernMapToBoolean failed!"); + + // Perform scan on the boolean array + cudaMemcpy(dev_indices, dev_booleans, sizeof(int) * paddedSize, cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpy from dev_booleans to dev_indices (Device To Device) failed!"); + + //================================================================================ + // Part 1 - Upsweep phase + //================================================================================ + for (int offset = 0; offset < numLevels - 1; offset++) { + + // Calculate necessary number of blocks + int numBlocks = (paddedSize / (1 << (offset + 1)) + blockSize - 1) / blockSize; + + if (numBlocks > 0) + { + upSweep << > > (paddedSize, offset, dev_indices); + checkCUDAError("upSweep kernel failed!"); + cudaDeviceSynchronize(); + } + } + + // Need to set the last element to 0 before starting the down sweep phase + cudaMemset(dev_indices + paddedSize - 1, 0, sizeof(int)); + checkCUDAError("cudaMemset failed!"); + + //================================================================================ + // Part 2 - Downsweep phase + //================================================================================ + for (int offset = numLevels - 1; offset >= 0; offset--) { + int numBlocks = (paddedSize / (1 << (offset + 1)) + blockSize - 1) / blockSize; + if (numBlocks > 0) + { + downSweep << > > (paddedSize, offset, dev_indices); + checkCUDAError("downSweep kernel failed!"); + cudaDeviceSynchronize(); + } + } + + //================================================================================ + // Part 3: Scatter + //================================================================================ + StreamCompaction::Common::kernScatter << > > (paddedSize, dev_odata, dev_idata, dev_booleans, dev_indices); + checkCUDAError("kernScatter failed!"); + timer().endGpuTimer(); - return -1; + + //================================================================================ + // Part 4: Copy results and free memory + //================================================================================ + int numRemaining; + cudaMemcpy(&numRemaining, dev_indices + paddedSize - 1, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy numRemaining failed!"); + + // Copy result from device to host + cudaMemcpy(odata, dev_odata, sizeof(int) * numRemaining, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy odata failed!"); + + // Free memory of temp buffers + cudaFree(dev_booleans); + cudaFree(dev_indices); + cudaFree(dev_idata); + cudaFree(dev_odata); + + return numRemaining; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..9676797 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -1,5 +1,6 @@ #include #include +#include #include "common.h" #include "naive.h" @@ -11,15 +12,69 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } + // TODO: __global__ + __global__ void kernelNaiveScan(int n, int offset, int* idata, int* odata) + { + // Using the Naive algorithm from GPU Gems 3, Section 39.2.1. + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (index >= n) return; + + if (index >= offset) { + odata[index] = idata[index] + idata[index - offset]; + } + else { + odata[index] = idata[index]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int* odata, const int* idata) { + int* bufferA, * bufferB; + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + cudaMalloc((void**)&bufferA, n * sizeof(int)); + checkCUDAError("cudaMalloc bufferA failed!"); + cudaMalloc((void**)&bufferB, n * sizeof(int)); + checkCUDAError("cudaMalloc bufferB failed!"); + + // Copy from host input data to device input data + cudaMemcpy(bufferA, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + + // DONE + + // Calculate number of levels + int numLevels = ilog2ceil(n); + int offset; + + for(int d = 1; d <= numLevels; d++) { + + offset = 1 << (d - 1); + + kernelNaiveScan << > > (n, offset, bufferA, bufferB); + checkCUDAError("kernelNaiveScan failed!"); + + // Sync threads before swapping + cudaDeviceSynchronize(); + std::swap(bufferA, bufferB); + } + timer().endGpuTimer(); + + // Insert identity and shift right to make exclusive + // Copy result from device back to host + odata[0] = 0; + cudaMemcpy(odata + 1, bufferA, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy from bufferA to odata failed!"); + + // Free bufferA and bufferB memory + cudaFree(bufferA); + cudaFree(bufferB); } } -} +} \ No newline at end of file diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..85a9b61 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,26 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` + // DONE use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + // Copy from input pointer to host vector + thrust::host_vector host_in(idata, idata + n); + + // Copy from host vector to device vector + thrust::device_vector dev_input = host_in; + + // Allocate device output vector + thrust::device_vector dev_output(n); + + // Time the exclusive scan on the device + timer().startGpuTimer(); + thrust::exclusive_scan(dev_input.begin(), dev_input.end(), dev_output.begin()); timer().endGpuTimer(); + + // Copy the results back to the output array odata + thrust::copy(dev_output.begin(), dev_output.end(), odata); } } }