diff --git a/README.md b/README.md index 0e38ddb..c684bdb 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,73 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Li Zheng + * [LinkedIn](https://www.linkedin.com/in/li-zheng-1955ba169) +* Tested on: Windows CUDA10, i5-3600 @ 3.59GHz 16GB, RTX 2060 6GB (personal computer) -### (TODO: Your README) +This project implements different versions of scan, including CPU scan, naive scan, work-efficient scan and thrust scan. Some of these methods are used to implement stream compaction. A timer is used to measure the time cost and evaluate the performance. -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +### Performance Analysis +![blockSize](img/blockSize.PNG) +This diagram demonstrates the change of time with respect to block size. The block size of 128 and 256 have relatively good performance. +![powerOfTwo](img/powerOfTwo.PNG) +![nonePowerOfTwo](img/nonPowerOfTwo.PNG) +The diagrams show the change of time with array size increases. The first diagram is for power-of-two size array. The second one is for non-power-of-two size array. Their performance is almost the same. When the array size is small, the CPU implementation has a better performance. I think it is because most of the threads doesn't actually work at a deeper level, but just swap two device memory. Additionally, the GPU version algorithms use bit shifting to find offsets or intervals of each level, which takes extra time. With the array size increases, the GPU version algorithms have better performance, especially the work-efficient and thrust method. + +### Output of The Test Program +Here is the test result of an array of 2^16 and a block size of 128. More results are in img/performance analysis.xlsx. +``` +**************** +** SCAN TESTS ** +**************** + [ 49 29 24 15 46 49 46 8 35 40 38 18 44 ... 3 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.1305ms (std::chrono Measured) + [ 0 49 78 102 117 163 212 258 266 301 341 379 397 ... 1603889 1603892 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.129ms (std::chrono Measured) + [ 0 49 78 102 117 163 212 258 266 301 341 379 397 ... 1603839 1603856 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.052416ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.050752ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.11264ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.113344ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.073344ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.055296ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 3 3 2 1 0 1 2 2 3 0 2 2 0 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.1239ms (std::chrono Measured) + [ 3 3 2 1 1 2 2 3 2 2 1 2 2 ... 3 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.1241ms (std::chrono Measured) + [ 3 3 2 1 1 2 2 3 2 2 1 2 2 ... 3 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.3149ms (std::chrono Measured) + [ 3 3 2 1 1 2 2 3 2 2 1 2 2 ... 3 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.124928ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.198656ms (CUDA Measured) + passed +``` diff --git a/img/blockSize.PNG b/img/blockSize.PNG new file mode 100644 index 0000000..01b595d Binary files /dev/null and b/img/blockSize.PNG differ diff --git a/img/nonPowerOfTwo.PNG b/img/nonPowerOfTwo.PNG new file mode 100644 index 0000000..c2ed2ac Binary files /dev/null and b/img/nonPowerOfTwo.PNG differ diff --git a/img/performance analysis.xlsx b/img/performance analysis.xlsx new file mode 100644 index 0000000..dca8dd3 Binary files /dev/null and b/img/performance analysis.xlsx differ diff --git a/img/powerOfTwo.PNG b/img/powerOfTwo.PNG new file mode 100644 index 0000000..dd5a50d Binary files /dev/null and b/img/powerOfTwo.PNG differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..2fce82c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 16; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..2e35147 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,16 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + if (idata[index] == 0) { + bools[index] = 0; + } + else { + bools[index] = 1; + } } /** @@ -33,6 +43,13 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + if (bools[index] == 1) { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..297bb4d 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -12,6 +12,7 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define blockSize 256 /** * Check for CUDA errors; print and exit if there was a problem. diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..3d3b669 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -4,47 +4,79 @@ #include "common.h" namespace StreamCompaction { - namespace CPU { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } + namespace CPU { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } - /** - * CPU scan (prefix sum). - * For performance analysis, this is supposed to be a simple for loop. - * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. - */ - void scan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - } + /** + * CPU scan (prefix sum). + * For performance analysis, this is supposed to be a simple for loop. + * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. + */ + void scan(int n, int *odata, const int *idata) { + timer().startCpuTimer(); + // TODO + odata[0] = 0; + for (int k = 1; k < n; ++k) { + odata[k] = odata[k - 1] + idata[k - 1]; + } + timer().endCpuTimer(); + } - /** - * CPU stream compaction without using the scan function. - * - * @returns the number of elements remaining after compaction. - */ - int compactWithoutScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - return -1; - } + /** + * CPU stream compaction without using the scan function. + * + * @returns the number of elements remaining after compaction. + */ + int compactWithoutScan(int n, int *odata, const int *idata) { + timer().startCpuTimer(); + // TODO + int ptr = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[ptr] = idata[i]; + ptr++; + } + } + timer().endCpuTimer(); + return ptr; + } - /** - * CPU stream compaction using scan and scatter, like the parallel version. - * - * @returns the number of elements remaining after compaction. - */ - int compactWithScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - return -1; - } - } + /** + * CPU stream compaction using scan and scatter, like the parallel version. + * + * @returns the number of elements remaining after compaction. + */ + int compactWithScan(int n, int *odata, const int *idata) { + timer().startCpuTimer(); + // TODO + int count = 0; + int *tmp = new int[n]; + for (int i = 0; i < n; i++) { + if (idata[i] == 0) { + tmp[i] = 0; + } + else { + tmp[i] = 1; + count++; + } + } + int *tmpScan = new int[n]; + tmpScan[0] = 0; + for (int k = 1; k < n; ++k) { + tmpScan[k] = tmpScan[k - 1] + tmp[k - 1]; + } + for (int i = 0; i < n; i++) { + odata[tmpScan[i]] = idata[i]; + } + timer().endCpuTimer(); + delete tmp; + delete tmpScan; + return count; + } + } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..c97f2c0 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -12,15 +12,67 @@ namespace StreamCompaction { return timer; } + __global__ void kernEfficientScanUpSweep(int n, int *odata, int d) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + int interval = 1 << (d + 1); + int halfInterval = 1 << d; + if ((index + 1) % interval == 0) { + odata[index] += odata[index - halfInterval]; + } + } + + __global__ void kernEfficientScanDownSweep(int n, int *odata, int d, int topLayer) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + if (d == topLayer && index == n - 1) { + odata[index] = 0; + } + int interval = 1 << (d + 1); + int halfInterval = 1 << d; + if ((index + 1) % interval == 0) { + int tmp = odata[index - halfInterval]; + odata[index - halfInterval] = odata[index]; + odata[index] += tmp; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); // TODO - timer().endGpuTimer(); + int N = pow(2, ilog2ceil(n)); + int *dev_odata; + cudaMalloc((void**)&dev_odata, N * sizeof(int)); + cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + if (N > n) { + int *zeroArray = new int[N - n]; + for (int i = 0; i < N - n; i++) { + zeroArray[i] = 0; + } + cudaMemcpy(dev_odata + n, zeroArray, (N - n) * sizeof(int), cudaMemcpyHostToDevice); + } + timer().startGpuTimer(); + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + int topLayer = ilog2ceil(n) - 1; + for (int d = 0; d <= topLayer; d++) { + kernEfficientScanUpSweep << > > (N, dev_odata, d); + } + + for (int d = topLayer; d >= 0; d--) { + kernEfficientScanDownSweep << > > (N, dev_odata, d, topLayer); + } + + timer().endGpuTimer(); + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_odata); } + /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -31,10 +83,52 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); // TODO - timer().endGpuTimer(); - return -1; + int N = pow(2, ilog2ceil(n)); + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + int topLayer = ilog2ceil(n) - 1; + int *dev_idata, *dev_odata, *dev_bools, *dev_indices; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMalloc((void**)&dev_bools, N * sizeof(int)); + cudaMalloc((void**)&dev_indices, N * sizeof(int)); + + timer().startGpuTimer(); + Common::kernMapToBoolean << > > (n, dev_bools, dev_idata); + if (N > n) { + int *zeroArray = new int[N - n]; + for (int i = 0; i < N - n; i++) { + zeroArray[i] = 0; + } + cudaMemcpy(dev_bools + n, zeroArray, (N - n) * sizeof(int), cudaMemcpyHostToDevice); + } + cudaMemcpy(dev_indices, dev_bools, N * sizeof(int), cudaMemcpyDeviceToDevice); + + int countScatter = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + countScatter++; + } + } + for (int d = 0; d <= topLayer; d++) { + kernEfficientScanUpSweep << > > (N, dev_indices, d); + } + + for (int d = topLayer; d >= 0; d--) { + kernEfficientScanDownSweep << > > (N, dev_indices, d, topLayer); + } + + Common::kernScatter << > > (n, dev_odata, + dev_idata, dev_bools, dev_indices); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, countScatter * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_bools); + cudaFree(dev_indices); + return countScatter; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..b9584fe 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -4,22 +4,53 @@ #include "naive.h" namespace StreamCompaction { - namespace Naive { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - // TODO: __global__ + namespace Naive { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + // TODO: __global__ + __global__ void kernNaiveScan(int n, int *odata, int *tmp, int d) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + int offset = 1 << (d - 1); + if (index >= offset) { + tmp[index] = odata[index - offset] + odata[index]; + } + else { + tmp[index] = odata[index]; + } + } - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } - } + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) { + // TODO + int *dev_odata, *dev_tmp; + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + int *zero = 0; + cudaMemcpy(dev_odata, zero, sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(dev_odata + 1, idata, (n - 1) * sizeof(int), cudaMemcpyHostToDevice); + cudaMalloc((void**)&dev_tmp, n * sizeof(int)); + + timer().startGpuTimer(); + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + for (int d = 1; d <= ilog2ceil(n); d++) { + kernNaiveScan << > > (n, dev_odata, dev_tmp, d); + int *tmpPtr = dev_tmp; + dev_tmp = dev_odata; + dev_odata = tmpPtr; + } + timer().endGpuTimer(); + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_odata); + cudaFree(dev_tmp); + } + } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..3e101de 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,20 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int *dev_idata, *dev_odata; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + thrust::device_ptr dev_thrust_odata(dev_odata); + thrust::device_ptr dev_thrust_idata(dev_idata); + timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_thrust_idata, dev_thrust_idata + n, dev_thrust_odata); timer().endGpuTimer(); + + cudaMemcpy(odata, thrust::raw_pointer_cast(dev_thrust_odata), n * sizeof(int), cudaMemcpyDeviceToHost); } } }