diff --git a/.gitignore b/.gitignore index a59ec56..b1f7699 100644 --- a/.gitignore +++ b/.gitignore @@ -25,7 +25,8 @@ build .LSOverride # Icon must end with two \r -Icon +Icon + # Thumbnails ._* @@ -560,3 +561,5 @@ xcuserdata *.xccheckout *.moved-aside *.xcuserstate +*.m +*.asv \ No newline at end of file diff --git a/README.md b/README.md index 0e38ddb..347b65b 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,169 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Mufeng Xu + * [LinkedIn](https://www.linkedin.com/in/mufeng-xu/) +* Tested on: Windows 11, i9-13900H @ 2.6GHz 32GB, RTX 4080 Laptop 12282MB (Personal Computer) -### (TODO: Your README) +> *GPU parallel algorithms have significantly better performance when the array sizes are large!* +> +>![](img/benchmark-power-of-2(linear-y).png) +>![](img/benchmark-compaction-power-of-2(linear-y).png) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +In this project, I implemented the following features: +- CPU Scan & Stream Compaction +- Naive GPU Scan Algorithm +- Work-Efficient GPU Scan & Stream Compaction +- GPU Scan and Compaction with `Thrust` +- [Extra Credit +5] Improved the performance of GPU scans with some index calculation tweaks. +- [Extra Credit +10] GPU **Radix Sort**, and compared it with `std::stable_sort`. +- Benchmarking the implementations, repeat tests to get more reliable results. +## Performance Benchmarks + +For each implementation, I tested different block sizes for array `SIZE = 1 << 26` and chose optimized one for them. +For naive implementation, the block size is `1024`, and for the work-efficient, it is `512`. + +### Scan Algorithms + +![](img/benchmark-power-of-2.png) + +![](img/benchmark-non-power-of-2.png) + +From the benchmarks we can find that when the array size is larger than $2^{18}$, +GPU algorithms perform better than the CPU implementations. +When the array size is smaller than $2^{17}$, +the GPU algorithms' performance are bottlenecked, +possibly because of memory overhead. +As the array size increases, the benefits of massive parallelization become apparent. +Additionally, the Work-Efficient algorithm is roughly twice as fast as the Naive implementation. +Notably, the Thrust implementation outperforms my implementation by a significant margin. + +I believe this automatically meets the requirement of Part 5 extra credits, +since I'm avoided calling redundant threads, +making the GPU algorithms perform better than the CPU when the array size is large. + +![](img/Screenshot-nsight-eff-scan-summary.png) + +As you can see in the profiling, the grid sizes shrink at the later up-sweep passes and early down-sweep passes. +This optimizes the performance a lot. However, the memory throughput is not very high, this can be further optimized. + +### Stream Compaction + +![](img/benchmark-compaction-power-of-2.png) + +For smaller array sizes (up to $2^{17}$), the CPU without Scan implementation outperforms the GPU Work-Efficient algorithm due to lower overhead and the inefficiency of GPU parallelization for small datasets. However, as the array size increases beyond $2^{18}$, the GPU Work-Efficient method starts to leverage its parallel processing capabilities, eventually surpassing the CPU w/o. Scan around $2^{20}$. For larger arrays, the GPU algorithm demonstrates significantly better scalability, with a slower growth in elapsed time compared to the CPU, making it the superior choice for handling large datasets. + +I also implemented the stream compaction with `thrust::remove_if`. +But after benchmarking, I found it performs exactly like the CPU algorithms. +And my observation is supported by Nsight Profiling, which warns "No kernels were profiled", +and that means `thrust::remove_if` runs on the CPU. + +### Radix Sort + +![](img/benchmark-radix-sort.png) + +For smaller datasets, std::stable_sort provides better performance. +However, as the array size increases, Radix Sort proves to be more efficient, maintaining lower elapsed times and scaling much better with large datasets compared to std::stable_sort, which becomes significantly slower. + +The fewer bits the numbers in the array have, the faster Radix Sort will perform. In this test case, the numbers range from 0 to 50, requiring a minimum of 6 bits. If the maximum number (or its bit approximation) in an array is known, Radix Sort can be optimized proportionally, allowing for more efficient sorting. + +Radix Sort is implemented in `radix.h` and `radix.cu`. The interfaces for calling the Radix sort are +```c++ +void StreamCompaction::Radix::sort(int n, int bits, int* odata, const int* idata); +void StreamCompaction::Radix::sort(int n, int* odata, const int* idata); // bits = 31 +``` + +And it is tested in `main.cpp`, `#define BENCHMARK 1` to run the automatic benchmarks. +Here is the output for Radix Sort Benchmarks: + +![](img/screenshot-radix-sort.png) + +To verify the correctness of my Radix Sort, I used `std::stable_sort` as the *ground truth*. I then utilized the provided `printCmpResult()` to compare the sorting results. The outputs are shown in the following section, "Test Program Output", and confirm the correctness of the implementation. + +## Test Program Output + +The console output with array `SIZE = 1 << 24`: + +``` +**************** +** SCAN TESTS ** +**************** + [ 47 34 40 37 0 23 36 44 21 48 49 48 26 ... 20 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 29.0478ms (std::chrono Measured) + [ 0 47 81 121 158 158 181 217 261 282 330 379 427 ... 410811186 410811206 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 28.7367ms (std::chrono Measured) + [ 0 47 81 121 158 158 181 217 261 282 330 379 427 ... 410811143 410811179 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 8.26326ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 8.32848ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 4.50995ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 3.71853ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 1.42131ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 1.31478ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 0 0 1 0 1 2 2 3 2 1 2 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 33.4106ms (std::chrono Measured) + [ 1 1 1 2 2 3 2 1 2 2 3 2 3 ... 2 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 34.1384ms (std::chrono Measured) + [ 1 1 1 2 2 3 2 1 2 2 3 2 3 ... 2 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 94.494ms (std::chrono Measured) + [ 1 1 1 2 2 3 2 1 2 2 3 2 3 ... 2 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 5.81114ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 5.72454ms (CUDA Measured) + passed +==== thrust::remove_if compact, power-of-two ==== + elapsed time: 35.2823ms (CUDA Measured) + passed +==== thrust::remove_if compact, non-power-of-two ==== + elapsed time: 37.1602ms (CUDA Measured) + passed + +***************************** +****** RADIX SORT TESTS ***** +***************************** + [ 1 32 4 32 15 0 49 17 14 0 1 42 15 ... 16 0 ] +==== std::stable_sort, power-of-two, 6 bits ==== + elapsed time: 502.405ms (std::chrono Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 49 49 ] +==== radix sort, power-of-two, 6 bits ==== + elapsed time: 34.2919ms (CUDA Measured) + passed +==== std::stable_sort, non-power-of-two, 6 bits ==== + elapsed time: 508.689ms (std::chrono Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 49 49 ] +==== radix sort, non-power-of-two, 6 bits ==== + elapsed time: 34.576ms (CUDA Measured) + passed +``` + +## Modifications To Project Structures + +- Added `radix.h` and `radix.cu` to `stream_compaction/` +- Added the entries of the above files to `stream_compaction/CMakeLists.txt` diff --git a/img/Screenshot-nsight-eff-scan-summary.png b/img/Screenshot-nsight-eff-scan-summary.png new file mode 100644 index 0000000..6f218a3 Binary files /dev/null and b/img/Screenshot-nsight-eff-scan-summary.png differ diff --git a/img/benchmark-compaction-power-of-2(linear-y).png b/img/benchmark-compaction-power-of-2(linear-y).png new file mode 100644 index 0000000..1ac6335 Binary files /dev/null and b/img/benchmark-compaction-power-of-2(linear-y).png differ diff --git a/img/benchmark-compaction-power-of-2.png b/img/benchmark-compaction-power-of-2.png new file mode 100644 index 0000000..1360a5f Binary files /dev/null and b/img/benchmark-compaction-power-of-2.png differ diff --git a/img/benchmark-non-power-of-2.png b/img/benchmark-non-power-of-2.png new file mode 100644 index 0000000..97738ea Binary files /dev/null and b/img/benchmark-non-power-of-2.png differ diff --git a/img/benchmark-power-of-2(linear-y).png b/img/benchmark-power-of-2(linear-y).png new file mode 100644 index 0000000..6a932b8 Binary files /dev/null and b/img/benchmark-power-of-2(linear-y).png differ diff --git a/img/benchmark-power-of-2.png b/img/benchmark-power-of-2.png new file mode 100644 index 0000000..c64da83 Binary files /dev/null and b/img/benchmark-power-of-2.png differ diff --git a/img/benchmark-radix-sort.png b/img/benchmark-radix-sort.png new file mode 100644 index 0000000..bcba5f0 Binary files /dev/null and b/img/benchmark-radix-sort.png differ diff --git a/img/screenshot-radix-sort.png b/img/screenshot-radix-sort.png new file mode 100644 index 0000000..0995b25 Binary files /dev/null and b/img/screenshot-radix-sort.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..1887932 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -7,13 +7,241 @@ */ #include +#include #include #include #include #include +#include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +#define BENCHMARK 0 +#if BENCHMARK +void benchmark_scan_powerOfTwo(int repeat) +{ + std::cout << "***************************************\n"; + std::cout << "***** scan power-of-2 (repeat=" << std::setw(2) << repeat << ") *****\n"; + std::cout << "***************************************\n"; + std::cout << "Elapsed Time (ms) [Lower is Better]\n"; + std::cout << std::setw(2) << " p\t"; + std::cout << std::setw(12) << "CPU\t"; + std::cout << std::setw(12) << "GPU Naive\t"; + std::cout << std::setw(12) << "GPU Work-Eff\t"; + std::cout << std::setw(12) << "GPU Thrust\n"; + // Test on different sizes + for (int p = 8; p < 29; ++p) + { + const int SIZE = 1 << p; + int* a = new int[SIZE]; + int* b = new int[SIZE]; + + float elapsedTimes[4] = {0.f, 0.f, 0.f, 0.f}; + + genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + + for (int i = 0; i < repeat; ++i) + { + zeroArray(SIZE, b); + StreamCompaction::CPU::scan(SIZE, b, a); + elapsedTimes[0] += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + + zeroArray(SIZE, b); + StreamCompaction::Naive::scan(SIZE, b, a); + elapsedTimes[1] += StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(); + + zeroArray(SIZE, b); + StreamCompaction::Efficient::scan(SIZE, b, a); + elapsedTimes[2] += StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + + zeroArray(SIZE, b); + StreamCompaction::Thrust::scan(SIZE, b, a); + elapsedTimes[3] += StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(); + } + + std::cout << std::setw(2) << p << "\t"; + std::cout << std::setw(11) << elapsedTimes[0] / repeat << "\t"; + std::cout << std::setw(11) << elapsedTimes[1] / repeat << "\t"; + std::cout << std::setw(11) << elapsedTimes[2] / repeat << "\t"; + std::cout << std::setw(11) << elapsedTimes[3] / repeat << "\n"; + + delete[] a; + delete[] b; + } +} + +void benchmark_scan_nonPowerOfTwo(int repeat) +{ + std::cout << "*******************************\n"; + std::cout << "***** scan non-power-of-2 *****\n"; + std::cout << "*******************************\n"; + std::cout << "Elapsed Time (ms) [Lower is Better]\n"; + std::cout << std::setw(2) << " p\t"; + std::cout << std::setw(12) << "CPU\t"; + std::cout << std::setw(12) << "GPU Naive\t"; + std::cout << std::setw(12) << "GPU Work-Eff\t"; + std::cout << std::setw(12) << "GPU Thrust\n"; + // Test on different sizes + for (int p = 8; p < 29; ++p) + { + const int SIZE = 1 << p; + const int NPOT = SIZE - 3; + int* a = new int[NPOT]; + int* b = new int[NPOT]; + + float elapsedTimes[4] = { 0.f, 0.f, 0.f, 0.f }; + + genArray(NPOT - 1, a, 50); // Leave a 0 at the end to test that edge case + a[NPOT - 1] = 0; + + for (int i = 0; i < repeat; ++i) + { + zeroArray(NPOT, b); + StreamCompaction::CPU::scan(NPOT, b, a); + elapsedTimes[0] += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + + zeroArray(NPOT, b); + StreamCompaction::Naive::scan(NPOT, b, a); + elapsedTimes[1] += StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(); + + zeroArray(NPOT, b); + StreamCompaction::Efficient::scan(NPOT, b, a); + elapsedTimes[2] += StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + + zeroArray(NPOT, b); + StreamCompaction::Thrust::scan(NPOT, b, a); + elapsedTimes[3] += StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(); + } + + std::cout << std::setw(2) << p << "\t"; + std::cout << std::setw(11) << elapsedTimes[0] / repeat << "\t"; + std::cout << std::setw(11) << elapsedTimes[1] / repeat << "\t"; + std::cout << std::setw(11) << elapsedTimes[2] / repeat << "\t"; + std::cout << std::setw(11) << elapsedTimes[3] / repeat << "\n"; + + delete[] a; + delete[] b; + } +} + +void benchmark_stream_compaction(int repeat) +{ + std::cout << "********************************************\n"; + std::cout << "***** compaction power-of-2 (repeat=" << std::setw(2) << repeat << ") *****\n"; + std::cout << "********************************************\n"; + std::cout << "Elapsed Time (ms) [Lower is Better]\n"; + std::cout << std::setw(2) << " p\t"; + std::cout << std::setw(12) << "CPU Compaction w/o. Scan\t"; + std::cout << std::setw(12) << "CPU Compaction w/. Scan\t"; + std::cout << std::setw(12) << "GPU Compaction with Work-Eff Scan\t"; + std::cout << std::setw(12) << "GPU Compaction with thrust::remove_if\n"; + // Test on different sizes + for (int p = 8; p < 26; ++p) + { + const int SIZE = 1 << p; + int* a = new int[SIZE]; + int* b = new int[SIZE]; + + float elapsedTimes[4] = { 0.f, 0.f, 0.f, 0.f}; + + genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + + for (int i = 0; i < repeat; ++i) + { + zeroArray(SIZE, b); + StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + elapsedTimes[0] += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + + zeroArray(SIZE, b); + StreamCompaction::CPU::compactWithScan(SIZE, b, a); + elapsedTimes[1] += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + + zeroArray(SIZE, b); + StreamCompaction::Efficient::compact(SIZE, b, a); + elapsedTimes[2] += StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + + zeroArray(SIZE, b); + StreamCompaction::Thrust::compact(SIZE, b, a); + elapsedTimes[3] += StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(); + } + + std::cout << std::setw(2) << p << "\t"; + std::cout << std::setw(11) << elapsedTimes[0] / repeat << "\t"; + std::cout << std::setw(11) << elapsedTimes[1] / repeat << "\t"; + std::cout << std::setw(11) << elapsedTimes[2] / repeat << "\t"; + std::cout << std::setw(11) << elapsedTimes[3] / repeat << "\n"; + + delete[] a; + delete[] b; + } +} + +void benchmark_radix_sort(int repeat) +{ + std::cout << "*************************************\n"; + std::cout << "******* radix sort benchmarks *******\n"; + std::cout << "*************************************\n"; + std::cout << "Elapsed Time (ms) [Lower is Better]\n"; + std::cout << std::setw(2) << " p\t"; + std::cout << std::setw(20) << "std::stable_sort\t"; + std::cout << std::setw(20) << "Radix::sort 31-bits\t"; + std::cout << std::setw(20) << "Radix::sort 15-bits\t"; + std::cout << std::setw(20) << "Radix::sort 6-bits\n"; + // Test on different sizes + for (int p = 8; p < 29; ++p) + { + const int SIZE = 1 << p; + int* a = new int[SIZE]; + int* b = new int[SIZE]; + + float elapsedTimes[4] = { 0.f, 0.f, 0.f, 0.f}; + + genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + + for (int i = 0; i < repeat; ++i) + { + zeroArray(SIZE, b); + StreamCompaction::CPU::sort(SIZE, b, a); + elapsedTimes[0] += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + + zeroArray(SIZE, b); + StreamCompaction::Radix::sort(SIZE, 31, b, a); + elapsedTimes[1] += StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(); + + zeroArray(SIZE, b); + StreamCompaction::Radix::sort(SIZE, 15, b, a); + elapsedTimes[2] += StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(); + + zeroArray(SIZE, b); + StreamCompaction::Radix::sort(SIZE, 6, b, a); + elapsedTimes[3] += StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(); + } + + std::cout << std::setw(2) << p << "\t"; + std::cout << std::setw(19) << elapsedTimes[0] / repeat << "\t"; + std::cout << std::setw(19) << elapsedTimes[1] / repeat << "\t"; + std::cout << std::setw(19) << elapsedTimes[2] / repeat << "\t"; + std::cout << std::setw(19) << elapsedTimes[3] / repeat << "\n"; + + delete[] a; + delete[] b; + } +} + +int main(int argc, char * argv[]) +{ + constexpr int repeat = 10; // This is very time-consuming + benchmark_scan_powerOfTwo(repeat); + benchmark_scan_nonPowerOfTwo(repeat); + benchmark_stream_compaction(repeat); + benchmark_radix_sort(repeat); + + return 0; +} +#else +const int SIZE = 1 << 24; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -147,8 +375,62 @@ int main(int argc, char* argv[]) { //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + zeroArray(SIZE, c); + printDesc("thrust::remove_if compact, power-of-two"); + count = StreamCompaction::Thrust::compact(SIZE, c, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + printDesc("thrust::remove_if compact, non-power-of-two"); + count = StreamCompaction::Thrust::compact(NPOT, c, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + + // Sort tests + printf("\n"); + printf("*****************************\n"); + printf("****** RADIX SORT TESTS *****\n"); + printf("*****************************\n"); + + genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printArray(SIZE, a, true); + + zeroArray(SIZE, b); + printDesc("std::stable_sort, power-of-two, 6 bits"); + StreamCompaction::CPU::sort(SIZE, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(SIZE, b, true); + + zeroArray(SIZE, c); + printDesc("radix sort, power-of-two, 6 bits"); + StreamCompaction::Radix::sort(SIZE, 6, c, a); + printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, b); + printDesc("std::stable_sort, non-power-of-two, 6 bits"); + StreamCompaction::CPU::sort(NPOT, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(NPOT, b, true); + + zeroArray(SIZE, c); + printDesc("radix sort, non-power-of-two, 6 bits"); + StreamCompaction::Radix::sort(NPOT, 6, c, a); + printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(NPOT, c, true); + printCmpResult(SIZE, b, c); + +#ifdef _WIN32 system("pause"); // stop Win32 console from closing on exit +#endif + delete[] a; delete[] b; delete[] c; } +#endif diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 19511ca..a794632 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -4,6 +4,7 @@ set(headers "naive.h" "efficient.h" "thrust.h" + "radix.h" ) set(sources @@ -12,6 +13,7 @@ set(sources "naive.cu" "efficient.cu" "thrust.cu" + "radix.cu" ) list(SORT headers) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..65c87cc 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,5 +1,10 @@ #include "common.h" +#include +#include +#include + + void checkCUDAErrorFn(const char *msg, const char *file, int line) { cudaError_t err = cudaGetLastError(); if (cudaSuccess == err) { @@ -24,6 +29,10 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { return; } + + bools[index] = idata[index] != 0; } /** @@ -33,6 +42,22 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { return; } + + if (bools[index]) + { + odata[indices[index]] = idata[index]; + } + } + + __global__ void kernScatter(int n, int* odata, + const int* idata, const int* indices) { + // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { return; } + + odata[indices[index]] = idata[index]; } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..c87ff66 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -37,6 +37,9 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices); + __global__ void kernScatter(int n, int* odata, + const int* idata, const int* indices); + /** * This class is used for timing the performance * Uncopyable and unmovable diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..6ffd7aa 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -20,6 +20,12 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + // Exclusive Prefix Sum + odata[0] = 0; + for (int i = 0; i < n - 1; ++i) + { + odata[i + 1] = odata[i] + idata[i]; + } timer().endCpuTimer(); } @@ -31,8 +37,16 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int j = 0; + for (int i = 0; i < n; ++i) + { + if (idata[i] != 0) + { + odata[j++] = idata[i]; + } + } timer().endCpuTimer(); - return -1; + return j; } /** @@ -41,10 +55,55 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); // TODO + // Temporary arrays + int* mask = new int[n]; + int* indices = new int[n]; + + timer().startCpuTimer(); + + for (int i = 0; i < n; ++i) + { + mask[i] = idata[i] != 0; + } + + // scan(n, indices, mask); + indices[0] = 0; + for (int i = 0; i < n - 1; ++i) + { + indices[i + 1] = indices[i] + mask[i]; + } + + // Scatter + for (int i = 0; i < n; ++i) + { + if (mask[i]) + { + odata[indices[i]] = idata[i]; + } + } + + timer().endCpuTimer(); + + // Retrieve the number of elements remaining + int elementCount = indices[n - 1] + mask[n - 1]; + + // Clean up + delete[] mask; + delete[] indices; + + return elementCount; + } + + void sort(int n, int *odata, const int *idata) + { + memcpy(odata, idata, n * sizeof(int)); + + timer().startCpuTimer(); + + std::stable_sort(odata, odata + n); + timer().endCpuTimer(); - return -1; } } } diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 873c047..222b77a 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -11,5 +11,7 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata); int compactWithScan(int n, int *odata, const int *idata); + + void sort(int n, int* odata, const int* idata); } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..68e7aed 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,6 +3,12 @@ #include "common.h" #include "efficient.h" +#include +#include +#include + +constexpr int blockSize = 512; // Optimized for SIZE = 1 << 26 + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,15 +18,80 @@ namespace StreamCompaction { return timer; } + __global__ void kernReduce(int N, int d, int *data) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { return; } + + int k = index * (1 << (d + 1)); + + data[k + (1 << (d + 1)) - 1] += data[k + (1 << d) - 1]; + } + + __global__ void kernTraverseBack(int N, int d, int *data) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { return; } + + int k = index * (1 << (d + 1)); + + int t = data[k + (1 << d) - 1]; + data[k + (1 << d) - 1] = data[k + (1 << (d + 1)) - 1]; + data[k + (1 << (d + 1)) - 1] += t; + } + + void scan_gpu(int n, int* dev_data) + { + thrust::device_ptr dev_thrust_data(dev_data); + + // Up-sweep + for (int d = 0; d < ilog2ceil(n) - 1; ++d) + { + dim3 threadsPerBlock(blockSize); + dim3 blocksPerGrid(((n >> (d + 1)) + blockSize - 1) / blockSize); + kernReduce <<>> ((n >> (d + 1)), d, dev_data); + checkCUDAError("kernReduce launch failed!"); + } + + // Down-sweep + dev_thrust_data[n - 1] = 0; + + for (int d = ilog2ceil(n) - 1; d >= 0; --d) + { + int numThreads = n >> (d + 1); + if (numThreads << (d + 1) != n) { numThreads = n >> d; } + dim3 threadsPerBlock(blockSize); + dim3 blocksPerGrid((numThreads + blockSize - 1) / blockSize); + kernTraverseBack <<>> (numThreads, d, dev_data); + checkCUDAError("kernTraverseBack launch failed!"); + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int nCeil = 1 << ilog2ceil(n); + // Up-sweep + int *dev_data; + cudaMalloc((void**)&dev_data, nCeil * sizeof(int)); + checkCUDAError("cudaMalloc dev_data failed!"); + + cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy idata->dev_data failed!"); + timer().startGpuTimer(); - // TODO + scan_gpu(nCeil, dev_data); timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy dev_data->odata failed!"); + + // Clean up + cudaFree(dev_data); } + /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -31,10 +102,51 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int nCeil = 1 << ilog2ceil(n); + size_t sizeOfArrays = nCeil * sizeof(int); + + int* dev_bools; + cudaMalloc((void**)&dev_bools, sizeOfArrays); + checkCUDAError("cudaMalloc dev_bools failed!"); + + int* dev_indices; + cudaMalloc((void**)&dev_indices, sizeOfArrays); + checkCUDAError("cudaMalloc dev_indices failed!"); + + thrust::device_ptr dev_thrust_indices(dev_indices); + thrust::device_ptr dev_thrust_bools(dev_bools); + + int* dev_idata; + cudaMalloc((void**)&dev_idata, sizeOfArrays); + checkCUDAError("cudaMalloc dev_idata failed!"); + + int* dev_odata; + cudaMalloc((void**)&dev_odata, sizeOfArrays); + checkCUDAError("cudaMalloc dev_odata failed!"); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy idata->dev_idata failed!"); + + dim3 threadsPerBlock(blockSize); + dim3 blocksPerGrid((n + blockSize - 1) / blockSize); + timer().startGpuTimer(); - // TODO + Common::kernMapToBoolean <<>> (nCeil, dev_bools, dev_idata); + checkCUDAError("kernMapToBoolean launch failed!"); + + cudaMemcpy(dev_indices, dev_bools, sizeOfArrays, cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpy dev_bools->dev_indices failed!"); + scan_gpu(nCeil, dev_indices); + + int numRemaining = dev_thrust_indices[n - 1] + dev_thrust_bools[n - 1]; + + Common::kernScatter <<>> (nCeil, dev_odata, dev_idata, dev_bools, dev_indices); + checkCUDAError("kernScatter launch failed!"); timer().endGpuTimer(); - return -1; + + cudaMemcpy(odata, dev_odata, numRemaining * sizeof(int), cudaMemcpyDeviceToHost); + + return numRemaining; } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..134ffe0 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -6,6 +6,7 @@ namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); + void scan_gpu(int n, int* dev_data); void scan(int n, int *odata, const int *idata); int compact(int n, int *odata, const int *idata); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..fe83936 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -1,7 +1,10 @@ #include -#include #include "common.h" #include "naive.h" +#include +#include + +constexpr int blockSize = 1024; // Optimized for SIZE = 1 << 26 namespace StreamCompaction { namespace Naive { @@ -13,13 +16,56 @@ namespace StreamCompaction { } // TODO: __global__ + __global__ void kernPartialSum(int N, int d, int* odata, const int *idata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { return; } + if (index >= (1 << d)) + { + odata[index] = idata[index - (1 << d)] + idata[index]; + } + else + { + odata[index] = idata[index]; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); // TODO + + // Ping-pong device data buffers + int* dev_idata; + int* dev_odata; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy idata->dev_idata failed!"); + + dim3 threadsPerBlock(blockSize); + dim3 numBlocks((n + blockSize - 1) / blockSize); + + timer().startGpuTimer(); + for (int d = 0; d < ilog2ceil(n); ++d) + { + kernPartialSum <<>> (n, d, dev_odata, dev_idata); + checkCUDAError("Kernel launch failed!"); + std::swap(dev_odata, dev_idata); + } timer().endGpuTimer(); + + // Copy to CPU and shift to right + odata[0] = 0; + cudaMemcpy(odata + 1, dev_idata, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy dev_idata->odata failed!"); + + cudaFree(dev_idata); + cudaFree(dev_odata); } } } diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu new file mode 100644 index 0000000..ef76561 --- /dev/null +++ b/stream_compaction/radix.cu @@ -0,0 +1,115 @@ +#include +#include +#include "common.h" +#include "efficient.h" +#include "radix.h" + +#include +#include + +#define getBit(num, k) ((num) & (1 << (k))) >> (k) + +constexpr int blockSize = 256; + +namespace StreamCompaction +{ + namespace Radix + { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + __global__ void kernGetBitMask(int N, int k, const int* data, int* b, int* e) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { return; } + + int bit = getBit(data[index], k); + b[index] = bit; + e[index] = bit ^ 1; + } + + __global__ void kernGetIdx(int N, int totalFalses, const int* b, const int* f, int* d) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { return; } + + d[index] = b[index] ? index - f[index] + totalFalses : f[index]; + } + + void sort(int n, int* odata, const int* idata) + { + sort(n, 31, odata, idata); + } + + void sort(int n, int bits, int* odata, const int* idata) + { + int nCeil = 1 << ilog2ceil(n); + + int* dev_idata; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_data failed!"); + + int* dev_odata; + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_data failed!"); + + int* dev_b; + cudaMalloc((void**)&dev_b, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_b failed!"); + + int* dev_e; + cudaMalloc((void**)&dev_e, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_e failed!"); + + int* dev_f; + cudaMalloc((void**)&dev_f, nCeil * sizeof(int)); + checkCUDAError("cudaMalloc dev_f failed!"); + + int* dev_d; + cudaMalloc((void**)&dev_d, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_d failed!"); + + thrust::device_ptr dev_thrust_f(dev_f); + thrust::device_ptr dev_thrust_e(dev_e); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy idata->dev_idata failed!"); + + timer().startGpuTimer(); + for (int k = 0; k < bits; ++k) + { + // k-th pass + dim3 threadsPerBlock(blockSize); + dim3 blocksPerGrid((n + blockSize - 1) / blockSize); + kernGetBitMask <<>> (n, k, dev_idata, dev_b, dev_e); + + cudaMemcpy(dev_f, dev_e, n * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpy dev_e->dev_f failed!"); + Efficient::scan_gpu(nCeil, dev_f); + + int totalFalses = dev_thrust_e[n - 1] + dev_thrust_f[n - 1]; + + kernGetIdx <<>> (n, totalFalses, dev_b, dev_f, dev_d); + + Common::kernScatter <<>> (n, dev_odata, dev_idata, dev_d); + + std::swap(dev_odata, dev_idata); + } + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy dev_idata->odata failed!"); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_b); + cudaFree(dev_e); + cudaFree(dev_f); + cudaFree(dev_d); + } + } +} \ No newline at end of file diff --git a/stream_compaction/radix.h b/stream_compaction/radix.h new file mode 100644 index 0000000..7dbffe3 --- /dev/null +++ b/stream_compaction/radix.h @@ -0,0 +1,14 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction +{ + namespace Radix + { + StreamCompaction::Common::PerformanceTimer& timer(); + + void sort(int n, int* odata, const int* idata); + void sort(int n, int bits, int* odata, const int* idata); + } +} \ No newline at end of file diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..210f9c5 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -3,6 +3,8 @@ #include #include #include +#include +#include #include "common.h" #include "thrust.h" @@ -18,11 +20,38 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::device_vector dv_idata(idata, idata + n); + thrust::device_vector dv_odata(n); + timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dv_idata.begin(), dv_idata.end(), dv_odata.begin()); + timer().endGpuTimer(); + + cudaMemcpy(odata, dv_odata.data().get(), n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy dv_odata->odata failed!"); + } + + int compact(int n, int *odata, const int *idata) + { + memcpy(odata, idata, n * sizeof(int)); + + struct is_zero + { + __host__ __device__ + bool operator() (const int x) const + { + return x == 0; + } + }; + + timer().startGpuTimer(); + int* new_end = thrust::remove_if(odata, odata + n, is_zero()); timer().endGpuTimer(); + + return new_end - odata; } } } diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h index fe98206..b3b5616 100644 --- a/stream_compaction/thrust.h +++ b/stream_compaction/thrust.h @@ -7,5 +7,6 @@ namespace StreamCompaction { StreamCompaction::Common::PerformanceTimer& timer(); void scan(int n, int *odata, const int *idata); + int compact(int n, int* odata, const int* idata); } }