diff --git a/README.md b/README.md index b71c458..5e6610e 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,210 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Michael Willett +* Tested on: Windows 10, I5-4690k @ 3.50GHz 8.00GB, GTX 750-TI 2GB (Personal Computer) -### (TODO: Your README) +## Contents +1. [Introduction](#intro) +2. [Algorithms](#part1) +3. [Verification and Performance Analysis](#part2) +4. [Development Process](#part3) +5. [Build Instructions](#appendix) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) + +## Introduction: Parallel Algorithms +The goal of this project was to implement several versions of basic GPU algorithms to become familiar with +parallel programming ideas, as well as compare the performance overhead of memory management on GPU vs +traditional CPU implementations. + + + +## Section 1: Scanning, Stream Compaction, and Sorting +Three basic algorithms were implemented in this project + +1. Scan - consective sum of all elements in the array prior to the current index. +2. Compaction - removal of all elements in an array that meet some boolean criteria. +3. Sort - arrange all elements in an array from low to high using a radix sort implementation. + +Two implementations of GPU based scan were implemented: a naive version where elements were added in pairs until all +sums were computed, and a work efficient version where the total number of operators is reduced at the cost of performing +two separate stages of analysis. In the implemented version, the work-efficient scan performs slightly worse than +the naive approach from the extra down-sweep phase, however, there is room for improvement with smarter array +indexing to reduce the total number of GPU kernel calls. Both algorithms run in O(log(N)) which is theoretically +faster than CPU implementations at O(N), but performance analysis shows that this is only significant for large +arrays due to array access overhead on the GPU level. + +Stream compaction shows similar trends in computation complexity in GPU vs. CPU performance, but catches up to CPU +faster than the scan implementations since GPU accelerated reording of arrays is significantly faster than CPU +implementations even for relatively small arrays. + +The radix sort agorithm iterates over each bit in the input value from least significant to most significant, reordering +the array at each step. Again since the reording step is highly efficient on a GPU, the major overhead is due to +memory access time. + + + +## Section 2: Verification and Performance Analysis +*Code Correctness for Scan, Compact, and Sort Implementations:* + +``` +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 35 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] +==== cpu scan, non-power-of-two ==== + passed +==== naive scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] + passed +==== naive scan, non-power-of-two ==== + passed +==== work-efficient scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] + passed +==== work-efficient scan, non-power-of-two ==== + passed +==== thrust scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] + passed +==== thrust scan, non-power-of-two ==== + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 3 3 ] + passed +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ] + passed +==== work-efficient compact, power-of-two ==== + passed +==== work-efficient compact, non-power-of-two ==== + passed + +********************** +** RADIX SORT TESTS ** +********************** +==== radix sort, power-of-two ==== + [ 38 7719 21238 2437 8855 11797 8365 32285 ] + [ 38 2437 7719 8365 8855 11797 21238 32285 ] + passed +==== radix sort, non-power-of-two ==== + [ 38 7719 21238 2437 8855 11797 8365 ] + [ 38 2437 7719 8365 8855 11797 21238 ] + passed + +Sort passed 1000/1000 randomly generated verification tests. +``` + +### Benchmarks +*All performance measurements were averaged over 1000 samples on a single dataset* +![Scan Performance](imgs/scan_benchmarks.PNG) +![Compact Performance](imgs/compact_benchmarks.PNG) +![Scan Performance](imgs/sort_benchmarks.PNG) + +We can quickly see from the above charts that run time for the various GPU implementations for scan, compact, and stort all +have logarithmic growth, while CPU implementations observe polynomial growth (for scan and compaction, CPU growth is linear). +As a result, for smaller arrays the overhead of memory access and array manipulation results in significantly worse execution +time, and it isn't until arrays of about ~50,000 elements that performance is comparable. Unfortunately due to implementation +choices, the GPU implemenations crash for arrays larger than 216, so performance analysis could not be run for these +large arrays, and estimates must be extrapolated from the current dataset. + +![Scan Kernal Calls](imgs/scan_performance.PNG) +The above figure shows the room for improvement of the efficient scan quite easily. Compared to the naive scan steps, +the up-sweep and down-sweep of the work-efficient implementation do not utilize nearly as much of the GPU per call as +the naive scan. Saturating the GPU here could yeild significant improvements by reducing the total number of kernal +calls required. + + +For the thrust library benchmarks, it is very interesting to see what appears to be standard linear growth due to the +performance loss in large arrays. Preliminary research on the thrust sort implementation implies that for basic sorting +of primitives like integers used in this example, it should also be using a radix sort implemented on the GPU. The charts +suggest, however, that thrust either not properly parallelizing the data, or it is running on the CPU. Documentation states +that method overloads that accept arrays located on the host will automatically handle data transfer to and from the GPU, +but that may not be the case. Additional work to instantiate the data on the GPU prior to the sort results in an Abort() +call from the thrust library, so additional investigation would be merited. + + +## Section 3: Development Process +Development was fairly straight forward algorithmic implementation. Future work could be done in the work effecient +scan implementation to better handle launching kernal functions at the high depth levels when only a couple of sums are +being calculated for the whole array. This should improve peformance to be faster than the naive approach, but I +exepect it to still be outperformed by the thrust implementation as well as the CPU version for arrays with fewer than +30,000 elements. + +It was worth noting there was a bug in using std::pow for calculating the array index in each kernal invocation. For some +unknown reason, it consisantly produced erronius values at 211, or 2048. This is odd since variables were being cast to +double precision before the computation, but the reverse cast was incorrect. This bug may be compiler specific as running +the index calculation on a separate machine result in accurate indexing in this range. Simply changing the code to use +bitshift operations cleared the error entirely. + + + +## Appendix: Build Instructions +**CMakeLists.txt modified to include new sort class and update compute compatability** + +* `src/` contains the source code. + +**CMake note:** Do not change any build settings or add any files to your +project directly (in Visual Studio, Nsight, etc.) Instead, edit the +`src/CMakeLists.txt` file. Any files you add must be added here. If you edit it, +just rebuild your VS/Nsight project to make it update itself. + +#### Windows + +1. In Git Bash, navigate to your cloned project directory. +2. Create a `build` directory: `mkdir build` + * (This "out-of-source" build makes it easy to delete the `build` directory + and try again if something goes wrong with the configuration.) +3. Navigate into that directory: `cd build` +4. Open the CMake GUI to configure the project: + * `cmake-gui ..` or `"C:\Program Files (x86)\cmake\bin\cmake-gui.exe" ..` + * Don't forget the `..` part! + * Make sure that the "Source" directory is like + `.../Project2-Stream-Compaction`. + * Click *Configure*. Select your version of Visual Studio, Win64. + (**NOTE:** you must use Win64, as we don't provide libraries for Win32.) + * If you see an error like `CUDA_SDK_ROOT_DIR-NOTFOUND`, + set `CUDA_SDK_ROOT_DIR` to your CUDA install path. This will be something + like: `C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v7.5` + * Click *Generate*. +5. If generation was successful, there should now be a Visual Studio solution + (`.sln`) file in the `build` directory that you just created. Open this. + (from the command line: `explorer *.sln`) +6. Build. (Note that there are Debug and Release configuration options.) +7. Run. Make sure you run the `cis565_` target (not `ALL_BUILD`) by + right-clicking it and selecting "Set as StartUp Project". + * If you have switchable graphics (NVIDIA Optimus), you may need to force + your program to run with only the NVIDIA card. In NVIDIA Control Panel, + under "Manage 3D Settings," set "Multi-display/Mixed GPU acceleration" + to "Single display performance mode". + +#### OS X & Linux + +It is recommended that you use Nsight. + +1. Open Nsight. Set the workspace to the one *containing* your cloned repo. +2. *File->Import...->General->Existing Projects Into Workspace*. + * Select the Project 0 repository as the *root directory*. +3. Select the *cis565-* project in the Project Explorer. From the *Project* + menu, select *Build All*. + * For later use, note that you can select various Debug and Release build + configurations under *Project->Build Configurations->Set Active...*. +4. If you see an error like `CUDA_SDK_ROOT_DIR-NOTFOUND`: + * In a terminal, navigate to the build directory, then run: `cmake-gui ..` + * Set `CUDA_SDK_ROOT_DIR` to your CUDA install path. + This will be something like: `/usr/local/cuda` + * Click *Configure*, then *Generate*. +5. Right click and *Refresh* the project. +6. From the *Run* menu, *Run*. Select "Local C/C++ Application" and the + `cis565_` binary. diff --git a/benchmark data.xlsx b/benchmark data.xlsx new file mode 100644 index 0000000..aa3b627 Binary files /dev/null and b/benchmark data.xlsx differ diff --git a/imgs/compact_benchmarks.PNG b/imgs/compact_benchmarks.PNG new file mode 100644 index 0000000..e759b1a Binary files /dev/null and b/imgs/compact_benchmarks.PNG differ diff --git a/imgs/scan_benchmarks.PNG b/imgs/scan_benchmarks.PNG new file mode 100644 index 0000000..6acebaa Binary files /dev/null and b/imgs/scan_benchmarks.PNG differ diff --git a/imgs/scan_performance.PNG b/imgs/scan_performance.PNG new file mode 100644 index 0000000..8a93fbe Binary files /dev/null and b/imgs/scan_performance.PNG differ diff --git a/imgs/sort_benchmarks.PNG b/imgs/sort_benchmarks.PNG new file mode 100644 index 0000000..6ef2fa2 Binary files /dev/null and b/imgs/sort_benchmarks.PNG differ diff --git a/src/main.cpp b/src/main.cpp index 675da35..5b01d79 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,21 +11,30 @@ #include #include #include +#include +#include +#include #include "testing_helpers.hpp" int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; + const int POW = 16; + const int SIZE = 1 << POW; const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; - // Scan tests + const int KEYSIZE = 16; + int a_sm[8], b_sm[8], c_sm[8]; + genArray(8, a_sm, 1 << KEYSIZE); + + genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + // Scan tests +#if 0 printf("\n"); printf("****************\n"); printf("** SCAN TESTS **\n"); printf("****************\n"); - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; printArray(SIZE, a, true); @@ -37,14 +46,14 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); - printArray(NPOT, b, true); + //printArray(NPOT, b, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); @@ -53,9 +62,9 @@ int main(int argc, char* argv[]) { printCmpResult(NPOT, b, c); zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); + printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); @@ -67,7 +76,7 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); @@ -120,4 +129,112 @@ int main(int argc, char* argv[]) { count = StreamCompaction::Efficient::compact(NPOT, c, a); //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); -} + + printf("\n"); + printf("**********************\n"); + printf("** RADIX SORT TESTS **\n"); + printf("**********************\n"); + + + zeroArray(8, b_sm); + zeroArray(8, c_sm); + printDesc("radix sort, power-of-two"); + printArray(8, a_sm, true); + memcpy(b_sm, a_sm, 8 * sizeof(int)); + StreamCompaction::Sort::radix(8, KEYSIZE, c_sm, a_sm); + thrust::sort(b_sm, b_sm + 8); + printArray(8, c_sm, true); + printCmpResult(8, b_sm, c_sm); + + int a_npot_sm[7]; + zeroArray(8, b_sm); + zeroArray(8, c_sm); + memcpy(a_npot_sm, a_sm, 7 * sizeof(int)); + printDesc("radix sort, non-power-of-two"); + printArray(7, a_npot_sm, true); + memcpy(b_sm, a_npot_sm, 7 * sizeof(int)); + StreamCompaction::Sort::radix(7, KEYSIZE, c_sm, a_npot_sm); + thrust::sort(b_sm, b_sm + 7); + printArray(7, c_sm, true); + printCmpResult(7, b_sm, c_sm); + +#endif +#if 0 + int successes = 0; + int tests = 1000; + for (int n = 0; n < tests; n++){ + zeroArray(SIZE, b); + zeroArray(SIZE, c); + genArray(SIZE, a, 1 << POW); // Leave a 0 at the end to test that edge case + memcpy(b, a, SIZE*sizeof(int)); + thrust::sort(b, b + SIZE); + StreamCompaction::Sort::radix(SIZE, KEYSIZE, c, a); + + if (!cmpArrays(SIZE, b, c)) successes++; + } + + printf("\nSort passed %i/%i randomly generated verification tests.\n", successes, tests); + +#endif + +#if 1 + printf("\n"); + printf("**********************\n"); + printf("** TIMING TESTS **\n"); + printf("**********************\n"); + +#define ITER 1 << i + + zeroArray(SIZE, c); + printDesc("naive scan, power-of-two"); + for (int i = 1; i <= POW; i++) StreamCompaction::Naive::TestScan(ITER, c, a); + + zeroArray(SIZE, c); + printDesc("efficient scan, power-of-two"); + for (int i = 1; i <= POW; i++) StreamCompaction::Efficient::TestScan(ITER, c, a); + + + zeroArray(SIZE, c); + printDesc("Thrust scan, power-of-two"); + for (int i = 1; i <= POW; i++) StreamCompaction::Thrust::TestScan(ITER, c, a); + + + + + zeroArray(SIZE, c); + printDesc("CPU scan, power-of-two"); + for (int i = 1; i <= POW; i++) StreamCompaction::CPU::TestScan(ITER, c, a); + + + zeroArray(SIZE, c); + printDesc("CPU compact, power-of-two"); + for (int i = 1; i <= POW; i++) StreamCompaction::CPU::TestCompact(ITER, c, a); + + zeroArray(SIZE, c); + printDesc("CPU compact without scan, power-of-two"); + for (int i = 1; i <= POW; i++) StreamCompaction::CPU::TestCompactWithoutScan(ITER, c, a); + + + zeroArray(SIZE, c); + printDesc("efficient compact, power-of-two"); + for (int i = 1; i <= POW; i++) StreamCompaction::Efficient::TestCompact(ITER, c, a); + + + + zeroArray(SIZE, c); + printDesc("Thrust stable sort, power-of-two"); + for (int i = 1; i <= POW; i++) StreamCompaction::Thrust::TestSortStable(ITER, c, a); + + + zeroArray(SIZE, c); + printDesc("Thrust unstable sort, power-of-two"); + for (int i = 1; i <= POW; i++) StreamCompaction::Thrust::TestSortUnstable(ITER, c, a); + + + zeroArray(SIZE, c); + printDesc("Radix sort, power-of-two"); + for (int i = 1; i <= POW; i++) StreamCompaction::Sort::TestSort(ITER, KEYSIZE, c, a); + +#endif + +} \ No newline at end of file diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..cb99a37 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -7,11 +7,13 @@ set(SOURCE_FILES "naive.cu" "efficient.h" "efficient.cu" + "sort.h" + "sort.cu" "thrust.h" "thrust.cu" ) cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_50 ) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..1b5df61 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,5 +1,6 @@ #include "common.h" + void checkCUDAErrorFn(const char *msg, const char *file, int line) { cudaError_t err = cudaGetLastError(); if (cudaSuccess == err) { @@ -23,7 +24,12 @@ namespace Common { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + bools[index] = !(idata[index] == 0); } /** @@ -32,8 +38,17 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + if (bools == NULL) odata[indices[index]] = idata[index]; + else if (bools[index] == 1) odata[indices[index]] = idata[index]; } + + } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 4f52663..e516c4a 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -3,10 +3,14 @@ #include #include #include +#include +#include #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define blockSize 128 + /** * Check for CUDA errors; print and exit if there was a problem. */ @@ -32,4 +36,4 @@ namespace Common { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices); } -} +} \ No newline at end of file diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..0c53b61 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,4 +1,5 @@ #include +#include "common.h" #include "cpu.h" namespace StreamCompaction { @@ -8,8 +9,20 @@ namespace CPU { * CPU scan (prefix sum). */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + for (int i = 0; i < n-1; i++) { + odata[i + 1] = odata[i] + idata[i]; + } +} + +/** +* CPU scatter. +*/ +void scatter(int n, int *odata, + const int *idata, const int *bools, const int *indices) { + + for (int i = 0; i < n - 1; i++) { + if (bools[i] == 1) odata[indices[i]] = idata[i]; + } } /** @@ -18,8 +31,27 @@ void scan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { - // TODO - return -1; + int r = 0; + + for (int i = 0; i < n-1; i++){ + if (idata[i] != 0) { + odata[r++] = idata[i]; + } + } + + return r; +} + +void printArray(int n, int *a, bool abridged = false) { + printf(" [ "); + for (int i = 0; i < n; i++) { + if (abridged && i + 2 == 15 && n > 16) { + i = n - 2; + printf("... "); + } + printf("%3d ", a[i]); + } + printf("]\n"); } /** @@ -27,9 +59,86 @@ int compactWithoutScan(int n, int *odata, const int *idata) { * * @returns the number of elements remaining after compaction. */ -int compactWithScan(int n, int *odata, const int *idata) { - // TODO - return -1; +int compactWithScan(const int n, int *odata, const int *idata) { + + // create arrays + int *indices = new int[n]; + int *bools = new int[n]; + int rtn = -1; + indices[0] = 0; + + for (int i = 0; i < n; i++) { + bools[i] = !(idata[i] == 0); + } + + scan(n, indices, bools); + scatter(n, odata, idata, bools, indices); + rtn = indices[n - 1]; + + delete indices; + delete bools; + + return rtn; +} + +void TestScan(int n, int *odata, const int *idata) { + + double time = 0; + int samp = 1000; + + for (int i = 0; i < samp; i++) { + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + + scan(n, odata, idata); + + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + + time += elapsed_seconds.count() * 1000 / samp; + } + printf(" %f\n", time); + +} + +void TestCompact(int n, int *odata, const int *idata) { + + double time = 0; + int samp = 1000; + + for (int i = 0; i < samp; i++) { + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + + compactWithScan(n, odata, idata); + + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + + time += elapsed_seconds.count() * 1000 / samp; + } + printf(" %f\n", time); + +} + +void TestCompactWithoutScan(int n, int *odata, const int *idata) { + + double time = 0; + int samp = 1000; + + for (int i = 0; i < samp; i++) { + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + + compactWithoutScan(n, odata, idata); + + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + + time += elapsed_seconds.count() * 1000 / samp; + } + printf(" %f\n", time); + } } diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 6348bf3..f6d8cc3 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -7,5 +7,10 @@ namespace CPU { int compactWithoutScan(int n, int *odata, const int *idata); int compactWithScan(int n, int *odata, const int *idata); + + + void TestScan(int n, int *odata, const int *idata); + void TestCompact(int n, int *odata, const int *idata); + void TestCompactWithoutScan(int n, int *odata, const int *idata); } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..cff875b 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -6,14 +6,96 @@ namespace StreamCompaction { namespace Efficient { -// TODO: __global__ +void printArray(int n, int *a, bool abridged = false) { + printf(" [ "); + for (int i = 0; i < n; i++) { + if (abridged && i + 2 == 15 && n > 16) { + i = n - 2; + printf("... "); + } + printf("%3d ", a[i]); + } + printf("]\n"); +} + +__global__ void kernUpStep(int n, int d, int *data) { + + int s = 1 << (d + 1); + int index = (threadIdx.x + (blockIdx.x * blockDim.x)) *s; + + if (index >= n) { + return; + } + + //if (fmod((double) index+1, (double)s) == 0) { + data[index + s - 1] += data[index + s / 2 - 1]; + //} +} + +__global__ void kernDownStep(int n, int d, int *data) { + + int s = 1 << (d + 1); + int index = (threadIdx.x + (blockIdx.x * blockDim.x)) * s; + + if (index >= n) { + return; + } + + + //if (fmod((double) index, (double)s) == 0) { + int t = data[index + s / 2 - 1]; + data[index + s / 2 - 1] = data[index + s - 1]; + data[index + s - 1] += t; + //} +} /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ +* Performs prefix-sum (aka scan) on idata, storing the result into odata. +*/ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + // create device arrays + int *dev_out; + + cudaMalloc((void**)&dev_out, n*sizeof(int)); + cudaMemcpy(dev_out, idata, n*sizeof(int), cudaMemcpyHostToDevice); + + scan_dev(n, dev_out); + + cudaMemcpy(odata, dev_out, n*sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_out); +} + +/** +* Performs prefix-sum (aka scan) on idata, storing the result into odata. +* For use with arrays intiialized on GPU already. +*/ +void scan_dev(int n, int *dev_in) { + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + // create device arrays to pad to power of 2 size array + //int pot = pow(2, ilog2ceil(n)); + int pot = 1 << ilog2ceil(n); + int *dev_data; + cudaMalloc((void**)&dev_data, pot*sizeof(int)); + cudaMemset(dev_data, 0, pot*sizeof(int)); + cudaMemcpy(dev_data, dev_in, n*sizeof(int), cudaMemcpyDeviceToDevice); + + + int d = 0; + for (d; d < ilog2ceil(pot); d++) { + fullBlocksPerGrid.x = ((pot / pow(2, d+1) + blockSize - 1) / blockSize); + kernUpStep << < fullBlocksPerGrid, blockSize >> >(pot, d, dev_data); + } + + cudaMemset(&dev_data[pot - 1], 0, sizeof(int)); + for (d = ilog2ceil(pot); d >= 0; d--) { + fullBlocksPerGrid.x = ((pot / pow(2, d+1) + blockSize - 1) / blockSize); + kernDownStep << < fullBlocksPerGrid, blockSize >> >(pot, d, dev_data); + } + + + cudaMemcpy(dev_in, dev_data, n*sizeof(int), cudaMemcpyDeviceToDevice); + cudaFree(dev_data); } /** @@ -26,8 +108,137 @@ void scan(int n, int *odata, const int *idata) { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - // TODO - return -1; + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + // create device arrays + int *dev_out; + int *dev_in; + int *dev_indices; + int *dev_bools; + int rtn = -1; + + cudaMalloc((void**)&dev_out, n*sizeof(int)); + cudaMalloc((void**)&dev_in, n*sizeof(int)); + cudaMalloc((void**)&dev_indices, n*sizeof(int)); + cudaMalloc((void**)&dev_bools, n*sizeof(int)); + + + cudaMemcpy(dev_in, idata, n*sizeof(int), cudaMemcpyHostToDevice); + StreamCompaction::Common::kernMapToBoolean << < fullBlocksPerGrid, blockSize >> >(n, dev_bools, dev_in); + + // scan without wasteful device-host-device write + cudaMemcpy(dev_indices, dev_bools, n*sizeof(int), cudaMemcpyDeviceToDevice); + scan_dev(n, dev_indices); + + // scatter + StreamCompaction::Common::kernScatter << < fullBlocksPerGrid, blockSize >> >(n, dev_out, dev_in, dev_bools, dev_indices); + + cudaMemcpy(odata, dev_out, n*sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&rtn, &dev_indices[n-1], sizeof(int), cudaMemcpyDeviceToHost); + + + cudaFree(dev_out); + cudaFree(dev_in); + cudaFree(dev_bools); + cudaFree(dev_indices); + + return rtn; +} + + +int compact_dev(int n, int *dev_out, const int *dev_in) { + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + // create device arrays + int *dev_indices; + int *dev_bools; + int rtn = -1; + + cudaMalloc((void**)&dev_indices, n*sizeof(int)); + cudaMalloc((void**)&dev_bools, n*sizeof(int)); + + StreamCompaction::Common::kernMapToBoolean << < fullBlocksPerGrid, blockSize >> >(n, dev_bools, dev_in); + + // scan without wasteful device-host-device write + cudaMemcpy(dev_indices, dev_bools, n*sizeof(int), cudaMemcpyDeviceToDevice); + scan_dev(n, dev_indices); + + // scatter + StreamCompaction::Common::kernScatter << < fullBlocksPerGrid, blockSize >> >(n, dev_out, dev_in, dev_bools, dev_indices); + + cudaMemcpy(&rtn, &dev_indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_bools); + cudaFree(dev_indices); + + return rtn; +} + + +void TestScan(int n, int *odata, const int *idata) { + + double time = 0; + int samp = 1000; + + int *dev_out; + + cudaMalloc((void**)&dev_out, n*sizeof(int)); + cudaMemcpy(dev_out, idata, n*sizeof(int), cudaMemcpyHostToDevice); + + + for (int i = 0; i < samp; i++) { + cudaMemcpy(dev_out, idata, n*sizeof(int), cudaMemcpyHostToDevice); + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + + scan_dev(n, dev_out); + + cudaThreadSynchronize(); // block until kernel is finished + + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + + time += elapsed_seconds.count() * 1000 / samp; + } + + cudaMemcpy(odata, dev_out, n*sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_out); + printf(" %f\n", time); + +} + +void TestCompact(int n, int *odata, const int *idata) { + + double time = 0; + int samp = 1000; + + int *dev_out; + int *dev_in; + cudaMalloc((void**)&dev_out, n*sizeof(int)); + cudaMalloc((void**)&dev_in, n*sizeof(int)); + + for (int i = 0; i < samp; i++) { + cudaMemcpy(dev_in, idata, n*sizeof(int), cudaMemcpyHostToDevice); + + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + + compact_dev(n, odata, idata); + cudaThreadSynchronize(); // block until kernel is finished + + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + + time += elapsed_seconds.count() * 1000 / samp; + } + + cudaMemcpy(odata, dev_out, n*sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_out); + cudaFree(dev_in); + + printf(" %f\n", time); + } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 395ba10..a92f1d9 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -2,8 +2,13 @@ namespace StreamCompaction { namespace Efficient { - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata); + void scan_dev(int n, int *dev_data); - int compact(int n, int *odata, const int *idata); + int compact(int n, int *odata, const int *idata); + int compact_dev(int n, int *dev_out, const int *dev_in); + + void TestScan(int n, int *odata, const int *idata); + void TestCompact(int n, int *odata, const int *idata); } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..530ebcd 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -6,15 +6,93 @@ namespace StreamCompaction { namespace Naive { -// TODO: __global__ +__global__ void kernScanStep(int n, int d, int *odata, const int *idata) { + + int s = 1 << (d - 1); + int index = threadIdx.x + (blockIdx.x * blockDim.x) + s; + + if (index >= n) { + return; + } + + + //if (index >= s) { + odata[index] = idata[index] + idata[index - s]; + //} +} /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + + // create device arrays + int *dev_in; + int *dev_out; + + cudaMalloc((void**)&dev_in, n*sizeof(int)); + cudaMalloc((void**)&dev_out, n*sizeof(int)); + + cudaMemcpy(dev_in, idata, n*sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(dev_out, idata, n*sizeof(int), cudaMemcpyHostToDevice); + + scan_dev(n, dev_out, dev_in); + + cudaMemcpy(&odata[1], dev_out, (n - 1)*sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_in); + cudaFree(dev_out); +} + +void scan_dev(int n, int *dev_out, int *dev_in) { + dim3 fullBlocksPerGrid; + + for (int d = 1; d <= ilog2ceil(n); d++) { + int s = 1 << (d - 1); + fullBlocksPerGrid = (n - s + blockSize - 1) / blockSize; + + kernScanStep << < fullBlocksPerGrid, blockSize >> >(n, d, dev_out, dev_in); + cudaMemcpy(dev_in, dev_out, n*sizeof(int), cudaMemcpyDeviceToDevice); + } +} + +void TestScan(int n, int *odata, const int *idata) { + + double time = 0; + int samp = 1000; + + // create device arrays + int *dev_in; + int *dev_out; + + cudaMalloc((void**)&dev_in, n*sizeof(int)); + cudaMalloc((void**)&dev_out, n*sizeof(int)); + + cudaMemcpy(dev_in, idata, n*sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(dev_out, idata, n*sizeof(int), cudaMemcpyHostToDevice); + + for (int i = 0; i < samp; i++) { + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + + scan_dev(n, dev_out, dev_in); + cudaThreadSynchronize(); // block until kernel is finished + + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + + time += elapsed_seconds.count() * 1000 / samp; + } + + cudaMemcpy(&odata[1], dev_out, (n - 1)*sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_in); + cudaFree(dev_out); + + printf(" %f\n", time ); + } } } + diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 21152d6..4e4614f 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -2,6 +2,9 @@ namespace StreamCompaction { namespace Naive { - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata); + void scan_dev(int n, int *dev_out, int *dev_in); + + void TestScan(int n, int *odata, const int *idata); } } diff --git a/stream_compaction/sort.cu b/stream_compaction/sort.cu new file mode 100644 index 0000000..f780d05 --- /dev/null +++ b/stream_compaction/sort.cu @@ -0,0 +1,173 @@ +#include +#include +#include "common.h" +#include "efficient.h" +#include "naive.h" +#include "sort.h" + +namespace StreamCompaction { +namespace Sort { + +void printArray(int n, int *a, bool abridged = false) { + printf(" [ "); + for (int i = 0; i < n; i++) { + if (abridged && i + 2 == 15 && n > 16) { + i = n - 2; + printf("... "); + } + printf("%3d ", a[i]); + } + printf("]\n"); +} + + +__global__ void kernGetBit(int n, int d, int *bits, int *nbits, const int *data) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + int s = (data[index] & (1 << d)) >> d; + + bits[index] = s; + nbits[index] = 1 - s; +} + + +__global__ void kernScanTrue(int n, int *trues, const int *falses, const int *lastBit) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + trues[index] = index - falses[index] + falses[n - 1] + lastBit[n - 1]; +} + +__global__ void kernDestination(int n, int *dest, int *trues, int *falses, int* bits) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + dest[index] = bits[index] ? trues[index] : falses[index]; +} + + +void radix(int n, const int k, int *odata, const int *idata) { + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + // create device arrays + int *dev_out; + int *dev_in; + int *dev_bits; + int *dev_nbits; + int *dev_true; + int *dev_false; + int *dev_dest; + int *dev_all; + + cudaMalloc((void**)&dev_out, n*sizeof(int)); + cudaMalloc((void**)&dev_in, n*sizeof(int)); + cudaMalloc((void**)&dev_bits, n*sizeof(int)); + cudaMalloc((void**)&dev_nbits, n*sizeof(int)); + cudaMalloc((void**)&dev_true, n*sizeof(int)); + cudaMalloc((void**)&dev_false, n*sizeof(int)); + cudaMalloc((void**)&dev_dest, n*sizeof(int)); + cudaMalloc((void**)&dev_all, n*sizeof(int)); + + cudaMemset(dev_all, 0, n); + cudaMemcpy(dev_in, idata, n*sizeof(int), cudaMemcpyHostToDevice); + + for (int d = 0; d < k; d++) { + kernGetBit << > >(n, d, dev_bits, dev_nbits, dev_in); + cudaMemcpy(dev_false, dev_nbits, n*sizeof(int), cudaMemcpyDeviceToDevice); + StreamCompaction::Efficient::scan_dev(n, dev_false); + kernScanTrue << > >(n, dev_true, dev_false, dev_nbits); + kernDestination << > >(n, dev_dest, dev_true, dev_false, dev_bits); + StreamCompaction::Common::kernScatter << > >(n, dev_out, dev_in, NULL, dev_dest); + cudaMemcpy(dev_in, dev_out, n*sizeof(int), cudaMemcpyDeviceToDevice); + } + + cudaMemcpy(odata, dev_out, n*sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_out); + cudaFree(dev_in); + cudaFree(dev_bits); + cudaFree(dev_nbits); + cudaFree(dev_true); + cudaFree(dev_false); + cudaFree(dev_dest); + cudaFree(dev_all); +} + + +void radix_dev(int n, const int k, int *dev_out, int *dev_in) { + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + // create device arrays + int *dev_bits; + int *dev_nbits; + int *dev_true; + int *dev_false; + int *dev_dest; + int *dev_all; + + cudaMalloc((void**)&dev_bits, n*sizeof(int)); + cudaMalloc((void**)&dev_nbits, n*sizeof(int)); + cudaMalloc((void**)&dev_true, n*sizeof(int)); + cudaMalloc((void**)&dev_false, n*sizeof(int)); + cudaMalloc((void**)&dev_dest, n*sizeof(int)); + cudaMalloc((void**)&dev_all, n*sizeof(int)); + + cudaMemset(dev_all, 0, n); + + for (int d = 0; d < k; d++) { + kernGetBit << > >(n, d, dev_bits, dev_nbits, dev_in); + cudaMemcpy(dev_false, dev_nbits, n*sizeof(int), cudaMemcpyDeviceToDevice); + StreamCompaction::Efficient::scan_dev(n, dev_false); + kernScanTrue << > >(n, dev_true, dev_false, dev_nbits); + kernDestination << > >(n, dev_dest, dev_true, dev_false, dev_bits); + StreamCompaction::Common::kernScatter << > >(n, dev_out, dev_in, NULL, dev_dest); + cudaMemcpy(dev_in, dev_out, n*sizeof(int), cudaMemcpyDeviceToDevice); + } + + cudaFree(dev_bits); + cudaFree(dev_nbits); + cudaFree(dev_true); + cudaFree(dev_false); + cudaFree(dev_dest); + cudaFree(dev_all); +} + + +void TestSort(int n, const int k, int *odata, const int *idata) { + + double time = 0; + int samp = 1000; + + int *dev_out; + int *dev_in; + cudaMalloc((void**)&dev_out, n*sizeof(int)); + cudaMalloc((void**)&dev_in, n*sizeof(int)); + cudaMemcpy(dev_in, idata, n*sizeof(int), cudaMemcpyHostToDevice); + + for (int i = 0; i < samp; i++) { + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + + radix_dev(n, k, dev_out, dev_in); + cudaThreadSynchronize(); // block until kernel is finished + + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + + time += elapsed_seconds.count() * 1000 / samp; + } + + cudaMemcpy(odata, dev_out, n*sizeof(int), cudaMemcpyDeviceToHost); + printf(" %f\n", time); + +} + +} +} \ No newline at end of file diff --git a/stream_compaction/sort.h b/stream_compaction/sort.h new file mode 100644 index 0000000..ec3bd9f --- /dev/null +++ b/stream_compaction/sort.h @@ -0,0 +1,10 @@ +#pragma once + +namespace StreamCompaction { + namespace Sort { + void radix(int n, const int k, int *odata, const int *idata); + void radix_dev(int n, const int k, int *dev_out, int *dev_in); + + void TestSort(int n, const int k, int *odata, const int *idata); + } +} diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..721d425 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -3,6 +3,7 @@ #include #include #include +#include #include "common.h" #include "thrust.h" @@ -13,9 +14,85 @@ namespace Thrust { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::device_vector dv_in(idata, idata + n); + thrust::device_vector dv_out(odata, odata + n); + + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::copy(dv_out.begin(), dv_out.end(), odata); +} + +void TestScan(int n, int *odata, const int *idata) { + + double time = 0; + int samp = 1000; + + thrust::device_vector dv_in(idata, idata + n); + thrust::device_vector dv_out(odata, odata + n); + + for (int i = 0; i < samp; i++) { + + + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + cudaThreadSynchronize(); // block until kernel is finished + + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + + + time += elapsed_seconds.count() * 1000 / samp; + } + + thrust::copy(dv_out.begin(), dv_out.end(), odata); + printf(" %f\n", time); + +} + +void TestSortStable(int n, int *odata, const int *idata) { + + double time = 0; + int samp = 1000; + + for (int i = 0; i < samp; i++) { + memcpy(odata, idata, n*sizeof(int)); + + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + + thrust::stable_sort(odata, odata + n); + cudaThreadSynchronize(); // block until kernel is finished + + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + + time += elapsed_seconds.count() * 1000 / samp; + } + printf(" %f\n", time); +} + +void TestSortUnstable(int n, int *odata, const int *idata) { + + double time = 0; + int samp = 1000; + + for (int i = 0; i < samp; i++) { + memcpy(odata, idata, n*sizeof(int)); + + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + + thrust::sort(odata, odata + n); + cudaThreadSynchronize(); // block until kernel is finished + + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + + time += elapsed_seconds.count() * 1000 / samp; + } + printf(" %f\n", time); + } } diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h index 06707f3..5790b94 100644 --- a/stream_compaction/thrust.h +++ b/stream_compaction/thrust.h @@ -2,6 +2,11 @@ namespace StreamCompaction { namespace Thrust { - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata); + + void TestScan(int n, int *odata, const int *idata); + void TestSortStable(int n, int *odata, const int *idata); + void TestSortUnstable(int n, int *odata, const int *idata); + } }