diff --git a/README.md b/README.md index b71c458..d0087b1 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,100 @@ -CUDA Stream Compaction +University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2 CUDA Stream Compaction ====================== +* Ziyu Li +* Tested on: Windows 7, i7-3840QM @ 2.8GHz 16GB, Nivida Quadro K4000M 4096MB (Personal Laptop) -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** +## Performance Analysis +#### Efficient Scan without Optimization +This implementation is achieved by reduction and down-sweep in GPU. The performance of this method is much better than naive scan but actually still slow compare to CPU. -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +For the benchmark and result, please check **Performance** section -### (TODO: Your README) +#### Efficient Scan with Optimization +To avoid the efficient scan method uses extra non-working threads, simply change the index pattern to perform the kernels. So this optimization can reduce a huge amount of threads to perform useless operations and increase the overall performance. -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +For the benchmark and result, please check **Performance** section + +#### More Efficient Scan with Shared Memory +The optimized method which states above still is not efficiency enough. Actually by performing the operations in shared memory can highly achieve the maximum the performance. The whole implementation can split into three parts. + +* Scan each blocks seperatly and use a auxiliary array to store each block sum +* Scan the block sums +* Add scanned block sum to next scanned block + +![](img/39fig06.jpg) + +(Figure 1: Algorithm for Performing a Sum Scan on a Large Array of Values, Nvidia GPU Gems) + + +This implementation is relatively easy to achieve, however using share memory will sometimes suffer from bank conflicts which could hurt the performance significantly by access those memory everytime. To avoid these bank conflict, we have to add padding to share memory every certain elements. And those offset can be easily implement by a macro. + +```c++ +#define CONFLICT_FREE_OFFSET(n) \ ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS)) +``` + +For the benchmark and result, please check **Performance** section + +#### Radix Sort +One of the most significant application for GPU scan is radix sort which is a sorting algorithm for parallel processors. + +To use radix sort function, please call the function below: +```c++ +StreamCompaction::Radix::sort(int n, int *odata, int *idata); +``` +* The first argument is the array size. (input) +* The second argument is sorted array. (output) +* The third argument is unsorted array. (input) + +For the benchmark and result, please check **Performance** section + +## Performance +#### Scan Performace Measurement and Result +The benckmark is performed the scan operation under 128 threads per block for array size from 2^4 to 2^22. (Since there is only one grid, 2^22 is the maximum amount for a 128 block size.) + +The benchmark also makes a running time comparision between CPU, GPU Naive, GPU Efficient, GPU Efficient With Optimization, GPU Efficient With Share Memory and GPU Thrust Scan. + +![](img/scan_power_2.PNG) + +![](img/scan_power_not_2.PNG) + +![](img/scan.PNG) + +(For the detail result, please check the data in the **benckmark** folder) + +#### Compact Performace Measurement and Result +The benckmark is performed the compaction operation under 128-512 threads per block for array size from 2^4 to 2^24. (128 block size for array size 2^4 to 2^22, 256 block size for 2^23 and 512 block size for 2^24) + +The benchmark also makes a running time comparision between CPU without scan, CPU with scan and GPU with scan. + +![](img/compaction_2.PNG) + +![](img/compact.PNG) + +(For the detail result, please check the data in the **benckmark** folder) + +#### Radix Sort Performance Measurement and Result +The benchmark is performed the radix sort operation under 128 threads per block for array size from 2^4 to 2^24. + +The benchmark makes a running time comparison between CPU 3-part hybrid sort (standard sort function in STL) and GPU radix sort + +![](img/radix_c.PNG) + +![](img/radix_result.PNG) + +## Questions +#### Roughly optimize the block sizes of each of your implementations for minimal run time on your GPU. + +Based on large number (larger than 2^20) benchmark result. The optimize block sizes for each implementation: + +| Methods | Naive | Efficient | Efficient (Optimize) | Efficient (ShareMem) | Thrust | +|:----------:|-------|-----------|----------------------|----------------------|--------| +| Block Size | 1024 | 128 | 128 | 256 | 1024 | + +#### Compare all of these GPU Scan implementations (Naive, Work-Efficient, and Thrust) to the serial CPU version of Scan. Plot a graph of the comparison + +For Thrust implementation, the highest occupancy in GPU is cudaMalloc and cudaMemcpy related function calls based on Nsight Timeline. However, there are three most significant functions in Thrust scan *accumlate_tiles*, *exculsive_scan_n* and *exclusive_downsweep* are not really use too much GPU time. + +I believe the performance bottlenecks is memory bandwidth for Thrust scan. The computation time compare to memory I/O time is trivial. As for my implementation, the efficient method waste a huge amount of time on launching non-working threads. For efficient with optimization, the memory I/O become the most inefficient factor in whole system. By using shared memory can highly increase memory I/O efficiency and decrease memory latency to achieve maximum efficiency. + +For the benchmark and graph, please check **Performance** section \ No newline at end of file diff --git a/img/39fig06.jpg b/img/39fig06.jpg new file mode 100644 index 0000000..973e531 Binary files /dev/null and b/img/39fig06.jpg differ diff --git a/img/compact.PNG b/img/compact.PNG new file mode 100644 index 0000000..2e70da4 Binary files /dev/null and b/img/compact.PNG differ diff --git a/img/compaction_2.PNG b/img/compaction_2.PNG new file mode 100644 index 0000000..ff64de0 Binary files /dev/null and b/img/compaction_2.PNG differ diff --git a/img/radix_c.PNG b/img/radix_c.PNG new file mode 100644 index 0000000..826f895 Binary files /dev/null and b/img/radix_c.PNG differ diff --git a/img/radix_result.PNG b/img/radix_result.PNG new file mode 100644 index 0000000..a9ab741 Binary files /dev/null and b/img/radix_result.PNG differ diff --git a/img/scan.PNG b/img/scan.PNG new file mode 100644 index 0000000..a967ae5 Binary files /dev/null and b/img/scan.PNG differ diff --git a/img/scan_power_2.PNG b/img/scan_power_2.PNG new file mode 100644 index 0000000..4f4b845 Binary files /dev/null and b/img/scan_power_2.PNG differ diff --git a/img/scan_power_not_2.PNG b/img/scan_power_not_2.PNG new file mode 100644 index 0000000..0e780b7 Binary files /dev/null and b/img/scan_power_not_2.PNG differ diff --git a/src/main.cpp b/src/main.cpp index 7305641..9b930b6 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,15 +11,39 @@ #include #include #include +#include +#include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array -const int NPOT = SIZE - 3; // Non-Power-Of-Two -int a[SIZE], b[SIZE], c[SIZE]; +int SIZE = 1 << 20; // feel free to change the size of array +int NPOT = SIZE - 3; // Non-Power-Of-Two -int main(int argc, char* argv[]) { - // Scan tests +StreamCompaction::Common::PerformanceTimer& timer(); + +using StreamCompaction::Common::PerformanceTimer; +PerformanceTimer& timer() +{ + static PerformanceTimer timer; + return timer; +} +int main(int argc, char* argv[]) { + int *a, *b, *c; + a = (int *)malloc(SIZE * sizeof(int)); + b = (int *)malloc(SIZE * sizeof(int)); + c = (int *)malloc(SIZE * sizeof(int)); + + // Scan tests + + if (argc == 2) { + if (atoi(argv[1]) < 0) { + printf("---------------------------------------------------"); + SIZE = 1 << (-1 * atoi(argv[1])); + //printf("---bash test: %i----\n", -1 * atoi(argv[1])); + } + } + + printf("\n"); printf("****************\n"); printf("** SCAN TESTS **\n"); @@ -32,6 +56,7 @@ int main(int argc, char* argv[]) { // initialize b using StreamCompaction::CPU::scan you implement // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. // At first all cases passed because b && c are all zeroes. + zeroArray(SIZE, b); printDesc("cpu scan, power-of-two"); StreamCompaction::CPU::scan(SIZE, b, a); @@ -59,20 +84,49 @@ int main(int argc, char* argv[]) { //printArray(SIZE, c, true); printCmpResult(NPOT, b, c); + zeroArray(SIZE, c); + printDesc("work-efficient scan, power-of-two"); + StreamCompaction::Efficient::scan0(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan, non-power-of-two"); + StreamCompaction::Efficient::scan0(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); + printDesc("work-efficient scan with optimization, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); + printDesc("work-efficient scan with optimization, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + zeroArray(SIZE, c); + printDesc("work-efficient scan with SHARE MEMORY and optimization, power-of-two"); + StreamCompaction::Efficient::scan_s(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan with SHARE MEMORY and optimization, non-power-of-two"); + StreamCompaction::Efficient::scan_s(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); @@ -86,7 +140,7 @@ int main(int argc, char* argv[]) { printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); - + printf("\n"); printf("*****************************\n"); printf("** STREAM COMPACTION TESTS **\n"); @@ -119,12 +173,20 @@ int main(int argc, char* argv[]) { printCmpLenResult(count, expectedNPOT, b, c); zeroArray(SIZE, c); - printDesc("cpu compact with scan"); + printDesc("cpu compact with scan, power-of-two"); count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); + zeroArray(SIZE, c); + printDesc("cpu compact with scan, non-power-of-two"); + count = StreamCompaction::CPU::compactWithScan(NPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + expectedNPOT = count; + printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); @@ -139,5 +201,44 @@ int main(int argc, char* argv[]) { //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + + + printf("\n"); + printf("*****************************\n"); + printf("** RADIX SORT TESTS **\n"); + printf("*****************************\n"); + + genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printArray(SIZE, a, true); + + memcpy(c, a, SIZE * sizeof(int)); + timer().startCpuTimer(); + std::sort(c, c + SIZE); + timer().endCpuTimer(); + printElapsedTime(timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + + zeroArray(SIZE, b); + printDesc("radix sort, power-of-two"); + count = SIZE; + StreamCompaction::Radix::sort(SIZE, b, a); + printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(cuda Measured)"); + expectedCount = count; + printArray(count, b, true); + printCmpLenResult(count, expectedCount, b, c); + + memcpy(c, a, SIZE * sizeof(int)); + std::sort(c, c + NPOT); + zeroArray(SIZE, b); + printDesc("radix sort, not-power-of-two"); + StreamCompaction::Radix::sort(NPOT, b, a); + printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(cuda Measured)"); + printArray(NPOT, b, true); + printCmpLenResult(NPOT, NPOT, b, c); + + + free(a); + free(b); + free(c); system("pause"); // stop Win32 console from closing on exit } diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..bcc484e 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -9,6 +9,8 @@ set(SOURCE_FILES "efficient.cu" "thrust.h" "thrust.cu" + "radix.h" + "radix.cu" ) cuda_add_library(stream_compaction diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 8fc0211..e290a6f 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,12 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n) { + return; + } + + bools[idx] = (bool)idata[idx]; } /** @@ -32,7 +37,13 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n) { + return; + } + if (bools[idx]) { + odata[indices[idx]] = idata[idx]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 05ce667..c23d95a 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -12,6 +12,8 @@ namespace StreamCompaction { return timer; } + + /** * CPU scan (prefix sum). * For performance analysis, this is supposed to be a simple for loop. @@ -19,10 +21,31 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } + timer().endCpuTimer(); } + void scan_wo_timer(int n, int *odata, const int *idata) { + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } + } + + void scan_incusive(int n, int *odata, const int *idata) { + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } + } + + + /** * CPU stream compaction without using the scan function. * @@ -30,9 +53,18 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + int nRemains = 0; + + for (int i = 0; i < n; i++) { + if (idata[i]) { + odata[nRemains++] = idata[i]; + } + } + timer().endCpuTimer(); - return -1; + return nRemains; + } /** @@ -42,9 +74,28 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - return -1; + + int *temp = new int[n]; + for (int i = 0; i < n; i++) { + temp[i] = (idata[i] != 0); + } + + int *temp_scan = new int[n]; + scan_wo_timer(n, temp_scan, temp); + + int nRemains = 0; + + for (int i = 0; i < n; i++) { + if (temp[i]) { + odata[temp_scan[i]] = idata[i]; + nRemains++; + } + } + + delete[] temp, temp_scan; + + timer().endCpuTimer(); + return nRemains; } } } diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 236ce11..a120163 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -8,6 +8,8 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata); + void scan_incusive(int n, int *odata, const int *idata); + int compactWithoutScan(int n, int *odata, const int *idata); int compactWithScan(int n, int *odata, const int *idata); diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 36c5ef2..dfb4939 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,6 +2,23 @@ #include #include "common.h" #include "efficient.h" +#include "cpu.h" +#include "naive.h" + +#define BLOCKSIZE 256 +#define SOLVE_BANK_CONFLICTS 1 + +#if BLOCKSIZE > 1024 +#error "Warning: Blocksize cannot excess 1024!" +#endif + +#define NUM_BANKS 16 +#define LOG_NUM_BANKS 4 +#define CONFLICT_FREE_OFFSET(n) \ + ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS)) + +typedef int var_t; +int scan_timing = 1; namespace StreamCompaction { namespace Efficient { @@ -12,29 +29,392 @@ namespace StreamCompaction { return timer; } +#pragma region PrescanEfficientWithShareMemory + __global__ void scanWithShareMem(int n, int * idata, int * odata, int * aux) { + + // Declare Share Memory + #if SOLVE_BANK_CONFLICTS + __shared__ var_t temp[BLOCKSIZE + NUM_BANKS]; + #else + __shared__ var_t temp[BLOCKSIZE]; + #endif + + // Declare necessary variables + const var_t tid = threadIdx.x; + const var_t bid = blockIdx.x; + const var_t s = bid * BLOCKSIZE; + var_t offset = 1; + + // Copy Global Memory to Share Memory + #if SOLVE_BANK_CONFLICTS + var_t ai = tid << 1; + var_t bi = ai + 1; + var_t bankOffsetA = CONFLICT_FREE_OFFSET(ai); + var_t bankOffsetB = CONFLICT_FREE_OFFSET(bi); + temp[ai + bankOffsetA] = idata[ai + s]; + temp[bi + bankOffsetB] = idata[bi + s]; + #else + temp[2 * tid] = idata[2 * tid + s]; + temp[2 * tid + 1] = idata[2 * tid + s + 1]; + #endif + + // Reduction Phase + for (var_t d = BLOCKSIZE >> 1; d > 0; d >>= 1) + { + __syncthreads(); + if (tid < d) + { + var_t ai = offset*((tid << 1) + 1) - 1; + var_t bi = offset*((tid << 1) + 2) - 1; + #if SOLVE_BANK_CONFLICTS + ai += CONFLICT_FREE_OFFSET(ai); + bi += CONFLICT_FREE_OFFSET(bi); + #endif + temp[bi] += temp[ai]; + } + offset <<= 1; + } + + // Copy Block Sum to Aux Array + __syncthreads(); + if (!tid) { + #if SOLVE_BANK_CONFLICTS + aux[bid] = temp[BLOCKSIZE - 1 + CONFLICT_FREE_OFFSET(BLOCKSIZE - 1)]; + temp[BLOCKSIZE - 1 + CONFLICT_FREE_OFFSET(BLOCKSIZE - 1)] = 0; + #else + aux[bid] = temp[BLOCKSIZE - 1]; + temp[BLOCKSIZE - 1] = 0; + #endif + } + + for (var_t d = 1; d < BLOCKSIZE; d <<= 1) + { + offset >>= 1; + __syncthreads(); + if (tid < d) + { + int ai = offset*((tid << 1) + 1) - 1; + int bi = offset*((tid << 1) + 2) - 1; + #if SOLVE_BANK_CONFLICTS + ai += CONFLICT_FREE_OFFSET(ai); + bi += CONFLICT_FREE_OFFSET(bi); + #endif + var_t t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + } + + __syncthreads(); + #if SOLVE_BANK_CONFLICTS + ai = tid << 1; + bi = ai + 1; + bankOffsetA = CONFLICT_FREE_OFFSET(ai); + bankOffsetB = CONFLICT_FREE_OFFSET(bi); + odata[ai + s] = temp[ai + bankOffsetA]; + odata[bi + s] = temp[bi + bankOffsetB]; + #else + odata[2 * tid + s] = temp[2 * tid]; + odata[2 * tid + 1 + s] = temp[2 * tid + 1]; + #endif + } + + __global__ void sumWithBlock(int n, int * idata, int * odata, int * aux) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + int bid = blockIdx.x; + if (idx >= n) { + return; + } + + if(bid > 0) { + odata[idx] = idata[idx] + aux[bid]; + } + else { + odata[idx] = idata[idx]; + } + } + + void scan_s(int n, int *odata, int *idata) { + int dSize, dLen, _n, aSize, aLen; + int *dev_idata, *dev_odata; + int *host_aux, *host_aux_sum; + int *dev_aux, *dev_aux_sum; + + _n = ilog2ceil(n); + dLen = 1 << _n; + dSize = dLen * sizeof(int); + + aLen = ((n + BLOCKSIZE - 1) / BLOCKSIZE); + aSize = aLen * sizeof(int); + + dim3 blocksPerGrid((n + BLOCKSIZE - 1) / (BLOCKSIZE)); + dim3 threadsPerBlocks(BLOCKSIZE / 2); + dim3 blocksPerGrid_aux((aLen + BLOCKSIZE - 1) / (BLOCKSIZE)); + dim3 threadsPerBlocks_aux(BLOCKSIZE / 2); + + // Alloc variables + cudaMalloc((void**)&dev_odata, dSize); + cudaMalloc((void**)&dev_idata, dSize); + cudaMalloc((void**)&dev_aux, aSize); + cudaMalloc((void**)&dev_aux_sum, aSize); + cudaMallocHost((void**)&host_aux, aSize); // Use Pin-Memory to ensure the maximum memory speed + cudaMallocHost((void**)&host_aux_sum, aSize); + + cudaMemcpy(dev_idata, idata, dSize, cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + + // Prescan Each Block use GPU + scanWithShareMem << > >(n, dev_idata, dev_odata, dev_aux); + + // Prescan Auxiliary Array use CPU + cudaMemcpy(host_aux, dev_aux, aSize, cudaMemcpyDeviceToHost); + StreamCompaction::CPU::scan_incusive(aLen, host_aux_sum, host_aux); + + // Sum up with Blocks use GPU + cudaMemcpy(dev_aux_sum, host_aux_sum, aSize, cudaMemcpyHostToDevice); + sumWithBlock << > >(n, dev_odata, dev_idata, dev_aux_sum); + + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_aux); + cudaFree(dev_aux_sum); + cudaFreeHost(host_aux); + cudaFreeHost(host_aux_sum); + } + +#pragma endregion + + +#pragma region PrescanEfficientWithOptimization + __global__ void reduction(const int _d, const int ts, int * idata) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= ts) { + return; + } + + int ai = idx * _d + _d - 1; + int bi = ai - (_d >> 1); + + idata[ai] += idata[bi]; + } + + __global__ void downSweep(const int _d, const int ts, int * idata) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= ts) { + return; + } + + int ai = idx * _d + _d - 1; + int bi = ai - (_d >> 1); + + int t = idata[bi]; + idata[bi] = idata[ai]; + idata[ai] += t; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + + int dSize, dLen, _n; + int *dev_data; + + dim3 threadsPerBlocks(BLOCKSIZE); + + _n = ilog2ceil(n); + dLen = 1 << _n; + dSize = dLen * sizeof(int); + + cudaMalloc((void**)&dev_data, dSize); + cudaMemcpy(dev_data, idata, dSize, cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + + int _d, ts; + for (int d = 0; d < _n; d++) { + ts = 1 << (_n - d - 1); + _d = 1 << (d + 1); + + dim3 blocksPerGrid((ts + BLOCKSIZE - 1) / BLOCKSIZE); + reduction<<>>(_d, ts, dev_data); + } + + cudaMemset(dev_data + dLen - 1, 0, sizeof(int)); + + for (int d = _n - 1; d > -1; d--) { + ts = 1 << (_n - d - 1); + _d = 1 << (d + 1); + + dim3 blocksPerGrid((ts + BLOCKSIZE - 1) / BLOCKSIZE); + downSweep<<>>(_d, ts, dev_data); + } + + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_data); } - /** - * Performs stream compaction on idata, storing the result into odata. - * All zeroes are discarded. - * - * @param n The number of elements in idata. - * @param odata The array into which to store elements. - * @param idata The array of elements to compact. - * @returns The number of elements remaining after compaction. - */ + void scan_dev(int len, int _n, int *data) { + int _d, ts; + dim3 threadsPerBlocks(BLOCKSIZE); + + for (int d = 0; d < _n; d++) { + ts = 1 << (_n - d - 1); + _d = 1 << (d + 1); + + dim3 blocksPerGrid((ts + BLOCKSIZE - 1) / BLOCKSIZE); + reduction << > >(_d, ts, data); + } + + cudaMemset(data + len - 1, 0, sizeof(int)); + + for (int d = _n - 1; d > -1; d--) { + ts = 1 << (_n - d - 1); + _d = 1 << (d + 1); + + dim3 blocksPerGrid((ts + BLOCKSIZE - 1) / BLOCKSIZE); + downSweep << > >(_d, ts, data); + } + } + +#pragma endregion + + +#pragma region PrescanEfficient + __global__ void reduction0(const int _d, const int n, int * idata) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n || (idx % _d)) { + return; + } + + idata[idx + _d - 1] += idata[idx + (_d >> 1) - 1]; + } + + __global__ void downSweep0(const int _d, const int n, int * idata) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n || (idx % _d)) { + return; + } + + int t = idata[idx + (_d >> 1) - 1]; + idata[idx + (_d >> 1) - 1] = idata[idx + _d - 1]; + idata[idx + _d - 1] += t; + } + + void scan0(int n, int *odata, const int *idata) { + + int dSize, dLen, _n; + int *dev_data; + + dim3 threadsPerBlocks(BLOCKSIZE); + + _n = ilog2ceil(n); + dLen = 1 << _n; + dSize = dLen * sizeof(int); + + cudaMalloc((void**)&dev_data, dSize); + cudaMemcpy(dev_data, idata, dSize, cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + + int _d; + for (int d = 0; d < _n; d++) { + _d = 1 << (d + 1); + + dim3 blocksPerGrid((dLen + BLOCKSIZE - 1) / BLOCKSIZE); + reduction0 << > >(_d, n, dev_data); + } + + cudaMemset(dev_data + dLen - 1, 0, sizeof(int)); + + for (int d = _n - 1; d > -1; d--) { + _d = 1 << (d + 1); + + dim3 blocksPerGrid((dLen + BLOCKSIZE - 1) / BLOCKSIZE); + downSweep0 << > >(_d, n, dev_data); + } + + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_data); + } + +#pragma endregion + + +#pragma region Compaction + int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO + + int _n = ilog2ceil(n); + int len = 1 << _n; + int bsize = len * sizeof(int); + int nsize = n * sizeof(int); + + int *dev_in, *dev_out, *dev_bools, *dev_indices; + + cudaMalloc((void**)&dev_in, nsize); + cudaMalloc((void**)&dev_out, nsize); + cudaMalloc((void**)&dev_bools, bsize); + cudaMalloc((void**)&dev_indices, bsize); + + cudaMemcpy(dev_in, idata, nsize, cudaMemcpyHostToDevice); + + cudaMemset(dev_bools, 0, bsize); + + dim3 blocksPerGrid((len + BLOCKSIZE - 1) / (BLOCKSIZE)); + dim3 threadsPerBlocks(BLOCKSIZE); + + scan_timing = 0; + timer().startGpuTimer(); + + // 1 + Common::kernMapToBoolean << > > (n, dev_bools, dev_in); + + // 2 + cudaMemcpy(dev_indices, dev_bools, n * sizeof(int), cudaMemcpyDeviceToDevice); + scan_dev(len, _n, dev_indices); + + // 3 + Common::kernScatter << > >(n, dev_out, dev_in, dev_bools, dev_indices); + timer().endGpuTimer(); - return -1; + scan_timing = 1; + + + int *test; + test = (int *)malloc(n * sizeof(int)); + cudaMemcpy(test, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost); + for (int i = 0; i < n; i++) { + //printf("%i\n", test[i]); + } + + // 4 + int s; + cudaMemcpy(&s, dev_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + s = idata[n - 1] ? s + 1 : s; + + cudaMemcpy(odata, dev_out, s * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_in); + cudaFree(dev_out); + cudaFree(dev_bools); + cudaFree(dev_indices); + + return s; } + +#pragma endregion + } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..480a3b5 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -6,8 +6,12 @@ namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); + void scan0(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata); + void scan_s(int n, int *odata, int *idata); + int compact(int n, int *odata, const int *idata); } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9218f8e..31cd82c 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,22 +3,60 @@ #include "common.h" #include "naive.h" +#define BLOCKSIZE 512 + namespace StreamCompaction { namespace Naive { + using StreamCompaction::Common::PerformanceTimer; PerformanceTimer& timer() { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + __global__ void scan(const int n, const int _d, int * idata, int * odata) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n) { + return; + } + + odata[idx] = idx >= _d ? idata[idx - _d] + idata[idx] : idata[idx]; + + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { timer().startGpuTimer(); - // TODO + + int dSize, dLen; + int *dev_idata, *dev_odata; + + dLen = 1 << ilog2ceil(n); + dSize = dLen * sizeof(int); + + dim3 blocksPerGrid((dLen + BLOCKSIZE - 1) / BLOCKSIZE); + dim3 threadsPerBlocks(BLOCKSIZE); + + cudaMalloc((void**)&dev_idata, dSize); + cudaMalloc((void**)&dev_odata, dSize); + + cudaMemcpy(dev_idata, idata, dSize, cudaMemcpyHostToDevice); + + for (int _d = 1; _d < dLen; _d <<= 1) { + scan <<>>(n, _d, dev_idata, dev_odata); + std::swap(dev_idata, dev_odata); + } + + cudaMemcpy(odata + 1, dev_idata, dSize - sizeof(int), cudaMemcpyDeviceToHost); + odata[0] = 0; + + cudaFree(dev_idata); + cudaFree(dev_odata); + timer().endGpuTimer(); } } diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu new file mode 100644 index 0000000..23c5272 --- /dev/null +++ b/stream_compaction/radix.cu @@ -0,0 +1,153 @@ + +#include +#include +#include +#include +#include +#include "stream_compaction/common.h" +#include "stream_compaction/efficient.h" +#include "stream_compaction/radix.h" + +typedef int var_t; + +#define blocksize 128 + +namespace StreamCompaction { + namespace Radix { + + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + template + T findMax(T* &arr, T n) { + T max = 0; + for (int i = 0; i < n; ++i) { + if (arr[i] > max) { + max = arr[i]; + } + } + return max; + } + + __global__ void getB(const int n, const int t, int *idata, int *odata) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n) { + return; + } + + odata[idx] = ((idata[idx] >> t) & 1); + } + + __global__ void getE(const int n, const int t, int *bdata, int *edata) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n) { + return; + } + + edata[idx] = (bdata[idx] ^ 1); + } + + __global__ void getTF(const int n, int *edata, int *fdata, int *odata) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n) { + return; + } + + odata[idx] = edata[idx] + fdata[idx]; + } + + __global__ void getT(const int n, const int tf, int *fdata, int *odata) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n) { + return; + } + + odata[idx] = idx - fdata[idx] + tf; + } + + __global__ void getD(const int n, int *bdata, int * tdata, int * fdata, int *odata) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n) { + return; + } + + odata[idx] = (bdata[idx] ? tdata[idx] : fdata[idx]); + } + + __global__ void refill(const int n, int * d, int * idata, int *odata) { + int idx = threadIdx.x + (blockIdx.x * blockDim.x); + if (idx >= n) { + return; + } + + odata[d[idx]] = idata[idx]; + } + + //template + void sort(int n, int * odata, int * idata) { + + int size; + int *dev_idata, *dev_odata; + int *dev_b, *dev_e, *dev_f, *dev_t, *dev_d; + + size = n * sizeof(int); + + cudaMalloc((void**)&dev_idata, size); + cudaMalloc((void**)&dev_odata, size); + cudaMalloc((void**)&dev_b, size); + cudaMalloc((void**)&dev_e, size); + cudaMalloc((void**)&dev_f, size); + cudaMalloc((void**)&dev_t, size); + cudaMalloc((void**)&dev_d, size); + + cudaMemcpy(dev_idata, idata, size, cudaMemcpyHostToDevice); + + dim3 blocksPerGrid((n + blocksize - 1) / blocksize); + dim3 threadsPerBlock(blocksize); + + int max = findMax(idata, n); + int ndigit = ilog2ceil(max); + + timer().startGpuTimer(); + + for (int i = 0; i < ndigit; i++) { + getB << > >(n, i, dev_idata, dev_b); + + getE << > >(n, i, dev_b, dev_e); + + thrust::device_ptr dev_thrust_e(dev_e); + thrust::device_ptr dev_thrust_f(dev_f); + thrust::exclusive_scan(dev_thrust_e, dev_thrust_e + n, dev_thrust_f); + + int tf, le; + cudaMemcpy(&tf, dev_f + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&le, dev_e + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + tf += le; + + getT << > >(n, tf, dev_f, dev_t); + + getD << > >(n, dev_b, dev_t, dev_f, dev_d); + + refill << > >(n, dev_d, dev_idata, dev_odata); + std::swap(dev_idata, dev_odata); + } + + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_idata, size, cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_e); + cudaFree(dev_f); + cudaFree(dev_b); + cudaFree(dev_t); + cudaFree(dev_d); + } + + } +} \ No newline at end of file diff --git a/stream_compaction/radix.h b/stream_compaction/radix.h new file mode 100644 index 0000000..04b6c3e --- /dev/null +++ b/stream_compaction/radix.h @@ -0,0 +1,14 @@ +#pragma once + +#include "stream_compaction/common.h" + + +namespace StreamCompaction { + namespace Radix { + StreamCompaction::Common::PerformanceTimer& timer(); + + + void sort(int n, int * odata, int * idata); + + } +} \ No newline at end of file diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 36b732d..3bfb425 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,25 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + + int *dev_data; + cudaMalloc((void**)&dev_data, n * sizeof(int)); + + thrust::device_vector dev_thrust_idata(idata, idata + n); + thrust::device_vector dev_thrust_odata(odata, odata + n); + + timer().startGpuTimer(); + + thrust::exclusive_scan(dev_thrust_idata.begin(), dev_thrust_idata.end(), dev_thrust_odata.begin()); + thrust::copy(dev_thrust_odata.begin(), dev_thrust_odata.end(), dev_data); + + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - timer().endGpuTimer(); } } }