diff --git a/README.md b/README.md index b71c458..0c3ecf9 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,151 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Name: William Ho +* Email: willho@seas.upenn.edu +* Tested on: Windows 7 Professional, i7-6700 @ 3.40 GHz 16.0GB, NVIDIA QuadroK620 (Moore 100C Lab) -### (TODO: Your README) +## Overview -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +**Stream Compaction** is a process for culling elements from an array that do not meet a certain criteria. It is a widely used technique for compression, utilized for audio compression, and has many use cases for computer graphics, such as culling unnecessary rays from a path tracer. A conceptually simple example is show below, in which we remove `0`s from an array of `int`s: + +* Starting Array: {0, 2, 3, 0, 5, 2, 0, 0, 9} + +* Compacted Array: {2, 3, 5, 2, 9} + +The goal for this project was to observe a few variations on the Stream Compaction algorithm, note that it is a parallelizable technique that can be implemented on the GPU, and use it as a case study to gain an understanding of working on the GPU. Within Stream Compaction, a common implementation technique involves "scanning" an array and creating a corresponding array of exclusive prefix-sums. + +I implement **Scan**, (and using it, Stream Compaction) with a straightforward CPU implementation, a naively parallelized CUDA implementation on the GPU, and a work-efficient CUDA implementation, the latter two of which are adapted from GPU Gems 3, Chapter 39 - [Parallel Prefix Sum (Scan) with CUDA](https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch39.html). + +#### Features Implemented: + +- CPU Scan +- CPU Compaction +- Naive GPU Scan +- Work Efficient GPU Scan +- GPU Compaction +#### Tests Passed: +``` + +**************** +** SCAN TESTS ** +**************** + [ 14 48 46 18 30 23 38 34 37 24 49 31 22 ... 10 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.002704ms (std::chrono Measured) + [ 0 14 62 108 126 156 179 217 251 288 312 361 392 ... 24618 24628 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.002403ms (std::chrono Measured) + [ 0 14 62 108 126 156 179 217 251 288 312 361 392 ... 24581 24594 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.195104ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.221728ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.39984ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.375648ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 5.17347ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 1.81082ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 2 2 2 3 3 0 3 2 2 3 3 3 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.005408ms (std::chrono Measured) + [ 1 2 2 2 3 3 3 2 2 3 3 3 2 ... 3 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.005108ms (std::chrono Measured) + [ 1 2 2 2 3 3 3 2 2 3 3 3 2 ... 1 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.002403ms (std::chrono Measured) + [ 1 2 2 2 3 3 3 2 2 3 3 3 2 ... 3 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.415616ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.442624ms (CUDA Measured) + passed +Press any key to continue . . . +``` + +## Additional Implementation Details + +The Work-Efficient Scan algorithm reduces the number of necessary operations to a complexity of _O(n)_. Observe that, on differing iterations of the Upsweep and Downsweep phase, the number of elements to process in parallel changes. One useful way to leverage this is to map thread indices to the indices they operate on in such a way as to group working threads together in warps. The code snippet I've included below is from my implementation of the Upsweep, which is intended to make sure that all threads that need to do work are grouped at the lowest thread indices, which minimizes the number of warps that do not have all threads working. + +``` +//n is the number of threads necessary to compute the Upsweep +int index = threadIdx.x + blockDim.x * blockIdx.x; +if (index >= n) { + return; +} +int incrementLength = (int)powf(2.0, d); +int firstIndexInIteration = incrementLength - 1; +int firstSumIndex = firstIndexInIteration + index * incrementLength * 2; +data[firstSumIndex + incrementLength] = data[firstSumIndex] + data[firstSumIndex + incrementLength]; +``` + +## Analysis + +### Scan Analysis +| Array Size vs Runtime in ms | CPU | Naive | Work Efficient | Thrust | Thrust 2nd Run | +|-------------|-----|-------|----------------|----------|----------------| +|256 | 0.0006 | 0.048| 0.083 | 0.081|0.014| +|512 | 0.0009 | 0.052| 0.096 |0.078 |0.013| +|1024 | 0.0018| 0.06| 0.11 | 0.073|0.016| +|2028 | 0.0033| 0.06 | 0.11 | 0.09 |0.018| +|4056 | 0.0066| 0.12| 0.2 | 0.073 |0.22| +|8192 | 0.013 | 0.094| 0.14 | 0.085|0.032| +|10000 |0.016 | 0.11 | 0.16 | 0.88 |0.33| +|16384 |0.025 | 0.13 | 0.16 | 1.1 | 0.046| +|20000 |0.032 | 0.16 | 0.19 | 1.1 | 0.27| +|32,768| 0.05 | 0.21 | 0.2 | 1.1 | 0.22| +|40000 | 0.06 | 0.27 | 0.22 | 1.1 | 0.2| +|50000 | 0.077 | 0.31 | 0.22 | 1.1 | 0.21| +|65536 | 0.097 | 0.38 | 0.22 | 1.2 | 0.29| + +#### Comparing CPU and GPU + + +![](img/StreamProjection1.png) + +The first thing to notice about the GPU implementations is that they are rather slow, especially compared to the CPU implementations. This is almost certainly a result of the global memory accesses in both Naive and Work-Efficient. Since global memory access is usually a ~200 cycle operation, it's drastically costly, even with clever indexing to retire threads early. A truly optimized implementation of Scan on the GPU would leverage shared memory. + +However, we do gain insight into the GPU's capabilities when we examine the scalability of the 3 implementations. The CPU implementation increases in an observable linear fashion. Since it is a simple iteration through the array, this makes sense, the algorithm has an _O(n)_ time complexity. By contrast, the Work-Efficient algorithm does not appear to increase linearly, but rather levels out asymptotically. The Work-Efficient scan performs _O(n)_ add operations, but performing these operations in parallel means that it's time complexity is closer to _O(logn)_. It is worth noting though, the Naive implementation does not scale nearly as well. It is a far less efficient algorithm that performs _O(nlogn)_ operations. + +#### Comparison With Thrust + +![](img/StreamProjection2.png) + +No analysis would be complete without comparing my implementation to a standard library. The following shows runtime comparisons against `Thrust::exclusive_scan`. It is important to note that running `Thrust::exclusive_scan` more than once in a single process shows considerable time difference for the same input array. `Thrust` perhaps requires a considerable amount of initial processing before perform its Scan, so subsequent runs of `exclusive_scan` likely give a more accurate understanding of its runtime. At array sizes less than 65536, our Work-Efficient scan seems to out-perform `Thrust`, but `Thrust` seems to scale slightly better. + +### Compaction Analysis + +|Array Size vs | CPU Compact w/o Scan | CPU Compact w/ Scan | GPU Compact (requires Scan) | +|---|---|---|---| +|256 |0.0009| 0.0037| 0.12| +|512 |0.0015| 0.0066| 0.12| +|1024 |0.003 |0.019 |0.16| +|2048 |0.0054| 0.024 |0.14| +|4096 |0.0095| 0.022 |0.14| +|8192 |0.02 |0.038 |0.16| +|16384| 0.039 | 0.075 |0.19| +|32768| 0.11 | 0.19 |0.22| +|65536| 0.16 | 0.42 |0.3| + +![](img/StreamProjection3.png) +Important observations here are that the equivalent `compact with Scan` implementations for GPU vs CPU have similar growth qualities as our Scan functions, which they implement. Our GPU Compaction scales better, while our CPU Compaction grows quite a bit larger with input sizes. Scan is not required on the CPU though, so our implementation that doesn't utilize Scan is predictably much faster. Here, our global memory accesses are hurting us again with our GPU implementation. diff --git a/img/StreamProjection1.png b/img/StreamProjection1.png new file mode 100644 index 0000000..39f4b0f Binary files /dev/null and b/img/StreamProjection1.png differ diff --git a/img/StreamProjection2.png b/img/StreamProjection2.png new file mode 100644 index 0000000..d2cb802 Binary files /dev/null and b/img/StreamProjection2.png differ diff --git a/img/StreamProjection3.png b/img/StreamProjection3.png new file mode 100644 index 0000000..6388002 Binary files /dev/null and b/img/StreamProjection3.png differ diff --git a/src/main.cpp b/src/main.cpp old mode 100644 new mode 100755 index 7305641..33c154c --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1000; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int a[SIZE], b[SIZE], c[SIZE]; diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu old mode 100644 new mode 100755 index 8fc0211..64507cf --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,11 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = threadIdx.x + blockDim.x * blockIdx.x; + if (index > n) { + return; + } + bools[index] = (idata[index] == 0) ? 0 : 1; } /** @@ -32,7 +36,15 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + + int index = threadIdx.x + blockDim.x * blockIdx.x; + if (index > n) { + return; + } + + if (bools[index] == 1) { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h old mode 100644 new mode 100755 index 55f1b38..8014d1a --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -1,14 +1,14 @@ #pragma once -#include -#include - -#include -#include -#include -#include +#include +#include + +#include +#include +#include +#include #include -#include +#include #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) @@ -37,96 +37,96 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices); - /** - * This class is used for timing the performance - * Uncopyable and unmovable - * - * Adapted from WindyDarian(https://github.com/WindyDarian) - */ - class PerformanceTimer - { - public: - PerformanceTimer() - { - cudaEventCreate(&event_start); - cudaEventCreate(&event_end); - } - - ~PerformanceTimer() - { - cudaEventDestroy(event_start); - cudaEventDestroy(event_end); - } - - void startCpuTimer() - { - if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); } - cpu_timer_started = true; - - time_start_cpu = std::chrono::high_resolution_clock::now(); - } - - void endCpuTimer() - { - time_end_cpu = std::chrono::high_resolution_clock::now(); - - if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); } - - std::chrono::duration duro = time_end_cpu - time_start_cpu; - prev_elapsed_time_cpu_milliseconds = - static_cast(duro.count()); - - cpu_timer_started = false; - } - - void startGpuTimer() - { - if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); } - gpu_timer_started = true; - - cudaEventRecord(event_start); - } - - void endGpuTimer() - { - cudaEventRecord(event_end); - cudaEventSynchronize(event_end); - - if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); } - - cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end); - gpu_timer_started = false; - } - - float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015 - { - return prev_elapsed_time_cpu_milliseconds; - } - - float getGpuElapsedTimeForPreviousOperation() //noexcept - { - return prev_elapsed_time_gpu_milliseconds; - } - - // remove copy and move functions - PerformanceTimer(const PerformanceTimer&) = delete; - PerformanceTimer(PerformanceTimer&&) = delete; - PerformanceTimer& operator=(const PerformanceTimer&) = delete; - PerformanceTimer& operator=(PerformanceTimer&&) = delete; - - private: - cudaEvent_t event_start = nullptr; - cudaEvent_t event_end = nullptr; - - using time_point_t = std::chrono::high_resolution_clock::time_point; - time_point_t time_start_cpu; - time_point_t time_end_cpu; - - bool cpu_timer_started = false; - bool gpu_timer_started = false; - - float prev_elapsed_time_cpu_milliseconds = 0.f; - float prev_elapsed_time_gpu_milliseconds = 0.f; + /** + * This class is used for timing the performance + * Uncopyable and unmovable + * + * Adapted from WindyDarian(https://github.com/WindyDarian) + */ + class PerformanceTimer + { + public: + PerformanceTimer() + { + cudaEventCreate(&event_start); + cudaEventCreate(&event_end); + } + + ~PerformanceTimer() + { + cudaEventDestroy(event_start); + cudaEventDestroy(event_end); + } + + void startCpuTimer() + { + if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); } + cpu_timer_started = true; + + time_start_cpu = std::chrono::high_resolution_clock::now(); + } + + void endCpuTimer() + { + time_end_cpu = std::chrono::high_resolution_clock::now(); + + if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); } + + std::chrono::duration duro = time_end_cpu - time_start_cpu; + prev_elapsed_time_cpu_milliseconds = + static_cast(duro.count()); + + cpu_timer_started = false; + } + + void startGpuTimer() + { + if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); } + gpu_timer_started = true; + + cudaEventRecord(event_start); + } + + void endGpuTimer() + { + cudaEventRecord(event_end); + cudaEventSynchronize(event_end); + + if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); } + + cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end); + gpu_timer_started = false; + } + + float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015 + { + return prev_elapsed_time_cpu_milliseconds; + } + + float getGpuElapsedTimeForPreviousOperation() //noexcept + { + return prev_elapsed_time_gpu_milliseconds; + } + + // remove copy and move functions + PerformanceTimer(const PerformanceTimer&) = delete; + PerformanceTimer(PerformanceTimer&&) = delete; + PerformanceTimer& operator=(const PerformanceTimer&) = delete; + PerformanceTimer& operator=(PerformanceTimer&&) = delete; + + private: + cudaEvent_t event_start = nullptr; + cudaEvent_t event_end = nullptr; + + using time_point_t = std::chrono::high_resolution_clock::time_point; + time_point_t time_start_cpu; + time_point_t time_end_cpu; + + bool cpu_timer_started = false; + bool gpu_timer_started = false; + + float prev_elapsed_time_cpu_milliseconds = 0.f; + float prev_elapsed_time_gpu_milliseconds = 0.f; }; } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu old mode 100644 new mode 100755 index 05ce667..c2ed9f0 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,15 +1,15 @@ #include #include "cpu.h" -#include "common.h" +#include "common.h" namespace StreamCompaction { namespace CPU { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** @@ -18,9 +18,12 @@ namespace StreamCompaction { * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ void scan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); + //timer().startCpuTimer(); + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } + //timer().endCpuTimer(); } /** @@ -30,9 +33,15 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int count = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[count] = idata[i]; + count++; + } + } timer().endCpuTimer(); - return -1; + return count; } /** @@ -42,9 +51,26 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + //Mark elements to be kept and scan the (0,1) array + int *tmp = new int[n]; + int *scanTmp = new int[n]; + for (int i = 0; i < n; i++) { + tmp[i] = (idata[i] == 0) ? 0 : 1; + } + scan(n, scanTmp, tmp); + + //scatter + int count = 0; + for (int i = 0; i < n; i++) { + if (tmp[i] == 1) { + odata[scanTmp[i]] = idata[i]; + count++; + } + } + delete[] scanTmp; + delete[] tmp; timer().endCpuTimer(); - return -1; + return count; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu old mode 100644 new mode 100755 index 36c5ef2..41c5355 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,24 +3,125 @@ #include "common.h" #include "efficient.h" +#define blockSize 32 + namespace StreamCompaction { namespace Efficient { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } + __global__ void kernComputeUpSweepIteration(int n, int d, int *data) { + //Compute the indices to be added together based on the thread index + //and the iteration number. The second of these indices is also the output + //index. + int index = threadIdx.x + blockDim.x * blockIdx.x; + if (index >= n) { + return; + } + int incrementLength = (int)powf(2.0, d); + int firstIndexInIteration = incrementLength - 1; + int firstSumIndex = firstIndexInIteration + index * incrementLength * 2; + int secondSumIndex = firstSumIndex + incrementLength; + + data[secondSumIndex] = data[firstSumIndex] + data[secondSumIndex]; + } + + __global__ void kernComputeDownSweepIteration(int n, int d, int ceil, int *data) { + int index = threadIdx.x + blockDim.x * blockIdx.x; + if (index >= n) { + return; + } + float twoPower = 1.0f / powf(2.0f, d + 1); + int incrementLength = ceil * twoPower; + int rightIndex = (ceil - 1) - index * incrementLength * 2; + int leftIndex = rightIndex - incrementLength; + + int tmp = data[leftIndex]; + int dataFromRight = data[rightIndex]; + + data[leftIndex] = dataFromRight; + data[rightIndex] = dataFromRight + tmp; + } + + __global__ void kernReplaceIndexWithZero(int n, int *data) { + int index = threadIdx.x + blockDim.x * blockIdx.x; + if (index != 0) { + return; + } + data[n] = 0; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + //allocate device data padded with zeroes + int log = ilog2ceil(n); + int ceil = (int)powf(2.0f, log); + int *dev_data; + cudaMalloc((void**)&dev_data, ceil * sizeof(int)); + checkCUDAError("cudaMalloc dev_data failed!"); + + cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy failed!"); + timer().startGpuTimer(); - // TODO + int numThreadsToDoWork = ceil; + for (int d = 0; d < log; d++) { + numThreadsToDoWork /= 2; + dim3 fullBlocks((numThreadsToDoWork + blockSize - 1) / blockSize); + kernComputeUpSweepIteration << > > (numThreadsToDoWork, d, dev_data); + } + + kernReplaceIndexWithZero << <1, 32 >> > (ceil - 1, dev_data); + + for (int d = 0; d < log; d++) { + dim3 fullBlocks((numThreadsToDoWork + blockSize - 1) / blockSize); + kernComputeDownSweepIteration << > > (numThreadsToDoWork, d, ceil, dev_data); + numThreadsToDoWork *= 2; + } timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy failed!"); + cudaFree(dev_data); } + void scanInPlace(int n, int *dev_data) { + int log = ilog2ceil(n); + int ceil = (int)powf(2.0f, log); + int numThreadsToDoWork = ceil; + for (int d = 0; d < log; d++) { + numThreadsToDoWork /= 2; + dim3 fullBlocks((numThreadsToDoWork + blockSize - 1) / blockSize); + kernComputeUpSweepIteration << > > (numThreadsToDoWork, d, dev_data); + } + + kernReplaceIndexWithZero << <1, 32 >> > (ceil - 1, dev_data); + + for (int d = 0; d < log; d++) { + dim3 fullBlocks((numThreadsToDoWork + blockSize - 1) / blockSize); + kernComputeDownSweepIteration << > > (numThreadsToDoWork, d, ceil, dev_data); + numThreadsToDoWork *= 2; + } + } + + /** + Helper function to calculate how many elements made it through compaction + */ + + __global__ void kernComputeNumberOfValidElements(int n, const int *bools, const int *indices, int *answer) { + int index = threadIdx.x + blockDim.x * blockIdx.x; + if (index != 0) { + return; + } + *answer = (bools[n] == 0) ? indices[n]: indices[n] + 1; + } + /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -31,10 +132,68 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int *dev_bools; + int *dev_indices; + int *dev_idata; + int *dev_odata; + int *dev_answer; + + int log = ilog2ceil(n); + int ceil = (int)powf(2.0f, log); + + cudaMalloc((void**)&dev_bools, ceil * sizeof(int)); + checkCUDAError("cudaMalloc dev_bools failed!"); + + cudaMalloc((void**)&dev_indices, ceil * sizeof(int)); + checkCUDAError("cudaMalloc dev_indices failed!"); + + cudaMalloc((void**)&dev_idata, ceil * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + + cudaMalloc((void**)&dev_odata, ceil * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + + cudaMalloc((void**)&dev_answer, sizeof(int)); + checkCUDAError("cudaMalloc dev_answer failed!"); + + dim3 fullBlocks((ceil + blockSize - 1) / blockSize); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy failed!"); + timer().startGpuTimer(); - // TODO + // map the idata to a bools array + Common::kernMapToBoolean << > > (ceil, dev_bools, dev_idata); + + + cudaMemcpy(dev_indices, dev_bools, ceil * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpy failed!"); + + // perform scan on the bools array + scanInPlace(n, dev_indices); + + Common::kernScatter << >> (ceil, dev_odata, dev_idata, dev_bools, dev_indices); + + + kernComputeNumberOfValidElements << <1, 32 >> > (ceil-1, dev_bools, dev_indices, dev_answer); timer().endGpuTimer(); - return -1; + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy failed!"); + + int answer; + int *answerPtr = &answer; + cudaMemcpy(answerPtr, dev_answer, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy failed!"); + + + cudaFree(dev_bools); + cudaFree(dev_indices); + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_answer); + + return answer; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu old mode 100644 new mode 100755 index 9218f8e..d38e2e2 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,23 +3,80 @@ #include "common.h" #include "naive.h" +#define blockSize 128 + +dim3 threadsPerBlock(blockSize); + namespace StreamCompaction { namespace Naive { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } // TODO: __global__ + __global__ void kernPerformScanIteration(int n, int d, int *odata, const int *idata) { + int index = threadIdx.x + blockDim.x * blockIdx.x; + if (index > n) { + return; + } + int twoPower = (int)powf(2.0f, d); + odata[index] = (index >= twoPower) ? idata[index - twoPower] + idata[index] : idata[index]; + } + + __global__ void kernShiftInclusiveToExclusive(int n, int *odata, const int *idata) { + int index = threadIdx.x + blockDim.x * blockIdx.x; + if (index > n) { + return; + } + odata[index] = (index > 0) ? idata[index - 1] : 0; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + //define blocks + dim3 fullBlocks((n + blockSize - 1) / blockSize); + int numIterations = ilog2ceil(n); + + //allocate buffers in global memory + int *dev_idata; + int *dev_odata; + int *tmpBuffer; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("cudaMalloc of dev_idata failed!"); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc of dev_idata failed!"); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy failed!"); + + //populate dev_odata for continuity (really for x0, so this is mostly unnecessary) + //cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + //checkCUDAError("cudaMemcpy failed!"); + timer().startGpuTimer(); + + for (int d = 0; d < numIterations; d++) { + kernPerformScanIteration << > > (n, d, dev_odata, dev_idata); + tmpBuffer = dev_idata; + dev_idata = dev_odata; + dev_odata = tmpBuffer; + } + //At this point, our final array is in dev_idata + kernShiftInclusiveToExclusive << > > (n, dev_odata, dev_idata); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy failed!"); + + //Free buffers + cudaFree(dev_idata); + cudaFree(dev_odata); + } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu old mode 100644 new mode 100755 index 36b732d..4206f4f --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -8,21 +8,44 @@ namespace StreamCompaction { namespace Thrust { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int *dev_idata; + int *dev_odata; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy failed!"); + + thrust::device_ptr dev_thrust_idata(dev_idata); + thrust::device_ptr dev_thrust_odata(dev_odata); + + timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + thrust::exclusive_scan(dev_thrust_idata, dev_thrust_idata + n, dev_thrust_odata); timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); } } }