diff --git a/README.md b/README.md
index b71c458..33cdff8 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,138 @@ CUDA Stream Compaction
**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Rishabh Shah
+* Tested on: Windows 10, i7-6700HQ @ 2.6GHz 16GB, GTX 960M 4096MB (Laptop)
-### (TODO: Your README)
+## Overview
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+In this project, I explored parallel algorithms to find the sum of elements of an array. Inclusive and Exclusive scan algorithms for CPU and GPU are implemented. These algorithms can be used in a number of applications like stream compaction, path tracing, etc. As a part of the project, I also implemented stream compaction using exclusive scan on the GPU.
+The performance of the GPU implementation is very high compared to the CPU implementation when the input array is very large.
+
+## Functionalities Implemented
+
+* Required functionalities
+ * Part 1: CPU Scan and Stream Compaction
+ * Part 2: Naive GPU Scan
+ * Part 3: Work-Efficient GPU Scan and Stream Compaction
+ * Part 4: Thrust's GPU Scan
+* Part 5: Optimizing work-efficient GPU implementation
+
+## Performance Analysis
+
+The GPU implementations are compared with a CPU implementation and Thrust's implementation. At first glance on the plots, it seems that GPU implementations are faster than CPU. But when we take a closer look, it becomes clear that GPU implementations are slower upto array sizes of 216 and becomes faster for longer arrays. This must be due to the calculation overhead with the parallel algorithms, compared to simple summation on the CPU.
+
+A same trend can be seen in stream compaction. Both, CPU and GPU methods, take a considerably longer to perform compaction compared to only scan, suggesting that scan is not the only heavy task performed during compaction, and the rest of the algorithm also contributes significantly to the total time. Yet, similar to scan, GPU becomes more efficient only for array sizes greater than 216.
+
+###### Optimizing work-efficient GPU scan
+
+Much of the performance gain for work-efficient GPU algorithm is due to the fact that the number of threads at each level reduces to half, so more the number of levels, faster it is compared to naive GPU scan and CPU scan. Mathematically, for array of size N, there are log2N levels. So, there would be N*log2N threads without this optimization. By not creating extra threads, we will only create threads equal to the number of nodes at each level of the binary tree, which is equal to 2log2N+1. Thus the number of threads required to compute the result is much less for very large array sizes.
+
+###### Performance Bottlenecks
+
+The GPU implementations seem to be very slow compared to thrust's implementation, probably because I am not using shared memory which is way faster than global memory. For CPU implementation, memory access should not be an issue, but it just becomes increasingly slower as it is a single threaded sequential process.
+
+
+##### Exclusive Scan
+
+| Array Size (exp) | CPU | GPU (Naive) | GPU (Efficient) | Thrust |
+|---------------------:|-------:|---------------:|-------------------:|--------:|
+|8 |0.00079 |0.036256 |0.112896 |0.020512|
+|10 |0.001975 |0.043136 |0.12544 |0.021248|
+|12 |0.007507 |0.05152 |0.160096 |0.02768|
+|14 |0.030025 |0.077568 |0.235488 |0.057216|
+|16 |0.117333 |0.14128 |0.271616 |0.228|
+|18 |0.860445 |0.418816 |0.434656 |0.220896|
+|20 |2.12425 |3.16147 |1.45034 |0.389504|
+|22 |7.8724 |13.7757 |5.36339 |1.076|
+|24 |34.0334 |60.7282 |20.7226 |3.8527|
+
+*Plot: exclusive scan for entire data*
+
+
+
+*Plot: exclusive scan for 28 to 218 array sizes*
+
+
+
+*Plot: exclusive scan for 218 to 224 array sizes*
+
+
+
+##### Stream Compaction
+
+| Array Size (exp) | CPU (without scan) | CPU (with scan) | GPU (Efficient) |
+|-----------------:|-------------------:|----------------:|----------------:|
+|8 |0.00079 |0.002766 |0.162496|
+|10 |0.003161 |0.005531 |0.1424|
+|12 |0.012642 |0.033975 |0.16608|
+|14 |0.046617 |0.112593 |0.304896|
+|16 |0.181729 |0.436939 |0.318752|
+|18 |0.764839 |2.9361 |0.541664|
+|20 |3.37936 |11.3189 |1.98714|
+|22 |13.412 |49.1414 |7.40618|
+|24 |52.4042 |189.27 |28.9966|
+
+
+
+
+
+_Console output for 16777216 (224) elements in the input array._
+```
+****************
+** SCAN TESTS **
+****************
+ [ 7 42 17 29 31 49 9 9 40 35 8 48 15 ... 23 0 ]
+==== cpu scan, power-of-two ====
+ elapsed time: 33.9812ms (std::chrono Measured)
+ [ 0 7 49 66 95 126 175 184 193 233 268 276 324 ... 410894172 410894195 ]
+==== cpu scan, non-power-of-two ====
+ elapsed time: 36.3137ms (std::chrono Measured)
+ [ 0 7 49 66 95 126 175 184 193 233 268 276 324 ... 410894088 410894124 ]
+ passed
+==== naive scan, power-of-two ====
+ elapsed time: 60.7475ms (CUDA Measured)
+ passed
+==== naive scan, non-power-of-two ====
+ elapsed time: 60.0793ms (CUDA Measured)
+ passed
+==== work-efficient scan, power-of-two ====
+ elapsed time: 20.6467ms (CUDA Measured)
+ passed
+==== work-efficient scan, non-power-of-two ====
+ elapsed time: 20.7787ms (CUDA Measured)
+ passed
+==== thrust scan, power-of-two ====
+ elapsed time: 3.81834ms (CUDA Measured)
+ passed
+==== thrust scan, non-power-of-two ====
+ elapsed time: 3.69171ms (CUDA Measured)
+ passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+ [ 1 2 2 3 0 0 0 2 3 3 1 2 1 ... 2 0 ]
+==== cpu compact without scan, power-of-two ====
+ elapsed time: 73.5653ms (std::chrono Measured)
+ [ 1 2 2 3 2 3 3 1 2 1 3 1 1 ... 2 2 ]
+ passed
+==== cpu compact without scan, non-power-of-two ====
+ elapsed time: 54.5126ms (std::chrono Measured)
+ [ 1 2 2 3 2 3 3 1 2 1 3 1 1 ... 3 2 ]
+ passed
+==== cpu compact with scan ====
+ elapsed time: 283.844ms (std::chrono Measured)
+ passed
+==== work-efficient compact, power-of-two ====
+ elapsed time: 29.1447ms (CUDA Measured)
+ passed
+==== work-efficient compact, non-power-of-two ====
+ elapsed time: 29.2029ms (CUDA Measured)
+ passed
+Press any key to continue . . .
+```
+
+##### Changes in CMakeLists
+OPTIONS -arch=sm_50
diff --git a/img/compaction.png b/img/compaction.png
new file mode 100644
index 0000000..bdc1c41
Binary files /dev/null and b/img/compaction.png differ
diff --git a/img/scan.png b/img/scan.png
new file mode 100644
index 0000000..a85e2ed
Binary files /dev/null and b/img/scan.png differ
diff --git a/img/scan_1.png b/img/scan_1.png
new file mode 100644
index 0000000..af20f81
Binary files /dev/null and b/img/scan_1.png differ
diff --git a/img/scan_2.png b/img/scan_2.png
new file mode 100644
index 0000000..5d70329
Binary files /dev/null and b/img/scan_2.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 7305641..4276bfc 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
#include
#include "testing_helpers.hpp"
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 16; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int a[SIZE], b[SIZE], c[SIZE];
@@ -56,7 +56,7 @@ int main(int argc, char* argv[]) {
printDesc("naive scan, non-power-of-two");
StreamCompaction::Naive::scan(NPOT, c, a);
printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(SIZE, c, true);
+ //printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);
zeroArray(SIZE, c);
@@ -122,7 +122,7 @@ int main(int argc, char* argv[]) {
printDesc("cpu compact with scan");
count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
- printArray(count, c, true);
+ //printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);
zeroArray(SIZE, c);
diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp
index ae94ca6..8c3cde8 100644
--- a/src/testing_helpers.hpp
+++ b/src/testing_helpers.hpp
@@ -65,6 +65,6 @@ void printArray(int n, int *a, bool abridged = false) {
template
void printElapsedTime(T time, std::string note = "")
-{
- std::cout << " elapsed time: " << time << "ms " << note << std::endl;
+{
+ std::cout << " elapsed time: " << time << "ms " << note << std::endl;
}
\ No newline at end of file
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index cdbef77..c8709e7 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
cuda_add_library(stream_compaction
${SOURCE_FILES}
- OPTIONS -arch=sm_20
+ OPTIONS -arch=sm_50
)
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 8fc0211..8410082 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -23,7 +23,10 @@ namespace StreamCompaction {
* which map to 0 will be removed, and elements which map to 1 will be kept.
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
- // TODO
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index >= n) return;
+
+ bools[index] = idata[index] != 0 ? 1 : 0;
}
/**
@@ -32,8 +35,11 @@ namespace StreamCompaction {
*/
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
- // TODO
- }
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index >= n) return;
+ if (bools[index] == 1)
+ odata[indices[index]] = idata[index];
+ }
}
}
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index 55f1b38..5174556 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -1,18 +1,20 @@
#pragma once
-#include
-#include
-
-#include
-#include
-#include
-#include
+#include
+#include
+
+#include
+#include
+#include
+#include
#include
-#include
+#include
#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+#define blockSize 128
+
/**
* Check for CUDA errors; print and exit if there was a problem.
*/
@@ -27,7 +29,7 @@ inline int ilog2(int x) {
}
inline int ilog2ceil(int x) {
- return ilog2(x - 1) + 1;
+ return x == 1 ? 0 : ilog2(x - 1) + 1;
}
namespace StreamCompaction {
@@ -37,96 +39,96 @@ namespace StreamCompaction {
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices);
- /**
- * This class is used for timing the performance
- * Uncopyable and unmovable
- *
- * Adapted from WindyDarian(https://github.com/WindyDarian)
- */
- class PerformanceTimer
- {
- public:
- PerformanceTimer()
- {
- cudaEventCreate(&event_start);
- cudaEventCreate(&event_end);
- }
-
- ~PerformanceTimer()
- {
- cudaEventDestroy(event_start);
- cudaEventDestroy(event_end);
- }
-
- void startCpuTimer()
- {
- if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); }
- cpu_timer_started = true;
-
- time_start_cpu = std::chrono::high_resolution_clock::now();
- }
-
- void endCpuTimer()
- {
- time_end_cpu = std::chrono::high_resolution_clock::now();
-
- if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); }
-
- std::chrono::duration duro = time_end_cpu - time_start_cpu;
- prev_elapsed_time_cpu_milliseconds =
- static_cast(duro.count());
-
- cpu_timer_started = false;
- }
-
- void startGpuTimer()
- {
- if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); }
- gpu_timer_started = true;
-
- cudaEventRecord(event_start);
- }
-
- void endGpuTimer()
- {
- cudaEventRecord(event_end);
- cudaEventSynchronize(event_end);
-
- if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); }
-
- cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end);
- gpu_timer_started = false;
- }
-
- float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015
- {
- return prev_elapsed_time_cpu_milliseconds;
- }
-
- float getGpuElapsedTimeForPreviousOperation() //noexcept
- {
- return prev_elapsed_time_gpu_milliseconds;
- }
-
- // remove copy and move functions
- PerformanceTimer(const PerformanceTimer&) = delete;
- PerformanceTimer(PerformanceTimer&&) = delete;
- PerformanceTimer& operator=(const PerformanceTimer&) = delete;
- PerformanceTimer& operator=(PerformanceTimer&&) = delete;
-
- private:
- cudaEvent_t event_start = nullptr;
- cudaEvent_t event_end = nullptr;
-
- using time_point_t = std::chrono::high_resolution_clock::time_point;
- time_point_t time_start_cpu;
- time_point_t time_end_cpu;
-
- bool cpu_timer_started = false;
- bool gpu_timer_started = false;
-
- float prev_elapsed_time_cpu_milliseconds = 0.f;
- float prev_elapsed_time_gpu_milliseconds = 0.f;
+ /**
+ * This class is used for timing the performance
+ * Uncopyable and unmovable
+ *
+ * Adapted from WindyDarian(https://github.com/WindyDarian)
+ */
+ class PerformanceTimer
+ {
+ public:
+ PerformanceTimer()
+ {
+ cudaEventCreate(&event_start);
+ cudaEventCreate(&event_end);
+ }
+
+ ~PerformanceTimer()
+ {
+ cudaEventDestroy(event_start);
+ cudaEventDestroy(event_end);
+ }
+
+ void startCpuTimer()
+ {
+ if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); }
+ cpu_timer_started = true;
+
+ time_start_cpu = std::chrono::high_resolution_clock::now();
+ }
+
+ void endCpuTimer()
+ {
+ time_end_cpu = std::chrono::high_resolution_clock::now();
+
+ if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); }
+
+ std::chrono::duration duro = time_end_cpu - time_start_cpu;
+ prev_elapsed_time_cpu_milliseconds =
+ static_cast(duro.count());
+
+ cpu_timer_started = false;
+ }
+
+ void startGpuTimer()
+ {
+ if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); }
+ gpu_timer_started = true;
+
+ cudaEventRecord(event_start);
+ }
+
+ void endGpuTimer()
+ {
+ cudaEventRecord(event_end);
+ cudaEventSynchronize(event_end);
+
+ if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); }
+
+ cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end);
+ gpu_timer_started = false;
+ }
+
+ float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015
+ {
+ return prev_elapsed_time_cpu_milliseconds;
+ }
+
+ float getGpuElapsedTimeForPreviousOperation() //noexcept
+ {
+ return prev_elapsed_time_gpu_milliseconds;
+ }
+
+ // remove copy and move functions
+ PerformanceTimer(const PerformanceTimer&) = delete;
+ PerformanceTimer(PerformanceTimer&&) = delete;
+ PerformanceTimer& operator=(const PerformanceTimer&) = delete;
+ PerformanceTimer& operator=(PerformanceTimer&&) = delete;
+
+ private:
+ cudaEvent_t event_start = nullptr;
+ cudaEvent_t event_end = nullptr;
+
+ using time_point_t = std::chrono::high_resolution_clock::time_point;
+ time_point_t time_start_cpu;
+ time_point_t time_end_cpu;
+
+ bool cpu_timer_started = false;
+ bool gpu_timer_started = false;
+
+ float prev_elapsed_time_cpu_milliseconds = 0.f;
+ float prev_elapsed_time_gpu_milliseconds = 0.f;
};
}
}
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 05ce667..59ae236 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -1,15 +1,15 @@
#include
#include "cpu.h"
-#include "common.h"
+#include "common.h"
namespace StreamCompaction {
namespace CPU {
- using StreamCompaction::Common::PerformanceTimer;
- PerformanceTimer& timer()
- {
- static PerformanceTimer timer;
- return timer;
+ using StreamCompaction::Common::PerformanceTimer;
+ PerformanceTimer& timer()
+ {
+ static PerformanceTimer timer;
+ return timer;
}
/**
@@ -17,9 +17,18 @@ namespace StreamCompaction {
* For performance analysis, this is supposed to be a simple for loop.
* (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
*/
+ void scan_implementation(int n, int *odata, const int *idata) {
+ // your actual implementation
+ odata[0] = 0;
+ for (int i = 1; i < n; i++) {
+ odata[i] = odata[i - 1] + idata[i - 1];
+ }
+ }
+
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
- // TODO
+ // TODO
+ scan_implementation(n, odata, idata);
timer().endCpuTimer();
}
@@ -30,9 +39,16 @@ namespace StreamCompaction {
*/
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
- // TODO
+ // TODO
+ int next = 0;
+ for (int i = 0; i < n; i++) {
+ if (idata[i] != 0) {
+ odata[next] = idata[i];
+ next++;
+ }
+ }
timer().endCpuTimer();
- return -1;
+ return next;
}
/**
@@ -43,8 +59,32 @@ namespace StreamCompaction {
int compactWithScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
+ int *mapped = new int[n];
+ int *scanned = new int[n];
+ int count = 0;
+ // Map
+ for (int i = 0; i < n; i++) {
+ if (idata[i] != 0) {
+ mapped[i] = 1;
+ }
+ else {
+ mapped[i] = 0;
+ }
+ }
+
+ // Scan
+ scan_implementation(n, scanned, mapped);
+
+ // Scatter
+ for (int i = 0; i < n; i++) {
+ if (mapped[i] == 1) {
+ odata[scanned[i]] = idata[i];
+ count++;
+ }
+ }
+
timer().endCpuTimer();
- return -1;
+ return count;
}
}
}
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 36c5ef2..14a0b15 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -5,20 +5,104 @@
namespace StreamCompaction {
namespace Efficient {
- using StreamCompaction::Common::PerformanceTimer;
- PerformanceTimer& timer()
- {
- static PerformanceTimer timer;
- return timer;
+ using StreamCompaction::Common::PerformanceTimer;
+ PerformanceTimer& timer()
+ {
+ static PerformanceTimer timer;
+ return timer;
+ }
+
+ // Kernel to pad the new array with 0s
+ __global__ void kernPadWithZeros(const int n, const int nPad, int *dev_data) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index >= nPad || index < n) return;
+
+ dev_data[index] = 0;
+ }
+
+ // Up-Sweep Kernel
+ __global__ void kernUpSweep(const int n, const int pow, const int pow1, int *dev_data) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index * pow1 >= n) return;
+
+ int idx = (index + 1) * pow1 - 1;
+ dev_data[idx] += dev_data[idx - pow];
+ }
+
+ // Down-Sweep Kernel
+ __global__ void kernDownSweep(const int n, const int pow, const int pow1, int *dev_data) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index * pow1 >= n) return;
+
+ int idx = (index + 1) * pow1 - 1;
+ int t = dev_data[idx - pow];
+ dev_data[idx - pow] = dev_data[idx];
+ dev_data[idx] += t;
+ }
+
+ void scan_implementation(const int nl, const dim3 numBlocks, const dim3 numThreads,
+ const int nPad, int*dev_data) {
+
+ for (int d = 0; d < nl; d++) {
+ int pow = 1 << (d);
+ int pow1 = 1 << (d + 1);
+ dim3 nB((nPad / pow1 + blockSize - 1) / blockSize);
+ kernUpSweep <<>> (nPad, pow, pow1, dev_data);
+ checkCUDAError("kernUpSweep failed!");
+ }
+
+ //dev_data[nPad - 1] = 0; // set last element to 0 before downsweep..
+ int zero = 0;
+ cudaMemcpy(dev_data + nPad - 1, &zero, sizeof(int), cudaMemcpyHostToDevice);
+ checkCUDAError("cudaMemcpy failed!");
+
+ for (int d = nl - 1; d >= 0; d--) {
+ int pow = 1 << (d);
+ int pow1 = 1 << (d + 1);
+ dim3 nB((nPad / pow1 + blockSize - 1) / blockSize);
+ kernDownSweep <<>> (nPad, pow, pow1, dev_data);
+ checkCUDAError("kernDownSweep failed!");
+ }
}
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
- timer().startGpuTimer();
- // TODO
- timer().endGpuTimer();
+ int nSize = n * sizeof(int);
+ int nl = ilog2ceil(n);
+ int nPad = 1 << nl;
+ int nPadSize = nPad * sizeof(int);
+
+ // Compute blocks per grid and threads per block
+ dim3 numBlocks((nPad + blockSize - 1) / blockSize);
+ dim3 numThreads(blockSize);
+
+ int *dev_data;
+ cudaMalloc((void**)&dev_data, nPadSize);
+ checkCUDAError("cudaMalloc for dev_data failed!");
+
+ // Copy device arrays to device
+ cudaMemcpy(dev_data, idata, nSize, cudaMemcpyHostToDevice); // use a kernel to fill 0s for the remaining indices..
+ checkCUDAError("cudaMemcpy for dev_data failed!");
+
+ // Fill the padded part of dev_data with 0s..
+ kernPadWithZeros <<>> (n, nPad, dev_data);
+
+ timer().startGpuTimer();
+ // Work Efficient Scan - Creates exclusive scan output
+
+ scan_implementation(nl, numBlocks, numThreads, nPad, dev_data);
+
+ timer().endGpuTimer();
+
+ // Copy device arrays back to host
+ cudaMemcpy(odata, dev_data, nSize, cudaMemcpyDeviceToHost);
+ checkCUDAError("cudaMemcpy (device to host) for odata failed!");
+
+ // Free memory
+ cudaFree(dev_data);
+ checkCUDAError("cudaFree failed!");
}
/**
@@ -31,10 +115,71 @@ namespace StreamCompaction {
* @returns The number of elements remaining after compaction.
*/
int compact(int n, int *odata, const int *idata) {
- timer().startGpuTimer();
- // TODO
- timer().endGpuTimer();
- return -1;
+ int nSize = n * sizeof(int);
+ int nl = ilog2ceil(n);
+ int nPad = 1 << nl;
+ int nPadSize = nPad * sizeof(int);
+
+ int *dev_idata, *dev_odata, *dev_bools, *dev_indices;
+ cudaMalloc((void**)&dev_idata, nSize);
+ checkCUDAError("cudaMalloc for dev_idata failed!");
+
+ cudaMalloc((void**)&dev_odata, nSize);
+ checkCUDAError("cudaMalloc for dev_odata failed!");
+
+ cudaMalloc((void**)&dev_bools, nSize);
+ checkCUDAError("cudaMalloc for dev_bools failed!");
+
+ cudaMalloc((void**)&dev_indices, nPadSize);
+ checkCUDAError("cudaMalloc for dev_indices failed!");
+
+ // Copy device arrays to device
+ cudaMemcpy(dev_idata, idata, nSize, cudaMemcpyHostToDevice);
+ checkCUDAError("cudaMemcpy for dev_data failed!");
+
+ dim3 numBlocks((n + blockSize - 1) / blockSize);
+ dim3 numBlocksPadded((nPad + blockSize - 1) / blockSize);
+ dim3 numThreads(blockSize);
+
+ timer().startGpuTimer();
+
+ // Create bools array
+ StreamCompaction::Common::kernMapToBoolean <<>> (n, dev_bools, dev_idata);
+ checkCUDAError("cudaMemcpy for kernMapToBoolean failed!");
+
+ // Copy bools array to indices array - device to device
+ cudaMemcpy(dev_indices, dev_bools, nSize, cudaMemcpyDeviceToDevice);
+ checkCUDAError("cudaMemcpy for dev_indices failed!");
+ // Pad the extended array with 0s
+ kernPadWithZeros <<>> (n, nPad, dev_indices);
+ checkCUDAError("cudaMemcpy for kernPadWithZeros failed!");
+
+ // Work Efficient Scan
+ scan_implementation(nl, numBlocksPadded, numThreads, nPad, dev_indices);
+
+ // Scatter
+ StreamCompaction::Common::kernScatter <<>> (n, dev_odata, dev_idata, dev_bools, dev_indices);
+ checkCUDAError("cudaMemcpy for kernScatter failed!");
+
+ timer().endGpuTimer();
+
+ int newSize, indEnd, boolEnd;
+ cudaMemcpy(&indEnd, dev_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+ cudaMemcpy(&boolEnd, dev_bools + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+ newSize = indEnd + boolEnd;
+ //printf("%d",newSize);
+
+ // Copy device arrays back to host
+ cudaMemcpy(odata, dev_odata, nSize, cudaMemcpyDeviceToHost);
+ checkCUDAError("cudaMemcpy (device to host) for odata failed!");
+
+ // Free memory
+ cudaFree(dev_idata);
+ cudaFree(dev_odata);
+ cudaFree(dev_bools);
+ cudaFree(dev_indices);
+ checkCUDAError("cudaFree failed!");
+ return newSize;
}
}
}
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 9218f8e..b9681d1 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -4,22 +4,77 @@
#include "naive.h"
namespace StreamCompaction {
- namespace Naive {
- using StreamCompaction::Common::PerformanceTimer;
- PerformanceTimer& timer()
- {
- static PerformanceTimer timer;
- return timer;
- }
- // TODO: __global__
-
- /**
- * Performs prefix-sum (aka scan) on idata, storing the result into odata.
- */
- void scan(int n, int *odata, const int *idata) {
- timer().startGpuTimer();
- // TODO
- timer().endGpuTimer();
- }
+ namespace Naive {
+ using StreamCompaction::Common::PerformanceTimer;
+ PerformanceTimer& timer()
+ {
+ static PerformanceTimer timer;
+ return timer;
}
+ // TODO: __global__
+ __global__ void kernScan(int n, const int pow, int *odata, const int *idata) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index >= n) return;
+
+ odata[index] = (index >= pow) ? idata[index - pow] + idata[index] : idata[index];
+ }
+
+ __global__ void kernInToEx(int n, int *odata, const int *idata) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index >= n) return;
+
+ odata[index] = (index == 0) ? 0 : idata[index - 1];
+ }
+
+ /**
+ * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+ */
+ void scan(int n, int *odata, const int *idata) {
+ // Create device arrays
+ int *dev_odata, *dev_idata;
+ int nsize = n * sizeof(int);
+
+ cudaMalloc((void**)&dev_odata, nsize);
+ checkCUDAError("cudaMalloc for dev_odata failed!");
+
+ cudaMalloc((void**)&dev_idata, nsize);
+ checkCUDAError("cudaMalloc for dev_idata failed!");
+
+ // Copy device arrays to device
+ cudaMemcpy(dev_odata, odata, nsize, cudaMemcpyHostToDevice);
+ checkCUDAError("cudaMemcpy for dev_odata failed!");
+
+ cudaMemcpy(dev_idata, idata, nsize, cudaMemcpyHostToDevice);
+ checkCUDAError("cudaMemcpy for dev_idata failed!");
+
+ // Compute block per grid and thread per block
+ dim3 numBlocks((n + blockSize - 1) / blockSize);
+ dim3 numThreads(blockSize);
+
+ timer().startGpuTimer();
+ // Naive Scan - Creates inclusive scan output
+ int levels = ilog2ceil(n);
+ for (int d = 1; d <= levels; d++) {
+ int pow = 1 << (d - 1);
+ kernScan <<>> (n, pow, dev_odata, dev_idata);
+ checkCUDAError("kernScan failed for level " + levels);
+ std::swap(dev_odata, dev_idata);
+ }
+
+ // Convert inclusive scan to exclusive
+ kernInToEx <<>> (n, dev_odata, dev_idata);
+ checkCUDAError("kernInToEx failed!");
+
+ timer().endGpuTimer();
+
+ // Copy device arrays back to host
+ cudaMemcpy(odata, dev_odata, nsize, cudaMemcpyDeviceToHost);
+ checkCUDAError("cudaMemcpy (device to host) for dev_odata failed!");
+
+ // Free memory
+ cudaFree(dev_odata);
+ cudaFree(dev_idata);
+ checkCUDAError("cudaFree failed!");
+ }
+ }
}
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 36b732d..1d2e3dd 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -8,21 +8,24 @@
namespace StreamCompaction {
namespace Thrust {
- using StreamCompaction::Common::PerformanceTimer;
- PerformanceTimer& timer()
- {
- static PerformanceTimer timer;
- return timer;
+ using StreamCompaction::Common::PerformanceTimer;
+ PerformanceTimer& timer()
+ {
+ static PerformanceTimer timer;
+ return timer;
}
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
- timer().startGpuTimer();
- // TODO use `thrust::exclusive_scan`
- // example: for device_vectors dv_in and dv_out:
- // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
- timer().endGpuTimer();
+ thrust::device_vector inDevice(idata, idata + n);
+ thrust::device_vector outDevice(odata, odata + n);
+
+ timer().startGpuTimer();
+ thrust::exclusive_scan(inDevice.begin(), inDevice.end(), outDevice.begin());
+ timer().endGpuTimer();
+
+ thrust::copy(outDevice.begin(), outDevice.end(), odata);
}
}
}