diff --git a/README.md b/README.md index b71c458..7936eec 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,96 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Mohamed Soudy +* Tested on: Windows 10 Enterprise, i7 @ 2.7 GHz 16GB, GT 650M 1024 MB -### (TODO: Your README) +### Description -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +An implementation of CPU and GPU compaction. Two versions of the Scan (Prefix Sum) algorithm were implemented for GPU compaction and compared against CPU and thrust implementations. These include naive and work-efficient implementations. +### Performance Analysis + +The following tables show comparisons between CPU, naive, work-efficient and thrust implementations with arrays of up to approximately 1 million elements. + +![](img/power2_table.png) + +![](img/non_power2_table.png) + +![](img/power2_chart.png) + +![](img/non_power2_chart.png) + +#### CPU vs Naive vs Work-Efficient + +Surprisingly, the CPU implementation is much faster than all the other implementations. The work efficient implementation should be faster but this isn't the case mainly because in both the down sweep and up sweep kernels most of the threads are not being occupied because the thread indexes are skipped by powers of 2 depending on the depth. Therefore, the threads aren't being utilized efficiently causing it to be much slower than the CPU implementation. + +#### Power of 2 vs Non Power of 2 Array Sizes + +The performance between power of 2 array sizes and non power of 2 array sizes is very similar in all implementations except with thrust. When tested with power of 2 array size the thrust implementation is significantly slower than non power of 2 array size. + +### Program Output + +``` +**************** +** SCAN TESTS ** +**************** + [ 48 5 46 40 32 43 10 49 32 34 3 41 49 ... 38 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.00076ms (std::chrono Measured) + [ 0 48 53 99 139 171 214 224 273 305 339 342 383 ... 6200 6238 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.000761ms (std::chrono Measured) + [ 0 48 53 99 139 171 214 224 273 305 339 342 383 ... 6132 6147 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.037632ms (CUDA Measured) + [ 0 48 53 99 139 171 214 224 273 305 339 342 383 ... 6200 6238 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.0376ms (CUDA Measured) + [ 0 48 53 99 139 171 214 224 273 305 339 342 383 ... 6132 6147 ] + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.142432ms (CUDA Measured) + [ 0 48 53 99 139 171 214 224 273 305 339 342 383 ... 6200 6238 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.14176ms (CUDA Measured) + [ 0 48 53 99 139 171 214 224 273 305 339 342 383 ... 6132 6147 ] + passed +==== thrust scan, power-of-two ==== + elapsed time: 2.66266ms (CUDA Measured) + [ 0 48 53 99 139 171 214 224 273 305 339 342 383 ... 6200 6238 ] + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.020128ms (CUDA Measured) + [ 0 48 53 99 139 171 214 224 273 305 339 342 383 ... 6132 6147 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 3 2 2 0 1 0 3 2 0 3 3 3 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.00076ms (std::chrono Measured) + [ 3 2 2 1 3 2 3 3 3 1 2 3 1 ... 3 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.001141ms (std::chrono Measured) + [ 3 2 2 1 3 2 3 3 3 1 2 3 1 ... 1 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.0019ms (std::chrono Measured) + [ 3 2 2 1 3 2 3 3 3 1 2 3 1 ... 3 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.519744ms (CUDA Measured) + [ 3 2 2 1 3 2 3 3 3 1 2 3 1 ... 3 2 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.769056ms (CUDA Measured) + [ 3 2 2 1 3 2 3 3 3 1 2 3 1 ... 1 3 ] + passed +Press any key to continue . . . + +``` diff --git a/img/non_power2_chart.png b/img/non_power2_chart.png new file mode 100644 index 0000000..fc253d9 Binary files /dev/null and b/img/non_power2_chart.png differ diff --git a/img/non_power2_table.png b/img/non_power2_table.png new file mode 100644 index 0000000..89896b1 Binary files /dev/null and b/img/non_power2_table.png differ diff --git a/img/power2_chart.png b/img/power2_chart.png new file mode 100644 index 0000000..1aaf198 Binary files /dev/null and b/img/power2_chart.png differ diff --git a/img/power2_table.png b/img/power2_table.png new file mode 100644 index 0000000..8264449 Binary files /dev/null and b/img/power2_table.png differ diff --git a/src/main.cpp b/src/main.cpp index 7305641..d2479be 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -27,6 +27,10 @@ int main(int argc, char* argv[]) { genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; + //for (int i = 0; i < SIZE; i++) { + // a[i] = i; + //} + printArray(SIZE, a, true); // initialize b using StreamCompaction::CPU::scan you implement @@ -42,49 +46,57 @@ int main(int argc, char* argv[]) { printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + //printArray(SIZE, a, true); + //printArray(SIZE, b, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + //printArray(NPOT, a, true); + //printArray(NPOT, b, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + //printArray(SIZE, a, true); + //printArray(SIZE, b, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + //printArray(NPOT, a, true); + //printArray(NPOT, b, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -129,14 +141,16 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + //printArray(count, a, true); + //printArray(count, b, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 8fc0211..c476985 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,9 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + bools[index] = idata[index] == 0 ? 0 : 1; } /** @@ -32,7 +34,10 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + if (bools[index] == 1) + odata[indices[index]] = idata[index]; } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 05ce667..2ebfa8f 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,15 +1,16 @@ #include #include "cpu.h" -#include "common.h" +#include "common.h" namespace StreamCompaction { namespace CPU { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + bool compactTest = false; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** @@ -18,9 +19,11 @@ namespace StreamCompaction { * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ void scan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); + if (!compactTest) timer().startCpuTimer(); + odata[0] = 0; + for (size_t k = 1; k < n; ++k) + odata[k] = odata[k - 1] + idata[k-1]; + if (!compactTest) timer().endCpuTimer(); } /** @@ -30,9 +33,12 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int odataIdx = 0; + for (size_t k = 0; k < n; ++k) + if (idata[k] != 0) + odata[odataIdx++] = idata[k]; timer().endCpuTimer(); - return -1; + return odataIdx; } /** @@ -41,10 +47,33 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + compactTest = true; + int* tdata = new int[n]; + int* sdata = new int[n]; + timer().startCpuTimer(); - // TODO + + for (size_t k = 0; k < n; ++k) + if (idata[k] != 0) + tdata[k] = 1; + else + tdata[k] = 0; + + scan(n, sdata, tdata); + + int sdataLastIdx = 0; + for (size_t k = 0; k < n; ++k) { + if (tdata[k] == 1) { + odata[sdata[k]] = idata[k]; + sdataLastIdx = sdata[k]; + } + } + timer().endCpuTimer(); - return -1; + + compactTest = false; + delete[] tdata, sdata; + return sdataLastIdx+1; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 36c5ef2..6fbf4ca 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,22 +3,66 @@ #include "common.h" #include "efficient.h" +#define BLOCK_SIZE 256 + namespace StreamCompaction { namespace Efficient { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + using namespace Common; + bool compactTest = false; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + __global__ void kernUpSweep(int *g_odata, int n, int offset, int offsetPlus1) { + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + if (index % offsetPlus1 == 0) + g_odata[index + offsetPlus1 - 1] += g_odata[index + offset - 1]; + } + + __global__ void kernDownSweep(int *g_odata, int n, int offset, int offsetPlus1) { + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + + if (index % offsetPlus1 == 0) { + int t = g_odata[index + offset - 1]; + g_odata[index + offset - 1] = g_odata[index + offsetPlus1 - 1]; + g_odata[index + offsetPlus1 - 1] += t; + } } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + int arrayLength = 1 << ilog2ceil(n); + dim3 blocksPerGrid((arrayLength + BLOCK_SIZE - 1) / BLOCK_SIZE); + int* h_idata = new int[arrayLength]; + memset(h_idata, 0, arrayLength * sizeof(int)); + memcpy(h_idata, idata, n * sizeof(int)); + int *d_idata; + cudaMalloc((void**)&d_idata, arrayLength * sizeof(int)); + cudaMemcpy(d_idata, h_idata, arrayLength * sizeof(int), cudaMemcpyHostToDevice); + + if (!compactTest) timer().startGpuTimer(); + + for (int d = 0; d <= ilog2ceil(n) - 1; d++) + kernUpSweep << > > (d_idata, arrayLength, 1 << d, 1 << d + 1); + int zero = 0; + cudaMemcpy(&d_idata[arrayLength-1], &zero, sizeof(int), cudaMemcpyHostToDevice); + for (int d = ilog2ceil(n) - 1; d >= 0; d--) + kernDownSweep << > > (d_idata, arrayLength, 1 << d, 1 << d + 1); + + if (!compactTest) timer().endGpuTimer(); + + cudaMemcpy(odata, d_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(d_idata); + delete[] h_idata; } /** @@ -31,10 +75,37 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - return -1; + dim3 blocksPerGrid((n + BLOCK_SIZE - 1) / BLOCK_SIZE); + int* d_bools, *d_idata, *d_indicies, *d_odata; + int* h_indicies = new int[n]; + int* h_bools = new int[n]; + + cudaMalloc((void**)&d_bools, n * sizeof(int)); + cudaMalloc((void**)&d_idata, n * sizeof(int)); + cudaMalloc((void**)&d_indicies, n * sizeof(int)); + cudaMalloc((void**)&d_odata, n * sizeof(int)); + cudaMemcpy(d_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + compactTest = true; + timer().startGpuTimer(); + + kernMapToBoolean<<>>(n, d_bools, d_idata); + cudaMemcpy(h_bools, d_bools, n * sizeof(int), cudaMemcpyDeviceToHost); + scan(n, h_indicies, h_bools); + cudaMemcpy(d_indicies, h_indicies, n * sizeof(int), cudaMemcpyHostToDevice); + kernScatter<<>>(n, d_odata, d_idata, d_bools, d_indicies); + + timer().endGpuTimer(); + compactTest = false; + + cudaMemcpy(odata, d_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + int count = 0; + for (int i = 0; i < n; i++) + if (odata[i] != 0) + count++; + cudaFree(d_bools); cudaFree(d_idata); cudaFree(d_indicies); cudaFree(d_odata); + delete[] h_indicies, h_bools; + return count; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9218f8e..fc2963e 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,23 +3,56 @@ #include "common.h" #include "naive.h" +#define BLOCK_SIZE 256 + namespace StreamCompaction { namespace Naive { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + __global__ void kernNaiveScan(int *g_odata, const int *g_idata, int n, int offset) { + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + if (index >= offset) + g_odata[index] = g_idata[index - offset] + g_idata[index]; + else + g_odata[index] = g_idata[index]; + } + + __global__ void kernExclusive(int *g_odata, const int *g_idata, int n) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + g_odata[index] = index == 0 ? 0 : g_idata[index - 1]; } - // TODO: __global__ /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + dim3 blocksPerGrid((n + BLOCK_SIZE - 1) / BLOCK_SIZE); + int *d_idata, *d_odata; + cudaMalloc((void**)&d_idata, n * sizeof(int)); + cudaMalloc((void**)&d_odata, n * sizeof(int)); + cudaMemcpy(d_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + + for (int d = 1; d <= ilog2ceil(n); d++) { + kernNaiveScan<<>>(d_odata, d_idata, n, 1 << d - 1); + std::swap(d_idata, d_odata); + } + kernExclusive<<>>(d_odata, d_idata, n); + + timer().endGpuTimer(); + + cudaMemcpy(odata, d_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(d_idata); + cudaFree(d_odata); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 36b732d..5c68d2c 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -8,21 +8,22 @@ namespace StreamCompaction { namespace Thrust { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - timer().endGpuTimer(); + thrust::device_vector dv_in(idata, idata + n); + thrust::device_vector dv_out(odata, odata + n); + timer().startGpuTimer(); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + timer().endGpuTimer(); + thrust::copy(dv_out.begin(), dv_out.end(), odata); } } }