Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 87 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,93 @@ CUDA Stream Compaction

**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**

* (TODO) YOUR NAME HERE
* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
* Name: Zhan Xiong Chin
* Tested on: Windows 7 Professional, Intel(R) Xeon(R) CPU E5-1630 v4 @ 3.70 GHz 3.70 GHz, GTX 1070 8192MB (SIG Lab)

### (TODO: Your README)
Overview
========
This implements a GPU-based scan (i.e. computes the prefix sums of an array) and stream compaction (i.e.
moves nonzero elements to the front of array). There are versions for a CPU-based scan, a naive GPU-based scan
(uses O(nlogn) additions) and a work-efficient GPU-based scan (uses O(n) additions).

Include analysis, etc. (Remember, this is public, so don't put
anything here that you don't want to share with the world.)
Build Instructions
==================
[See here](https://github.com/CIS565-Fall-2016/Project0-CUDA-Getting-Started/blob/master/INSTRUCTION.md)

Performance analysis
====================
Milliseconds to calculate prefix sums for arrays of given sizes:

![](images/milliseconds.png)

Log2 of nanoseconds for same data:

![](images/log_nanoseconds.png)

The timings above did not include the time needed to copy arrays onto device.

For small arrays, the naive algorithm appears to be the fastest, though this may be due to the inability to
effectively time CPU execution for times smaller than 1 millisecond. The CPU algorithm still beats out the
naive algorithm for most of the timings, suggesting that the extra logn additions needed is slowing down the execution
of the naive algorithm significantly.

The efficient algorithm is approximately 3 times faster than the naive algorithm and twice as fast as the CPU algorithm.
This is in line with the O(n) operations needed by both of them; the GPU algorithm is making better use of its
multiple cores to achieve this speedup.

Compared to the thrust-based implementation, all algorithms beat it for small arrays, but it is significantly faster
than all other algorithms for large arrays. Based on the large number of registers used and the small grid size, it
may be using the work-efficient algorithm, but with a larger base than 2 (e.g. ternary or quarternary tree).

Test output
===========
```
S:\cis565\Project2-Stream-Compaction\build>Release\cis565_stream_compaction_test
.exe

****************
** SCAN TESTS **
****************
[ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 10 0 ]
==== cpu scan, power-of-two ====
[ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 205473618 20547362
8 ]
==== cpu scan, non-power-of-two ====
[ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 205473252 20547325
5 ]
passed
==== naive scan, power-of-two ====
[ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 205473618 20547362
8 ]
passed
==== naive scan, non-power-of-two ====
passed
==== work-efficient scan, power-of-two ====
passed
==== work-efficient scan, non-power-of-two ====
passed
==== thrust scan, power-of-two ====
passed
==== thrust scan, non-power-of-two ====
passed

*****************************
** STREAM COMPACTION TESTS **
*****************************
[ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ]
==== cpu compact without scan, power-of-two ====
[ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ]
passed
==== cpu compact without scan, non-power-of-two ====
[ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 3 ]
passed
==== cpu compact with scan ====
[ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ]
passed
==== work-efficient compact, power-of-two ====
passed
==== work-efficient compact, non-power-of-two ====
passed

S:\cis565\Project2-Stream-Compaction\build>
```
Binary file added images/log_nanoseconds.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/milliseconds.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
263 changes: 157 additions & 106 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,118 +6,169 @@
* @copyright University of Pennsylvania
*/

#include <chrono>
#include <cstdio>
#include <stream_compaction/cpu.h>
#include <stream_compaction/naive.h>
#include <stream_compaction/efficient.h>
#include <stream_compaction/thrust.h>
#include "testing_helpers.hpp"

#define TIMING 1

void comparePerformance(const int SIZE, const int RUNS) {
printf("Timing performance with arrays of size %d, averaged over %d runs\n", SIZE, RUNS);
int * a = new int[SIZE];
int * b = new int[SIZE];
genArray(SIZE, a, 2);

for (int i = 0; i < RUNS; i++) {
std::chrono::time_point<std::chrono::high_resolution_clock> start, end;
start = std::chrono::high_resolution_clock::now();

StreamCompaction::CPU::scan(SIZE, b, a);

end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsedSeconds = end - start;
printf("CPU scan: %lf milliseconds\n", elapsedSeconds.count() * 1000.0f);

StreamCompaction::Naive::scan(SIZE, b, a);

StreamCompaction::Efficient::scan(SIZE, b, a);

StreamCompaction::Thrust::scan(SIZE, b, a);
}

delete a;
delete b;
}

void runTests() {
const int SIZE = 1 << 23;
const int NPOT = SIZE - 17;
int * a = new int[SIZE];
int * b = new int[SIZE];
int * c = new int[SIZE];

// Scan tests

printf("\n");
printf("****************\n");
printf("** SCAN TESTS **\n");
printf("****************\n");

genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case
a[SIZE - 1] = 0;
printArray(SIZE, a, true);

zeroArray(SIZE, b);
printDesc("cpu scan, power-of-two");
StreamCompaction::CPU::scan(SIZE, b, a);
printArray(SIZE, b, true);

zeroArray(SIZE, c);
printDesc("cpu scan, non-power-of-two");
StreamCompaction::CPU::scan(NPOT, c, a);
printArray(NPOT, b, true);
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
printDesc("naive scan, power-of-two");
StreamCompaction::Naive::scan(SIZE, c, a);
printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

zeroArray(SIZE, c);
printDesc("naive scan, non-power-of-two");
StreamCompaction::Naive::scan(NPOT, c, a);
//printArray(SIZE, c, true);
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient scan, power-of-two");
StreamCompaction::Efficient::scan(SIZE, c, a);
//printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient scan, non-power-of-two");
StreamCompaction::Efficient::scan(NPOT, c, a);
//printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
printDesc("thrust scan, power-of-two");
StreamCompaction::Thrust::scan(SIZE, c, a);
//printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

zeroArray(SIZE, c);
printDesc("thrust scan, non-power-of-two");
StreamCompaction::Thrust::scan(NPOT, c, a);
//printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);

printf("\n");
printf("*****************************\n");
printf("** STREAM COMPACTION TESTS **\n");
printf("*****************************\n");

// Compaction tests

genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case
a[SIZE - 1] = 0;
printArray(SIZE, a, true);

int count, expectedCount, expectedNPOT;

zeroArray(SIZE, b);
printDesc("cpu compact without scan, power-of-two");
count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
expectedCount = count;
printArray(count, b, true);
printCmpLenResult(count, expectedCount, b, b);

zeroArray(SIZE, c);
printDesc("cpu compact without scan, non-power-of-two");
count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
expectedNPOT = count;
printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);

zeroArray(SIZE, c);
printDesc("cpu compact with scan");
count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient compact, power-of-two");
count = StreamCompaction::Efficient::compact(SIZE, c, a);
//printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient compact, non-power-of-two");
count = StreamCompaction::Efficient::compact(NPOT, c, a);
//printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);

delete a;
delete b;
delete c;
}


int main(int argc, char* argv[]) {
const int SIZE = 1 << 8;
const int NPOT = SIZE - 3;
int a[SIZE], b[SIZE], c[SIZE];

// Scan tests

printf("\n");
printf("****************\n");
printf("** SCAN TESTS **\n");
printf("****************\n");

genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case
a[SIZE - 1] = 0;
printArray(SIZE, a, true);

zeroArray(SIZE, b);
printDesc("cpu scan, power-of-two");
StreamCompaction::CPU::scan(SIZE, b, a);
printArray(SIZE, b, true);

zeroArray(SIZE, c);
printDesc("cpu scan, non-power-of-two");
StreamCompaction::CPU::scan(NPOT, c, a);
printArray(NPOT, b, true);
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
printDesc("naive scan, power-of-two");
StreamCompaction::Naive::scan(SIZE, c, a);
//printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

zeroArray(SIZE, c);
printDesc("naive scan, non-power-of-two");
StreamCompaction::Naive::scan(NPOT, c, a);
//printArray(SIZE, c, true);
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient scan, power-of-two");
StreamCompaction::Efficient::scan(SIZE, c, a);
//printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient scan, non-power-of-two");
StreamCompaction::Efficient::scan(NPOT, c, a);
//printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
printDesc("thrust scan, power-of-two");
StreamCompaction::Thrust::scan(SIZE, c, a);
//printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

zeroArray(SIZE, c);
printDesc("thrust scan, non-power-of-two");
StreamCompaction::Thrust::scan(NPOT, c, a);
//printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);

printf("\n");
printf("*****************************\n");
printf("** STREAM COMPACTION TESTS **\n");
printf("*****************************\n");

// Compaction tests

genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case
a[SIZE - 1] = 0;
printArray(SIZE, a, true);

int count, expectedCount, expectedNPOT;

zeroArray(SIZE, b);
printDesc("cpu compact without scan, power-of-two");
count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
expectedCount = count;
printArray(count, b, true);
printCmpLenResult(count, expectedCount, b, b);

zeroArray(SIZE, c);
printDesc("cpu compact without scan, non-power-of-two");
count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
expectedNPOT = count;
printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);

zeroArray(SIZE, c);
printDesc("cpu compact with scan");
count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient compact, power-of-two");
count = StreamCompaction::Efficient::compact(SIZE, c, a);
//printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient compact, non-power-of-two");
count = StreamCompaction::Efficient::compact(NPOT, c, a);
//printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);
#if TIMING == 1
comparePerformance(1 << 10, 3);
comparePerformance(1 << 13, 3);
comparePerformance(1 << 16, 3);
comparePerformance(1 << 19, 3);
comparePerformance(1 << 22, 3);
comparePerformance(1 << 25, 3);
comparePerformance(1 << 28, 3);
#else
runTests();
#endif
}
2 changes: 1 addition & 1 deletion stream_compaction/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ set(SOURCE_FILES

cuda_add_library(stream_compaction
${SOURCE_FILES}
OPTIONS -arch=sm_20
OPTIONS -arch=sm_50
)
Loading