CIS565-Fall-2016 · czxcjx · Sep 19, 2016 · Sep 21, 2016 · Sep 27, 2016 · Sep 27, 2016
diff --git a/README.md b/README.md
@@ -3,11 +3,93 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Name: Zhan Xiong Chin
+* Tested on: Windows 7 Professional, Intel(R) Xeon(R) CPU E5-1630 v4 @ 3.70 GHz 3.70 GHz, GTX 1070 8192MB (SIG Lab)
 
-### (TODO: Your README)
+Overview
+========
+This implements a GPU-based scan (i.e. computes the prefix sums of an array) and stream compaction (i.e. 
+moves nonzero elements to the front of array). There are versions for a CPU-based scan, a naive GPU-based scan 
+(uses O(nlogn) additions) and a work-efficient GPU-based scan (uses O(n) additions).
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+Build Instructions
+==================
+[See here](https://github.com/CIS565-Fall-2016/Project0-CUDA-Getting-Started/blob/master/INSTRUCTION.md)
 
+Performance analysis
+====================
+Milliseconds to calculate prefix sums for arrays of given sizes:
+
+![](images/milliseconds.png)
+
+Log2 of nanoseconds for same data:
+
+![](images/log_nanoseconds.png)
+
+The timings above did not include the time needed to copy arrays onto device. 
+
+For small arrays, the naive algorithm appears to be the fastest, though this may be due to the inability to
+effectively time CPU execution for times smaller than 1 millisecond. The CPU algorithm still beats out the
+naive algorithm for most of the timings, suggesting that the extra logn additions needed is slowing down the execution
+of the naive algorithm significantly.
+
+The efficient algorithm is approximately 3 times faster than the naive algorithm and twice as fast as the CPU algorithm.
+This is in line with the O(n) operations needed by both of them; the GPU algorithm is making better use of its 
+multiple cores to achieve this speedup.
+
+Compared to the thrust-based implementation, all algorithms beat it for small arrays, but it is significantly faster
+than all other algorithms for large arrays. Based on the large number of registers used and the small grid size, it
+may be using the work-efficient algorithm, but with a larger base than 2 (e.g. ternary or quarternary tree). 
+
+Test output
+===========
+```
+S:\cis565\Project2-Stream-Compaction\build>Release\cis565_stream_compaction_test
+.exe
+
+****************
+** SCAN TESTS **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...  10   0 ]
+==== cpu scan, power-of-two ====
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 205473618 20547362
+8 ]
+==== cpu scan, non-power-of-two ====
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 205473252 20547325
+5 ]
+    passed
+==== naive scan, power-of-two ====
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 205473618 20547362
+8 ]
+    passed
+==== naive scan, non-power-of-two ====
+    passed
+==== work-efficient scan, power-of-two ====
+    passed
+==== work-efficient scan, non-power-of-two ====
+    passed
+==== thrust scan, power-of-two ====
+    passed
+==== thrust scan, non-power-of-two ====
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   3   2   1   3   1   1   1   2   0   1   0   2 ...   0   0 ]
+==== cpu compact without scan, power-of-two ====
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   2 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   1   3 ]
+    passed
+==== cpu compact with scan ====
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   2 ]
+    passed
+==== work-efficient compact, power-of-two ====
+    passed
+==== work-efficient compact, non-power-of-two ====
+    passed
+
+S:\cis565\Project2-Stream-Compaction\build>
+```
diff --git a/images/log_nanoseconds.png b/images/log_nanoseconds.png
diff --git a/images/milliseconds.png b/images/milliseconds.png
diff --git a/src/main.cpp b/src/main.cpp
@@ -6,118 +6,169 @@
  * @copyright University of Pennsylvania
  */
 
+#include <chrono>
 #include <cstdio>
 #include <stream_compaction/cpu.h>
 #include <stream_compaction/naive.h>
 #include <stream_compaction/efficient.h>
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
+#define TIMING 1
+
+void comparePerformance(const int SIZE, const int RUNS) {
+	printf("Timing performance with arrays of size %d, averaged over %d runs\n", SIZE, RUNS);
+	int * a = new int[SIZE];
+	int * b = new int[SIZE];
+	genArray(SIZE, a, 2);
+
+	for (int i = 0; i < RUNS; i++) {
+		std::chrono::time_point<std::chrono::high_resolution_clock> start, end;
+		start = std::chrono::high_resolution_clock::now();
+
+		StreamCompaction::CPU::scan(SIZE, b, a);
+
+		end = std::chrono::high_resolution_clock::now();
+		std::chrono::duration<double> elapsedSeconds = end - start;
+		printf("CPU scan: %lf milliseconds\n", elapsedSeconds.count() * 1000.0f);
+
+		StreamCompaction::Naive::scan(SIZE, b, a);
+
+		StreamCompaction::Efficient::scan(SIZE, b, a);
+
+		StreamCompaction::Thrust::scan(SIZE, b, a);
+	}
+
+	delete a;
+	delete b;
+}
+
+void runTests() {
+	const int SIZE = 1 << 23;
+	const int NPOT = SIZE - 17;
+	int * a = new int[SIZE];
+	int * b = new int[SIZE];
+	int * c = new int[SIZE];
+
+	// Scan tests
+
+	printf("\n");
+	printf("****************\n");
+	printf("** SCAN TESTS **\n");
+	printf("****************\n");
+
+	genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+	a[SIZE - 1] = 0;
+	printArray(SIZE, a, true);
+
+	zeroArray(SIZE, b);
+	printDesc("cpu scan, power-of-two");
+	StreamCompaction::CPU::scan(SIZE, b, a);
+	printArray(SIZE, b, true);
+
+	zeroArray(SIZE, c);
+	printDesc("cpu scan, non-power-of-two");
+	StreamCompaction::CPU::scan(NPOT, c, a);
+	printArray(NPOT, b, true);
+	printCmpResult(NPOT, b, c);
+
+	zeroArray(SIZE, c);
+	printDesc("naive scan, power-of-two");
+	StreamCompaction::Naive::scan(SIZE, c, a);
+	printArray(SIZE, c, true);
+	printCmpResult(SIZE, b, c);
+
+	zeroArray(SIZE, c);
+	printDesc("naive scan, non-power-of-two");
+	StreamCompaction::Naive::scan(NPOT, c, a);
+	//printArray(SIZE, c, true);
+	printCmpResult(NPOT, b, c);
+
+	zeroArray(SIZE, c);
+	printDesc("work-efficient scan, power-of-two");
+	StreamCompaction::Efficient::scan(SIZE, c, a);
+	//printArray(SIZE, c, true);
+	printCmpResult(SIZE, b, c);
+
+	zeroArray(SIZE, c);
+	printDesc("work-efficient scan, non-power-of-two");
+	StreamCompaction::Efficient::scan(NPOT, c, a);
+	//printArray(NPOT, c, true);
+	printCmpResult(NPOT, b, c);
+
+	zeroArray(SIZE, c);
+	printDesc("thrust scan, power-of-two");
+	StreamCompaction::Thrust::scan(SIZE, c, a);
+	//printArray(SIZE, c, true);
+	printCmpResult(SIZE, b, c);
+
+	zeroArray(SIZE, c);
+	printDesc("thrust scan, non-power-of-two");
+	StreamCompaction::Thrust::scan(NPOT, c, a);
+	//printArray(NPOT, c, true);
+	printCmpResult(NPOT, b, c);
+
+	printf("\n");
+	printf("*****************************\n");
+	printf("** STREAM COMPACTION TESTS **\n");
+	printf("*****************************\n");
+
+	// Compaction tests
+
+	genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
+	a[SIZE - 1] = 0;
+	printArray(SIZE, a, true);
+
+	int count, expectedCount, expectedNPOT;
+
+	zeroArray(SIZE, b);
+	printDesc("cpu compact without scan, power-of-two");
+	count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
+	expectedCount = count;
+	printArray(count, b, true);
+	printCmpLenResult(count, expectedCount, b, b);
+
+	zeroArray(SIZE, c);
+	printDesc("cpu compact without scan, non-power-of-two");
+	count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
+	expectedNPOT = count;
+	printArray(count, c, true);
+	printCmpLenResult(count, expectedNPOT, b, c);
+
+	zeroArray(SIZE, c);
+	printDesc("cpu compact with scan");
+	count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
+	printArray(count, c, true);
+	printCmpLenResult(count, expectedCount, b, c);
+
+	zeroArray(SIZE, c);
+	printDesc("work-efficient compact, power-of-two");
+	count = StreamCompaction::Efficient::compact(SIZE, c, a);
+	//printArray(count, c, true);
+	printCmpLenResult(count, expectedCount, b, c);
+
+	zeroArray(SIZE, c);
+	printDesc("work-efficient compact, non-power-of-two");
+	count = StreamCompaction::Efficient::compact(NPOT, c, a);
+	//printArray(count, c, true);
+	printCmpLenResult(count, expectedNPOT, b, c);
+
+	delete a;
+	delete b;
+	delete c;
+}
+
+
 int main(int argc, char* argv[]) {
-    const int SIZE = 1 << 8;
-    const int NPOT = SIZE - 3;
-    int a[SIZE], b[SIZE], c[SIZE];
-
-    // Scan tests
-
-    printf("\n");
-    printf("****************\n");
-    printf("** SCAN TESTS **\n");
-    printf("****************\n");
-
-    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    zeroArray(SIZE, b);
-    printDesc("cpu scan, power-of-two");
-    StreamCompaction::CPU::scan(SIZE, b, a);
-    printArray(SIZE, b, true);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu scan, non-power-of-two");
-    StreamCompaction::CPU::scan(NPOT, c, a);
-    printArray(NPOT, b, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, power-of-two");
-    StreamCompaction::Naive::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, non-power-of-two");
-    StreamCompaction::Naive::scan(NPOT, c, a);
-    //printArray(SIZE, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, power-of-two");
-    StreamCompaction::Efficient::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, non-power-of-two");
-    StreamCompaction::Efficient::scan(NPOT, c, a);
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, power-of-two");
-    StreamCompaction::Thrust::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, non-power-of-two");
-    StreamCompaction::Thrust::scan(NPOT, c, a);
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    printf("\n");
-    printf("*****************************\n");
-    printf("** STREAM COMPACTION TESTS **\n");
-    printf("*****************************\n");
-
-    // Compaction tests
-
-    genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    int count, expectedCount, expectedNPOT;
-
-    zeroArray(SIZE, b);
-    printDesc("cpu compact without scan, power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
-    expectedCount = count;
-    printArray(count, b, true);
-    printCmpLenResult(count, expectedCount, b, b);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact without scan, non-power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
-    expectedNPOT = count;
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact with scan");
-    count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, power-of-two");
-    count = StreamCompaction::Efficient::compact(SIZE, c, a);
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, non-power-of-two");
-    count = StreamCompaction::Efficient::compact(NPOT, c, a);
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
+#if TIMING == 1
+	comparePerformance(1 << 10, 3);
+	comparePerformance(1 << 13, 3);
+	comparePerformance(1 << 16, 3);
+	comparePerformance(1 << 19, 3);
+	comparePerformance(1 << 22, 3);
+	comparePerformance(1 << 25, 3);
+	comparePerformance(1 << 28, 3);
+#else
+	runTests();
+#endif
 }
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_50
     )