CIS565-Fall-2017 · XinCastle · Sep 23, 2017 · Sep 23, 2017 · Sep 23, 2017 · Sep 23, 2017
diff --git a/README.md b/README.md
@@ -3,11 +3,68 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Xincheng Zhang
+* Tested on: (TODO) Windows 10, i7-4702HQ @ 2.20GHz 8GB, GTX 870M 3072MB (Personal Laptop)
 
-### (TODO: Your README)
+### Output Screenshot
+-------------
+@blocksize = 128; Arraysize = 1<<9
+![](https://github.com/XinCastle/Project2-Stream-Compaction/blob/master/img/sc1.png)
+![](https://github.com/XinCastle/Project2-Stream-Compaction/blob/master/img/sc2.png)
+![](https://github.com/XinCastle/Project2-Stream-Compaction/blob/master/img/sc3.png)
+
+### Description&Features
+-------------
+```
+1: CPU Scan; Stream Compaction
+2: Naive Scan using GPU
+3: Efficient GPU Scan; Stream Compaction
+4: Thrust Scan
+```
+
+### Blocksize Optimization
+-------------
+@constant Arraysize = 1<<9, the performance of different methods will change accroding to the blocksize. Therefore, I modify the blocksize to find the optimized value of these methods.
+
+![](https://github.com/XinCastle/Project2-Stream-Compaction/blob/master/img/chart1.png)
+
+**The test data of the chart above is the following:**
+-------------
+Block Size | Naive Scan | Efficient Scan | Thrust Scan | CPU Scan
+---|---|---|---|---
+32 | 0.3818  | 0.1598 |1.0674 |0.0019
+64 | 0.0389  | 0.1575 |1.0808 |0.0018
+128 | 0.0382 | 0.1373 |1.0888 |0.0019
+256 | 0.0387 | 0.1542 |1.0669 |0.0018
+512 | 0.0428 | 0.1398 |1.0899 |0.0019
+1024 | 0.043 | 0.1532 |1.0523 |0.0018
+
+From the data I get and the chart above, we can tell that for CPU scan, the blocksize doesn't change the performance. For naive scan, its best blocksize is 128. For efficient scan, its best blocksize is 128. As for thrust scan, its best blocksize is 1024.
+
+
+### Performance Comparison Based on Array Size
+-------------
+Array Size | Naive Scan | Efficient Scan | Thrust Scan | CPU Scan
+---|---|---|---|---
+2^8 | 0.3546  | 0.127 |1.0821 |0.0014
+2^12 | 0.0531  | 0.1795 |2.3651 |0.1398
+2^16 | 0.2922 | 0.6992 |8.2656 |0.265
+2^20 | 3.3498 | 7.4632 |40.6058 |3.2167
+2^24 | 61.6843 | 130.091 |556.343 |53.3077
+
+The chart is the following:
+![](https://github.com/XinCastle/Project2-Stream-Compaction/blob/master/img/chart2.png)
+
+
+### Questions
+-------------
+*  Roughly optimize the block sizes of each of your implementations for minimal run time on your GPU.
+Answer: in the "blocksize optimization" above.
+
+* Compare all of these GPU Scan implementations
+Answer: in the "Performance Comparison Based on Array Size" above. I guess that thrust scan uses shared memory.
+
+* Write a brief explanation of the phenomena you see here.
+Answer: I think the reason why GPU methods are slower than CPU method is because that in these methods, not all the threads are working which means we have lots of threads doing nothing so they are not efficient enough to be faster than CPU scan. Moreover, I think I/O is another factor that causes bottleneck because there are many memory copy operations in my code.
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
 
diff --git a/img/chart1.png b/img/chart1.png
diff --git a/img/chart2.png b/img/chart2.png
diff --git a/img/data1.png b/img/data1.png
diff --git a/img/data2.png b/img/data2.png
diff --git a/img/sc1.png b/img/sc1.png
diff --git a/img/sc2.png b/img/sc2.png
diff --git a/img/sc3.png b/img/sc3.png
diff --git a/src/main.cpp b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 24; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];

diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
@@ -24,6 +24,11 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+			int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+			if (index < n) 
+			{
+				bools[index] = (idata[index] == 0) ? 0 : 1;
+			}
         }
 
         /**
@@ -33,6 +38,21 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+			int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+			if (index >= n) 
+			{
+				return;
+			}
+
+			if (index == n - 1) 
+			{
+				odata[indices[index]] = idata[index];
+			}
+
+			else if (indices[index] != indices[index + 1]) 
+			{
+				odata[indices[index]] = idata[index];
+			}
         }
 
     }

diff --git a/stream_compaction/common.h b/stream_compaction/common.h
@@ -10,123 +10,129 @@
 #include <chrono>
 #include <stdexcept>
 
+
+#include <device_launch_parameters.h>
+
+
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
 
+#define blockSize 1024
+
 /**
- * Check for CUDA errors; print and exit if there was a problem.
- */
+* Check for CUDA errors; print and exit if there was a problem.
+*/
 void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1);
 
 inline int ilog2(int x) {
-    int lg = 0;
-    while (x >>= 1) {
-        ++lg;
-    }
-    return lg;
+	int lg = 0;
+	while (x >>= 1) {
+		++lg;
+	}
+	return lg;
 }
 
 inline int ilog2ceil(int x) {
-    return if x == 1 : 0 ? ilog2(x - 1) + 1;
+	return ilog2(x - 1) + 1;
 }
 
 namespace StreamCompaction {
-    namespace Common {
-        __global__ void kernMapToBoolean(int n, int *bools, const int *idata);
-
-        __global__ void kernScatter(int n, int *odata,
-                const int *idata, const int *bools, const int *indices);
-
-	    /**
-	    * This class is used for timing the performance
-	    * Uncopyable and unmovable
-        *
-        * Adapted from WindyDarian(https://github.com/WindyDarian)
-	    */
-	    class PerformanceTimer
-	    {
-	    public:
-		    PerformanceTimer()
-		    {
-			    cudaEventCreate(&event_start);
-			    cudaEventCreate(&event_end);
-		    }
-
-		    ~PerformanceTimer()
-		    {
-			    cudaEventDestroy(event_start);
-			    cudaEventDestroy(event_end);
-		    }
-
-		    void startCpuTimer()
-		    {
-			    if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); }
-			    cpu_timer_started = true;
-
-			    time_start_cpu = std::chrono::high_resolution_clock::now();
-		    }
-
-		    void endCpuTimer()
-		    {
-			    time_end_cpu = std::chrono::high_resolution_clock::now();
-
-			    if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); }
-
-			    std::chrono::duration<double, std::milli> duro = time_end_cpu - time_start_cpu;
-			    prev_elapsed_time_cpu_milliseconds =
-				    static_cast<decltype(prev_elapsed_time_cpu_milliseconds)>(duro.count());
-
-			    cpu_timer_started = false;
-		    }
-
-		    void startGpuTimer()
-		    {
-			    if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); }
-			    gpu_timer_started = true;
-
-			    cudaEventRecord(event_start);
-		    }
-
-		    void endGpuTimer()
-		    {
-			    cudaEventRecord(event_end);
-			    cudaEventSynchronize(event_end);
-
-			    if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); }
-
-			    cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end);
-			    gpu_timer_started = false;
-		    }
-
-		    float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015
-		    {
-			    return prev_elapsed_time_cpu_milliseconds;
-		    }
-
-		    float getGpuElapsedTimeForPreviousOperation() //noexcept
-		    {
-			    return prev_elapsed_time_gpu_milliseconds;
-		    }
-
-		    // remove copy and move functions
-		    PerformanceTimer(const PerformanceTimer&) = delete;
-		    PerformanceTimer(PerformanceTimer&&) = delete;
-		    PerformanceTimer& operator=(const PerformanceTimer&) = delete;
-		    PerformanceTimer& operator=(PerformanceTimer&&) = delete;
-
-	    private:
-		    cudaEvent_t event_start = nullptr;
-		    cudaEvent_t event_end = nullptr;
-
-		    using time_point_t = std::chrono::high_resolution_clock::time_point;
-		    time_point_t time_start_cpu;
-		    time_point_t time_end_cpu;
-
-		    bool cpu_timer_started = false;
-		    bool gpu_timer_started = false;
-
-		    float prev_elapsed_time_cpu_milliseconds = 0.f;
-		    float prev_elapsed_time_gpu_milliseconds = 0.f;
-	    };
-    }
-}
+	namespace Common {
+		__global__ void kernMapToBoolean(int n, int *bools, const int *idata);
+
+		__global__ void kernScatter(int n, int *odata,
+			const int *idata, const int *bools, const int *indices);
+
+		/**
+		* This class is used for timing the performance
+		* Uncopyable and unmovable
+		*
+		* Adapted from WindyDarian(https://github.com/WindyDarian)
+		*/
+		class PerformanceTimer
+		{
+		public:
+			PerformanceTimer()
+			{
+				cudaEventCreate(&event_start);
+				cudaEventCreate(&event_end);
+			}
+
+			~PerformanceTimer()
+			{
+				cudaEventDestroy(event_start);
+				cudaEventDestroy(event_end);
+			}
+
+			void startCpuTimer()
+			{
+				if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); }
+				cpu_timer_started = true;
+
+				time_start_cpu = std::chrono::high_resolution_clock::now();
+			}
+
+			void endCpuTimer()
+			{
+				time_end_cpu = std::chrono::high_resolution_clock::now();
+
+				if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); }
+
+				std::chrono::duration<double, std::milli> duro = time_end_cpu - time_start_cpu;
+				prev_elapsed_time_cpu_milliseconds =
+					static_cast<decltype(prev_elapsed_time_cpu_milliseconds)>(duro.count());
+
+				cpu_timer_started = false;
+			}
+
+			void startGpuTimer()
+			{
+				if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); }
+				gpu_timer_started = true;
+
+				cudaEventRecord(event_start);
+			}
+
+			void endGpuTimer()
+			{
+				cudaEventRecord(event_end);
+				cudaEventSynchronize(event_end);
+
+				if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); }
+
+				cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end);
+				gpu_timer_started = false;
+			}
+
+			float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015
+			{
+				return prev_elapsed_time_cpu_milliseconds;
+			}
+
+			float getGpuElapsedTimeForPreviousOperation() //noexcept
+			{
+				return prev_elapsed_time_gpu_milliseconds;
+			}
+
+			// remove copy and move functions
+			PerformanceTimer(const PerformanceTimer&) = delete;
+			PerformanceTimer(PerformanceTimer&&) = delete;
+			PerformanceTimer& operator=(const PerformanceTimer&) = delete;
+			PerformanceTimer& operator=(PerformanceTimer&&) = delete;
+
+		private:
+			cudaEvent_t event_start = nullptr;
+			cudaEvent_t event_end = nullptr;
+
+			using time_point_t = std::chrono::high_resolution_clock::time_point;
+			time_point_t time_start_cpu;
+			time_point_t time_end_cpu;
+
+			bool cpu_timer_started = false;
+			bool gpu_timer_started = false;
+
+			float prev_elapsed_time_cpu_milliseconds = 0.f;
+			float prev_elapsed_time_gpu_milliseconds = 0.f;
+		};
+	}
+}