CIS565-Fall-2020 · zhengliii · Sep 20, 2020 · Sep 20, 2020 · Sep 21, 2020 · Sep 21, 2020
diff --git a/README.md b/README.md
@@ -3,12 +3,73 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Li Zheng
+  * [LinkedIn](https://www.linkedin.com/in/li-zheng-1955ba169)
+* Tested on: Windows CUDA10, i5-3600 @ 3.59GHz 16GB, RTX 2060 6GB (personal computer)
 
-### (TODO: Your README)
+This project implements different versions of scan, including CPU scan, naive scan, work-efficient scan and thrust scan. Some of these methods are used to implement stream compaction. A timer is used to measure the time cost and evaluate the performance.
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+### Performance Analysis
+![blockSize](img/blockSize.PNG)  
+This diagram demonstrates the change of time with respect to block size. The block size of 128 and 256 have relatively good performance.
 
+![powerOfTwo](img/powerOfTwo.PNG)
+![nonePowerOfTwo](img/nonPowerOfTwo.PNG)  
+The diagrams show the change of time with array size increases. The first diagram is for power-of-two size array. The second one is for non-power-of-two size array. Their performance is almost the same. When the array size is small, the CPU implementation has a better performance. I think it is because most of the threads doesn't actually work at a deeper level, but just swap two device memory. Additionally, the GPU version algorithms use bit shifting to find offsets or intervals of each level, which takes extra time. With the array size increases, the GPU version algorithms have better performance, especially the work-efficient and thrust method.
+
+### Output of The Test Program
+Here is the test result of an array of 2^16 and a block size of 128. More results are in img/performance analysis.xlsx.
+```
+****************
+** SCAN TESTS **
+****************
+    [  49  29  24  15  46  49  46   8  35  40  38  18  44 ...   3   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 0.1305ms    (std::chrono Measured)
+    [   0  49  78 102 117 163 212 258 266 301 341 379 397 ... 1603889 1603892 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 0.129ms    (std::chrono Measured)
+    [   0  49  78 102 117 163 212 258 266 301 341 379 397 ... 1603839 1603856 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 0.052416ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 0.050752ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 0.11264ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 0.113344ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 0.073344ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.055296ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   3   3   2   1   0   1   2   2   3   0   2   2   0 ...   3   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 0.1239ms    (std::chrono Measured)
+    [   3   3   2   1   1   2   2   3   2   2   1   2   2 ...   3   3 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 0.1241ms    (std::chrono Measured)
+    [   3   3   2   1   1   2   2   3   2   2   1   2   2 ...   3   2 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 0.3149ms    (std::chrono Measured)
+    [   3   3   2   1   1   2   2   3   2   2   1   2   2 ...   3   3 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 0.124928ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 0.198656ms    (CUDA Measured)
+    passed
+```
diff --git a/img/blockSize.PNG b/img/blockSize.PNG
diff --git a/img/nonPowerOfTwo.PNG b/img/nonPowerOfTwo.PNG
diff --git a/img/performance analysis.xlsx b/img/performance analysis.xlsx
diff --git a/img/powerOfTwo.PNG b/img/powerOfTwo.PNG
diff --git a/src/main.cpp b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 16; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];

diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
@@ -24,6 +24,16 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (index >= n) {
+				return;
+			}
+			if (idata[index] == 0) {
+				bools[index] = 0;
+			}
+			else {
+				bools[index] = 1;
+			}
         }
 
         /**
@@ -33,6 +43,13 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (index >= n) {
+				return;
+			}
+			if (bools[index] == 1) {
+				odata[indices[index]] = idata[index];
+			}
         }
 
     }

diff --git a/stream_compaction/common.h b/stream_compaction/common.h
@@ -12,6 +12,7 @@
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+#define blockSize 256
 
 /**
  * Check for CUDA errors; print and exit if there was a problem.

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
@@ -4,47 +4,79 @@
 #include "common.h"
 
 namespace StreamCompaction {
-    namespace CPU {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
-        }
+	namespace CPU {
+		using StreamCompaction::Common::PerformanceTimer;
+		PerformanceTimer& timer()
+		{
+			static PerformanceTimer timer;
+			return timer;
+		}
 
-        /**
-         * CPU scan (prefix sum).
-         * For performance analysis, this is supposed to be a simple for loop.
-         * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
-         */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startCpuTimer();
-            // TODO
-            timer().endCpuTimer();
-        }
+		/**
+		 * CPU scan (prefix sum).
+		 * For performance analysis, this is supposed to be a simple for loop.
+		 * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
+		 */
+		void scan(int n, int *odata, const int *idata) {
+			timer().startCpuTimer();
+			// TODO
+			odata[0] = 0;
+			for (int k = 1; k < n; ++k) {
+				odata[k] = odata[k - 1] + idata[k - 1];
+			}
+			timer().endCpuTimer();
+		}
 
-        /**
-         * CPU stream compaction without using the scan function.
-         *
-         * @returns the number of elements remaining after compaction.
-         */
-        int compactWithoutScan(int n, int *odata, const int *idata) {
-            timer().startCpuTimer();
-            // TODO
-            timer().endCpuTimer();
-            return -1;
-        }
+		/**
+		 * CPU stream compaction without using the scan function.
+		 *
+		 * @returns the number of elements remaining after compaction.
+		 */
+		int compactWithoutScan(int n, int *odata, const int *idata) {
+			timer().startCpuTimer();
+			// TODO
+			int ptr = 0;
+			for (int i = 0; i < n; i++) {
+				if (idata[i] != 0) {
+					odata[ptr] = idata[i];
+					ptr++;
+				}
+			}
+			timer().endCpuTimer();
+			return ptr;
+		}
 
-        /**
-         * CPU stream compaction using scan and scatter, like the parallel version.
-         *
-         * @returns the number of elements remaining after compaction.
-         */
-        int compactWithScan(int n, int *odata, const int *idata) {
-            timer().startCpuTimer();
-            // TODO
-            timer().endCpuTimer();
-            return -1;
-        }
-    }
+		/**
+		 * CPU stream compaction using scan and scatter, like the parallel version.
+		 *
+		 * @returns the number of elements remaining after compaction.
+		 */
+		int compactWithScan(int n, int *odata, const int *idata) {
+			timer().startCpuTimer();
+			// TODO
+			int count = 0;
+			int *tmp = new int[n];
+			for (int i = 0; i < n; i++) {
+				if (idata[i] == 0) {
+					tmp[i] = 0;
+				}
+				else {
+					tmp[i] = 1;
+					count++;
+				}
+			}
+			int *tmpScan = new int[n];
+			tmpScan[0] = 0;
+			for (int k = 1; k < n; ++k) {
+				tmpScan[k] = tmpScan[k - 1] + tmp[k - 1];
+			}
+			for (int i = 0; i < n; i++) {
+				odata[tmpScan[i]] = idata[i];
+			}
+			timer().endCpuTimer();
+			delete tmp;
+			delete tmpScan;
+			return count;
+		}
+	}
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
@@ -12,15 +12,67 @@ namespace StreamCompaction {
             return timer;
         }
 
+		__global__ void kernEfficientScanUpSweep(int n, int *odata, int d) {
+			int index = threadIdx.x + (blockIdx.x * blockDim.x);
+			if (index >= n) {
+				return;
+			}
+			int interval = 1 << (d + 1);
+			int halfInterval = 1 << d;
+			if ((index + 1) % interval == 0) {
+				odata[index] += odata[index - halfInterval];
+			}
+		}
+
+		__global__ void kernEfficientScanDownSweep(int n, int *odata, int d, int topLayer) {
+			int index = threadIdx.x + (blockIdx.x * blockDim.x);
+			if (index >= n) {
+				return;
+			}
+			if (d == topLayer && index == n - 1) {
+				odata[index] = 0;
+			}
+			int interval = 1 << (d + 1);
+			int halfInterval = 1 << d;
+			if ((index + 1) % interval == 0) {
+				int tmp = odata[index - halfInterval];
+				odata[index - halfInterval] = odata[index];
+				odata[index] += tmp;
+			}
+		}
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
             // TODO
-            timer().endGpuTimer();
+			int N = pow(2, ilog2ceil(n));
+			int *dev_odata;
+			cudaMalloc((void**)&dev_odata, N * sizeof(int));
+			cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+			if (N > n) {
+				int *zeroArray = new int[N - n];
+				for (int i = 0; i < N - n; i++) {
+					zeroArray[i] = 0;
+				}
+				cudaMemcpy(dev_odata + n, zeroArray, (N - n) * sizeof(int), cudaMemcpyHostToDevice);
+			}
+			timer().startGpuTimer();
+			dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+			int topLayer = ilog2ceil(n) - 1;
+			for (int d = 0; d <= topLayer; d++) {
+				kernEfficientScanUpSweep << <fullBlocksPerGrid, blockSize >> > (N, dev_odata, d);
+			}
+
+			for (int d = topLayer; d >= 0; d--) {
+				kernEfficientScanDownSweep << <fullBlocksPerGrid, blockSize >> > (N, dev_odata, d, topLayer);
+			}
+
+			timer().endGpuTimer();
+			cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+			cudaFree(dev_odata);
         }
 
+
         /**
          * Performs stream compaction on idata, storing the result into odata.
          * All zeroes are discarded.
@@ -31,10 +83,52 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
             // TODO
-            timer().endGpuTimer();
-            return -1;
+			int N = pow(2, ilog2ceil(n));
+			dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+			int topLayer = ilog2ceil(n) - 1;
+			int *dev_idata, *dev_odata, *dev_bools, *dev_indices;
+			cudaMalloc((void**)&dev_idata, n * sizeof(int));
+			cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+			cudaMalloc((void**)&dev_odata, n * sizeof(int));
+			cudaMalloc((void**)&dev_bools, N * sizeof(int));
+			cudaMalloc((void**)&dev_indices, N * sizeof(int));
+
+			timer().startGpuTimer();
+			Common::kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (n, dev_bools, dev_idata);
+			if (N > n) {
+				int *zeroArray = new int[N - n];
+				for (int i = 0; i < N - n; i++) {
+					zeroArray[i] = 0;
+				}
+				cudaMemcpy(dev_bools + n, zeroArray, (N - n) * sizeof(int), cudaMemcpyHostToDevice);
+			}
+			cudaMemcpy(dev_indices, dev_bools, N * sizeof(int), cudaMemcpyDeviceToDevice);
+
+			int countScatter = 0;
+			for (int i = 0; i < n; i++) {
+				if (idata[i] != 0) {
+					countScatter++;
+				}
+			}
+			for (int d = 0; d <= topLayer; d++) {
+				kernEfficientScanUpSweep << <fullBlocksPerGrid, blockSize >> > (N, dev_indices, d);
+			}
+
+			for (int d = topLayer; d >= 0; d--) {
+				kernEfficientScanDownSweep << <fullBlocksPerGrid, blockSize >> > (N, dev_indices, d, topLayer);
+			}
+
+			Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (n, dev_odata,
+				dev_idata, dev_bools, dev_indices);
+			timer().endGpuTimer();
+
+			cudaMemcpy(odata, dev_odata, countScatter * sizeof(int), cudaMemcpyDeviceToHost);
+			cudaFree(dev_idata);
+			cudaFree(dev_odata);
+			cudaFree(dev_bools);
+			cudaFree(dev_indices);
+            return countScatter;
         }
     }
 }