CIS565-Fall-2017 · yashv28 · Sep 13, 2017 · Sep 13, 2017 · Sep 19, 2017 · Sep 19, 2017
diff --git a/README.md b/README.md
@@ -3,11 +3,42 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Yash Vardhan
+* Tested on: Windows 10 Pro, Intel i5-4200U (4) @ 2.600GHz 4GB, NVIDIA GeForce 840M 2048MB
 
-### (TODO: Your README)
+In this project, I have implemented the stream compaction and scan(prefix sum) in CUDA on both a CPU and a GPU, and have compared the performance of both of these side-by-side. The parallel version of Stream Compaction run on a GPU is a useful tool for many applications like deferred shading, path tracer algorithms. Algorithms like scan (a.k.a. prefix-sum) are the basis of many algorithms. They are specifically designed to run on GPU architecture.
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+The scan algorithms were implemented on -
 
+- CPU
+- Naive Scan on GPU
+- Work-efficient Scan on GPU
+- Scan using Thrust on GPU.
+
+Performance Analysis
+-------------------------
+
+Block Sizes were varied from 16 to 1024. The performance peaked around 64,128 and 256. I selected to benchmark the performance graphs on block size of 128.
+
+### Time(in ms) vs Number of elements in array
+----------------------------------------------
+
+![](img/scan.jpg)
+
+![](img/Data.png)
+
+The size of array was increased incrementally with a magnitude of 2^4. The transistion from 2^16 to 2^20 showed a signinficant diversion between thrust and naive GPU impementation. Also a difference between Work-Efficient and naive GPU implementation was observed. The Work-efficient GPU Scan could be optimized much more by using shared memory, which will reflect ina future benchmark test. 
+
+### Scan Results
+----------------
+
+Tested on array size of 512
+
+![](img/resScan.png)
+
+### Stream Compaction Results
+-----------------------------
+
+Tested on array size of 512
+
+![](img/resCompact.png)
diff --git a/img/Data.png b/img/Data.png
diff --git a/img/resCompact.png b/img/resCompact.png
diff --git a/img/resScan.png b/img/resScan.png
diff --git a/img/scan.jpg b/img/scan.jpg
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_50
     )
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
@@ -24,6 +24,10 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+			int index = threadIdx.x + (blockIdx.x * blockDim.x);
+			if (index >= n) return;
+
+			bools[index] = (idata[index] != 0) ? 1 : 0;
         }
 
         /**
@@ -33,7 +37,12 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+			int index = threadIdx.x + (blockIdx.x * blockDim.x);
+			if (index >= n) return;
+
+			int i = indices[index];
+			if (bools[index] == 1)
+				odata[i] = idata[index];
         }
-
     }
 }
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
@@ -9,6 +9,8 @@
 #include <algorithm>
 #include <chrono>
 #include <stdexcept>
+
+#define blockSize 128
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
@@ -27,7 +29,7 @@ inline int ilog2(int x) {
 }
 
 inline int ilog2ceil(int x) {
-    return ilog2(x - 1) + 1;
+	return ilog2(x - 1) + 1;
 }
 
 namespace StreamCompaction {

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
@@ -1,25 +1,31 @@
 #include <cstdio>
 #include "cpu.h"
 
-#include "common.h"
+#include "common.h"
 
 namespace StreamCompaction {
     namespace CPU {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-	        static PerformanceTimer timer;
-	        return timer;
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+	        static PerformanceTimer timer;
+	        return timer;
         }
 
         /**
          * CPU scan (prefix sum).
          * For performance analysis, this is supposed to be a simple for loop.
          * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
          */
-        void scan(int n, int *odata, const int *idata) {
-	        timer().startCpuTimer();
-            // TODO
+		void scan(int n, int *odata, const int *idata) {
+			timer().startCpuTimer();
+			//TODO
+			int sum = 0;
+			for (int i = 0; i < n; i++)
+			{
+				odata[i] = sum;
+				sum += idata[i];	
+			}
 	        timer().endCpuTimer();
         }
 
@@ -31,8 +37,17 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
             // TODO
+			int numel = 0;
+			for (int i = 0; i < n; i++)
+			{
+				if (idata[i] != 0)
+				{
+					odata[numel] = idata[i];
+					numel++;
+				}
+			}
 	        timer().endCpuTimer();
-            return -1;
+            return numel;
         }
 
         /**
@@ -43,8 +58,25 @@ namespace StreamCompaction {
         int compactWithScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
 	        // TODO
+			int *oscan = (int*)malloc(n * sizeof(int));
+			int *iscan = (int*)malloc(n * sizeof(int));
+			for (int i = 0; i < n; i++)
+				if(idata[i]==0)
+					iscan[i] = 0;
+				else
+					iscan[i] = 1;
+			scan(n, oscan, iscan);
+			int numel = 0;
+			for (int i = 0; i < n; i++)
+			{
+				if (idata[i] != 0)
+				{
+					odata[oscan[i]] = idata[i];
+					numel++;
+				}
+			}
 	        timer().endCpuTimer();
-            return -1;
+            return numel;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
@@ -12,13 +12,66 @@ namespace StreamCompaction {
             return timer;
         }
 
+		__global__ void upsweep(int n, int k, int* dev)
+		{
+			int index = threadIdx.x + (blockIdx.x * blockDim.x);
+			if (index >= n) return;
+
+			if ((index % (2 * k) == 0) && (index + (2 * k) <= n))
+				dev[index + (2 * k) - 1] += dev[index + k - 1];
+		}
+
+		__global__ void downsweep(int n, int k, int* dev)
+		{
+			int index = threadIdx.x + (blockIdx.x * blockDim.x);
+			if (index >= n) return;
+
+			if ((index % (2 * k) == 0) && (index + (2 * k) <= n))
+			{
+				int tmp = dev[index + k - 1];
+				dev[index + k - 1] = dev[index + (2 * k) - 1];
+				dev[index + (2 * k) - 1] += tmp;
+			}
+		}
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
+       void scan(int n, int *odata, const int *idata) {
+
+			int* dev;
+			int potn = 1 << ilog2ceil(n);
+
+			cudaMalloc((void**)&dev, potn * sizeof(int));
+			checkCUDAError("Malloc for input device failed\n");
+
+			cudaMemset(dev, 0, potn * sizeof(n));
+
+			cudaMemcpy(dev, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+			checkCUDAError("cudaMemcpy for device failed\n");
+
+			dim3 fullBlocksPerGrid((potn + blockSize - 1) / blockSize);
+
+			//timer().startGpuTimer();
+
+			for (int k = 1; k < potn; k*=2)
+			{
+				upsweep <<< fullBlocksPerGrid, blockSize >>> (potn, k, dev);
+			}
+
+			cudaMemset(dev + potn - 1, 0, sizeof(int));
+
+			for (int k = potn/2; k>0; k/=2)
+			{
+				downsweep <<< fullBlocksPerGrid, blockSize >>> (potn, k, dev);
+			}
+
+			//timer().endGpuTimer();
+
+			cudaMemcpy(odata, dev, n * sizeof(int), cudaMemcpyDeviceToHost);
+			checkCUDAError("cudaMemcpy for output data failed\n");
+
+			cudaFree(dev);
         }
 
         /**
@@ -31,10 +84,57 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
+			int* idev;
+			int* odev;
+			cudaMalloc((void**)&idev, n * sizeof(int));
+			checkCUDAError("cudaMalloc idata failed!");
+
+			cudaMalloc((void**)&odev, n * sizeof(*odev));
+			checkCUDAError("cudaMalloc odev failed!");
+
+			cudaMemcpy(idev, idata, n * sizeof(*idata), cudaMemcpyHostToDevice);
+
+			int potn = 1 << ilog2ceil(n);
+			int* boolarr; 
+
+			cudaMalloc((void**)&boolarr, potn * sizeof(int));
+			checkCUDAError("cudaMalloc bool failed!");
+
+			cudaMemset(boolarr, 0, potn * sizeof(int));
+
+			int* indices;
+			cudaMalloc((void**)&indices, potn * sizeof(int));
+			checkCUDAError("cudaMalloc bool failed!");
+
+			cudaMemcpy(indices, boolarr, n * sizeof(int), cudaMemcpyDeviceToDevice);
+			checkCUDAError("cudaMemcpy from to dev_bools to dev_indices failed!");
+
+			dim3 fullBlocksPerGrid((potn + blockSize - 1) / blockSize);
+
             timer().startGpuTimer();
             // TODO
+			StreamCompaction::Common::kernMapToBoolean <<<fullBlocksPerGrid, blockSize >>>(n, boolarr, idev);
+			scan(n, indices, boolarr);
+			StreamCompaction::Common::kernScatter <<<fullBlocksPerGrid, blockSize >>>(n, odev, idev, boolarr, indices);
+
             timer().endGpuTimer();
-            return -1;
+
+			cudaMemcpy(odata, odev, n * sizeof(int), cudaMemcpyDeviceToHost);
+			checkCUDAError("cudaMemcpy for odev failed");
+
+			int numbool = 0;
+			cudaMemcpy(&numbool, boolarr + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+
+			int numindices = 0;
+			cudaMemcpy(&numindices, indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+
+			int total = numbool + numindices;
+			cudaFree(indices);
+			cudaFree(idev);
+			cudaFree(odev);
+			cudaFree(boolarr);
+
+			return total;
         }
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
@@ -11,15 +11,67 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
+
         // TODO: __global__
+		__global__ void naivescan(int n, int k, int* idev, int* odev)
+		{
+			auto index = threadIdx.x + (blockIdx.x * blockDim.x);
+			if (index >= n) {return;}
+
+			if (index >= k)
+				odev[index] = idev[index] + idev[index - k];
+			else
+				odev[index] = idev[index];
+		}
 
+
+		__global__ void inc2exc(int n, int* idev, int* odev)
+		{
+			int index = threadIdx.x + (blockIdx.x * blockDim.x);
+			if (index >= n) { return; }
+
+			if (index > 0)
+				odev[index] = idev[index - 1];
+			else
+				odev[index] = 0;
+
+		}
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+
+			// TODO
+			int* idev;
+			int* odev;
+			cudaMalloc((void**)&idev, n * sizeof(int));
+			checkCUDAError("Malloc for input device failed\n");
+
+			cudaMalloc((void**)&odev, n * sizeof(int));
+			checkCUDAError("Malloc for input device failed\n");
+
+			cudaMemcpy(idev, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+			checkCUDAError("cudaMemcpy for input device failed\n");
+
+			dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
             timer().startGpuTimer();
-            // TODO
+
+			for (int k = 1; k < n; k<<=1)
+			{
+				naivescan <<< fullBlocksPerGrid, blockSize >>> (n, k, idev, odev);
+				std::swap(idev, odev);
+			}
+
+			inc2exc <<< fullBlocksPerGrid, blockSize >>> (n, idev, odev);
+
             timer().endGpuTimer();
+
+			cudaMemcpy(odata, odev, n * sizeof(int), cudaMemcpyDeviceToHost);
+			checkCUDAError("cudaMemcpy for output device failed\n");
+
+			cudaFree(odev);
+			cudaFree(idev);
         }
     }
 }