diff --git a/README.md b/README.md
index b71c458..8bbc3fa 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,105 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Yuxin Hu
+* Tested on: Windows 10, i7-6700HQ @ 2.60GHz 8GB, GTX 960M 4096MB (Personal Laptop)
 
-### (TODO: Your README)
+### README
+##### Project Description
+This project is about doing inclusive and exclusive scan for compaction computation in GPU parallel methodology, that will be useful for lots of applications, such as terminating rays that don't contribute in patch tracing.
+
+##### Features Implemented
+1. Exclusive Scan
+   * CPU Version using single for loop
+   * GPU Version using Naive Parallel Reduction
+   * GPU Version using Work Efficient Parallel Reduction with a up sweep and down sweep process
+   * thrust Version using built in thrust::exclusive_scan function
+   
+2. Stream Compaction
+   * CPU Version of a single for loop without using exclusive scan
+   * CPU Version of two for loops with exclusive scan. First for loop scan over the array to check if each element we would like to keep. If yes, we put 1 in a boolean array, if not, we put 0 in a boolean array. Then we call exclusive scan function we wrote in step above to get another array containing exclusive sum of the boolean array. Lastly we use another for loop to get final compaction result from the boolean array and boolean exclusive sum array.
+   * GPU Version with work efficient scan. The idea is similar to CPU version, except that the boolean array calculation, exclusive scan, and final result is all done parallelly on GPU.
+
+##### Performance Analysis: 
+1. Roughly optimize the block sizes of each of your implementations for minimal run time on your GPU.
+![BlockSize Versus Efficiency](/img/BlocksizeAndEfficiency.PNG)
+<p align="center"><b>BlockSize Versus Efficiency</b></p>
+
+There is not much performance change with block size changes. I set it to 512 for all remaining performance analysis.
+
+2. Scan performance comparason with array size changes
+![ScanPerformanceAnalysis](/img/ScanPerformanceAnalysis.PNG)
+<p align="center"><b>Exclusive Scan Performance Analysis with Increasing Array Size</b></p>
+
++ It can be observed from the graph that the performance ranking is as follows: thrust > CPU > GPU Naive Scan > GPU Work Efficient Scan. The performance of GPU version scanning is worse than CPU version in my implementation. 
++ For GPU Naive Scan, it runs log2n levels, with each level of n threads, so the total number of threads is nLog2n, which is more than the number of elements check in CPU. Moreover I need to do a right shift of the Naive scan result which is another n threads. Although these threads can run in parallel, but considering the the thread scheduling on GPU,global memory read latency, the advantage of naive scan is not that obvious comparing to CPU version. 
++ For GPU Work Efficient Scan, I am doint an up sweep and a down sweep, which result in 2log2n levels of kernal function calls. Each level consists of n threads, so the total number of threads would be 2nlog2n. It needs twice the thread number as Naive scan. In fact, many threads in each level are not doing work because they do not meet index%2^(level+1). Each streaming-multi processor can only execute a small number of warps at a time, so those threads who meet the criteria still have to wait until the SM finishes executing the warps, even though there may not be any threads meeting the requirements in those warps.
++ thrust's performance is the best of all scan methods.
+
+
+##### Program Output at array SIZE = 2^15
+```
+****************
+** SCAN TESTS **
+****************
+    [  44  14  14   1   0  15  26   2  38  20  24  46  10 ...  17   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 0.061234ms    (std::chrono Measured)
+    [   0  44  58  72  73  73  88 114 116 154 174 198 244 ... 801091 801108 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 0.06084ms    (std::chrono Measured)
+    [   0  44  58  72  73  73  88 114 116 154 174 198 244 ... 800978 801016 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 0.090464ms    (CUDA Measured)
+    [   0  44  58  72  73  73  88 114 116 154 174 198 244 ... 801091 801108 ]
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 0.101856ms    (CUDA Measured)
+    [   0  44  58  72  73  73  88 114 116 154 174 198 244 ...   0   0 ]
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 0.222976ms    (CUDA Measured)
+    [   0  44  58  72  73  73  88 114 116 154 174 198 244 ... 801091 801108 ]
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 0.227488ms    (CUDA Measured)
+    [   0  44  58  72  73  73  88 114 116 154 174 198 244 ... 800978 801016 ]
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 0.001184ms    (CUDA Measured)
+    [   0  44  58  72  73  73  88 114 116 154 174 198 244 ... 801091 801108 ]
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.001152ms    (CUDA Measured)
+    [   0  44  58  72  73  73  88 114 116 154 174 198 244 ... 800978 801016 ]
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   2   0   3   0   3   2   2   0   0   2   0   2 ...   3   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 0.12721ms    (std::chrono Measured)
+    [   2   2   3   3   2   2   2   2   1   1   2   2   1 ...   2   3 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 0.136691ms    (std::chrono Measured)
+    [   2   2   3   3   2   2   2   2   1   1   2   2   1 ...   2   3 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 0.343704ms    (std::chrono Measured)
+    [   2   2   3   3   2   2   2   2   1   1   2   2   1 ...   2   3 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 0.403328ms    (CUDA Measured)
+    [   2   2   3   3   2   2   2   2   1   1   2   2   1 ...   2   3 ]
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 0.392096ms    (CUDA Measured)
+    [   2   2   3   3   2   2   2   2   1   1   2   2   1 ...   2   3 ]
+    passed
+Press any key to continue . . .
+```
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
 
diff --git a/img/BlocksizeAndEfficiency.PNG b/img/BlocksizeAndEfficiency.PNG
new file mode 100644
index 0000000..d22b8bd
Binary files /dev/null and b/img/BlocksizeAndEfficiency.PNG differ
diff --git a/img/ScanPerformanceAnalysis.PNG b/img/ScanPerformanceAnalysis.PNG
new file mode 100644
index 0000000..4ed9e6d
Binary files /dev/null and b/img/ScanPerformanceAnalysis.PNG differ
diff --git a/src/main.cpp b/src/main.cpp
index 7305641..80609c8 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 15; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int a[SIZE], b[SIZE], c[SIZE];
 
@@ -49,42 +49,42 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, non-power-of-two");
     StreamCompaction::Thrust::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     printf("\n");
@@ -129,14 +129,14 @@ int main(int argc, char* argv[]) {
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
     system("pause"); // stop Win32 console from closing on exit
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index cdbef77..c8709e7 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_50
     )
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 8fc0211..690fd14 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -24,6 +24,15 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (index < n) {
+				if (idata[index] == 0) {
+					bools[index] = 0;
+				}
+				else {
+					bools[index] = 1;
+				}
+			}
         }
 
         /**
@@ -33,6 +42,12 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (index < n) {
+				if (bools[index] == 1) {
+					odata[indices[index]] = idata[index];
+				}
+			}
         }
 
     }
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index 55f1b38..8014d1a 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -1,14 +1,14 @@
 #pragma once
 
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include <cstdio>
-#include <cstring>
-#include <cmath>
-#include <algorithm>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <cstdio>
+#include <cstring>
+#include <cmath>
+#include <algorithm>
 #include <chrono>
-#include <stdexcept>
+#include <stdexcept>
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
@@ -37,96 +37,96 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices);
 
-	    /**
-	    * This class is used for timing the performance
-	    * Uncopyable and unmovable
-        *
-        * Adapted from WindyDarian(https://github.com/WindyDarian)
-	    */
-	    class PerformanceTimer
-	    {
-	    public:
-		    PerformanceTimer()
-		    {
-			    cudaEventCreate(&event_start);
-			    cudaEventCreate(&event_end);
-		    }
-
-		    ~PerformanceTimer()
-		    {
-			    cudaEventDestroy(event_start);
-			    cudaEventDestroy(event_end);
-		    }
-
-		    void startCpuTimer()
-		    {
-			    if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); }
-			    cpu_timer_started = true;
-
-			    time_start_cpu = std::chrono::high_resolution_clock::now();
-		    }
-
-		    void endCpuTimer()
-		    {
-			    time_end_cpu = std::chrono::high_resolution_clock::now();
-
-			    if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); }
-
-			    std::chrono::duration<double, std::milli> duro = time_end_cpu - time_start_cpu;
-			    prev_elapsed_time_cpu_milliseconds =
-				    static_cast<decltype(prev_elapsed_time_cpu_milliseconds)>(duro.count());
-
-			    cpu_timer_started = false;
-		    }
-
-		    void startGpuTimer()
-		    {
-			    if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); }
-			    gpu_timer_started = true;
-
-			    cudaEventRecord(event_start);
-		    }
-
-		    void endGpuTimer()
-		    {
-			    cudaEventRecord(event_end);
-			    cudaEventSynchronize(event_end);
-
-			    if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); }
-
-			    cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end);
-			    gpu_timer_started = false;
-		    }
-
-		    float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015
-		    {
-			    return prev_elapsed_time_cpu_milliseconds;
-		    }
-
-		    float getGpuElapsedTimeForPreviousOperation() //noexcept
-		    {
-			    return prev_elapsed_time_gpu_milliseconds;
-		    }
-
-		    // remove copy and move functions
-		    PerformanceTimer(const PerformanceTimer&) = delete;
-		    PerformanceTimer(PerformanceTimer&&) = delete;
-		    PerformanceTimer& operator=(const PerformanceTimer&) = delete;
-		    PerformanceTimer& operator=(PerformanceTimer&&) = delete;
-
-	    private:
-		    cudaEvent_t event_start = nullptr;
-		    cudaEvent_t event_end = nullptr;
-
-		    using time_point_t = std::chrono::high_resolution_clock::time_point;
-		    time_point_t time_start_cpu;
-		    time_point_t time_end_cpu;
-
-		    bool cpu_timer_started = false;
-		    bool gpu_timer_started = false;
-
-		    float prev_elapsed_time_cpu_milliseconds = 0.f;
-		    float prev_elapsed_time_gpu_milliseconds = 0.f;
+	    /**
+	    * This class is used for timing the performance
+	    * Uncopyable and unmovable
+        *
+        * Adapted from WindyDarian(https://github.com/WindyDarian)
+	    */
+	    class PerformanceTimer
+	    {
+	    public:
+		    PerformanceTimer()
+		    {
+			    cudaEventCreate(&event_start);
+			    cudaEventCreate(&event_end);
+		    }
+
+		    ~PerformanceTimer()
+		    {
+			    cudaEventDestroy(event_start);
+			    cudaEventDestroy(event_end);
+		    }
+
+		    void startCpuTimer()
+		    {
+			    if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); }
+			    cpu_timer_started = true;
+
+			    time_start_cpu = std::chrono::high_resolution_clock::now();
+		    }
+
+		    void endCpuTimer()
+		    {
+			    time_end_cpu = std::chrono::high_resolution_clock::now();
+
+			    if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); }
+
+			    std::chrono::duration<double, std::milli> duro = time_end_cpu - time_start_cpu;
+			    prev_elapsed_time_cpu_milliseconds =
+				    static_cast<decltype(prev_elapsed_time_cpu_milliseconds)>(duro.count());
+
+			    cpu_timer_started = false;
+		    }
+
+		    void startGpuTimer()
+		    {
+			    if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); }
+			    gpu_timer_started = true;
+
+			    cudaEventRecord(event_start);
+		    }
+
+		    void endGpuTimer()
+		    {
+			    cudaEventRecord(event_end);
+			    cudaEventSynchronize(event_end);
+
+			    if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); }
+
+			    cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end);
+			    gpu_timer_started = false;
+		    }
+
+		    float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015
+		    {
+			    return prev_elapsed_time_cpu_milliseconds;
+		    }
+
+		    float getGpuElapsedTimeForPreviousOperation() //noexcept
+		    {
+			    return prev_elapsed_time_gpu_milliseconds;
+		    }
+
+		    // remove copy and move functions
+		    PerformanceTimer(const PerformanceTimer&) = delete;
+		    PerformanceTimer(PerformanceTimer&&) = delete;
+		    PerformanceTimer& operator=(const PerformanceTimer&) = delete;
+		    PerformanceTimer& operator=(PerformanceTimer&&) = delete;
+
+	    private:
+		    cudaEvent_t event_start = nullptr;
+		    cudaEvent_t event_end = nullptr;
+
+		    using time_point_t = std::chrono::high_resolution_clock::time_point;
+		    time_point_t time_start_cpu;
+		    time_point_t time_end_cpu;
+
+		    bool cpu_timer_started = false;
+		    bool gpu_timer_started = false;
+
+		    float prev_elapsed_time_cpu_milliseconds = 0.f;
+		    float prev_elapsed_time_gpu_milliseconds = 0.f;
 	    };
     }
 }
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 05ce667..7681e8d 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -1,28 +1,39 @@
 #include <cstdio>
 #include "cpu.h"
 
-#include "common.h"
+#include "common.h"
 
 namespace StreamCompaction {
     namespace CPU {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-	        static PerformanceTimer timer;
-	        return timer;
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+	        static PerformanceTimer timer;
+	        return timer;
         }
 
+		void scan_implement(int n, int *odata, const int *idata) {
+			odata[0] = 0;
+			for (int i = 1; i < n; i++) {
+				odata[i] = odata[i - 1] + idata[i - 1];
+			}
+		}
+
         /**
          * CPU scan (prefix sum).
+		 * Exclusive prefix sum
          * For performance analysis, this is supposed to be a simple for loop.
          * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
          */
         void scan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
             // TODO
+			scan_implement(n, odata, idata);
 	        timer().endCpuTimer();
         }
 
+		
+
         /**
          * CPU stream compaction without using the scan function.
          *
@@ -31,8 +42,15 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
             // TODO
+			int count = 0;
+			for (int i = 0; i < n; i++) {
+				if (idata[i] != 0) {
+					odata[count] = idata[i];
+					count++;
+				}
+			}
 	        timer().endCpuTimer();
-            return -1;
+            return count;
         }
 
         /**
@@ -43,8 +61,26 @@ namespace StreamCompaction {
         int compactWithScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
 	        // TODO
+			int* tempdata = new int[n];
+			for (int i = 0; i < n; i++) {
+				if (idata[i] != 0) {
+					tempdata[i] = 1;
+				}
+				else {
+					tempdata[i] = 0;
+				}
+			}
+			int* tempIndexdata = new int[n];
+			scan_implement(n, tempIndexdata, tempdata);
+			int count = 0;
+			for (int i = 0; i < n; i++) {
+				if (tempdata[i]) {
+					odata[tempIndexdata[i]] = idata[i];
+					count++;
+				}
+			}
 	        timer().endCpuTimer();
-            return -1;
+            return count;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 36c5ef2..36222b3 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -3,22 +3,100 @@
 #include "common.h"
 #include "efficient.h"
 
+#define blockSize 512
+#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)
+
 namespace StreamCompaction {
     namespace Efficient {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
         }
 
+		__global__ void kernEfficientUpsweep(int pow2plus1, int pow2, int N, int* idata) {
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (index < N) {
+				if (index % pow2plus1==0) {
+					idata[index + pow2plus1 - 1] += idata[index + pow2 - 1];
+				}
+			}
+		}
+
+		__global__ void kernEfficientDownsweep(int pow2plus1, int pow2, int N, int* idata) {
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (index < N) {
+				if (index % pow2plus1 == 0) {
+					int t = idata[index + pow2 - 1];
+					idata[index + pow2 - 1] = idata[index + pow2plus1 - 1];
+					idata[index + pow2plus1 - 1] += t;
+				}
+			}
+		}
+
+		__global__ void kernNonPowerTwoHelper(int N, int zeroStartIndex, int zeroEndIndex, int* idata) {
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (index < N) {
+				if (index >= zeroStartIndex && index < zeroEndIndex) {
+					idata[index] = 0;
+				}
+			}
+		}
+
+		__global__ void kernChangeElem(int *arr, int idx, int val) {
+			arr[idx] = val;
+		}
+		
+		void gpuEfficientScan(int originSize, int npoweroftwo, int* dev_tempin) {
+			dim3 fullBlocksPerGrid((npoweroftwo + blockSize - 1) / blockSize);
+			//Helper kernal function to set extra elements to 0 if the size is not a power of 2.
+			if (originSize != npoweroftwo) {
+				kernNonPowerTwoHelper << <fullBlocksPerGrid, blockSize >> > (npoweroftwo, originSize, npoweroftwo, dev_tempin);
+			}
+			
+			for (int iteration = 0; iteration <= ilog2ceil(npoweroftwo) - 1; iteration++) {
+				kernEfficientUpsweep << <fullBlocksPerGrid, blockSize >> > (pow(2, iteration + 1),
+					pow(2, iteration), npoweroftwo, dev_tempin);
+			}
+
+			/*cudaMemcpy(odata, dev_tempin, npoweroftwo * sizeof(int), cudaMemcpyDeviceToHost);
+			odata[npoweroftwo - 1] = 0;
+			cudaMemcpy(dev_tempin, odata, npoweroftwo * sizeof(int), cudaMemcpyHostToDevice);*/
+
+			kernChangeElem << <1, 1 >> >(dev_tempin, npoweroftwo - 1, 0);
+
+			for (int iteration = ilog2ceil(npoweroftwo) - 1; iteration >= 0; iteration--) {
+				kernEfficientDownsweep << <fullBlocksPerGrid, blockSize >> > (pow(2, iteration + 1),
+					pow(2, iteration), npoweroftwo, dev_tempin);
+			}
+		}
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
+			
+			int *dev_tempin;
+			//check if n is power of 2, if not, needs to add extra zeros in order to make n a power of 2
+			bool isPowerOfTwo = (n != 0) && ((n & (n - 1)) == 0);
+			int npoweroftwo = n;
+			if (!isPowerOfTwo) {
+				npoweroftwo = pow(2, ilog2ceil(n));
+			}
+			
+			cudaMalloc((void**)&dev_tempin, npoweroftwo * sizeof(int));
+			checkCUDAErrorWithLine("cudaMalloc dev_tempin failed!");
+			cudaMemcpy(dev_tempin, idata, npoweroftwo * sizeof(int), cudaMemcpyHostToDevice);
+			checkCUDAErrorWithLine("cuda Memcpy from idata to dev_tempin failed!");
+
+			timer().startGpuTimer();
+			gpuEfficientScan(n, npoweroftwo, dev_tempin);
+			timer().endGpuTimer();
+
+			cudaMemcpy(odata, dev_tempin, npoweroftwo * sizeof(int), cudaMemcpyDeviceToHost);
+			checkCUDAErrorWithLine("cuda Memcpy from dev_tempin to odata failed!");
+			cudaFree(dev_tempin);			
         }
 
         /**
@@ -31,10 +109,75 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
+           
             // TODO
+			bool isPowerOfTwo = (n != 0) && ((n & (n - 1)) == 0);
+			int npoweroftwo = n;
+			if (!isPowerOfTwo) {
+				npoweroftwo = pow(2, ilog2ceil(n));
+			}
+
+			dim3 fullBlocksPerGrid((npoweroftwo + blockSize - 1) / blockSize);
+
+			//Initialize two host arrays to store intermediate bools and indices
+			int* ibools;
+			ibools = (int*)malloc(npoweroftwo * sizeof(int));
+			int* indices;
+			indices = (int*)malloc(npoweroftwo * sizeof(int));
+			
+			//Initialize and allocate CUDA device arrays
+			int *dev_bool;
+			int *dev_idata;
+			int *dev_odata;
+			int *dev_indices;
+			cudaMalloc((void**)&dev_bool, npoweroftwo * sizeof(int));
+			checkCUDAErrorWithLine("cudaMalloc dev_bool failed!");
+			cudaMalloc((void**)&dev_idata, npoweroftwo * sizeof(int));
+			checkCUDAErrorWithLine("cudaMalloc dev_idata failed!");
+			cudaMalloc((void**)&dev_odata, npoweroftwo * sizeof(int));
+			checkCUDAErrorWithLine("cudaMalloc dev_odata failed!");
+			cudaMalloc((void**)&dev_indices, npoweroftwo * sizeof(int));
+			checkCUDAErrorWithLine("cudaMalloc dev_indices failed!");
+			
+			//Copy data from host input data to device data
+			cudaMemcpy(dev_idata, idata, npoweroftwo * sizeof(int), cudaMemcpyHostToDevice);
+			checkCUDAErrorWithLine("cuda Memcpy from idata to dev_idata failed!");
+
+			//Perform a basic check in kernal, mark 1 for numbers not equal to 0, 0 for numbers equal to 0
+			timer().startGpuTimer();
+			StreamCompaction::Common::kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (npoweroftwo, dev_bool, dev_idata);
+			cudaDeviceSynchronize();
+
+			//Copy result from device array dev_bool to device array dev_indices for excluse scan.
+			cudaMemcpy(dev_indices, dev_bool, npoweroftwo * sizeof(int), cudaMemcpyDeviceToDevice);
+			
+			//Perform an exclusive sum scan on ibools to get the final indices array
+			gpuEfficientScan(n, npoweroftwo, dev_indices);
+
+			//perform the final scatter step to store the result in dev_odata
+			StreamCompaction::Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (n, dev_odata, dev_idata, dev_bool, dev_indices);
             timer().endGpuTimer();
-            return -1;
+
+			//copy result from dev_odata to odata.
+			cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+			checkCUDAErrorWithLine("cuda Memcpy from dev_odata to odata failed!");
+
+			cudaFree(dev_bool);
+			cudaFree(dev_idata);
+			cudaFree(dev_odata);
+			cudaFree(dev_indices);
+
+			//How to decide the remianing number in odata?
+			int count=0;
+			for (int i = 0; i < n; i++) {
+				if (odata[i] != 0) {
+					count++;
+				}
+				else {
+					break;
+				}
+			}
+            return count;
         }
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 9218f8e..31015e9 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -1,25 +1,94 @@
+#define GLM_FORCE_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include "common.h"
 #include "naive.h"
+#define blockSize 512
+#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)
 
 namespace StreamCompaction {
     namespace Naive {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
         }
-        // TODO: __global__
 
+		// TODO: __global__
+		__global__ void kernNaiveScan(int pow2minus1, int N, int* odata, const int* idata) {
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (index < N) {
+				if (index >= pow2minus1) {
+					int scanIndexHelper = index - pow2minus1;
+					odata[index] = idata[scanIndexHelper] + idata[index];
+				}
+				else {
+					odata[index] = idata[index];
+				}
+			}
+		}
+
+		__global__ void kernRightShift(int N, int* odata, int* idata)
+		{
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (index == 0) {
+				odata[index] = 0;
+				odata[index + 1] = idata[index];
+			}else if (index < N-1) {
+				odata[index + 1] = idata[index];
+			}
+		}
+		
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
+            
+			dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+			bool swap = false; //flag, true call kernNaiveScan(iteration, n, dev_tempin, dev_tempout). false call kernNaiveScan(iteration, n, dev_tempout, dev_tempin)
+			
             // TODO
+			int *dev_tempin;
+			int *dev_tempout;
+			cudaMalloc((void**)&dev_tempin, n * sizeof(int));
+			checkCUDAErrorWithLine("cudaMalloc dev_tempin failed!");
+			cudaMalloc((void**)&dev_tempout, n * sizeof(int));
+			checkCUDAErrorWithLine("cudaMalloc dev_tempout failed!");
+			cudaMemcpy(dev_tempin, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+			checkCUDAErrorWithLine("cuda Memcpy from idata to dev_tempin failed!");
+			
+			timer().startGpuTimer();
+			for(int iteration = 1; iteration<= ilog2ceil(n); iteration++){
+				if (swap) {
+					kernNaiveScan << <fullBlocksPerGrid, blockSize >> > (pow(2, iteration-1), n, dev_tempin, dev_tempout);
+				}
+				else {
+					kernNaiveScan << <fullBlocksPerGrid, blockSize >> > (pow(2, iteration - 1), n, dev_tempout, dev_tempin);
+				}
+				swap = !swap;				
+			}
+			swap = !swap; //revert back to the last status
+			if (swap) {
+				kernRightShift << <fullBlocksPerGrid, blockSize >> >(n, dev_tempout, dev_tempin);
+			}
+			else {
+				kernRightShift << <fullBlocksPerGrid, blockSize >> >(n, dev_tempin, dev_tempout);
+			}
             timer().endGpuTimer();
+			
+			
+			if (swap) {
+				cudaMemcpy(odata, dev_tempout, n * sizeof(int), cudaMemcpyDeviceToHost);
+				checkCUDAErrorWithLine("cuda Memcpy from dev_tempin to odata failed!");
+			}
+			else {
+				cudaMemcpy(odata, dev_tempin, n * sizeof(int), cudaMemcpyDeviceToHost);
+				checkCUDAErrorWithLine("cuda Memcpy from dev_tempout to odata failed!");
+			}
+			cudaFree(dev_tempin);
+			cudaFree(dev_tempout);
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 36b732d..63c5d43 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -8,20 +8,24 @@
 
 namespace StreamCompaction {
     namespace Thrust {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
         }
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+			//thrust::device_vector<int> in(idata, idata + n);
+			//thrust::device_vector<int> out(odata, odata + n);
             timer().startGpuTimer();
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+			thrust::exclusive_scan(idata, idata + n, odata);
+			//thrust::exclusive_scan(in.begin(), in.end(), out.begin());
             timer().endGpuTimer();
         }
     }