diff --git a/README.md b/README.md
index b71c458..e37c172 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,27 @@
-CUDA Stream Compaction
-======================
+# University of Pennsylvania, CIS 565: GPU Programming and Architecture
+## Project 2 - Stream Compaction
+* Liang Peng
+* Tested on: Windows 10, i7-6700HQ @ 2.6GHz, 8GB, GTX 960M (Personal Computer)
 
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
+## Screenshots
+* Result
+<br><img src="img/Capture1.PNG" width="500"></img>
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
-
-### (TODO: Your README)
-
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+## Analysis
+* Time measurement with std::chrono
+<blockquote>
+high_resolution_clock::time_point t1;<br>
+kernel<<<..., ...>>>(...);<br>
+cudaDeviceSynchronize();<br>
+high_resolution_clock::time_point t2;<br>
+duration t = t2 - t1;<br>
+print t.count();<br>
+</blockquote>
 
+* Array size
+<br><img src="img/Capture2.PNG" width="500"></img>
+  * _Observation_ CPU implementation is always faster than GPU implementation, the reason might be there is considerable overhead in the GPU implementation. My speculation is that as the scan process goes to next level, number of idling threads increases, which can be optimized by removing idle threads before launching kernel for next level. With this approach, index for threads to access elements in array must be recalculated properly.
+  
+* Block size 
+<br><img src="img/Capture3.PNG" width="500"></img>
+  * _Observation_ As block size increases, time consumed to perform scan decreases and at some point stablizes. The reason is the analyzed in last project.
diff --git a/img/Capture1.PNG b/img/Capture1.PNG
new file mode 100644
index 0000000..88ec082
Binary files /dev/null and b/img/Capture1.PNG differ
diff --git a/img/Capture2.PNG b/img/Capture2.PNG
new file mode 100644
index 0000000..1bb6135
Binary files /dev/null and b/img/Capture2.PNG differ
diff --git a/img/Capture3.PNG b/img/Capture3.PNG
new file mode 100644
index 0000000..497edb3
Binary files /dev/null and b/img/Capture3.PNG differ
diff --git a/src/main.cpp b/src/main.cpp
index 675da35..accd8f6 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,9 +14,14 @@
 #include "testing_helpers.hpp"
 
 int main(int argc, char* argv[]) {
-    const int SIZE = 1 << 8;
+    const int SIZE = 1 << 23;
     const int NPOT = SIZE - 3;
-    int a[SIZE], b[SIZE], c[SIZE];
+    int *a, *b, *c;
+
+	// allocate memory for test data
+	a = new int [SIZE];
+	b = new int [SIZE];
+	c = new int [SIZE];
 
     // Scan tests
 
@@ -120,4 +125,9 @@ int main(int argc, char* argv[]) {
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     //printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
+
+	// free memory
+	delete [] a;
+	delete [] b;
+	delete [] c;
 }
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index cdbef77..c8709e7 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_50
     )
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index fe872d4..56143b1 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -24,6 +24,13 @@ namespace Common {
  */
 __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
     // TODO
+	int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (index >= n) {
+		return;
+	}
+
+	bools[index] = idata[index] == 0 ? 0 : 1;
 }
 
 /**
@@ -33,6 +40,13 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
 __global__ void kernScatter(int n, int *odata,
         const int *idata, const int *bools, const int *indices) {
     // TODO
+	int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (index >= n || bools[index] == 0) {
+		return;
+	}
+
+	odata[indices[index]] = idata[index];
 }
 
 }
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index 4f52663..abfb918 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -26,6 +26,7 @@ inline int ilog2ceil(int x) {
 
 
 namespace StreamCompaction {
+	const int BLOCK_SIZE = 256;
 namespace Common {
     __global__ void kernMapToBoolean(int n, int *bools, const int *idata);
 
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index e600c29..04a08b6 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -1,5 +1,10 @@
 #include <cstdio>
+#include <chrono>
+using namespace std::chrono;
 #include "cpu.h"
+#include <iostream>
+using std::cout;
+using std::endl;
 
 namespace StreamCompaction {
 namespace CPU {
@@ -9,7 +14,15 @@ namespace CPU {
  */
 void scan(int n, int *odata, const int *idata) {
     // TODO
-    printf("TODO\n");
+    //printf("TODO\n");
+	high_resolution_clock::time_point t1 = high_resolution_clock::now();
+	odata[0] = 0;
+	for (int i = 1; i < n; ++i) {
+		odata[i] = odata[i - 1] + idata[i - 1];
+	}
+	high_resolution_clock::time_point t2 = high_resolution_clock::now();
+	duration<int, std::milli> t12 = duration_cast<duration<int, std::milli>>(t2 - t1);
+	cout << "----------Time consumed: " << t12.count() << " ms----------" << endl;
 }
 
 /**
@@ -19,7 +32,14 @@ void scan(int n, int *odata, const int *idata) {
  */
 int compactWithoutScan(int n, int *odata, const int *idata) {
     // TODO
-    return -1;
+	int m = 0;
+
+	for (int i = 0; i < n; ++i) {
+		if (idata[i] == 0) continue;
+		odata[m++] = idata[i];
+	}
+
+    return m;
 }
 
 /**
@@ -29,7 +49,24 @@ int compactWithoutScan(int n, int *odata, const int *idata) {
  */
 int compactWithScan(int n, int *odata, const int *idata) {
     // TODO
-    return -1;
+	int *nonZero = new int [n];
+	int m = 0;
+
+	for (int i = 0; i < n; ++i) {
+		nonZero[i] = idata[i] == 0 ? 0 : 1;
+	}
+
+	scan(n, odata, nonZero);
+	m = odata[n - 1];
+
+	for (int i = 0; i < n; ++i) {
+		if (nonZero[i] == 0) continue;
+		odata[odata[i]] = idata[i];
+	}
+
+	delete [] nonZero;
+
+    return m;
 }
 
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index b2f739b..7271860 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -1,5 +1,10 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <chrono>
+using namespace std::chrono;
+#include <iostream>
+using std::cout;
+using std::endl;
 #include "common.h"
 #include "efficient.h"
 
@@ -7,13 +12,95 @@ namespace StreamCompaction {
 namespace Efficient {
 
 // TODO: __global__
+__global__ void upSweep(const int n, const int step, int *data) {
+
+	int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (index >= n) {
+		return;
+	}
+
+	int rIndex = n - 1 - index;
+	int mask = 1;
+
+	for (int i = 1; i != step; (i <<= 1), (mask = mask << 1 | 1));
+
+	if (index - step >= 0 && (rIndex & mask) == 0) {
+		data[index] = data[index] + data[index - step];
+	}
+}
+
+__global__ void downSweep(const int n, const int step, int *data) {
+	int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (index >= n) {
+		return;
+	}
+
+	int rIndex = n - 1 - index;
+	int mask = 1;
+
+	for (int i = 1; i != step; (i <<= 1), (mask = mask << 1 | 1));
+
+	if (index - step >= 0 && (rIndex & mask) == 0) {
+		auto tmp = data[index];
+		data[index] += data[index - step];
+		data[index - step] = tmp;
+	}
+}
+
+void scanOnGPU(const int n, int *dev_data) {
+	dim3 blockCount = (n - 1) / BLOCK_SIZE + 1;
+	int step;
+
+	// up-sweep
+	for (step = 1; step < n; step <<= 1) {
+		upSweep<<<blockCount, BLOCK_SIZE>>>(n, step, dev_data);
+	}
+
+	// set last element to 0
+	cudaMemset(&dev_data[n - 1], 0, sizeof(int));
+
+	// down-sweep
+	for (step >>= 1; step > 0; step >>= 1) {
+		downSweep<<<blockCount, BLOCK_SIZE>>>(n, step, dev_data);
+	}
+}
 
 /**
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
 void scan(int n, int *odata, const int *idata) {
     // TODO
-    printf("TODO\n");
+    // printf("TODO\n");
+	int *dev_data;
+
+	// device memory allocation
+	cudaMalloc((void**)&dev_data, sizeof(int) * n);
+	checkCUDAError("Failed to allocate dev_data");
+
+	// copy input data to device
+	cudaMemcpy((void*)dev_data, (const void*)idata, sizeof(int) * n,
+			cudaMemcpyHostToDevice);
+
+	// tic
+	high_resolution_clock::time_point t1 = high_resolution_clock::now();
+
+	// do scan
+	scanOnGPU(n, dev_data);
+
+	// toc
+	cudaDeviceSynchronize();
+	high_resolution_clock::time_point t2 = high_resolution_clock::now();
+	duration<int, std::milli> t12 = duration_cast<duration<int, std::milli>>(t2 - t1);
+	cout << "----------Time consumed: " << t12.count() << " ms----------" << endl;
+
+	// copy result to host
+	cudaMemcpy((void*)odata, (const void*)dev_data, sizeof(int) * n,
+			cudaMemcpyDeviceToHost);
+
+	// free memory on device
+	cudaFree(dev_data);
 }
 
 /**
@@ -27,7 +114,58 @@ void scan(int n, int *odata, const int *idata) {
  */
 int compact(int n, int *odata, const int *idata) {
     // TODO
-    return -1;
+	int count;
+	int *dev_data;
+	int *dev_dataCopy;
+	int *dev_bool;
+	int *dev_boolScan;
+
+	// device memory allocation
+	cudaMalloc((void**)&dev_data, sizeof(int) * n);
+	checkCUDAError("Failed to allocate dev_data");
+
+	cudaMalloc((void**)&dev_dataCopy, sizeof(int) * n);
+	checkCUDAError("Failed to allocate dev_dataCopy");
+
+	cudaMalloc((void**)&dev_bool, sizeof(int) * n);
+	checkCUDAError("Failed to allocate dev_bool");
+
+	cudaMalloc((void**)&dev_boolScan, sizeof(int) * n);
+	checkCUDAError("Failed to allocate dev_boolScan");
+
+	// copy input data to device
+	cudaMemcpy((void*)dev_data, (const void*)idata, sizeof(int) * n,
+			cudaMemcpyHostToDevice);
+
+	dim3 blockCount = (n - 1) / BLOCK_SIZE + 1;
+
+	// map to booleans
+	Common::kernMapToBoolean<<<blockCount, BLOCK_SIZE>>>(n, dev_bool, dev_data);
+
+	// scan booleans
+	cudaMemcpy((void*)dev_boolScan, (const void*)dev_bool, sizeof(int) * n,
+			cudaMemcpyDeviceToDevice);
+	scanOnGPU(n, dev_boolScan);
+
+	// scatter
+	cudaMemcpy((void*)dev_dataCopy, (const void*)dev_data, sizeof(int) * n,
+			cudaMemcpyDeviceToDevice);
+	Common::kernScatter<<<blockCount, BLOCK_SIZE>>>(n, dev_data, dev_dataCopy,
+			dev_bool, dev_boolScan);
+
+	// copy result to host
+	cudaMemcpy((void*)odata, (const void*)dev_data, sizeof(int) * n,
+			cudaMemcpyDeviceToHost);
+	cudaMemcpy((void*)&count, (const void*)&dev_boolScan[n - 1], sizeof(int),
+			cudaMemcpyDeviceToHost);
+
+	// free memory on device
+	cudaFree(dev_data);
+	cudaFree(dev_dataCopy);
+	cudaFree(dev_bool);
+	cudaFree(dev_boolScan);
+
+    return count;
 }
 
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 3d86b60..929995b 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -1,5 +1,10 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <chrono>
+using namespace std::chrono;
+#include <iostream>
+using std::cout;
+using std::endl;
 #include "common.h"
 #include "naive.h"
 
@@ -7,13 +12,87 @@ namespace StreamCompaction {
 namespace Naive {
 
 // TODO: __global__
+__global__ void accumulate(const int n, const int step, int *idata, int *odata) {
+
+	int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (index >= n) {
+		return;
+	}
+
+	if (index - step < 0) {
+		odata[index] = idata[index];
+	} else {
+		odata[index] = idata[index] + idata[index - step];
+	}
+}
+
+__global__ void shiftRight(const int n, int *idata, int *odata) {
+
+	int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (index >= n) {
+		return;
+	} else if (index == 0) {
+		odata[index] = 0;
+		return;
+	}
+
+	odata[index] = idata[index - 1];
+}
+
+// swap
+inline void swap(int &x, int &y) {
+	auto tmp = x; x = y; y = tmp;
+}
 
 /**
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
 void scan(int n, int *odata, const int *idata) {
     // TODO
-    printf("TODO\n");
+    // printf("TODO\n");
+	int *dev_data[2];
+	int input = 1;
+	int output = 0;
+
+	// device memory allocation
+	cudaMalloc((void**)&dev_data[0], sizeof(int) * n);
+	checkCUDAError("Failed to allocate dev_data[0]");
+
+	cudaMalloc((void**)&dev_data[1], sizeof(int) * n);
+	checkCUDAError("Failed to allocate dev_data[1]");
+
+	// copy input data to device
+	cudaMemcpy((void*)dev_data[input], (const void*)idata, sizeof(int) * n,
+			cudaMemcpyHostToDevice);
+
+	// tic
+	high_resolution_clock::time_point t1 = high_resolution_clock::now();
+
+	// do scan
+	dim3 blockCount = (n - 1) / BLOCK_SIZE + 1;
+
+	shiftRight<<<blockCount, BLOCK_SIZE>>>(n, dev_data[input], dev_data[output]);
+
+	for (int step = 1; step < n; step <<= 1) {
+		swap(input, output);
+		accumulate<<<blockCount, BLOCK_SIZE>>>(n, step, dev_data[input], dev_data[output]);
+	}
+
+	// toc
+	cudaDeviceSynchronize();
+	high_resolution_clock::time_point t2 = high_resolution_clock::now();
+	duration<int, std::milli> t12 = duration_cast<duration<int, std::milli>>(t2 - t1);
+	cout << "----------Time consumed: " << t12.count() << " ms----------" << endl;
+
+	// copy result to host
+	cudaMemcpy((void*)odata, (const void*)dev_data[output], sizeof(int) * n,
+			cudaMemcpyDeviceToHost);
+
+	// free memory on device
+	cudaFree(dev_data[0]);
+	cudaFree(dev_data[1]);
 }
 
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index d8dbb32..34426af 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -16,6 +16,28 @@ void scan(int n, int *odata, const int *idata) {
     // TODO use `thrust::exclusive_scan`
     // example: for device_vectors dv_in and dv_out:
     // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+	int *dev_data;
+
+	// device memory allocation
+	cudaMalloc((void**)&dev_data, sizeof(int) * n);
+	checkCUDAError("Failed to allocate dev_data[0]");
+
+	// copy input data to device
+	cudaMemcpy((void*)dev_data, (const void*)idata, sizeof(int) * n,
+			cudaMemcpyHostToDevice);
+
+	// cast to device ptr
+	thrust::device_ptr<int> dev_thrust_data(dev_data);
+
+	// do scan
+	thrust::exclusive_scan(dev_thrust_data, dev_thrust_data + n, dev_thrust_data);
+
+	// copy result to host
+	cudaMemcpy((void*)odata, (const void*)dev_data, sizeof(int) * n,
+			cudaMemcpyDeviceToHost);
+
+	// free memory on device
+	cudaFree(dev_data);
 }
 
 }