diff --git a/Project2-Character-Recognition/CMakeLists.txt b/Project2-Character-Recognition/CMakeLists.txt
index 09e9198..1caf239 100644
--- a/Project2-Character-Recognition/CMakeLists.txt
+++ b/Project2-Character-Recognition/CMakeLists.txt
@@ -22,6 +22,7 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
 endif()
 
 include_directories(.)
+link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
 add_subdirectory(character_recognition)
 
 cuda_add_executable(${CMAKE_PROJECT_NAME}
@@ -32,4 +33,5 @@ cuda_add_executable(${CMAKE_PROJECT_NAME}
 target_link_libraries(${CMAKE_PROJECT_NAME}
     character_recognition
     ${CORELIBS}
+    cublas
     )
diff --git a/Project2-Character-Recognition/README.md b/Project2-Character-Recognition/README.md
index 4503fac..924de17 100644
--- a/Project2-Character-Recognition/README.md
+++ b/Project2-Character-Recognition/README.md
@@ -1,14 +1,144 @@
-CUDA Character Recognition
+# Project 2-2 CUDA Character Recognition
 ======================
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Weiqi Chen
+  * [LinkedIn](https://www.linkedin.com/in/weiqi-ricky-chen-2b04b2ab/)
+* Tested on: Windows 10, i7-8750H @ 2.20GHz 2.21GHz, 16GB, GTX 1050 2GB
 
-### (TODO: Your README)
+## Project goal
+This project aims to build a Multiple Layer Perceptron (MLP) using CUDA. In my case, a 3-layer MLP is implemented and tested on two different datasets.
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+The MLP consists of one input layer, one hidden layer and one output layer. The number of hidden neurons in the hidden layer can be specified by user when constructing.
 
+### XOR Example Test
+To make sure the implementation is correct, a simple test to train the MLP for XOR operation is performed first. Parameters include:
+
+* Input neurons: 2
+* Hidden neurons: 2
+* Output neurons: 1
+* Learning Rate (lambda): 0.01
+* Initial weights from input layer to hidden layer : w11 = 10.1, w12 = 0.9, w21 = 20, w22 = 0.87
+* Initial weights from hidden layer to output layer: w1 = 41, w2 = -54
+
+The plot below shows the total squared error vs number of epochs.
+
+![](img/xor.png)
+
+Console Output
+
+```
+--- XOR Example ---
+epoch: 0 | error: 0.00286369
+epoch: 10 | error: 0.00229358
+epoch: 20 | error: 0.00190236
+epoch: 30 | error: 0.00162444
+epoch: 40 | error: 0.00142166
+epoch: 50 | error: 0.00127057
+epoch: 60 | error: 0.0011561
+epoch: 70 | error: 0.00106817
+epoch: 80 | error: 0.000999872
+epoch: 90 | error: 0.000946314
+epoch: 100 | error: 0.000903958
+epoch: 110 | error: 0.000870197
+epoch: 120 | error: 0.000843124
+epoch: 130 | error: 0.000821268
+epoch: 140 | error: 0.000803517
+```
+
+
+### Character Recognition
+Parameters:
+* Input neurons: 10201
+* Hidden neurons: 256
+* Output neurons: 52
+* Learning Rate (lambda): 0.01
+* Initial weights: sampled from a uniform distribution from -1 to 1 randomly
+
+The dataset has 52 images of size 101x101. Each image is a letter from the alphabet, either capitalized or not. Therefore our input is a 1x10201 array expanding the image matrix. The corresponding label is a 1x52 one-hot array.
+
+The plot below shows the total squared error vs number of epochs.
+
+![](img/error.png)
+
+The plot below shows the accuracy vs number of epochs.
+
+![](img/acc.png)
+
+Console Output
+
+```
+--- Character Recognition ---
+epoch: 0 | error: 0.179246 | accuracy: 90.3846%
+epoch: 10 | error: 0.0394659 | accuracy: 100%
+epoch: 20 | error: 0.018194 | accuracy: 100%
+epoch: 30 | error: 0.00813415 | accuracy: 100%
+epoch: 40 | error: 0.00660377 | accuracy: 100%
+epoch: 50 | error: 0.00379195 | accuracy: 100%
+epoch: 60 | error: 0.00267373 | accuracy: 100%
+epoch: 70 | error: 0.00241532 | accuracy: 100%
+epoch: 80 | error: 0.00199531 | accuracy: 100%
+epoch: 90 | error: 0.00165342 | accuracy: 100%
+epoch: 100 | error: 0.00178789 | accuracy: 100%
+epoch: 110 | error: 0.000856958 | accuracy: 100%
+epoch: 120 | error: 0.00144631 | accuracy: 100%
+epoch: 130 | error: 0.000920435 | accuracy: 100%
+epoch: 140 | error: 0.000663891 | accuracy: 100%
+---
+Target:1, Predicted:1
+Target:2, Predicted:2
+Target:3, Predicted:3
+Target:4, Predicted:4
+Target:5, Predicted:5
+Target:6, Predicted:6
+Target:7, Predicted:7
+Target:8, Predicted:8
+Target:9, Predicted:9
+Target:10, Predicted:10
+Target:11, Predicted:11
+Target:12, Predicted:12
+Target:13, Predicted:13
+Target:14, Predicted:14
+Target:15, Predicted:15
+Target:16, Predicted:16
+Target:17, Predicted:17
+Target:18, Predicted:18
+Target:19, Predicted:19
+Target:20, Predicted:20
+Target:21, Predicted:21
+Target:22, Predicted:22
+Target:23, Predicted:23
+Target:24, Predicted:24
+Target:25, Predicted:25
+Target:26, Predicted:26
+Target:27, Predicted:27
+Target:28, Predicted:28
+Target:29, Predicted:29
+Target:30, Predicted:30
+Target:31, Predicted:31
+Target:32, Predicted:32
+Target:33, Predicted:33
+Target:34, Predicted:34
+Target:35, Predicted:35
+Target:36, Predicted:36
+Target:37, Predicted:37
+Target:38, Predicted:38
+Target:39, Predicted:39
+Target:40, Predicted:40
+Target:41, Predicted:41
+Target:42, Predicted:42
+Target:43, Predicted:43
+Target:44, Predicted:44
+Target:45, Predicted:45
+Target:46, Predicted:46
+Target:47, Predicted:47
+Target:48, Predicted:48
+Target:49, Predicted:49
+Target:50, Predicted:50
+Target:51, Predicted:51
+Target:52, Predicted:52
+```
+
+### Extra credit:
+Matrix multiplication is implemented in place of doing 1d vector for input to output.
diff --git a/Project2-Character-Recognition/character_recognition/CMakeLists.txt b/Project2-Character-Recognition/character_recognition/CMakeLists.txt
index 7446175..9e834c1 100644
--- a/Project2-Character-Recognition/character_recognition/CMakeLists.txt
+++ b/Project2-Character-Recognition/character_recognition/CMakeLists.txt
@@ -7,5 +7,5 @@ set(SOURCE_FILES
 
 cuda_add_library(character_recognition
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_61
     )
diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu
index 5a3ed7f..c21f96a 100644
--- a/Project2-Character-Recognition/character_recognition/mlp.cu
+++ b/Project2-Character-Recognition/character_recognition/mlp.cu
@@ -2,26 +2,275 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "mlp.h"
+#include <thrust/random.h>
+#include <iostream>
+#include <cublas_v2.h>
 
-namespace CharacterRecognition {
-    using Common::PerformanceTimer;
-    PerformanceTimer& timer()
-    {
-        static PerformanceTimer timer;
-        return timer;
-    }
-        
-    // TODO: __global__
-
-    /**
-        * Example of use case (follow how you did it in stream compaction)
-        */
-    /*void scan(int n, int *odata, const int *idata) {
-        timer().startGpuTimer();
-        // TODO
-        timer().endGpuTimer();
-    }
-    */
-
-	// TODO: implement required elements for MLP sections 1 and 2 here
+
+#define blockSize 128
+
+void gpu_blas_mmul(cublasHandle_t &handle, const float *A, const float *B, float *C, const int aRow, const int bRow, const int cRow) {
+
+	const float alf = 1;
+	const float bet = 0;
+
+	cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, aRow, cRow, bRow, &alf, A, aRow, B, bRow, &bet, C, aRow);
 }
+
+void gpu_blas_mtrans(cublasHandle_t &handle, const float *A, float *B, const int row, const int col) {
+	float alf = 1;
+	float bet = 0;
+
+	cublasSgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, row, col, &alf, A, col, &bet, A, col, B, row);
+}
+
+namespace CharacterRecognition {
+	using Common::PerformanceTimer;
+	PerformanceTimer& timer()
+	{
+		static PerformanceTimer timer;
+		return timer;
+	}
+
+	__host__ __device__ unsigned int hash(unsigned int a) {
+		a = (a + 0x7ed55d16) + (a << 12);
+		a = (a ^ 0xc761c23c) ^ (a >> 19);
+		a = (a + 0x165667b1) + (a << 5);
+		a = (a + 0xd3a2646c) ^ (a << 9);
+		a = (a + 0xfd7046c5) + (a << 3);
+		a = (a ^ 0xb55a4f09) ^ (a >> 16);
+		return a;
+	}
+
+	__global__ void kernRandomNumber(int n, int time, float* array) {
+		int index = threadIdx.x + (blockIdx.x * blockDim.x);
+		if (index >= n) {
+			return;
+		}
+		thrust::default_random_engine rng(hash((int)(index * time)));
+		thrust::uniform_real_distribution<float> unitDistrib(-1, 1);
+		array[index] = (float)unitDistrib(rng);
+	}
+
+	__global__ void kernSigmoid(int n, float *out, float *in) {
+		int index = threadIdx.x + (blockIdx.x * blockDim.x);
+		if (index >= n) {
+			return;
+		}
+		out[index] = 1 / (1 + exp(-in[index]));
+	}
+
+	__global__ void kernSmDerivative(int n, float *out, float *in) {
+		int index = threadIdx.x + (blockIdx.x * blockDim.x);
+		if (index >= n) {
+			return;
+		}
+		float sigma = 1 / (1 + exp(-in[index]));
+		out[index] = (1 - sigma) * sigma;
+	}
+
+	__global__ void kernLoss(int n, float *out, float *a, float *b) {
+		int index = threadIdx.x + (blockIdx.x * blockDim.x);
+		if (index >= n) {
+			return;
+		}
+		float diff = a[index] - b[index];
+		out[index] = powf(diff, 2);
+	}
+
+	__global__ void kernUpdate(int n, float *weights, float *gradients, float lambda) {
+		int index = threadIdx.x + (blockIdx.x * blockDim.x);
+		if (index >= n) {
+			return;
+		}
+		weights[index] -= gradients[index] * lambda;
+	}
+
+	__global__ void kernSubtract(int n, float *out, float *a, float *b) {
+		int index = threadIdx.x + (blockIdx.x * blockDim.x);
+		if (index >= n) {
+			return;
+		}
+		out[index] = a[index] - b[index];
+	}
+
+	__global__ void kernDotProd(int n, float *out, float *a, float *b) {
+		int index = threadIdx.x + (blockIdx.x * blockDim.x);
+		if (index >= n) {
+			return;
+		}
+		out[index] = a[index] * b[index];
+	}
+
+
+	mlp::mlp(int input, int hidden, int output) {
+		input_size = input;
+		hidden_size = hidden;
+		output_size = output;
+
+		cudaMalloc((void**)&wkj, input_size * hidden_size * sizeof(float));
+		cudaMalloc((void**)&wji, hidden_size * output_size * sizeof(float));
+		cudaMalloc((void**)&gwkj, input_size * hidden_size * sizeof(float));
+		cudaMalloc((void**)&gwji, hidden_size * output_size * sizeof(float));
+
+		cublasCreate(&handle);
+	}
+
+	mlp::~mlp() {
+		cudaFree(wkj);
+		cudaFree(wji);
+		cudaFree(gwkj);
+		cudaFree(gwji);
+
+		cublasDestroy(handle);
+	}
+
+	void mlp::initRandom() {
+		// TODO: initialize random weights
+		int n1 = input_size * hidden_size;
+		int fullBlockPerGrid1 = (n1 + blockSize - 1) / blockSize;
+		kernRandomNumber << <fullBlockPerGrid1, blockSize >> > (n1, 1, wji);
+
+		int n2 = hidden_size * output_size;
+		int fullBlockPerGrid2 = (n2 + blockSize - 1) / blockSize;
+		kernRandomNumber << <fullBlockPerGrid1, blockSize >> > (n2, 1, wkj);
+	}
+
+	void mlp::initWeights(float *xwkj, float *xwji) {
+		cudaMemcpy(wkj, xwkj, input_size * hidden_size * sizeof(float), cudaMemcpyHostToDevice);
+		cudaMemcpy(wji, xwji, hidden_size * output_size * sizeof(float), cudaMemcpyHostToDevice);
+	}
+
+
+	void mlp::train(float *x_train, float *y_train, int n, int epoch) {
+		num = n;
+		int n1 = n * input_size;
+		int n2 = n * hidden_size;
+		int n3 = n * output_size;
+
+		cudaMalloc((void**)&dev_x, n1 * sizeof(float));
+		cudaMalloc((void**)&dev_target, n3 * sizeof(float));
+		cudaMemcpy(dev_x, x_train, n1 * sizeof(float), cudaMemcpyHostToDevice);
+		cudaMemcpy(dev_target, y_train, n3 * sizeof(float), cudaMemcpyHostToDevice);
+
+		cudaMalloc((void**)&dev_hidden, n2 * sizeof(float));
+		cudaMalloc((void**)&dev_hidden_sm, n2 * sizeof(float));
+		cudaMalloc((void**)&dev_y, n3 * sizeof(float));
+		cudaMalloc((void**)&dev_y_sm, n3 * sizeof(float));
+
+		for (int i = 0; i < epoch; i++) {
+			forward();
+			backProp();
+			loss();
+			update();
+		}
+
+		cudaFree(dev_x);
+		cudaFree(dev_target);
+		cudaFree(dev_hidden);
+		cudaFree(dev_hidden_sm);
+		cudaFree(dev_y);
+		cudaFree(dev_y_sm);
+	}
+
+	void mlp::predict(float *x, float *y, int n) {
+		num = n;
+		cudaMalloc((void**)&dev_x, n * input_size * sizeof(float));
+		cudaMalloc((void**)&dev_hidden, n * hidden_size * sizeof(float));
+		cudaMalloc((void**)&dev_hidden_sm, n * hidden_size * sizeof(float));
+		cudaMalloc((void**)&dev_y, n * output_size * sizeof(float));
+		cudaMalloc((void**)&dev_y_sm, n * output_size * sizeof(float));
+		cudaMemcpy(dev_x, x, n * input_size * sizeof(float), cudaMemcpyHostToDevice);
+
+		forward();
+		cudaMemcpy(y, dev_y_sm, n * output_size * sizeof(float), cudaMemcpyDeviceToHost);
+
+		cudaFree(dev_x);
+		cudaFree(dev_hidden);
+		cudaFree(dev_hidden_sm);
+		cudaFree(dev_y);
+		cudaFree(dev_y_sm);
+	}
+
+	void mlp::forward() {
+		gpu_blas_mmul(handle, dev_x, wkj, dev_hidden, num, input_size, hidden_size);
+		dim3 fullBlockPerGrid1((hidden_size * num + blockSize - 1) / blockSize);
+		kernSigmoid << <fullBlockPerGrid1, blockSize >> > (hidden_size * num, dev_hidden_sm, dev_hidden);
+
+		gpu_blas_mmul(handle, dev_hidden_sm, wji, dev_y, num, hidden_size, output_size);
+		dim3 fullBlockPerGrid2((output_size * num + blockSize - 1) / blockSize);
+		kernSigmoid << <fullBlockPerGrid2, blockSize >> > (output_size * num, dev_y_sm, dev_y);
+	}
+
+	void mlp::backProp() {
+		float *wji_T, *y_smDer, *temp1, *psi_right, *hidden_sm_T, *hidden_smDer, *psi_left, *temp2, *dev_x_T;
+		
+		cudaMalloc((void**)&dev_x_T, num * input_size * sizeof(float));
+		cudaMalloc((void**)&y_smDer, num * output_size * sizeof(float));
+		cudaMalloc((void**)&temp1, num * output_size * sizeof(float));
+		cudaMalloc((void**)&psi_right, num * output_size * sizeof(float));
+		cudaMalloc((void**)&hidden_sm_T, num * hidden_size * sizeof(float));
+		cudaMalloc((void**)&hidden_smDer, num * hidden_size * sizeof(float));
+		cudaMalloc((void**)&psi_left, num * hidden_size * sizeof(float));
+		cudaMalloc((void**)&wji_T, output_size * hidden_size * sizeof(float));
+		cudaMalloc((void**)&temp2, num * hidden_size * sizeof(float));
+
+		int n1 = output_size * num;
+		dim3 fullBlockPerGrid1((n1 + blockSize - 1) / blockSize);
+		kernSmDerivative << <fullBlockPerGrid1, blockSize >> > (n1, y_smDer, dev_y_sm);
+		kernSubtract << <fullBlockPerGrid1, blockSize >> > (num * output_size, temp1, dev_y_sm, dev_target);
+		kernDotProd << <fullBlockPerGrid1, blockSize >> > (n1, psi_right, y_smDer, temp1);
+		gpu_blas_mtrans(handle, dev_hidden_sm, hidden_sm_T, hidden_size, num);
+		gpu_blas_mmul(handle, hidden_sm_T, psi_right, gwji, hidden_size, num, output_size);
+
+		int n2 = hidden_size * num;
+		dim3 fullBlockPerGrid2((n2 + blockSize - 1) / blockSize);
+		kernSmDerivative << <fullBlockPerGrid2, blockSize >> > (n2, hidden_smDer, dev_hidden_sm);
+		gpu_blas_mtrans(handle, wji, wji_T, output_size, hidden_size);
+		gpu_blas_mmul(handle, psi_right, wji_T, psi_left, num, output_size, hidden_size);
+		kernDotProd << <fullBlockPerGrid2, blockSize >> > (n2, temp2, psi_left, hidden_smDer);
+
+		gpu_blas_mtrans(handle, dev_x, dev_x_T, input_size, num);
+		gpu_blas_mmul(handle, dev_x_T, temp2, gwkj, input_size, num, hidden_size);
+
+		cudaFree(y_smDer);
+		cudaFree(temp1);
+		cudaFree(psi_right);
+		cudaFree(hidden_sm_T);
+		cudaFree(hidden_smDer);
+		cudaFree(psi_left);
+		cudaFree(temp2);
+		cudaFree(dev_x_T);
+		cudaFree(wji_T);
+	}
+
+	void mlp::loss() {
+
+		int n1 = num * output_size;
+		float *dev_loss;
+		float *h_loss = new float[n1]();
+
+		cudaMalloc((void**)&dev_loss, sizeof(float) * n1);
+		dim3 fullBlockPerGrid = dim3((n1 + blockSize - 1) / blockSize);
+		kernLoss << <fullBlockPerGrid, blockSize >> > (n1, dev_loss, dev_target, dev_y_sm);		
+		cudaMemcpy(h_loss, dev_loss, n1 * sizeof(float), cudaMemcpyDeviceToHost);
+
+		error = 0.0;
+		for (int i = 0; i < n1; i++) {
+			error += h_loss[i];
+		}
+		error /= (2.0 * output_size);
+		cudaFree(dev_loss);
+		delete[] h_loss;
+	}
+
+	void mlp::update() {
+		float lambda = 0.01;
+
+		dim3 fullBlockPerGrid1 = dim3((hidden_size * output_size + blockSize) / blockSize);
+		kernUpdate << <fullBlockPerGrid1, blockSize >> > (hidden_size * output_size, wji, gwji, lambda);
+
+		dim3 fullBlockPerGrid2 = dim3((input_size * hidden_size + blockSize) / blockSize);
+		kernUpdate << <fullBlockPerGrid2, blockSize >> > (input_size * hidden_size, wkj, gwkj, lambda);
+	}
+}
\ No newline at end of file
diff --git a/Project2-Character-Recognition/character_recognition/mlp.h b/Project2-Character-Recognition/character_recognition/mlp.h
index 2096228..0630411 100644
--- a/Project2-Character-Recognition/character_recognition/mlp.h
+++ b/Project2-Character-Recognition/character_recognition/mlp.h
@@ -1,9 +1,46 @@
 #pragma once
-
 #include "common.h"
+#include <cublas_v2.h>
+
 
 namespace CharacterRecognition {
-    Common::PerformanceTimer& timer();
+	Common::PerformanceTimer& timer();
+
+	class mlp {
+	public:
+
+		mlp(int n_input, int n_hidden, int n_output);
+		~mlp();
+
+		void initWeights(float *wkj, float *wji);
+		void initRandom();
+		void train(float *x, float *y, int n, int epoch);
+		void predict(float *x, float *y, int n);
+		float getError() const {
+			return error;
+		}
+
+	private:
+		void forward();
+		void backProp();
+		void loss();
+		void update();
+
+		int input_size;
+		int hidden_size;
+		int output_size;
+
+		float *wkj, *wji;
+		float *gwkj, *gwji;
+		int num;
+		float* dev_x;
+		float* dev_hidden;
+		float* dev_hidden_sm;
+		float* dev_y;
+		float* dev_y_sm;
+		float* dev_target;
+		float error;
 
-    // TODO: implement required elements for MLP sections 1 and 2 here
+		cublasHandle_t handle;
+	};
 }
diff --git a/Project2-Character-Recognition/img/acc.png b/Project2-Character-Recognition/img/acc.png
new file mode 100644
index 0000000..e4b5ab1
Binary files /dev/null and b/Project2-Character-Recognition/img/acc.png differ
diff --git a/Project2-Character-Recognition/img/error.png b/Project2-Character-Recognition/img/error.png
new file mode 100644
index 0000000..c5f7e6c
Binary files /dev/null and b/Project2-Character-Recognition/img/error.png differ
diff --git a/Project2-Character-Recognition/img/xor.png b/Project2-Character-Recognition/img/xor.png
new file mode 100644
index 0000000..bbe667e
Binary files /dev/null and b/Project2-Character-Recognition/img/xor.png differ
diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp
index 11dd534..9263ad6 100644
--- a/Project2-Character-Recognition/src/main.cpp
+++ b/Project2-Character-Recognition/src/main.cpp
@@ -1,8 +1,8 @@
 /**
  * @file      main.cpp
  * @brief     Stream compaction test program
- * @authors   Kai Ninomiya
- * @date      2015
+ * @authors   Weiqi Chen
+ * @date      2019
  * @copyright University of Pennsylvania
  */
 
@@ -10,143 +10,160 @@
 #include <character_recognition/mlp.h>
 #include <character_recognition/common.h>
 #include "testing_helpers.hpp"
+#include <fstream>
+#include <string>
+
+using namespace std;
+bool test(bool flag, CharacterRecognition::mlp &mlp, int label);
+int decode(int n, float* array);
+
+
+void xor_test() {
+	const int input_size = 2;
+	const int hidden_size = 2;
+	const int output_size = 1;
+
+	CharacterRecognition::mlp xor(input_size, hidden_size, output_size);
+	float *wkj = new float[input_size * hidden_size];
+	float *wji = new float[hidden_size * output_size];
+	
+	wkj[0] = 10.1;
+	wkj[1] = 20;
+	wkj[2] = 0.9;
+	wkj[3] = 0.87;
+	
+	wji[0] = 41;
+	wji[1] = -54;
+	xor.initWeights(wkj, wji);
+
+	const int combo = 4;
+	float *x = new float[combo * input_size];
+	float *y = new float[combo * output_size];
+
+	x[0] = 0;
+	x[4] = 0;
+	y[0] = 0;
+
+	x[1] = 0;
+	x[5] = 1;
+	y[1] = 1;
+
+	x[2] = 1;
+	x[6] = 0;
+	y[2] = 1;
+
+	x[3] = 1;
+	x[7] = 1;
+	y[3] = 0;
+
+	cout << "--- XOR ---" << endl;
+	for (int e = 0; e < 14; e++) {
+		// Train
+		xor.train(x, y, combo, 10);
+		float err = xor.getError();
+
+		std::cout << "epoch: " << e * 10 << " | error: " << err << std::endl;
+	}
+}
+
+void char_reg() {
+
+	const int num_data = 52;
+	const int input_size = 10201;
+	const int hidden_size = 256;
+	const int output_size = 52;
+	CharacterRecognition::mlp characterRec(input_size, hidden_size, output_size);
+	characterRec.initRandom();
+	
+	float *x = new float[num_data * input_size];
+	float *y = new float[num_data * output_size];
+	memset(y, 0, num_data * output_size * sizeof(float));
+
+	for (int i = 0; i < 52; i++) {
+		string id = to_string(i + 1);
+		if (id.length() == 1) {
+			id = "0" + id;
+		}
+		string name = "../data-set/" + id + "info.txt";
+		ifstream input_stream(name);
+
+		int a, b;
+		input_stream >> a;
+		input_stream >> b;
+		for (int j = 0; j < input_size; j++) {
+			input_stream >> x[i+ j * num_data];
+		}
+		y[i * num_data + i] = 1.0;
+	}
+
+	cout << "--- Character Recognition ---" << endl;
+	for (int t = 0; t < 14; t++) {
+
+		characterRec.train(x, y, num_data, 10);
+		float err = characterRec.getError();
+		int correct_cnt = 0;
+		for (int i = 1; i <= 52; i++) {
+			if (test(false, characterRec, i)) {
+				correct_cnt++;
+			}
+		}
+		std::cout << "epoch: " << t * 10 << " | error: " << err << " | accuracy: " << correct_cnt / 52.0 * 100 << "%" << std::endl;
+	}
+	cout << "---" << endl;
+	for (int i = 1; i <= 52; i++) {
+		if (test(true, characterRec, i)) {
+		}
+	}
+}
 
-const int SIZE = 1 << 8; // feel free to change the size of array
-const int NPOT = SIZE - 3; // Non-Power-Of-Two
-int *a = new int[SIZE];
-int *b = new int[SIZE];
-int *c = new int[SIZE];
+bool test(bool flag, CharacterRecognition::mlp &mlp, int label) {
+
+	float *x = new float[10201];
+	float *y = new float[52];
+
+	string id = to_string(label);
+	if (id.length() == 1) {
+		id = "0" + id;
+	}
+	string name = "../data-set/" + id + "info.txt";
+	ifstream input_stream(name);
+	int a, b;
+	input_stream >> a;
+	input_stream >> b;
+	
+	for (int i = 0; i < 10201; i++) {
+		input_stream >> x[i];
+	}
+	mlp.predict(x, y, 1);
+	int pred = decode(52, y) + 1;
+	if (flag) {
+		cout << "Target:" << label << ", Predicted:" << pred << endl;
+	}
+
+	delete[] x;
+	delete[] y;
+	if (pred == label) {
+		return true;
+	}
+	else {
+		return false;
+	}
 
-int main(int argc, char* argv[]) {
-    // Scan tests
-
-    printf("\n");
-    printf("****************\n");
-    printf("** SCAN TESTS **\n");
-    printf("****************\n");
-
-    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    // initialize b using StreamCompaction::CPU::scan you implement
-    // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
-    // At first all cases passed because b && c are all zeroes.
-    zeroArray(SIZE, b);
-    printDesc("cpu scan, power-of-two");
-    StreamCompaction::CPU::scan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(SIZE, b, true);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu scan, non-power-of-two");
-    StreamCompaction::CPU::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(NPOT, b, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, power-of-two");
-    StreamCompaction::Naive::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-	/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
-	onesArray(SIZE, c);
-	printDesc("1s array for finding bugs");
-	StreamCompaction::Naive::scan(SIZE, c, a);
-	printArray(SIZE, c, true); */
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, non-power-of-two");
-    StreamCompaction::Naive::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, power-of-two");
-    StreamCompaction::Efficient::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, non-power-of-two");
-    StreamCompaction::Efficient::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, power-of-two");
-    StreamCompaction::Thrust::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, non-power-of-two");
-    StreamCompaction::Thrust::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    printf("\n");
-    printf("*****************************\n");
-    printf("** STREAM COMPACTION TESTS **\n");
-    printf("*****************************\n");
-
-    // Compaction tests
-
-    genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    int count, expectedCount, expectedNPOT;
-
-    // initialize b using StreamCompaction::CPU::compactWithoutScan you implement
-    // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct.
-    zeroArray(SIZE, b);
-    printDesc("cpu compact without scan, power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    expectedCount = count;
-    printArray(count, b, true);
-    printCmpLenResult(count, expectedCount, b, b);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact without scan, non-power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    expectedNPOT = count;
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact with scan");
-    count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, power-of-two");
-    count = StreamCompaction::Efficient::compact(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, non-power-of-two");
-    count = StreamCompaction::Efficient::compact(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
-
-    system("pause"); // stop Win32 console from closing on exit
-	delete[] a;
-	delete[] b;
-	delete[] c;
 }
+
+int decode(int n, float* array) {
+	float temp = FLT_MIN;
+	int index = 0;
+	for (int i = 0; i < n; i++) {
+		if (array[i] > temp) {
+			index = i;
+			temp = array[i];
+		}
+	}
+	return index;
+}
+
+int main(int argc, char* argv[]) {
+	xor_test();
+	char_reg();
+}
\ No newline at end of file
diff --git a/Project2-Stream-Compaction/README.md b/Project2-Stream-Compaction/README.md
index 0e38ddb..f14e7c6 100644
--- a/Project2-Stream-Compaction/README.md
+++ b/Project2-Stream-Compaction/README.md
@@ -1,14 +1,93 @@
-CUDA Stream Compaction
+# Project 2-1 GPU Stream Compaction
 ======================
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Weiqi Chen
+  * [LinkedIn](https://www.linkedin.com/in/weiqi-ricky-chen-2b04b2ab/)
+* Tested on: Windows 10, i7-8750H @ 2.20GHz 2.21GHz, 16GB, GTX 1050 2GB
 
-### (TODO: Your README)
+## Project Goal
+In this project, we will implement CUDA stream compaction and use it to remove 0s (or any other elements you don't want) from arrays. 3 versions of exclusive scan are implemented for computing scatter indices:
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+* CPU Scan
+* Naive GPU Scan
+* Work-Efficient GPU Scan
 
+We will compare the performances of the three methods along with the implementation provided by Thrust library.
+
+## Performance Analysis
+### Block Size Optimization
+The bar plot below shows the performance vs block size. The best is 128 and we will use this for all subsequent tasks.
+
+![](img/bar.png)
+
+### GPU Scan Implementations vs CPU Scan Implementations
+Below is a plot of performance vs array size for all four methods. For each method, an array size of power of two and one of non power of two are used.
+
+![](img/plot.png)
+
+* When the array size is below 2<sup>13</sup>, CPU scan has the best performance. This could be due to the overheads of invoking kernels when computation time is not the bottleneck.
+* As array size gets larger, GPU implementations start to perform much better than CPU method.
+* Thrust's scan is the fastest of all when array size is large. This could be due to the use of shared memory instead of reading from global memory.
+
+### Extra Credit
+
+I didn't encounter the problem of GPU approach running slower than expected. I think this is because during the implementation of UpSweep and DownSweep for work-efficient scan, I change the grid size and allocate necessary threads in different stages of the sweeping process. It makes sense that if we use as many threads as the array size throughout the computation, inactive threads will be present and slow down the performance.
+
+
+Console Output with a block size of 128 and an array size of 2<sup>16</sup>.
+```
+****************
+** SCAN TESTS **
+****************
+    [  17  25  12  21  22   4  37  44  40   8  40  46  43 ...   5   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 0.2863ms    (std::chrono Measured)
+    [   0  17  42  54  75  97 101 138 182 222 230 270 316 ... 3209400 3209405 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 0.2797ms    (std::chrono Measured)
+    [   0  17  42  54  75  97 101 138 182 222 230 270 316 ... 3209316 3209325 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 0.726016ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 0.727008ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 0.393216ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 0.39216ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 17.0906ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 11.7248ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   0   3   2   2   0   0   1   1   2   0   0   2 ...   2   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 0.5571ms    (std::chrono Measured)
+    [   2   3   2   2   1   1   2   2   1   1   1   3   1 ...   2   2 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 0.5431ms    (std::chrono Measured)
+    [   2   3   2   2   1   1   2   2   1   1   1   3   1 ...   2   1 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 0.2748ms    (std::chrono Measured)
+    [   2   3   2   2   1   1   2   2   1   1   1   3   1 ...   2   2 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 0.490592ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 0.49152ms    (CUDA Measured)
+    passed
+```
diff --git a/Project2-Stream-Compaction/img/1.png b/Project2-Stream-Compaction/img/1.png
new file mode 100644
index 0000000..92ee62b
Binary files /dev/null and b/Project2-Stream-Compaction/img/1.png differ
diff --git a/Project2-Stream-Compaction/img/bar.png b/Project2-Stream-Compaction/img/bar.png
new file mode 100644
index 0000000..f3f63f4
Binary files /dev/null and b/Project2-Stream-Compaction/img/bar.png differ
diff --git a/Project2-Stream-Compaction/img/plot.png b/Project2-Stream-Compaction/img/plot.png
new file mode 100644
index 0000000..7959363
Binary files /dev/null and b/Project2-Stream-Compaction/img/plot.png differ
diff --git a/Project2-Stream-Compaction/src/main.cpp b/Project2-Stream-Compaction/src/main.cpp
index d016553..49243a0 100644
--- a/Project2-Stream-Compaction/src/main.cpp
+++ b/Project2-Stream-Compaction/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 2 << 20; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
diff --git a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt
index cdbef77..4bb0dc2 100644
--- a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt
+++ b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_61
     )
diff --git a/Project2-Stream-Compaction/stream_compaction/common.cu b/Project2-Stream-Compaction/stream_compaction/common.cu
index 2ed6d63..b596264 100644
--- a/Project2-Stream-Compaction/stream_compaction/common.cu
+++ b/Project2-Stream-Compaction/stream_compaction/common.cu
@@ -24,6 +24,12 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+			int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+			if (index >= n) {
+				return;
+			}
+			bools[index] = idata[index] == 0 ? 0 : 1;
         }
 
         /**
@@ -33,6 +39,14 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+			int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+			if (index >= n) {
+				return;
+			}
+			if (bools[index]) {
+				odata[indices[index]] = idata[index];
+			}
         }
 
     }
diff --git a/Project2-Stream-Compaction/stream_compaction/common.h b/Project2-Stream-Compaction/stream_compaction/common.h
index 996997e..2938bc0 100644
--- a/Project2-Stream-Compaction/stream_compaction/common.h
+++ b/Project2-Stream-Compaction/stream_compaction/common.h
@@ -12,6 +12,7 @@
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+#define blockSize 128
 
 /**
  * Check for CUDA errors; print and exit if there was a problem.
diff --git a/Project2-Stream-Compaction/stream_compaction/cpu.cu b/Project2-Stream-Compaction/stream_compaction/cpu.cu
index a2d3e6c..501803d 100644
--- a/Project2-Stream-Compaction/stream_compaction/cpu.cu
+++ b/Project2-Stream-Compaction/stream_compaction/cpu.cu
@@ -20,6 +20,10 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
             // TODO
+			odata[0] = 0;
+			for (int i = 1; i < n; i++) {
+				odata[i] = odata[i - 1] + idata[i-1];
+			}
 	        timer().endCpuTimer();
         }
 
@@ -31,8 +35,14 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
             // TODO
+			int remain = 0;
+			for (int i = 0; i < n; i++) {
+				if (idata[i] != 0) {
+					odata[remain++] = idata[i];
+				}
+			}
 	        timer().endCpuTimer();
-            return -1;
+            return remain;
         }
 
         /**
@@ -41,10 +51,33 @@ namespace StreamCompaction {
          * @returns the number of elements remaining after compaction.
          */
         int compactWithScan(int n, int *odata, const int *idata) {
+
+			int *temp = (int*)malloc(n * sizeof(int));
+			int *scanResult = (int*)malloc(n * sizeof(int));
+
 	        timer().startCpuTimer();
 	        // TODO
+			int remain = 0;
+			for (int i = 0; i < n; i++) {
+				temp[i] = idata[i] == 0 ? 0 : 1;
+			}
+
+			scanResult[0] = 0;
+			for (int i = 1; i < n; i++) {
+				scanResult[i] = scanResult[i - 1] + temp[i - 1];
+			}
+
+			for (int i = 0; i < n; i++) {
+				if (temp[i] == 1) {
+					odata[remain++] = idata[i];
+				}
+			}
+
 	        timer().endCpuTimer();
-            return -1;
+
+			free(temp);
+			free(scanResult);
+            return remain;
         }
     }
 }
diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.cu b/Project2-Stream-Compaction/stream_compaction/efficient.cu
index 2db346e..298317e 100644
--- a/Project2-Stream-Compaction/stream_compaction/efficient.cu
+++ b/Project2-Stream-Compaction/stream_compaction/efficient.cu
@@ -3,6 +3,7 @@
 #include "common.h"
 #include "efficient.h"
 
+
 namespace StreamCompaction {
     namespace Efficient {
         using StreamCompaction::Common::PerformanceTimer;
@@ -12,13 +13,58 @@ namespace StreamCompaction {
             return timer;
         }
 
+		__global__ void kernUpSweep(int n, int offset, int *idata) {
+			int index = threadIdx.x + (blockIdx.x * blockDim.x);
+			if (index >= n) {
+				return;
+			}
+			idata[(index+1)*offset*2 - 1] += idata[offset*(2*index + 1) - 1];
+		}
+
+		__global__ void kernDownSweep(int n, int offset, int *idata) {
+			int index = threadIdx.x + (blockIdx.x * blockDim.x);
+			if (index >= n) {
+				return;
+			}
+			int t = idata[offset*(2 * index + 1) - 1];
+			idata[offset*(2 * index + 1) - 1] = idata[(index + 1)*offset * 2 - 1];
+			idata[(index + 1)*offset * 2 - 1] += t;
+		}
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+
+			int tempSize = pow(2, ilog2ceil(n));
+			int* dev_idata;
+			cudaMalloc((void**)&dev_idata, tempSize * sizeof(int));
+			cudaMemcpy(dev_idata, idata, n * sizeof(int),cudaMemcpyHostToDevice);
+			checkCUDAErrorFn("cudaMemcpyHostToDevice failed!");
+
             timer().startGpuTimer();
             // TODO
-            timer().endGpuTimer();
+			int level = ilog2ceil(n);
+			dim3 fullBlocksPerGrid;
+
+			for (int d = 0; d < level; d++) {
+				int threads = 1 << (level - 1 - d);
+				fullBlocksPerGrid = dim3((threads + blockSize - 1) / blockSize);
+				kernUpSweep << <fullBlocksPerGrid, blockSize >> > (threads, 1 << d, dev_idata);
+			}
+
+			cudaMemset(dev_idata + tempSize - 1, 0, sizeof(int));
+
+			for (int d = level - 1; d >= 0; d--) {
+				int threads = 1 << (level - 1 - d);
+				fullBlocksPerGrid = dim3((threads + blockSize - 1) / blockSize);
+				kernDownSweep << <fullBlocksPerGrid, blockSize >> > (threads, 1 << d, dev_idata);
+			}
+
+			timer().endGpuTimer();
+			cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost);
+			checkCUDAErrorFn("cudaMemcpyDeviceToHost failed!");
+			cudaFree(dev_idata);
         }
 
         /**
@@ -31,10 +77,59 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
+
+			int tempSize = pow(2, ilog2ceil(n));
+			int *dev_indices, *dev_bools, *dev_odata, *dev_idata;
+
+			cudaMalloc((void**)&dev_indices, tempSize * sizeof(int));
+			cudaMalloc((void**)&dev_bools, n * sizeof(int));
+			cudaMalloc((void**)&dev_odata, n * sizeof(int));
+			cudaMalloc((void**)&dev_idata, n * sizeof(int));
+
+			cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
             timer().startGpuTimer();
             // TODO
+			dim3 fullBlocksPerGrid = (n + blockSize - 1) / blockSize;
+			Common::kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (n, dev_bools, dev_idata);
+			
+			dim3 fullBlocksPerGrid2;
+			int level = ilog2ceil(tempSize);
+			cudaMemcpy(dev_indices, dev_bools, n * sizeof(int), cudaMemcpyDeviceToDevice);
+			int threads;
+
+			for (int d = 0; d < level; d++) {
+				threads = 1 << (level - 1 - d);
+				fullBlocksPerGrid2 = dim3((threads + blockSize - 1) / blockSize);
+				kernUpSweep << <fullBlocksPerGrid2, blockSize >> > (threads, 1 << d, dev_indices);
+			}
+
+			cudaMemset(dev_indices + tempSize - 1, 0, sizeof(int));
+
+			for (int d = level - 1; d >= 0; d--) {
+				threads = 1 << (level - 1 - d);
+				fullBlocksPerGrid2 = dim3((threads + blockSize - 1) / blockSize);
+				kernDownSweep << <fullBlocksPerGrid2, blockSize >> > (threads, 1 << d, dev_indices);
+			}
+			Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (n, dev_odata, dev_idata, dev_bools, dev_indices);
+
             timer().endGpuTimer();
-            return -1;
+			int *bools = (int*)malloc(n * sizeof(int));
+			int *indices = (int*)malloc(n * sizeof(int));
+			cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+			cudaMemcpy(bools, dev_bools, n * sizeof(int), cudaMemcpyDeviceToHost);
+			cudaMemcpy(indices, dev_indices, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+			int remain = bools[n - 1] ? indices[n - 1] + 1 : indices[n - 1];
+
+			cudaFree(dev_bools);
+			cudaFree(dev_indices);
+			cudaFree(dev_odata);
+			cudaFree(dev_idata);
+			free(bools);
+			free(indices);
+
+            return remain;
         }
     }
 }
diff --git a/Project2-Stream-Compaction/stream_compaction/naive.cu b/Project2-Stream-Compaction/stream_compaction/naive.cu
index 4308876..742ca0c 100644
--- a/Project2-Stream-Compaction/stream_compaction/naive.cu
+++ b/Project2-Stream-Compaction/stream_compaction/naive.cu
@@ -3,6 +3,8 @@
 #include "common.h"
 #include "naive.h"
 
+
+
 namespace StreamCompaction {
     namespace Naive {
         using StreamCompaction::Common::PerformanceTimer;
@@ -12,14 +14,63 @@ namespace StreamCompaction {
             return timer;
         }
         // TODO: __global__
+		__global__ void kernNaiveScan(int n, int offset, int *odata, const int *idata) {
+			int index = threadIdx.x + (blockIdx.x * blockDim.x);
+			
+			if (index >= n) {
+				return;
+			}
+			
+			if (index >= offset) {
+				odata[index] = idata[index] + idata[index-offset];
+			}
+			else {
+				odata[index] = idata[index];
+			}
+		}
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+
+			int* dev_data1;
+			int* dev_data2;
+			cudaMalloc((void**)&dev_data1, n * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_data1 failed!");
+
+			cudaMalloc((void**)&dev_data2, n * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_data2 failed!");
+
+			cudaMemcpy(dev_data1, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+			checkCUDAErrorFn("cudaMemcpyHostToDevice failed!");
+
             timer().startGpuTimer();
             // TODO
+			dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+			for (int d = 0; d < ilog2ceil(n); d++) {
+				if (d % 2 == 0) {
+					kernNaiveScan << <fullBlocksPerGrid, blockSize >> > (n, 1 << d, dev_data2, dev_data1);
+				}
+				else {
+					kernNaiveScan << <fullBlocksPerGrid, blockSize >> > (n, 1 << d, dev_data1, dev_data2);
+				}
+			}
+
             timer().endGpuTimer();
+
+			if ((ilog2ceil(n) % 2) == 1) {
+				cudaMemcpy(odata + 1, dev_data2, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost);
+				checkCUDAErrorFn("cudaMemcpyDeviceToHost) failed!");
+			}
+			else {
+				cudaMemcpy(odata + 1, dev_data1, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost);
+				checkCUDAErrorFn("cudaMemcpyDeviceToHost) failed!");
+			}
+			
+			odata[0] = 0;
+			cudaFree(dev_data1);
+			cudaFree(dev_data2);
         }
     }
 }
diff --git a/Project2-Stream-Compaction/stream_compaction/thrust.cu b/Project2-Stream-Compaction/stream_compaction/thrust.cu
index 1def45e..02a4922 100644
--- a/Project2-Stream-Compaction/stream_compaction/thrust.cu
+++ b/Project2-Stream-Compaction/stream_compaction/thrust.cu
@@ -18,11 +18,19 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
-            timer().endGpuTimer();
+
+			thrust::host_vector<int> host_in(idata, idata + n);
+			thrust::device_vector<int> dv_in = host_in;
+			thrust::device_vector<int> dv_out(n);
+
+			timer().startGpuTimer();
+			thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+			timer().endGpuTimer();
+
+			thrust::copy(dv_out.begin(), dv_out.end(), odata);
         }
     }
 }