diff --git a/Project2-Character-Recognition/README.md b/Project2-Character-Recognition/README.md
index 4503fac..c6fdc0f 100644
--- a/Project2-Character-Recognition/README.md
+++ b/Project2-Character-Recognition/README.md
@@ -3,12 +3,30 @@ CUDA Character Recognition
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* #### Author information
+  
+  - Tianming Xu (Mark)
+    - www.linkedin.com/in/tianming-xu-8bb81816a (LinkedIn)
+  - Tested on: Windows 10, i7-8700 @ 3.20GHz 16GB, GTX 2080 8192MB (my personal desktop)
 
-### (TODO: Your README)
+### Output Screenshots(50 Epochs and 30 neuron)
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+![](img/output.PNG)
 
+The cost drops from 6 to around
+
+#### Features
+
+I implemented a file loader to load in all the training data from the data-set folder and input that to the neural network class. In the class, according to how many epochs we will train, I will loop through all the training data I have in the back propagation algorithm, applying back the delta of weights and bias. The cost of out data set drops significantly after 5 epochs.
+
+
+
+#### Performance Analysis
+
+Due to time constraint, I don't have time to do a full performance analysis. However, the GPU can help me handle many matrix operations so, it should be more efficient than the CPU version.
+
+
+
+#### Acknowledgement
+
+This homework I especially want to thank Jiangping Xu for helping me figure out the concept of neural network. Also, http://neuralnetworksanddeeplearning.com/ is a fantastic website which describes the concept of neural network concisely. There are python codes for a hand written number recognition program, which helps me a lot for implementing my character recognition program.
\ No newline at end of file
diff --git a/Project2-Character-Recognition/character_recognition/CMakeLists.txt b/Project2-Character-Recognition/character_recognition/CMakeLists.txt
index 7446175..c5e28b0 100644
--- a/Project2-Character-Recognition/character_recognition/CMakeLists.txt
+++ b/Project2-Character-Recognition/character_recognition/CMakeLists.txt
@@ -7,5 +7,5 @@ set(SOURCE_FILES
 
 cuda_add_library(character_recognition
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_75
     )
diff --git a/Project2-Character-Recognition/character_recognition/common.h b/Project2-Character-Recognition/character_recognition/common.h
index 6aede64..ac6a52b 100644
--- a/Project2-Character-Recognition/character_recognition/common.h
+++ b/Project2-Character-Recognition/character_recognition/common.h
@@ -4,15 +4,18 @@
 #include <cuda_runtime.h>
 
 #include <cstdio>
+#include <cstdlib>
 #include <cstring>
+#include <iostream>
 #include <cmath>
+#include <vector>
 #include <algorithm>
 #include <chrono>
 #include <stdexcept>
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
-
+#define BLOCK_SIZE 512
 /**
  * Check for CUDA errors; print and exit if there was a problem.
  */
@@ -30,6 +33,13 @@ inline int ilog2ceil(int x) {
     return x == 1 ? 0 : ilog2(x - 1) + 1;
 }
 
+//isInput = 1 -> input, isInput = 0 -> hidden
+//n is number of element in input
+inline int matrix_index(int n, int row, int col)
+{
+    return row * n + col;
+}
+
 
 namespace Common {
     /**
@@ -38,6 +48,7 @@ namespace Common {
     *
     * Adapted from WindyDarian(https://github.com/WindyDarian)
     */
+
     class PerformanceTimer
     {
     public:
diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu
index 5a3ed7f..37a95d7 100644
--- a/Project2-Character-Recognition/character_recognition/mlp.cu
+++ b/Project2-Character-Recognition/character_recognition/mlp.cu
@@ -1,5 +1,6 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <thrust/complex.h>
 #include "common.h"
 #include "mlp.h"
 
@@ -10,18 +11,457 @@ namespace CharacterRecognition {
         static PerformanceTimer timer;
         return timer;
     }
+
+    //three buffers
+    float* device_input;
+    float* device_weight_matrix;
+    float* device_hidden;
+    float* device_ji_buffer;
+    float* device_output;
+
+    //directly used in class function
+    int INPUT_SIZE = 0;
+    int HIDDEN_SIZE = 0;
+    int OUTPUT_SIZE = 0;
+
+    std::vector<float*> array_inputs;
+    std::vector<float*> array_target_outputs;
         
+    void initSimulation(int num_object, int hidden_num, int output_num, std::vector<float*> inputs, std::vector<float*> target_outputs)
+    {
+        CharacterRecognition::INPUT_SIZE = num_object;
+        CharacterRecognition::HIDDEN_SIZE = hidden_num;
+        CharacterRecognition::OUTPUT_SIZE = output_num;
+
+        array_inputs.assign(inputs.begin(), inputs.end());
+        array_target_outputs.assign(target_outputs.begin(), target_outputs.end());
+    }
+
+    float sigmoid(float z)
+    {
+        return 1 / (1 + std::pow(exp(1.0), -z));
+    }
+
+    float sigmoid_prime(float z)
+    {
+        return sigmoid(z) * (1 - sigmoid(z));
+    }
+
+    //kernel function
+    void cost_derivative(int n, float* target_output, float* actual_output, float* error_vec)
+    {
+        for (int i = 0; i < n; ++i)
+        {
+            error_vec[i] = actual_output[i] - target_output[i];
+        }
+    }
+
+    __global__ void kernCostDerivative(int n, float* target_output, float* actual_output, float* error_vec)
+    {
+        int index = threadIdx.x + (blockIdx.x * blockDim.x);
+        if (index >= n)
+        {
+            return;
+        }
+
+        error_vec[index] = actual_output[index] - target_output[index];
+    }
+
     // TODO: __global__
+    __global__ void kernInputMultWeight(int max_bound, int n, float* idata, float* weight_matrix, float* odata)
+    {
+        int index = threadIdx.x + (blockIdx.x * blockDim.x);
+        if (index >= max_bound)
+        {
+            return;
+        }
+        int row = index;
+
+        float sum = 0;
+        for (int col = 0; col < n; ++col)
+        {
+            int idx = row * n + col;
+            float w = weight_matrix[idx];
+            float input = idata[col];
+            sum += w * input; //weight's row is fixed, but col different, similarly ,the weight corresponds to what element in idata
+        }
+
+        odata[row] = sum;
+    }
+    //kernel?
+    void argmax(int output_num, float* curr_output)
+    {
+        int max_idx = -1;
+        float max = 0;
+        for (int i = 0; i < output_num; ++i)
+        {
+            if (curr_output[i] >= max)
+            {
+                max_idx = i;
+                max = curr_output[i];
+            }
+        }
+
+        for (int i = 0; i < output_num; ++i)
+        {
+            if (i == max_idx) {
+                curr_output[i] = 1;
+            }
+            else curr_output[i] = 0;
+        }
+    }
+
+    void feed_forward(int n, int hidden_num, int output_num, int idata_idx, float* odata, float* input_weight_matrix, float* hidden_weight_matrix, float* hidden_bias_vec, float* output_bias_vec)
+    {
+        float* temp_hidden = new float[hidden_num];
+        float* idata = array_inputs[idata_idx];
+        //input to hidden
+        for (int row = 0; row < hidden_num; ++row)
+        {
+            float sum = 0;
+            for (int col = 0; col < n; ++col)
+            {
+                int idx = row * n + col;
+                float w = input_weight_matrix[idx];
+                float input = idata[col];
+                sum += w * input;
+
+            }
+            temp_hidden[row] = sigmoid(sum + hidden_bias_vec[row]);
+        }
+
+        //test
+        //std::cout << "the 2064 of element in this file is: " << idata[2064] << std::endl;
+        //printFloatArray(hidden_num, temp_hidden, false);
+        //from hidden to output
+            //input to hidden
+        for (int row = 0; row < output_num; ++row)
+        {
+            float sum = 0;
+            for (int col = 0; col < hidden_num; ++col)
+            {
+                int idx = row * hidden_num + col;
+                float w = hidden_weight_matrix[idx];
+                float input = temp_hidden[col];
+                sum += w * input;
+
+            }
+            odata[row] = sigmoid(sum + output_bias_vec[row]);
+        }
+
+        delete[] temp_hidden;
+    }
+
+    void back_prop(float* cost, float* input_data, float* target_output_data, float* input_weight_matrix, float* hidden_weight_matrix, float* hidden_bias_vec, float* output_bias_vec, float* updated_input_weight, float* updated_hidden_weight, float* updated_hidden_bias, float* updated_output_bias)
+    {
+        //generate intermediate hidden layer
+        float* hidden_layer = new float[HIDDEN_SIZE];
+        float* output_layer = new float[OUTPUT_SIZE];
+        float* hidden_weighted_input = new float[HIDDEN_SIZE];
+        float* output_weighted_input = new float[OUTPUT_SIZE];
+        float* hidden_cost_error = new float[HIDDEN_SIZE];
+        float* output_cost_error = new float[OUTPUT_SIZE];
+
+
+        float* device_hidden_layer;
+        float* device_output_layer;
+        float* device_hidden_weighted_input;
+        float* device_output_weighted_input;
+        float* device_hidden_cost_error;
+        float* device_output_cost_error;
+        float* device_input_data;
+        float* device_target_output_data;
+        float* device_input_weight_mat;
+        float* device_hidden_weight_mat;
+
+
+        cudaMalloc((void**)&device_hidden_layer, HIDDEN_SIZE * sizeof(float));
+        checkCUDAError("cudaMalloc device_hidden_layer failed!");
+        cudaMalloc((void**)&device_output_layer, OUTPUT_SIZE * sizeof(float));
+        checkCUDAError("cudaMalloc device_output_layer failed!");
+        cudaMalloc((void**)&device_hidden_weighted_input, HIDDEN_SIZE * sizeof(float));
+        checkCUDAError("cudaMalloc device_hidden_weighted_input failed!");
+        cudaMalloc((void**)&device_output_weighted_input, OUTPUT_SIZE  * sizeof(float));
+        checkCUDAError("cudaMalloc device_output_weighted_input failed!");
+        cudaMalloc((void**)&device_hidden_cost_error, HIDDEN_SIZE * sizeof(float));
+        checkCUDAError("cudaMalloc device_hidden_cost_error failed!");
+        cudaMalloc((void**)&device_output_cost_error, OUTPUT_SIZE * sizeof(float));
+        checkCUDAError("cudaMalloc device_output_cost_error failed!");
+        cudaMalloc((void**)&device_input_data, INPUT_SIZE * sizeof(float));
+        checkCUDAError("cudaMalloc device_input_data failed!");
+        cudaMalloc((void**)&device_target_output_data, OUTPUT_SIZE * sizeof(float));
+        checkCUDAError("cudaMalloc device_target_output_data failed!");
+        cudaMalloc((void**)&device_input_weight_mat, HIDDEN_SIZE * INPUT_SIZE * sizeof(float));
+        checkCUDAError("cudaMalloc device_hidden_weight_mat failed!");
+        cudaMalloc((void**)&device_hidden_weight_mat, HIDDEN_SIZE * OUTPUT_SIZE * sizeof(float));
+        checkCUDAError("cudaMalloc device_output_weight_mat failed!");
+
+        //memcpy input and target output
+        cudaMemcpy(device_input_data, input_data, INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice);
+        cudaMemcpy(device_target_output_data, target_output_data, OUTPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice);
+        cudaMemcpy(device_input_weight_mat, input_weight_matrix, HIDDEN_SIZE * INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice);
+        cudaMemcpy(device_hidden_weight_mat, hidden_weight_matrix, HIDDEN_SIZE * OUTPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice);
+
+        cudaMemset(device_hidden_layer, 0, HIDDEN_SIZE * sizeof(float));
+        cudaMemset(device_output_layer, 0, OUTPUT_SIZE * sizeof(float));
+        cudaMemset(device_hidden_weighted_input, 0, HIDDEN_SIZE * sizeof(float));
+        cudaMemset(device_output_weighted_input, 0, OUTPUT_SIZE * sizeof(float));
+        cudaMemset(device_hidden_cost_error, 0, HIDDEN_SIZE * sizeof(float));
+        cudaMemset(device_output_cost_error, 0, OUTPUT_SIZE * sizeof(float));
+
+        int gridSize = (INPUT_SIZE + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        dim3 normalBlocksPerGrid(gridSize);
+        int hiddenBufferSize = (HIDDEN_SIZE + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        dim3 hiddenBufferBlocksPerGrid(hiddenBufferSize);
+        //int jiBufferSize = (output_num * n + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        //dim3 jiBufferBlocksPerGrid(jiBufferSize);
+        int outputSize = (OUTPUT_SIZE + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        dim3 outputBlocksPerGrid(outputSize);
+        dim3 threadsPerBlock(BLOCK_SIZE);
+
+        //initialize
+        std::fill(hidden_layer, hidden_layer + HIDDEN_SIZE, 0);
+        std::fill(output_layer, output_layer + OUTPUT_SIZE, 0);
+        std::fill(hidden_weighted_input, hidden_weighted_input + HIDDEN_SIZE, 0);
+        std::fill(output_weighted_input, output_weighted_input + OUTPUT_SIZE, 0);
+        std::fill(hidden_cost_error, hidden_cost_error + HIDDEN_SIZE, 0);
+        std::fill(output_cost_error, output_cost_error + OUTPUT_SIZE, 0);
 
-    /**
-        * Example of use case (follow how you did it in stream compaction)
-        */
-    /*void scan(int n, int *odata, const int *idata) {
         timer().startGpuTimer();
-        // TODO
+
+        kernInputMultWeight << < hiddenBufferBlocksPerGrid, threadsPerBlock >> > (HIDDEN_SIZE, INPUT_SIZE, device_input_data, device_input_weight_mat, device_hidden_layer);
+
         timer().endGpuTimer();
+        //need to apply the activate function on each element, currently each element is only the sum.
+        //copy to a temp array and copmute sequentially for now
+        cudaMemcpy(hidden_layer, device_hidden_layer, HIDDEN_SIZE * sizeof(float), cudaMemcpyDeviceToHost);
+        checkCUDAError("cudaMemcpy device_hidden_layer to hidden_layer failed!");
+
+        //compute activation function
+        for (int i = 0; i < HIDDEN_SIZE; ++i)
+        {
+            hidden_weighted_input[i] = hidden_layer[i] + hidden_bias_vec[i];
+            hidden_layer[i] = sigmoid(hidden_weighted_input[i]);
+        }
+
+        //copy back to hidden
+        cudaMemcpy(device_hidden_layer, hidden_layer, HIDDEN_SIZE * sizeof(float), cudaMemcpyHostToDevice);
+        checkCUDAError("cudaMemcpy hidden_layer to device_hidden_layer failed!");
+
+        timer().startGpuTimer();
+        //do we use the same weight? -- in XOR example, it is not
+        kernInputMultWeight << < outputBlocksPerGrid, threadsPerBlock >> > (OUTPUT_SIZE, HIDDEN_SIZE, device_hidden_layer, device_hidden_weight_mat, device_output_layer);
+        timer().endGpuTimer();
+
+        //how to calculate the error and affect the next weights?  -- how to get expcted result? -- read in?
+        cudaMemcpy(output_layer, device_output_layer, OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost);
+        checkCUDAError("cudaMemcpy device_output to odata failed!");
+
+        for (int i = 0; i < OUTPUT_SIZE; ++i)
+        {
+            output_weighted_input[i] = output_layer[i] + output_bias_vec[i];
+            output_layer[i] = sigmoid(output_weighted_input[i]);
+        }
+
+        //output cost here
+        *cost += compute_cost(OUTPUT_SIZE, target_output_data, output_layer);
+
+        //Get the cost derivative from the output layer result and target output data
+        //test
+        cudaMemcpy(device_output_layer, output_layer, OUTPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice);
+        kernCostDerivative <<< outputBlocksPerGrid, threadsPerBlock >> > (OUTPUT_SIZE, device_target_output_data, device_output_layer, device_output_cost_error);
+        cudaMemcpy(output_cost_error, device_output_cost_error, OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost);
+        //add the sigmoid prime to it
+        for (int row = 0; row < OUTPUT_SIZE; ++row)
+        {
+            output_cost_error[row] *= sigmoid_prime(output_weighted_input[row]);
+            //assign to updated weights and bias for output
+            updated_output_bias[row] += output_cost_error[row];
+            for (int col = 0; col < HIDDEN_SIZE; ++col)
+            {
+                int mat_idx = row * HIDDEN_SIZE + col;
+                updated_hidden_weight[mat_idx] += hidden_layer[col] * output_cost_error[row];
+            }
+        }
+
+        //compute the hidden_cost_error by output_cost_error
+        //use transpose index
+        for (int row = 0; row < HIDDEN_SIZE; ++row)
+        {
+            for (int col = 0; col < OUTPUT_SIZE; ++col)
+            {
+                int mat_idx = row * OUTPUT_SIZE + col;
+                hidden_cost_error[row] += hidden_weight_matrix[mat_idx] * output_cost_error[col];
+            }
+
+            //apply sigmoid prime after we compute the derivative
+            hidden_cost_error[row] *= sigmoid_prime(hidden_weighted_input[row]);
+            //assign to updated matrix and vec
+            updated_hidden_bias[row] += hidden_cost_error[row];
+            for (int mat_col = 0; mat_col < INPUT_SIZE; ++mat_col)
+            {
+                int mat_idx = row * INPUT_SIZE + mat_col;
+                updated_input_weight[mat_idx] += input_data[mat_col] * hidden_cost_error[row];
+            }
+        }
+
+        cudaFree(device_hidden_layer);
+        cudaFree(device_output_layer);
+        cudaFree(device_hidden_weighted_input);
+        cudaFree(device_output_weighted_input);
+        cudaFree(device_hidden_cost_error);
+        cudaFree(device_output_cost_error);
+        cudaFree(device_input_data);
+        cudaFree(device_target_output_data);
+
+        delete[] hidden_layer;
+        delete[] output_layer;
+        delete[] hidden_weighted_input;
+        delete[] output_weighted_input;
+        delete[] hidden_cost_error;
+        delete[] output_cost_error;
+    }
+
+    //SGD
+    //we use training data as array_input and array_target_output, same as test data -- probably all size information should be inputs
+    void SGD(int training_round, int hidden_size, float eta, float* input_weight_matrix, float* hidden_weight_matrix, float* hidden_bias_vec, float* output_bias_vec)
+    {
+        //init necessary buffers
+        float* updated_input_weight = new float[INPUT_SIZE * hidden_size];
+        float* updated_hidden_weight = new float[OUTPUT_SIZE * hidden_size];
+        float* updated_hidden_bias = new float[hidden_size];
+        float* updated_output_bias = new float[OUTPUT_SIZE];
+
+        //need cuda malloc to init
+
+        for (int round = 0; round < training_round; ++round)
+        {
+            //zero out all components for new round of change
+            std::fill(updated_input_weight, updated_input_weight + INPUT_SIZE * hidden_size, 0);
+            std::fill(updated_hidden_weight, updated_hidden_weight + OUTPUT_SIZE * hidden_size, 0);
+            std::fill(updated_hidden_bias, updated_hidden_bias + hidden_size, 0);
+            std::fill(updated_output_bias, updated_output_bias + OUTPUT_SIZE, 0);
+            //memcpy(updated_hidden_bias, hidden_bias_vec, HIDDEN_SIZE * sizeof(float));
+
+            std::cout << "Round " << round << ":" << std::endl;
+            float cost = 0;
+            for (int input_index = 0; input_index < OUTPUT_SIZE; ++input_index)
+            {
+                //update each element in the buffer arrays directly in back_prop
+                back_prop(&cost, array_inputs[input_index], array_target_outputs[input_index], input_weight_matrix, hidden_weight_matrix, hidden_bias_vec, output_bias_vec, updated_input_weight, updated_hidden_weight, updated_hidden_bias, updated_output_bias);
+            }
+
+            //update the weights and bias after each round
+            for (int input_weight_index = 0; input_weight_index < INPUT_SIZE * hidden_size; ++input_weight_index)
+            {
+                input_weight_matrix[input_weight_index] -= (eta / OUTPUT_SIZE) * updated_input_weight[input_weight_index];
+            }
+
+            for (int hidden_weight_index = 0; hidden_weight_index < OUTPUT_SIZE * hidden_size; ++hidden_weight_index)
+            {
+                hidden_weight_matrix[hidden_weight_index] -= (eta / OUTPUT_SIZE) * updated_hidden_weight[hidden_weight_index];
+            }
+
+            for (int hidden_bias_index = 0; hidden_bias_index < hidden_size; ++hidden_bias_index)
+            {
+                hidden_bias_vec[hidden_bias_index] -= (eta / OUTPUT_SIZE) * updated_hidden_bias[hidden_bias_index];
+            }
+
+            for (int output_bias_index = 0; output_bias_index < OUTPUT_SIZE; ++output_bias_index)
+            {
+                output_bias_vec[output_bias_index] -= (eta / OUTPUT_SIZE) * updated_output_bias[output_bias_index];
+            }
+
+            //output cost
+            cost /= OUTPUT_SIZE;
+            std::cout << "The cost of current data is: " << cost << std::endl;
+        }
+
+        std::cout << "we train " << training_round << " rounds" << std::endl;
+        Evaluate(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, nullptr, nullptr, input_weight_matrix, hidden_weight_matrix, hidden_bias_vec, output_bias_vec);
+
+        delete[] updated_input_weight;
+        delete[] updated_hidden_weight;
+        delete[] updated_hidden_bias;
+        delete[] updated_output_bias;
+    }
+
+    float compute_cost(int size, float* target_output_data, float* output_layer)
+    {
+        float sum = 0;
+        for (int i = 0; i < size; ++i)
+        {
+            sum += std::pow(target_output_data[i] - output_layer[i], 2);
+        }
+
+        sum /= 2;
+        //std::cout << "The cost of current data is: " << sum << std::endl;
+        return sum;
+    }
+
+    void Evaluate(int n, int hidden_num, int output_num, float* test_data, float* target_output, float* input_weight_matrix, float* hidden_weight_matrix, float* hidden_bias_vec, float* output_bias_vec)
+    {
+        int sum = 0;
+        float* curr_output = new float[output_num];
+        //unused (test_data, target_output) --> will use later
+        for (int i = 0; i < output_num; ++i)
+        {
+            //zero out output
+            std::fill(curr_output, curr_output + output_num, 0);
+            //for (int j = 0; j < output_num; ++j)
+            //{
+            //    std::cout << "curr_output[" << j << "] is " << curr_output[j] << std::endl;
+            //}
+
+            feed_forward(n, hidden_num, output_num, i, curr_output, input_weight_matrix, hidden_weight_matrix, hidden_bias_vec, output_bias_vec);
+
+            argmax(output_num, curr_output);
+            bool same = true;
+            for (int j = 0; j < output_num; ++j)
+            {
+                if (curr_output[j] != array_target_outputs[i][j])
+                {
+                    same = false;
+                }
+            }
+
+            for (int j = 0; j < output_num; ++j)
+            {
+                if (curr_output[j] == 1)
+                {
+                    std::cout << "curr_output[" << j << "] is " << curr_output[j] << std::endl;
+                }
+            }
+
+            if (same) sum++;
+
+        }
+
+
+        std::cout << "Result: " << sum << " / " << output_num << std::endl;
+
+        delete[] curr_output;
+    }
+
+    //the same as inputMultWeight, except the max bound is different, here it is constrained by output_num
+    __global__ void kernHiddenMultWeight(int max_bound, int n, float* idata, float* weight_matrix, float* odata)
+    {
+        int index = threadIdx.x + (blockIdx.x * blockDim.x);
+        if (index >= max_bound)
+        {
+            return;
+        }
+
+        int row = index;
+
+        float sum = 0;
+        for (int col = 0; col < n; ++col)
+        {
+            int idx = row * n + col;
+            sum += weight_matrix[idx] * idata[col]; //weight's row is fixed, but col different, similarly ,the weight corresponds to what element in idata
+        }
+
+        odata[row] = sum;
     }
-    */
 
-	// TODO: implement required elements for MLP sections 1 and 2 here
 }
diff --git a/Project2-Character-Recognition/character_recognition/mlp.h b/Project2-Character-Recognition/character_recognition/mlp.h
index 2096228..04e4831 100644
--- a/Project2-Character-Recognition/character_recognition/mlp.h
+++ b/Project2-Character-Recognition/character_recognition/mlp.h
@@ -6,4 +6,13 @@ namespace CharacterRecognition {
     Common::PerformanceTimer& timer();
 
     // TODO: implement required elements for MLP sections 1 and 2 here
+    void initSimulation(int num_object, int hidden_num, int output_num, std::vector<float*> inputs, std::vector<float*> target_outputs);
+    float sigmoid(float z);
+    float sigmoid_prime(float z);
+    void feed_forward(int n, int hidden_num, int output_num, int idata_idx, float* odata, float* input_weight_matrix, float* hidden_weight_matrix, float* hidden_bias_vec, float* output_bias_vec);
+    void argmax(int output_num, float* curr_output);
+    void SGD(int training_round, int hidden_size, float eta, float* input_weight_matrix, float* hidden_weight_matrix, float* hidden_bias_vec, float* output_bias_vec);
+    void back_prop(float* cost, float* input_data, float* target_output_data, float* input_weight_matrix, float* hidden_weight_matrix, float* hidden_bias_vec, float* output_bias_vec, float* updated_input_weight, float* updated_hidden_weight, float* updated_hidden_bias, float* updated_output_bias);
+    void Evaluate(int n, int hidden_num, int output_num, float* test_data, float* target_output, float* input_weight_matrix, float* hidden_weight_matrix, float* hidden_bias_vec, float* output_bias_vec);
+    float compute_cost(int size, float* target_output_data, float* output_layer);
 }
diff --git a/Project2-Character-Recognition/img/output.PNG b/Project2-Character-Recognition/img/output.PNG
new file mode 100644
index 0000000..91568f7
Binary files /dev/null and b/Project2-Character-Recognition/img/output.PNG differ
diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp
index 11dd534..d0535ff 100644
--- a/Project2-Character-Recognition/src/main.cpp
+++ b/Project2-Character-Recognition/src/main.cpp
@@ -6,147 +6,443 @@
  * @copyright University of Pennsylvania
  */
 
+#include <curand.h>
+#include <conio.h>
+#include <iostream>
 #include <cstdio>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <string>
+#include <string.h>
+#include <algorithm>
 #include <character_recognition/mlp.h>
 #include <character_recognition/common.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
-const int NPOT = SIZE - 3; // Non-Power-Of-Two
-int *a = new int[SIZE];
-int *b = new int[SIZE];
-int *c = new int[SIZE];
+#define LAYER 2
+#define INPUT_NUM 52
+
+const int INPUT_SIZE = 10201; // feel free to change the size of array
+const int OUTPUT_SIZE = 52;
+const int HIDDEN_SIZE = 30;
+//float* input = new float[INPUT_SIZE];
+//float* output = new float[OUTPUT_SIZE];
+std::vector<float*> array_inputs;
+std::vector<float*> array_target_outputs;
+
+void Evaluate(int n, int hidden_num, int output_num, float* test_data, float* target_output, float* input_weight_matrix, float* hidden_weight_matrix, float* hidden_bias_vec, float* output_bias_vec);
+
+void read_in_inputs()
+{
+    std::string input_file_postfix = "info.txt";
+    std::string input_file_folder = "..\\data-set\\";
+    for (int i = 1; i <= OUTPUT_SIZE; ++i)
+    {
+        //init the buffer we want to store the input data
+        float* input = new float[INPUT_SIZE];
+        float* output = new float[OUTPUT_SIZE];
+
+        std::string input_file_prefix = i < 10 ? "0" + std::to_string(i) : std::to_string(i);
+        std::string input_file = input_file_folder + input_file_prefix + input_file_postfix;
+
+        std::ifstream myfile(input_file);
+        if (!myfile)
+        {
+            std::cout << "Error opening input_file" << std::endl;
+            std::cout << input_file << std::endl;
+            system("pause");
+            return;
+        }
+        //std::cout << input_file_prefix << "." << std::endl;
+        int count = 0;
+        while (!myfile.eof())
+        {
+            std::string intermediate_array;
+            std::vector <std::string> tokens;
+            std::getline(myfile, intermediate_array, '\n');
+            if (count == 0)
+            {
+                //std::cout << intermediate_array << std::endl;
+                int output_result = std::stoi(intermediate_array);
+                for (int i = 1; i <= OUTPUT_SIZE; ++i)
+                {
+                    if (i == output_result) output[i-1] = 1;
+                    else output[i-1] = 0;
+                }
+            }
+            else if (count == 2)
+            {
+                std::stringstream tokens(intermediate_array);
+                std::string token;
+                // Tokenizing w.r.t. space ' ' 
+                int array_count = 0;
+                //the first is a space, clean it
+                std::getline(tokens, token, ' ');
+                while (std::getline(tokens, token, ' '))
+                {
+                    input[array_count] = std::stoi(token) / 255; //normalize
+                    array_count++;
+                }
+                if (array_count != INPUT_SIZE)
+                {
+                    std::cout << "array_count != INPUT_SIZE, something wrong" << std::endl;
+                }
+            }
+            count++;
+        }
+        //store the input array into vector
+        array_inputs.push_back(input);
+        array_target_outputs.push_back(output);
+    }
+}
+
+//first do in cpu version
+
+//utility
+
+//float sigmoid(float z)
+//{
+//    return 1 / (1 + std::pow(exp(1.0), -z));
+//}
+//
+//float sigmoid_prime(float z)
+//{
+//    return sigmoid(z) * (1 - sigmoid(z));
+//}
+//
+//void cost_derivative(int n, float* target_output, float* actual_output, float* error_vec)
+//{
+//    for (int i = 0; i < n; ++i)
+//    {
+//        error_vec[i] = actual_output[i] - target_output[i];
+//    }
+//}
+//
+//void argmax(int output_num, float* curr_output)
+//{
+//    int max_idx = -1;
+//    float max = 0;
+//    for (int i = 0; i < output_num; ++i)
+//    {
+//        if (curr_output[i] >= max)
+//        {
+//            max_idx = i;
+//            max = curr_output[i];
+//        }
+//    }
+//
+//    for (int i = 0; i < output_num; ++i)
+//    {
+//        if (i == max_idx) {
+//            curr_output[i] = 1;
+//        }
+//        else curr_output[i] = 0;
+//    }
+//}
+//
+//float compute_cost(int size, float* target_output_data, float* output_layer)
+//{
+//    float sum = 0;
+//    for (int i = 0; i < size; ++i)
+//    {
+//        sum += std::pow(target_output_data[i] - output_layer[i], 2);
+//    }
+//
+//    sum /= 2;
+//    //std::cout << "The cost of current data is: " << sum << std::endl;
+//    return sum;
+//}
+//
+//void feed_forward(int n, int hidden_num, int output_num, int idata_idx, float* odata, float* input_weight_matrix, float* hidden_weight_matrix, float* hidden_bias_vec, float* output_bias_vec)
+//{
+//    float* temp_hidden = new float[hidden_num];
+//    float* idata = array_inputs[idata_idx];
+//    //input to hidden
+//    for (int row = 0; row < hidden_num; ++row)
+//    {
+//        float sum = 0;
+//        for (int col = 0; col < n; ++col)
+//        {
+//            int idx = row * n + col;
+//            float w = input_weight_matrix[idx];
+//            float input = idata[col];
+//            sum += w * input;
+//
+//        }
+//        temp_hidden[row] = sigmoid(sum + hidden_bias_vec[row]);
+//    }
+//
+//    //test
+//    //std::cout << "the 2064 of element in this file is: " << idata[2064] << std::endl;
+//    //printFloatArray(hidden_num, temp_hidden, false);
+//    //from hidden to output
+//        //input to hidden
+//    for (int row = 0; row < output_num; ++row)
+//    {
+//        float sum = 0;
+//        for (int col = 0; col < hidden_num; ++col)
+//        {
+//            int idx = row * hidden_num + col;
+//            float w = hidden_weight_matrix[idx];
+//            float input = temp_hidden[col];
+//            sum += w * input;
+//
+//        }
+//        odata[row] = sigmoid(sum + output_bias_vec[row]);
+//    }
+//
+//    delete[] temp_hidden;
+//}
+//
+//void back_prop(int hidden_num, float* cost, float* input_data, float* target_output_data, float* input_weight_matrix, float* hidden_weight_matrix, float* hidden_bias_vec, float* output_bias_vec, float* updated_input_weight, float* updated_hidden_weight, float* updated_hidden_bias, float* updated_output_bias)
+//{
+//    //generate intermediate hidden layer
+//    float* hidden_layer = new float[hidden_num];
+//    float* output_layer = new float[OUTPUT_SIZE];
+//    float* hidden_weighted_input = new float[hidden_num];
+//    float* output_weighted_input = new float[OUTPUT_SIZE];
+//    float* output_cost_error = new float[OUTPUT_SIZE];
+//    float* hidden_cost_error = new float[hidden_num];
+//
+//    //initialize
+//    std::fill(hidden_layer, hidden_layer + hidden_num, 0);
+//    std::fill(output_layer, output_layer + OUTPUT_SIZE, 0);
+//    std::fill(hidden_weighted_input, hidden_weighted_input + hidden_num, 0);
+//    std::fill(output_weighted_input, output_weighted_input + OUTPUT_SIZE, 0);
+//    std::fill(hidden_cost_error, hidden_cost_error + hidden_num, 0);
+//    std::fill(output_cost_error, output_cost_error + OUTPUT_SIZE, 0);
+//
+//    //feedfoward
+//    for (int row = 0; row < hidden_num; ++row)
+//    {
+//        float sum = 0;
+//        for (int col = 0; col < INPUT_SIZE; ++col)
+//        {
+//            int idx = row * INPUT_SIZE + col;
+//            float w = input_weight_matrix[idx];
+//            float input = input_data[col];
+//            sum += w * input;
+//
+//        }
+//        hidden_weighted_input[row] = sum + hidden_bias_vec[row];
+//        hidden_layer[row] = sigmoid(sum + hidden_bias_vec[row]);
+//    }
+//
+//    for (int row = 0; row < OUTPUT_SIZE; ++row)
+//    {
+//        float sum = 0;
+//        for (int col = 0; col < hidden_num; ++col)
+//        {
+//            int idx = row * hidden_num + col;
+//            float w = hidden_weight_matrix[idx];
+//            float input = hidden_layer[col];
+//            sum += w * input;
+//
+//        }
+//        output_weighted_input[row] = sum + output_bias_vec[row];
+//        output_layer[row] = sigmoid(sum + output_bias_vec[row]);
+//    }
+//	
+//    //output cost here
+//    *cost += compute_cost(OUTPUT_SIZE, target_output_data, output_layer);
+//
+//    //Get the cost derivative from the output layer result and target output data
+//    cost_derivative(OUTPUT_SIZE, target_output_data, output_layer, output_cost_error);
+//    //add the sigmoid prime to it
+//    for (int row = 0; row < OUTPUT_SIZE; ++row)
+//    {
+//        output_cost_error[row] *= sigmoid_prime(output_weighted_input[row]);
+//        //assign to updated weights and bias for output
+//        updated_output_bias[row] += output_cost_error[row];
+//        for (int col = 0; col < hidden_num; ++col)
+//        {
+//            int mat_idx = row * hidden_num + col;
+//            updated_hidden_weight[mat_idx] += hidden_layer[col] * output_cost_error[row];
+//        }
+//    }
+//
+//    //compute the hidden_cost_error by output_cost_error
+//    //use transpose index
+//    for (int row = 0; row < hidden_num; ++row)
+//    {
+//        for (int col = 0; col < OUTPUT_SIZE; ++col)
+//        {
+//            int mat_idx = row * OUTPUT_SIZE + col;
+//            hidden_cost_error[row] += hidden_weight_matrix[mat_idx] * output_cost_error[col];
+//        }
+//
+//        //apply sigmoid prime after we compute the derivative
+//        hidden_cost_error[row] *= sigmoid_prime(hidden_weighted_input[row]);
+//        //assign to updated matrix and vec
+//        updated_hidden_bias[row] += hidden_cost_error[row];
+//        for (int mat_col = 0; mat_col < INPUT_SIZE; ++mat_col)
+//        {
+//            int mat_idx = row * INPUT_SIZE + mat_col;
+//            updated_input_weight[mat_idx] += input_data[mat_col] * hidden_cost_error[row];
+//        }
+//    }
+//}
+//
+////SGD
+////we use training data as array_input and array_target_output, same as test data -- probably all size information should be inputs
+//void SGD(int training_round, int hidden_size, float eta, float* input_weight_matrix, float* hidden_weight_matrix, float* hidden_bias_vec, float* output_bias_vec)
+//{
+//    //init necessary buffers
+//    float* updated_input_weight = new float[INPUT_SIZE * hidden_size];
+//    float* updated_hidden_weight = new float[OUTPUT_SIZE * hidden_size];
+//    float* updated_hidden_bias = new float[hidden_size];
+//    float* updated_output_bias = new float[OUTPUT_SIZE];
+//
+//    for (int round = 0; round < training_round; ++round)
+//    {
+//        //zero out all components for new round of change
+//        std::fill(updated_input_weight, updated_input_weight + INPUT_SIZE * hidden_size, 0);
+//        std::fill(updated_hidden_weight, updated_hidden_weight + OUTPUT_SIZE * hidden_size, 0);
+//        std::fill(updated_hidden_bias, updated_hidden_bias + hidden_size, 0);
+//        std::fill(updated_output_bias, updated_output_bias + OUTPUT_SIZE, 0);
+//        //memcpy(updated_hidden_bias, hidden_bias_vec, HIDDEN_SIZE * sizeof(float));
+//
+//        std::cout << "Round " << round << ":" << std::endl;
+//        float cost = 0;
+//        for (int input_index = 0; input_index < OUTPUT_SIZE; ++input_index)
+//        {
+//            //update each element in the buffer arrays directly in back_prop
+//            back_prop(hidden_size,&cost, array_inputs[input_index], array_target_outputs[input_index], input_weight_matrix, hidden_weight_matrix, hidden_bias_vec, output_bias_vec, updated_input_weight, updated_hidden_weight, updated_hidden_bias, updated_output_bias);
+//        }
+//
+//        //update the weights and bias after each round
+//        for (int input_weight_index = 0; input_weight_index < INPUT_SIZE * hidden_size; ++input_weight_index)
+//        {
+//            input_weight_matrix[input_weight_index] -= (eta / OUTPUT_SIZE) * updated_input_weight[input_weight_index];
+//        }
+//
+//        for (int hidden_weight_index = 0; hidden_weight_index < OUTPUT_SIZE * hidden_size; ++hidden_weight_index)
+//        {
+//            hidden_weight_matrix[hidden_weight_index] -= (eta / OUTPUT_SIZE) * updated_hidden_weight[hidden_weight_index];
+//        }
+//
+//        for (int hidden_bias_index = 0; hidden_bias_index < hidden_size; ++hidden_bias_index)
+//        {
+//            hidden_bias_vec[hidden_bias_index] -= (eta / OUTPUT_SIZE) * updated_hidden_bias[hidden_bias_index];
+//        }
+//
+//        for (int output_bias_index = 0; output_bias_index < OUTPUT_SIZE; ++output_bias_index)
+//        {
+//            output_bias_vec[output_bias_index] -= (eta / OUTPUT_SIZE) * updated_output_bias[output_bias_index];
+//        }
+//
+//        //output cost
+//        cost /= OUTPUT_SIZE;
+//        std::cout << "The cost of current data is: " << cost << std::endl;
+//    }
+//
+//    std::cout << "we train " << training_round << " rounds" << std::endl;
+//    Evaluate(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, nullptr, nullptr, input_weight_matrix, hidden_weight_matrix, hidden_bias_vec, output_bias_vec);
+//
+//    delete[] updated_input_weight;
+//    delete[] updated_hidden_weight;
+//    delete[] updated_hidden_bias;
+//    delete[] updated_output_bias;
+//}
+//
+bool check_data(int* file_idx, int* array_idx)
+{
+    for (int i = 0; i < OUTPUT_SIZE - 1; ++i)
+    {
+        for (int j = 0; j < INPUT_SIZE; ++j)
+        {
+            if (array_inputs[i][j] != array_inputs[i + 1][j])
+            {
+                *file_idx = i;
+                *array_idx = j;
+                return true;
+            }
+        }
+    }
+    return false;
+}
+//
+//void Evaluate(int n, int hidden_num, int output_num, float* test_data, float* target_output, float* input_weight_matrix, float* hidden_weight_matrix, float* hidden_bias_vec, float* output_bias_vec)
+//{
+//    int sum = 0;
+//    float* curr_output = new float[output_num];
+//    //unused (test_data, target_output) --> will use later
+//    for (int i = 0; i < output_num; ++i)
+//    {
+//        //zero out output
+//        std::fill(curr_output, curr_output + output_num, 0);
+//        //for (int j = 0; j < output_num; ++j)
+//        //{
+//        //    std::cout << "curr_output[" << j << "] is " << curr_output[j] << std::endl;
+//        //}
+//
+//        feed_forward(n, hidden_num, output_num, i, curr_output, input_weight_matrix, hidden_weight_matrix, hidden_bias_vec, output_bias_vec);
+//
+//        argmax(output_num, curr_output);
+//        bool same = true;
+//        for (int j = 0; j < output_num; ++j)
+//        {
+//            if (curr_output[j] != array_target_outputs[i][j])
+//            {
+//                same = false;
+//            }
+//        }
+//
+//        for (int j = 0; j < output_num; ++j)
+//        {
+//            if (curr_output[j] == 1)
+//            {
+//                std::cout << "curr_output[" << j << "] is " << curr_output[j] << std::endl;
+//            }
+//        }
+//        
+//        if (same) sum++;
+//
+//    }
+//
+//
+//    std::cout << "Result: " << sum << " / " << output_num << std::endl;
+//
+//    delete[] curr_output;
+//}
+
+
 
 int main(int argc, char* argv[]) {
-    // Scan tests
-
-    printf("\n");
-    printf("****************\n");
-    printf("** SCAN TESTS **\n");
-    printf("****************\n");
-
-    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    // initialize b using StreamCompaction::CPU::scan you implement
-    // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
-    // At first all cases passed because b && c are all zeroes.
-    zeroArray(SIZE, b);
-    printDesc("cpu scan, power-of-two");
-    StreamCompaction::CPU::scan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(SIZE, b, true);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu scan, non-power-of-two");
-    StreamCompaction::CPU::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(NPOT, b, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, power-of-two");
-    StreamCompaction::Naive::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-	/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
-	onesArray(SIZE, c);
-	printDesc("1s array for finding bugs");
-	StreamCompaction::Naive::scan(SIZE, c, a);
-	printArray(SIZE, c, true); */
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, non-power-of-two");
-    StreamCompaction::Naive::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, power-of-two");
-    StreamCompaction::Efficient::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, non-power-of-two");
-    StreamCompaction::Efficient::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, power-of-two");
-    StreamCompaction::Thrust::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, non-power-of-two");
-    StreamCompaction::Thrust::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    printf("\n");
-    printf("*****************************\n");
-    printf("** STREAM COMPACTION TESTS **\n");
-    printf("*****************************\n");
-
-    // Compaction tests
-
-    genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    int count, expectedCount, expectedNPOT;
-
-    // initialize b using StreamCompaction::CPU::compactWithoutScan you implement
-    // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct.
-    zeroArray(SIZE, b);
-    printDesc("cpu compact without scan, power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    expectedCount = count;
-    printArray(count, b, true);
-    printCmpLenResult(count, expectedCount, b, b);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact without scan, non-power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    expectedNPOT = count;
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact with scan");
-    count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, power-of-two");
-    count = StreamCompaction::Efficient::compact(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, non-power-of-two");
-    count = StreamCompaction::Efficient::compact(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
-
-    system("pause"); // stop Win32 console from closing on exit
-	delete[] a;
-	delete[] b;
-	delete[] c;
+
+    read_in_inputs();
+    //the data will be stored in array_inputs and array_outputs.
+    int file_idx = -1;
+    int array_idx = -1;
+    if (check_data(&file_idx, &array_idx))
+    {
+        std::cout << "the data has different in " << file_idx << " file " << ", array index is " << array_idx << std::endl;
+    }
+
+    //init input_weights and hidden weights
+    float *input_weight_matrix = new float[INPUT_SIZE * HIDDEN_SIZE];
+    float *hidden_weight_matrix = new float[OUTPUT_SIZE * HIDDEN_SIZE];
+    init_weight_matrix(INPUT_SIZE, HIDDEN_SIZE, input_weight_matrix, -0.1f , 0.1f);  // Leave a 0 at the end to test that edge case
+    init_weight_matrix(OUTPUT_SIZE, HIDDEN_SIZE, hidden_weight_matrix, -0.1f, 0.1f);
+    float *hidden_bias_vec = new float[HIDDEN_SIZE];
+    float *output_bias_vec = new float[OUTPUT_SIZE];
+    std::fill(hidden_bias_vec, hidden_bias_vec + HIDDEN_SIZE, 0);
+    std::fill(output_bias_vec, output_bias_vec + OUTPUT_SIZE, 0);
+
+    CharacterRecognition::initSimulation(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, array_inputs, array_target_outputs);
+    CharacterRecognition::SGD(50, HIDDEN_SIZE, 0.1f, input_weight_matrix, hidden_weight_matrix, hidden_bias_vec, output_bias_vec);
+
+    //assume we get the correct output, we should calculate the error and apply bp algorithm back to weight
+
+    //understand bp algorithm
+    //how to make it keep running? -- while loop within a certain threshold
+    delete[] input_weight_matrix;
+    delete[] hidden_weight_matrix;
+    delete[] hidden_bias_vec;
+    delete[] output_bias_vec;
+
+    //delete the whole array
+    array_inputs.clear();
+    array_target_outputs.clear();
 }
diff --git a/Project2-Character-Recognition/src/testing_helpers.hpp b/Project2-Character-Recognition/src/testing_helpers.hpp
index b28a8d2..f326316 100644
--- a/Project2-Character-Recognition/src/testing_helpers.hpp
+++ b/Project2-Character-Recognition/src/testing_helpers.hpp
@@ -57,6 +57,41 @@ void genArray(int n, int *a, int maxval) {
     }
 }
 
+//initialize weight matrix within a certain range -- should that be sequential?
+void init_weight_matrix(int n, int hidden_num,  float* weight_matrix, float range_min = 0, float range_max = 1)
+{
+    srand(time(nullptr));
+    for (int i = 0; i < n * hidden_num; ++i)
+    {
+        weight_matrix[i] = range_min + (static_cast <float> (rand()) / (static_cast <float> (RAND_MAX))) * (range_max - range_min);
+        if (weight_matrix[i] > range_max)
+        {
+            std::cout << "weight matrix index " << i << " has problem, its value is " << weight_matrix[i] << std::endl;
+            weight_matrix[i] = range_max;
+        }
+        else if (weight_matrix[i] < range_min)
+        {
+            std::cout << "weight matrix index " << i << " has problem, its value is " << weight_matrix[i] << std::endl;
+            weight_matrix[i] = range_min;
+        }
+    }
+}
+
+void init_bais_vec(int n, float *mat)
+{
+    for (int i = 0; i < n; ++i)
+    {
+        mat[i] = 0;
+    }
+}
+
+void init_input(int n, float* input) {
+    for (int i = 0; i < n; ++i)
+    {
+        input[i] = i % 2 == 0 ? 1 : 0;
+    }
+}
+
 void printArray(int n, int *a, bool abridged = false) {
     printf("    [ ");
     for (int i = 0; i < n; i++) {
@@ -69,6 +104,18 @@ void printArray(int n, int *a, bool abridged = false) {
     printf("]\n");
 }
 
+void printFloatArray(int n, float *a, bool abridged = false) {
+    printf("    [ ");
+    for (int i = 0; i < n; i++) {
+        if (abridged && i + 2 == 15 && n > 16) {
+            i = n - 2;
+            printf("... ");
+        }
+        printf("%.3f ", a[i]);
+    }
+    printf("]\n");
+}
+
 template<typename T>
 void printElapsedTime(T time, std::string note = "")
 {
diff --git a/Project2-Stream-Compaction/README.md b/Project2-Stream-Compaction/README.md
index 0e38ddb..17830d0 100644
--- a/Project2-Stream-Compaction/README.md
+++ b/Project2-Stream-Compaction/README.md
@@ -3,12 +3,52 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* #### Author information
+  
+  - Tianming Xu (Mark)
+    - www.linkedin.com/in/tianming-xu-8bb81816a (LinkedIn)
+  - Tested on: Windows 10, i7-8700 @ 3.20GHz 16GB, GTX 2080 8192MB (my personal desktop)
+
+### Output Screenshots(2^14 elements)
+
+![](img/screen_shot_output.PNG)
+
+
+
+### Features
+
+The first thing I did in this project is to implement the exclusive scan in CPU sequential approach, naive GPU approach and work-effiecient approach using upsweep and downsweep algorithm from GPU Gem. Besides, I explored the exclusive scan method in Thrust library. 
+
+The other thing I did in my project is to implement the stream compaction method, which exclude certain elements in the array which satisfy some conditions(in my project, the condition is equal to 0). I use three approaches: sequential without scan, sequential with scan, and parallel with work-efficient scan.
+
+### Performance Analysis
+
+##### The comparison among CPU, Naive, work-efficient and Thrust in scan algorithm
+
+![](img/scan_power_of_2.png)
+
+The performance of thrust is very weird. It runs significantly slower than all the other three. However, we can see later in the non power of two part, its speed becomes much more reasonable. I am not quite sure why. I guess there might be some very heavy pre-process step in the thrust library. In order to see more detailed comparison among the rest, I have another chart without thrust
+
+![](img/scan_power_of_2_without_thrust.png)
+
+In this chart, surprisingly, the work-efficient one is never the best in both low number of element and high number of element. What I guess the problem might be, just like what instruction mentioned, the index of thread. As we have a 2^d stride of each iteration, when the value of d becomes larger and larger, the gap between the elements we need to compute the result is larger. Hence, as we need to access the data from global memory, we need to spend a lot of time reading the corresponding value we need. 
+
+![](img/scan_non_power_of_two.png)
+
+In the non power of two cases, the situation is similar. Though the thrust library has a much more reasonable efficiency, it still is the slowest among the three. Another interesting find is that, the non power of two cases have a better performance than the power of two cases, which is unexpected. I am not quite sure why that happens.
+
+##### The comparison among CPU(no scan), CPU(with scan) work-efficient stream compaction
+
+![](img/compaction_power_of_two.png)
+
+The performance of compaction algorithm is more expected than the scan algorithm. The CPU scan one takes longer when the number of element is larger, as it will take multiple rounds of adding and swapping, which is unnecessary in sequential algorithm. But the work-efficient algorithm is not recognizably faster than the sequential one, even when the number of element is huge. I think it might be the same problem in the scan algorithm.
+
+### Debug Database
+
+- The corner case guard is very important. I used incorrect corner case guard (index > n) in my first project, and copy and paste to my project 2. It causes a  very weird problem that the last element in the output data is always incorrect. Make sure to use (index >= n), as the nth element is out of range.
+- The cuda initialization error might not be caused by the cuda initialization part. It might be other kernel function mess up something in cuda memory and cause the memory allocation to fail. 
+  - ![](img/initialization_error.png)
+  - This error is actually because I use "cudaMemcpyDeviceToDevice" when I try to copy my temp_out intermediate array to odata array. 
 
-### (TODO: Your README)
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
 
diff --git a/Project2-Stream-Compaction/img/compaction_power_of_two.png b/Project2-Stream-Compaction/img/compaction_power_of_two.png
new file mode 100644
index 0000000..cfcb765
Binary files /dev/null and b/Project2-Stream-Compaction/img/compaction_power_of_two.png differ
diff --git a/Project2-Stream-Compaction/img/initialization_error.png b/Project2-Stream-Compaction/img/initialization_error.png
new file mode 100644
index 0000000..0eaf122
Binary files /dev/null and b/Project2-Stream-Compaction/img/initialization_error.png differ
diff --git a/Project2-Stream-Compaction/img/scan_non_power_of_two.png b/Project2-Stream-Compaction/img/scan_non_power_of_two.png
new file mode 100644
index 0000000..981644f
Binary files /dev/null and b/Project2-Stream-Compaction/img/scan_non_power_of_two.png differ
diff --git a/Project2-Stream-Compaction/img/scan_power_of_2.png b/Project2-Stream-Compaction/img/scan_power_of_2.png
new file mode 100644
index 0000000..c3fbf9e
Binary files /dev/null and b/Project2-Stream-Compaction/img/scan_power_of_2.png differ
diff --git a/Project2-Stream-Compaction/img/scan_power_of_2_without_thrust.png b/Project2-Stream-Compaction/img/scan_power_of_2_without_thrust.png
new file mode 100644
index 0000000..fa7893e
Binary files /dev/null and b/Project2-Stream-Compaction/img/scan_power_of_2_without_thrust.png differ
diff --git a/Project2-Stream-Compaction/img/screen_shot_output.PNG b/Project2-Stream-Compaction/img/screen_shot_output.PNG
new file mode 100644
index 0000000..8d42d68
Binary files /dev/null and b/Project2-Stream-Compaction/img/screen_shot_output.PNG differ
diff --git a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt
index cdbef77..185a604 100644
--- a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt
+++ b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_75
     )
diff --git a/Project2-Stream-Compaction/stream_compaction/common.cu b/Project2-Stream-Compaction/stream_compaction/common.cu
index 2ed6d63..687906c 100644
--- a/Project2-Stream-Compaction/stream_compaction/common.cu
+++ b/Project2-Stream-Compaction/stream_compaction/common.cu
@@ -24,6 +24,20 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n)
+            {
+                return;
+            }
+
+            if (idata[index] == 0)
+            {
+                bools[index] = 0;
+            }
+            else 
+            {
+                bools[index] = 1;
+            }
         }
 
         /**
@@ -33,6 +47,16 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n)
+            {
+                return;
+            }
+
+            if (bools[index] != 0)
+            {
+                odata[indices[index]] = idata[index];
+            }
         }
 
     }
diff --git a/Project2-Stream-Compaction/stream_compaction/common.h b/Project2-Stream-Compaction/stream_compaction/common.h
index 996997e..638836a 100644
--- a/Project2-Stream-Compaction/stream_compaction/common.h
+++ b/Project2-Stream-Compaction/stream_compaction/common.h
@@ -12,6 +12,7 @@
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+#define BLOCK_SIZE 1024
 
 /**
  * Check for CUDA errors; print and exit if there was a problem.
diff --git a/Project2-Stream-Compaction/stream_compaction/cpu.cu b/Project2-Stream-Compaction/stream_compaction/cpu.cu
index a2d3e6c..5285fc6 100644
--- a/Project2-Stream-Compaction/stream_compaction/cpu.cu
+++ b/Project2-Stream-Compaction/stream_compaction/cpu.cu
@@ -2,6 +2,7 @@
 #include "cpu.h"
 
 #include "common.h"
+#include <iostream>
 
 namespace StreamCompaction {
     namespace CPU {
@@ -20,6 +21,11 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
             // TODO
+            for (int idx = 0; idx < n; ++idx)
+            {
+                if (idx == 0) odata[idx] = 0;
+                else odata[idx] = odata[idx - 1] + idata[idx - 1];
+            }
 	        timer().endCpuTimer();
         }
 
@@ -30,9 +36,19 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
-            // TODO
+            int counter = 0;
+            for (int idx = 0; idx < n; ++idx)
+            {
+                int curr_val = idata[idx];
+                if (curr_val != 0)
+                {
+                 odata[counter] = curr_val;
+                 counter++;
+                }
+                
+            }
 	        timer().endCpuTimer();
-            return -1;
+            return counter;
         }
 
         /**
@@ -42,9 +58,36 @@ namespace StreamCompaction {
          */
         int compactWithScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
-	        // TODO
+            //do we need to allocate a new array?
+            //how to allocate in cpu
+            int *bool_array = new int[n]();
+            int *scattered_array = new int[n]();
+            for (int idx = 0; idx < n; ++idx)
+            {
+                if (idata[idx] == 0) bool_array[idx] = 0;
+                else bool_array[idx] = 1;
+            }
+            //scan -- if directly call scan will cause cpu timer error
+            for (int idx = 0; idx < n; ++idx)
+            {
+                if (idx == 0) scattered_array[idx] = 0;
+                else scattered_array[idx] = scattered_array[idx - 1] + bool_array[idx - 1];
+            }
+
+            for (int idx = 0; idx < n; ++idx)
+            {
+                int curr_odata_idx = scattered_array[idx];
+                if (bool_array[idx] != 0)
+                {
+                    odata[curr_odata_idx] = idata[idx];
+                }
+            
+            }
 	        timer().endCpuTimer();
-            return -1;
+            int returned_val = scattered_array[n - 1]; //the scan gives us the length of the output array
+            free(bool_array);
+            free(scattered_array);
+            return returned_val;
         }
     }
 }
diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.cu b/Project2-Stream-Compaction/stream_compaction/efficient.cu
index 2db346e..83f66af 100644
--- a/Project2-Stream-Compaction/stream_compaction/efficient.cu
+++ b/Project2-Stream-Compaction/stream_compaction/efficient.cu
@@ -6,19 +6,89 @@
 namespace StreamCompaction {
     namespace Efficient {
         using StreamCompaction::Common::PerformanceTimer;
+        using StreamCompaction::Common::kernMapToBoolean;
+        using StreamCompaction::Common::kernScatter;
+        //intermediate arrays
+        int* temp_out;
+        int* temp_bool;
+        int* temp_scattered;
+        int* temp_in;
+
         PerformanceTimer& timer()
         {
             static PerformanceTimer timer;
             return timer;
         }
+        __global__ void kernComputePartialUpSweep(int n, const int pow_d_plus_one, int* idata)
+        {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n)
+            {
+                return;
+            }
+            //no need to store the idata to temp_in, it is already there
+            int pow_d = pow_d_plus_one / 2;
+            //by 2^(d+1) means a stride of two
+            if (index % pow_d_plus_one == 0)  idata[index + pow_d_plus_one - 1] += idata[index + pow_d - 1];
+        }
 
+        __global__ void kernComputePartialDownSweep(int n, const int pow_d_plus_one, int* odata)
+        {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n)
+            {
+                return;
+            }
+            //no need to store the idata to temp_in, it is already there
+            int pow_d = pow_d_plus_one / 2;
+            if (index % pow_d_plus_one == 0)
+            {
+                int temp = odata[index + pow_d - 1];
+                odata[index + pow_d - 1] = odata[index + pow_d_plus_one - 1];
+                odata[index + pow_d_plus_one - 1] += temp;
+            }
+        }
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
+            //init two new memory
             // TODO
+            //padding to the nearest 2^d
+            int logn = ilog2ceil(n);
+            int powd = std::pow(2, logn);
+            //init temp_out
+            cudaMalloc((void**)&temp_out, powd * sizeof(int));
+            checkCUDAError("cudaMalloc temp_out failed!");            
+            //assign idata value to temp_out
+            cudaMemcpy(temp_out, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            int gridSize = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
+            dim3 blocksPerGrid(gridSize);
+            dim3 threadsPerBlock(BLOCK_SIZE);
+            ////first compute the new velocity
+            int ceil = logn - 1;
+            timer().startGpuTimer();
+            for (int offset = 0; offset <= ceil; ++offset) {
+                const int pow_d_plus_one = std::pow(2, offset + 1);
+                //set last element to zero in temp_out in kernel
+                kernComputePartialUpSweep << < blocksPerGrid, threadsPerBlock >> > (n, pow_d_plus_one, temp_out);
+            }
+
+            //assign 0 to root element
+            int last_value = 0;
+            cudaMemset(temp_out + powd - 1, last_value, sizeof(int));
+            checkCUDAError("cudaMemSet temp_out last value to 0 failed!");
+            //debug note --- If you have wrong operation in kernel, this will mess up your device memory, and next time you want to initalize them, it will fail -- so it is not problem in initialization, it is the operation having problem
+            for (int offset = ceil; offset >= 0; --offset) {
+                const int pow_d_plus_one = std::pow(2, offset + 1);
+                kernComputePartialDownSweep << < blocksPerGrid, threadsPerBlock >> > (n, pow_d_plus_one, temp_out);
+            }
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, temp_out, n * sizeof(int), cudaMemcpyDeviceToHost); //Using the wrong tag to copy will cause the next time initalization fail
+
+            cudaFree(temp_out);
         }
 
         /**
@@ -31,10 +101,65 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
+            int logn = ilog2ceil(n);
+            int powd = std::pow(2, logn);
+            //initialize intermediate arrays
+            cudaMalloc((void**)&temp_bool, powd * sizeof(int));
+            checkCUDAError("cudaMalloc temp_bool failed!");
+            cudaMalloc((void**)&temp_scattered, powd * sizeof(int));
+            checkCUDAError("cudaMalloc temp_scattered failed!");
+            cudaMalloc((void**)&temp_in, powd * sizeof(int));
+            checkCUDAError("cudaMalloc temp_in failed!");
+            cudaMalloc((void**)&temp_out, powd * sizeof(int));
+            checkCUDAError("cudaMalloc temp_out failed!");
+            cudaMemcpy(temp_in, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAError("cudaMemcpy idata to temp_in failed!");
+            int gridSize = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
+            dim3 blocksPerGrid(gridSize);
+            dim3 threadsPerBlock(BLOCK_SIZE);
+
+            //map values to bool array
+            Common::kernMapToBoolean << < blocksPerGrid, threadsPerBlock >> > (n, temp_bool, temp_in);
+            
+            //assign bool value to temp_scattered
+            cudaMemcpy(temp_scattered, temp_bool, n * sizeof(int), cudaMemcpyDeviceToDevice);
+            checkCUDAError("cudaMemcpy temp_scattered failed!");
+            //apply scan on bool array
             timer().startGpuTimer();
-            // TODO
+            int ceil = logn - 1;
+            for (int offset = 0; offset <= ceil; ++offset) {
+                const int pow_d_plus_one = std::pow(2, offset + 1);
+                //set last element to zero in temp_out in kernel
+                kernComputePartialUpSweep << < blocksPerGrid, threadsPerBlock >> > (n, pow_d_plus_one, temp_scattered);
+            }
+
+            //assign 0 to root element
+            int last_value = 0;
+            cudaMemset(temp_scattered + powd - 1, last_value, sizeof(int));
+            checkCUDAError("cudaMemSet temp_scattered last value to 0 failed!");
+            for (int offset = ceil; offset >= 0; --offset) {
+                const int pow_d_plus_one = std::pow(2, offset + 1);
+                kernComputePartialDownSweep << < blocksPerGrid, threadsPerBlock >> > (n, pow_d_plus_one, temp_scattered);
+            }
+
+            //got the correct indices of non-zero element in the array
+            //map to odata
+            Common::kernScatter << < blocksPerGrid, threadsPerBlock >> > (n, temp_out, temp_in, temp_bool, temp_scattered);
+
             timer().endGpuTimer();
-            return -1;
+            //copy from temp_out to odata
+            cudaMemcpy(odata, temp_out, n * sizeof(int), cudaMemcpyDeviceToHost);
+            checkCUDAError("cudaMemcpy temp_out to odata failed!");
+
+            int last_scattered = 0;
+            int last_bool = 0;
+            cudaMemcpy(&last_scattered, &temp_scattered[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMemcpy(&last_bool, &temp_bool[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+            int odata_size = last_scattered + last_bool;
+
+            cudaFree(temp_bool);
+            cudaFree(temp_scattered);
+            return odata_size;
         }
     }
 }
diff --git a/Project2-Stream-Compaction/stream_compaction/naive.cu b/Project2-Stream-Compaction/stream_compaction/naive.cu
index 4308876..5d30d83 100644
--- a/Project2-Stream-Compaction/stream_compaction/naive.cu
+++ b/Project2-Stream-Compaction/stream_compaction/naive.cu
@@ -13,13 +13,78 @@ namespace StreamCompaction {
         }
         // TODO: __global__
 
+        __global__ void kernComputePartialNaive(int n,  int pow_d, int* temp_in, int* temp_out)
+        {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n)
+            {
+                return;
+            }
+            //no need to store the idata to temp_in, it is already there
+            if (index >= pow_d)  temp_out[index] = temp_in[index - pow_d] + temp_in[index];
+            else temp_out[index] = temp_in[index];
+        }
+
+        __global__ void kernShiftTempOut(int n, int* temp_in, int* temp_out)
+        {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n)
+            {
+                return;
+            }
+
+            if (index == 0)
+            {
+                temp_out[index] = 0;
+            }
+            else
+            {
+                temp_out[index] = temp_in[index - 1];
+            }
+        }
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
+            int* temp_in;
+            int* temp_out;
             // TODO
+            //init two new memory
+            cudaMalloc((void**)&temp_in, n * sizeof(int));
+            checkCUDAError("cudaMalloc temp_in failed!");
+
+            cudaMalloc((void**)&temp_out, n * sizeof(int));
+            checkCUDAError("cudaMalloc temp_out failed!");
+            int gridSize = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
+            dim3 blocksPerGrid(gridSize);
+            dim3 threadsPerBlock(BLOCK_SIZE);
+
+            //copy idata to temp_in
+            timer().startGpuTimer();
+            cudaMemcpy(temp_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); //idata is in global memory
+            checkCUDAError("cudaMemcpy idata to temp_in failed!");
+            //temp pointer
+
+            //first compute the new velocity
+            int ceil = ilog2ceil(n);
+            for (int offset = 1; offset <= ceil; ++offset) {
+                const int pow_d_minus_one = std::pow(2, offset - 1);
+                kernComputePartialNaive << < blocksPerGrid, threadsPerBlock >> > (n, pow_d_minus_one, temp_in, temp_out);
+
+                int* temp = temp_in;
+                temp_in = temp_out;
+                temp_out = temp;
+
+            }
             timer().endGpuTimer();
+
+            //shift temp_out by one to get exclusive scan from reduction
+            kernShiftTempOut << < blocksPerGrid, threadsPerBlock >> > (n, temp_in,temp_out);
+            //pass the result from temp array to result
+            cudaMemcpy(odata, temp_out, n * sizeof(int), cudaMemcpyDeviceToHost);
+            checkCUDAError("cudaMemcpy temp_out to odata failed!");
+            cudaFree(temp_in);
+            cudaFree(temp_out);
         }
     }
 }
diff --git a/Project2-Stream-Compaction/stream_compaction/thrust.cu b/Project2-Stream-Compaction/stream_compaction/thrust.cu
index 1def45e..2963533 100644
--- a/Project2-Stream-Compaction/stream_compaction/thrust.cu
+++ b/Project2-Stream-Compaction/stream_compaction/thrust.cu
@@ -9,6 +9,9 @@
 namespace StreamCompaction {
     namespace Thrust {
         using StreamCompaction::Common::PerformanceTimer;
+        //intermediate arrays
+        int* temp_in;
+        int* temp_out;
         PerformanceTimer& timer()
         {
             static PerformanceTimer timer;
@@ -18,11 +21,24 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
+
             // TODO use `thrust::exclusive_scan`
+            cudaMalloc((void**)&temp_in, n * sizeof(int));
+            checkCUDAError("cudaMalloc temp_in failed!");
+            cudaMalloc((void**)&temp_out, n * sizeof(int));
+            checkCUDAError("cudaMalloc temp_out failed!");
+
+            cudaMemcpy(temp_in, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            timer().startGpuTimer();
+            //warp to a device ptr
+            thrust::device_ptr<int> dev_in(temp_in);
+            thrust::device_ptr<int> dev_out(temp_out);
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            thrust::exclusive_scan(dev_in, dev_in + n, dev_out);
             timer().endGpuTimer();
+            cudaMemcpy(odata, temp_out, n * sizeof(int), cudaMemcpyDeviceToHost);
+
         }
     }
 }