diff --git a/README.md b/README.md
index b71c458..a70589f 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,69 @@
 CUDA Stream Compaction
 ======================
 
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
+###University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+Ju Yang 
 
-### (TODO: Your README)
+### Tested on: Windows 7, i7-4710MQ @ 2.50GHz 8GB, GTX 870M 6870MB (Hasee Notebook K770E-i7)
+![result](doc/1024.png)
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+## TODOs finished: 
+  ### 1. naive.cu 
+  
+  __global__ void naive_sum(int n,int* odata, int* idata);
+  
+  void scan(int n, int *odata, const int *idata);  
+  
+  ### 2. efficient.cu 
+  
+  __global__ void prescan(int *g_odata, int *g_idata, int n, int*temp);
+  
+  void scan(int n, int *odata, const int *idata);
+  
+  int compact(int n, int *odata, const int *idata);
+  
 
+  ### 3 thrust.cu 
+  
+  void scan(int n, int *odata, const int *idata);
+  
+  
+  ### 4 cpu.cu 
+  
+  void scan(int n, int *odata, const int *idata);
+  
+  int compactWithoutScan(int n, int *odata, const int *idata) ;
+  
+  int compactWithScan(int n, int *odata, const int *idata);
+  
+  ### 5 common.cu 
+  
+  __global__ void kernMapToBoolean(int n, int *bools, const int *idata);
+  
+  __global__ void kernScatter(int n, int *odata,
+                const int *idata, const int *bools, const int *indices);
+                
+ ### Modified the main.cpp a little bit for display. 
+
+## Performance Graph
+
+ ### Scanning
+ ![result](doc/image001.gif)
+ ### Thrust Scanning
+ ![result](doc/data_29123_image001.gif)
+ ### Compact
+ ![result](doc/data_6317_image001.gif)
+ 
+## Analysis
+ ### Thrust
+ As we can see, the thrust::exclusive_scan is rather time-costing compared with other methods. Even if I used device_vector to store the data, it is still the slowest. 
+ But since I did not free the device_vectors, the non-pow2 as second round's speed is much faster. 
+ I think the reason is, when calling thrust functions, it will apply for some blocks/threads inside the GPU, and will release later on. 
+ Although I tried my best to avoid any read/write from CPU to GPU, the scan function still cost some time to arrange for some place. 
+ 
+ ### Unfixed Known Bugs
+  #### 1. When using multiple blocks, sometimes the result is not right. I think it is because __syncthreads() doesn't sync blocks?
+  #### 2. Since I used only 1 block, when the SIZE is more than 1024(which is the limit), apperently the result is wrong. 
+  #### 3. CPU performace is much better, and sometimes the calculating time doesn't always raise with the SIZE. 
+           I think this is because the SIZE is still not large enough? 
diff --git a/doc/1024.png b/doc/1024.png
new file mode 100644
index 0000000..c17a51f
Binary files /dev/null and b/doc/1024.png differ
diff --git a/doc/128.png b/doc/128.png
new file mode 100644
index 0000000..4f9081a
Binary files /dev/null and b/doc/128.png differ
diff --git a/doc/16.png b/doc/16.png
new file mode 100644
index 0000000..e4057f2
Binary files /dev/null and b/doc/16.png differ
diff --git a/doc/256.png b/doc/256.png
new file mode 100644
index 0000000..ca4fd76
Binary files /dev/null and b/doc/256.png differ
diff --git a/doc/32.png b/doc/32.png
new file mode 100644
index 0000000..ec22e71
Binary files /dev/null and b/doc/32.png differ
diff --git a/doc/512.png b/doc/512.png
new file mode 100644
index 0000000..9e7b183
Binary files /dev/null and b/doc/512.png differ
diff --git a/doc/64.png b/doc/64.png
new file mode 100644
index 0000000..5c4b56f
Binary files /dev/null and b/doc/64.png differ
diff --git a/doc/8.png b/doc/8.png
new file mode 100644
index 0000000..eb5e5d4
Binary files /dev/null and b/doc/8.png differ
diff --git a/doc/data.xls b/doc/data.xls
new file mode 100644
index 0000000..8b434a8
Binary files /dev/null and b/doc/data.xls differ
diff --git a/doc/data_29123_image001.gif b/doc/data_29123_image001.gif
new file mode 100644
index 0000000..b077003
Binary files /dev/null and b/doc/data_29123_image001.gif differ
diff --git a/doc/data_6317_image001.gif b/doc/data_6317_image001.gif
new file mode 100644
index 0000000..04a6eac
Binary files /dev/null and b/doc/data_6317_image001.gif differ
diff --git a/doc/image001.gif b/doc/image001.gif
new file mode 100644
index 0000000..3e6fc9d
Binary files /dev/null and b/doc/image001.gif differ
diff --git a/src/main.cpp b/src/main.cpp
index 7305641..83808fc 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,14 +13,17 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+
+
+const int SIZE = 1 << 10; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int a[SIZE], b[SIZE], c[SIZE];
 
 int main(int argc, char* argv[]) {
+
     // Scan tests
+	printf("Size= %d, Non-Pow2 Size= %d. \n", SIZE, NPOT);
 
-    printf("\n");
     printf("****************\n");
     printf("** SCAN TESTS **\n");
     printf("****************\n");
@@ -42,7 +45,7 @@ int main(int argc, char* argv[]) {
     printDesc("cpu scan, non-power-of-two");
     StreamCompaction::CPU::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(NPOT, b, true);
+    //printArray(NPOT, b, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
@@ -115,14 +118,14 @@ int main(int argc, char* argv[]) {
     count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
     printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
     expectedNPOT = count;
-    printArray(count, c, true);
+    //printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("cpu compact with scan");
     count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
     printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(count, c, true);
+    //printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 8fc0211..93b2530 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -24,6 +24,15 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+			int index = threadIdx.x;
+			if (idata[index] == 0)//If this is 0
+			{
+				bools[index] = 0;
+			}
+			else
+			{
+				bools[index] = 1;
+			}
         }
 
         /**
@@ -33,6 +42,13 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+
+			int index = threadIdx.x;
+
+			if (bools[index]!=0)
+			{
+				odata[indices[index]] = idata[index];
+			}
         }
 
     }
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 05ce667..2b30508 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -1,15 +1,15 @@
 #include <cstdio>
 #include "cpu.h"
 
-#include "common.h"
+#include "common.h"
 
 namespace StreamCompaction {
     namespace CPU {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-	        static PerformanceTimer timer;
-	        return timer;
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+	        static PerformanceTimer timer;
+	        return timer;
         }
 
         /**
@@ -20,6 +20,14 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
             // TODO
+			int i;
+			int count=0;
+
+			for (i = 0; i < n; i++)
+			{
+				count += idata[i];
+				odata[i] = count;
+			}
 	        timer().endCpuTimer();
         }
 
@@ -31,8 +39,21 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
             // TODO
+			int i;
+			int count = 0;
+			for (i = 0; i < n; i++)
+			{
+				if (idata[i] != 0)
+				{
+					odata[i-count] = idata[i];				
+				}
+				else
+				{
+					count++;
+				}
+			}
 	        timer().endCpuTimer();
-            return -1;
+            return n-count;
         }
 
         /**
@@ -43,8 +64,23 @@ namespace StreamCompaction {
         int compactWithScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
 	        // TODO
+			int i;
+			int count=0;
+
+			for (i = 0; i < n; i++)
+			{
+				if (idata[i] != 0)
+				{
+					odata[i - count] = idata[i];
+				}
+				else
+				{
+					count++;
+				}
+			}
+
 	        timer().endCpuTimer();
-            return -1;
+            return n- count;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 36c5ef2..0b1754b 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -5,36 +5,214 @@
 
 namespace StreamCompaction {
     namespace Efficient {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
         }
 
-        /**
-         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
-         */
+		__global__ void fillempty(int n,int * start, int fill)
+		{
+			int index= threadIdx.x;
+
+			start[index+n] = fill;		
+		}
+
+        //Refered from Nvidia 
+		//https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_pref01.html
+		//
+		__global__ void prescan(int *g_odata, int *g_idata, int n, int*temp)
+		{
+			//extern __shared__ int temp[];  // allocated on invocation
+						
+			int thid = blockIdx.x * blockDim.x + threadIdx.x;
+			int offset = 1;
+
+			temp[2 * thid] = g_idata[2 * thid]; // load input into shared memory
+			temp[2 * thid + 1] = g_idata[2 * thid + 1];
+
+			int d,ai,bi;
+	
+			// is >> faster than /=2 for int? 
+			for (d = n >> 1; d > 0; d >>= 1)// build sum in place up the tree
+			{		
+				__syncthreads();
+				if (thid < d)
+				{
+					ai = offset*(2 * thid + 1) - 1;
+					bi = offset*(2 * thid + 2) - 1;
+
+					temp[bi] += temp[ai];
+				}
+				offset *= 2;				
+			}
+			
+			if (thid == 0) 
+			{ 		
+				temp[n - 1] = 0; // clear the last element
+			} 
+			
+			int t;
+
+			for (d = 1; d < n; d *= 2) // traverse down tree & build scan
+			{
+				offset >>= 1;
+				__syncthreads();
+				if (thid < d)
+				{
+					ai = offset*(2 * thid + 1) - 1;
+					bi = offset*(2 * thid + 2) - 1;
+
+					t = temp[ai];
+					temp[ai] = temp[bi];
+					temp[bi] += t;
+				}
+			}
+			__syncthreads();
+
+			g_odata[2 * thid] = temp[2 * thid]; // write results to device memory
+			g_odata[2 * thid + 1] = temp[2 * thid + 1];		
+		}
+
+		/**
+		* Performs prefix-sum (aka scan) on idata, storing the result into odata.
+		*/
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
+
+			//Deal with non pow2 arrays
+			int N;
+			int count;			
+			count = ilog2(n);
+			N = 1<<count;
+			if (N < n)
+			{
+				N *=2;
+			}
+			// Malloc buffers
+			int* buffer3;
+			int* buffer4;
+			int* temp;
+			cudaMalloc((void**)&buffer3, N * sizeof(int));
+			cudaMalloc((void**)&buffer4, (N + 4) * sizeof(int));
+			cudaMalloc((void**)&temp, (N + 4) * sizeof(int));
+			cudaMemcpy(buffer3, idata, n * sizeof(int), cudaMemcpyHostToDevice); //COPY from idata in CPU to buffer1 in GPU
+			cudaMemcpy(buffer4, idata, n * sizeof(int), cudaMemcpyHostToDevice); //COPY from idata in CPU to buffer2 in GPU
+			cudaMemcpy(temp, idata, n * sizeof(int), cudaMemcpyHostToDevice); //COPY from idata in CPU to buffer2 in GPU
+			//Fill all empty positions as 0
+			fillempty << <1, (N - n) >> > (n,buffer3, 0);
+			fillempty << <1, (4 + N - n) >> > (n,buffer4, 0);
+			fillempty << <1, (4 + N - n) >> > (n,temp, 0);
+
+			//Start Timer
+			timer().startGpuTimer();
             // TODO
-            timer().endGpuTimer();
+			prescan<<<1,N/2>>>(buffer4, buffer3, N,temp);
+			//END Timer
+			timer().endGpuTimer();
+           
+			//COPY data back to CPU
+			cudaMemcpy(odata, buffer4+1, N * sizeof(int), cudaMemcpyDeviceToHost);
+			//Should I do this????
+			odata[n - 1] = odata[n - 2] + idata[n - 1];
+			//Free buffers
+			cudaFree(buffer3);
+			cudaFree(buffer4);
+			cudaFree(temp);		
         }
+       		
+
+
+		__global__ void naive_sum2(int n, int* odata, int* idata)
+		{
+			int index = threadIdx.x;
+			int i;
+			int * ping;
+			int * pong;
+			int * swap;
+			ping = idata;
+			pong = odata;
+			__syncthreads();
+
+			for (i = 1; i < n; i *= 2)
+			{
+				if (index - i >= 0)
+				{
+					pong[index] = ping[index] + ping[index - i];
+				}
+				else
+				{
+					pong[index] = ping[index];
+				}
+				//Ping-Pong here!
+				swap = ping;
+				ping = pong;
+				pong = swap;
+				__syncthreads();
+			}
+		}
+
+		/**
+		* Performs stream compaction on idata, storing the result into odata.
+		* All zeroes are discarded.
+		*
+		* @param n      The number of elements in idata.
+		* @param odata  The array into which to store elements.
+		* @param idata  The array of elements to compact.
+		* @returns      The number of elements remaining after compaction.
+		*/
 
-        /**
-         * Performs stream compaction on idata, storing the result into odata.
-         * All zeroes are discarded.
-         *
-         * @param n      The number of elements in idata.
-         * @param odata  The array into which to store elements.
-         * @param idata  The array of elements to compact.
-         * @returns      The number of elements remaining after compaction.
-         */
         int compact(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
+            
             // TODO
-            timer().endGpuTimer();
-            return -1;
+
+			int* data;		
+			cudaMalloc((void**)&data, n * sizeof(int));
+			cudaMemcpy(data, idata, n * sizeof(int), cudaMemcpyHostToDevice); //COPY from idata in CPU to data in GPU
+
+			int* out;
+			cudaMalloc((void**)&out, n * sizeof(int));
+			cudaMemcpy(out, idata, n * sizeof(int), cudaMemcpyHostToDevice); //COPY from idata in CPU to data in GPU
+
+			int* bools;
+			cudaMalloc((void**)&bools, n * sizeof(int));
+
+			int* index;
+			cudaMalloc((void**)&index, n * sizeof(int));
+
+
+			timer().startGpuTimer();
+
+			Common::kernMapToBoolean<<<1,n>>>(n, bools, data); //map the bools
+
+			//TEST
+			cudaMemcpy(odata, bools, n * sizeof(int), cudaMemcpyDeviceToHost); //COPY from idata in CPU to data in GPU
+			int i;
+			int count = 0;
+
+			
+			for (i = 0; i < n; i++)
+			{
+				count += odata[i];
+				odata[i] = count-1;
+			}
+				
+			cudaMemcpy(index, odata, n * sizeof(int), cudaMemcpyHostToDevice); //COPY from idata in CPU to data in GPU
+			cudaMemcpy(data, idata, n * sizeof(int), cudaMemcpyHostToDevice); //COPY from idata in CPU to data in GPU
+
+			Common::kernScatter << <1, n>> > (n, out, data, bools, index);//write to out
+
+			timer().endGpuTimer();
+
+			//COPY data back to CPU
+			cudaMemcpy(odata, out, n * sizeof(int), cudaMemcpyDeviceToHost); //COPY back to CPU
+			
+			//Handle the first
+			if(idata[0]!=0)
+			{
+				odata[0] = idata[0];
+			}	
+            return count;
         }
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 9218f8e..55f7bb5 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -5,21 +5,71 @@
 
 namespace StreamCompaction {
     namespace Naive {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
         }
         // TODO: __global__
 
+		__global__ void naive_sum(int n,int* odata, int* idata)
+		{		
+			int index = blockIdx.x *blockDim.x + threadIdx.x;
+			int i;
+			int * ping;
+			int * pong;
+			int * swap;
+			ping = idata;
+			pong = odata;
+			__syncthreads();
+
+			for (i = 1; i < n; i *= 2)
+			{
+				if (index - i >= 0)
+				{
+					pong[index] = ping[index] + ping[index - i];
+				}
+				else
+				{
+					pong[index] = ping[index];
+				}
+				//Ping-Pong here!
+				swap = ping;
+				ping = pong;
+				pong = swap;
+				__syncthreads();
+			}
+		}
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
+            
             // TODO
-            timer().endGpuTimer();
+			int count;
+
+
+			//Create 2 buffers
+			int* buffer1;
+			int* buffer2;
+			cudaMalloc((void**)&buffer1, n * sizeof(int));
+			cudaMalloc((void**)&buffer2, n * sizeof(int));
+			cudaMemcpy(buffer1, idata, n * sizeof(int), cudaMemcpyHostToDevice); //COPY from idata in CPU to buffer1 in GPU
+			cudaMemcpy(buffer2, idata, n * sizeof(int), cudaMemcpyHostToDevice); //COPY from idata in CPU to buffer2 in GPU
+			//Start timer
+			timer().startGpuTimer();
+			//Do Naive_Sum here! 
+			naive_sum << <1,n>> > (n, buffer2, buffer1);
+			//End timer
+			timer().endGpuTimer();
+			//COPY data back to CPU
+			cudaMemcpy(odata, buffer1, n * sizeof(int), cudaMemcpyDeviceToHost);
+			//Free buffers
+			cudaFree(buffer1);
+			cudaFree(buffer2);
+
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 36b732d..c7088b5 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -8,21 +8,42 @@
 
 namespace StreamCompaction {
     namespace Thrust {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
         }
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+
+			int* a;
+			cudaMalloc((void**)&a, n * sizeof(int));
+			cudaMemcpy(a, idata, n * sizeof(int), cudaMemcpyHostToDevice); //COPY from CPU to GPU
+			
+			thrust::device_vector<int> in(n+1);
+			thrust::device_vector<int> out(n+1);
+
+			thrust::device_ptr<int> here(a);
+			thrust::copy(here, here + n, in.begin()); //CPOY from GPU to Thrust
+
+			//Start Timing
             timer().startGpuTimer();
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
-            timer().endGpuTimer();
+
+			thrust::exclusive_scan(in.begin(), in.end(), out.begin());
+			timer().endGpuTimer();
+			//End Timing
+
+			thrust::copy(out.begin()+1, out.end(), odata);	//COPY from Thrust to CPU
+
+			cudaFree(a);
+			
+			
         }
     }
 }