diff --git a/README.md b/README.md
index a82ea0f..b6752a3 100644
--- a/README.md
+++ b/README.md
@@ -3,211 +3,86 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Xinyue Zhu
+* Tested on: Windows 10, i5-5200U @ 2.20GHz 8GB, GTX 960M 
 
-### (TODO: Your README)
-
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
-
-Instructions (delete me)
-========================
-
-This is due Sunday, September 13 at midnight.
-
-**Summary:** In this project, you'll implement GPU stream compaction in CUDA,
-from scratch. This algorithm is widely used, and will be important for
-accelerating your path tracer project.
-
-Your stream compaction implementations in this project will simply remove `0`s
-from an array of `int`s. In the path tracer, you will remove terminated paths
-from an array of rays.
-
-In addition to being useful for your path tracer, this project is meant to
-reorient your algorithmic thinking to the way of the GPU. On GPUs, many
-algorithms can benefit from massive parallelism and, in particular, data
-parallelism: executing the same code many times simultaneously with different
-data.
-
-You'll implement a few different versions of the *Scan* (*Prefix Sum*)
-algorithm. First, you'll implement a CPU version of the algorithm to reinforce
-your understanding. Then, you'll write a few GPU implementations: "naive" and
-"work-efficient." Finally, you'll use some of these to implement GPU stream
-compaction.
-
-**Algorithm overview & details:** There are two primary references for details
-on the implementation of scan and stream compaction.
-
-* The [slides on Parallel Algorithms](https://github.com/CIS565-Fall-2015/cis565-fall-2015.github.io/raw/master/lectures/2-Parallel-Algorithms.pptx)
-  for Scan, Stream Compaction, and Work-Efficient Parallel Scan.
-* GPU Gems 3, Chapter 39 - [Parallel Prefix Sum (Scan) with CUDA](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html).
-
-Your GPU stream compaction implementation will live inside of the
-`stream_compaction` subproject. This way, you will be able to easily copy it
-over for use in your GPU path tracer.
-
-
-## Part 0: The Usual
-
-This project (and all other CUDA projects in this course) requires an NVIDIA
-graphics card with CUDA capability. Any card with Compute Capability 2.0
-(`sm_20`) or greater will work. Check your GPU on this
-[compatibility table](https://developer.nvidia.com/cuda-gpus).
-If you do not have a personal machine with these specs, you may use those
-computers in the Moore 100B/C which have supported GPUs.
-
-**HOWEVER**: If you need to use the lab computer for your development, you will
-not presently be able to do GPU performance profiling. This will be very
-important for debugging performance bottlenecks in your program.
-
-### Useful existing code
-
-* `stream_compaction/common.h`
-  * `checkCUDAError` macro: checks for CUDA errors and exits if there were any.
-  * `ilog2ceil(x)`: computes the ceiling of log2(x), as an integer.
-* `main.cpp`
-  * Some testing code for your implementations.
-
-
-## Part 1: CPU Scan & Stream Compaction
-
-This stream compaction method will remove `0`s from an array of `int`s.
-
-In `stream_compaction/cpu.cu`, implement:
-
-* `StreamCompaction::CPU::scan`: compute an exclusive prefix sum.
-* `StreamCompaction::CPU::compactWithoutScan`: stream compaction without using
-  the `scan` function.
-* `StreamCompaction::CPU::compactWithScan`: stream compaction using the `scan`
-  function. Map the input array to an array of 0s and 1s, scan it, and use
-  scatter to produce the output. You will need a **CPU** scatter implementation
-  for this (see slides or GPU Gems chapter for an explanation).
-
-These implementations should only be a few lines long.
-
-
-## Part 2: Naive GPU Scan Algorithm
-
-In `stream_compaction/naive.cu`, implement `StreamCompaction::Naive::scan`
-
-This uses the "Naive" algorithm from GPU Gems 3, Section 39.2.1. We haven't yet
-taught shared memory, and you **shouldn't use it yet**. Example 39-1 uses
-shared memory, but is limited to operating on very small arrays! Instead, write
-this using global memory only. As a result of this, you will have to do
-`ilog2ceil(n)` separate kernel invocations.
-
-Beware of errors in Example 39-1 in the book; both the pseudocode and the CUDA
-code in the online version of Chapter 39 are known to have a few small errors
-(in superscripting, missing braces, bad indentation, etc.)
-
-Since the parallel scan algorithm operates on a binary tree structure, it works
-best with arrays with power-of-two length. Make sure your implementation works
-on non-power-of-two sized arrays (see `ilog2ceil`). This requires extra memory
-- your intermediate array sizes will need to be rounded to the next power of
-two.
-
-
-## Part 3: Work-Efficient GPU Scan & Stream Compaction
-
-### 3.1. Scan
-
-In `stream_compaction/efficient.cu`, implement
-`StreamCompaction::Efficient::scan`
-
-All of the text in Part 2 applies.
-
-* This uses the "Work-Efficient" algorithm from GPU Gems 3, Section 39.2.2.
-* Beware of errors in Example 39-2.
-* Test non-power-of-two sized arrays.
-
-### 3.2. Stream Compaction
-
-This stream compaction method will remove `0`s from an array of `int`s.
-
-In `stream_compaction/efficient.cu`, implement
-`StreamCompaction::Efficient::compact`
-
-For compaction, you will also need to implement the scatter algorithm presented
-in the slides and the GPU Gems chapter.
-
-In `stream_compaction/common.cu`, implement these for use in `compact`:
-
-* `StreamCompaction::Common::kernMapToBoolean`
-* `StreamCompaction::Common::kernScatter`
-
-
-## Part 4: Using Thrust's Implementation
-
-In `stream_compaction/thrust.cu`, implement:
-
-* `StreamCompaction::Thrust::scan`
-
-This should be a very short function which wraps a call to the Thrust library
-function `thrust::exclusive_scan(first, last, result)`.
-
-To measure timing, be sure to exclude memory operations by passing
-`exclusive_scan` a `thrust::device_vector` (which is already allocated on the
-GPU).  You can create a `thrust::device_vector` by creating a
-`thrust::host_vector` from the given pointer, then casting it.
-
-
-## Part 5: Radix Sort (Extra Credit) (+10)
-
-Add an additional module to the `stream_compaction` subproject. Implement radix
-sort using one of your scan implementations. Add tests to check its correctness.
-
-
-## Write-up
-
-1. Update all of the TODOs at the top of this README.
-2. Add a description of this project including a list of its features.
-3. Add your performance analysis (see below).
-
-All extra credit features must be documented in your README, explaining its
-value (with performance comparison, if applicable!) and showing an example how
-it works. For radix sort, show how it is called and an example of its output.
-
-Always profile with Release mode builds and run without debugging.
+### README
+The results are marked by the number of requirements.
+ 1 : CPU Scan & Compaction 
+ 2 : Naive Scan (naive.cu)
+ 3.1 : Work-Efficient Scan 
+ 3.2 Compaction (efficient.cu)
+ 4 : Thrust Scan (thrust.cu)
 
 ### Questions
+blocksize compare: to be honest,changing blocksize does not make much difference..
+
+* GPU Scan implementations (not included in code)(Naive, Work-Efficient, and
+  Thrust) to the serial CPU version of Scan. 
+  ![](graph.png)
+*the time line of thrust is seperate. It runs evey few seconds.
+	
+* The CudaMalloc function take over 167439 us. Over half of the time.This I/O is bottlenenecks.
+    The I/O time of deferent method is different. Because some methods needs to allocate more places in device. 
+  
+
+*Output:
+  
+****************
+** SCAN TESTS **
+****************
+
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...  26   0 ]
+==== cpu scan, power-of-two ====
+    1.1    
+	[   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ]
+==== cpu scan, non-power-of-two ====
+    1.1    
+	[   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 6146 6190 ]
+    passed
+==== naive scan, power-of-two ====
+    2.1    
+	[   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ]
+    passed
+==== naive scan, non-power-of-two ====
+    2.1    
+	[   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ]
+    passed
+==== work-efficient scan, power-of-two ====
+    3.1
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ]
+    passed
+==== work-efficient scan, non-power-of-two ====
+3.1
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 6146 6190 ]
+    passed
+==== thrust scan, power-of-two ====
+4.1    passed
+==== thrust scan, non-power-of-two ====
+4.1    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   3   2   1   3   1   1   1   2   0   1   0   2 ...   0   0 ]
+==== cpu compact without scan, power-of-two ====
+1.2
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   1 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+1.2
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   3   2 ]
+    passed
+==== cpu compact with scan ====
+1.3
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   1 ]
+    passed
+==== work-efficient compact, power-of-two ====
+3.2
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   1 ]
+    passed
+==== work-efficient compact, non-power-of-two ====
+3.2
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   3   2 ]
+    passed
 
-* Roughly optimize the block sizes of each of your implementations for minimal
-  run time on your GPU.
-  * (You shouldn't compare unoptimized implementations to each other!)
-
-* Compare all of these GPU Scan implementations (Naive, Work-Efficient, and
-  Thrust) to the serial CPU version of Scan. Plot a graph of the comparison
-  (with array size on the independent axis).
-  * You should use CUDA events for timing. Be sure **not** to include any
-    explicit memory operations in your performance measurements, for
-    comparability.
-  * To guess at what might be happening inside the Thrust implementation, take
-    a look at the Nsight timeline for its execution.
-
-* Write a brief explanation of the phenomena you see here.
-  * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is
-    it different for each implementation?
-
-* Paste the output of the test program into a triple-backtick block in your
-  README.
-  * If you add your own tests (e.g. for radix sort or to test additional corner
-    cases), be sure to mention it explicitly.
-
-These questions should help guide you in performance analysis on future
-assignments, as well.
-
-## Submit
-
-If you have modified any of the `CMakeLists.txt` files at all (aside from the
-list of `SOURCE_FILES`), you must test that your project can build in Moore
-100B/C. Beware of any build issues discussed on the Google Group.
-
-1. Open a GitHub pull request so that we can see that you have finished.
-   The title should be "Submission: YOUR NAME".
-2. Send an email to the TA (gmail: kainino1+cis565@) with:
-   * **Subject**: in the form of `[CIS565] Project 2: PENNKEY`
-   * Direct link to your pull request on GitHub
-   * In the form of a grade (0-100+) with comments, evaluate your own
-     performance on the project.
-   * Feedback on the project itself, if any.
diff --git a/graph.png b/graph.png
new file mode 100644
index 0000000..acf2667
Binary files /dev/null and b/graph.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 675da35..90ec7b6 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -26,42 +26,43 @@ int main(int argc, char* argv[]) {
     printf("****************\n");
 
     genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
+    a[SIZE - 1] = 0;//a[0]-a[size-1]:number=size
     printArray(SIZE, a, true);
 
     zeroArray(SIZE, b);
     printDesc("cpu scan, power-of-two");
     StreamCompaction::CPU::scan(SIZE, b, a);
     printArray(SIZE, b, true);
+	
 
     zeroArray(SIZE, c);
     printDesc("cpu scan, non-power-of-two");
-    StreamCompaction::CPU::scan(NPOT, c, a);
+    StreamCompaction::CPU::scan(NPOT, c, a);//253
     printArray(NPOT, b, true);
     printCmpResult(NPOT, b, c);
-
+	/////////////////////////////////////////////////////
     zeroArray(SIZE, c);
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
@@ -112,12 +113,13 @@ int main(int argc, char* argv[]) {
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
+	getchar();
 }
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index fe872d4..5893e1b 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -22,17 +22,41 @@ namespace Common {
  * Maps an array to an array of 0s and 1s for stream compaction. Elements
  * which map to 0 will be removed, and elements which map to 1 will be kept.
  */
-__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
+	__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
     // TODO
+		int k = threadIdx.x;
+	
+		if (idata[k] == 0)bools[k] = 0;
+		else bools[k] = 1;
+		/*for (int i = 0; i < n; i++){
+			if (idata[i] == 0) bools[i] = 0;
+			else {
+				bools[i] = 1;
+			}
+		}*/
+
 }
 
 /**
  * Performs scatter on an array. That is, for each element in idata,
  * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
  */
-__global__ void kernScatter(int n, int *odata,
-        const int *idata, const int *bools, const int *indices) {
+__global__ void kernScatter(int n, int *odata,const int *idata, const int *bools, const int *indices) {
+	//last, dev_odata, dev_idata, dev_bool, dev_boolb
+	//indices[i]={0,1,2,3,4},n is the muber of indices
     // TODO
+	/*for (int i = 0; i < n; i++){
+		if (bools[i] == 1)
+		{
+			odata[indices[i]] = idata[i];
+		}
+	}*/
+	//for (int i = 0; i < n; i++){ odata[i] = 0; }
+	int k = threadIdx.x;
+	if (bools[k] == 1){
+		int t = indices[k];//
+		odata[t] = idata[k];
+	}
 }
 
 }
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index e600c29..2c9454f 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -1,36 +1,101 @@
 #include <cstdio>
 #include "cpu.h"
-
+#include<math.h>
+#include<iostream>
 namespace StreamCompaction {
-namespace CPU {
+	namespace CPU {
 
-/**
- * CPU scan (prefix sum).
- */
-void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
-}
+		/**
+		 * CPU scan (prefix sum).
+		 */
+		void scan(int n, int *odata, const int *idata) {//b,a
+			// TODO
+			odata[0] = 0;
+			if (n > 1){
+				for (int i = 1; i < n; i++){
+					odata[i] = idata[i - 1] + odata[i - 1];
+				}
+			}
+			std::cout << "1.1";
 
-/**
- * CPU stream compaction without using the scan function.
- *
- * @returns the number of elements remaining after compaction.
- */
-int compactWithoutScan(int n, int *odata, const int *idata) {
-    // TODO
-    return -1;
-}
+		}
 
-/**
- * CPU stream compaction using scan and scatter, like the parallel version.
- *
- * @returns the number of elements remaining after compaction.
- */
-int compactWithScan(int n, int *odata, const int *idata) {
-    // TODO
-    return -1;
-}
+		/**
+		 * CPU stream compaction without using the scan function.
+		 *
+		 * @returns the number of elements remaining after compaction.
+		 */
+		int compactWithoutScan(int n, int *odata, const int *idata) {
+			// TODO
+			//int *p = odata;
+			
+			int count = 0;
+			for (int i = 0; i < n; i++){
+				if (idata[i] != 0){
+					odata[count] = idata[i];
+					count++;
+				}
+			
+			}
+			std::cout << "1.2"<<std::endl;
+			return count;
+		}
 
+		/**
+		 * CPU stream compaction using scan and scatter, like the parallel version.
+		 *
+		 * @returns the number of elements remaining after compaction.
+		 */
+		int compactWithScan(int n, int *odata, const int *idata) {
+			// TODO
+			int t = 0;
+			int *temp = new int[n];
+			int *scan = new int[n];
+			for (int i = 0; i < n; i++){ temp[i] = 0; scan[i] = 0;}
+			for (int i = 0; i < n; i++){
+				if (idata[i] != 0){
+					temp[i] = 1;			
+				}
+				else {
+					temp[i] = 0;		
+				}
+			}
+			for (int i = 1; i < n; i++){
+				scan[i] = temp[i - 1] + scan[i - 1];	
+			}
+			
+			int index = 0;
+			for (int i = 0; i < n-1; i++){
+				if (scan[i] != scan[i + 1]){
+					t = scan[i];
+					odata[index] = idata[i];
+					index++;
+				}
+
+			}
+			std::cout << "1.3"<<std::endl;
+			return scan[n - 1];
+		
+		}
+	}
 }
+	//scan
+	/*scan[0] = 0;
+	for (int i = 0; i < n; i++){
+		scan[i] = temp[i - 1] + scan[i - 1];
+	}
+	//scatter
+	for (int i = 0; i < n; i++){
+		if (scan[i] != scan[i + 1]){
+			t = scan[i];
+			*odata = idata[t];
+			*odata++;
+		}
+		else continue;
+	
+	}
+    return scan[n-1];
 }
+
+}*/
+
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index b2f739b..92e4752 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -2,6 +2,9 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "efficient.h"
+#include <iostream>
+int *dev_A1;
+int *dev_B1;
 
 namespace StreamCompaction {
 namespace Efficient {
@@ -11,9 +14,86 @@ namespace Efficient {
 /**
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
+__global__ void Uscan(int p1, int p2, int *od){
+		int thid = threadIdx.x*2*p1;
+		//od[thid + p1 - 1] = id[thid + p1 - 1];
+		//od[thid + p2 - 1] = id[thid + p2 - 1];
+
+		od[thid + p2 - 1] += od[thid + p1 - 1];
+		
+	}
+__global__ void put0(int * odata, int n)
+	{
+				odata[n - 1] = 0;
+			}
+
+__global__ void Dscan(int p1,int p2,int *od){
+
+		int thid = threadIdx.x*2*p1;
+			
+		//od[thid + p1 - 1] = id[thid + p1 - 1];
+		//od[thid + p2 - 1] = id[thid + p2 - 1];
+		//if (thid == n) { od[n - 1] = 0; }
+		int t = od[thid +p1 - 1];//
+		od[thid + p1 - 1] = od[thid + p2 - 1];
+		od[thid + p2 - 1] += t;
+		}
+
+
+	
+void init(int n, const int *hst_A){
+
+	int _size = n*sizeof(int);
+	cudaMalloc((void**)&dev_A1, _size);
+	cudaMemcpy(dev_A1, hst_A, _size, cudaMemcpyHostToDevice);
+
+
+}
+
+
 void scan(int n, int *odata, const int *idata) {
     // TODO
-    printf("TODO\n");
+
+	//dev_A1,dev_B1
+	int p1,p2;
+	int num;
+	if (n % 2 != 0)
+	{
+		num = ilog2ceil(n);
+		num = pow(2, num);
+	}
+	else num = n;
+	int *_idata = new int[num];
+	init(num, idata);
+	float ms=0;
+	cudaEvent_t start, stop;
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+	cudaEventRecord(start);
+	cudaEventRecord(stop);
+	cudaEventSynchronize(stop);
+
+	cudaEventElapsedTime(&ms, start, stop);
+	for (int d = 0; d <= ilog2ceil(num) - 1; d++){
+		p1 = pow(2, d);
+		p2 = pow(2, d + 1);
+		Uscan << <1, 512 >> >(p1, p2, dev_A1);
+	}
+	put0 << <1, 1 >> >(dev_A1, num);
+	for (int d = ilog2ceil(num) - 1; d >= 0; d--){
+		p1 = pow(2, d);
+		p2 = pow(2, d + 1);
+		Dscan << <1, 512 >> >(p1, p2, dev_A1);
+	
+	}
+	cudaEventSynchronize(stop);
+
+	cudaEventElapsedTime(&ms, start, stop);
+	printf("\t time of 3.1 efficient function1: %f ms\n", ms);
+	cudaMemcpy(odata, dev_A1, num* sizeof(int), cudaMemcpyDeviceToHost);//destination,source,
+	cudaFree(dev_A1);
+
+    printf("3.1\n");
 }
 
 /**
@@ -24,10 +104,74 @@ void scan(int n, int *odata, const int *idata) {
  * @param odata  The array into which to store elements.
  * @param idata  The array of elements to compact.
  * @returns      The number of elements remaining after compaction.
- */
+ */ 
+int *dev_idata;
+int *dev_odata;
+int *dev_indices;
+int *dev_bool;
+int *dev_boolb;
+
 int compact(int n, int *odata, const int *idata) {
     // TODO
-    return -1;
+	int num;
+	if (n % 2 != 0)
+	{
+		num = ilog2ceil(n);
+		num = pow(2, num);
+	}
+	else num = n;
+
+	int _size = num*sizeof(int);
+
+	cudaMalloc((void**)&dev_bool, _size);
+	cudaMalloc((void**)&dev_boolb, _size);
+	cudaMalloc((void**)&dev_odata, _size);
+	cudaMalloc((void**)&dev_idata, _size);
+	cudaMemcpy(dev_idata, idata, _size, cudaMemcpyHostToDevice);
+
+	int p1, p2;
+	int hst;
+	int last;
+	//step 1
+
+	Common::kernMapToBoolean <<< 1, n >>>(n, dev_bool, dev_idata);
+	Common::kernMapToBoolean << < 1, n >> >(n, dev_boolb, dev_idata);//back_up
+	//cudaMemcpy(&hst, &dev_idata[6],sizeof(int), cudaMemcpyDeviceToHost);
+	//std::cout << hst;
+   //Step 2 
+
+	for (int d = 0; d <= ilog2ceil(num) - 1; d++){
+			p1 = pow(2, d);
+			p2 = pow(2, d + 1);
+			Uscan <<<1, num >> >(p1, p2, dev_boolb);//change end to n
+				}
+	put0 <<<1, 1 >> >(dev_boolb, num);
+	//cudaMemcpy(&hst, &dev_idata[6], sizeof(int), cudaMemcpyDeviceToHost);
+	//std::cout << hst << "ss1";
+	for (int d = ilog2ceil(num) - 1; d >= 0; d--){
+			p1 = pow(2, d);
+			p2 = pow(2, d + 1);
+			Dscan <<<1, num >> >(p1, p2, dev_boolb);
+			}
+
+
+	//???????????my dev_idata changed its value here...have no idea why.
+	cudaMemcpy(dev_idata, idata, _size, cudaMemcpyHostToDevice);
+    ////???????????????????????/////////////
+	//cudaMemcpy(&hst, &dev_idata[6], sizeof(int), cudaMemcpyDeviceToHost);
+	//std::cout << hst << "ss2";
+    //Step 3 : Scatter
+	//cudaMemcpy(&hst, &dev_idata[2],sizeof(int), cudaMemcpyDeviceToHost);
+	//std::cout << hst;
+	cudaMemcpy(&last, &(dev_boolb[num - 1]), sizeof(int), cudaMemcpyDeviceToHost);
+	//cudaMalloc((void**)&dev_odata, last*sizeof(int));
+
+	Common::kernScatter <<<1, num >> >(last, dev_odata, dev_idata, dev_bool, dev_boolb);
+
+	cudaMemcpy(odata, dev_odata, last*sizeof(int), cudaMemcpyDeviceToHost);
+	
+	printf("3.2\n");
+    return last;
 }
 
 }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 395ba10..4769a7d 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -3,7 +3,7 @@
 namespace StreamCompaction {
 namespace Efficient {
     void scan(int n, int *odata, const int *idata);
-
+	void init(int n,const int*b);
     int compact(int n, int *odata, const int *idata);
 }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 3d86b60..83f3742 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -2,7 +2,10 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "naive.h"
-
+#include<iostream>
+int *dev_A;
+int *dev_B;
+int *dev_C;
 namespace StreamCompaction {
 namespace Naive {
 
@@ -11,9 +14,69 @@ namespace Naive {
 /**
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
+__global__ void Nscan(int n, int logn, int *Ain, int*Bout,int *Ctemp){//in;out;temp
+
+	int thid = threadIdx.x;
+	int offset; 
+	Bout[0] = 0;
+	Ctemp[0] = 0;
+	for (int j = 0; j < n-1; j++){
+		Ctemp[j + 1] = Ain[j];
+	}
+
+	//Ctemp[thid] = (thid > 0) ? Ain[thid - 1] : 0;
+	for (int d = 1; d <= logn; d++){
+		offset = 2;
+
+		if (d == 1)offset = 1;
+		if (d == 2)offset = 2;
+		else 
+			for (int i = 1; i < d-1; i++){
+				offset *= 2; 
+			}
+		if (thid >= offset)//pow(2,d-1){d=1,off=1}{d=2,off=2}{d=3,off=4}off=pow(2,d-1){d=4,offset=8}
+				Ctemp[thid] += Ctemp[thid - offset];
+	}
+	Bout[0] = 0;
+    Bout[thid] = Ctemp[thid];
+
+	}
+void init(int *hst_A, int *hst_B,int n){
+
+		int _size = n *sizeof(int);
+		cudaMalloc((void**)&dev_A, _size);
+		cudaMemcpy(dev_A, hst_A, _size, cudaMemcpyHostToDevice);
+
+		cudaMalloc((void**)&dev_B, _size);
+		cudaMemcpy(dev_B, hst_B, _size, cudaMemcpyHostToDevice);
+
+		cudaMalloc((void**)&dev_C, _size);
+	}
 void scan(int n, int *odata, const int *idata) {
     // TODO
-    printf("TODO\n");
+	int num;
+	if (n % 2 != 0)
+	{
+		num = ilog2ceil(n);
+		num = pow(2, num);
+	}
+	else num = n;
+	int *_idata=new int[num];
+	for (int i = 0; i < num; i++){
+		_idata[i] = idata[i];
+	}
+	init(_idata, odata, num);
+	
+	//std::cout << ilog2ceil(4) << ilog2ceil(5);//2,3;	
+	int logn = ilog2ceil(num);
+	Nscan <<< 1, num >> >(num,logn,dev_A, dev_B,dev_C);
+
+	cudaMemcpy(odata, dev_B, num* sizeof(int), cudaMemcpyDeviceToHost);//destination,source,
+	cudaFree(dev_A);
+	cudaFree(dev_B);
+	cudaFree(dev_C);
+	
+    printf("2.1");
 }
 
 }
diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h
index 21152d6..0cdba88 100644
--- a/stream_compaction/naive.h
+++ b/stream_compaction/naive.h
@@ -3,5 +3,8 @@
 namespace StreamCompaction {
 namespace Naive {
     void scan(int n, int *odata, const int *idata);
+	void init(int *hst_idata,int *hst_odata,int n);
+	//int *dev_A;
+	//int *dev_B;
 }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index d8dbb32..ee4c69c 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -16,7 +16,24 @@ void scan(int n, int *odata, const int *idata) {
     // TODO use `thrust::exclusive_scan`
     // example: for device_vectors dv_in and dv_out:
     // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+
+	thrust::device_vector<int> dv_in,dv_out;
+	for (int i = 0; i < n; i++){
+		dv_in.push_back(idata[i]);
+		dv_out.push_back(0);
+	}
+	thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+	thrust::copy(dv_out.begin(), dv_out.end(), odata);
+
+
+	printf("4.1");
+
+		
 }
 
+
+
 }
+
 }
+