diff --git a/README.md b/README.md
index b71c458..0a49626 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,97 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Akshay Shah
+* Tested on: Windows 10, i7-5700HQ @ 2.70GHz 16GB, GTX 970M 6GB (Personal Computer)
 
-### (TODO: Your README)
+### Stream Compaction
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+Block Size: 256
 
+Array input: vary from 2<sup>18</sup> to 2<sup>26</sup>.
+
+Max size: 2<sup>30</sup>
+
+CUDA Summary
+
+![](images/GPUCUDAstats.PNG)
+
+![](images/cudaCoreStats.PNG)
+
+Memory I/O looks to be the bottleneck over the GPU, meaning changing the variables over to shared memory might make a difference. (not checked, just speculating)
+
+
+#### Output
+```
+****************
+** SCAN TESTS **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...  38   0 ]
+==== cpu scan, power-of-two ====
+215 ms total, average : 215 ms.
+==== cpu scan, non-power-of-two ====
+    passed
+==== naive scan, power-of-two ====
+total time to run naive scan, power-of-two: 187.5 in ms, aver: 187.5
+    passed
+==== naive scan, non-power-of-two ====
+    passed
+==== work-efficient scan, power-of-two ====
+total time to run efficient scan, power-of-two: 171.601 in ms, aver: 171.601
+    passed
+==== work-efficient scan, non-power-of-two ====
+    passed
+==== thrust scan, power-of-two ====
+49 ms total thrust, average : 49 ms.
+    passed
+==== thrust scan, non-power-of-two ====
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   3   2   1   3   1   1   1   2   0   1   0   2 ...   0   0 ]
+==== cpu compact without scan, power-of-two ====
+182 ms total cpu w/o scan, average : 182 ms.
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   1   1 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   2 ]
+    passed
+==== cpu compact with scan ====
+356 ms total cpu w scan, average : 356 ms.
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   1   1 ]
+    passed
+==== work-efficient compact, power-of-two ====
+total time to run efficient compact, power-of-two: 186.149 in ms, aver: 186.149
+    passed
+==== work-efficient compact, non-power-of-two ====
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   2 ]
+    passed
+
+*****************************
+** RADIX SORT TESTS **
+*****************************
+    [   3   5   0   1   0   2   0   1   6   1   1   2   1 ...   0   0 ]
+==== radix sort, power-of-two ====
+total time to run radix, power-of-two: 67.8532 in ms, aver: 67.8532
+    [   0   0   0   0   0   0   0   0   0   0   0   0   0 ...   6   6 ]
+==== thrust sort, power-of-two ====
+515 ms total thrust sort, average : 515 ms.
+```
+
+![Fig1](images/naivevsefficient.png)
+
+##### A comparison of Stream Compaction over the GPU vs CPU
+
+The time was measured for 1000 iterations
+Exclusive scan was used inside the stream compaction over GPU, which is shown (CPU vs GPU scan) in Fig. 1
+
+![Fig2](images/streamcompact.png)
+
+##### Radix sort
+Implemented Radix sort on the GPU, that uses exclusive scan. The output shows a sorted array.
+Limitation to this implementation is that the max digit that can be used for sorting is 7. To increase the digit limit, you would have to change the lsb iteration number in sort.cu to whatever bit is the maximum.
+There is a comparison between thrust's CPU sort and this GPU sort.
+
+![Fig3](images/sortcpuvgpu.png)
diff --git a/images/GPUCUDAstats.PNG b/images/GPUCUDAstats.PNG
new file mode 100644
index 0000000..47ff1b3
Binary files /dev/null and b/images/GPUCUDAstats.PNG differ
diff --git a/images/cudaCoreStats.PNG b/images/cudaCoreStats.PNG
new file mode 100644
index 0000000..5840bb7
Binary files /dev/null and b/images/cudaCoreStats.PNG differ
diff --git a/images/naivevsefficient.png b/images/naivevsefficient.png
new file mode 100644
index 0000000..5129a21
Binary files /dev/null and b/images/naivevsefficient.png differ
diff --git a/images/sortcpuvgpu.png b/images/sortcpuvgpu.png
new file mode 100644
index 0000000..22900c5
Binary files /dev/null and b/images/sortcpuvgpu.png differ
diff --git a/images/streamcompact.png b/images/streamcompact.png
new file mode 100644
index 0000000..9a4ae7d
Binary files /dev/null and b/images/streamcompact.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 675da35..7038f23 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -11,13 +11,23 @@
 #include <stream_compaction/naive.h>
 #include <stream_compaction/efficient.h>
 #include <stream_compaction/thrust.h>
+#include <stream_compaction/sort.h>
 #include "testing_helpers.hpp"
+#include <iostream>
+#include <chrono>
+
+#define ITER 1
 
 int main(int argc, char* argv[]) {
-    const int SIZE = 1 << 8;
+    const size_t SIZE = 1 << 20;
     const int NPOT = SIZE - 3;
-    int a[SIZE], b[SIZE], c[SIZE];
+    //int a[SIZE], b[SIZE], c[SIZE];
+	
+	int *a = new int[SIZE];
+	int *b = new int[SIZE];
+	int *c = new int[SIZE];
 
+	float time = 0.f, totalTime = 0.f;
     // Scan tests
 
     printf("\n");
@@ -31,42 +41,64 @@ int main(int argc, char* argv[]) {
 
     zeroArray(SIZE, b);
     printDesc("cpu scan, power-of-two");
-    StreamCompaction::CPU::scan(SIZE, b, a);
-    printArray(SIZE, b, true);
+	auto begin = std::chrono::high_resolution_clock::now();
+	for (int i = 0; i < ITER; ++i){
+		StreamCompaction::CPU::scan(SIZE, b, a);
+	}
+	auto end = std::chrono::high_resolution_clock::now();
+	auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
+	std::cout << (float)duration << " ms total, average : " << (float)duration / ITER << " ms." << std::endl;
+    //printArray(SIZE, b, true);
 
     zeroArray(SIZE, c);
     printDesc("cpu scan, non-power-of-two");
     StreamCompaction::CPU::scan(NPOT, c, a);
-    printArray(NPOT, b, true);
+    //printArray(NPOT, b, true);
     printCmpResult(NPOT, b, c);
 
+	totalTime = 0.f;
     zeroArray(SIZE, c);
     printDesc("naive scan, power-of-two");
-    StreamCompaction::Naive::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
+    //printArray(SIZE, c, false);
+	for (int i = 0; i < ITER; ++i) {
+		StreamCompaction::Naive::scan(SIZE, c, a, time);
+		totalTime += time;
+	}
+	std::cout << "total time to run naive scan, power-of-two: " << totalTime << " in ms, aver: "  << totalTime / ITER << std::endl;
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("naive scan, non-power-of-two");
-    StreamCompaction::Naive::scan(NPOT, c, a);
+    StreamCompaction::Naive::scan(NPOT, c, a, time);
     //printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
 
+	totalTime = 0.f;
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
-    StreamCompaction::Efficient::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
+	for (int i = 0; i < ITER; ++i) {
+		StreamCompaction::Efficient::scan(SIZE, c, a, time);
+		totalTime += time;
+	}
+	std::cout << "total time to run efficient scan, power-of-two: " << totalTime << " in ms, aver: " << totalTime / ITER << std::endl;
+    //printArray(SIZE, c, false);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
-    StreamCompaction::Efficient::scan(NPOT, c, a);
+	StreamCompaction::Efficient::scan(NPOT, c, a, time);
     //printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
-    StreamCompaction::Thrust::scan(SIZE, c, a);
+	begin = std::chrono::high_resolution_clock::now();
+	for (int i = 0; i < ITER; ++i){
+		StreamCompaction::Thrust::scan(SIZE, c, a);
+	}
+	end = std::chrono::high_resolution_clock::now();
+	duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
+	std::cout << (float)duration << " ms total thrust, average : " << (float)duration / ITER << " ms." << std::endl;
     //printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
@@ -91,7 +123,13 @@ int main(int argc, char* argv[]) {
 
     zeroArray(SIZE, b);
     printDesc("cpu compact without scan, power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
+	begin = std::chrono::high_resolution_clock::now();
+	for (int i = 0; i < ITER; ++i){
+		count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
+	}
+	end = std::chrono::high_resolution_clock::now();
+	duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
+	std::cout << (float)duration << " ms total cpu w/o scan, average : " << (float)duration / ITER << " ms." << std::endl;
     expectedCount = count;
     printArray(count, b, true);
     printCmpLenResult(count, expectedCount, b, b);
@@ -105,19 +143,66 @@ int main(int argc, char* argv[]) {
 
     zeroArray(SIZE, c);
     printDesc("cpu compact with scan");
-    count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
+	begin = std::chrono::high_resolution_clock::now();
+	for (int i = 0; i < ITER; ++i){
+		count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
+	}
+	end = std::chrono::high_resolution_clock::now();
+	duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
+	std::cout << (float)duration << " ms total cpu w scan, average : " << (float)duration / ITER << " ms." << std::endl;
     printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
+	totalTime = 0.f;
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, power-of-two");
-    count = StreamCompaction::Efficient::compact(SIZE, c, a);
+	for (int i = 0; i < ITER; ++i) {
+		count = StreamCompaction::Efficient::compact(SIZE, c, a, time);
+		totalTime += time;
+	}
+	std::cout << "total time to run efficient compact, power-of-two: " << totalTime << " in ms, aver: " << totalTime / ITER << std::endl;
     //printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
-    count = StreamCompaction::Efficient::compact(NPOT, c, a);
-    //printArray(count, c, true);
+	count = StreamCompaction::Efficient::compact(NPOT, c, a, time);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
+
+	printf("\n");
+	printf("*****************************\n");
+	printf("** RADIX SORT TESTS **\n");
+	printf("*****************************\n");
+
+	// SORT tests
+
+	genArray(SIZE, a, 7);  // Leave a 0 at the end to test that edge case
+	a[SIZE - 1] = 0;
+	printArray(SIZE, a, true);
+
+	totalTime = 0.f;
+	zeroArray(SIZE, b);
+	printDesc("radix sort, power-of-two");
+	for (int i = 0; i < ITER; ++i) {
+		StreamCompaction::Sort::sort(SIZE, b, a, time);
+		totalTime += time;
+	}
+	std::cout << "total time to run radix, power-of-two: " << totalTime << " in ms, aver: " << totalTime / ITER << std::endl;
+	printArray(SIZE, b, true);
+
+
+	zeroArray(SIZE, b);
+	printDesc("thrust sort, power-of-two");
+	begin = std::chrono::high_resolution_clock::now();
+	for (int i = 0; i < ITER; ++i){
+		StreamCompaction::Thrust::sort(SIZE, a);
+	}
+	end = std::chrono::high_resolution_clock::now();
+	float fduration = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
+	std::cout << (float)fduration << " ms total thrust sort, average : " << (float)fduration / ITER << " ms." << std::endl;
+
+	delete[] a;
+	delete[] b;
+	delete[] c;
 }
diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp
index f6b572f..ba40e91 100644
--- a/src/testing_helpers.hpp
+++ b/src/testing_helpers.hpp
@@ -47,6 +47,19 @@ void genArray(int n, int *a, int maxval) {
     }
 }
 
+void genArraySort(int n, int *a, int maxval) {
+	if (n == 8) {
+		a[0] = 4;
+		a[1] = 7;
+		a[2] = 2;
+		a[3] = 6;
+		a[4] = 3;
+		a[5] = 5;
+		a[6] = 1;
+		a[7] = 0;
+	}
+}
+
 void printArray(int n, int *a, bool abridged = false) {
     printf("    [ ");
     for (int i = 0; i < n; i++) {
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index cdbef77..b945583 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -9,9 +9,11 @@ set(SOURCE_FILES
     "efficient.cu"
     "thrust.h"
     "thrust.cu"
+	"sort.h"
+	"sort.cu"
     )
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_52
     )
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index fe872d4..6cfbf7a 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -1,39 +1,45 @@
 #include "common.h"
 
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
-    cudaError_t err = cudaGetLastError();
-    if (cudaSuccess == err) {
-        return;
-    }
-
-    fprintf(stderr, "CUDA error");
-    if (file) {
-        fprintf(stderr, " (%s:%d)", file, line);
-    }
-    fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess == err) {
+		return;
+	}
+
+	fprintf(stderr, "CUDA error");
+	if (file) {
+		fprintf(stderr, " (%s:%d)", file, line);
+	}
+	fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
+	exit(EXIT_FAILURE);
 }
 
 
 namespace StreamCompaction {
-namespace Common {
-
-/**
- * Maps an array to an array of 0s and 1s for stream compaction. Elements
- * which map to 0 will be removed, and elements which map to 1 will be kept.
- */
-__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-    // TODO
-}
+	namespace Common {
 
-/**
- * Performs scatter on an array. That is, for each element in idata,
- * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
- */
-__global__ void kernScatter(int n, int *odata,
-        const int *idata, const int *bools, const int *indices) {
-    // TODO
-}
+		/**
+		* Maps an array to an array of 0s and 1s for stream compaction. Elements
+		* which map to 0 will be removed, and elements which map to 1 will be kept.
+		*/
+		__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
+			int index = threadIdx.x + (blockDim.x * blockIdx.x);
 
-}
-}
+			if (index >= n) return;
+			bools[index] = idata[index] != 0 ? 1 : 0;
+		}
+
+		/**
+		* Performs scatter on an array. That is, for each element in idata,
+		* if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
+		*/
+		__global__ void kernScatter(int n, int *odata,
+			const int *idata, const int *bools, const int *indices) {
+			int index = threadIdx.x + (blockDim.x * blockIdx.x);
+
+			if (index >= n) return;
+			if (bools[index] == 1) odata[indices[index]] = idata[index];
+		}
+
+	}
+}
\ No newline at end of file
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index 4f52663..73e6935 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -6,7 +6,7 @@
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
-
+#define blockSize 256
 /**
  * Check for CUDA errors; print and exit if there was a problem.
  */
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index e600c29..6683342 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -8,8 +8,10 @@ namespace CPU {
  * CPU scan (prefix sum).
  */
 void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
+	odata[0] = 0;
+	for (int i = 1; i < n; ++i) {
+		odata[i] = idata[i - 1] + odata[i - 1];
+	}
 }
 
 /**
@@ -18,8 +20,14 @@ void scan(int n, int *odata, const int *idata) {
  * @returns the number of elements remaining after compaction.
  */
 int compactWithoutScan(int n, int *odata, const int *idata) {
-    // TODO
-    return -1;
+	int j = 0;
+	for (int i = 0; i < n; ++i) {
+		if (idata[i] != 0) {
+			odata[j++] = idata[i];
+		}
+
+	}
+    return (n - j);
 }
 
 /**
@@ -28,8 +36,18 @@ int compactWithoutScan(int n, int *odata, const int *idata) {
  * @returns the number of elements remaining after compaction.
  */
 int compactWithScan(int n, int *odata, const int *idata) {
-    // TODO
-    return -1;
+	//Run the scan on temp array
+	int *tmp = new int[n];
+	int j = 0;
+	scan(n, tmp, idata);
+
+	for (int i = 0; i < n - 1; ++i) {
+		if ((tmp[i] != tmp[i + 1])) {
+			odata[j++] = idata[i];
+		}
+	}
+	delete[] tmp;
+	return n - j;
 }
 
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index b2f739b..98a22a1 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -3,17 +3,76 @@
 #include "common.h"
 #include "efficient.h"
 
+#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)
 namespace StreamCompaction {
 namespace Efficient {
 
 // TODO: __global__
 
+	__global__ void kernDownSweep(int d, int n, int* idata) {
+		int index = threadIdx.x + (blockDim.x * blockIdx.x);
+
+		if (index >= n) return;
+		
+		int off_n = 1 << (d + 1);
+		int off = (1 << d);
+		if ((index % (off_n) == 0)) {
+			int temp = idata[index + (1 << d) - 1];
+			idata[index + off - 1] = idata[index + off_n - 1];
+			idata[index + off_n - 1] += temp;	
+		}
+	}
+
+	__global__ void kernUpSweep(int d, int n, int* idata) {
+		int index = threadIdx.x + (blockDim.x * blockIdx.x);
+
+		if (index >= n) return;
+		int off_n = 1 << (d + 1);
+		int off = (1 << d);
+		if (index % off_n == 0)
+			idata[index + off_n - 1] += idata[index + off - 1];
+	}
+
 /**
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
-void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
+void scan(int n, int *odata, const int *idata, float& time) {
+	cudaEvent_t start, stop;
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+	//for non power of 2
+	int ilog = ilog2ceil(n);
+	int off_n = 1 << ilog;
+
+	dim3 fullBlocksPerGrid((off_n + blockSize - 1) / blockSize);
+	
+	int *dev_in;
+
+	cudaMalloc((void**)&dev_in, off_n * sizeof(int));
+	checkCUDAErrorWithLine("cudaMalloc dev_in failed!");
+
+	cudaMemcpy(dev_in, idata, sizeof(int) * off_n, cudaMemcpyHostToDevice);
+	float milliseconds = 0;
+	cudaEventRecord(start);
+	//up-sweep
+	for (int d = 0; d < ilog; ++d) {
+		kernUpSweep << <fullBlocksPerGrid, blockSize >> >(d, off_n, dev_in);
+	}
+	
+	//set the last value as zero
+	cudaMemset(dev_in + (off_n - 1), 0, sizeof(int));
+	
+	//down-sweep
+	for (int d = ilog - 1; d >= 0; --d) {
+		kernDownSweep << <fullBlocksPerGrid, blockSize >> >(d, off_n, dev_in);
+	}
+	cudaEventRecord(stop);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&milliseconds, start, stop);
+	time = milliseconds;
+	cudaMemcpy(odata, dev_in, sizeof(int) * (n), cudaMemcpyDeviceToHost);
+	
+	cudaFree(dev_in);
 }
 
 /**
@@ -25,9 +84,66 @@ void scan(int n, int *odata, const int *idata) {
  * @param idata  The array of elements to compact.
  * @returns      The number of elements remaining after compaction.
  */
-int compact(int n, int *odata, const int *idata) {
-    // TODO
-    return -1;
+int compact(int n, int *odata, const int *idata, float& time) {
+	cudaEvent_t start, stop;
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+	dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+	int *bools;
+	int *dev_in;
+	int *dev_out;
+	int *indices;
+	int *tmp = new int[n];
+	int *tmp_bools = new int[n];
+	cudaMalloc((void**)&dev_in, n * sizeof(int));
+	checkCUDAErrorWithLine("cudaMalloc dev_in failed!");
+	cudaMalloc((void**)&dev_out, n * sizeof(int));
+	checkCUDAErrorWithLine("cudaMalloc dev_out failed!");
+	cudaMalloc((void**)&indices, n * sizeof(int));
+	checkCUDAErrorWithLine("cudaMalloc indices failed!");
+	cudaMalloc((void**)&bools, n * sizeof(int));
+	checkCUDAErrorWithLine("cudaMalloc bools failed!");
+
+	cudaMemcpy(dev_in, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+	float milliseconds = 0, totalTime = 0.f;
+	cudaEventRecord(start);
+	//map to boolean
+	Common::kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (n, bools, dev_in);
+	cudaEventRecord(stop);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&milliseconds, start, stop);
+	totalTime += milliseconds;
+	int j = 0;
+	//scan
+	cudaMemcpy(odata, bools, sizeof(int) * (n), cudaMemcpyDeviceToHost);
+	float time_scan;
+	scan(n, tmp, odata, time_scan);
+	totalTime += time_scan;
+	cudaMemcpy(indices, tmp, sizeof(int) * n, cudaMemcpyHostToDevice);
+	cudaMemcpy(tmp_bools, bools, sizeof(int) * (n), cudaMemcpyDeviceToHost);
+	for (int i = 0; i < n; ++i) {
+		j = tmp_bools[i] == 1 ? j + 1 : j;
+	}
+	cudaEventRecord(start);
+	//scatter
+	Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (n, dev_out, dev_in, bools, indices);
+	cudaEventRecord(stop);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&milliseconds, start, stop);
+	totalTime += milliseconds;
+
+	time = totalTime;
+	cudaMemcpy(odata, dev_out, sizeof(int) * (n), cudaMemcpyDeviceToHost);
+	
+	cudaFree(dev_in);
+	cudaFree(bools);
+	cudaFree(dev_out);
+	cudaFree(indices);
+
+	delete[] tmp;
+	delete[] tmp_bools;
+    return n - j;
 }
 
 }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 395ba10..ca99f23 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -2,8 +2,8 @@
 
 namespace StreamCompaction {
 namespace Efficient {
-    void scan(int n, int *odata, const int *idata);
+    void scan(int n, int *odata, const int *idata, float& time);
 
-    int compact(int n, int *odata, const int *idata);
+	int compact(int n, int *odata, const int *idata, float& time);
 }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 3d86b60..34f8268 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -2,19 +2,67 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "naive.h"
+#include <algorithm>
+
+#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)
 
 namespace StreamCompaction {
-namespace Naive {
+	namespace Naive {
 
-// TODO: __global__
+		// Done: __global__
 
-/**
- * Performs prefix-sum (aka scan) on idata, storing the result into odata.
- */
-void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
-}
+		__global__ void kernReduce(int offset, int n, int *in, int *out) {
+			int index = threadIdx.x + (blockDim.x * blockIdx.x);
 
-}
+			if (index >= n) 
+				return;
+
+			if (index >= offset) {
+				out[index] = in[index] + in[index - offset];
+			}
+			else {
+				out[index] = in[index];
+			}
+
+		}
+
+		/**
+		* Performs prefix-sum (aka scan) on idata, storing the result into odata.
+		*/
+		void scan(int n, int *odata, const int *idata, float& time) {
+			cudaEvent_t start, stop;
+			cudaEventCreate(&start);
+			cudaEventCreate(&stop);
+			dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+			int *dev_out;
+			int *dev_in;
+
+			cudaMalloc((void**)&dev_in, n * sizeof(int));
+			checkCUDAErrorWithLine("cudaMalloc dev_in failed!");
+			cudaMalloc((void**)&dev_out, n * sizeof(int));
+			checkCUDAErrorWithLine("cudaMalloc dev_out failed!");
+
+			cudaMemcpy(dev_in, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+			
+			float milliseconds = 0;
+			cudaEventRecord(start);
+			for (int d = 1; d <= ilog2ceil(n); ++d) {
+				kernReduce << <fullBlocksPerGrid, blockSize >> >((1 << (d - 1)), n, dev_in, dev_out);
+				std::swap(dev_in, dev_out);
+			}
+			cudaEventRecord(stop);
+			std::swap(dev_in, dev_out);
+			cudaMemcpy(odata + 1, dev_out, sizeof(int) * (n-1), cudaMemcpyDeviceToHost);
+			
+			odata[0] = 0;
+			cudaEventSynchronize(stop);
+			cudaEventElapsedTime(&milliseconds, start, stop);
+			time = milliseconds;
+			//printf("%f\n", milliseconds);
+			cudaFree(dev_out);
+			cudaFree(dev_in);
+		}
+
+	}
 }
diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h
index 21152d6..04344a2 100644
--- a/stream_compaction/naive.h
+++ b/stream_compaction/naive.h
@@ -2,6 +2,6 @@
 
 namespace StreamCompaction {
 namespace Naive {
-    void scan(int n, int *odata, const int *idata);
+    void scan(int n, int *odata, const int *idata, float& time);
 }
 }
diff --git a/stream_compaction/sort.cu b/stream_compaction/sort.cu
new file mode 100644
index 0000000..eaff952
--- /dev/null
+++ b/stream_compaction/sort.cu
@@ -0,0 +1,114 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "common.h"
+#include "sort.h"
+#include <algorithm>
+#include <thrust/scan.h>
+#include "thrust.h"
+
+#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)
+
+namespace StreamCompaction {
+	namespace Sort {
+
+		// Done: __global__
+
+		__global__ void kernComputeTArray(int n, int total_falses, int* f, int *t) {
+			int index = threadIdx.x + (blockDim.x * blockIdx.x);
+
+			if (index >= n) return;
+			t[index] = index - f[index] + total_falses;
+		}
+
+		__global__ void kernComputeEArray(int n, int shift, int *e, int *in) {
+			int index = threadIdx.x + (blockDim.x * blockIdx.x);
+
+			if (index >= n) return;
+			e[index] = (in[index] >> shift) & 1 ? 0 : 1;
+		}
+
+		__global__ void kernScatter(int n, int* e, int *t, int *f, int* dev_out, int* dev_in) {
+			int index = threadIdx.x + (blockDim.x * blockIdx.x);
+
+			if (index >= n) return;
+
+			dev_out[!e[index] ? t[index] : f[index]] = dev_in[index];
+		}
+
+		/**
+		* Performs prefix-sum (aka scan) on idata, storing the result into odata.
+		*/
+		void sort(int n, int *odata, const int *idata, float& time) {
+			cudaEvent_t start, stop;
+			cudaEventCreate(&start);
+			cudaEventCreate(&stop);
+			dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+			int *dev_in;
+			int *dev_out;
+			int *e;
+			int *f;
+			int *t;
+
+			//int *t_host = new int[n];
+
+			int *e_host = new int[n];
+
+			cudaMalloc((void**)&dev_in, n * sizeof(int));
+			checkCUDAErrorWithLine("cudaMalloc dev_in failed!");
+			cudaMalloc((void**)&dev_out, n * sizeof(int));
+			checkCUDAErrorWithLine("cudaMalloc dev_out failed!");
+			cudaMalloc((void**)&e, n * sizeof(int));
+			checkCUDAErrorWithLine("cudaMalloc e failed!");
+			cudaMalloc((void**)&f, n * sizeof(int));
+			checkCUDAErrorWithLine("cudaMalloc f failed!");
+			cudaMalloc((void**)&t, n * sizeof(int));
+			checkCUDAErrorWithLine("cudaMalloc t failed!");
+
+			cudaMemcpy(dev_in, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+			
+			float milliseconds = 0, totalTime = 0.f;
+			
+			//max number allowed is ilog2ceil(n) - 1; for ex: if n == 8, max value any element can have is 7
+			for (int lsb = 0; lsb < 3; ++lsb) {
+				cudaEventRecord(start);
+				//compute e array
+				kernComputeEArray << <fullBlocksPerGrid, blockSize >> >(n, lsb, e, dev_in);
+				cudaEventRecord(stop);
+				cudaEventSynchronize(stop);
+				cudaEventElapsedTime(&milliseconds, start, stop);
+				totalTime += milliseconds;
+				//scan e
+				cudaMemcpy(e_host, e, sizeof(int) * (n), cudaMemcpyDeviceToHost);
+				int total_falses = e_host[n - 1];
+				thrust::exclusive_scan(e_host, e_host + n, e_host);
+				total_falses += e_host[n - 1];
+				cudaMemcpy(f, e_host, sizeof(int) * n, cudaMemcpyHostToDevice);
+				cudaEventRecord(start);
+				//compute t array
+				kernComputeTArray << <fullBlocksPerGrid, blockSize >> >(n, total_falses, f, t);
+
+				//scatter
+				kernScatter << <fullBlocksPerGrid, blockSize >> >(n, e, t, f, dev_out, dev_in);
+				cudaEventRecord(stop);
+				cudaEventSynchronize(stop);
+				cudaEventElapsedTime(&milliseconds, start, stop);
+				totalTime += milliseconds;
+				std::swap(dev_in, dev_out);
+			}
+			std::swap(dev_in, dev_out);
+			cudaMemcpy(odata, dev_out, sizeof(int) * (n), cudaMemcpyDeviceToHost);
+			
+			time = totalTime;
+
+			cudaFree(dev_in);
+			cudaFree(dev_out);
+			cudaFree(e);
+			cudaFree(t);
+			cudaFree(f);
+			
+			delete[] e_host;
+		}
+
+	}
+}
diff --git a/stream_compaction/sort.h b/stream_compaction/sort.h
new file mode 100644
index 0000000..4152c2d
--- /dev/null
+++ b/stream_compaction/sort.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace StreamCompaction {
+	namespace Sort {
+		void sort(int n, int *odata, const int *idata, float& time);
+	}
+}
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index d8dbb32..e7f8e7f 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -3,9 +3,12 @@
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #include <thrust/scan.h>
+#include <thrust/sort.h>
 #include "common.h"
 #include "thrust.h"
 
+#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)
+
 namespace StreamCompaction {
 namespace Thrust {
 
@@ -13,9 +16,11 @@ namespace Thrust {
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
 void scan(int n, int *odata, const int *idata) {
-    // TODO use `thrust::exclusive_scan`
-    // example: for device_vectors dv_in and dv_out:
-    // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+	thrust::exclusive_scan(idata, idata + n, odata);
+}
+
+void sort(int n, int *idata) {
+	thrust::sort(idata, idata + n);
 }
 
 }
diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h
index 06707f3..90889f4 100644
--- a/stream_compaction/thrust.h
+++ b/stream_compaction/thrust.h
@@ -3,5 +3,6 @@
 namespace StreamCompaction {
 namespace Thrust {
     void scan(int n, int *odata, const int *idata);
+	void sort(int n, int *idata);
 }
 }