diff --git a/README.md b/README.md
index b71c458..5e6610e 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,210 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Michael Willett
+* Tested on: Windows 10, I5-4690k @ 3.50GHz 8.00GB, GTX 750-TI 2GB (Personal Computer)
 
-### (TODO: Your README)
+## Contents
+1. [Introduction](#intro)
+2. [Algorithms](#part1)
+3. [Verification and Performance Analysis](#part2)
+4. [Development Process](#part3)
+5. [Build Instructions](#appendix)
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
 
+<a name="intro"/>
+## Introduction: Parallel Algorithms
+The goal of this project was to implement several versions of basic GPU algorithms to become familiar with
+parallel programming ideas, as well as compare the performance overhead of memory management on GPU vs 
+traditional CPU implementations.
+
+
+<a name="part1"/>
+## Section 1: Scanning, Stream Compaction, and Sorting
+Three basic algorithms were implemented in this project
+
+1. Scan - consective sum of all elements in the array prior to the current index.
+2. Compaction - removal of all elements in an array that meet some boolean criteria.
+3. Sort - arrange all elements in an array from low to high using a radix sort implementation.
+
+Two implementations of GPU based scan were implemented: a naive version where elements were added in pairs until all
+sums were computed, and a work efficient version where the total number of operators is reduced at the cost of performing
+two separate stages of analysis. In the implemented version, the work-efficient scan performs slightly worse than
+the naive approach from the extra down-sweep phase, however, there is room for improvement with smarter array
+indexing to reduce the total number of GPU kernel calls. Both algorithms run in O(log(N)) which is theoretically
+faster than CPU implementations at O(N), but performance analysis shows that this is only significant for large
+arrays due to array access overhead on the GPU level.
+
+Stream compaction shows similar trends in computation complexity in GPU vs. CPU performance, but catches up to CPU 
+faster than the scan implementations since GPU accelerated reording of arrays is significantly faster than CPU 
+implementations even for relatively small arrays.
+
+The radix sort agorithm iterates over each bit in the input value from least significant to most significant, reordering
+the array at each step. Again since the reording step is highly efficient on a GPU, the major overhead is due to
+memory access time. 
+
+
+<a name="part2"/>
+## Section 2: Verification and Performance Analysis
+*Code Correctness for Scan, Compact, and Sort Implementations:*
+
+```
+****************
+** SCAN TESTS **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...  35   0 ]
+==== cpu scan, power-of-two ====
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ]
+==== cpu scan, non-power-of-two ====
+    passed
+==== naive scan, power-of-two ====
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ]
+    passed
+==== naive scan, non-power-of-two ====
+    passed
+==== work-efficient scan, power-of-two ====
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ]
+    passed
+==== work-efficient scan, non-power-of-two ====
+    passed
+==== thrust scan, power-of-two ====
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ]
+    passed
+==== thrust scan, non-power-of-two ====
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   3   2   1   3   1   1   1   2   0   1   0   2 ...   1   0 ]
+==== cpu compact without scan, power-of-two ====
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   1   1 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   3   3 ]
+    passed
+==== cpu compact with scan ====
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   1   1 ]
+    passed
+==== work-efficient compact, power-of-two ====
+    passed
+==== work-efficient compact, non-power-of-two ====
+    passed
+
+**********************
+** RADIX SORT TESTS **
+**********************
+==== radix sort, power-of-two ====
+    [  38 7719 21238 2437 8855 11797 8365 32285 ]
+    [  38 2437 7719 8365 8855 11797 21238 32285 ]
+    passed
+==== radix sort, non-power-of-two ====
+    [  38 7719 21238 2437 8855 11797 8365 ]
+    [  38 2437 7719 8365 8855 11797 21238 ]
+    passed
+
+Sort passed 1000/1000 randomly generated verification tests.
+```
+
+### Benchmarks
+*All performance measurements were averaged over 1000 samples on a single dataset*
+![Scan Performance](imgs/scan_benchmarks.PNG)
+![Compact Performance](imgs/compact_benchmarks.PNG)
+![Scan Performance](imgs/sort_benchmarks.PNG)
+
+We can quickly see from the above charts that run time for the various GPU implementations for scan, compact, and stort all 
+have logarithmic growth, while CPU implementations observe polynomial growth (for scan and compaction, CPU growth is linear). 
+As a result, for smaller arrays the overhead of memory access and array manipulation results in significantly worse execution 
+time, and it isn't until arrays of about ~50,000 elements that performance is comparable. Unfortunately due to implementation
+choices, the GPU implemenations crash for arrays larger than 2<sup>16</sup>, so performance analysis could not be run for these
+large arrays, and estimates must be extrapolated from the current dataset.
+
+![Scan Kernal Calls](imgs/scan_performance.PNG)
+The above figure shows the room for improvement of the efficient scan quite easily. Compared to the naive scan steps,
+the up-sweep and down-sweep of the work-efficient implementation do not utilize nearly as much of the GPU per call as
+the naive scan. Saturating the GPU here could yeild significant improvements by reducing the total number of kernal
+calls required.
+
+
+For the thrust library benchmarks, it is very interesting to see what appears to be standard linear growth due to the
+performance loss in large arrays. Preliminary research on the thrust sort implementation implies that for basic sorting
+of primitives like integers used in this example, it should also be using a radix sort implemented on the GPU. The charts
+suggest, however, that thrust either not properly parallelizing the data, or it is running on the CPU. Documentation states
+that method overloads that accept arrays located on the host will automatically handle data transfer to and from the GPU,
+but that may not be the case. Additional work to instantiate the data on the GPU prior to the sort results in an Abort()
+call from the thrust library, so additional investigation would be merited.
+
+<a name="part3"/>
+## Section 3: Development Process
+Development was fairly straight forward algorithmic implementation. Future work could be done in the work effecient 
+scan implementation to better handle launching kernal functions at the high depth levels when only a couple of sums are
+being calculated for the whole array. This should improve peformance to be faster than the naive approach, but I
+exepect it to still be outperformed by the thrust implementation as well as the CPU version for arrays with fewer than
+30,000 elements.
+
+It was worth noting there was a bug in using std::pow for calculating the array index in each kernal invocation. For some
+unknown reason, it consisantly produced erronius values at 2<sup>11</sup>, or 2048. This is odd since variables were being cast to 
+double precision before the computation, but the reverse cast was incorrect. This bug may be compiler specific as running
+the index calculation on a separate machine result in accurate indexing in this range. Simply changing the code to use
+bitshift operations cleared the error entirely. 
+
+
+<a name="appendix"/>
+## Appendix: Build Instructions
+**CMakeLists.txt modified to include new sort class and update compute compatability**
+
+* `src/` contains the source code.
+
+**CMake note:** Do not change any build settings or add any files to your
+project directly (in Visual Studio, Nsight, etc.) Instead, edit the
+`src/CMakeLists.txt` file. Any files you add must be added here. If you edit it,
+just rebuild your VS/Nsight project to make it update itself.
+
+#### Windows
+
+1. In Git Bash, navigate to your cloned project directory.
+2. Create a `build` directory: `mkdir build`
+   * (This "out-of-source" build makes it easy to delete the `build` directory
+     and try again if something goes wrong with the configuration.)
+3. Navigate into that directory: `cd build`
+4. Open the CMake GUI to configure the project:
+   * `cmake-gui ..` or `"C:\Program Files (x86)\cmake\bin\cmake-gui.exe" ..`
+     * Don't forget the `..` part!
+   * Make sure that the "Source" directory is like
+     `.../Project2-Stream-Compaction`.
+   * Click *Configure*.  Select your version of Visual Studio, Win64.
+     (**NOTE:** you must use Win64, as we don't provide libraries for Win32.)
+   * If you see an error like `CUDA_SDK_ROOT_DIR-NOTFOUND`,
+     set `CUDA_SDK_ROOT_DIR` to your CUDA install path. This will be something
+     like: `C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v7.5`
+   * Click *Generate*.
+5. If generation was successful, there should now be a Visual Studio solution
+   (`.sln`) file in the `build` directory that you just created. Open this.
+   (from the command line: `explorer *.sln`)
+6. Build. (Note that there are Debug and Release configuration options.)
+7. Run. Make sure you run the `cis565_` target (not `ALL_BUILD`) by
+   right-clicking it and selecting "Set as StartUp Project".
+   * If you have switchable graphics (NVIDIA Optimus), you may need to force
+     your program to run with only the NVIDIA card. In NVIDIA Control Panel,
+     under "Manage 3D Settings," set "Multi-display/Mixed GPU acceleration"
+     to "Single display performance mode".
+
+#### OS X & Linux
+
+It is recommended that you use Nsight.
+
+1. Open Nsight. Set the workspace to the one *containing* your cloned repo.
+2. *File->Import...->General->Existing Projects Into Workspace*.
+   * Select the Project 0 repository as the *root directory*.
+3. Select the *cis565-* project in the Project Explorer. From the *Project*
+   menu, select *Build All*.
+   * For later use, note that you can select various Debug and Release build
+     configurations under *Project->Build Configurations->Set Active...*.
+4. If you see an error like `CUDA_SDK_ROOT_DIR-NOTFOUND`:
+   * In a terminal, navigate to the build directory, then run: `cmake-gui ..`
+   * Set `CUDA_SDK_ROOT_DIR` to your CUDA install path.
+     This will be something like: `/usr/local/cuda`
+   * Click *Configure*, then *Generate*.
+5. Right click and *Refresh* the project.
+6. From the *Run* menu, *Run*. Select "Local C/C++ Application" and the
+   `cis565_` binary.
diff --git a/benchmark data.xlsx b/benchmark data.xlsx
new file mode 100644
index 0000000..aa3b627
Binary files /dev/null and b/benchmark data.xlsx differ
diff --git a/imgs/compact_benchmarks.PNG b/imgs/compact_benchmarks.PNG
new file mode 100644
index 0000000..e759b1a
Binary files /dev/null and b/imgs/compact_benchmarks.PNG differ
diff --git a/imgs/scan_benchmarks.PNG b/imgs/scan_benchmarks.PNG
new file mode 100644
index 0000000..6acebaa
Binary files /dev/null and b/imgs/scan_benchmarks.PNG differ
diff --git a/imgs/scan_performance.PNG b/imgs/scan_performance.PNG
new file mode 100644
index 0000000..8a93fbe
Binary files /dev/null and b/imgs/scan_performance.PNG differ
diff --git a/imgs/sort_benchmarks.PNG b/imgs/sort_benchmarks.PNG
new file mode 100644
index 0000000..6ef2fa2
Binary files /dev/null and b/imgs/sort_benchmarks.PNG differ
diff --git a/src/main.cpp b/src/main.cpp
index 675da35..5b01d79 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -11,21 +11,30 @@
 #include <stream_compaction/naive.h>
 #include <stream_compaction/efficient.h>
 #include <stream_compaction/thrust.h>
+#include <stream_compaction/sort.h>
+#include <cstring>
+#include <thrust/sort.h>
 #include "testing_helpers.hpp"
 
 int main(int argc, char* argv[]) {
-    const int SIZE = 1 << 8;
+	const int POW = 16;
+	const int SIZE = 1 << POW;
     const int NPOT = SIZE - 3;
     int a[SIZE], b[SIZE], c[SIZE];
 
-    // Scan tests
+	const int KEYSIZE = 16;
+	int a_sm[8], b_sm[8], c_sm[8];
+	genArray(8, a_sm, 1 << KEYSIZE);
+
+	genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
 
+    // Scan tests
+#if 0
     printf("\n");
     printf("****************\n");
     printf("** SCAN TESTS **\n");
     printf("****************\n");
 
-    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
     a[SIZE - 1] = 0;
     printArray(SIZE, a, true);
 
@@ -37,14 +46,14 @@ int main(int argc, char* argv[]) {
     zeroArray(SIZE, c);
     printDesc("cpu scan, non-power-of-two");
     StreamCompaction::CPU::scan(NPOT, c, a);
-    printArray(NPOT, b, true);
+    //printArray(NPOT, b, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
+    printArray(SIZE, c, true);
+	printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("naive scan, non-power-of-two");
@@ -53,9 +62,9 @@ int main(int argc, char* argv[]) {
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
-    printDesc("work-efficient scan, power-of-two");
+	printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
@@ -67,7 +76,7 @@ int main(int argc, char* argv[]) {
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
@@ -120,4 +129,112 @@ int main(int argc, char* argv[]) {
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     //printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
-}
+
+	printf("\n");
+	printf("**********************\n");
+	printf("** RADIX SORT TESTS **\n");
+	printf("**********************\n");
+
+
+	zeroArray(8, b_sm);
+	zeroArray(8, c_sm);
+	printDesc("radix sort, power-of-two");
+	printArray(8, a_sm, true);
+	memcpy(b_sm, a_sm, 8 * sizeof(int));
+	StreamCompaction::Sort::radix(8, KEYSIZE, c_sm, a_sm);
+	thrust::sort(b_sm, b_sm + 8);
+	printArray(8, c_sm, true);
+	printCmpResult(8, b_sm, c_sm);
+
+	int a_npot_sm[7];
+	zeroArray(8, b_sm);
+	zeroArray(8, c_sm);
+	memcpy(a_npot_sm, a_sm, 7 * sizeof(int));
+	printDesc("radix sort, non-power-of-two");
+	printArray(7, a_npot_sm, true);
+	memcpy(b_sm, a_npot_sm, 7 * sizeof(int));
+	StreamCompaction::Sort::radix(7, KEYSIZE, c_sm, a_npot_sm);
+	thrust::sort(b_sm, b_sm + 7);
+	printArray(7, c_sm, true);
+	printCmpResult(7, b_sm, c_sm);
+
+#endif
+#if 0
+	int successes = 0;
+	int tests = 1000;
+	for (int n = 0; n < tests; n++){
+		zeroArray(SIZE, b);
+		zeroArray(SIZE, c);
+		genArray(SIZE, a, 1 << POW);  // Leave a 0 at the end to test that edge case
+		memcpy(b, a, SIZE*sizeof(int));
+		thrust::sort(b, b + SIZE);
+		StreamCompaction::Sort::radix(SIZE, KEYSIZE, c, a);
+
+		if (!cmpArrays(SIZE, b, c)) successes++;
+	}
+
+	printf("\nSort passed %i/%i randomly generated verification tests.\n", successes, tests);
+
+#endif
+
+#if 1
+	printf("\n");
+	printf("**********************\n");
+	printf("**   TIMING TESTS   **\n");
+	printf("**********************\n");
+
+#define ITER 1 << i
+
+	zeroArray(SIZE, c);
+	printDesc("naive scan, power-of-two");
+	for (int i = 1; i <= POW; i++) StreamCompaction::Naive::TestScan(ITER, c, a);
+
+	zeroArray(SIZE, c);
+	printDesc("efficient scan, power-of-two");
+	for (int i = 1; i <= POW; i++) StreamCompaction::Efficient::TestScan(ITER, c, a);
+
+
+	zeroArray(SIZE, c);
+	printDesc("Thrust scan, power-of-two");
+	for (int i = 1; i <= POW; i++) StreamCompaction::Thrust::TestScan(ITER, c, a);
+	
+
+
+
+	zeroArray(SIZE, c);
+	printDesc("CPU scan, power-of-two");
+	for (int i = 1; i <= POW; i++) StreamCompaction::CPU::TestScan(ITER, c, a);
+
+	
+	zeroArray(SIZE, c);
+	printDesc("CPU compact, power-of-two");
+	for (int i = 1; i <= POW; i++) StreamCompaction::CPU::TestCompact(ITER, c, a);
+
+	zeroArray(SIZE, c);
+	printDesc("CPU compact without scan, power-of-two");
+	for (int i = 1; i <= POW; i++) StreamCompaction::CPU::TestCompactWithoutScan(ITER, c, a);
+
+
+	zeroArray(SIZE, c);
+	printDesc("efficient compact, power-of-two");
+	for (int i = 1; i <= POW; i++) StreamCompaction::Efficient::TestCompact(ITER, c, a);
+
+
+
+	zeroArray(SIZE, c);
+	printDesc("Thrust stable sort, power-of-two");
+	for (int i = 1; i <= POW; i++) StreamCompaction::Thrust::TestSortStable(ITER, c, a);
+
+
+	zeroArray(SIZE, c);
+	printDesc("Thrust unstable sort, power-of-two");
+	for (int i = 1; i <= POW; i++) StreamCompaction::Thrust::TestSortUnstable(ITER, c, a);
+
+
+	zeroArray(SIZE, c);
+	printDesc("Radix sort, power-of-two");
+	for (int i = 1; i <= POW; i++) StreamCompaction::Sort::TestSort(ITER, KEYSIZE, c, a);
+
+#endif
+
+}
\ No newline at end of file
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index cdbef77..cb99a37 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -7,11 +7,13 @@ set(SOURCE_FILES
     "naive.cu"
     "efficient.h"
     "efficient.cu"
+    "sort.h"
+    "sort.cu"
     "thrust.h"
     "thrust.cu"
     )
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_50
     )
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index fe872d4..1b5df61 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -1,5 +1,6 @@
 #include "common.h"
 
+
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
     cudaError_t err = cudaGetLastError();
     if (cudaSuccess == err) {
@@ -23,7 +24,12 @@ namespace Common {
  * which map to 0 will be removed, and elements which map to 1 will be kept.
  */
 __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-    // TODO
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= n) {
+		return;
+	}
+
+	bools[index] = !(idata[index] == 0);
 }
 
 /**
@@ -32,8 +38,17 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
  */
 __global__ void kernScatter(int n, int *odata,
         const int *idata, const int *bools, const int *indices) {
-    // TODO
+
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= n) {
+		return;
+	}
+
+	if (bools == NULL) odata[indices[index]] = idata[index];
+	else if (bools[index] == 1) odata[indices[index]] = idata[index];
 }
 
+
+
 }
 }
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index 4f52663..e516c4a 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -3,10 +3,14 @@
 #include <cstdio>
 #include <cstring>
 #include <cmath>
+#include <chrono>
+#include <ctime>
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
 
+#define blockSize 128
+
 /**
  * Check for CUDA errors; print and exit if there was a problem.
  */
@@ -32,4 +36,4 @@ namespace Common {
     __global__ void kernScatter(int n, int *odata,
             const int *idata, const int *bools, const int *indices);
 }
-}
+}
\ No newline at end of file
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index e600c29..0c53b61 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -1,4 +1,5 @@
 #include <cstdio>
+#include "common.h"
 #include "cpu.h"
 
 namespace StreamCompaction {
@@ -8,8 +9,20 @@ namespace CPU {
  * CPU scan (prefix sum).
  */
 void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
+	for (int i = 0; i < n-1; i++) {
+		odata[i + 1] = odata[i] + idata[i];
+	}
+}
+
+/**
+* CPU scatter.
+*/
+void scatter(int n, int *odata,
+	const int *idata, const int *bools, const int *indices) {
+
+	for (int i = 0; i < n - 1; i++) {
+		if (bools[i] == 1) odata[indices[i]] = idata[i];
+	}
 }
 
 /**
@@ -18,8 +31,27 @@ void scan(int n, int *odata, const int *idata) {
  * @returns the number of elements remaining after compaction.
  */
 int compactWithoutScan(int n, int *odata, const int *idata) {
-    // TODO
-    return -1;
+	int r = 0;
+
+	for (int i = 0; i < n-1; i++){
+		if (idata[i] != 0) {
+			odata[r++] = idata[i];
+		}
+	}
+
+    return r;
+}
+
+void printArray(int n, int *a, bool abridged = false) {
+	printf("    [ ");
+	for (int i = 0; i < n; i++) {
+		if (abridged && i + 2 == 15 && n > 16) {
+			i = n - 2;
+			printf("... ");
+		}
+		printf("%3d ", a[i]);
+	}
+	printf("]\n");
 }
 
 /**
@@ -27,9 +59,86 @@ int compactWithoutScan(int n, int *odata, const int *idata) {
  *
  * @returns the number of elements remaining after compaction.
  */
-int compactWithScan(int n, int *odata, const int *idata) {
-    // TODO
-    return -1;
+int compactWithScan(const int n, int *odata, const int *idata) {
+
+	// create arrays
+	int *indices = new int[n];
+	int *bools = new int[n];
+	int rtn = -1;
+	indices[0] = 0;
+
+	for (int i = 0; i < n; i++) {
+		bools[i] = !(idata[i] == 0);
+	}
+
+	scan(n, indices, bools);
+	scatter(n, odata, idata, bools, indices);
+	rtn = indices[n - 1];
+
+	delete indices;
+	delete bools;
+
+	return rtn;
+}
+
+void TestScan(int n, int *odata, const int *idata) {
+
+	double time = 0;
+	int samp = 1000;
+
+	for (int i = 0; i < samp; i++) {
+		std::chrono::time_point<std::chrono::system_clock> start, end;
+		start = std::chrono::system_clock::now();
+
+		scan(n, odata, idata);
+
+		end = std::chrono::system_clock::now();
+		std::chrono::duration<double> elapsed_seconds = end - start;
+
+		time += elapsed_seconds.count() * 1000 / samp;
+	}
+	printf("     %f\n", time);
+
+}
+
+void TestCompact(int n, int *odata, const int *idata) {
+
+	double time = 0;
+	int samp = 1000;
+
+	for (int i = 0; i < samp; i++) {
+		std::chrono::time_point<std::chrono::system_clock> start, end;
+		start = std::chrono::system_clock::now();
+
+		compactWithScan(n, odata, idata);
+
+		end = std::chrono::system_clock::now();
+		std::chrono::duration<double> elapsed_seconds = end - start;
+
+		time += elapsed_seconds.count() * 1000 / samp;
+	}
+	printf("     %f\n", time);
+
+}
+
+void TestCompactWithoutScan(int n, int *odata, const int *idata) {
+
+	double time = 0;
+	int samp = 1000;
+
+	for (int i = 0; i < samp; i++) {
+		std::chrono::time_point<std::chrono::system_clock> start, end;
+		start = std::chrono::system_clock::now();
+
+		compactWithoutScan(n, odata, idata);
+
+		end = std::chrono::system_clock::now();
+		std::chrono::duration<double> elapsed_seconds = end - start;
+
+		time += elapsed_seconds.count() * 1000 / samp;
+	}
+	printf("     %f\n", time);
+
 }
 
 }
diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h
index 6348bf3..f6d8cc3 100644
--- a/stream_compaction/cpu.h
+++ b/stream_compaction/cpu.h
@@ -7,5 +7,10 @@ namespace CPU {
     int compactWithoutScan(int n, int *odata, const int *idata);
 
     int compactWithScan(int n, int *odata, const int *idata);
+
+
+	void TestScan(int n, int *odata, const int *idata);
+	void TestCompact(int n, int *odata, const int *idata);
+	void TestCompactWithoutScan(int n, int *odata, const int *idata);
 }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index b2f739b..cff875b 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -6,14 +6,96 @@
 namespace StreamCompaction {
 namespace Efficient {
 
-// TODO: __global__
+void printArray(int n, int *a, bool abridged = false) {
+	printf("    [ ");
+	for (int i = 0; i < n; i++) {
+		if (abridged && i + 2 == 15 && n > 16) {
+			i = n - 2;
+			printf("... ");
+		}
+		printf("%3d ", a[i]);
+	}
+	printf("]\n");
+}
+
+__global__ void kernUpStep(int n, int d, int *data) {
+
+	int s = 1 << (d + 1);
+	int index = (threadIdx.x + (blockIdx.x * blockDim.x)) *s;
+
+	if (index >= n) {
+		return;
+	}
+	
+	//if (fmod((double) index+1, (double)s) == 0) {
+		data[index + s - 1] += data[index + s / 2 - 1];
+	//}
+}
+
+__global__ void kernDownStep(int n, int d, int *data) {
+	
+	int s = 1 << (d + 1);
+	int index = (threadIdx.x + (blockIdx.x * blockDim.x)) * s;
+	
+	if (index >= n) {
+		return;
+	}
+
+
+	//if (fmod((double) index, (double)s) == 0) {
+		int t = data[index + s / 2 - 1];
+		data[index + s / 2 - 1] = data[index + s - 1];
+		data[index + s - 1] += t;
+	//}
+}
 
 /**
- * Performs prefix-sum (aka scan) on idata, storing the result into odata.
- */
+* Performs prefix-sum (aka scan) on idata, storing the result into odata.
+*/
 void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
+	// create device arrays
+	int *dev_out;
+
+	cudaMalloc((void**)&dev_out, n*sizeof(int));
+	cudaMemcpy(dev_out, idata, n*sizeof(int), cudaMemcpyHostToDevice);
+
+	scan_dev(n, dev_out);
+
+	cudaMemcpy(odata, dev_out, n*sizeof(int), cudaMemcpyDeviceToHost);
+	cudaFree(dev_out);
+}
+
+/**
+* Performs prefix-sum (aka scan) on idata, storing the result into odata.
+* For use with arrays intiialized on GPU already.
+*/
+void scan_dev(int n, int *dev_in) {
+	dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+	// create device arrays to pad to power of 2 size array
+	//int pot = pow(2, ilog2ceil(n));
+	int pot = 1 << ilog2ceil(n);
+	int *dev_data;
+	cudaMalloc((void**)&dev_data, pot*sizeof(int));
+	cudaMemset(dev_data, 0, pot*sizeof(int));
+	cudaMemcpy(dev_data, dev_in, n*sizeof(int), cudaMemcpyDeviceToDevice);
+
+
+	int d = 0;
+	for (d; d < ilog2ceil(pot); d++) {
+		fullBlocksPerGrid.x = ((pot / pow(2, d+1) + blockSize - 1) / blockSize);
+		kernUpStep << < fullBlocksPerGrid, blockSize >> >(pot, d, dev_data);
+	}
+
+	cudaMemset(&dev_data[pot - 1], 0, sizeof(int));
+	for (d = ilog2ceil(pot); d >= 0; d--) {
+		fullBlocksPerGrid.x = ((pot / pow(2, d+1) + blockSize - 1) / blockSize);
+		kernDownStep << < fullBlocksPerGrid, blockSize >> >(pot, d, dev_data);
+	}
+
+
+	cudaMemcpy(dev_in, dev_data, n*sizeof(int), cudaMemcpyDeviceToDevice);
+	cudaFree(dev_data);
 }
 
 /**
@@ -26,8 +108,137 @@ void scan(int n, int *odata, const int *idata) {
  * @returns      The number of elements remaining after compaction.
  */
 int compact(int n, int *odata, const int *idata) {
-    // TODO
-    return -1;
+	dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+	// create device arrays
+	int *dev_out;
+	int *dev_in;
+	int *dev_indices;
+	int *dev_bools;
+	int rtn = -1;
+
+	cudaMalloc((void**)&dev_out, n*sizeof(int));
+	cudaMalloc((void**)&dev_in, n*sizeof(int));
+	cudaMalloc((void**)&dev_indices, n*sizeof(int));
+	cudaMalloc((void**)&dev_bools, n*sizeof(int));
+
+
+	cudaMemcpy(dev_in, idata, n*sizeof(int), cudaMemcpyHostToDevice);
+	StreamCompaction::Common::kernMapToBoolean << < fullBlocksPerGrid, blockSize >> >(n, dev_bools, dev_in);
+
+	// scan without wasteful device-host-device write
+	cudaMemcpy(dev_indices, dev_bools, n*sizeof(int), cudaMemcpyDeviceToDevice);
+	scan_dev(n, dev_indices);
+
+	// scatter
+	StreamCompaction::Common::kernScatter << < fullBlocksPerGrid, blockSize >> >(n, dev_out, dev_in, dev_bools, dev_indices);
+
+	cudaMemcpy(odata, dev_out, n*sizeof(int), cudaMemcpyDeviceToHost);
+	cudaMemcpy(&rtn, &dev_indices[n-1], sizeof(int), cudaMemcpyDeviceToHost);
+
+
+	cudaFree(dev_out);
+	cudaFree(dev_in);
+	cudaFree(dev_bools);
+	cudaFree(dev_indices);
+
+    return rtn;
+}
+
+
+int compact_dev(int n, int *dev_out, const int *dev_in) {
+	dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+	// create device arrays
+	int *dev_indices;
+	int *dev_bools;
+	int rtn = -1;
+
+	cudaMalloc((void**)&dev_indices, n*sizeof(int));
+	cudaMalloc((void**)&dev_bools, n*sizeof(int));
+
+	StreamCompaction::Common::kernMapToBoolean << < fullBlocksPerGrid, blockSize >> >(n, dev_bools, dev_in);
+
+	// scan without wasteful device-host-device write
+	cudaMemcpy(dev_indices, dev_bools, n*sizeof(int), cudaMemcpyDeviceToDevice);
+	scan_dev(n, dev_indices);
+
+	// scatter
+	StreamCompaction::Common::kernScatter << < fullBlocksPerGrid, blockSize >> >(n, dev_out, dev_in, dev_bools, dev_indices);
+
+	cudaMemcpy(&rtn, &dev_indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+
+	cudaFree(dev_bools);
+	cudaFree(dev_indices);
+
+	return rtn;
+}
+
+
+void TestScan(int n, int *odata, const int *idata) {
+
+	double time = 0;
+	int samp = 1000;
+
+	int *dev_out;
+
+	cudaMalloc((void**)&dev_out, n*sizeof(int));
+	cudaMemcpy(dev_out, idata, n*sizeof(int), cudaMemcpyHostToDevice);
+
+
+	for (int i = 0; i < samp; i++) {
+		cudaMemcpy(dev_out, idata, n*sizeof(int), cudaMemcpyHostToDevice);
+		std::chrono::time_point<std::chrono::system_clock> start, end;
+		start = std::chrono::system_clock::now();
+
+		scan_dev(n, dev_out);
+
+		cudaThreadSynchronize(); // block until kernel is finished
+
+		end = std::chrono::system_clock::now();
+		std::chrono::duration<double> elapsed_seconds = end - start;
+
+		time += elapsed_seconds.count() * 1000 / samp;
+	}
+
+	cudaMemcpy(odata, dev_out, n*sizeof(int), cudaMemcpyDeviceToHost);
+	cudaFree(dev_out);
+	printf("     %f\n", time);
+
+}
+
+void TestCompact(int n, int *odata, const int *idata) {
+
+	double time = 0;
+	int samp = 1000;
+
+	int *dev_out;
+	int *dev_in;
+	cudaMalloc((void**)&dev_out, n*sizeof(int));
+	cudaMalloc((void**)&dev_in, n*sizeof(int));
+
+	for (int i = 0; i < samp; i++) {
+		cudaMemcpy(dev_in, idata, n*sizeof(int), cudaMemcpyHostToDevice);
+
+		std::chrono::time_point<std::chrono::system_clock> start, end;
+		start = std::chrono::system_clock::now();
+
+		compact_dev(n, odata, idata);
+		cudaThreadSynchronize(); // block until kernel is finished
+
+		end = std::chrono::system_clock::now();
+		std::chrono::duration<double> elapsed_seconds = end - start;
+
+		time += elapsed_seconds.count() * 1000 / samp;
+	}
+
+	cudaMemcpy(odata, dev_out, n*sizeof(int), cudaMemcpyDeviceToHost);
+
+	cudaFree(dev_out);
+	cudaFree(dev_in);
+
+	printf("     %f\n", time);
+
 }
 
 }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 395ba10..a92f1d9 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -2,8 +2,13 @@
 
 namespace StreamCompaction {
 namespace Efficient {
-    void scan(int n, int *odata, const int *idata);
+	void scan(int n, int *odata, const int *idata);
+	void scan_dev(int n, int *dev_data);
 
-    int compact(int n, int *odata, const int *idata);
+	int compact(int n, int *odata, const int *idata);
+	int compact_dev(int n, int *dev_out, const int *dev_in);
+
+	void TestScan(int n, int *odata, const int *idata);
+	void TestCompact(int n, int *odata, const int *idata);
 }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 3d86b60..530ebcd 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -6,15 +6,93 @@
 namespace StreamCompaction {
 namespace Naive {
 
-// TODO: __global__
+__global__ void kernScanStep(int n, int d, int *odata, const int *idata) {
+
+	int s = 1 << (d - 1);
+	int index = threadIdx.x + (blockIdx.x * blockDim.x) + s;
+	
+	if (index >= n) {
+		return;
+	}
+	
+
+	//if (index >= s) {
+		odata[index] = idata[index] + idata[index - s];
+	//}
+}
 
 /**
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
 void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
+
+	// create device arrays
+	int *dev_in;
+	int *dev_out;
+
+	cudaMalloc((void**)&dev_in, n*sizeof(int));
+	cudaMalloc((void**)&dev_out, n*sizeof(int));
+
+	cudaMemcpy(dev_in, idata, n*sizeof(int), cudaMemcpyHostToDevice);
+	cudaMemcpy(dev_out, idata, n*sizeof(int), cudaMemcpyHostToDevice);
+
+	scan_dev(n, dev_out, dev_in);
+
+	cudaMemcpy(&odata[1], dev_out, (n - 1)*sizeof(int), cudaMemcpyDeviceToHost);
+
+	cudaFree(dev_in);
+	cudaFree(dev_out);
+}
+
+void scan_dev(int n, int *dev_out, int *dev_in) {
+	dim3 fullBlocksPerGrid;
+
+	for (int d = 1; d <= ilog2ceil(n); d++) {
+		int s = 1 << (d - 1);
+		fullBlocksPerGrid = (n - s + blockSize - 1) / blockSize;
+
+		kernScanStep << < fullBlocksPerGrid, blockSize >> >(n, d, dev_out, dev_in);
+		cudaMemcpy(dev_in, dev_out, n*sizeof(int), cudaMemcpyDeviceToDevice);
+	}
+}
+
+void TestScan(int n, int *odata, const int *idata) {
+
+	double time = 0;
+	int samp = 1000;
+
+	// create device arrays
+	int *dev_in;
+	int *dev_out;
+
+	cudaMalloc((void**)&dev_in, n*sizeof(int));
+	cudaMalloc((void**)&dev_out, n*sizeof(int));
+
+	cudaMemcpy(dev_in, idata, n*sizeof(int), cudaMemcpyHostToDevice);
+	cudaMemcpy(dev_out, idata, n*sizeof(int), cudaMemcpyHostToDevice);
+
+	for (int i = 0; i < samp; i++) {
+		std::chrono::time_point<std::chrono::system_clock> start, end;
+		start = std::chrono::system_clock::now();
+
+		scan_dev(n, dev_out, dev_in);
+		cudaThreadSynchronize(); // block until kernel is finished
+
+		end = std::chrono::system_clock::now();
+		std::chrono::duration<double> elapsed_seconds = end - start;
+
+		time += elapsed_seconds.count() * 1000 / samp;
+	}
+
+	cudaMemcpy(&odata[1], dev_out, (n - 1)*sizeof(int), cudaMemcpyDeviceToHost);
+
+	cudaFree(dev_in);
+	cudaFree(dev_out);
+
+	printf("     %f\n", time );
+
 }
 
 }
 }
+
diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h
index 21152d6..4e4614f 100644
--- a/stream_compaction/naive.h
+++ b/stream_compaction/naive.h
@@ -2,6 +2,9 @@
 
 namespace StreamCompaction {
 namespace Naive {
-    void scan(int n, int *odata, const int *idata);
+	void scan(int n, int *odata, const int *idata);
+	void scan_dev(int n, int *dev_out, int *dev_in);
+
+	void TestScan(int n, int *odata, const int *idata);
 }
 }
diff --git a/stream_compaction/sort.cu b/stream_compaction/sort.cu
new file mode 100644
index 0000000..f780d05
--- /dev/null
+++ b/stream_compaction/sort.cu
@@ -0,0 +1,173 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "common.h"
+#include "efficient.h"
+#include "naive.h"
+#include "sort.h"
+
+namespace StreamCompaction {
+namespace Sort {
+
+void printArray(int n, int *a, bool abridged = false) {
+	printf("    [ ");
+	for (int i = 0; i < n; i++) {
+		if (abridged && i + 2 == 15 && n > 16) {
+			i = n - 2;
+			printf("... ");
+		}
+		printf("%3d ", a[i]);
+	}
+	printf("]\n");
+}
+
+
+__global__ void kernGetBit(int n, int d, int *bits, int *nbits, const int *data) {
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= n) {
+		return;
+	}
+
+	int s = (data[index] & (1 << d)) >> d;
+
+	bits[index] = s;
+	nbits[index] = 1 - s;
+}
+
+
+__global__ void kernScanTrue(int n, int *trues, const int *falses, const int *lastBit) {
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= n) {
+		return;
+	}
+
+	trues[index] = index - falses[index] + falses[n - 1] + lastBit[n - 1];
+}
+
+__global__ void kernDestination(int n, int *dest, int *trues, int *falses, int* bits) {
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= n) {
+		return;
+	}
+
+	dest[index] = bits[index] ? trues[index] : falses[index];
+}
+
+
+void radix(int n, const int k, int *odata, const int *idata) {
+	dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+	// create device arrays
+	int *dev_out;
+	int *dev_in;
+	int *dev_bits;
+	int *dev_nbits;
+	int *dev_true;
+	int *dev_false;
+	int *dev_dest;
+	int *dev_all;
+
+	cudaMalloc((void**)&dev_out, n*sizeof(int));
+	cudaMalloc((void**)&dev_in, n*sizeof(int));
+	cudaMalloc((void**)&dev_bits, n*sizeof(int));
+	cudaMalloc((void**)&dev_nbits, n*sizeof(int));
+	cudaMalloc((void**)&dev_true, n*sizeof(int));
+	cudaMalloc((void**)&dev_false, n*sizeof(int));
+	cudaMalloc((void**)&dev_dest, n*sizeof(int));
+	cudaMalloc((void**)&dev_all, n*sizeof(int));
+
+	cudaMemset(dev_all, 0, n);
+	cudaMemcpy(dev_in, idata, n*sizeof(int), cudaMemcpyHostToDevice);
+
+	for (int d = 0; d < k; d++) {
+		kernGetBit << <fullBlocksPerGrid, blockSize >> >(n, d, dev_bits, dev_nbits, dev_in);
+		cudaMemcpy(dev_false, dev_nbits, n*sizeof(int), cudaMemcpyDeviceToDevice);	
+		StreamCompaction::Efficient::scan_dev(n, dev_false);
+		kernScanTrue << <fullBlocksPerGrid, blockSize >> >(n, dev_true, dev_false, dev_nbits);
+		kernDestination << <fullBlocksPerGrid, blockSize >> >(n, dev_dest, dev_true, dev_false, dev_bits);
+		StreamCompaction::Common::kernScatter << <fullBlocksPerGrid, blockSize >> >(n, dev_out, dev_in, NULL, dev_dest);
+		cudaMemcpy(dev_in, dev_out, n*sizeof(int), cudaMemcpyDeviceToDevice);
+	}
+
+	cudaMemcpy(odata, dev_out, n*sizeof(int), cudaMemcpyDeviceToHost);
+
+	cudaFree(dev_out);
+	cudaFree(dev_in);
+	cudaFree(dev_bits);
+	cudaFree(dev_nbits);
+	cudaFree(dev_true);
+	cudaFree(dev_false);
+	cudaFree(dev_dest);
+	cudaFree(dev_all);
+}
+
+
+void radix_dev(int n, const int k, int *dev_out, int *dev_in) {
+	dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+	// create device arrays
+	int *dev_bits;
+	int *dev_nbits;
+	int *dev_true;
+	int *dev_false;
+	int *dev_dest;
+	int *dev_all;
+
+	cudaMalloc((void**)&dev_bits, n*sizeof(int));
+	cudaMalloc((void**)&dev_nbits, n*sizeof(int));
+	cudaMalloc((void**)&dev_true, n*sizeof(int));
+	cudaMalloc((void**)&dev_false, n*sizeof(int));
+	cudaMalloc((void**)&dev_dest, n*sizeof(int));
+	cudaMalloc((void**)&dev_all, n*sizeof(int));
+
+	cudaMemset(dev_all, 0, n);
+
+	for (int d = 0; d < k; d++) {
+		kernGetBit << <fullBlocksPerGrid, blockSize >> >(n, d, dev_bits, dev_nbits, dev_in);
+		cudaMemcpy(dev_false, dev_nbits, n*sizeof(int), cudaMemcpyDeviceToDevice);
+		StreamCompaction::Efficient::scan_dev(n, dev_false);
+		kernScanTrue << <fullBlocksPerGrid, blockSize >> >(n, dev_true, dev_false, dev_nbits);
+		kernDestination << <fullBlocksPerGrid, blockSize >> >(n, dev_dest, dev_true, dev_false, dev_bits);
+		StreamCompaction::Common::kernScatter << <fullBlocksPerGrid, blockSize >> >(n, dev_out, dev_in, NULL, dev_dest);
+		cudaMemcpy(dev_in, dev_out, n*sizeof(int), cudaMemcpyDeviceToDevice);
+	}
+
+	cudaFree(dev_bits);
+	cudaFree(dev_nbits);
+	cudaFree(dev_true);
+	cudaFree(dev_false);
+	cudaFree(dev_dest);
+	cudaFree(dev_all);
+}
+
+
+void TestSort(int n, const int k, int *odata, const int *idata) {
+
+	double time = 0;
+	int samp = 1000;
+
+	int *dev_out;
+	int *dev_in;
+	cudaMalloc((void**)&dev_out, n*sizeof(int));
+	cudaMalloc((void**)&dev_in, n*sizeof(int));
+	cudaMemcpy(dev_in, idata, n*sizeof(int), cudaMemcpyHostToDevice);
+
+	for (int i = 0; i < samp; i++) {
+		std::chrono::time_point<std::chrono::system_clock> start, end;
+		start = std::chrono::system_clock::now();
+
+		radix_dev(n, k, dev_out, dev_in);
+		cudaThreadSynchronize(); // block until kernel is finished
+
+		end = std::chrono::system_clock::now();
+		std::chrono::duration<double> elapsed_seconds = end - start;
+
+		time += elapsed_seconds.count() * 1000 / samp;
+	}
+
+	cudaMemcpy(odata, dev_out, n*sizeof(int), cudaMemcpyDeviceToHost);
+	printf("     %f\n", time);
+
+}
+
+}
+}
\ No newline at end of file
diff --git a/stream_compaction/sort.h b/stream_compaction/sort.h
new file mode 100644
index 0000000..ec3bd9f
--- /dev/null
+++ b/stream_compaction/sort.h
@@ -0,0 +1,10 @@
+#pragma once
+
+namespace StreamCompaction {
+	namespace Sort {
+		void radix(int n, const int k, int *odata, const int *idata);
+		void radix_dev(int n, const int k, int *dev_out, int *dev_in);
+
+		void TestSort(int n, const int k, int *odata, const int *idata);
+	}
+}
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index d8dbb32..721d425 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -3,6 +3,7 @@
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #include <thrust/scan.h>
+#include <thrust/sort.h>
 #include "common.h"
 #include "thrust.h"
 
@@ -13,9 +14,85 @@ namespace Thrust {
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
 void scan(int n, int *odata, const int *idata) {
-    // TODO use `thrust::exclusive_scan`
-    // example: for device_vectors dv_in and dv_out:
-    // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+	thrust::device_vector<int> dv_in(idata, idata + n);
+	thrust::device_vector<int> dv_out(odata, odata + n);
+
+    thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+	thrust::copy(dv_out.begin(), dv_out.end(), odata);
+}
+
+void TestScan(int n, int *odata, const int *idata) {
+
+	double time = 0;
+	int samp = 1000;
+
+	thrust::device_vector<int> dv_in(idata, idata + n);
+	thrust::device_vector<int> dv_out(odata, odata + n);
+
+	for (int i = 0; i < samp; i++) {
+
+
+		std::chrono::time_point<std::chrono::system_clock> start, end;
+		start = std::chrono::system_clock::now();
+
+		thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+		cudaThreadSynchronize(); // block until kernel is finished
+
+		end = std::chrono::system_clock::now();
+		std::chrono::duration<double> elapsed_seconds = end - start;
+
+
+		time += elapsed_seconds.count() * 1000 / samp;
+	}
+
+	thrust::copy(dv_out.begin(), dv_out.end(), odata);
+	printf("    %f\n", time);
+
+}
+
+void TestSortStable(int n, int *odata, const int *idata) {
+
+	double time = 0;
+	int samp = 1000;
+
+	for (int i = 0; i < samp; i++) {
+		memcpy(odata, idata, n*sizeof(int));
+
+		std::chrono::time_point<std::chrono::system_clock> start, end;
+		start = std::chrono::system_clock::now();
+
+		thrust::stable_sort(odata, odata + n);
+		cudaThreadSynchronize(); // block until kernel is finished
+
+		end = std::chrono::system_clock::now();
+		std::chrono::duration<double> elapsed_seconds = end - start;
+
+		time += elapsed_seconds.count() * 1000 / samp;
+	}
+	printf("    %f\n", time);
+}
+
+void TestSortUnstable(int n, int *odata, const int *idata) {
+
+	double time = 0;
+	int samp = 1000;
+	
+	for (int i = 0; i < samp; i++) {
+		memcpy(odata, idata, n*sizeof(int));
+
+		std::chrono::time_point<std::chrono::system_clock> start, end;
+		start = std::chrono::system_clock::now();
+
+		thrust::sort(odata, odata + n);
+		cudaThreadSynchronize(); // block until kernel is finished
+
+		end = std::chrono::system_clock::now();
+		std::chrono::duration<double> elapsed_seconds = end - start;
+
+		time += elapsed_seconds.count() * 1000 / samp;
+	}
+	printf("    %f\n", time);
+
 }
 
 }
diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h
index 06707f3..5790b94 100644
--- a/stream_compaction/thrust.h
+++ b/stream_compaction/thrust.h
@@ -2,6 +2,11 @@
 
 namespace StreamCompaction {
 namespace Thrust {
-    void scan(int n, int *odata, const int *idata);
+	void scan(int n, int *odata, const int *idata);
+
+	void TestScan(int n, int *odata, const int *idata); 
+	void TestSortStable(int n, int *odata, const int *idata);
+	void TestSortUnstable(int n, int *odata, const int *idata);
+
 }
 }