diff --git a/README.md b/README.md
index b71c458..eaecfcc 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,107 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Mariano Merchante
+* Tested on
+  * Microsoft Windows 10 Pro
+  * Intel(R) Core(TM) i7-6700HQ CPU @ 2.60GHz, 2601 Mhz, 4 Core(s), 8 Logical Processor(s)
+  * 32.0 GB RAM
+  * NVIDIA GeForce GTX 1070 (mobile version)
 
-### (TODO: Your README)
+## Details
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+This project implements and compares different algorithms for scan and stream compaction, both in GPU and CPU. In particular, it implements a serial version, a naive parallel version, a work efficient version and an implementation from the Thrust library. A simple CSV exporter for automated performance evaluation is also included.
 
+## Note about Work Efficient implementation
+
+My implementation of the Work-Efficient approach tries to reduce the amount of threads dispatched to do both the up and down sweep, and thus it performs better than the naive approach. This happens because occupancy is maximized and only threads that do useful work are running.
+
+## Analysis
+
+It is interesting to analyze the results both in debug and release mode, as some behaviours arise. In general, CPU < Naive < Work-Efficient, but the Thrust library implementation radically changes between both.
+
+![](images/debug_pot.png)
+![](images/release_pot.png)
+![](images/debug_vs_release.png)
+
+The relative performances seem to be uniformly correlated, exept for Thrust. I suspect this happens because the library does some precomputation and thus becomes faster after some iterations, as the next graph exemplifies. It is also probably faster than my work efficient implementation, as I didn't do any shared memory optimizations. A clear drop in execution time can be seen after the first Thrust run.
+
+![](images/iterations.png)
+
+An example of running the algorithms with an array of the same size. Note how Thrust drops and performs better with time.
+
+It is also useful to analyze the relationship between using power of two arrays or non power of two. Note that the Thrust pattern still emerges. Although the ratio is close to 1, the non power of two case definitely uses more memory than necessary and is going to impact the overall application.
+
+![](images/debug_npot.png)
+![](images/release_npot.png)
+![](images/pot_vs_npot.png)
+
+
+An example run can be seen here:
+```
+SIZE: 4194304
+****************
+** SCAN TESTS **
+****************
+    [  35  16  14  26  19  39   4  39  24  42  27  45  46 ...  11   0 ]
+==== cpu scan, results test ====
+==== PASS: YES ====
+    [   0   1   6   6   7   9   9 ]
+    [   0   1   6   6   7   9   9 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 8.31961ms    (std::chrono Measured)
+    [   0  35  51  65  91 110 149 153 192 216 258 285 330 ... 102731641 102731652 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 8.75102ms    (std::chrono Measured)
+    [   0  35  51  65  91 110 149 153 192 216 258 285 330 ... 102731502 102731543 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 4.20352ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 4.20454ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 2.05008ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 1.88826ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 7.67795ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 1.47558ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   0   3   3   0   1   1   3   2   2   0   1   3   3 ...   0   0 ]
+==== cpu compact without scan, power-of-two ====
+==== PASS: YES ====
+    [   1   5   1   2   3 ]
+    [   1   5   1   2   3 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 7.46312ms    (std::chrono Measured)
+    [   3   3   1   1   3   2   2   1   3   3   2   3   2 ...   1   1 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 4.41561ms    (std::chrono Measured)
+    [   3   3   1   1   3   2   2   1   3   3   2   3   2 ...   3   1 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 25.8876ms    (std::chrono Measured)
+    [   3   3   1   1   3   2   2   1   3   3   2   3   2 ...   1   1 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 2.75443ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 2.47562ms    (CUDA Measured)
+    passed
+Press any key to continue . . .
+
+
+
+```
diff --git a/images/debug_npot.png b/images/debug_npot.png
new file mode 100644
index 0000000..6dcf0cf
Binary files /dev/null and b/images/debug_npot.png differ
diff --git a/images/debug_pot.png b/images/debug_pot.png
new file mode 100644
index 0000000..c9ae31b
Binary files /dev/null and b/images/debug_pot.png differ
diff --git a/images/debug_vs_release.png b/images/debug_vs_release.png
new file mode 100644
index 0000000..0fc1398
Binary files /dev/null and b/images/debug_vs_release.png differ
diff --git a/images/iterations.png b/images/iterations.png
new file mode 100644
index 0000000..6d17450
Binary files /dev/null and b/images/iterations.png differ
diff --git a/images/pot_vs_npot.png b/images/pot_vs_npot.png
new file mode 100644
index 0000000..36c55ff
Binary files /dev/null and b/images/pot_vs_npot.png differ
diff --git a/images/release_npot.png b/images/release_npot.png
new file mode 100644
index 0000000..ee8bedc
Binary files /dev/null and b/images/release_npot.png differ
diff --git a/images/release_pot.png b/images/release_pot.png
new file mode 100644
index 0000000..1573a53
Binary files /dev/null and b/images/release_pot.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 7305641..733a594 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -6,6 +6,10 @@
  * @copyright University of Pennsylvania
  */
 
+#include <thread>
+#include <iostream>
+#include <fstream>
+#include <vector>
 #include <cstdio>
 #include <stream_compaction/cpu.h>
 #include <stream_compaction/naive.h>
@@ -13,13 +17,21 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
-const int NPOT = SIZE - 3; // Non-Power-Of-Two
+const int SIZE = 1 << 22; // feel free to change the size of array
+const int NPOT = SIZE - 5; // Non-Power-Of-Two
 int a[SIZE], b[SIZE], c[SIZE];
 
+int testInput[] = { 1, 5, 0, 1, 2, 0, 3 };
+int testOutput[] = { 0, 1, 6, 6, 7, 9, 9 };
+
+int testCompactionInput[] = { 1, 5, 0, 1, 2, 0, 3 };
+int testCompactionOutput[] = { 1, 5, 1, 2, 3 };
+
 int main(int argc, char* argv[]) {
-    // Scan tests
 
+	printf("SIZE: %d", SIZE);
+
+    // Scan tests
     printf("\n");
     printf("****************\n");
     printf("** SCAN TESTS **\n");
@@ -29,6 +41,18 @@ int main(int argc, char* argv[]) {
     a[SIZE - 1] = 0;
     printArray(SIZE, a, true);
 
+	// Test example
+	zeroArray(7, b);
+	printDesc("cpu scan, results test");
+	StreamCompaction::CPU::scan(7, b, testInput);
+	bool pass = true;
+	for (int i = 0; i < 7; ++i)
+		if (testOutput[i] != b[i])
+			pass = false;
+	printDesc((std::string("PASS: ") + (pass ? "YES": "NO")).c_str());
+	printArray(7, testOutput, true);
+	printArray(7, b, true);
+
     // initialize b using StreamCompaction::CPU::scan you implement
     // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
     // At first all cases passed because b && c are all zeroes.
@@ -100,6 +124,16 @@ int main(int argc, char* argv[]) {
 
     int count, expectedCount, expectedNPOT;
 
+	// Test results
+	zeroArray(SIZE, b);
+	printDesc("cpu compact without scan, power-of-two");
+	count = StreamCompaction::CPU::compactWithoutScan(7, b, testCompactionInput);
+	pass = cmpArrays(5, testCompactionOutput, b) == 0;
+	printDesc((std::string("PASS: ") + (pass ? "YES" : "NO")).c_str());
+	expectedCount = count;
+	printArray(count, b, true);
+	printArray(5, testCompactionOutput, true);
+
     // initialize b using StreamCompaction::CPU::compactWithoutScan you implement
     // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct.
     zeroArray(SIZE, b);
@@ -139,5 +173,72 @@ int main(int argc, char* argv[]) {
     //printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
+	bool generateCSV = false;
+
+	if (generateCSV)
+	{
+		bool useNPOT = true;
+		int steps = 22;		
+		std::vector<std::vector<float>> timeData;
+
+		for (int i = 1; i < steps + 1; ++i)
+		{
+			int size = (1 << i);
+
+			if (useNPOT)
+				size = (size - 3 > 0) ? size - 3 : size;
+
+			int * data = new int[size];
+			int * result = new int[size];
+			genArray(size, data, i * 5);
+			zeroArray(size, data);
+
+			std::vector<float> stepData;
+			stepData.push_back(size);
+
+			// Run each implementation -- we don't care about the results (the previous tests cover that)
+			{
+				zeroArray(size, result);
+				StreamCompaction::CPU::scan(size, result, data);
+				stepData.push_back(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation());
+
+				zeroArray(size, result);
+				StreamCompaction::Naive::scan(size, result, data);
+				stepData.push_back(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation());
+
+				zeroArray(size, result);
+				StreamCompaction::Efficient::scan(size, result, data);
+				stepData.push_back(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation());
+
+				zeroArray(size, result);
+				StreamCompaction::Thrust::scan(size, result, data);
+				stepData.push_back(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation());
+			}
+
+			timeData.push_back(stepData);
+
+			delete[] data;
+			delete[] result;
+		}
+
+		std::ofstream fstr;
+		fstr.open("data.csv", std::ofstream::out);
+
+		for (int i = 0; i < timeData.size(); ++i)
+		{
+			std::string line = "";
+
+			for (int j = 0; j < timeData[i].size(); ++j)
+				line += std::to_string(timeData[i][j]) + ", "; // Parsers remove this
+
+			line += "\n";
+			std::cout << line << std::endl;
+			fstr.write(line.c_str(), line.length());
+		}
+
+		fstr.close();
+	}
+
+
     system("pause"); // stop Win32 console from closing on exit
 }
diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp
index ae94ca6..75bb1c9 100644
--- a/src/testing_helpers.hpp
+++ b/src/testing_helpers.hpp
@@ -44,13 +44,21 @@ void zeroArray(int n, int *a) {
 }
 
 void genArray(int n, int *a, int maxval) {
-    srand(time(nullptr));
+	srand(time(nullptr));
 
     for (int i = 0; i < n; i++) {
         a[i] = rand() % maxval;
     }
 }
 
+void genArrayNonZero(int n, int *a, int maxval) {
+	srand(time(nullptr));
+
+	for (int i = 0; i < n; i++) {
+		a[i] = (rand() % maxval) + 1;
+	}
+}
+
 void printArray(int n, int *a, bool abridged = false) {
     printf("    [ ");
     for (int i = 0; i < n; i++) {
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 8fc0211..e018d6a 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -1,4 +1,5 @@
 #include "common.h"
+#include <device_launch_parameters.h>
 
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
     cudaError_t err = cudaGetLastError();
@@ -22,17 +23,32 @@ namespace StreamCompaction {
          * Maps an array to an array of 0s and 1s for stream compaction. Elements
          * which map to 0 will be removed, and elements which map to 1 will be kept.
          */
-        __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-            // TODO
+        __global__ void kernMapToBoolean(int n, int *bools, const int *idata) 
+		{
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+			if (index >= n)
+				return;
+
+			bools[index] = (idata[index] != 0 ? 1 : 0);
         }
 
         /**
          * Performs scatter on an array. That is, for each element in idata,
          * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
          */
-        __global__ void kernScatter(int n, int *odata,
-                const int *idata, const int *bools, const int *indices) {
-            // TODO
+        __global__ void kernScatter(int n, int *odata, const int *idata, const int *indices) 
+		{
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+			if (index >= n)
+				return;
+
+			if (idata[index] != 0)
+			{
+				int b = indices[index]; // For some reason, odata[indices[index]] was having race conditions when indices matched
+				odata[b] = idata[index];
+			}
         }
 
     }
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index 55f1b38..413a896 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -34,8 +34,7 @@ namespace StreamCompaction {
     namespace Common {
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata);
 
-        __global__ void kernScatter(int n, int *odata,
-                const int *idata, const int *bools, const int *indices);
+        __global__ void kernScatter(int n, int *odata, const int *idata, const int *indices);
 
 	    /**
 	    * This class is used for timing the performance
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 05ce667..add3f1e 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -17,9 +17,24 @@ namespace StreamCompaction {
          * For performance analysis, this is supposed to be a simple for loop.
          * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
          */
-        void scan(int n, int *odata, const int *idata) {
-	        timer().startCpuTimer();
-            // TODO
+		void scan_implementation(int n, int *odata, const int *idata) {
+			if (n == 0)
+				return;
+
+			// The idea here is to be able to call scan in-place
+			int prev = idata[0];
+			odata[0] = 0;
+			for (int i = 1; i < n; ++i)
+			{
+				int tmp = idata[i];
+				odata[i] = prev + odata[i - 1];
+				prev = tmp;
+			}
+		}
+
+		void scan(int n, int *odata, const int *idata) {
+			timer().startCpuTimer();
+			scan_implementation(n, odata, idata);
 	        timer().endCpuTimer();
         }
 
@@ -28,11 +43,19 @@ namespace StreamCompaction {
          *
          * @returns the number of elements remaining after compaction.
          */
-        int compactWithoutScan(int n, int *odata, const int *idata) {
-	        timer().startCpuTimer();
-            // TODO
+		int compactWithoutScan(int n, int *odata, const int *idata) {
+			timer().startCpuTimer();
+
+			int sum = 0;
+			for (int i = 0; i < n; ++i)
+			{
+				int tmp = idata[i] != 0 ? 1 : 0;
+				odata[sum] = tmp * idata[i];
+				sum += tmp;
+			}
+            
 	        timer().endCpuTimer();
-            return -1;
+            return sum;
         }
 
         /**
@@ -40,11 +63,23 @@ namespace StreamCompaction {
          *
          * @returns the number of elements remaining after compaction.
          */
-        int compactWithScan(int n, int *odata, const int *idata) {
-	        timer().startCpuTimer();
-	        // TODO
+		int compactWithScan(int n, int *odata, const int *idata) {
+			timer().startCpuTimer();
+
+			for (int i = 0; i < n; ++i)
+				odata[i] = (idata[i] != 0 ? 1 : 0);
+
+			// No malloc
+			scan_implementation(n, odata, odata);
+
+			// Scatter
+			int sum = odata[n - 1];
+			for (int i = 0; i < n; ++i)
+				if (idata[i] != 0)
+					odata[odata[i]] = idata[i];
+
 	        timer().endCpuTimer();
-            return -1;
+            return sum;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 36c5ef2..d97d0e7 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -1,5 +1,6 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <device_launch_parameters.h>
 #include "common.h"
 #include "efficient.h"
 
@@ -12,15 +13,123 @@ namespace StreamCompaction {
             return timer;
         }
 
+		#define blockSize 128
+
+		__global__ void kernUpSweep(int N, int stride, int halfStride, int * data)
+		{
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+			
+			if (index >= N)
+				return;
+
+			index = (index + 1) * stride - 1;
+			data[index] += data[index - halfStride];
+		}
+
+		__global__ void kernDownSweepFirst(int N, int stride, int halfStride, int * data)
+		{
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+			if (index >= N)
+				return;
+
+			index = (index + 1) * stride - 1;
+			int tmp = data[index - halfStride];
+
+			// Swap
+			data[index - halfStride] = 0;
+
+			// Add, replace
+			data[index] = tmp;
+		}
+
+		__global__ void kernDownSweep(int N, int stride, int halfStride, int * data)
+		{
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+			
+			if (index >= N)
+				return;
+
+			index = (index + 1) * stride - 1;
+
+			int value = data[index];
+			int tmp = data[index - halfStride];
+
+			// Swap
+			data[index - halfStride] = value;
+
+			// Add, replace
+			data[index] = value + tmp;
+		}
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
+		
+		void scan(int n, int *dev_data)
+		{
+			int passes = ilog2ceil(n);
+			for (int d = 0; d < passes; ++d)
+			{
+				int stride = pow(2, d + 1);
+				int halfStride = stride / 2;
+				int sliceElements = n / stride;
+
+				//printf("%d, %d, %d \n", sliceElements, stride, halfStride);
+				dim3 blocks((sliceElements + blockSize - 1) / blockSize);
+
+				kernUpSweep << <blocks, blockSize >> > (sliceElements, stride, halfStride, dev_data);
+				checkCUDAErrorFn("kernUpSweep failed!");
+			}
+
+			for (int d = passes - 1; d >= 0; --d)
+			{
+				int stride = pow(2, d + 1);
+				int halfStride = stride / 2;
+				int sliceElements = n / stride;
+
+				//printf("%d, %d, %d \n", sliceElements, stride, halfStride);
+				dim3 blocks((sliceElements + blockSize - 1) / blockSize);
+
+				if (d == passes - 1)
+				{
+					kernDownSweepFirst << <blocks, blockSize >> > (sliceElements, stride, halfStride, dev_data);
+					checkCUDAErrorFn("kernDownSweepFirst failed!");
+				}
+				else
+				{
+					kernDownSweep << <blocks, blockSize >> > (sliceElements, stride, halfStride, dev_data);
+					checkCUDAErrorFn("kernDownSweep failed!");
+				}
+			}
         }
 
+		void scan(int n, int *odata, const int *idata)
+		{
+			int * dev_data;
+			int passes = ilog2ceil(n);
+			int squareN = pow(2, passes);
+
+			//printf("%d vs %d\n", n, squareN);
+
+			cudaMalloc((void**)&dev_data, squareN * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_data failed!");
+
+			// calloc
+			cudaMemset(dev_data, 0, squareN * sizeof(int));
+			cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+			checkCUDAErrorFn("cudaMemcpy dev_data failed!");
+
+			timer().startGpuTimer();
+
+			scan(squareN, dev_data);
+
+			timer().endGpuTimer();
+
+			cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost);
+			checkCUDAErrorFn("cudaMemcpy dev_data failed!");
+			cudaFree(dev_data);
+		}
+
         /**
          * Performs stream compaction on idata, storing the result into odata.
          * All zeroes are discarded.
@@ -30,11 +139,54 @@ namespace StreamCompaction {
          * @param idata  The array of elements to compact.
          * @returns      The number of elements remaining after compaction.
          */
-        int compact(int n, int *odata, const int *idata) {
+        int compact(int n, int *odata, const int *idata) 
+		{
+			int * dev_data;
+			int * dev_booleans;
+			int * dev_data_output;
+			int passes = ilog2ceil(n);
+			int squareN = pow(2, passes);
+
+			cudaMalloc((void**)&dev_data_output, squareN * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_data failed!");
+
+			cudaMalloc((void**)&dev_booleans, squareN * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_data failed!");
+
+			cudaMalloc((void**)&dev_data, squareN * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_data failed!");
+
+			// calloc
+			cudaMemset(dev_data_output, 0, squareN * sizeof(int));
+			cudaMemset(dev_data, 0, squareN * sizeof(int));
+			cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+			checkCUDAErrorFn("cudaMemcpy dev_data failed!");
+
             timer().startGpuTimer();
-            // TODO
+
+			dim3 blocks((squareN + blockSize - 1) / blockSize);
+			StreamCompaction::Common::kernMapToBoolean << <blocks, blockSize >> > (squareN, dev_booleans, dev_data);
+			checkCUDAErrorFn("kernMapToBoolean failed!");
+
+			scan(squareN, dev_booleans);
+
+			int sum = 0;
+			cudaMemcpy(&sum, &dev_booleans[squareN-1], sizeof(int), cudaMemcpyDeviceToHost);
+			
+			// Note: I removed one of the input arrays
+			StreamCompaction::Common::kernScatter << <blocks, blockSize >> > (squareN, dev_data_output, dev_data, dev_booleans);
+			checkCUDAErrorFn("kernScatter failed!");
+
             timer().endGpuTimer();
-            return -1;
+
+			cudaMemcpy(odata, dev_data_output, sizeof(int) * sum, cudaMemcpyDeviceToHost);
+			checkCUDAErrorFn("cudaMemcpy dev_booleans failed!");
+
+			cudaFree(dev_data);
+			cudaFree(dev_data_output);
+			cudaFree(dev_booleans);
+
+			return sum;
         }
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 9218f8e..3d73044 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -1,5 +1,6 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <device_launch_parameters.h>
 #include "common.h"
 #include "naive.h"
 
@@ -11,15 +12,93 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
-        // TODO: __global__
+
+		#define blockSize 128
+
+		__global__ void kernScanShift(int N, int offset, int * odata, int * idata)
+		{
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+			if (index >= N)
+				return;
+
+			int res = idata[index];
+
+			if (index >= offset)
+				res += idata[index - offset];
+
+			index += 1;
+
+			if (index < N)
+				odata[index] = res;
+		}
+
+		__global__ void kernScan(int N, int offset, int * odata, int * idata) 
+		{
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+			if (index >= N)
+				return;
+
+			int res = idata[index];
+			
+			if (index >= offset)
+				res += idata[index - offset];
+
+			odata[index] = res;
+		}
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+
+			int * dev_ping;
+			int * dev_pong;
+
+			cudaMalloc((void**)&dev_ping, n * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_ping failed!");
+
+			cudaMalloc((void**)&dev_pong, n * sizeof(int));
+			checkCUDAErrorFn("cudaMalloc dev_pong failed!");
+
+			// Only ping is memcpyed
+			cudaMemcpy(dev_ping, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+			checkCUDAErrorFn("cudaMemcpy dev_ping failed!");
+			
+			dim3 blocks((n + blockSize - 1) / blockSize);
+
             timer().startGpuTimer();
-            // TODO
+			int passes = ilog2ceil(n);
+
+			for (int d = 0; d < passes; ++d)
+			{
+				int offset = pow(2, d); // (d-1)+1
+
+				if (d == passes - 1)
+				{
+					kernScanShift << <blocks, blockSize >> > (n, offset, dev_pong, dev_ping);
+					checkCUDAErrorFn("kernScanShift failed!");
+				}
+				else
+				{
+					kernScan << <blocks, blockSize >> > (n, offset, dev_pong, dev_ping);
+					checkCUDAErrorFn("kernScan failed!");
+				}
+
+				std::swap(dev_pong, dev_ping);
+			}
             timer().endGpuTimer();
+
+			cudaMemcpy(odata, dev_ping, sizeof(int) * n, cudaMemcpyDeviceToHost);
+			checkCUDAErrorFn("cudaMemcpy dev_ping failed!");
+
+			// I decided to do this so that we don't thrash the gpu memory access by jumping
+			// For very large arrays this should be more efficient than doing it on gpu
+			odata[0] = 0;
+
+			cudaFree(dev_pong);
+			cudaFree(dev_ping);
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 36b732d..e45e034 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -17,12 +17,16 @@ namespace StreamCompaction {
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
+        void scan(int n, int *odata, const int *idata) 
+		{
+			thrust::device_vector<int> dev_thrust_input(idata, idata + n);
+			thrust::device_vector<int> dev_thrust_output(n);
+
             timer().startGpuTimer();
-            // TODO use `thrust::exclusive_scan`
-            // example: for device_vectors dv_in and dv_out:
-            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            thrust::exclusive_scan(dev_thrust_input.begin(), dev_thrust_input.end(), dev_thrust_output.begin());
             timer().endGpuTimer();
+
+			thrust::copy(dev_thrust_output.begin(), dev_thrust_output.end(), odata);
         }
     }
 }