diff --git a/.README.md.un~ b/.README.md.un~
new file mode 100644
index 0000000..1adcd84
Binary files /dev/null and b/.README.md.un~ differ
diff --git a/.vs/Project2-Number-Algorithms/v15/.suo b/.vs/Project2-Number-Algorithms/v15/.suo
new file mode 100644
index 0000000..c9fddab
Binary files /dev/null and b/.vs/Project2-Number-Algorithms/v15/.suo differ
diff --git a/.vs/Project2-Number-Algorithms/v15/Browse.VC.db b/.vs/Project2-Number-Algorithms/v15/Browse.VC.db
new file mode 100644
index 0000000..f530e72
Binary files /dev/null and b/.vs/Project2-Number-Algorithms/v15/Browse.VC.db differ
diff --git a/.vs/Project2-Number-Algorithms/v15/ipch/AutoPCH/69bca779ad2e221c/MAIN.ipch b/.vs/Project2-Number-Algorithms/v15/ipch/AutoPCH/69bca779ad2e221c/MAIN.ipch
new file mode 100644
index 0000000..91bf6ae
Binary files /dev/null and b/.vs/Project2-Number-Algorithms/v15/ipch/AutoPCH/69bca779ad2e221c/MAIN.ipch differ
diff --git a/.vs/Project2-Number-Algorithms/v15/ipch/AutoPCH/f38087074acb9f8b/CPU.ipch b/.vs/Project2-Number-Algorithms/v15/ipch/AutoPCH/f38087074acb9f8b/CPU.ipch
new file mode 100644
index 0000000..428bfa1
Binary files /dev/null and b/.vs/Project2-Number-Algorithms/v15/ipch/AutoPCH/f38087074acb9f8b/CPU.ipch differ
diff --git a/.vs/ProjectSettings.json b/.vs/ProjectSettings.json
new file mode 100644
index 0000000..2b7b33d
--- /dev/null
+++ b/.vs/ProjectSettings.json
@@ -0,0 +1,3 @@
+{
+  "CurrentProjectSetting": "x64-Debug (default)"
+}
\ No newline at end of file
diff --git a/.vs/VSWorkspaceState.json b/.vs/VSWorkspaceState.json
new file mode 100644
index 0000000..4fbcda8
--- /dev/null
+++ b/.vs/VSWorkspaceState.json
@@ -0,0 +1,10 @@
+{
+  "ExpandedNodes": [
+    "",
+    "\\Project2-Stream-Compaction",
+    "\\Project2-Stream-Compaction\\src",
+    "\\Project2-Stream-Compaction\\stream_compaction"
+  ],
+  "SelectedNode": "\\Project2-Stream-Compaction\\stream_compaction\\CMakeLists.txt",
+  "PreviewInSolutionExplorer": false
+}
\ No newline at end of file
diff --git a/.vs/slnx.sqlite b/.vs/slnx.sqlite
new file mode 100644
index 0000000..c33bf1f
Binary files /dev/null and b/.vs/slnx.sqlite differ
diff --git a/Project2-Character-Recognition/CMakeLists.txt b/Project2-Character-Recognition/CMakeLists.txt
index 09e9198..548fa85 100644
--- a/Project2-Character-Recognition/CMakeLists.txt
+++ b/Project2-Character-Recognition/CMakeLists.txt
@@ -22,6 +22,7 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
 endif()
 
 include_directories(.)
+link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
 add_subdirectory(character_recognition)
 
 cuda_add_executable(${CMAKE_PROJECT_NAME}
@@ -30,6 +31,8 @@ cuda_add_executable(${CMAKE_PROJECT_NAME}
     )
 
 target_link_libraries(${CMAKE_PROJECT_NAME}
+    cublas
+	curand
     character_recognition
     ${CORELIBS}
     )
diff --git a/Project2-Character-Recognition/README.md b/Project2-Character-Recognition/README.md
index 4503fac..d97aa45 100644
--- a/Project2-Character-Recognition/README.md
+++ b/Project2-Character-Recognition/README.md
@@ -1,14 +1,4 @@
 CUDA Character Recognition
 ======================
-
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
-
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
-
-### (TODO: Your README)
-
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+attempted to implement, never truly understood what I am doing and could not figure out what I am suppose to do
 
diff --git a/Project2-Character-Recognition/character_recognition/CMakeLists.txt b/Project2-Character-Recognition/character_recognition/CMakeLists.txt
index 7446175..9e834c1 100644
--- a/Project2-Character-Recognition/character_recognition/CMakeLists.txt
+++ b/Project2-Character-Recognition/character_recognition/CMakeLists.txt
@@ -7,5 +7,5 @@ set(SOURCE_FILES
 
 cuda_add_library(character_recognition
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_61
     )
diff --git a/Project2-Character-Recognition/character_recognition/common.cu b/Project2-Character-Recognition/character_recognition/common.cu
index 2a754d4..6521b75 100644
--- a/Project2-Character-Recognition/character_recognition/common.cu
+++ b/Project2-Character-Recognition/character_recognition/common.cu
@@ -1,5 +1,5 @@
 #include "common.h"
-
+#include "cublas.h"
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
     cudaError_t err = cudaGetLastError();
     if (cudaSuccess == err) {
diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu
index 5a3ed7f..4aefc20 100644
--- a/Project2-Character-Recognition/character_recognition/mlp.cu
+++ b/Project2-Character-Recognition/character_recognition/mlp.cu
@@ -2,6 +2,90 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "mlp.h"
+#include <cublas_v2.h>
+#define blockSize 32
+//https://solarianprogrammer.com/2012/05/31/matrix-multiplication-cuda-cublas-curand-thrust/
+// m =  number of rows in A
+// k = number of columns in A
+// n = number of columns in B
+void gpu_blas_mmul(const float *A, const float *B, float *C, const int m, const int k, const int n) {
+	int lda = m, ldb = k, ldc = m;
+	const float alf = 1;
+    const float bet = 0;
+    const float *alpha = &alf;
+    const float *beta = &bet;
+    // Create a handle for CUBLAS
+	 cublasHandle_t handle;
+     cublasCreate(&handle);
+	
+    // Do the actual multiplication
+    cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+	
+	// Destroy the handle
+    cublasDestroy(handle);
+	
+}
+
+__global__ void activation_rule(int n, float const *idata, float* odata) {
+	int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+	if (index >= n) {
+		return;
+	}
+
+	odata[index] = 1.0 / (1.0 + std::exp(-idata[index]));
+}
+
+__global__ void activation_rule_d(int n, float const *idat, float* odata) {
+	int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+	if (index >= n) {
+		return;
+	}
+	odata[index] = idat[index] * (1 - idat[index]);
+	
+}
+__global__  void matrixSub(int n, const float* a, const float* b, float* odata) {
+
+	int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+	if (index >= n) {
+		return;
+	}
+	odata[index] = a[index] - b[index];
+}
+
+
+void train(const float *A_input, const int A_m, const int A_n, float* weights_into_layer, float* weights_out_layer, float* predicted_output, 
+	float* err, float* layer, const int layer_n, const float* expected_output, const int expected_output_n, float* post_activation_layer) {
+	float *dev_input, *dev_weights_0, *dev_weights_1, *dev_error, *dev_layer, *dev_output, *dev_activation_layer, *dev_expected_output;
+	cudaMalloc((void**)&dev_input, sizeof(float) * A_m * A_n);
+	cudaMemcpy(dev_input, A_input, A_m * A_n * sizeof(float), cudaMemcpyHostToDevice);
+	cudaMalloc((void**)&dev_weights_0, A_m * layer_n * sizeof(float));
+	cudaMemcpy(dev_weights_0, weights_into_layer, A_m * layer_n * sizeof(float), cudaMemcpyHostToDevice);
+	cudaMalloc((void**)&dev_weights_1, layer_n * sizeof(float));
+	cudaMemcpy(dev_weights_1, weights_out_layer, layer_n * sizeof(float), cudaMemcpyHostToDevice);
+	cudaMalloc((void**)&dev_layer, sizeof(float) * layer_n);
+	cudaMemcpy(dev_layer, layer, layer_n * sizeof(float), cudaMemcpyHostToDevice);
+	cudaMalloc((void**)&dev_activation_layer, sizeof(float) * layer_n);
+	cudaMalloc((void**)&dev_error, expected_output_n * sizeof(float));
+	cudaMalloc((void**)&dev_output, expected_output_n * sizeof(float));
+	cudaMalloc((void**)&dev_expected_output, expected_output_n * sizeof(float));
+	cudaMemcpy(dev_expected_output, expected_output, expected_output_n* sizeof(float), cudaMemcpyHostToDevice);
+	float* dev_subtraction, *dev_derivative, *dev_output_d;
+	cudaMalloc((void**)&dev_subtraction, expected_output_n * sizeof(float));
+	cudaMalloc((void**)&dev_derivative, expected_output_n * sizeof(float));
+	cudaMalloc((void**)&dev_output_d, expected_output_n * sizeof(float));
+	for (int i = 0; i < 50; i++) {
+		gpu_blas_mmul(dev_input, dev_weights_1, dev_layer, 1, A_n, layer_n);
+		activation_rule<<<((layer_n + blockSize - 1) / blockSize), blockSize>>>(layer_n, dev_layer, dev_activation_layer);
+		gpu_blas_mmul(dev_activation_layer, dev_weights_1, dev_output, 1, layer_n, A_n);
+		//compute derivate. derivative of sigmoid is out * expected-out
+		matrixSub << <((layer_n + blockSize - 1) / blockSize), blockSize >> > (dev_expected_output, dev_output, dev_subtraction);
+		activation_rule_d << <((layer_n + blockSize - 1) / blockSize), blockSize >> > (layer_n, dev_output, dev_derivative);
+		gpu_blas_mmul(dev_subtraction, dev_derivative, dev_output_d, 1, expected_output_n, expected_output_n);
+        
+	}
+
+}
+__global__ void runTrain(const float *A_input, const int A_m, const int A_n, float* weights_into_layer, float* weights_out_layer, const float* predicted_output, float* err, float* layer, const int layer_n, const float* expected_output, const int expected_output_n) {
 
 namespace CharacterRecognition {
     using Common::PerformanceTimer;
diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp
index 11dd534..e4e54a0 100644
--- a/Project2-Character-Recognition/src/main.cpp
+++ b/Project2-Character-Recognition/src/main.cpp
@@ -11,142 +11,5 @@
 #include <character_recognition/common.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
-const int NPOT = SIZE - 3; // Non-Power-Of-Two
-int *a = new int[SIZE];
-int *b = new int[SIZE];
-int *c = new int[SIZE];
-
 int main(int argc, char* argv[]) {
-    // Scan tests
-
-    printf("\n");
-    printf("****************\n");
-    printf("** SCAN TESTS **\n");
-    printf("****************\n");
-
-    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    // initialize b using StreamCompaction::CPU::scan you implement
-    // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
-    // At first all cases passed because b && c are all zeroes.
-    zeroArray(SIZE, b);
-    printDesc("cpu scan, power-of-two");
-    StreamCompaction::CPU::scan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(SIZE, b, true);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu scan, non-power-of-two");
-    StreamCompaction::CPU::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(NPOT, b, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, power-of-two");
-    StreamCompaction::Naive::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-	/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
-	onesArray(SIZE, c);
-	printDesc("1s array for finding bugs");
-	StreamCompaction::Naive::scan(SIZE, c, a);
-	printArray(SIZE, c, true); */
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, non-power-of-two");
-    StreamCompaction::Naive::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, power-of-two");
-    StreamCompaction::Efficient::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, non-power-of-two");
-    StreamCompaction::Efficient::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, power-of-two");
-    StreamCompaction::Thrust::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, non-power-of-two");
-    StreamCompaction::Thrust::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    printf("\n");
-    printf("*****************************\n");
-    printf("** STREAM COMPACTION TESTS **\n");
-    printf("*****************************\n");
-
-    // Compaction tests
-
-    genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    int count, expectedCount, expectedNPOT;
-
-    // initialize b using StreamCompaction::CPU::compactWithoutScan you implement
-    // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct.
-    zeroArray(SIZE, b);
-    printDesc("cpu compact without scan, power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    expectedCount = count;
-    printArray(count, b, true);
-    printCmpLenResult(count, expectedCount, b, b);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact without scan, non-power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    expectedNPOT = count;
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact with scan");
-    count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, power-of-two");
-    count = StreamCompaction::Efficient::compact(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, non-power-of-two");
-    count = StreamCompaction::Efficient::compact(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
-
-    system("pause"); // stop Win32 console from closing on exit
-	delete[] a;
-	delete[] b;
-	delete[] c;
 }
diff --git a/Project2-Stream-Compaction/img/blocksizeopt.PNG b/Project2-Stream-Compaction/img/blocksizeopt.PNG
new file mode 100644
index 0000000..72c3adb
Binary files /dev/null and b/Project2-Stream-Compaction/img/blocksizeopt.PNG differ
diff --git a/Project2-Stream-Compaction/img/outputConsole.PNG b/Project2-Stream-Compaction/img/outputConsole.PNG
new file mode 100644
index 0000000..33fe2fd
Binary files /dev/null and b/Project2-Stream-Compaction/img/outputConsole.PNG differ
diff --git a/Project2-Stream-Compaction/img/sizevstime.PNG b/Project2-Stream-Compaction/img/sizevstime.PNG
new file mode 100644
index 0000000..b25873f
Binary files /dev/null and b/Project2-Stream-Compaction/img/sizevstime.PNG differ
diff --git a/Project2-Stream-Compaction/src/main.cpp b/Project2-Stream-Compaction/src/main.cpp
index d016553..f2f6783 100644
--- a/Project2-Stream-Compaction/src/main.cpp
+++ b/Project2-Stream-Compaction/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 15; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -51,7 +51,7 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
 	/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
diff --git a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt
index cdbef77..6975dd9 100644
--- a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt
+++ b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt
@@ -13,5 +13,4 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
-    )
+    OPTIONS -arch=sm_61    )
diff --git a/Project2-Stream-Compaction/stream_compaction/common.cu b/Project2-Stream-Compaction/stream_compaction/common.cu
index 2ed6d63..932abab 100644
--- a/Project2-Stream-Compaction/stream_compaction/common.cu
+++ b/Project2-Stream-Compaction/stream_compaction/common.cu
@@ -24,6 +24,12 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+			int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+			if (index >= n) {
+				return;
+			}
+
+			bools[index] = (idata[index] != 0) ? 1 : 0;
         }
 
         /**
@@ -33,6 +39,14 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+			int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
+			if (idx >= n) {
+				return;
+			}
+
+			if (bools[idx]) {
+				odata[indices[idx]] = idata[idx];
+			}
         }
 
     }
diff --git a/Project2-Stream-Compaction/stream_compaction/common.h b/Project2-Stream-Compaction/stream_compaction/common.h
index 996997e..84bae7e 100644
--- a/Project2-Stream-Compaction/stream_compaction/common.h
+++ b/Project2-Stream-Compaction/stream_compaction/common.h
@@ -9,7 +9,6 @@
 #include <algorithm>
 #include <chrono>
 #include <stdexcept>
-
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
 
@@ -92,6 +91,8 @@ namespace StreamCompaction {
 			    cudaEventRecord(event_end);
 			    cudaEventSynchronize(event_end);
 
+#include "common.h"
+
 			    if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); }
 
 			    cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end);
diff --git a/Project2-Stream-Compaction/stream_compaction/cpu.cu b/Project2-Stream-Compaction/stream_compaction/cpu.cu
index a2d3e6c..5feec13 100644
--- a/Project2-Stream-Compaction/stream_compaction/cpu.cu
+++ b/Project2-Stream-Compaction/stream_compaction/cpu.cu
@@ -20,9 +20,26 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
             // TODO
+			if(n > 0){
+				int sum = 0;
+			   for (int i = 0; i < n; i++) {
+				   odata[i] = sum;
+				   sum += idata[i];
+			    }
+			}
 	        timer().endCpuTimer();
         }
 
+		void scan_notimer(int n, int *odata, const int *idata) {
+			if(n > 0){
+				int sum = 0;
+			   for (int i = 0; i < n; i++) {
+				   odata[i] = sum;
+				   sum += idata[i];
+			    }
+			}
+		}
+
         /**
          * CPU stream compaction without using the scan function.
          *
@@ -31,8 +48,14 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
             // TODO
+			int count = 0;
+			for (int i = 0; i < n; i++) {
+				if (idata[i] != 0) {
+					odata[count++] = idata[i];
+				}
+			}
 	        timer().endCpuTimer();
-            return -1;
+            return count;
         }
 
         /**
@@ -42,9 +65,28 @@ namespace StreamCompaction {
          */
         int compactWithScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
-	        // TODO
+			if (n < 1) {
+				return 0;
+			}
+			int* tmp = new int [n];
+			 
+			for (int i = 0; i < n; i++) {
+				tmp[i] = (idata[i] == 0) ? 0 : 1;
+			}
+			int * scan_o = new int[n];
+
+			scan_notimer(n, scan_o, tmp);
+			int scan_counter = 0;
+			for (int i = 1; i < n; i++) {
+				if (scan_o[i] != scan_o[i - 1]) {
+					odata[scan_counter++] = idata[i - 1];
+				}
+			}
+
 	        timer().endCpuTimer();
-            return -1;
+			delete[] scan_o;
+			delete[] tmp;
+			return scan_counter;
         }
     }
 }
diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.cu b/Project2-Stream-Compaction/stream_compaction/efficient.cu
index 2db346e..694e543 100644
--- a/Project2-Stream-Compaction/stream_compaction/efficient.cu
+++ b/Project2-Stream-Compaction/stream_compaction/efficient.cu
@@ -2,7 +2,7 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "efficient.h"
-
+#define blockSize 512
 namespace StreamCompaction {
     namespace Efficient {
         using StreamCompaction::Common::PerformanceTimer;
@@ -12,15 +12,136 @@ namespace StreamCompaction {
             return timer;
         }
 
+		__global__ void kernUpsweep(int n, int d, int* idata) {
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (index >= n) {
+				return;
+			}
+			int offset  =  (1 << (d + 1)); // implementing 2^d+1 incrementing 
+			int k = index * offset;
+			if (k >= n) {
+				return;
+			}
+
+			idata[k + offset - 1] += idata[k + (1 << d) - 1];
+		}
+
+		__global__ void kernDownsweep(int n, int d, int* idata) {
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (index >= n) {
+				return;
+			}
+			int offset = (1 << (d + 1)); // implementing 2^d+1 incrementing 
+			int k = index * offset;
+			if (k >= n) {
+				return;
+			}
+
+			int t = idata[k + (1 << d) - 1];
+			idata[k + (1 << d) - 1] = idata[k + offset - 1];
+			idata[k + offset - 1] += t;
+		}
+
+
+		void printDeviceArr(int n, int* device_arr) {
+			int* arr = (int*)malloc(sizeof(int)*n);
+			cudaMemcpy(arr, device_arr, sizeof(int) *n, cudaMemcpyDeviceToHost);
+			printf("\n [");
+			for (int i = 0; i < n; i++) {
+				printf("%d, ", arr[i]);
+			}
+			printf("]\n");
+			free(arr);
+		}
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+
+			dim3 threadsPerBlock(blockSize);
+			int npt = 1 << ilog2ceil(n);
+
+			int * dev_idata;
+			cudaMalloc((void **)&dev_idata, npt * sizeof(int));
+			checkCUDAError("Error: Cuda Malloc for input data on device");
+			if (npt != n) {
+				int *new_cpu_arr = (int*) malloc(sizeof(int) * npt);
+				memset(new_cpu_arr, 0, sizeof(int) * npt);
+				memcpy(new_cpu_arr, idata, sizeof(int) * n);
+			    cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+				checkCUDAError("Cuda Error on non-power of two array");
+			    free(new_cpu_arr);
+			} else { 
+			    cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+				checkCUDAError("Cuda Error on power of two array");
+			}
+
             timer().startGpuTimer();
+			for (int d = 0; d < ilog2ceil(npt); d++) {
+				int updateThreadUtil = npt / (1 << (npt + 1));
+				dim3 blocksPerGrid((updateThreadUtil + blockSize - 1) / blockSize);
+				kernUpsweep << <blocksPerGrid, threadsPerBlock >> > (npt, d, dev_idata);
+			}
+
+			int z = 0;
+			cudaMemcpy(&dev_idata[npt - 1], &z, sizeof(int), cudaMemcpyHostToDevice);
+			for (int d = ilog2(npt) - 1; d >= 0; d--) {
+				int updateThreadUtil = npt / (1 << (npt + 1));
+				dim3 blocksPerGrid((updateThreadUtil + blockSize - 1) / blockSize);
+				kernDownsweep << <blocksPerGrid, threadsPerBlock >> > (npt, d, dev_idata);
+			}
             // TODO
             timer().endGpuTimer();
+			cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost);
+			cudaFree(dev_idata);
+        }
+
+        void compactScan(int n, int *odata, const int *idata) {
+
+			dim3 threadsPerBlock(blockSize);
+			int npt = 1 << ilog2ceil(n);
+
+			int * dev_idata;
+			cudaMalloc((void **)&dev_idata, npt * sizeof(int));
+			checkCUDAError("Error: Cuda Malloc for input data on device");
+			if (npt != n) {
+				int *new_cpu_arr = (int*) malloc(sizeof(int) * npt);
+				memset(new_cpu_arr, 0, sizeof(int) * npt);
+				memcpy(new_cpu_arr, idata, sizeof(int) * n);
+			    cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+				checkCUDAError("Cuda Error on non-power of two array");
+			} else { 
+			    cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+				checkCUDAError("Cuda Error on power of two array");
+			}
+
+			for (int d = 0; d < ilog2ceil(npt); d++) {
+				int updateThreadUtil = npt / (1 << (npt + 1));
+				dim3 blocksPerGrid((updateThreadUtil + blockSize - 1) / blockSize);
+				kernUpsweep << <blocksPerGrid, threadsPerBlock >> > (npt, d, dev_idata);
+			}
+
+			int z = 0;
+			cudaMemcpy(&dev_idata[npt - 1], &z, sizeof(int), cudaMemcpyHostToDevice);
+			for (int d = ilog2(npt) - 1; d >= 0; d--) {
+				int updateThreadUtil = npt / (1 << (npt + 1));
+				dim3 blocksPerGrid((updateThreadUtil + blockSize - 1) / blockSize);
+				kernDownsweep << <blocksPerGrid, threadsPerBlock >> > (npt, d, dev_idata);
+			}
+            // TODO
+			cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost);
+			cudaFree(dev_idata);
         }
 
+        /**
+         * Performs stream compaction on idata, storing the result into odata.
+         * All zeroes are discarded.
+         *
+         * @param n      The number of elements in idata.
+         * @param odata  The array into which to store elements.
+         * @param idata  The array of elements to compact.
+         * @returns      The number of elements remaining after compaction.
+
         /**
          * Performs stream compaction on idata, storing the result into odata.
          * All zeroes are discarded.
@@ -31,10 +152,53 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
+			dim3 threadsPerBlock(blockSize);
+			dim3 blocksPerGrid((n + blockSize - 1) / blockSize);
+
+			int *dev_idata, *dev_odata, *dev_bool;
+			cudaMalloc((void**)&dev_idata, sizeof(int) * n);
+			checkCUDAError("Malloc Error dev_idata");
+			cudaMalloc((void**)&dev_odata, sizeof(int) * n);
+			checkCUDAError("Malloc Error dev_odata");
+			cudaMalloc((void**)&dev_bool,  sizeof(int) * n);
+			checkCUDAError("Malloc Error dev_bool");
+			
+			cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+			checkCUDAError("memcpy error dev_idata", __LINE__);
+
+
             timer().startGpuTimer();
+			Common::kernMapToBoolean << <blocksPerGrid, threadsPerBlock >> > (n, dev_bool, dev_idata);
+			checkCUDAError("map to boolean error", __LINE__);
+			int* host_bool = new int[n];
+			cudaMemcpy(host_bool, dev_bool, n * sizeof(int), cudaMemcpyDeviceToHost);
+			checkCUDAError("memcpy erroyr host_bool", __LINE__);
+			
+			int* indices = new int[n];
+
+			compactScan(n, indices, host_bool);
+			int* dev_indices;
+			cudaMalloc((void**)&dev_indices, sizeof(int) * n);
+			checkCUDAError("malloc error dev_indices", __LINE__);
+			cudaMemcpy(dev_indices, indices, sizeof(int)*n, cudaMemcpyHostToDevice);
+			checkCUDAError("memcpy error dev_indices", __LINE__);
+
+			Common::kernScatter << <blocksPerGrid, threadsPerBlock >> > (n, dev_odata, dev_idata, dev_bool, dev_indices);
+			checkCUDAError("kernScatter Error", __LINE__);
+
+			cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost);
+			checkCUDAError("memcpy error dev_odata to odata", __LINE__);
+
             // TODO
             timer().endGpuTimer();
-            return -1;
+			int count = (host_bool[n - 1]) ? indices[n - 1] + 1 : indices[n-1];
+			delete[] host_bool;
+			delete[] indices;
+			cudaFree(dev_bool);
+			cudaFree(dev_idata);
+			cudaFree(dev_indices);
+			cudaFree(dev_odata);
+            return count;
         }
     }
 }
diff --git a/Project2-Stream-Compaction/stream_compaction/naive.cu b/Project2-Stream-Compaction/stream_compaction/naive.cu
index 4308876..1ec3875 100644
--- a/Project2-Stream-Compaction/stream_compaction/naive.cu
+++ b/Project2-Stream-Compaction/stream_compaction/naive.cu
@@ -2,7 +2,7 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "naive.h"
-
+#define blockSize 128
 namespace StreamCompaction {
     namespace Naive {
         using StreamCompaction::Common::PerformanceTimer;
@@ -11,15 +11,75 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
-        // TODO: __global__
+		__global__ void kernNaiveScan(int n, int offset, int *odata, const int *idata) {
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (index >= n) {
+				return;
+			}
+
+		/*	if (index < offset) {
+				odata[index] = idata[index];
+			}
+			else {
+				odata[index] = idata[index - offset] + idata[index];
+			}*/
+
+			odata[index] = (index < offset) ? idata[index] : idata[index - offset] + idata[index];
+		}
+
+
+		__global__ void kernMakeExclusive(int n, int *odata, const int *idata) {
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (index >= n) {
+				return;
+			}
+			odata[index] = (index == 0) ? 0 : idata[index - 1];
+		}
 
+	    void printDeviceArr(int n, int* device_arr) {
+		    int* arr = (int*)malloc(sizeof(int)*n);
+			cudaMemcpy(arr, device_arr, sizeof(int) *n, cudaMemcpyDeviceToHost);
+			printf("\n [");
+			for (int i = -1; i < n; i++) {
+				printf("%d, ", arr[i]);
+			}
+			printf("]\n");
+			free(arr);
+		}
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+			dim3 threadsPerBlock(blockSize);
+			dim3 blocksPerGrid((n + blockSize - 1) / blockSize);
+
+			int* A;
+			int* B;
+
+			cudaMalloc((void**)&A, sizeof(int) * n);
+			checkCUDAError("cuda Error Allocating A");
+			cudaMalloc((void**)&B, sizeof(int) * n);
+			checkCUDAError("cuda Error Allocating B");
+			cudaMemcpy(A, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+			checkCUDAError("Error Copying data to A");
+
             timer().startGpuTimer();
-            // TODO
+			for (int offset = 1; offset < n; offset *= 2) {
+				kernNaiveScan<<<blocksPerGrid,threadsPerBlock>>>(n, offset, B, A);
+                
+				cudaMemcpy(A, B, n * sizeof(int), cudaMemcpyDeviceToDevice);
+			    checkCUDAError("Error Copying data to A");
+				
+			}
+			kernMakeExclusive << <blocksPerGrid, threadsPerBlock >> > (n, B, A);
+			    checkCUDAError("ERror in kernMakeExclusive");
+		    cudaMemcpy(odata, B, n * sizeof(int), cudaMemcpyDeviceToHost);
+			    checkCUDAError("ERror in Copying back to host");
+             // TODO
             timer().endGpuTimer();
+
+			cudaFree(A);
+			cudaFree(B);
         }
     }
 }
diff --git a/Project2-Stream-Compaction/stream_compaction/naive.h b/Project2-Stream-Compaction/stream_compaction/naive.h
index 37dcb06..1d82526 100644
--- a/Project2-Stream-Compaction/stream_compaction/naive.h
+++ b/Project2-Stream-Compaction/stream_compaction/naive.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include "common.h"
-
 namespace StreamCompaction {
     namespace Naive {
         StreamCompaction::Common::PerformanceTimer& timer();
diff --git a/Project2-Stream-Compaction/stream_compaction/thrust.cu b/Project2-Stream-Compaction/stream_compaction/thrust.cu
index 1def45e..f6b386b 100644
--- a/Project2-Stream-Compaction/stream_compaction/thrust.cu
+++ b/Project2-Stream-Compaction/stream_compaction/thrust.cu
@@ -18,11 +18,21 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+
+			thrust::host_vector<int> dv_in(idata, idata + n);
+			thrust::host_vector<int> dv_out(odata, odata + n);
+
+			thrust::device_vector<int> dev_in(dv_in);
+			thrust::device_vector<int> dev_out(dv_out);
+
             timer().startGpuTimer();
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
-            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            thrust::exclusive_scan(dev_in.begin(), dev_in.end(), dev_out.begin());
             timer().endGpuTimer();
+			thrust::copy(dev_out.begin(), dev_out.end(), dv_out.begin());
+			cudaMemcpy(odata, &dv_out[0], sizeof(int) * n, cudaMemcpyHostToHost);
+			checkCUDAError("memcopy error in thryust scan");
         }
     }
 }
diff --git a/README.md b/README.md
index 3a0b2fe..363d1f0 100644
--- a/README.md
+++ b/README.md
@@ -3,14 +3,27 @@ CUDA Number Algorithms
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+Davis Polito 
+*  [https://github.com/davispolito/Project0-Getting-Started/blob/master]()
+* Tested on: Windows 10, i7-8750H @ 2.20GHz 16GB, GTX 1060       
+#Optimizing Block Size Per Algorithm
 
-### (TODO: Your README)
+First we must optimize block size per algorithm
+![block optimization graph](/Project2-Stream-Compaction/img/blocksizeopt.PNG)
+After creating this graph I chose blocksize to be 128 and 512 for naive and work efficient scan
+
+#Scan comparison
+![size vs. time graph](/Project2-Stream-Compaction/img/sizevstime.PNG)
+
+
+##Questions
+#Explanations for each result
+
+######cpu This method has O(n) runtime and is only affected by size of the aray
+######naivei this method has O(nlogn) runtime
+######work-efficient This has a possible runtime of O(n) but do to memory access and non ideal uses of threads and warps (i.e. warp branching) We see a slower runtime than cpu
+#####thrust 
+![Console Output From Steam compaction](/Project2-Stream-Compaction/img/outputConsole.PNG)
 
-Link to the readmes of the other two subprojects.
 
-Add anything else you think is relevant up to this point.
-(Remember, this is public, so don't put anything here that you don't want to share with the world.)
 
diff --git a/README.md~ b/README.md~
new file mode 100644
index 0000000..676fcb0
--- /dev/null
+++ b/README.md~
@@ -0,0 +1,26 @@
+CUDA Number Algorithms
+======================
+
+**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
+
+Davis Polito 
+*  [https://github.com/davispolito/Project0-Getting-Started/blob/master]()
+* Tested on: Windows 10, i7-8750H @ 2.20GHz 16GB, GTX 1060       
+#Optimizing Block Size Per Algorithm
+
+First we must optimize block size per algorithm
+![block optimization graph](/Project2-Character-Recognition/img/blocksizeopt.png)
+After creating this graph I chose blocksize to be 128 and 512 for naive and work efficient scan
+
+#Scan comparison
+![size vs. time graph](/Project2-Character-Recognition/img/sizevstime.png)
+
+
+##Questions
+#Explanations for each result
+
+######cpu
+######naive
+######work-efficient
+#####thrust
+