From ccfeebc0ff1662cf4c047c24a70b601b290f2d44 Mon Sep 17 00:00:00 2001
From: Austin Eng <213reeses@gmail.com>
Date: Mon, 19 Sep 2016 02:16:56 -0400
Subject: [PATCH 01/11] set computability level

---
 stream_compaction/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index cdbef77..cee1b43 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_35
     )

From 3284422ce556554771d9df373e5b5b1c22e5d514 Mon Sep 17 00:00:00 2001
From: Austin Eng <213reeses@gmail.com>
Date: Mon, 19 Sep 2016 08:20:36 -0400
Subject: [PATCH 02/11] part 1: cpu implementation

---
 stream_compaction/cpu.cu | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index e600c29..579fbfc 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -8,8 +8,12 @@ namespace CPU {
  * CPU scan (prefix sum).
  */
 void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
+  int total = 0;
+  for (int i = 0; i < n; ++i) {
+    int val = idata[i];
+    odata[i] = total;
+    total += val;
+  }
 }
 
 /**
@@ -18,8 +22,11 @@ void scan(int n, int *odata, const int *idata) {
  * @returns the number of elements remaining after compaction.
  */
 int compactWithoutScan(int n, int *odata, const int *idata) {
-    // TODO
-    return -1;
+  int idx = 0;
+  for (int i = 0; i < n; ++i) {
+    if (idata[i] != 0) odata[idx++] = idata[i];
+  }
+  return idx;
 }
 
 /**
@@ -28,8 +35,16 @@ int compactWithoutScan(int n, int *odata, const int *idata) {
  * @returns the number of elements remaining after compaction.
  */
 int compactWithScan(int n, int *odata, const int *idata) {
-    // TODO
-    return -1;
+  for (int i = 0; i < n; ++i) {
+    odata[i] = idata[i] != 0 ? 1 : 0;
+  }
+  int last = odata[n - 1];
+  scan(n, odata, odata);
+  int count = odata[n - 1] + last;
+  for (int i = 0; i < n; ++i) {
+    odata[odata[i]] = idata[i];
+  }
+  return count;
 }
 
 }

From a8f22ade6efcd51266d3a810fa7eae4e584da317 Mon Sep 17 00:00:00 2001
From: Austin Eng <213reeses@gmail.com>
Date: Mon, 19 Sep 2016 09:44:48 -0400
Subject: [PATCH 03/11] naive scan implementation

---
 src/main.cpp               |  4 ++--
 stream_compaction/naive.cu | 39 +++++++++++++++++++++++++++++++++++---
 2 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 675da35..dcf9328 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -43,13 +43,13 @@ int main(int argc, char* argv[]) {
     zeroArray(SIZE, c);
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 3d86b60..b49153d 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -6,14 +6,47 @@
 namespace StreamCompaction {
 namespace Naive {
 
-// TODO: __global__
+/*
+ * Performs one iteration of a naive scan of N elements. pow2d = 2^depth
+ */
+__global__ void naiveScanIteration(int N, int pow2d, int* odata, int* idata) {
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index >= N) return;
+  if (index >= pow2d) {
+    odata[index] = idata[index] + idata[index - pow2d];
+  }
+  else {
+    // we've already processed these elements. just copy them
+    odata[index] = idata[index];
+  }
+}
 
 /**
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
 void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
+  int* dev_data;
+  cudaMalloc((void**)&dev_data, n * sizeof(int));
+  checkCUDAError("cudaMalloc dev_data failed!");
+  cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+  checkCUDAError("cudaMemcpy from idata to dev_data failed!");
+
+  const int blockSize = 128;
+  const int nBlocks = (n + blockSize - 1) / blockSize; //  n/blockSize, rounded up
+
+  for (int d = 0; d < ilog2ceil(n); ++d) {
+    naiveScanIteration << < nBlocks, blockSize >> >(n, pow(2, d), dev_data, dev_data);
+  }
+
+  cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost);
+  checkCUDAError("cudaMemcpy from dev_data to odata failed!");
+  cudaFree(dev_data);
+  checkCUDAError("cudaFree dev_data failed!");
+
+  for (int i = n-1; i > 0; --i) {
+    odata[i] = odata[i - 1];
+  }
+  odata[0] = 0;
 }
 
 }

From 731c1a5ef03f834fe18b51ae0b329da55b199750 Mon Sep 17 00:00:00 2001
From: Austin Eng <213reeses@gmail.com>
Date: Tue, 20 Sep 2016 09:17:00 -0400
Subject: [PATCH 04/11] efficient scan

---
 src/main.cpp                   |  4 +-
 stream_compaction/efficient.cu | 82 +++++++++++++++++++++++++++++++---
 2 files changed, 79 insertions(+), 7 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index dcf9328..bfdd660 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -55,13 +55,13 @@ int main(int argc, char* argv[]) {
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index b2f739b..21ce713 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -6,14 +6,69 @@
 namespace StreamCompaction {
 namespace Efficient {
 
-// TODO: __global__
+__global__ void scan_up(int n, int pow2d, int *odata, const int *idata) {
+  int index = 2 * pow2d * (blockIdx.x * blockDim.x + threadIdx.x + 1) - 1;
+  if (index >= n) return;
+  odata[index] = idata[index - pow2d] + idata[index];
+}
+
+__global__ void scan_down(int n, int pow2d, int *odata, const int *idata) {
+  int index = n - 2 * pow2d * (blockIdx.x * blockDim.x + threadIdx.x) - 1;
+  if (index < 0) return;
+
+  int temp = idata[index - pow2d];
+  odata[index - pow2d] = idata[index];
+  odata[index] = temp + idata[index];
+}
+
+__global__ void zero(int n, int *odata) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < 0) return;
+  odata[index] = 0;
+}
 
 /**
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
 void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
+  int* dev_data;
+  int sizePow2 = pow(2, ilog2ceil(n));
+
+  const int blockSize = 128;
+  int nBlocks;
+
+  cudaMalloc((void**)&dev_data, sizePow2 * sizeof(int));
+  checkCUDAError("cudaMalloc dev_data failed!");
+
+  // fill with 0
+  nBlocks = (sizePow2 + blockSize - 1) / blockSize;
+  zero << <nBlocks, blockSize >> >(sizePow2, dev_data);
+
+  cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+  checkCUDAError("cudaMemcpy from idata to dev_data failed!");
+
+  // scan up
+  for (int pow2d = 1, int threads = sizePow2; pow2d < sizePow2 / 2; pow2d *= 2, threads /= 2) {
+    nBlocks = (threads + blockSize - 1) / blockSize; // threads / blockSize, rounded up
+    scan_up << < nBlocks, threads >> >(n, pow2d, dev_data, dev_data);
+  }
+
+  // set last item to 0
+  int zero = 0;
+  cudaMemcpy(&dev_data[sizePow2 - 1], &zero, sizeof(int), cudaMemcpyHostToDevice);
+  checkCUDAError("cudaMemcpy copy zero failed!");
+
+  // scan down
+  for (int pow2d = pow(2, ilog2ceil(sizePow2) - 1), int threads = 1; pow2d >= 1; pow2d /= 2, threads *= 2) {
+    nBlocks = (threads + blockSize - 1) / blockSize; // threads / blockSize, rounded up
+    scan_down << < nBlocks, threads >> >(sizePow2, pow2d, dev_data, dev_data);
+  }
+
+  cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost);
+  checkCUDAError("cudaMemcpy from dev_data to odata failed!");
+
+  cudaFree(dev_data);
+  checkCUDAError("cudaFree dev_data failed!");
 }
 
 /**
@@ -26,8 +81,25 @@ void scan(int n, int *odata, const int *idata) {
  * @returns      The number of elements remaining after compaction.
  */
 int compact(int n, int *odata, const int *idata) {
-    // TODO
-    return -1;
+  int* dev_data;
+
+  const int blockSize = 128;
+  int nBlocks;
+
+  cudaMalloc((void**)&dev_data, n * sizeof(int));
+  checkCUDAError("cudaMalloc dev_data failed!");
+
+  cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+  checkCUDAError("cudaMemcpy from idata to dev_data failed!");
+
+  cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost);
+  checkCUDAError("cudaMemcpy from dev_data to odata failed!");
+
+  cudaFree(dev_data);
+  checkCUDAError("cudaFree dev_data failed!");
+
+  // TODO
+  return -1;
 }
 
 }

From 8dd30dc0f8ff8b43be26a712626a4d3f29cc6ce0 Mon Sep 17 00:00:00 2001
From: Austin Eng <213reeses@gmail.com>
Date: Tue, 20 Sep 2016 10:38:44 -0400
Subject: [PATCH 05/11] efficient stream compact

---
 src/main.cpp                   |   4 +-
 stream_compaction/efficient.cu | 101 +++++++++++++++++++++++++++++----
 2 files changed, 93 insertions(+), 12 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index bfdd660..b20ffd0 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -112,12 +112,12 @@ int main(int argc, char* argv[]) {
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 21ce713..503d239 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -3,6 +3,14 @@
 #include "common.h"
 #include "efficient.h"
 
+__global__ void printArr(int n, const int* data) {
+  printf("    [ ");
+  for (int i = 0; i < n; ++i) {
+    printf("%3d ", data[i]);
+  }
+  printf("]\n");
+}
+
 namespace StreamCompaction {
 namespace Efficient {
 
@@ -71,6 +79,21 @@ void scan(int n, int *odata, const int *idata) {
   checkCUDAError("cudaFree dev_data failed!");
 }
 
+
+__global__ void filter(int n, int *odata, const int* idata) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index >= n) return;
+  odata[index] = idata[index] != 0;
+}
+
+__global__ void scatter(int n, int *odata, const int* idata, const int* mask, const int* indicies) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index >= n) return;
+  if (mask[index]) {
+    odata[indicies[index]] = idata[index];
+  }
+}
+
 /**
  * Performs stream compaction on idata, storing the result into odata.
  * All zeroes are discarded.
@@ -81,25 +104,83 @@ void scan(int n, int *odata, const int *idata) {
  * @returns      The number of elements remaining after compaction.
  */
 int compact(int n, int *odata, const int *idata) {
-  int* dev_data;
+  int* dev_idata;
+  int* dev_odata;
+  int* dev_mask;
+  int sizePow2 = pow(2, ilog2ceil(n));
 
   const int blockSize = 128;
   int nBlocks;
 
-  cudaMalloc((void**)&dev_data, n * sizeof(int));
-  checkCUDAError("cudaMalloc dev_data failed!");
+  cudaMalloc((void**)&dev_idata, n * sizeof(int));
+  checkCUDAError("cudaMalloc dev_idata failed!");
 
-  cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+  cudaMalloc((void**)&dev_odata, n * sizeof(int));
+  checkCUDAError("cudaMalloc dev_odata failed!");
+
+  cudaMalloc((void**)&dev_mask, sizePow2 * sizeof(int));
+  checkCUDAError("cudaMalloc dev_idata failed!");
+
+  cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
   checkCUDAError("cudaMemcpy from idata to dev_data failed!");
 
-  cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost);
-  checkCUDAError("cudaMemcpy from dev_data to odata failed!");
+  // printArr << <1, 1 >> >(n, dev_idata);
 
-  cudaFree(dev_data);
-  checkCUDAError("cudaFree dev_data failed!");
+  // create mask
+  nBlocks = (n + blockSize - 1) / blockSize;
+  filter << <nBlocks, blockSize >> >(n, dev_mask, dev_idata);
+  
+  // save the mask here for usage later
+  cudaMemcpy(dev_odata, dev_mask, sizeof(int) * n, cudaMemcpyDeviceToDevice);
+  checkCUDAError("cudaMemcpy from dev_mask to dev_odata failed!");
+
+  // printArr << <1, 1 >> >(n, dev_mask);
+  
+  int endsWith1;
+  cudaMemcpy(&endsWith1, &dev_mask[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+  checkCUDAError("cudaMemcpy copy to endswith1 failed!");
+
+  // scan up
+  for (int pow2d = 1, int threads = sizePow2; pow2d < sizePow2 / 2; pow2d *= 2, threads /= 2) {
+    nBlocks = (threads + blockSize - 1) / blockSize; // threads / blockSize, rounded up
+    scan_up << < nBlocks, threads >> >(n, pow2d, dev_mask, dev_mask);
+  }
+
+  // set last item to 0
+  int last = 0;
+  cudaMemcpy(&dev_mask[sizePow2 - 1], &last, sizeof(int), cudaMemcpyHostToDevice);
+  checkCUDAError("cudaMemcpy copy zero failed!");
+
+  // scan down
+  for (int pow2d = pow(2, ilog2ceil(sizePow2) - 1), int threads = 1; pow2d >= 1; pow2d /= 2, threads *= 2) {
+    nBlocks = (threads + blockSize - 1) / blockSize; // threads / blockSize, rounded up
+    scan_down << < nBlocks, threads >> >(sizePow2, pow2d, dev_mask, dev_mask);
+  }
+
+  // copy back last val so we know how many elements
+  cudaMemcpy(&last, &dev_mask[sizePow2 - 1], sizeof(int), cudaMemcpyDeviceToHost);
+  checkCUDAError("cudaMemcpy copy last failed!");
+  last += endsWith1; // increment if we should include the very last element
+
+  // printArr << <1, 1 >> >(n, dev_mask);
+
+  // scatter
+  nBlocks = (n + blockSize - 1) / blockSize;
+  scatter << <nBlocks, blockSize >> >(n , dev_odata, dev_idata, dev_odata, dev_mask);
+
+  cudaMemcpy(odata, dev_odata, sizeof(int) * last, cudaMemcpyDeviceToHost);
+  checkCUDAError("cudaMemcpy from dev_odata to odata failed!");
+
+  cudaFree(dev_idata);
+  checkCUDAError("cudaFree dev_idata failed!");
+
+  cudaFree(dev_odata);
+  checkCUDAError("cudaFree dev_odata failed!");
+
+  cudaFree(dev_mask);
+  checkCUDAError("cudaFree dev_mask failed!");
 
-  // TODO
-  return -1;
+  return last;
 }
 
 }

From b365f2e653069ef95751e30325ccb86a2861b223 Mon Sep 17 00:00:00 2001
From: Austin Eng <213reeses@gmail.com>
Date: Tue, 20 Sep 2016 10:45:53 -0400
Subject: [PATCH 06/11] move kernels to common

---
 stream_compaction/common.cu    | 10 ++++++++--
 stream_compaction/efficient.cu | 19 ++-----------------
 2 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index fe872d4..8b4fbad 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -23,7 +23,9 @@ namespace Common {
  * which map to 0 will be removed, and elements which map to 1 will be kept.
  */
 __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-    // TODO
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index >= n) return;
+  bools[index] = idata[index] != 0;
 }
 
 /**
@@ -32,7 +34,11 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
  */
 __global__ void kernScatter(int n, int *odata,
         const int *idata, const int *bools, const int *indices) {
-    // TODO
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index >= n) return;
+  if (bools[index]) {
+    odata[indices[index]] = idata[index];
+  }
 }
 
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 503d239..fbf446d 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -79,21 +79,6 @@ void scan(int n, int *odata, const int *idata) {
   checkCUDAError("cudaFree dev_data failed!");
 }
 
-
-__global__ void filter(int n, int *odata, const int* idata) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index >= n) return;
-  odata[index] = idata[index] != 0;
-}
-
-__global__ void scatter(int n, int *odata, const int* idata, const int* mask, const int* indicies) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index >= n) return;
-  if (mask[index]) {
-    odata[indicies[index]] = idata[index];
-  }
-}
-
 /**
  * Performs stream compaction on idata, storing the result into odata.
  * All zeroes are discarded.
@@ -128,7 +113,7 @@ int compact(int n, int *odata, const int *idata) {
 
   // create mask
   nBlocks = (n + blockSize - 1) / blockSize;
-  filter << <nBlocks, blockSize >> >(n, dev_mask, dev_idata);
+  StreamCompaction::Common::kernMapToBoolean << <nBlocks, blockSize >> >(n, dev_mask, dev_idata);
   
   // save the mask here for usage later
   cudaMemcpy(dev_odata, dev_mask, sizeof(int) * n, cudaMemcpyDeviceToDevice);
@@ -166,7 +151,7 @@ int compact(int n, int *odata, const int *idata) {
 
   // scatter
   nBlocks = (n + blockSize - 1) / blockSize;
-  scatter << <nBlocks, blockSize >> >(n , dev_odata, dev_idata, dev_odata, dev_mask);
+  StreamCompaction::Common::kernScatter << <nBlocks, blockSize >> >(n, dev_odata, dev_idata, dev_odata, dev_mask);
 
   cudaMemcpy(odata, dev_odata, sizeof(int) * last, cudaMemcpyDeviceToHost);
   checkCUDAError("cudaMemcpy from dev_odata to odata failed!");

From 4aea147788b3abf6335cc79e984e06a9e45a605b Mon Sep 17 00:00:00 2001
From: Austin Eng <213reeses@gmail.com>
Date: Tue, 20 Sep 2016 11:08:21 -0400
Subject: [PATCH 07/11] thrust scan

---
 src/main.cpp                | 4 ++--
 stream_compaction/thrust.cu | 7 ++++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index b20ffd0..10673c8 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -67,13 +67,13 @@ int main(int argc, char* argv[]) {
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, non-power-of-two");
     StreamCompaction::Thrust::scan(NPOT, c, a);
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     printf("\n");
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index d8dbb32..06ef696 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -13,9 +13,10 @@ namespace Thrust {
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
 void scan(int n, int *odata, const int *idata) {
-    // TODO use `thrust::exclusive_scan`
-    // example: for device_vectors dv_in and dv_out:
-    // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+  thrust::device_vector<int> dv_in(idata, idata + n);
+  thrust::device_vector<int> dv_out(odata, odata + n);
+  thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+  thrust::copy(dv_out.begin(), dv_out.end(), odata);
 }
 
 }

From a15fc76abfc5a8e79bd16d9b72bdfb2c625ca19e Mon Sep 17 00:00:00 2001
From: Austin Eng <213reeses@gmail.com>
Date: Thu, 22 Sep 2016 10:51:27 -0400
Subject: [PATCH 08/11] fix naive scan to double buffer

---
 src/main.cpp                   | 10 +++++--
 stream_compaction/efficient.cu |  8 ++---
 stream_compaction/naive.cu     | 54 ++++++++++++++++++++++++----------
 3 files changed, 51 insertions(+), 21 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 10673c8..43f6c16 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,9 +14,11 @@
 #include "testing_helpers.hpp"
 
 int main(int argc, char* argv[]) {
-    const int SIZE = 1 << 8;
+    const int SIZE = 1 << 24;
     const int NPOT = SIZE - 3;
-    int a[SIZE], b[SIZE], c[SIZE];
+    int* a = new int[SIZE];
+    int* b = new int[SIZE];
+    int* c = new int[SIZE];
 
     // Scan tests
 
@@ -120,4 +122,8 @@ int main(int argc, char* argv[]) {
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
+
+    delete a;
+    delete b;
+    delete c;
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index fbf446d..fc06461 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -58,7 +58,7 @@ void scan(int n, int *odata, const int *idata) {
   // scan up
   for (int pow2d = 1, int threads = sizePow2; pow2d < sizePow2 / 2; pow2d *= 2, threads /= 2) {
     nBlocks = (threads + blockSize - 1) / blockSize; // threads / blockSize, rounded up
-    scan_up << < nBlocks, threads >> >(n, pow2d, dev_data, dev_data);
+    scan_up << < nBlocks, blockSize >> >(n, pow2d, dev_data, dev_data);
   }
 
   // set last item to 0
@@ -69,7 +69,7 @@ void scan(int n, int *odata, const int *idata) {
   // scan down
   for (int pow2d = pow(2, ilog2ceil(sizePow2) - 1), int threads = 1; pow2d >= 1; pow2d /= 2, threads *= 2) {
     nBlocks = (threads + blockSize - 1) / blockSize; // threads / blockSize, rounded up
-    scan_down << < nBlocks, threads >> >(sizePow2, pow2d, dev_data, dev_data);
+    scan_down << < nBlocks, blockSize >> >(sizePow2, pow2d, dev_data, dev_data);
   }
 
   cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost);
@@ -128,7 +128,7 @@ int compact(int n, int *odata, const int *idata) {
   // scan up
   for (int pow2d = 1, int threads = sizePow2; pow2d < sizePow2 / 2; pow2d *= 2, threads /= 2) {
     nBlocks = (threads + blockSize - 1) / blockSize; // threads / blockSize, rounded up
-    scan_up << < nBlocks, threads >> >(n, pow2d, dev_mask, dev_mask);
+    scan_up << < nBlocks, blockSize >> >(n, pow2d, dev_mask, dev_mask);
   }
 
   // set last item to 0
@@ -139,7 +139,7 @@ int compact(int n, int *odata, const int *idata) {
   // scan down
   for (int pow2d = pow(2, ilog2ceil(sizePow2) - 1), int threads = 1; pow2d >= 1; pow2d /= 2, threads *= 2) {
     nBlocks = (threads + blockSize - 1) / blockSize; // threads / blockSize, rounded up
-    scan_down << < nBlocks, threads >> >(sizePow2, pow2d, dev_mask, dev_mask);
+    scan_down << < nBlocks, blockSize >> >(sizePow2, pow2d, dev_mask, dev_mask);
   }
 
   // copy back last val so we know how many elements
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index b49153d..6aa424c 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -1,16 +1,25 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <algorithm>
 #include "common.h"
 #include "naive.h"
 
 namespace StreamCompaction {
 namespace Naive {
 
+__global__ void printArr(int n, const int* data) {
+  printf("    [ ");
+  for (int i = 0; i < n; ++i) {
+    printf("%3d ", data[i]);
+  }
+  printf("]\n");
+}
+
 /*
  * Performs one iteration of a naive scan of N elements. pow2d = 2^depth
  */
-__global__ void naiveScanIteration(int N, int pow2d, int* odata, int* idata) {
-  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+__global__ void naiveScanIteration(int N, int pow2d, int* odata, const int* idata) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index >= N) return;
   if (index >= pow2d) {
     odata[index] = idata[index] + idata[index - pow2d];
@@ -21,32 +30,47 @@ __global__ void naiveScanIteration(int N, int pow2d, int* odata, int* idata) {
   }
 }
 
+__global__ void rshift(int n, int* odata, const int* idata) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index >= n) return;
+  odata[index] = index == 0 ? 0 : idata[index - 1];
+}
+
 /**
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
 void scan(int n, int *odata, const int *idata) {
-  int* dev_data;
-  cudaMalloc((void**)&dev_data, n * sizeof(int));
-  checkCUDAError("cudaMalloc dev_data failed!");
-  cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+  int* dev_idata;
+  int* dev_odata;
+  cudaMalloc((void**)&dev_idata, n * sizeof(int));
+  checkCUDAError("cudaMalloc dev_idata failed!");
+
+  cudaMalloc((void**)&dev_odata, n * sizeof(int));
+  checkCUDAError("cudaMalloc dev_odata failed!");
+
+  cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
   checkCUDAError("cudaMemcpy from idata to dev_data failed!");
 
   const int blockSize = 128;
   const int nBlocks = (n + blockSize - 1) / blockSize; //  n/blockSize, rounded up
 
-  for (int d = 0; d < ilog2ceil(n); ++d) {
-    naiveScanIteration << < nBlocks, blockSize >> >(n, pow(2, d), dev_data, dev_data);
+  for (int pow2d = 1; pow2d < n; pow2d *= 2) {
+    naiveScanIteration << < nBlocks, blockSize >> >(n, pow2d, dev_odata, dev_idata);
+    std::swap(dev_idata, dev_odata);
   }
 
-  cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost);
+  // convert to exclusive scan
+  rshift << <nBlocks, blockSize >> >(n, dev_odata, dev_idata);
+
+  // we use dev_idata here because we swapped buffers
+  cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost);
   checkCUDAError("cudaMemcpy from dev_data to odata failed!");
-  cudaFree(dev_data);
-  checkCUDAError("cudaFree dev_data failed!");
 
-  for (int i = n-1; i > 0; --i) {
-    odata[i] = odata[i - 1];
-  }
-  odata[0] = 0;
+  cudaFree(dev_idata);
+  checkCUDAError("cudaFree dev_idata failed!");
+
+  cudaFree(dev_odata);
+  checkCUDAError("cudaFree dev_odata failed!");
 }
 
 }

From 3978305a62096d74589cd78f3cf86283648621d3 Mon Sep 17 00:00:00 2001
From: Austin Eng <213reeses@gmail.com>
Date: Fri, 23 Sep 2016 00:44:28 -0400
Subject: [PATCH 09/11] add output to README.md

---
 README.md                      |  71 +++++++++-
 calc_stats.js                  |  39 ++++++
 src/main.cpp                   | 237 +++++++++++++++++----------------
 stream_compaction/common.h     |  15 +++
 stream_compaction/cpu.cu       |  11 ++
 stream_compaction/efficient.cu |  34 ++---
 stream_compaction/naive.cu     |   5 +
 stream_compaction/thrust.cu    |   2 +
 8 files changed, 281 insertions(+), 133 deletions(-)
 create mode 100644 calc_stats.js

diff --git a/README.md b/README.md
index b71c458..89205c7 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,70 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Austin Eng
+* Tested on: Windows 10, i7-4770K @ 3.50GHz 16GB, GTX 780 3072MB (Personal Computer)
 
-### (TODO: Your README)
-
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+## Output
+```
+****************
+** SCAN TESTS **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...  10   0 ]
+==== cpu scan, power-of-two ====
+Elapsed: 10.0046ms
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 205473618 205473628 ]
+==== cpu scan, non-power-of-two ====
+Elapsed: 9.0066ms
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 205473524 205473568 ]
+    passed
+==== naive scan, power-of-two ====
+Elapsed: 9.708448ms
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 205473618 205473628 ]
+    passed
+==== naive scan, non-power-of-two ====
+Elapsed: 9.713088ms
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ...   0   0 ]
+    passed
+==== work-efficient scan, power-of-two ====
+Elapsed: 4.019968ms
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 205473618 205473628 ]
+    passed
+==== work-efficient scan, non-power-of-two ====
+Elapsed: 3.999136ms
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 205473524 205473568 ]
+    passed
+==== thrust scan, power-of-two ====
+Elapsed: 0.906560ms
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 205473618 205473628 ]
+    passed
+==== thrust scan, non-power-of-two ====
+Elapsed: 1.042912ms
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 205473524 205473568 ]
+    passed
 
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   3   2   1   3   1   1   1   2   0   1   0   2 ...   0   0 ]
+==== cpu compact without scan, power-of-two ====
+Elapsed: 17.0074ms
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   2 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+Elapsed: 17.0071ms
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   2 ]
+    passed
+==== cpu compact with scan ====
+Elapsed: 6.0037ms
+Elapsed: 31.0118ms
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   2 ]
+    passed
+==== work-efficient compact, power-of-two ====
+Elapsed: 5.496416ms
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   2 ]
+    passed
+==== work-efficient compact, non-power-of-two ====
+Elapsed: 5.449856ms
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   2 ]
+    passed
+```
\ No newline at end of file
diff --git a/calc_stats.js b/calc_stats.js
new file mode 100644
index 0000000..0110a97
--- /dev/null
+++ b/calc_stats.js
@@ -0,0 +1,39 @@
+
+var fs = require("fs")
+
+fs.readFile(process.argv[2], function (err, data) {
+
+    var stats = {}
+
+    var output = data.toString();
+    var re = /\n==== ([\s\S]+?) ====[^=]+Elapsed: ([\.\d]+)ms/g
+    outputs = output.split("SIZE: ");
+
+    for (var i = 1; i < outputs.length; ++i) {
+        var out = outputs[i];
+        var size = parseFloat(out.match(/\d+/)[0])
+        var match = re.exec(out)
+        while (match != null) {
+
+            if (!(size in stats)) {
+                console.log('initing', size)
+                stats[size] = new Object()
+            }
+            if (!(match[1] in stats[size])) {
+                console.log('initing', size, match[1])
+                stats[size][match[1]] = [0, 0]
+            }
+            stats[size][match[1]][0] += 1
+            stats[size][match[1]][1] += parseFloat(match[2])
+
+            match = re.exec(out)
+        }
+    }
+    for (var i in stats) {
+        for (var j in stats[i]) {
+            stats[i][j] = stats[i][j][1] / stats[i][j][0]
+        }
+    }
+    console.log(stats)
+    
+})
\ No newline at end of file
diff --git a/src/main.cpp b/src/main.cpp
index 43f6c16..e651f0a 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,117 +13,130 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
+void test(int SIZE) {
+  const int NPOT = SIZE - 3;
+  int* a = new int[SIZE];
+  int* b = new int[SIZE];
+  int* c = new int[SIZE];
+
+  // Scan tests
+
+  printf("\n");
+  printf("****************\n");
+  printf("** SCAN TESTS **\n");
+  printf("****************\n");
+
+  genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+  a[SIZE - 1] = 0;
+  printArray(SIZE, a, true);
+
+  zeroArray(SIZE, b);
+  printDesc("cpu scan, power-of-two");
+  StreamCompaction::CPU::scan(SIZE, b, a);
+  printArray(SIZE, b, true);
+
+  zeroArray(SIZE, c);
+  printDesc("cpu scan, non-power-of-two");
+  StreamCompaction::CPU::scan(NPOT, c, a);
+  printArray(NPOT, b, true);
+  printCmpResult(NPOT, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("naive scan, power-of-two");
+  StreamCompaction::Naive::scan(SIZE, c, a);
+  printArray(SIZE, c, true);
+  printCmpResult(SIZE, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("naive scan, non-power-of-two");
+  StreamCompaction::Naive::scan(NPOT, c, a);
+  printArray(SIZE, c, true);
+  printCmpResult(NPOT, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("work-efficient scan, power-of-two");
+  StreamCompaction::Efficient::scan(SIZE, c, a);
+  printArray(SIZE, c, true);
+  printCmpResult(SIZE, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("work-efficient scan, non-power-of-two");
+  StreamCompaction::Efficient::scan(NPOT, c, a);
+  printArray(NPOT, c, true);
+  printCmpResult(NPOT, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("thrust scan, power-of-two");
+  StreamCompaction::Thrust::scan(SIZE, c, a);
+  printArray(SIZE, c, true);
+  printCmpResult(SIZE, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("thrust scan, non-power-of-two");
+  StreamCompaction::Thrust::scan(NPOT, c, a);
+  printArray(NPOT, c, true);
+  printCmpResult(NPOT, b, c);
+
+  printf("\n");
+  printf("*****************************\n");
+  printf("** STREAM COMPACTION TESTS **\n");
+  printf("*****************************\n");
+
+  // Compaction tests
+
+  genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
+  a[SIZE - 1] = 0;
+  printArray(SIZE, a, true);
+
+  int count, expectedCount, expectedNPOT;
+
+  zeroArray(SIZE, b);
+  printDesc("cpu compact without scan, power-of-two");
+  count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
+  expectedCount = count;
+  printArray(count, b, true);
+  printCmpLenResult(count, expectedCount, b, b);
+
+  zeroArray(SIZE, c);
+  printDesc("cpu compact without scan, non-power-of-two");
+  count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
+  expectedNPOT = count;
+  printArray(count, c, true);
+  printCmpLenResult(count, expectedNPOT, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("cpu compact with scan");
+  count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
+  printArray(count, c, true);
+  printCmpLenResult(count, expectedCount, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("work-efficient compact, power-of-two");
+  count = StreamCompaction::Efficient::compact(SIZE, c, a);
+  printArray(count, c, true);
+  printCmpLenResult(count, expectedCount, b, c);
+
+  zeroArray(SIZE, c);
+  printDesc("work-efficient compact, non-power-of-two");
+  count = StreamCompaction::Efficient::compact(NPOT, c, a);
+  printArray(count, c, true);
+  printCmpLenResult(count, expectedNPOT, b, c);
+
+  delete a;
+  delete b;
+  delete c;
+}
+
 int main(int argc, char* argv[]) {
-    const int SIZE = 1 << 24;
-    const int NPOT = SIZE - 3;
-    int* a = new int[SIZE];
-    int* b = new int[SIZE];
-    int* c = new int[SIZE];
-
-    // Scan tests
-
-    printf("\n");
-    printf("****************\n");
-    printf("** SCAN TESTS **\n");
-    printf("****************\n");
-
-    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    zeroArray(SIZE, b);
-    printDesc("cpu scan, power-of-two");
-    StreamCompaction::CPU::scan(SIZE, b, a);
-    printArray(SIZE, b, true);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu scan, non-power-of-two");
-    StreamCompaction::CPU::scan(NPOT, c, a);
-    printArray(NPOT, b, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, power-of-two");
-    StreamCompaction::Naive::scan(SIZE, c, a);
-    printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, non-power-of-two");
-    StreamCompaction::Naive::scan(NPOT, c, a);
-    printArray(SIZE, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, power-of-two");
-    StreamCompaction::Efficient::scan(SIZE, c, a);
-    printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, non-power-of-two");
-    StreamCompaction::Efficient::scan(NPOT, c, a);
-    printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, power-of-two");
-    StreamCompaction::Thrust::scan(SIZE, c, a);
-    printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, non-power-of-two");
-    StreamCompaction::Thrust::scan(NPOT, c, a);
-    printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    printf("\n");
-    printf("*****************************\n");
-    printf("** STREAM COMPACTION TESTS **\n");
-    printf("*****************************\n");
-
-    // Compaction tests
-
-    genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    int count, expectedCount, expectedNPOT;
-
-    zeroArray(SIZE, b);
-    printDesc("cpu compact without scan, power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
-    expectedCount = count;
-    printArray(count, b, true);
-    printCmpLenResult(count, expectedCount, b, b);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact without scan, non-power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
-    expectedNPOT = count;
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact with scan");
-    count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, power-of-two");
-    count = StreamCompaction::Efficient::compact(SIZE, c, a);
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, non-power-of-two");
-    count = StreamCompaction::Efficient::compact(NPOT, c, a);
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
-
-    delete a;
-    delete b;
-    delete c;
+  int SIZE = 1 << 23;
+  test(SIZE);
+    /*for (int pow = 4; pow < 24; ++pow) {
+      int SIZE = 1 << pow;
+      printf("====== SIZE: %d ======\n", SIZE);
+
+      for (int i = 0; i < 100; ++i) {
+        test(SIZE);
+      }
+    }*/
+    
 }
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index 4f52663..91f358d 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -33,3 +33,18 @@ namespace Common {
             const int *idata, const int *bools, const int *indices);
 }
 }
+
+#define START_CUDA_TIMER() \
+  cudaEvent_t start, stop; \
+  cudaEventCreate(&start); \
+  cudaEventCreate(&stop); \
+  cudaEventRecord(start);
+
+#define STOP_CUDA_TIMER() \
+  cudaEventRecord(stop); \
+  cudaEventSynchronize(stop); \
+  float milliseconds = 0; \
+  cudaEventElapsedTime(&milliseconds, start, stop); \
+  cudaEventDestroy(start); \
+  cudaEventDestroy(stop); \
+  printf("Elapsed: %fms\n", milliseconds);
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 579fbfc..f71b0f7 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -1,4 +1,6 @@
 #include <cstdio>
+#include <chrono>
+#include <iostream>
 #include "cpu.h"
 
 namespace StreamCompaction {
@@ -8,12 +10,15 @@ namespace CPU {
  * CPU scan (prefix sum).
  */
 void scan(int n, int *odata, const int *idata) {
+  auto begin = std::chrono::high_resolution_clock::now();
   int total = 0;
   for (int i = 0; i < n; ++i) {
     int val = idata[i];
     odata[i] = total;
     total += val;
   }
+  auto end = std::chrono::high_resolution_clock::now();
+  std::cout << "Elapsed: " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count() / 1000000.f << "ms" << std::endl;
 }
 
 /**
@@ -22,10 +27,13 @@ void scan(int n, int *odata, const int *idata) {
  * @returns the number of elements remaining after compaction.
  */
 int compactWithoutScan(int n, int *odata, const int *idata) {
+  auto begin = std::chrono::high_resolution_clock::now();
   int idx = 0;
   for (int i = 0; i < n; ++i) {
     if (idata[i] != 0) odata[idx++] = idata[i];
   }
+  auto end = std::chrono::high_resolution_clock::now();
+  std::cout << "Elapsed: " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count() / 1000000.f << "ms" << std::endl;
   return idx;
 }
 
@@ -35,6 +43,7 @@ int compactWithoutScan(int n, int *odata, const int *idata) {
  * @returns the number of elements remaining after compaction.
  */
 int compactWithScan(int n, int *odata, const int *idata) {
+  auto begin = std::chrono::high_resolution_clock::now();
   for (int i = 0; i < n; ++i) {
     odata[i] = idata[i] != 0 ? 1 : 0;
   }
@@ -44,6 +53,8 @@ int compactWithScan(int n, int *odata, const int *idata) {
   for (int i = 0; i < n; ++i) {
     odata[odata[i]] = idata[i];
   }
+  auto end = std::chrono::high_resolution_clock::now();
+  std::cout << "Elapsed: " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count() / 1000000.f << "ms" << std::endl;
   return count;
 }
 
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index fc06461..02627cc 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -17,7 +17,13 @@ namespace Efficient {
 __global__ void scan_up(int n, int pow2d, int *odata, const int *idata) {
   int index = 2 * pow2d * (blockIdx.x * blockDim.x + threadIdx.x + 1) - 1;
   if (index >= n) return;
-  odata[index] = idata[index - pow2d] + idata[index];
+  
+  // set last value to 0 here to avoid cudaMemcpy
+  if (index == n - 1) {
+    odata[index] = 0;
+  } else {
+    odata[index] = idata[index - pow2d] + idata[index];
+  }
 }
 
 __global__ void scan_down(int n, int pow2d, int *odata, const int *idata) {
@@ -31,7 +37,7 @@ __global__ void scan_down(int n, int pow2d, int *odata, const int *idata) {
 
 __global__ void zero(int n, int *odata) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < 0) return;
+  if (index >= n) return;
   odata[index] = 0;
 }
 
@@ -42,12 +48,13 @@ void scan(int n, int *odata, const int *idata) {
   int* dev_data;
   int sizePow2 = pow(2, ilog2ceil(n));
 
-  const int blockSize = 128;
+  int blockSize = 128;
   int nBlocks;
 
   cudaMalloc((void**)&dev_data, sizePow2 * sizeof(int));
   checkCUDAError("cudaMalloc dev_data failed!");
 
+  blockSize = 32;
   // fill with 0
   nBlocks = (sizePow2 + blockSize - 1) / blockSize;
   zero << <nBlocks, blockSize >> >(sizePow2, dev_data);
@@ -55,22 +62,21 @@ void scan(int n, int *odata, const int *idata) {
   cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
   checkCUDAError("cudaMemcpy from idata to dev_data failed!");
 
+  blockSize = 128;
+  START_CUDA_TIMER()
   // scan up
   for (int pow2d = 1, int threads = sizePow2; pow2d < sizePow2 / 2; pow2d *= 2, threads /= 2) {
     nBlocks = (threads + blockSize - 1) / blockSize; // threads / blockSize, rounded up
-    scan_up << < nBlocks, blockSize >> >(n, pow2d, dev_data, dev_data);
+    scan_up << < nBlocks, blockSize >> >(sizePow2, pow2d, dev_data, dev_data);
   }
 
-  // set last item to 0
-  int zero = 0;
-  cudaMemcpy(&dev_data[sizePow2 - 1], &zero, sizeof(int), cudaMemcpyHostToDevice);
-  checkCUDAError("cudaMemcpy copy zero failed!");
-
+  blockSize = 128;
   // scan down
   for (int pow2d = pow(2, ilog2ceil(sizePow2) - 1), int threads = 1; pow2d >= 1; pow2d /= 2, threads *= 2) {
     nBlocks = (threads + blockSize - 1) / blockSize; // threads / blockSize, rounded up
     scan_down << < nBlocks, blockSize >> >(sizePow2, pow2d, dev_data, dev_data);
   }
+  STOP_CUDA_TIMER()
 
   cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost);
   checkCUDAError("cudaMemcpy from dev_data to odata failed!");
@@ -110,7 +116,7 @@ int compact(int n, int *odata, const int *idata) {
   checkCUDAError("cudaMemcpy from idata to dev_data failed!");
 
   // printArr << <1, 1 >> >(n, dev_idata);
-
+  START_CUDA_TIMER()
   // create mask
   nBlocks = (n + blockSize - 1) / blockSize;
   StreamCompaction::Common::kernMapToBoolean << <nBlocks, blockSize >> >(n, dev_mask, dev_idata);
@@ -131,17 +137,13 @@ int compact(int n, int *odata, const int *idata) {
     scan_up << < nBlocks, blockSize >> >(n, pow2d, dev_mask, dev_mask);
   }
 
-  // set last item to 0
-  int last = 0;
-  cudaMemcpy(&dev_mask[sizePow2 - 1], &last, sizeof(int), cudaMemcpyHostToDevice);
-  checkCUDAError("cudaMemcpy copy zero failed!");
-
   // scan down
   for (int pow2d = pow(2, ilog2ceil(sizePow2) - 1), int threads = 1; pow2d >= 1; pow2d /= 2, threads *= 2) {
     nBlocks = (threads + blockSize - 1) / blockSize; // threads / blockSize, rounded up
     scan_down << < nBlocks, blockSize >> >(sizePow2, pow2d, dev_mask, dev_mask);
   }
 
+  int last;
   // copy back last val so we know how many elements
   cudaMemcpy(&last, &dev_mask[sizePow2 - 1], sizeof(int), cudaMemcpyDeviceToHost);
   checkCUDAError("cudaMemcpy copy last failed!");
@@ -153,6 +155,8 @@ int compact(int n, int *odata, const int *idata) {
   nBlocks = (n + blockSize - 1) / blockSize;
   StreamCompaction::Common::kernScatter << <nBlocks, blockSize >> >(n, dev_odata, dev_idata, dev_odata, dev_mask);
 
+  STOP_CUDA_TIMER()
+
   cudaMemcpy(odata, dev_odata, sizeof(int) * last, cudaMemcpyDeviceToHost);
   checkCUDAError("cudaMemcpy from dev_odata to odata failed!");
 
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 6aa424c..2b4f2a9 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -54,6 +54,8 @@ void scan(int n, int *odata, const int *idata) {
   const int blockSize = 128;
   const int nBlocks = (n + blockSize - 1) / blockSize; //  n/blockSize, rounded up
 
+  START_CUDA_TIMER()
+
   for (int pow2d = 1; pow2d < n; pow2d *= 2) {
     naiveScanIteration << < nBlocks, blockSize >> >(n, pow2d, dev_odata, dev_idata);
     std::swap(dev_idata, dev_odata);
@@ -62,6 +64,9 @@ void scan(int n, int *odata, const int *idata) {
   // convert to exclusive scan
   rshift << <nBlocks, blockSize >> >(n, dev_odata, dev_idata);
 
+  STOP_CUDA_TIMER()
+  
+
   // we use dev_idata here because we swapped buffers
   cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost);
   checkCUDAError("cudaMemcpy from dev_data to odata failed!");
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 06ef696..50d3c8c 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -15,7 +15,9 @@ namespace Thrust {
 void scan(int n, int *odata, const int *idata) {
   thrust::device_vector<int> dv_in(idata, idata + n);
   thrust::device_vector<int> dv_out(odata, odata + n);
+  START_CUDA_TIMER()
   thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+  STOP_CUDA_TIMER()
   thrust::copy(dv_out.begin(), dv_out.end(), odata);
 }
 

From 9cc656597649ebe66459a8b3e26e0cc3f664c95e Mon Sep 17 00:00:00 2001
From: Austin Eng <213reeses@gmail.com>
Date: Fri, 23 Sep 2016 02:48:36 -0400
Subject: [PATCH 10/11] update readme

---
 README.md | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 89205c7..701eba8 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,29 @@ CUDA Stream Compaction
 * Austin Eng
 * Tested on: Windows 10, i7-4770K @ 3.50GHz 16GB, GTX 780 3072MB (Personal Computer)
 
-## Output
+## Analysis
+
+**Note: Reported graphs are the result of 100 trials, averaged. Also note that input sizes are at increasing powers of two. Furthermore, since the algorithm is exponential in growth, both axes are displayed at a log scale**
+
+![Scan Analysis](https://docs.google.com/spreadsheets/d/1x1MppbyAceIIrwhDLsmV7unUYS2RYU_I20_wK0ReORY/pubchart?oid=175703576&format=image)
+
+![Compaction Analysis](https://docs.google.com/spreadsheets/d/1x1MppbyAceIIrwhDLsmV7unUYS2RYU_I20_wK0ReORY/pubchart?oid=477396612&format=image)
+
+## Analysis
+
+For smaller input sizes, the CPU implementation for both Scan and Stream Compaction is much, much faster than the GPU implementation. When dealing with contiguous buffers of memory, the CPU reaps large benefits from cache which makes it very fast. However, at around 2^19 in input size, the more efficient GPU implementations begin to outperform the CPU. With only a single core, CPU performance becomes worse as the number of computations required increases exponentially.
+
+Meanwhile, on the GPU, the exponent of this algorithmic growth is divided by the number of cores so there is much slower growth. However, there is a larger cost from memory access so the GPU implementations are much slower for lower input sizes because of this memory overhead. Memory usage, however, increases linearly not exponentially, so for larger sets of data, the GPU wins with performance.
+
+In comparing the Naive and Efficient GPU implementations, we see that for smaller datasets, the Naive implementation is faster. This is probably because there are fewer kernel invocations are made. Even though there are an exponential number of additions, this still takes less time. However, as the input size increases, the much more computationally-efficient method performs better.
+
+I did not see much difference between power-of-two input sizes and non-power-of-two data sizes. This is likely because my implementation just increases the size of non-power-of-two inputs to be power-of-two inputs.
+
+### Why is Thrust So Fast?
+
+It seems like the Thrust implementation receives a big performance boost from using shared memory. From the names of the function calls: `accumulate_tiles, exclusive_scan, exclusive_downsweep` it seems like Thrust is doing the same thing as the Efficient implementation except the `accumulate_tiles` calls have 32 and 4064 static and dynamic bytes of shared memory, respectively. `exclusive_scan`: 48 and 12240. `exclusive_downsweep`: 32 and 6880. This probably allows for much more efficient memory access in the kernel. Analysis also shows that each of the kernels is called twice, notably wrapped in `cuda_task` and `parallel_group`. This is probably done because the computation needs to be split into multiple pieces since shared memory can only be so large.
+
+## Test Output
 ```
 ****************
 ** SCAN TESTS **

From 4d1a7314b7dafcfecdaf3112121971d5c2cab2df Mon Sep 17 00:00:00 2001
From: Austin Eng <213reeses@gmail.com>
Date: Mon, 26 Sep 2016 22:37:10 -0400
Subject: [PATCH 11/11] update Readme

---
 README.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 701eba8..a3860b3 100644
--- a/README.md
+++ b/README.md
@@ -14,8 +14,6 @@ CUDA Stream Compaction
 
 ![Compaction Analysis](https://docs.google.com/spreadsheets/d/1x1MppbyAceIIrwhDLsmV7unUYS2RYU_I20_wK0ReORY/pubchart?oid=477396612&format=image)
 
-## Analysis
-
 For smaller input sizes, the CPU implementation for both Scan and Stream Compaction is much, much faster than the GPU implementation. When dealing with contiguous buffers of memory, the CPU reaps large benefits from cache which makes it very fast. However, at around 2^19 in input size, the more efficient GPU implementations begin to outperform the CPU. With only a single core, CPU performance becomes worse as the number of computations required increases exponentially.
 
 Meanwhile, on the GPU, the exponent of this algorithmic growth is divided by the number of cores so there is much slower growth. However, there is a larger cost from memory access so the GPU implementations are much slower for lower input sizes because of this memory overhead. Memory usage, however, increases linearly not exponentially, so for larger sets of data, the GPU wins with performance.
@@ -24,6 +22,11 @@ In comparing the Naive and Efficient GPU implementations, we see that for smalle
 
 I did not see much difference between power-of-two input sizes and non-power-of-two data sizes. This is likely because my implementation just increases the size of non-power-of-two inputs to be power-of-two inputs.
 
+### More Efficient `Efficient` Scan
+
+It turns out that my initial implementation of the Efficient scan was the extra credit implementation. Instead of launching the same number threads for the upsweep and downsweep, we decrease the number to avoid wasted threads and increase occupancy. Why? For the upsweep, after every iteration we need half as many threads. The others don't do anything. For the downsweep, our first iteration uses just 1 thread and each subsequent iteration doubles this number. A more efficient way to implement Efficient scan is to launch only the number of threads needed and have your calculated thread index jump by a power of two. So: `index = 2^d * (blockIdx.x * blockDim.x + threadIdx.x)`. Now our indicies will jump 2 -- 4 -- 6 -- 8 or 16 -- 32 -- 48 -- 64, etc. We can launch only the needed number of threads instead of launching `n` threads and using far, far less than half of them.
+
+
 ### Why is Thrust So Fast?
 
 It seems like the Thrust implementation receives a big performance boost from using shared memory. From the names of the function calls: `accumulate_tiles, exclusive_scan, exclusive_downsweep` it seems like Thrust is doing the same thing as the Efficient implementation except the `accumulate_tiles` calls have 32 and 4064 static and dynamic bytes of shared memory, respectively. `exclusive_scan`: 48 and 12240. `exclusive_downsweep`: 32 and 6880. This probably allows for much more efficient memory access in the kernel. Analysis also shows that each of the kernels is called twice, notably wrapped in `cuda_task` and `parallel_group`. This is probably done because the computation needs to be split into multiple pieces since shared memory can only be so large.