From a505563a3dddd9755fc08493589108978417a344 Mon Sep 17 00:00:00 2001
From: Jeremy Newlin <hms@seasnet-2-29.seas.upenn.edu>
Date: Thu, 25 Sep 2014 16:42:16 -0400
Subject: [PATCH 01/18] naive

---
 src/main.cpp            |  45 ++++++++
 src/streamCompaction.cu | 227 ++++++++++++++++++++++++++++++++++++++++
 src/streamCompaction.h  |  64 +++++++++++
 3 files changed, 336 insertions(+)
 create mode 100644 src/main.cpp
 create mode 100644 src/streamCompaction.cu
 create mode 100755 src/streamCompaction.h
diff --git a/src/main.cpp b/src/main.cpp
new file mode 100644
index 0000000..726232d
--- /dev/null
+++ b/src/main.cpp
@@ -0,0 +1,45 @@
+#include <iostream>
+#include <time.h>
+
+#include "streamCompaction.h"
+
+using namespace std;
+
+void naive(){
+	int numElements = 25;
+
+	dataPacket * ints = new dataPacket[numElements];
+	for (int i=0; i<numElements; i+=1){
+		ints[i] = dataPacket(i);
+	}
+
+	DataStream ds(numElements, ints);
+
+	cout<<"starting with "<<numElements<<" streams"<<endl;
+
+	int bound = 0;
+	while(ds.numAlive () > 0){
+		int toKill = rand() % ds.numAlive();
+		ds.kill(toKill);
+		ds.compact ();
+
+		cout<<"killing "<<toKill<<", "<<ds.numAlive()<<" streams remain"<<endl;
+
+		ds.fetchDataFromGPU();
+
+		for (int i=0; i<ds.numAlive(); i+=1){
+			dataPacket cur;
+			ds.getData(i, cur);
+			cout<<cur.index;
+			if (i<ds.numAlive()-1) cout<<",";
+		}
+		cout<<endl<<endl;
+		bound+=1;
+	}
+}
+
+int main(){
+	//testStreamCompaction();
+	srand (time(NULL));
+	return 0;
+}
\ No newline at end of file
diff --git a/src/streamCompaction.cu b/src/streamCompaction.cu
new file mode 100644
index 0000000..2bac130
--- /dev/null
+++ b/src/streamCompaction.cu
@@ -0,0 +1,227 @@
+#include <stdio.h>
+#include <cuda.h>
+#include <cmath>
+#include <vector>
+#include <iostream>
+
+#include "streamCompaction.h"
+
+__global__ void sum(int* in, int* out, int n, int d1){
+  int k = (blockIdx.x * blockDim.x) + threadIdx.x;
+  
+  if (k<n){
+    int ink = in[k];
+    if (k>=d1){
+      out[k] = in[k-d1] + ink;
+    }
+    else{
+      out[k] = ink;
+    }
+  }
+}
+
+__global__ void shift(int* in, int* out, int n){
+  int k = (blockIdx.x * blockDim.x) + threadIdx.x;
+  
+  out[0] = 0;
+  if (k<n && k>0){
+    out[k] = in[k-1];
+  }
+}
+
+__global__ void streamCompaction(dataPacket* inRays, int* indices, dataPacket* outRays, int numElements){
+  int k = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  if (k<numElements){
+    dataPacket inRay = inRays[k];
+    if (inRay.alive){
+      outRays[indices[k]-1] = inRay;
+    }
+  }
+}
+
+__global__ void killStream(int index, dataPacket* inRays, int* indices, int numElements){
+  int k = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  if (k<numElements){
+    inRays[k].alive = true;
+    indices[k] = 1;
+    if (k == index){
+      inRays[k].alive = false;
+      indices[k] = 0;
+    }
+  }
+}
+
+void testStreamCompaction(){
+  //Testing stream compaction
+  int numElements = 10;
+  dataPacket* arrayOfElements = new dataPacket[numElements];
+  for (int i=0; i<numElements; i+=1){
+    dataPacket rb(i);
+    arrayOfElements[i] = rb;
+  }
+
+  arrayOfElements[1].alive=false;
+  arrayOfElements[4].alive=false;
+  arrayOfElements[5].alive=false;
+  arrayOfElements[7].alive=false;
+  arrayOfElements[8].alive=false;
+
+
+  dataPacket* cudaArrayA;
+  dataPacket* cudaArrayB;
+
+  cudaMalloc((void**)&cudaArrayA, numElements*sizeof(dataPacket));
+  cudaMalloc((void**)&cudaArrayB, numElements*sizeof(dataPacket));
+
+  int* testin;
+  int* testout;
+  int* cputest = new int[numElements];
+
+  for (int i=0; i<numElements; i++){
+    if (arrayOfElements[i].alive){
+      cputest[i]=1;
+    }
+    else{
+      cputest[i]=0;
+    }
+  }
+
+  cudaMalloc((void**)&testin, numElements*sizeof(int));
+  cudaMalloc((void**)&testout, numElements*sizeof(int));
+
+  cudaMemcpy(cudaArrayA, arrayOfElements, numElements*sizeof(dataPacket), cudaMemcpyHostToDevice);  
+  cudaMemcpy(cudaArrayB, arrayOfElements, numElements*sizeof(dataPacket), cudaMemcpyHostToDevice);  
+  cudaMemcpy(testin, cputest, numElements*sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemcpy(testout, cputest, numElements*sizeof(int), cudaMemcpyHostToDevice);
+
+  for (int i=0; i<numElements; i++){
+    std::cout<<arrayOfElements[i].index<<", "<<cputest[i]<<std::endl;
+  }
+
+  dim3 threadsPerBlock(64);
+  dim3 fullBlocksPerGrid(int(ceil(float(numElements)/64.0f)));
+
+  //scan
+  for (int d=1; d<=ceil(log(numElements)/log(2))+1; d++){
+    sum<<<fullBlocksPerGrid, threadsPerBlock>>>(testin, testout, numElements, int(pow(2.0f,d-1)));
+    cudaThreadSynchronize();
+    cudaMemcpy(cputest, testout, numElements*sizeof(int), cudaMemcpyDeviceToHost);
+
+
+    int* temp = testin;
+    testin=testout;
+    testout=temp;
+  }
+  //Compact
+  streamCompaction<<<fullBlocksPerGrid, threadsPerBlock>>>(cudaArrayA, testin, cudaArrayB, numElements);
+  cudaArrayA = cudaArrayB;
+  cudaThreadSynchronize();
+
+  cudaMemcpy(&numElements, &testin[numElements-1], 1*sizeof(int), cudaMemcpyDeviceToHost);
+
+  std::cout<<"number of rays left: "<<numElements<<std::endl;
+
+  // for (int i=0; i<numElements; i++){
+  //   std::cout<<cputest[i]<<std::endl;
+  // }    
+
+  cudaMemcpy(cputest, testin, numElements*sizeof(int), cudaMemcpyDeviceToHost);
+  cudaMemcpy(arrayOfElements, cudaArrayA, numElements*sizeof(dataPacket), cudaMemcpyDeviceToHost);
+
+
+  for (int i=0; i<numElements; i++){
+    std::cout<<arrayOfElements[i].index<<std::endl;
+
+  }
+  std::cout<<"___________________________________"<<std::endl;
+
+
+  delete [] cputest;
+  cudaFree(testin);
+  cudaFree(testout);
+
+  delete [] arrayOfElements;
+  cudaFree(cudaArrayA);
+  cudaFree(cudaArrayB);
+}
+
+DataStream::DataStream(int numElements, dataPacket * data){
+  m_data = data;
+  m_numElementsAlive = numElements;
+  m_numElements = numElements;
+
+  m_indices = new int[numElements];
+  for (int i=0; i<numElements; i+=1){
+    m_indices[i] = 1;
+  }
+
+  //cudaInit (cudaDataA, cudaDataB, cudaIndicesA, cudaIndicesB);
+  cudaMalloc ((void**)&cudaDataA, numElements*sizeof (dataPacket));
+  cudaMalloc ((void**)&cudaDataB, numElements*sizeof (dataPacket));
+  cudaMalloc ((void**)&cudaIndicesA, numElements*sizeof (int));
+  cudaMalloc ((void**)&cudaIndicesB, numElements*sizeof (int));
+
+  cudaMemcpy(cudaDataA, m_data, numElements*sizeof(dataPacket), cudaMemcpyHostToDevice);
+  cudaMemcpy(cudaDataB, m_data, numElements*sizeof(dataPacket), cudaMemcpyHostToDevice);
+  cudaMemcpy(cudaIndicesA, m_indices, numElements*sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemcpy(cudaIndicesB, m_indices, numElements*sizeof(int), cudaMemcpyHostToDevice);
+}
+
+DataStream::~DataStream(){
+  cudaFree (cudaDataA);
+  cudaFree (cudaDataB);
+  cudaFree (cudaIndicesA);
+  cudaFree (cudaIndicesB);
+
+  delete [] m_data;
+  delete [] m_indices;
+}
+
+void DataStream::compact(){
+  dim3 threadsPerBlockL(64);
+  dim3 fullBlocksPerGridL(int(ceil(float(m_numElementsAlive)/64.0f)));
+
+  // scan algorithm
+    for (int d=1; d<=ceil(log(m_numElementsAlive)/log(2)); d++){
+      sum<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive, powf(2.0f, d-1));
+      int* temp = cudaIndicesA;
+      cudaIndicesA = cudaIndicesB;
+      cudaIndicesB = temp;
+    }
+
+    //Stream compation from A into B, then save back into A
+    streamCompaction<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaDataA, cudaIndicesA, cudaDataB, m_numElementsAlive);
+    dataPacket * temp = cudaDataA;
+    cudaDataA = cudaDataB;
+    cudaDataB = temp;
+
+    // update numrays
+    cudaMemcpy(&m_numElementsAlive, &cudaIndicesA[m_numElementsAlive-1], sizeof(int), cudaMemcpyDeviceToHost);
+}
+
+bool DataStream::getData(int index, dataPacket& data){
+
+  if (index > m_numElements) return false;
+
+  data = m_data[index];
+  return true;
+}
+
+int DataStream::numAlive(){
+  return m_numElementsAlive;
+}
+
+void DataStream::fetchDataFromGPU(){
+  cudaMemcpy(m_data, cudaDataA, m_numElementsAlive*sizeof(dataPacket), cudaMemcpyDeviceToHost);
+}
+
+void DataStream::kill(int index){
+  if (index > m_numElementsAlive) return;
+
+  dim3 threadsPerBlockL(64);
+  dim3 fullBlocksPerGridL(int(ceil(float(m_numElementsAlive)/64.0f)));
+
+  killStream<<<fullBlocksPerGridL, threadsPerBlockL>>>(index, cudaDataA, cudaIndicesA, m_numElementsAlive);
+}
\ No newline at end of file
diff --git a/src/streamCompaction.h b/src/streamCompaction.h
new file mode 100755
index 0000000..4353c58
--- /dev/null
+++ b/src/streamCompaction.h
@@ -0,0 +1,64 @@
+// CIS565 CUDA Raytracer: A parallel raytracer for Patrick Cozzi's CIS565: GPU Computing at the University of Pennsylvania
+// Written by Yining Karl Li, Copyright (c) 2012 University of Pennsylvania
+// This file includes code from:
+//       Rob Farber for CUDA-GL interop, from CUDA Supercomputing For The Masses: http://www.drdobbs.com/architecture-and-design/cuda-supercomputing-for-the-masses-part/222600097
+//       Peter Kutz and Yining Karl Li's GPU Pathtracer: http://gpupathtracer.blogspot.com/
+//       Yining Karl Li's TAKUA Render, a massively parallel pathtracing renderer: http://www.yiningkarlli.com
+
+#ifndef STREAM_COMPACTION_H
+#define STREAM_COMPACTION_H
+
+#include <stdio.h>
+#include <thrust/random.h>
+#include <cuda.h>
+#include <cmath>
+#include <time.h>
+#include <map>
+
+struct dataPacket{
+	int index;
+	bool alive;
+	dataPacket(){
+		index = -1;
+		alive = true;
+	}
+	dataPacket(int i){
+		index = i;
+		alive = true;
+	}
+};
+
+class DataStream{
+private:
+
+	dataPacket * m_data;
+
+	int m_numElementsAlive, m_numElements;
+
+	dataPacket * cudaDataA;
+	dataPacket * cudaDataB;
+
+	int * cudaIndicesA;
+	int * cudaIndicesB;
+
+public:
+	int * m_indices;
+
+	DataStream(int numElements, dataPacket * data);
+	~DataStream();
+
+	void compact();
+	bool getData(int index, dataPacket& data);
+	int numAlive();
+	void kill(int index);
+	void fetchDataFromGPU();
+
+};
+
+void cudaVectorSum(int * indicesA, int * indicesB, int numElements, float k);
+
+void cudaInit (dataPacket * a, dataPacket * b, int * ia, int * ib);
+
+void testStreamCompaction();
+
+#endif

From b173957c0e9773f52d6963c8acd723883263ef9c Mon Sep 17 00:00:00 2001
From: Jeremy Newlin <hms@seasnet-2-29.seas.upenn.edu>
Date: Fri, 26 Sep 2014 16:55:32 -0400
Subject: [PATCH 02/18] second

---
 src/main.cpp            |  28 +++++++++++
 src/streamCompaction.cu | 104 ++++++++++++++++++++++++++++++----------
 2 files changed, 108 insertions(+), 24 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 726232d..435f5b1 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -38,8 +38,36 @@ void naive(){
 	}
 }
 
+void test(){
+	int numElements = 16;
+
+	dataPacket * ints = new dataPacket[numElements];
+	for (int i=0; i<numElements; i+=1){
+		ints[i] = dataPacket(i);
+	}
+
+	DataStream ds(numElements, ints);
+
+	cout<<"starting with "<<numElements<<" streams"<<endl;
+
+	for (int i=0; i<ds.numAlive(); i+=1){
+		cout<<ds.m_indices[i];
+		if (i<ds.numAlive()-1) cout<<",";
+	}
+	cout<<endl;
+
+	ds.compact();
+
+	for (int i=0; i<ds.numAlive(); i+=1){
+		cout<<ds.m_indices[i];
+		if (i<ds.numAlive()-1) cout<<",";
+	}
+	cout<<endl;
+}
+
 int main(){
 	//testStreamCompaction();
 	srand (time(NULL));
+	test ();
 	return 0;
 }
\ No newline at end of file
diff --git a/src/streamCompaction.cu b/src/streamCompaction.cu
index 2bac130..eee16eb 100644
--- a/src/streamCompaction.cu
+++ b/src/streamCompaction.cu
@@ -6,6 +6,13 @@
 
 #include "streamCompaction.h"
 
+void checkCUDAError(const char *msg) {
+  cudaError_t err = cudaGetLastError();
+  if( cudaSuccess != err) {
+    fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); 
+  }
+} 
+
 __global__ void sum(int* in, int* out, int n, int d1){
   int k = (blockIdx.x * blockDim.x) + threadIdx.x;
   
@@ -20,13 +27,51 @@ __global__ void sum(int* in, int* out, int n, int d1){
   }
 }
 
-__global__ void shift(int* in, int* out, int n){
-  int k = (blockIdx.x * blockDim.x) + threadIdx.x;
-  
-  out[0] = 0;
-  if (k<n && k>0){
-    out[k] = in[k-1];
+__global__ void test(int* in, int* out, int n){
+
+  extern __shared__ float temp[];
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  int offset = 1;
+
+  if (2*index+1<=n){
+    temp[2*index] = in[2*index];
+    temp[2*index+1] = in[2*index+1];
+
+    for (int d = n>>1; d>0; d >>= 1){
+    //for (int d=0; d<floor(log(float(n-1)/log(2.0f))); d+=1){
+      __syncthreads();
+      if (index < d){
+        int ai = offset * (2*index+1) - 1;
+        int bi = offset * (2*index+2) - 1;
+
+        temp[bi] += temp[ai];
+      }
+      offset *= 2;
+    }
+
+    if (index == 0) temp[n - 1] = 0;
+
+    for (int d = 1; d<n; d*=2){
+      offset >>= 1;
+      __syncthreads();
+      if (index < d){
+
+        int ai = offset * (2*index+1) - 1;
+        int bi = offset * (2*index+2) - 1;
+
+        if (ai < n && bi < n){
+          float t = temp[ai];
+          temp[ai] = temp[bi];
+          temp[bi] += t;
+        }
+      }
+    }
+    __syncthreads();
+
+    out[2*index] = temp[2*index];
+    out[2*index+1] = temp[2*index+1];
   }
+
 }
 
 __global__ void streamCompaction(dataPacket* inRays, int* indices, dataPacket* outRays, int numElements){
@@ -180,25 +225,36 @@ DataStream::~DataStream(){
 }
 
 void DataStream::compact(){
-  dim3 threadsPerBlockL(64);
-  dim3 fullBlocksPerGridL(int(ceil(float(m_numElementsAlive)/64.0f)));
-
-  // scan algorithm
-    for (int d=1; d<=ceil(log(m_numElementsAlive)/log(2)); d++){
-      sum<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive, powf(2.0f, d-1));
-      int* temp = cudaIndicesA;
-      cudaIndicesA = cudaIndicesB;
-      cudaIndicesB = temp;
-    }
-
-    //Stream compation from A into B, then save back into A
-    streamCompaction<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaDataA, cudaIndicesA, cudaDataB, m_numElementsAlive);
-    dataPacket * temp = cudaDataA;
-    cudaDataA = cudaDataB;
-    cudaDataB = temp;
 
-    // update numrays
-    cudaMemcpy(&m_numElementsAlive, &cudaIndicesA[m_numElementsAlive-1], sizeof(int), cudaMemcpyDeviceToHost);
+  int numElements = m_numElementsAlive;
+  int threadsPerBlock = 64;
+  int procsPefBlock = threadsPerBlock*2;
+
+  dim3 initialScanThreadsPerBlock(threadsPerBlock/2);
+  dim3 initialScanBlocksPerGrid(numElements/threadsPerBlock);
+
+  dim3 threadsPerBlockL(threadsPerBlock);
+  dim3 fullBlocksPerGridL(int(ceil(float(m_numElementsAlive)/float(threadsPerBlock))));
+
+  // // scan algorithm
+  //   for (int d=1; d<=ceil(log(m_numElementsAlive)/log(2)); d++){
+  //     sum<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive, powf(2.0f, d-1));
+  //     int* temp = cudaIndicesA;
+  //     cudaIndicesA = cudaIndicesB;
+  //     cudaIndicesB = temp;
+  //   }
+
+    test<<<fullBlocksPerGridL, threadsPerBlockL, m_numElementsAlive*sizeof(int)>>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive);
+    checkCUDAError("kernel failed!");
+    cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost);
+    // //Stream compation from A into B, then save back into A
+    // streamCompaction<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaDataA, cudaIndicesA, cudaDataB, m_numElementsAlive);
+    // dataPacket * temp = cudaDataA;
+    // cudaDataA = cudaDataB;
+    // cudaDataB = temp;
+
+    // // update numrays
+    // cudaMemcpy(&m_numElementsAlive, &cudaIndicesA[m_numElementsAlive-1], sizeof(int), cudaMemcpyDeviceToHost);
 }
 
 bool DataStream::getData(int index, dataPacket& data){

From 12f97c14aa54ba54610021de1b4d8e7fef3c28dd Mon Sep 17 00:00:00 2001
From: Jeremy Newlin <hms@seasnet-2-29.seas.upenn.edu>
Date: Fri, 26 Sep 2014 20:00:25 -0400
Subject: [PATCH 03/18] sad

---
 src/main.cpp            |  16 ++++--
 src/streamCompaction.cu | 124 +++++++++++++++++++++++++++++++++++++---
 src/streamCompaction.h  |   6 ++
 3 files changed, 133 insertions(+), 13 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 435f5b1..7a70986 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -39,7 +39,7 @@ void naive(){
 }
 
 void test(){
-	int numElements = 16;
+	int numElements = 32;
 
 	dataPacket * ints = new dataPacket[numElements];
 	for (int i=0; i<numElements; i+=1){
@@ -50,16 +50,22 @@ void test(){
 
 	cout<<"starting with "<<numElements<<" streams"<<endl;
 
+	// for (int i=0; i<ds.numAlive(); i+=1){
+	// 	cout<<ds.m_indices[i];
+	// 	if (i<ds.numAlive()-1) cout<<",";
+	// }
+	// cout<<endl;
+
+	ds.compact();
+
 	for (int i=0; i<ds.numAlive(); i+=1){
 		cout<<ds.m_indices[i];
 		if (i<ds.numAlive()-1) cout<<",";
 	}
 	cout<<endl;
 
-	ds.compact();
-
-	for (int i=0; i<ds.numAlive(); i+=1){
-		cout<<ds.m_indices[i];
+	for (int i=0; i<numElements/(THREADS_PER_BLOCK*2); i+=1){
+		cout<<ds.m_auxSums[i];
 		if (i<ds.numAlive()-1) cout<<",";
 	}
 	cout<<endl;
diff --git a/src/streamCompaction.cu b/src/streamCompaction.cu
index eee16eb..fdd51b7 100644
--- a/src/streamCompaction.cu
+++ b/src/streamCompaction.cu
@@ -6,6 +6,8 @@
 
 #include "streamCompaction.h"
 
+using namespace std;
+
 void checkCUDAError(const char *msg) {
   cudaError_t err = cudaGetLastError();
   if( cudaSuccess != err) {
@@ -74,6 +76,67 @@ __global__ void test(int* in, int* out, int n){
 
 }
 
+__global__ void test2(int* in, int* out, int n, int* sums=0){
+
+  extern __shared__ float temp[];
+
+  int realIndex = (blockIdx.x * blockDim.x) + threadIdx.x;
+  temp[realIndex] = 0;
+  
+  int offset = 1;
+  int index = threadIdx.x;
+
+  temp[2*index] = in[2*realIndex];
+  temp[2*index+1] = in[2*realIndex+1];
+
+  for (int d = n>>1; d>0; d >>= 1){
+    __syncthreads();
+    if (index < d){
+      int ai = offset * (2*index+1) - 1;
+      int bi = offset * (2*index+2) - 1;
+
+      temp[bi] += temp[ai];
+    }
+    offset *= 2;
+  }
+  
+  if (index == 0){
+    if (sums) sums[blockIdx.x] = temp[16-1];
+    temp[n - 1] = 0;
+  }
+
+  for (int d = 1; d<n; d*=2){
+    offset >>= 1;
+    __syncthreads();
+    if (index < d){
+
+      int ai = offset * (2*index+1) - 1;
+      int bi = offset * (2*index+2) - 1;
+
+      if (ai < n && bi < n){
+        float t = temp[ai];
+        temp[ai] = temp[bi];
+        temp[bi] += t;
+      }
+    }
+  }
+  __syncthreads();
+
+  out[2*realIndex] = temp[2*index];
+  out[2*realIndex+1] = temp[2*index+1];
+
+}
+
+__global__ void addIncs(int* cudaAuxIncs, int* cudaIndicesB, int n){
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+
+  // if (index < n){
+    // cudaIndicesB[index] = blockIdx.x; //cudaAuxIncs[blockIdx.x];
+    cudaIndicesB[index] += cudaAuxIncs[blockIdx.x];
+  // }
+}
+
 __global__ void streamCompaction(dataPacket* inRays, int* indices, dataPacket* outRays, int numElements){
   int k = (blockIdx.x * blockDim.x) + threadIdx.x;
 
@@ -195,11 +258,32 @@ void testStreamCompaction(){
 DataStream::DataStream(int numElements, dataPacket * data){
   m_data = data;
   m_numElementsAlive = numElements;
+
+  // if (numElements % THREADS_PER_BLOCK*2 != 0){
+  //   int counter = 1;
+  //   while (THREADS_PER_BLOCK*2*counter < numElements){
+  //     counter += 1;
+  //   }
+  //   numElements = THREADS_PER_BLOCK*2*counter;
+  // }
+
+  // cout<<numElements<<endl;
+
   m_numElements = numElements;
 
   m_indices = new int[numElements];
   for (int i=0; i<numElements; i+=1){
-    m_indices[i] = 1;
+    if (i < m_numElementsAlive){
+      m_indices[i] = 1;
+    }
+    else{
+      m_indices[i] = 0;
+    }
+  }
+
+  m_auxSums = new int[numElements/(THREADS_PER_BLOCK*2)];
+  for (int i=0; i<numElements/(THREADS_PER_BLOCK*2); i+=1){
+    m_auxSums[i] = 0;
   }
 
   //cudaInit (cudaDataA, cudaDataB, cudaIndicesA, cudaIndicesB);
@@ -207,11 +291,15 @@ DataStream::DataStream(int numElements, dataPacket * data){
   cudaMalloc ((void**)&cudaDataB, numElements*sizeof (dataPacket));
   cudaMalloc ((void**)&cudaIndicesA, numElements*sizeof (int));
   cudaMalloc ((void**)&cudaIndicesB, numElements*sizeof (int));
+  cudaMalloc ((void**)&cudaAuxSums, numElements/(THREADS_PER_BLOCK*2)*sizeof (int));
+  cudaMalloc ((void**)&cudaAuxIncs, numElements/(THREADS_PER_BLOCK*2)*sizeof (int));
 
   cudaMemcpy(cudaDataA, m_data, numElements*sizeof(dataPacket), cudaMemcpyHostToDevice);
   cudaMemcpy(cudaDataB, m_data, numElements*sizeof(dataPacket), cudaMemcpyHostToDevice);
   cudaMemcpy(cudaIndicesA, m_indices, numElements*sizeof(int), cudaMemcpyHostToDevice);
   cudaMemcpy(cudaIndicesB, m_indices, numElements*sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemcpy(cudaAuxSums, m_auxSums, numElements/(THREADS_PER_BLOCK*2)*sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemcpy(cudaAuxIncs, m_auxSums, numElements/(THREADS_PER_BLOCK*2)*sizeof(int), cudaMemcpyHostToDevice);
 }
 
 DataStream::~DataStream(){
@@ -219,23 +307,43 @@ DataStream::~DataStream(){
   cudaFree (cudaDataB);
   cudaFree (cudaIndicesA);
   cudaFree (cudaIndicesB);
+  cudaFree (cudaAuxSums);
+  cudaFree (cudaAuxIncs);
 
   delete [] m_data;
   delete [] m_indices;
+  delete [] m_auxSums;
 }
 
 void DataStream::compact(){
 
   int numElements = m_numElementsAlive;
-  int threadsPerBlock = 64;
-  int procsPefBlock = threadsPerBlock*2;
+  int threadsPerBlock = THREADS_PER_BLOCK; // 8
+  int procsPefBlock = threadsPerBlock*2;   // 16
+
+  dim3 initialScanThreadsPerBlock(procsPefBlock/2);        //8
+  dim3 initialScanBlocksPerGrid(numElements/procsPefBlock);//
 
-  dim3 initialScanThreadsPerBlock(threadsPerBlock/2);
-  dim3 initialScanBlocksPerGrid(numElements/threadsPerBlock);
+  int sumSize = numElements/(THREADS_PER_BLOCK*2);
+
+  dim3 initialScanThreadsPerBlock2(sumSize/2);        //16
+  dim3 initialScanBlocksPerGrid2(sumSize/(sumSize/2)+1);//1024/16
 
   dim3 threadsPerBlockL(threadsPerBlock);
   dim3 fullBlocksPerGridL(int(ceil(float(m_numElementsAlive)/float(threadsPerBlock))));
 
+  test2<<<initialScanBlocksPerGrid, initialScanThreadsPerBlock, m_numElements*sizeof(int)>>>(cudaIndicesA, cudaIndicesB, procsPefBlock, cudaAuxSums);
+  checkCUDAError("kernel failed!");
+
+  test2<<<initialScanBlocksPerGrid2, initialScanThreadsPerBlock2, m_numElements*sizeof(int)>>>(cudaAuxSums, cudaAuxIncs, sumSize);
+  checkCUDAError("kernel failed!");
+
+  addIncs<<<initialScanBlocksPerGrid, initialScanThreadsPerBlock>>>(cudaAuxIncs, cudaIndicesB, m_numElements);
+  checkCUDAError("kernel failed!");
+
+  cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost);
+  cudaMemcpy(m_auxSums, cudaAuxIncs, m_numElements/(THREADS_PER_BLOCK*2)*sizeof(int), cudaMemcpyDeviceToHost);
+
   // // scan algorithm
   //   for (int d=1; d<=ceil(log(m_numElementsAlive)/log(2)); d++){
   //     sum<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive, powf(2.0f, d-1));
@@ -244,9 +352,9 @@ void DataStream::compact(){
   //     cudaIndicesB = temp;
   //   }
 
-    test<<<fullBlocksPerGridL, threadsPerBlockL, m_numElementsAlive*sizeof(int)>>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive);
-    checkCUDAError("kernel failed!");
-    cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost);
+    // test<<<fullBlocksPerGridL, threadsPerBlockL, m_numElementsAlive*sizeof(int)>>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive);
+    // checkCUDAError("kernel failed!");
+    // cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost);
     // //Stream compation from A into B, then save back into A
     // streamCompaction<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaDataA, cudaIndicesA, cudaDataB, m_numElementsAlive);
     // dataPacket * temp = cudaDataA;
diff --git a/src/streamCompaction.h b/src/streamCompaction.h
index 4353c58..6f6ae2f 100755
--- a/src/streamCompaction.h
+++ b/src/streamCompaction.h
@@ -15,6 +15,8 @@
 #include <time.h>
 #include <map>
 
+#define THREADS_PER_BLOCK 8
+
 struct dataPacket{
 	int index;
 	bool alive;
@@ -40,9 +42,13 @@ class DataStream{
 
 	int * cudaIndicesA;
 	int * cudaIndicesB;
+	
+	int * cudaAuxSums;
+	int * cudaAuxIncs;
 
 public:
 	int * m_indices;
+	int * m_auxSums;
 
 	DataStream(int numElements, dataPacket * data);
 	~DataStream();

From 7dae4e04d967fa88315e5643f77e3dbfdf97373a Mon Sep 17 00:00:00 2001
From: Jeremy Newlin <hms@seasnet-2-29.seas.upenn.edu>
Date: Fri, 26 Sep 2014 20:07:04 -0400
Subject: [PATCH 04/18] omg the multi block version is done

---
 src/main.cpp            | 2 +-
 src/streamCompaction.cu | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 7a70986..af146cf 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -39,7 +39,7 @@ void naive(){
 }
 
 void test(){
-	int numElements = 32;
+	int numElements = 2048;
 
 	dataPacket * ints = new dataPacket[numElements];
 	for (int i=0; i<numElements; i+=1){
diff --git a/src/streamCompaction.cu b/src/streamCompaction.cu
index fdd51b7..8f857a5 100644
--- a/src/streamCompaction.cu
+++ b/src/streamCompaction.cu
@@ -329,6 +329,9 @@ void DataStream::compact(){
   dim3 initialScanThreadsPerBlock2(sumSize/2);        //16
   dim3 initialScanBlocksPerGrid2(sumSize/(sumSize/2)+1);//1024/16
 
+  dim3 initialScanThreadsPerBlock3(procsPefBlock);        //8
+  dim3 initialScanBlocksPerGrid3(numElements/procsPefBlock);//3
+
   dim3 threadsPerBlockL(threadsPerBlock);
   dim3 fullBlocksPerGridL(int(ceil(float(m_numElementsAlive)/float(threadsPerBlock))));
 
@@ -338,7 +341,7 @@ void DataStream::compact(){
   test2<<<initialScanBlocksPerGrid2, initialScanThreadsPerBlock2, m_numElements*sizeof(int)>>>(cudaAuxSums, cudaAuxIncs, sumSize);
   checkCUDAError("kernel failed!");
 
-  addIncs<<<initialScanBlocksPerGrid, initialScanThreadsPerBlock>>>(cudaAuxIncs, cudaIndicesB, m_numElements);
+  addIncs<<<initialScanBlocksPerGrid3, initialScanThreadsPerBlock3>>>(cudaAuxIncs, cudaIndicesB, m_numElements);
   checkCUDAError("kernel failed!");
 
   cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost);

From c954ddf22e2d581789781a78c9b82d2ca78a820d Mon Sep 17 00:00:00 2001
From: Jeremy Newlin <hms@seasnet-2-29.seas.upenn.edu>
Date: Sat, 27 Sep 2014 15:46:13 -0400
Subject: [PATCH 05/18] all sums complete

---
 src/main.cpp            |  79 ++++++++++++++++--
 src/streamCompaction.cu | 180 ++++++++++++++++++++++++++++++++++++----
 src/streamCompaction.h  |   7 +-
 3 files changed, 243 insertions(+), 23 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index af146cf..e0c0de1 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -21,7 +21,7 @@ void naive(){
 	while(ds.numAlive () > 0){
 		int toKill = rand() % ds.numAlive();
 		ds.kill(toKill);
-		ds.compact ();
+		ds.compactWorkEfficientArbitrary ();
 
 		cout<<"killing "<<toKill<<", "<<ds.numAlive()<<" streams remain"<<endl;
 
@@ -38,8 +38,77 @@ void naive(){
 	}
 }
 
-void test(){
-	int numElements = 2048;
+void naiveSumGlobal(){
+	int numElements = 256;
+
+	dataPacket * ints = new dataPacket[numElements];
+	for (int i=0; i<numElements; i+=1){
+		ints[i] = dataPacket(i);
+	}
+
+	DataStream ds(numElements, ints);
+
+	cout<<"starting with "<<numElements<<" streams"<<endl;
+
+	ds.compactNaiveSumGlobal();
+
+	for (int i=0; i<ds.numAlive(); i+=1){
+		cout<<ds.m_indices[i];
+		if (i<ds.numAlive()-1) cout<<",";
+	}
+	cout<<endl;
+}
+
+void naiveSumSharedSingleBlock(){
+	int numElements = 16;
+
+	dataPacket * ints = new dataPacket[numElements];
+	for (int i=0; i<numElements; i+=1){
+		ints[i] = dataPacket(i);
+	}
+
+	DataStream ds(numElements, ints);
+
+	cout<<"starting with "<<numElements<<" streams"<<endl;
+
+	ds.compactNaiveSumSharedSingleBlock();
+
+	for (int i=0; i<ds.numAlive(); i+=1){
+		cout<<ds.m_indices[i];
+		if (i<ds.numAlive()-1) cout<<",";
+	}
+	cout<<endl;
+}
+
+void naiveSumSharedArbitrary(){
+	int numElements = 33;
+
+	dataPacket * ints = new dataPacket[numElements];
+	for (int i=0; i<numElements; i+=1){
+		ints[i] = dataPacket(i);
+	}
+
+	DataStream ds(numElements, ints);
+
+	cout<<"starting with "<<numElements<<" streams"<<endl;
+
+	ds.compactNaiveSumSharedArbitrary();
+
+	for (int i=0; i<ds.numAlive(); i+=1){
+		cout<<ds.m_indices[i];
+		if (i<ds.numAlive()-1) cout<<",";
+	}
+	cout<<endl;
+
+	for (int i=0; i<numElements/(THREADS_PER_BLOCK*2); i+=1){
+		cout<<ds.m_auxSums[i];
+		if (i<ds.numAlive()-1) cout<<",";
+	}
+	cout<<endl;
+}
+
+void workEfficientArbitrary(){
+	int numElements = 512;
 
 	dataPacket * ints = new dataPacket[numElements];
 	for (int i=0; i<numElements; i+=1){
@@ -56,7 +125,7 @@ void test(){
 	// }
 	// cout<<endl;
 
-	ds.compact();
+	ds.compactWorkEfficientArbitrary();
 
 	for (int i=0; i<ds.numAlive(); i+=1){
 		cout<<ds.m_indices[i];
@@ -74,6 +143,6 @@ void test(){
 int main(){
 	//testStreamCompaction();
 	srand (time(NULL));
-	test ();
+	workEfficientArbitrary ();
 	return 0;
 }
\ No newline at end of file
diff --git a/src/streamCompaction.cu b/src/streamCompaction.cu
index 8f857a5..c0b7089 100644
--- a/src/streamCompaction.cu
+++ b/src/streamCompaction.cu
@@ -29,7 +29,99 @@ __global__ void sum(int* in, int* out, int n, int d1){
   }
 }
 
-__global__ void test(int* in, int* out, int n){
+__global__ void naiveSumGlobal(int* in, int* out, int n){
+
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  if (index >= n) return;
+
+  for (int offset = 1; offset < n; offset *= 2){
+    int* temp = in;
+    in = out;
+    out = temp;
+
+    if (index >= offset){
+      out[index] = in[index-offset] + in[index];
+    }
+    else{
+      out[index] = in[index]; 
+    }
+    __syncthreads();
+  }
+
+  if (index>0) in[index] = out[index-1];
+  else in[index] = 0;
+
+}
+
+__global__ void naiveSumSharedSingleBlock(int* in, int* out, int n){
+
+  int index = threadIdx.x;
+
+  if (index >= n) return;
+
+  extern __shared__ int shared[];
+  int *tempIn = &shared[0];
+  int *tempOut = &shared[n];
+
+  tempOut[index] = (index > 0) ? in[index-1] : 0;  
+
+  __syncthreads();
+
+  for (int offset = 1; offset < n; offset *= 2){
+    int* temp = tempIn;
+    tempIn = tempOut;
+    tempOut = temp;
+
+    if (index >= offset){
+      tempOut[index] = tempIn[index-offset] + tempIn[index];
+    }
+    else{
+      tempOut[index] = tempIn[index]; 
+    }
+    __syncthreads();
+  }
+  out[index] = tempOut[index];
+}
+
+__global__ void naiveSumSharedArbitrary(int* in, int* out, int n, int* sums=0){
+
+  int localIndex = threadIdx.x;
+  int globalIndex = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  // if (globalIndex >= n) return;
+
+  // out[k] = index; return;
+
+  extern __shared__ int shared[];
+  int *tempIn = &shared[0];
+  int *tempOut = &shared[n];
+
+  // tempOut[localIndex] = (localIndex > 0) ? in[localIndex-1] : 0;  
+  tempOut[localIndex] = in[globalIndex];  
+  
+  __syncthreads();
+
+  for (int offset = 1; offset < n; offset *= 2){
+    int* temp = tempIn;
+    tempIn = tempOut;
+    tempOut = temp;
+
+    if (localIndex >= offset){
+      tempOut[localIndex] = tempIn[localIndex-offset] + tempIn[localIndex];
+    }
+    else{
+      tempOut[localIndex] = tempIn[localIndex]; 
+    }
+    __syncthreads();
+  }
+
+  if (sums) sums[blockIdx.x] = tempOut[n-1];
+  if (localIndex>0) out[globalIndex] = tempOut[localIndex-1];
+  else out[globalIndex] = 0;
+}
+
+__global__ void workEfficientSumSingleBlock(int* in, int* out, int n){
 
   extern __shared__ float temp[];
   int index = (blockIdx.x * blockDim.x) + threadIdx.x;
@@ -40,7 +132,6 @@ __global__ void test(int* in, int* out, int n){
     temp[2*index+1] = in[2*index+1];
 
     for (int d = n>>1; d>0; d >>= 1){
-    //for (int d=0; d<floor(log(float(n-1)/log(2.0f))); d+=1){
       __syncthreads();
       if (index < d){
         int ai = offset * (2*index+1) - 1;
@@ -76,7 +167,7 @@ __global__ void test(int* in, int* out, int n){
 
 }
 
-__global__ void test2(int* in, int* out, int n, int* sums=0){
+__global__ void workEfficientArbitrary(int* in, int* out, int n, int* sums=0){
 
   extern __shared__ float temp[];
 
@@ -101,7 +192,7 @@ __global__ void test2(int* in, int* out, int n, int* sums=0){
   }
   
   if (index == 0){
-    if (sums) sums[blockIdx.x] = temp[16-1];
+    if (sums) sums[blockIdx.x] = temp[n-1];
     temp[n - 1] = 0;
   }
 
@@ -259,15 +350,13 @@ DataStream::DataStream(int numElements, dataPacket * data){
   m_data = data;
   m_numElementsAlive = numElements;
 
-  // if (numElements % THREADS_PER_BLOCK*2 != 0){
-  //   int counter = 1;
-  //   while (THREADS_PER_BLOCK*2*counter < numElements){
-  //     counter += 1;
-  //   }
-  //   numElements = THREADS_PER_BLOCK*2*counter;
-  // }
-
-  // cout<<numElements<<endl;
+  if (numElements % (THREADS_PER_BLOCK*2) != 0){
+    int counter = 1;
+    while (THREADS_PER_BLOCK*2*counter < numElements){
+      counter += 1;
+    }
+    numElements = THREADS_PER_BLOCK*2*counter;
+  }
 
   m_numElements = numElements;
 
@@ -315,7 +404,7 @@ DataStream::~DataStream(){
   delete [] m_auxSums;
 }
 
-void DataStream::compact(){
+void DataStream::compactWorkEfficientArbitrary(){
 
   int numElements = m_numElementsAlive;
   int threadsPerBlock = THREADS_PER_BLOCK; // 8
@@ -335,10 +424,10 @@ void DataStream::compact(){
   dim3 threadsPerBlockL(threadsPerBlock);
   dim3 fullBlocksPerGridL(int(ceil(float(m_numElementsAlive)/float(threadsPerBlock))));
 
-  test2<<<initialScanBlocksPerGrid, initialScanThreadsPerBlock, m_numElements*sizeof(int)>>>(cudaIndicesA, cudaIndicesB, procsPefBlock, cudaAuxSums);
+  workEfficientArbitrary<<<initialScanBlocksPerGrid, initialScanThreadsPerBlock, m_numElements*sizeof(int)>>>(cudaIndicesA, cudaIndicesB, procsPefBlock, cudaAuxSums);
   checkCUDAError("kernel failed!");
 
-  test2<<<initialScanBlocksPerGrid2, initialScanThreadsPerBlock2, m_numElements*sizeof(int)>>>(cudaAuxSums, cudaAuxIncs, sumSize);
+  workEfficientArbitrary<<<initialScanBlocksPerGrid2, initialScanThreadsPerBlock2, m_numElements*sizeof(int)>>>(cudaAuxSums, cudaAuxIncs, sumSize);
   checkCUDAError("kernel failed!");
 
   addIncs<<<initialScanBlocksPerGrid3, initialScanThreadsPerBlock3>>>(cudaAuxIncs, cudaIndicesB, m_numElements);
@@ -368,6 +457,65 @@ void DataStream::compact(){
     // cudaMemcpy(&m_numElementsAlive, &cudaIndicesA[m_numElementsAlive-1], sizeof(int), cudaMemcpyDeviceToHost);
 }
 
+void DataStream::compactNaiveSumGlobal(){
+
+  int threadsPerBlock = THREADS_PER_BLOCK;
+
+  dim3 threadsPerBlockL(threadsPerBlock);
+  dim3 fullBlocksPerGridL(int(ceil(float(m_numElements)/float(threadsPerBlock))));
+
+  naiveSumGlobal<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaIndicesA, cudaIndicesB, m_numElements);
+  checkCUDAError("kernel failed!");
+
+  cudaMemcpy(m_indices, cudaIndicesA, m_numElements*sizeof(int), cudaMemcpyDeviceToHost);
+
+}
+
+void DataStream::compactNaiveSumSharedSingleBlock(){
+
+  int threadsPerBlock = THREADS_PER_BLOCK;
+
+  dim3 threadsPerBlockL(threadsPerBlock);
+  dim3 fullBlocksPerGridL(int(ceil(float(m_numElementsAlive)/float(threadsPerBlock))));
+
+  naiveSumSharedSingleBlock<<<fullBlocksPerGridL, threadsPerBlockL, 2*m_numElements*sizeof(int)>>>(cudaIndicesA, cudaIndicesB, m_numElements);
+  checkCUDAError("kernel failed!");
+
+  cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost);
+
+}
+
+void DataStream::compactNaiveSumSharedArbitrary(){
+
+  int threadsPerBlock = THREADS_PER_BLOCK;
+
+  dim3 threadsPerBlockL(threadsPerBlock*2);
+  dim3 fullBlocksPerGridL(m_numElements/(threadsPerBlock*2));
+
+  naiveSumSharedArbitrary<<<fullBlocksPerGridL, threadsPerBlockL, 2*m_numElements*sizeof(int)>>>(cudaIndicesA, cudaIndicesB, threadsPerBlock*2, cudaAuxSums);
+  checkCUDAError("kernel failed 1 !");
+
+  int sumSize = m_numElements/(THREADS_PER_BLOCK*2);
+  dim3 initialScanThreadsPerBlock2(threadsPerBlock);        
+  dim3 initialScanBlocksPerGrid2(sumSize/threadsPerBlock+1);
+
+  dim3 threadsPerBlockOld(threadsPerBlock);
+  dim3 fullBlocksPerGridOld(int(ceil(float(sumSize)/float(threadsPerBlock))));
+  
+  cudaMemcpy(cudaAuxIncs, cudaAuxSums, m_numElements/(THREADS_PER_BLOCK*2)*sizeof(int), cudaMemcpyDeviceToDevice);
+
+  naiveSumGlobal<<<fullBlocksPerGridOld, threadsPerBlockOld>>>(cudaAuxSums, cudaAuxIncs, sumSize);
+  checkCUDAError("kernel failed 2 !");
+
+  addIncs<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaAuxSums, cudaIndicesB, m_numElements);
+  checkCUDAError("kernel failed!");
+
+  cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost);
+  checkCUDAError("kernel failed 3 !");
+  cudaMemcpy(m_auxSums, cudaAuxSums, m_numElements/(THREADS_PER_BLOCK*2)*sizeof(int), cudaMemcpyDeviceToHost);
+  checkCUDAError("kernel failed 4 !");
+}
+
 bool DataStream::getData(int index, dataPacket& data){
 
   if (index > m_numElements) return false;
diff --git a/src/streamCompaction.h b/src/streamCompaction.h
index 6f6ae2f..a5af67e 100755
--- a/src/streamCompaction.h
+++ b/src/streamCompaction.h
@@ -15,7 +15,7 @@
 #include <time.h>
 #include <map>
 
-#define THREADS_PER_BLOCK 8
+#define THREADS_PER_BLOCK 64
 
 struct dataPacket{
 	int index;
@@ -53,7 +53,10 @@ class DataStream{
 	DataStream(int numElements, dataPacket * data);
 	~DataStream();
 
-	void compact();
+	void compactWorkEfficientArbitrary();
+	void compactNaiveSumGlobal();
+	void compactNaiveSumSharedSingleBlock();
+	void compactNaiveSumSharedArbitrary();
 	bool getData(int index, dataPacket& data);
 	int numAlive();
 	void kill(int index);

From 5749a514c11b6b21a2b95deba6b34e4b926a7eac Mon Sep 17 00:00:00 2001
From: Jeremy Newlin <hms@seasnet-2-29.seas.upenn.edu>
Date: Sat, 27 Sep 2014 18:57:40 -0400
Subject: [PATCH 06/18] compact for naive and shared

---
 src/main.cpp            | 138 ++++++++++++++++++++++++++++++++++++----
 src/streamCompaction.cu | 130 ++++++++++++++++++++++++++++---------
 src/streamCompaction.h  |  11 +++-
 3 files changed, 234 insertions(+), 45 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index e0c0de1..1fd08d9 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -5,6 +5,27 @@
 
 using namespace std;
 
+void serialSum(){
+	int numElements = 256;
+
+	dataPacket * ints = new dataPacket[numElements];
+	for (int i=0; i<numElements; i+=1){
+		ints[i] = dataPacket(i);
+	}
+
+	DataStream ds(numElements, ints);
+
+	cout<<"starting with "<<numElements<<" streams"<<endl;
+
+	ds.serialScan();
+
+	for (int i=0; i<ds.numAlive(); i+=1){
+		cout<<ds.m_indices[i];
+		if (i<ds.numAlive()-1) cout<<",";
+	}
+	cout<<endl;
+}
+
 void naive(){
 	int numElements = 25;
 
@@ -39,7 +60,7 @@ void naive(){
 }
 
 void naiveSumGlobal(){
-	int numElements = 256;
+	int numElements = 40;
 
 	dataPacket * ints = new dataPacket[numElements];
 	for (int i=0; i<numElements; i+=1){
@@ -48,7 +69,13 @@ void naiveSumGlobal(){
 
 	DataStream ds(numElements, ints);
 
-	cout<<"starting with "<<numElements<<" streams"<<endl;
+	cout<<"starting with "<<ds.m_numElements<<" streams"<<endl;
+
+	for (int i=0; i<ds.numAlive(); i+=1){
+		cout<<ds.m_indices[i];
+		if (i<ds.numAlive()-1) cout<<",";
+	}
+	cout<<endl;
 
 	ds.compactNaiveSumGlobal();
 
@@ -59,6 +86,41 @@ void naiveSumGlobal(){
 	cout<<endl;
 }
 
+void naiveCompactGlobal(){
+	int numElements = 33;
+
+	dataPacket * ints = new dataPacket[numElements];
+	for (int i=0; i<numElements; i+=1){
+		ints[i] = dataPacket(i);
+	}
+
+	DataStream ds(numElements, ints);
+
+	cout<<"starting with "<<numElements<<" streams"<<endl;
+
+	int bound = 0;
+	while(ds.numAlive () > 0 && bound < 1){
+		int toKill = rand() % ds.numAlive();
+		toKill = 10;
+		ds.kill(toKill);
+		ds.compactNaiveSumGlobal ();
+
+		dataPacket cur;
+		ds.getData(toKill, cur);
+		cout<<"killing "<<cur.index<<", "<<ds.numAlive()<<" streams remain"<<endl;
+
+		ds.fetchDataFromGPU();
+
+		for (int i=0; i<ds.numAlive(); i+=1){
+			ds.getData(i, cur);
+			cout<<cur.index;
+			if (i<ds.numAlive()-1) cout<<",";
+		}
+		cout<<endl<<endl;
+		bound+=1;
+	}
+}
+
 void naiveSumSharedSingleBlock(){
 	int numElements = 16;
 
@@ -81,7 +143,7 @@ void naiveSumSharedSingleBlock(){
 }
 
 void naiveSumSharedArbitrary(){
-	int numElements = 33;
+	int numElements = 34;
 
 	dataPacket * ints = new dataPacket[numElements];
 	for (int i=0; i<numElements; i+=1){
@@ -92,19 +154,70 @@ void naiveSumSharedArbitrary(){
 
 	cout<<"starting with "<<numElements<<" streams"<<endl;
 
+	int toKill = 32;
+	ds.kill(toKill);
+
+	dataPacket cur;
+	ds.getData(toKill, cur);
+	cout<<"killing "<<cur.index<<", "<<ds.numAlive()<<" streams remain"<<endl;
+
+	// for (int i=0; i<ds.numAlive(); i+=1){
+		// cout<<ds.m_indices[i];
+		// if (i<ds.numAlive()-1) cout<<",";
+	// }
+	// cout<<endl;
+
+
 	ds.compactNaiveSumSharedArbitrary();
+	//ds.compactNaiveSumGlobal();
 
-	for (int i=0; i<ds.numAlive(); i+=1){
-		cout<<ds.m_indices[i];
-		if (i<ds.numAlive()-1) cout<<",";
+	// for (int i=0; i<ds.numAlive(); i+=1){
+	// 	cout<<ds.m_indices[i];
+	// 	if (i<ds.numAlive()-1) cout<<",";
+	// }
+	// cout<<endl;
+
+	// for (int i=0; i<numElements/(THREADS_PER_BLOCK*2); i+=1){
+	// 	cout<<ds.m_auxSums[i];
+	// 	if (i<ds.numAlive()-1) cout<<",";
+	// }
+	// cout<<endl;
+
+}
+
+void naiveCompactSharedArbitrary(){
+	int numElements = 33;
+
+	dataPacket * ints = new dataPacket[numElements];
+	for (int i=0; i<numElements; i+=1){
+		ints[i] = dataPacket(i);
 	}
-	cout<<endl;
 
-	for (int i=0; i<numElements/(THREADS_PER_BLOCK*2); i+=1){
-		cout<<ds.m_auxSums[i];
-		if (i<ds.numAlive()-1) cout<<",";
+	DataStream ds(numElements, ints);
+
+	cout<<"starting with "<<numElements<<" streams"<<endl;
+
+	int bound = 0;
+	while(ds.numAlive () > 0 && bound < 10){
+		int toKill = rand() % ds.numAlive();
+		// toKill = 10;
+		ds.kill(toKill);
+		ds.compactNaiveSumSharedArbitrary ();
+
+		dataPacket cur;
+		ds.getData(toKill, cur);
+		cout<<"killing "<<cur.index<<", "<<ds.numAlive()<<" streams remain"<<endl;
+
+		ds.fetchDataFromGPU();
+
+		for (int i=0; i<ds.numAlive(); i+=1){
+			ds.getData(i, cur);
+			cout<<cur.index;
+			if (i<ds.numAlive()-1) cout<<",";
+		}
+		cout<<endl<<endl;
+		bound+=1;
 	}
-	cout<<endl;
 }
 
 void workEfficientArbitrary(){
@@ -143,6 +256,7 @@ void workEfficientArbitrary(){
 int main(){
 	//testStreamCompaction();
 	srand (time(NULL));
-	workEfficientArbitrary ();
+	// naiveCompactGlobal ();
+	naiveCompactSharedArbitrary ();
 	return 0;
 }
\ No newline at end of file
diff --git a/src/streamCompaction.cu b/src/streamCompaction.cu
index c0b7089..4af294e 100644
--- a/src/streamCompaction.cu
+++ b/src/streamCompaction.cu
@@ -17,7 +17,6 @@ void checkCUDAError(const char *msg) {
 
 __global__ void sum(int* in, int* out, int n, int d1){
   int k = (blockIdx.x * blockDim.x) + threadIdx.x;
-  
   if (k<n){
     int ink = in[k];
     if (k>=d1){
@@ -29,17 +28,24 @@ __global__ void sum(int* in, int* out, int n, int d1){
   }
 }
 
+__global__ void shift(int* in, int* out, int n){
+  int k = (blockIdx.x * blockDim.x) + threadIdx.x;
+  
+  out[0] = 0;
+  if (k<n && k>0){
+    out[k] = in[k-1];
+  }
+}
+
 __global__ void naiveSumGlobal(int* in, int* out, int n){
 
   int index = (blockIdx.x * blockDim.x) + threadIdx.x;
 
-  if (index >= n) return;
-
-  for (int offset = 1; offset < n; offset *= 2){
-    int* temp = in;
-    in = out;
-    out = temp;
-
+  int logn = ceil(log(float(n))/log(2.0f));
+  for (int d=1; d<=logn; d++){
+    
+    int offset = powf(2.0f, d-1);
+    
     if (index >= offset){
       out[index] = in[index-offset] + in[index];
     }
@@ -47,11 +53,11 @@ __global__ void naiveSumGlobal(int* in, int* out, int n){
       out[index] = in[index]; 
     }
     __syncthreads();
-  }
-
-  if (index>0) in[index] = out[index-1];
-  else in[index] = 0;
 
+    int* temp = in;
+    in = out;
+    out = temp;
+  }
 }
 
 __global__ void naiveSumSharedSingleBlock(int* in, int* out, int n){
@@ -68,7 +74,7 @@ __global__ void naiveSumSharedSingleBlock(int* in, int* out, int n){
 
   __syncthreads();
 
-  for (int offset = 1; offset < n; offset *= 2){
+  for (int offset = 1; offset <= n; offset *= 2){
     int* temp = tempIn;
     tempIn = tempOut;
     tempOut = temp;
@@ -97,7 +103,6 @@ __global__ void naiveSumSharedArbitrary(int* in, int* out, int n, int* sums=0){
   int *tempIn = &shared[0];
   int *tempOut = &shared[n];
 
-  // tempOut[localIndex] = (localIndex > 0) ? in[localIndex-1] : 0;  
   tempOut[localIndex] = in[globalIndex];  
   
   __syncthreads();
@@ -117,8 +122,7 @@ __global__ void naiveSumSharedArbitrary(int* in, int* out, int n, int* sums=0){
   }
 
   if (sums) sums[blockIdx.x] = tempOut[n-1];
-  if (localIndex>0) out[globalIndex] = tempOut[localIndex-1];
-  else out[globalIndex] = 0;
+  out[globalIndex] = tempOut[localIndex];
 }
 
 __global__ void workEfficientSumSingleBlock(int* in, int* out, int n){
@@ -234,7 +238,7 @@ __global__ void streamCompaction(dataPacket* inRays, int* indices, dataPacket* o
   if (k<numElements){
     dataPacket inRay = inRays[k];
     if (inRay.alive){
-      outRays[indices[k]-1] = inRay;
+      outRays[indices[k]] = inRay;
     }
   }
 }
@@ -404,6 +408,29 @@ DataStream::~DataStream(){
   delete [] m_auxSums;
 }
 
+void DataStream::serialScan(){
+  m_indices[0] = 0;
+  for (int i=1; i<m_numElementsAlive; i+=1){
+    m_indices[i] = m_indices[i] + m_indices[i-1];
+  }
+}
+
+void DataStream::globalSum(int* in, int* out, int n){
+  int threadsPerBlock = THREADS_PER_BLOCK;
+
+  dim3 threadsPerBlockL(threadsPerBlock);
+  dim3 fullBlocksPerGridL(m_numElements/threadsPerBlock);
+
+  for (int d=1; d<=ceil(log(m_numElementsAlive)/log(2)); d++){
+    sum<<<fullBlocksPerGridL, threadsPerBlockL>>>(in, out, m_numElementsAlive, powf(2.0f, d-1));
+    cudaThreadSynchronize();
+    int* temp = in;
+    in = out;
+    out = temp;
+  }
+  shift<<<fullBlocksPerGridL, threadsPerBlockL>>>(in, out, m_numElementsAlive);
+}
+
 void DataStream::compactWorkEfficientArbitrary(){
 
   int numElements = m_numElementsAlive;
@@ -462,12 +489,26 @@ void DataStream::compactNaiveSumGlobal(){
   int threadsPerBlock = THREADS_PER_BLOCK;
 
   dim3 threadsPerBlockL(threadsPerBlock);
-  dim3 fullBlocksPerGridL(int(ceil(float(m_numElements)/float(threadsPerBlock))));
+  dim3 fullBlocksPerGridL(m_numElements/threadsPerBlock);
 
-  naiveSumGlobal<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaIndicesA, cudaIndicesB, m_numElements);
-  checkCUDAError("kernel failed!");
+  for (int d=1; d<=ceil(log(m_numElementsAlive)/log(2)); d++){
+    sum<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive, powf(2.0f, d-1));
+    cudaThreadSynchronize();
+    int* temp = cudaIndicesA;
+    cudaIndicesA = cudaIndicesB;
+    cudaIndicesB = temp;
+  }
+  shift<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive);
 
-  cudaMemcpy(m_indices, cudaIndicesA, m_numElements*sizeof(int), cudaMemcpyDeviceToHost);
+  //Stream compation from A into B, then save back into A
+  streamCompaction<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaDataA, cudaIndicesB, cudaDataB, m_numElementsAlive);
+  dataPacket * temp = cudaDataA;
+  cudaDataA = cudaDataB;
+  cudaDataB = temp;
+
+  // update numrays
+  cudaMemcpy(&m_numElementsAlive, &cudaIndicesA[m_numElementsAlive-1], sizeof(int), cudaMemcpyDeviceToHost);
+  cout<<m_numElementsAlive<<endl;
 
 }
 
@@ -487,6 +528,7 @@ void DataStream::compactNaiveSumSharedSingleBlock(){
 
 void DataStream::compactNaiveSumSharedArbitrary(){
 
+  ////////////////////////////////////////////////////////////////////////////////////////
   int threadsPerBlock = THREADS_PER_BLOCK;
 
   dim3 threadsPerBlockL(threadsPerBlock*2);
@@ -494,7 +536,9 @@ void DataStream::compactNaiveSumSharedArbitrary(){
 
   naiveSumSharedArbitrary<<<fullBlocksPerGridL, threadsPerBlockL, 2*m_numElements*sizeof(int)>>>(cudaIndicesA, cudaIndicesB, threadsPerBlock*2, cudaAuxSums);
   checkCUDAError("kernel failed 1 !");
+  ////////////////////////////////////////////////////////////////////////////////////////
 
+  ////////////////////////////////////////////////////////////////////////////////////////
   int sumSize = m_numElements/(THREADS_PER_BLOCK*2);
   dim3 initialScanThreadsPerBlock2(threadsPerBlock);        
   dim3 initialScanBlocksPerGrid2(sumSize/threadsPerBlock+1);
@@ -504,16 +548,38 @@ void DataStream::compactNaiveSumSharedArbitrary(){
   
   cudaMemcpy(cudaAuxIncs, cudaAuxSums, m_numElements/(THREADS_PER_BLOCK*2)*sizeof(int), cudaMemcpyDeviceToDevice);
 
-  naiveSumGlobal<<<fullBlocksPerGridOld, threadsPerBlockOld>>>(cudaAuxSums, cudaAuxIncs, sumSize);
-  checkCUDAError("kernel failed 2 !");
-
-  addIncs<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaAuxSums, cudaIndicesB, m_numElements);
-  checkCUDAError("kernel failed!");
-
-  cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost);
-  checkCUDAError("kernel failed 3 !");
-  cudaMemcpy(m_auxSums, cudaAuxSums, m_numElements/(THREADS_PER_BLOCK*2)*sizeof(int), cudaMemcpyDeviceToHost);
-  checkCUDAError("kernel failed 4 !");
+  for (int d=1; d<=ceil(log(sumSize)/log(2)); d++){
+    sum<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaAuxSums, cudaAuxIncs, sumSize, powf(2.0f, d-1));
+    cudaThreadSynchronize();
+    int* temp = cudaAuxSums;
+    cudaAuxSums = cudaAuxIncs;
+    cudaAuxIncs = temp;
+  }
+  shift<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaAuxSums, cudaAuxIncs, m_numElementsAlive);
+
+  addIncs<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaAuxIncs, cudaIndicesB, m_numElements);
+
+  shift<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaIndicesB, cudaIndicesA, m_numElementsAlive);
+  int * temp = cudaIndicesA;
+  cudaIndicesA = cudaIndicesB;
+  cudaIndicesB = temp;
+  ////////////////////////////////////////////////////////////////////////////////////////
+
+  ////////////////////////////////////////////////////////////////////////////////////////
+  dim3 threadsPerBlockLL(threadsPerBlock);
+  dim3 fullBlocksPerGridLL(m_numElements/threadsPerBlock);
+
+  //Stream compation from A into B, then save back into A
+  streamCompaction<<<fullBlocksPerGridLL, threadsPerBlockLL>>>(cudaDataA, cudaIndicesB, cudaDataB, m_numElementsAlive);
+  dataPacket * tempDP = cudaDataA;
+  cudaDataA = cudaDataB;
+  cudaDataB = tempDP;
+
+  // // update numrays
+  ////////////////////////////////////////////////////////////////////////////////////////
+  cudaMemcpy(&m_numElementsAlive, &cudaIndicesA[m_numElementsAlive-1], sizeof(int), cudaMemcpyDeviceToHost);
+  cout<<m_numElementsAlive<<endl;
+  //////////////////////////////////////////////////////////////////////////////////////
 }
 
 bool DataStream::getData(int index, dataPacket& data){
@@ -539,4 +605,6 @@ void DataStream::kill(int index){
   dim3 fullBlocksPerGridL(int(ceil(float(m_numElementsAlive)/64.0f)));
 
   killStream<<<fullBlocksPerGridL, threadsPerBlockL>>>(index, cudaDataA, cudaIndicesA, m_numElementsAlive);
+
+  cudaMemcpy(m_indices, cudaIndicesA, m_numElementsAlive*sizeof(int), cudaMemcpyDeviceToHost);
 }
\ No newline at end of file
diff --git a/src/streamCompaction.h b/src/streamCompaction.h
index a5af67e..2dd3872 100755
--- a/src/streamCompaction.h
+++ b/src/streamCompaction.h
@@ -15,7 +15,7 @@
 #include <time.h>
 #include <map>
 
-#define THREADS_PER_BLOCK 64
+#define THREADS_PER_BLOCK 8
 
 struct dataPacket{
 	int index;
@@ -35,7 +35,7 @@ class DataStream{
 
 	dataPacket * m_data;
 
-	int m_numElementsAlive, m_numElements;
+	
 
 	dataPacket * cudaDataA;
 	dataPacket * cudaDataB;
@@ -46,13 +46,20 @@ class DataStream{
 	int * cudaAuxSums;
 	int * cudaAuxIncs;
 
+	void globalSum(int* in, int* out, int n);
+
 public:
 	int * m_indices;
 	int * m_auxSums;
 
+	int m_numElementsAlive, m_numElements;
+
 	DataStream(int numElements, dataPacket * data);
 	~DataStream();
 
+	void serialScan();
+	void serialScatter();
+
 	void compactWorkEfficientArbitrary();
 	void compactNaiveSumGlobal();
 	void compactNaiveSumSharedSingleBlock();

From 7b101112de4cd7c0007bafd534409ba1615c7d95 Mon Sep 17 00:00:00 2001
From: Jeremy Newlin <hms@seasnet-2-29.seas.upenn.edu>
Date: Sat, 27 Sep 2014 18:58:18 -0400
Subject: [PATCH 07/18] compact for naive and shared

---
 src/main.cpp | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 1fd08d9..00f2e8d 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -232,12 +232,6 @@ void workEfficientArbitrary(){
 
 	cout<<"starting with "<<numElements<<" streams"<<endl;
 
-	// for (int i=0; i<ds.numAlive(); i+=1){
-	// 	cout<<ds.m_indices[i];
-	// 	if (i<ds.numAlive()-1) cout<<",";
-	// }
-	// cout<<endl;
-
 	ds.compactWorkEfficientArbitrary();
 
 	for (int i=0; i<ds.numAlive(); i+=1){
@@ -257,6 +251,7 @@ int main(){
 	//testStreamCompaction();
 	srand (time(NULL));
 	// naiveCompactGlobal ();
-	naiveCompactSharedArbitrary ();
+	// naiveCompactSharedArbitrary ();
+	workEfficientArbitrary ();
 	return 0;
 }
\ No newline at end of file

From 6ce412e4b5f74dcb6cfb4f6c7d16a03a673c3b85 Mon Sep 17 00:00:00 2001
From: Jeremy Newlin <hms@seasnet-2-29.seas.upenn.edu>
Date: Sat, 27 Sep 2014 19:18:51 -0400
Subject: [PATCH 08/18] all compacts working

---
 src/main.cpp            | 57 ++++++++++++++++++++++------
 src/streamCompaction.cu | 82 +++++++++++++++++++++++++++--------------
 2 files changed, 101 insertions(+), 38 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 00f2e8d..7935650 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -186,7 +186,7 @@ void naiveSumSharedArbitrary(){
 }
 
 void naiveCompactSharedArbitrary(){
-	int numElements = 33;
+	int numElements = 32;
 
 	dataPacket * ints = new dataPacket[numElements];
 	for (int i=0; i<numElements; i+=1){
@@ -221,7 +221,7 @@ void naiveCompactSharedArbitrary(){
 }
 
 void workEfficientArbitrary(){
-	int numElements = 512;
+	int numElements = 40;
 
 	dataPacket * ints = new dataPacket[numElements];
 	for (int i=0; i<numElements; i+=1){
@@ -234,17 +234,52 @@ void workEfficientArbitrary(){
 
 	ds.compactWorkEfficientArbitrary();
 
-	for (int i=0; i<ds.numAlive(); i+=1){
-		cout<<ds.m_indices[i];
-		if (i<ds.numAlive()-1) cout<<",";
+	// for (int i=0; i<ds.numAlive(); i+=1){
+	// 	cout<<ds.m_indices[i];
+	// 	if (i<ds.numAlive()-1) cout<<",";
+	// }
+	// cout<<endl;
+
+	// for (int i=0; i<numElements/(THREADS_PER_BLOCK*2); i+=1){
+	// 	cout<<ds.m_auxSums[i];
+	// 	if (i<ds.numAlive()-1) cout<<",";
+	// }
+	// cout<<endl;
+}
+
+void workEfficientCompactSharedArbitrary(){
+	int numElements = 33;
+
+	dataPacket * ints = new dataPacket[numElements];
+	for (int i=0; i<numElements; i+=1){
+		ints[i] = dataPacket(i);
 	}
-	cout<<endl;
 
-	for (int i=0; i<numElements/(THREADS_PER_BLOCK*2); i+=1){
-		cout<<ds.m_auxSums[i];
-		if (i<ds.numAlive()-1) cout<<",";
+	DataStream ds(numElements, ints);
+
+	cout<<"starting with "<<numElements<<" streams"<<endl;
+
+	int bound = 0;
+	while(ds.numAlive () > 0 && bound < 20){
+		int toKill = rand() % ds.numAlive();
+		// toKill = 10;
+		ds.kill(toKill);
+		ds.compactWorkEfficientArbitrary ();
+
+		dataPacket cur;
+		ds.getData(toKill, cur);
+		cout<<"killing "<<cur.index<<", "<<ds.numAlive()<<" streams remain"<<endl;
+
+		ds.fetchDataFromGPU();
+
+		for (int i=0; i<ds.numAlive(); i+=1){
+			ds.getData(i, cur);
+			cout<<cur.index;
+			if (i<ds.numAlive()-1) cout<<",";
+		}
+		cout<<endl<<endl;
+		bound+=1;
 	}
-	cout<<endl;
 }
 
 int main(){
@@ -252,6 +287,6 @@ int main(){
 	srand (time(NULL));
 	// naiveCompactGlobal ();
 	// naiveCompactSharedArbitrary ();
-	workEfficientArbitrary ();
+	workEfficientCompactSharedArbitrary ();
 	return 0;
 }
\ No newline at end of file
diff --git a/src/streamCompaction.cu b/src/streamCompaction.cu
index 4af294e..551c569 100644
--- a/src/streamCompaction.cu
+++ b/src/streamCompaction.cu
@@ -176,7 +176,6 @@ __global__ void workEfficientArbitrary(int* in, int* out, int n, int* sums=0){
   extern __shared__ float temp[];
 
   int realIndex = (blockIdx.x * blockDim.x) + threadIdx.x;
-  temp[realIndex] = 0;
   
   int offset = 1;
   int index = threadIdx.x;
@@ -352,6 +351,9 @@ void testStreamCompaction(){
 
 DataStream::DataStream(int numElements, dataPacket * data){
   m_data = data;
+
+  if (numElements % (THREADS_PER_BLOCK*2)) numElements+=1;
+
   m_numElementsAlive = numElements;
 
   if (numElements % (THREADS_PER_BLOCK*2) != 0){
@@ -433,7 +435,7 @@ void DataStream::globalSum(int* in, int* out, int n){
 
 void DataStream::compactWorkEfficientArbitrary(){
 
-  int numElements = m_numElementsAlive;
+  int numElements = m_numElements;
   int threadsPerBlock = THREADS_PER_BLOCK; // 8
   int procsPefBlock = threadsPerBlock*2;   // 16
 
@@ -454,34 +456,61 @@ void DataStream::compactWorkEfficientArbitrary(){
   workEfficientArbitrary<<<initialScanBlocksPerGrid, initialScanThreadsPerBlock, m_numElements*sizeof(int)>>>(cudaIndicesA, cudaIndicesB, procsPefBlock, cudaAuxSums);
   checkCUDAError("kernel failed!");
 
-  workEfficientArbitrary<<<initialScanBlocksPerGrid2, initialScanThreadsPerBlock2, m_numElements*sizeof(int)>>>(cudaAuxSums, cudaAuxIncs, sumSize);
-  checkCUDAError("kernel failed!");
+  // cudaMemcpy(m_indices, cudaIndicesA, m_numElements*sizeof(int), cudaMemcpyDeviceToHost);
+  // for (int i=0; i<numAlive(); i+=1){
+  //   cout<<m_indices[i];
+  //   if (i<numAlive()-1) cout<<",";
+  // }
+  // cout<<endl;
+
+  // cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost);
+  // for (int i=0; i<numAlive(); i+=1){
+  //   cout<<m_indices[i];
+  //   if (i<numAlive()-1) cout<<",";
+  // }
+  // cout<<endl;
+
+  for (int d=1; d<=ceil(log(sumSize)/log(2)); d++){
+    sum<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaAuxSums, cudaAuxIncs, sumSize, powf(2.0f, d-1));
+    cudaThreadSynchronize();
+    int* temp = cudaAuxSums;
+    cudaAuxSums = cudaAuxIncs;
+    cudaAuxIncs = temp;
+  }
+  shift<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaAuxSums, cudaAuxIncs, m_numElementsAlive);
 
   addIncs<<<initialScanBlocksPerGrid3, initialScanThreadsPerBlock3>>>(cudaAuxIncs, cudaIndicesB, m_numElements);
   checkCUDAError("kernel failed!");
 
-  cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost);
-  cudaMemcpy(m_auxSums, cudaAuxIncs, m_numElements/(THREADS_PER_BLOCK*2)*sizeof(int), cudaMemcpyDeviceToHost);
-
-  // // scan algorithm
-  //   for (int d=1; d<=ceil(log(m_numElementsAlive)/log(2)); d++){
-  //     sum<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive, powf(2.0f, d-1));
-  //     int* temp = cudaIndicesA;
-  //     cudaIndicesA = cudaIndicesB;
-  //     cudaIndicesB = temp;
-  //   }
-
-    // test<<<fullBlocksPerGridL, threadsPerBlockL, m_numElementsAlive*sizeof(int)>>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive);
-    // checkCUDAError("kernel failed!");
-    // cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost);
-    // //Stream compation from A into B, then save back into A
-    // streamCompaction<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaDataA, cudaIndicesA, cudaDataB, m_numElementsAlive);
-    // dataPacket * temp = cudaDataA;
-    // cudaDataA = cudaDataB;
-    // cudaDataB = temp;
-
-    // // update numrays
-    // cudaMemcpy(&m_numElementsAlive, &cudaIndicesA[m_numElementsAlive-1], sizeof(int), cudaMemcpyDeviceToHost);
+  // cudaMemcpy(m_indices, cudaIndicesA, m_numElements*sizeof(int), cudaMemcpyDeviceToHost);
+  // for (int i=0; i<numAlive(); i+=1){
+  //   cout<<m_indices[i];
+  //   if (i<numAlive()-1) cout<<",";
+  // }
+  // cout<<endl;
+
+  // cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost);
+  // for (int i=0; i<numAlive()+1; i+=1){
+  //   cout<<m_indices[i];
+  //   if (i<numAlive()-1) cout<<",";
+  // }
+  // cout<<endl;
+
+  // cudaMemcpy(m_auxSums, cudaAuxIncs, m_numElements/(THREADS_PER_BLOCK*2)*sizeof(int), cudaMemcpyDeviceToHost);
+  // for (int i=0; i<m_numElements/(THREADS_PER_BLOCK*2); i+=1){
+  //   cout<<m_auxSums[i];
+  //   if (i<numAlive()-1) cout<<",";
+  // }
+  // cout<<endl;
+
+  //Stream compation from A into B, then save back into A
+  streamCompaction<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaDataA, cudaIndicesB, cudaDataB, m_numElementsAlive);
+  dataPacket * temp = cudaDataA;
+  cudaDataA = cudaDataB;
+  cudaDataB = temp;
+
+  // update numrays
+  cudaMemcpy(&m_numElementsAlive, &cudaIndicesB[m_numElementsAlive], sizeof(int), cudaMemcpyDeviceToHost);
 }
 
 void DataStream::compactNaiveSumGlobal(){
@@ -523,7 +552,6 @@ void DataStream::compactNaiveSumSharedSingleBlock(){
   checkCUDAError("kernel failed!");
 
   cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost);
-
 }
 
 void DataStream::compactNaiveSumSharedArbitrary(){

From 2ca861f804d375c5e713758e3fb5597609169cb7 Mon Sep 17 00:00:00 2001
From: Jeremy Newlin <hms@seasnet-2-29.seas.upenn.edu>
Date: Sat, 27 Sep 2014 19:24:59 -0400
Subject: [PATCH 09/18] kill stream error

---
 src/main.cpp            | 17 ++++++++++++-----
 src/streamCompaction.cu | 16 ++++++++++++++--
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 7935650..b65f062 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -199,14 +199,21 @@ void naiveCompactSharedArbitrary(){
 
 	int bound = 0;
 	while(ds.numAlive () > 0 && bound < 10){
+
 		int toKill = rand() % ds.numAlive();
-		// toKill = 10;
 		ds.kill(toKill);
-		ds.compactNaiveSumSharedArbitrary ();
-
 		dataPacket cur;
 		ds.getData(toKill, cur);
-		cout<<"killing "<<cur.index<<", "<<ds.numAlive()<<" streams remain"<<endl;
+		cout<<"killed "<<cur.index<<endl;
+
+		toKill = rand() % ds.numAlive();
+		ds.kill(toKill);
+		ds.getData(toKill, cur);
+		cout<<"killed "<<cur.index<<endl;
+
+		ds.compactNaiveSumSharedArbitrary ();
+
+		cout<<ds.numAlive()<<" streams remain"<<endl;
 
 		ds.fetchDataFromGPU();
 
@@ -287,6 +294,6 @@ int main(){
 	srand (time(NULL));
 	// naiveCompactGlobal ();
 	// naiveCompactSharedArbitrary ();
-	workEfficientCompactSharedArbitrary ();
+	naiveCompactSharedArbitrary ();
 	return 0;
 }
\ No newline at end of file
diff --git a/src/streamCompaction.cu b/src/streamCompaction.cu
index 551c569..04d18bc 100644
--- a/src/streamCompaction.cu
+++ b/src/streamCompaction.cu
@@ -246,8 +246,6 @@ __global__ void killStream(int index, dataPacket* inRays, int* indices, int numE
   int k = (blockIdx.x * blockDim.x) + threadIdx.x;
 
   if (k<numElements){
-    inRays[k].alive = true;
-    indices[k] = 1;
     if (k == index){
       inRays[k].alive = false;
       indices[k] = 0;
@@ -255,6 +253,15 @@ __global__ void killStream(int index, dataPacket* inRays, int* indices, int numE
   }
 }
 
+__global__ void resetStreams(dataPacket* inRays, int* indices, int numElements){
+  int k = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  if (k<numElements){
+    inRays[k].alive = true;
+    indices[k] = 1;
+  }
+}
+
 void testStreamCompaction(){
   //Testing stream compaction
   int numElements = 10;
@@ -511,6 +518,8 @@ void DataStream::compactWorkEfficientArbitrary(){
 
   // update numrays
   cudaMemcpy(&m_numElementsAlive, &cudaIndicesB[m_numElementsAlive], sizeof(int), cudaMemcpyDeviceToHost);
+
+  resetStreams<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaDataA, cudaIndicesA, m_numElementsAlive);
 }
 
 void DataStream::compactNaiveSumGlobal(){
@@ -539,6 +548,7 @@ void DataStream::compactNaiveSumGlobal(){
   cudaMemcpy(&m_numElementsAlive, &cudaIndicesA[m_numElementsAlive-1], sizeof(int), cudaMemcpyDeviceToHost);
   cout<<m_numElementsAlive<<endl;
 
+  resetStreams<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaDataA, cudaIndicesA, m_numElementsAlive);
 }
 
 void DataStream::compactNaiveSumSharedSingleBlock(){
@@ -608,6 +618,8 @@ void DataStream::compactNaiveSumSharedArbitrary(){
   cudaMemcpy(&m_numElementsAlive, &cudaIndicesA[m_numElementsAlive-1], sizeof(int), cudaMemcpyDeviceToHost);
   cout<<m_numElementsAlive<<endl;
   //////////////////////////////////////////////////////////////////////////////////////
+
+  resetStreams<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaDataA, cudaIndicesA, m_numElementsAlive);
 }
 
 bool DataStream::getData(int index, dataPacket& data){

From 0717b73061d7aabc8781cd526b18132780d2660a Mon Sep 17 00:00:00 2001
From: Jeremy Newlin <hms@seasnet-2-29.seas.upenn.edu>
Date: Sun, 28 Sep 2014 11:07:24 -0400
Subject: [PATCH 10/18] shared mem typo

---
 src/main.cpp            | 171 +++++++++++++++++++++++++++-------------
 src/streamCompaction.cu |  61 ++++++++++++--
 src/streamCompaction.h  |   6 +-
 3 files changed, 175 insertions(+), 63 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index b65f062..f2f90c8 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -87,7 +87,7 @@ void naiveSumGlobal(){
 }
 
 void naiveCompactGlobal(){
-	int numElements = 33;
+	int numElements = 100000;
 
 	dataPacket * ints = new dataPacket[numElements];
 	for (int i=0; i<numElements; i+=1){
@@ -99,25 +99,24 @@ void naiveCompactGlobal(){
 	cout<<"starting with "<<numElements<<" streams"<<endl;
 
 	int bound = 0;
-	while(ds.numAlive () > 0 && bound < 1){
-		int toKill = rand() % ds.numAlive();
-		toKill = 10;
-		ds.kill(toKill);
+	while(ds.numAlive () > 0){
+		for (int i=0; i<numElements/25; i+=1){
+			int toKill = rand() % ds.numAlive();
+			ds.kill(toKill);
+		}
 		ds.compactNaiveSumGlobal ();
 
-		dataPacket cur;
-		ds.getData(toKill, cur);
-		cout<<"killing "<<cur.index<<", "<<ds.numAlive()<<" streams remain"<<endl;
+		cout<<"killing ~"<<numElements/25<<" streams, "<<ds.numAlive()<<" streams remain"<<endl;
 
 		ds.fetchDataFromGPU();
 
-		for (int i=0; i<ds.numAlive(); i+=1){
-			ds.getData(i, cur);
-			cout<<cur.index;
-			if (i<ds.numAlive()-1) cout<<",";
-		}
-		cout<<endl<<endl;
-		bound+=1;
+		// for (int i=0; i<ds.numAlive(); i+=1){
+		// 	dataPacket cur;
+		// 	ds.getData(i, cur);
+		// 	cout<<cur.index;
+		// 	if (i<ds.numAlive()-1) cout<<",";
+		// }
+		cout<<endl;
 	}
 }
 
@@ -143,7 +142,46 @@ void naiveSumSharedSingleBlock(){
 }
 
 void naiveSumSharedArbitrary(){
-	int numElements = 34;
+	// int numElements = 34;
+
+	// dataPacket * ints = new dataPacket[numElements];
+	// for (int i=0; i<numElements; i+=1){
+	// 	ints[i] = dataPacket(i);
+	// }
+
+	// DataStream ds(numElements, ints);
+
+	// cout<<"starting with "<<numElements<<" streams"<<endl;
+
+	// int toKill = 32;
+	// ds.kill(toKill);
+
+	// dataPacket cur;
+	// ds.getData(toKill, cur);
+	// cout<<"killing "<<cur.index<<", "<<ds.numAlive()<<" streams remain"<<endl;
+
+	// // for (int i=0; i<ds.numAlive(); i+=1){
+	// 	// cout<<ds.m_indices[i];
+	// 	// if (i<ds.numAlive()-1) cout<<",";
+	// // }
+	// // cout<<endl;
+
+
+	// ds.compactNaiveSumSharedArbitrary();
+	// //ds.compactNaiveSumGlobal();
+
+	// // for (int i=0; i<ds.numAlive(); i+=1){
+	// // 	cout<<ds.m_indices[i];
+	// // 	if (i<ds.numAlive()-1) cout<<",";
+	// // }
+	// // cout<<endl;
+
+	// // for (int i=0; i<numElements/(THREADS_PER_BLOCK*2); i+=1){
+	// // 	cout<<ds.m_auxSums[i];
+	// // 	if (i<ds.numAlive()-1) cout<<",";
+	// // }
+	// // cout<<endl;
+	int numElements = 100000;
 
 	dataPacket * ints = new dataPacket[numElements];
 	for (int i=0; i<numElements; i+=1){
@@ -154,35 +192,26 @@ void naiveSumSharedArbitrary(){
 
 	cout<<"starting with "<<numElements<<" streams"<<endl;
 
-	int toKill = 32;
-	ds.kill(toKill);
-
-	dataPacket cur;
-	ds.getData(toKill, cur);
-	cout<<"killing "<<cur.index<<", "<<ds.numAlive()<<" streams remain"<<endl;
-
-	// for (int i=0; i<ds.numAlive(); i+=1){
-		// cout<<ds.m_indices[i];
-		// if (i<ds.numAlive()-1) cout<<",";
-	// }
-	// cout<<endl;
-
-
-	ds.compactNaiveSumSharedArbitrary();
-	//ds.compactNaiveSumGlobal();
+	int bound = 0;
+	while(ds.numAlive () > 0){
+		for (int i=0; i<numElements/25; i+=1){
+			int toKill = rand() % ds.numAlive();
+			ds.kill(toKill);
+		}
+		ds.compactNaiveSumSharedArbitrary ();
 
-	// for (int i=0; i<ds.numAlive(); i+=1){
-	// 	cout<<ds.m_indices[i];
-	// 	if (i<ds.numAlive()-1) cout<<",";
-	// }
-	// cout<<endl;
+		cout<<"killing ~"<<numElements/25<<" streams, "<<ds.numAlive()<<" streams remain"<<endl;
 
-	// for (int i=0; i<numElements/(THREADS_PER_BLOCK*2); i+=1){
-	// 	cout<<ds.m_auxSums[i];
-	// 	if (i<ds.numAlive()-1) cout<<",";
-	// }
-	// cout<<endl;
+		ds.fetchDataFromGPU();
 
+		// for (int i=0; i<ds.numAlive(); i+=1){
+		// 	dataPacket cur;
+		// 	ds.getData(i, cur);
+		// 	cout<<cur.index;
+		// 	if (i<ds.numAlive()-1) cout<<",";
+		// }
+		cout<<endl;
+	}
 }
 
 void naiveCompactSharedArbitrary(){
@@ -228,7 +257,32 @@ void naiveCompactSharedArbitrary(){
 }
 
 void workEfficientArbitrary(){
-	int numElements = 40;
+	// int numElements = 40;
+
+	// dataPacket * ints = new dataPacket[numElements];
+	// for (int i=0; i<numElements; i+=1){
+	// 	ints[i] = dataPacket(i);
+	// }
+
+	// DataStream ds(numElements, ints);
+
+	// cout<<"starting with "<<numElements<<" streams"<<endl;
+
+	// ds.compactWorkEfficientArbitrary();
+
+	// // for (int i=0; i<ds.numAlive(); i+=1){
+	// // 	cout<<ds.m_indices[i];
+	// // 	if (i<ds.numAlive()-1) cout<<",";
+	// // }
+	// // cout<<endl;
+
+	// // for (int i=0; i<numElements/(THREADS_PER_BLOCK*2); i+=1){
+	// // 	cout<<ds.m_auxSums[i];
+	// // 	if (i<ds.numAlive()-1) cout<<",";
+	// // }
+	// // cout<<endl;
+
+	int numElements = 100000;
 
 	dataPacket * ints = new dataPacket[numElements];
 	for (int i=0; i<numElements; i+=1){
@@ -239,19 +293,26 @@ void workEfficientArbitrary(){
 
 	cout<<"starting with "<<numElements<<" streams"<<endl;
 
-	ds.compactWorkEfficientArbitrary();
+	int bound = 0;
+	while(ds.numAlive () > 0){
+		for (int i=0; i<numElements/25; i+=1){
+			int toKill = rand() % ds.numAlive();
+			ds.kill(toKill);
+		}
+		ds.compactWorkEfficientArbitrary ();
+
+		cout<<"killing ~"<<numElements/25<<" streams, "<<ds.numAlive()<<" streams remain"<<endl;
 
-	// for (int i=0; i<ds.numAlive(); i+=1){
-	// 	cout<<ds.m_indices[i];
-	// 	if (i<ds.numAlive()-1) cout<<",";
-	// }
-	// cout<<endl;
+		ds.fetchDataFromGPU();
 
-	// for (int i=0; i<numElements/(THREADS_PER_BLOCK*2); i+=1){
-	// 	cout<<ds.m_auxSums[i];
-	// 	if (i<ds.numAlive()-1) cout<<",";
-	// }
-	// cout<<endl;
+		// for (int i=0; i<ds.numAlive(); i+=1){
+		// 	dataPacket cur;
+		// 	ds.getData(i, cur);
+		// 	cout<<cur.index;
+		// 	if (i<ds.numAlive()-1) cout<<",";
+		// }
+		cout<<endl;
+	}
 }
 
 void workEfficientCompactSharedArbitrary(){
@@ -294,6 +355,6 @@ int main(){
 	srand (time(NULL));
 	// naiveCompactGlobal ();
 	// naiveCompactSharedArbitrary ();
-	naiveCompactSharedArbitrary ();
+	workEfficientArbitrary ();
 	return 0;
 }
\ No newline at end of file
diff --git a/src/streamCompaction.cu b/src/streamCompaction.cu
index 04d18bc..f1fa517 100644
--- a/src/streamCompaction.cu
+++ b/src/streamCompaction.cu
@@ -7,6 +7,26 @@
 #include "streamCompaction.h"
 
 using namespace std;
+#include <thrust/copy.h>
+
+// ...
+// struct is_even
+// {
+//   __host__ __device__
+//   bool operator()(const int x)
+//   {
+//     return (x % 2) == 0;
+//   }
+// };
+// ...
+// int N = 6;
+// int data[N]    = { 0, 1,  2, 3, 4, 5};
+// int stencil[N] = {-2, 0, -1, 0, 1, 2};
+// int result[4];
+// thrust::copy_if(data, data + N, stencil, result, is_even());
+// // data remains    = { 0, 1,  2, 3, 4, 5};
+// // stencil remains = {-2, 0, -1, 0, 1, 2};
+// // result is now     { 0, 1,  3, 5}
 
 void checkCUDAError(const char *msg) {
   cudaError_t err = cudaGetLastError();
@@ -242,6 +262,33 @@ __global__ void streamCompaction(dataPacket* inRays, int* indices, dataPacket* o
   }
 }
 
+struct isAlive
+{
+  __host__ __device__
+  bool operator()(const dataPacket& dp)
+  {
+    return dp.alive;
+  }
+};
+
+struct isEven
+{
+  __host__ __device__
+  bool operator()(const int x)
+  {
+    return (x%2 == 0);
+  }
+};
+
+struct isOne
+{
+  __host__ __device__
+  bool operator()(const int x)
+  {
+    return (x == 1);
+  }
+};
+
 __global__ void killStream(int index, dataPacket* inRays, int* indices, int numElements){
   int k = (blockIdx.x * blockDim.x) + threadIdx.x;
 
@@ -440,6 +487,10 @@ void DataStream::globalSum(int* in, int* out, int n){
   shift<<<fullBlocksPerGridL, threadsPerBlockL>>>(in, out, m_numElementsAlive);
 }
 
+void DataStream::thrustStreamCompact(){
+  thrust::copy_if (m_data, m_data+m_numElements, m_indices, m_data, isOne());
+}
+
 void DataStream::compactWorkEfficientArbitrary(){
 
   int numElements = m_numElements;
@@ -460,8 +511,8 @@ void DataStream::compactWorkEfficientArbitrary(){
   dim3 threadsPerBlockL(threadsPerBlock);
   dim3 fullBlocksPerGridL(int(ceil(float(m_numElementsAlive)/float(threadsPerBlock))));
 
-  workEfficientArbitrary<<<initialScanBlocksPerGrid, initialScanThreadsPerBlock, m_numElements*sizeof(int)>>>(cudaIndicesA, cudaIndicesB, procsPefBlock, cudaAuxSums);
-  checkCUDAError("kernel failed!");
+  workEfficientArbitrary<<<initialScanBlocksPerGrid, initialScanThreadsPerBlock, procsPefBlock*sizeof(int)>>>(cudaIndicesA, cudaIndicesB, procsPefBlock, cudaAuxSums);
+  checkCUDAError("kernel failed1!");
 
   // cudaMemcpy(m_indices, cudaIndicesA, m_numElements*sizeof(int), cudaMemcpyDeviceToHost);
   // for (int i=0; i<numAlive(); i+=1){
@@ -487,7 +538,7 @@ void DataStream::compactWorkEfficientArbitrary(){
   shift<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaAuxSums, cudaAuxIncs, m_numElementsAlive);
 
   addIncs<<<initialScanBlocksPerGrid3, initialScanThreadsPerBlock3>>>(cudaAuxIncs, cudaIndicesB, m_numElements);
-  checkCUDAError("kernel failed!");
+  checkCUDAError("kernel failed2!");
 
   // cudaMemcpy(m_indices, cudaIndicesA, m_numElements*sizeof(int), cudaMemcpyDeviceToHost);
   // for (int i=0; i<numAlive(); i+=1){
@@ -538,7 +589,7 @@ void DataStream::compactNaiveSumGlobal(){
   }
   shift<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive);
 
-  //Stream compation from A into B, then save back into A
+  // Stream compation from A into B, then save back into A
   streamCompaction<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaDataA, cudaIndicesB, cudaDataB, m_numElementsAlive);
   dataPacket * temp = cudaDataA;
   cudaDataA = cudaDataB;
@@ -572,7 +623,7 @@ void DataStream::compactNaiveSumSharedArbitrary(){
   dim3 threadsPerBlockL(threadsPerBlock*2);
   dim3 fullBlocksPerGridL(m_numElements/(threadsPerBlock*2));
 
-  naiveSumSharedArbitrary<<<fullBlocksPerGridL, threadsPerBlockL, 2*m_numElements*sizeof(int)>>>(cudaIndicesA, cudaIndicesB, threadsPerBlock*2, cudaAuxSums);
+  naiveSumSharedArbitrary<<<fullBlocksPerGridL, threadsPerBlockL, 2*m_numElements/(m_numElements/(threadsPerBlock*2))*sizeof(int)>>>(cudaIndicesA, cudaIndicesB, threadsPerBlock*2, cudaAuxSums);
   checkCUDAError("kernel failed 1 !");
   ////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/src/streamCompaction.h b/src/streamCompaction.h
index 2dd3872..f8e8712 100755
--- a/src/streamCompaction.h
+++ b/src/streamCompaction.h
@@ -14,8 +14,9 @@
 #include <cmath>
 #include <time.h>
 #include <map>
+#include <thrust/copy.h>
 
-#define THREADS_PER_BLOCK 8
+#define THREADS_PER_BLOCK 64
 
 struct dataPacket{
 	int index;
@@ -35,8 +36,6 @@ class DataStream{
 
 	dataPacket * m_data;
 
-	
-
 	dataPacket * cudaDataA;
 	dataPacket * cudaDataB;
 
@@ -47,6 +46,7 @@ class DataStream{
 	int * cudaAuxIncs;
 
 	void globalSum(int* in, int* out, int n);
+	void thrustStreamCompact();
 
 public:
 	int * m_indices;

From 0b88a21737f568e11ba898d63f1a8f42d18abe9e Mon Sep 17 00:00:00 2001
From: Jeremy Newlin <hms@seasnet-2-29.seas.upenn.edu>
Date: Sun, 28 Sep 2014 13:51:30 -0400
Subject: [PATCH 11/18] omg I think Im done?

---
 src/main.cpp            | 129 ++++++++++++----------------------------
 src/streamCompaction.cu | 112 ++++++++++++++++------------------
 src/streamCompaction.h  |   3 +-
 3 files changed, 91 insertions(+), 153 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index f2f90c8..fcaa421 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,5 +1,4 @@
 #include <iostream>
-#include <time.h>
 
 #include "streamCompaction.h"
 
@@ -19,11 +18,11 @@ void serialSum(){
 
 	ds.serialScan();
 
-	for (int i=0; i<ds.numAlive(); i+=1){
-		cout<<ds.m_indices[i];
-		if (i<ds.numAlive()-1) cout<<",";
-	}
-	cout<<endl;
+	// for (int i=0; i<ds.numAlive(); i+=1){
+	// 	cout<<ds.m_indices[i];
+	// 	if (i<ds.numAlive()-1) cout<<",";
+	// }
+	// cout<<endl;
 }
 
 void naive(){
@@ -60,30 +59,35 @@ void naive(){
 }
 
 void naiveSumGlobal(){
-	int numElements = 40;
 
-	dataPacket * ints = new dataPacket[numElements];
-	for (int i=0; i<numElements; i+=1){
-		ints[i] = dataPacket(i);
-	}
+	int ne[] = {100, 1000, 10000, 100000, 1000000, 10000000};
+	for (int  i=0; i<6; i+=1){
+		int numElements = ne[i];
 
-	DataStream ds(numElements, ints);
+		dataPacket * ints = new dataPacket[numElements];
+		for (int i=0; i<numElements; i+=1){
+			ints[i] = dataPacket(i);
+		}
 
-	cout<<"starting with "<<ds.m_numElements<<" streams"<<endl;
+		DataStream ds(numElements, ints);
 
-	for (int i=0; i<ds.numAlive(); i+=1){
-		cout<<ds.m_indices[i];
-		if (i<ds.numAlive()-1) cout<<",";
-	}
-	cout<<endl;
+		// cout<<"starting with "<<ds.m_numElements<<" streams"<<endl;
 
-	ds.compactNaiveSumGlobal();
+		// for (int i=0; i<ds.numAlive(); i+=1){
+		// 	cout<<ds.m_indices[i];
+		// 	if (i<ds.numAlive()-1) cout<<",";
+		// }
+		// cout<<endl;
 
-	for (int i=0; i<ds.numAlive(); i+=1){
-		cout<<ds.m_indices[i];
-		if (i<ds.numAlive()-1) cout<<",";
+		
+		ds.compactNaiveSumGlobal();
+
+		// for (int i=0; i<ds.numAlive(); i+=1){
+		// 	cout<<ds.m_indices[i];
+		// 	if (i<ds.numAlive()-1) cout<<",";
+		// }
+		// cout<<endl;
 	}
-	cout<<endl;
 }
 
 void naiveCompactGlobal(){
@@ -104,7 +108,9 @@ void naiveCompactGlobal(){
 			int toKill = rand() % ds.numAlive();
 			ds.kill(toKill);
 		}
+		//
 		ds.compactNaiveSumGlobal ();
+		//
 
 		cout<<"killing ~"<<numElements/25<<" streams, "<<ds.numAlive()<<" streams remain"<<endl;
 
@@ -141,76 +147,19 @@ void naiveSumSharedSingleBlock(){
 	cout<<endl;
 }
 
-void naiveSumSharedArbitrary(){
-	// int numElements = 34;
+void compactNaiveSumSharedArbitrary(){
+	int ne[] = {100, 1000, 10000, 100000, 1000000, 10000000};
+	for (int  i=0; i<6; i+=1){
+		int numElements = ne[i];
 
-	// dataPacket * ints = new dataPacket[numElements];
-	// for (int i=0; i<numElements; i+=1){
-	// 	ints[i] = dataPacket(i);
-	// }
-
-	// DataStream ds(numElements, ints);
-
-	// cout<<"starting with "<<numElements<<" streams"<<endl;
-
-	// int toKill = 32;
-	// ds.kill(toKill);
-
-	// dataPacket cur;
-	// ds.getData(toKill, cur);
-	// cout<<"killing "<<cur.index<<", "<<ds.numAlive()<<" streams remain"<<endl;
-
-	// // for (int i=0; i<ds.numAlive(); i+=1){
-	// 	// cout<<ds.m_indices[i];
-	// 	// if (i<ds.numAlive()-1) cout<<",";
-	// // }
-	// // cout<<endl;
-
-
-	// ds.compactNaiveSumSharedArbitrary();
-	// //ds.compactNaiveSumGlobal();
-
-	// // for (int i=0; i<ds.numAlive(); i+=1){
-	// // 	cout<<ds.m_indices[i];
-	// // 	if (i<ds.numAlive()-1) cout<<",";
-	// // }
-	// // cout<<endl;
-
-	// // for (int i=0; i<numElements/(THREADS_PER_BLOCK*2); i+=1){
-	// // 	cout<<ds.m_auxSums[i];
-	// // 	if (i<ds.numAlive()-1) cout<<",";
-	// // }
-	// // cout<<endl;
-	int numElements = 100000;
-
-	dataPacket * ints = new dataPacket[numElements];
-	for (int i=0; i<numElements; i+=1){
-		ints[i] = dataPacket(i);
-	}
-
-	DataStream ds(numElements, ints);
-
-	cout<<"starting with "<<numElements<<" streams"<<endl;
-
-	int bound = 0;
-	while(ds.numAlive () > 0){
-		for (int i=0; i<numElements/25; i+=1){
-			int toKill = rand() % ds.numAlive();
-			ds.kill(toKill);
+		dataPacket * ints = new dataPacket[numElements];
+		for (int i=0; i<numElements; i+=1){
+			ints[i] = dataPacket(i);
 		}
-		ds.compactNaiveSumSharedArbitrary ();
 
-		cout<<"killing ~"<<numElements/25<<" streams, "<<ds.numAlive()<<" streams remain"<<endl;
-
-		ds.fetchDataFromGPU();
+		DataStream ds(numElements, ints);
 
-		// for (int i=0; i<ds.numAlive(); i+=1){
-		// 	dataPacket cur;
-		// 	ds.getData(i, cur);
-		// 	cout<<cur.index;
-		// 	if (i<ds.numAlive()-1) cout<<",";
-		// }
-		cout<<endl;
+		ds.compactNaiveSumGlobal();
 	}
 }
 
@@ -355,6 +304,6 @@ int main(){
 	srand (time(NULL));
 	// naiveCompactGlobal ();
 	// naiveCompactSharedArbitrary ();
-	workEfficientArbitrary ();
+	naiveSumGlobal ();
 	return 0;
 }
\ No newline at end of file
diff --git a/src/streamCompaction.cu b/src/streamCompaction.cu
index f1fa517..60c8339 100644
--- a/src/streamCompaction.cu
+++ b/src/streamCompaction.cu
@@ -9,6 +9,11 @@
 using namespace std;
 #include <thrust/copy.h>
 
+#define NUM_BANKS 16  
+#define LOG_NUM_BANKS 4  
+#define CONFLICT_FREE_OFFSET(n) \  
+    ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS))  
+
 // ...
 // struct is_even
 // {
@@ -115,10 +120,6 @@ __global__ void naiveSumSharedArbitrary(int* in, int* out, int n, int* sums=0){
   int localIndex = threadIdx.x;
   int globalIndex = (blockIdx.x * blockDim.x) + threadIdx.x;
 
-  // if (globalIndex >= n) return;
-
-  // out[k] = index; return;
-
   extern __shared__ int shared[];
   int *tempIn = &shared[0];
   int *tempOut = &shared[n];
@@ -195,13 +196,15 @@ __global__ void workEfficientArbitrary(int* in, int* out, int n, int* sums=0){
 
   extern __shared__ float temp[];
 
-  int realIndex = (blockIdx.x * blockDim.x) + threadIdx.x;
-  
   int offset = 1;
   int index = threadIdx.x;
 
-  temp[2*index] = in[2*realIndex];
-  temp[2*index+1] = in[2*realIndex+1];
+  int indexA = index;  
+  int indexB = index + (n/2);  
+  int bankOffsetA = CONFLICT_FREE_OFFSET(indexA);
+  int bankOffsetB = CONFLICT_FREE_OFFSET(indexB);
+  temp[indexA + bankOffsetA] = in[indexA];
+  temp[indexB + bankOffsetB] = in[indexB];
 
   for (int d = n>>1; d>0; d >>= 1){
     __syncthreads();
@@ -209,14 +212,17 @@ __global__ void workEfficientArbitrary(int* in, int* out, int n, int* sums=0){
       int ai = offset * (2*index+1) - 1;
       int bi = offset * (2*index+2) - 1;
 
+      ai += CONFLICT_FREE_OFFSET(ai);
+      bi += CONFLICT_FREE_OFFSET(bi);
+
       temp[bi] += temp[ai];
     }
     offset *= 2;
   }
   
   if (index == 0){
-    if (sums) sums[blockIdx.x] = temp[n-1];
-    temp[n - 1] = 0;
+    if (sums) sums[blockIdx.x] = temp[n - 1 + CONFLICT_FREE_OFFSET(n - 1)];
+    temp[n - 1 + CONFLICT_FREE_OFFSET(n - 1)] = 0;
   }
 
   for (int d = 1; d<n; d*=2){
@@ -227,6 +233,9 @@ __global__ void workEfficientArbitrary(int* in, int* out, int n, int* sums=0){
       int ai = offset * (2*index+1) - 1;
       int bi = offset * (2*index+2) - 1;
 
+      ai += CONFLICT_FREE_OFFSET(ai);
+      bi += CONFLICT_FREE_OFFSET(bi);
+
       if (ai < n && bi < n){
         float t = temp[ai];
         temp[ai] = temp[bi];
@@ -236,9 +245,8 @@ __global__ void workEfficientArbitrary(int* in, int* out, int n, int* sums=0){
   }
   __syncthreads();
 
-  out[2*realIndex] = temp[2*index];
-  out[2*realIndex+1] = temp[2*index+1];
-
+  out[indexA] = temp[indexA + bankOffsetA];  
+  out[indexB] = temp[indexB + bankOffsetB];  
 }
 
 __global__ void addIncs(int* cudaAuxIncs, int* cudaIndicesB, int n){
@@ -465,10 +473,13 @@ DataStream::~DataStream(){
 }
 
 void DataStream::serialScan(){
+  clock_t t = clock ();
   m_indices[0] = 0;
   for (int i=1; i<m_numElementsAlive; i+=1){
     m_indices[i] = m_indices[i] + m_indices[i-1];
   }
+  t = clock() - t;
+  cout<<(float)t/CLOCKS_PER_SEC<<endl;
 }
 
 void DataStream::globalSum(int* in, int* out, int n){
@@ -488,7 +499,10 @@ void DataStream::globalSum(int* in, int* out, int n){
 }
 
 void DataStream::thrustStreamCompact(){
+  clock_t t = clock ();
   thrust::copy_if (m_data, m_data+m_numElements, m_indices, m_data, isOne());
+  t = clock() - t;
+  cout<<(float)t/CLOCKS_PER_SEC<<endl;
 }
 
 void DataStream::compactWorkEfficientArbitrary(){
@@ -501,6 +515,7 @@ void DataStream::compactWorkEfficientArbitrary(){
   dim3 initialScanBlocksPerGrid(numElements/procsPefBlock);//
 
   int sumSize = numElements/(THREADS_PER_BLOCK*2);
+  if (sumSize<2) sumSize+=2;
 
   dim3 initialScanThreadsPerBlock2(sumSize/2);        //16
   dim3 initialScanBlocksPerGrid2(sumSize/(sumSize/2)+1);//1024/16
@@ -512,21 +527,6 @@ void DataStream::compactWorkEfficientArbitrary(){
   dim3 fullBlocksPerGridL(int(ceil(float(m_numElementsAlive)/float(threadsPerBlock))));
 
   workEfficientArbitrary<<<initialScanBlocksPerGrid, initialScanThreadsPerBlock, procsPefBlock*sizeof(int)>>>(cudaIndicesA, cudaIndicesB, procsPefBlock, cudaAuxSums);
-  checkCUDAError("kernel failed1!");
-
-  // cudaMemcpy(m_indices, cudaIndicesA, m_numElements*sizeof(int), cudaMemcpyDeviceToHost);
-  // for (int i=0; i<numAlive(); i+=1){
-  //   cout<<m_indices[i];
-  //   if (i<numAlive()-1) cout<<",";
-  // }
-  // cout<<endl;
-
-  // cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost);
-  // for (int i=0; i<numAlive(); i+=1){
-  //   cout<<m_indices[i];
-  //   if (i<numAlive()-1) cout<<",";
-  // }
-  // cout<<endl;
 
   for (int d=1; d<=ceil(log(sumSize)/log(2)); d++){
     sum<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaAuxSums, cudaAuxIncs, sumSize, powf(2.0f, d-1));
@@ -535,31 +535,8 @@ void DataStream::compactWorkEfficientArbitrary(){
     cudaAuxSums = cudaAuxIncs;
     cudaAuxIncs = temp;
   }
-  shift<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaAuxSums, cudaAuxIncs, m_numElementsAlive);
-
+  shift<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaAuxSums, cudaAuxIncs, m_numElements/(THREADS_PER_BLOCK*2));
   addIncs<<<initialScanBlocksPerGrid3, initialScanThreadsPerBlock3>>>(cudaAuxIncs, cudaIndicesB, m_numElements);
-  checkCUDAError("kernel failed2!");
-
-  // cudaMemcpy(m_indices, cudaIndicesA, m_numElements*sizeof(int), cudaMemcpyDeviceToHost);
-  // for (int i=0; i<numAlive(); i+=1){
-  //   cout<<m_indices[i];
-  //   if (i<numAlive()-1) cout<<",";
-  // }
-  // cout<<endl;
-
-  // cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost);
-  // for (int i=0; i<numAlive()+1; i+=1){
-  //   cout<<m_indices[i];
-  //   if (i<numAlive()-1) cout<<",";
-  // }
-  // cout<<endl;
-
-  // cudaMemcpy(m_auxSums, cudaAuxIncs, m_numElements/(THREADS_PER_BLOCK*2)*sizeof(int), cudaMemcpyDeviceToHost);
-  // for (int i=0; i<m_numElements/(THREADS_PER_BLOCK*2); i+=1){
-  //   cout<<m_auxSums[i];
-  //   if (i<numAlive()-1) cout<<",";
-  // }
-  // cout<<endl;
 
   //Stream compation from A into B, then save back into A
   streamCompaction<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaDataA, cudaIndicesB, cudaDataB, m_numElementsAlive);
@@ -580,14 +557,30 @@ void DataStream::compactNaiveSumGlobal(){
   dim3 threadsPerBlockL(threadsPerBlock);
   dim3 fullBlocksPerGridL(m_numElements/threadsPerBlock);
 
+  clock_t t = clock();
   for (int d=1; d<=ceil(log(m_numElementsAlive)/log(2)); d++){
     sum<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive, powf(2.0f, d-1));
+    checkCUDAError("kernel failed 1 !");
     cudaThreadSynchronize();
     int* temp = cudaIndicesA;
     cudaIndicesA = cudaIndicesB;
     cudaIndicesB = temp;
   }
   shift<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive);
+  checkCUDAError("kernel failed 1 !");
+  t = clock() - t;
+
+  cudaMemcpy(m_indices, cudaIndicesB, m_numElementsAlive*sizeof(int), cudaMemcpyDeviceToHost);
+
+  cout<<m_indices[m_numElementsAlive-1]<<": ";
+  cout<<(float) t / CLOCKS_PER_SEC<<endl;
+
+  // for (int i=0; i<numAlive(); i+=1){
+  //     cout<<m_indices[i];
+  //     if (i<numAlive()-1) cout<<",";
+  // }
+  // cout<<endl;
+
 
   // Stream compation from A into B, then save back into A
   streamCompaction<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaDataA, cudaIndicesB, cudaDataB, m_numElementsAlive);
@@ -597,8 +590,6 @@ void DataStream::compactNaiveSumGlobal(){
 
   // update numrays
   cudaMemcpy(&m_numElementsAlive, &cudaIndicesA[m_numElementsAlive-1], sizeof(int), cudaMemcpyDeviceToHost);
-  cout<<m_numElementsAlive<<endl;
-
   resetStreams<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaDataA, cudaIndicesA, m_numElementsAlive);
 }
 
@@ -635,7 +626,7 @@ void DataStream::compactNaiveSumSharedArbitrary(){
   dim3 threadsPerBlockOld(threadsPerBlock);
   dim3 fullBlocksPerGridOld(int(ceil(float(sumSize)/float(threadsPerBlock))));
   
-  cudaMemcpy(cudaAuxIncs, cudaAuxSums, m_numElements/(THREADS_PER_BLOCK*2)*sizeof(int), cudaMemcpyDeviceToDevice);
+  // cudaMemcpy(cudaAuxIncs, cudaAuxSums, m_numElements/(THREADS_PER_BLOCK*2)*sizeof(int), cudaMemcpyDeviceToDevice);
 
   for (int d=1; d<=ceil(log(sumSize)/log(2)); d++){
     sum<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaAuxSums, cudaAuxIncs, sumSize, powf(2.0f, d-1));
@@ -644,7 +635,7 @@ void DataStream::compactNaiveSumSharedArbitrary(){
     cudaAuxSums = cudaAuxIncs;
     cudaAuxIncs = temp;
   }
-  shift<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaAuxSums, cudaAuxIncs, m_numElementsAlive);
+  shift<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaAuxSums, cudaAuxIncs, m_numElements/(THREADS_PER_BLOCK*2));
 
   addIncs<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaAuxIncs, cudaIndicesB, m_numElements);
 
@@ -652,24 +643,21 @@ void DataStream::compactNaiveSumSharedArbitrary(){
   int * temp = cudaIndicesA;
   cudaIndicesA = cudaIndicesB;
   cudaIndicesB = temp;
-  ////////////////////////////////////////////////////////////////////////////////////////
 
-  ////////////////////////////////////////////////////////////////////////////////////////
   dim3 threadsPerBlockLL(threadsPerBlock);
   dim3 fullBlocksPerGridLL(m_numElements/threadsPerBlock);
 
+  clock_t t = clock();
   //Stream compation from A into B, then save back into A
   streamCompaction<<<fullBlocksPerGridLL, threadsPerBlockLL>>>(cudaDataA, cudaIndicesB, cudaDataB, m_numElementsAlive);
   dataPacket * tempDP = cudaDataA;
   cudaDataA = cudaDataB;
   cudaDataB = tempDP;
+  t = clock() - t;
+  cout<<(float)t / CLOCKS_PER_SEC<<endl;
 
   // // update numrays
-  ////////////////////////////////////////////////////////////////////////////////////////
   cudaMemcpy(&m_numElementsAlive, &cudaIndicesA[m_numElementsAlive-1], sizeof(int), cudaMemcpyDeviceToHost);
-  cout<<m_numElementsAlive<<endl;
-  //////////////////////////////////////////////////////////////////////////////////////
-
   resetStreams<<<fullBlocksPerGridL, threadsPerBlockL>>>(cudaDataA, cudaIndicesA, m_numElementsAlive);
 }
 
diff --git a/src/streamCompaction.h b/src/streamCompaction.h
index f8e8712..5585e70 100755
--- a/src/streamCompaction.h
+++ b/src/streamCompaction.h
@@ -15,8 +15,9 @@
 #include <time.h>
 #include <map>
 #include <thrust/copy.h>
+#include <time.h>
 
-#define THREADS_PER_BLOCK 64
+#define THREADS_PER_BLOCK 256
 
 struct dataPacket{
 	int index;

From 25959a19af56ed683015fa961439c956dcbcd6b2 Mon Sep 17 00:00:00 2001
From: Jeremy Newlin <hms@seasnet-2-29.seas.upenn.edu>
Date: Sun, 28 Sep 2014 14:01:00 -0400
Subject: [PATCH 12/18] testing readme

---
 README.md | 129 +++---------------------------------------------------
 1 file changed, 6 insertions(+), 123 deletions(-)

diff --git a/README.md b/README.md
index 6e02afa..9ec505d 100644
--- a/README.md
+++ b/README.md
@@ -1,133 +1,16 @@
-Project-2
-=========
-
 A Study in Parallel Algorithms : Stream Compaction
 
-# INTRODUCTION
-Many of the algorithms you have learned thus far in your career have typically
-been developed from a serial standpoint.  When it comes to GPUs, we are mainly
-looking at massively parallel work.  Thus, it is necessary to reorient our
-thinking.  In this project, we will be implementing a couple different versions
-of prefix sum.  We will start with a simple single thread serial CPU version,
-and then move to a naive GPU version.  Each part of this homework is meant to
-follow the logic of the previous parts, so please do not do this homework out of
-order.
-
-This project will serve as a stream compaction library that you may use (and
-will want to use) in your
-future projects.  For that reason, we suggest you create proper header and CUDA
-files so that you can reuse this code later.  You may want to create a separate
-cpp file that contains your main function so that you can test the code you
-write.
-
-# OVERVIEW
-Stream compaction is broken down into two parts: (1) scan, and (2) scatter.
-
-## SCAN
-Scan or prefix sum is the summation of the elements in an array such that the
-resulting array is the summation of the terms before it.  Prefix sum can either
-be inclusive, meaning the current term is a summation of all the elements before
-it and itself, or exclusive, meaning the current term is a summation of all
-elements before it excluding itself. 
-
-Inclusive:
-
-In : [ 3 4 6 7 9 10 ]
-
-Out : [ 3 7 13 20 29 39 ]
-
-Exclusive
-
-In : [ 3 4 6 7 9 10 ]
-
-Out : [ 0 3 7 13 20 29 ]
-
-Note that the resulting prefix sum will always be n + 1 elements if the input
-array is of length n.  Similarly, the first element of the exclusive prefix sum
-will always be 0.  In the following sections, all references to prefix sum will
-be to the exclusive version of prefix sum.
-
-## SCATTER
-The scatter section of stream compaction takes the results of the previous scan
-in order to reorder the elements to form a compact array.
-
-For example, let's say we have the following array:
-[ 0 0 3 4 0 6 6 7 0 1 ]
-
-We would only like to consider the non-zero elements in this zero, so we would
-like to compact it into the following array:
-[ 3 4 6 6 7 1 ]
-
-We can perform a transform on input array to transform it into a boolean array:
-
-In :  [ 0 0 3 4 0 6 6 7 0 1 ]
-
-Out : [ 0 0 1 1 0 1 1 1 0 1 ]
-
-Performing a scan on the output, we get the following array :
-
-In :  [ 0 0 1 1 0 1 1 1 0 1 ]
-
-Out : [ 0 0 0 1 2 2 3 4 5 5 ]
-
-Notice that the output array produces a corresponding index array that we can
-use to create the resulting array for stream compaction. 
-
-# PART 1 : REVIEW OF PREFIX SUM
-Given the definition of exclusive prefix sum, please write a serial CPU version
-of prefix sum.  You may write this in the cpp file to separate this from the
-CUDA code you will be writing in your .cu file. 
-
-# PART 2 : NAIVE PREFIX SUM
-We will now parallelize this the previous section's code.  Recall from lecture
-that we can parallelize this using a series of kernel calls.  In this portion,
-you are NOT allowed to use shared memory.
-
-### Questions 
-* Compare this version to the serial version of exclusive prefix scan. Please
-  include a table of how the runtimes compare on different lengths of arrays.
-* Plot a graph of the comparison and write a short explanation of the phenomenon you
-  see here.
-
-# PART 3 : OPTIMIZING PREFIX SUM
-In the previous section we did not take into account shared memory.  In the
-previous section, we kept everything in global memory, which is much slower than
-shared memory.
-
-## PART 3a : Write prefix sum for a single block
-Shared memory is accessible to threads of a block. Please write a version of
-prefix sum that works on a single block.  
-
-## PART 3b : Generalizing to arrays of any length.
-Taking the previous portion, please write a version that generalizes prefix sum
-to arbitrary length arrays, this includes arrays that will not fit on one block.
+There are two main components of stream compaction: scan and scatter.
 
-### Questions
-* Compare this version to the parallel prefix sum using global memory.
-* Plot a graph of the comparison and write a short explanation of the phenomenon
-  you see here.
+Here is a comparison of the various mehtods I used to scan:
 
-# PART 4 : ADDING SCATTER
-First create a serial version of scatter by expanding the serial version of
-prefix sum.  Then create a GPU version of scatter.  Combine the function call
-such that, given an array, you can call stream compact and it will compact the
-array for you.  Finally, write a version using thrust. 
+![](https://drive.google.com/file/d/0BzqFSVys9HdcV0t2eE43YXgydDQ/edit?usp=sharing)
 
-### Questions
-* Compare your version of stream compact to your version using thrust.  How do
-  they compare?  How might you optimize yours more, or how might thrust's stream
-  compact be optimized.
+And here is a comparison of my scatter implementation and thrust's.  I think I'm using a slow
+thrut version of this, becuase I don't think my basic version in CUDA should be as fast as thrust.
 
-# EXTRA CREDIT (+10)
-For extra credit, please optimize your prefix sum for work parallelism and to
-deal with bank conflicts.  Information on this can be found in the GPU Gems
-chapter listed in the references.  
+![](https://drive.google.com/file/d/0BzqFSVys9HdcV0t2eE43YXgydDQ/edit?usp=sharing)
 
-# SUBMISSION
-Please answer all the questions in each of the subsections above and write your
-answers in the README by overwriting the README file.  In future projects, we
-expect your analysis to be similar to the one we have led you through in this
-project.  Like other projects, please open a pull request and email Harmony.
 
 # REFERENCES
 "Parallel Prefix Sum (Scan) with CUDA." GPU Gems 3.

From ac93f6f33f642cd57446174ebd497c5f1682d3be Mon Sep 17 00:00:00 2001
From: Jeremy Newlin <hms@seasnet-2-29.seas.upenn.edu>
Date: Sun, 28 Sep 2014 14:02:00 -0400
Subject: [PATCH 13/18] testing readme again

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 9ec505d..9a135f2 100644
--- a/README.md
+++ b/README.md
@@ -4,10 +4,10 @@ There are two main components of stream compaction: scan and scatter.
 
 Here is a comparison of the various mehtods I used to scan:
 
-![](https://drive.google.com/file/d/0BzqFSVys9HdcV0t2eE43YXgydDQ/edit?usp=sharing)
+![](https://lh4.googleusercontent.com/TWSCNE_ZOLPWiv-EFjObiNwU7AW9Qfz5X4F-wtiu6JngBCe1ZIg_T5HCn5_k8q8d4OnJkageIPI=w1505-h726)
 
 And here is a comparison of my scatter implementation and thrust's.  I think I'm using a slow
-thrut version of this, becuase I don't think my basic version in CUDA should be as fast as thrust.
+thrust version of this, becuase I don't think my basic version in CUDA should be as fast as thrust.
 
 ![](https://drive.google.com/file/d/0BzqFSVys9HdcV0t2eE43YXgydDQ/edit?usp=sharing)
 

From fc83941fda04e86a18aed9f511f6b79e5074ae02 Mon Sep 17 00:00:00 2001
From: Jeremy Newlin <hms@seasnet-2-29.seas.upenn.edu>
Date: Sun, 28 Sep 2014 14:04:54 -0400
Subject: [PATCH 14/18] testing readme again

---
 README.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9a135f2..cb954ae 100644
--- a/README.md
+++ b/README.md
@@ -6,10 +6,16 @@ Here is a comparison of the various mehtods I used to scan:
 
 ![](https://lh4.googleusercontent.com/TWSCNE_ZOLPWiv-EFjObiNwU7AW9Qfz5X4F-wtiu6JngBCe1ZIg_T5HCn5_k8q8d4OnJkageIPI=w1505-h726)
 
+As you can see, the serial version is faster for small arrays, but is quickly out matched as the array length grows.  The global
+memory version is always just a bit slower than the shared memory version, which makes sense, as the only difference is the slowdown
+that comes from fetching from global memory often.  The work efficient algorithm that I've implemented must have a bug in it, because
+it only becomes comparable to the naive shared memory version after the array is over 10 million elements long.  Further investigation is
+needed.
+
 And here is a comparison of my scatter implementation and thrust's.  I think I'm using a slow
 thrust version of this, becuase I don't think my basic version in CUDA should be as fast as thrust.
 
-![](https://drive.google.com/file/d/0BzqFSVys9HdcV0t2eE43YXgydDQ/edit?usp=sharing)
+![](https://lh3.googleusercontent.com/-smo_LiXzpgg15xhhf7EwXruEdDWJ6cN-NfNbUv0Z9F7l4qwYAyI22eZpwk9dHrYbonYsrSY9ik=w1505-h726)
 
 
 # REFERENCES

From c8137e9035b1e47b8b3aa93a31266d974daaa473 Mon Sep 17 00:00:00 2001
From: Jeremy Newlin <hms@seasnet-2-29.seas.upenn.edu>
Date: Sun, 28 Sep 2014 14:06:40 -0400
Subject: [PATCH 15/18] testing readme again

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cb954ae..bc151bf 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,9 @@ it only becomes comparable to the naive shared memory version after the array is
 needed.
 
 And here is a comparison of my scatter implementation and thrust's.  I think I'm using a slow
-thrust version of this, becuase I don't think my basic version in CUDA should be as fast as thrust.
+thrust version of this, becuase I don't think my basic version in CUDA should be as fast as thrust.  But, to be honest,
+I'm not sure how else to optimize my implementation of scatter any further.  It has 3 global memory reads that are absolutely necessary,
+and a branch.
 
 ![](https://lh3.googleusercontent.com/-smo_LiXzpgg15xhhf7EwXruEdDWJ6cN-NfNbUv0Z9F7l4qwYAyI22eZpwk9dHrYbonYsrSY9ik=w1505-h726)
 

From 61c2ec06fa23f62201d546fb883f3ce44adab598 Mon Sep 17 00:00:00 2001
From: Jeremy Newlin <hms@seasnet-2-29.seas.upenn.edu>
Date: Sun, 28 Sep 2014 14:07:48 -0400
Subject: [PATCH 16/18] finished

---
 src/streamCompaction.cu | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/src/streamCompaction.cu b/src/streamCompaction.cu
index 60c8339..574cf6b 100644
--- a/src/streamCompaction.cu
+++ b/src/streamCompaction.cu
@@ -14,25 +14,6 @@ using namespace std;
 #define CONFLICT_FREE_OFFSET(n) \  
     ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS))  
 
-// ...
-// struct is_even
-// {
-//   __host__ __device__
-//   bool operator()(const int x)
-//   {
-//     return (x % 2) == 0;
-//   }
-// };
-// ...
-// int N = 6;
-// int data[N]    = { 0, 1,  2, 3, 4, 5};
-// int stencil[N] = {-2, 0, -1, 0, 1, 2};
-// int result[4];
-// thrust::copy_if(data, data + N, stencil, result, is_even());
-// // data remains    = { 0, 1,  2, 3, 4, 5};
-// // stencil remains = {-2, 0, -1, 0, 1, 2};
-// // result is now     { 0, 1,  3, 5}
-
 void checkCUDAError(const char *msg) {
   cudaError_t err = cudaGetLastError();
   if( cudaSuccess != err) {

From eedbea8df4335092d970f52b07dcfe2aafb24359 Mon Sep 17 00:00:00 2001
From: Jeremy Newlin <hms@seasnet-2-29.seas.upenn.edu>
Date: Sun, 28 Sep 2014 14:09:37 -0400
Subject: [PATCH 17/18] finished

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index bc151bf..ef55956 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ There are two main components of stream compaction: scan and scatter.
 
 Here is a comparison of the various mehtods I used to scan:
 
-![](https://lh4.googleusercontent.com/TWSCNE_ZOLPWiv-EFjObiNwU7AW9Qfz5X4F-wtiu6JngBCe1ZIg_T5HCn5_k8q8d4OnJkageIPI=w1505-h726)
+![](http://imgur.com/AaR3gk0,V55kt3w)
 
 As you can see, the serial version is faster for small arrays, but is quickly out matched as the array length grows.  The global
 memory version is always just a bit slower than the shared memory version, which makes sense, as the only difference is the slowdown
@@ -17,7 +17,7 @@ thrust version of this, becuase I don't think my basic version in CUDA should be
 I'm not sure how else to optimize my implementation of scatter any further.  It has 3 global memory reads that are absolutely necessary,
 and a branch.
 
-![](https://lh3.googleusercontent.com/-smo_LiXzpgg15xhhf7EwXruEdDWJ6cN-NfNbUv0Z9F7l4qwYAyI22eZpwk9dHrYbonYsrSY9ik=w1505-h726)
+![](http://imgur.com/AaR3gk0,V55kt3w#1)
 
 
 # REFERENCES

From 6872df4cc267207058e2a248828b742ce9bf767e Mon Sep 17 00:00:00 2001
From: Jeremy Newlin <hms@seasnet-2-29.seas.upenn.edu>
Date: Sun, 28 Sep 2014 14:10:20 -0400
Subject: [PATCH 18/18] finished

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index ef55956..0d60152 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ There are two main components of stream compaction: scan and scatter.
 
 Here is a comparison of the various mehtods I used to scan:
 
-![](http://imgur.com/AaR3gk0,V55kt3w)
+![](http://i.imgur.com/AaR3gk0.png)
 
 As you can see, the serial version is faster for small arrays, but is quickly out matched as the array length grows.  The global
 memory version is always just a bit slower than the shared memory version, which makes sense, as the only difference is the slowdown
@@ -17,7 +17,7 @@ thrust version of this, becuase I don't think my basic version in CUDA should be
 I'm not sure how else to optimize my implementation of scatter any further.  It has 3 global memory reads that are absolutely necessary,
 and a branch.
 
-![](http://imgur.com/AaR3gk0,V55kt3w#1)
+![](http://i.imgur.com/V55kt3w.png)
 
 
 # REFERENCES