From a505563a3dddd9755fc08493589108978417a344 Mon Sep 17 00:00:00 2001 From: Jeremy Newlin Date: Thu, 25 Sep 2014 16:42:16 -0400 Subject: [PATCH 01/18] naive --- src/main.cpp | 45 ++++++++ src/streamCompaction.cu | 227 ++++++++++++++++++++++++++++++++++++++++ src/streamCompaction.h | 64 +++++++++++ 3 files changed, 336 insertions(+) create mode 100644 src/main.cpp create mode 100644 src/streamCompaction.cu create mode 100755 src/streamCompaction.h diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 0000000..726232d --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,45 @@ +#include +#include + +#include "streamCompaction.h" + +using namespace std; + +void naive(){ + int numElements = 25; + + dataPacket * ints = new dataPacket[numElements]; + for (int i=0; i 0){ + int toKill = rand() % ds.numAlive(); + ds.kill(toKill); + ds.compact (); + + cout<<"killing "< +#include +#include +#include +#include + +#include "streamCompaction.h" + +__global__ void sum(int* in, int* out, int n, int d1){ + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (k=d1){ + out[k] = in[k-d1] + ink; + } + else{ + out[k] = ink; + } + } +} + +__global__ void shift(int* in, int* out, int n){ + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + + out[0] = 0; + if (k0){ + out[k] = in[k-1]; + } +} + +__global__ void streamCompaction(dataPacket* inRays, int* indices, dataPacket* outRays, int numElements){ + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (k>>(testin, testout, numElements, int(pow(2.0f,d-1))); + cudaThreadSynchronize(); + cudaMemcpy(cputest, testout, numElements*sizeof(int), cudaMemcpyDeviceToHost); + + + int* temp = testin; + testin=testout; + testout=temp; + } + //Compact + streamCompaction<<>>(cudaArrayA, testin, cudaArrayB, numElements); + cudaArrayA = cudaArrayB; + cudaThreadSynchronize(); + + cudaMemcpy(&numElements, &testin[numElements-1], 1*sizeof(int), cudaMemcpyDeviceToHost); + + std::cout<<"number of rays left: "<>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive, powf(2.0f, d-1)); + int* temp = cudaIndicesA; + cudaIndicesA = cudaIndicesB; + cudaIndicesB = temp; + } + + //Stream compation from A into B, then save back into A + streamCompaction<<>>(cudaDataA, cudaIndicesA, cudaDataB, m_numElementsAlive); + dataPacket * temp = cudaDataA; + cudaDataA = cudaDataB; + cudaDataB = temp; + + // update numrays + cudaMemcpy(&m_numElementsAlive, &cudaIndicesA[m_numElementsAlive-1], sizeof(int), cudaMemcpyDeviceToHost); +} + +bool DataStream::getData(int index, dataPacket& data){ + + if (index > m_numElements) return false; + + data = m_data[index]; + return true; +} + +int DataStream::numAlive(){ + return m_numElementsAlive; +} + +void DataStream::fetchDataFromGPU(){ + cudaMemcpy(m_data, cudaDataA, m_numElementsAlive*sizeof(dataPacket), cudaMemcpyDeviceToHost); +} + +void DataStream::kill(int index){ + if (index > m_numElementsAlive) return; + + dim3 threadsPerBlockL(64); + dim3 fullBlocksPerGridL(int(ceil(float(m_numElementsAlive)/64.0f))); + + killStream<<>>(index, cudaDataA, cudaIndicesA, m_numElementsAlive); +} \ No newline at end of file diff --git a/src/streamCompaction.h b/src/streamCompaction.h new file mode 100755 index 0000000..4353c58 --- /dev/null +++ b/src/streamCompaction.h @@ -0,0 +1,64 @@ +// CIS565 CUDA Raytracer: A parallel raytracer for Patrick Cozzi's CIS565: GPU Computing at the University of Pennsylvania +// Written by Yining Karl Li, Copyright (c) 2012 University of Pennsylvania +// This file includes code from: +// Rob Farber for CUDA-GL interop, from CUDA Supercomputing For The Masses: http://www.drdobbs.com/architecture-and-design/cuda-supercomputing-for-the-masses-part/222600097 +// Peter Kutz and Yining Karl Li's GPU Pathtracer: http://gpupathtracer.blogspot.com/ +// Yining Karl Li's TAKUA Render, a massively parallel pathtracing renderer: http://www.yiningkarlli.com + +#ifndef STREAM_COMPACTION_H +#define STREAM_COMPACTION_H + +#include +#include +#include +#include +#include +#include + +struct dataPacket{ + int index; + bool alive; + dataPacket(){ + index = -1; + alive = true; + } + dataPacket(int i){ + index = i; + alive = true; + } +}; + +class DataStream{ +private: + + dataPacket * m_data; + + int m_numElementsAlive, m_numElements; + + dataPacket * cudaDataA; + dataPacket * cudaDataB; + + int * cudaIndicesA; + int * cudaIndicesB; + +public: + int * m_indices; + + DataStream(int numElements, dataPacket * data); + ~DataStream(); + + void compact(); + bool getData(int index, dataPacket& data); + int numAlive(); + void kill(int index); + void fetchDataFromGPU(); + +}; + +void cudaVectorSum(int * indicesA, int * indicesB, int numElements, float k); + +void cudaInit (dataPacket * a, dataPacket * b, int * ia, int * ib); + +void testStreamCompaction(); + +#endif From b173957c0e9773f52d6963c8acd723883263ef9c Mon Sep 17 00:00:00 2001 From: Jeremy Newlin Date: Fri, 26 Sep 2014 16:55:32 -0400 Subject: [PATCH 02/18] second --- src/main.cpp | 28 +++++++++++ src/streamCompaction.cu | 104 ++++++++++++++++++++++++++++++---------- 2 files changed, 108 insertions(+), 24 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 726232d..435f5b1 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -38,8 +38,36 @@ void naive(){ } } +void test(){ + int numElements = 16; + + dataPacket * ints = new dataPacket[numElements]; + for (int i=0; i0){ - out[k] = in[k-1]; +__global__ void test(int* in, int* out, int n){ + + extern __shared__ float temp[]; + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + int offset = 1; + + if (2*index+1<=n){ + temp[2*index] = in[2*index]; + temp[2*index+1] = in[2*index+1]; + + for (int d = n>>1; d>0; d >>= 1){ + //for (int d=0; d>= 1; + __syncthreads(); + if (index < d){ + + int ai = offset * (2*index+1) - 1; + int bi = offset * (2*index+2) - 1; + + if (ai < n && bi < n){ + float t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + } + } + __syncthreads(); + + out[2*index] = temp[2*index]; + out[2*index+1] = temp[2*index+1]; } + } __global__ void streamCompaction(dataPacket* inRays, int* indices, dataPacket* outRays, int numElements){ @@ -180,25 +225,36 @@ DataStream::~DataStream(){ } void DataStream::compact(){ - dim3 threadsPerBlockL(64); - dim3 fullBlocksPerGridL(int(ceil(float(m_numElementsAlive)/64.0f))); - - // scan algorithm - for (int d=1; d<=ceil(log(m_numElementsAlive)/log(2)); d++){ - sum<<>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive, powf(2.0f, d-1)); - int* temp = cudaIndicesA; - cudaIndicesA = cudaIndicesB; - cudaIndicesB = temp; - } - - //Stream compation from A into B, then save back into A - streamCompaction<<>>(cudaDataA, cudaIndicesA, cudaDataB, m_numElementsAlive); - dataPacket * temp = cudaDataA; - cudaDataA = cudaDataB; - cudaDataB = temp; - // update numrays - cudaMemcpy(&m_numElementsAlive, &cudaIndicesA[m_numElementsAlive-1], sizeof(int), cudaMemcpyDeviceToHost); + int numElements = m_numElementsAlive; + int threadsPerBlock = 64; + int procsPefBlock = threadsPerBlock*2; + + dim3 initialScanThreadsPerBlock(threadsPerBlock/2); + dim3 initialScanBlocksPerGrid(numElements/threadsPerBlock); + + dim3 threadsPerBlockL(threadsPerBlock); + dim3 fullBlocksPerGridL(int(ceil(float(m_numElementsAlive)/float(threadsPerBlock)))); + + // // scan algorithm + // for (int d=1; d<=ceil(log(m_numElementsAlive)/log(2)); d++){ + // sum<<>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive, powf(2.0f, d-1)); + // int* temp = cudaIndicesA; + // cudaIndicesA = cudaIndicesB; + // cudaIndicesB = temp; + // } + + test<<>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive); + checkCUDAError("kernel failed!"); + cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost); + // //Stream compation from A into B, then save back into A + // streamCompaction<<>>(cudaDataA, cudaIndicesA, cudaDataB, m_numElementsAlive); + // dataPacket * temp = cudaDataA; + // cudaDataA = cudaDataB; + // cudaDataB = temp; + + // // update numrays + // cudaMemcpy(&m_numElementsAlive, &cudaIndicesA[m_numElementsAlive-1], sizeof(int), cudaMemcpyDeviceToHost); } bool DataStream::getData(int index, dataPacket& data){ From 12f97c14aa54ba54610021de1b4d8e7fef3c28dd Mon Sep 17 00:00:00 2001 From: Jeremy Newlin Date: Fri, 26 Sep 2014 20:00:25 -0400 Subject: [PATCH 03/18] sad --- src/main.cpp | 16 ++++-- src/streamCompaction.cu | 124 +++++++++++++++++++++++++++++++++++++--- src/streamCompaction.h | 6 ++ 3 files changed, 133 insertions(+), 13 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 435f5b1..7a70986 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -39,7 +39,7 @@ void naive(){ } void test(){ - int numElements = 16; + int numElements = 32; dataPacket * ints = new dataPacket[numElements]; for (int i=0; i>1; d>0; d >>= 1){ + __syncthreads(); + if (index < d){ + int ai = offset * (2*index+1) - 1; + int bi = offset * (2*index+2) - 1; + + temp[bi] += temp[ai]; + } + offset *= 2; + } + + if (index == 0){ + if (sums) sums[blockIdx.x] = temp[16-1]; + temp[n - 1] = 0; + } + + for (int d = 1; d>= 1; + __syncthreads(); + if (index < d){ + + int ai = offset * (2*index+1) - 1; + int bi = offset * (2*index+2) - 1; + + if (ai < n && bi < n){ + float t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + } + } + __syncthreads(); + + out[2*realIndex] = temp[2*index]; + out[2*realIndex+1] = temp[2*index+1]; + +} + +__global__ void addIncs(int* cudaAuxIncs, int* cudaIndicesB, int n){ + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + + // if (index < n){ + // cudaIndicesB[index] = blockIdx.x; //cudaAuxIncs[blockIdx.x]; + cudaIndicesB[index] += cudaAuxIncs[blockIdx.x]; + // } +} + __global__ void streamCompaction(dataPacket* inRays, int* indices, dataPacket* outRays, int numElements){ int k = (blockIdx.x * blockDim.x) + threadIdx.x; @@ -195,11 +258,32 @@ void testStreamCompaction(){ DataStream::DataStream(int numElements, dataPacket * data){ m_data = data; m_numElementsAlive = numElements; + + // if (numElements % THREADS_PER_BLOCK*2 != 0){ + // int counter = 1; + // while (THREADS_PER_BLOCK*2*counter < numElements){ + // counter += 1; + // } + // numElements = THREADS_PER_BLOCK*2*counter; + // } + + // cout<>>(cudaIndicesA, cudaIndicesB, procsPefBlock, cudaAuxSums); + checkCUDAError("kernel failed!"); + + test2<<>>(cudaAuxSums, cudaAuxIncs, sumSize); + checkCUDAError("kernel failed!"); + + addIncs<<>>(cudaAuxIncs, cudaIndicesB, m_numElements); + checkCUDAError("kernel failed!"); + + cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(m_auxSums, cudaAuxIncs, m_numElements/(THREADS_PER_BLOCK*2)*sizeof(int), cudaMemcpyDeviceToHost); + // // scan algorithm // for (int d=1; d<=ceil(log(m_numElementsAlive)/log(2)); d++){ // sum<<>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive, powf(2.0f, d-1)); @@ -244,9 +352,9 @@ void DataStream::compact(){ // cudaIndicesB = temp; // } - test<<>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive); - checkCUDAError("kernel failed!"); - cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost); + // test<<>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive); + // checkCUDAError("kernel failed!"); + // cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost); // //Stream compation from A into B, then save back into A // streamCompaction<<>>(cudaDataA, cudaIndicesA, cudaDataB, m_numElementsAlive); // dataPacket * temp = cudaDataA; diff --git a/src/streamCompaction.h b/src/streamCompaction.h index 4353c58..6f6ae2f 100755 --- a/src/streamCompaction.h +++ b/src/streamCompaction.h @@ -15,6 +15,8 @@ #include #include +#define THREADS_PER_BLOCK 8 + struct dataPacket{ int index; bool alive; @@ -40,9 +42,13 @@ class DataStream{ int * cudaIndicesA; int * cudaIndicesB; + + int * cudaAuxSums; + int * cudaAuxIncs; public: int * m_indices; + int * m_auxSums; DataStream(int numElements, dataPacket * data); ~DataStream(); From 7dae4e04d967fa88315e5643f77e3dbfdf97373a Mon Sep 17 00:00:00 2001 From: Jeremy Newlin Date: Fri, 26 Sep 2014 20:07:04 -0400 Subject: [PATCH 04/18] omg the multi block version is done --- src/main.cpp | 2 +- src/streamCompaction.cu | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 7a70986..af146cf 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -39,7 +39,7 @@ void naive(){ } void test(){ - int numElements = 32; + int numElements = 2048; dataPacket * ints = new dataPacket[numElements]; for (int i=0; i>>(cudaAuxSums, cudaAuxIncs, sumSize); checkCUDAError("kernel failed!"); - addIncs<<>>(cudaAuxIncs, cudaIndicesB, m_numElements); + addIncs<<>>(cudaAuxIncs, cudaIndicesB, m_numElements); checkCUDAError("kernel failed!"); cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost); From c954ddf22e2d581789781a78c9b82d2ca78a820d Mon Sep 17 00:00:00 2001 From: Jeremy Newlin Date: Sat, 27 Sep 2014 15:46:13 -0400 Subject: [PATCH 05/18] all sums complete --- src/main.cpp | 79 ++++++++++++++++-- src/streamCompaction.cu | 180 ++++++++++++++++++++++++++++++++++++---- src/streamCompaction.h | 7 +- 3 files changed, 243 insertions(+), 23 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index af146cf..e0c0de1 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -21,7 +21,7 @@ void naive(){ while(ds.numAlive () > 0){ int toKill = rand() % ds.numAlive(); ds.kill(toKill); - ds.compact (); + ds.compactWorkEfficientArbitrary (); cout<<"killing "<= n) return; + + for (int offset = 1; offset < n; offset *= 2){ + int* temp = in; + in = out; + out = temp; + + if (index >= offset){ + out[index] = in[index-offset] + in[index]; + } + else{ + out[index] = in[index]; + } + __syncthreads(); + } + + if (index>0) in[index] = out[index-1]; + else in[index] = 0; + +} + +__global__ void naiveSumSharedSingleBlock(int* in, int* out, int n){ + + int index = threadIdx.x; + + if (index >= n) return; + + extern __shared__ int shared[]; + int *tempIn = &shared[0]; + int *tempOut = &shared[n]; + + tempOut[index] = (index > 0) ? in[index-1] : 0; + + __syncthreads(); + + for (int offset = 1; offset < n; offset *= 2){ + int* temp = tempIn; + tempIn = tempOut; + tempOut = temp; + + if (index >= offset){ + tempOut[index] = tempIn[index-offset] + tempIn[index]; + } + else{ + tempOut[index] = tempIn[index]; + } + __syncthreads(); + } + out[index] = tempOut[index]; +} + +__global__ void naiveSumSharedArbitrary(int* in, int* out, int n, int* sums=0){ + + int localIndex = threadIdx.x; + int globalIndex = (blockIdx.x * blockDim.x) + threadIdx.x; + + // if (globalIndex >= n) return; + + // out[k] = index; return; + + extern __shared__ int shared[]; + int *tempIn = &shared[0]; + int *tempOut = &shared[n]; + + // tempOut[localIndex] = (localIndex > 0) ? in[localIndex-1] : 0; + tempOut[localIndex] = in[globalIndex]; + + __syncthreads(); + + for (int offset = 1; offset < n; offset *= 2){ + int* temp = tempIn; + tempIn = tempOut; + tempOut = temp; + + if (localIndex >= offset){ + tempOut[localIndex] = tempIn[localIndex-offset] + tempIn[localIndex]; + } + else{ + tempOut[localIndex] = tempIn[localIndex]; + } + __syncthreads(); + } + + if (sums) sums[blockIdx.x] = tempOut[n-1]; + if (localIndex>0) out[globalIndex] = tempOut[localIndex-1]; + else out[globalIndex] = 0; +} + +__global__ void workEfficientSumSingleBlock(int* in, int* out, int n){ extern __shared__ float temp[]; int index = (blockIdx.x * blockDim.x) + threadIdx.x; @@ -40,7 +132,6 @@ __global__ void test(int* in, int* out, int n){ temp[2*index+1] = in[2*index+1]; for (int d = n>>1; d>0; d >>= 1){ - //for (int d=0; d>>(cudaIndicesA, cudaIndicesB, procsPefBlock, cudaAuxSums); + workEfficientArbitrary<<>>(cudaIndicesA, cudaIndicesB, procsPefBlock, cudaAuxSums); checkCUDAError("kernel failed!"); - test2<<>>(cudaAuxSums, cudaAuxIncs, sumSize); + workEfficientArbitrary<<>>(cudaAuxSums, cudaAuxIncs, sumSize); checkCUDAError("kernel failed!"); addIncs<<>>(cudaAuxIncs, cudaIndicesB, m_numElements); @@ -368,6 +457,65 @@ void DataStream::compact(){ // cudaMemcpy(&m_numElementsAlive, &cudaIndicesA[m_numElementsAlive-1], sizeof(int), cudaMemcpyDeviceToHost); } +void DataStream::compactNaiveSumGlobal(){ + + int threadsPerBlock = THREADS_PER_BLOCK; + + dim3 threadsPerBlockL(threadsPerBlock); + dim3 fullBlocksPerGridL(int(ceil(float(m_numElements)/float(threadsPerBlock)))); + + naiveSumGlobal<<>>(cudaIndicesA, cudaIndicesB, m_numElements); + checkCUDAError("kernel failed!"); + + cudaMemcpy(m_indices, cudaIndicesA, m_numElements*sizeof(int), cudaMemcpyDeviceToHost); + +} + +void DataStream::compactNaiveSumSharedSingleBlock(){ + + int threadsPerBlock = THREADS_PER_BLOCK; + + dim3 threadsPerBlockL(threadsPerBlock); + dim3 fullBlocksPerGridL(int(ceil(float(m_numElementsAlive)/float(threadsPerBlock)))); + + naiveSumSharedSingleBlock<<>>(cudaIndicesA, cudaIndicesB, m_numElements); + checkCUDAError("kernel failed!"); + + cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost); + +} + +void DataStream::compactNaiveSumSharedArbitrary(){ + + int threadsPerBlock = THREADS_PER_BLOCK; + + dim3 threadsPerBlockL(threadsPerBlock*2); + dim3 fullBlocksPerGridL(m_numElements/(threadsPerBlock*2)); + + naiveSumSharedArbitrary<<>>(cudaIndicesA, cudaIndicesB, threadsPerBlock*2, cudaAuxSums); + checkCUDAError("kernel failed 1 !"); + + int sumSize = m_numElements/(THREADS_PER_BLOCK*2); + dim3 initialScanThreadsPerBlock2(threadsPerBlock); + dim3 initialScanBlocksPerGrid2(sumSize/threadsPerBlock+1); + + dim3 threadsPerBlockOld(threadsPerBlock); + dim3 fullBlocksPerGridOld(int(ceil(float(sumSize)/float(threadsPerBlock)))); + + cudaMemcpy(cudaAuxIncs, cudaAuxSums, m_numElements/(THREADS_PER_BLOCK*2)*sizeof(int), cudaMemcpyDeviceToDevice); + + naiveSumGlobal<<>>(cudaAuxSums, cudaAuxIncs, sumSize); + checkCUDAError("kernel failed 2 !"); + + addIncs<<>>(cudaAuxSums, cudaIndicesB, m_numElements); + checkCUDAError("kernel failed!"); + + cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("kernel failed 3 !"); + cudaMemcpy(m_auxSums, cudaAuxSums, m_numElements/(THREADS_PER_BLOCK*2)*sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("kernel failed 4 !"); +} + bool DataStream::getData(int index, dataPacket& data){ if (index > m_numElements) return false; diff --git a/src/streamCompaction.h b/src/streamCompaction.h index 6f6ae2f..a5af67e 100755 --- a/src/streamCompaction.h +++ b/src/streamCompaction.h @@ -15,7 +15,7 @@ #include #include -#define THREADS_PER_BLOCK 8 +#define THREADS_PER_BLOCK 64 struct dataPacket{ int index; @@ -53,7 +53,10 @@ class DataStream{ DataStream(int numElements, dataPacket * data); ~DataStream(); - void compact(); + void compactWorkEfficientArbitrary(); + void compactNaiveSumGlobal(); + void compactNaiveSumSharedSingleBlock(); + void compactNaiveSumSharedArbitrary(); bool getData(int index, dataPacket& data); int numAlive(); void kill(int index); From 5749a514c11b6b21a2b95deba6b34e4b926a7eac Mon Sep 17 00:00:00 2001 From: Jeremy Newlin Date: Sat, 27 Sep 2014 18:57:40 -0400 Subject: [PATCH 06/18] compact for naive and shared --- src/main.cpp | 138 ++++++++++++++++++++++++++++++++++++---- src/streamCompaction.cu | 130 ++++++++++++++++++++++++++++--------- src/streamCompaction.h | 11 +++- 3 files changed, 234 insertions(+), 45 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index e0c0de1..1fd08d9 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -5,6 +5,27 @@ using namespace std; +void serialSum(){ + int numElements = 256; + + dataPacket * ints = new dataPacket[numElements]; + for (int i=0; i 0 && bound < 1){ + int toKill = rand() % ds.numAlive(); + toKill = 10; + ds.kill(toKill); + ds.compactNaiveSumGlobal (); + + dataPacket cur; + ds.getData(toKill, cur); + cout<<"killing "< 0 && bound < 10){ + int toKill = rand() % ds.numAlive(); + // toKill = 10; + ds.kill(toKill); + ds.compactNaiveSumSharedArbitrary (); + + dataPacket cur; + ds.getData(toKill, cur); + cout<<"killing "<=d1){ @@ -29,17 +28,24 @@ __global__ void sum(int* in, int* out, int n, int d1){ } } +__global__ void shift(int* in, int* out, int n){ + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + + out[0] = 0; + if (k0){ + out[k] = in[k-1]; + } +} + __global__ void naiveSumGlobal(int* in, int* out, int n){ int index = (blockIdx.x * blockDim.x) + threadIdx.x; - if (index >= n) return; - - for (int offset = 1; offset < n; offset *= 2){ - int* temp = in; - in = out; - out = temp; - + int logn = ceil(log(float(n))/log(2.0f)); + for (int d=1; d<=logn; d++){ + + int offset = powf(2.0f, d-1); + if (index >= offset){ out[index] = in[index-offset] + in[index]; } @@ -47,11 +53,11 @@ __global__ void naiveSumGlobal(int* in, int* out, int n){ out[index] = in[index]; } __syncthreads(); - } - - if (index>0) in[index] = out[index-1]; - else in[index] = 0; + int* temp = in; + in = out; + out = temp; + } } __global__ void naiveSumSharedSingleBlock(int* in, int* out, int n){ @@ -68,7 +74,7 @@ __global__ void naiveSumSharedSingleBlock(int* in, int* out, int n){ __syncthreads(); - for (int offset = 1; offset < n; offset *= 2){ + for (int offset = 1; offset <= n; offset *= 2){ int* temp = tempIn; tempIn = tempOut; tempOut = temp; @@ -97,7 +103,6 @@ __global__ void naiveSumSharedArbitrary(int* in, int* out, int n, int* sums=0){ int *tempIn = &shared[0]; int *tempOut = &shared[n]; - // tempOut[localIndex] = (localIndex > 0) ? in[localIndex-1] : 0; tempOut[localIndex] = in[globalIndex]; __syncthreads(); @@ -117,8 +122,7 @@ __global__ void naiveSumSharedArbitrary(int* in, int* out, int n, int* sums=0){ } if (sums) sums[blockIdx.x] = tempOut[n-1]; - if (localIndex>0) out[globalIndex] = tempOut[localIndex-1]; - else out[globalIndex] = 0; + out[globalIndex] = tempOut[localIndex]; } __global__ void workEfficientSumSingleBlock(int* in, int* out, int n){ @@ -234,7 +238,7 @@ __global__ void streamCompaction(dataPacket* inRays, int* indices, dataPacket* o if (k>>(in, out, m_numElementsAlive, powf(2.0f, d-1)); + cudaThreadSynchronize(); + int* temp = in; + in = out; + out = temp; + } + shift<<>>(in, out, m_numElementsAlive); +} + void DataStream::compactWorkEfficientArbitrary(){ int numElements = m_numElementsAlive; @@ -462,12 +489,26 @@ void DataStream::compactNaiveSumGlobal(){ int threadsPerBlock = THREADS_PER_BLOCK; dim3 threadsPerBlockL(threadsPerBlock); - dim3 fullBlocksPerGridL(int(ceil(float(m_numElements)/float(threadsPerBlock)))); + dim3 fullBlocksPerGridL(m_numElements/threadsPerBlock); - naiveSumGlobal<<>>(cudaIndicesA, cudaIndicesB, m_numElements); - checkCUDAError("kernel failed!"); + for (int d=1; d<=ceil(log(m_numElementsAlive)/log(2)); d++){ + sum<<>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive, powf(2.0f, d-1)); + cudaThreadSynchronize(); + int* temp = cudaIndicesA; + cudaIndicesA = cudaIndicesB; + cudaIndicesB = temp; + } + shift<<>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive); - cudaMemcpy(m_indices, cudaIndicesA, m_numElements*sizeof(int), cudaMemcpyDeviceToHost); + //Stream compation from A into B, then save back into A + streamCompaction<<>>(cudaDataA, cudaIndicesB, cudaDataB, m_numElementsAlive); + dataPacket * temp = cudaDataA; + cudaDataA = cudaDataB; + cudaDataB = temp; + + // update numrays + cudaMemcpy(&m_numElementsAlive, &cudaIndicesA[m_numElementsAlive-1], sizeof(int), cudaMemcpyDeviceToHost); + cout<>>(cudaIndicesA, cudaIndicesB, threadsPerBlock*2, cudaAuxSums); checkCUDAError("kernel failed 1 !"); + //////////////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////////////// int sumSize = m_numElements/(THREADS_PER_BLOCK*2); dim3 initialScanThreadsPerBlock2(threadsPerBlock); dim3 initialScanBlocksPerGrid2(sumSize/threadsPerBlock+1); @@ -504,16 +548,38 @@ void DataStream::compactNaiveSumSharedArbitrary(){ cudaMemcpy(cudaAuxIncs, cudaAuxSums, m_numElements/(THREADS_PER_BLOCK*2)*sizeof(int), cudaMemcpyDeviceToDevice); - naiveSumGlobal<<>>(cudaAuxSums, cudaAuxIncs, sumSize); - checkCUDAError("kernel failed 2 !"); - - addIncs<<>>(cudaAuxSums, cudaIndicesB, m_numElements); - checkCUDAError("kernel failed!"); - - cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost); - checkCUDAError("kernel failed 3 !"); - cudaMemcpy(m_auxSums, cudaAuxSums, m_numElements/(THREADS_PER_BLOCK*2)*sizeof(int), cudaMemcpyDeviceToHost); - checkCUDAError("kernel failed 4 !"); + for (int d=1; d<=ceil(log(sumSize)/log(2)); d++){ + sum<<>>(cudaAuxSums, cudaAuxIncs, sumSize, powf(2.0f, d-1)); + cudaThreadSynchronize(); + int* temp = cudaAuxSums; + cudaAuxSums = cudaAuxIncs; + cudaAuxIncs = temp; + } + shift<<>>(cudaAuxSums, cudaAuxIncs, m_numElementsAlive); + + addIncs<<>>(cudaAuxIncs, cudaIndicesB, m_numElements); + + shift<<>>(cudaIndicesB, cudaIndicesA, m_numElementsAlive); + int * temp = cudaIndicesA; + cudaIndicesA = cudaIndicesB; + cudaIndicesB = temp; + //////////////////////////////////////////////////////////////////////////////////////// + + //////////////////////////////////////////////////////////////////////////////////////// + dim3 threadsPerBlockLL(threadsPerBlock); + dim3 fullBlocksPerGridLL(m_numElements/threadsPerBlock); + + //Stream compation from A into B, then save back into A + streamCompaction<<>>(cudaDataA, cudaIndicesB, cudaDataB, m_numElementsAlive); + dataPacket * tempDP = cudaDataA; + cudaDataA = cudaDataB; + cudaDataB = tempDP; + + // // update numrays + //////////////////////////////////////////////////////////////////////////////////////// + cudaMemcpy(&m_numElementsAlive, &cudaIndicesA[m_numElementsAlive-1], sizeof(int), cudaMemcpyDeviceToHost); + cout<>>(index, cudaDataA, cudaIndicesA, m_numElementsAlive); + + cudaMemcpy(m_indices, cudaIndicesA, m_numElementsAlive*sizeof(int), cudaMemcpyDeviceToHost); } \ No newline at end of file diff --git a/src/streamCompaction.h b/src/streamCompaction.h index a5af67e..2dd3872 100755 --- a/src/streamCompaction.h +++ b/src/streamCompaction.h @@ -15,7 +15,7 @@ #include #include -#define THREADS_PER_BLOCK 64 +#define THREADS_PER_BLOCK 8 struct dataPacket{ int index; @@ -35,7 +35,7 @@ class DataStream{ dataPacket * m_data; - int m_numElementsAlive, m_numElements; + dataPacket * cudaDataA; dataPacket * cudaDataB; @@ -46,13 +46,20 @@ class DataStream{ int * cudaAuxSums; int * cudaAuxIncs; + void globalSum(int* in, int* out, int n); + public: int * m_indices; int * m_auxSums; + int m_numElementsAlive, m_numElements; + DataStream(int numElements, dataPacket * data); ~DataStream(); + void serialScan(); + void serialScatter(); + void compactWorkEfficientArbitrary(); void compactNaiveSumGlobal(); void compactNaiveSumSharedSingleBlock(); From 7b101112de4cd7c0007bafd534409ba1615c7d95 Mon Sep 17 00:00:00 2001 From: Jeremy Newlin Date: Sat, 27 Sep 2014 18:58:18 -0400 Subject: [PATCH 07/18] compact for naive and shared --- src/main.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 1fd08d9..00f2e8d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -232,12 +232,6 @@ void workEfficientArbitrary(){ cout<<"starting with "< Date: Sat, 27 Sep 2014 19:18:51 -0400 Subject: [PATCH 08/18] all compacts working --- src/main.cpp | 57 ++++++++++++++++++++++------ src/streamCompaction.cu | 82 +++++++++++++++++++++++++++-------------- 2 files changed, 101 insertions(+), 38 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 00f2e8d..7935650 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -186,7 +186,7 @@ void naiveSumSharedArbitrary(){ } void naiveCompactSharedArbitrary(){ - int numElements = 33; + int numElements = 32; dataPacket * ints = new dataPacket[numElements]; for (int i=0; i 0 && bound < 20){ + int toKill = rand() % ds.numAlive(); + // toKill = 10; + ds.kill(toKill); + ds.compactWorkEfficientArbitrary (); + + dataPacket cur; + ds.getData(toKill, cur); + cout<<"killing "<>>(cudaIndicesA, cudaIndicesB, procsPefBlock, cudaAuxSums); checkCUDAError("kernel failed!"); - workEfficientArbitrary<<>>(cudaAuxSums, cudaAuxIncs, sumSize); - checkCUDAError("kernel failed!"); + // cudaMemcpy(m_indices, cudaIndicesA, m_numElements*sizeof(int), cudaMemcpyDeviceToHost); + // for (int i=0; i>>(cudaAuxSums, cudaAuxIncs, sumSize, powf(2.0f, d-1)); + cudaThreadSynchronize(); + int* temp = cudaAuxSums; + cudaAuxSums = cudaAuxIncs; + cudaAuxIncs = temp; + } + shift<<>>(cudaAuxSums, cudaAuxIncs, m_numElementsAlive); addIncs<<>>(cudaAuxIncs, cudaIndicesB, m_numElements); checkCUDAError("kernel failed!"); - cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost); - cudaMemcpy(m_auxSums, cudaAuxIncs, m_numElements/(THREADS_PER_BLOCK*2)*sizeof(int), cudaMemcpyDeviceToHost); - - // // scan algorithm - // for (int d=1; d<=ceil(log(m_numElementsAlive)/log(2)); d++){ - // sum<<>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive, powf(2.0f, d-1)); - // int* temp = cudaIndicesA; - // cudaIndicesA = cudaIndicesB; - // cudaIndicesB = temp; - // } - - // test<<>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive); - // checkCUDAError("kernel failed!"); - // cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost); - // //Stream compation from A into B, then save back into A - // streamCompaction<<>>(cudaDataA, cudaIndicesA, cudaDataB, m_numElementsAlive); - // dataPacket * temp = cudaDataA; - // cudaDataA = cudaDataB; - // cudaDataB = temp; - - // // update numrays - // cudaMemcpy(&m_numElementsAlive, &cudaIndicesA[m_numElementsAlive-1], sizeof(int), cudaMemcpyDeviceToHost); + // cudaMemcpy(m_indices, cudaIndicesA, m_numElements*sizeof(int), cudaMemcpyDeviceToHost); + // for (int i=0; i>>(cudaDataA, cudaIndicesB, cudaDataB, m_numElementsAlive); + dataPacket * temp = cudaDataA; + cudaDataA = cudaDataB; + cudaDataB = temp; + + // update numrays + cudaMemcpy(&m_numElementsAlive, &cudaIndicesB[m_numElementsAlive], sizeof(int), cudaMemcpyDeviceToHost); } void DataStream::compactNaiveSumGlobal(){ @@ -523,7 +552,6 @@ void DataStream::compactNaiveSumSharedSingleBlock(){ checkCUDAError("kernel failed!"); cudaMemcpy(m_indices, cudaIndicesB, m_numElements*sizeof(int), cudaMemcpyDeviceToHost); - } void DataStream::compactNaiveSumSharedArbitrary(){ From 2ca861f804d375c5e713758e3fb5597609169cb7 Mon Sep 17 00:00:00 2001 From: Jeremy Newlin Date: Sat, 27 Sep 2014 19:24:59 -0400 Subject: [PATCH 09/18] kill stream error --- src/main.cpp | 17 ++++++++++++----- src/streamCompaction.cu | 16 ++++++++++++++-- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 7935650..b65f062 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -199,14 +199,21 @@ void naiveCompactSharedArbitrary(){ int bound = 0; while(ds.numAlive () > 0 && bound < 10){ + int toKill = rand() % ds.numAlive(); - // toKill = 10; ds.kill(toKill); - ds.compactNaiveSumSharedArbitrary (); - dataPacket cur; ds.getData(toKill, cur); - cout<<"killing "<>>(cudaDataA, cudaIndicesA, m_numElementsAlive); } void DataStream::compactNaiveSumGlobal(){ @@ -539,6 +548,7 @@ void DataStream::compactNaiveSumGlobal(){ cudaMemcpy(&m_numElementsAlive, &cudaIndicesA[m_numElementsAlive-1], sizeof(int), cudaMemcpyDeviceToHost); cout<>>(cudaDataA, cudaIndicesA, m_numElementsAlive); } void DataStream::compactNaiveSumSharedSingleBlock(){ @@ -608,6 +618,8 @@ void DataStream::compactNaiveSumSharedArbitrary(){ cudaMemcpy(&m_numElementsAlive, &cudaIndicesA[m_numElementsAlive-1], sizeof(int), cudaMemcpyDeviceToHost); cout<>>(cudaDataA, cudaIndicesA, m_numElementsAlive); } bool DataStream::getData(int index, dataPacket& data){ From 0717b73061d7aabc8781cd526b18132780d2660a Mon Sep 17 00:00:00 2001 From: Jeremy Newlin Date: Sun, 28 Sep 2014 11:07:24 -0400 Subject: [PATCH 10/18] shared mem typo --- src/main.cpp | 171 +++++++++++++++++++++++++++------------- src/streamCompaction.cu | 61 ++++++++++++-- src/streamCompaction.h | 6 +- 3 files changed, 175 insertions(+), 63 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index b65f062..f2f90c8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -87,7 +87,7 @@ void naiveSumGlobal(){ } void naiveCompactGlobal(){ - int numElements = 33; + int numElements = 100000; dataPacket * ints = new dataPacket[numElements]; for (int i=0; i 0 && bound < 1){ - int toKill = rand() % ds.numAlive(); - toKill = 10; - ds.kill(toKill); + while(ds.numAlive () > 0){ + for (int i=0; i 0){ + for (int i=0; i 0){ + for (int i=0; i + +// ... +// struct is_even +// { +// __host__ __device__ +// bool operator()(const int x) +// { +// return (x % 2) == 0; +// } +// }; +// ... +// int N = 6; +// int data[N] = { 0, 1, 2, 3, 4, 5}; +// int stencil[N] = {-2, 0, -1, 0, 1, 2}; +// int result[4]; +// thrust::copy_if(data, data + N, stencil, result, is_even()); +// // data remains = { 0, 1, 2, 3, 4, 5}; +// // stencil remains = {-2, 0, -1, 0, 1, 2}; +// // result is now { 0, 1, 3, 5} void checkCUDAError(const char *msg) { cudaError_t err = cudaGetLastError(); @@ -242,6 +262,33 @@ __global__ void streamCompaction(dataPacket* inRays, int* indices, dataPacket* o } } +struct isAlive +{ + __host__ __device__ + bool operator()(const dataPacket& dp) + { + return dp.alive; + } +}; + +struct isEven +{ + __host__ __device__ + bool operator()(const int x) + { + return (x%2 == 0); + } +}; + +struct isOne +{ + __host__ __device__ + bool operator()(const int x) + { + return (x == 1); + } +}; + __global__ void killStream(int index, dataPacket* inRays, int* indices, int numElements){ int k = (blockIdx.x * blockDim.x) + threadIdx.x; @@ -440,6 +487,10 @@ void DataStream::globalSum(int* in, int* out, int n){ shift<<>>(in, out, m_numElementsAlive); } +void DataStream::thrustStreamCompact(){ + thrust::copy_if (m_data, m_data+m_numElements, m_indices, m_data, isOne()); +} + void DataStream::compactWorkEfficientArbitrary(){ int numElements = m_numElements; @@ -460,8 +511,8 @@ void DataStream::compactWorkEfficientArbitrary(){ dim3 threadsPerBlockL(threadsPerBlock); dim3 fullBlocksPerGridL(int(ceil(float(m_numElementsAlive)/float(threadsPerBlock)))); - workEfficientArbitrary<<>>(cudaIndicesA, cudaIndicesB, procsPefBlock, cudaAuxSums); - checkCUDAError("kernel failed!"); + workEfficientArbitrary<<>>(cudaIndicesA, cudaIndicesB, procsPefBlock, cudaAuxSums); + checkCUDAError("kernel failed1!"); // cudaMemcpy(m_indices, cudaIndicesA, m_numElements*sizeof(int), cudaMemcpyDeviceToHost); // for (int i=0; i>>(cudaAuxSums, cudaAuxIncs, m_numElementsAlive); addIncs<<>>(cudaAuxIncs, cudaIndicesB, m_numElements); - checkCUDAError("kernel failed!"); + checkCUDAError("kernel failed2!"); // cudaMemcpy(m_indices, cudaIndicesA, m_numElements*sizeof(int), cudaMemcpyDeviceToHost); // for (int i=0; i>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive); - //Stream compation from A into B, then save back into A + // Stream compation from A into B, then save back into A streamCompaction<<>>(cudaDataA, cudaIndicesB, cudaDataB, m_numElementsAlive); dataPacket * temp = cudaDataA; cudaDataA = cudaDataB; @@ -572,7 +623,7 @@ void DataStream::compactNaiveSumSharedArbitrary(){ dim3 threadsPerBlockL(threadsPerBlock*2); dim3 fullBlocksPerGridL(m_numElements/(threadsPerBlock*2)); - naiveSumSharedArbitrary<<>>(cudaIndicesA, cudaIndicesB, threadsPerBlock*2, cudaAuxSums); + naiveSumSharedArbitrary<<>>(cudaIndicesA, cudaIndicesB, threadsPerBlock*2, cudaAuxSums); checkCUDAError("kernel failed 1 !"); //////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/streamCompaction.h b/src/streamCompaction.h index 2dd3872..f8e8712 100755 --- a/src/streamCompaction.h +++ b/src/streamCompaction.h @@ -14,8 +14,9 @@ #include #include #include +#include -#define THREADS_PER_BLOCK 8 +#define THREADS_PER_BLOCK 64 struct dataPacket{ int index; @@ -35,8 +36,6 @@ class DataStream{ dataPacket * m_data; - - dataPacket * cudaDataA; dataPacket * cudaDataB; @@ -47,6 +46,7 @@ class DataStream{ int * cudaAuxIncs; void globalSum(int* in, int* out, int n); + void thrustStreamCompact(); public: int * m_indices; From 0b88a21737f568e11ba898d63f1a8f42d18abe9e Mon Sep 17 00:00:00 2001 From: Jeremy Newlin Date: Sun, 28 Sep 2014 13:51:30 -0400 Subject: [PATCH 11/18] omg I think Im done? --- src/main.cpp | 129 ++++++++++++---------------------------- src/streamCompaction.cu | 112 ++++++++++++++++------------------ src/streamCompaction.h | 3 +- 3 files changed, 91 insertions(+), 153 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index f2f90c8..fcaa421 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,5 +1,4 @@ #include -#include #include "streamCompaction.h" @@ -19,11 +18,11 @@ void serialSum(){ ds.serialScan(); - for (int i=0; i 0){ - for (int i=0; i +#define NUM_BANKS 16 +#define LOG_NUM_BANKS 4 +#define CONFLICT_FREE_OFFSET(n) \ + ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS)) + // ... // struct is_even // { @@ -115,10 +120,6 @@ __global__ void naiveSumSharedArbitrary(int* in, int* out, int n, int* sums=0){ int localIndex = threadIdx.x; int globalIndex = (blockIdx.x * blockDim.x) + threadIdx.x; - // if (globalIndex >= n) return; - - // out[k] = index; return; - extern __shared__ int shared[]; int *tempIn = &shared[0]; int *tempOut = &shared[n]; @@ -195,13 +196,15 @@ __global__ void workEfficientArbitrary(int* in, int* out, int n, int* sums=0){ extern __shared__ float temp[]; - int realIndex = (blockIdx.x * blockDim.x) + threadIdx.x; - int offset = 1; int index = threadIdx.x; - temp[2*index] = in[2*realIndex]; - temp[2*index+1] = in[2*realIndex+1]; + int indexA = index; + int indexB = index + (n/2); + int bankOffsetA = CONFLICT_FREE_OFFSET(indexA); + int bankOffsetB = CONFLICT_FREE_OFFSET(indexB); + temp[indexA + bankOffsetA] = in[indexA]; + temp[indexB + bankOffsetB] = in[indexB]; for (int d = n>>1; d>0; d >>= 1){ __syncthreads(); @@ -209,14 +212,17 @@ __global__ void workEfficientArbitrary(int* in, int* out, int n, int* sums=0){ int ai = offset * (2*index+1) - 1; int bi = offset * (2*index+2) - 1; + ai += CONFLICT_FREE_OFFSET(ai); + bi += CONFLICT_FREE_OFFSET(bi); + temp[bi] += temp[ai]; } offset *= 2; } if (index == 0){ - if (sums) sums[blockIdx.x] = temp[n-1]; - temp[n - 1] = 0; + if (sums) sums[blockIdx.x] = temp[n - 1 + CONFLICT_FREE_OFFSET(n - 1)]; + temp[n - 1 + CONFLICT_FREE_OFFSET(n - 1)] = 0; } for (int d = 1; d>>(cudaIndicesA, cudaIndicesB, procsPefBlock, cudaAuxSums); - checkCUDAError("kernel failed1!"); - - // cudaMemcpy(m_indices, cudaIndicesA, m_numElements*sizeof(int), cudaMemcpyDeviceToHost); - // for (int i=0; i>>(cudaAuxSums, cudaAuxIncs, sumSize, powf(2.0f, d-1)); @@ -535,31 +535,8 @@ void DataStream::compactWorkEfficientArbitrary(){ cudaAuxSums = cudaAuxIncs; cudaAuxIncs = temp; } - shift<<>>(cudaAuxSums, cudaAuxIncs, m_numElementsAlive); - + shift<<>>(cudaAuxSums, cudaAuxIncs, m_numElements/(THREADS_PER_BLOCK*2)); addIncs<<>>(cudaAuxIncs, cudaIndicesB, m_numElements); - checkCUDAError("kernel failed2!"); - - // cudaMemcpy(m_indices, cudaIndicesA, m_numElements*sizeof(int), cudaMemcpyDeviceToHost); - // for (int i=0; i>>(cudaDataA, cudaIndicesB, cudaDataB, m_numElementsAlive); @@ -580,14 +557,30 @@ void DataStream::compactNaiveSumGlobal(){ dim3 threadsPerBlockL(threadsPerBlock); dim3 fullBlocksPerGridL(m_numElements/threadsPerBlock); + clock_t t = clock(); for (int d=1; d<=ceil(log(m_numElementsAlive)/log(2)); d++){ sum<<>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive, powf(2.0f, d-1)); + checkCUDAError("kernel failed 1 !"); cudaThreadSynchronize(); int* temp = cudaIndicesA; cudaIndicesA = cudaIndicesB; cudaIndicesB = temp; } shift<<>>(cudaIndicesA, cudaIndicesB, m_numElementsAlive); + checkCUDAError("kernel failed 1 !"); + t = clock() - t; + + cudaMemcpy(m_indices, cudaIndicesB, m_numElementsAlive*sizeof(int), cudaMemcpyDeviceToHost); + + cout<>>(cudaDataA, cudaIndicesB, cudaDataB, m_numElementsAlive); @@ -597,8 +590,6 @@ void DataStream::compactNaiveSumGlobal(){ // update numrays cudaMemcpy(&m_numElementsAlive, &cudaIndicesA[m_numElementsAlive-1], sizeof(int), cudaMemcpyDeviceToHost); - cout<>>(cudaDataA, cudaIndicesA, m_numElementsAlive); } @@ -635,7 +626,7 @@ void DataStream::compactNaiveSumSharedArbitrary(){ dim3 threadsPerBlockOld(threadsPerBlock); dim3 fullBlocksPerGridOld(int(ceil(float(sumSize)/float(threadsPerBlock)))); - cudaMemcpy(cudaAuxIncs, cudaAuxSums, m_numElements/(THREADS_PER_BLOCK*2)*sizeof(int), cudaMemcpyDeviceToDevice); + // cudaMemcpy(cudaAuxIncs, cudaAuxSums, m_numElements/(THREADS_PER_BLOCK*2)*sizeof(int), cudaMemcpyDeviceToDevice); for (int d=1; d<=ceil(log(sumSize)/log(2)); d++){ sum<<>>(cudaAuxSums, cudaAuxIncs, sumSize, powf(2.0f, d-1)); @@ -644,7 +635,7 @@ void DataStream::compactNaiveSumSharedArbitrary(){ cudaAuxSums = cudaAuxIncs; cudaAuxIncs = temp; } - shift<<>>(cudaAuxSums, cudaAuxIncs, m_numElementsAlive); + shift<<>>(cudaAuxSums, cudaAuxIncs, m_numElements/(THREADS_PER_BLOCK*2)); addIncs<<>>(cudaAuxIncs, cudaIndicesB, m_numElements); @@ -652,24 +643,21 @@ void DataStream::compactNaiveSumSharedArbitrary(){ int * temp = cudaIndicesA; cudaIndicesA = cudaIndicesB; cudaIndicesB = temp; - //////////////////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////////////////// dim3 threadsPerBlockLL(threadsPerBlock); dim3 fullBlocksPerGridLL(m_numElements/threadsPerBlock); + clock_t t = clock(); //Stream compation from A into B, then save back into A streamCompaction<<>>(cudaDataA, cudaIndicesB, cudaDataB, m_numElementsAlive); dataPacket * tempDP = cudaDataA; cudaDataA = cudaDataB; cudaDataB = tempDP; + t = clock() - t; + cout<<(float)t / CLOCKS_PER_SEC<>>(cudaDataA, cudaIndicesA, m_numElementsAlive); } diff --git a/src/streamCompaction.h b/src/streamCompaction.h index f8e8712..5585e70 100755 --- a/src/streamCompaction.h +++ b/src/streamCompaction.h @@ -15,8 +15,9 @@ #include #include #include +#include -#define THREADS_PER_BLOCK 64 +#define THREADS_PER_BLOCK 256 struct dataPacket{ int index; From 25959a19af56ed683015fa961439c956dcbcd6b2 Mon Sep 17 00:00:00 2001 From: Jeremy Newlin Date: Sun, 28 Sep 2014 14:01:00 -0400 Subject: [PATCH 12/18] testing readme --- README.md | 129 +++--------------------------------------------------- 1 file changed, 6 insertions(+), 123 deletions(-) diff --git a/README.md b/README.md index 6e02afa..9ec505d 100644 --- a/README.md +++ b/README.md @@ -1,133 +1,16 @@ -Project-2 -========= - A Study in Parallel Algorithms : Stream Compaction -# INTRODUCTION -Many of the algorithms you have learned thus far in your career have typically -been developed from a serial standpoint. When it comes to GPUs, we are mainly -looking at massively parallel work. Thus, it is necessary to reorient our -thinking. In this project, we will be implementing a couple different versions -of prefix sum. We will start with a simple single thread serial CPU version, -and then move to a naive GPU version. Each part of this homework is meant to -follow the logic of the previous parts, so please do not do this homework out of -order. - -This project will serve as a stream compaction library that you may use (and -will want to use) in your -future projects. For that reason, we suggest you create proper header and CUDA -files so that you can reuse this code later. You may want to create a separate -cpp file that contains your main function so that you can test the code you -write. - -# OVERVIEW -Stream compaction is broken down into two parts: (1) scan, and (2) scatter. - -## SCAN -Scan or prefix sum is the summation of the elements in an array such that the -resulting array is the summation of the terms before it. Prefix sum can either -be inclusive, meaning the current term is a summation of all the elements before -it and itself, or exclusive, meaning the current term is a summation of all -elements before it excluding itself. - -Inclusive: - -In : [ 3 4 6 7 9 10 ] - -Out : [ 3 7 13 20 29 39 ] - -Exclusive - -In : [ 3 4 6 7 9 10 ] - -Out : [ 0 3 7 13 20 29 ] - -Note that the resulting prefix sum will always be n + 1 elements if the input -array is of length n. Similarly, the first element of the exclusive prefix sum -will always be 0. In the following sections, all references to prefix sum will -be to the exclusive version of prefix sum. - -## SCATTER -The scatter section of stream compaction takes the results of the previous scan -in order to reorder the elements to form a compact array. - -For example, let's say we have the following array: -[ 0 0 3 4 0 6 6 7 0 1 ] - -We would only like to consider the non-zero elements in this zero, so we would -like to compact it into the following array: -[ 3 4 6 6 7 1 ] - -We can perform a transform on input array to transform it into a boolean array: - -In : [ 0 0 3 4 0 6 6 7 0 1 ] - -Out : [ 0 0 1 1 0 1 1 1 0 1 ] - -Performing a scan on the output, we get the following array : - -In : [ 0 0 1 1 0 1 1 1 0 1 ] - -Out : [ 0 0 0 1 2 2 3 4 5 5 ] - -Notice that the output array produces a corresponding index array that we can -use to create the resulting array for stream compaction. - -# PART 1 : REVIEW OF PREFIX SUM -Given the definition of exclusive prefix sum, please write a serial CPU version -of prefix sum. You may write this in the cpp file to separate this from the -CUDA code you will be writing in your .cu file. - -# PART 2 : NAIVE PREFIX SUM -We will now parallelize this the previous section's code. Recall from lecture -that we can parallelize this using a series of kernel calls. In this portion, -you are NOT allowed to use shared memory. - -### Questions -* Compare this version to the serial version of exclusive prefix scan. Please - include a table of how the runtimes compare on different lengths of arrays. -* Plot a graph of the comparison and write a short explanation of the phenomenon you - see here. - -# PART 3 : OPTIMIZING PREFIX SUM -In the previous section we did not take into account shared memory. In the -previous section, we kept everything in global memory, which is much slower than -shared memory. - -## PART 3a : Write prefix sum for a single block -Shared memory is accessible to threads of a block. Please write a version of -prefix sum that works on a single block. - -## PART 3b : Generalizing to arrays of any length. -Taking the previous portion, please write a version that generalizes prefix sum -to arbitrary length arrays, this includes arrays that will not fit on one block. +There are two main components of stream compaction: scan and scatter. -### Questions -* Compare this version to the parallel prefix sum using global memory. -* Plot a graph of the comparison and write a short explanation of the phenomenon - you see here. +Here is a comparison of the various mehtods I used to scan: -# PART 4 : ADDING SCATTER -First create a serial version of scatter by expanding the serial version of -prefix sum. Then create a GPU version of scatter. Combine the function call -such that, given an array, you can call stream compact and it will compact the -array for you. Finally, write a version using thrust. +![](https://drive.google.com/file/d/0BzqFSVys9HdcV0t2eE43YXgydDQ/edit?usp=sharing) -### Questions -* Compare your version of stream compact to your version using thrust. How do - they compare? How might you optimize yours more, or how might thrust's stream - compact be optimized. +And here is a comparison of my scatter implementation and thrust's. I think I'm using a slow +thrut version of this, becuase I don't think my basic version in CUDA should be as fast as thrust. -# EXTRA CREDIT (+10) -For extra credit, please optimize your prefix sum for work parallelism and to -deal with bank conflicts. Information on this can be found in the GPU Gems -chapter listed in the references. +![](https://drive.google.com/file/d/0BzqFSVys9HdcV0t2eE43YXgydDQ/edit?usp=sharing) -# SUBMISSION -Please answer all the questions in each of the subsections above and write your -answers in the README by overwriting the README file. In future projects, we -expect your analysis to be similar to the one we have led you through in this -project. Like other projects, please open a pull request and email Harmony. # REFERENCES "Parallel Prefix Sum (Scan) with CUDA." GPU Gems 3. From ac93f6f33f642cd57446174ebd497c5f1682d3be Mon Sep 17 00:00:00 2001 From: Jeremy Newlin Date: Sun, 28 Sep 2014 14:02:00 -0400 Subject: [PATCH 13/18] testing readme again --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9ec505d..9a135f2 100644 --- a/README.md +++ b/README.md @@ -4,10 +4,10 @@ There are two main components of stream compaction: scan and scatter. Here is a comparison of the various mehtods I used to scan: -![](https://drive.google.com/file/d/0BzqFSVys9HdcV0t2eE43YXgydDQ/edit?usp=sharing) +![](https://lh4.googleusercontent.com/TWSCNE_ZOLPWiv-EFjObiNwU7AW9Qfz5X4F-wtiu6JngBCe1ZIg_T5HCn5_k8q8d4OnJkageIPI=w1505-h726) And here is a comparison of my scatter implementation and thrust's. I think I'm using a slow -thrut version of this, becuase I don't think my basic version in CUDA should be as fast as thrust. +thrust version of this, becuase I don't think my basic version in CUDA should be as fast as thrust. ![](https://drive.google.com/file/d/0BzqFSVys9HdcV0t2eE43YXgydDQ/edit?usp=sharing) From fc83941fda04e86a18aed9f511f6b79e5074ae02 Mon Sep 17 00:00:00 2001 From: Jeremy Newlin Date: Sun, 28 Sep 2014 14:04:54 -0400 Subject: [PATCH 14/18] testing readme again --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9a135f2..cb954ae 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,16 @@ Here is a comparison of the various mehtods I used to scan: ![](https://lh4.googleusercontent.com/TWSCNE_ZOLPWiv-EFjObiNwU7AW9Qfz5X4F-wtiu6JngBCe1ZIg_T5HCn5_k8q8d4OnJkageIPI=w1505-h726) +As you can see, the serial version is faster for small arrays, but is quickly out matched as the array length grows. The global +memory version is always just a bit slower than the shared memory version, which makes sense, as the only difference is the slowdown +that comes from fetching from global memory often. The work efficient algorithm that I've implemented must have a bug in it, because +it only becomes comparable to the naive shared memory version after the array is over 10 million elements long. Further investigation is +needed. + And here is a comparison of my scatter implementation and thrust's. I think I'm using a slow thrust version of this, becuase I don't think my basic version in CUDA should be as fast as thrust. -![](https://drive.google.com/file/d/0BzqFSVys9HdcV0t2eE43YXgydDQ/edit?usp=sharing) +![](https://lh3.googleusercontent.com/-smo_LiXzpgg15xhhf7EwXruEdDWJ6cN-NfNbUv0Z9F7l4qwYAyI22eZpwk9dHrYbonYsrSY9ik=w1505-h726) # REFERENCES From c8137e9035b1e47b8b3aa93a31266d974daaa473 Mon Sep 17 00:00:00 2001 From: Jeremy Newlin Date: Sun, 28 Sep 2014 14:06:40 -0400 Subject: [PATCH 15/18] testing readme again --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index cb954ae..bc151bf 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,9 @@ it only becomes comparable to the naive shared memory version after the array is needed. And here is a comparison of my scatter implementation and thrust's. I think I'm using a slow -thrust version of this, becuase I don't think my basic version in CUDA should be as fast as thrust. +thrust version of this, becuase I don't think my basic version in CUDA should be as fast as thrust. But, to be honest, +I'm not sure how else to optimize my implementation of scatter any further. It has 3 global memory reads that are absolutely necessary, +and a branch. ![](https://lh3.googleusercontent.com/-smo_LiXzpgg15xhhf7EwXruEdDWJ6cN-NfNbUv0Z9F7l4qwYAyI22eZpwk9dHrYbonYsrSY9ik=w1505-h726) From 61c2ec06fa23f62201d546fb883f3ce44adab598 Mon Sep 17 00:00:00 2001 From: Jeremy Newlin Date: Sun, 28 Sep 2014 14:07:48 -0400 Subject: [PATCH 16/18] finished --- src/streamCompaction.cu | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/src/streamCompaction.cu b/src/streamCompaction.cu index 60c8339..574cf6b 100644 --- a/src/streamCompaction.cu +++ b/src/streamCompaction.cu @@ -14,25 +14,6 @@ using namespace std; #define CONFLICT_FREE_OFFSET(n) \ ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS)) -// ... -// struct is_even -// { -// __host__ __device__ -// bool operator()(const int x) -// { -// return (x % 2) == 0; -// } -// }; -// ... -// int N = 6; -// int data[N] = { 0, 1, 2, 3, 4, 5}; -// int stencil[N] = {-2, 0, -1, 0, 1, 2}; -// int result[4]; -// thrust::copy_if(data, data + N, stencil, result, is_even()); -// // data remains = { 0, 1, 2, 3, 4, 5}; -// // stencil remains = {-2, 0, -1, 0, 1, 2}; -// // result is now { 0, 1, 3, 5} - void checkCUDAError(const char *msg) { cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { From eedbea8df4335092d970f52b07dcfe2aafb24359 Mon Sep 17 00:00:00 2001 From: Jeremy Newlin Date: Sun, 28 Sep 2014 14:09:37 -0400 Subject: [PATCH 17/18] finished --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index bc151bf..ef55956 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ There are two main components of stream compaction: scan and scatter. Here is a comparison of the various mehtods I used to scan: -![](https://lh4.googleusercontent.com/TWSCNE_ZOLPWiv-EFjObiNwU7AW9Qfz5X4F-wtiu6JngBCe1ZIg_T5HCn5_k8q8d4OnJkageIPI=w1505-h726) +![](http://imgur.com/AaR3gk0,V55kt3w) As you can see, the serial version is faster for small arrays, but is quickly out matched as the array length grows. The global memory version is always just a bit slower than the shared memory version, which makes sense, as the only difference is the slowdown @@ -17,7 +17,7 @@ thrust version of this, becuase I don't think my basic version in CUDA should be I'm not sure how else to optimize my implementation of scatter any further. It has 3 global memory reads that are absolutely necessary, and a branch. -![](https://lh3.googleusercontent.com/-smo_LiXzpgg15xhhf7EwXruEdDWJ6cN-NfNbUv0Z9F7l4qwYAyI22eZpwk9dHrYbonYsrSY9ik=w1505-h726) +![](http://imgur.com/AaR3gk0,V55kt3w#1) # REFERENCES From 6872df4cc267207058e2a248828b742ce9bf767e Mon Sep 17 00:00:00 2001 From: Jeremy Newlin Date: Sun, 28 Sep 2014 14:10:20 -0400 Subject: [PATCH 18/18] finished --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ef55956..0d60152 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ There are two main components of stream compaction: scan and scatter. Here is a comparison of the various mehtods I used to scan: -![](http://imgur.com/AaR3gk0,V55kt3w) +![](http://i.imgur.com/AaR3gk0.png) As you can see, the serial version is faster for small arrays, but is quickly out matched as the array length grows. The global memory version is always just a bit slower than the shared memory version, which makes sense, as the only difference is the slowdown @@ -17,7 +17,7 @@ thrust version of this, becuase I don't think my basic version in CUDA should be I'm not sure how else to optimize my implementation of scatter any further. It has 3 global memory reads that are absolutely necessary, and a branch. -![](http://imgur.com/AaR3gk0,V55kt3w#1) +![](http://i.imgur.com/V55kt3w.png) # REFERENCES