From cd6196012bf73b95e232a2323bc3be597bb8817b Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 9 Sep 2015 11:59:55 -0400 Subject: [PATCH 01/15] part 1 passed --- stream_compaction/cpu.cu | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..4da2aca 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -9,7 +9,11 @@ namespace CPU { */ void scan(int n, int *odata, const int *idata) { // TODO - printf("TODO\n"); + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = idata[i-1] + odata[i-1]; + } + } /** @@ -18,8 +22,17 @@ void scan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { - // TODO - return -1; + int sum = 0; + for (int i = 0; i < n; i++) { + if (idata[i] == 0) { + odata[i] = 0; + } + else { + odata[i] = 1; + sum++; + } + } + return sum; } /** @@ -28,8 +41,18 @@ int compactWithoutScan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - // TODO - return -1; + for (int i = 0; i < n; i++) { + if (idata[i] == 0) { + odata[i] = 0; + } + else { + odata[i] = 1; + } + } + int* scanArray = new int[n]; + scan(n, scanArray, odata); + + return scanArray[n - 1] + odata[n - 1]; } } From 5e9d7dfc540c1e812924e9e1429e7ec86bded67a Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 9 Sep 2015 14:17:23 -0400 Subject: [PATCH 02/15] part 2 passed --- src/testing_helpers.hpp | 2 +- stream_compaction/naive.cu | 34 +++++++++++++++++++++++++++++++--- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index 69ac2e9..20bb30b 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -17,7 +17,7 @@ void printDesc(const char *desc) { template void printCmpResult(int n, T *a, T *b) { - printf(" %s \n", + printf(" %s \n", cmpArrays(n, a, b) ? "FAIL VALUE" : "passed"); } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..d0d7ca0 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -6,14 +6,42 @@ namespace StreamCompaction { namespace Naive { -// TODO: __global__ +int *g_odata; +int *g_idata; +__global__ void kern_scan(int n, int *odata, const int *idata, int layer) { + int thrId = threadIdx.x; + if (thrId >= layer) { + + odata[thrId] = idata[thrId - layer] + idata[thrId]; + + } + else { + odata[thrId] = idata[thrId]; + } + +} /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + cudaMalloc((void**)&g_odata, n * sizeof(int)); + cudaMalloc((void**)&g_idata, n * sizeof(int)); + + int* scanArray = new int[n]; + scanArray[0] = 0; + for (int i = 1; i < n; i++) { + scanArray[i] = idata[i - 1]; + } + + cudaMemcpy(g_odata, odata, n*sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(g_idata, scanArray, n*sizeof(int), cudaMemcpyHostToDevice); + for (int d = 1; d <= ilog2ceil(n); d++) { + int layer = pow(2, d - 1); + kern_scan<<<1, n>>>(n, g_odata, g_idata, layer); + g_idata = g_odata; + } + cudaMemcpy(odata, g_odata, n*sizeof(int), cudaMemcpyDeviceToHost); } } From 58540474acc19315d47eb2c061d5f6cd8861a899 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 9 Sep 2015 15:25:43 -0400 Subject: [PATCH 03/15] part 3 in progress: --- stream_compaction/efficient.cu | 50 ++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..a26fe47 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -6,14 +6,58 @@ namespace StreamCompaction { namespace Efficient { -// TODO: __global__ +int* g_odata; +int* g_idata; +__global__ void kern_up_sweep(int n, int *odata, const int *idata, int layer) { + int thrId = layer*threadIdx.x; + odata[thrId + layer - 1] += idata[thrId + (layer / 2) - 1]; +} + +__global__ void kern_down_sweep(int n, int *odata, const int *idata, int layer) { + int thrId = n - 1 - layer*threadIdx.x; + int temp = idata[thrId + (layer / 2) - 1]; + odata[thrId + (layer / 2) - 1] = idata[thrId + layer - 1]; + odata[thrId + layer - 1] = temp + idata[thrId + layer - 1]; +} /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + cudaMalloc((void**)&g_odata, n * sizeof(int)); + cudaMalloc((void**)&g_idata, n * sizeof(int)); + + int* scanArray = new int[n]; + scanArray[0] = 0; + for (int i = 1; i < n; i++) { + scanArray[i] = idata[i - 1]; + } + + cudaMemcpy(g_odata, odata, n*sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(g_idata, scanArray, n*sizeof(int), cudaMemcpyHostToDevice); + + for (int d = 0; d <= ilog2ceil(n) - 1; d++) { + int layer = pow(2, d + 1); + float mult = 1.0f / (float)layer; + g_odata = g_idata; + int block = ceil(n*mult); + kern_up_sweep<<<1, block>>>(n, g_odata, g_idata, layer); + g_idata = g_odata; + } + cudaMemcpy(scanArray, g_idata, n*sizeof(int), cudaMemcpyDeviceToHost); + scanArray[n-1] = 0; + cudaMemcpy(g_idata, scanArray, n*sizeof(int), cudaMemcpyDeviceToHost); + for (int d = ilog2ceil(n) - 1; d >= 0; d--) { + int layer = pow(2, d + 1); + float mult = 1.0f / (float)layer; + g_odata = g_idata; + int block = ceil(n*mult); + kern_down_sweep<<<1, block>>>(n, g_odata, g_idata, layer); + g_idata = g_odata; + } + + cudaMemcpy(odata, g_odata, n*sizeof(int), cudaMemcpyDeviceToHost); + } /** From 242685b5de170e888b0ad134ec94fce33a62199f Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 9 Sep 2015 22:19:24 -0400 Subject: [PATCH 04/15] part 3.1 passed --- src/main.cpp | 2 +- stream_compaction/efficient.cu | 61 ++++++++++++++++++++++------------ 2 files changed, 41 insertions(+), 22 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index efc8c06..c29b215 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,7 +16,7 @@ #include "testing_helpers.hpp" int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; + const int SIZE = 1 << 8; const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index a26fe47..8f23f71 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -9,28 +9,52 @@ namespace Efficient { int* g_odata; int* g_idata; +__global__ void generate_zeros(int *data) { + int i = threadIdx.x; + data[i] = 0; +} + +__global__ void set_zero(int size, int n, int *data) { + int i = threadIdx.x; + if (i >= n - 1) { + data[i] = 0; + } +} __global__ void kern_up_sweep(int n, int *odata, const int *idata, int layer) { - int thrId = layer*threadIdx.x; - odata[thrId + layer - 1] += idata[thrId + (layer / 2) - 1]; + int thrId = threadIdx.x + (blockIdx.x * blockDim.x); + if ((thrId < n) && (thrId%layer == 0)) { + odata[thrId + layer - 1] += idata[thrId + (layer / 2) - 1]; + } } __global__ void kern_down_sweep(int n, int *odata, const int *idata, int layer) { - int thrId = n - 1 - layer*threadIdx.x; - int temp = idata[thrId + (layer / 2) - 1]; - odata[thrId + (layer / 2) - 1] = idata[thrId + layer - 1]; - odata[thrId + layer - 1] = temp + idata[thrId + layer - 1]; + int thrId = threadIdx.x + (blockIdx.x * blockDim.x); + if ((thrId < n) && (thrId%layer == 0)) { + int temp = idata[thrId + (layer / 2) - 1]; + odata[thrId + (layer / 2) - 1] = idata[thrId + layer - 1]; + odata[thrId + layer - 1] += temp; + } + + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - cudaMalloc((void**)&g_odata, n * sizeof(int)); - cudaMalloc((void**)&g_idata, n * sizeof(int)); + int blockSize = 128; + int numBlocks = ceil((float)n / (float)blockSize); + int powTwo = pow(2, ilog2ceil(n)); + dim3 fullBlocksPerGrid((powTwo + blockSize - 1) / blockSize); + cudaMalloc((void**)&g_odata, powTwo * sizeof(int)); + cudaMalloc((void**)&g_idata, powTwo * sizeof(int)); + + generate_zeros<<<1, powTwo>>>(g_odata); + generate_zeros<<<1, powTwo>>>(g_idata); int* scanArray = new int[n]; - scanArray[0] = 0; - for (int i = 1; i < n; i++) { - scanArray[i] = idata[i - 1]; + //scanArray[0] = 0; + for (int i = 0; i < n; i++) { + scanArray[i] = idata[i]; } cudaMemcpy(g_odata, odata, n*sizeof(int), cudaMemcpyHostToDevice); @@ -38,26 +62,21 @@ void scan(int n, int *odata, const int *idata) { for (int d = 0; d <= ilog2ceil(n) - 1; d++) { int layer = pow(2, d + 1); - float mult = 1.0f / (float)layer; g_odata = g_idata; - int block = ceil(n*mult); - kern_up_sweep<<<1, block>>>(n, g_odata, g_idata, layer); + kern_up_sweep<<>>(powTwo, g_odata, g_idata, layer); g_idata = g_odata; } - cudaMemcpy(scanArray, g_idata, n*sizeof(int), cudaMemcpyDeviceToHost); - scanArray[n-1] = 0; - cudaMemcpy(g_idata, scanArray, n*sizeof(int), cudaMemcpyDeviceToHost); + + set_zero<<<1, powTwo>>>(powTwo, n, g_idata); + for (int d = ilog2ceil(n) - 1; d >= 0; d--) { int layer = pow(2, d + 1); - float mult = 1.0f / (float)layer; g_odata = g_idata; - int block = ceil(n*mult); - kern_down_sweep<<<1, block>>>(n, g_odata, g_idata, layer); + kern_down_sweep<<>>(powTwo, g_odata, g_idata, layer); g_idata = g_odata; } cudaMemcpy(odata, g_odata, n*sizeof(int), cudaMemcpyDeviceToHost); - } /** From 3f0e1ab333c45a2de599abddbe8cab1dd95a4d6f Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 9 Sep 2015 22:54:34 -0400 Subject: [PATCH 05/15] part 3.2 --- stream_compaction/common.cu | 14 ++++++++++++-- stream_compaction/efficient.cu | 30 ++++++++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..666379b 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,13 @@ namespace Common { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int thrId = threadIdx.x + (blockIdx.x * blockDim.x); + if (idata[thrId] == 0) { + bools[thrId] = 0; + } + else { + bools[thrId] = 1; + } } /** @@ -32,7 +38,11 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int thrId = threadIdx.x + (blockIdx.x * blockDim.x); + + if (bools[thrId] == 1) { + odata[indices[thrId]] = idata[thrId]; + } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 8f23f71..9d8c2df 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -8,6 +8,8 @@ namespace Efficient { int* g_odata; int* g_idata; +int* dev_bools; +int* dev_indices; __global__ void generate_zeros(int *data) { int i = threadIdx.x; @@ -89,8 +91,32 @@ void scan(int n, int *odata, const int *idata) { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - // TODO - return -1; + int blockSize = 128; + int numBlocks = ceil((float)n / (float)blockSize); + int powTwo = pow(2, ilog2ceil(n)); + dim3 fullBlocksPerGrid((powTwo + blockSize - 1) / blockSize); + + cudaMalloc((void**)&g_odata, powTwo * sizeof(int)); + cudaMalloc((void**)&g_idata, powTwo * sizeof(int)); + cudaMalloc((void**)&dev_bools, powTwo * sizeof(int)); + cudaMalloc((void**)&dev_indices, powTwo * sizeof(int)); + + int* indices = new int[n]; + int* bools = new int[n]; + + cudaMemcpy(g_idata, idata, n*sizeof(int), cudaMemcpyHostToDevice); + + Common::kernMapToBoolean<<>>(powTwo, dev_bools, g_idata); + + scan(powTwo, indices, idata); + + cudaMemcpy(dev_indices, indices, n*sizeof(int), cudaMemcpyHostToDevice); + + Common::kernScatter<<>>(powTwo, g_odata, g_idata, dev_bools, dev_indices); + + cudaMemcpy(bools, dev_bools, n*sizeof(int), cudaMemcpyDeviceToHost); + + return indices[n-1] + bools[n-1]; } } From 6a931e35117d8e1262f849933c9f7a35becf8802 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 9 Sep 2015 23:52:23 -0400 Subject: [PATCH 06/15] Part 3 passes (when it is the boolean array --- src/main.cpp | 2 +- stream_compaction/common.cu | 2 +- stream_compaction/efficient.cu | 32 +++++++++++++++++++++++++++++--- 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index c29b215..d45d8bf 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,7 +16,7 @@ #include "testing_helpers.hpp" int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; + const int SIZE = 8; // 1 << 8; const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 666379b..82925e1 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -39,7 +39,7 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { int thrId = threadIdx.x + (blockIdx.x * blockDim.x); - + if (bools[thrId] == 1) { odata[indices[thrId]] = idata[thrId]; } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 9d8c2df..1896905 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -107,15 +107,41 @@ int compact(int n, int *odata, const int *idata) { cudaMemcpy(g_idata, idata, n*sizeof(int), cudaMemcpyHostToDevice); Common::kernMapToBoolean<<>>(powTwo, dev_bools, g_idata); - - scan(powTwo, indices, idata); + cudaMemcpy(bools, dev_bools, n*sizeof(int), cudaMemcpyDeviceToHost); + + scan(n, indices, bools); cudaMemcpy(dev_indices, indices, n*sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(g_idata, idata, n*sizeof(int), cudaMemcpyHostToDevice); Common::kernScatter<<>>(powTwo, g_odata, g_idata, dev_bools, dev_indices); + + cudaMemcpy(odata, g_odata, n*sizeof(int), cudaMemcpyDeviceToHost); + /* + printf("Bools \n"); + for (int i = 0; i < n; i++) { + printf("%i ", bools[i]); + } + printf("\n"); - cudaMemcpy(bools, dev_bools, n*sizeof(int), cudaMemcpyDeviceToHost); + printf("Indices \n"); + for (int i = 0; i < n; i++) { + printf("%i ", indices[i]); + } + printf("\n"); + + printf("idata \n"); + for (int i = 0; i < n; i++) { + printf("%i ", idata[i]); + } + printf("\n"); + printf("odata \n"); + for (int i = 0; i < n; i++) { + printf("%i ", odata[i]); + } + printf("\n"); + */ return indices[n-1] + bools[n-1]; } From eba9442d859d66bf594a64ea9f1dc3342334c9fc Mon Sep 17 00:00:00 2001 From: unknown Date: Thu, 10 Sep 2015 12:16:29 -0400 Subject: [PATCH 07/15] Fixed cpu.cu --- stream_compaction/cpu.cu | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 4da2aca..6c0a6ad 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -24,14 +24,14 @@ void scan(int n, int *odata, const int *idata) { int compactWithoutScan(int n, int *odata, const int *idata) { int sum = 0; for (int i = 0; i < n; i++) { - if (idata[i] == 0) { - odata[i] = 0; - } - else { - odata[i] = 1; + if (idata[i] != 0) { + odata[sum] = idata[i]; + printf("%i ", odata[sum]); sum++; } + } + printf("\n"); return sum; } @@ -41,18 +41,25 @@ int compactWithoutScan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - for (int i = 0; i < n; i++) { + int* bools = new int[n]; + for (int i = 0; i < n; i++) { if (idata[i] == 0) { - odata[i] = 0; + bools[i] = 0; } else { - odata[i] = 1; + bools[i] = 1; } } int* scanArray = new int[n]; - scan(n, scanArray, odata); - - return scanArray[n - 1] + odata[n - 1]; + scan(n, scanArray, bools); + for (int i = 0; i < n; i++) { + if (bools[i] == 1) { + odata[scanArray[i]] = idata[i]; + printf("%i ", odata[scanArray[i]]); + } + } + printf("\n"); + return scanArray[n - 1] + bools[n - 1]; } } From 3cb3feee3162f0002344f41f1fd08efd6e2b36b3 Mon Sep 17 00:00:00 2001 From: unknown Date: Thu, 10 Sep 2015 12:48:22 -0400 Subject: [PATCH 08/15] passing all functions --- src/main.cpp | 2 +- stream_compaction/common.cu | 7 ++++--- stream_compaction/cpu.cu | 4 ---- stream_compaction/efficient.cu | 26 +------------------------- stream_compaction/thrust.cu | 5 +++++ 5 files changed, 11 insertions(+), 33 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index d45d8bf..c29b215 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,7 +16,7 @@ #include "testing_helpers.hpp" int main(int argc, char* argv[]) { - const int SIZE = 8; // 1 << 8; + const int SIZE = 1 << 8; const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 82925e1..e03178e 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -39,9 +39,10 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { int thrId = threadIdx.x + (blockIdx.x * blockDim.x); - - if (bools[thrId] == 1) { - odata[indices[thrId]] = idata[thrId]; + if (thrId < n) { + if (bools[thrId] == 1) { + odata[indices[thrId]] = idata[thrId]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 6c0a6ad..8a9c67f 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -26,12 +26,10 @@ int compactWithoutScan(int n, int *odata, const int *idata) { for (int i = 0; i < n; i++) { if (idata[i] != 0) { odata[sum] = idata[i]; - printf("%i ", odata[sum]); sum++; } } - printf("\n"); return sum; } @@ -55,10 +53,8 @@ int compactWithScan(int n, int *odata, const int *idata) { for (int i = 0; i < n; i++) { if (bools[i] == 1) { odata[scanArray[i]] = idata[i]; - printf("%i ", odata[scanArray[i]]); } } - printf("\n"); return scanArray[n - 1] + bools[n - 1]; } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 1896905..c9879a7 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -117,31 +117,7 @@ int compact(int n, int *odata, const int *idata) { Common::kernScatter<<>>(powTwo, g_odata, g_idata, dev_bools, dev_indices); cudaMemcpy(odata, g_odata, n*sizeof(int), cudaMemcpyDeviceToHost); - /* - printf("Bools \n"); - for (int i = 0; i < n; i++) { - printf("%i ", bools[i]); - } - printf("\n"); - - printf("Indices \n"); - for (int i = 0; i < n; i++) { - printf("%i ", indices[i]); - } - printf("\n"); - - printf("idata \n"); - for (int i = 0; i < n; i++) { - printf("%i ", idata[i]); - } - printf("\n"); - - printf("odata \n"); - for (int i = 0; i < n; i++) { - printf("%i ", odata[i]); - } - printf("\n"); - */ + return indices[n-1] + bools[n-1]; } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..9f13535 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -16,6 +16,11 @@ void scan(int n, int *odata, const int *idata) { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::device_vector dv_in(idata, idata + n); + thrust::device_vector dv_out(odata, odata + n); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::copy(dv_out.begin(), dv_out.end(), odata); + } } From 815aa9dbafb6392d73102813dbefe1572a34850c Mon Sep 17 00:00:00 2001 From: megmo21 Date: Mon, 14 Sep 2015 14:48:57 -0400 Subject: [PATCH 09/15] Update README.md --- README.md | 50 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a82ea0f..f63de1f 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,53 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) - +* Megan Moore +* Tested on: Windows 7, i7-4770 @ 3.40GHz 16GB (Moore 100 Lab C) + +``` + +**************** +** SCAN TESTS ** +**************** + [ 3 29 33 19 0 16 10 40 39 50 44 30 9 ... 4 -858993460 ] +==== cpu scan, power-of-two ==== + [ 0 3 32 65 84 84 100 110 150 189 239 283 313 ... 6684 6688 ] +==== cpu scan, non-power-of-two ==== + [ 0 3 32 65 84 84 100 110 150 189 239 283 313 ... 6613 6626 ] + passed +==== naive scan, power-of-two ==== + passed +==== naive scan, non-power-of-two ==== + passed +==== work-efficient scan, power-of-two ==== + passed +==== work-efficient scan, non-power-of-two ==== + passed +==== thrust scan, power-of-two ==== + passed +==== thrust scan, non-power-of-two ==== + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 4 3 0 3 4 2 3 2 3 1 1 1 4 ... 3 -858993460 ] +==== cpu compact without scan, power-of-two ==== + [ 4 3 3 4 2 3 2 3 1 1 1 4 3 ... 3 -858993460 ] + passed +==== cpu compact without scan, non-power-of-two ==== + [ 4 3 3 4 2 3 2 3 1 1 1 4 3 ... 4 4 ] + passed +==== cpu compact with scan ==== + [ 4 3 3 4 2 3 2 3 1 1 1 4 3 ... 3 -858993460 ] + passed +==== work-efficient compact, power-of-two ==== + passed +==== work-efficient compact, non-power-of-two ==== + passed +Press any key to continue . . . + +``` ### (TODO: Your README) Include analysis, etc. (Remember, this is public, so don't put From 768e8adf1a0e9a0491712a5609ef8e54a630121e Mon Sep 17 00:00:00 2001 From: megmo21 Date: Mon, 14 Sep 2015 16:26:13 -0400 Subject: [PATCH 10/15] Update README.md --- README.md | 179 +----------------------------------------------------- 1 file changed, 2 insertions(+), 177 deletions(-) diff --git a/README.md b/README.md index f63de1f..51f2124 100644 --- a/README.md +++ b/README.md @@ -50,175 +50,14 @@ CUDA Stream Compaction Press any key to continue . . . ``` -### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) - -Instructions (delete me) -======================== - -This is due Sunday, September 13 at midnight. - -**Summary:** In this project, you'll implement GPU stream compaction in CUDA, -from scratch. This algorithm is widely used, and will be important for -accelerating your path tracer project. - -Your stream compaction implementations in this project will simply remove `0`s -from an array of `int`s. In the path tracer, you will remove terminated paths -from an array of rays. - -In addition to being useful for your path tracer, this project is meant to -reorient your algorithmic thinking to the way of the GPU. On GPUs, many -algorithms can benefit from massive parallelism and, in particular, data -parallelism: executing the same code many times simultaneously with different -data. - -You'll implement a few different versions of the *Scan* (*Prefix Sum*) -algorithm. First, you'll implement a CPU version of the algorithm to reinforce -your understanding. Then, you'll write a few GPU implementations: "naive" and -"work-efficient." Finally, you'll use some of these to implement GPU stream -compaction. - -**Algorithm overview & details:** There are two primary references for details -on the implementation of scan and stream compaction. - -* The [slides on Parallel Algorithms](https://github.com/CIS565-Fall-2015/cis565-fall-2015.github.io/raw/master/lectures/2-Parallel-Algorithms.pptx) - for Scan, Stream Compaction, and Work-Efficient Parallel Scan. -* GPU Gems 3, Chapter 39 - [Parallel Prefix Sum (Scan) with CUDA](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html). - -Your GPU stream compaction implementation will live inside of the -`stream_compaction` subproject. This way, you will be able to easily copy it -over for use in your GPU path tracer. - - -## Part 0: The Usual - -This project (and all other CUDA projects in this course) requires an NVIDIA -graphics card with CUDA capability. Any card with Compute Capability 2.0 -(`sm_20`) or greater will work. Check your GPU on this -[compatibility table](https://developer.nvidia.com/cuda-gpus). -If you do not have a personal machine with these specs, you may use those -computers in the Moore 100B/C which have supported GPUs. - -**HOWEVER**: If you need to use the lab computer for your development, you will -not presently be able to do GPU performance profiling. This will be very -important for debugging performance bottlenecks in your program. - -### Useful existing code - -* `stream_compaction/common.h` - * `checkCUDAError` macro: checks for CUDA errors and exits if there were any. - * `ilog2ceil(x)`: computes the ceiling of log2(x), as an integer. -* `main.cpp` - * Some testing code for your implementations. - - -## Part 1: CPU Scan & Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -In `stream_compaction/cpu.cu`, implement: - -* `StreamCompaction::CPU::scan`: compute an exclusive prefix sum. -* `StreamCompaction::CPU::compactWithoutScan`: stream compaction without using - the `scan` function. -* `StreamCompaction::CPU::compactWithScan`: stream compaction using the `scan` - function. Map the input array to an array of 0s and 1s, scan it, and use - scatter to produce the output. You will need a **CPU** scatter implementation - for this (see slides or GPU Gems chapter for an explanation). - -These implementations should only be a few lines long. - - -## Part 2: Naive GPU Scan Algorithm - -In `stream_compaction/naive.cu`, implement `StreamCompaction::Naive::scan` - -This uses the "Naive" algorithm from GPU Gems 3, Section 39.2.1. We haven't yet -taught shared memory, and you **shouldn't use it yet**. Example 39-1 uses -shared memory, but is limited to operating on very small arrays! Instead, write -this using global memory only. As a result of this, you will have to do -`ilog2ceil(n)` separate kernel invocations. - -Beware of errors in Example 39-1 in the book; both the pseudocode and the CUDA -code in the online version of Chapter 39 are known to have a few small errors -(in superscripting, missing braces, bad indentation, etc.) - -Since the parallel scan algorithm operates on a binary tree structure, it works -best with arrays with power-of-two length. Make sure your implementation works -on non-power-of-two sized arrays (see `ilog2ceil`). This requires extra memory -- your intermediate array sizes will need to be rounded to the next power of -two. - - -## Part 3: Work-Efficient GPU Scan & Stream Compaction - -### 3.1. Scan - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::scan` - -All of the text in Part 2 applies. - -* This uses the "Work-Efficient" algorithm from GPU Gems 3, Section 39.2.2. -* Beware of errors in Example 39-2. -* Test non-power-of-two sized arrays. - -### 3.2. Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::compact` - -For compaction, you will also need to implement the scatter algorithm presented -in the slides and the GPU Gems chapter. - -In `stream_compaction/common.cu`, implement these for use in `compact`: - -* `StreamCompaction::Common::kernMapToBoolean` -* `StreamCompaction::Common::kernScatter` - - -## Part 4: Using Thrust's Implementation - -In `stream_compaction/thrust.cu`, implement: - -* `StreamCompaction::Thrust::scan` - -This should be a very short function which wraps a call to the Thrust library -function `thrust::exclusive_scan(first, last, result)`. - -To measure timing, be sure to exclude memory operations by passing -`exclusive_scan` a `thrust::device_vector` (which is already allocated on the -GPU). You can create a `thrust::device_vector` by creating a -`thrust::host_vector` from the given pointer, then casting it. - - -## Part 5: Radix Sort (Extra Credit) (+10) - -Add an additional module to the `stream_compaction` subproject. Implement radix -sort using one of your scan implementations. Add tests to check its correctness. - - -## Write-up - -1. Update all of the TODOs at the top of this README. -2. Add a description of this project including a list of its features. -3. Add your performance analysis (see below). - -All extra credit features must be documented in your README, explaining its -value (with performance comparison, if applicable!) and showing an example how -it works. For radix sort, show how it is called and an example of its output. - -Always profile with Release mode builds and run without debugging. ### Questions * Roughly optimize the block sizes of each of your implementations for minimal run time on your GPU. - * (You shouldn't compare unoptimized implementations to each other!) + + * Four different block sizes (128, 256, 512, 1024) were tested against four different array sizes (128, 256, 512, 1024). Based on the cudaEvent timing, none of the different combinations led to a notable difference in times. When the times did differ, it was only by a few tenths of a millisecond. Also, the speed ups that occured with a blocksize for one of the scan functions, caused other scan functions to slow down. Therefore, I used a consistent blocksize of 128 for all scan functions. * Compare all of these GPU Scan implementations (Naive, Work-Efficient, and Thrust) to the serial CPU version of Scan. Plot a graph of the comparison @@ -241,17 +80,3 @@ Always profile with Release mode builds and run without debugging. These questions should help guide you in performance analysis on future assignments, as well. -## Submit - -If you have modified any of the `CMakeLists.txt` files at all (aside from the -list of `SOURCE_FILES`), you must test that your project can build in Moore -100B/C. Beware of any build issues discussed on the Google Group. - -1. Open a GitHub pull request so that we can see that you have finished. - The title should be "Submission: YOUR NAME". -2. Send an email to the TA (gmail: kainino1+cis565@) with: - * **Subject**: in the form of `[CIS565] Project 2: PENNKEY` - * Direct link to your pull request on GitHub - * In the form of a grade (0-100+) with comments, evaluate your own - performance on the project. - * Feedback on the project itself, if any. From 7482b82e55bcd6300dfd8745d27c10946631d93c Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 14 Sep 2015 16:27:00 -0400 Subject: [PATCH 11/15] graph added --- images/Graph.png | Bin 0 -> 13472 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 images/Graph.png diff --git a/images/Graph.png b/images/Graph.png new file mode 100644 index 0000000000000000000000000000000000000000..d6185253507b31def49c406cba6fb4c125bb54ce GIT binary patch literal 13472 zcmb_@bySpJ_wOJIDh<*lLn$HMB@F_DLrO?UgT&Ab=@;pe?vRk~ZV?1&X{iB+ZU$)> zxDVg=eeYWL{_b7x?_Kx)!J74)XP z%3OkVzOonWpVrY(M!eT3_D$B+D~xllp+0cX@U&m~zTTyN1b=%v_u_qF8`k?}FvACX z?q30csFZJCYX$?Q1&KFbeG{i((o@>}{a7M^4)n;Ud`y5?hWu0H%ZFvxt)pA}6B`+R zDXASE@Da4%_EUI^diK8L!~{A$t9jxeEhEE+3AxfY%0HGW2<~HS3l62weq*c@EU?ML&w!9)ovTuz1tE0DBK>h7>2!7f&`bCmLwC zW8|S7YTjd7W!Cf3*`kA&{tRcDNz7|7BJWoptQD)3w93O`KGt1Ouz>NY3JJzze`o39 zc&Rt_Lfux?`LwBpciPw!CgMW#baG41_l?fhBeT& zO+Ci)(XqM3YB+6^oUqabLuj`c?o&JgLT2GfE75-aYGw**g=w}9-n~3!nJ+KiLM5Gce9<{e%TPxB zg|r;je#K6KbJk#GAJPcvZG4BW44BvVH*>~8xxzB-`3(;1VzYM}K9do}a2h9vzLg!$ zs5F=hPupC?&tU}wWRnzT4&hk(tX3py)@!P#{han-VD5yg&}6HZ^7A=`ExnmW6EgFy=C(VhwWM zO4E@yuIpBRfY;30^1qsiEL>lA7SfCf%QuUMWj$D5YY-ewP2hZ{XJ9s_r{)o{;0943 zb^qegerbfE?&2E>#TiUX>}$q8-GyuE6q(uyI#TmV>sB!%5-rmGurKVM$%XwCz z+sVFcR}sTgqUWF7tIggp_BNon$j#gA*sL_yR*YKjXPQ8Zb!zZYB9<*9GDigUpKilVaJ`gnu0)^I4a)yZ9eL9fcOK!dr@Nx~R4_k~^C3Wu$U`ppWB z=IrkqLEC!6@%Aj`66I0{!$EyjKgiN{;bWcGqh_3?qojl*p*OunL>I7X(fOk7ClZX` zHOrshgrC0gFnc#L6l9R_%=+ObDG_#*GJzr8`dG>4xi-HMErW!!;=;m$1RXe@IiK%* zV`i#mEXabmw{H~t9}CB8#dnNUlf`G( zI#Gk=**D)-O6tCB8a@$Xp)534DX)#{nC4|3)fG1r{cb!)WiXlM*ybNWyk>_tsn8=W zI|(cM1Z!dw^&^d(!R}swRh{Qhjv}szG=^gZH|<_J=7`j6rWnaC)@zeo8HRQ(yma{` z%f&0SIID5{WoGwN9H%==F-v(1jMI+iLgWf=+qN^c@%Uk5^G$F5WWH;cTLW}hL#wdq zfn9iKZev{Q_IH))#%IuR?3Y}C+bW<;heQrZX*;i1+VQ*9!dAFe_y=YR_NZ*Ay|q zPIPNw_Z!7ik&vCnpTaq3l$gu+AM6V-C%>`;sCM2u4e7OPdOr{^KjBfJOB?K0 z*N!>*Q(C>1vf5uuJw_TwHq@V`cdcH=H~ls{pEEB_iz5MA&MF^cb4k zpAuDHhz2CaoH-?Jh-8V@h?2QRf5r)jy9tO!V9Owk(f+-7_*zJx zp&e0SOFi2NZuR>YS2=Gx&b4?AQ0~u~x8NFE;1VJ3cjfht&tLH~)0t{PNq5}=mY7HD z_>=`~rlRchB2YrM_U2P9UP;qHz_Xj2p{T$2XcVhRgm+4Nm+C@(b%yl|+qo6HJ%__{ zqZVknPsDZkJ@ixL(`WmXn41`4KYz}Cc(_SVgw62#eh@~n-NRiOS=n#WfuX_=LppAT zw;o|WUBM-3k5M3k5#maAD>61kbMrIN@JIZA$j1L`^3M4H9+9)k;uVxEPy}*(-B%Ei zm2F#I-0+C<3E5QN7AIpP5=sM<##15L^Q%$7ArV# z079uaX4jG;a3XfJFlvpaVikZ3rxZ_-s;+Tw7$1djtpLwtl&frzOQ99RZ@_MQyh?Y# z!ut8YLlFPuc}QL{I_r(o;J!=c{Am=m$G()SVGQ0w0H_3rC>{;ZCOe4k{ne-5YLc1Y zLT3M%YfVcr<5(zi(VO2~%5exy4I1L<>cmv|>aI?Q9(S9c@W~t4sPNNJ$CuacNu@d{ zIwoF(i2G+_QRc-*iQ$1{1j!K?jJT|jj#}TPQUXUw*DKl1o69J5|I2Vr+d|)^i=C+e zy<_puMIGVsPW$z#5yFPO7=)6_g3b+Z&W;DO(IfHGV#_XM4spu&X>pV1)L)umKi*f-wR!d;2E1rrvSA2(fn)`>X!9Ms6!WVQFejQB|Q*ZuJhU7 zX0;$o)z$yxj!);M*@(mc%AIqz%Ur$HuU~R@4br*!$WE7o%^2Mk{m1Woa*N#RWYn+< zHXPmEviN#3Qutk3l;DDa;eQqAM!|J^%+;ejhlCA(X6AdiAI_}V5{+(y2|8)PZqwbaZwt2 zH6x4$_*>Q0=#c0^4&KU3$&V)tx)vAP1n!@_*Pon3nIwMmo-@t=G6$Wi0Pk&*5tW7u zj}^X7eb#55%1Sl)AdOo=KnBlFXa26 z%|!EK*I86~6mPHj1-M!9t(OL@ys*#Vm2cV57eDQ0KyXae`~m zFE?El$XuY{-gYLDcPI}vwzolKl#gKL4=62 zq>@Wa3&l@O>=M4B7N(c`HlL33A8#fvw3Xgqe_@MWYTA8pf_k=8pC$s}|KZnCW1nxZ z@YLve`HT#U{cI2odcmC&z&(-y4Al7=V!L~lVxL*%GEoLr(n#vpierpb4+o(nALZFg_tm|1-D!K+ZyTk&DEI;v7HNEmnG< zUwU(MA6#ZI;!-B;QX?T_S@GG~kCKvIzgc^4I9j1<&S*;r5P>*U_+wm0>{R(-hZ_#MrDWXT%igfd$K{t##^I-fDmX6cEh07V+Wi>cxI2?hH*X5( zh6+B5cV-?$2;+r?;)i^G3PR?;1q_k~xwVb#~z&tUQ`DU&}I*s?kb5 zn>-*^7UT1_#w}9lsy=j49AdHs8 zI|rvwjE|p%2-g}890b~viM;K1FP@0Gtfi{>nuqU2I*7>K7jfzzZnXrTC3F&L_;)4y zF*e;aMvOb&$G&I>r<8z4COs8{$p0j1rCB|B9YRxu)zB5Ld$_1H! zJMCfB_jk$T`#5`E0x@;R*oNl$KNdG>MVdwXIWN_ui%@& zz!F>+`Lp$<*QmZL{d<#5HW&3Z>d$wcMV@%Bz-=hN7<~{PC6~eUOIX+Q`tO`*>ir&F z0mRVTOFR<#j{L(s6oFyW4SMNEw6VsYtdOknKbR?{(++d(&ch@0FA%5gVvT;A@5GQk z%hWraxAEnTTuTgY$G9o+Oe&jNPIVc)OqHj%_HiZLgOdC z?A(!AT7DB3}I^;C9di#U9J(M(J+rY&Is|1v*MNG|# z;{;L|*g3jwMBf5Rz>s|$A=oqzuR4Dbq0>5+m$94obqFzY_4-P(J)sYti#3|kLVXvp(L+~I~M~c8!p0d^lNRu-QwaITVLDsBb ze6UT~^V{MVq;_-&pHTV?0Bq0@S?!CuU$3?l8h5~Ean`EWU38_q&~d({4IOf27@$Z( z@W?`3Z`u((g__wg(f(6B5QUmt#-XxmY!x_G{;vp9iy`o5#oUw&s4L8FQT(ZKmp5yFopD% zQ45PQs8A^*Amw`Wc4+)f2MlFT^O(idai+Z+B2umw`r>5R`fCU1* z+6x8hAePval%&Qp3`l;~c3%(WGHBz$oswVYTK1r@X(USAd$!zLYdGG}>9_s~iD*Ql zZ|ASOaApi_SUBlBR>LZnxM34DEZAO*A4+TE0kCF5v+VcjA_#<=%ZAMN=cS9tjZF6n6@Jj< zD+B^{Yj}}jw~M%6n`gfH(_EU(1hB^K5-NIkEl?6`*bm}THy<^9iS6mIT|Tt zCLD)ZPK-5bp?%fPi_&*^cf#hNLxt&xEk-(ACv3>YE5L)4e}oKgVn!1HHC;K}sR{5L z)E%nhUrKmFYWMf%oN_&=^B>GXLh4G6OKN~CWmSIp#jzWn{j4Pd9?`xy?6?1MmYa5KjIQ6^i7 z(%N=tx0MzMWKyT4kg?g?^MK)>)Y|eH#|PicpgGEcQDTn(L`=(82^7shk#zq}g!|8u z8i2|nmsh>lbvyt%PQ>^-d@`qRCgEp>c>)!>m(;kfq^wbvUG>Tsh<9Aa>kv%LHa5ifbCzZ{D-x@sW9QOD5|zj zqYQKhPp3UD2sD?^i|jd@%{W|C6L0!yZN`Y)v15CL@S=lm5>So&N#9Q_s;w0};HyC6Vkmw89yh<@Qc8yx`kNiy z)j-U&k?ZEBA7It>$U9wOXkTZVgYkco#W}85o^N`o+-$cM93(q$4XA%|1`P=SLGY^E ziAX-}*<6u037*Y;njAF;qfYK&p%TNB9~&q9pcLxux8E_iMH}{weY?|( zH`6?CAAw{Pdob?7R!;)iJrE6>AJi2-tV@b56*>Q|;oCxtPI?&Bi1Y1_LhtDCn+3fe z@`9AunrS*^3aM;3i9yH##^;8&CEMr!P)f~bzfekxRb}k&O^b~X1$~q4!BFV^oOgAw z=-$mdI9h#gppou+C~uTQ^0K$CLRUT)m)s3>KPD8jZn3{n{e7Cx>gAMiEpb?an4od} zhRN-j`tMx)fB+&`z^Uzajhe`dZO}tvB5do-)YkW!(ZCKe8m!2k&Ud@FXykEY%!QX( z4o0oZ>=znf*^i$A^EPIu9U?3z+EhpbZt!;gu>ysSTLrSmv^Po^5etHk1rG|Y%~5uO za>hYKea+IW0ci?6LUmG67y&j_q4D?A&nL>ZaYX;N^pHh*mcI8+nxkMzUFT1~VX)K~JVIoK2(o18@>`UN#>} z?%zq3MEw-`Ln#wh!vIRE$^$J;V*J!kBcv67`71_VIEf)YptOH=7HCAVe6%ttQGb0idh-5tqcYQ325vt z+~rsqHf#|9Q-*-5^K0^C`?TB_+Z^!acpdeU2vJ2NsiGZ>uOI;C3A~se`c!_Ud+WJ) zuT;0-Z<}JgGzQ)<0B@ohE9niJ{|KJtMn}e??iiv6<1cqvq<>xpIKTqV7%GN>d@-=Z z70j*Q^nL>CjwEy-lw?WHC-+b={a^VfaIDuNo$QWl>B;7B65|ro7m9x-9`3m44xV<) z1K)j%bd{uRmxBiCe=U?P4UUeCa_m&@>3TTl;)IMI*4498dp=WnGxw!m5z^r+Gb3vU zNdHp(GDdi|X^yCHes7>AfMKq~ZV>5~88o<|SJ)qsoaqKw&=GE1UJ=qTSL4!6Ygd^m zbpzd@OyJ=)`mzlN-DCcz!8$`3^VZjT$Xx@bku_>2z_GyH+@zimsx>+Kbfhg&c&J93Bjx-)qZ668!Fj=c64&gy z#`;Z+PDddlEV|*{<^tbGi>M%9B_jDjtSpWa*4zKE02Yq+L&cp|DBY_Ja(rD*2)L7dt!E6 z3CJ68aO0+qjC7B*aMSz1Ct{c~%-)rcd3LgmpEFpljm&F>#(Kb}5k}Dc+CGjqD4zDn z*MurtXfCS;TkqBs+?YELSqyZEc_&x#vP{&2Rcp*O0ApfG-&OUlao zEE$ga{1j>O5e1X3>L8PLktc(Q?09g#0jXqJ=I0pj zRZJaYYf`vkO?tn^xyZn*T#qz$@u1*-HbK##APploNmCI?35Gx0mwdxNzjX66{eK;* zk-p1H)8|q#)wPKcC1UNMO+3JkoG_NeKbVPHGa*0C6w{9w6r>P+v0W#JUa1m)&enr_ zsmTM0{lUKX1Em;5A@aCiBmCy&BYRCs7*lU#l=}HTQ~-buE4dG!OsWa5xshIHiUp5| zdctP0>@~fhAwn{I0}}Y^sy8yopnTOtL9I|20aZJhjkExB2$hY_oUV;>CANZIW~s1t z<8LjVXE{5<&cAB*7hupM^y!yyIH!QocqiTaHvX%TDN#?~m77jTt0<>ogS5(%x0B7w z9TQf^QJ+qz-k0PPeMY^bUBlwQ(eW7)#n0~KKdbVK)hd~*yLlOg~ALbn3@d{GWk-r8Z`u(mYwD&*?GN8-)cbX3*V(G|nel z)Gv`r-L{=vejbv!xar=IsM>Ns(}3*j-RHZ7%Jo+fOCjSX8a-c z%=8N$G0gmZz=cBWjO{OveTo+%eyI#`f#eGKSBD(S!jTx-3K_=)-Hc;PY6V>^Ld?I) z0x7?B;p9d8Qdl>`b%FjRNa68+*4_LW0NkRA&HR2*0MK{%Dd18MHssLMQ9yHaC=R0% znXvc~z-1$lX966@X!?uIm<}BybJ@asI^fAq6_EIlljx z``>G+0tL|$#*xYMH)8}8RzPi2;tmz2YsqIkTQ0I@^kShmuW8k^NCYIE%K;-(gl73e z7@I@m)un(DS%H^5VSPoT#Z4app7bqpLP>nd3;Br91_$83JpkbGSh40Kgj74@o5aT&mQg2mB*20)A<|5>5B zNEB(_X}a5--GrX#1LoVmCWVC`KodJvNCCMbwB%@%d2A|pLKsiYfK3{JnsPq9hI~`n z9slNch=Te|mZS22N9kDLXbAv>n-sBuAy16LI~+3927-ar=W^JP6voYC7_0+)y*V;p z0eS5;X#aK#{Qt}?Uc2=apFdjh(yd7l#VR8I2p7;ct*GeI^04N|6>S{CyVK z)a2Q}xptI<3PUTNU$(%+qkQfxR>X#E@~6zgN_UlsECUAS@pt;n`y^`4+D=q^MMN^z~vHINrV;Hxa33+cDCPF27`46aX?Q$P@_>3uCl?7Y)RJvp~XaA@~Y7 zu@v}_uYBPoRR(2zn*l*2+{|r9>m~jL#)!M*`|w}n!Wf#*64$1tZfD`Y`8Dt>C5GW{ zLfS@t`!9Av`ISVDSD9ILZ;j@t9m{XXLp+v>2y%ew=fgpExRq+?@wpY1@RL{U?CL%P z@UB*1lOPQ}VGL~ldvq#{L=!uV$W0YqCZj&9%R29ZqOaiwI~l6))gDWTef^7K0K z9%m%CkQ=0t!tfBv$QHw>yhQio>29N`)lz~;4iRB}R=K<97ZU0>t5{9t29j|epz;fwDbtWa~WIp2?bS0aW-uw{tPfn1fz8w)| zCQl4ZAFYKnI-~}XRJpxzEQ$QNWqPaTpOzKS+K7H$7-AXW4bKUAj`747g!dgpRJePp zowqlocq0oxEjNxvBbVsC4rA<+ioI{=KYYJ!hioCxS(s9n6KQMT?QTQrI~!_$Y<&Q< zCiDFH1PzI0Qb>p4SV zi6Ph2AUed!qWT;14XWq!qQ^*DuWlh&xwGy7i<`(w%hq(6k_N_@p>-oC4FBYSssZ?7Hv5vJ)w-L=~ z%7ccBw@J~oALp|LPHnaRxSBN(l!7#WR%6hlh6&?)MGwC<&Cf2W=(o zRP_|TmhQ$K?6r!sghZ_T>n;v7wMVV@F5jJP$T{mV#Hz|mW%y2t>!y&*re0}2F+BDg z{+-L9ps2GuRW!KSn=AWateRJ!KCV7if|mqFGySJBd;a}HH2ao%1rm2;!EJ?3uyt}5 z1|e>*z2b|9^tb%leSiF8+r}$JqUOtI!CLCtMLr4QCmtNSuzTQ94;C?<)EJ(WZ#PUx z+@4uC7seXCDh1=Sfi3ksp<3V6Zuuo2w4PO^0&^jC6tsmvMKtNoLw*YWhgsnNg}e~W zBKl{2n3_6!fXbF?Uo) zZys)C4D&oIx@W0N-Oan{UhlEG$s;84xerE2%?2JRgYWDu?cEGJ7PR>x?IlX|T?r`$ z@nw&yPOB}ScsAU1s_`ES5OkVeEcTN)wsSMAc{tKc<&`|>h{qk-kA))o5PnyCfz>3e zkWFY}F0EX)O&;e6q*cHpk?!j8!|$RkIYNcaQc|C3ET~s>RoBgDqLe*V3Y%F z-({!4tEN`sR2H0EOO_3I{jWAGB2pP4H|wdol~>E`OBa8N3{U)5KWDvW@UyHzgwt<- zNKl_9FH_nbt$hq>?MzO5I2?^$Mv)GU35N&2eBOdp4jRi z6qHgC6ja~N=aQpLl0I=1mx<}uk86BZ^VQSV^-X_tK&&UxSS(AX#G1`S#)M8?P2N`d z#&oCiVn{THkB4%q(eZvV*RX_YNI6)oz1huBDD(K;cxng4-ko(!+gw$jkJi1kAX2%d zW8$xjX(TD^lt-JPGcNGJQhr_(S5s4y*w}A8uXVpgg+L^~ek0r8NFeL@{jnv+TTn#T1n+l&S?*mH9IJRXOZ7(8Ux~ z%}~2i9yYCuoTwh!SU(bxCvP@RRPy~JN^z12X&Q{2Q0h46Tro1NGo|2X>i z#4X1TMWM{o+gL_mVWnII24Yf*J7mUQ62?3-Ty2ann^%(>(2$5{Q;*!r+u@`k(_;4Ku96Ds!@3pw-o~~_ z5tZwJz+V}PJ*muS+tH4k=#dj}PboKdY2yKZUGf+I#K9F_daa#1e+ z;&`p_G)*o87mOB^b#;rqh@^Xd&q}5|MXA+c-SfP-6NvnTxJ$@>rWu#IG0KRqr_FiF zc=}2-dMpCEF}whYlhn2SmBQ{N<2$8s#GNWVb5=)KUT-AeLQkS#!CaC?P4uVEraUBV zIkkQuL5Z>Ayu^+Oc5n~F4>9(#lmTvkLjuSWWAAB73Mnj?6 zDcvRQ?-@`Z-!r(2-B`MoyS{>GMFmo*%4O&YoD8I^Wcw%&AY}VaiW&~vlz5&v(%(#> zw1t#Kns1*=u+i*V;(6}X&t!$uO8UHsk@?t;BM8QS5{;koVAnE5WcO{=f7pZgzmga3 zIiLN>u5!FKUisqEzL569*0|>+%9L_wn&CIC2zhEI*#zVIu>P%_w+y7tL@#1)^vW70 zYB&VXj!6<}2*Dkh?fccG@Pw%FWVJrAtptgweMwyE^>O?6OG}+S(*}KKC?}Jdd_i4Rw|ql>K&H=C4EpgR}3TxI{4wxjTZ1S8o7o_195nW18kwdv=^AP*M9o~YJW()nty z*vk~5&Xy`GxzEANQl5?PZJ_gyL_qUuMIf=!+zh_nq~aEIt+RT`y;hfcP?vGU`J(FJ ztE1G(UKk(O;&Us_Jav8xiUWb#tx30FstyJvea}l>ib0g6NV}lU% zsAut-ar79YS6*B@zQxA_bSOVzCsbJ(ua?>6%~ZY792`%4&ojMWM>vzQJi%CB{C)Hf z7O|=0svi=Obgs@5EBz(W98y$~EMNGV4Ptk?W znm}JWKRtSl+@?ZTJR^NqbqZ5PdjR+!3!+3!;&v6M!hIqBtGpE%w+){!&q$Jm!W zHcGWXFRZ?^kUGX#?+Cxg^P=ED%coL93@OLBn_PAS!(5sY12Gfk`C=Y{z`_$;a2X-m!OEIu zBb1Xv4WBjvPBFQ}e7`pm+}xatExYH$urfH)gf96Ll`yWll7y3-`PqPeosO7bBORu& z#=1ejjeWs}VyquW@0(>|xnQzT>UDHkGk_toe!hvcud%pBeU*jiI-ABZUmc6m@-arB z#^Y^x8O~l!SL$q>Yi}8Rd1uwhiYHB*I#Eqy$drK~p3>D<9cS#- zre8wjvvl)@Bx8x&OZZ&fw5;0U5PW-5BF4Qu5}9=Nwn|cHeP;(Ew&a<@HN(~Ks}yT& z;nI%$y|B6~UDq;A*25>9I!&z2llFnPwgDT<*_#sUp>|e6Zd!@H_w#pr%`o}*J zT^|>}eZ5MVfo$PAsw(vJw`i@-;~Y|)+57nY0kS!tKCbTL@JunUMze)N2dm{G@R$-GKA`v zu!bLQ1T@|LL;+uS)F%Mn!JQNdB60coL!$Tnoz5~ffR4*p-n}0DU@Su5=KY`j*USHj r{Ph#q>HZ@7EpR(~*o$cQzLmn@@-^&n+=h+Kt?$zT5ioN!GH literal 0 HcmV?d00001 From e4accd3469dce58570052c3ef9cecc9b6cf60606 Mon Sep 17 00:00:00 2001 From: megmo21 Date: Mon, 14 Sep 2015 16:28:36 -0400 Subject: [PATCH 12/15] Update README.md --- README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index 51f2124..94e1054 100644 --- a/README.md +++ b/README.md @@ -62,9 +62,7 @@ Press any key to continue . . . * Compare all of these GPU Scan implementations (Naive, Work-Efficient, and Thrust) to the serial CPU version of Scan. Plot a graph of the comparison (with array size on the independent axis). - * You should use CUDA events for timing. Be sure **not** to include any - explicit memory operations in your performance measurements, for - comparability. + ![](images/Graph.png "Array size analysis") * To guess at what might be happening inside the Thrust implementation, take a look at the Nsight timeline for its execution. From 979fbe8d28e9a2fc98d1781034bec9f8afa53c4f Mon Sep 17 00:00:00 2001 From: megmo21 Date: Mon, 14 Sep 2015 16:34:27 -0400 Subject: [PATCH 13/15] Update README.md --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 94e1054..2164aa0 100644 --- a/README.md +++ b/README.md @@ -63,8 +63,7 @@ Press any key to continue . . . Thrust) to the serial CPU version of Scan. Plot a graph of the comparison (with array size on the independent axis). ![](images/Graph.png "Array size analysis") - * To guess at what might be happening inside the Thrust implementation, take - a look at the Nsight timeline for its execution. + * The thrust application is taking a much longer time than all the other implimentations. Seeing as the cudaEvent_t's, start and stop, occur right before and after the function call to thrust::exclusive_scan(), it could be possible that these events are picking up memory transfer timing. Whereas, with all the other function, I was able to record the time without the memory transfer time being included. One other interesting thing about the thrust application, is that the time for the non-power of 2 array is significantly lower than the time for the power of 2 array. This, in theory, makes sense because a smaller array should take less time. However, with all the other applications, they take approximately the same amount of time because the kernel is 2^(ilog2ceil(n)) times (for an array of size n). In the thrust application, they must be allocating their memory better, as they clearly do not have to call the kernel 2^(ilog2ceil(n)) times. * Write a brief explanation of the phenomena you see here. * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is From fcd8f8f912ed6e5d0e4058f2334ea35780e442f5 Mon Sep 17 00:00:00 2001 From: megmo21 Date: Mon, 14 Sep 2015 16:54:31 -0400 Subject: [PATCH 14/15] Update README.md --- README.md | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 2164aa0..c826381 100644 --- a/README.md +++ b/README.md @@ -63,17 +63,10 @@ Press any key to continue . . . Thrust) to the serial CPU version of Scan. Plot a graph of the comparison (with array size on the independent axis). ![](images/Graph.png "Array size analysis") - * The thrust application is taking a much longer time than all the other implimentations. Seeing as the cudaEvent_t's, start and stop, occur right before and after the function call to thrust::exclusive_scan(), it could be possible that these events are picking up memory transfer timing. Whereas, with all the other function, I was able to record the time without the memory transfer time being included. One other interesting thing about the thrust application, is that the time for the non-power of 2 array is significantly lower than the time for the power of 2 array. This, in theory, makes sense because a smaller array should take less time. However, with all the other applications, they take approximately the same amount of time because the kernel is 2^(ilog2ceil(n)) times (for an array of size n). In the thrust application, they must be allocating their memory better, as they clearly do not have to call the kernel 2^(ilog2ceil(n)) times. + * I tried to use the chrono implimentation to time the CPU scan function. However, I continuously got 0 nanoseconds for each test. + * The thrust application is taking a much longer time than all the other implimentations. Seeing as the cudaEvent_t's, start and stop, occur right before and after the function call to thrust::exclusive_scan(), it could be possible that these events are picking up memory transfer timing. Whereas, with all the other function, I was able to record the time without the memory transfer time being included. One other interesting thing about the thrust application, is that the time for the non-power of 2 array is significantly lower than the time for the power of 2 array. This, in theory, makes sense because a smaller array should take less time. However, with all the other applications, they take approximately the same amount of time because the kernel is 2^(ilog2ceil(n)) times (for an array of size n). This can be seen in the graph, as the naive/work-efficient power of 2 and non-power of 2 lines are almost identicle. In the thrust application, they must be allocating their memory better, as they clearly do not have to call the kernel 2^(ilog2ceil(n)) times. * Write a brief explanation of the phenomena you see here. - * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is - it different for each implementation? - -* Paste the output of the test program into a triple-backtick block in your - README. - * If you add your own tests (e.g. for radix sort or to test additional corner - cases), be sure to mention it explicitly. - -These questions should help guide you in performance analysis on future -assignments, as well. + * It is interesting that the naive implementation is running faster than the work-efficient implementation. This seems to go against what we think should be happening. An idea of what could possibly be causing this is that the work-efficient implementation requires two function calls to kernels. If the array is not so large, that it requires multiple blocks, then it could finish the naive scan implementation in the same time it finishes just one of the functions (up/down sweep) in the work-efficient scan implementation. + * I would guess that the bottlenecks are occuring at the memory transfers. Especially in the work-efficient implementation, because it has so many different arrays (idata, odata, bools, and indices) it requires more memory allocation and transfers between the host and device. I would not think the computation is a large part of the time, considering it is only addition, and the amount of addition that is necessary decreases throughout the function (in all of the implementations). From c4dc2a765230dab17946331a8dbcb29edd62ca5f Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 14 Sep 2015 17:39:06 -0400 Subject: [PATCH 15/15] final --- stream_compaction/cpu.cu | 6 ++++++ stream_compaction/efficient.cu | 18 +++++++++++++++--- stream_compaction/naive.cu | 20 +++++++++++++++++--- stream_compaction/thrust.cu | 13 +++++++++++++ 4 files changed, 51 insertions(+), 6 deletions(-) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 8a9c67f..00be952 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,4 +1,6 @@ #include +#include +#include #include "cpu.h" namespace StreamCompaction { @@ -22,7 +24,9 @@ void scan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { + using namespace std; int sum = 0; + auto begin = std::chrono::high_resolution_clock::now(); for (int i = 0; i < n; i++) { if (idata[i] != 0) { odata[sum] = idata[i]; @@ -30,6 +34,8 @@ int compactWithoutScan(int n, int *odata, const int *idata) { } } + auto end = std::chrono::high_resolution_clock::now(); + //std::cout << std::chrono::duration_cast(end-begin).count() << "ns" << std::endl; return sum; } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index c9879a7..298afa1 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -43,6 +43,10 @@ __global__ void kern_down_sweep(int n, int *odata, const int *idata, int layer) * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + int blockSize = 128; int numBlocks = ceil((float)n / (float)blockSize); int powTwo = pow(2, ilog2ceil(n)); @@ -62,13 +66,16 @@ void scan(int n, int *odata, const int *idata) { cudaMemcpy(g_odata, odata, n*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(g_idata, scanArray, n*sizeof(int), cudaMemcpyHostToDevice); + cudaEventRecord(start); for (int d = 0; d <= ilog2ceil(n) - 1; d++) { int layer = pow(2, d + 1); g_odata = g_idata; + kern_up_sweep<<>>(powTwo, g_odata, g_idata, layer); g_idata = g_odata; } - + + set_zero<<<1, powTwo>>>(powTwo, n, g_idata); for (int d = ilog2ceil(n) - 1; d >= 0; d--) { @@ -77,7 +84,12 @@ void scan(int n, int *odata, const int *idata) { kern_down_sweep<<>>(powTwo, g_odata, g_idata, layer); g_idata = g_odata; } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + //printf("%f - ", milliseconds); cudaMemcpy(odata, g_odata, n*sizeof(int), cudaMemcpyDeviceToHost); } @@ -90,7 +102,7 @@ void scan(int n, int *odata, const int *idata) { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ -int compact(int n, int *odata, const int *idata) { +int compact(int n, int *odata, const int *idata) { int blockSize = 128; int numBlocks = ceil((float)n / (float)blockSize); int powTwo = pow(2, ilog2ceil(n)); @@ -115,7 +127,7 @@ int compact(int n, int *odata, const int *idata) { cudaMemcpy(g_idata, idata, n*sizeof(int), cudaMemcpyHostToDevice); Common::kernScatter<<>>(powTwo, g_odata, g_idata, dev_bools, dev_indices); - + cudaMemcpy(odata, g_odata, n*sizeof(int), cudaMemcpyDeviceToHost); return indices[n-1] + bools[n-1]; diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index d0d7ca0..e778738 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -9,7 +9,7 @@ namespace Naive { int *g_odata; int *g_idata; __global__ void kern_scan(int n, int *odata, const int *idata, int layer) { - int thrId = threadIdx.x; + int thrId = threadIdx.x + (blockIdx.x * blockDim.x); if (thrId >= layer) { odata[thrId] = idata[thrId - layer] + idata[thrId]; @@ -27,7 +27,14 @@ __global__ void kern_scan(int n, int *odata, const int *idata, int layer) { void scan(int n, int *odata, const int *idata) { cudaMalloc((void**)&g_odata, n * sizeof(int)); cudaMalloc((void**)&g_idata, n * sizeof(int)); - + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + int blockSize = 128; + dim3 numBlocks = (int)ceil((float)n / (float)blockSize); + int powTwo = pow(2, ilog2ceil(n)); + dim3 fullBlocksPerGrid((powTwo + blockSize - 1) / blockSize); + int* scanArray = new int[n]; scanArray[0] = 0; for (int i = 1; i < n; i++) { @@ -36,11 +43,18 @@ void scan(int n, int *odata, const int *idata) { cudaMemcpy(g_odata, odata, n*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(g_idata, scanArray, n*sizeof(int), cudaMemcpyHostToDevice); + cudaEventRecord(start); for (int d = 1; d <= ilog2ceil(n); d++) { int layer = pow(2, d - 1); - kern_scan<<<1, n>>>(n, g_odata, g_idata, layer); + kern_scan<<>>(n, g_odata, g_idata, layer); g_idata = g_odata; } + cudaEventRecord(stop); + + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + //printf("%f - ", milliseconds); cudaMemcpy(odata, g_odata, n*sizeof(int), cudaMemcpyDeviceToHost); } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 9f13535..d182d1b 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,7 +18,20 @@ void scan(int n, int *odata, const int *idata) { // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); thrust::device_vector dv_in(idata, idata + n); thrust::device_vector dv_out(odata, odata + n); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + cudaEventRecord(stop); + + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + //printf("%f - ", milliseconds); + thrust::copy(dv_out.begin(), dv_out.end(), odata); }