From 249ef1adbc739ec316ffd7befea25cd64ca4eee6 Mon Sep 17 00:00:00 2001 From: Sally Kong Date: Sun, 13 Sep 2015 19:47:17 -0400 Subject: [PATCH 01/10] working cpu, naive, and upSweep, buggy downSweep --- src/main.cpp | 18 +++++----- stream_compaction/cpu.cu | 30 +++++++++++++---- stream_compaction/efficient.cu | 61 ++++++++++++++++++++++++++++++---- stream_compaction/naive.cu | 54 ++++++++++++++++++++++++++++-- stream_compaction/naive.h | 1 + 5 files changed, 142 insertions(+), 22 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 675da35..b145c60 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,7 +14,7 @@ #include "testing_helpers.hpp" int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; + const int SIZE = 1 << 3; const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; @@ -38,30 +38,31 @@ int main(int argc, char* argv[]) { printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); printArray(NPOT, b, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); + printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, false); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); - //printArray(NPOT, c, true); + printArray(NPOT, c, false); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); @@ -112,12 +113,13 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + system("pause"); } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..ac5d74f 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -8,8 +8,10 @@ namespace CPU { * CPU scan (prefix sum). */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = idata[i-1] + odata[i-1]; + } } /** @@ -18,8 +20,16 @@ void scan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { - // TODO - return -1; + int cnt = 0; + for(int i = 0; i < n; i++) { + if (idata[i] != 0) { + cnt++; + odata[i] = 1; + } else { + odata[i] = 0; + } + } + return cnt; } /** @@ -28,8 +38,16 @@ int compactWithoutScan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - // TODO - return -1; + for(int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[i] = 1; + } else { + odata[i] = 0; + } + } + int* result = new int[n]; + scan(n, result, odata); + return result[n-1]; } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..2dbc120 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -6,14 +6,63 @@ namespace StreamCompaction { namespace Efficient { -// TODO: __global__ +#define blockSize 128 +int *temp_scan; +int *scan_result; + +__global__ void upSweep(int n, int d, int *o_data, int *i_data) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index <= n) { + if (index % (int)pow(2.0, d+1) == 0) { + o_data[index-1] = i_data[index - 1 - (int)pow(2.0, d)] + i_data[index - 1]; + } + } +} + +__global__ void downSweep(int n, int d, int *o_data, int *i_data) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index <= n) { + if (index % (int)pow(2.0, d+1) == 0) { + o_data[index - 1 - (int)pow(2.0, d)] = o_data[index-1]; + o_data[index-1] = i_data[index - 1 - (int)pow(2.0, d)] + i_data[index - 1]; + } + } + +} -/** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + int d = ilog2ceil(n); + + cudaMalloc((void**)&scan_result, n * sizeof(int)); + cudaMalloc((void**)&temp_scan, n * sizeof(int)); + + cudaMemcpy(temp_scan, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(scan_result, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + for (int i = 0; i < d; i++) { + upSweep<<>>(n, i, scan_result, temp_scan); + temp_scan = scan_result; + } + + /* + cudaMemcpy(odata, scan_result, n * sizeof(int), cudaMemcpyDeviceToHost); + odata[n-1] = 0; + + + cudaMemcpy(scan_result, odata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(temp_scan, odata, n * sizeof(int), cudaMemcpyHostToDevice); + + for (int i = d-1; i >= 0; i--) { + downSweep<<>>(n, i, scan_result, temp_scan); + temp_scan = scan_result; + } + */ + cudaMemcpy(odata, scan_result, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(scan_result); + cudaFree(temp_scan); + } /** diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..c6e3a3b 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -1,3 +1,5 @@ +#define GLM_FORCE_CUDA +#include #include #include #include "common.h" @@ -6,14 +8,62 @@ namespace StreamCompaction { namespace Naive { +#define blockSize 128 +int *scan_result; +int *temp_scan; +int *shifted_result; + // TODO: __global__ +__global__ void prefixSum(int n, int d, int *o_data, int *i_data) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (index < n) { + if (index >= (int)pow(2.0, d-1)) { + o_data[index] = i_data[index - (int)pow(2.0, d-1)] + i_data[index]; + } + } +} + + + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + + int d = ilog2ceil(n); + + cudaMalloc((void**)&scan_result, n * sizeof(int)); + cudaMalloc((void**)&temp_scan, n * sizeof(int)); + cudaMalloc((void**)&shifted_result, n * sizeof(int)); + + cudaMemcpy(temp_scan, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(scan_result, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + for (int i = 1; i <= d; i++) { + prefixSum<<>>(n, i, scan_result, temp_scan); + temp_scan = scan_result; + } + + cudaMemcpy(odata, scan_result, n * sizeof(int), cudaMemcpyDeviceToHost); + + //shift right + for(int i = n-1; i >= 0; i--) { + odata[i] = odata[i-1]; + } + odata[0] = 0; + + + cleanUp(); + +} + +void cleanUp() { + cudaFree(scan_result); + cudaFree(temp_scan); } } diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 21152d6..1e28232 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -3,5 +3,6 @@ namespace StreamCompaction { namespace Naive { void scan(int n, int *odata, const int *idata); + void cleanUp(); } } From f05b730a78c2ab0796520c5645ef368f01e775c0 Mon Sep 17 00:00:00 2001 From: Sally Kong Date: Sun, 13 Sep 2015 20:15:44 -0400 Subject: [PATCH 02/10] work efficient power of two working --- src/main.cpp | 2 +- stream_compaction/efficient.cu | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index b145c60..838ab42 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,7 +14,7 @@ #include "testing_helpers.hpp" int main(int argc, char* argv[]) { - const int SIZE = 1 << 3; + const int SIZE = 1 << 4; const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2dbc120..87d3c09 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -20,11 +20,13 @@ __global__ void upSweep(int n, int d, int *o_data, int *i_data) { } __global__ void downSweep(int n, int d, int *o_data, int *i_data) { - int index = (blockIdx.x * blockDim.x) + threadIdx.x; + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + int temp = 0; if (index <= n) { if (index % (int)pow(2.0, d+1) == 0) { - o_data[index - 1 - (int)pow(2.0, d)] = o_data[index-1]; - o_data[index-1] = i_data[index - 1 - (int)pow(2.0, d)] + i_data[index - 1]; + temp = i_data[index - 1 - (int)pow(2.0, d)]; + o_data[index - 1 - (int)pow(2.0, d)] = i_data[index-1]; + o_data[index-1] = temp + i_data[index - 1]; } } @@ -46,11 +48,10 @@ void scan(int n, int *odata, const int *idata) { temp_scan = scan_result; } - /* + cudaMemcpy(odata, scan_result, n * sizeof(int), cudaMemcpyDeviceToHost); odata[n-1] = 0; - cudaMemcpy(scan_result, odata, n * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(temp_scan, odata, n * sizeof(int), cudaMemcpyHostToDevice); @@ -58,7 +59,7 @@ void scan(int n, int *odata, const int *idata) { downSweep<<>>(n, i, scan_result, temp_scan); temp_scan = scan_result; } - */ + cudaMemcpy(odata, scan_result, n * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(scan_result); cudaFree(temp_scan); From 98f638467ed44526c60070009036c697cfbf1508 Mon Sep 17 00:00:00 2001 From: Sally Kong Date: Sun, 13 Sep 2015 20:18:54 -0400 Subject: [PATCH 03/10] work efficient non power of two working --- stream_compaction/efficient.cu | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 87d3c09..027a51a 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -34,33 +34,34 @@ __global__ void downSweep(int n, int d, int *o_data, int *i_data) { void scan(int n, int *odata, const int *idata) { int d = ilog2ceil(n); + int total = (int) pow(2.0, d); - cudaMalloc((void**)&scan_result, n * sizeof(int)); - cudaMalloc((void**)&temp_scan, n * sizeof(int)); + cudaMalloc((void**)&scan_result, total * sizeof(int)); + cudaMalloc((void**)&temp_scan, total * sizeof(int)); - cudaMemcpy(temp_scan, idata, n * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(scan_result, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(temp_scan, idata, total * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(scan_result, idata, total * sizeof(int), cudaMemcpyHostToDevice); - dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + dim3 fullBlocksPerGrid((total + blockSize - 1) / blockSize); for (int i = 0; i < d; i++) { - upSweep<<>>(n, i, scan_result, temp_scan); + upSweep<<>>(total, i, scan_result, temp_scan); temp_scan = scan_result; } - cudaMemcpy(odata, scan_result, n * sizeof(int), cudaMemcpyDeviceToHost); - odata[n-1] = 0; + cudaMemcpy(odata, scan_result, total * sizeof(int), cudaMemcpyDeviceToHost); + odata[total-1] = 0; - cudaMemcpy(scan_result, odata, n * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(temp_scan, odata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(scan_result, odata, total * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(temp_scan, odata, total * sizeof(int), cudaMemcpyHostToDevice); for (int i = d-1; i >= 0; i--) { - downSweep<<>>(n, i, scan_result, temp_scan); + downSweep<<>>(total, i, scan_result, temp_scan); temp_scan = scan_result; } - cudaMemcpy(odata, scan_result, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(odata, scan_result, total * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(scan_result); cudaFree(temp_scan); From 3ad7fa65b70ed6374c9ef1f33b68702c283db5dd Mon Sep 17 00:00:00 2001 From: Sally Kong Date: Sun, 13 Sep 2015 20:27:07 -0400 Subject: [PATCH 04/10] jk only works until size 2^6 --- src/main.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 838ab42..067fa2b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,7 +14,7 @@ #include "testing_helpers.hpp" int main(int argc, char* argv[]) { - const int SIZE = 1 << 4; + const int SIZE = 1 << 6; const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; @@ -56,13 +56,13 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); - printArray(SIZE, c, false); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); - printArray(NPOT, c, false); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); From 01f2d506b37c6076ff1d1767b0b602c1126c32a9 Mon Sep 17 00:00:00 2001 From: Sally Kong Date: Sun, 13 Sep 2015 21:57:15 -0400 Subject: [PATCH 05/10] kinda working efficient compact for power of two --- stream_compaction/efficient.cu | 42 ++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 027a51a..4bcdfd6 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -62,9 +62,19 @@ void scan(int n, int *odata, const int *idata) { } cudaMemcpy(odata, scan_result, total * sizeof(int), cudaMemcpyDeviceToHost); - cudaFree(scan_result); - cudaFree(temp_scan); + printf("odata[n-1] %d \n", odata[total-1]); +} + +__global__ void checkNonZero(int n, int *o_data, const int *i_data) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index <= n) { + if (i_data[index] != 0) { + o_data[index] = 1; + } else { + o_data[index] = 0; + } + } } /** @@ -77,8 +87,32 @@ void scan(int n, int *odata, const int *idata) { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - // TODO - return -1; + + int d = ilog2ceil(n); + int total = (int) pow(2.0, d); + + int *predicate_array; + int *hst_predicate_array; + int *dev_idata; + int *compact_array; + + cudaMalloc((void**)&predicate_array, total * sizeof(int)); + cudaMalloc((void**)&hst_predicate_array, total * sizeof(int)); + cudaMalloc((void**)&dev_idata, total * sizeof(int)); + cudaMalloc((void**)&compact_array, total * sizeof(int)); + + cudaMemcpy(dev_idata, idata, total * sizeof(int), cudaMemcpyHostToDevice); + + dim3 fullBlocksPerGrid((total + blockSize - 1) / blockSize); + + checkNonZero<<>>(total, predicate_array, dev_idata); + cudaMemcpy(hst_predicate_array, predicate_array, total * sizeof(int), cudaMemcpyDeviceToHost); + + scan(total, odata, hst_predicate_array); + int totalAfterCompaction = odata[n-1]; + cudaMemcpy(odata, predicate_array, total * sizeof(int), cudaMemcpyDeviceToHost); + + return totalAfterCompaction; } } From 92fa49377c48ac5154482d26b7eb3cc91e4434c8 Mon Sep 17 00:00:00 2001 From: Sally Kong Date: Sun, 13 Sep 2015 22:01:55 -0400 Subject: [PATCH 06/10] non power of two also kinda passing --- stream_compaction/efficient.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 4bcdfd6..6c45fe3 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -109,7 +109,7 @@ int compact(int n, int *odata, const int *idata) { cudaMemcpy(hst_predicate_array, predicate_array, total * sizeof(int), cudaMemcpyDeviceToHost); scan(total, odata, hst_predicate_array); - int totalAfterCompaction = odata[n-1]; + int totalAfterCompaction = odata[total-1]; cudaMemcpy(odata, predicate_array, total * sizeof(int), cudaMemcpyDeviceToHost); return totalAfterCompaction; From 9650bdaee5d9727b3fc591cfb1788d90d88d05de Mon Sep 17 00:00:00 2001 From: Sally Kong Date: Sun, 13 Sep 2015 22:38:02 -0400 Subject: [PATCH 07/10] Part 4 done. yaassss --- src/main.cpp | 4 ++-- stream_compaction/common.cu | 16 ++++++++++++++-- stream_compaction/efficient.cu | 15 ++------------- stream_compaction/thrust.cu | 8 +++++++- 4 files changed, 25 insertions(+), 18 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 067fa2b..cc4ff1b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -68,13 +68,13 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..1012fcb 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,14 @@ namespace Common { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n) { + if (idata[index] != 0) { + bools[index] = 1; + } else { + bools[index] = 0; + } + } } /** @@ -32,7 +39,12 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n) { + if (bools[index] ==1) { + odata[indices[index]] = idata[index]; + } + } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 6c45fe3..4c82d64 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -6,7 +6,7 @@ namespace StreamCompaction { namespace Efficient { -#define blockSize 128 +#define blockSize 1024 int *temp_scan; int *scan_result; @@ -66,17 +66,6 @@ void scan(int n, int *odata, const int *idata) { } -__global__ void checkNonZero(int n, int *o_data, const int *i_data) { - int index = (blockIdx.x * blockDim.x) + threadIdx.x; - if (index <= n) { - if (i_data[index] != 0) { - o_data[index] = 1; - } else { - o_data[index] = 0; - } - } -} - /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -105,7 +94,7 @@ int compact(int n, int *odata, const int *idata) { dim3 fullBlocksPerGrid((total + blockSize - 1) / blockSize); - checkNonZero<<>>(total, predicate_array, dev_idata); + Common::kernMapToBoolean<<>>(total, predicate_array, dev_idata); cudaMemcpy(hst_predicate_array, predicate_array, total * sizeof(int), cudaMemcpyDeviceToHost); scan(total, odata, hst_predicate_array); diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..fe6591d 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -15,7 +15,13 @@ namespace Thrust { void scan(int n, int *odata, const int *idata) { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + thrust::host_vector hst_in(idata, idata + n); + thrust::device_vector dv_in(hst_in); + thrust::device_vector dv_out(n); + + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::copy(dv_out.begin(), dv_out.end(), odata); } } From 7fa3c301100f86edb54ff504f2bbd9a2abb9b0cc Mon Sep 17 00:00:00 2001 From: Sally Kong Date: Sun, 13 Sep 2015 23:15:18 -0400 Subject: [PATCH 08/10] Update README.md --- README.md | 181 +++--------------------------------------------------- 1 file changed, 7 insertions(+), 174 deletions(-) diff --git a/README.md b/README.md index a82ea0f..c5f23be 100644 --- a/README.md +++ b/README.md @@ -3,38 +3,16 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Sally Kong +* Tested on: Windows 8, i7-5500U CPU @ 2.40GHz 2.40 GHz, GEForce 920M (Personal) -### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +**Summary:** This project is an implementation of a GPU stream compaction in CUDA, +from scratch. This is a widely used algorithm that I later plan to use to accelerate my path tracer. -Instructions (delete me) -======================== - -This is due Sunday, September 13 at midnight. - -**Summary:** In this project, you'll implement GPU stream compaction in CUDA, -from scratch. This algorithm is widely used, and will be important for -accelerating your path tracer project. - -Your stream compaction implementations in this project will simply remove `0`s -from an array of `int`s. In the path tracer, you will remove terminated paths -from an array of rays. - -In addition to being useful for your path tracer, this project is meant to -reorient your algorithmic thinking to the way of the GPU. On GPUs, many -algorithms can benefit from massive parallelism and, in particular, data -parallelism: executing the same code many times simultaneously with different -data. - -You'll implement a few different versions of the *Scan* (*Prefix Sum*) -algorithm. First, you'll implement a CPU version of the algorithm to reinforce -your understanding. Then, you'll write a few GPU implementations: "naive" and -"work-efficient." Finally, you'll use some of these to implement GPU stream -compaction. +A few different versions of the *Scan* (*Prefix Sum*) +algorithm were implemented: a CPU version, and a few GPU implementations: "naive" and +"work-efficient." **Algorithm overview & details:** There are two primary references for details on the implementation of scan and stream compaction. @@ -43,139 +21,8 @@ on the implementation of scan and stream compaction. for Scan, Stream Compaction, and Work-Efficient Parallel Scan. * GPU Gems 3, Chapter 39 - [Parallel Prefix Sum (Scan) with CUDA](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html). -Your GPU stream compaction implementation will live inside of the -`stream_compaction` subproject. This way, you will be able to easily copy it -over for use in your GPU path tracer. - - -## Part 0: The Usual - -This project (and all other CUDA projects in this course) requires an NVIDIA -graphics card with CUDA capability. Any card with Compute Capability 2.0 -(`sm_20`) or greater will work. Check your GPU on this -[compatibility table](https://developer.nvidia.com/cuda-gpus). -If you do not have a personal machine with these specs, you may use those -computers in the Moore 100B/C which have supported GPUs. - -**HOWEVER**: If you need to use the lab computer for your development, you will -not presently be able to do GPU performance profiling. This will be very -important for debugging performance bottlenecks in your program. - -### Useful existing code - -* `stream_compaction/common.h` - * `checkCUDAError` macro: checks for CUDA errors and exits if there were any. - * `ilog2ceil(x)`: computes the ceiling of log2(x), as an integer. -* `main.cpp` - * Some testing code for your implementations. - - -## Part 1: CPU Scan & Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -In `stream_compaction/cpu.cu`, implement: - -* `StreamCompaction::CPU::scan`: compute an exclusive prefix sum. -* `StreamCompaction::CPU::compactWithoutScan`: stream compaction without using - the `scan` function. -* `StreamCompaction::CPU::compactWithScan`: stream compaction using the `scan` - function. Map the input array to an array of 0s and 1s, scan it, and use - scatter to produce the output. You will need a **CPU** scatter implementation - for this (see slides or GPU Gems chapter for an explanation). - -These implementations should only be a few lines long. - - -## Part 2: Naive GPU Scan Algorithm - -In `stream_compaction/naive.cu`, implement `StreamCompaction::Naive::scan` - -This uses the "Naive" algorithm from GPU Gems 3, Section 39.2.1. We haven't yet -taught shared memory, and you **shouldn't use it yet**. Example 39-1 uses -shared memory, but is limited to operating on very small arrays! Instead, write -this using global memory only. As a result of this, you will have to do -`ilog2ceil(n)` separate kernel invocations. - -Beware of errors in Example 39-1 in the book; both the pseudocode and the CUDA -code in the online version of Chapter 39 are known to have a few small errors -(in superscripting, missing braces, bad indentation, etc.) - -Since the parallel scan algorithm operates on a binary tree structure, it works -best with arrays with power-of-two length. Make sure your implementation works -on non-power-of-two sized arrays (see `ilog2ceil`). This requires extra memory -- your intermediate array sizes will need to be rounded to the next power of -two. - - -## Part 3: Work-Efficient GPU Scan & Stream Compaction - -### 3.1. Scan - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::scan` - -All of the text in Part 2 applies. - -* This uses the "Work-Efficient" algorithm from GPU Gems 3, Section 39.2.2. -* Beware of errors in Example 39-2. -* Test non-power-of-two sized arrays. - -### 3.2. Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::compact` - -For compaction, you will also need to implement the scatter algorithm presented -in the slides and the GPU Gems chapter. - -In `stream_compaction/common.cu`, implement these for use in `compact`: - -* `StreamCompaction::Common::kernMapToBoolean` -* `StreamCompaction::Common::kernScatter` - - -## Part 4: Using Thrust's Implementation - -In `stream_compaction/thrust.cu`, implement: - -* `StreamCompaction::Thrust::scan` - -This should be a very short function which wraps a call to the Thrust library -function `thrust::exclusive_scan(first, last, result)`. - -To measure timing, be sure to exclude memory operations by passing -`exclusive_scan` a `thrust::device_vector` (which is already allocated on the -GPU). You can create a `thrust::device_vector` by creating a -`thrust::host_vector` from the given pointer, then casting it. - - -## Part 5: Radix Sort (Extra Credit) (+10) - -Add an additional module to the `stream_compaction` subproject. Implement radix -sort using one of your scan implementations. Add tests to check its correctness. - - ## Write-up -1. Update all of the TODOs at the top of this README. -2. Add a description of this project including a list of its features. -3. Add your performance analysis (see below). - -All extra credit features must be documented in your README, explaining its -value (with performance comparison, if applicable!) and showing an example how -it works. For radix sort, show how it is called and an example of its output. - -Always profile with Release mode builds and run without debugging. - -### Questions - -* Roughly optimize the block sizes of each of your implementations for minimal - run time on your GPU. - * (You shouldn't compare unoptimized implementations to each other!) - * Compare all of these GPU Scan implementations (Naive, Work-Efficient, and Thrust) to the serial CPU version of Scan. Plot a graph of the comparison (with array size on the independent axis). @@ -197,17 +44,3 @@ Always profile with Release mode builds and run without debugging. These questions should help guide you in performance analysis on future assignments, as well. -## Submit - -If you have modified any of the `CMakeLists.txt` files at all (aside from the -list of `SOURCE_FILES`), you must test that your project can build in Moore -100B/C. Beware of any build issues discussed on the Google Group. - -1. Open a GitHub pull request so that we can see that you have finished. - The title should be "Submission: YOUR NAME". -2. Send an email to the TA (gmail: kainino1+cis565@) with: - * **Subject**: in the form of `[CIS565] Project 2: PENNKEY` - * Direct link to your pull request on GitHub - * In the form of a grade (0-100+) with comments, evaluate your own - performance on the project. - * Feedback on the project itself, if any. From bdfdeea145c7801c8a8283644a36e60e1cb849dd Mon Sep 17 00:00:00 2001 From: Sally Kong Date: Mon, 14 Sep 2015 00:08:05 -0400 Subject: [PATCH 09/10] added graph --- imgs/graph.png | Bin 0 -> 23807 bytes src/main.cpp | 2 +- stream_compaction/efficient.cu | 11 +++++++++++ stream_compaction/naive.cu | 10 ++++++++++ stream_compaction/thrust.cu | 12 ++++++++++++ 5 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 imgs/graph.png diff --git a/imgs/graph.png b/imgs/graph.png new file mode 100644 index 0000000000000000000000000000000000000000..b095e470dab03a06029cb84c4d5a33f05efa211b GIT binary patch literal 23807 zcmeFZbyQW|+cpZgQ9^OksGyrJ0SQGwdeaEfY)X)lmPU}2?nWBvMm8m_NQ1O=C?Fl8 zl;oKUAD{30z2AHOJ7sGkh+F<6AuFetxZ5y8NMeou)BBfN!q@goAm z`#qxStHS zuG^2VqP%$_>~)<^+p|{vkA8)gJZ+wpjbkx>edT&}!c&il_`9fQFMlmn32ZkEp9-+O z4rPeGQI?YCkbl&*YCC)z>>=;xv|xi&lge@HHQp=ID6^ibwvywG@xF9rGAlyJ{CIz} z0-yc;htU%4gwnnl=Q*n^$uP7{^XX_G-!i3~fG`E+47!3c$s_C_1%wRzQ30nK#(9!5TZAQM67o zj$;!izkiijPN1vxKVq3C-Vksq7Cbv#U@)M0Yv0YBr}#)SMw=i5R`Btpm-26x8| zBPN8;PZrv<>J){3XMIcSNe|T7h*FQDEBp znEUx)>i0)f6j9r=jgR9Wpz694IJPw@%V5g?{Tp@M@vP+Te>5DF6d`of*?6c%a7V}z z+_;&PRlS$rY2$fbs#6j%61~*>hVv7c(C=%~4?9~NGJQHLx;wk+%BjEYebIP2F_a^x z>q-9fRr#&{jfslKEwbvxJ?!{~cWPpe@{V&mpd07Y^p7#u13@n(`Fy-=ZAkqPhg3-Q8%C^z}hs8_## z)`~9nr_Tl9DhJR6a71{es~{m{p{hWiD%R1=cMLMnYKc&j1;Q%1YHSY460d9bL*?B zuw9U3RnG3@j1&~j=99s3m9S4(8EP=b{u%81y7;+Xl*rm>NwQk04&N79r-t&LpNpuG zI)~Ny`EZ^)xKu$|tHXH$gdEgVCJ)I$X$!#%8DurGoN4MVTfoWgL2ukKq zT3#CN{`-5vstK#oK-p8_{>*~?xI5Iq<2r(z7%bq{OWf$oUJg8=vt#SV6+%R8~3D0$c-}1&EsZP67Ry& z*W9+6`-RV^+jP6WFzqbi5~DO9McE&3*3_SI->S!|Y^+1OGLsdcDohkkHYTfj!k%Jo z-K?LEAu{J^+>NfXnVTJH9z}|=1rZ%Ok8Es-1jH?TC(SZFgeoN zKhY3UJBJrvrI(9;XW=-i9YJIVmOHM=nF^uL`q4v4qpzuqX_@4 zWzBW7zEwdBKZH+@YSha{ z-|niS-O1E9eTpBuZI6~7DSJBSo=x5~w9!aGT|0mK)elj({Rz_&jWRv;3X{jn3@QVa zld)~mqfEF6)bn66wzX%Ws;@eVL(4SuUbflbN8QyU($kZXk+D_hY+HdnqaNb!xK-aL zAiJ(dEbv)ki>H=a(E6sQi4RlWQPi;_&0~ojIet@_X>Q^QrKZgrLeNg31godpHv6s3 zlN%=8ny#BwW6A;ve&U>b=qE0sUz)b^LM53K5{iXvntq$CjiFtoDVX{gvob5(95UV( z!}V_PJLK=J4tM-q>{`&#=w@*{{1z}XMx;HQ``CVdrJ`TR-j-L-KhftCDf}*LNNZHW zjXd%O9TPn1czXGkPjeJBd#kmrbjtqM@1(8OX}y2(w#xiVH?|Hd#dO8A3t8Dhn4pzz zxr(-4%4nR1-l|Ug{1PgYWI0P@M}LntjmTs6rAkLnH~uMmjueDLqaI_3l1!`<53Cq{ zkE)j!LyC4XYBe~KSY%g(oG_zX^GR4#nLfFYavZa59&~3af32Jk8F@+<-`k!xsM#jE z!=GfDL7#V|G##_=U2IKq0d0RoVEA+jD~$O3MaljkwmfsOO}zOBHDh3%5sJWncLZtu z172}j?_t7}u5z%ey#D8s55ibDH0KYM@c#m73I?!czqJ{g|Gnf;3PW`94x_o~Kd=}6 z2?TJ?hWGN<{zBq&97N#B0+Q`7+=l3Vgo3}9e7%DSi}Fm#$NmSAi(DlZfw%pb z=eqy*k`sVVent>i{R_$s04S%>)5*R3&m~@-;HhV}nPh+8j8h)W=ckaKk-wMxx07+A zGaVaCk_{} zCdO*A(k%YX8y9(e0Kx2h%(kX$)!*rhCJVY%H~whB2Wf&*yTT-FZu5}yZs9)#4kA(< zOpk|GZ0GN-`t6Mg({#u9bV0X$&3dQJ6u*@gd=7c5Z(l#Y9<8?0ULVW~xNkPp>a;m& z3}Hs$v%^miU`ad3qZ0ND=n?oF(pE}AMZPs za+ht))Q{D+29tAGYu8v;d7ht+wGjyJji@+5aBfC{pp@hBzf0f>G>bm@ru}K^Z^f|J zXXlKJq+k#aY(7^}zBCfX;oj19TZDW*rfZ_G*hhSr>0A zvIx&tD^g#)>K_F|f@K+ii1y0|P*ef-vPG)2{Nw8@$=gc@cadNY%fqPz>gtktZJb0$ ziKGoX!uN_&z0@PS|14{7V!Dj%w*UkV-Q;<$pP*wWW6l~tK*i-3qT{~zMZNr4o4j^{ zw@5jfUN-h!4*)+fDZ(2O)PX0O0RUr8fCOvR9m~xAMX%tOU83O8N|t(!^~?xaW%=&Q z3X{*pU^+T*GMCdmX8|&w*AVcy8pQI?gDr{2Yi!bq#e&1!LDDI4SRE>T>|XkWkRJO> zfy(lGpOqwwQrn%%OP8;9?z^;1fDL~A2+YVkfWHL}G(qgL>Wrk`hwxv+%U^dr&rTGm zSf@77J?!?0rrajIFU3rF^hPz&wp5QRYFMfW8qWQL76E%+LW-0jZT7GO*Rktt``3D$?ZbDd=677<8eKuU4xB0jaUc~9z!$8esk`tII%(c~kYb`=$}NE5Rs={wOD=54Ng`uINK^H+=xB=!QHXRoxYEfY@<7b2X#$7xK0!fs*S zl+>9uL}@TnVskM@(P>YSf0Zuzo`G1dhJl}*3L;QUCb{XK0Aq&p=-|kP+FcgBgPMR7G}I#h>*d4jQ|X9fJZA@iA(iolOluT;^?0Ph_=60R5l6r z#{7IV*Bb{7V=}7|N$hR5tcmCNeICPOp4-)%E>cK5^Gxmy66)6biln4UGrf>eYQsSgJ zMOh%kzqj^<^^0+C|07lvF)<&gNN*cD`|;^IyO)+Vddj-~bL}vR7n^sEW~MX~PNuXb zcKbg>5Y8n`jvjg2N9Yk3i9~&(yF^I4#8sQ^l((Q3nP97#eip;1=p_0DlsuK7PRhBh zSDa0Q^0Tg6WpEw|)iB~zR<9-sQ8luSzAw8eAtnl!Xkoc``A-BD@g+s6QSZF%WO}%&NJk1&qAR&S?wo3u^!K(BAg@R*hDV6T(4hX24V4b}ll~TV(8;GW9JpeFu09X12<#Zk2u*ccN zVHMSEYV_*C@mN#(s&jzzHu}=7X1s1nA}xn^(T&k}oxf#@nx#up!{xDvkRaPu9L&1) z+b#E5t&f#C&w#>h;Des?hh7A~@gsf)dP+AoB1Y$&G2fv-pVDAcen>Y-dB_eXaIPSpncr>w5-V;cm1 z%hd1(SVZ)Vu6t;xECs|wDnfbUy=@i*5QR>6RagNLRG?R?Q)}mtl7=YMsw&)D8Kjk( zBFN^1DIqHJ@x5^v-yO`f#dDdy^3S@On?qNx`7@rylre{ps>v!%XXjB*YGf^Gu1MJB z4k|J4q`eA(R)>B$5$jt+pZ3d6Ke#gv#y&($_zY@$otEqVhS`mnx`lv@-MVSL*!6HyTeU^JAhFjg5@dAI?*(0Tk2VR^x#lW748l z{bID-Mp^S2TI%-8R#i|h^Qq@4P%*wyHR*k0ppq+JZrl^k$GHpMZXED#mL1_#3MbMy zRc6EOxcN!Pcoq(Jz(_~tk;Ws``S&8*9Bd>_w?8xHNkdot;89$vEk*)Q|y zvBe9Qe)a>)9R>H2=k=wnTJfnk+D7D-f2)*8Eas>|?_+B*3jTg{=(oryUo4xu%o$T;a1rnkS9 z4luE?ZV*PkA&?l<>n)~Kgfsg;|5B(1vKc3%E1h1UYQcP~B3Pgse&l-Zz>p|4;A|*# z>;BfvIuaz1km-8>z8C^jRQ0P*fFm^$4?CjLU7s57isO;52m$ecoTg|Kw)sd6mm%5G zi*+7yCd}GFCxx47H=ky|&m?M$mL9%NLboR_iGYhPqP%0h8ANvh;V^|wt`nJqING1G zDM1DisiTc#$cs+_GgcexiHe{@kNDqJ{KkTl`64RsDb6As6Z?S0Njhg^#Mxza2|kwc z+~goL!SXo&tiYOQXxH9{#R~TN!;VkkuvQ>lAmEDP&8dIFqT27f{Lm*yo_4dAy+^D8 z6kGcy1F)@ih&49t>i6$yYP)W`ZE!-Y$&51*^7pUhn%PFT`Xj#7uBGeTuqff!{7c?A z4DYJuKId+5)f#gJQw4v0+tD_#D0ytL?=r<_=9|o6c4S+=p~NeLn|bBVs{!LgW}L|$ zE){qcy5p6u2@&$zO`aFeq&Sgy(eaK_5VR_}EJrcb!){D}_|?ic;QT@(?3+j;+&c0|!*)799gKlm^k z_#4mo<3us2!W9#plcWCx8Rbg;l=v8vb%(c$HGUWp@bdmAR%p(vT=tPV4>NW zq!W0K$2EEg4ntjXQ(L5E^4SwlTkwI2hx7xi%4MefBK*L}Qlk%VpoO1}SDqvR6Am zjaU^(2P)D|V}mE~&Np6tl*$dbX8Su8;^6Z1DwEuA^j-)&4o}wruQ`~*%_@bQCAegB zRGX><7@j#_qCAWU4DV~^JvtmCq{u;BSC}$dBbwA|?3%Q#v=`6mLmh`)N-i+U$hWz) zI4KXkV<#4@j`K0WBmwxK~m00Ix+iQmub9A-p1K2mqt zZ2yclBsmE@<4hvq)8m7h^Rt8It)FT$)J}l-uLszETA~<4VcEvYz0bUJ4p^(7 zt_xSvNCY1WrU52$YP{U=laoQw+@<^6|74eY@Npq@b53vK2kKA*{X5rmqjT?ED-p1b8P&HFUnglmSYxT<%MK@Zzh8cIES^MJyzk;h=BCqUb{4*Po<2dY=xZYX<@n8aHW&G)M2wG}<6^s{J_z7#VJUB|dn5ajp!1hzFn=_tm%jNvK@$HoXLgyYa-HjX=N!OiyH zX5SWYkIKCxVevcVoIT(2b74=^)jJ7lQl)61{!B`c>l#5R48%FD*O#wtly}^&@b!h& zub0+u0EGq(a1{S~l#Ufd9BwuOE^oBLRMGTtbk$0%ib6SK+C!F~Mz*d$K3*vSA6Y?@ zO|M}bHwP-ofo8P1)`^`b)I#MDhUgyMnm53KMR+M>k9LiKz51xVe3r}XND19yaoNu# zqw2EBc!X~_EW$VHEdE`Tu@u>z+FB#h$tL%DVksgdnh65e3IdxvV`;6bKxj$1UtlxW zV`T%NF2m;oE-~Wqx$j_G+iPZK=Fxm5+U52_y~a9XN%W-j?lj>0 z!$0D4H0VRR2T;L;QvIZT)~cthd!E3%#CAbll!P8Bt>5Ao05rto`DFl7_5s?=zN{su zL(237QQSa?G-y*?hD$4oG=Eut;D+7BKpfO{-p>pB|LB?95l&%n4oKkcMA<6$YRD`D!!(agY+Jd^zb^)GIb*6UqdX z>o07P5s!Wb0@%bk|Bxn-##)BrNdxhdjgCGUTIq_2gD`mZ@0i>>bnldAi+$2!qN6PV z9#$_@W#+wwOOK7(1|$>8(Lo_9SjPwmB(IspA4@Eg3I6KkR?mMzO|Gh2pq!(cFu_{Z zFyBZ0b7;#Ez6h$*)OC4ViND)|ehasgg>F4kPs=!(T+2J@+g?G(sZZ-Y3=kJxY{hx* z*G|S-<&I})f`Uj>Q(_6u4XeJkYF$s>kNzh{KQqyjr?UfF=}xW1r%o|B5T>wJ`&O+;00D9@xyLOcrJCH*TL&;Hf=}_1a=piL~g_ zlTokKt)WPz9}a#1;=j~XFz4(BG+I%`gPIj-`y70h$3OPQp4Kcb8YTGW6Z3Mj@Q=$L z)E?s9#w~=PF^JC_TYhgvME(NI_u>>i9EX%l@IWT$h9h?O-XK6;`i<#4lCQK6p!)=% zqu1|>p^qvp#`;i!>F6zpgEcjTT&T!PY>p%T7!sZK87A; zIlp0d`&|4A@cH{6uiViDu(;wXH{a|#hl50gYWAm1uQ0N%7qbBNPtVTYadoK8e5`a+ z;j3y?j+BqIb9Hv3n=w#2`_vdX9aeJ^_%Ro2TW8%5be+jVZOlN$S5NwodrVuuM`rUm z?K^iPPf$R4i&JD-eVcb_0EupFJ_3}4RCO*u>z)8S)+T{bu}WUJTQjqyx}-PerOQTm z#z(uWeK;A%gAK2#F45;7yqnE2O|fmaJ;tk?soWz(df?E^%xeQ0k({z5vu&`=5kOx# zQkmK1x~toOMXkamW*Ao#J~eWleVI4K!~fA9)Hr>gbB9H$-UUmzg4!nao%v9<%m)i* zITCtT+{LqB&a7NNRs==skHafGd(uc7V+#z|qT(KG;r>sQ@HK9sMesL7EyM zv3=QVRli4;_<%OX3*8;78$Zy{NH4_0z;&f7AMt_`0w?YJ18S7+N4w=f&E~hwI_}+- zt0NaELUrGw8A}LW>rJ}fGy8Ixid7{y)GdP9O=}zJ7nHxdJvRrbw=t&3G3S*8PeD*W zZ%=@AxVa@VTv5lfzfsZe)Fmm)uRK3w-!Hg48ObTOt$Psz>-$LpifoPJzPr<~zDE3f zoh0wC+zJwxZAb>mtJ>b%?z@AAK3Sx+*>dZuK-b=2{}_Y+IQeT3W7<2Q&kn@8JB{f& z=Zr=@iPn!!4kHqVEjkUSIJ5!ggKhn2zjAPRt)jcYFItfyZ6XP2aMY`*O&fY~$!}g#Gu7T_q zavY6|4Rm;%H#nd9LNq*sVwz=7JX-}=9V4~VB%Vdb{d9%9slIWuMRlqVsHs0fl_W$W zl)neSpki22cqWJ)RcPc8*}*`8ExLF`t^a2h_NWhJYiK%JW371t6&Mp0ruCNGZ0~HM zM4}G|54bqiY_PMKd*tz10W(!pmIO;6;iC7zSwWho%7du2Q?ayjPX$CRn>fc?jUKfc z>Xox9fNw(+Kdj4=uNhmXTBcoRIO0>SW#xMrb`R~>?AY8|Oli{+1+LZ^{!fK^-j#hqY zMGh;A;`%pG?zPtFIpm4VphEQ80 zP<7Pn9L&c2?+T(Xlit-FNd2~q3fu)-*WEQhb{FD=oz@ES>YSfxiPAZzhT^3mqDi}Y z_6aWQ!j$gi4>7>Fs59_+oE2`0CJ74AS>rI-itI=PRNoeS+dRN00J`w#Zv5 zektkr1c-`UBvbi{_6@))acARBLP`HjM2IedNQ$wNIB7cnRw$3^huP3hJigaPZC} z!D+?EJmtzznYg|^@jX|kLFFmyWGO5A0?s%FdZUwwjw3L*HPk21m^0ch7c%ACd&!AR z>*+lZFb-lCl@^_3v2$kblBSWylJ)@wD3P%F6l?HPWnEMZiDGv3a9uCS`;zyxxIAs^ z<_)NvqjVf~*kI6;g&R027~WRmbyHhy6~#|JnW&o|h7v#bmO{X1V7qeO08reZmJU8-g(>lr-ZTtn!AhPS0VQ&G zG4o`pwrW`FM=^vdj7xS{Sn#*nbkAEYj3a9QKtT*|eWEAopZAq6O;d+{Cx3OjxAgff zEKbo6a>HXrhc*lh@jT`X-hNEx@xk7bw=iq59#oZzCb?I#Q7(kxV4*2&)s>KUC8n=p ziIVWjeMX8u76}+!J@Uc2zownYDP`l8`$y|Cf-2Xgx_gC8j2Uy*qypK>PHO_wM^xX} zyM9n(c#lKgiqrch{sxGQVy-r^l@GP80qwp9FbCWV@xvP+7HqMQI{1&*UVXQQpcs3M z8FC>*(LiPfF)|;6Rszga>5`ZNK3`6!(bz=r+;T+#^NuZguZl~HFm7A_-Sk~;anweN z8W2N`2K9O@mFpvX%sW`J6@c%>%U7U`w8UlO|8V6(5zd6}E=v|{ASr}7Gv8=Ggfp~3 z>sdfS4Wtoo2ker`%1wX6@Xkuw zd8*MRp45E2SN2*u;jq229rQoXqQC>lG~)RDV!gP!V^O)v9%ovmw{V!<`5{?5k{djO z?^BfeJVX}^O+d+Ihe3Z0aLgkRS8+mmC=DMqfHqDU5Nq9Oa>3F8;ckWh!W}E*r|3O!EnYg{EeL+0BULI1{hcq0P%LS zi>5`$Vab-wT~znBwMX(dvEYFc;H5Nm)Rssbu%@K;Yhw9`ANq2xWUoRJipX;uq%T5N zMy4EOt>iPISBLLNLC{waK6PM4p^ct14dBDD5E*Z%+XxC`Q;)*G7ir%2$}9LIaYacZ zihWismJd&_PP}sIeGa?Nc#Y`MhmxtttwD1LuJM@AT7QF<)|A#D3A-U zyaub6sSszSfjU(HndGj@cI&ktH1+h}@9;A3N|~AfW)ls0t$^>*l%wH?kM66TwzEf) zR5&ode@jo21|H}o5SB0+N%-b;?Xc_aiUB|(WnW5lryf|2>qB@QkO|spk6v0Xw1E-#rrT%oA^WXiR#lTglfCF*iNxU}J07r0RYl1|`j^(Lv(^`Nr3?OhFe352ox%kK< zXkbywo5{cn3el{0JPtnsRs6OaC?hukBMMc;ldrD}J>Od&yUvA|6d=+LD4}RaY4uF5 zF8mUa$NMyH*wErACJe+8rOSsz4oFYl*#s8MJpkXD4(IXfx9ZkY9x9XF#K!uD_^-Pq zoO*!q&uk=$wBpdT5y~7=K5`j=_-R1>g8~&$8h{$lB@`!F3={cvgZSU@YVlGo9h2ok z#R?R~BBTu*LaWuCPF7jyJ|aXJbH4wTX(Y>UFKv1~%H$3`e)~OYAf}}tyaNf5pzMDq zcuOi+nve0+qeo59f%k6e`&gQV$;r;i>>7cJ{RC^9R#I- zjk6dq$B+&y@ZzhNVg7ItzzoK~Oc6qs8_F$-v(m(ct(veAXCyJ@7!rlzymm3n&Oqpc ziw6))0%ZKKulG9}Ci39->0&E|fTN&+;p%X}i5?7Td9!NfNuKl<0S$Ii&&l#-pE$IG}&7RF6)OGX0ti{g< zGpk>G6Ps#yS)qOPS>U|2qQ7HH@D1gSNZE;kZOx`&wNL!F`2)bN!tfq|gGG=XQwVAU za{yf{Q$b~8y%bD+v{^H^QB>B1Hu;={0+f`vxUb%P(tnf+rKo!^p*h+?LJQI>;*Rhv zwi?INrfF0V6CCG0W-4#CeSmbu_4_Z6px_DjfR??3?B4w`9mA)dr&Y9eV9VDJ7lI&R z3Am?rev&hHB68Y+pb`E=kVqGv)h5zGLo~$kRf97tNY{+BMjk~7pS^<+-*7wLwX0F3yG(iw59O|SXAu6^&D$<}gJ+!QDijT{aKVipCvvsTsN_1hQFX3pJKS5? zZa%jLw8}f+a*GbW61*nd%PSq$!VbLW%5O!u%m!~RymbvwQfq?Z%hL_;+M#a{zfZMK}#sjM<^r)s2GsdB%J4FkFiQ@cj zfjq~Wx&}*-H|&EaWQ)>-)Ul@HEdfS5Bf?4GV6~s?5?fp+P*JfTYhakZzqrIS66eW2L&f zR<(U}fs1%C>vyw$0>4zo<8iL9THY+iV@s3&}6|k3f@O&Pj0*XmA9*1BsC5 zC%+=_wbbEN1eU#bpYz4Op$i1+>MDBH^(DZ>B!ZI4pe-Od>w1g(QUxwp2RK@JbI%4hqH;oK zBE?xG0^(`Z%k(zqy^N}q*MKz=*wypDlxXu-u^G1kOTMl(dtJz52}Ubtpm4Mify!UX zql$RoYd$61dz1E9yE_FJ5X|uVEEiD2{UV=k%~+7@yzpjDmUE7mF=D!G3i=SXA|+9% zmX`^MjOHZbBb8<QDtll1zZt#}_sBL?W<5#ZffA1}|UvYwfQY+pdy zC0$ac+`63*O zNWy4j;bvZS)eC?cyPxwl;Vx<@n3T1>TNI%UbK-FA02zJ zw+B54w6%P|rc1Z)kE_0V z5FijiWa1)y5z0E@1RSbg9fFAHr{DUgP05=Q0R;h1P6=s~)k!s-9^*gthZ3Kre`W+d zl*R`F1F3Uav5r)FD>Il7-+m0ll}*r7@~)_~t{a;RKrWpBC?B{NEYz{gV(R`3IvM)f zxFzWBrM^@P$iKs=rhzpj;CqgR{QRf z!%pZkj=M>o=>Cy4KhJgKUjeg^fn|GvK({W{FC|%Q0Ef7foGp~er{@1xChxe#Gfe`Edd+|?yyv&G@@;sdpPTZU@QqQcL=%O^gd5YhC6~v4u5D?AoIs^1uHRzwv zu6HV7)2$l;zONmK)p8pyv*HK`1RXxnto1_U*>iYFrRt_P< zbiXAy_IoKP{1I>t2oyNbo&ZA72!Kp9)F}nr5>e))fZR?MsMWIO4$20=ucIgFUO(}F zR$F^8t)O>ObWR@HFI7)Ek7zMtt4Y|;h4CXzaetO$y=;-`w?03~XJxaJIqX;RUi4L_ zEtV#%#-<-+?-T7-KK!W$_x$EFY!{Vl_o`~)tN~);= z^*i!AWlFtVJXQZuR`W1fEl@{#=}m}u&d3hC41j6o=nfBw3`?!4zCa&^fa;k3gjKgn|fN8HxnZ1VUDI<@P) z%A{+2gHIRX1D==AzI8@k=eQY_=Tt6PYLBK%%$_aMm@m*g+pFQeEw8ney*!rYwBob67DnIHOV0#rVGnc=tFkd0 z$_|1G)vZ=@Y!!0Wb`*;!>Nmx6)&COVK6i(l%aC5{2s+<^JwfaUe@jLkrP1#{;!WqB z^Hd%h`&H#+AQ7BR+FM)hvaDxz=TOUoYyG3Z*+=}PvAMycInh3)>>M0wxRdAzFkL}5 z+UXmRSI#!kWcCK~nJR#spbzT+E=hdTV(BH$4=6yK$oPi23aT>ly7AteaG0s)m(Y zswMhsfE=&m4&2oR*4pU_Bnghoe^j5Jk3RY>+&YgL3=!EO&~u7EAi~)Sq^WoGERi;w zvDCzGX)e`EiKuWr>;D;(Sl=UXAiK`QtMTnTDC0-J51|M`A6hJEycxS5wpEzYhl;P> z?`i)_eHvBq>mclWM@P)e?MK*o6m}r_%TbH3jrxMl+LGnc=*-_HUkoI=lh%VqrNMil zJh&_rExxrKx?nNIm!vUv$1P{Z1K6%KqHOrpfIj0tVzb)<-|QZ_UeD}rVOP}pbq=2m z&V#i5Qc4-YmjbUf>58Q%3^!}*8y?=_C=sksZ*<|{pv^M%`zxsC1fpc;{)4 z;&b&RJp|kqpiaUXx~nTLe~R7xW4PqpTzG`@X&WbpCc9`@zV3>JME3?>v`c;GXodOZ zo68GskHnB{T(7-F4nY$~OY`MW1O`@JnLOqfdN+F1pwrBd@Tv#*+5Gw3Y23sfc|oE= z>p9Fzn5*!}gzx2qhndupTa*=#cGRok$;OITT$97ICyfv93{BD+mT?8=(*Ulp@O$v~ z$K3xt+Be&KoHU-b5R0^)w=OaKT&9_luBI_|a+feEihI!4QE}3`w8q8yxA$q4t{xV9 zfh2o0Tmn*E-LwT(7lpZn&O78h4@%>3I`+sXQt$Sgrh5si)9n$3hge>!ZKAPCxo);O zF?XvH!&@E*wBt9UiBTySwW*SpKo;E5u^>tRTB~bOvxP`~tWW%`e=_fBAZdP$K;D%izbKVNrqI zF<7o(%PpJAmt6tKhlpf)(@f}ZG(Lc2rHS_z4g4_cOg=U0!E290JN2i@UrYhq*KX0_ z%?LX zy$iiOZ4>ii_y(Z+2=A6F|2~c-Fj^`v$$y1QR>Lmfoo^VptL4h7=6|}Lur6Or3<7Qk zS;uJe#GhjQZ*>60*9*Lc0c#NhVSP`afoz zP7^Q(0BXGeG_{`__I%~}Oo$%(Ux!RNLEEa3{^Z3(`GH|6tJ8$Hzx_`dgz|G*ff)4F zMCc%cK`je#(LAtUlAl5whd^wcR+YIiaOntb4Of` zl@b^I)H6&@SJFwi|4cP(v_5Lf#jmMEU{&0V)&|H;ULOt)> zO~-wp5fx>6K|Me;%^y34q1J#iktk%w>5`@Uwtx!sIQ(`6D(%7m{ZQKpCU05Cd`CG} zDYus^?&q&Dt=S>d)}E2XKO8LbA=ukw_;ewPfDkJNrqxk!XH~ffn9^L6d+K^`4JE%r zCvfLlNCcCVgAR2qz!j-r57uutSwsl#k8|!N-?uOU)VNJ+SZicRwfXAmc>e9)dkVI^hd$_= zODWN7oXnL^oLAnA^WjLC0M@o7vlk9pqS@)7-}||2#eq-pb4I_s zOB&}o^yFz?APmgZJ3DWc=RQuJjHDL7AV7f}3=yD@m#;BjbVJEmeI`Z*Q%Cname`{U zjy2A$2&H?c=f0;F^OYaaQK$c_9}13h8QeErm7HB?1^r3Y!D&E6QP5beP}Oz^X;%Qb zH`EUE!x;lfLfyf!4YkW8FG@o1w-LSEWKsd|;_4CZWBw<=VB)}DgI$SHc!hxS1$!AF z4~R_|MMUJGavsit`E37 z%xU|0?7Pd1 z|7pp+nla2#Vw4P}F+}zyF0Q>Mzf|bDS^2c zZL6#Qxro?~`al=UxwPD)N1T>yo~M0h@=w%B*_I@5}OtEX%aYFiJ%) z7srU@A--IN%d4L;uk#1oVbVcF7?asVqdt@n)7)q+brGk1%@jZ$v;@(@pj+G+|IYO* zP}_x0;~5ve(;k(gq-l0GmCZv#IfYnV6Hex)P2NsDl`xH?x3}Yv9kmD>bvP{{dhV=tzRLPBFSX`lc6WorTTCGf7ckUM0mr!k&{-8r$fX552*Uo2*G4f=d#x9cXlfi+20r4Z0JVDR zmQBMqWFOGa*)&RZIw}DrC9DZV1IShl+OnU6I=vh;lzj!6xI@K*D{CdID{E@}wyy>H_A+rsa42g>L;nGjY1{YyG`BZHC30OS?_fgId# z7g?jBG!J^8i<66Jq9p$ue$q+(1Dty#*{4q-BXV&A;B*3wP+Q1I##aEYxrNe8r@PH} zjKx(Oe7YQyq3jRIcPHkI8Kl`<7kC^jg$b~&eqQ#xDRcPp%AR>00Fs3j_bT zdX6k9xguSH7MRXd3wdEyiKhieEYRUTP|GxQW`;*!F_mzD=)cDx_y8D3f-!RG3c)yN zH7Eg+^(ujp{8TSEj|b@dGnMp_PFNJT)h-G^fQTN7LNONBVfYtB06!0C)G4Ot{@Yvn z6$XO00-Q49pK1>Q7XZ^65`%}PRfKGT29YtR}BBSsIm~;{{djJz48VDkqz~8S^ zeVcw6mnq|E$0G;enC^~eOa4XGY8xUOtJ(Cb7V0SjT7t>uR80*~H3Y(g7EeanG*NZ) z#fFa|8-5YkHGR&=5V$lb7+T|T_`ShVOSQDPHy>C&2d5oXcEvDi0}yR>_~YAFhMa64 z;Xn`4c@>yWKI<~P0wnx*orOvx=)@~}M#d@7{9*}Aw`d>+&q88~GyQdXQ=I2>YXjOd zmk~`z#m%qm<>7 z7LgZl+E~#Vp?*88yTT1%MNES8wpORLBE{igb6SVSxX;<1t>IVlFcML0}fP6RyOz=#k~)pdt_lgsMDuAQTLw3 z=v{JQpf%p%<4x>_x|Z-=Kr^FekR8S^Fo(}UHN(en_Bg>ff=Xc2wjZ1TL9i=E40^}F z6H^I+|Ek)m0@9Im9g40J&9m8&)xx4YlGURqAgS33C^x6YB2KupvirC$T^M$It_k;9 zvlP{@wskS*21W~F0dsH-m=nH)?1UF@Erm91qVJ(Q;`1s%Q0h)1ypIK6UDc~fxiRY&+xB*DM}z^*LiqX5q54c;DaiKF|00em{s#w+hWd;xeG{kTc3O zu(KsFb^Eog9lt50_Y^!OE?CSiKLjJtIq>Y!KT=_bQDW+6{_F-H(O%F37pip0Yb_vF zxNBSgScvFtLut?$2w?UlM{*6CnND^XI4W%<^A17E5qSUL?R4uq0A0@$#I%7r&I<&7 z-aBNMIQk*7hbi55=~HUNcrDv;4Qd(HN%?Z04oC|8U2vp|S~%$uvUE^Bcsbtin zk3pjXg$BOv$Y<2J{IB=gJcg3LMy2rPy0+$W_e-y%sNpD1l+WT9%ewm3z~g<603fL) zg>0vG(5^U#2A)0A{|@gr1NKAYL@POZDfO8ce$hxMbSU>X)e>I(JGdjC51+f;3D>uJ zd)W%%$Cs1c;9K&Hx{pkF5QK`BRnTZD4Px{|GOnYScN8 z#uPnh%asxsKRG>?BmQiCesUbE-o0T+6*mT6JEuoh%p+(s`bDR1mGZedPx3BBBI*V*64y*s2~Xr_Kdvlp>1QhxcnYOuCYd5behHMKuE@ z>YBL_h`_h9TrvftKyF-tga#c$awe{tV`2N2JeCS6+=!XZR-}RDIOJ$jxjo0f)2=AT8M}We>m{ zVP|V)?p#-g4ql|qNHAv()2j)V6yM+Ghmww`VDmCEGR6lRpc2?BY9krADZ_mqwwtD|&0auriNIJeCYtF_=>q+!p z=-sbi2@2jiNCc{@XGaEjV7j~uDrxg%pFI9Oo9hHnjCkjHNa@p4`Wd%-LVvAF1@_Kj zMsY^A-v*a!#}d^6!9&$v#TP``X2|Mx`dpw?K~8sGS3*7GanmoXlr?t?>anIK-u&Sk z^>4x!p@ev;n|v6I@RK+LETAv8)Og;b=>(4n8PuK*l|xFD1~pQ(7GVhTCu<3qod2oB zOa))D!s#e9n(Q-hR}p9k0gk4s=G;uh-E%Ii(#|!k z($jUW8~>4D$={h2WhPv&9f7jeD{C!ct&0PqJEX#M znZvi7)DqE%s3>F!3PBT0Kx*1ceS|p<(}e}`;aqA6Kg)taA%km~e<*5t!Y+PcHfUa6 z{vE+$;J2d9W3G)2FlvwL_}5lhCBM=rp6btiJ~|Rh?*=2xaAz@@yv!E-L=6o(0LeAc zs~_4b<}TthD--8;8!?`A3h}|?S-axL%N#9V3UM_2qprh7$IE7?(<&8)6Ov>1h3yY> z@U6IGRIj$>2rMNdxu-QQ*9iVWUj=cZ9cz@*%Ajts%Bw^5mHQ|d`FdYm%;W?v#~5ph z)woW-a*|CWHXh8u=G;>?+fyLOE9KuH+w)Ex_=~5w;S5LLEVr1N<)7shuMV*kHp<1K zfGkd#ER*~)MvbsSDc27#jMetw(ykQUn27;H=h*d=cvGLrvbyEAp>>$VcUHVo81i>L z4-%HaTpGKIzkXUR+%*JnBJFY8wD|%1QA-icSmA1VKZsCH%b?8&|5b6&;Ij`AG&Ky& zZSu-mPj`kv@bb%5O$2y{{kf8ZOPDa~Nn>u=?F`OMNK8wL7=X?4=e3uz974P~o$>i1 zs=R~mT5z;y`CoS~kp{^?Rxa=DmjP234VXZ7_<|gCg1btkk#%;)$qCq{vAPe@+;%RJ zA;Mj87WMzxCE}Em&;)wFW>9YD5-FHbF*0uSoxdy?u!hF(#8>`Y&cH@Y83%V|_S7PN j?C0Iq|K*=C`$?MjDXW)UL)`|jxzzWrUM`K!XVU)-WO}at literal 0 HcmV?d00001 diff --git a/src/main.cpp b/src/main.cpp index cc4ff1b..3d70dd8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,7 +14,7 @@ #include "testing_helpers.hpp" int main(int argc, char* argv[]) { - const int SIZE = 1 << 6; + const int SIZE = 1 << 8; const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 4c82d64..68d21bd 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -43,7 +43,12 @@ void scan(int n, int *odata, const int *idata) { cudaMemcpy(scan_result, idata, total * sizeof(int), cudaMemcpyHostToDevice); dim3 fullBlocksPerGrid((total + blockSize - 1) / blockSize); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); for (int i = 0; i < d; i++) { upSweep<<>>(total, i, scan_result, temp_scan); temp_scan = scan_result; @@ -61,6 +66,12 @@ void scan(int n, int *odata, const int *idata) { temp_scan = scan_result; } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + printf("%f milliseconds for efficient \n", milliseconds); + cudaMemcpy(odata, scan_result, total * sizeof(int), cudaMemcpyDeviceToHost); printf("odata[n-1] %d \n", odata[total-1]); } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index c6e3a3b..535007f 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -42,11 +42,17 @@ void scan(int n, int *odata, const int *idata) { cudaMemcpy(scan_result, idata, n * sizeof(int), cudaMemcpyHostToDevice); dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); for (int i = 1; i <= d; i++) { prefixSum<<>>(n, i, scan_result, temp_scan); temp_scan = scan_result; } + cudaEventRecord(stop); cudaMemcpy(odata, scan_result, n * sizeof(int), cudaMemcpyDeviceToHost); @@ -56,6 +62,10 @@ void scan(int n, int *odata, const int *idata) { } odata[0] = 0; + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + printf("%f milliseconds for naive \n", milliseconds); cleanUp(); diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index fe6591d..c23549d 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -16,11 +16,23 @@ void scan(int n, int *odata, const int *idata) { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + thrust::host_vector hst_in(idata, idata + n); thrust::device_vector dv_in(hst_in); thrust::device_vector dv_out(n); + cudaEventRecord(start); thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + cudaEventRecord(stop); + + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + printf("%f milliseconds for thrust \n", milliseconds); + thrust::copy(dv_out.begin(), dv_out.end(), odata); } From 842a3b1644c8b6f3224d474b6bfc53deaf44eefc Mon Sep 17 00:00:00 2001 From: Sally Kong Date: Mon, 14 Sep 2015 00:10:13 -0400 Subject: [PATCH 10/10] Added graph --- README.md | 25 +++---------------------- 1 file changed, 3 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index c5f23be..87a6183 100644 --- a/README.md +++ b/README.md @@ -21,26 +21,7 @@ on the implementation of scan and stream compaction. for Scan, Stream Compaction, and Work-Efficient Parallel Scan. * GPU Gems 3, Chapter 39 - [Parallel Prefix Sum (Scan) with CUDA](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html). -## Write-up - -* Compare all of these GPU Scan implementations (Naive, Work-Efficient, and - Thrust) to the serial CPU version of Scan. Plot a graph of the comparison - (with array size on the independent axis). - * You should use CUDA events for timing. Be sure **not** to include any - explicit memory operations in your performance measurements, for - comparability. - * To guess at what might be happening inside the Thrust implementation, take - a look at the Nsight timeline for its execution. - -* Write a brief explanation of the phenomena you see here. - * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is - it different for each implementation? - -* Paste the output of the test program into a triple-backtick block in your - README. - * If you add your own tests (e.g. for radix sort or to test additional corner - cases), be sure to mention it explicitly. - -These questions should help guide you in performance analysis on future -assignments, as well. +## Performance Analysis + +![](imgs/graph.png)