From 666b00dac069114663b8578d6387e12fdbbbbb68 Mon Sep 17 00:00:00 2001 From: Kyle Bauer Date: Mon, 16 Sep 2024 00:38:04 -0400 Subject: [PATCH 01/10] Implemented CPU and Naive --- src/main.cpp | 2 +- stream_compaction/cpu.cu | 54 ++++++++++++++++++++++++++++++---- stream_compaction/naive.cu | 59 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 107 insertions(+), 8 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..90fcea1 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 24; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..2ed649d 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,7 +19,13 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + odata[0] = 0; + for (int k = 1; k < n; ++k) + { + odata[k] = odata[k - 1] + idata[k - 1]; + } + timer().endCpuTimer(); } @@ -30,9 +36,19 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + int olength = 0; + for (int k = 0; k < n; ++k) + { + if (idata[k] != 0) + { + odata[olength] = idata[k]; + ++olength; + } + } + timer().endCpuTimer(); - return -1; + return olength; } /** @@ -42,9 +58,37 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + int* tmpdata = new int[n]; + + // Compute the temporary array of pass/fail checks + for (int k = 0; k < n; ++k) + { + odata[k] = idata[k] != 0; + } + + // Scan the temporary array + tmpdata[0] = 0; + for (int k = 1; k < n; ++k) + { + tmpdata[k] = tmpdata[k - 1] + odata[k - 1]; + } + + // Scatter based on the found indices + int olength = 0; + for (int k = 0; k < n; ++k) + { + if (odata[k] == 1) + { + odata[tmpdata[k]] = idata[k]; + ++olength; + } + } + + delete[](tmpdata); + timer().endCpuTimer(); - return -1; + return olength; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..f80acec 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -11,15 +11,70 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + __global__ void kernNaiveScan(int n, int d, int* odata, const int* idata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + + int ipow2 = powf(2, d - 1); + if (index >= ipow2) + { + odata[index] = idata[index - ipow2] + idata[index]; + } + else + { + odata[index] = idata[index]; + } + } + + __global__ void kernShiftRight(int n, int s, int* odata, const int* idata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + + int output = index >= s ? idata[index - s] : 0; + odata[index] = output; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int *dev_odata, *dev_tmpdata, *dev_idata; + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc Naive::scan::dev_odata failed!"); + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("cudaMalloc Naive::scan::dev_idata failed!"); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + const int blockSize = 512; + dim3 gridDim((n + blockSize - 1) / blockSize); + + int depth_max = ilog2ceil(n); + timer().startGpuTimer(); - // TODO + + for (int d = 1; d <= depth_max; ++d) + { + kernNaiveScan<<>>(n, d, dev_odata, dev_idata); + + int *tmp = dev_idata; + dev_idata = dev_odata; + dev_odata = tmp; + } + kernShiftRight<<>>(n, 1, dev_odata, dev_idata); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_odata); + cudaFree(dev_idata); + checkCUDAError("cudaFree Naive::scan failed"); } } } From fdae3b6f6d9b45e8cb59b89679441bf96fe11776 Mon Sep 17 00:00:00 2001 From: Kyle Bauer Date: Tue, 17 Sep 2024 04:35:54 -0400 Subject: [PATCH 02/10] Implemented efficient scan and stram compaction --- stream_compaction/common.cu | 15 +++- stream_compaction/common.h | 2 + stream_compaction/efficient.cu | 124 ++++++++++++++++++++++++++++++++- stream_compaction/naive.cu | 5 +- 4 files changed, 138 insertions(+), 8 deletions(-) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..ebe88c2 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,11 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (index >= n) return; + + bools[index] = idata[index] != 0; } /** @@ -32,7 +36,14 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (index >= n) return; + + if (bools[index]) + { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..95f9a11 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -13,6 +13,8 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define blockSize 512 + /** * Check for CUDA errors; print and exit if there was a problem. */ diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..4303343 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -12,13 +12,69 @@ namespace StreamCompaction { return timer; } + __global__ void kernUpSweep(int n, int d, int *data) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (index >= n) return; + + if (index % (1 << (d + 1))) return; + + data[index + (1 << (d + 1)) - 1] += data[index + (1 << d) - 1]; + } + + __global__ void kernDownSweep(int n, int d, int* data) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (index >= n - 1) return; + + int test = (index % (1 << (d + 1))); + if (index % (1 << (d + 1))) return; + + int t = data[index + (1 << d) - 1]; + data[index + (1 << d) - 1] = data[index + (1 << (d + 1)) - 1]; + data[index + (1 << (d + 1)) - 1] += t; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int *dev_data; + + int depth_max = ilog2ceil(n); + size_t dataSize = (1ull << depth_max); + + cudaMalloc((void**)&dev_data, dataSize * sizeof(int)); + checkCUDAError("cudaMalloc Efficient::scan::dev_data failed!"); + + cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + dim3 gridDim((dataSize + blockSize - 1) / blockSize); + timer().startGpuTimer(); - // TODO + + for (int d = 0; d < depth_max; ++d) + { + kernUpSweep<<>>(dataSize, d, dev_data); + } + + int* zero = new int(0); + cudaMemcpy(dev_data + dataSize - 1, zero, sizeof(int), cudaMemcpyHostToDevice); + delete(zero); + + for (int d = depth_max - 1; d >= 0; --d) + { + kernDownSweep<<>>(dataSize, d, dev_data); + } + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_data); + checkCUDAError("cudaFree Efficient::scan failed!"); } /** @@ -31,10 +87,72 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int *dev_idata, *dev_bools, *dev_indices, *dev_odata; + + int depth_max = ilog2ceil(n); + size_t dataSize = (1ull << depth_max); + + cudaMalloc((void**)&dev_idata, dataSize * sizeof(int)); + checkCUDAError("cudaMalloc Efficient::compact::dev_idata failed!"); + + cudaMalloc((void**)&dev_bools, dataSize * sizeof(int)); + checkCUDAError("cudaMalloc Efficient::compact::dev_bools failed!"); + + cudaMalloc((void**)&dev_indices, dataSize * sizeof(int)); + checkCUDAError("cudaMalloc Efficient::compact::dev_indices failed!"); + + cudaMalloc((void**)&dev_odata, dataSize * sizeof(int)); + checkCUDAError("cudaMalloc Efficient::compact::dev_odata failed!"); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + dim3 gridDim((dataSize + blockSize - 1) / blockSize); + timer().startGpuTimer(); - // TODO + + // Map to boolean + Common::kernMapToBoolean<<>>(dataSize, dev_bools, dev_idata); + cudaMemcpy(dev_indices, dev_bools, dataSize * sizeof(int), cudaMemcpyHostToHost); + + // Scan + for (int d = 0; d < depth_max; ++d) + { + kernUpSweep<<>>(dataSize, d, dev_indices); + } + + int* zero = new int(0); + cudaMemcpy(dev_indices + dataSize - 1, zero, sizeof(int), cudaMemcpyHostToDevice); + delete(zero); + + for (int d = depth_max - 1; d >= 0; --d) + { + kernDownSweep<<>>(dataSize, d, dev_indices); + } + + // Scatter + Common::kernScatter<<>>(dataSize, dev_odata, dev_idata, dev_bools, dev_indices); + timer().endGpuTimer(); - return -1; + + int* ptr_size = new int(); + int* ptr_doLast = new int(); + cudaMemcpy(ptr_size, dev_indices + dataSize - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(ptr_doLast, dev_bools + dataSize - 1, sizeof(int), cudaMemcpyDeviceToHost); + + int size = *ptr_size + (*ptr_doLast ? 1 : 0); + + delete(ptr_doLast); + delete(ptr_size); + + cudaMemcpy(odata, dev_odata, size * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_odata); + cudaFree(dev_indices); + cudaFree(dev_bools); + cudaFree(dev_idata); + checkCUDAError("cudaFree Efficient::compact failed!"); + + return size; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index f80acec..ff22f59 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -41,7 +41,7 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - int *dev_odata, *dev_tmpdata, *dev_idata; + int *dev_odata, *dev_idata; cudaMalloc((void**)&dev_odata, n * sizeof(int)); checkCUDAError("cudaMalloc Naive::scan::dev_odata failed!"); @@ -51,7 +51,6 @@ namespace StreamCompaction { cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); - const int blockSize = 512; dim3 gridDim((n + blockSize - 1) / blockSize); int depth_max = ilog2ceil(n); @@ -74,7 +73,7 @@ namespace StreamCompaction { cudaFree(dev_odata); cudaFree(dev_idata); - checkCUDAError("cudaFree Naive::scan failed"); + checkCUDAError("cudaFree Naive::scan failed!"); } } } From 508a8627ce6dc0f1e6a8da4dc2948c4241818bcd Mon Sep 17 00:00:00 2001 From: Kyle Bauer Date: Tue, 17 Sep 2024 05:00:53 -0400 Subject: [PATCH 03/10] Added the Thrust implementation --- stream_compaction/thrust.cu | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..46ebf77 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,17 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::host_vector thrust_odata(n); + thrust::host_vector thrust_idata(idata, idata + n); + + thrust::device_vector dev_thrust_odata(thrust_odata); + thrust::device_vector dev_thrust_idata(thrust_idata); + timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_thrust_idata.begin(), dev_thrust_idata.end(), dev_thrust_odata.begin()); timer().endGpuTimer(); + + thrust::copy(dev_thrust_odata.begin(), dev_thrust_odata.end(), odata); } } } From 94799c15a140f477b028d9bd506eb5ab4a24e6cf Mon Sep 17 00:00:00 2001 From: Kyle Bauer Date: Tue, 17 Sep 2024 05:14:53 -0400 Subject: [PATCH 04/10] Updated cpu naming scheme --- stream_compaction/cpu.cu | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 2ed649d..becb241 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -35,20 +35,21 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { + int size = 0; + timer().startCpuTimer(); - int olength = 0; for (int k = 0; k < n; ++k) { if (idata[k] != 0) { - odata[olength] = idata[k]; - ++olength; + odata[size] = idata[k]; + ++size; } } timer().endCpuTimer(); - return olength; + return size; } /** @@ -57,9 +58,10 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); + int* indices = new int[n]; + int size = 0; - int* tmpdata = new int[n]; + timer().startCpuTimer(); // Compute the temporary array of pass/fail checks for (int k = 0; k < n; ++k) @@ -68,27 +70,27 @@ namespace StreamCompaction { } // Scan the temporary array - tmpdata[0] = 0; + indices[0] = 0; for (int k = 1; k < n; ++k) { - tmpdata[k] = tmpdata[k - 1] + odata[k - 1]; + indices[k] = indices[k - 1] + odata[k - 1]; } // Scatter based on the found indices - int olength = 0; for (int k = 0; k < n; ++k) { - if (odata[k] == 1) + if (odata[k] != 0) { - odata[tmpdata[k]] = idata[k]; - ++olength; + odata[indices[k]] = idata[k]; + ++size; } } - delete[](tmpdata); - timer().endCpuTimer(); - return olength; + + delete[](indices); + + return size; } } } From a41958ceaf688a07bcf8295d200ba150ecc68a6c Mon Sep 17 00:00:00 2001 From: Kyle Bauer Date: Tue, 17 Sep 2024 05:23:47 -0400 Subject: [PATCH 05/10] Updated naive naming scheme --- stream_compaction/naive.cu | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index ff22f59..e59e9cc 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -17,13 +17,15 @@ namespace StreamCompaction { int index = (blockIdx.x * blockDim.x) + threadIdx.x; if (index >= n) return; - int ipow2 = powf(2, d - 1); - if (index >= ipow2) + int pow2 = (1 << (d - 1)); + if (index >= pow2) { - odata[index] = idata[index - ipow2] + idata[index]; + // Combine + odata[index] = idata[index - pow2] + idata[index]; } else { + // Unused this iteration - Copy over to the other array odata[index] = idata[index]; } } @@ -33,14 +35,16 @@ namespace StreamCompaction { int index = (blockIdx.x * blockDim.x) + threadIdx.x; if (index >= n) return; - int output = index >= s ? idata[index - s] : 0; - odata[index] = output; + // Shift to the right + // Fill newly empty slots with 0s + odata[index] = index >= s ? idata[index - s] : 0; } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + // Allocate device arrays int *dev_odata, *dev_idata; cudaMalloc((void**)&dev_odata, n * sizeof(int)); @@ -53,10 +57,12 @@ namespace StreamCompaction { dim3 gridDim((n + blockSize - 1) / blockSize); + // Number of levels needed for the naive parallel scan int depth_max = ilog2ceil(n); timer().startGpuTimer(); + // Perform an Inclusive Scan for (int d = 1; d <= depth_max; ++d) { kernNaiveScan<<>>(n, d, dev_odata, dev_idata); @@ -65,12 +71,16 @@ namespace StreamCompaction { dev_idata = dev_odata; dev_odata = tmp; } + + // Inclusive Scan -> Exclusive Scan conversion kernShiftRight<<>>(n, 1, dev_odata, dev_idata); timer().endGpuTimer(); + // Copy the output data cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + // Free device arrays cudaFree(dev_odata); cudaFree(dev_idata); checkCUDAError("cudaFree Naive::scan failed!"); From f44ea3775badd6f356d5de0a792197fb2a9951ec Mon Sep 17 00:00:00 2001 From: Kyle Bauer Date: Tue, 17 Sep 2024 05:48:08 -0400 Subject: [PATCH 06/10] Updated efficient naming scheme --- stream_compaction/efficient.cu | 71 +++++++++++++++++++++++----------- stream_compaction/naive.cu | 6 +-- 2 files changed, 52 insertions(+), 25 deletions(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 4303343..72dbdf1 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -12,25 +12,31 @@ namespace StreamCompaction { return timer; } + // Finds the modulo of value with 2^power) + __device__ int modPowBase2(int value, int power) + { + int shift = (sizeof(int) * 8 - power); + return (value << shift) >> shift; + } + + // Performs parallel reduction __global__ void kernUpSweep(int n, int d, int *data) { int index = (blockIdx.x * blockDim.x) + threadIdx.x; - if (index >= n) return; - if (index % (1 << (d + 1))) return; + if (modPowBase2(index, d + 1)) return; data[index + (1 << (d + 1)) - 1] += data[index + (1 << d) - 1]; } + // Build a scan from an up-swept input __global__ void kernDownSweep(int n, int d, int* data) { int index = (blockIdx.x * blockDim.x) + threadIdx.x; - if (index >= n - 1) return; - int test = (index % (1 << (d + 1))); - if (index % (1 << (d + 1))) return; + if (modPowBase2(index, d + 1)) return; int t = data[index + (1 << d) - 1]; data[index + (1 << d) - 1] = data[index + (1 << (d + 1)) - 1]; @@ -41,11 +47,17 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + // Number of levels needed for the scan + int maxDepth = ilog2ceil(n); + // Padded size of the work device array + size_t dataSize = (1ull << maxDepth); + // Reset value for down-sweeping + int zero = 0; + + // Allocate device arrays int *dev_data; - int depth_max = ilog2ceil(n); - size_t dataSize = (1ull << depth_max); - cudaMalloc((void**)&dev_data, dataSize * sizeof(int)); checkCUDAError("cudaMalloc Efficient::scan::dev_data failed!"); @@ -55,24 +67,27 @@ namespace StreamCompaction { timer().startGpuTimer(); - for (int d = 0; d < depth_max; ++d) + // Up-Sweep + for (int d = 0; d < maxDepth; ++d) { kernUpSweep<<>>(dataSize, d, dev_data); } - int* zero = new int(0); - cudaMemcpy(dev_data + dataSize - 1, zero, sizeof(int), cudaMemcpyHostToDevice); - delete(zero); + // Replace the last element with 0 + cudaMemcpy(dev_data + dataSize - 1, &zero, sizeof(int), cudaMemcpyHostToDevice); - for (int d = depth_max - 1; d >= 0; --d) + // Down-Sweep + for (int d = maxDepth - 1; d >= 0; --d) { kernDownSweep<<>>(dataSize, d, dev_data); } timer().endGpuTimer(); + // Copy the output data cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + // Free device arrays cudaFree(dev_data); checkCUDAError("cudaFree Efficient::scan failed!"); } @@ -87,11 +102,16 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + // Number of levels needed for the scan + int maxDepth = ilog2ceil(n); + // Padded size of the work device array + size_t dataSize = (1ull << maxDepth); + // Reset value for down-sweeping + int zero = 0; + + // Allocate device arrays int *dev_idata, *dev_bools, *dev_indices, *dev_odata; - int depth_max = ilog2ceil(n); - size_t dataSize = (1ull << depth_max); - cudaMalloc((void**)&dev_idata, dataSize * sizeof(int)); checkCUDAError("cudaMalloc Efficient::compact::dev_idata failed!"); @@ -112,28 +132,33 @@ namespace StreamCompaction { // Map to boolean Common::kernMapToBoolean<<>>(dataSize, dev_bools, dev_idata); + + + // Scan the indices cudaMemcpy(dev_indices, dev_bools, dataSize * sizeof(int), cudaMemcpyHostToHost); - // Scan - for (int d = 0; d < depth_max; ++d) + // Up-Sweep + for (int d = 0; d < maxDepth; ++d) { kernUpSweep<<>>(dataSize, d, dev_indices); } - int* zero = new int(0); - cudaMemcpy(dev_indices + dataSize - 1, zero, sizeof(int), cudaMemcpyHostToDevice); - delete(zero); + // Replace the last element with 0 + cudaMemcpy(dev_indices + dataSize - 1, &zero, sizeof(int), cudaMemcpyHostToDevice); - for (int d = depth_max - 1; d >= 0; --d) + // Down-Sweep + for (int d = maxDepth - 1; d >= 0; --d) { kernDownSweep<<>>(dataSize, d, dev_indices); } + // Scatter Common::kernScatter<<>>(dataSize, dev_odata, dev_idata, dev_bools, dev_indices); timer().endGpuTimer(); + // Retrieve the output size int* ptr_size = new int(); int* ptr_doLast = new int(); cudaMemcpy(ptr_size, dev_indices + dataSize - 1, sizeof(int), cudaMemcpyDeviceToHost); @@ -144,8 +169,10 @@ namespace StreamCompaction { delete(ptr_doLast); delete(ptr_size); + // Copy the output data cudaMemcpy(odata, dev_odata, size * sizeof(int), cudaMemcpyDeviceToHost); + // Free device arrays cudaFree(dev_odata); cudaFree(dev_indices); cudaFree(dev_bools); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index e59e9cc..0467219 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -57,13 +57,13 @@ namespace StreamCompaction { dim3 gridDim((n + blockSize - 1) / blockSize); - // Number of levels needed for the naive parallel scan - int depth_max = ilog2ceil(n); + // Number of levels needed for the scan + int maxDepth = ilog2ceil(n); timer().startGpuTimer(); // Perform an Inclusive Scan - for (int d = 1; d <= depth_max; ++d) + for (int d = 1; d <= maxDepth; ++d) { kernNaiveScan<<>>(n, d, dev_odata, dev_idata); From b822307a5aee57e7d206044aff6e5730bb2c6d6a Mon Sep 17 00:00:00 2001 From: Kyle Bauer Date: Tue, 17 Sep 2024 22:26:02 -0400 Subject: [PATCH 07/10] Added per function block sizes --- src/main.cpp | 10 +++++++++- stream_compaction/common.h | 2 -- stream_compaction/efficient.cu | 2 ++ stream_compaction/naive.cu | 1 + 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 90fcea1..a11297d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -127,12 +127,20 @@ int main(int argc, char* argv[]) { printCmpLenResult(count, expectedNPOT, b, c); zeroArray(SIZE, c); - printDesc("cpu compact with scan"); + printDesc("cpu compact with scan, power-of-two"); count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); + zeroArray(SIZE, c); + printDesc("cpu compact with scan, non-power-of-two"); + count = StreamCompaction::CPU::compactWithScan(NPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + expectedNPOT = count; + printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 95f9a11..d2c1fed 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -13,8 +13,6 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) -#define blockSize 512 - /** * Check for CUDA errors; print and exit if there was a problem. */ diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 72dbdf1..0d8acc4 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -63,6 +63,7 @@ namespace StreamCompaction { cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + const int blockSize = 256; dim3 gridDim((dataSize + blockSize - 1) / blockSize); timer().startGpuTimer(); @@ -126,6 +127,7 @@ namespace StreamCompaction { cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + int blockSize = 128; dim3 gridDim((dataSize + blockSize - 1) / blockSize); timer().startGpuTimer(); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 0467219..6bff991 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -55,6 +55,7 @@ namespace StreamCompaction { cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + const int blockSize = 128; dim3 gridDim((n + blockSize - 1) / blockSize); // Number of levels needed for the scan From d2fb826037b6a9f2e0d2d20d92f9f0e119fcf8cf Mon Sep 17 00:00:00 2001 From: Kyle Bauer Date: Tue, 17 Sep 2024 22:36:06 -0400 Subject: [PATCH 08/10] Added comparison images --- img/Scan Implementation Comparison Pow2.svg | 1 + img/Scan Implementation Comparison.svg | 1 + 2 files changed, 2 insertions(+) create mode 100644 img/Scan Implementation Comparison Pow2.svg create mode 100644 img/Scan Implementation Comparison.svg diff --git a/img/Scan Implementation Comparison Pow2.svg b/img/Scan Implementation Comparison Pow2.svg new file mode 100644 index 0000000..99f118d --- /dev/null +++ b/img/Scan Implementation Comparison Pow2.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/img/Scan Implementation Comparison.svg b/img/Scan Implementation Comparison.svg new file mode 100644 index 0000000..245ce24 --- /dev/null +++ b/img/Scan Implementation Comparison.svg @@ -0,0 +1 @@ + \ No newline at end of file From 41e6219ad291878daceece0eb5b317d8f0670843 Mon Sep 17 00:00:00 2001 From: kbau121-seas <144053033+kbau121-seas@users.noreply.github.com> Date: Tue, 17 Sep 2024 23:29:54 -0400 Subject: [PATCH 09/10] Update README.md --- README.md | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 82 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0e38ddb..69e4b86 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,88 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Kyle Bauer + * [LinkedIn](https://www.linkedin.com/in/kyle-bauer-75bb25171/), [twitter](https://x.com/KyleBauer414346) +* Tested on: Windows 10, i-7 12700 @ 2.1GHz 32GB, NVIDIA T1000 4GB (CETS Virtual Lab) -### (TODO: Your README) +Analysis +--- -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +
+ +
+The CPU, Naive, and Work-Efficient implementations all scaled similarly with an increasing array size. Generally, doubling the array size would double the runtime of each algorithm. + +The CPU and Work-Efficient implementations compared very similarly, with the Work-Efficient runtimes never straying more than 3% away from the CPU runtimes. + +The Naive implemenation's runtime diverged slightly from the CPU and Work-Efficient runtimes at around the 2^21 array size mark. In runs with a lesser element size than this, Naive performed up to 6% faster (at 2^20 elements) compared to the CPU implementation. And in runs with a greater element size, Naive performed at most 10% worse (at 2^24 elements) than the CPU implementation. + +The Thrust implementation is clearly the overall most performant option, pulling completely away from all other implementations as the array size increases. + +Potential Bottlenecks: +1. Global Memory: Both the Naive and Work-Efficient algorithms were implemented using global memory with no shared memory, creating a massive amount of overhead anytime the implementations wish to read or write data. +2. Memory Locality: Both the Naive and Work-Efficient algorithms read and write data across very large arrays. As the algorithms progress, these memory accesses become progressively more sparse- randomly accessing the memory will cause cache thrashing decreasing the bus utilization. +3. GPU Utilization: The Naive algorithm suffers from not saturating the GPU (Many threads are ended early leaving a couple of active threads in a warp). This inherently decreases parallelism and will increase the runtime as the array size grows. + +Sample Output +--- + +``` +**************** +** SCAN TESTS ** +**************** + [ 33 40 46 12 48 15 5 37 39 42 27 41 35 ... 10 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 27.4656ms (std::chrono Measured) + [ 0 33 73 119 131 179 194 199 236 275 317 344 385 ... 410928744 410928754 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 26.8243ms (std::chrono Measured) + [ 0 33 73 119 131 179 194 199 236 275 317 344 385 ... 410928700 410928722 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 31.6926ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 30.7692ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 23.55ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 23.0375ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 1.71158ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 1.14893ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 3 2 0 2 0 0 2 0 0 3 3 2 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 31.584ms (std::chrono Measured) + [ 3 2 2 2 3 3 2 2 1 1 2 3 1 ... 1 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 35.5074ms (std::chrono Measured) + [ 3 2 2 2 3 3 2 2 1 1 2 3 1 ... 2 2 ] + passed +==== cpu compact with scan, power-of-two ==== + elapsed time: 74.7157ms (std::chrono Measured) + [ 3 2 2 2 3 3 2 2 1 1 2 3 1 ... 1 1 ] + passed +==== cpu compact with scan, non-power-of-two ==== + elapsed time: 73.4743ms (std::chrono Measured) + [ 3 2 2 2 3 3 2 2 1 1 2 3 1 ... 2 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 33.6798ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 24.5682ms (CUDA Measured) + passed +``` From c76a492faa7f7dcbd1a8095402e1ceb0e2af253d Mon Sep 17 00:00:00 2001 From: kbau121-seas <144053033+kbau121-seas@users.noreply.github.com> Date: Tue, 17 Sep 2024 23:35:45 -0400 Subject: [PATCH 10/10] Update README.md --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 69e4b86..6cf09f5 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,13 @@ CUDA Stream Compaction * [LinkedIn](https://www.linkedin.com/in/kyle-bauer-75bb25171/), [twitter](https://x.com/KyleBauer414346) * Tested on: Windows 10, i-7 12700 @ 2.1GHz 32GB, NVIDIA T1000 4GB (CETS Virtual Lab) +Features +--- +* CPU Scan and Stream Compaction +* Naive Scan +* Work-Efficient Scan and Stream Compaction +* Thrust Scan Wrapper + Analysis ---