From 465f824b5ce05c1d6be9cdc0e92c9fe03531a19e Mon Sep 17 00:00:00 2001 From: Kau Date: Tue, 10 Sep 2024 16:16:36 -0400 Subject: [PATCH 01/10] Added Part 1 --- stream_compaction/cpu.cu | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..d9896f2 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -20,6 +20,11 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int partialSum = 0; + for (int i = 0; i < n; ++i) { + odata[i] = partialSum; + partialSum += idata[i]; + } timer().endCpuTimer(); } @@ -30,9 +35,20 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int numElements = 0; + for (int i = 0; i < n; ++i) { + if (idata[i]) odata[numElements++] = idata[i]; + } timer().endCpuTimer(); - return -1; + return numElements; + } + + int scatter(int n, int* odata, const int* bdata, const int* idata) { + int numElements = 0; + for (int i = 0; i < n; ++i) { + if (bdata[i]) odata[numElements++] = idata[i]; + } + return numElements; } /** @@ -42,9 +58,24 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + // Create boolean mask + int* buffer = new int[n]; + for (int i = 0; i < n; ++i) { + buffer[i] = (idata[i] != 0); + } + + // Scan (calling timed function leads to error) + int partialSum = 0; + for (int i = 0; i < n; ++i) { + odata[i] = partialSum; + partialSum += buffer[i]; + } + + int numElements = scatter(n, odata, buffer, idata); + + delete[] buffer; timer().endCpuTimer(); - return -1; + return numElements; } } } From 1d673ff6f0fbf0eccf2da1f8133d789524439638 Mon Sep 17 00:00:00 2001 From: Kau Date: Mon, 16 Sep 2024 10:04:38 -0400 Subject: [PATCH 02/10] Added Part 2 --- src/main.cpp | 6 ++-- stream_compaction/cpu.cu | 4 +-- stream_compaction/naive.cu | 73 +++++++++++++++++++++++++++++++++++++- 3 files changed, 77 insertions(+), 6 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..08fcfa6 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 4; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -51,7 +51,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan @@ -64,7 +64,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index d9896f2..1cbd725 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -57,9 +57,9 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + int* buffer = new int[n]; timer().startCpuTimer(); // Create boolean mask - int* buffer = new int[n]; for (int i = 0; i < n; ++i) { buffer[i] = (idata[i] != 0); } @@ -73,8 +73,8 @@ namespace StreamCompaction { int numElements = scatter(n, odata, buffer, idata); - delete[] buffer; timer().endCpuTimer(); + delete[] buffer; return numElements; } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..6e78567 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -2,6 +2,7 @@ #include #include "common.h" #include "naive.h" +#include "device_launch_parameters.h" namespace StreamCompaction { namespace Naive { @@ -13,13 +14,83 @@ namespace StreamCompaction { } // TODO: __global__ + __global__ void shiftArrayElements(int n, const int* readBuffer, int* writeBuffer) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= n) return; + + if (index == 0) { + writeBuffer[0] = 0; + return; + } + + writeBuffer[index] = readBuffer[index - 1]; + } + + __global__ void handleNonPower(int n, int d, int* buffer) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + int pow2tod = 1 << d; + + if (index >= n - pow2tod) return; + + buffer[pow2tod + index] += buffer[index]; + } + + __global__ void naiveScanStep(int n, int d, const int* readBuffer, int* writeBuffer) { + // compute thread index + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= n) return; + + int pow2tod = 1 << d; + + if (index > pow2tod) { + writeBuffer[index] = readBuffer[index] + readBuffer[index - pow2tod]; + } + else { + writeBuffer[index] = readBuffer[index]; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int blockSize = 128; + dim3 fullBlocksPerGrid{ (unsigned int) (n + blockSize - 1) / blockSize }; + + int* dev_buffer1; + int* dev_buffer2; + + cudaMalloc((void**)&dev_buffer1, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_buffer1 failed!"); + cudaMalloc((void**)&dev_buffer2, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_buffer2 failed!"); + + cudaMemcpy(dev_buffer2, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); + shiftArrayElements<<>>(n, dev_buffer2, dev_buffer1); + checkCUDAError("shiftArrayElements failed!"); + cudaDeviceSynchronize(); // TODO + for (int d = 0; d < ilog2(n); ++d) { + naiveScanStep<<>>(n, d, dev_buffer1, dev_buffer2); + checkCUDAError("naiveScanStep failed!"); + cudaDeviceSynchronize(); + + std::swap(dev_buffer1, dev_buffer2); + } + if ((1 << ilog2(n)) != n) { + fullBlocksPerGrid.x = (n - (1 << ilog2(n)) + blockSize - 1) / blockSize; + handleNonPower<<>>(n, ilog2(n), dev_buffer1); + checkCUDAError("handleNonPower failed!"); + cudaDeviceSynchronize(); + } timer().endGpuTimer(); + + cudaMemcpy(odata, dev_buffer1, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_buffer1); + cudaFree(dev_buffer2); } } -} +} \ No newline at end of file From fc07224a46286cdda8638d5dd65141a0f2513607 Mon Sep 17 00:00:00 2001 From: Kau Date: Mon, 16 Sep 2024 16:58:28 -0400 Subject: [PATCH 03/10] Added Part 3 --- src/main.cpp | 6 +-- stream_compaction/common.cu | 11 +++++ stream_compaction/common.h | 3 ++ stream_compaction/efficient.cu | 83 +++++++++++++++++++++++++++++++++- stream_compaction/naive.cu | 16 +------ 5 files changed, 101 insertions(+), 18 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 08fcfa6..e19a26c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 4; // feel free to change the size of array +const int SIZE = 1 << 8; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -71,14 +71,14 @@ int main(int argc, char* argv[]) { printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..c949f8c 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,4 +1,5 @@ #include "common.h" +#include "device_launch_parameters.h" void checkCUDAErrorFn(const char *msg, const char *file, int line) { cudaError_t err = cudaGetLastError(); @@ -17,7 +18,17 @@ void checkCUDAErrorFn(const char *msg, const char *file, int line) { namespace StreamCompaction { namespace Common { + __global__ void shiftArrayElements(int n, int shift, const int* readBuffer, int* writeBuffer) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= n) return; + if (index < shift) { + writeBuffer[index] = 0; + return; + } + + writeBuffer[index] = readBuffer[index - shift]; + } /** * Maps an array to an array of 0s and 1s for stream compaction. Elements * which map to 0 will be removed, and elements which map to 1 will be kept. diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..cfdc45b 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -37,6 +37,9 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices); + __global__ void shiftArrayElements(int n, int shift, + const int* readBuffer, int* writeBuffer); + /** * This class is used for timing the performance * Uncopyable and unmovable diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..2c73fc3 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,6 +2,7 @@ #include #include "common.h" #include "efficient.h" +#include "device_launch_parameters.h" namespace StreamCompaction { namespace Efficient { @@ -12,13 +13,93 @@ namespace StreamCompaction { return timer; } + __global__ void kernUpSweep(int n, int d, int* buffer) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + + int pow2tod = 1 << d; + int pow2todp1 = 2 * pow2tod; + + if (index >= n / pow2todp1) return; + index *= pow2todp1; + + buffer[index + pow2todp1 - 1] += buffer[index + pow2tod - 1]; + } + + __global__ void kernDownSweep(int n, int d, int s, int* buffer) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + + int pow2tod = 1 << d; + int pow2todp1 = 2 * pow2tod; + + if (s) { + buffer[pow2todp1 - 1] = 0; + } + + if (index > n / pow2todp1) return; + index *= pow2todp1; + + int tmp = buffer[index + pow2tod - 1]; + buffer[index + pow2tod - 1] = buffer[index + pow2todp1 - 1]; + buffer[index + pow2todp1 - 1] += tmp; + } + + __global__ void kernZeroPadding(int n, int d, int* buffer) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + + if (index >= 1 << (d + 1) - n) return; + + buffer[n + index] = 0; + } + + dim3 computeBlocksPerGrid(int threads, int blockSize) { + return dim3{ (unsigned int)(threads + blockSize - 1) / blockSize }; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int blockSize = 128; + + bool isPower2Length = (n == (1 << ilog2(n))); + + int bufferLength = (isPower2Length) ? n : 1 << ilog2ceil(n); + + int* tmpArray; + cudaMalloc((void**)&tmpArray, bufferLength * sizeof(int)); + checkCUDAError("cudaMalloc tmpArray failed!"); + + if (!isPower2Length) { + dim3 blocks = computeBlocksPerGrid(n - (1 << ilog2(n)), blockSize); + kernZeroPadding<<>>(n, ilog2(n), tmpArray); + checkCUDAError("kernZeroPadding failed!"); + cudaDeviceSynchronize(); + } + + cudaMemcpy(tmpArray, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); // TODO + for (int d = 0; d < ilog2ceil(n); ++d) { + dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize); + kernUpSweep<<>>(bufferLength, d, tmpArray); + checkCUDAError("kernUpSweep failed!"); + cudaDeviceSynchronize(); + } + + bool flag = 1; + for (int d = ilog2ceil(n) - 1; d >= 0; --d) { + dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize); + kernDownSweep<<>>(bufferLength, d, flag, tmpArray); + flag = 0; + checkCUDAError("kernDownSweep failed!"); + cudaDeviceSynchronize(); + } timer().endGpuTimer(); + + cudaMemcpy(odata, tmpArray, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(tmpArray); } /** @@ -37,4 +118,4 @@ namespace StreamCompaction { return -1; } } -} +} \ No newline at end of file diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 6e78567..eb2ba1b 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -13,19 +13,6 @@ namespace StreamCompaction { return timer; } // TODO: __global__ - - __global__ void shiftArrayElements(int n, const int* readBuffer, int* writeBuffer) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - if (index >= n) return; - - if (index == 0) { - writeBuffer[0] = 0; - return; - } - - writeBuffer[index] = readBuffer[index - 1]; - } - __global__ void handleNonPower(int n, int d, int* buffer) { int index = threadIdx.x + blockIdx.x * blockDim.x; int pow2tod = 1 << d; @@ -68,7 +55,7 @@ namespace StreamCompaction { cudaMemcpy(dev_buffer2, idata, n * sizeof(int), cudaMemcpyHostToDevice); timer().startGpuTimer(); - shiftArrayElements<<>>(n, dev_buffer2, dev_buffer1); + StreamCompaction::Common::shiftArrayElements<<>>(n, 1, dev_buffer2, dev_buffer1); checkCUDAError("shiftArrayElements failed!"); cudaDeviceSynchronize(); // TODO @@ -79,6 +66,7 @@ namespace StreamCompaction { std::swap(dev_buffer1, dev_buffer2); } + // perform last step if ((1 << ilog2(n)) != n) { fullBlocksPerGrid.x = (n - (1 << ilog2(n)) + blockSize - 1) / blockSize; handleNonPower<<>>(n, ilog2(n), dev_buffer1); From 73a3ed81d93710556bd48c6ed723697fcd9c19a9 Mon Sep 17 00:00:00 2001 From: Kau Date: Tue, 17 Sep 2024 10:38:27 -0400 Subject: [PATCH 04/10] Added Part 4 --- src/main.cpp | 14 ++--- stream_compaction/common.cu | 17 ++++-- stream_compaction/efficient.cu | 94 ++++++++++++++++++++++++++++++++-- stream_compaction/thrust.cu | 13 +++++ 4 files changed, 123 insertions(+), 15 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index e19a26c..ca51934 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 20; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -54,11 +54,11 @@ int main(int argc, char* argv[]) { printArray(SIZE, c, true); printCmpResult(SIZE, b, c); - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan + /*For bug - finding only : Array of 1s to help find bugs in stream compaction or scan onesArray(SIZE, c); printDesc("1s array for finding bugs"); StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ + printArray(SIZE, c, true);*/ zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); @@ -85,14 +85,14 @@ int main(int argc, char* argv[]) { printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -137,14 +137,14 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index c949f8c..12c8acf 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -33,17 +33,26 @@ namespace StreamCompaction { * Maps an array to an array of 0s and 1s for stream compaction. Elements * which map to 0 will be removed, and elements which map to 1 will be kept. */ - __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { + __global__ void kernMapToBoolean(int n, int* bools, const int* idata) { // TODO - } + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= n) return; + bools[index] = (idata[index] != 0); + + } + /** * Performs scatter on an array. That is, for each element in idata, * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. */ - __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices) { + __global__ void kernScatter(int n, int* odata, + const int* idata, const int* bools, const int* indices) { // TODO + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= n) return; + + if (bools[index]) odata[indices[index]] = idata[index]; } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2c73fc3..2760ba8 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -19,7 +19,7 @@ namespace StreamCompaction { int pow2tod = 1 << d; int pow2todp1 = 2 * pow2tod; - if (index >= n / pow2todp1) return; + if (index > n / pow2todp1 - 1) return; index *= pow2todp1; buffer[index + pow2todp1 - 1] += buffer[index + pow2tod - 1]; @@ -35,7 +35,7 @@ namespace StreamCompaction { buffer[pow2todp1 - 1] = 0; } - if (index > n / pow2todp1) return; + if (index > n / pow2todp1 - 1) return; index *= pow2todp1; int tmp = buffer[index + pow2tod - 1]; @@ -102,6 +102,49 @@ namespace StreamCompaction { cudaFree(tmpArray); } + void scanUntimed(int n, int* odata, const int* idata) { + int blockSize = 128; + + bool isPower2Length = (n == (1 << ilog2(n))); + + int bufferLength = (isPower2Length) ? n : 1 << ilog2ceil(n); + + int* tmpArray; + cudaMalloc((void**)&tmpArray, bufferLength * sizeof(int)); + checkCUDAError("cudaMalloc tmpArray failed!"); + + if (!isPower2Length) { + dim3 blocks = computeBlocksPerGrid(n - (1 << ilog2(n)), blockSize); + kernZeroPadding << > > (n, ilog2(n), tmpArray); + checkCUDAError("kernZeroPadding failed!"); + cudaDeviceSynchronize(); + } + + cudaMemcpy(tmpArray, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + // TODO + for (int d = 0; d < ilog2ceil(n); ++d) { + dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize); + kernUpSweep<<>>(bufferLength, d, tmpArray); + checkCUDAError("kernUpSweep failed!"); + cudaDeviceSynchronize(); + } + + bool flag = 1; + for (int d = ilog2ceil(n) - 1; d >= 0; --d) { + dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize); + kernDownSweep<<>>(bufferLength, d, flag, tmpArray); + flag = 0; + checkCUDAError("kernDownSweep failed!"); + cudaDeviceSynchronize(); + } + + cudaMemcpy(odata, tmpArray, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(tmpArray); + } + + /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -111,11 +154,54 @@ namespace StreamCompaction { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ - int compact(int n, int *odata, const int *idata) { + int compact(int n, int* odata, const int* idata) { + int blockSize = 128; + dim3 blocks{ (unsigned int)(n + blockSize - 1) / blockSize }; + + int* dev_buffer1; + int* dev_buffer2; + int* dev_boolArray; + int* dev_indices; + cudaMalloc((void**)&dev_boolArray, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_boolArray failed!"); + cudaMalloc((void**)&dev_indices, n * sizeof(int)); + cudaDeviceSynchronize(); + checkCUDAError("cudaMalloc dev_indices failed!"); + cudaMalloc((void**)&dev_buffer1, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_buffer1 failed!"); + cudaMalloc((void**)&dev_buffer2, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_buffer2 failed!"); + + cudaMemcpy(dev_buffer1, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); + cudaDeviceSynchronize(); + checkCUDAError("timer failed!"); // TODO + + + StreamCompaction::Common::kernMapToBoolean<<>>(n, dev_boolArray, dev_buffer1); + cudaDeviceSynchronize(); + checkCUDAError("kernMapToBoolean failed!"); + + scanUntimed(n, dev_indices, dev_boolArray); + + StreamCompaction::Common::kernScatter<<>>(n, dev_buffer2, dev_buffer1, dev_boolArray, dev_indices); + checkCUDAError("kernScatter failed!"); + cudaDeviceSynchronize(); + + cudaMemcpy(odata, dev_buffer2, n * sizeof(int), cudaMemcpyDeviceToHost); + + int numElem; + cudaMemcpy(&numElem, &dev_indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + timer().endGpuTimer(); - return -1; + + cudaFree(dev_boolArray); + cudaFree(dev_indices); + cudaFree(dev_buffer1); + cudaFree(dev_buffer2); + return numElem; } } } \ No newline at end of file diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..463ca2e 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,24 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* dev_buffer; + thrust::device_ptr dev_thrustBuffer; + cudaMalloc((void**)&dev_buffer, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_buffer failed!"); + dev_thrustBuffer = thrust::device_ptr(dev_buffer); + + cudaMemcpy(dev_buffer, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_thrustBuffer, dev_thrustBuffer + n, dev_thrustBuffer); timer().endGpuTimer(); + + cudaMemcpy(odata, dev_buffer, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_buffer); } } } From 4a13a709603152dfad968456b3c16fa5e2308e76 Mon Sep 17 00:00:00 2001 From: Kau Date: Tue, 17 Sep 2024 11:03:58 -0400 Subject: [PATCH 05/10] Bug Fixes and Code Cleanup --- stream_compaction/cpu.cu | 13 ++--- stream_compaction/cpu.h | 2 +- stream_compaction/efficient.cu | 90 +++++++++------------------------- stream_compaction/efficient.h | 2 +- 4 files changed, 30 insertions(+), 77 deletions(-) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 1cbd725..956cced 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -17,15 +17,15 @@ namespace StreamCompaction { * For performance analysis, this is supposed to be a simple for loop. * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ - void scan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); + void scan(int n, int *odata, const int *idata, bool timed) { + if (timed) timer().startCpuTimer(); // TODO int partialSum = 0; for (int i = 0; i < n; ++i) { odata[i] = partialSum; partialSum += idata[i]; } - timer().endCpuTimer(); + if (timed) timer().endCpuTimer(); } /** @@ -64,12 +64,7 @@ namespace StreamCompaction { buffer[i] = (idata[i] != 0); } - // Scan (calling timed function leads to error) - int partialSum = 0; - for (int i = 0; i < n; ++i) { - odata[i] = partialSum; - partialSum += buffer[i]; - } + scan(n, odata, idata, 0); int numElements = scatter(n, odata, buffer, idata); diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 873c047..f2f8c14 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -6,7 +6,7 @@ namespace StreamCompaction { namespace CPU { StreamCompaction::Common::PerformanceTimer& timer(); - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata, bool timed = 1); int compactWithoutScan(int n, int *odata, const int *idata); diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2760ba8..cbd3eb3 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -25,16 +25,12 @@ namespace StreamCompaction { buffer[index + pow2todp1 - 1] += buffer[index + pow2tod - 1]; } - __global__ void kernDownSweep(int n, int d, int s, int* buffer) { + __global__ void kernDownSweep(int n, int d, int* buffer) { int index = threadIdx.x + blockIdx.x * blockDim.x; int pow2tod = 1 << d; int pow2todp1 = 2 * pow2tod; - if (s) { - buffer[pow2todp1 - 1] = 0; - } - if (index > n / pow2todp1 - 1) return; index *= pow2todp1; @@ -58,93 +54,52 @@ namespace StreamCompaction { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - int blockSize = 128; - - bool isPower2Length = (n == (1 << ilog2(n))); - - int bufferLength = (isPower2Length) ? n : 1 << ilog2ceil(n); - - int* tmpArray; - cudaMalloc((void**)&tmpArray, bufferLength * sizeof(int)); - checkCUDAError("cudaMalloc tmpArray failed!"); - - if (!isPower2Length) { - dim3 blocks = computeBlocksPerGrid(n - (1 << ilog2(n)), blockSize); - kernZeroPadding<<>>(n, ilog2(n), tmpArray); - checkCUDAError("kernZeroPadding failed!"); - cudaDeviceSynchronize(); - } - - cudaMemcpy(tmpArray, idata, n * sizeof(int), cudaMemcpyHostToDevice); - - timer().startGpuTimer(); - // TODO - for (int d = 0; d < ilog2ceil(n); ++d) { - dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize); - kernUpSweep<<>>(bufferLength, d, tmpArray); - checkCUDAError("kernUpSweep failed!"); - cudaDeviceSynchronize(); - } - - bool flag = 1; - for (int d = ilog2ceil(n) - 1; d >= 0; --d) { - dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize); - kernDownSweep<<>>(bufferLength, d, flag, tmpArray); - flag = 0; - checkCUDAError("kernDownSweep failed!"); - cudaDeviceSynchronize(); - } - timer().endGpuTimer(); - - cudaMemcpy(odata, tmpArray, n * sizeof(int), cudaMemcpyDeviceToHost); - - cudaFree(tmpArray); - } - - void scanUntimed(int n, int* odata, const int* idata) { + void scan(int n, int *odata, const int *idata, bool timed) { int blockSize = 128; bool isPower2Length = (n == (1 << ilog2(n))); int bufferLength = (isPower2Length) ? n : 1 << ilog2ceil(n); - int* tmpArray; - cudaMalloc((void**)&tmpArray, bufferLength * sizeof(int)); + int* dev_tmpArray; + cudaMalloc((void**)&dev_tmpArray, bufferLength * sizeof(int)); checkCUDAError("cudaMalloc tmpArray failed!"); if (!isPower2Length) { dim3 blocks = computeBlocksPerGrid(n - (1 << ilog2(n)), blockSize); - kernZeroPadding << > > (n, ilog2(n), tmpArray); + kernZeroPadding<<>>(n, ilog2(n), dev_tmpArray); checkCUDAError("kernZeroPadding failed!"); cudaDeviceSynchronize(); } - cudaMemcpy(tmpArray, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(dev_tmpArray, idata, n * sizeof(int), cudaMemcpyHostToDevice); + if (timed) timer().startGpuTimer(); // TODO for (int d = 0; d < ilog2ceil(n); ++d) { dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize); - kernUpSweep<<>>(bufferLength, d, tmpArray); + kernUpSweep<<>>(bufferLength, d, dev_tmpArray); checkCUDAError("kernUpSweep failed!"); cudaDeviceSynchronize(); } - bool flag = 1; + int zero = 0; + cudaMemcpy(&dev_tmpArray[bufferLength - 1], &zero, sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy zero failed!"); + for (int d = ilog2ceil(n) - 1; d >= 0; --d) { dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize); - kernDownSweep<<>>(bufferLength, d, flag, tmpArray); - flag = 0; + kernDownSweep<<>>(bufferLength, d, dev_tmpArray); checkCUDAError("kernDownSweep failed!"); cudaDeviceSynchronize(); } + if (timed) timer().endGpuTimer(); - cudaMemcpy(odata, tmpArray, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(odata, dev_tmpArray, n * sizeof(int), cudaMemcpyDeviceToHost); - cudaFree(tmpArray); + cudaFree(dev_tmpArray); } - /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -178,13 +133,12 @@ namespace StreamCompaction { cudaDeviceSynchronize(); checkCUDAError("timer failed!"); // TODO - StreamCompaction::Common::kernMapToBoolean<<>>(n, dev_boolArray, dev_buffer1); cudaDeviceSynchronize(); checkCUDAError("kernMapToBoolean failed!"); - scanUntimed(n, dev_indices, dev_boolArray); + scan(n, dev_indices, dev_boolArray, 0); StreamCompaction::Common::kernScatter<<>>(n, dev_buffer2, dev_buffer1, dev_boolArray, dev_indices); checkCUDAError("kernScatter failed!"); @@ -192,8 +146,12 @@ namespace StreamCompaction { cudaMemcpy(odata, dev_buffer2, n * sizeof(int), cudaMemcpyDeviceToHost); - int numElem; - cudaMemcpy(&numElem, &dev_indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + // Index that last element in idata would have, if it was valid + int lastIndex; + cudaMemcpy(&lastIndex, &dev_indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + // Check if last element is valid + int lastBool; + cudaMemcpy(&lastBool, &dev_boolArray[n - 1], sizeof(int), cudaMemcpyDeviceToHost); timer().endGpuTimer(); @@ -201,7 +159,7 @@ namespace StreamCompaction { cudaFree(dev_indices); cudaFree(dev_buffer1); cudaFree(dev_buffer2); - return numElem; + return (lastBool) ? lastIndex + 1 : lastIndex; } } } \ No newline at end of file diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..a4455cc 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -6,7 +6,7 @@ namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata, bool timed = 1); int compact(int n, int *odata, const int *idata); } From ce79167c04a539c4146f87b0045829e69192e64c Mon Sep 17 00:00:00 2001 From: Kau Date: Wed, 18 Sep 2024 17:35:05 -0400 Subject: [PATCH 06/10] Using CudaMemset --- stream_compaction/efficient.cu | 29 +++++++++-------------------- stream_compaction/naive.cu | 10 +++++----- 2 files changed, 14 insertions(+), 25 deletions(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index cbd3eb3..9c84ae3 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -13,10 +13,9 @@ namespace StreamCompaction { return timer; } - __global__ void kernUpSweep(int n, int d, int* buffer) { + __global__ void kernUpSweep(int n, int pow2tod, int* buffer) { int index = threadIdx.x + blockIdx.x * blockDim.x; - int pow2tod = 1 << d; int pow2todp1 = 2 * pow2tod; if (index > n / pow2todp1 - 1) return; @@ -25,10 +24,9 @@ namespace StreamCompaction { buffer[index + pow2todp1 - 1] += buffer[index + pow2tod - 1]; } - __global__ void kernDownSweep(int n, int d, int* buffer) { + __global__ void kernDownSweep(int n, int pow2tod, int* buffer) { int index = threadIdx.x + blockIdx.x * blockDim.x; - int pow2tod = 1 << d; int pow2todp1 = 2 * pow2tod; if (index > n / pow2todp1 - 1) return; @@ -55,7 +53,7 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata, bool timed) { - int blockSize = 128; + int blockSize = 64; bool isPower2Length = (n == (1 << ilog2(n))); @@ -66,10 +64,7 @@ namespace StreamCompaction { checkCUDAError("cudaMalloc tmpArray failed!"); if (!isPower2Length) { - dim3 blocks = computeBlocksPerGrid(n - (1 << ilog2(n)), blockSize); - kernZeroPadding<<>>(n, ilog2(n), dev_tmpArray); - checkCUDAError("kernZeroPadding failed!"); - cudaDeviceSynchronize(); + cudaMemset(dev_tmpArray + n, 0, (bufferLength - n) * sizeof(int)); } cudaMemcpy(dev_tmpArray, idata, n * sizeof(int), cudaMemcpyHostToDevice); @@ -78,18 +73,16 @@ namespace StreamCompaction { // TODO for (int d = 0; d < ilog2ceil(n); ++d) { dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize); - kernUpSweep<<>>(bufferLength, d, dev_tmpArray); + kernUpSweep<<>>(bufferLength, 1 << d, dev_tmpArray); checkCUDAError("kernUpSweep failed!"); cudaDeviceSynchronize(); } - int zero = 0; - cudaMemcpy(&dev_tmpArray[bufferLength - 1], &zero, sizeof(int), cudaMemcpyHostToDevice); - checkCUDAError("cudaMemcpy zero failed!"); + cudaMemset(dev_tmpArray + bufferLength - 1, 0, sizeof(int)); for (int d = ilog2ceil(n) - 1; d >= 0; --d) { dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize); - kernDownSweep<<>>(bufferLength, d, dev_tmpArray); + kernDownSweep<<>>(bufferLength, 1 << d, dev_tmpArray); checkCUDAError("kernDownSweep failed!"); cudaDeviceSynchronize(); } @@ -120,7 +113,6 @@ namespace StreamCompaction { cudaMalloc((void**)&dev_boolArray, n * sizeof(int)); checkCUDAError("cudaMalloc dev_boolArray failed!"); cudaMalloc((void**)&dev_indices, n * sizeof(int)); - cudaDeviceSynchronize(); checkCUDAError("cudaMalloc dev_indices failed!"); cudaMalloc((void**)&dev_buffer1, n * sizeof(int)); checkCUDAError("cudaMalloc dev_buffer1 failed!"); @@ -130,9 +122,6 @@ namespace StreamCompaction { cudaMemcpy(dev_buffer1, idata, n * sizeof(int), cudaMemcpyHostToDevice); timer().startGpuTimer(); - cudaDeviceSynchronize(); - checkCUDAError("timer failed!"); - // TODO StreamCompaction::Common::kernMapToBoolean<<>>(n, dev_boolArray, dev_buffer1); cudaDeviceSynchronize(); @@ -148,10 +137,10 @@ namespace StreamCompaction { // Index that last element in idata would have, if it was valid int lastIndex; - cudaMemcpy(&lastIndex, &dev_indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastIndex, dev_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost); // Check if last element is valid int lastBool; - cudaMemcpy(&lastBool, &dev_boolArray[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastBool, dev_boolArray + n - 1, sizeof(int), cudaMemcpyDeviceToHost); timer().endGpuTimer(); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index eb2ba1b..f811076 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -13,7 +13,7 @@ namespace StreamCompaction { return timer; } // TODO: __global__ - __global__ void handleNonPower(int n, int d, int* buffer) { + __global__ void kernHandleNonPower(int n, int d, int* buffer) { int index = threadIdx.x + blockIdx.x * blockDim.x; int pow2tod = 1 << d; @@ -22,7 +22,7 @@ namespace StreamCompaction { buffer[pow2tod + index] += buffer[index]; } - __global__ void naiveScanStep(int n, int d, const int* readBuffer, int* writeBuffer) { + __global__ void kernNaiveScanStep(int n, int d, const int* readBuffer, int* writeBuffer) { // compute thread index int index = threadIdx.x + blockIdx.x * blockDim.x; if (index >= n) return; @@ -58,9 +58,9 @@ namespace StreamCompaction { StreamCompaction::Common::shiftArrayElements<<>>(n, 1, dev_buffer2, dev_buffer1); checkCUDAError("shiftArrayElements failed!"); cudaDeviceSynchronize(); - // TODO + for (int d = 0; d < ilog2(n); ++d) { - naiveScanStep<<>>(n, d, dev_buffer1, dev_buffer2); + kernNaiveScanStep <<>>(n, d, dev_buffer1, dev_buffer2); checkCUDAError("naiveScanStep failed!"); cudaDeviceSynchronize(); @@ -69,7 +69,7 @@ namespace StreamCompaction { // perform last step if ((1 << ilog2(n)) != n) { fullBlocksPerGrid.x = (n - (1 << ilog2(n)) + blockSize - 1) / blockSize; - handleNonPower<<>>(n, ilog2(n), dev_buffer1); + kernHandleNonPower<<>>(n, ilog2(n), dev_buffer1); checkCUDAError("handleNonPower failed!"); cudaDeviceSynchronize(); } From 8109858318780483251230b4a1b95f87b010fb00 Mon Sep 17 00:00:00 2001 From: DomIno0o <51756125+DomIno0o@users.noreply.github.com> Date: Wed, 18 Sep 2024 23:09:31 -0400 Subject: [PATCH 07/10] Add files via upload --- img/performance_compact.svg | 1379 +++++++++++++++++++++++++++++++++++ img/performance_nonpow2.svg | 1342 ++++++++++++++++++++++++++++++++++ img/performance_pow2.svg | 1328 +++++++++++++++++++++++++++++++++ 3 files changed, 4049 insertions(+) create mode 100644 img/performance_compact.svg create mode 100644 img/performance_nonpow2.svg create mode 100644 img/performance_pow2.svg diff --git a/img/performance_compact.svg b/img/performance_compact.svg new file mode 100644 index 0000000..28efc4a --- /dev/null +++ b/img/performance_compact.svg @@ -0,0 +1,1379 @@ + + + + + + + + 2024-09-18T22:27:00.267128 + image/svg+xml + + + Matplotlib v3.9.2, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/img/performance_nonpow2.svg b/img/performance_nonpow2.svg new file mode 100644 index 0000000..26a93fc --- /dev/null +++ b/img/performance_nonpow2.svg @@ -0,0 +1,1342 @@ + + + + + + + + 2024-09-18T22:27:00.060256 + image/svg+xml + + + Matplotlib v3.9.2, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/img/performance_pow2.svg b/img/performance_pow2.svg new file mode 100644 index 0000000..35c2c07 --- /dev/null +++ b/img/performance_pow2.svg @@ -0,0 +1,1328 @@ + + + + + + + + 2024-09-18T22:26:59.849723 + image/svg+xml + + + Matplotlib v3.9.2, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 1dce29ca8fb9334a9da978f0f2fbaf1d39e2f1d2 Mon Sep 17 00:00:00 2001 From: DomIno0o <51756125+DomIno0o@users.noreply.github.com> Date: Wed, 18 Sep 2024 23:34:19 -0400 Subject: [PATCH 08/10] Initial update of README.md --- README.md | 119 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 110 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 0e38ddb..2908c8f 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,115 @@ -CUDA Stream Compaction -====================== +Project 2 Stream Compaction +=========================== -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** +**University of Pennsylvania, CIS 5650: GPU Programming and Architecture** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Dominik Kau ([LinkedIn](https://www.linkedin.com/in/dominikkau/)) +* Tested on: Windows 10, i7-12700 @ 2.10 GHz, 32 GB, T1000 4096 MB (CETS machine) -### (TODO: Your README) +## Scan and Stream Compaction -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +Scan is an algorithm that outputs an array in which all previous elemnts of the input array have been summed up (https://en.wikipedia.org/wiki/Prefix_sum). +Stream compaction outputs an array that contains only those elements of the input that fulfill some predicate (in this project being non-zero). +The parallel implementation of stream compaction in this project will make use of the scan algorithm. +This project contains 4 implementations of the scan algorithm: +* straight-forward implementation on the CPU (CPU) +* naive, parallel implementation on the GPU (GPU naive) +* work-efficient implementation on the GPU (GPU efficient) +* implementation using the thrust library (GPU thrust) + +The stream compatction algorithm is implememented in 3 variants: +* straight-forward implementation on the CPU (CPU) +* scan based implementation on the CPU +* work-efficient scan based implementation on the GPU (GPU) + +These cases are labeled in the following graphs by the identifiers given in parantheses. +All implementations were tested on array lengths that are powers of 2 and on array lengths that are not powers of 2. + +## Performance Analysis + +### Scan timings on arrays with sizes that are powers of 2 + +![](img/performance_pow2.svg) + +Surprisingly, the CPU implementation is quicker than both of my GPU implementations. +This is despite the fact, that for both GPU implementations, I am launching kernels only with as many threads as are needed (up to the block size). +This prevents starting many threads that will not be active after a first if-condition. +As far as I understand this is what Part 5, Extra Credit is alluding to. +This result might be due to the usage of a CETS machine. + +Unfortunately, I haven't been able to make a detailed performance analysis using the Nsight suite, but obviously the memory access pattern is suboptimal. +Firstly, using global memory will lead to high latencies in each kernel call. +Secondly the strided indexing pattern in both GPU implementations has the same performance as random access of memory which creates a big bottleneck. + +Those bottlenecks are not present in the thrust implementation which is by far the fastest. + +### Scan timings on arrays with sizes that are not powers of 2 + +![](img/performance_nonpow2.svg) + +The performance on arrays with non-powers of 2 lengths reflects the same trends as the results above. +Interestingly, for smaller arrays the thrust implementation is quite a bit quicker in this scenario than in the above case of arrays with lengths of power 2. + +### Timing results of compacting algorithm + +![](img/performance_compact.svg) + +For the compacting algorithm, the GPU implementation is actually faster for large array sizes. +Here, only the straight-forward CPU implementation is used, as it is faster than the scan based algorithm. +I would have expected bigeer differences between the two GPU and the CPU implementation, but again, the surprisingly short CPU runtime could stem from the fact that I am using a CETS machine. +There is not a big difference between the arrays with power of 2 lengths and those with non-power of 2 lengths. + + +## Console Output + +This is the console ouput after running the project with an array size of $2^{20} = 1048576$. +I removed the numeric outputs for clarity. + +``` +**************** +** SCAN TESTS ** +**************** +==== cpu scan, power-of-two ==== + elapsed time: 0.4223ms (std::chrono Measured) +==== cpu scan, non-power-of-two ==== + elapsed time: 0.3906ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 2.48595ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 2.34307ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 1.90022ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 1.82384ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.503264ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.183776ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** +==== cpu compact without scan, power-of-two ==== + elapsed time: 2.1741ms (std::chrono Measured) + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 2.1809ms (std::chrono Measured) + passed +==== cpu compact with scan ==== + elapsed time: 3.3442ms (std::chrono Measured) + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 3.5247ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 3.73046ms (CUDA Measured) + passed +``` From a9244875852d6db4864dd412d74a8dd8b85b1332 Mon Sep 17 00:00:00 2001 From: DomIno0o <51756125+DomIno0o@users.noreply.github.com> Date: Wed, 18 Sep 2024 23:43:42 -0400 Subject: [PATCH 09/10] Update README.md --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 2908c8f..d2ee079 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Project 2 Stream Compaction ## Scan and Stream Compaction -Scan is an algorithm that outputs an array in which all previous elemnts of the input array have been summed up (https://en.wikipedia.org/wiki/Prefix_sum). +Scan is an algorithm that outputs an array in which all previous elements of the input array have been summed up (https://en.wikipedia.org/wiki/Prefix_sum). Stream compaction outputs an array that contains only those elements of the input that fulfill some predicate (in this project being non-zero). The parallel implementation of stream compaction in this project will make use of the scan algorithm. @@ -18,13 +18,14 @@ This project contains 4 implementations of the scan algorithm: * work-efficient implementation on the GPU (GPU efficient) * implementation using the thrust library (GPU thrust) -The stream compatction algorithm is implememented in 3 variants: +The stream compaction algorithm is implemented in 3 variants: * straight-forward implementation on the CPU (CPU) * scan based implementation on the CPU * work-efficient scan based implementation on the GPU (GPU) -These cases are labeled in the following graphs by the identifiers given in parantheses. +These cases are labeled in the following graphs by the identifiers given in parentheses. All implementations were tested on array lengths that are powers of 2 and on array lengths that are not powers of 2. +As far as I understand Part 5, Extra Credit, I implemented the alluded optimization by starting only as many threads as are needed (up to the block size) in every round of the algorithm. ## Performance Analysis @@ -35,7 +36,6 @@ All implementations were tested on array lengths that are powers of 2 and on arr Surprisingly, the CPU implementation is quicker than both of my GPU implementations. This is despite the fact, that for both GPU implementations, I am launching kernels only with as many threads as are needed (up to the block size). This prevents starting many threads that will not be active after a first if-condition. -As far as I understand this is what Part 5, Extra Credit is alluding to. This result might be due to the usage of a CETS machine. Unfortunately, I haven't been able to make a detailed performance analysis using the Nsight suite, but obviously the memory access pattern is suboptimal. @@ -57,13 +57,13 @@ Interestingly, for smaller arrays the thrust implementation is quite a bit quick For the compacting algorithm, the GPU implementation is actually faster for large array sizes. Here, only the straight-forward CPU implementation is used, as it is faster than the scan based algorithm. -I would have expected bigeer differences between the two GPU and the CPU implementation, but again, the surprisingly short CPU runtime could stem from the fact that I am using a CETS machine. +I would have expected bigger differences between the two GPU and the CPU implementation, but again, the surprisingly short CPU runtime could stem from the fact that I am using a CETS machine. There is not a big difference between the arrays with power of 2 lengths and those with non-power of 2 lengths. ## Console Output -This is the console ouput after running the project with an array size of $2^{20} = 1048576$. +This is the console output after running the project with an array size of $2^{20} = 1048576$. I removed the numeric outputs for clarity. ``` From 7a541a69af34480cc14723f9d29faca5b463f698 Mon Sep 17 00:00:00 2001 From: Kau Date: Wed, 18 Sep 2024 23:53:03 -0400 Subject: [PATCH 10/10] Minor code cleanup --- stream_compaction/cpu.cu | 2 +- stream_compaction/efficient.cu | 18 ++++++------------ stream_compaction/naive.cu | 4 ++-- stream_compaction/thrust.cu | 3 --- 4 files changed, 9 insertions(+), 18 deletions(-) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 956cced..044d37b 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,7 +19,7 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata, bool timed) { if (timed) timer().startCpuTimer(); - // TODO + // TODO int partialSum = 0; for (int i = 0; i < n; ++i) { odata[i] = partialSum; diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 9c84ae3..89427ea 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -37,14 +37,6 @@ namespace StreamCompaction { buffer[index + pow2todp1 - 1] += tmp; } - __global__ void kernZeroPadding(int n, int d, int* buffer) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - - if (index >= 1 << (d + 1) - n) return; - - buffer[n + index] = 0; - } - dim3 computeBlocksPerGrid(int threads, int blockSize) { return dim3{ (unsigned int)(threads + blockSize - 1) / blockSize }; } @@ -53,7 +45,7 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata, bool timed) { - int blockSize = 64; + int blockSize = 128; bool isPower2Length = (n == (1 << ilog2(n))); @@ -74,8 +66,8 @@ namespace StreamCompaction { for (int d = 0; d < ilog2ceil(n); ++d) { dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize); kernUpSweep<<>>(bufferLength, 1 << d, dev_tmpArray); - checkCUDAError("kernUpSweep failed!"); cudaDeviceSynchronize(); + checkCUDAError("kernUpSweep failed!"); } cudaMemset(dev_tmpArray + bufferLength - 1, 0, sizeof(int)); @@ -83,8 +75,8 @@ namespace StreamCompaction { for (int d = ilog2ceil(n) - 1; d >= 0; --d) { dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize); kernDownSweep<<>>(bufferLength, 1 << d, dev_tmpArray); - checkCUDAError("kernDownSweep failed!"); cudaDeviceSynchronize(); + checkCUDAError("kernDownSweep failed!"); } if (timed) timer().endGpuTimer(); @@ -120,6 +112,7 @@ namespace StreamCompaction { checkCUDAError("cudaMalloc dev_buffer2 failed!"); cudaMemcpy(dev_buffer1, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy idata->dev_buffer1 failed!"); timer().startGpuTimer(); @@ -130,10 +123,11 @@ namespace StreamCompaction { scan(n, dev_indices, dev_boolArray, 0); StreamCompaction::Common::kernScatter<<>>(n, dev_buffer2, dev_buffer1, dev_boolArray, dev_indices); - checkCUDAError("kernScatter failed!"); cudaDeviceSynchronize(); + checkCUDAError("kernScatter failed!"); cudaMemcpy(odata, dev_buffer2, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy dev_buffer2->odata failed!"); // Index that last element in idata would have, if it was valid int lastIndex; diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index f811076..b7c18cd 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -57,12 +57,12 @@ namespace StreamCompaction { timer().startGpuTimer(); StreamCompaction::Common::shiftArrayElements<<>>(n, 1, dev_buffer2, dev_buffer1); checkCUDAError("shiftArrayElements failed!"); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); for (int d = 0; d < ilog2(n); ++d) { kernNaiveScanStep <<>>(n, d, dev_buffer1, dev_buffer2); checkCUDAError("naiveScanStep failed!"); - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); std::swap(dev_buffer1, dev_buffer2); } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 463ca2e..32a3a73 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -27,9 +27,6 @@ namespace StreamCompaction { cudaMemcpy(dev_buffer, idata, n * sizeof(int), cudaMemcpyHostToDevice); timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); thrust::exclusive_scan(dev_thrustBuffer, dev_thrustBuffer + n, dev_thrustBuffer); timer().endGpuTimer();