diff --git a/README.md b/README.md index 0e38ddb..d2ee079 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,115 @@ -CUDA Stream Compaction -====================== +Project 2 Stream Compaction +=========================== -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** +**University of Pennsylvania, CIS 5650: GPU Programming and Architecture** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Dominik Kau ([LinkedIn](https://www.linkedin.com/in/dominikkau/)) +* Tested on: Windows 10, i7-12700 @ 2.10 GHz, 32 GB, T1000 4096 MB (CETS machine) -### (TODO: Your README) +## Scan and Stream Compaction -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +Scan is an algorithm that outputs an array in which all previous elements of the input array have been summed up (https://en.wikipedia.org/wiki/Prefix_sum). +Stream compaction outputs an array that contains only those elements of the input that fulfill some predicate (in this project being non-zero). +The parallel implementation of stream compaction in this project will make use of the scan algorithm. +This project contains 4 implementations of the scan algorithm: +* straight-forward implementation on the CPU (CPU) +* naive, parallel implementation on the GPU (GPU naive) +* work-efficient implementation on the GPU (GPU efficient) +* implementation using the thrust library (GPU thrust) + +The stream compaction algorithm is implemented in 3 variants: +* straight-forward implementation on the CPU (CPU) +* scan based implementation on the CPU +* work-efficient scan based implementation on the GPU (GPU) + +These cases are labeled in the following graphs by the identifiers given in parentheses. +All implementations were tested on array lengths that are powers of 2 and on array lengths that are not powers of 2. +As far as I understand Part 5, Extra Credit, I implemented the alluded optimization by starting only as many threads as are needed (up to the block size) in every round of the algorithm. + +## Performance Analysis + +### Scan timings on arrays with sizes that are powers of 2 + +![](img/performance_pow2.svg) + +Surprisingly, the CPU implementation is quicker than both of my GPU implementations. +This is despite the fact, that for both GPU implementations, I am launching kernels only with as many threads as are needed (up to the block size). +This prevents starting many threads that will not be active after a first if-condition. +This result might be due to the usage of a CETS machine. + +Unfortunately, I haven't been able to make a detailed performance analysis using the Nsight suite, but obviously the memory access pattern is suboptimal. +Firstly, using global memory will lead to high latencies in each kernel call. +Secondly the strided indexing pattern in both GPU implementations has the same performance as random access of memory which creates a big bottleneck. + +Those bottlenecks are not present in the thrust implementation which is by far the fastest. + +### Scan timings on arrays with sizes that are not powers of 2 + +![](img/performance_nonpow2.svg) + +The performance on arrays with non-powers of 2 lengths reflects the same trends as the results above. +Interestingly, for smaller arrays the thrust implementation is quite a bit quicker in this scenario than in the above case of arrays with lengths of power 2. + +### Timing results of compacting algorithm + +![](img/performance_compact.svg) + +For the compacting algorithm, the GPU implementation is actually faster for large array sizes. +Here, only the straight-forward CPU implementation is used, as it is faster than the scan based algorithm. +I would have expected bigger differences between the two GPU and the CPU implementation, but again, the surprisingly short CPU runtime could stem from the fact that I am using a CETS machine. +There is not a big difference between the arrays with power of 2 lengths and those with non-power of 2 lengths. + + +## Console Output + +This is the console output after running the project with an array size of $2^{20} = 1048576$. +I removed the numeric outputs for clarity. + +``` +**************** +** SCAN TESTS ** +**************** +==== cpu scan, power-of-two ==== + elapsed time: 0.4223ms (std::chrono Measured) +==== cpu scan, non-power-of-two ==== + elapsed time: 0.3906ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 2.48595ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 2.34307ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 1.90022ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 1.82384ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.503264ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.183776ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** +==== cpu compact without scan, power-of-two ==== + elapsed time: 2.1741ms (std::chrono Measured) + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 2.1809ms (std::chrono Measured) + passed +==== cpu compact with scan ==== + elapsed time: 3.3442ms (std::chrono Measured) + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 3.5247ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 3.73046ms (CUDA Measured) + passed +``` diff --git a/img/performance_compact.svg b/img/performance_compact.svg new file mode 100644 index 0000000..28efc4a --- /dev/null +++ b/img/performance_compact.svg @@ -0,0 +1,1379 @@ + + + + + + + + 2024-09-18T22:27:00.267128 + image/svg+xml + + + Matplotlib v3.9.2, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/img/performance_nonpow2.svg b/img/performance_nonpow2.svg new file mode 100644 index 0000000..26a93fc --- /dev/null +++ b/img/performance_nonpow2.svg @@ -0,0 +1,1342 @@ + + + + + + + + 2024-09-18T22:27:00.060256 + image/svg+xml + + + Matplotlib v3.9.2, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/img/performance_pow2.svg b/img/performance_pow2.svg new file mode 100644 index 0000000..35c2c07 --- /dev/null +++ b/img/performance_pow2.svg @@ -0,0 +1,1328 @@ + + + + + + + + 2024-09-18T22:26:59.849723 + image/svg+xml + + + Matplotlib v3.9.2, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..ca51934 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 20; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -51,48 +51,48 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan + /*For bug - finding only : Array of 1s to help find bugs in stream compaction or scan onesArray(SIZE, c); printDesc("1s array for finding bugs"); StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ + printArray(SIZE, c, true);*/ zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -137,14 +137,14 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..12c8acf 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,4 +1,5 @@ #include "common.h" +#include "device_launch_parameters.h" void checkCUDAErrorFn(const char *msg, const char *file, int line) { cudaError_t err = cudaGetLastError(); @@ -17,22 +18,41 @@ void checkCUDAErrorFn(const char *msg, const char *file, int line) { namespace StreamCompaction { namespace Common { + __global__ void shiftArrayElements(int n, int shift, const int* readBuffer, int* writeBuffer) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= n) return; + if (index < shift) { + writeBuffer[index] = 0; + return; + } + + writeBuffer[index] = readBuffer[index - shift]; + } /** * Maps an array to an array of 0s and 1s for stream compaction. Elements * which map to 0 will be removed, and elements which map to 1 will be kept. */ - __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { + __global__ void kernMapToBoolean(int n, int* bools, const int* idata) { // TODO - } + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= n) return; + bools[index] = (idata[index] != 0); + + } + /** * Performs scatter on an array. That is, for each element in idata, * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. */ - __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices) { + __global__ void kernScatter(int n, int* odata, + const int* idata, const int* bools, const int* indices) { // TODO + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= n) return; + + if (bools[index]) odata[indices[index]] = idata[index]; } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..cfdc45b 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -37,6 +37,9 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices); + __global__ void shiftArrayElements(int n, int shift, + const int* readBuffer, int* writeBuffer); + /** * This class is used for timing the performance * Uncopyable and unmovable diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..044d37b 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -17,10 +17,15 @@ namespace StreamCompaction { * For performance analysis, this is supposed to be a simple for loop. * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ - void scan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); + void scan(int n, int *odata, const int *idata, bool timed) { + if (timed) timer().startCpuTimer(); + // TODO + int partialSum = 0; + for (int i = 0; i < n; ++i) { + odata[i] = partialSum; + partialSum += idata[i]; + } + if (timed) timer().endCpuTimer(); } /** @@ -30,9 +35,20 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int numElements = 0; + for (int i = 0; i < n; ++i) { + if (idata[i]) odata[numElements++] = idata[i]; + } timer().endCpuTimer(); - return -1; + return numElements; + } + + int scatter(int n, int* odata, const int* bdata, const int* idata) { + int numElements = 0; + for (int i = 0; i < n; ++i) { + if (bdata[i]) odata[numElements++] = idata[i]; + } + return numElements; } /** @@ -41,10 +57,20 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + int* buffer = new int[n]; timer().startCpuTimer(); - // TODO + // Create boolean mask + for (int i = 0; i < n; ++i) { + buffer[i] = (idata[i] != 0); + } + + scan(n, odata, idata, 0); + + int numElements = scatter(n, odata, buffer, idata); + timer().endCpuTimer(); - return -1; + delete[] buffer; + return numElements; } } } diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 873c047..f2f8c14 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -6,7 +6,7 @@ namespace StreamCompaction { namespace CPU { StreamCompaction::Common::PerformanceTimer& timer(); - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata, bool timed = 1); int compactWithoutScan(int n, int *odata, const int *idata); diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..89427ea 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,6 +2,7 @@ #include #include "common.h" #include "efficient.h" +#include "device_launch_parameters.h" namespace StreamCompaction { namespace Efficient { @@ -12,13 +13,76 @@ namespace StreamCompaction { return timer; } + __global__ void kernUpSweep(int n, int pow2tod, int* buffer) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + + int pow2todp1 = 2 * pow2tod; + + if (index > n / pow2todp1 - 1) return; + index *= pow2todp1; + + buffer[index + pow2todp1 - 1] += buffer[index + pow2tod - 1]; + } + + __global__ void kernDownSweep(int n, int pow2tod, int* buffer) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + + int pow2todp1 = 2 * pow2tod; + + if (index > n / pow2todp1 - 1) return; + index *= pow2todp1; + + int tmp = buffer[index + pow2tod - 1]; + buffer[index + pow2tod - 1] = buffer[index + pow2todp1 - 1]; + buffer[index + pow2todp1 - 1] += tmp; + } + + dim3 computeBlocksPerGrid(int threads, int blockSize) { + return dim3{ (unsigned int)(threads + blockSize - 1) / blockSize }; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + void scan(int n, int *odata, const int *idata, bool timed) { + int blockSize = 128; + + bool isPower2Length = (n == (1 << ilog2(n))); + + int bufferLength = (isPower2Length) ? n : 1 << ilog2ceil(n); + + int* dev_tmpArray; + cudaMalloc((void**)&dev_tmpArray, bufferLength * sizeof(int)); + checkCUDAError("cudaMalloc tmpArray failed!"); + + if (!isPower2Length) { + cudaMemset(dev_tmpArray + n, 0, (bufferLength - n) * sizeof(int)); + } + + cudaMemcpy(dev_tmpArray, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + if (timed) timer().startGpuTimer(); // TODO - timer().endGpuTimer(); + for (int d = 0; d < ilog2ceil(n); ++d) { + dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize); + kernUpSweep<<>>(bufferLength, 1 << d, dev_tmpArray); + cudaDeviceSynchronize(); + checkCUDAError("kernUpSweep failed!"); + } + + cudaMemset(dev_tmpArray + bufferLength - 1, 0, sizeof(int)); + + for (int d = ilog2ceil(n) - 1; d >= 0; --d) { + dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize); + kernDownSweep<<>>(bufferLength, 1 << d, dev_tmpArray); + cudaDeviceSynchronize(); + checkCUDAError("kernDownSweep failed!"); + } + if (timed) timer().endGpuTimer(); + + cudaMemcpy(odata, dev_tmpArray, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_tmpArray); } /** @@ -30,11 +94,55 @@ namespace StreamCompaction { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ - int compact(int n, int *odata, const int *idata) { + int compact(int n, int* odata, const int* idata) { + int blockSize = 128; + dim3 blocks{ (unsigned int)(n + blockSize - 1) / blockSize }; + + int* dev_buffer1; + int* dev_buffer2; + int* dev_boolArray; + int* dev_indices; + cudaMalloc((void**)&dev_boolArray, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_boolArray failed!"); + cudaMalloc((void**)&dev_indices, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_indices failed!"); + cudaMalloc((void**)&dev_buffer1, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_buffer1 failed!"); + cudaMalloc((void**)&dev_buffer2, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_buffer2 failed!"); + + cudaMemcpy(dev_buffer1, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy idata->dev_buffer1 failed!"); + timer().startGpuTimer(); - // TODO + + StreamCompaction::Common::kernMapToBoolean<<>>(n, dev_boolArray, dev_buffer1); + cudaDeviceSynchronize(); + checkCUDAError("kernMapToBoolean failed!"); + + scan(n, dev_indices, dev_boolArray, 0); + + StreamCompaction::Common::kernScatter<<>>(n, dev_buffer2, dev_buffer1, dev_boolArray, dev_indices); + cudaDeviceSynchronize(); + checkCUDAError("kernScatter failed!"); + + cudaMemcpy(odata, dev_buffer2, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy dev_buffer2->odata failed!"); + + // Index that last element in idata would have, if it was valid + int lastIndex; + cudaMemcpy(&lastIndex, dev_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + // Check if last element is valid + int lastBool; + cudaMemcpy(&lastBool, dev_boolArray + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + timer().endGpuTimer(); - return -1; + + cudaFree(dev_boolArray); + cudaFree(dev_indices); + cudaFree(dev_buffer1); + cudaFree(dev_buffer2); + return (lastBool) ? lastIndex + 1 : lastIndex; } } -} +} \ No newline at end of file diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..a4455cc 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -6,7 +6,7 @@ namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata, bool timed = 1); int compact(int n, int *odata, const int *idata); } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..b7c18cd 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -2,6 +2,7 @@ #include #include "common.h" #include "naive.h" +#include "device_launch_parameters.h" namespace StreamCompaction { namespace Naive { @@ -12,14 +13,72 @@ namespace StreamCompaction { return timer; } // TODO: __global__ + __global__ void kernHandleNonPower(int n, int d, int* buffer) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + int pow2tod = 1 << d; + + if (index >= n - pow2tod) return; + + buffer[pow2tod + index] += buffer[index]; + } + + __global__ void kernNaiveScanStep(int n, int d, const int* readBuffer, int* writeBuffer) { + // compute thread index + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= n) return; + + int pow2tod = 1 << d; + + if (index > pow2tod) { + writeBuffer[index] = readBuffer[index] + readBuffer[index - pow2tod]; + } + else { + writeBuffer[index] = readBuffer[index]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int blockSize = 128; + dim3 fullBlocksPerGrid{ (unsigned int) (n + blockSize - 1) / blockSize }; + + int* dev_buffer1; + int* dev_buffer2; + + cudaMalloc((void**)&dev_buffer1, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_buffer1 failed!"); + cudaMalloc((void**)&dev_buffer2, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_buffer2 failed!"); + + cudaMemcpy(dev_buffer2, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + StreamCompaction::Common::shiftArrayElements<<>>(n, 1, dev_buffer2, dev_buffer1); + checkCUDAError("shiftArrayElements failed!"); + //cudaDeviceSynchronize(); + + for (int d = 0; d < ilog2(n); ++d) { + kernNaiveScanStep <<>>(n, d, dev_buffer1, dev_buffer2); + checkCUDAError("naiveScanStep failed!"); + //cudaDeviceSynchronize(); + + std::swap(dev_buffer1, dev_buffer2); + } + // perform last step + if ((1 << ilog2(n)) != n) { + fullBlocksPerGrid.x = (n - (1 << ilog2(n)) + blockSize - 1) / blockSize; + kernHandleNonPower<<>>(n, ilog2(n), dev_buffer1); + checkCUDAError("handleNonPower failed!"); + cudaDeviceSynchronize(); + } timer().endGpuTimer(); + + cudaMemcpy(odata, dev_buffer1, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_buffer1); + cudaFree(dev_buffer2); } } -} +} \ No newline at end of file diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..32a3a73 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,21 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* dev_buffer; + thrust::device_ptr dev_thrustBuffer; + cudaMalloc((void**)&dev_buffer, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_buffer failed!"); + dev_thrustBuffer = thrust::device_ptr(dev_buffer); + + cudaMemcpy(dev_buffer, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_thrustBuffer, dev_thrustBuffer + n, dev_thrustBuffer); timer().endGpuTimer(); + + cudaMemcpy(odata, dev_buffer, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_buffer); } } }