diff --git a/README.md b/README.md index 0e38ddb..9e0d072 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,75 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Logan Cho + * [LinkedIn](https://www.linkedin.com/in/logan-cho/) + * [Personal Website](https://www.logancho.com/) +* Tested on: Windows 11, 13th Gen Intel(R) Core(TM) i7-13700H, 2.40 GHz, RTX 4060 Laptop GPU -### (TODO: Your README) +# Sample Output +``` +**************** +** SCAN TESTS ** +**************** + [ 48 27 47 0 26 20 5 27 18 47 10 9 22 ... 35 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0004ms (std::chrono Measured) + [ 0 48 75 122 122 148 168 173 200 218 265 275 284 ... 6629 6664 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.0004ms (std::chrono Measured) + [ 0 48 75 122 122 148 168 173 200 218 265 275 284 ... 6562 6595 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.08192ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.02048ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.191232ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.137216ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.071904ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.04608ms (CUDA Measured) + passed -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 3 3 0 0 2 1 1 0 3 2 1 2 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.0006ms (std::chrono Measured) + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.0003ms (std::chrono Measured) + passed +==== cpu compact with scan ==== + elapsed time: 0.001ms (std::chrono Measured) + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.188416ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.084992ms (CUDA Measured) + passed +``` +# Charts +![](images/Chart1.png) +![](images/Chart2.png) + +# In-Depth Performance Analysis + * CUDA Block Size Optimization + * Through trial and error, I narrowed down the optimal block sizes for each of my implementations of scan/compaction. + * Naive: (256, 1, 1) + * Efficient: (256, 1, 1) + * Comparison of all Scan Implementations (CPU, GPU (Naive, Efficient, Thrust)): + * ![](images/Chart1.png) + * This chart plots the time taken/runtime of different Scan implementations across different input array sizes. We can draw a conclusion that the GPU methods have a significant performance boost over the serial CPU method for larger array sizes, larger than 2 to the power of 14. + * ![](images/Chart2.png) + * We can also see from this additional chart that between the GPU methods, Thrust is the most performant by far. And we also see that fficient outpaces Naive once the size of the array exceeds 2 to the power of 20. diff --git a/images/Chart1.png b/images/Chart1.png new file mode 100644 index 0000000..49d6b2d Binary files /dev/null and b/images/Chart1.png differ diff --git a/images/Chart2.png b/images/Chart2.png new file mode 100644 index 0000000..aa9d4a0 Binary files /dev/null and b/images/Chart2.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..fc06200 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 22; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -54,11 +54,12 @@ int main(int argc, char* argv[]) { //printArray(SIZE, c, true); printCmpResult(SIZE, b, c); - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ + + //For bug-finding only: Array of 1s to help find bugs in stream compaction or scan + //onesArray(SIZE, c); + //printDesc("1s array for finding bugs"); + //StreamCompaction::Naive::scan(SIZE, c, a); + //printArray(SIZE, c, true); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); @@ -67,6 +68,7 @@ int main(int argc, char* argv[]) { //printArray(SIZE, c, true); printCmpResult(NPOT, b, c); + zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); @@ -115,7 +117,7 @@ int main(int argc, char* argv[]) { count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); expectedCount = count; - printArray(count, b, true); + //printArray(count, b, true); printCmpLenResult(count, expectedCount, b, b); zeroArray(SIZE, c); @@ -123,14 +125,14 @@ int main(int argc, char* argv[]) { count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); expectedNPOT = count; - printArray(count, c, true); + //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); zeroArray(SIZE, c); printDesc("cpu compact with scan"); count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); + //printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..eb1daf2 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -12,6 +12,7 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define blockSize 128 /** * Check for CUDA errors; print and exit if there was a problem. diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..64b932d 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -20,6 +20,10 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } timer().endCpuTimer(); } @@ -31,8 +35,16 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int index = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[index] = idata[i]; + index++; + } + } + timer().endCpuTimer(); - return -1; + return index; } /** @@ -41,10 +53,45 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); // TODO + int* bitArray = new int[n]; + int* scanBitArray = new int[n]; + + // 1. Populate scanBitArray + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + bitArray[i] = 1; + } + else { + bitArray[i] = 0; + } + } + + + // + timer().startCpuTimer(); + + { + //1.5 scan + scanBitArray[0] = 0; + for (int i = 1; i < n; i++) { + scanBitArray[i] = scanBitArray[i - 1] + bitArray[i - 1]; + } + //scan(n, scanBitArray, bitArray); + } + + + // 2. Scatter + int numElem = scanBitArray[n - 1]; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[scanBitArray[i]] = idata[i]; + } + } timer().endCpuTimer(); - return -1; + delete[] bitArray; + delete[] scanBitArray; + return numElem; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..6ca7328 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -12,13 +12,90 @@ namespace StreamCompaction { return timer; } + __global__ void UpSweepAtDepthD(int n, int offset1, int offset2, int* buffer) { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + int offsetBetweenMains = offset1; + int actualOffset = offset2; + + int index = (k + 1) * offsetBetweenMains - 1; + if (k < n / offsetBetweenMains) { + buffer[index] += buffer[index - actualOffset]; + } + } + + __global__ void DownSweepAtDepthD(int n, int offset1, int offset2, int* buffer) { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + int offsetBetweenMains = offset1; + int actualOffset = offset2; + + int index = (k + 1) * offsetBetweenMains - 1; + if (k < n / offsetBetweenMains) { + //left child index + int leftChildIndex = index - actualOffset; + int rightChildIndex = index; + int leftChildSave = buffer[leftChildIndex]; + buffer[leftChildIndex] = buffer[index]; + buffer[rightChildIndex] = leftChildSave + buffer[index]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* buffer; + int size = pow(2, ilog2ceil(n)); + + cudaMalloc((void**)&buffer, size * sizeof(int)); + cudaMemset(buffer, 0, size * sizeof(int)); + cudaMemcpy(buffer, idata, n * sizeof(int), cudaMemcpyHostToDevice); timer().startGpuTimer(); // TODO + + //UpSweep (parallel reduction) + for (int d = 0; d < ilog2ceil(size); d++) { + dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize); + int offsetBetweenMains = 1 << (d + 1); + int actualOffset = 1 << d; + UpSweepAtDepthD<<>>(size, offsetBetweenMains, actualOffset, buffer); + } + + //DownSweep + cudaMemset(buffer + (size - 1), 0, sizeof(int)); + for (int d = ilog2ceil(n) - 1; d >= 0; d--) { + dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize); + int offsetBetweenMains = 1 << (d + 1); + int actualOffset = 1 << d; + DownSweepAtDepthD<<>>(size, offsetBetweenMains, actualOffset, buffer); + checkCUDAError("DownSweepAtDepthD failed!"); + } + timer().endGpuTimer(); + //Cpy data back to CPU (only need first n ints! We dont care about the extension anymore) + cudaMemcpy(odata, buffer, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(buffer); + } + + + + __global__ void KernInitializeBitArray(int n, int* bitArray) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n) { + if (bitArray[index] == 0) { + bitArray[index] = 0; + } + else { + bitArray[index] = 1; + } + } + } + + __global__ void KernScatter(int n, int* idata, int* scan, int* output) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n) { + if (idata[index] != 0) { + output[scan[index]] = idata[index]; + } + } } /** @@ -31,10 +108,82 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int* numElem = new int; + *numElem = -1; + int* bitArray; + int* idataCpy; + int* deviceOutput; + + cudaMalloc((void**)&bitArray, n * sizeof(int)); + cudaMalloc((void**)&idataCpy, n * sizeof(int)); + cudaMalloc((void**)&deviceOutput, n * sizeof(int)); + cudaMemset(deviceOutput, 0, n * sizeof(int)); + cudaMemcpy(bitArray, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(idataCpy, idata, n * sizeof(int), cudaMemcpyHostToDevice); + { + //PT1: BITARRAY + // turn buffer1 into a bitarray + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + KernInitializeBitArray<<>>(n, bitArray); + } + + int* buffer; + int size = pow(2, ilog2ceil(n)); + { + //PT2: Scan Initialization + //buffer is a pow2 size buffer. We copy over buffer1 to this buffer to set it up. + cudaMalloc((void**)&buffer, size * sizeof(int)); + cudaMemset(buffer, 0, size * sizeof(int)); + cudaMemcpy(buffer, bitArray, n * sizeof(int), cudaMemcpyDeviceToDevice); + } + timer().startGpuTimer(); - // TODO + //TODO + { + //PT3: SCAN + // Store scan result in buffer2 + //UpSweep (parallel reduction) + for (int d = 0; d < ilog2ceil(size); d++) { + dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize); + int offsetBetweenMains = 1 << (d + 1); + int actualOffset = 1 << d; + UpSweepAtDepthD<<>>(size, offsetBetweenMains, actualOffset, buffer); + } + + //DownSweep + cudaMemset(buffer + (size - 1), 0, sizeof(int)); + for (int d = ilog2ceil(n) - 1; d >= 0; d--) { + dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize); + int offsetBetweenMains = 1 << (d + 1); + int actualOffset = 1 << d; + DownSweepAtDepthD<<>>(size, offsetBetweenMains, actualOffset, buffer); + checkCUDAError("DownSweepAtDepthD failed!"); + } + } + { + //PT4: SCATTER + // bitarray: stores bitarray + // buffer: first n elements store scan + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + KernScatter<<>>(n, idataCpy, buffer, deviceOutput); + } + timer().endGpuTimer(); - return -1; + + cudaMemcpy(odata, deviceOutput, n * sizeof(int), cudaMemcpyDeviceToHost); + //Access final element of scan for numElem + cudaMemcpy(numElem, buffer + size - 1, sizeof(int), cudaMemcpyDeviceToHost); + //FREEING + { + //compaction free + cudaFree(buffer); + cudaFree(bitArray); + cudaFree(idataCpy); + cudaFree(deviceOutput); + } + + return *numElem; + } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..8d17aa6 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -11,15 +11,44 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ + __global__ void KernScanAtDepthD(int n, int offset, int* iBuffer, int* oBuffer) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n) { + if (index >= offset) { + oBuffer[index] = iBuffer[index] + iBuffer[index - offset]; + } + else { + oBuffer[index] = iBuffer[index]; + } + + } + } + void scan(int n, int *odata, const int *idata) { + int* buffer1; + int* buffer2; + cudaMalloc((void**)&buffer1, n * sizeof(int)); + cudaMalloc((void**)&buffer2, n * sizeof(int)); + cudaMemcpy(buffer2, idata, n * sizeof(int), cudaMemcpyHostToDevice); timer().startGpuTimer(); // TODO + for (int d = 1; d <= ilog2ceil(n); d++) { + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + KernScanAtDepthD<<>>(n, (1 << d - 1), buffer2, buffer1); + if (d != ilog2ceil(n)) { + std::swap(buffer2, buffer1); + } + + checkCUDAError("KernScanAtDepthD failed!"); + } + timer().endGpuTimer(); + //Exclusive shift + copying correct buffer over based on parity + odata[0] = 0; + cudaMemcpy(odata + 1, buffer1, (n-1) * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(buffer1); + cudaFree(buffer2); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..cb4018f 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,20 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + + thrust::device_vector dv_in(idata, idata + n); //str, end + thrust::device_vector dv_out(n); + timer().startGpuTimer(); + + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + timer().endGpuTimer(); + + thrust::copy(dv_out.begin(), dv_out.end(), odata); } } }