diff --git a/README.md b/README.md index 0e38ddb1..0c84e46f 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,203 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +![Stream Compaction Tests](./img/test.png) -### (TODO: Your README) + * Xiaonan Pan + * [LinkedIn](https://www.linkedin.com/in/xiaonan-pan-9b0b0b1a7), [My Blog](www.tsingloo.com), [GitHub](https://github.com/TsingLoo) + * Tested on: + * Windows 11 24H2 + * 13600KF @ 3.5Ghz + * 4070 SUPER 12GB + * 32GB RAM -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +# Overview + +This project provides thoroughly tested CPU and GPU (CUDA) implementations of parallel **scan** (exclusive and inclusive) and **stream compaction** algorithms. Additionally, a CUDA-based **radix sort** is implemented and tested. The CUDA scan implementation specifically explores and contrasts both naive and work-efficient approaches. + + + +# Implementation + +Most of the GPU scan implementations for this project were based on the pseudocode provided in the lecture slides. Throughout the coding process, I found the mental transition between the serial "scheduler's view" and the parallel "thread's view" to be a fresh and critical challenge. For example, the work-efficient up-sweep algorithm is presented from a high-level, scheduler's perspective: + +```cpp +for d = 0 to log_2{n} - 1 + for all k = 0 to n – 1 by 2^(d+1) in parallel + x[k + 2^(d+1) – 1] += x[k + 2^(d) – 1]; +``` + +However, in a massively parallel CUDA kernel, there is no such central scheduler. Instead, thousands of threads awaken simultaneously, each with its own dense, unique ID `k`. The core of the implementation is to translate the high-level plan into an independent action for each thread. This is achieved with a filter condition. Once a thread passes a check like `if ((k + 1) % fullStride == 0)`, it has successfully identified itself as one of the active workers from the scheduler's plan. At that moment, the thread's own dense ID `k` becomes the actual memory address for the write operation, as shown in the kernel: +```cpp +__global__ void kernWorkEfficientUpSweep(int paddedN, int n, int d, int* idata) { + int k = blockIdx.x * blockDim.x + threadIdx.x; + + int halfStride = 1 << (d); + int fullStride = halfStride * 2; + + if (k >= n || ((k + 1) % fullStride != 0)) { + return; + } + + // up-sweep + idata[k] += idata[k - halfStride]; +} +``` + +Although further optimizations involve launching a compacted grid where each thread must explicitly calculate its target address, I still find this fundamental transition from a collective plan to individual, self-aware threads to be the most impressive aspect of parallel programming. This is the optimized version: +```cpp +__global__ void kernUpSweep(int n, int d, int* data) { + // k is the ID of the k-th active thread (dense) + int k = blockIdx.x * blockDim.x + threadIdx.x; + + int fullStride = 1 << (d + 1); + int halfStride = 1 << d; + + int global_idx = (k + 1) * fullStride - 1; + + if (global_idx >= n) { + return; + } + + data[global_idx] += data[global_idx - halfStride]; +} +``` + +# Performance Analysis + +![plot](./img/plot.png) + +The x-axis is the input size(N, where size = 2^N) + +The y-axis is the execution time (ms), the less the better + +## Explanation + +**The Crossover Point:** For small input sizes (roughly N < 20), the **CPU is significantly faster**. After this point, the GPU implementations become equal and faster than the CPU. The low performance of my work-efficient GPU implementation was due to excessive IO between the host and device. To reuse the `scan` function, intermediate results were copied from the device to the host, and the output of the scan was then copied back to the device. This data "round trip" is a significant performance bottleneck. To avoid this, the code was refactored to create a device side helper that operates exclusively on device pointers, without these time-consuming data transfers. + +# Extra Credit + +I implemented the Radix Sort and add the tests for it. In`main.cpp` it is called by + +```cpp +int *a = new int[SIZE]; +int *c = new int[SIZE]; + +//a is the input data, c is the output data +StreamCompaction::Radix::sort(SIZE, c, a); +``` + +```cpp +printf("\n"); +printf("*****************************\n"); +printf("** RADIX TESTS **\n"); +printf("*****************************\n"); + +genArray(SIZE, a, 1000); // Generate a new array with a larger range of numbers +printArray(SIZE, a, true); + +memcpy(b, a, SIZE * sizeof(int)); // Copy a into b +printDesc("cpu sort (std::sort), power-of-two"); +auto start_time = std::chrono::high_resolution_clock::now(); +std::sort(b, b + SIZE); // Sort b to create the correct reference answer +auto end_time = std::chrono::high_resolution_clock::now(); +std::chrono::duration elapsed = end_time - start_time; +printf(" elapsed time: %.4fms (std::chrono Measured)\n", elapsed.count()); +printArray(SIZE, b, true); + +zeroArray(SIZE, c); +printDesc("radix sort, power-of-two"); +StreamCompaction::Radix::sort(SIZE, c, a); +printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); +printCmpResult(SIZE, b, c); + +memcpy(b, a, NPOT * sizeof(int)); +std::sort(b, b + NPOT); + + +zeroArray(SIZE, c); +printDesc("radix sort, non-power-of-two"); +StreamCompaction::Radix::sort(NPOT, c, a); +printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); +printCmpResult(NPOT, b, c); +``` + + + + + +# Test Output + +`#define BLOCK_SIZE 256`,`const int SIZE = 1 << 10;` + + +```cpp +**************** +** SCAN TESTS ** +**************** + [ 45 35 35 0 7 15 46 20 46 47 42 7 18 ... 22 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0007ms (std::chrono Measured) + [ 0 45 80 115 115 122 137 183 203 249 296 338 345 ... 24413 24435 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.0006ms (std::chrono Measured) + [ 0 45 80 115 115 122 137 183 203 249 296 338 345 ... 24328 24355 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.238592ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.055296ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.250784ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.11776ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.08192ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.03072ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 3 3 3 0 1 1 0 0 0 3 2 3 2 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.0022ms (std::chrono Measured) + [ 3 3 3 1 1 3 2 3 2 2 3 3 2 ... 2 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.0011ms (std::chrono Measured) + [ 3 3 3 1 1 3 2 3 2 2 3 3 2 ... 1 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.0055ms (std::chrono Measured) + [ 3 3 3 1 1 3 2 3 2 2 3 3 2 ... 2 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.284672ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.1536ms (CUDA Measured) + passed + +***************************** +** RADIX TESTS ** +***************************** + [ 795 735 835 0 57 565 396 420 196 647 42 7 118 ... 122 648 ] +==== cpu sort (std::sort), power-of-two ==== + elapsed time: 0.0299ms (std::chrono Measured) + [ 0 2 2 2 3 5 5 6 6 6 7 8 8 ... 993 994 ] +==== radix sort, power-of-two ==== + elapsed time: 5.88186ms (CUDA Measured) + passed +==== radix sort, non-power-of-two ==== + elapsed time: 5.72314ms (CUDA Measured) + passed +Press any key to continue . . . +``` diff --git a/img/plot.png b/img/plot.png new file mode 100644 index 00000000..00a40fee Binary files /dev/null and b/img/plot.png differ diff --git a/img/test.png b/img/test.png new file mode 100644 index 00000000..a7b38e9f Binary files /dev/null and b/img/test.png differ diff --git a/src/main.cpp b/src/main.cpp index 3d5c8820..d0ec89b6 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,9 +11,12 @@ #include #include #include +#include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +//Xiaonan + +const int SIZE = 1 << 22; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -22,18 +25,20 @@ int *c = new int[SIZE]; int main(int argc, char* argv[]) { // Scan tests - printf("\n"); - printf("****************\n"); - printf("** SCAN TESTS **\n"); - printf("****************\n"); + //printf("\n"); + //printf("****************\n"); + //printf("** SCAN TESTS **\n"); + //printf("****************\n"); genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; printArray(SIZE, a, true); - // initialize b using StreamCompaction::CPU::scan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. - // At first all cases passed because b && c are all zeroes. + //printDesc("The calcualtion is " + paddedN / (1 << (d + 1))); + + //// initialize b using StreamCompaction::CPU::scan you implement + //// We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. + //// At first all cases passed because b && c are all zeroes. zeroArray(SIZE, b); printDesc("cpu scan, power-of-two"); StreamCompaction::CPU::scan(SIZE, b, a); @@ -65,6 +70,7 @@ int main(int argc, char* argv[]) { StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(SIZE, c, true); + printCmpResult(NPOT, b, c); zeroArray(SIZE, c); @@ -72,6 +78,9 @@ int main(int argc, char* argv[]) { StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(SIZE, c, true); + printArray(SIZE, b, true); + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); zeroArray(SIZE, c); @@ -79,6 +88,10 @@ int main(int argc, char* argv[]) { StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(NPOT, c, true); + + printArray(SIZE, b, true); + printArray(SIZE, c, true); + printCmpResult(NPOT, b, c); zeroArray(SIZE, c); @@ -147,7 +160,41 @@ int main(int argc, char* argv[]) { //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); - system("pause"); // stop Win32 console from closing on exit + printf("\n"); + printf("*****************************\n"); + printf("** RADIX TESTS **\n"); + printf("*****************************\n"); + + genArray(SIZE, a, 1000); // Generate a new array with a larger range of numbers + printArray(SIZE, a, true); + + memcpy(b, a, SIZE * sizeof(int)); // Copy a into b + printDesc("cpu sort (std::sort), power-of-two"); + auto start_time = std::chrono::high_resolution_clock::now(); + std::sort(b, b + SIZE); // Sort b to create the correct reference answer + auto end_time = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed = end_time - start_time; + printf(" elapsed time: %.4fms (std::chrono Measured)\n", elapsed.count()); + printArray(SIZE, b, true); + + zeroArray(SIZE, c); + printDesc("radix sort, power-of-two"); + StreamCompaction::Radix::sort(SIZE, c, a); + printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printCmpResult(SIZE, b, c); + + memcpy(b, a, NPOT * sizeof(int)); + std::sort(b, b + NPOT); + + + zeroArray(SIZE, c); + printDesc("radix sort, non-power-of-two"); + StreamCompaction::Radix::sort(NPOT, c, a); + printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printCmpResult(NPOT, b, c); + + + system("pause"); delete[] a; delete[] b; delete[] c; diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 19511caa..7378625a 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -4,6 +4,7 @@ set(headers "naive.h" "efficient.h" "thrust.h" + "radix.h" ) set(sources @@ -12,6 +13,7 @@ set(sources "naive.cu" "efficient.cu" "thrust.cu" + "radix.cu" ) list(SORT headers) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d630..fa818af3 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,18 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx >= n) { + return; + } + + if (idata[idx] != 0) { + bools[idx] = 1; + } + else { + bools[idx] = 0; + } } /** @@ -32,7 +43,13 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx >= n || bools[idx] == 0) { + return; + } + + odata[indices[idx]] = idata[idx]; } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa115..7754bc5b 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -12,6 +12,15 @@ namespace StreamCompaction { return timer; } + void scanWithoutTimer(int n, int* odata, const int* idata) { + int sum = 0; + for (int i = 0; i < n; ++i) + { + odata[i] = sum; + sum += idata[i]; + } + } + /** * CPU scan (prefix sum). * For performance analysis, this is supposed to be a simple for loop. @@ -20,6 +29,9 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + + scanWithoutTimer(n, odata, idata); + timer().endCpuTimer(); } @@ -31,8 +43,20 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int count = 0; + int curr = 0; + for (int i = 0; i < n; ++i) + { + curr = idata[i]; + + if (curr != 0) { + odata[count] = curr; + count++; + } + } + timer().endCpuTimer(); - return -1; + return count; } /** @@ -43,8 +67,34 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + + int* tempArray = new int[n]; + + for (int i = 0; i < n; ++i) + { + tempArray[i] = (idata[i] != 0) ? 1 : 0; + } + + int* scanResult = new int[n]; + scanWithoutTimer(n, scanResult, tempArray); + + //Scatter + int finalLength = scanResult[n - 1]; + + for (int i = 0; i < n; ++i) + { + if (tempArray[i] == 1) { + odata[scanResult[i]] = idata[i]; + } + } + + + delete[] tempArray; + delete[] scanResult; + + timer().endCpuTimer(); - return -1; + return finalLength; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346ee..9f7a449a 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,6 +3,8 @@ #include "common.h" #include "efficient.h" +#define BLOCK_SIZE 256 + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,13 +14,212 @@ namespace StreamCompaction { return timer; } + __global__ void kernWorkEfficientUpSweep(int paddedN, int n, int d, int* idata) { + int k = blockIdx.x * blockDim.x + threadIdx.x; + + int halfStride = 1 << (d); + int fullStride = halfStride * 2; + + if (k >= n || ((k + 1) % fullStride != 0)) { + return; + } + + // up-sweep + idata[k] += idata[k - halfStride]; + } + + __global__ void setLastElementToZero(int paddedN, int* idata) + { + idata[paddedN - 1] = 0; + } + + __global__ void kernWorkEfficientDownSweep(int paddedN, int n, int d, int* idata) { + int k = blockIdx.x * blockDim.x + threadIdx.x; + + int halfStride = 1 << (d); + int fullStride = halfStride * 2; + + if (k >= paddedN || ((k + 1) % fullStride != 0)) { + return; + } + + // down-sweep + int originalLeftChildValue = idata[k - halfStride]; + int parentValue = idata[k]; + + idata[k - halfStride] = parentValue; + idata[k] = parentValue + originalLeftChildValue; + } + + __global__ void kernUpSweep(int n, int d, int* data) { + // k is the ID of the k-th active thread (dense) + int k = blockIdx.x * blockDim.x + threadIdx.x; + + int fullStride = 1 << (d + 1); + int halfStride = 1 << d; + + int global_idx = (k + 1) * fullStride - 1; + + if (global_idx >= n) { + return; + } + + data[global_idx] += data[global_idx - halfStride]; + } + + __global__ void kernUpSweep_DEBUG(int n, int d, int* data, int numActiveThreads) { + // k is the ID of the k-th active thread (dense) + int k = blockIdx.x * blockDim.x + threadIdx.x; + + // This check is important: only let threads that are part of the + // compact grid do the printing. + if (k >= numActiveThreads) { + return; + } + + int fullStride = 1 << (d + 1); + int halfStride = 1 << d; + + int global_idx = (k + 1) * fullStride - 1; + + // --- Let's print the state of the first and last active threads --- + if (d == 0 && (k == 0 || k == numActiveThreads - 1)) { + printf("d=%d, active_thread_k=%d, global_idx=%d, n=%d\n", + d, k, global_idx, n); + } + + if (global_idx >= n) { + return; + } + + // Let's also check the values we are about to add + if (d == 0 && (k == 0 || k == numActiveThreads - 1)) { + int read_addr = global_idx - halfStride; + printf(" -> Thread %d will add data[%d] (value=%d) to data[%d] (value=%d)\n", + k, read_addr, data[read_addr], global_idx, data[global_idx]); + } + + data[global_idx] += data[global_idx - halfStride]; + } + + __global__ void kernDownSweep(int n, int d, int* data) { + int k = blockIdx.x * blockDim.x + threadIdx.x; + + int fullStride = 1 << (d + 1); + int halfStride = 1 << d; + + int global_idx = (k + 1) * fullStride - 1; + + if (global_idx >= n) { + return; + } + + // Standard down-sweep logic using the calculated global index + int originalLeftChildValue = data[global_idx - halfStride]; + int parentValue = data[global_idx]; + + + data[global_idx - halfStride] = parentValue; + data[global_idx] = parentValue + originalLeftChildValue; + } + + /// + /// return an excluisve scan + /// + /// is the actual number of elements + /// be sure that the length of this should be n + /// be sure that the length of this should be n + void scan_device(int n, int* dev_odata, const int* dev_idata) { + // Calculate the padded size, which is the next power of 2. + int paddedN = 1 << ilog2ceil(n); + + // STEP 1: No more cudaMalloc or dev_temp! + + // STEP 2: Copy input directly to the output buffer. + // Then, pad the rest of the output buffer with zeros. + cudaMemcpy(dev_odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToDevice); + if (paddedN > n) { + cudaMemset(dev_odata + n, 0, (paddedN - n) * sizeof(int)); + } + + // --- Up-Sweep Phase --- + for (int d = 0; d < ilog2ceil(paddedN); d++) { + int numActiveThreads = paddedN / (1 << (d + 1)); + int blockSize = std::min(BLOCK_SIZE, numActiveThreads); + + dim3 gridDim((numActiveThreads + blockSize - 1) / blockSize); + dim3 blockDim(blockSize); + + // STEP 3: Kernels now operate directly on dev_odata. + kernUpSweep << > > (paddedN, d, dev_odata); + cudaDeviceSynchronize(); + } + + // Set the last element to zero for an exclusive scan. + setLastElementToZero << <1, 1 >> > (paddedN, dev_odata); + + // --- Down-Sweep Phase --- + for (int d = ilog2ceil(paddedN) - 1; d >= 0; d--) { + int numActiveThreads = paddedN / (1 << (d + 1)); + int blockSize = std::min(BLOCK_SIZE, numActiveThreads); + + dim3 gridDim((numActiveThreads + blockSize - 1) / blockSize); + dim3 blockDim(blockSize); + + // STEP 3 (cont.): Kernels now operate directly on dev_odata. + kernDownSweep << > > (paddedN, d, dev_odata); + cudaDeviceSynchronize(); + } + + // STEP 4: The final cudaMemcpy and cudaFree are no longer needed. + // The result is already in dev_odata. + } + + void scanWithoutTimer(int n, int* odata, const int* idata) { + int* dev_idata; + int* dev_odata; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + scan_device(n, dev_odata, dev_idata); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int paddedN = 1 << ilog2ceil(n); + + // + int t = ilog2ceil(9); + + dim3 fullBlocksPerGrid((paddedN + BLOCK_SIZE - 1) / BLOCK_SIZE); + + int* dev_temp; // Use a temporary buffer for the in-place algorithm + int* dev_odata; + cudaMalloc((void**)&dev_temp, paddedN * sizeof(int)); + cudaMalloc((void**)&dev_odata, paddedN * sizeof(int)); + cudaMemset(dev_temp, 0, paddedN * sizeof(int)); + cudaMemcpy(dev_temp, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + + scan_device(paddedN, dev_odata, dev_temp); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_temp); + cudaFree(dev_odata); } /** @@ -31,10 +232,47 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + + dim3 fullBlocksPerGrid((n + BLOCK_SIZE - 1) / BLOCK_SIZE); + + int* dev_idata; + int* dev_flags; + int* dev_scanResult; + int* dev_odata; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_flags, n * sizeof(int)); + cudaMalloc((void**)&dev_scanResult, n * sizeof(int)); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + //compute the temporary array timer().startGpuTimer(); - // TODO + StreamCompaction::Common::kernMapToBoolean << > > (n, dev_flags, dev_idata); + + //compute the scan of the temporary array + int paddedN = 1 << ilog2ceil(n); + scan_device(n, dev_scanResult, dev_flags); + + //compute the scatter + int lastElementOfScan = 0; + int lastElementOfFlags = 0; + cudaMemcpy(&lastElementOfScan, &dev_scanResult[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastElementOfFlags, &dev_flags[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + + + int totalCount = lastElementOfScan + lastElementOfFlags; + cudaMalloc((void**)&dev_odata, totalCount * sizeof(int)); + StreamCompaction::Common::kernScatter << > > (n, dev_odata, dev_idata, dev_flags, dev_scanResult); timer().endGpuTimer(); - return -1; + cudaMemcpy(odata, dev_odata, totalCount * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_flags); + cudaFree(dev_scanResult); + cudaFree(dev_odata); + + return totalCount; } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4fe..1957cc76 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -8,6 +8,8 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata); + void scan_device(int n, int* dev_odata, const int* dev_idata); + int compact(int n, int *odata, const int *idata); } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 43088769..cd155fc8 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,8 @@ #include "common.h" #include "naive.h" +#define BLOCK_SIZE 128 + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -13,13 +15,51 @@ namespace StreamCompaction { } // TODO: __global__ + __global__ void kernParallelScan(int n, int pow2d, int *odata, const int *idata) { + int k = blockIdx.x * blockDim.x + threadIdx.x; + + if (k >= n) { + return; + } + + if (k >= pow2d){ + odata[k] = idata[k - pow2d] + idata[k]; + } + else { + odata[k] = idata[k]; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + dim3 fullBlocksPerGrid((n + BLOCK_SIZE - 1) / BLOCK_SIZE); + + int* dev_A; + int* dev_B; + + cudaMalloc((void**)&dev_A, n * sizeof(int)); + cudaMalloc((void**)&dev_B, n * sizeof(int)); + + + cudaMemcpy(dev_A, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + + for (int d = 0; d < ilog2ceil(n); ++d) + { + kernParallelScan <<>> (n, pow(2, d), dev_B, dev_A); + std::swap(dev_A, dev_B); + } timer().endGpuTimer(); + + cudaMemcpy(odata + 1, dev_A, (n - 1) * sizeof(int), cudaMemcpyDefault); + + cudaFree(dev_A); + cudaFree(dev_B); + } } } diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu new file mode 100644 index 00000000..a1ff4966 --- /dev/null +++ b/stream_compaction/radix.cu @@ -0,0 +1,101 @@ +#include +#include +#include "common.h" +#include "radix.h" +#include "efficient.h" + +#define BLOCK_SIZE 128 + +namespace StreamCompaction { + namespace Radix { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + __global__ void kernRadixComputeE(int n, int d, int* eArr, const int* data) { + int k = blockIdx.x * blockDim.x + threadIdx.x; + if (k >= n) return; + + eArr[k] = ((data[k] >> d) & 1) == 0 ? 1 : 0; + } + + __global__ void kernRadixScatter(int n, int d, int totalFalses, int* odata, const int* idata, const int* fArr) { + int k = blockIdx.x * blockDim.x + threadIdx.x; + if (k >= n) return; + + int element = idata[k]; + int bit_is_false = ((element >> d) & 1) == 0; // 1 if bit is 0, 0 if bit is 1 + + int new_idx; + if (bit_is_false) { + // If the bit is 0 (false), its new position is given by the + // exclusive scan result 'fArr[k]'. This corresponds to f[i] in your image. + new_idx = fArr[k]; + } + else { + // If the bit is 1 (true), this calculates its destination address. + // This corresponds to t[i] = i - f[i] + totalFalses from your image. + int ones_offset = k - fArr[k]; + new_idx = totalFalses + ones_offset; + } + + odata[new_idx] = element; + } + + void sort(int n, int *odata, const int *idata) { + + dim3 fullBlocksPerGrid((n + BLOCK_SIZE - 1) / BLOCK_SIZE); + + int* dev_bufferA; + int* dev_bufferB; + int* dev_eArr; + int* dev_fArr; + + cudaMalloc((void**)&dev_bufferA, n * sizeof(int)); + cudaMalloc((void**)&dev_bufferB, n * sizeof(int)); + cudaMalloc((void**)&dev_eArr, n * sizeof(int)); + cudaMalloc((void**)&dev_fArr, n * sizeof(int)); + + + cudaMemcpy(dev_bufferA, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + int* dev_input = dev_bufferA; + int* dev_output = dev_bufferB; + + timer().startGpuTimer(); + + for (int d = 0; d < sizeof(int) * 8; ++d) { + // e array + kernRadixComputeE << > > (n, d, dev_eArr, dev_input); + + // exclusive scan to get f array + StreamCompaction::Efficient::scan_device(n, dev_fArr, dev_eArr); + + // total count of zero + int totalFalses = 0; + if (n > 0) { + int last_idx, last_flag; + cudaMemcpy(&last_idx, &dev_fArr[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&last_flag, &dev_eArr[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + totalFalses = last_idx + last_flag; + } + + kernRadixScatter << > > (n, d, totalFalses, dev_output, dev_input, dev_fArr); + + std::swap(dev_input, dev_output); + } + + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_input, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_bufferA); + cudaFree(dev_bufferB); + cudaFree(dev_eArr); + cudaFree(dev_fArr); + } + } +} diff --git a/stream_compaction/radix.h b/stream_compaction/radix.h new file mode 100644 index 00000000..aa489d93 --- /dev/null +++ b/stream_compaction/radix.h @@ -0,0 +1,11 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace Radix { + StreamCompaction::Common::PerformanceTimer& timer(); + + void sort(int n, int *odata, const int *idata); + } +} diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e7..470334ee 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,18 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::host_vector host_idata(idata, idata + n); + thrust::device_vector dev_idata = host_idata; + thrust::device_vector dev_odata(n); + timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin()); timer().endGpuTimer(); + + thrust::copy(dev_odata.begin(), dev_odata.end(), odata); } } }