diff --git a/README.md b/README.md index 0e38ddb..798c52b 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,120 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Xinyu Niu + * [personal website](https://xinyuniu6.wixsite.com/my-site-1) +* Tested on: Windows 11, i9-13980HX @ 2.20GHz 16GB, RTX 4070 16185MB (Personal) -### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +## Introduction +This project focuses on implementing *Scan* (*Prefix Sum*) and *Stream Compaction* algorithms in CUDA. Scan algorithms are about doing prefix sum on an array, and Stream Compaction algorithm is about removing elements that meet some given conditions from an array. In this project, the stream compaction implementations will remove `0`s from an array of `int`s. + +## Implemented Features + +In this project, I completed the following features: + +* CPU Scan & Stream Compaction +* Naive GPU Scan Algorithm +* Work-Efficient GPU Scan & Stream Compaction +* GPU Scan using Thrust +* Optimized Work-Efficient GPU Scan(Extra Credit) +* Radix Sort (Extra Credit) + +## Performance Analysis + +![](img/blocksize.png) + +**Figure 1:** Change on Elapsed Time influenced by increasing blocksize with fixed array size(2^25) + +From Figure 1, we can observe that before blocksize = 32 is reached, blocksize has relatively significant effect on time elapsed, and a larger blocksize leads to better performance for all three implementations. After 32 is reached, the blocksize no longer has considerable inflence on time elapsed. And we can also observe that before optimization, the efficient scan is not actually efficient. This is causing by the inefficient use of threads, since there will be idle threads as we always launch the same number of blocks for each iteration. By compacting threads, using shared memory etc. we can reduce this inefficiency. + +![](img/arraysize.png) + +**Figure 2:** Change on Elapsed Time influenced by increasing arraysize with fixed blocksize(256) + +From Figure 2, we can observe that when arraysize gets larger and over a certain number(from the data collected it's about 2^22), there will be significant increase on elapsed time and we can clearly see the difference in performance of different scan method. From faster to slower, the rank of methods is: ```thrust > efficient GPU scan > Naive GPU scan > CPU scan```. When arraysize is smaller, although it's hard to observe from the graph, CPU appears to have a bit better performance than the two algorithm I implemented while thrust is still the fastest one. + +![](img/thrust.png) + +**Figure 3:** NSight trace result of only allowing CPU scans and thrust scans + +From Figure 3, we can see that the largest part of thrust is the use of ```cudaMemcpyAsync``` and ```cudaStreamSynchronize```, which allows control to be returned to the host thread immediately so we won't need to wait until the data copy is completed. Since we are using cudaMemcpy in our implementation, this waiting time might be a bottleneck. + +## Output +``` +**************** +** SCAN TESTS ** +**************** + [ 0 13 20 46 19 36 9 0 15 22 27 38 11 ... 18 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 428.08ms (std::chrono Measured) + [ 0 0 13 33 79 98 134 143 143 158 180 207 245 ... -2015722115 -2015722097 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 425.212ms (std::chrono Measured) + [ 0 0 13 33 79 98 134 143 143 158 180 207 245 ... -2015722147 -2015722135 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 314.354ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 334.496ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 96.6155ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 97.165ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 10.5397ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 12.058ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 0 0 0 0 0 0 0 2 0 0 2 0 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 425.212ms (std::chrono Measured) + [ 2 2 2 3 2 1 2 3 1 1 3 3 2 ... 1 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 425.212ms (std::chrono Measured) + [ 2 2 2 3 2 1 2 3 1 1 3 3 2 ... 1 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 2080.47ms (std::chrono Measured) + [ 2 2 2 3 2 1 2 3 1 1 3 3 2 ... 1 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 202.213ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 188.192ms (CUDA Measured) + passed + +``` +## Extra Credit +**1. Optimized GPU Efficient Scan** + +I attempted to optimize the performance by adjusting blocks launched at during the loop for upper sweep and down sweep. During upper sweep, each iteration the block number shrinks to half. During down sweep, each iteration the block number expands to twice. + +**2. Radix Sort** + +I have implemented the Radix Sort algorithm, which can be called using ```StreamCompaction::Efficient::radixSort()```. + +The below output resulted from comparing the sorted arry using my implementation and std::sort. + +``` +***************************** +** RADIX SORT TESTS ** +***************************** + [ 0 17 0 16 17 5 18 9 5 18 19 18 10 ... 3 0 ] +==== Radix Sort ==== + elapsed time: 8.7022ms (std::chrono Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 19 19 ] + passed +``` \ No newline at end of file diff --git a/img/arraysize.png b/img/arraysize.png new file mode 100644 index 0000000..98ca4f5 Binary files /dev/null and b/img/arraysize.png differ diff --git a/img/blocksize.png b/img/blocksize.png new file mode 100644 index 0000000..3c80b56 Binary files /dev/null and b/img/blocksize.png differ diff --git a/img/out1.png b/img/out1.png new file mode 100644 index 0000000..6a0bff8 Binary files /dev/null and b/img/out1.png differ diff --git a/img/out2.png b/img/out2.png new file mode 100644 index 0000000..39d30af Binary files /dev/null and b/img/out2.png differ diff --git a/img/out3.png b/img/out3.png new file mode 100644 index 0000000..da091d0 Binary files /dev/null and b/img/out3.png differ diff --git a/img/thrust.png b/img/thrust.png new file mode 100644 index 0000000..9863e6d Binary files /dev/null and b/img/thrust.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..47992dd 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,12 +13,14 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 28; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; int *c = new int[SIZE]; +# define NOTESTTHRUST 1 + int main(int argc, char* argv[]) { // Scan tests @@ -47,6 +49,8 @@ int main(int argc, char* argv[]) { printArray(NPOT, b, true); printCmpResult(NPOT, b, c); +#if NOTESTTHRUST + zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); @@ -81,6 +85,8 @@ int main(int argc, char* argv[]) { //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); +#endif + zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); @@ -95,6 +101,8 @@ int main(int argc, char* argv[]) { //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); +#if NOTESTTHRUST + printf("\n"); printf("*****************************\n"); printf("** STREAM COMPACTION TESTS **\n"); @@ -147,6 +155,26 @@ int main(int argc, char* argv[]) { //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + printf("\n"); + printf("*****************************\n"); + printf("** RADIX SORT TESTS **\n"); + printf("*****************************\n"); + + // Radixsort tests + + genArray(SIZE - 1, a, 20); // Leave a 0 at the end to test that edge case + printArray(SIZE, a, true); + + zeroArray(SIZE, b); + printDesc("Radix Sort"); + StreamCompaction::Efficient::radixSort(SIZE, b, a); + std::sort(a, a + SIZE); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(SIZE, a, true); + printCmpResult(NPOT, a, b); + +#endif + system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..3e5fab0 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,9 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (idx < n) + bools[idx] = (idata[idx] != 0 ? 1 : 0); } /** @@ -33,6 +36,9 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (idx < n && bools[idx] == 1) + odata[indices[idx]] = idata[idx]; } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..07afdf4 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -3,6 +3,8 @@ #include "common.h" +int gNonCompact = 1; + namespace StreamCompaction { namespace CPU { using StreamCompaction::Common::PerformanceTimer; @@ -18,9 +20,16 @@ namespace StreamCompaction { * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ void scan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); + if(gNonCompact) + timer().startCpuTimer(); // TODO - timer().endCpuTimer(); + odata[0] = 0; + for (int i = 1; i < n; i++) + { + odata[i] = odata[i - 1] + idata[i - 1]; + } + if(gNonCompact) + timer().endCpuTimer(); } /** @@ -29,10 +38,19 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); + //timer().startCpuTimer(); // TODO - timer().endCpuTimer(); - return -1; + int count = 0; + for (int i = 0; i < n; i++) + { + if (idata[i] != 0) + { + odata[count] = idata[i]; + count++; + } + } + //timer().endCpuTimer(); + return count; } /** @@ -41,10 +59,30 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + gNonCompact = 0; timer().startCpuTimer(); // TODO + // Step1: map + int* boolmap = new int[n]; + for (int i = 0; i < n; i++) + { + boolmap[i] = (idata[i] != 0 ? 1 : 0); + } + // Step2: scan + scan(n, odata, boolmap); + int count = odata[n - 1]; + // Step3: scatter + for (int k = 0; k < n; k++) + { + if (boolmap[k] != 0) + { + odata[odata[k]] = idata[k]; + } + } + delete[] boolmap; + timer().endCpuTimer(); - return -1; + return count; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..2346faf 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,6 +3,9 @@ #include "common.h" #include "efficient.h" +#define blockSize 256 +#define OPTIMIZED 1 + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,13 +15,120 @@ namespace StreamCompaction { return timer; } + __global__ void kernUpSweep(int n, int d, int* data) + { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + + int powerd = 1 << d; + int powerdp1 = 1 << (d + 1); + + if (k >= n || k % powerdp1) return; + + data[k + powerdp1 - 1] += data[k + powerd - 1]; + } + + __global__ void kernDownSweep(int n, int d, int* data) + { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + + int powerd = 1 << d; + int powerdp1 = 1 << (d + 1); + + if (k >= n || k % powerdp1 || k + powerdp1 - 1 >= n) return; + + int t = data[k + powerd - 1]; + data[k + powerd - 1] = data[k + powerdp1 - 1]; + data[k + powerdp1 - 1] += t; + } + + __global__ void kernOptUpSweep(int n, int d, int offset, int* data) + { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (k >= n || k >= d) return; + + int i = offset * (2 * k + 1) - 1; + int j = offset * (2 * k + 2) - 1; + + data[j] += data[i]; + } + + __global__ void kernOptDownSweep(int n, int d, int offset, int* data) + { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (k >= n || k >= d) return; + + int i = offset * (2 * k + 1) - 1; + int j = offset * (2 * k + 2) - 1; + + int t = data[i]; + data[i] = data[j]; + data[j] += t; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + void scan(int n, int *odata, const int *idata, bool timeScan) + { + int N = 1 << ilog2ceil(n); + + int* dev_data; + + cudaMalloc((void**)&dev_data, N * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_data failed!"); + cudaMemset(dev_data, 0, N * sizeof(int)); + checkCUDAErrorFn("cudaMemset dev_data to 0 failed!"); + cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorFn("cudaMemcpy idata to dev_data failed!"); + + dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); +#if OPTIMIZED + int offset = 1; +#endif + + if(timeScan) + timer().startGpuTimer(); // TODO - timer().endGpuTimer(); +#if OPTIMIZED + for (int d = N >> 1; d > 0; d >>= 1) + { + fullBlocksPerGrid = dim3((d + blockSize - 1) / blockSize); + kernOptUpSweep<<>>(N, d, offset, dev_data); + offset <<= 1; + } + +#else + for (int d = 0; d < ilog2ceil(N); d++) + { + kernUpSweep<<>>(N, d, dev_data); + } +#endif + + cudaMemset(dev_data + N - 1, 0, sizeof(int)); +#if OPTIMIZED + for (int d = 1; d < N; d <<= 1) + { + offset >>= 1; + fullBlocksPerGrid = dim3((d + blockSize - 1) / blockSize); + kernOptDownSweep<<>>(N, d, offset, dev_data); + } +#else + for (int d = ilog2ceil(N) - 1; d >= 0; d--) + { + kernDownSweep<<>>(N, d, dev_data); + } +#endif + + if (timeScan) + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("cudaMemcpy dev_data to odata failed!"); + + cudaFree(dev_data); + checkCUDAErrorFn("cudaFree dev_data failed!"); } /** @@ -30,11 +140,172 @@ namespace StreamCompaction { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ - int compact(int n, int *odata, const int *idata) { + int compact(int n, int *odata, const int *idata) + { + int N = 1 << ilog2ceil(n); + + int* dev_bools; + int* dev_data; + int* dev_idata; + int* dev_odata; + + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_bools failed!"); + cudaMalloc((void**)&dev_data, N * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_data failed!"); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_idata failed!"); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_odata failed!"); + + cudaMemset(dev_data, 0, N * sizeof(int)); + checkCUDAErrorFn("cudaMemset dev_data failed!"); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorFn("cudaMempcy idata to dev_idata failed!"); + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + timer().startGpuTimer(); // TODO + StreamCompaction::Common::kernMapToBoolean<<>>(n, dev_bools, dev_idata); + + cudaMemcpy(dev_data, dev_bools, n * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAErrorFn("cudaMempcy dev_bools to dev_data failed!"); + + for (int d = 0; d < ilog2ceil(N); d++) + { + kernUpSweep<<>>(N, d, dev_data); + } + + cudaMemset(dev_data + N - 1, 0, sizeof(int)); + for (int d = ilog2ceil(N) - 1; d >= 0; d--) + { + kernDownSweep<<>>(N, d, dev_data); + } + + StreamCompaction::Common::kernScatter<<>>(n, dev_odata, dev_idata, dev_bools, dev_data); + + timer().endGpuTimer(); + + int count = 0; + cudaMemcpy(&count, dev_data + N - 1, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("cudaMempcy count failed!"); + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("cudaMempcy dev_odata to odata failed!"); + + cudaFree(dev_bools); + checkCUDAErrorFn("cudaFree dev_bools failed!"); + cudaFree(dev_data); + checkCUDAErrorFn("cudaFree dev_data failed!"); + cudaFree(dev_idata); + checkCUDAErrorFn("cudaFree dev_idata failed!"); + cudaFree(dev_odata); + checkCUDAErrorFn("cudaFree dev_odata failed!"); + + return count; + } + + __global__ void kernComputeEArray(int n, int bit, int* edata, const int* idata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + + edata[index] = !((idata[index] >> bit) & 1); + } + + __global__ void kernComputeTArray(int n, int totalFalses, int* tdata, const int* fdata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + + tdata[index] = index - fdata[index] + totalFalses; + } + + __global__ void kernComputeDArray(int n, int* ddata, const int* edata, const int* tdata, const int* fdata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + + ddata[index] = edata[index] ? fdata[index] : tdata[index]; + } + + __global__ void kernScatter(int n, int* ddata, int* odata, int* idata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + + odata[ddata[index]] = idata[index]; + } + + void radixSort(int n, int* odata, const int* idata) + { + int* dev_edata; + int* dev_fdata; + int* dev_tdata; + int* dev_ddata; + + int* dev_idata; + int* dev_odata; + + cudaMalloc((void**)&dev_edata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_edata failed!"); + cudaMalloc((void**)&dev_fdata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_fdata failed!"); + cudaMalloc((void**)&dev_tdata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_tdata failed!"); + cudaMalloc((void**)&dev_ddata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_ddata failed!"); + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_idata failed!"); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_odata failed!"); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorFn("cudaMempcy idata to dev_idata failed!"); + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + int bnum = ilog2ceil(*(std::max_element(idata, idata + n))); + + timer().startGpuTimer(); + for (int d = 0; d < bnum; d++) + { + // Step1: Compute e array + kernComputeEArray<<>>(n, d, dev_edata, dev_idata); + // Step2: Scan e + scan(n, dev_fdata, dev_edata, false); + // Step3: Compute totalFalse + int e_last; + int f_last; + cudaMemcpy(&e_last, dev_edata + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&f_last, dev_fdata + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + int totalFalses = e_last + f_last; + // Step4: Compute t + kernComputeTArray<<>>(n, totalFalses, dev_tdata, dev_fdata); + // Step5: scatter + kernComputeDArray<<>>(n, dev_ddata, dev_edata, dev_tdata, dev_fdata); + kernScatter<<>>(n, dev_ddata, dev_odata, dev_idata); + cudaMemcpy(dev_idata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAErrorFn("cudaMempcy dev_odata to dev_idata failed!"); + } timer().endGpuTimer(); - return -1; + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("cudaMempcy dev_odata to odata failed!"); + + cudaFree(dev_edata); + checkCUDAErrorFn("cudaFree dev_edata failed!"); + cudaFree(dev_fdata); + checkCUDAErrorFn("cudaFree dev_fdata failed!"); + cudaFree(dev_tdata); + checkCUDAErrorFn("cudaFree dev_tdata failed!"); + cudaFree(dev_ddata); + checkCUDAErrorFn("cudaFree dev_ddata failed!"); + cudaFree(dev_idata); + checkCUDAErrorFn("cudaFree dev_idata failed!"); + cudaFree(dev_odata); + checkCUDAErrorFn("cudaFree dev_odata failed!"); } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..2592aca 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -6,8 +6,10 @@ namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata, bool timeScan = true); int compact(int n, int *odata, const int *idata); + + void radixSort(int n, int* odata, const int* idata); } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..a597dae 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,8 @@ #include "common.h" #include "naive.h" +#define blockSize 256 + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -12,14 +14,56 @@ namespace StreamCompaction { return timer; } // TODO: __global__ + __global__ void kernNaiveGPUScan(int n, int d, int* odata, int* idata) + { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= n) return; + + int powerd = 1 << (d - 1); + if (k >= powerd) + { + odata[k] = idata[k - powerd] + idata[k]; + } + else + { + odata[k] = idata[k]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* dev_odata; + int* dev_idata; + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_odata failed!"); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_idata failed!"); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorFn("cudaMemcpy idata to dev_idata failed!"); + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + timer().startGpuTimer(); // TODO + for (int d = 1; d <= ilog2ceil(n); d++) + { + kernNaiveGPUScan<<>>(n, d, dev_odata, dev_idata); + std::swap(dev_odata, dev_idata); + } timer().endGpuTimer(); + + cudaMemcpy(odata + 1, dev_idata, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("cudaMemcpy dev_idata to odata failed!"); + odata[0] = 0; + + cudaFree(dev_odata); + checkCUDAErrorFn("cudaFree dev_odata failed!"); + cudaFree(dev_idata); + checkCUDAErrorFn("cudaFree dev_idata failed!"); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..bbc2b35 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,19 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int N = 1 << ilog2ceil(n); + thrust::device_vector dev_idata(idata, idata + N); + thrust::device_vector dev_odata = dev_idata; + timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin()); + timer().endGpuTimer(); + + thrust::copy(dev_odata.begin(), dev_odata.end(), odata); } } }