diff --git a/README.md b/README.md index 0e38ddb..30b9c37 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,202 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Xitong Zheng + * [LinkedIn](https://www.linkedin.com/in/xitong-zheng-5b6543205/), [Instagram](https://www.instagram.com/simonz_zheng/), etc. +* Tested on: Windows 11, i7-12700k 32GB, GTX 4090 24GB -### (TODO: Your README) +### Features Implemented +To make it clear, scan here refers to calculate the prefix sum of an array, whether exclusive or inclusive. Stream Compaction refers to remove the zero data in an array and compact the useful data together with less memory size required. This can be useful in spareMatrix and other fields. +- CPU Scan & Stream Compaction +- Naive GPU Scan +- Work-Efficent Algo GPU Scan & Stream Compaction +- Thrust Scan (for comparison) +- Optimized GPU Scan with dynamical wrap control +- Radix Sort +- GPU Scan Using Shared Memory +#### Features to be included +- bank conflict reduce +- more Nsight Performance Analysis +### Feature Details +#### CPU Scan & Stream Compaction +CPU Scan is quite straight forward, use a for loop to compute the exclusive prefix sum, stream compaction in cpu is easy to write down, just to remove all zeros. +#### GPU naive Scan +![](./img/figure-39-2.jpg) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +The algorithm performs O(n log2 n) addition operations while a sequential scan performs O(n) adds. So it's not very efficient. +#### Work-Efficient GPU Scan & Stream Compaction +This algorithm is based on the one presented by Blelloch (1990).\ +It performs O(n) operations for n datas, 2*(n - 1) adds and (n - 1) swaps. So it is considerably better for large arrays. + +**Up-Sweep** + +![](./img/39fig03.jpg) + +**Down-Sweep** + +Using scan method, we can do parrall stream compaction easily in the following way. +![](./img/39fig10.jpg) + +#### But Why Work-Efficient GPU can be slower? +Despite that the algorithm performs less operation, it run slow on GPU and can actually slower than naive scan implemention. Below is the reason: +![](./img/part5.png) +The left side is exactly the steps to scan in the algo mentioned above. It lanuches more threads than it actually needs and cause serious wrap divergence. All other finished threads are waiting the most time-cosuming one in the wrap. Also, for that single thread, it actually run more iterations than the naive scan, which cause the whole implementation slower. + +The solution is in the right: rearrange the thread index to avoid wrap divergence. Also, only launch the threads that has works, which mean dynamically reducing the threads and thus wraps in each iteration to save time. + +![](./img/figure-39-4.jpg) + +#### Radix Sort +With scan implementation provided above, it is easy to implement radix sort with O(klog(n)). The following picture show one single sort iteration based on 1bit. For k bits number, there is k time sort iteration needed. +![](./img/39fig14.jpg) + +I add module like radixsort.cu and test it in the main.cpp as well. Please check the output the output of the test program. + +#### GPU Scan Using Shared Memory +The idea is to reduce the global memory access and reuse the data to compute to hide memory latency. To use share memory, we need do scan based on that shared memory block so the input has to be divided into blocks. After that, scan the array that stores the sum of each individual blocks and then add the responding the scanned sum to get the total scan across blocks. In my code, it is `scanshared` function in the efficient.cu +![](./img/39fig06.jpg) + +### Performance Analysis +#### Find the optimized of each of implementation for minimal run time on GPU +First, I choose array size = 2^26 to find the optimal block size. The reason is that stream generally comes in a large scale and if it is small size, then using cpu is rather fast. + +#### Optimal block Size for GPU naive scan +![](./img/Native_GPU_Scan.png) + +The x-axis is the block size, which specifys how many threads are in a single block. The y-axis is executation time in ms. +It shows that for native GPU scan, executation time drops drastically after block size increase to around 100 and the time do not change greatly from that scale. So any scale ranging from 100(128) to 1000(1024) is acceptable. I choose 512 blocks for the naive GPU scan + +#### Optimal block Size for work efficient GPU scan with dynamic wrap lanuching +![](./img/work_efficient_scan_dynamic_wrap_lanuching.png) + +The x-axis is the block size, which specifys how many threads are in a single block. The y-axis is executation time in ms. +As blockSize increases, the executation time dropped quickly and bounce up slightlt after 256 blockSize. So 256 is the optimal blockSize. + +#### Optimal block Size for work efficient GPU scan with extra share memory +![](./img/share_memory_optimization.png) + +The x-axis is the block size in log, which specifys how many threads are in a single block. The y-axis is executation time in ms. +Based on the picture, the optimal block size is 128. Also, there are some memoery issues when the block size is 512 or higher so I do not plot the data after that, but the time reaches minimal at block size of 128. + + +#### Compare all of these GPU Scan implementations (Naive, Work-Efficient, and Thrust) to the serial CPU version of Scan. Plot a graph of the comparison (with array size on the independent axis). +![](./img/performance.png) +The x-axis is the Input Array size in log and the y-axis is scan time in ms in log. +CPU scan is really fast when the input array size is small, only after around 100000 (rougly 2^13), all gpu scans start to run faster than the cpu one. Another interesting fact is that my work efficient scan run faster than thrust library exclusive_scan during the size ranging from 10^5 to 10^8. After that boundary, the thrust scan run superly fast, like for input array of size 2^29, the performance is twice as much as the work efficient shared memory scan. + +#### Performance bottlenecks for each implementation. +#### Naive Scan +Naive Scan has an extensive size of global memory access.\ +Bottleneck: memory bandwidth +![](./img/naive_analysis.png) + +#### Efficient Scan with dynamic wrap optimization +Algo time complexity drops but memory has not changed. It also has a lot of global memory access. Some are reduced by dynamic wrap optimization so that threads do not lanuched. But a single thread run similarly.\ +Bottleneck: memory bandwidth + +#### Efficient Scan with extra share memory optimization +Global memory requests drop a lot and compute intensity increase.\ +Bottleneck: algo implementation +![](./img/blockscanShared_analysis.png) + + +### Thrust scan +From Nsigh System trace, I get stats like this. You can see that occupany has reach the max. +![](./img/thrust_analysis_1.png) + +Zoom up a little bit +![](./img/thrust_analysis_2.png) +From the picture above, thrust::exclusive_scan has the following functions calls: _kenrel_agent, cudaStreamSychronize, cudaEventRecord, cudaMalloc, DeviceScanInitKernel, DeviceScanKernel, cudaStreamSychronize, cudaFree, cudaEventRecord. + +![](./img/thrust_analysis_3.png) +You can see that DeviceScanKernel do the scan actually and the Wrap occupancy approaches the limit.\ +Bottleneck: memory bandwidth +![](./img/thrust_scan_analysis.png) + + +#### Output of the test program +Input Array Size is 2^29 + +``` +**************** +** SCAN TESTS ** +**************** + [ 1 2 1 3 3 4 6 0 6 2 0 4 8 ... 4 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 830.72ms (std::chrono Measured) + [ 0 1 3 4 7 10 14 20 20 26 28 28 32 ... -1879071581 -1879071577 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 817.077ms (std::chrono Measured) + [ 0 1 3 4 7 10 14 20 20 26 28 28 32 ... -1879071590 -1879071590 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 150.14ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 150.199ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 51.344ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 51.6955ms (CUDA Measured) + passed +==== work-efficient scanShared, power-of-two ==== + elapsed time: 10.537ms (CUDA Measured) + [ 0 1 3 4 7 10 14 20 20 26 28 28 32 ... -1879071581 -1879071577 ] + passed +==== work-efficient scanShared, non-power-of-two ==== + elapsed time: 10.1171ms (CUDA Measured) + [ 0 1 3 4 7 10 14 20 20 26 28 28 32 ... -1879071590 -1879071590 ] + passed +==== thrust scan, power-of-two ==== + elapsed time: 5.6976ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 5.82576ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 0 3 1 3 3 2 3 3 3 0 ... 0 0 ] +==== cpu compact w ithout scan, power-of-two ==== + elapsed time: 1111.51ms (std::chrono Measured) + [ 2 3 2 3 1 3 3 2 3 3 3 2 1 ... 3 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 1118.49ms (std::chrono Measured) + [ 2 3 2 3 1 3 3 2 3 3 3 2 1 ... 2 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 4087.93ms (std::chrono Measured) + [ 2 3 2 3 1 3 3 2 3 3 3 2 1 ... 3 1 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 415.429ms (CUDA Measured) + [ 2 3 2 3 1 3 3 2 3 3 3 2 1 ... 3 1 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 409.022ms (CUDA Measured) + passed + +***************************** +** Radix Sort TESTS ** +***************************** + [ 3566 6053 12621 13337 8159 5567 1910 31611 14939 2462 8737 2023 16916 ... 8582 0 ] +==== cpu sort power-of-two ==== + elapsed time: 31606.3ms (std::chrono Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] +==== my radixsort power-of-two ==== + elapsed time: 3875.56ms (CUDA Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + passed +==== cpu sort non-power-of-two ==== + elapsed time: 26910.9ms (std::chrono Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] +==== my radixsort non-power-of-two ==== + elapsed time: 3768.46ms (CUDA Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + passed +Press any key to continue . . .``` \ No newline at end of file diff --git a/img/39fig03.jpg b/img/39fig03.jpg new file mode 100644 index 0000000..22c4da0 Binary files /dev/null and b/img/39fig03.jpg differ diff --git a/img/39fig06.jpg b/img/39fig06.jpg new file mode 100644 index 0000000..973e531 Binary files /dev/null and b/img/39fig06.jpg differ diff --git a/img/39fig10.jpg b/img/39fig10.jpg new file mode 100644 index 0000000..d3438dc Binary files /dev/null and b/img/39fig10.jpg differ diff --git a/img/39fig14.jpg b/img/39fig14.jpg new file mode 100644 index 0000000..992c3c5 Binary files /dev/null and b/img/39fig14.jpg differ diff --git a/img/Native_GPU_Scan.png b/img/Native_GPU_Scan.png new file mode 100644 index 0000000..ba852f5 Binary files /dev/null and b/img/Native_GPU_Scan.png differ diff --git a/img/blockscanShared_analysis.png b/img/blockscanShared_analysis.png new file mode 100644 index 0000000..3f3597e Binary files /dev/null and b/img/blockscanShared_analysis.png differ diff --git a/img/naive_analysis.png b/img/naive_analysis.png new file mode 100644 index 0000000..d521c0f Binary files /dev/null and b/img/naive_analysis.png differ diff --git a/img/part5.png b/img/part5.png new file mode 100644 index 0000000..542f56d Binary files /dev/null and b/img/part5.png differ diff --git a/img/performance.png b/img/performance.png new file mode 100644 index 0000000..23df29d Binary files /dev/null and b/img/performance.png differ diff --git a/img/share_memory_optimization.png b/img/share_memory_optimization.png new file mode 100644 index 0000000..61ab0a5 Binary files /dev/null and b/img/share_memory_optimization.png differ diff --git a/img/thrust_analysis_1.png b/img/thrust_analysis_1.png new file mode 100644 index 0000000..0184d45 Binary files /dev/null and b/img/thrust_analysis_1.png differ diff --git a/img/thrust_analysis_2.png b/img/thrust_analysis_2.png new file mode 100644 index 0000000..edb0d39 Binary files /dev/null and b/img/thrust_analysis_2.png differ diff --git a/img/thrust_analysis_3.png b/img/thrust_analysis_3.png new file mode 100644 index 0000000..6d56b1c Binary files /dev/null and b/img/thrust_analysis_3.png differ diff --git a/img/thrust_scan_analysis.png b/img/thrust_scan_analysis.png new file mode 100644 index 0000000..582dc5a Binary files /dev/null and b/img/thrust_scan_analysis.png differ diff --git a/img/work_efficient_scan_dynamic_wrap_lanuching.png b/img/work_efficient_scan_dynamic_wrap_lanuching.png new file mode 100644 index 0000000..4e2ee06 Binary files /dev/null and b/img/work_efficient_scan_dynamic_wrap_lanuching.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..d52b6f7 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,13 +11,14 @@ #include #include #include +#include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array -const int NPOT = SIZE - 3; // Non-Power-Of-Two -int *a = new int[SIZE]; -int *b = new int[SIZE]; -int *c = new int[SIZE]; +const int TSIZE = 1 << 27; // feel free to change the TSIZE of array +const int NPOT = TSIZE - 3; // Non-Power-Of-Two +int *a = new int[TSIZE]; +int *b = new int[TSIZE]; +int *c = new int[TSIZE]; int main(int argc, char* argv[]) { // Scan tests @@ -27,127 +28,177 @@ int main(int argc, char* argv[]) { printf("** SCAN TESTS **\n"); printf("****************\n"); - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); + genArray(TSIZE - 1, a, 10); // Leave a 0 at the end to test that edge case + //onesArray(TSIZE, a); + a[TSIZE - 1] = 0; + printArray(TSIZE, a, true); - // initialize b using StreamCompaction::CPU::scan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. - // At first all cases passed because b && c are all zeroes. - zeroArray(SIZE, b); + //initialize b using StreamCompaction::CPU::scan you implement + //We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. + //At first all cases passed because b && c are all zeroes. + zeroArray(TSIZE, b); printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); + StreamCompaction::CPU::scan(TSIZE, b, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); + printArray(TSIZE, b, true); - zeroArray(SIZE, c); + zeroArray(TSIZE, c); printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); printArray(NPOT, b, true); printCmpResult(NPOT, b, c); - zeroArray(SIZE, c); + zeroArray(TSIZE, c); printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); + StreamCompaction::Naive::scan(TSIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); + //printArray(TSIZE, c, true); + printCmpResult(TSIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); + onesArray(TSIZE, c); printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ + StreamCompaction::Naive::scan(TSIZE, c, a); + printArray(TSIZE, c, true); */ - zeroArray(SIZE, c); + zeroArray(TSIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + //printArray(TSIZE, c, true); printCmpResult(NPOT, b, c); - zeroArray(SIZE, c); + zeroArray(TSIZE, c); printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); + StreamCompaction::Efficient::scan(TSIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); + //printArray(TSIZE, c, true); + printCmpResult(TSIZE, b, c); - zeroArray(SIZE, c); + zeroArray(TSIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); - zeroArray(SIZE, c); + zeroArray(TSIZE, c); + printDesc("work-efficient scanShared, power-of-two"); + StreamCompaction::Efficient::scanShared(TSIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(TSIZE, c, true); + printCmpResult(TSIZE, b, c); + + zeroArray(TSIZE, c); + printDesc("work-efficient scanShared, non-power-of-two"); + StreamCompaction::Efficient::scanShared(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + + zeroArray(TSIZE, c); printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); + StreamCompaction::Thrust::scan(TSIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); + //printArray(TSIZE, c, true); + printCmpResult(TSIZE, b, c); - zeroArray(SIZE, c); + zeroArray(TSIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); - printf("\n"); - printf("*****************************\n"); - printf("** STREAM COMPACTION TESTS **\n"); - printf("*****************************\n"); + //printf("\n"); + //printf("*****************************\n"); + //printf("** STREAM COMPACTION TESTS **\n"); + //printf("*****************************\n"); - // Compaction tests + //// Compaction tests - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); + //genArray(TSIZE - 1, a, 4); // Leave a 0 at the end to test that edge case + //a[TSIZE - 1] = 0; + //printArray(TSIZE, a, true); - int count, expectedCount, expectedNPOT; + //int count, expectedCount, expectedNPOT; // initialize b using StreamCompaction::CPU::compactWithoutScan you implement // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); - - zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); + //zeroArray(TSIZE, b); + //printDesc("cpu compact without scan, power-of-two"); + //count = StreamCompaction::CPU::compactWithoutScan(TSIZE, b, a); + //printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + //expectedCount = count; + //printArray(count, b, true); + //printCmpLenResult(count, expectedCount, b, b); + + //zeroArray(TSIZE, c); + //printDesc("cpu compact without scan, non-power-of-two"); + //count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + //printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + //expectedNPOT = count; + //printArray(count, c, true); + //printCmpLenResult(count, expectedNPOT, b, c); - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //zeroArray(TSIZE, c); + //printDesc("cpu compact with scan"); + //count = StreamCompaction::CPU::compactWithScan(TSIZE, c, a); + //printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); + //printCmpLenResult(count, expectedCount, b, c); - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //zeroArray(TSIZE, c); + //printDesc("work-efficient compact, power-of-two"); + //count = StreamCompaction::Efficient::compact(TSIZE, c, a); + //printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); + //printCmpLenResult(count, expectedCount, b, c); + + //zeroArray(TSIZE, c); + //printDesc("work-efficient compact, non-power-of-two"); + //count = StreamCompaction::Efficient::compact(NPOT, c, a); + //printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + ////printArray(count, c, true); + //printCmpLenResult(count, expectedNPOT, b, c); + + //printf("\n"); + //printf("*****************************\n"); + //printf("** Radix Sort TESTS **\n"); + //printf("*****************************\n"); + + //genArray(TSIZE - 1, a, 100000); + //printArray(TSIZE, a, true); + + //zeroArray(TSIZE, b); + //printDesc("cpu sort power-of-two"); + //StreamCompaction::CPU::sort(TSIZE, b, a); + //printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + //printArray(TSIZE, b, true); + + //zeroArray(TSIZE, c); + //printDesc("my radixsort power-of-two"); + //StreamCompaction::RadixSort::radixsort(TSIZE, c, a); + //printElapsedTime(StreamCompaction::RadixSort::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(TSIZE, c, true); + //printCmpResult(TSIZE, b, c); + + //zeroArray(TSIZE, b); + //printDesc("cpu sort non-power-of-two"); + //StreamCompaction::CPU::sort(NPOT, b, a); + //printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + //printArray(NPOT, b, true); + + //zeroArray(TSIZE, c); + //printDesc("my radixsort non-power-of-two"); + //StreamCompaction::RadixSort::radixsort(NPOT, c, a); + //printElapsedTime(StreamCompaction::RadixSort::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(NPOT, c, true); + //printCmpResult(NPOT, b, c); + - system("pause"); // stop Win32 console from closing on exit + //system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; delete[] c; diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 567795b..d35f904 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -4,6 +4,7 @@ set(headers "naive.h" "efficient.h" "thrust.h" + "radixsort.h" ) set(sources @@ -12,6 +13,7 @@ set(sources "naive.cu" "efficient.cu" "thrust.cu" + "radixsort.cu" ) list(SORT headers) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..986ec0b 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,12 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int k = threadIdx.x + blockIdx.x * blockDim.x; + if (k >= n) return; + + if (idata[k] > 0) { + bools[k] = 1; + } } /** @@ -33,7 +39,13 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO - } + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= n) return; + + if (bools[idx] == 1) { + odata[indices[idx]] = idata[idx]; + } + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..53be131 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -9,6 +10,7 @@ #include #include #include +#include #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..da0a50b 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -20,6 +20,9 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } timer().endCpuTimer(); } @@ -31,7 +34,16 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int idx = 0; + for (int i = 0; i < n; i++) { + if (idata[i] > 0) { + odata[idx] = idata[i]; + idx++; + } + } + timer().endCpuTimer(); + if (idx > 0) return idx; return -1; } @@ -42,9 +54,40 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); + // TODO + std::vector id(n, 0); + std::vector idx_array(n, 0); + + for (int i = 0; i < n; i++) { + if (idata[i] > 0) id[i] = 1; + } + + for (int i = 1; i < n; i++) { + idx_array[i] = idx_array[i - 1] + id[i - 1]; + } + + int maxIdx = 0; + for (int i = 0; i < n; i++) { + if (id[i] > 0) { + odata[idx_array[i]] = idata[i]; + maxIdx = std::max(idx_array[i], maxIdx); + } + } + + timer().endCpuTimer(); + if (maxIdx > 0) return maxIdx + 1; return -1; } + + void sort(int n, int* odata, const int* idata) { + timer().startCpuTimer(); + + std::copy(idata, idata + n, odata); + std::sort(odata, odata + n); + + timer().endCpuTimer(); + } } } diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 873c047..222b77a 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -11,5 +11,7 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata); int compactWithScan(int n, int *odata, const int *idata); + + void sort(int n, int* odata, const int* idata); } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..2e23351 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,6 +2,7 @@ #include #include "common.h" #include "efficient.h" +#include "naive.h" namespace StreamCompaction { namespace Efficient { @@ -12,13 +13,261 @@ namespace StreamCompaction { return timer; } + __global__ void upSweep(int n, int d, int* data) { + int k = threadIdx.x + blockIdx.x * blockDim.x; + if (k >= n) return; + int offset = 1 << (d + 1); + + if (k % offset == 0) { + int left = (k + offset - 1); + int right = k + (offset >> 1) - 1; + if (left < n) { + data[left] += data[right]; + } + } + } + + __global__ void downSweep(int n, int d, int* data) { + int k = threadIdx.x + blockIdx.x * blockDim.x; + int offset = 1 << (d + 1); + + if (k % offset == 0) { + int b = k + (offset >> 1) - 1; + int a = k + offset - 1; + + if (a < n) { + int temp = data[b]; + data[b] = data[a]; + data[a] += temp; + } + } + } + + __global__ void upSweepV2(int n, int stride, int* data) { + int k = threadIdx.x + blockIdx.x * blockDim.x; + if (k >= n) return; + + int left = stride * (k + 1) - 1; + int right = stride * k + (stride >> 1) - 1; + if (left < n) { + data[left] += data[right]; + } + } + + __global__ void downSweepV2(int n, int stride, int* data) { + int k = threadIdx.x + blockIdx.x * blockDim.x; + if (k >= n) return; + + // Compute the indices for the left and right elements to be operated on + int a = stride * (k + 1) - 1; + int b = stride * k + (stride >> 1) - 1; + + if (a < n) { + int temp = data[b]; + data[b] = data[a]; + data[a] += temp; + } + } + + __global__ void blockScan(int* data, int* blockSums, int n, int D_max) { + extern __shared__ int sdata[]; + int tid = threadIdx.x; + int gid = blockIdx.x * blockDim.x + threadIdx.x; + + if (gid >= n) return; + // Load input into shared memory. + sdata[tid] = (gid < n) ? data[gid] : 0; + __syncthreads(); + + // Up-sweep (Reduce) + int stride; + for (int d = 0; d <= D_max; ++d) { + stride = 1 << (d + 1); + + if (tid < ((blockDim.x + stride - 1) / stride)) { + int right = stride * tid + (stride >> 1) - 1; + int left = stride * (tid + 1) - 1; + if (left < blockDim.x) sdata[left] += sdata[right]; + } + __syncthreads(); + } + + //Clear last element for downsweep + int lastSum; + if (tid == blockDim.x - 1) { + lastSum = sdata[blockDim.x - 1]; + sdata[blockDim.x - 1] = 0; + if (blockSums != nullptr) { + blockSums[blockIdx.x] = lastSum; + } + } + __syncthreads(); + + //// Down-sweep + for (int d = D_max; d >= 0; --d) { + stride = 1 << (d + 1); + + if (tid < ((blockDim.x + stride - 1) / stride)) { + // Compute the indices for the left and right elements to be operated on + int a = stride * (tid + 1) - 1; + int b = stride * tid + (stride >> 1) - 1; + + if (a < blockDim.x) { + int temp = sdata[b]; + sdata[b] = sdata[a]; + sdata[a] += temp; + } + } + __syncthreads(); + } + + // Write results to output array. + //convert the exclusive scan into inclusive one + if (tid < blockDim.x - 1) { + data[gid] = sdata[tid + 1]; + } + else if (tid == blockDim.x - 1) { + data[gid] = lastSum; + } + + //data[gid] = sdata[tid]; + } + + __global__ void addBlockSums(int* data, int* blockSums, int n) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + if (gid >= n) return; + + int blockSum; + if (blockIdx.x >= 1) { + blockSum = blockSums[blockIdx.x - 1]; + data[gid] += blockSum; + } + } + + void scanShared(int n, int* odata, const int* idata) { + int d = ilog2ceil(n) - 1; + int extendLength = 1 << (d + 1); + + int blockSize = 128; + int numBlocks = (extendLength + blockSize - 1) / blockSize; + + int d_max = ilog2ceil(blockSize) - 1; + int* dev_data; + int* dev_blockSums; + + cudaMalloc((void**)&dev_data, extendLength * sizeof(int)); + cudaMemset(dev_data, 0, extendLength * sizeof(int)); + cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + cudaMalloc((void**)&dev_blockSums, numBlocks * sizeof(int)); + cudaMemset(dev_blockSums, 0, numBlocks * sizeof(int)); + + int* dev_o_blockSums; + cudaMalloc((void**)&dev_o_blockSums, numBlocks * sizeof(int)); + cudaMemset(dev_o_blockSums, 0, numBlocks * sizeof(int)); + + nvtxRangePushA("scanShared"); + timer().startGpuTimer(); + //Step 1: Perform scan on individual blocks and record the block sums. + blockScan << > > (dev_data, dev_blockSums, extendLength, d_max); + checkCUDAError("blockScan"); + + //Step 1.1: Perform scan on block sums. + int newblockSize = 512; + int gridSize = (numBlocks + newblockSize - 1) / newblockSize; + + for (int d = 1; d <= ilog2ceil(numBlocks); ++d) { + // Swap pointers + if (d > 1) std::swap(dev_blockSums, dev_o_blockSums); + + // Launch kernel + StreamCompaction::Naive::naiveScanKernel << > > (numBlocks, d, dev_o_blockSums, dev_blockSums); + checkCUDAError("naiveScanKernel"); + // wait all computing finished + //cudaDeviceSynchronize(); + } + + //// Step 2: Add the block sums to the scanned array. + addBlockSums << > > (dev_data, dev_o_blockSums, n); + checkCUDAError("addBlockSums"); + timer().endGpuTimer(); + nvtxRangePop(); + + // Copy results back to host + cudaMemcpy(odata + 1, dev_data, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost); + + // manually convert it into exclusive scan + odata[0] = 0; + + cudaFree(dev_data); + cudaFree(dev_blockSums); + cudaFree(dev_o_blockSums); + } + + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int d = ilog2ceil(n) - 1; + + int extendLength = 1 << (d + 1); + // Allocate device memory + int* dev_data; + cudaMalloc((void**)&dev_data, extendLength * sizeof(int)); + + // set the cuda memory + cudaMemset(dev_data, 0, extendLength * sizeof(int)); + cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + // Set up execution parameters + int blockSize = 256; + int initialBlockSize = blockSize; + + nvtxRangePushA("work efficient scan"); timer().startGpuTimer(); - // TODO + // ------------------------ Version 2.0 ------------------------------------------- + // + // + // index optimizing, thus wraps divergance optimizing + // Up-sweep + int stride = 0; + int threadsNeeded = 0; + + int gridSize; + for (int i = 0; i <= d; ++i) { + stride = 1 << (1 + i); + threadsNeeded = (extendLength + stride - 1) / stride; + blockSize = std::max(1, initialBlockSize / stride); + gridSize = (threadsNeeded + blockSize - 1) / blockSize; + upSweepV2 << > > (extendLength, stride, dev_data); + checkCUDAError("upSweepV2"); + } + + // Clear the last element + cudaMemset(&dev_data[extendLength - 1], 0, sizeof(int)); + + //// Down-sweep + for (int i = d; i >= 0; --i) { + stride = 1 << (1 + i); + threadsNeeded = (extendLength + stride - 1) / stride; + blockSize = std::max(1, initialBlockSize / stride); + gridSize = (threadsNeeded + blockSize - 1) / blockSize; + downSweepV2 << > > (extendLength, stride, dev_data); + checkCUDAError("downSweepV2"); + } + //_______________________Version 2.0 ___________________________________________________ timer().endGpuTimer(); + nvtxRangePop(); + + cudaDeviceSynchronize(); + // Copy results back to host + cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy"); + + // Free device memory + cudaFree(dev_data); + checkCUDAError("cudaFree"); } /** @@ -31,9 +280,98 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int d = ilog2ceil(n) - 1; + + int extendLength = 1 << (d + 1); + // Allocate device memory + int* dev_idata; + int* dev_odata; + int* dev_bools; + int* dev_indices; + cudaMalloc((void**)&dev_idata, extendLength * sizeof(int)); + checkCUDAError("cudaMalloc"); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc"); + cudaMalloc((void**)&dev_bools, extendLength * sizeof(int)); + checkCUDAError("cudaMalloc"); + cudaMalloc((void**)&dev_indices, extendLength * sizeof(int)); + checkCUDAError("cudaMalloc"); + + // Set up execution parameters + int blockSize = 128; + int initialBlockSize = blockSize; + int gridSize = (extendLength + blockSize - 1) / blockSize; + + cudaMemset(dev_bools, 0, extendLength * sizeof(int)); + checkCUDAError("cudaMemset"); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemset(dev_idata + n, 0, (extendLength - n) * sizeof(int)); + checkCUDAError("cudaMemset"); + timer().startGpuTimer(); - // TODO + + StreamCompaction::Common::kernMapToBoolean << > > (extendLength, dev_bools, dev_idata); + checkCUDAError("kernMapToBoolean"); + + cudaMemcpy(dev_indices, dev_bools, extendLength * sizeof(int), cudaMemcpyDeviceToDevice); + // Up-sweep + int stride = 0; + //for (int i = 0; i <= d; ++i) { + // stride = 1 << (1 + i); + // upSweep << > > (extendLength, i, dev_indices); + //} + int threadsNeeded; + for (int i = 0; i <= d; ++i) { + stride = 1 << (1 + i); + threadsNeeded = (extendLength + stride - 1) / stride; + blockSize = std::max(1, initialBlockSize / stride); + gridSize = (threadsNeeded + blockSize - 1) / blockSize; + upSweepV2 << > > (extendLength, stride, dev_indices); + checkCUDAError("upSweepV2"); + } + + // Clear the last element + cudaMemset(&dev_indices[extendLength - 1], 0, sizeof(int)); + + // Down-sweep + //for (int i = d; i >= 0; --i) { + // stride = 1 << (1 + i); + // downSweep << > > (extendLength, i, dev_indices); + //} + + //// Down-sweep + for (int i = d; i >= 0; --i) { + stride = 1 << (1 + i); + threadsNeeded = (extendLength + stride - 1) / stride; + blockSize = std::max(1, initialBlockSize / stride); + gridSize = (threadsNeeded + blockSize - 1) / blockSize; + downSweepV2 << > > (extendLength, stride, dev_indices); + checkCUDAError("downSweepV2"); + } + + gridSize = (extendLength + initialBlockSize - 1) / initialBlockSize; + StreamCompaction::Common::kernScatter << > > (extendLength, dev_odata, dev_idata, dev_bools, dev_indices); + //StreamCompaction::Common::kernScatter << > > (extendLength, dev_odata, dev_idata, dev_bools, dev_indices); + checkCUDAError("kernScatter"); + + + // Copy results back to host + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy"); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_bools); + cudaFree(dev_indices); + checkCUDAError("cudaFree"); timer().endGpuTimer(); + + int i; + for (i = 0; i < n; i++) { + if (odata[i] == 0) return i; + } + + if (i == n) return n; return -1; } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..1b1f586 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -7,6 +7,7 @@ namespace StreamCompaction { StreamCompaction::Common::PerformanceTimer& timer(); void scan(int n, int *odata, const int *idata); + void scanShared(int n, int* odata, const int* idata); int compact(int n, int *odata, const int *idata); } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..885af8f 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -13,13 +13,66 @@ namespace StreamCompaction { } // TODO: __global__ + __global__ void naiveScanKernel(int n, int d, int* odata, const int* idata) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k >= n) { + return; + } + + if (k >= (1 << (d - 1))) { + odata[k] = idata[k - (1 << (d - 1))] + idata[k]; + } + else { + odata[k] = idata[k]; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + // TODO + int* dev_idata; + int* dev_odata; + + // Allocate device memory + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + + // Copy data to device + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + // Set up execution parameters + int blockSize = 512; + int gridSize = (n + blockSize - 1) / blockSize; + + nvtxRangePushA("naive scan"); + timer().startGpuTimer(); + + for (int d = 1; d <= ilog2ceil(n); ++d) { + // Swap pointers + if (d > 1) std::swap(dev_idata, dev_odata); + + // Launch kernel + naiveScanKernel << > > (n, d, dev_odata, dev_idata); + + // wait all computing finished + //cudaDeviceSynchronize(); + } + timer().endGpuTimer(); + nvtxRangePop(); + + // Copy results back to host + cudaMemcpy(odata + 1, dev_odata , (n - 1) * sizeof(int), cudaMemcpyDeviceToHost); + + + // manually convert it into exclusive scan + odata[0] = 0; + + cudaFree(dev_idata); + cudaFree(dev_odata); } } } diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 37dcb06..9e2bfe0 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -7,5 +7,7 @@ namespace StreamCompaction { StreamCompaction::Common::PerformanceTimer& timer(); void scan(int n, int *odata, const int *idata); + + __global__ void naiveScanKernel(int n, int d, int* odata, const int* idata); } } diff --git a/stream_compaction/radixsort.cu b/stream_compaction/radixsort.cu new file mode 100644 index 0000000..fd52923 --- /dev/null +++ b/stream_compaction/radixsort.cu @@ -0,0 +1,81 @@ +#include "efficient.h" +#include "radixsort.h" + + +namespace StreamCompaction { + namespace RadixSort { + StreamCompaction::Common::PerformanceTimer& timer() { + static StreamCompaction::Common::PerformanceTimer timer; + return timer; + } + + __global__ void mapToBool(int* out, const int* in, int n, int mask) { + int k = threadIdx.x + blockDim.x * blockIdx.x; + + if (k >= n) return; + + // set e array value + if ((in[k] & mask) == 0) { + out[k] = 1; + } else { + out[k] = 0; + } + + //out[k] = ((in[k] & mask) == 0) ? 1 : 0; + } + + __global__ void scatter(int n, int* odata, + const int* idata, const int* bools, const int* indices) { + int k = threadIdx.x + blockDim.x * blockIdx.x; + + if (k >= n) return; + + // indices is f array, bools is e array (chapter 39) + int totalFalse = indices[n - 1] + bools[n - 1]; + + // here e is the opposite of significant bit b + int d = bools[k] ? indices[k] : k - indices[k] + totalFalse; + odata[d] = idata[k]; + } + + void radixsort(int n, int* out, const int* in) { + + const int numBits = 8 * sizeof(int); // Assuming 32-bit integers + + int* dev_datai, * dev_datao, * dev_bools, * dev_indices; + + cudaMalloc((void**)&dev_datai, n * sizeof(int)); + cudaMalloc((void**)&dev_datao, n * sizeof(int)); + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + cudaMalloc((void**)&dev_indices, n * sizeof(int)); + + cudaMemcpy(dev_datai, in, n * sizeof(int), cudaMemcpyHostToDevice); + + int blockSize = 256; + int numBlocks = (n + blockSize - 1) / blockSize; + + timer().startGpuTimer(); + for (int d = 0; d < numBits; ++d) { + int mask = 1 << d; + + mapToBool << > > (dev_bools, dev_datai, n, mask); + + StreamCompaction::Efficient::scan(n, dev_indices, dev_bools); + + scatter << > > (n, dev_datao, dev_datai, dev_bools, dev_indices); + + std::swap(dev_datai, dev_datao); + } + + timer().endGpuTimer(); + + cudaMemcpy(out, dev_datai, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_datai); + cudaFree(dev_datao); + cudaFree(dev_bools); + cudaFree(dev_indices); + + } + } +} \ No newline at end of file diff --git a/stream_compaction/radixsort.h b/stream_compaction/radixsort.h new file mode 100644 index 0000000..ff11668 --- /dev/null +++ b/stream_compaction/radixsort.h @@ -0,0 +1,11 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace RadixSort { + StreamCompaction::Common::PerformanceTimer& timer(); + + void radixsort(int n, int* out, const int* in); + } +} \ No newline at end of file diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..023e733 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,21 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + // Create device_vectors and copy input data to device + thrust::device_vector dv_in(idata, idata + n); + thrust::device_vector dv_out(n); + + nvtxRangePushA("thrust scan"); + timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); timer().endGpuTimer(); + + nvtxRangePop(); + + + thrust::copy(dv_out.begin(), dv_out.end(), odata); + } } }