diff --git a/CMakeLists.txt b/CMakeLists.txt index c504b62..454fafd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,6 +41,7 @@ include_directories(.) set(headers "src/testing_helpers.hpp" + "src/csvfile.hpp" ) set(sources diff --git a/README.md b/README.md index 0e38ddb..715b37c 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,264 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Ling Xie + * [LinkedIn](https://www.linkedin.com/in/ling-xie-94b939182/), + * [personal website](https://jack12xl.netlify.app). +* Tested on: + * Windows 10, Intel(R) Xeon(R) CPU E5-2650 v4 @ 2.20GHz 2.20GHz ( two processors) + * 64.0 GB memory + * NVIDIA TITAN XP GP102 -### (TODO: Your README) +Thanks to [FLARE LAB](http://faculty.sist.shanghaitech.edu.cn/faculty/liuxp/flare/index.html) for this ferocious monster. -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +##### Cmake change + +Add + +1. [csvfile.hpp](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/src/csvfile.hpp) to automatically record the performance in CSV form. +2. [radixSort.h](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/stream_compaction/radixSort.h), [radixSort.cu](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/stream_compaction/radixSort.cu) for [Part 6](https://github.com/Jack12xl/Project2-Stream-Compaction#part-6-radix-sort-extra-point) + +#### Intro + +In this project, basically we implement **scan** algorithm(in specific prefix sum) based on CUDA parrallelism, as required by [instruction](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/INSTRUCTION.md). The detailed algorithm content can be viewed at [nvidia gpu gem](https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda). + +Here we managed to implement **all** the compulsory and extra point sections. + + + +#### Overview + +##### Optimized block size + +After implementing all the functions, first we try to find the optimized block size. + +![alt text](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/img/SCAN_for_block.svg) + +From the image, we may choose the optimized block size as 256 + +##### Implementation Comparisons + +Here we compare each scan, compact implementations under different array size. The results below are ran under block size = 256. + +![alt text](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/img/SCAN.svg) + +![alt text](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/img/Compact.svg) + +##### notes: + +- the **non-opt** refers to the non-optimization scan function before Part 5. +- The **idx** refers to the optimized version in Part 5. +- The **shared memory** refers to the optimized version in Part 7 + +##### Output of test program + +Here we add test for radix sort, shared memory based scan and compact. + +``` + +**************** +** SCAN TESTS ** +**************** + [ 1 0 0 1 1 1 0 1 1 1 0 0 1 ... 0 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0589ms (std::chrono Measured) + [ 0 1 1 1 2 3 4 4 5 6 7 7 7 ... 32801 32801 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.056ms (std::chrono Measured) + [ 0 1 1 1 2 3 4 4 5 6 7 7 7 ... 32799 32800 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.042656ms (CUDA Measured) + [ 0 1 1 1 2 3 4 4 5 6 7 7 7 ... 32801 32801 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.041024ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.104704ms (CUDA Measured) + [ 0 1 1 1 2 3 4 4 5 6 7 7 7 ... 32801 32801 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.108032ms (CUDA Measured) + [ 0 1 1 1 2 3 4 4 5 6 7 7 7 ... 32799 32800 ] + passed +==== work-efficient scan with shared memory, power-of-two ==== + elapsed time: 0.0256ms (CUDA Measured) + [ 0 1 1 1 2 3 4 4 5 6 7 7 7 ... 32801 32801 ] + passed +==== work-efficient scan with shared memory, non-power-of-two ==== + elapsed time: 0.025024ms (CUDA Measured) + [ 0 1 1 1 2 3 4 4 5 6 7 7 7 ... 32799 32800 ] + passed +==== work-efficient scan with index scale, power-of-two ==== + elapsed time: 0.083584ms (CUDA Measured) + [ 0 1 1 1 2 3 4 4 5 6 7 7 7 ... 32801 32801 ] + passed +==== work-efficient scan with index scale, non-power-of-two ==== + elapsed time: 0.077536ms (CUDA Measured) + [ 0 1 1 1 2 3 4 4 5 6 7 7 7 ... 32799 32800 ] + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.094944ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.091936ms (CUDA Measured) + passed + +***************************** +** STREAM SORT TESTS ** +***************************** + [ 31 24 18 1 17 25 4 15 25 3 30 22 7 ... 5 0 ] +The array to be sorted is : + [ 31 24 18 1 17 25 4 15 25 3 30 22 7 ... 5 0 ] +==== Std sort ==== + elapsed time: 0.0011ms (std::chrono Measured) + [ 0 1 1 1 3 4 5 5 5 5 7 7 9 ... 30 31 ] +==== Radix sort ==== + elapsed time: 0.0009ms (std::chrono Measured) + [ 0 1 1 1 3 4 5 5 5 5 7 7 9 ... 30 31 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 3 0 2 1 1 1 0 3 1 3 2 2 3 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.2151ms (std::chrono Measured) + [ 3 2 1 1 1 3 1 3 2 2 3 3 2 ... 2 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.4586ms (std::chrono Measured) + [ 3 2 1 1 1 3 1 3 2 2 3 3 2 ... 1 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.5532ms (std::chrono Measured) + [ 3 2 1 1 1 3 1 3 2 2 3 3 2 ... 2 1 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.443296ms (CUDA Measured) + [ 3 2 1 1 1 3 1 3 2 2 3 3 2 ... 2 1 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.403328ms (CUDA Measured) + [ 3 2 1 1 1 3 1 3 2 2 3 3 2 ... 1 2 ] + passed +==== work-efficient compact with idx mapping, power-of-two ==== + elapsed time: 0.362304ms (CUDA Measured) + [ 3 2 1 1 1 3 1 3 2 2 3 3 2 ... 2 1 ] + passed +==== work-efficient compact with idx mapping, non-power-of-two ==== + elapsed time: 0.493792ms (CUDA Measured) + [ 3 2 1 1 1 3 1 3 2 2 3 3 2 ... 1 2 ] + passed +==== work-efficient compact with shared memory, power-of-two ==== + elapsed time: 0.394784ms (CUDA Measured) + [ 3 2 1 1 1 3 1 3 2 2 3 3 2 ... 2 1 ] + passed +==== work-efficient compact with shared memory, non-power-of-two ==== + elapsed time: 0.463968ms (CUDA Measured) + [ 3 2 1 1 1 3 1 3 2 2 3 3 2 ... 1 2 ] + passed +``` + + + +#### Part 1~3: + +The performance is showed in previous [image](https://github.com/Jack12xl/Project2-Stream-Compaction#implementation-comparisons). + +#### Part 4: + +Here shows the thrust summary and timeline: + +![alt text](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/img/Thrust.png) + +![alt text](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/img/Thrust_timeline.png) + +#### Part 5: why GPU version so slow [Extra point] + +The reason why the GPU is slower than CPU version: + +1. **Spatial coherence:** The cpu version reads the memory in a continuous way while the current version fetches memory uncontinuously, which leads to a low memory bandwidth. +2. **The input size matters:** When the size of input array is trivial (for example 2^4), **cpu** version is faster than **gpu's**. When the size goes up, the situation goes reversed and **gpu** version is much faster than **cpu's** since naturally **gpu** is better in dealing with a large amounts of number. +3. **Occupancy low**: Not all the threads are doing its job in non-optimization version. + +##### Simple solution + +We can increase the **Occupancy** by mapping the each thread index to the active index to force them to work. Also, we can dynamically adjust the grid dimension to stop calling useless grid. + +##### Tips: + +The mapping step may cause integer overflow. So we use **size_t** for thread index. + +#### Part 6 Radix Sort [Extra point] + +For simplicity and less memory copy between gpu and cpu, we mainly implement the algorithm in cpu side, except for the scan function, which we call the shared memory version from part 7. + +We compare the results with built-in std::sort. Here we show the correctness of the radix sort. + +``` +***************************** +** STREAM SORT TESTS ** +***************************** + [ 31 24 18 1 17 25 4 15 25 3 30 22 7 ... 5 0 ] +The array to be sorted is : + [ 31 24 18 1 17 25 4 15 25 3 30 22 7 ... 5 0 ] +==== Std sort ==== + elapsed time: 0.0011ms (std::chrono Measured) + [ 0 1 1 1 3 4 5 5 5 5 7 7 9 ... 30 31 ] +==== Radix sort ==== + elapsed time: 0.0009ms (std::chrono Measured) + [ 0 1 1 1 3 4 5 5 5 5 7 7 9 ... 30 31 ] + passed + +``` + + + +#### Part 7 Scan with shared memory [Extra point] + +As is showed in previous figure, adding shared memory can boost the performance in a large degree since it provides higher memory bandwidth than global memory. + +##### Implementation explain + +First the implementation in gpu gem is somehow not that robust to input blocksize because it tries to read all block memory into a single shared memory bank. If the blocksize keep increasing, it would soon drain the shared memory limit(48 kb) . + +So instead, we tear the up-sweep and down-sweep process in several part( based on the block size) with different sweep depth. In each part we respectively assign the shared memory based on the largest array this part would hop into. + +##### Detail: + +In our design, we set the shared memory size twice as big as the block size. The reason for this is to utilize the index mapping from [part 5](https://github.com/Jack12xl/Project2-Stream-Compaction#part-5-why-gpu-version-so-slow-extra-point). + +Sadly we do not consider the bank conflict effect. + +##### Tips + +The shared memory version is prone to cause integer overflow so we decrease the element range in input array. + + + +#### Questions: + +- ##### Roughly optimize the block sizes of each of your implementations for minimal run time on your GPU. + + - As is discussed in [here](https://github.com/Jack12xl/Project2-Stream-Compaction#optimized-block-size), we adopt the 256 block size for both naive and efficient version. + +- ##### Compare all of these GPU Scan implementations (Naive, Work-Efficient, and Thrust) to the serial CPU version of Scan. Plot a graph of the comparison (with array size on the independent axis). + + - The picture is showed [here](https://github.com/Jack12xl/Project2-Stream-Compaction#implementation-comparisons). + +- ##### Can you find the performance bottlenecks? Is it memory I/O? Computation? Is it different for each implementation? + + - Personally I believe the bottlenecks lie mainly in memory I/O. Because for each implementation the computation is pretty straight(with complexity **O(n)** and **O(n * log(n)**). Besides, when the shared memory is introduced to decrease the I/O latency, the performance goes up drastically. + +- ##### Thrust implementation + + - As is depicted in this [figure](https://github.com/Jack12xl/Project2-Stream-Compaction/blob/master/img/Thrust_timeline.png) + - **Memory**: thrust use thrust pointer to wrap the memory. It would automatically copy memory between host and device when necessary. + - + +- ##### Paste the output of the test program into a triple-backtick block in your README. + + - Pasted [here](https://github.com/Jack12xl/Project2-Stream-Compaction#output-of-test-program) diff --git a/img/Compact.svg b/img/Compact.svg new file mode 100644 index 0000000..3783ce7 --- /dev/null +++ b/img/Compact.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/img/SCAN.svg b/img/SCAN.svg new file mode 100644 index 0000000..af77cd7 --- /dev/null +++ b/img/SCAN.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/img/SCAN_for_block.svg b/img/SCAN_for_block.svg new file mode 100644 index 0000000..9175408 --- /dev/null +++ b/img/SCAN_for_block.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/img/Thrust.png b/img/Thrust.png new file mode 100644 index 0000000..39487a4 Binary files /dev/null and b/img/Thrust.png differ diff --git a/img/Thrust_timeline.png b/img/Thrust_timeline.png new file mode 100644 index 0000000..f741759 Binary files /dev/null and b/img/Thrust_timeline.png differ diff --git a/src/csvfile.hpp b/src/csvfile.hpp new file mode 100644 index 0000000..5ebde7b --- /dev/null +++ b/src/csvfile.hpp @@ -0,0 +1,112 @@ +#pragma once +#include +#include +#include +#include + +class csvfile; + +inline static csvfile& endrow(csvfile& file); +inline static csvfile& flush(csvfile& file); + +class csvfile +{ + std::ofstream fs_; + bool is_first_; + const std::string separator_; + const std::string escape_seq_; + const std::string special_chars_; +public: + csvfile(const std::string filename, const std::string separator = ",") + : fs_() + , is_first_(true) + , separator_(separator) + , escape_seq_("\"") + , special_chars_("\"") + { + fs_.exceptions(std::ios::failbit | std::ios::badbit); + fs_.open(filename); + } + + ~csvfile() + { + flush(); + fs_.close(); + } + + void flush() + { + fs_.flush(); + } + + void endrow() + { + fs_ << std::endl; + is_first_ = true; + } + + csvfile& operator << (csvfile& (*val)(csvfile&)) + { + return val(*this); + } + + csvfile& operator << (const char* val) + { + return write(escape(val)); + } + + csvfile& operator << (const std::string& val) + { + return write(escape(val)); + } + + template + csvfile& operator << (const T& val) + { + return write(val); + } + +private: + template + csvfile& write(const T& val) + { + if (!is_first_) + { + fs_ << separator_; + } + else + { + is_first_ = false; + } + fs_ << val; + return *this; + } + + std::string escape(const std::string& val) + { + std::ostringstream result; + result << '"'; + std::string::size_type to, from = 0u, len = val.length(); + while (from < len && + std::string::npos != (to = val.find_first_of(special_chars_, from))) + { + result << val.substr(from, to - from) << escape_seq_ << val[to]; + from = to + 1; + } + result << val.substr(from) << '"'; + return result.str(); + } +}; + + +inline static csvfile& endrow(csvfile& file) +{ + file.endrow(); + return file; +} + +inline static csvfile& flush(csvfile& file) +{ + file.flush(); + return file; +} \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..f770db2 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,95 +11,219 @@ #include #include #include +#include #include "testing_helpers.hpp" +#include "csvfile.hpp" +#include -const int SIZE = 1 << 8; // feel free to change the size of array + + +const int power = 16; +const int SIZE = 1 << power; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; int *c = new int[SIZE]; + + int main(int argc, char* argv[]) { // Scan tests + int sort_size_power = 5; + assert(sort_size_power <= power); + int sort_size = 1 << sort_size_power; + + int num_power = 5; + printf("\n"); printf("****************\n"); printf("** SCAN TESTS **\n"); printf("****************\n"); - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + //onesArray(SIZE - 1, a); + genArray(SIZE - 1, a, 2); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; printArray(SIZE, a, true); // initialize b using StreamCompaction::CPU::scan you implement // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. // At first all cases passed because b && c are all zeroes. + std::ostringstream stringStream; + stringStream << "Compact_" << power; + stringStream << "_Sort_" << sort_size_power << "_" << num_power; + stringStream << "_naiveblck_" << blocksize << "_effblck_" << efficient_blocksize; + stringStream << ".csv"; + std::string file_name = stringStream.str(); + + csvfile my_csv(file_name); + my_csv << "Compact at power " << "Sort power" << "Sort num power" << endrow; + my_csv << power << sort_size_power << num_power << endrow; + my_csv << " " << endrow; + + my_csv << "Naive block size" << "Efficient block size" << endrow; + my_csv << blocksize << efficient_blocksize << endrow; + my_csv << endrow; + + my_csv << endrow; + my_csv << "SCAN" << endrow; + my_csv << endrow; + + float cur_time; zeroArray(SIZE, b); printDesc("cpu scan, power-of-two"); StreamCompaction::CPU::scan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + cur_time = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(std::chrono Measured)"); printArray(SIZE, b, true); - + my_csv << "cpu scan p_2 " << cur_time << endrow; + // zeroArray(SIZE, c); printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + cur_time = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(std::chrono Measured)"); printArray(NPOT, b, true); printCmpResult(NPOT, b, c); + my_csv << "cpu scan n_p_2 " << cur_time << endrow; zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + cur_time = StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); + my_csv << "naive scan p_2" << cur_time << endrow; - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ + + //// For bug-finding only: Array of 1s to help find bugs in stream compaction or scan + ///*onesArray(SIZE, c); + //printDesc("1s array for finding bugs"); + //StreamCompaction::Naive::scan(SIZE, c, a); + //printArray(SIZE, c, true); */ zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + cur_time = StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); //printArray(SIZE, c, true); printCmpResult(NPOT, b, c); + my_csv << "naive scan n_p_2" << cur_time << endrow; zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + StreamCompaction::Efficient::scan(SIZE, c, a, EFF_method::nonOptimization, true); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); + my_csv << "efficient scan non-optimization p_2" << cur_time << endrow; zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); + StreamCompaction::Efficient::scan(NPOT, c, a, EFF_method::nonOptimization, true); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); + printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + my_csv << "efficient scan non-optimization n_p_2" << cur_time << endrow; + + zeroArray(SIZE, c); + printDesc("work-efficient scan with shared memory, power-of-two"); + StreamCompaction::Efficient::scan(SIZE, c, a, EFF_method::sharedMemory, true); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + my_csv << "efficient scan shared p_2" << cur_time << endrow; + + zeroArray(SIZE, c); + printDesc("work-efficient scan with shared memory, non-power-of-two"); + StreamCompaction::Efficient::scan(NPOT, c, a, EFF_method::sharedMemory, true); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + my_csv << "efficient scan shared n_p_2" << cur_time << endrow; + + zeroArray(SIZE, c); + printDesc("work-efficient scan with index scale, power-of-two"); + StreamCompaction::Efficient::scan(SIZE, c, a, EFF_method::idxMapping, true); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + my_csv << "efficient scan idx p_2" << cur_time << endrow; + + zeroArray(SIZE, c); + printDesc("work-efficient scan with index scale, non-power-of-two"); + StreamCompaction::Efficient::scan(NPOT, c, a, EFF_method::idxMapping, true); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + my_csv << "efficient scan idx n_p_2" << cur_time << endrow; zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + cur_time = StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); //printArray(SIZE, c, true); printCmpResult(SIZE, b, c); + my_csv << "thrust scan p_2" << cur_time << endrow; zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + cur_time = StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + my_csv << "thrust scan n_p_2" << cur_time << endrow; + + printf("\n"); + printf("*****************************\n"); + printf("** STREAM SORT TESTS **\n"); + printf("*****************************\n"); + + my_csv << endrow; + my_csv << "SORT" << endrow; + my_csv << endrow; + + + genArray(sort_size - 1, a, 1 << num_power); // Leave a 0 at the end to test that edge case + a[sort_size - 1] = 0; + printArray(sort_size, a, true); + + printf("The array to be sorted is : \n"); + printArray(sort_size, a, true); + printDesc("Std sort"); + StreamCompaction::RadixSort::CpuStandardSort(sort_size, b, a); + cur_time = StreamCompaction::RadixSort::timer().getCpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(std::chrono Measured)"); + printArray(sort_size, b, true); + my_csv << "std sort" << cur_time << endrow; + + printDesc("Radix sort"); + zeroArray(sort_size, c); + StreamCompaction::RadixSort::GpuRadixSort(sort_size, c, a, num_power); + cur_time = StreamCompaction::RadixSort::timer().getCpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(std::chrono Measured)"); + printArray(sort_size, c, true); + printCmpResult(sort_size, b, c); + my_csv << "Radix sort" << cur_time << endrow; printf("\n"); printf("*****************************\n"); printf("** STREAM COMPACTION TESTS **\n"); printf("*****************************\n"); + my_csv << endrow; + my_csv << "CAMPACTION" << endrow; + my_csv << endrow; // Compaction tests genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case @@ -113,42 +237,109 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, b); printDesc("cpu compact without scan, power-of-two"); count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + cur_time = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(std::chrono Measured)"); expectedCount = count; printArray(count, b, true); printCmpLenResult(count, expectedCount, b, b); + my_csv << "cpu campact no scan p_2" << cur_time << endrow; zeroArray(SIZE, c); printDesc("cpu compact without scan, non-power-of-two"); count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + cur_time = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(std::chrono Measured)"); expectedNPOT = count; printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + my_csv << "cpu campact no scan n_p_2" << cur_time << endrow; zeroArray(SIZE, c); printDesc("cpu compact with scan"); count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + cur_time = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(std::chrono Measured)"); printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); + my_csv << "cpu campact scan n_2" << cur_time << endrow; zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + count = StreamCompaction::Efficient::compact(SIZE, c, a, EFF_method::nonOptimization); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); + my_csv << "eff compact non-opt p_2" << cur_time << endrow; zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + count = StreamCompaction::Efficient::compact(NPOT, c, a, EFF_method::nonOptimization); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + my_csv << "eff compact non-opt n_p_2" << cur_time << endrow; + + zeroArray(SIZE, c); + printDesc("work-efficient compact with idx mapping, power-of-two"); + count = StreamCompaction::Efficient::compact(SIZE, c, a, EFF_method::idxMapping); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); + printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + my_csv << "eff compact idx map p_2" << cur_time << endrow; + + zeroArray(SIZE, c); + printDesc("work-efficient compact with idx mapping, non-power-of-two"); + count = StreamCompaction::Efficient::compact(NPOT, c, a, EFF_method::idxMapping); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); + printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + my_csv << "eff compact idx map n_p_2" << cur_time << endrow; + + zeroArray(SIZE, c); + printDesc("work-efficient compact with shared memory, power-of-two"); + count = StreamCompaction::Efficient::compact(SIZE, c, a, EFF_method::sharedMemory); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); + printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + my_csv << "eff compact shared p_2" << cur_time << endrow; + + zeroArray(SIZE, c); + printDesc("work-efficient compact with shared memory, non-power-of-two"); + count = StreamCompaction::Efficient::compact(NPOT, c, a, EFF_method::sharedMemory); + cur_time = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + printElapsedTime(cur_time, "(CUDA Measured)"); + printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + my_csv << "eff compact shared n_p_2" << cur_time << endrow; + system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; delete[] c; + // save to csv + try + { + //csvfile csv("MyTable.csv"); // throws exceptions! + //// Hearer + //csv << "X" << "VALUE" << endrow; + //// Data + //int i = 1; + /*csv << i++ << "String value" << endrow; + csv << i++ << 123 << endrow; + csv << i++ << 1.f << endrow; + csv << i++ << 1.2 << endrow; + csv << i++ << "One more string" << endrow;*/ + /*csv << i++ << "\"Escaped\"" << endrow; + csv << i++ << "=HYPERLINK(\"https://playkey.net\"; \"Playkey Service\")" << endrow;*/ + } + catch (const std::exception& ex) + { + std::cout << "Exception was thrown: " << ex.what() << std::endl; + } } diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 567795b..8656c2f 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -4,6 +4,7 @@ set(headers "naive.h" "efficient.h" "thrust.h" + "radixSort.h" ) set(sources @@ -12,6 +13,7 @@ set(sources "naive.cu" "efficient.cu" "thrust.cu" + "radixSort.cu" ) list(SORT headers) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..15ee84c 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,4 +1,5 @@ #include "common.h" +#include void checkCUDAErrorFn(const char *msg, const char *file, int line) { cudaError_t err = cudaGetLastError(); @@ -22,17 +23,31 @@ namespace StreamCompaction { * Maps an array to an array of 0s and 1s for stream compaction. Elements * which map to 0 will be removed, and elements which map to 1 will be kept. */ - __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { + __global__ void kernMapToBoolean(int N, int *bools, const int *idata) { // TODO + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= N) { + return; + } + bools[k] = idata[k] == 0 ? 0 : 1; } /** * Performs scatter on an array. That is, for each element in idata, * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. */ - __global__ void kernScatter(int n, int *odata, + __global__ void kernScatter(int N, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= N) { + return; + } + + if (bools[k] == 1) { + odata[indices[k]] = idata[k]; + } + } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..a07b626 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -2,6 +2,8 @@ #include "cpu.h" #include "common.h" +#include // Jack12 for assert +#include // Jack12 uses vector namespace StreamCompaction { namespace CPU { @@ -17,9 +19,31 @@ namespace StreamCompaction { * For performance analysis, this is supposed to be a simple for loop. * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ + inline void cpu_scan(const int& n, int* odata, const int* idata) { + // scan without cputimer inline + if (n == 0) { + return; + } + + assert(odata != nullptr); + assert(idata != nullptr); + + int prefix_sum = 0; + + odata[0] = 0; + for (int i = 1; i < n; i++) { + //odata[i] += idata[i - 1]; + prefix_sum += idata[i - 1]; + odata[i] = prefix_sum; + } + } + void scan(int n, int *odata, const int *idata) { + timer().startCpuTimer(); // TODO + // should be exclusive + cpu_scan(n, odata, idata); timer().endCpuTimer(); } @@ -29,10 +53,22 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); + // TODO + if (n != 0) { + assert(odata != nullptr); + assert(idata != nullptr); + } + timer().startCpuTimer(); + int p = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[p] = idata[i]; + p++; + } + } timer().endCpuTimer(); - return -1; + return p; } /** @@ -41,10 +77,38 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); // TODO + int out = 0; + if (n == 0) { + return out; + } + + assert(odata != nullptr); + assert(idata != nullptr); + + int* bin_arr = new int[n]; + int* scan_arr = new int[n]; + timer().startCpuTimer(); + // map to 0, 1 + + for (int i = 0; i < n; i++) { + bin_arr[i] = idata[i] == 0 ? 0 : 1; + } + // scan + + cpu_scan(n, scan_arr, bin_arr); + // odata contains the scan result + + for (int i = 0; i < n; i++) { + if (bin_arr[i]) { + out++; + odata[scan_arr[i]] = idata[i]; + } + } timer().endCpuTimer(); - return -1; + delete[] bin_arr; + delete[] scan_arr; + return out; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..7ccb806 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,6 +2,10 @@ #include #include "common.h" #include "efficient.h" +#include +#include +//#include "cis565_stream_compaction_test/testing_helpers.hpp" + namespace StreamCompaction { namespace Efficient { @@ -11,14 +15,226 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } + __global__ void kernUpdateArray(int idx, int val, int* d_data) { + d_data[idx] = val; + } + +#pragma region vanilla + __global__ void kernUpSweepStep( + int N, + int d_2, + int* d_data + ){ + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= N) { + return; + } + if (k % (2 * d_2) == 0) { + d_data[k + 2 * d_2 - 1] += d_data[k + d_2 - 1]; + } + } + + __global__ void kernDownSweepStep( + int N, + int d_2, + int* d_data + ) { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= N) { + return; + } + + if (k % ( d_2 * 2 )== 0) { + int tmp = d_data[k + d_2 -1]; + d_data[k + d_2 - 1] = d_data[k + 2 * d_2 - 1]; + d_data[k + 2 * d_2 - 1] = tmp + d_data[k + 2 * d_2 - 1]; + } + } +#pragma endregion + +#pragma region indexScale +// for part 5 + __global__ void kernUpSweepIndexScaleStep( + int N, + int d_2, + int* d_data + ) { + // use size_t in case overflow + size_t k = 2 * d_2 * ( (blockIdx.x * blockDim.x) + threadIdx.x) + 2 * d_2 - 1; + /*k *= 2 * d_2; + k += 2 * d_2 - 1;*/ + if (k >= N) { + return; + } + d_data[k] += d_data[k - d_2]; + } + + __global__ void kernDownSweepIndexScaleStep( + int N, + int d_2, + int* d_data + ) { + size_t k = 2 * d_2 * ((blockIdx.x * blockDim.x) + threadIdx.x) + 2 * d_2 - 1; + if (k >= N) { + return; + } + int tmp = d_data[k + - d_2]; + d_data[k - d_2] = d_data[k]; + d_data[k] = tmp + d_data[k]; + } +#pragma endregion + +#pragma region SharedMemory + __global__ void kernSharedMemoryUpSweepStep(int N, int d_2, int cur_depth, int target_depth, int* dev_idata) { + size_t t_offset = blockIdx.x * blockDim.x; + size_t t_id = threadIdx.x; + size_t k = 2 * d_2 * (t_offset + t_id) + 2 * d_2 - 1; + if (k >= N) { + return; + } + + extern __shared__ float shared[]; + shared[2 * t_id] = dev_idata[k - d_2]; + shared[2 * t_id + 1] = dev_idata[k]; + __syncthreads(); + + /*shared[2 * t_id + 1] += shared[2 * t_id];*/ + + for (int i = 0; i < target_depth - cur_depth; i++) { + int mul = 1 << (i + 1); + int idx_a = mul * (t_id + 1) - 1; + int idx_b = mul * (t_id + 1) - mul / 2 - 1; + if (idx_a < 2 * blockDim.x) { + /*int a = shared[idx_a]; + int b = shared[idx_b];*/ + shared[idx_a] += shared[idx_b]; + } + __syncthreads(); + } + + + dev_idata[k] = shared[2 * t_id + 1]; + dev_idata[k - d_2] = shared[2 * t_id]; + } + + __global__ void kernSharedMemoryDownSweepStep(int N, int d_2, int cur_depth, int target_depth, int* dev_idata) { + size_t t_offset = blockIdx.x * blockDim.x; + size_t t_id = threadIdx.x; + size_t k = 2 * d_2 * (t_offset + t_id) + 2 * d_2 - 1; + if (k >= N) { + return; + } + + extern __shared__ float shared[]; + shared[2 * t_id] = dev_idata[k - d_2]; + shared[2 * t_id + 1] = dev_idata[k]; + __syncthreads(); + for (int i = cur_depth - 1 - target_depth; i >= 0; i--) { + int mul = 1 << (i + 1); + int idx_a = mul * (t_id + 1) - 1; + int idx_b = mul * (t_id + 1) - mul / 2 - 1; + if (idx_a < 2 * blockDim.x) { + /*int a = shared[idx_a]; + int b = shared[idx_b];*/ + int tmp = shared[idx_b]; + shared[idx_b] = shared[idx_a]; + shared[idx_a] += tmp; + } + __syncthreads(); + } + + dev_idata[k - d_2] = shared[2 * t_id]; + dev_idata[k] = shared[2 * t_id + 1]; + } + #pragma endregion /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + void scan(int n, int *odata, const int *idata, EFF_method cur_method, bool ifTimer = true ) { + if (n == 0) { + return; + } + assert(odata != nullptr); + assert(idata != nullptr); + + int log_n = ilog2ceil(n); + int n_2 = 1 << log_n; + + int* dev_idata; + dim3 blocksPerGrid = (n_2 + efficient_blocksize - 1) / efficient_blocksize; + /*int* dev_odata;*/ + cudaMalloc((void**)&dev_idata, n_2 * sizeof(int)); + /*cudaMalloc((void**)&dev_odata, n_2 * sizeof(int));*/ + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + if (ifTimer) { + timer().startGpuTimer(); + } + // TODO - timer().endGpuTimer(); + if (cur_method == EFF_method::sharedMemory) { + int unroll_depth = ilog2ceil(efficient_blocksize); + + for (int cur_depth = 0; cur_depth < log_n; cur_depth += unroll_depth) { + int d = cur_depth; + blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; + int target_depth = std::min(cur_depth + unroll_depth, log_n); // log_n exclusive + kernSharedMemoryUpSweepStep << > > (n_2, 1 << d, cur_depth, target_depth, dev_idata); + } + } + else { + for (int d = 0; d <= log_n - 1; d++) { + if (cur_method == EFF_method::idxMapping) { + blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; + kernUpSweepIndexScaleStep << > > (n_2, 1 << d, dev_idata); + } + /*else if (ifSharedMemory) { + blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; + kernSharedMemoryUpSweepStep <<>> (n_2, 1 << d, dev_idata); + }*/ + else { + // non optimization + kernUpSweepStep << > > (n_2, 1 << d, dev_idata); + } + } + } + + + kernUpdateArray << <1, 1 >> > (n_2 - 1, 0, dev_idata); + + if (cur_method == EFF_method::sharedMemory) { + int unroll_depth = ilog2ceil(efficient_blocksize); + for (int cur_depth = log_n; cur_depth > 0; cur_depth -= unroll_depth) { + int target_depth = std::max(0, cur_depth - unroll_depth); + int d = target_depth; + blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; + kernSharedMemoryDownSweepStep << > > (n_2, 1 << d, cur_depth, target_depth, dev_idata); + } + } + else { + for (int d = log_n - 1; d >= 0; d--) { + if (cur_method == EFF_method::idxMapping) { + blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; + kernDownSweepIndexScaleStep << > > (n_2, 1 << d, dev_idata); + } + /*else if (ifSharedMemory) { + blocksPerGrid = (n_2 / (1 << (1 + d)) + efficient_blocksize - 1) / efficient_blocksize; + kernSharedMemoryDownSweepStep << > > (n_2, 1 << d, dev_idata); + }*/ + else { + kernDownSweepStep << > > (n_2, 1 << d, dev_idata); + } + } + } + + if (ifTimer) { + timer().endGpuTimer(); + } + + cudaMemcpy(odata, dev_idata,n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_idata); } /** @@ -30,11 +246,102 @@ namespace StreamCompaction { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ - int compact(int n, int *odata, const int *idata) { + int compact(int N, int *odata, const int *idata, EFF_method cur_method) { + if (N == 0) { + return 0; + } + assert(odata != nullptr); + assert(idata != nullptr); + + int* dev_idata; + int* dev_odata; + int* dev_bools; + int* dev_indices; + + cudaMalloc((void**)&dev_idata, N * sizeof(int)); + cudaMalloc((void**)&dev_odata, N * sizeof(int)); + cudaMalloc((void**)&dev_bools, N * sizeof(int)); + cudaMalloc((void**)&dev_indices, N * sizeof(int)); + + cudaMemcpy(dev_idata, idata, N * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); // TODO + dim3 blocksPerGrid = (N + efficient_blocksize - 1) / efficient_blocksize; + Common::kernMapToBoolean << > > (N, dev_bools, dev_idata); + + scan(N, dev_indices, dev_bools, cur_method, false); + + Common::kernScatter << > > ( + N, + dev_odata, + dev_idata, + dev_bools, + dev_indices + ); + + timer().endGpuTimer(); - return -1; + + cudaMemcpy(odata, dev_odata, N * sizeof(int), cudaMemcpyDeviceToHost); + int* hst_bools = new int[N]; + cudaMemcpy(hst_bools, dev_bools, N * sizeof(int), cudaMemcpyDeviceToHost); + int out = 0; + for (int i = 0; i < N; i++) { + if (hst_bools[i] == 1) { + out++; + } + } + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_bools); + cudaFree(dev_indices); + + return out; } } + + + + // ref :: gpu gem + __global__ void prescan(float* g_odata, float* g_idata, int n) { + extern __shared__ float temp[]; // allocated on invocation + int thid = threadIdx.x; int offset = 1; + temp[2 * thid] = g_idata[2 * thid]; // load input into shared memory + temp[2*thid+1] = g_idata[2*thid+1]; + + for (int d = n >> 1; d > 0; d >>= 1) // build sum in place up the tree + { + __syncthreads(); + if (thid < d) + { + int ai = offset * (2 * thid + 1) - 1; + int bi = offset * (2 * thid + 2) - 1; + temp[bi] += temp[ai]; + } + offset *= 2; + } + + + if (thid == 0) { temp[n - 1] = 0; } // clear the last element + + for (int d = 1; d < n; d *= 2) // traverse down tree & build scan + { + offset >>= 1; + __syncthreads(); + if (thid < d){ + int ai = offset * (2 * thid + 1) - 1; + int bi = offset * (2 * thid + 2) - 1; + + float t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + } + __syncthreads(); + + g_odata[2 * thid] = temp[2 * thid]; // write results to device memory g_odata[2*thid+1] = temp[2*thid+1]; + + } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..168ca4d 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -2,12 +2,20 @@ #include "common.h" +constexpr int efficient_blocksize = 256; + +enum class EFF_method { + nonOptimization, + idxMapping, + sharedMemory +}; + namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata, EFF_method cur_method, bool ifTimer); - int compact(int n, int *odata, const int *idata); + int compact(int n, int *odata, const int *idata, EFF_method cur_method); } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..13d5821 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -2,6 +2,11 @@ #include #include "common.h" #include "naive.h" +#include +#include // Jack12 for assert + +//#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) + namespace StreamCompaction { namespace Naive { @@ -12,14 +17,92 @@ namespace StreamCompaction { return timer; } // TODO: __global__ + __global__ void kernScanStep( + int n, + int d_phase, + const int* dev_buf_0, + int* dev_buf_1) { + + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= n) { + return; + } + if (k >= d_phase) { + dev_buf_1[k] = dev_buf_0[k - d_phase] + dev_buf_0[k]; + } + else { + dev_buf_1[k] = dev_buf_0[k]; + } + return; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + // + if (n == 0) { + return; + } + assert(odata != nullptr); + assert(odata != nullptr); + + int* device_buf_0; + int* device_buf_1; + cudaMalloc((void**)&device_buf_0, n * sizeof(int)); + cudaMalloc((void**)&device_buf_1, n * sizeof(int)); + + //int* device_idata; + cudaMemcpy(device_buf_0, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(device_buf_1, idata, n * sizeof(int), cudaMemcpyHostToDevice); timer().startGpuTimer(); + // TODO + // to device + int it_ceil = ilog2ceil(n); + dim3 blocksPerGrid = (n + blocksize - 1) / blocksize; + + int offset; + for (int d = 1; d <= it_ceil; d++) { + offset = (int)std::pow(2, d - 1); + StreamCompaction::Naive::kernScanStep<<>>(n, offset, device_buf_0, device_buf_1); + //cudaMemcpy(device_buf_0, device_buf_1, n* sizeof(int), cudaMemcpyDeviceToDevice); + std::swap(device_buf_0, device_buf_1); + } + + timer().endGpuTimer(); + cudaThreadSynchronize(); + // to exclusive + cudaMemcpy(odata + 1, device_buf_0, (n-1) * sizeof(int), cudaMemcpyDeviceToHost); + //cudaMemcpy(odata, device_buf_0, n * sizeof(int), cudaMemcpyDeviceToHost); + /*std::memmove(odata + 1, odata, n-1); + odata[0] = 0;*/ + cudaFree(device_buf_0); + cudaFree(device_buf_1); } + } } + +__global__ void scan(float* g_odata, float* g_idata, int n) { + extern __shared__ float temp[]; // allocated on invocation + int thid = threadIdx.x; + int pout = 0, pin = 1; // Load input into shared memory. + // This is exclusive scan, so shift right by one + // and set first element to 0 + temp[pout * n + thid] = (thid > 0) ? g_idata[thid - 1] : 0; + __syncthreads(); + for (int offset = 1; offset < n; offset *= 2) + { + pout = 1 - pout; + // swap double buffer indices + pin = 1 - pout; + if (thid >= offset) + temp[pout * n + thid] += temp[pin * n + thid - offset]; + else + temp[pout * n + thid] = temp[pin * n + thid]; + __syncthreads(); + } + g_odata[thid] = temp[pout * n + thid]; // write output +} diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 37dcb06..f0d94c2 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -2,6 +2,8 @@ #include "common.h" +constexpr int blocksize = 256; + namespace StreamCompaction { namespace Naive { StreamCompaction::Common::PerformanceTimer& timer(); diff --git a/stream_compaction/radixSort.cu b/stream_compaction/radixSort.cu new file mode 100644 index 0000000..995d499 --- /dev/null +++ b/stream_compaction/radixSort.cu @@ -0,0 +1,90 @@ +#include "efficient.h" +#include +#include +#include "radixSort.h" +#include +#include +#include +#include + +int k_th_bit(int k, int n) { + return (n >> k) & 1; +} + +namespace StreamCompaction { + namespace RadixSort { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + void CpuStandardSort(const int& N, int* out, const int* in) { + if (N == 0) { + return; + } + assert(in != nullptr); + assert(out != nullptr); + + std::vector a_vec(in, in + N); + + timer().startCpuTimer(); + std::sort(a_vec.begin(), a_vec.end()); + timer().endCpuTimer(); + + std::copy(a_vec.begin(), a_vec.end(), out); + } + + void GpuRadixSort(const int& N, int* hst_out, const int* hst_in, const int max_bit ){ + // + if (N == 0) { + return; + } + assert(hst_in != nullptr); + assert(hst_out != nullptr); + + /*int* dev_in, dev_out, dev_out_buf; + cudaMalloc((void**)&dev_in, N * sizeof(int)); + cudaMalloc((void**)&dev_out, N * sizeof(int)); + cudaMalloc((void**)&dev_out_buf, N * sizeof(int)); + cudaMemcpy(dev_in, hst_in, N * sizeof(int), cudaMemcpyHostToDevice);*/ + + int* hst_e,* hst_f,* hst_d; + int* hst_out_buf; + hst_e = new int[N]; + hst_f = new int[N]; + hst_d = new int[N]; + + hst_out_buf = new int[N]; + std::copy(hst_in, hst_in + N, hst_out_buf); + + timer().startGpuTimer(); + for (int k = 0; k < max_bit; k ++) { + for (int i = 0; i < N; i++) { + hst_e[i] = 1 - k_th_bit(k, hst_out_buf[i]); + } + + Efficient::scan(N, hst_f, hst_e, EFF_method::sharedMemory, false); + + int total_falses = hst_e[N - 1] + hst_f[N - 1]; + + for (int i = 0; i < N; i++) { + hst_d[i] = hst_e[i] == 0 ? (i - hst_f[i] + total_falses) : hst_f[i]; + } + + for (int i = 0; i < N; i++) { + hst_out[hst_d[i]] = hst_out_buf[i]; + } + std::copy(hst_out, hst_out + N, hst_out_buf); + } + + timer().endGpuTimer(); + + delete[] hst_e; + delete[] hst_f; + delete[] hst_d; + delete[] hst_out_buf; + } + } +} diff --git a/stream_compaction/radixSort.h b/stream_compaction/radixSort.h new file mode 100644 index 0000000..ea918b9 --- /dev/null +++ b/stream_compaction/radixSort.h @@ -0,0 +1,13 @@ +#pragma once +#include "common.h" +constexpr int radix_blocksize = 256; + +namespace StreamCompaction { + namespace RadixSort { + StreamCompaction::Common::PerformanceTimer& timer(); + + void CpuStandardSort(const int& N, int* out, const int* in); + + void GpuRadixSort(const int& N, int* out, const int* in, const int max_bit); + } +} \ No newline at end of file diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..7722cd8 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -17,12 +17,37 @@ namespace StreamCompaction { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int N, int *odata, const int *idata) { + if (N == 0) { + return; + } + assert(odata != nullptr); + assert(idata != nullptr); + int* dev_idata; int* dev_odata; + + cudaMalloc((void**)&dev_idata, N * sizeof(int)); + cudaMalloc((void**)&dev_odata, N * sizeof(int)); + /*thrust::host_vector thrust_hst_odata_vec = thrust*/ + + cudaMemcpy(dev_idata, idata, N * sizeof(int), cudaMemcpyHostToDevice); + + thrust::device_ptr dev_thrust_idata = thrust::device_pointer_cast(dev_idata); + //thrust::device_vector< int > dev_thrust_idata_vec(idata, idata + N); + thrust::device_vector dev_thrust_idata_vec(dev_thrust_idata, dev_thrust_idata + N); + + thrust::device_ptr dev_thrust_odata = thrust::device_pointer_cast(dev_odata); + thrust::device_vector dev_thrust_odata_vec(dev_thrust_odata, dev_thrust_odata + N); + timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_thrust_idata_vec.begin(), dev_thrust_idata_vec.end(), dev_thrust_odata_vec.begin()); timer().endGpuTimer(); + + thrust::copy(dev_thrust_odata_vec.begin(), dev_thrust_odata_vec.end(), odata); + /*cudaFree(dev_idata); + cudaFree(dev_odata);*/ } } }