diff --git a/README.md b/README.md index a82ea0f..22c3675 100644 --- a/README.md +++ b/README.md @@ -3,211 +3,55 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Ziye Zhou +* Tested on: Windows 8.1, i7-4910 @ 2.90GHz 32GB, GTX 880M 8192MB (Alienware) -### (TODO: Your README) - -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) - -Instructions (delete me) -======================== - -This is due Sunday, September 13 at midnight. - -**Summary:** In this project, you'll implement GPU stream compaction in CUDA, -from scratch. This algorithm is widely used, and will be important for -accelerating your path tracer project. - -Your stream compaction implementations in this project will simply remove `0`s -from an array of `int`s. In the path tracer, you will remove terminated paths -from an array of rays. - -In addition to being useful for your path tracer, this project is meant to -reorient your algorithmic thinking to the way of the GPU. On GPUs, many -algorithms can benefit from massive parallelism and, in particular, data -parallelism: executing the same code many times simultaneously with different -data. - -You'll implement a few different versions of the *Scan* (*Prefix Sum*) -algorithm. First, you'll implement a CPU version of the algorithm to reinforce -your understanding. Then, you'll write a few GPU implementations: "naive" and -"work-efficient." Finally, you'll use some of these to implement GPU stream -compaction. - -**Algorithm overview & details:** There are two primary references for details -on the implementation of scan and stream compaction. - -* The [slides on Parallel Algorithms](https://github.com/CIS565-Fall-2015/cis565-fall-2015.github.io/raw/master/lectures/2-Parallel-Algorithms.pptx) - for Scan, Stream Compaction, and Work-Efficient Parallel Scan. -* GPU Gems 3, Chapter 39 - [Parallel Prefix Sum (Scan) with CUDA](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html). - -Your GPU stream compaction implementation will live inside of the -`stream_compaction` subproject. This way, you will be able to easily copy it -over for use in your GPU path tracer. - - -## Part 0: The Usual - -This project (and all other CUDA projects in this course) requires an NVIDIA -graphics card with CUDA capability. Any card with Compute Capability 2.0 -(`sm_20`) or greater will work. Check your GPU on this -[compatibility table](https://developer.nvidia.com/cuda-gpus). -If you do not have a personal machine with these specs, you may use those -computers in the Moore 100B/C which have supported GPUs. - -**HOWEVER**: If you need to use the lab computer for your development, you will -not presently be able to do GPU performance profiling. This will be very -important for debugging performance bottlenecks in your program. - -### Useful existing code - -* `stream_compaction/common.h` - * `checkCUDAError` macro: checks for CUDA errors and exits if there were any. - * `ilog2ceil(x)`: computes the ceiling of log2(x), as an integer. -* `main.cpp` - * Some testing code for your implementations. - - -## Part 1: CPU Scan & Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -In `stream_compaction/cpu.cu`, implement: - -* `StreamCompaction::CPU::scan`: compute an exclusive prefix sum. -* `StreamCompaction::CPU::compactWithoutScan`: stream compaction without using - the `scan` function. -* `StreamCompaction::CPU::compactWithScan`: stream compaction using the `scan` - function. Map the input array to an array of 0s and 1s, scan it, and use - scatter to produce the output. You will need a **CPU** scatter implementation - for this (see slides or GPU Gems chapter for an explanation). - -These implementations should only be a few lines long. - - -## Part 2: Naive GPU Scan Algorithm - -In `stream_compaction/naive.cu`, implement `StreamCompaction::Naive::scan` - -This uses the "Naive" algorithm from GPU Gems 3, Section 39.2.1. We haven't yet -taught shared memory, and you **shouldn't use it yet**. Example 39-1 uses -shared memory, but is limited to operating on very small arrays! Instead, write -this using global memory only. As a result of this, you will have to do -`ilog2ceil(n)` separate kernel invocations. - -Beware of errors in Example 39-1 in the book; both the pseudocode and the CUDA -code in the online version of Chapter 39 are known to have a few small errors -(in superscripting, missing braces, bad indentation, etc.) - -Since the parallel scan algorithm operates on a binary tree structure, it works -best with arrays with power-of-two length. Make sure your implementation works -on non-power-of-two sized arrays (see `ilog2ceil`). This requires extra memory -- your intermediate array sizes will need to be rounded to the next power of -two. - - -## Part 3: Work-Efficient GPU Scan & Stream Compaction - -### 3.1. Scan - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::scan` - -All of the text in Part 2 applies. - -* This uses the "Work-Efficient" algorithm from GPU Gems 3, Section 39.2.2. -* Beware of errors in Example 39-2. -* Test non-power-of-two sized arrays. - -### 3.2. Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::compact` - -For compaction, you will also need to implement the scatter algorithm presented -in the slides and the GPU Gems chapter. - -In `stream_compaction/common.cu`, implement these for use in `compact`: - -* `StreamCompaction::Common::kernMapToBoolean` -* `StreamCompaction::Common::kernScatter` - - -## Part 4: Using Thrust's Implementation - -In `stream_compaction/thrust.cu`, implement: - -* `StreamCompaction::Thrust::scan` - -This should be a very short function which wraps a call to the Thrust library -function `thrust::exclusive_scan(first, last, result)`. - -To measure timing, be sure to exclude memory operations by passing -`exclusive_scan` a `thrust::device_vector` (which is already allocated on the -GPU). You can create a `thrust::device_vector` by creating a -`thrust::host_vector` from the given pointer, then casting it. - - -## Part 5: Radix Sort (Extra Credit) (+10) - -Add an additional module to the `stream_compaction` subproject. Implement radix -sort using one of your scan implementations. Add tests to check its correctness. - - -## Write-up - -1. Update all of the TODOs at the top of this README. -2. Add a description of this project including a list of its features. -3. Add your performance analysis (see below). - -All extra credit features must be documented in your README, explaining its -value (with performance comparison, if applicable!) and showing an example how -it works. For radix sort, show how it is called and an example of its output. - -Always profile with Release mode builds and run without debugging. +### Description +1. Implemented CPU Scan & Stream Compaction +2. Implemented Naive GPU Scan & Stream Compaction +3. Implemented Work-Efficient GPU Scan & Stream Compaction +4. Tested Thrust's Implementation +5. Implemented Radix Sort (Extra Credit) +6. Compared the performance of CPU & GPU on Scan Algorithm ### Questions -* Roughly optimize the block sizes of each of your implementations for minimal - run time on your GPU. - * (You shouldn't compare unoptimized implementations to each other!) - * Compare all of these GPU Scan implementations (Naive, Work-Efficient, and Thrust) to the serial CPU version of Scan. Plot a graph of the comparison (with array size on the independent axis). - * You should use CUDA events for timing. Be sure **not** to include any - explicit memory operations in your performance measurements, for - comparability. - * To guess at what might be happening inside the Thrust implementation, take - a look at the Nsight timeline for its execution. + ![alt tag](https://github.com/ziyezhou-Jerry/Project2-Stream-Compaction/blob/master/proj2_compare.png?raw=true) +![alt tag](https://github.com/ziyezhou-Jerry/Project2-Stream-Compaction/blob/master/proj2_thrust_running.png?raw=true) * Write a brief explanation of the phenomena you see here. * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is it different for each implementation? +From the performance analysis given below, I think the bootleneck is the memory I/O part. As we can see from the timeline, the computation time only takes a little portion of the whole time, but the memory I/O takes a lot. This is nearly the same for all implementations. + +![alt tag](https://github.com/ziyezhou-Jerry/Project2-Stream-Compaction/blob/master/proj2_bottleneck_analysis.png?raw=true) * Paste the output of the test program into a triple-backtick block in your README. - * If you add your own tests (e.g. for radix sort or to test additional corner - cases), be sure to mention it explicitly. - -These questions should help guide you in performance analysis on future -assignments, as well. -## Submit + Apart from the given test cases, I have also added my own test case for the radix sort. I used the STL sort method to get the CPU version of sort result and compare it with the GPU radix sort. + <<>> + +### Extra Credit +I have also Implemented the Radix Sort. Within the method, I am using the thrust::scan method to do the prefix-sum. The comparison with the CPU STL sort can be seen below: +![alt tag](https://github.com/ziyezhou-Jerry/Project2-Stream-Compaction/blob/master/proj2_extra_output.png?raw=true) + +The input I used is manually generate by this for loop: +``` C++ + int m_array[M_SIZE]; + int m_out[M_SIZE]; + for (int i = 0; i < M_SIZE / 2; i++) + { + m_array[i] = M_SIZE / 2 - i; + } + for (int i = M_SIZE / 2; i < M_SIZE; i++) + { + m_array[i] = i - M_SIZE / 2; + } +``` +The output is the sorted array, we can see it from the screenshot above. -If you have modified any of the `CMakeLists.txt` files at all (aside from the -list of `SOURCE_FILES`), you must test that your project can build in Moore -100B/C. Beware of any build issues discussed on the Google Group. -1. Open a GitHub pull request so that we can see that you have finished. - The title should be "Submission: YOUR NAME". -2. Send an email to the TA (gmail: kainino1+cis565@) with: - * **Subject**: in the form of `[CIS565] Project 2: PENNKEY` - * Direct link to your pull request on GitHub - * In the form of a grade (0-100+) with comments, evaluate your own - performance on the project. - * Feedback on the project itself, if any. diff --git a/bin_new/Activity1.nvact b/bin_new/Activity1.nvact new file mode 100644 index 0000000..b9697f2 --- /dev/null +++ b/bin_new/Activity1.nvact @@ -0,0 +1,125 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/proj2_bottleneck_analysis.png b/proj2_bottleneck_analysis.png new file mode 100644 index 0000000..e7b8ab7 Binary files /dev/null and b/proj2_bottleneck_analysis.png differ diff --git a/proj2_compare.png b/proj2_compare.png new file mode 100644 index 0000000..7b94e5c Binary files /dev/null and b/proj2_compare.png differ diff --git a/proj2_data.xlsx b/proj2_data.xlsx new file mode 100644 index 0000000..b6ec519 Binary files /dev/null and b/proj2_data.xlsx differ diff --git a/proj2_extra_output.png b/proj2_extra_output.png new file mode 100644 index 0000000..de3db9f Binary files /dev/null and b/proj2_extra_output.png differ diff --git a/proj2_runningTime.png b/proj2_runningTime.png new file mode 100644 index 0000000..6277c47 Binary files /dev/null and b/proj2_runningTime.png differ diff --git a/proj2_testing_output.png b/proj2_testing_output.png new file mode 100644 index 0000000..47a32e4 Binary files /dev/null and b/proj2_testing_output.png differ diff --git a/proj2_thrust_running.png b/proj2_thrust_running.png new file mode 100644 index 0000000..5d113e9 Binary files /dev/null and b/proj2_thrust_running.png differ diff --git a/src/main.cpp b/src/main.cpp index 675da35..43d1ec1 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,70 +11,173 @@ #include #include #include +#include +#include +#include +#include +#include +#include +#include #include "testing_helpers.hpp" + +#define M_SIZE 2000 + + +//help function +void tick() +{ + +} + +void tock() +{ + +} + + + int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; - const int NPOT = SIZE - 3; - int a[SIZE], b[SIZE], c[SIZE]; + + //for (int m_i = 8; m_i < 16; m_i++) + //{ - // Scan tests + int SIZE = 1 << 8; + const int NPOT = SIZE - 3; + int* a = new int[SIZE]; + int*b = new int[SIZE]; + int*c = new int[SIZE]; - printf("\n"); - printf("****************\n"); - printf("** SCAN TESTS **\n"); - printf("****************\n"); - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printArray(SIZE, b, true); + int m_array[M_SIZE]; + int m_out[M_SIZE]; + for (int i = 0; i < M_SIZE / 2; i++) + { + m_array[i] = M_SIZE / 2 - i; + } + for (int i = M_SIZE / 2; i < M_SIZE; i++) + { + m_array[i] = i - M_SIZE / 2; + } - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printArray(NPOT, b, true); - printCmpResult(NPOT, b, c); + std::vector m_array_vec(m_array, m_array + M_SIZE); - zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); + //cuda event init + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + float milliseconds = 0; - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); + // === Sort Test === + printf("\n"); + printf("*****************************\n"); + printf("** Radix Sort TESTS **\n"); + printf("*****************************\n"); + printDesc("radix sort using GPU"); - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); + + StreamCompaction::RadixSort::radixsort(M_SIZE, m_out, m_array); + + + + printDesc("sort using CPU"); + + + cudaEventRecord(start); + std::sort(m_array_vec.begin(), m_array_vec.end()); + cudaEventRecord(stop); + cudaEventSynchronize(stop); + milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + + std::cout << "printf: " << milliseconds << '\n'; + + + + + printArray(60, m_out, true); + printArray(60, &m_array_vec[0], true); + + // === Scan tests ==== + + printf("\n"); + printf("****************\n"); + printf("** SCAN TESTS **\n"); + printf("****************\n"); + + genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printArray(SIZE, a, true); + + zeroArray(SIZE, b); + printDesc("cpu scan, power-of-two"); + StreamCompaction::CPU::scan(SIZE, b, a); + //printArray(SIZE, b, true); + + zeroArray(SIZE, c); + printDesc("cpu scan, non-power-of-two"); + StreamCompaction::CPU::scan(NPOT, c, a); + //printArray(NPOT, b, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("naive scan, power-of-two"); + StreamCompaction::Naive::scan(SIZE, c, a); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("naive scan, non-power-of-two"); + StreamCompaction::Naive::scan(NPOT, c, a); + //printArray(SIZE, c, true); + printCmpResult(NPOT, b, c); + + + zeroArray(SIZE, c); + printDesc("work-efficient scan, non-power-of-two"); + StreamCompaction::Efficient::scan(NPOT, c, a); + //printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + + + zeroArray(SIZE, c); + printDesc("work-efficient scan, power-of-two"); + StreamCompaction::Efficient::scan(SIZE, c, a); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + + zeroArray(SIZE, c); + printDesc("work-efficient share_mem scan, power-of-two"); + StreamCompaction::Efficient::scan_share_mem(SIZE, c, a); + /*printArray(SIZE, a, true); + printArray(SIZE, c, true); + printArray(SIZE, b, true);*/ + printCmpResult(SIZE, b, c); + + + zeroArray(SIZE, c); + printDesc("work-efficient share_mem scan, non-power-of-two"); + StreamCompaction::Efficient::scan_share_mem(NPOT, c, a); + printCmpResult(NPOT, b, c); + + + zeroArray(SIZE, c); + printDesc("thrust scan, power-of-two"); + StreamCompaction::Thrust::scan(SIZE, c, a); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("thrust scan, non-power-of-two"); + StreamCompaction::Thrust::scan(NPOT, c, a); + //printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + //} printf("\n"); printf("*****************************\n"); @@ -93,20 +196,20 @@ int main(int argc, char* argv[]) { printDesc("cpu compact without scan, power-of-two"); count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); expectedCount = count; - printArray(count, b, true); + // printArray(count, b, true); printCmpLenResult(count, expectedCount, b, b); zeroArray(SIZE, c); printDesc("cpu compact without scan, non-power-of-two"); count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); expectedNPOT = count; - printArray(count, c, true); + //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); zeroArray(SIZE, c); printDesc("cpu compact with scan"); count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printArray(count, c, true); + //printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); @@ -120,4 +223,20 @@ int main(int argc, char* argv[]) { count = StreamCompaction::Efficient::compact(NPOT, c, a); //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient share mem compact, power-of-two"); + count = StreamCompaction::Efficient::compact_share_mem(SIZE, c, a); + //printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient share mem compact, non-power-of-two"); + count = StreamCompaction::Efficient::compact_share_mem(NPOT, c, a); + //printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + + + + //system("pause"); } diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..1c841bf 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -9,6 +9,8 @@ set(SOURCE_FILES "efficient.cu" "thrust.h" "thrust.cu" + "radixsort.h" + "radixsort.cu" ) cuda_add_library(stream_compaction diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..ceeac30 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,19 @@ namespace Common { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index < n) + { + if (idata[index] == 0) + { + bools[index] = 0; + } + else + { + bools[index] = 1; + } + } } /** @@ -32,7 +44,15 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index +#include +#include +#include #include "cpu.h" namespace StreamCompaction { @@ -9,7 +12,28 @@ namespace CPU { */ void scan(int n, int *odata, const int *idata) { // TODO - printf("TODO\n"); + + + //cuda event init + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + float milliseconds = 0; + + cudaEventRecord(start); + + odata[0] = 0; + for (int i = 1; i #include +#include #include "common.h" #include "efficient.h" + +// used for avoid bank conflict +#define NUM_BANKS 16 +#define LOG_NUM_BANKS 4 +#define CONFLICT_FREE_OFFSET(n) ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS)) + + + +int* dev_array; + namespace StreamCompaction { -namespace Efficient { + namespace Efficient { -// TODO: __global__ + __global__ void kern_up_sweep(int n, int m_power, int* x) //m_power = 2^d + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); -/** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ -void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); -} + index = index *m_power * 2 -1; -/** - * Performs stream compaction on idata, storing the result into odata. - * All zeroes are discarded. - * - * @param n The number of elements in idata. - * @param odata The array into which to store elements. - * @param idata The array of elements to compact. - * @returns The number of elements remaining after compaction. - */ -int compact(int n, int *odata, const int *idata) { - // TODO - return -1; -} + if (index > 0 && index < n ) //&& ((index + 1) % (m_power * 2) == 0) + { + x[index] = x[index] + x[index - m_power]; + } + } + + __global__ void kern_down_sweep(int n, int m_power, int* x) //m_power = 2^(log2(n)-1-d) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + index = index* m_power * 2 -1; + + if (index>0 && index < n ) //&& ((index + 1) % (m_power * 2) == 0) + { + int tmp = x[index]; + x[index] = x[index] + x[index - m_power]; //sum + x[index - m_power] = tmp; //swap + } + + } + + __global__ void kern_set_value(int index,int val,int* x) + { + x[index] = val; + } + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) { + // up sweep + + int m_power = ilog2ceil(n); + int new_n = pow(2, m_power); + + dim3 threadsPerBlock(blockSize); + dim3 fullBlocksPerGrid((new_n + blockSize - 1) / blockSize); + + //init the array + cudaMalloc((void**)&dev_array, new_n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_array1 failed!"); + + cudaMemset(dev_array, 0, new_n*sizeof(int)); + checkCUDAErrorFn("cudaMemset dev_array failed!"); + + cudaMemcpy(dev_array, idata, n*sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorFn("cudaMemcpy dev_array failed!"); + + + int* pow_2_d =new int[m_power]; + int* pow_2_log2n_minus_d = new int[m_power]; + for (int d = 0; d < m_power; d++) + { + pow_2_d[d] = pow(2, d); + + int nn = m_power - 1 - d; + + pow_2_log2n_minus_d[d] = pow(2, nn); + + } + + + + //cuda event init + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + float milliseconds = 0; + + cudaEventRecord(start); + + //up sweep + for (int d = 0; d < m_power; d++) + { + //int pow_2_d = pow(2, d); + kern_up_sweep << > >(new_n, pow_2_d[d], dev_array); + } + + //down sweep + + kern_set_value << <1, 1 >> > (new_n - 1, 0, dev_array); //insert 0 + + + for (int d = 0; d < m_power; d++) + { + /*int nn = m_power - 1 - d; + int pow_2_log2n_minus_d = pow(2, nn);*/ + + kern_down_sweep << > >(new_n, pow_2_log2n_minus_d[d], dev_array); + } + + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + std::cout << "efficient method: " << milliseconds << "ms" << std::endl; + + //copy data + cudaMemcpy(odata, dev_array, n*sizeof(int), cudaMemcpyDeviceToHost); + + } + + + + + + + /** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ + int compact(int n, int *odata, const int *idata) { + + dim3 threadsPerBlock(blockSize); + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + //copy data to device + int* dev_idata; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_idata failed!"); + + cudaMemset(dev_idata, 0, n*sizeof(int)); + checkCUDAErrorFn("cudaMemset dev_idata failed!"); + + cudaMemcpy(dev_idata, idata, n*sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorFn("cudaMemcpy dev_idata failed!"); + + + // map the idata to bools + int* dev_bools; + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_bools failed!"); + + cudaMemset(dev_bools, 0, n*sizeof(int)); + checkCUDAErrorFn("cudaMemset dev_bools failed!"); + + StreamCompaction::Common::kernMapToBoolean << > >(n, dev_bools, dev_idata); + + //scan the bools to get the indices + + + int* host_bools = new int[n]; + cudaMemcpy(host_bools, dev_bools, n*sizeof(int), cudaMemcpyDeviceToHost); + int* host_indices = new int[n]; + + scan(n, host_indices, host_bools); //input is host data + + int* dev_indices; + cudaMalloc((void**)&dev_indices, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_indices failed!"); + + cudaMemcpy(dev_indices, host_indices, n*sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorFn("cudaMemcpy dev_indices failed!"); + + //run scatter + int* dev_odata; + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_bools failed!"); + + cudaMemset(dev_odata, 0, n*sizeof(int)); + checkCUDAErrorFn("cudaMemset dev_bools failed!"); + + StreamCompaction::Common::kernScatter << < fullBlocksPerGrid, threadsPerBlock >> > (n, dev_odata,dev_idata,dev_bools,dev_indices); + + //copy back to host + cudaMemcpy(odata, dev_odata, n*sizeof(int), cudaMemcpyDeviceToHost); + + return host_indices[n - 1]+host_bools[n-1]; //num of non-zero + + + + } + + + __global__ void kern_prescan(int *g_idata, int n) + { + extern __shared__ int temp[]; // allocated on invocation + int thid = threadIdx.x; + int offset = 1; + + if (thid < n) + { + temp[thid] = g_idata[thid]; // load input into shared memory + + + for (int d = n >> 1; d > 0; d >>= 1) // build sum in place up the tree + { + __syncthreads(); + if (thid < d) + { + int ai = offset*(2 * thid + 1) - 1; + int bi = offset*(2 * thid + 2) - 1; + + temp[bi] += temp[ai]; + + } + offset *= 2; + } + + if (thid == 0) + { + temp[n - 1] = 0; + } // clear the last element + + + for (int d = 1; d < n; d *= 2) // traverse down tree & build scan + { + offset >>= 1; + __syncthreads(); + if (thid < d) + { + int ai = offset*(2 * thid + 1) - 1; + int bi = offset*(2 * thid + 2) - 1; + + + int t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + } + + __syncthreads(); //make sure all threads are done with writing result + + g_idata[thid] = temp[thid]; // write results to device memory + + } + } + + + + + void scan_share_mem(int n, int *odata, const int *idata) + { + + int m_power = ilog2ceil(n); + int new_n = pow(2, m_power); + + dim3 threadsPerBlock(512); + //dim3 fullBlocksPerGrid((new_n + blockSize - 1) / blockSize); + + //init the array + cudaMalloc((void**)&dev_array, new_n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_array1 failed!"); + + cudaMemset(dev_array, 0, new_n*sizeof(int)); + checkCUDAErrorFn("cudaMemset dev_array failed!"); + + cudaMemcpy(dev_array, idata, n*sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorFn("cudaMemcpy dev_array failed!"); + + + //cuda event init + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + float milliseconds = 0; + + cudaEventRecord(start); + + //invoke prescan + kern_prescan << <1,threadsPerBlock, new_n * sizeof(int)>> >(dev_array , new_n); + + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + std::cout << "efficient method: " << milliseconds << "ms" << std::endl; + + //copy data + cudaMemcpy(odata, dev_array, n*sizeof(int), cudaMemcpyDeviceToHost); + + } + + + int compact_share_mem(int n, int *odata, const int *idata) + { + dim3 threadsPerBlock(blockSize); + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + //copy data to device + int* dev_idata; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_idata failed!"); + + cudaMemset(dev_idata, 0, n*sizeof(int)); + checkCUDAErrorFn("cudaMemset dev_idata failed!"); + + cudaMemcpy(dev_idata, idata, n*sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorFn("cudaMemcpy dev_idata failed!"); + + + // map the idata to bools + int* dev_bools; + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_bools failed!"); + + cudaMemset(dev_bools, 0, n*sizeof(int)); + checkCUDAErrorFn("cudaMemset dev_bools failed!"); + + StreamCompaction::Common::kernMapToBoolean << > >(n, dev_bools, dev_idata); + + //scan the bools to get the indices + + + int* host_bools = new int[n]; + cudaMemcpy(host_bools, dev_bools, n*sizeof(int), cudaMemcpyDeviceToHost); + int* host_indices = new int[n]; + + scan_share_mem(n, host_indices, host_bools); //input is host data + + int* dev_indices; + cudaMalloc((void**)&dev_indices, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_indices failed!"); + + cudaMemcpy(dev_indices, host_indices, n*sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorFn("cudaMemcpy dev_indices failed!"); + + //run scatter + int* dev_odata; + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_bools failed!"); + + cudaMemset(dev_odata, 0, n*sizeof(int)); + checkCUDAErrorFn("cudaMemset dev_bools failed!"); + + StreamCompaction::Common::kernScatter << < fullBlocksPerGrid, threadsPerBlock >> > (n, dev_odata, dev_idata, dev_bools, dev_indices); + + //copy back to host + cudaMemcpy(odata, dev_odata, n*sizeof(int), cudaMemcpyDeviceToHost); + + return host_indices[n - 1] + host_bools[n - 1]; //num of non-zero + } + + + } } -} + diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 395ba10..fdab23e 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -5,5 +5,9 @@ namespace Efficient { void scan(int n, int *odata, const int *idata); int compact(int n, int *odata, const int *idata); + + void scan_share_mem(int n, int *odata, const int *idata); + + int compact_share_mem(int n, int *odata, const int *idata); } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..10808c1 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -1,19 +1,87 @@ #include #include +#include #include "common.h" #include "naive.h" + + + +int* dev_array1; +int* dev_array2; + + namespace StreamCompaction { namespace Naive { -// TODO: __global__ + __global__ void kern_naive_scan(int d, int n, int m_power,int* array_1, int* array_2) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index >= m_power && index < n) + { + array_2[index] = array_1[index] + array_1[index - m_power]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + //compute the size of the intermediate array + int m_power = ilog2ceil(n); + int new_n = pow(2,m_power); + + dim3 fullBlocksPerGrid((new_n + blockSize - 1) / blockSize); + dim3 threadsPerBlock(blockSize); + + //init the array + cudaMalloc((void**)&dev_array1, new_n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_array1 failed!"); + + cudaMalloc((void**)&dev_array2, new_n * sizeof(int)); + checkCUDAErrorFn("cudaMalloc dev_array1 failed!"); + + cudaMemset(dev_array1, 0, new_n * sizeof(int)); + checkCUDAErrorFn("cudaMemset dev_array1 failed!"); + cudaMemset(dev_array2, 0, new_n * sizeof(int)); + checkCUDAErrorFn("cudaMemset dev_array2 failed!"); + + int* tmp_data = new int[n]; + tmp_data[0] = 0; + + for (int i = 1; i> > (d, new_n, m_power, dev_array1, dev_array2); + //copy array2 to array1 + cudaMemcpy(dev_array1,dev_array2,new_n*sizeof(int),cudaMemcpyDeviceToDevice); + } + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + std::cout << "naive method: " << milliseconds << "ms"< +#include +#include +#include +#include "radixsort.h" +#include "common.h" +#include "thrust.h" + +namespace StreamCompaction { + namespace RadixSort { + + + __global__ void kern_get_k_bit_array(int n, int k, int* odata, const int* idata) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index < n) + { + odata[index] = (idata[index] & (1 << k)) >> k; //get the kth bit of the cur int + } + } + + __global__ void kern_inv_array(int n, int* odata, const int* idata) //1-->0 0-->1 + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index < n) + { + odata[index] = std::abs(idata[index]-1); + } + } + + __global__ void kern_get_totalFalses(int n, int* e, int *f, int* totalFalse) + { + *totalFalse = e[n - 1] + f[n - 1]; + } + + __global__ void kern_compute_t_array(int n,int * f,int *t,int* totalFalse) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index < n) + { + t[index] = index - f[index]+ *totalFalse; + } + } + + __global__ void kern_compute_the_d_array(int n, int *b, int *t, int *f, int *d) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index < n) + { + d[index] = b[index] ? t[index] : f[index]; + } + } + + __global__ void kern_get_output(int n,int * d,int * odata,int* idata) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index < n) + { + odata[d[index]] = idata[index]; + } + } + + + + + void radixsort(int n, int *odata, const int *idata) //assume that all the bits are 32 bits + { + + + + dim3 threadsPerBlock(blockSize); + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + int * dev_b_array; + cudaMalloc((void**)&dev_b_array, n*sizeof(int)); + int * dev_e_array; + cudaMalloc((void**)&dev_e_array, n*sizeof(int)); + int * dev_f_array; + cudaMalloc((void**)&dev_f_array, n*sizeof(int)); + int * dev_t_array; + cudaMalloc((void**)&dev_t_array, n*sizeof(int)); + int * dev_d_array; + cudaMalloc((void**)&dev_d_array, n*sizeof(int)); + + int * dev_idata; + cudaMalloc((void**)&dev_idata, n*sizeof(int)); + + int * dev_odata; + cudaMalloc((void**)&dev_odata, n*sizeof(int)); + + + cudaMemcpy(dev_idata,idata,n*sizeof(int),cudaMemcpyHostToDevice); + + + int* dev_totalFalse; + cudaMalloc((void**)&dev_totalFalse, 1*sizeof(int)); + + + int* host_f_array = new int[n]; + + int* host_e_array = new int[n]; + + + //cuda event init + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + float milliseconds = 0; + + cudaEventRecord(start); + + for (int k = 0; k<32; k++) + { + //get b array + kern_get_k_bit_array << > > (n, k, dev_b_array,dev_idata); + + //get e array + kern_inv_array << > >(n, dev_e_array, dev_b_array); + + //get f data + + cudaMemcpy(host_e_array, dev_e_array, n*sizeof(int), cudaMemcpyDeviceToHost); + StreamCompaction::Thrust::scan(n, host_f_array, host_e_array); + cudaMemcpy(dev_f_array, host_f_array, n*sizeof(int), cudaMemcpyHostToDevice); + + //get t array + //comptue the totalFalse + kern_get_totalFalses << <1, 1 >> >(n, dev_e_array, dev_f_array, dev_totalFalse); + + kern_compute_t_array << < fullBlocksPerGrid, threadsPerBlock >> > (n, dev_f_array, dev_t_array, dev_totalFalse); + + //get the d array + kern_compute_the_d_array << < fullBlocksPerGrid, threadsPerBlock >> > (n, dev_b_array, dev_t_array, dev_f_array, dev_d_array); + + //get the current output + kern_get_output << < fullBlocksPerGrid, threadsPerBlock >> > (n, dev_d_array, dev_odata,dev_idata); + + //update the idata + cudaMemcpy(dev_idata, dev_odata, n*sizeof(int), cudaMemcpyDeviceToDevice); + + } + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + std::cout << "radix sort method: " << milliseconds << "ms" << std::endl; + + cudaMemcpy(odata, dev_odata, n*sizeof(int), cudaMemcpyDeviceToHost); + + } + + + } +} \ No newline at end of file diff --git a/stream_compaction/radixsort.h b/stream_compaction/radixsort.h new file mode 100644 index 0000000..35e8bc6 --- /dev/null +++ b/stream_compaction/radixsort.h @@ -0,0 +1,8 @@ +#pragma once + +namespace StreamCompaction { + namespace RadixSort { + + void radixsort(int n, int *odata, const int *idata); + } +} \ No newline at end of file diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..d625e04 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -3,6 +3,7 @@ #include #include #include +#include #include "common.h" #include "thrust.h" @@ -16,6 +17,23 @@ void scan(int n, int *odata, const int *idata) { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + //cuda event init + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + float milliseconds = 0; + + cudaEventRecord(start); + + thrust::exclusive_scan(idata, idata + n, odata); // in-place scan + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + std::cout << "thrust method: " << milliseconds << "ms" << std::endl; + } }