diff --git a/README.md b/README.md index 4535eea..3f6659e 100644 --- a/README.md +++ b/README.md @@ -3,220 +3,103 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) - -### (TODO: Your README) - -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) - -Instructions (delete me) -======================== - -This is due Sunday, September 13 at midnight. - -**Summary:** In this project, you'll implement GPU stream compaction in CUDA, -from scratch. This algorithm is widely used, and will be important for -accelerating your path tracer project. - -Your stream compaction implementations in this project will simply remove `0`s -from an array of `int`s. In the path tracer, you will remove terminated paths -from an array of rays. - -In addition to being useful for your path tracer, this project is meant to -reorient your algorithmic thinking to the way of the GPU. On GPUs, many -algorithms can benefit from massive parallelism and, in particular, data -parallelism: executing the same code many times simultaneously with different -data. - -You'll implement a few different versions of the *Scan* (*Prefix Sum*) -algorithm. First, you'll implement a CPU version of the algorithm to reinforce -your understanding. Then, you'll write a few GPU implementations: "naive" and -"work-efficient." Finally, you'll use some of these to implement GPU stream -compaction. - -**Algorithm overview & details:** There are two primary references for details -on the implementation of scan and stream compaction. - -* The [slides on Parallel Algorithms](https://github.com/CIS565-Fall-2015/cis565-fall-2015.github.io/raw/master/lectures/2-Parallel-Algorithms.pptx) - for Scan, Stream Compaction, and Work-Efficient Parallel Scan. -* GPU Gems 3, Chapter 39 - [Parallel Prefix Sum (Scan) with CUDA](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html). - -Your GPU stream compaction implementation will live inside of the -`stream_compaction` subproject. This way, you will be able to easily copy it -over for use in your GPU path tracer. - - -## Part 0: The Usual - -This project (and all other CUDA projects in this course) requires an NVIDIA -graphics card with CUDA capability. Any card with Compute Capability 2.0 -(`sm_20`) or greater will work. Check your GPU on this -[compatibility table](https://developer.nvidia.com/cuda-gpus). -If you do not have a personal machine with these specs, you may use those -computers in the Moore 100B/C which have supported GPUs. - -**HOWEVER**: If you need to use the lab computer for your development, you will -not presently be able to do GPU performance profiling. This will be very -important for debugging performance bottlenecks in your program. - -### Useful existing code - -* `stream_compaction/common.h` - * `checkCUDAError` macro: checks for CUDA errors and exits if there were any. - * `ilog2ceil(x)`: computes the ceiling of log2(x), as an integer. -* `main.cpp` - * Some testing code for your implementations. - -**Note 1:** The tests will simply compare against your CPU implementation -Do it first! - -**Note 2:** The tests default to an array of size 256. -Test with something larger (10,000?), too! - - -## Part 1: CPU Scan & Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -Do this first, and double check the output! It will be used as the expected -value for the other tests. - -In `stream_compaction/cpu.cu`, implement: - -* `StreamCompaction::CPU::scan`: compute an exclusive prefix sum. -* `StreamCompaction::CPU::compactWithoutScan`: stream compaction without using - the `scan` function. -* `StreamCompaction::CPU::compactWithScan`: stream compaction using the `scan` - function. Map the input array to an array of 0s and 1s, scan it, and use - scatter to produce the output. You will need a **CPU** scatter implementation - for this (see slides or GPU Gems chapter for an explanation). - -These implementations should only be a few lines long. - - -## Part 2: Naive GPU Scan Algorithm - -In `stream_compaction/naive.cu`, implement `StreamCompaction::Naive::scan` - -This uses the "Naive" algorithm from GPU Gems 3, Section 39.2.1. We haven't yet -taught shared memory, and you **shouldn't use it yet**. Example 39-1 uses -shared memory, but is limited to operating on very small arrays! Instead, write -this using global memory only. As a result of this, you will have to do -`ilog2ceil(n)` separate kernel invocations. - -Beware of errors in Example 39-1 in the book; both the pseudocode and the CUDA -code in the online version of Chapter 39 are known to have a few small errors -(in superscripting, missing braces, bad indentation, etc.) - -Since the parallel scan algorithm operates on a binary tree structure, it works -best with arrays with power-of-two length. Make sure your implementation works -on non-power-of-two sized arrays (see `ilog2ceil`). This requires extra memory -- your intermediate array sizes will need to be rounded to the next power of -two. - - -## Part 3: Work-Efficient GPU Scan & Stream Compaction - -### 3.1. Scan - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::scan` - -All of the text in Part 2 applies. - -* This uses the "Work-Efficient" algorithm from GPU Gems 3, Section 39.2.2. -* Beware of errors in Example 39-2. -* Test non-power-of-two sized arrays. - -### 3.2. Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::compact` - -For compaction, you will also need to implement the scatter algorithm presented -in the slides and the GPU Gems chapter. - -In `stream_compaction/common.cu`, implement these for use in `compact`: - -* `StreamCompaction::Common::kernMapToBoolean` -* `StreamCompaction::Common::kernScatter` - - -## Part 4: Using Thrust's Implementation - -In `stream_compaction/thrust.cu`, implement: - -* `StreamCompaction::Thrust::scan` - -This should be a very short function which wraps a call to the Thrust library -function `thrust::exclusive_scan(first, last, result)`. - -To measure timing, be sure to exclude memory operations by passing -`exclusive_scan` a `thrust::device_vector` (which is already allocated on the -GPU). You can create a `thrust::device_vector` by creating a -`thrust::host_vector` from the given pointer, then casting it. - - -## Part 5: Radix Sort (Extra Credit) (+10) - -Add an additional module to the `stream_compaction` subproject. Implement radix -sort using one of your scan implementations. Add tests to check its correctness. - - -## Write-up - -1. Update all of the TODOs at the top of this README. -2. Add a description of this project including a list of its features. -3. Add your performance analysis (see below). - -All extra credit features must be documented in your README, explaining its -value (with performance comparison, if applicable!) and showing an example how -it works. For radix sort, show how it is called and an example of its output. - -Always profile with Release mode builds and run without debugging. - -### Questions - -* Roughly optimize the block sizes of each of your implementations for minimal - run time on your GPU. - * (You shouldn't compare unoptimized implementations to each other!) - -* Compare all of these GPU Scan implementations (Naive, Work-Efficient, and - Thrust) to the serial CPU version of Scan. Plot a graph of the comparison - (with array size on the independent axis). - * You should use CUDA events for timing. Be sure **not** to include any - explicit memory operations in your performance measurements, for - comparability. - * To guess at what might be happening inside the Thrust implementation, take - a look at the Nsight timeline for its execution. - -* Write a brief explanation of the phenomena you see here. - * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is - it different for each implementation? - -* Paste the output of the test program into a triple-backtick block in your - README. - * If you add your own tests (e.g. for radix sort or to test additional corner - cases), be sure to mention it explicitly. - -These questions should help guide you in performance analysis on future -assignments, as well. - -## Submit - -If you have modified any of the `CMakeLists.txt` files at all (aside from the -list of `SOURCE_FILES`), you must test that your project can build in Moore -100B/C. Beware of any build issues discussed on the Google Group. - -1. Open a GitHub pull request so that we can see that you have finished. - The title should be "Submission: YOUR NAME". -2. Send an email to the TA (gmail: kainino1+cis565@) with: - * **Subject**: in the form of `[CIS565] Project 2: PENNKEY` - * Direct link to your pull request on GitHub - * In the form of a grade (0-100+) with comments, evaluate your own - performance on the project. - * Feedback on the project itself, if any. +* Kangning Li +* Tested on: Windows 10, i7-4790 @ 3.6GHz 16GB, GTX 970 4096MB (Personal) + +This repository contains HW2 for CIS 565 2015, GPU implementations of scan and compact. + +## Analysis + +![](images/graph.png) + +|array size | cpu scan | naive scan | efficient scan | thrust scan| cpu compact w/out scan| cpu compact w/scan | efficient compact +|-----------|----------|------------|----------------|------------|-----------------------|--------------------|------------------- +|32768 | 0 | 87 | 265 | 169 | 0 | 1002 | 308 +|16384 | 0 | 66 | 228 | 50 | 0 | 1002 | 233 +|8192 | 0 | 59 | 196 | 34 | 0 | 500 | 188 +|4096 | 0 | 51 | 182 | 25 | 0 | 499 | 173 +|2048 | 0 | 55 | 154 | 21 | 0 | 501 | 180 +|1024 | 0 | 42 | 137 | 18 | 0 | 500 | 181 +|512 | 0 | 36 | 143 | 19 | 0 | 0 | 153 +|256 | 0 | 32 | 133 | 18 | 0 | 0 | 123 + +The data tells us some obvious things, such as the fact that generally computation is faster when there is less data. However, the speed difference between the CPU and GPU implementations indicate something suboptimal in the GPU code. The GPU code was timed without taking into account memory operations, so the difficulty may be in a lack of optimized register use or excess memory access besides explicit operations like copies, allocations, and frees. What is also interesting is that thrust scan, though faster than my implementation, is still slower than the CPU implementation. +Further analysis is required. It is also possible that the CPU implementation has not been timed correctly, or that the more expected benefits of GPU parallelization only become apparent with larger amounts of data than were measured. + +Another note is that this project implements efficient scan by modifying an array on the device in-place in both the upsweep and downsweep stages. +There were some concerns over race conditions when multiple blocks are needed, however, these did not arise. The project's commit history includes a version of efficient scan that uses an input and output array for the kernel but requires a memcpy to synchronize data in the two from the host in between passes. + +## Notes +I added an additional "small" case test for debugging use. +Efficient Scan also has disabled desting code for "peeking" at the results of up-sweep before down-sweep. + +## Example Output + +``` +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 7 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 803684 803691 ] +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 803630 803660 ] + passed +==== small cpu scan test. ==== + passed +==== naive scan, power-of-two ==== + passed +==== naive scan, non-power-of-two ==== + passed +==== small naive scan test. ==== + passed +==== small naive scan test, non-power-of-two. ==== + passed +==== small work efficient scan test. ==== + passed +==== small work efficient scan test, non-power-of-two. ==== + passed +==== work-efficient scan, power-of-two ==== + passed +==== work-efficient scan, non-power-of-two ==== + passed +==== small thrust scan. ==== + passed +==== thrust scan, power-of-two ==== + passed +==== thrust scan, non-power-of-two ==== + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 3 0 ] +==== small cpu compact without scan, power-of-two ==== + passed +==== small cpu compact without scan, non-power-of-two ==== + passed +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +==== small cpu compact with scan, power-of-two ==== + passed +==== small cpu compact with scan, non-power-of-two ==== + passed +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 3 ] + passed +==== small work-efficient compact with scan, power-of-two ==== + passed +==== small work-efficient compact with scan, non-power-of-two ==== + passed +==== work-efficient compact, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 3 ] + passed +==== work-efficient compact, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed + +``` \ No newline at end of file diff --git a/data.ods b/data.ods new file mode 100644 index 0000000..54d1333 Binary files /dev/null and b/data.ods differ diff --git a/images/graph.png b/images/graph.png new file mode 100644 index 0000000..74a0067 Binary files /dev/null and b/images/graph.png differ diff --git a/src/main.cpp b/src/main.cpp index 675da35..85f4fbb 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -18,6 +18,61 @@ int main(int argc, char* argv[]) { const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; + // the case in the slides, as a smaller test. + int small[8]; + for (int i = 0; i < 8; i++) { + small[i] = i; + } + int smallScan[8] = { 0, 0, 1, 3, 6, 10, 15, 21 }; + int smallCompact[7] = { 1, 2, 3, 4, 5, 6, 7 }; + + // set "true" for timed tests + // also set BENCHMARK in common to 1 + if (false) { + genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printf("array size: %i\n", SIZE); + int count; + + zeroArray(SIZE, b); + printDesc("cpu scan, power-of-two"); + StreamCompaction::CPU::scan(SIZE, b, a); + printf("\n"); + + zeroArray(SIZE, c); + printDesc("naive scan, power-of-two"); + StreamCompaction::Naive::scan(SIZE, c, a); + printf("\n"); + + zeroArray(SIZE, c); + printDesc("work-efficient scan, power-of-two"); + StreamCompaction::Efficient::scan(SIZE, c, a); + printf("\n"); + + zeroArray(SIZE, c); + printDesc("thrust scan, power-of-two"); + StreamCompaction::Thrust::scan(SIZE, c, a); + printf("\n"); + + zeroArray(SIZE, b); + printDesc("cpu compact without scan, power-of-two"); + count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + printf("\n"); + + zeroArray(SIZE, c); + printDesc("cpu compact with scan"); + count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + printf("\n"); + + zeroArray(SIZE, c); + printDesc("work-efficient compact, power-of-two"); + count = StreamCompaction::Efficient::compact(SIZE, c, a); + printf("\n"); + + printf("benchmark tests done\n"); + return 0; + } + // Scan tests printf("\n"); @@ -40,6 +95,11 @@ int main(int argc, char* argv[]) { printArray(NPOT, b, true); printCmpResult(NPOT, b, c); + zeroArray(SIZE, c); + printDesc("small cpu scan test."); + StreamCompaction::CPU::scan(8, c, small); + printCmpResult(8, smallScan, c); + zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); @@ -52,6 +112,26 @@ int main(int argc, char* argv[]) { //printArray(SIZE, c, true); printCmpResult(NPOT, b, c); + zeroArray(SIZE, c); + printDesc("small naive scan test."); + StreamCompaction::Naive::scan(8, c, small); + printCmpResult(8, smallScan, c); + + zeroArray(SIZE, c); + printDesc("small naive scan test, non-power-of-two."); + StreamCompaction::Naive::scan(7, c, small); + printCmpResult(7, smallScan, c); + + zeroArray(SIZE, c); + printDesc("small work efficient scan test."); + StreamCompaction::Efficient::scan(8, c, small); + printCmpResult(8, smallScan, c); + + zeroArray(SIZE, c); + printDesc("small work efficient scan test, non-power-of-two."); + StreamCompaction::Efficient::scan(7, c, small); + printCmpResult(7, smallScan, c); + zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); @@ -64,6 +144,11 @@ int main(int argc, char* argv[]) { //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + zeroArray(SIZE, c); + printDesc("small thrust scan."); + StreamCompaction::Thrust::scan(8, c, small); + printCmpResult(8, smallScan, c); + zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); @@ -89,6 +174,16 @@ int main(int argc, char* argv[]) { int count, expectedCount, expectedNPOT; + zeroArray(SIZE, c); + printDesc("small cpu compact without scan, power-of-two"); + count = StreamCompaction::CPU::compactWithoutScan(8, c, small); + printCmpLenResult(count, 7, smallCompact, c); + + zeroArray(SIZE, c); + printDesc("small cpu compact without scan, non-power-of-two"); + count = StreamCompaction::CPU::compactWithoutScan(7, c, small); + printCmpLenResult(count, 6, smallCompact, c); + zeroArray(SIZE, b); printDesc("cpu compact without scan, power-of-two"); count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); @@ -103,21 +198,42 @@ int main(int argc, char* argv[]) { printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + zeroArray(SIZE, c); + printDesc("small cpu compact with scan, power-of-two"); + count = StreamCompaction::CPU::compactWithScan(8, c, small); + printCmpLenResult(count, 7, smallCompact, c); + + zeroArray(SIZE, c); + printDesc("small cpu compact with scan, non-power-of-two"); + count = StreamCompaction::CPU::compactWithScan(7, c, small); + printCmpLenResult(count, 6, smallCompact, c); + zeroArray(SIZE, c); printDesc("cpu compact with scan"); count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); + zeroArray(SIZE, c); + printDesc("small work-efficient compact with scan, power-of-two"); + count = StreamCompaction::Efficient::compact(8, c, small); + printCmpLenResult(count, 7, smallCompact, c); + + zeroArray(SIZE, c); + printDesc("small work-efficient compact with scan, non-power-of-two"); + count = StreamCompaction::Efficient::compact(7, c, small); + printCmpLenResult(count, 6, smallCompact, c); + zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + printf("done\n"); } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 4f52663..cc34b96 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -6,6 +6,7 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define BENCHMARK 0 /** * Check for CUDA errors; print and exit if there was a problem. diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..87f5fd7 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,5 +1,6 @@ #include #include "cpu.h" +#include "common.h" namespace StreamCompaction { namespace CPU { @@ -8,8 +9,24 @@ namespace CPU { * CPU scan (prefix sum). */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + std::chrono::high_resolution_clock::time_point t1; + if (BENCHMARK) { + t1 = std::chrono::high_resolution_clock::now(); + } + + + // Implement exclusive serial scan on CPU + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } + + if (BENCHMARK) { + std::chrono::high_resolution_clock::time_point t2 = + std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(t2 - t1).count(); + std::cout << duration << " microseconds.\n"; + } } /** @@ -18,8 +35,29 @@ void scan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { - // TODO - return -1; + std::chrono::high_resolution_clock::time_point t1; + if (BENCHMARK) { + t1 = std::chrono::high_resolution_clock::now(); + } + + // remove all 0s from the array of ints + int odataIndex = 0; + for (int i = 0; i < n; i++) { + if (idata[i] == 0) { + continue; + } + odata[odataIndex] = idata[i]; + odataIndex++; + } + + if (BENCHMARK) { + std::chrono::high_resolution_clock::time_point t2 = + std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(t2 - t1).count(); + std::cout << duration << " microseconds.\n"; + } + + return odataIndex; } /** @@ -28,8 +66,45 @@ int compactWithoutScan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - // TODO - return -1; + int *trueArray = new int[n]; + int *trueScan = new int[n]; + + std::chrono::high_resolution_clock::time_point t1; + if (BENCHMARK) { + t1 = std::chrono::high_resolution_clock::now(); + } + + + // Step 1: Compute temporary values in odata + for (int i = 0; i < n; i++) { + if (idata[i] == 0) { + trueArray[i] = 0; + } + else { + trueArray[i] = 1; + } + } + // Step 2: Run exclusive scan on temporary array + scan(n, trueScan, trueArray); + + // Step 3: Scatter + for (int i = 0; i < n; i++) { + if (trueArray[i]) { + odata[trueScan[i]] = idata[i]; + } + } + int numRemaining = trueScan[n - 1] + trueArray[n - 1]; + + if (BENCHMARK) { + std::chrono::high_resolution_clock::time_point t2 = + std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(t2 - t1).count(); + std::cout << duration << " microseconds.\n"; + } + + delete trueArray; + delete trueScan; + return numRemaining; } } diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 6348bf3..21cce3a 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -1,4 +1,6 @@ #pragma once +#include +#include namespace StreamCompaction { namespace CPU { diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..51e4acf 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -6,14 +6,147 @@ namespace StreamCompaction { namespace Efficient { +cudaEvent_t start, stop; + +static void setup_timer_events() { + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); +} + +static float teardown_timer_events() { + cudaEventRecord(stop); + + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + + cudaEventDestroy(start); + cudaEventDestroy(stop); + + return milliseconds; +} + // TODO: __global__ +__global__ void upsweep_step(int d_offset_plus, int d_offset, int *x) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k % d_offset_plus) { + return; + } + x[k + d_offset_plus - 1] += x[k + d_offset - 1]; +} + +__global__ void downsweep_step(int d_offset_plus, int d_offset, int *x) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k % d_offset_plus) { + return; + } + int t = x[k + d_offset - 1]; + x[k + d_offset - 1] = x[k + d_offset_plus - 1]; + x[k + d_offset_plus - 1] += t; +} + +__global__ void fill_by_value(int val, int *x) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + x[k] = val; +} + +static void setup_dimms(dim3 &dimBlock, dim3 &dimGrid, int n) { + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + int tpb = deviceProp.maxThreadsPerBlock; + int blockWidth = fmin(n, tpb); + int blocks = 1; + if (blockWidth != n) { + blocks = n / tpb; + if (n % tpb) { + blocks ++; + } + } + + dimBlock = dim3(blockWidth); + dimGrid = dim3(blocks); +} + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + + // copy everything in idata over to the GPU. + // we'll need to pad the device memory with 0s to get a power of 2 array size. + int logn = ilog2ceil(n); + int pow2 = (int)pow(2, logn); + + dim3 dimBlock; + dim3 dimGrid; + setup_dimms(dimBlock, dimGrid, pow2); + + int *dev_x; + cudaMalloc((void**)&dev_x, sizeof(int) * pow2); + fill_by_value <<>>(0, dev_x); + + cudaMemcpy(dev_x, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + if (BENCHMARK) { + setup_timer_events(); + } + + // up sweep and down sweep + up_sweep_down_sweep(pow2, dev_x); + + if (BENCHMARK) { + printf("%f microseconds.\n", + teardown_timer_events() * 1000.0f); + } + + cudaMemcpy(odata, dev_x, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaFree(dev_x); +} + +// exposed up sweep and down sweep. expects powers of two! +void up_sweep_down_sweep(int n, int *dev_data1) { + int logn = ilog2ceil(n); + + dim3 dimBlock; + dim3 dimGrid; + setup_dimms(dimBlock, dimGrid, n); + + // Up Sweep + for (int d = 0; d < logn; d++) { + int d_offset_plus = (int)pow(2, d + 1); + int d_offset = (int)pow(2, d); + upsweep_step << > >(d_offset_plus, d_offset, dev_data1); + } + + //debug: peek at the array after upsweep + //int peek1[8]; + //cudaMemcpy(&peek1, dev_data1, sizeof(int) * 8, cudaMemcpyDeviceToHost); + + // Down-Sweep + //int zero[1]; + //zero[0] = 0; + //cudaMemcpy(&dev_data1[n - 1], zero, sizeof(int) * 1, cudaMemcpyHostToDevice); + cudaMemset(&dev_data1[n - 1], 0, sizeof(int) * 1); + for (int d = logn - 1; d >= 0; d--) { + int d_offset_plus = (int)pow(2, d + 1); + int d_offset = (int)pow(2, d); + downsweep_step << > >(d_offset_plus, d_offset, dev_data1); + } +} + +__global__ void temporary_array(int *x, int *temp) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + temp[k] = (x[k] != 0); +} + +__global__ void scatter(int *x, int *trueFalse, int* scan, int *out) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (trueFalse[k]) { + out[scan[k]] = x[k]; + } } /** @@ -26,8 +159,63 @@ void scan(int n, int *odata, const int *idata) { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - // TODO - return -1; + int logn = ilog2ceil(n); + int pow2 = (int)pow(2, logn); + + dim3 dimBlock; + dim3 dimGrid; + setup_dimms(dimBlock, dimGrid, pow2); + + int *dev_x; + int *dev_tmp; + int *dev_scatter; + int *dev_scan; + + cudaMalloc((void**)&dev_x, sizeof(int) * pow2); + cudaMalloc((void**)&dev_tmp, sizeof(int) * pow2); + cudaMalloc((void**)&dev_scan, sizeof(int) * pow2); + cudaMalloc((void**)&dev_scatter, sizeof(int) * pow2); + + // 0 pad up to a power of 2 array length. + // copy everything in idata over to the GPU. + fill_by_value << > >(0, dev_x); + cudaMemcpy(dev_x, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + if (BENCHMARK) { + setup_timer_events(); + } + + // Step 1: compute temporary true/false array + temporary_array <<>>(dev_x, dev_tmp); + + // Step 2: run efficient scan on the tmp array + cudaMemcpy(dev_scan, dev_tmp, sizeof(int) * pow2, cudaMemcpyDeviceToDevice); + up_sweep_down_sweep(pow2, dev_scan); + + // Step 3: scatter + scatter <<>>(dev_x, dev_tmp, dev_scan, dev_scatter); + + if (BENCHMARK) { + printf("%f microseconds.\n", + teardown_timer_events() * 1000.0f); + } + + cudaMemcpy(odata, dev_scatter, sizeof(int) * n, cudaMemcpyDeviceToHost); + + int last_index; + cudaMemcpy(&last_index, dev_scan + (n - 1), sizeof(int), + cudaMemcpyDeviceToHost); + + int last_true_false; + cudaMemcpy(&last_true_false, dev_tmp + (n - 1), sizeof(int), + cudaMemcpyDeviceToHost); + + cudaFree(dev_x); + cudaFree(dev_tmp); + cudaFree(dev_scan); + cudaFree(dev_scatter); + + return last_index + last_true_false; } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 395ba10..a550771 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -4,6 +4,8 @@ namespace StreamCompaction { namespace Efficient { void scan(int n, int *odata, const int *idata); + void up_sweep_down_sweep(int n, int *dev_data); + int compact(int n, int *odata, const int *idata); } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..430d658 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -6,14 +6,93 @@ namespace StreamCompaction { namespace Naive { +cudaEvent_t start, stop; + +static void setup_timer_events() { + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); +} + +static float teardown_timer_events() { + cudaEventRecord(stop); + + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + + cudaEventDestroy(start); + cudaEventDestroy(stop); + + return milliseconds; +} + // TODO: __global__ +__global__ void naive_scan_step(int offset, int *x_1, int *x_2) { + int i = threadIdx.x + (blockIdx.x * blockDim.x); + if (i >= offset) { + x_2[i] = x_1[i - offset] + x_1[i]; + } + else { + x_2[i] = x_1[i]; + } +} + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + // copy everything in idata over to the GPU + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + int tpb = deviceProp.maxThreadsPerBlock; + int blockWidth = fmin(n, tpb); + int blocks = 1; + if (blockWidth != n) { + blocks = n / tpb; + if (n % tpb) { + blocks++; + } + } + + dim3 dimBlock(blockWidth); + dim3 dimGrid(blocks); + + int *dev_x; + int *dev_x_next; + cudaMalloc((void**)&dev_x, sizeof(int) * n); + cudaMalloc((void**)&dev_x_next, sizeof(int) * n); + + cudaMemcpy(dev_x, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + cudaMemcpy(dev_x_next, dev_x, sizeof(int) * n, cudaMemcpyDeviceToDevice); + + if (BENCHMARK) { + setup_timer_events(); + } + + // run steps. + // no need to pad with 0s to get a power of 2 array here, + // this can be an "unbalanced" binary tree of ops. + int logn = ilog2ceil(n); + for (int d = 1; d <= logn; d++) { + int offset = powf(2, d - 1); + naive_scan_step <<>>(offset, dev_x, dev_x_next); + int *temp = dev_x_next; + dev_x_next = dev_x; + dev_x = temp; + } + if (BENCHMARK) { + printf("%f microseconds.\n", + teardown_timer_events() * 1000.0f); + } + + cudaMemcpy(odata + 1, dev_x, sizeof(int) * (n - 1), cudaMemcpyDeviceToHost); + odata[0] = 0; + + cudaFree(dev_x); + cudaFree(dev_x_next); } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..be1200e 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -9,13 +9,57 @@ namespace StreamCompaction { namespace Thrust { +cudaEvent_t start, stop; + +static void setup_timer_events() { + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); +} + +static float teardown_timer_events() { + cudaEventRecord(stop); + + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + + cudaEventDestroy(start); + cudaEventDestroy(stop); + + return milliseconds; +} + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO use `thrust::exclusive_scan` + // use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + // Create a thrust::device_vector from a thrust::host_vector + thrust::host_vector v_in(idata, idata + n); + thrust::device_vector device_v_in(v_in); + thrust::device_vector device_v_out(n); + + if (BENCHMARK) { + setup_timer_events(); + } + + thrust::exclusive_scan(device_v_in.begin(), device_v_in.end(), + device_v_out.begin()); + + if (BENCHMARK) { + printf("%f microseconds.\n", + teardown_timer_events() * 1000.0f); + } + + // copy back over + for (int i = 0; i < n; i++) { + odata[i] = device_v_out[i]; + } } }