diff --git a/README.md b/README.md index a82ea0f..169f17d 100644 --- a/README.md +++ b/README.md @@ -3,211 +3,99 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Nada Ouf +* Tested on: Windows 7, i7-2649M @ 2.80GHz 8GB, GTX 520 1024MB + +### Performance Analysis + +##Timing for different problem size + +After testing different block sizes for both the naive and work-efficient implementations a block size of 256 achieved the best performance results. +The timing results below are measured using a block size of 256. + + + + +These are the execution times for naive, work-efficient and thrust GPU implementations. +The vertical axis is a logarithmic scale with base 10 that represents the time in ms. +The horizontal axis is the problem size n. + +##Nsight analysis + + + +Comparing the time taken by the thrust library, according to the Nsight analysis some of the function calls have very low occupancy which may be because +it used a lot of registers or a low number of threads per block. + +##Explanation of results + +The naive implementation is better in performance because: +- all branches are outside the kernal functions +- the need to copy results from a temporary array to the device output array was eleminated + +In my opinion work-efficient is slower than expected because my implementation includes branches in while all the loops are outside the kernal functions. +The work-efficient implementation still needs to be optimized. + + +##Test program output +
+****************
+** SCAN TESTS **
+****************
+ [ 30 36 43 30 43 27 43 21 31 32 19 22 15 ... 12 0 ]
+==== cpu scan, power-of-two ====
+ [ 0 30 66 109 139 182 209 252 273 304 336 355 377 ... 1634700 1634712 ]
+==== cpu scan, non-power-of-two ====
+ [ 0 30 66 109 139 182 209 252 273 304 336 355 377 ... 1634589 1634618 ]
+ passed
+==== naive scan, power-of-two ====
+time is 0.886432 ms on the GPU
+ [ 0 30 66 109 139 182 209 252 273 304 336 355 377 ... 1634700 1634712 ]
+ passed
+==== naive scan, non-power-of-two ====
+time is 0.896960 ms on the GPU
+ [ 0 30 66 109 139 182 209 252 273 304 336 355 377 ... 0 0 ]
+ passed
+==== work-efficient scan, power-of-two ====
+time is 1.020288 ms on the GPU
+ [ 0 30 66 109 139 182 209 252 273 304 336 355 377 ... 1634700 1634712 ]
+ passed
+==== work-efficient scan, non-power-of-two ====
+time is 1.020128 ms on the GPU
+ [ 0 30 66 109 139 182 209 252 273 304 336 355 377 ... 1634589 1634618 ]
+ passed
+==== thrust scan, power-of-two ====
+time is 3.317376 ms on the GPU
+ [ 0 30 66 109 139 182 209 252 273 304 336 355 377 ... 1634700 1634712 ]
+ passed
+==== thrust scan, non-power-of-two ====
+time is 0.444320 ms on the GPU
+ [ 0 30 66 109 139 182 209 252 273 304 336 355 377 ... 1634589 1634618 ]
+ passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+ [ 2 3 4 3 4 2 4 2 3 3 1 2 1 ... 1 0 ]
+==== cpu compact without scan, power-of-two ====
+ [ 2 3 4 3 4 2 4 2 3 3 1 2 1 ... 3 1 ]
+ passed
+==== cpu compact without scan, non-power-of-two ====
+ [ 2 3 4 3 4 2 4 2 3 3 1 2 1 ... 2 4 ]
+ passed
+==== cpu compact with scan ====
+ [ 2 3 4 3 4 2 4 2 3 3 1 2 1 ... 3 1 ]
+ passed
+==== work-efficient compact, power-of-two ====
+time is 1.023168 ms on the GPU
+ [ 2 3 4 3 4 2 4 2 3 3 1 2 1 ... 3 1 ]
+ passed
+==== work-efficient compact, non-power-of-two ====
+time is 1.013376 ms on the GPU
+ [ 2 3 4 3 4 2 4 2 3 3 1 2 1 ... 2 4 ]
+ passed
+
-### (TODO: Your README)
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
-Instructions (delete me)
-========================
-This is due Sunday, September 13 at midnight.
-
-**Summary:** In this project, you'll implement GPU stream compaction in CUDA,
-from scratch. This algorithm is widely used, and will be important for
-accelerating your path tracer project.
-
-Your stream compaction implementations in this project will simply remove `0`s
-from an array of `int`s. In the path tracer, you will remove terminated paths
-from an array of rays.
-
-In addition to being useful for your path tracer, this project is meant to
-reorient your algorithmic thinking to the way of the GPU. On GPUs, many
-algorithms can benefit from massive parallelism and, in particular, data
-parallelism: executing the same code many times simultaneously with different
-data.
-
-You'll implement a few different versions of the *Scan* (*Prefix Sum*)
-algorithm. First, you'll implement a CPU version of the algorithm to reinforce
-your understanding. Then, you'll write a few GPU implementations: "naive" and
-"work-efficient." Finally, you'll use some of these to implement GPU stream
-compaction.
-
-**Algorithm overview & details:** There are two primary references for details
-on the implementation of scan and stream compaction.
-
-* The [slides on Parallel Algorithms](https://github.com/CIS565-Fall-2015/cis565-fall-2015.github.io/raw/master/lectures/2-Parallel-Algorithms.pptx)
- for Scan, Stream Compaction, and Work-Efficient Parallel Scan.
-* GPU Gems 3, Chapter 39 - [Parallel Prefix Sum (Scan) with CUDA](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html).
-
-Your GPU stream compaction implementation will live inside of the
-`stream_compaction` subproject. This way, you will be able to easily copy it
-over for use in your GPU path tracer.
-
-
-## Part 0: The Usual
-
-This project (and all other CUDA projects in this course) requires an NVIDIA
-graphics card with CUDA capability. Any card with Compute Capability 2.0
-(`sm_20`) or greater will work. Check your GPU on this
-[compatibility table](https://developer.nvidia.com/cuda-gpus).
-If you do not have a personal machine with these specs, you may use those
-computers in the Moore 100B/C which have supported GPUs.
-
-**HOWEVER**: If you need to use the lab computer for your development, you will
-not presently be able to do GPU performance profiling. This will be very
-important for debugging performance bottlenecks in your program.
-
-### Useful existing code
-
-* `stream_compaction/common.h`
- * `checkCUDAError` macro: checks for CUDA errors and exits if there were any.
- * `ilog2ceil(x)`: computes the ceiling of log2(x), as an integer.
-* `main.cpp`
- * Some testing code for your implementations.
-
-
-## Part 1: CPU Scan & Stream Compaction
-
-This stream compaction method will remove `0`s from an array of `int`s.
-
-In `stream_compaction/cpu.cu`, implement:
-
-* `StreamCompaction::CPU::scan`: compute an exclusive prefix sum.
-* `StreamCompaction::CPU::compactWithoutScan`: stream compaction without using
- the `scan` function.
-* `StreamCompaction::CPU::compactWithScan`: stream compaction using the `scan`
- function. Map the input array to an array of 0s and 1s, scan it, and use
- scatter to produce the output. You will need a **CPU** scatter implementation
- for this (see slides or GPU Gems chapter for an explanation).
-
-These implementations should only be a few lines long.
-
-
-## Part 2: Naive GPU Scan Algorithm
-
-In `stream_compaction/naive.cu`, implement `StreamCompaction::Naive::scan`
-
-This uses the "Naive" algorithm from GPU Gems 3, Section 39.2.1. We haven't yet
-taught shared memory, and you **shouldn't use it yet**. Example 39-1 uses
-shared memory, but is limited to operating on very small arrays! Instead, write
-this using global memory only. As a result of this, you will have to do
-`ilog2ceil(n)` separate kernel invocations.
-
-Beware of errors in Example 39-1 in the book; both the pseudocode and the CUDA
-code in the online version of Chapter 39 are known to have a few small errors
-(in superscripting, missing braces, bad indentation, etc.)
-
-Since the parallel scan algorithm operates on a binary tree structure, it works
-best with arrays with power-of-two length. Make sure your implementation works
-on non-power-of-two sized arrays (see `ilog2ceil`). This requires extra memory
-- your intermediate array sizes will need to be rounded to the next power of
-two.
-
-
-## Part 3: Work-Efficient GPU Scan & Stream Compaction
-
-### 3.1. Scan
-
-In `stream_compaction/efficient.cu`, implement
-`StreamCompaction::Efficient::scan`
-
-All of the text in Part 2 applies.
-
-* This uses the "Work-Efficient" algorithm from GPU Gems 3, Section 39.2.2.
-* Beware of errors in Example 39-2.
-* Test non-power-of-two sized arrays.
-
-### 3.2. Stream Compaction
-
-This stream compaction method will remove `0`s from an array of `int`s.
-
-In `stream_compaction/efficient.cu`, implement
-`StreamCompaction::Efficient::compact`
-
-For compaction, you will also need to implement the scatter algorithm presented
-in the slides and the GPU Gems chapter.
-
-In `stream_compaction/common.cu`, implement these for use in `compact`:
-
-* `StreamCompaction::Common::kernMapToBoolean`
-* `StreamCompaction::Common::kernScatter`
-
-
-## Part 4: Using Thrust's Implementation
-
-In `stream_compaction/thrust.cu`, implement:
-
-* `StreamCompaction::Thrust::scan`
-
-This should be a very short function which wraps a call to the Thrust library
-function `thrust::exclusive_scan(first, last, result)`.
-
-To measure timing, be sure to exclude memory operations by passing
-`exclusive_scan` a `thrust::device_vector` (which is already allocated on the
-GPU). You can create a `thrust::device_vector` by creating a
-`thrust::host_vector` from the given pointer, then casting it.
-
-
-## Part 5: Radix Sort (Extra Credit) (+10)
-
-Add an additional module to the `stream_compaction` subproject. Implement radix
-sort using one of your scan implementations. Add tests to check its correctness.
-
-
-## Write-up
-
-1. Update all of the TODOs at the top of this README.
-2. Add a description of this project including a list of its features.
-3. Add your performance analysis (see below).
-
-All extra credit features must be documented in your README, explaining its
-value (with performance comparison, if applicable!) and showing an example how
-it works. For radix sort, show how it is called and an example of its output.
-
-Always profile with Release mode builds and run without debugging.
-
-### Questions
-
-* Roughly optimize the block sizes of each of your implementations for minimal
- run time on your GPU.
- * (You shouldn't compare unoptimized implementations to each other!)
-
-* Compare all of these GPU Scan implementations (Naive, Work-Efficient, and
- Thrust) to the serial CPU version of Scan. Plot a graph of the comparison
- (with array size on the independent axis).
- * You should use CUDA events for timing. Be sure **not** to include any
- explicit memory operations in your performance measurements, for
- comparability.
- * To guess at what might be happening inside the Thrust implementation, take
- a look at the Nsight timeline for its execution.
-
-* Write a brief explanation of the phenomena you see here.
- * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is
- it different for each implementation?
-
-* Paste the output of the test program into a triple-backtick block in your
- README.
- * If you add your own tests (e.g. for radix sort or to test additional corner
- cases), be sure to mention it explicitly.
-
-These questions should help guide you in performance analysis on future
-assignments, as well.
-
-## Submit
-
-If you have modified any of the `CMakeLists.txt` files at all (aside from the
-list of `SOURCE_FILES`), you must test that your project can build in Moore
-100B/C. Beware of any build issues discussed on the Google Group.
-
-1. Open a GitHub pull request so that we can see that you have finished.
- The title should be "Submission: YOUR NAME".
-2. Send an email to the TA (gmail: kainino1+cis565@) with:
- * **Subject**: in the form of `[CIS565] Project 2: PENNKEY`
- * Direct link to your pull request on GitHub
- * In the form of a grade (0-100+) with comments, evaluate your own
- performance on the project.
- * Feedback on the project itself, if any.
diff --git a/images/nsight.png b/images/nsight.png
new file mode 100644
index 0000000..06faf43
Binary files /dev/null and b/images/nsight.png differ
diff --git a/images/results.PNG b/images/results.PNG
new file mode 100644
index 0000000..ed020ef
Binary files /dev/null and b/images/results.PNG differ
diff --git a/src/main.cpp b/src/main.cpp
index efc8c06..d8b4272 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -16,7 +16,7 @@
#include "testing_helpers.hpp"
int main(int argc, char* argv[]) {
- const int SIZE = 1 << 8;
+ const int SIZE = 1 << 16;
const int NPOT = SIZE - 3;
int a[SIZE], b[SIZE], c[SIZE];
@@ -44,37 +44,37 @@ int main(int argc, char* argv[]) {
zeroArray(SIZE, c);
printDesc("naive scan, power-of-two");
StreamCompaction::Naive::scan(SIZE, c, a);
- //printArray(SIZE, c, true);
+ printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);
zeroArray(SIZE, c);
printDesc("naive scan, non-power-of-two");
StreamCompaction::Naive::scan(NPOT, c, a);
- //printArray(SIZE, c, true);
+ printArray(SIZE, c, true);
printCmpResult(NPOT, b, c);
zeroArray(SIZE, c);
printDesc("work-efficient scan, power-of-two");
StreamCompaction::Efficient::scan(SIZE, c, a);
- //printArray(SIZE, c, true);
+ printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);
zeroArray(SIZE, c);
printDesc("work-efficient scan, non-power-of-two");
StreamCompaction::Efficient::scan(NPOT, c, a);
- //printArray(NPOT, c, true);
+ printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);
zeroArray(SIZE, c);
printDesc("thrust scan, power-of-two");
StreamCompaction::Thrust::scan(SIZE, c, a);
- //printArray(SIZE, c, true);
+ printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);
zeroArray(SIZE, c);
printDesc("thrust scan, non-power-of-two");
StreamCompaction::Thrust::scan(NPOT, c, a);
- //printArray(NPOT, c, true);
+ printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);
printf("\n");
@@ -112,12 +112,12 @@ int main(int argc, char* argv[]) {
zeroArray(SIZE, c);
printDesc("work-efficient compact, power-of-two");
count = StreamCompaction::Efficient::compact(SIZE, c, a);
- //printArray(count, c, true);
+ printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);
zeroArray(SIZE, c);
printDesc("work-efficient compact, non-power-of-two");
count = StreamCompaction::Efficient::compact(NPOT, c, a);
- //printArray(count, c, true);
+ printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);
}
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index fe872d4..3653379 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -14,6 +14,37 @@ void checkCUDAErrorFn(const char *msg, const char *file, int line) {
exit(EXIT_FAILURE);
}
+void cpyHostToDevice(const int *hst, int *dev, size_t array_size) {
+ cudaMemcpy( /*destination*/ dev,
+ /*source*/ hst,
+ /*size in bytes to copy*/ array_size,
+ /*cudaMemcpy type*/ cudaMemcpyHostToDevice);
+
+ checkCUDAError("Error copying memory from host to device");
+}
+
+/**
+*/
+void cpyDeviceToHost(int *hst,const int *dev, size_t array_size) {
+ cudaMemcpy( /*destination*/ hst,
+ /*source*/ dev,
+ /*size in bytes to copy*/ array_size,
+ /*cudaMemcpy type*/ cudaMemcpyDeviceToHost);
+
+ checkCUDAError("Error copying memory from device to host");
+}
+
+/**
+*/
+void cpyDeviceToDevice(const int *src, int *dest, size_t array_size) {
+ cudaMemcpy( /*destination*/ dest,
+ /*source*/ src,
+ /*size in bytes to copy*/ array_size,
+ /*cudaMemcpy type*/ cudaMemcpyDeviceToDevice);
+
+ checkCUDAError("Error copying memory from device to device");
+}
+
namespace StreamCompaction {
namespace Common {
@@ -23,7 +54,11 @@ namespace Common {
* which map to 0 will be removed, and elements which map to 1 will be kept.
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
- // TODO
+ int index = (blockIdx.x*blockDim.x) + threadIdx.x;
+
+ if(index < n) {
+ bools[index] = (idata[index] != 0);
+ }
}
/**
@@ -32,7 +67,11 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
*/
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
- // TODO
+ int index = (blockIdx.x*blockDim.x) + threadIdx.x;
+
+ if(index < n && bools[index]) {
+ odata[indices[index]] = idata[index];
+ }
}
}
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index 4f52663..1f9f5ed 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -6,12 +6,26 @@
#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+#define blockSize 256
/**
* Check for CUDA errors; print and exit if there was a problem.
*/
void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1);
+void cpyHostToDevice(const int *hst, int *dev, size_t array_size);
+void cpyDeviceToHost(int *hst, const int *dev, size_t array_size);
+void cpyDeviceToDevice(const int *src, int *dest, size_t array_size);
+
+inline int power(int x, int y) {
+ int result = 1;
+ while(y > 0) {
+ result *= x;
+ --y;
+ }
+ return result;
+}
+
inline int ilog2(int x) {
int lg = 0;
while (x >>= 1) {
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index e600c29..9d7446b 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -8,8 +8,14 @@ namespace CPU {
* CPU scan (prefix sum).
*/
void scan(int n, int *odata, const int *idata) {
- // TODO
- printf("TODO\n");
+
+ odata[0] = 0;
+
+ for(int i = 1; i < n; ++i) {
+ odata[i] = idata[i-1] + odata[i-1];
+ }
+
+ //printf("n is %i time is %f ms on the CPU\n",n, elapsedTime);
}
/**
@@ -18,8 +24,16 @@ void scan(int n, int *odata, const int *idata) {
* @returns the number of elements remaining after compaction.
*/
int compactWithoutScan(int n, int *odata, const int *idata) {
- // TODO
- return -1;
+
+ int compactN = 0;
+ for(int i = 0; i < n; ++i) {
+ if(idata[i] != 0) {
+ odata[compactN] = idata[i];
+ ++compactN;
+ }
+ }
+
+ return compactN;
}
/**
@@ -28,8 +42,32 @@ int compactWithoutScan(int n, int *odata, const int *idata) {
* @returns the number of elements remaining after compaction.
*/
int compactWithScan(int n, int *odata, const int *idata) {
- // TODO
- return -1;
+ int compactN = 0;
+ int *intermediate = new int[n];
+ int *scanResult = new int[n];
+
+ for(int i = 0; i < n; ++i) {
+ if(idata[i] != 0) {
+ intermediate[i] = 1;
+ } else {
+ intermediate[i] = 0;
+ }
+ }
+
+ scan(n, scanResult, intermediate);
+
+ for(int i = 0; i < n; ++i) {
+ if(intermediate[i] == 1) {
+ odata[scanResult[i]] = idata[i];
+ }
+ }
+
+ compactN = scanResult[n-1];
+ if(intermediate[n-1] == 1) {
+ ++compactN;
+ }
+
+ return compactN;
}
}
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index b2f739b..727fee4 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -6,14 +6,85 @@
namespace StreamCompaction {
namespace Efficient {
-// TODO: __global__
+__global__ void kernUpSweep(int n, int d, int offset, int *idata) {
+ int index = (blockIdx.x *blockDim.x) + threadIdx.x;
+
+ if(index < d) {
+ int ai = offset*((index*2)+1) - 1;
+ int bi = offset*((index*2)+2) - 1;
+ if(ai >= n || bi >= n)
+ return;
+ if(bi == n-1)
+ idata[bi] = 0;
+ else
+ idata[bi] += idata[ai];
+ }
+}
+
+__global__ void kernDownSweep(int n, int d, int offset, int *idata) {
+ int index = (blockIdx.x *blockDim.x) + threadIdx.x;
+
+ if(index < d) {
+ int ai = offset*((index*2)+1) - 1;
+ int bi = offset*((index*2)+2) - 1;
+
+ int temp = idata[ai];
+ idata[ai] = idata[bi];
+ idata[bi] += temp;
+ }
+
+}
+
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
-void scan(int n, int *odata, const int *idata) {
- // TODO
- printf("TODO\n");
+void scan(int n, int *hst_odata, const int *hst_idata) {
+ int *dev_idata;
+
+ //The number of blocks
+ dim3 blocksPerGrid((n + blockSize -1)/blockSize);
+
+ //The number of threads per block
+ dim3 threadsPerBlock(blockSize);
+
+ //Allocate memory
+ cudaMalloc((void**)&dev_idata, n*sizeof(int));
+ checkCUDAError("Error in malloc of dev_idata");
+
+ //copy array to device
+ cpyHostToDevice(hst_idata, dev_idata, n*sizeof(int));
+
+ int passes = ilog2ceil(n);
+
+ int offset = 1;
+ int d = power(2, passes)>>1;
+
+ cudaEvent_t start, stop;
+ cudaEventCreate(&start);
+ cudaEventCreate(&stop);
+ cudaEventRecord(start);
+
+ //Up sweep
+ for(; d > 0; d >>= 1) {
+ kernUpSweep<<