diff --git a/README.md b/README.md index a82ea0f..169f17d 100644 --- a/README.md +++ b/README.md @@ -3,211 +3,99 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Nada Ouf +* Tested on: Windows 7, i7-2649M @ 2.80GHz 8GB, GTX 520 1024MB + +### Performance Analysis + +##Timing for different problem size + +After testing different block sizes for both the naive and work-efficient implementations a block size of 256 achieved the best performance results. +The timing results below are measured using a block size of 256. + +![](images/results.PNG) + + +These are the execution times for naive, work-efficient and thrust GPU implementations. +The vertical axis is a logarithmic scale with base 10 that represents the time in ms. +The horizontal axis is the problem size n. + +##Nsight analysis + +![](images/nsight.png) + +Comparing the time taken by the thrust library, according to the Nsight analysis some of the function calls have very low occupancy which may be because +it used a lot of registers or a low number of threads per block. + +##Explanation of results + +The naive implementation is better in performance because: +- all branches are outside the kernal functions +- the need to copy results from a temporary array to the device output array was eleminated + +In my opinion work-efficient is slower than expected because my implementation includes branches in while all the loops are outside the kernal functions. +The work-efficient implementation still needs to be optimized. + + +##Test program output +

+****************
+** SCAN TESTS **
+****************
+    [  30  36  43  30  43  27  43  21  31  32  19  22  15 ...  12   0 ]
+==== cpu scan, power-of-two ====
+    [   0  30  66 109 139 182 209 252 273 304 336 355 377 ... 1634700 1634712 ]
+==== cpu scan, non-power-of-two ====
+    [   0  30  66 109 139 182 209 252 273 304 336 355 377 ... 1634589 1634618 ]
+    passed
+==== naive scan, power-of-two ====
+time is 0.886432 ms on the GPU
+    [   0  30  66 109 139 182 209 252 273 304 336 355 377 ... 1634700 1634712 ]
+    passed
+==== naive scan, non-power-of-two ====
+time is 0.896960 ms on the GPU
+    [   0  30  66 109 139 182 209 252 273 304 336 355 377 ...   0   0 ]
+    passed
+==== work-efficient scan, power-of-two ====
+time is 1.020288 ms on the GPU
+    [   0  30  66 109 139 182 209 252 273 304 336 355 377 ... 1634700 1634712 ]
+    passed
+==== work-efficient scan, non-power-of-two ====
+time is 1.020128 ms on the GPU
+    [   0  30  66 109 139 182 209 252 273 304 336 355 377 ... 1634589 1634618 ]
+    passed
+==== thrust scan, power-of-two ====
+time is 3.317376 ms on the GPU
+    [   0  30  66 109 139 182 209 252 273 304 336 355 377 ... 1634700 1634712 ]
+    passed
+==== thrust scan, non-power-of-two ====
+time is 0.444320 ms on the GPU
+    [   0  30  66 109 139 182 209 252 273 304 336 355 377 ... 1634589 1634618 ]
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   3   4   3   4   2   4   2   3   3   1   2   1 ...   1   0 ]
+==== cpu compact without scan, power-of-two ====
+    [   2   3   4   3   4   2   4   2   3   3   1   2   1 ...   3   1 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+    [   2   3   4   3   4   2   4   2   3   3   1   2   1 ...   2   4 ]
+    passed
+==== cpu compact with scan ====
+    [   2   3   4   3   4   2   4   2   3   3   1   2   1 ...   3   1 ]
+    passed
+==== work-efficient compact, power-of-two ====
+time is 1.023168 ms on the GPU
+    [   2   3   4   3   4   2   4   2   3   3   1   2   1 ...   3   1 ]
+    passed
+==== work-efficient compact, non-power-of-two ====
+time is 1.013376 ms on the GPU
+    [   2   3   4   3   4   2   4   2   3   3   1   2   1 ...   2   4 ]
+    passed 
+
-### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) -Instructions (delete me) -======================== -This is due Sunday, September 13 at midnight. - -**Summary:** In this project, you'll implement GPU stream compaction in CUDA, -from scratch. This algorithm is widely used, and will be important for -accelerating your path tracer project. - -Your stream compaction implementations in this project will simply remove `0`s -from an array of `int`s. In the path tracer, you will remove terminated paths -from an array of rays. - -In addition to being useful for your path tracer, this project is meant to -reorient your algorithmic thinking to the way of the GPU. On GPUs, many -algorithms can benefit from massive parallelism and, in particular, data -parallelism: executing the same code many times simultaneously with different -data. - -You'll implement a few different versions of the *Scan* (*Prefix Sum*) -algorithm. First, you'll implement a CPU version of the algorithm to reinforce -your understanding. Then, you'll write a few GPU implementations: "naive" and -"work-efficient." Finally, you'll use some of these to implement GPU stream -compaction. - -**Algorithm overview & details:** There are two primary references for details -on the implementation of scan and stream compaction. - -* The [slides on Parallel Algorithms](https://github.com/CIS565-Fall-2015/cis565-fall-2015.github.io/raw/master/lectures/2-Parallel-Algorithms.pptx) - for Scan, Stream Compaction, and Work-Efficient Parallel Scan. -* GPU Gems 3, Chapter 39 - [Parallel Prefix Sum (Scan) with CUDA](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html). - -Your GPU stream compaction implementation will live inside of the -`stream_compaction` subproject. This way, you will be able to easily copy it -over for use in your GPU path tracer. - - -## Part 0: The Usual - -This project (and all other CUDA projects in this course) requires an NVIDIA -graphics card with CUDA capability. Any card with Compute Capability 2.0 -(`sm_20`) or greater will work. Check your GPU on this -[compatibility table](https://developer.nvidia.com/cuda-gpus). -If you do not have a personal machine with these specs, you may use those -computers in the Moore 100B/C which have supported GPUs. - -**HOWEVER**: If you need to use the lab computer for your development, you will -not presently be able to do GPU performance profiling. This will be very -important for debugging performance bottlenecks in your program. - -### Useful existing code - -* `stream_compaction/common.h` - * `checkCUDAError` macro: checks for CUDA errors and exits if there were any. - * `ilog2ceil(x)`: computes the ceiling of log2(x), as an integer. -* `main.cpp` - * Some testing code for your implementations. - - -## Part 1: CPU Scan & Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -In `stream_compaction/cpu.cu`, implement: - -* `StreamCompaction::CPU::scan`: compute an exclusive prefix sum. -* `StreamCompaction::CPU::compactWithoutScan`: stream compaction without using - the `scan` function. -* `StreamCompaction::CPU::compactWithScan`: stream compaction using the `scan` - function. Map the input array to an array of 0s and 1s, scan it, and use - scatter to produce the output. You will need a **CPU** scatter implementation - for this (see slides or GPU Gems chapter for an explanation). - -These implementations should only be a few lines long. - - -## Part 2: Naive GPU Scan Algorithm - -In `stream_compaction/naive.cu`, implement `StreamCompaction::Naive::scan` - -This uses the "Naive" algorithm from GPU Gems 3, Section 39.2.1. We haven't yet -taught shared memory, and you **shouldn't use it yet**. Example 39-1 uses -shared memory, but is limited to operating on very small arrays! Instead, write -this using global memory only. As a result of this, you will have to do -`ilog2ceil(n)` separate kernel invocations. - -Beware of errors in Example 39-1 in the book; both the pseudocode and the CUDA -code in the online version of Chapter 39 are known to have a few small errors -(in superscripting, missing braces, bad indentation, etc.) - -Since the parallel scan algorithm operates on a binary tree structure, it works -best with arrays with power-of-two length. Make sure your implementation works -on non-power-of-two sized arrays (see `ilog2ceil`). This requires extra memory -- your intermediate array sizes will need to be rounded to the next power of -two. - - -## Part 3: Work-Efficient GPU Scan & Stream Compaction - -### 3.1. Scan - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::scan` - -All of the text in Part 2 applies. - -* This uses the "Work-Efficient" algorithm from GPU Gems 3, Section 39.2.2. -* Beware of errors in Example 39-2. -* Test non-power-of-two sized arrays. - -### 3.2. Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::compact` - -For compaction, you will also need to implement the scatter algorithm presented -in the slides and the GPU Gems chapter. - -In `stream_compaction/common.cu`, implement these for use in `compact`: - -* `StreamCompaction::Common::kernMapToBoolean` -* `StreamCompaction::Common::kernScatter` - - -## Part 4: Using Thrust's Implementation - -In `stream_compaction/thrust.cu`, implement: - -* `StreamCompaction::Thrust::scan` - -This should be a very short function which wraps a call to the Thrust library -function `thrust::exclusive_scan(first, last, result)`. - -To measure timing, be sure to exclude memory operations by passing -`exclusive_scan` a `thrust::device_vector` (which is already allocated on the -GPU). You can create a `thrust::device_vector` by creating a -`thrust::host_vector` from the given pointer, then casting it. - - -## Part 5: Radix Sort (Extra Credit) (+10) - -Add an additional module to the `stream_compaction` subproject. Implement radix -sort using one of your scan implementations. Add tests to check its correctness. - - -## Write-up - -1. Update all of the TODOs at the top of this README. -2. Add a description of this project including a list of its features. -3. Add your performance analysis (see below). - -All extra credit features must be documented in your README, explaining its -value (with performance comparison, if applicable!) and showing an example how -it works. For radix sort, show how it is called and an example of its output. - -Always profile with Release mode builds and run without debugging. - -### Questions - -* Roughly optimize the block sizes of each of your implementations for minimal - run time on your GPU. - * (You shouldn't compare unoptimized implementations to each other!) - -* Compare all of these GPU Scan implementations (Naive, Work-Efficient, and - Thrust) to the serial CPU version of Scan. Plot a graph of the comparison - (with array size on the independent axis). - * You should use CUDA events for timing. Be sure **not** to include any - explicit memory operations in your performance measurements, for - comparability. - * To guess at what might be happening inside the Thrust implementation, take - a look at the Nsight timeline for its execution. - -* Write a brief explanation of the phenomena you see here. - * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is - it different for each implementation? - -* Paste the output of the test program into a triple-backtick block in your - README. - * If you add your own tests (e.g. for radix sort or to test additional corner - cases), be sure to mention it explicitly. - -These questions should help guide you in performance analysis on future -assignments, as well. - -## Submit - -If you have modified any of the `CMakeLists.txt` files at all (aside from the -list of `SOURCE_FILES`), you must test that your project can build in Moore -100B/C. Beware of any build issues discussed on the Google Group. - -1. Open a GitHub pull request so that we can see that you have finished. - The title should be "Submission: YOUR NAME". -2. Send an email to the TA (gmail: kainino1+cis565@) with: - * **Subject**: in the form of `[CIS565] Project 2: PENNKEY` - * Direct link to your pull request on GitHub - * In the form of a grade (0-100+) with comments, evaluate your own - performance on the project. - * Feedback on the project itself, if any. diff --git a/images/nsight.png b/images/nsight.png new file mode 100644 index 0000000..06faf43 Binary files /dev/null and b/images/nsight.png differ diff --git a/images/results.PNG b/images/results.PNG new file mode 100644 index 0000000..ed020ef Binary files /dev/null and b/images/results.PNG differ diff --git a/src/main.cpp b/src/main.cpp index efc8c06..d8b4272 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,7 +16,7 @@ #include "testing_helpers.hpp" int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; + const int SIZE = 1 << 16; const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; @@ -44,37 +44,37 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -112,12 +112,12 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); } diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..3653379 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -14,6 +14,37 @@ void checkCUDAErrorFn(const char *msg, const char *file, int line) { exit(EXIT_FAILURE); } +void cpyHostToDevice(const int *hst, int *dev, size_t array_size) { + cudaMemcpy( /*destination*/ dev, + /*source*/ hst, + /*size in bytes to copy*/ array_size, + /*cudaMemcpy type*/ cudaMemcpyHostToDevice); + + checkCUDAError("Error copying memory from host to device"); +} + +/** +*/ +void cpyDeviceToHost(int *hst,const int *dev, size_t array_size) { + cudaMemcpy( /*destination*/ hst, + /*source*/ dev, + /*size in bytes to copy*/ array_size, + /*cudaMemcpy type*/ cudaMemcpyDeviceToHost); + + checkCUDAError("Error copying memory from device to host"); +} + +/** +*/ +void cpyDeviceToDevice(const int *src, int *dest, size_t array_size) { + cudaMemcpy( /*destination*/ dest, + /*source*/ src, + /*size in bytes to copy*/ array_size, + /*cudaMemcpy type*/ cudaMemcpyDeviceToDevice); + + checkCUDAError("Error copying memory from device to device"); +} + namespace StreamCompaction { namespace Common { @@ -23,7 +54,11 @@ namespace Common { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = (blockIdx.x*blockDim.x) + threadIdx.x; + + if(index < n) { + bools[index] = (idata[index] != 0); + } } /** @@ -32,7 +67,11 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = (blockIdx.x*blockDim.x) + threadIdx.x; + + if(index < n && bools[index]) { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 4f52663..1f9f5ed 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -6,12 +6,26 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define blockSize 256 /** * Check for CUDA errors; print and exit if there was a problem. */ void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1); +void cpyHostToDevice(const int *hst, int *dev, size_t array_size); +void cpyDeviceToHost(int *hst, const int *dev, size_t array_size); +void cpyDeviceToDevice(const int *src, int *dest, size_t array_size); + +inline int power(int x, int y) { + int result = 1; + while(y > 0) { + result *= x; + --y; + } + return result; +} + inline int ilog2(int x) { int lg = 0; while (x >>= 1) { diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..9d7446b 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -8,8 +8,14 @@ namespace CPU { * CPU scan (prefix sum). */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + + odata[0] = 0; + + for(int i = 1; i < n; ++i) { + odata[i] = idata[i-1] + odata[i-1]; + } + + //printf("n is %i time is %f ms on the CPU\n",n, elapsedTime); } /** @@ -18,8 +24,16 @@ void scan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { - // TODO - return -1; + + int compactN = 0; + for(int i = 0; i < n; ++i) { + if(idata[i] != 0) { + odata[compactN] = idata[i]; + ++compactN; + } + } + + return compactN; } /** @@ -28,8 +42,32 @@ int compactWithoutScan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - // TODO - return -1; + int compactN = 0; + int *intermediate = new int[n]; + int *scanResult = new int[n]; + + for(int i = 0; i < n; ++i) { + if(idata[i] != 0) { + intermediate[i] = 1; + } else { + intermediate[i] = 0; + } + } + + scan(n, scanResult, intermediate); + + for(int i = 0; i < n; ++i) { + if(intermediate[i] == 1) { + odata[scanResult[i]] = idata[i]; + } + } + + compactN = scanResult[n-1]; + if(intermediate[n-1] == 1) { + ++compactN; + } + + return compactN; } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..727fee4 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -6,14 +6,85 @@ namespace StreamCompaction { namespace Efficient { -// TODO: __global__ +__global__ void kernUpSweep(int n, int d, int offset, int *idata) { + int index = (blockIdx.x *blockDim.x) + threadIdx.x; + + if(index < d) { + int ai = offset*((index*2)+1) - 1; + int bi = offset*((index*2)+2) - 1; + if(ai >= n || bi >= n) + return; + if(bi == n-1) + idata[bi] = 0; + else + idata[bi] += idata[ai]; + } +} + +__global__ void kernDownSweep(int n, int d, int offset, int *idata) { + int index = (blockIdx.x *blockDim.x) + threadIdx.x; + + if(index < d) { + int ai = offset*((index*2)+1) - 1; + int bi = offset*((index*2)+2) - 1; + + int temp = idata[ai]; + idata[ai] = idata[bi]; + idata[bi] += temp; + } + +} + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ -void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); +void scan(int n, int *hst_odata, const int *hst_idata) { + int *dev_idata; + + //The number of blocks + dim3 blocksPerGrid((n + blockSize -1)/blockSize); + + //The number of threads per block + dim3 threadsPerBlock(blockSize); + + //Allocate memory + cudaMalloc((void**)&dev_idata, n*sizeof(int)); + checkCUDAError("Error in malloc of dev_idata"); + + //copy array to device + cpyHostToDevice(hst_idata, dev_idata, n*sizeof(int)); + + int passes = ilog2ceil(n); + + int offset = 1; + int d = power(2, passes)>>1; + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); + + //Up sweep + for(; d > 0; d >>= 1) { + kernUpSweep<<>>(n, d, offset, dev_idata); + offset *= 2; + } + + //Down sweep + for(d = 1; d < n; d *= 2) { + offset >>= 1; + kernDownSweep<<>>(n, d, offset, dev_idata); + } + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float elapsedTime; + cudaEventElapsedTime(&elapsedTime , start, stop); + printf("time is %f ms on the GPU\n", elapsedTime); + + cpyDeviceToHost(hst_odata, dev_idata, (n*sizeof(int))); + } /** @@ -25,9 +96,50 @@ void scan(int n, int *odata, const int *idata) { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ -int compact(int n, int *odata, const int *idata) { - // TODO - return -1; +int compact(int n, int *hst_odata, const int *hst_idata) { + int *dev_idata; + int *dev_odata; + int *dev_bools; + int *dev_indices; + + //The number of blocks + dim3 blocksPerGrid((n + blockSize -1)/blockSize); + + //The number of threads per block + dim3 threadsPerBlock(blockSize); + + //Allocate memory + cudaMalloc((void**)&dev_idata, n*sizeof(int)); + checkCUDAError("Error in malloc of dev_idata"); + + cudaMalloc((void**)&dev_odata, n*sizeof(int)); + checkCUDAError("Error in malloc of dev_odata"); + + cudaMalloc((void**)&dev_bools, n*sizeof(int)); + checkCUDAError("Error in malloc of dev_bools"); + + cudaMalloc((void**)&dev_indices, n*sizeof(int)); + checkCUDAError("Error in malloc of dev_indices"); + + //copy array to device + cpyHostToDevice(hst_idata, dev_idata, n*sizeof(int)); + + Common::kernMapToBoolean<<>>(n, dev_bools, dev_idata); + + int *hst_bools = new int[n]; + int *hst_indices = new int[n]; + + cpyDeviceToHost(hst_bools, dev_bools, (n*sizeof(int))); + + scan(n, hst_indices, hst_bools); + + cpyHostToDevice(hst_indices, dev_indices, n*sizeof(int)); + + Common::kernScatter<<>>(n, dev_odata, dev_idata, dev_bools, dev_indices); + + cpyDeviceToHost(hst_odata, dev_odata, n*sizeof(int)); + + return hst_bools[n-1] == 1 ? hst_indices[n-1]+1 : hst_indices[n-1]; } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..322d579 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -6,14 +6,67 @@ namespace StreamCompaction { namespace Naive { -// TODO: __global__ +int *dev_idata; +int *dev_odata; + +__global__ void kernNaive(int n, int step, int *odata, int *idata) { + + int index = (blockIdx.x *blockDim.x) + threadIdx.x; + + odata[0] = idata[0]; + if(index < n) { + if(index >= step) { + odata[index] = idata[index - step] + idata[index]; + } else { + odata[index] = idata[index]; + } + } +} /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ -void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); +void scan(int n, int *hst_odata, const int *hst_idata) { + + //The number of blocks + dim3 blocksPerGrid((n + blockSize - 1)/blockSize); + + //The number of threads per block + dim3 threadsPerBlock(blockSize); + + //Allocate memory + cudaMalloc((void**)&dev_idata, n*sizeof(int)); + checkCUDAError("Error in malloc of dev_idata"); + + cudaMalloc((void**)&dev_odata, n*sizeof(int)); + checkCUDAError("Error in malloc of dev_odata"); + + //copy array to device + cpyHostToDevice(hst_idata, dev_idata, n*sizeof(int)); + + int passes = ilog2ceil(n); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); + + for(int d = 1; d <= passes; ++d) { + if(d%2) { + kernNaive<<>>(n, power(2, d - 1), dev_odata, dev_idata); + } else { + kernNaive<<>>(n, power(2, d - 1), dev_idata, dev_odata); + } + } + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float elapsedTime; + cudaEventElapsedTime(&elapsedTime , start, stop); + printf("time is %f ms on the GPU\n", elapsedTime); + + cpyDeviceToHost(hst_odata+1, dev_idata, (n-1)*sizeof(int)); + hst_odata[0] = 0; } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..25934e2 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -16,6 +16,24 @@ void scan(int n, int *odata, const int *idata) { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::host_vector hst_in(idata, idata+n); + thrust::device_vector dev_in(hst_in); + thrust::device_vector dev_out(n); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); + + thrust::exclusive_scan(dev_in.begin(), dev_in.end(), dev_out.begin()); + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float elapsedTime; + cudaEventElapsedTime(&elapsedTime , start, stop); + printf("time is %f ms on the GPU\n", elapsedTime); + + thrust::copy(dev_out.begin(), dev_out.end(), odata); } }