From 6e7a605619109132f8fe66c6f2030619e4c674c1 Mon Sep 17 00:00:00 2001 From: xnieamo Date: Thu, 22 Sep 2016 19:42:04 -0400 Subject: [PATCH 1/6] finished up to part 3, passes all included tests so far --- src/main.cpp | 14 +-- stream_compaction/CMakeLists.txt | 2 +- stream_compaction/common.cu | 67 +++++++------ stream_compaction/common.h | 2 + stream_compaction/cpu.cu | 100 ++++++++++++++------ stream_compaction/efficient.cu | 156 ++++++++++++++++++++++++++----- stream_compaction/naive.cu | 66 +++++++++++-- 7 files changed, 309 insertions(+), 98 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 675da35..c307201 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,7 +14,7 @@ #include "testing_helpers.hpp" int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; + const int SIZE = 1 << 4; const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; @@ -43,37 +43,37 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..e31ca3c 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_30 ) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..5f9bef4 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,39 +1,48 @@ #include "common.h" void checkCUDAErrorFn(const char *msg, const char *file, int line) { - cudaError_t err = cudaGetLastError(); - if (cudaSuccess == err) { - return; - } - - fprintf(stderr, "CUDA error"); - if (file) { - fprintf(stderr, " (%s:%d)", file, line); - } - fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err)); - exit(EXIT_FAILURE); + cudaError_t err = cudaGetLastError(); + if (cudaSuccess == err) { + return; + } + + fprintf(stderr, "CUDA error"); + if (file) { + fprintf(stderr, " (%s:%d)", file, line); + } + fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err)); + exit(EXIT_FAILURE); } namespace StreamCompaction { -namespace Common { - -/** - * Maps an array to an array of 0s and 1s for stream compaction. Elements - * which map to 0 will be removed, and elements which map to 1 will be kept. - */ -__global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO -} + namespace Common { -/** - * Performs scatter on an array. That is, for each element in idata, - * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. - */ -__global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices) { - // TODO -} + /** + * Maps an array to an array of 0s and 1s for stream compaction. Elements + * which map to 0 will be removed, and elements which map to 1 will be kept. + */ -} + __global__ void kernMapToBoolean(int n, int *bools, const int *idata){ + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + + bools[index] = 1; + if (idata[index] == 0) bools[index] = 0; + } + + + /** + * Performs scatter on an array. That is, for each element in idata, + * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. + */ + __global__ void kernScatter(int n, int *odata, + const int *idata, const int *bools, const int *indices) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + + if (bools[index] == 1) odata[indices[index]] = idata[index]; + } + + } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 4f52663..aec8363 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -3,6 +3,8 @@ #include #include #include +#include +#include #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..fb52d56 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -2,35 +2,79 @@ #include "cpu.h" namespace StreamCompaction { -namespace CPU { - -/** - * CPU scan (prefix sum). - */ -void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); -} + namespace CPU { -/** - * CPU stream compaction without using the scan function. - * - * @returns the number of elements remaining after compaction. - */ -int compactWithoutScan(int n, int *odata, const int *idata) { - // TODO - return -1; -} + /** + * CPU scan (prefix sum). + */ + void scan(int n, int *odata, const int *idata) { + // Initialize first value to 0 + odata[0] = 0; -/** - * CPU stream compaction using scan and scatter, like the parallel version. - * - * @returns the number of elements remaining after compaction. - */ -int compactWithScan(int n, int *odata, const int *idata) { - // TODO - return -1; -} + // Start loop at second element. The prefix sum should be sum of the + // previous elements in idata and odata + for (int x = 1; x < n; x++){ + odata[x] = idata[x - 1] + odata[x - 1]; + } -} + } + + /** + * CPU stream compaction without using the scan function. + * + * @returns the number of elements remaining after compaction. + */ + int compactWithoutScan(int n, int *odata, const int *idata) { + + // Set initial count of elements to 0. Also start an index tracker for the output variable + int numberOfNonZeroElements = 0; + int outIdx = 0; + + // Loop over each element in the input array + for (int x = 0; x < n; x++){ + if (idata[x] != 0){ + // If the value is nonzero, put into output array. Increment trackers as necessary. + odata[outIdx] = idata[x]; + outIdx++; + numberOfNonZeroElements++; + } + } + + return numberOfNonZeroElements; + } + + /** + * CPU stream compaction using scan and scatter, like the parallel version. + * + * @returns the number of elements remaining after compaction. + */ + int compactWithScan(int n, int *odata, const int *idata) { + // Allocate a temporary array and set each entry to 1 if the corresponding entry + // in idata should be kept, 0 otherwise. + int *tempArray = new int[n]; + for (int x = 0; x < n; x++){ + if (idata[x] == 0) tempArray[x] = 0; + else tempArray[x] = 1; + } + + // Run scan on tempArray + int *scanResults = new int[n]; + scan(n, scanResults, tempArray); + + // Scatter results into odata. Also keep track of number of elements added. + int numberOfNonZeroElements = 0; + for (int x = 0; x < n; x++){ + if (tempArray[x] == 1) { + odata[scanResults[x]] = idata[x]; + numberOfNonZeroElements++; + } + } + + // Free memory for temporary arrays we created + delete[] tempArray, scanResults; + + return numberOfNonZeroElements; + } + + } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..660244e 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,32 +3,142 @@ #include "common.h" #include "efficient.h" +#define blockSize 128 + namespace StreamCompaction { -namespace Efficient { + namespace Efficient { -// TODO: __global__ + __global__ void kernUpSweep(int n, int *data, int d){ + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; -/** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ -void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); -} + if ((index / float(d) - index / d) > 0) return; + data[index + d - 1] += data[index + d / 2 - 1]; + } -/** - * Performs stream compaction on idata, storing the result into odata. - * All zeroes are discarded. - * - * @param n The number of elements in idata. - * @param odata The array into which to store elements. - * @param idata The array of elements to compact. - * @returns The number of elements remaining after compaction. - */ -int compact(int n, int *odata, const int *idata) { - // TODO - return -1; -} + __global__ void kernDownSweep(int n, int *data, int d){ + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; -} + if ((index / float(d) - index / d) > 0) return; + int t = data[index + d / 2 - 1]; + data[index + d / 2 - 1] = data[index + d - 1]; + data[index + d - 1] += t; + } + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) { + + // Pad and resize idata into temporary array if it is not a power of 2 + int powerOf2Size = std::ceil(std::log2(n)); + int newN = std::pow(2, powerOf2Size); + int *temp = new int[newN]; + for (int x = 0; x < n; x++){ + temp[x] = idata[x]; + } + + dim3 fullBlocksPerGrid((newN + blockSize - 1) / blockSize); + + // Create GPU array pointers + int *dev_data; + + // Allocate GPU space + cudaMalloc((void**)&dev_data, newN * sizeof(int)); + checkCUDAErrorFn("Failed to allocate dev_data"); + + cudaMemcpy(dev_data, temp, sizeof(int)*newN, cudaMemcpyHostToDevice); + checkCUDAErrorFn("Failed to copy dev_iData"); + + // Perform scan + for (int x = 1; x < newN; x *= 2) { + kernUpSweep<<>>(newN, dev_data, 2 * x); + } + cudaMemcpy(temp, dev_data, sizeof(int)*newN, cudaMemcpyDeviceToHost); + temp[newN - 1] = 0; + cudaMemcpy(dev_data, temp, sizeof(int)*newN, cudaMemcpyHostToDevice); + for (int x = newN / 2; x > 0; x /= 2) { + kernDownSweep<<>>(newN, dev_data, 2 * x); + } + + cudaMemcpy(temp, dev_data, sizeof(int)*newN, cudaMemcpyDeviceToHost); + for (int x = 0; x < n; x++){ + odata[x] = temp[x]; + } + + cudaFree(dev_data); + delete[] temp; + } + + /** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ + int compact(int n, int *odata, const int *idata) { + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + int *dev_bools; + int *dev_idata; + int *dev_odata; + int *dev_indices; + + // Allocate GPU space + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + checkCUDAErrorFn("Failed to allocate dev_data"); + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAErrorFn("Failed to allocate dev_data"); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAErrorFn("Failed to allocate dev_data"); + + cudaMalloc((void**)&dev_indices, n * sizeof(int)); + checkCUDAErrorFn("Failed to allocate dev_data"); + + cudaMemcpy(dev_idata, idata, sizeof(int)*n, cudaMemcpyHostToDevice); + checkCUDAErrorFn("Failed to copy dev_iData"); + + + Common::kernMapToBoolean<<>>(n, dev_bools, dev_idata); + + int *bools = new int[n]; + + cudaMemcpy(bools, dev_bools, sizeof(int)*n, cudaMemcpyDeviceToHost); + checkCUDAErrorFn("Failed to copy bools"); + + scan(n, odata, bools); + + // Find number of elements. It is the last value in the indices array. If the + // last entry of bool is 1, that means we need to add 1 since the value be an index, + // not the count of elements. + int numberOfElements = odata[n - 1]; + if (bools[n - 1] == 1) numberOfElements++; + + // Copy indices over + cudaMemcpy(dev_indices, odata, sizeof(int)*n, cudaMemcpyHostToDevice); + checkCUDAErrorFn("Failed to copy dev_oData"); + + Common::kernScatter<<>>(n, dev_odata, dev_idata, dev_bools, dev_indices); + + // Bring odata back + cudaMemcpy(odata, dev_odata, sizeof(int)*n, cudaMemcpyDeviceToHost); + checkCUDAErrorFn("Failed to copy dev_oData"); + + // Free memories + cudaFree(dev_bools); + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_indices); + delete[] bools; + + return numberOfElements; + } + + } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..d506c5d 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -2,19 +2,65 @@ #include #include "common.h" #include "naive.h" +#include + +#define blockSize 128 namespace StreamCompaction { -namespace Naive { + namespace Naive { -// TODO: __global__ + __global__ void kernScanInnerLoop(int n, int *odata, int *idata, int d){ + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; -/** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ -void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); -} + if (index >= d) + odata[index] = idata[index - d] + idata[index]; + else + odata[index] = idata[index]; -} + } + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) { + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + // Create GPU array pointers + int *dev_oData; + int *dev_iData; + + // Allocate GPU space + cudaMalloc((void**)&dev_oData, n * sizeof(int)); + cudaMalloc((void**)&dev_iData, n * sizeof(int)); + + // Copy data to GPU + cudaMemcpy(dev_iData, idata, sizeof(int)*n, cudaMemcpyHostToDevice); + checkCUDAErrorFn("Failed to copy dev_iData"); + cudaMemcpy(dev_oData, odata, sizeof(int)*n, cudaMemcpyHostToDevice); + checkCUDAErrorFn("Failed to copy dev_oData"); + + // Perform scan + for (int x = 1; x < n; x *= 2) { + kernScanInnerLoop << > >(n, dev_oData, dev_iData, x); + std::swap(dev_oData, dev_iData); + } + + // Swap back + std::swap(dev_oData, dev_iData); + + // Copy data back to CPU + cudaMemcpy(odata, dev_oData, sizeof(int)*n, cudaMemcpyDeviceToHost); + + // Shift right + for (int x = n - 1; x > 0; x--) odata[x] = odata[x - 1]; + odata[0] = 0; + + // Free memory on GPU and CPU + cudaFree(dev_iData); + cudaFree(dev_oData); + } + + } } From 7659058092ab41bfc812ba5eb337b68cad1b76e1 Mon Sep 17 00:00:00 2001 From: Xiaomao Ding Date: Tue, 27 Sep 2016 19:04:34 -0400 Subject: [PATCH 2/6] Update README.md --- README.md | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index b71c458..d59f0b2 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,26 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Xiaomao Ding +* Tested on: Windows 8.1, i7-4700MQ @ 2.40GHz 8.00GB, GT 750M 2047MB (Personal Computer) -### (TODO: Your README) +# Intro +The code in this repo implements stream compaction and scan algorithms on the GPU in CUDA as well as on the CPU in C++ for performance comparisons. The scan algorithm performs a parallel prefix sum on the GPU. For more information, read this [NVIDIA link](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html). -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +![Image of Prefix Sum](http://http.developer.nvidia.com/GPUGems3/elementLinks/39fig02.jpg) + Image from NVIDIA + +# Performance Analysis +This section below discusses the performance of the algorithms in this repository. + +### Optimal Block Size +Each GPU algorithm was tested using an array of 2^14 integers. The optimal block size was found to be 128-256 as shown below. All calculations following this section are done with block size 128. Performance was timed using CUDAEvents. + +| Block Size | Naive GPU scan (ms) | Efficient GPU scan (ms) | Efficient GPU Compaction (ms)| +| :------------- |-------------:| -----:|-----:| +| 64 | 0.124 | 0.527 |0.531 | +| 128 | 0.094 | 0.484 |0.412 | +| 256 | 0.095 | 0.473 |0.423 | +| 512 | 0.102 | 0.471 |0.454 | +| 1024 | 0.109 | 0.495 |0.487 | From 292a5980ce4d831ba97773559ee9cdcddd9ddb5e Mon Sep 17 00:00:00 2001 From: xnieamo Date: Tue, 27 Sep 2016 19:04:49 -0400 Subject: [PATCH 3/6] added plots and finished things --- images/blockSizePlot.png | Bin 0 -> 16947 bytes images/performanceChart.png | Bin 0 -> 12535 bytes src/main.cpp | 32 +++++++++++++++++++++++++++----- stream_compaction/efficient.cu | 12 ++++++++++++ stream_compaction/naive.cu | 12 ++++++++++++ stream_compaction/thrust.cu | 18 ++++++++++++++++++ 6 files changed, 69 insertions(+), 5 deletions(-) create mode 100644 images/blockSizePlot.png create mode 100644 images/performanceChart.png diff --git a/images/blockSizePlot.png b/images/blockSizePlot.png new file mode 100644 index 0000000000000000000000000000000000000000..2345b9efa113fff8ba8367c084a1af9fbfa2870e GIT binary patch literal 16947 zcmb`uWmr^g+cr!Hh=5pxNQ#2ANJ)o;bazUJgrp3q2&hOXEh&-$15yJ6LkLQDH%JLX zC@@3AdoHf)dhYvv-uL~!@7cDGAG!_9TI*cLdBnc&$8o;VP*WhgL~{uT2Zv1Yv8)yj z4xRxH4z9;VLhzfPOFk7iIMRoTveG)wCpM;u{BPKtw*C>t^pqpYscyuX5yh%l5m{Ked`XWr&v#@?C06)PV})1Kxt!=`T@PjyTXTw50u2C;xXUTzvKgo9K?ZZb@Wr(!r8Pbj4&~n*c2`Q>G$mEgLj|#gmvPL4PAUU%{LfDI?g|R(MKMTn zD<^Ub%t~M+HWdG0jI^}0thZ)R+XKox$(U%d?Z;Jg)Fc$_)|*?Ch!SFWo?6<6?i8WU zSak`95h5a@l{e(Y4X2nA+evXhY+HLQ>s>asb}v&G|43oqGZ}2<^)0@O2G#jQ`Bl!7 zAA_2kn@{L*deX%?9)*#<4`&D*^ZY#PJ?T79WXR_c{LHUweZ0JTW4SxUPpo9P@Xx_Y z#w2<-&gjq2@7+5IoQ7JSlr>XG7b+=qI+S?%`wBh5T+uk$h z%KySnx+cDQbs>W&*k?9i6odZ-I2t^0t*2B^|-V=~iI zQ`eCjKO%2+J7HY3$v#mHY#vJgM46@tI=kyMURo6JlVphrxDz9Ik>6rz@ucwC!Vh@K z!cwAn5ATt#w*+FO%b_@S&o4t$l%TA4adGeF1~le<&6)Cj@$M-;>LHi>KYw=QfqCB*LW1k7SiGXf9W zFYGxC<|b!s)vTn`mYdtGc#IYsk8)d$H3T&nI*!8Ap8VVEy${YQ()OA4--+zS4<2E) zkM05{w?s~10*h<^U{WV4OC*IH23i&-5#@!apTgtb?fxw=-n6%v`56jP zu7)^rYL&t!rv}pE;*$8@v7i(2eC$%QM5#UaDQ~N8YxGkWEXY>E6Ej&zAG|A-+gtMF zkyA`65kErv!U&t7`L64bv-kSg`vQC`#@!paE`>q0LgE_^X(Jco(kX|@rNu9ztvVBV zTxab!@J(*^tn`Z{bpHq>qmV6yWJGmW&_qzF22|Q6{f z(uOW+;2tj27mDuJbQQK~=;&Fgn&4nz`C+!#$XlTmxX0Q;BK_UVSaCese1XU z;33zaW^zgX&#}9%1MmGnj_ewEp~*@;9)228L|OVlMPFP2@K1avC)fK>OHLPJ1~1D~kIBOR>%Y>J^); zhz)bS=%|q+`;sU^t`kG`oTWHu8z#Tx3@AEb0{Un;izhF6-VbgFU}0_O1)D)m$aykS z`1u(43+(n5+@nz9+nS}1pV|ht;b^mVV*^#&)o_9}9=)n=^n9x~Q#!1L z;3`MntJC?93F++77s-=n4oYt4rA9lQ}8~k`7 zP1J56TM-vUSd8f-PxaeEji$l|h26t0)3FB~|4~SQx$|4E6{r^X_Z{+X2=w`eoB98d zi`J&7w@>Tx=oPo)Mtx$qd^n#qi=^T)OLZ|WPU|=0I-!F(Dy)zzsj7A-c=x`+<%k(w z$0ohQzL%PmLeK9844=rYgmpL{B~n(P?^My;z_<=*hm@eOAKBO&&R;OlC2df7(#$;K z)QK#;J--`Y9KJZTaPgu6jytiq>Zd$c9GnZBIODPn@kFpsRuJ5g!r85ei~?ZiN-4lQ z;EOmo=g8%M`xs1GWbpN+ZzgmHMq{6!DrRc{A>mO%Knc|UnpHU+kc^$UWnj%@LP-&` zF0=L1R<}^>{s6sn%Ncv6%+1YtP8u|QM>ZCjnN&Isf4N3Q#R{N7{Lk+ZDG_OUC1$4B z&FPfQXE05@=^w}>^h*K~vGg-gv6n3?VNH4gE+{2bkJ6?;{GC~J0|^yAz-Tg6W7W?- zoA@rpugWM4=BjQuV>c#Q0X%X-I=eKSc#md!ey7lX{#3u--`B$nwt_@@p>zGFphOz& z3`ef9k5@RvqNYKSN7KYO4&`6{jb;Kc4j_tOAB9|9WaKBSobNI*NiTOLTZOYKuk@sK zi*S*SO65dgFRdak(}`r0D5Z&beQ0TsO%`*(kpS1v7Lt`tEQHPIm#-;E)@1*7Rr^xLdyeOm6j zW9v7az-8Qp1gjG@b?c|+@!DQg^qbzCuI)x%XI0B4c{tJwqZ!DeVjl4#m)Os2opnFn zf0iuj1Fzfe;8^C*cG($iI2F(?2?v0d(PBQUOl|ruE9=peyi>C0Eix5K4zX1S@vS*` z@OGYPfXn;|Y0-9wmC;#e!>Xsr1-LstA9-R%AN{q#nhxae%fs8dQF}8fa3W<_G!n#b zud5rebsDC8MBUKR(y|LFx>?<9re9>hTMh>q_d(F9|9H8b`d{w}{0n+dkxS~$@*vZw zL=JT&^L6*iMNn2EKoN~J%=6Np5il^V_3o|;*3D6(7_W3nq!oLvu{K^lJWtOFR2M_D#{h(VhwOh?iR56swtZ@Mo;3gu2N z!)rG`;1gZ8Hf}6alM@LtL%5`&Of@t#kPS^V)A>PGnI*0438Qj(DG-DYeQMWjoby6n z{p+fUc0oY_{iJ<=O;3o|psG+iz{9RCTT?ZBgHJll8;wAQKJkg;(C;~3&lWeHc$DL1 zfb>L%z!gQsJ8(*U2v|p>NV7&}2aC=5gPW1-F6QBrVw0HrLjsIdD;0yoG zI{u$O{!LLhRw5!J{0y=($3V z!Y{|) zp1AZ#Ux9t8%bca6p!og`{Cw!7x>6eRf<^V?V~YBsjsTD!qNAH2jYtW;FaoHgVP`;ICOou zK%4bVbhHqHoI!$fV-n#rtu~52VHXxo2DWbdBXP2;eN*shlJHTP+q}&6vO2J`x8O1% z*seVk#5d_4Lg;}R*?%<*1ga29AE1;y;fX8Y(OCd@U8AGh*a_S~!8U7F0O}4Dd7zPJ zl5bjDbyG8c$I=jFkteefUTbm51D^n1wv~M%BZC)7gwxjFk-){l#KfePt8&w0*#>YR z`aP$u_;kTLAmap^)z_ojVR$-buFULX7VIUr-$F9*uY!NC+3jqFAh z`FHN#O`bH8ED|@QEZlnw`A-b7(Hh96b}mi@5quU7`P*jaiTq~Y#C%_$#-Ja?)vHWA z4aa3@?_KaLoC}W?+5pBiR!Zb{mcOxfcmARQs0xsG{(pDA^DnpAf+&ygi08;NZg80v zA-;6!gXQ3*BS(6WNR7<|p@`Y6EB93hnW^uO<9E4DNmzsRjZv++l8N+<5}Z?1I+}&)%DU8s?ufJXasKRBv0+Ao}e}kf;5vbHZ($RkW(OQmY^l< zd^EJSkx_o*3V%K>ga+#K^4{{B=x$U>|Gf6W;R=XxoVK^G1&Fb$57pFh0Wy%HbjTbq z;~>+gseqCRj^P}qPnYZyJ9Fy-Agd*RZbQen#92x0BPMSW;8LIQ&1 z2!V5U^g$?`oB``wkP0|Y8^DX%vTv?w6&nd0%!e~T&zY`h?LfjaO0uwvpWC;6$9rv# z&3~9D-2IvSIyhnLbgMR4#$sLnD{iP1S-g}QX1%<(V<1PlTQ23%qenTQ7-0jD^?-lF zqmOn6)osQ~pDfGtkx+5-GeHI$Z)0~H9Sk{rJ0VbK-5LNb~$NuTwx{!~Fu?c8_7IO+sBJFXGA(jJe<6@pR0{mfaaZJ&ua_AV49 zB~jmeVEwxHs27MgIm(GuLEPu5E(ase%F3FO&I$RFCZpWOgt5TEayyI3YPj>yE{*F& z-oy7S!a29@!WHm^A3V!nT~ZP*4G$yamUD}Whz&eBOwL@4z*_3&Yc!qK038I&)~ubU zb`@Bq5Rc7q^lFGvMe#g)ne$>zBmeAWO~sw&!K!geXPMD?gkReXRbe$D_r{XQ*cQQH zrK$$6cKCU9)+yJ^|F)22!)gvUOwwM+2{H4LSrt!#7R`ylh@pdN& zpK1*!s}FeWj)+{VKFz?OqG`rjM+tkp!L27-5wr%Mh&iXND!9=t$YI-*lWFwP`s#og zeE)sPU?+if2oRnQQ_)IjI8*@Dh#W5}tB9G477U8)0)-+~a0qW7)N=4Hb22S=?-1z> zZPM~ScQ>zF4IQw^7T0~FoRPTKYU-M>x&VLgYA4Au%WZ#eR^B__bu@tS4UwLoSR&Tl zAv&?ajGN4LqBAFta&-w0$+T4cq+Q|FQj}R{il=`krEg3!*aydlqLH*~4RgTA1~e z7E^YID@l!q3S@=g)OHX@opbx(-&Opgt0AyS8s7Z8LoJ~&rRcsC6$^HZqNeXw9(nfB z2bXKRvyRvJQDqkb^weHTu-zknhY}sc<>c$U{wXJWrecjHfB5Xn8^alM(J8B&wGJ=+ zZ|$YEma#hUh~h#?a{YS!u3MPgM$grdiHId&AlUTNRo+wY>u&VaXTh?qu08vSEKmD{ zab>dzJJ$%vtT8JTHW*e!+uEO?#DfKR!rl+Ft*@R8ez6x+t6N`HWcJd>!(x>b-}HrZ zI~o*j&#rktW-h`7JAE;!JYU?sS##9cvn7)PUZ+4xpCYOy(>$O?B5{ASzVTbUq!DdN z6NhaXvH3MkU2E@T(iI$~ljCQK$(oM>X?Bu0{R=Dn$CoJ&To-~q=BSTOb`Ky{0=5s0 zqdU$b3z@z0#5pka_Kecz0)ol~1r*@%94Cg~J)bx?B0>fT1h+~PmarG38j(qsR#rIx zM{DJ&(XEaYrzE>|GtgCi@d$qIKd<)qK01{oCIzJGR=Jh~Qhi_-qd1pHYO!nBgjebu zmf^2|o7>xSf^OpnYZFvBz%-)!kzBI|3sD*u5sb2hJBg0 z^_rrImQxQ9Njxo_Eqz4ugpAQ*NmL}G_1#UruDG)&Po4zYmFoiL?1;p{fPB#ovbMd- zB5_X^R{xRr{!|ax`_aiAdS2?2?+YuM)(-`tu^#s8^OWT>&Zdsj~vsBBq~6R9c#8a#V~`59#of%FVM~# zNNT$d#rpVoKqU1Pf60p*7g5drkA2LCm{#ihNPc0IZU%-Do72=)jG{GKW!(2Ml)|9-A-73Hd?43xGBq=FO{w3N=!6HWatpJtxd^0&P~!;Xh(WNma!dR;UzuX*R2Ui0@0 zR_PzCwx-xvPXlzrP-ZJM#&>=uSIkv9zwLE;h-diKb5(0~s-_$rQ0u+%fbR0lC*{@9 zOw3o9stoE^dHQ!I3#PRG&kCIt4uf4$4*+uj+`=F%`p;#3=gF#gAWDCKRR}dS6c_?z zn{SW)qC+w_XJG*}m>lH&^;)_x{{+EAw}fIN&kHU5x_AF2r_{tzp0?q!M0MEl%t?Yr zrvS_I9RKH`XLF$LRm@S0N15?hj|0LBu{-#lZjpf|5E+(eO2x2C*I6Mnbb&J)2xo#2 z>l=cbPQJT08iAc=ZBM|Gt!jw?Ju{)#7d)pH zi|5a8`VEa8o7)X$Qaz5C$9x<4!%WzAx{2woxI$!X=K4}!E2D4~^G&Y*tCfPs+Y*)J z2LpA!9&aK<&FL-D#@p{%tXcu=4YAXRo*oy9N_5eNODZ#=fQhgIDLg7ak#8B<@wp;V znMc&KbZe)q_StHMcL^*a`HH}&g-RCxG#c;NRAST%vl5d{%O1Pj2%dO$-4F2``X5CW z0c}Y6aMSfBK~YT16~hYqs{lT#xVhh`L}Lg|8f% zDky`l7qo^p>w?SEZ+k{jAA2oFtgv27n|w`(dgodC0_(oIexS#bxL1adQxrZ7WEK{# ztVuPT;wtq;{latv;Y!?(VdZE~Zo&8lUto_Knr z2L3WvKbO~o8s#62bw8oU);sYeR@&2<9=DBlTj$%2hWqx|s+Z{3fBKB;6{SF?FOv9q z$Tl&69v-KQ!|o+2iV>kxP|=WnsS> zPm7nva|7td^y_=y4wO#~%EuL=FZv7mAR@E8n1qbrw7RONl%6x zutbS$kz!kE{xmYUJc@cKoztmh?i3rJbJRj~mWp08`=-mCi!ynnJALv z`k8Gjs8jstX|Ent#=i-nibMjfAy8*3a;@B=sMq(}9|iSzc_CyO_M-^9t$;fa@$RFS z1_oOuI=!raYV461`|ZJsSLK>t1`{xD4TuyT;%A*uR<`!M!3bNrP3*F+_dhP&tS9NG_h(tW@1rKI^H)e#ln12%!D;h@ zQcB3ukB?DJel2r|=C!m8^jas3p9Sw{Dyc3NpgxT1&nPRM`rNG|sY!XRZ1P&V9Vy#v ztWq1HpUy!&<(OGu1$d>N3p1gRP@VdhcMU?$@WM9)$x#sf!in>`Nox+U3duwCFX@%n zO`A{1liYNbR**`TI@A!IN}}fgObOI%0G-9vvr@%mp`bkJPi6R0;@D3}+UR`zRt`#vc_=LzfCywTgF|4H6g z=|KZ`cI8m{7qC&1lualHBFlS)AeH%xd{SqQ0W%@%We&)u!vPK|j2UA z!cI6n@Lo4iR8soz=}~wGXm~uG&7*o_hV(2*74y}S`HN+(($9X;9nXOu({tkqTIfl=u6dWeYKQJ7H~>%x-u|R2o#>_sTt5%8~UQ#i4g`ZQKq{} zalZ}nQ2M>%d$b?|Hf<>?eNGvFU`_>4Bh?_pCN{a*uE6XjO!ViY2V$fot}`&Wo_dkU z{KLFOg@p~@Qt@ma(ms3kZ23l3EY|GgcQZM_ay5y%L-a$E6P5YXz>Bya^w9HG7yRJT zf=!7#%sVfnzZE4NwS0j=?F=UB$r7fD#VSTQQ#sFyc90ei`l!N9U5r_9G_2> zpaE2|`|^UT@eS0~NohL9%>9A8cVRl3{h@gqDb7ZL->v*Yr*6YQkzr4!I#SL)A_%TbDF zwIgIDbLd?AlpjON~6MQhCxf6Mgl-R6QRG z{(1Uu6gZ2a!Ic(Gcm+BIA8LxljchvN-hUKu`Z&Xp#`h>Z2(K+kcW7qKq|Kef5%Dd4 z?{yQa{4^9)_=OISaoR|5m79krDxFj_zW{YHx#%eXJfsh z0pld%C5#xXIy5Dk!4#pfi8pWF*s_eRWCW%AkhTS5^$s{3qtmXVc`Y>}CIn0(S7C*~ zZ`SViLWUj1OAsP623_G8Fz4_m3ODfYb)gK{h_FdlLq1QO?i`!wdpJT5(#KEe@x`<< z1pU5@o7^Yr8Qz=aU0m*N);LbXYrl*<^biT_DyPAGpm) z9jq0WbPXeLs|N>P)W4lbJI795rL!bur8CakjY*O#$V;_}ZUi{K!WQxOG-bTEX1Iun ziLc$f$p-qi37~m5t}TS~J3Dk;PqUV?IBQ(w9VdWADdEodN#!^HJWvoo;b{}74%Iefuz0tsL=>COD+%* zb*78^JrBo`1T{jWz#jQ~sXDma%(V~fk&A_qrXTqJ-Dk@8$cS+cm@K+eg|+E@7q7DE z92GIHUmzRB1A!k=IRC_%4!?qr(G&{I_uy6N7c@o4`lYr?Y`3u3vH%g5VF2KW|&QIg|zcz@S zD?iU=S&pv^^Oy<6fz6BVGZW%6HV|;4HZ8!IM{)wL+j~vTqo48-;@opZASsIwikhg_ zM+37S@LH@EZBTIN#eEAUQ3go(&Ye55KoD|JyNP#@JYqP^aAl)BKLFs!Re92n0Cb#s zzzhGVwBq0#2wE6|s^sqeVC&M{PaubGWO;A7Dw*Hm188Ra>0z36J1^$$f1$Oa!K)!WSYfas>36y8WpUcJaFug%b0PUK z$F3KeO`u={niUu|kq0d+R@a7;s$(!%P?Pn{i<^gQphP<_3Th3lECqRr1{8atM+DA5 zFE3I2x;)71S9@i%Z}E;H!mZ$iH@`c`0zq|&3#dB+C^x9zRa$x1&Z({ydEiE|Ff}vrs=gsNdOgizU~TFwf&vk#VKWZF5f6y=dfMI@)pX zC&FHVGQp%SIM}P7_Muzj&|QZoes8Yb?LvlFCy7DeJz(9q+iJCV^9u}GtRm8QI`XkjgAt^z%J)9m>?!A$|jxB>-s zFk=NG;^InoN@5`m0;q%608DUUknlGE(tF0d)vf^AN$zxaYEtru6O!=&d@b5#Xf@GS zHXaNHpl-YP`JbQ5ami(PBVlB>#MEk*2tm#fjLma&9tO@}4K!r1@}SGkMZvBU^ZU2k^-|Bkd|s;`8Y!>D6!GJAhc?BvYeDrB zfoUj^My)+>fHJ+h%aayH(BH>H*mP3z9r@O{FJ*(GGEn7gJl+r_xkbHw1EP?AON(_s zEOuYpdqg%x`z4Bl7^!x;s1J|xOZhx%@sR6nYaSHUlvX{U}D!2JX+>Jx4?FK zU$a~MpEs}xuklz>`YXBnzJ&D25IZv67QU;1?g#C=C)rj1^`;|g`m#P3 zYZ6L8jeQBwKd7%cfJ{IkQ{BmAULD&=<^dJGztT%dg%%k(I$4b-9JT-=(l^vj+c z=PDa3g>I#Mya)qaqDC{EVgT}}4hg%S)Ia-%ex+GiSx~PVw)=|-C<)7SkD~HI*ImvF z(1<4~U8{tXdTXiO&q%0ZT_{@58)dxU&bLv#tvlor$^k%=@!-UNEeS~tI{Iy*FHJfP zhu|1}HNeJb;JI*W7xmWrK~uxG`Koh)|H zOTJJP!BG0)!v||1S=ior0{Z?cPmT&p>e(}bPKEhunHAhAA=QNv;E0Xy6R{Qg^wb^= z01>ZsefaAIm9Ll9eQ#||QC`_akE*8!9y)-Cxd&m1nM^r!5#T6lV)6Il9}VVOU8M0V zY-1he`)jKsaq$U%M=_562I(R-N?L*R3+p%f$`zytjW^wS#UTk!0TM?IB~)OVNnWLt z4r4lZ26^v99dZ2gcXHcJFD}Pu8rmgZ%hK60iS&OQ2`8KCD@zlgT)wz=bm9-4o2~ss z!iE>ldd_1VZ|(=Mi*)Bt0DYx7juVsVJxp_Yd_REs0ze{MjqG%Tq&$&X*zB=8(SRTS zNm+W)hsRz^d=N7=Jh6p}cf(?K=*w;_IXmUesvGU!Ev;9Ed3 z0PtZ9prZA$U&8P6qo}RH+X6k|u{wS{1snwM2tg_!Z4($xEynftkOX=p`p?DZ6?4e9 zWW5gqmhB`GHjt5kXdbQ+g^U6e3x7~gM1!tal?+A*f;|;_J*Blqp zoOx&zq%ra@!pse6TkmLS?_wGpe_n<6bU8n6hKw70t1#bGo!kh3W6S%;O>RecJwTpg z@@Y%|qpzg2zrarmwUev=fPg!vis9V1z%x!DjfiDk0;*m>hY)&3+ztnC^STK%M6aC= z0`#{WT`S_;!Z?2kTI;}|2#_hTzpnO!t#M7@+(~`RzsUag#pY9hH&D5m@p|RlNHQ3E z1!io8l2)m;5MhIE(Cot;i~s7p=XpvfoH4fv=Wv1-@^Q?5`s?+&L6N8h>GP2hS9}z8 z#kb*~RFsn*ro$vP!>2ZbepZDu^@gMXv&(+TqCK6+=e|i3n4q?5$m&AxmbaE z3{a6UDV>OpZC4W0f1tx7>_#lyv=$kbw^e_n2HL~2R%WuG@D_S7b+S8}4?0DPTbAy< z3Ov7EGP>g(^&J4;E*d0;W6i*5>BmXXKKTGDLAA>$n3z8R)6k~)BY7Et>;M(z@bX-P zG@#c_1sQ7Pc~%DEr&}xEhT}gVeFV;Z)RY^@vx$6m129>&3<)l2X=(T*ov?e$F`Dw9 z4oMs|7xYhyq?-jKU=ho{OZwU~X=}Wr@obLfd>_KjOG+LXpC3nPYis8K_!x-rAu-@G zs)(s{8vU4A1z5ZNDSOv=0qt6~?#!RJ7uMYbO1Dx#fQQb+3XTpZaV+j4$N!E| zkO0xySslK(D_?t~l~Zy8)=yYWl97c3u6=(6QUq8w03)oCEIBFYwU!}4ei|sa_kV&> z5fVt84AA?@S98=Hu2TJx(%sakn#S;zJhbgNP>dE|4Y}@`dQVP1QE>iCXRE^?Z|d_s zK^ihBz>oU}U!+OC{u(jySwR*wP{aiH*{;KpVN>q~-~S>yvpA0p0&Ds8R0F>ji%EJq z$Yy>Gz-CLI3-27P2=HHnR&t?3*z)&MR7rMdA#vycV8Z()?w0NLG{n#|@N&+@V4?Xn zlo3<_srQ1{ZwBQE3ncM>B8j{JCF5jbeUO7<6l4a!4ZjKpowW~P;>yfWAW7L8@Kvf; z+r0$fxh}>D{5@r_Z9H4{uUkKG8X+pr0W$X~kj)b++yv%eKqLYSGI{-Lzgx?~^2v4h zVN|WCvq;$9Sr34OAf>b^HL?p-zVu8p7y^lgyd*|)Ikf*il(1qb8Wbw-&nx9w?VB6$ z&|H8PUv}HeshU_HNgL2`l?)GBtD%?JcEg9E6~_aV&y%+kz>7H&oETGVnBSg1Fh;O?Mo!!uVDO;|2-JaswGnP1Lx7eJ{b1>x3R)JP3Vo7 zIOL=0xj+Vpjb&Bu_?zzivs7>E0 z`^SyubJ<07ja@cu<7er%!_pI{6yX2|VWX zZhrpT?X25if~&SZH}&_6xe}kwgNNoNvo(y6rPyUuca4<>4c{L6x`eB2VB!7r(Th`4 zWbc~X7!|jPeLd%5Xh0*|=e<=eL`{iz0C)|X?Zx)xi0Ck#le1NS?i0i2Pxa~(A%Z|! zEH9os9j^6pz0S$1`tc6XcgcRCNUSr6KxhmReWic>G9;wC;q>x77_4yv-xsqs%u;GV z#3n9Pke3eYsK;idzx)E?ONz)6;TqZFHF1^g0<&k;|@ zEoV!J%vOYEr6!vXk~+OCdh!W5VkYa&8a!?VZIv`sUED<>ZyYrNhq1?wgTpmR={NJH z>c}73>`Iuf3GNNUb-(C*^((FrLcd{6cXQSc;5saKS#UZlV|g~>Oz*e;HF&003hN7| zx>fc(e<7sEXlKv@ulR(}~;V z3~f{%l6~YuV0!;!gDK)oR8-IPqhM{@nLOqG<@yQTGd1+sr#-(tC7>P>)&;G8Z95cA z?OoXV+rrhECqUSVDIk=^70*kR{PUGuOvc)(>`%fe+JF252x+BVuuik zky7U{1jiM)fh@aikQY=E_#DY*Tven%mVljE#hrn=Xdd^ml=gm$*x)5JMilX9@6_yZ zBa-&$g`E0LphXXUMlcI);j5j_ZZ)hR?Qw&iE=x>m^inn-zn+N8HJ1A8J&&Ogq-AhJ}ZxXp^`Uy;m7| zVRq&f_4LvrV_x2K&j|u?fy@S1fLL=aMhq*~vr%3&7Z(;5x}63|T+s_@PjsIdP!0P1x(<(xQ$79|XcJjuLzGCKS5BI%zpzhh2PakhT?%L8 zj`p@(AvE3CCG6OI$&tIZ%j?gv!*zdQ3-r7bq z;DL%FRe$pHB7jez9-W^jI4{khW`VhBFtqwt;r<_eJg@Tqy7vF_aqa^E;2*%tcVsKZ zij4rxPU;KjbQL$C=bA#cTdu1oO`aX}28jiZg44Y1DbQ#zqpSTyMS6LRhz7E71HL1_p#wAaxfE$nv29gXs1^dq4#e98g&Vr*jbp zhJIZ|at_tE_-x;KZcZ6Oq7vx3BF>@!pzoZj@yG$vJT$bvQQ0J1h1MAy2_%>Auc0uq2oP&LG8L}34EjSz931-5c zxSXfhyLS8RU?lhsD2|Pb>zao@BY3)2#mNMq(@J16XngPFPb-7g7acYaKXB?B8lV0y zZQ|qN?x>4n;*P<=W2}hybS_H65g?*myT=ZFN-8Q9h(T?4J)U^129TU4_Z+?=b&YfK z@Y1|YjZ{idmTU3)X1sZ_Ww+)$)1o@uPT(db6MtuLID8 z**F46OH@n~2d;*INeBT1t6Hjki9R?!7zbi(7AWtQuTE+K`Ti^>Gx@9k^G5o82fN;~=HNOCN_BSt#9RS0p6s3dtUx~Kl!LN`&Tsm@^FYbrPk_U38*BiRLoy z3VY20V+5S8FK)!s5T)}41QVENAmreJlV7^DZQ$Df5{4einHdq`SP7V!G^#X)l0po} zuT>F9qZIbvFBPtQ&b#sh5Acd$u`Q755}?FG&n{j(3SdC=OP{}x|9{H~P|o@PlNW4< aoDtP_RWcT#??{3><0#6h$--qUUj8rqBa_nr literal 0 HcmV?d00001 diff --git a/images/performanceChart.png b/images/performanceChart.png new file mode 100644 index 0000000000000000000000000000000000000000..332ce15ddec7f9d4f858bdafedb1f8a21c6a5af1 GIT binary patch literal 12535 zcmb7q1z1#T)Gm#Jh(Vk)lr54Tr4dk&Pys0gX;CDG?k)ubk%J;CNS8==4~QZt5(7AN zcMZ+Z+_eYvoImcp|NlJdNtj0c`6S4`Mq*t@Ed>HmqOt2l0esp=fV#gIPI{W_H`H8=VBiFnd z-hIA*L9u92f5CmUccYiUMNUF;`Dw|sLio6k4<%ewl97-cGbH_AULMMV1^#*We}B0T zl(73<;=BLzOBKV54}N4((V^P;z2<2*aD^CS@72#7Y4XABD{EuXW78y>das98ymtuR zE1x;Gn?7o!81a|QNw<%UcD`F!^Rng9D{1Y_GrL!@R&PCB+P9UlHNx+m-4H3>RpL11 z(Glh6Hd{_ebDHT~DWCOh%#XJn`s!c>q7Gayn#FAJ(V}{lWZmk+1&gU(HtE>eEilrb zdPq&*u-r%3n)U9xou)OAy29`^dS`Qtb!Quv!G0q3@$qj?I!wP? ze0uoXs{)$=>5Z1??O}{&p;eEr%$K#rG0WNBa;LFK=aL$3jSn`m$O&xwFf#F4_C}_X@A{dwE4$VWZzh<%LXuQu`-WYgV9&zf*ZpHxf@p$ z-=2?E%s;!jVL#T8|NS+0mosLOO_;IR@RMI%RBFuyk^v-z&(?pLuhW9=<2zppvG z5Co%$O5SkeSrtXy_1M`W@VoO$WJpB_n7u?-s<|_~m98M98fZQe7ELIg;PK0*e!tOz@ z3@g+Y_TK#bJf&B?r02TUg9k}Ukz#^#?Usc-qwMZm%WXYvG}TcHBXy-U4&4wDW4Hx> z$+N2x*!_$(FW7Ra*@(+IsB zSCN?#9br}SZL~2udZ@VfiM)PhqM%H%>&8rBkNx1_V0T;iU~`+R(uNBucaemvQsnCg z?q$)3FKHJU*D(>^%JoXiGs;+3Y|U8nyQeQLEqTcF&U){-6WUFpJ(*lAT9TCT*?n7h zXG$-nphvFpfxY)iooAhb&945{-H2NkX5)`kh&@-@M5x^m|D^62jym8OMTj!hz93BL z+6Sr6Uu_=T6U!J|A0f=lWyts_X>mz6NdM8ov%I7LyZDOatoSB})w#REohuNZ0*&Xj zGIT`=C0&Kq{2G;9K`Sm&HyY`-+_g1__Y3QdDxD6%gb90wYq~FW7H{cNe)q$S`qMJ= zx%0RroL@3QryfXOsRo$-BP_^H1rr{l^VTwzbuNO`$Y;=QwNC7?DnNVC5E zSQ4L}z9{#j>Rjyx6$u4TD2=raIWLvh5>{UCaO=K+q#6OwgjqB4<~WKYmeVO?kl3)j zo&>#($<)kljpoUpsWV?$tR4DV9>lobDX_R>Sfb!oWr-4ZO4G4USQ$4t z_{aGeEz;B?MB0Xiu44OSHZh0VaA9k&4it|y*U}4dd~NpVM9EYJx5N3v{O`?sZjGbT zytdXECj@C1-Zt!OZPUJ=ac^#^<)(qq!mXz0lwA5{$3Bc_k9(Y>Yi~B^k6rOm)}fr; zN`X%v%1hE5Rmj6BsCyKb7c@{EV6WdVAAe52spXx`K+>4B#6s=K;%&w9+iGgfjAp@v zOV;1{R;TjOZB$W(b|c@jOn#VcJc@gCT*L*3)v@O7#ZceKUQ)0+f&rlVBLIL{ zf+={QER!+8KmYGbQd)TApNIc-d7DDqcJPtoWb3)cG4S9!6<%9uu8X5WhLt2;rOy1; ztjK$ha;^I$nco{387+9|2eV5|#|KO1Tr_FWx_) z=^S`^^sF}IN%B6H3-_PUv5GVW9Apc>cjyZGIb#{kqBaE&gQ%fd1eK$lBWQW^P?le$ zq-Q7KlR>U222-6 z>+zTerrP=DuZN2N_0R#WiHV7@w{Nxjy{+|0Q?oK<5kPpDIZGf?*k#U89BN87`E6DG zKm;tF;4(Br`|Fpx70XWeALcV;T+T<_O=EgB4?;JpJO;Hu~p5LHXBTbKTL^eLvHleR8fmh()-hu4lZ_uqK$J5a8VC zdN_yortMYTGmnX%Je2i}Aqyp7YM7Fe5*8V0dGxGu9zZcFjPU*1Lo6aTF?4KV0(0$9 z`qE_snOvzp!2gaCpop!nL;si4Xyn-gha zEBmx!f{$hmSNf9Ua*S^{dxJ*`nYVH}dw+RyIPB$1F!%lYMN;;(@W%S^tDB%EK+>M? zeNO+j*A+BkFE4HF@O<;1Ts8KeDLJi|xH3uP5XBq^ov{m&a!W|`965TFc5{}FIi?KD zP1NpXj^K5l#FS5;9#VI8v)N+37$6Iv|8s7gYWav2FG?0Vk`=!-MAIJVmQgla7VyTp6rvQtfd1$GW z(U*NUu3fVwTtp&ofb912TS^|7Cyt*{06=3;HJWziRY?%%iZ@p&Si-zZE48?U@^~}l(25)N3HC* zND=31Kna6l^@#d^sli&nOJzZQT^eEu*v5F zP{+SG>bsj1q(xBW-+%mYAmk^@dw6)XkL6o-;rzL=XVAqG^n2^YMI~*5>-6KJ#hn|B z?%n$=c~kRX8({=|z|Q;ELx<#sz6OyK`1MNe2cZOMHp=dP4(NcgV}hiO!zbm2*UL7E z#iNkhi`!y;Kw?OagWlEiP}bjt;IiW0+a87#+=U@2GLH}5+1cLeQNTr5W6Yo@9I7Xe z#*-q^un$@b$*_=+0X&DO-4{KEi&A2XS?bBkWv;(AuB@i^65#hwUc=}M#?ONOym)4M z(QRq`E|i_TS7&c(U%!4m!-RzlvK<8A?N;bcQ=xz82^TOMY~XAi4?TP4%o!W+o)Sl= zF3ZB=b6-^B*Zdsnjv)?I>A&3ji0HSjfhL%P*Tdj@rKWKY0n!|&jZ-B0*w{`)b9cfO}D0~LrW?I zp_{E7DR!qWjMozUf%VFsHn`5E-cGkWj-G5yVU_*$+jB}3Eazy@@S3Z+QrwQ=7Ko1ZQaWY2(rUqgoFsRXi%QlX{LB zGRn7Q=vhKPNBqtU!5J#j)P(VphYzXfjC82r(|bRewyhwwR>UmW?ph?I&}C;++77`Z{tpG43Lx$MS`8{D}&PBVRp{%cARLivv#J<{c0L->xuiZ@mF z3HTn%&jEBxioe!7{vI1IUm8*fMG2uAPMCtW}dbmM?(0xy*&}xK^d{6aE3-RYFpc5_H7;A{Eb1c`25VEuYK!Vo} z+}a>G>!?AD*`%2XL&idwF5eC98^E^2XEnAA@ncrwbSg8?RY7)SoES?-(Dd(wsMUi1 zyN#MORRAha)B5t%7*l~JRB8!?(K0A2wzai&+TL8N+!rR7S7nAvd#95;B60KN9g=AL zC!*CoEfILukkvA-Tx>VeeNXJ64LonWT|d*39Bd5uXhZ%5Yc`-2f6plPHiB$K>XEEp zyI#)!s9T%^!T7z0K!$^L>=Y|r-@10~n$>5GNH!7U?+h##A1JnuO&s_f8phFC8xJ+RN-mli&wL2f z!a4V~g%SLZOP1a$$;xam*ng?x7K2z9pM*r@$=@OMIuv}eBuJPr-SHVo3ws`4_59C^ zY0~~Qw}Av?&I19BX_Y0U_#aDV@>w&(>Rzl=$uCX^1hAsh(0{ zbk=XZRh0|`uyjWwc^U{Z1iXqO8D=+s*t&F`f6gk}47$AsA)CK4)1_l%6vwMu^tUDI zUrT`Vy~q^w%YRUhLV{;`wy$pzmtKb@|0N=gOxv-45{4g;n>AR{UpKFd8S0g}gq)=K zZJPU)AHmkpPq(8p%O3#wGiFNqO9Rz(T4$BtXSKF!IG?!->8qz|Wn;FY??HWdbp!sl z%1KRkbaz@&EFBLQT-2cr89P@?H(ElFX2|7{3Gs94gv(uWjy)r*8NfXJ?koRYu50Wo8CL8|8E z=B|Ufd;jUvr+1^pTg5R;Qu!w4GY-lLZ)OW28pyz0akl$-i0W1v&fIHjlG$#^M zr*5=6kRRx-f^UfNy@zWLPXXk? zc*~=+0N@}3A&nuRa1iL!>C+uh-g-Jeo>Nb1fBVXs>xcoV1oJT-~`@8#cj02Zvy4sYE0}0lSSseuS1ssSL z@elCAeOe40>msCq$Q~|No`AlQ=vXn}Lzkg%hc;5hWA~XokA6mvm>pabI>X11+la_+ z*C&6G0FMV{z^{&GK|@^e$V|BUG29@won%blDW9Qq{9a&C2JY-Vc@CTb(C2$Zl-1LU zrAVZ7Dj{x;I2+>8ASf*b4}v)V|4+i+;A1lLv4C_D8GQ|^@*;kJ9HNT2W?lqwQQ1GfVyBMo3#Bz%R*m8d~a{_ z)sKB-|9NsyhPb2MC`P!H6NX5nQSgA`!Ro58!`)bpS#7d0Qfr#V_7!mnDQP)J+j|ll&F^~(9&HeXJ{s$EvH(=(ZP&3EtK;kO4qKjxK`#AtZ&?Rw*$@Qm5od8T)n`E+dvjgjIe z(1cL%bSjmaU7SGxZLA1N*ow8td!qR?y7hx9)~i_8TLPgWZ5P*ozY6{PcMo~vq&6~l z^*0*-PoQT$i3FKv+3v+Luef+ulHxlfATb7%fZFr%A8h}R?H~qdID}H$BX6OBL!PR2 z*j%+`0V*EhRKvo;zOn$fM@m~gKk~*v8*r^&rQ6`svuRMW_-tST4E&}c^b0GE-rFsZ z7KHm$jOQrA>~$mv$RF6dzy$Lv zAqgb~g)E?PifdM;JCK6y=_6?#tZyS;-v)33vp&g9C`Vv%(}<}3==1XVuPpc$BZ0NPooy57H?qK% z8w-7zLryJ9!X0^LEpbWpShutZVzN<@bC|-_hwCg01&z}^ShisR*E?NM7Uz5~4e$hp>B1vJZA7-j}kpQrDh#m~~eCJrNSspyw{lP*Y zPshRUJG25b)yxJz>^q-J;&<-RkuiOr<2Ue6A*ViA6mm|lO8)Lgqv~hKP`Y$1Q@6O; z3Ew@j;lvNU7VzHFA@c0m;0Sd;e*9QmE9~{N&EDJ8e$5VClSkk?D|`y_D33_nphMv0Hu&txlkrVcq8SXHG&|$*!QD^dyKQUOtY&jmbP6nN z2V8;7z*yVs?|m=k%ERfOng;cY&Lp0fF1!%jr#1J9zYo}FD#@Sq?=IE=``Gg&*zbwK zaAEDOsf(>%yv=RNeb99l(^|U=7$r+j`zuPV4hHh;MmRW)!C(aGkL=?^fYrCR+%Ft- zgoDHS=o!U1%Lv*N6{;Nyrp{LCF)*{uhhjYDk`a~cBh5h#vlY?-9LvZ`oijMo?P5D) zW|gIs^AHivGxgRXQvd~LQj(+Wt|+QUt)KFk$!{B7t)5C2bpa?Vgj zrm$t_RiYdj-5|$u9ijiz!vg17{l3iw-tQ+g{kdjeN+FYS@UiuURr9NICvAX0_G zX92wAI6fL0mF?wB?~a6PugLn^&P|{w=aL#XmE%PwD>BPh!nekxc5JKL(K@G|e;+-Q zc}``kugQI3?lXPq#CgwJ^X{VTNO9*Ek_QiEsT^`m7yWKou9^U!o;;zs)tsSUUecAQ zj}|E-)CRi|@NBE6_`@$?vs=H>vIpr;dN%WIYTGB1EHvNXm%X~vX8O8nsbXhqw(=e- zzggk+dT~`~pS-dCt3HMg+t%n;{Q-0=B~qWaq+)}~VVZIxQ&8SDU3g)Nzi*JojxH(X z>a5`Ak%VP4-8bjd-YVR@=^Q-UnO~Q$PB{vFko{d5b-CCjj~SD&_wV0Nxerp$R_tK5 z2o*c^heI9meIK*Can>w`3t5_T-%hh3Yzc5hM8qlkuFrfvfh6(JKVffNwQeUXIPoVm zrCLf}0m+~Z-D)?(TU?y!^Zeu7;l9N%6l1DYrG89-MW6)Xt*{+wBVIW){5lvlmwQ{gso4( ze!BfH_tAGI7LcIvo+|EwZjOuf!ejeX#Bh`3sF9{`sC7&)b46g-xh7LQpn=!}oZrJw zR%8l4TanEOXUBff8($Y$CS}xMJdMqJ*U572%|??#s$8l=0J_v^CJka)R60O9R64*k zL55PR5qpizE9%IBNUETvy#)toxSoQLI@Sza;mpO8?R|E2yrS$a2RBhaRR*x*{!D`C zCFQ$U<0V2apA<$VVng4UDur~GLDpvF;PP|kj25*|qGk7zlsVrmV485#R}PH;A5De4 z3{jfGCP8hcV)Ijcp)Z{MX&CKvo6iag3N~S&u$4}zVKEBgPSa1`?vSI3_aDS%R&19< zP%R~eIFv1We~BkcJ7;4lGkBKr9aT{1ITfT=C^Cw3tyk5j)r-Fyf`%~uY*macu+(8@ z{k;4dCZw16pAIJf9ji)9NoQRjljOQHAM9NuC3{jG6&o&yzMYX7-}7TjfLFK4tV!V= zS0WEL=w&wogBe7*nhQ<{r(lWkzNi6IgoxueJ9ZSR%)@i^RaKWP?{D#!En z1=|!^LQ*xu8HPA<$4Q2jH7)^Ek4nYd3iSeQ5XRi$dwh^p`Br<0(~SAzSYzY7hyQbg zi0?5dwoB})sh_=Wgle@`kHNOqkdlSbOJZ;2S*B?=`DcXW)ZQA9o>JEEh;kX+kI)a? za$M)b`P|_!#mUk3$w$u-2IlQLUDCxd0ZRtO_@WR2HrAk)y_IglN&;4C%)GREg4!vb z-xyF5;bPc#uW}qF)C#{UdrG^ru0^hA$)(t_47d<_oO^!mD??9BtZ`L_2>2bggk^Au zK(LH^oaMc8Q|bw$kt%0VHFF^&UqstJ z{;*=?aExK1;58a;uO#{#;RjnC-uAvR1(zB|GTMp$qg&J*9{qUv&aZCz*lX9X)9BU; zhDI}&O!oc3_s1i2A+DR#!3Ax7wMG7uy72&?r^f9ON35-X=G(V_2s8e1z2!a)D&$MV z!Pb&%kv?&`Bqxng0^u7XFz5ZIpyU-jE)kM7s?b7$iZyLcc)D>lGorz5$?txV!JwD! z_{WyHTm2{`-(iMnOW;xSrR^2xC=!W;SLeZ?lKp{kb&)Iq@pU;4Zab zR8ae@*c88mm`;mN$ay+cJpy{xhM64%JmcE_DRE9GE6W#rS zr#l%g6gq7<&M2w!iSfZ`&mK-0P{^EhGEGo3`L>?kjx77T@F||n8b5Hk;^srS8%Wua zWV#kTZXY2uXDTMZ$Je0mwPsA%uTP6x-ml4=iCZ}kay8JMk+rPuttpB#(`mz&_z^tT zG)eA!p5RmbhN@S*BMnQergl>sWYah#vxf#KtX<_0vy8c!C+1$;W$OlmNvrV7UCLz( z-?@Reb0ripRbo~;(HN;0v9#g&MVeg}PlUXK9yz`FB3rZdfivE5U7SXPUTQOoHe}s9 zBgjui7@LT!>=ATg;Hc+#p6@edq?=TVwi8T@L!UBl%!Kq`YnkvdH(9`OJXvUJI=QL; zkkyskLf3@O6h|j7E9X3Yhi>&&tQebt|MR^%w3x09 zIce1@BVN#F-zb4Wji{(%{|urD`c%bh%EXIbmRsIz$D3ltUGiSwWOaz8`;~xFjDKnP zddr~R{wPqTgEGrOE|0Yk$q@AS+Yu-D3^1EHl$c^(Q(X8_Uc4t@T+a@?Y1!@)kWV4? z#Z^VV%t$|3c8+|;J$^31>{GwlLqje_)X%x0v+Hn<4$G+TsXf~i`!IrLKYL0Y`bsVn zVcj(_$h3IeMIaa_>rPz7NWci2V_e5tpw4ic52{yEX6ixjf&#iKFF&3Ek*H z$4_RwRtMQr0z)b5u^!lYQ`Ypruf+O4D%mrYBUxe*T+_v)N~yqd;N#_#L*CnKW4@l& zn_`5(frm!FGUMjPP))Ei zVeH4dd%!AK`KA>TUK@}Zk9S5G(x7_Jx@8`l$n&H;=;=q;AnP+N9e44%d*O)uR9rdH zQ~wO-QawHiB&vAC_2XPH6H(fkc$Fb+9CJP~Q}9`0OYUbq&Fr+6<%73WWHO(AP^Fn| z<^TaDP+Kb53&?^c<1SaK?w<<6VIhYxegA`!GyNi#|CVJ%OiPpF-O1!Ax9>C7M=u%2 z2(q{n@XLpbQl`@`S8Em3+x3rmuFlc{bj02~qF`MfKBS zr^Qh#C0K{Qg>(++Ka7)ZEw(pt{`#&j`O4zabIfdK6`P5By=49Ii}l7P!`b_-X?I%m zwq?Zax$l>4uTNpbxN}5K48#pRx`srUAHV6xQ{hPm@&*Q>qjE01E&FDh^zKW^gV0ej zyef!mFR)~>rWhakznuy48qyoHS{VK&%H9_68X!J5Had}(e(H$Ok+QBLudo$oPscDOCtWw5S(O&$*}i6LJuu-y6mi4g-@jg?$Hg7201x zqW|M(^GFyR+vpd{*M(9^hZ3lTA^ik~fG?dbscSQJUWNlY#puzeLR+oqDM2pH3=5zP zt@_S*rXRjIdxpljlXGdM%i6gOb)YvyudStZG`yrXm(MUGI@S9obi6a2=*;|)x;NBO zsTIlnw}D+}?nK(@7>$XAYcL_U6Zi98=`gXX*)fFCTOo9|9Z+!f4A)kNf_oUErC(Kr z31$8yI1UUDdfgkLZ!m^ww^JW8WjY*_>w=?7={p;F$+cOpO3op@5=5r1 z3f8|7ufDDxuL46S+DjQ&e(T{I#~a+~dNgA?E4(WqPs9rd&oGgbd8h9lJ7O9^SviKmkGkq0D@^VAVUK32exbeex(rG0$~}{@KZS zWYl2L_IXh*xJ+flKNH3boiM;B>qDjhI&G)L?#->v58*LP-hK^^LZNs!$opq!lx!>;4Y3HriEPNA;2 zLBqfUJ?Q9d|U z^(Qp9X{3Oj5@jctUMUxLOxe&jVcaZl8goMz=Go|*pv)yS0gl{xjGV`cv1oRPG_`CA zIVs=WI@ne$Ogyf2wG0YDaf)z|ikbE-oC~SV?q}Og1jykZlF+0b>aw3B%IJu=abTZF NuE{E2&5$ +#include +#include #include #include #include @@ -14,12 +16,11 @@ #include "testing_helpers.hpp" int main(int argc, char* argv[]) { - const int SIZE = 1 << 4; + const int SIZE = 1 << 16; const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; // Scan tests - printf("\n"); printf("****************\n"); printf("** SCAN TESTS **\n"); @@ -31,7 +32,13 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, b); printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); + //auto begin = std::chrono::high_resolution_clock::now(); + //for (int i = 0; i < 1000; i++){ + StreamCompaction::CPU::scan(SIZE, b, a); + //} + //auto end = std::chrono::high_resolution_clock::now(); + //std::cout << std::chrono::duration_cast(end - begin).count() << "ns" << std::endl; + printArray(SIZE, b, true); zeroArray(SIZE, c); @@ -91,7 +98,14 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, b); printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + + begin = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < 1000; i++){ + count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + } + end = std::chrono::high_resolution_clock::now(); + std::cout << std::chrono::duration_cast(end - begin).count() << "ns" << std::endl; + expectedCount = count; printArray(count, b, true); printCmpLenResult(count, expectedCount, b, b); @@ -105,7 +119,15 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + + + begin = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < 1000; i++){ + count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + } + end = std::chrono::high_resolution_clock::now(); + std::cout << std::chrono::duration_cast(end - begin).count() << "ns" << std::endl; + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 660244e..88f65f1 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,6 +2,7 @@ #include #include "common.h" #include "efficient.h" +#include #define blockSize 128 @@ -51,6 +52,10 @@ namespace StreamCompaction { cudaMemcpy(dev_data, temp, sizeof(int)*newN, cudaMemcpyHostToDevice); checkCUDAErrorFn("Failed to copy dev_iData"); + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); // Perform scan for (int x = 1; x < newN; x *= 2) { kernUpSweep<<>>(newN, dev_data, 2 * x); @@ -62,6 +67,13 @@ namespace StreamCompaction { kernDownSweep<<>>(newN, dev_data, 2 * x); } + cudaEventRecord(stop); + + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + std::cout << milliseconds << std::endl; + cudaMemcpy(temp, dev_data, sizeof(int)*newN, cudaMemcpyDeviceToHost); for (int x = 0; x < n; x++){ odata[x] = temp[x]; diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index d506c5d..9241882 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,7 @@ #include "common.h" #include "naive.h" #include +#include #define blockSize 128 @@ -25,6 +26,10 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); // Create GPU array pointers @@ -41,11 +46,18 @@ namespace StreamCompaction { cudaMemcpy(dev_oData, odata, sizeof(int)*n, cudaMemcpyHostToDevice); checkCUDAErrorFn("Failed to copy dev_oData"); + cudaEventRecord(start); // Perform scan for (int x = 1; x < n; x *= 2) { kernScanInnerLoop << > >(n, dev_oData, dev_iData, x); std::swap(dev_oData, dev_iData); } + cudaEventRecord(stop); + + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + std::cout << milliseconds << std::endl; // Swap back std::swap(dev_oData, dev_iData); diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..278291f 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -16,6 +16,24 @@ void scan(int n, int *odata, const int *idata) { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::host_vector host_odata(n); + thrust::device_vector dev_thrust_odata = host_odata; + + int *dev_idata; + cudaMalloc((void**)&dev_idata, sizeof(int)*n); + checkCUDAErrorFn("Failed to allocate dev_data"); + cudaMemcpy(dev_idata, idata, sizeof(int)*n, cudaMemcpyHostToDevice); + checkCUDAErrorFn("Failed to copy dev_data"); + + // Use a thrust pointer because the vector wasn't working for me + thrust::device_ptr dev_thrust_idata(dev_idata); + + // Scan and copy back + thrust::exclusive_scan(dev_thrust_idata, dev_thrust_idata + n, dev_thrust_odata.begin()); + thrust::copy(dev_thrust_odata.begin(), dev_thrust_odata.end(), odata); + + // Free mem + cudaFree(dev_idata); } } From ae5d38e22b777d3566fb806e5f0da5ac319e7244 Mon Sep 17 00:00:00 2001 From: xnieamo Date: Tue, 27 Sep 2016 19:08:21 -0400 Subject: [PATCH 4/6] comment out timers --- src/main.cpp | 20 ++++++++++---------- stream_compaction/efficient.cu | 29 ++++++++++++++++++++--------- stream_compaction/naive.cu | 18 +++++++++--------- 3 files changed, 39 insertions(+), 28 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index b23b3a0..04ce6bb 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -99,12 +99,12 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, b); printDesc("cpu compact without scan, power-of-two"); - begin = std::chrono::high_resolution_clock::now(); - for (int i = 0; i < 1000; i++){ + //begin = std::chrono::high_resolution_clock::now(); + //for (int i = 0; i < 1000; i++){ count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - } - end = std::chrono::high_resolution_clock::now(); - std::cout << std::chrono::duration_cast(end - begin).count() << "ns" << std::endl; + //} + //end = std::chrono::high_resolution_clock::now(); + //std::cout << std::chrono::duration_cast(end - begin).count() << "ns" << std::endl; expectedCount = count; printArray(count, b, true); @@ -121,12 +121,12 @@ int main(int argc, char* argv[]) { printDesc("cpu compact with scan"); - begin = std::chrono::high_resolution_clock::now(); - for (int i = 0; i < 1000; i++){ + //begin = std::chrono::high_resolution_clock::now(); + //for (int i = 0; i < 1000; i++){ count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - } - end = std::chrono::high_resolution_clock::now(); - std::cout << std::chrono::duration_cast(end - begin).count() << "ns" << std::endl; + //} + //end = std::chrono::high_resolution_clock::now(); + //std::cout << std::chrono::duration_cast(end - begin).count() << "ns" << std::endl; printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 88f65f1..94151f6 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -52,10 +52,10 @@ namespace StreamCompaction { cudaMemcpy(dev_data, temp, sizeof(int)*newN, cudaMemcpyHostToDevice); checkCUDAErrorFn("Failed to copy dev_iData"); - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - cudaEventRecord(start); + //cudaEvent_t start, stop; + //cudaEventCreate(&start); + //cudaEventCreate(&stop); + //cudaEventRecord(start); // Perform scan for (int x = 1; x < newN; x *= 2) { kernUpSweep<<>>(newN, dev_data, 2 * x); @@ -67,12 +67,12 @@ namespace StreamCompaction { kernDownSweep<<>>(newN, dev_data, 2 * x); } - cudaEventRecord(stop); + //cudaEventRecord(stop); - cudaEventSynchronize(stop); - float milliseconds = 0; - cudaEventElapsedTime(&milliseconds, start, stop); - std::cout << milliseconds << std::endl; + //cudaEventSynchronize(stop); + //float milliseconds = 0; + //cudaEventElapsedTime(&milliseconds, start, stop); + //std::cout << milliseconds << std::endl; cudaMemcpy(temp, dev_data, sizeof(int)*newN, cudaMemcpyDeviceToHost); for (int x = 0; x < n; x++){ @@ -116,6 +116,10 @@ namespace StreamCompaction { cudaMemcpy(dev_idata, idata, sizeof(int)*n, cudaMemcpyHostToDevice); checkCUDAErrorFn("Failed to copy dev_iData"); + //cudaEvent_t start, stop; + //cudaEventCreate(&start); + //cudaEventCreate(&stop); + //cudaEventRecord(start); Common::kernMapToBoolean<<>>(n, dev_bools, dev_idata); @@ -138,6 +142,13 @@ namespace StreamCompaction { Common::kernScatter<<>>(n, dev_odata, dev_idata, dev_bools, dev_indices); + //cudaEventRecord(stop); + + //cudaEventSynchronize(stop); + //float milliseconds = 0; + //cudaEventElapsedTime(&milliseconds, start, stop); + //std::cout << milliseconds << std::endl; + // Bring odata back cudaMemcpy(odata, dev_odata, sizeof(int)*n, cudaMemcpyDeviceToHost); checkCUDAErrorFn("Failed to copy dev_oData"); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9241882..d84c608 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -26,9 +26,9 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); + //cudaEvent_t start, stop; + //cudaEventCreate(&start); + //cudaEventCreate(&stop); dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); @@ -46,18 +46,18 @@ namespace StreamCompaction { cudaMemcpy(dev_oData, odata, sizeof(int)*n, cudaMemcpyHostToDevice); checkCUDAErrorFn("Failed to copy dev_oData"); - cudaEventRecord(start); + //cudaEventRecord(start); // Perform scan for (int x = 1; x < n; x *= 2) { kernScanInnerLoop << > >(n, dev_oData, dev_iData, x); std::swap(dev_oData, dev_iData); } - cudaEventRecord(stop); + //cudaEventRecord(stop); - cudaEventSynchronize(stop); - float milliseconds = 0; - cudaEventElapsedTime(&milliseconds, start, stop); - std::cout << milliseconds << std::endl; + //cudaEventSynchronize(stop); + //float milliseconds = 0; + //cudaEventElapsedTime(&milliseconds, start, stop); + //std::cout << milliseconds << std::endl; // Swap back std::swap(dev_oData, dev_iData); From ba190badb795aac9b123d34f620bf6ffc3ae84cc Mon Sep 17 00:00:00 2001 From: xnieamo Date: Tue, 27 Sep 2016 19:31:43 -0400 Subject: [PATCH 5/6] updated chart with thrust --- images/performanceChart.png | Bin 12535 -> 12784 bytes stream_compaction/efficient.cu | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/images/performanceChart.png b/images/performanceChart.png index 332ce15ddec7f9d4f858bdafedb1f8a21c6a5af1..a0da332e01b861b2913831f0abf5eee9ff4b4efe 100644 GIT binary patch literal 12784 zcmb7q2RxN++&{;RL}myrWktg%J1LQoaqKN+?_nNw__{o9J?fWWIH$xGBOUw z9$CHDeUzv7^Zx(ucs_lekNZCNbzQ&fH^09TaYtQ=j)sGVgoK1n`Ie#<2?{p~ zc_wDp!`He?$06#np^+#lHn#lq*u_S@R;Jeb=if;k#UST+?Wx5SwR+HQ;l)$J}l3-U;I zOjQUo?w5A|q54m2jl;mDJl%q~c_e!i=j2?I-w&S~+#n}`v5x(44%&WapV5;tX-7t_ zy)^LtzDCMedblKa$eGKRyT$V5HXG>YdUCZ1vb#O=JiF6M1LDy^yLdC4Jz=n{m3Y&F zy6YCa9|V{5>bBthF6m2F4ejW)J|mM9>e?dR0z;flK5d7<{s%3?A5KJuiEPfQx2 z^IiFdWwR_rnX?IjeL0$^=kBLhG?)-;15{Wr?{}w6<0hB!g|f-}ZJg z=~ZlOY$oGHtIto5lj$xlhtjJ5ERpUE?4u#^-9sVpu#EzpPRpd3!z^ z^FvkLxa=opo~_)*n%EDvuh1GX_2wMSS?wkdS9|H$QTfC6)_3nJ9!{*ldA>vMPRacm zko26JMm^}|dKWi&gdnNoWj;L;`)*$kRK~S>HXVj4FCv3uIA_0gyTjM0C&Ne(3wM?3 zvWwF@JQhc<>5*>DXasRwOtJBgGWEju6=+iA+Y<@pV9K1rmt7xGvN-R-E^J!u-4xIe zQ#}_eGla+2$h=)vuu64Xx$29n8mP$@P?NxUXvbnY?Cp2-^IN%$kM?+%=npfIQ0qSy zbnfu}wYOX3t=i3=^X{fnT@&N2>WfOL?iwxObewLzYU2ntTh*vQyV{&vgqjW0jq2|o zA596IFm0WTSbyKQqn{dc+yoOqWB(%dq{N0#gdUZY~_F zclNSQQuZmD~mmE zH27A`esdHG!Y31C{MPj!9pTEi>qQ<-mT(*t_4_#)cOJObSRsQk1y|Zt)L#wEi)VVgSi-rs!V59i+$H-4YJh}=jubq{Jlva->9rknrB#M zx88MIWFY?XePg!o!K7rCzR4_ty1Kf&lGkQ?TemhAcq$>iCfN8; z6xrBLQ@*G7J2)ES?~v6Li%;W{Ae)|mXQkqsP->PBR&ER zB>}Dw{119$P%;wS8AF;v>Kh!4h=Rp_<9YM)rDf9-miHe%hyqa=jh2+zV^H0=>HEc2 z&Cv#7G%Te7Z5pya&+x0ix=viiqFs+Pt}2D#&ZWzj9apD2`zl?>>#cUT*4_h)F#sNy zY{PM+7N@1HeVQ@{05(SGwheUp4c`*3TJfs9cEJF2GpRnDzrCtmk?Al{DsKJtwFB8} zq+Z)wvD1L1B(T^yY(@KQPwsf3#+wTbl=R$lh4ja-bevUBm@edz_3z*PM_4}Bx6ETk z3%cP6^OhXVbcf+;9|Gz@AJ7=w8Xc#&J`RO=FD+plyo}U)0Xs%YOG3i@COGEB3yYbV znW{a%j}&}GzC`X)$eQQc7dwyCdIDe|O(BG1^F2wFEL*Xt+I66!L+0x?6U<3D#Fv~v z6y`$kbNeTUK;k8h3?^>$KLhk&`_arI6C5xfNLGbv^<7oe+W!x0@%qZSd)RZX|6<;q z)JB#f#M0Oeztv8q0>ubM-CQkRBa{a}bfHEyS>2Fjesg%C@c^rkzIl9nytwbGjr~xi z>u5>f?v_E3X-$66uf34%a_14J+}vD?cCpRbyrMy1bGV4~sej_f6$XRB8+UBd&2=E!eUp05^viPxdtt|zCHrB@ibGkHFYt9X4Ldmk~|d0{$;KMvH^gOb2f#J0JnJO`*n&~$UT3+%N0L3Wx+i==n{(!5PIU3;NsX{>; z$DztjlaXSJhrIwXA?OuLPFkM$evD^vwR88ENlQqi@NhVE*M^B2j4JsTc0!deJcB3!VIO=zaD zE?<8N7$Ptz2?ROCAJAQmid}mcplT{@T4C?116mejzjaPkF zbGF&l>Lopcw<0E?kuoAqirW8`^?kF;olc?v~ zC+8w%{^+*h2b=FsRA!Jv`Qv@|-3g%T_sb4GlDXS+w*hZNX+&vAwiyApY?w*bdBK7X zzTkObgL0Wa55Y~fXW=+-<<-6iwna zNmEkOHG$Oy93mca!n{NT*kSNZQU@<0S>PZT>+f^_-J_oXv4|A>BNzG~&W9-e|D2Bn z8cCk-Ut#f)l2g>?IY2e~xmua|Hl1m@C6+hvU73-PYZNxC{b!Ku&kY1QfLARqv1%R! z&T0Nr1e531sx7wEI#Mm+N+(bhOJGWn<1M8m{lkeTaUf->dM*~YA}u|Z9a(Q3FSW41FDybp}>6m_Irge8Uql=9qWTh#RIlH zfFAPL(*v0jC7G}PnoqM4xL{kQ=;I|#k!;v6FHRQ(Y_Espazc7TR2ALVNZkTE;0p5j z5AHf<4X}xBk*N%}%%07j_FzBnH_|?(Ec{xZXxN139`peN#`)`SZ!FyZi`dAqrbuiw zm(I)gnBB^}3IPH~6D`H{k(XweIGKo|OeiaKcg zd=FeTq{y_daA>co$I5@0&susJpldxJDa7pSDE7;rK@H)fY>_>tup9(pi63+6&tyxJ zWc-%Dhx}1PhG?W7P#!M~J-769RswZ4P*}*xc*y|J3ZFV)wZgI4WwiHF8oH1q2IpPk zghLH1PqyORMRUw54opJfSZ|&ly)j0=6yVOVfF#N}`vl!yB)kB?uYu^^uPs1_D_vT8 zc`Kq|$t-a@{RBo^ESjU04Av)B67(w&-;w%|v)?pL@sQUDP`|X?$y9i{*gBx`E&FkeZFe^C2@;Ja<*cfNuRx+Qw6r&S@|*t%4Jy=xa%vyTB_9#pg3J@xCPWR(4@Gu>Cscn<8lq-#h| z%8T#;T&^G36Yho2+r1S3pUZzAnwmaunK*F#;Cf*tF5eZPA{H=*$TEsQDyA!~nj}5& zgqhzG#4XQj_8{zv`>i{8(Qy6cy4ofR4@rtq-X$fq(7vPlYy^9>kw= z^9GpL84G~7arNumGQRzwC>KA#m7;0Rb(9!>pYWQ8N&0UVOCi)pY6IrKNiE!kyk`FC z-*IRfQ^aFW_3!sk;BKUD{RBmG`Ej*$`C}IGzXN)`Y6(Y64A42Z|{P%x)jv#^nHDOKTT~+cRqxUq&^Li$q@0cQ?&u*$hP$=r0Lq+ zz^h=2KMXqgvZqIf=uiWJx4o^b+yZ@5VDLWt?{8*%^Qa2VYW)X5MV)os$8=3)nXwqc zjF}{6ZdktQswzk3P2-gXMzv$g<@3Vqar{UGJ&ctY%Kx~{8%WbhG4l`_))7=L@SG5n z1!u~fM~07pG@y`2(&NkE2V16f9~yWn8&E@#ClTdf@$~rBuFubSoS}@xY3K^4q=!v( z-StWs1oeoQYyiQx0AL{9^qX{Eim5-EzVhEXdf;6lK`?;4mxv0gry2q3_--rg4EN1&A*8_Vcy9HZ9o zu#%YT5v2<)-TxA1Grco(qT@ep2Bc0@BsvH${-KC8lE{Mpx|~}7)?jeV0ufva1&2om z!;3LY`)TDqIe$?Q1^e?u=ur-kQ(?J()3kIn_-1r)lP{Es5LG+s_}>S@Sm`84BORaZ z-!bLIY4egKXj_O(=n1ML>^}jF2n?cJW?y-OA%4m1{|XfKv=ZRV!Bf3uJhgrcdj9CX zVM6Qi5nuuFiX2xVuKsxfW~>M;{cpb{Y6rPc?EW5*7>Ei1(E?Tj*&Lie{8H5IkMN21 zp)WeLPdD)-QVIcc-bT4Qzlr!Q*#33M-#{Df%O3g_N+deI9Q`e6h#|L*v~0GA6G{Kg zOpDOq8|nLL9+aI$0taOB4zdb}R3c%(WkDC*cP2ac|EtMU>jMX<`+z*3DO=>ntOC*3 zh1olNd45tH^dA0#Rfm3mC`8;6M%tRKQzIF;Q;7})m2-P+`8+(R{aIg7Odc(8Y!FNJ z&q72W;ujO^6*k@ZB^$pBgTVK- zi97VyQ^8W?r-y6MEawe-K-shL5h-Xq;af%OK0y633Q+CX9?{KGWoN4-IWaJ^Q!G-m z`cllcE0mrLxZ(D}GW+>*tke5uZa<+`2~;yrPy#)bd{<{&Gf0s70p=vzNR;yWCZJd7 z@cv%g7#Wc3Qg`ZgDm-Y(jFzk4G%3BSEKx>YNS7!Z5mIiBMx$6oOdUW|vrLI6peNB} z95jVOnpuU7Z6G>up&d}ZF%It0*(s;3<~9hzvQe2`4B8n}XyUojCV@^)no9EfMz^DS zwxM-O0_vFp60w=L<6hp2f9Wd2i@nr@!ts9f)za5`y!u*D?^~ksbQ%Hk*vu_vElKpMj!!8BptX<{?;xZD~Cik(nLVWa=~$?L|`%bf|Zpi}PaXv5a* z)+$-lM#mJ79jLQ;;tMKn?ocgYtZ7&Zwcbv*o{>Cp7i2I_cRVr*#pHywP0P`Wh8m_} zDzbZ{egYoE{kt{5?(8l=Xa_HB($8=H4z_F8&1Jhm=N)I zL7o&c$#KCvxMs1D_E4Fs&j5kZLdKT!oooOK=i1C=AYebct{pa z?==_SNH`~Px_=L_((V-iRw&VZNs+_jUH9f8=PDVFprft*FY~UAtgks5{u-(Y+Is6Q z)B{pB<7tp#6)nW^OT7@tj6kWT6=v$01&EEwPE}ZcZS{=1g;)MXD=KfIBv)bA`!$@N zhud3qDxkO2r!`F^78Karlr-=w)P>%_mOc#~Tb=C{Ts-dq0xl?&5d^N&2n90Ob~vh! zpVNuZwjyM^{bDoo|GsyRE5>JgzQPEUeN9+g+mmDn&U6^-ZczQ@yl5Hf9LIm|Dqx01 zd?UhDcCnrexTXxD_HK7(%bjvn9~_iaZO^@LA7Z<4z6ca=o19QHSwa??-*4mF(@XA~ zPD6U3Tj{DiAV%q2vlh!DVmcIdEZ|k!KmV-umO?yvx5O04aX%CJ{d~oc<*zMF-5fxq z!lItDJuP9zK3g-{P`OYqwB=l~e;3HUr_k0kO35`GlOTmgUG0?y#O+qA4j^2kdF1w< ztYu>kR^~b~BrOJ_W1DE|-do5^!jZJ>Tk9hm-^fnkPmhS$4kG;2Hbnvx}8S0`!_1 z@Rlo=;XWVKYSsts*OPzkG>gp+kIZkV*q#CO*!lG?%PxO8U5HjSvqS0?If0h%eKcAH zlqTly>GR0=e&{VS8=*UN$42LvPKsfLQ=(rIiu7Tg?v`9?p+W%~R9epTYUY9}p1Rvx z>I-OB<%o`BpDs;{1KpiAD$^5VK(yO%O{b7OLq=~M7WJ$B{FOhWdS|({c0?Fnphdk0 zy0Y<^c0eu4Wy1L7^i)nq^fcuLD8E_;CF7>u!xi(TxXHwIJV?<|znqc!_+PPoVQkEB zv49`(_l*%t)1B!AFP##<>C{IyqQImG6QW1WGhCGe9S$!`PV4T72P%ZxP^L z|1B06wx)cZ1H&9%A&NNl)K!13$&41dkLQcbs(k|EXpM*7hLZP|Lb~jg3myB;#x4ZOAm%I^N9$wJ(A{!b4$v%GQ$SE2vnp4O( zIVUmWNb$|G+ZR6gX)~5ZUXHIo-{H8Dq{8D!SkSiBGnpSKg&XdIAUip4QLX{^dT@OSUsOjx!D<9T`eGzlZ_-tv zGR4Fmn@wc-Y3~d-8FZARU6{FRp5#se(_LJ%tpN%sd#mrFa*J#~8ejyA?AA&UDN7Tk zguG?4GcpvV%bu7$c+7Dj?5Okc>pZ4@l>2q(g>MoS z0Bw^asN=p%L4Oj+p;DnVT3O{s!|<3|SSQ+RL1+Un%7y%9yX(_4kIho!^jd1I!irjq zzjLmQw-$gc_{X%%Y$W8Z+AO-bAe87T1q_lv=-dk2y@9Ra4RgQSXmkOY z>$!o?XIMh7RT$FSbySmfG%)+JaKqU(Co?=|x`Y$Ry=*7g;Bw4-0C!E^tL+=f`WjuK z+JN&lJW`-kG@ElVfJQJvypQ)SL2b#3?S7iG7eEg`wPbFUAL4uE+EF0D^D&&xomA80 zJ`E>Ly`-bk6}uiC-cHFi*b|-{0p#p!r4>pa_YB*lRM5e_=TrOO*5XyHXjOW0Gb$86 z_Ou+ZyQ?W)2+IW+TnndvX$@0RF27bmbAn?$EnNLv#Ch$kr}mpYZ*p%t_T?KKDlzp) zQG{AKGX5q?nA~mtH^{t*t|-pj$je9360fY$WI(Mkv)9s|DbvIVJ#>Ow_xM}VgmELf zr@!1XsB}r?u_Oyae!jIF%Zj<7inC*?9|&!C2MNI7 zm&<%lv|KovaxR_{hg?o|m!K4R#6bXM9W)jxIV){Di)wRzp-x$dNJFYu8hjDUQnK>W z3FslEq0t4M{FF((Ew?m+JR;sU-hFm#)o89g^XCgi3vY?%r`z)XsPnBIrM}=!u{R*V232cq?rYMJA@J zNfj{7SFbodrrI}ugoIRZWFpJ=L?F+r4Bo6u|6Nu_a6X+=h z#BQHDuRT3;MRYP0=^iM>rKEn2Oph)8Y$2U2jan~HSSgr7+Ojh^1t0h5F|(0)k0RY2 zZi>&xa)G8?tvGAT9hr{9E0Qsf&swa0M)NTs6?*G}euWzFNWN4xDQOtjJ9YjQPxIm@ z-%E>7jX!->=TK(@EArCpG`vo1@CDM9UV1TD?X4((Yk;J=qSCLQz#Q1ir2upvQGPvp z%@cPl%}M?IBb4dh_Q)@kT?l0Xp0h$-y?~#CF^rN?(AYa$p~W_SM?Cyo->Z;FffvY7 z8x%v4z>Eagn&wREeTk0wH9Y1$4)j%66eI4)P!m3PRM>O1Otjg_fPOhfEL~Y$tl*Pi zQMDy#M7cb(l_%I2)%?J>%Xyf8pk%ava#%Uk`U$3`e)sI4T|!ck{Mwh(&nkm+c2!=j zgsVHXdQt@k!-F?#d>MwKQp2H&03pqJXtrE)x=a63+tk|PM|yOG79t}SMZa4rE!ia? zY8=XWY+g+v*Lrc~67P6R{lg}$sBT|P2GFK?vdvELDiU*)Kvq-e1vB}e$$%9kP_V1)gDj%vN#aN>zkvp& zaEmV37zHMEjJ?bj<<}L52%uWPaBjA|Eswm3;DEA_p5dxmY-dNGRdL0a&%rDduRG!t4x}kzqRv-9+SLHBRAfJv zn17B;+H!TZ>G!0)b7wncGSHo5X=to@!si+D!JFj zp?7}qNGp9IcaHr{lS`5lCg0Cg(o8jnbSB-EV)wSW^ZIViX9{P(0Y|Sb7?UyS`U)i7 zuw#uqV6u(u;bR$)wYGO5&c2<~S#k9_m#mwkqNaw(URu(Y2uE4^ZV(*7B$2z4sBa9I z4Gke+Mv5p!G?6aQm2X|U_#)5sdwI<*Y$DqZlLZ!%EOWImyO@jQBLkO|uA`m}Wc_fs z_xiwn+#0Be8~FO7IQa-|L6eV)k4Xled-LFy%bL8$be%#3k|K~tH9_?p*)wKl-cF?J z3*xRxQy7{Hkc+xya>;HHjLA$!4{AMzvU2=hw}f*MUe!XS0^Zd1eeCgOWpK|nSl=lQYIFmt=xg@kjRW!rq zQHfTU-0jiUsQT&o04*XKeiwqNCb8cVrO?PndRIGAED#qd+P%4qWfo&?L4mi{Oao%U z5cv(|*LKEE6~6`koND$ZxTxA_IVG2%J{UYj5Z?Kud6?OpsEu~%nepwZ%zZ&i@TJj{ zxy4LePuZlH^Z5pPo_CBW7U@OQT?Kk~jOHECghNDGrgijl)4d!nw)n{MtY+n#lvA~8 znl3%#c^)YCH#N7*dw2e@}ungL1+dm#RS1)h3)YGcm5qgZ} z20L4(2*=-@Y3!JYmKbt*$jOx6y%n7D1=@yU;zd?`f2lPs4|)GOutQL`fZFt+n++yP z{yZ-Js$N-*;*t_?-F-V(20CJ{0V0VXjIuE=M%la9Oy4HcK#N`~Jfx%>I@Cl(&z&&2 z2gX>9;)G2K2e)H|O(a|WWL7(_=N|?RlqA9#jI;pjMlL%+wZzrZLpM z{`~%k$1qtoz~Sp_enJT}-Ir*nbh)kqOq(XIXo#N7H8m>ItoC+o{qn_SMSZ4R;NI{J zW`bzM%!ncRVq>ZY-HRQHjL%{43SB)3)>asz}1{(Ao1U(kA_TB~lN~jYYCJx=O zGRxy@CO{Nb>}{9=qp5F<4EX>wf!dCMQH^gItx&T6as+-%+eWLuE40Gb_?&v+Le0h; z=;Mfkj^0KkpG}Ma{41CoW4%(N4JLVT`1yhN=UlpMlyBd5PgR=|8v-+-AjKD6e4}8p zFkFN4kpOiW3yfJ%jTBEB?41+dt7c&GcC64$QUz%I2o*uY&mBqM*%2T&>obzmOvxrvrngjLD3m1DX{9e#$uRYRouOg8ck z8RcQ>p`zJ1y`Nf-S4^*CSaQ}0IJ^_y|1qdfm4gvC3a31ptPK{#WpVqyfBFhdT4^Xj zmC3~snh)9A+o;2Wr1xm02F?+w&;n-ZjID1me*mc?w|5e1qFNwEK&K(QqthJW>=1*3 zJ#0qUUZ_D@cGdWKVPyCP^$I=4>!m@h`va)SSK@(tf;zlZ+}fBFq+UZ%B#K5x(hN+| zTsE;5_UMp>)XW$@`<0fPYSN^-yZI@ngfK(24nh*N3`e~Xzd&br2$9&{p2zKz`jysf zcYC8ES38@MoAdI$r<@pGNy|mhRwAI(O$j)WcG>NGXuh1VwfOxb*#htBn@_(m4g&DP zo)bzzbq!wY!3n8q;UPp59He((visfS9@HQ#(dh!SG?i=nTHjPK;6$Wt=_pJ(7EE328|RplQ7rD2>tkfoe`+rCH1gXX##?*9h0K zjf{{9j)X!&Jt(maMLk;q8|sRM3g!>~16YB9 AssI20 literal 12535 zcmb7q1z1#T)Gm#Jh(Vk)lr54Tr4dk&Pys0gX;CDG?k)ubk%J;CNS8==4~QZt5(7AN zcMZ+Z+_eYvoImcp|NlJdNtj0c`6S4`Mq*t@Ed>HmqOt2l0esp=fV#gIPI{W_H`H8=VBiFnd z-hIA*L9u92f5CmUccYiUMNUF;`Dw|sLio6k4<%ewl97-cGbH_AULMMV1^#*We}B0T zl(73<;=BLzOBKV54}N4((V^P;z2<2*aD^CS@72#7Y4XABD{EuXW78y>das98ymtuR zE1x;Gn?7o!81a|QNw<%UcD`F!^Rng9D{1Y_GrL!@R&PCB+P9UlHNx+m-4H3>RpL11 z(Glh6Hd{_ebDHT~DWCOh%#XJn`s!c>q7Gayn#FAJ(V}{lWZmk+1&gU(HtE>eEilrb zdPq&*u-r%3n)U9xou)OAy29`^dS`Qtb!Quv!G0q3@$qj?I!wP? ze0uoXs{)$=>5Z1??O}{&p;eEr%$K#rG0WNBa;LFK=aL$3jSn`m$O&xwFf#F4_C}_X@A{dwE4$VWZzh<%LXuQu`-WYgV9&zf*ZpHxf@p$ z-=2?E%s;!jVL#T8|NS+0mosLOO_;IR@RMI%RBFuyk^v-z&(?pLuhW9=<2zppvG z5Co%$O5SkeSrtXy_1M`W@VoO$WJpB_n7u?-s<|_~m98M98fZQe7ELIg;PK0*e!tOz@ z3@g+Y_TK#bJf&B?r02TUg9k}Ukz#^#?Usc-qwMZm%WXYvG}TcHBXy-U4&4wDW4Hx> z$+N2x*!_$(FW7Ra*@(+IsB zSCN?#9br}SZL~2udZ@VfiM)PhqM%H%>&8rBkNx1_V0T;iU~`+R(uNBucaemvQsnCg z?q$)3FKHJU*D(>^%JoXiGs;+3Y|U8nyQeQLEqTcF&U){-6WUFpJ(*lAT9TCT*?n7h zXG$-nphvFpfxY)iooAhb&945{-H2NkX5)`kh&@-@M5x^m|D^62jym8OMTj!hz93BL z+6Sr6Uu_=T6U!J|A0f=lWyts_X>mz6NdM8ov%I7LyZDOatoSB})w#REohuNZ0*&Xj zGIT`=C0&Kq{2G;9K`Sm&HyY`-+_g1__Y3QdDxD6%gb90wYq~FW7H{cNe)q$S`qMJ= zx%0RroL@3QryfXOsRo$-BP_^H1rr{l^VTwzbuNO`$Y;=QwNC7?DnNVC5E zSQ4L}z9{#j>Rjyx6$u4TD2=raIWLvh5>{UCaO=K+q#6OwgjqB4<~WKYmeVO?kl3)j zo&>#($<)kljpoUpsWV?$tR4DV9>lobDX_R>Sfb!oWr-4ZO4G4USQ$4t z_{aGeEz;B?MB0Xiu44OSHZh0VaA9k&4it|y*U}4dd~NpVM9EYJx5N3v{O`?sZjGbT zytdXECj@C1-Zt!OZPUJ=ac^#^<)(qq!mXz0lwA5{$3Bc_k9(Y>Yi~B^k6rOm)}fr; zN`X%v%1hE5Rmj6BsCyKb7c@{EV6WdVAAe52spXx`K+>4B#6s=K;%&w9+iGgfjAp@v zOV;1{R;TjOZB$W(b|c@jOn#VcJc@gCT*L*3)v@O7#ZceKUQ)0+f&rlVBLIL{ zf+={QER!+8KmYGbQd)TApNIc-d7DDqcJPtoWb3)cG4S9!6<%9uu8X5WhLt2;rOy1; ztjK$ha;^I$nco{387+9|2eV5|#|KO1Tr_FWx_) z=^S`^^sF}IN%B6H3-_PUv5GVW9Apc>cjyZGIb#{kqBaE&gQ%fd1eK$lBWQW^P?le$ zq-Q7KlR>U222-6 z>+zTerrP=DuZN2N_0R#WiHV7@w{Nxjy{+|0Q?oK<5kPpDIZGf?*k#U89BN87`E6DG zKm;tF;4(Br`|Fpx70XWeALcV;T+T<_O=EgB4?;JpJO;Hu~p5LHXBTbKTL^eLvHleR8fmh()-hu4lZ_uqK$J5a8VC zdN_yortMYTGmnX%Je2i}Aqyp7YM7Fe5*8V0dGxGu9zZcFjPU*1Lo6aTF?4KV0(0$9 z`qE_snOvzp!2gaCpop!nL;si4Xyn-gha zEBmx!f{$hmSNf9Ua*S^{dxJ*`nYVH}dw+RyIPB$1F!%lYMN;;(@W%S^tDB%EK+>M? zeNO+j*A+BkFE4HF@O<;1Ts8KeDLJi|xH3uP5XBq^ov{m&a!W|`965TFc5{}FIi?KD zP1NpXj^K5l#FS5;9#VI8v)N+37$6Iv|8s7gYWav2FG?0Vk`=!-MAIJVmQgla7VyTp6rvQtfd1$GW z(U*NUu3fVwTtp&ofb912TS^|7Cyt*{06=3;HJWziRY?%%iZ@p&Si-zZE48?U@^~}l(25)N3HC* zND=31Kna6l^@#d^sli&nOJzZQT^eEu*v5F zP{+SG>bsj1q(xBW-+%mYAmk^@dw6)XkL6o-;rzL=XVAqG^n2^YMI~*5>-6KJ#hn|B z?%n$=c~kRX8({=|z|Q;ELx<#sz6OyK`1MNe2cZOMHp=dP4(NcgV}hiO!zbm2*UL7E z#iNkhi`!y;Kw?OagWlEiP}bjt;IiW0+a87#+=U@2GLH}5+1cLeQNTr5W6Yo@9I7Xe z#*-q^un$@b$*_=+0X&DO-4{KEi&A2XS?bBkWv;(AuB@i^65#hwUc=}M#?ONOym)4M z(QRq`E|i_TS7&c(U%!4m!-RzlvK<8A?N;bcQ=xz82^TOMY~XAi4?TP4%o!W+o)Sl= zF3ZB=b6-^B*Zdsnjv)?I>A&3ji0HSjfhL%P*Tdj@rKWKY0n!|&jZ-B0*w{`)b9cfO}D0~LrW?I zp_{E7DR!qWjMozUf%VFsHn`5E-cGkWj-G5yVU_*$+jB}3Eazy@@S3Z+QrwQ=7Ko1ZQaWY2(rUqgoFsRXi%QlX{LB zGRn7Q=vhKPNBqtU!5J#j)P(VphYzXfjC82r(|bRewyhwwR>UmW?ph?I&}C;++77`Z{tpG43Lx$MS`8{D}&PBVRp{%cARLivv#J<{c0L->xuiZ@mF z3HTn%&jEBxioe!7{vI1IUm8*fMG2uAPMCtW}dbmM?(0xy*&}xK^d{6aE3-RYFpc5_H7;A{Eb1c`25VEuYK!Vo} z+}a>G>!?AD*`%2XL&idwF5eC98^E^2XEnAA@ncrwbSg8?RY7)SoES?-(Dd(wsMUi1 zyN#MORRAha)B5t%7*l~JRB8!?(K0A2wzai&+TL8N+!rR7S7nAvd#95;B60KN9g=AL zC!*CoEfILukkvA-Tx>VeeNXJ64LonWT|d*39Bd5uXhZ%5Yc`-2f6plPHiB$K>XEEp zyI#)!s9T%^!T7z0K!$^L>=Y|r-@10~n$>5GNH!7U?+h##A1JnuO&s_f8phFC8xJ+RN-mli&wL2f z!a4V~g%SLZOP1a$$;xam*ng?x7K2z9pM*r@$=@OMIuv}eBuJPr-SHVo3ws`4_59C^ zY0~~Qw}Av?&I19BX_Y0U_#aDV@>w&(>Rzl=$uCX^1hAsh(0{ zbk=XZRh0|`uyjWwc^U{Z1iXqO8D=+s*t&F`f6gk}47$AsA)CK4)1_l%6vwMu^tUDI zUrT`Vy~q^w%YRUhLV{;`wy$pzmtKb@|0N=gOxv-45{4g;n>AR{UpKFd8S0g}gq)=K zZJPU)AHmkpPq(8p%O3#wGiFNqO9Rz(T4$BtXSKF!IG?!->8qz|Wn;FY??HWdbp!sl z%1KRkbaz@&EFBLQT-2cr89P@?H(ElFX2|7{3Gs94gv(uWjy)r*8NfXJ?koRYu50Wo8CL8|8E z=B|Ufd;jUvr+1^pTg5R;Qu!w4GY-lLZ)OW28pyz0akl$-i0W1v&fIHjlG$#^M zr*5=6kRRx-f^UfNy@zWLPXXk? zc*~=+0N@}3A&nuRa1iL!>C+uh-g-Jeo>Nb1fBVXs>xcoV1oJT-~`@8#cj02Zvy4sYE0}0lSSseuS1ssSL z@elCAeOe40>msCq$Q~|No`AlQ=vXn}Lzkg%hc;5hWA~XokA6mvm>pabI>X11+la_+ z*C&6G0FMV{z^{&GK|@^e$V|BUG29@won%blDW9Qq{9a&C2JY-Vc@CTb(C2$Zl-1LU zrAVZ7Dj{x;I2+>8ASf*b4}v)V|4+i+;A1lLv4C_D8GQ|^@*;kJ9HNT2W?lqwQQ1GfVyBMo3#Bz%R*m8d~a{_ z)sKB-|9NsyhPb2MC`P!H6NX5nQSgA`!Ro58!`)bpS#7d0Qfr#V_7!mnDQP)J+j|ll&F^~(9&HeXJ{s$EvH(=(ZP&3EtK;kO4qKjxK`#AtZ&?Rw*$@Qm5od8T)n`E+dvjgjIe z(1cL%bSjmaU7SGxZLA1N*ow8td!qR?y7hx9)~i_8TLPgWZ5P*ozY6{PcMo~vq&6~l z^*0*-PoQT$i3FKv+3v+Luef+ulHxlfATb7%fZFr%A8h}R?H~qdID}H$BX6OBL!PR2 z*j%+`0V*EhRKvo;zOn$fM@m~gKk~*v8*r^&rQ6`svuRMW_-tST4E&}c^b0GE-rFsZ z7KHm$jOQrA>~$mv$RF6dzy$Lv zAqgb~g)E?PifdM;JCK6y=_6?#tZyS;-v)33vp&g9C`Vv%(}<}3==1XVuPpc$BZ0NPooy57H?qK% z8w-7zLryJ9!X0^LEpbWpShutZVzN<@bC|-_hwCg01&z}^ShisR*E?NM7Uz5~4e$hp>B1vJZA7-j}kpQrDh#m~~eCJrNSspyw{lP*Y zPshRUJG25b)yxJz>^q-J;&<-RkuiOr<2Ue6A*ViA6mm|lO8)Lgqv~hKP`Y$1Q@6O; z3Ew@j;lvNU7VzHFA@c0m;0Sd;e*9QmE9~{N&EDJ8e$5VClSkk?D|`y_D33_nphMv0Hu&txlkrVcq8SXHG&|$*!QD^dyKQUOtY&jmbP6nN z2V8;7z*yVs?|m=k%ERfOng;cY&Lp0fF1!%jr#1J9zYo}FD#@Sq?=IE=``Gg&*zbwK zaAEDOsf(>%yv=RNeb99l(^|U=7$r+j`zuPV4hHh;MmRW)!C(aGkL=?^fYrCR+%Ft- zgoDHS=o!U1%Lv*N6{;Nyrp{LCF)*{uhhjYDk`a~cBh5h#vlY?-9LvZ`oijMo?P5D) zW|gIs^AHivGxgRXQvd~LQj(+Wt|+QUt)KFk$!{B7t)5C2bpa?Vgj zrm$t_RiYdj-5|$u9ijiz!vg17{l3iw-tQ+g{kdjeN+FYS@UiuURr9NICvAX0_G zX92wAI6fL0mF?wB?~a6PugLn^&P|{w=aL#XmE%PwD>BPh!nekxc5JKL(K@G|e;+-Q zc}``kugQI3?lXPq#CgwJ^X{VTNO9*Ek_QiEsT^`m7yWKou9^U!o;;zs)tsSUUecAQ zj}|E-)CRi|@NBE6_`@$?vs=H>vIpr;dN%WIYTGB1EHvNXm%X~vX8O8nsbXhqw(=e- zzggk+dT~`~pS-dCt3HMg+t%n;{Q-0=B~qWaq+)}~VVZIxQ&8SDU3g)Nzi*JojxH(X z>a5`Ak%VP4-8bjd-YVR@=^Q-UnO~Q$PB{vFko{d5b-CCjj~SD&_wV0Nxerp$R_tK5 z2o*c^heI9meIK*Can>w`3t5_T-%hh3Yzc5hM8qlkuFrfvfh6(JKVffNwQeUXIPoVm zrCLf}0m+~Z-D)?(TU?y!^Zeu7;l9N%6l1DYrG89-MW6)Xt*{+wBVIW){5lvlmwQ{gso4( ze!BfH_tAGI7LcIvo+|EwZjOuf!ejeX#Bh`3sF9{`sC7&)b46g-xh7LQpn=!}oZrJw zR%8l4TanEOXUBff8($Y$CS}xMJdMqJ*U572%|??#s$8l=0J_v^CJka)R60O9R64*k zL55PR5qpizE9%IBNUETvy#)toxSoQLI@Sza;mpO8?R|E2yrS$a2RBhaRR*x*{!D`C zCFQ$U<0V2apA<$VVng4UDur~GLDpvF;PP|kj25*|qGk7zlsVrmV485#R}PH;A5De4 z3{jfGCP8hcV)Ijcp)Z{MX&CKvo6iag3N~S&u$4}zVKEBgPSa1`?vSI3_aDS%R&19< zP%R~eIFv1We~BkcJ7;4lGkBKr9aT{1ITfT=C^Cw3tyk5j)r-Fyf`%~uY*macu+(8@ z{k;4dCZw16pAIJf9ji)9NoQRjljOQHAM9NuC3{jG6&o&yzMYX7-}7TjfLFK4tV!V= zS0WEL=w&wogBe7*nhQ<{r(lWkzNi6IgoxueJ9ZSR%)@i^RaKWP?{D#!En z1=|!^LQ*xu8HPA<$4Q2jH7)^Ek4nYd3iSeQ5XRi$dwh^p`Br<0(~SAzSYzY7hyQbg zi0?5dwoB})sh_=Wgle@`kHNOqkdlSbOJZ;2S*B?=`DcXW)ZQA9o>JEEh;kX+kI)a? za$M)b`P|_!#mUk3$w$u-2IlQLUDCxd0ZRtO_@WR2HrAk)y_IglN&;4C%)GREg4!vb z-xyF5;bPc#uW}qF)C#{UdrG^ru0^hA$)(t_47d<_oO^!mD??9BtZ`L_2>2bggk^Au zK(LH^oaMc8Q|bw$kt%0VHFF^&UqstJ z{;*=?aExK1;58a;uO#{#;RjnC-uAvR1(zB|GTMp$qg&J*9{qUv&aZCz*lX9X)9BU; zhDI}&O!oc3_s1i2A+DR#!3Ax7wMG7uy72&?r^f9ON35-X=G(V_2s8e1z2!a)D&$MV z!Pb&%kv?&`Bqxng0^u7XFz5ZIpyU-jE)kM7s?b7$iZyLcc)D>lGorz5$?txV!JwD! z_{WyHTm2{`-(iMnOW;xSrR^2xC=!W;SLeZ?lKp{kb&)Iq@pU;4Zab zR8ae@*c88mm`;mN$ay+cJpy{xhM64%JmcE_DRE9GE6W#rS zr#l%g6gq7<&M2w!iSfZ`&mK-0P{^EhGEGo3`L>?kjx77T@F||n8b5Hk;^srS8%Wua zWV#kTZXY2uXDTMZ$Je0mwPsA%uTP6x-ml4=iCZ}kay8JMk+rPuttpB#(`mz&_z^tT zG)eA!p5RmbhN@S*BMnQergl>sWYah#vxf#KtX<_0vy8c!C+1$;W$OlmNvrV7UCLz( z-?@Reb0ripRbo~;(HN;0v9#g&MVeg}PlUXK9yz`FB3rZdfivE5U7SXPUTQOoHe}s9 zBgjui7@LT!>=ATg;Hc+#p6@edq?=TVwi8T@L!UBl%!Kq`YnkvdH(9`OJXvUJI=QL; zkkyskLf3@O6h|j7E9X3Yhi>&&tQebt|MR^%w3x09 zIce1@BVN#F-zb4Wji{(%{|urD`c%bh%EXIbmRsIz$D3ltUGiSwWOaz8`;~xFjDKnP zddr~R{wPqTgEGrOE|0Yk$q@AS+Yu-D3^1EHl$c^(Q(X8_Uc4t@T+a@?Y1!@)kWV4? z#Z^VV%t$|3c8+|;J$^31>{GwlLqje_)X%x0v+Hn<4$G+TsXf~i`!IrLKYL0Y`bsVn zVcj(_$h3IeMIaa_>rPz7NWci2V_e5tpw4ic52{yEX6ixjf&#iKFF&3Ek*H z$4_RwRtMQr0z)b5u^!lYQ`Ypruf+O4D%mrYBUxe*T+_v)N~yqd;N#_#L*CnKW4@l& zn_`5(frm!FGUMjPP))Ei zVeH4dd%!AK`KA>TUK@}Zk9S5G(x7_Jx@8`l$n&H;=;=q;AnP+N9e44%d*O)uR9rdH zQ~wO-QawHiB&vAC_2XPH6H(fkc$Fb+9CJP~Q}9`0OYUbq&Fr+6<%73WWHO(AP^Fn| z<^TaDP+Kb53&?^c<1SaK?w<<6VIhYxegA`!GyNi#|CVJ%OiPpF-O1!Ax9>C7M=u%2 z2(q{n@XLpbQl`@`S8Em3+x3rmuFlc{bj02~qF`MfKBS zr^Qh#C0K{Qg>(++Ka7)ZEw(pt{`#&j`O4zabIfdK6`P5By=49Ii}l7P!`b_-X?I%m zwq?Zax$l>4uTNpbxN}5K48#pRx`srUAHV6xQ{hPm@&*Q>qjE01E&FDh^zKW^gV0ej zyef!mFR)~>rWhakznuy48qyoHS{VK&%H9_68X!J5Had}(e(H$Ok+QBLudo$oPscDOCtWw5S(O&$*}i6LJuu-y6mi4g-@jg?$Hg7201x zqW|M(^GFyR+vpd{*M(9^hZ3lTA^ik~fG?dbscSQJUWNlY#puzeLR+oqDM2pH3=5zP zt@_S*rXRjIdxpljlXGdM%i6gOb)YvyudStZG`yrXm(MUGI@S9obi6a2=*;|)x;NBO zsTIlnw}D+}?nK(@7>$XAYcL_U6Zi98=`gXX*)fFCTOo9|9Z+!f4A)kNf_oUErC(Kr z31$8yI1UUDdfgkLZ!m^ww^JW8WjY*_>w=?7={p;F$+cOpO3op@5=5r1 z3f8|7ufDDxuL46S+DjQ&e(T{I#~a+~dNgA?E4(WqPs9rd&oGgbd8h9lJ7O9^SviKmkGkq0D@^VAVUK32exbeex(rG0$~}{@KZS zWYl2L_IXh*xJ+flKNH3boiM;B>qDjhI&G)L?#->v58*LP-hK^^LZNs!$opq!lx!>;4Y3HriEPNA;2 zLBqfUJ?Q9d|U z^(Qp9X{3Oj5@jctUMUxLOxe&jVcaZl8goMz=Go|*pv)yS0gl{xjGV`cv1oRPG_`CA zIVs=WI@ne$Ogyf2wG0YDaf)z|ikbE-oC~SV?q}Og1jykZlF+0b>aw3B%IJu=abTZF NuE{E2&5$= n) return; - if ((index / float(d) - index / d) > 0) return; + if ((index % d) > 0) return; data[index + d - 1] += data[index + d / 2 - 1]; } @@ -21,7 +21,7 @@ namespace StreamCompaction { int index = (blockIdx.x * blockDim.x) + threadIdx.x; if (index >= n) return; - if ((index / float(d) - index / d) > 0) return; + if ((index % d) > 0) return; int t = data[index + d / 2 - 1]; data[index + d / 2 - 1] = data[index + d - 1]; data[index + d - 1] += t; From a75a0efaef7f478b0b90f0e17eb81c8309e7cb67 Mon Sep 17 00:00:00 2001 From: Xiaomao Ding Date: Tue, 27 Sep 2016 19:56:34 -0400 Subject: [PATCH 6/6] Update README.md --- README.md | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/README.md b/README.md index d59f0b2..7318caf 100644 --- a/README.md +++ b/README.md @@ -26,3 +26,76 @@ Each GPU algorithm was tested using an array of 2^14 integers. The optimal block | 256 | 0.095 | 0.473 |0.423 | | 512 | 0.102 | 0.471 |0.454 | | 1024 | 0.109 | 0.495 |0.487 | + +![Plot of block size](https://github.com/xnieamo/Project2-Stream-Compaction/blob/master/images/blockSizePlot.png) + +### Performance comparisons +This section describes the performance of the various implementations of scan and stream compaction in this repository. For some reason, I get a stack overflow error when trying to run the algorithms with greater than 2^16 array entries, so that is maximum array size presented here. + +| Array Size | CPU scan | Naive GPU scan | Efficient GPU scan | CPU Compact w/o scan | CPU compact w/ scan | Efficient GPU compact | Thrust | +|:------|-----------:|----------------:|---------------------:|----------------------:|---------------------:|-----------------------:|--------:| +| 2^12 | 0.015623 | 0.051032 | 0.298 | 0.0090072 | 0.0312529 | 0.263 | 0.352 | +| 2^14 | 0.062499 | 0.0928 | 0.422 | 0.0468755 | 0.1716863 | 0.425 | 0.502 | +| 2^16 | 0.2343767 | 0.342 | 1.15 | 0.250018 | 0.6718685 | 1.127 | 1.325 | + +![Plot of various runtimes](https://github.com/xnieamo/Project2-Stream-Compaction/blob/master/images/performanceChart.png) + +Because we are implementing the work-efficient algorithm described in GPU Gems without any optimizations, it actually runs SLOWER! When looking at the NVIDIA NSight runtime analysis, it appears that the thrust implementation is using asynchronous memory transfer, which seems to allow the CPU to call functions while a kernel is running. Surprisingly, the thrust implementation is still slower than the efficient GPU implementation (runtime was taken from NSight analysis, discounting initial and final memcpy operations). + +In the case of the work-efficient algorithm, one of the issues that affects runtime is the fact that many threads idle as the upsweep and downsweep progress. Aside from that, a main bottleneck in my implementation is memory transfer from host to device. In the stream compaction algorithm, there is a need to set the last index to 0. Instead of doing this via a kernel, I transfer back to host. This results in an expensive memory transfer and adds roughly 0.100 ms to the runtime. Another bottleneck that seems to take about as long as the calculation itself is the cudaLaunch function. The internet hasn't been helpful in telling me what this does, but I suspect that it is responsible for launching the grids or blocks on the GPU. If so, then changing the index to 0 on the GPU might save me 25% of my runtime! + +With the naive GPU scan, there aren't really many addressable bottlenecks. The calculation just takes that long. + +For the CPU implementation, I think for this particular project, the w/o scan compaction runs faster as it only needs to perform a single comparison operation per element. The w/ scan implementation adds a large amount of unnecessary calculations (on the CPU) which makes it run much slower. This shows that GPU and CPU algorithms and the way we should about implementing code on these machines differs by quite a lot! + +### Program output +Finally, here is the output of the various tests to validate my implementations, using an array of 2^16 elements. They all pass, woohoo! + +``` + +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 35 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604305 1604316 ] + passed +==== naive scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] + passed +==== naive scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 0 0 ] + passed +==== work-efficient scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] + passed +==== work-efficient scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604305 1604316 ] + passed +==== thrust scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] + passed +==== thrust scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604305 1604316 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 3 1 ] + passed +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ] + passed +==== work-efficient compact, power-of-two ==== + passed +==== work-efficient compact, non-power-of-two ==== + passed +```