From 59b8320ea8b82e924baefd3ba3a13ad31da3b0ab Mon Sep 17 00:00:00 2001 From: Yuxin Hu Date: Sat, 16 Sep 2017 20:27:02 -0400 Subject: [PATCH 1/8] Finished CPU Implementation --- stream_compaction/CMakeLists.txt | 2 +- stream_compaction/common.h | 196 +++++++++++++++---------------- stream_compaction/cpu.cu | 52 ++++++-- stream_compaction/efficient.cu | 10 +- stream_compaction/naive.cu | 10 +- stream_compaction/thrust.cu | 10 +- 6 files changed, 158 insertions(+), 122 deletions(-) diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..c8709e7 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_50 ) diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 55f1b38..8014d1a 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -1,14 +1,14 @@ #pragma once -#include -#include - -#include -#include -#include -#include +#include +#include + +#include +#include +#include +#include #include -#include +#include #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) @@ -37,96 +37,96 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices); - /** - * This class is used for timing the performance - * Uncopyable and unmovable - * - * Adapted from WindyDarian(https://github.com/WindyDarian) - */ - class PerformanceTimer - { - public: - PerformanceTimer() - { - cudaEventCreate(&event_start); - cudaEventCreate(&event_end); - } - - ~PerformanceTimer() - { - cudaEventDestroy(event_start); - cudaEventDestroy(event_end); - } - - void startCpuTimer() - { - if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); } - cpu_timer_started = true; - - time_start_cpu = std::chrono::high_resolution_clock::now(); - } - - void endCpuTimer() - { - time_end_cpu = std::chrono::high_resolution_clock::now(); - - if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); } - - std::chrono::duration duro = time_end_cpu - time_start_cpu; - prev_elapsed_time_cpu_milliseconds = - static_cast(duro.count()); - - cpu_timer_started = false; - } - - void startGpuTimer() - { - if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); } - gpu_timer_started = true; - - cudaEventRecord(event_start); - } - - void endGpuTimer() - { - cudaEventRecord(event_end); - cudaEventSynchronize(event_end); - - if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); } - - cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end); - gpu_timer_started = false; - } - - float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015 - { - return prev_elapsed_time_cpu_milliseconds; - } - - float getGpuElapsedTimeForPreviousOperation() //noexcept - { - return prev_elapsed_time_gpu_milliseconds; - } - - // remove copy and move functions - PerformanceTimer(const PerformanceTimer&) = delete; - PerformanceTimer(PerformanceTimer&&) = delete; - PerformanceTimer& operator=(const PerformanceTimer&) = delete; - PerformanceTimer& operator=(PerformanceTimer&&) = delete; - - private: - cudaEvent_t event_start = nullptr; - cudaEvent_t event_end = nullptr; - - using time_point_t = std::chrono::high_resolution_clock::time_point; - time_point_t time_start_cpu; - time_point_t time_end_cpu; - - bool cpu_timer_started = false; - bool gpu_timer_started = false; - - float prev_elapsed_time_cpu_milliseconds = 0.f; - float prev_elapsed_time_gpu_milliseconds = 0.f; + /** + * This class is used for timing the performance + * Uncopyable and unmovable + * + * Adapted from WindyDarian(https://github.com/WindyDarian) + */ + class PerformanceTimer + { + public: + PerformanceTimer() + { + cudaEventCreate(&event_start); + cudaEventCreate(&event_end); + } + + ~PerformanceTimer() + { + cudaEventDestroy(event_start); + cudaEventDestroy(event_end); + } + + void startCpuTimer() + { + if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); } + cpu_timer_started = true; + + time_start_cpu = std::chrono::high_resolution_clock::now(); + } + + void endCpuTimer() + { + time_end_cpu = std::chrono::high_resolution_clock::now(); + + if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); } + + std::chrono::duration duro = time_end_cpu - time_start_cpu; + prev_elapsed_time_cpu_milliseconds = + static_cast(duro.count()); + + cpu_timer_started = false; + } + + void startGpuTimer() + { + if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); } + gpu_timer_started = true; + + cudaEventRecord(event_start); + } + + void endGpuTimer() + { + cudaEventRecord(event_end); + cudaEventSynchronize(event_end); + + if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); } + + cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end); + gpu_timer_started = false; + } + + float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015 + { + return prev_elapsed_time_cpu_milliseconds; + } + + float getGpuElapsedTimeForPreviousOperation() //noexcept + { + return prev_elapsed_time_gpu_milliseconds; + } + + // remove copy and move functions + PerformanceTimer(const PerformanceTimer&) = delete; + PerformanceTimer(PerformanceTimer&&) = delete; + PerformanceTimer& operator=(const PerformanceTimer&) = delete; + PerformanceTimer& operator=(PerformanceTimer&&) = delete; + + private: + cudaEvent_t event_start = nullptr; + cudaEvent_t event_end = nullptr; + + using time_point_t = std::chrono::high_resolution_clock::time_point; + time_point_t time_start_cpu; + time_point_t time_end_cpu; + + bool cpu_timer_started = false; + bool gpu_timer_started = false; + + float prev_elapsed_time_cpu_milliseconds = 0.f; + float prev_elapsed_time_gpu_milliseconds = 0.f; }; } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 05ce667..7681e8d 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,28 +1,39 @@ #include #include "cpu.h" -#include "common.h" +#include "common.h" namespace StreamCompaction { namespace CPU { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } + void scan_implement(int n, int *odata, const int *idata) { + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } + } + /** * CPU scan (prefix sum). + * Exclusive prefix sum * For performance analysis, this is supposed to be a simple for loop. * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + scan_implement(n, odata, idata); timer().endCpuTimer(); } + + /** * CPU stream compaction without using the scan function. * @@ -31,8 +42,15 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int count = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[count] = idata[i]; + count++; + } + } timer().endCpuTimer(); - return -1; + return count; } /** @@ -43,8 +61,26 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int* tempdata = new int[n]; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + tempdata[i] = 1; + } + else { + tempdata[i] = 0; + } + } + int* tempIndexdata = new int[n]; + scan_implement(n, tempIndexdata, tempdata); + int count = 0; + for (int i = 0; i < n; i++) { + if (tempdata[i]) { + odata[tempIndexdata[i]] = idata[i]; + count++; + } + } timer().endCpuTimer(); - return -1; + return count; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 36c5ef2..fd1d622 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -5,11 +5,11 @@ namespace StreamCompaction { namespace Efficient { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9218f8e..dde09d0 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -5,11 +5,11 @@ namespace StreamCompaction { namespace Naive { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } // TODO: __global__ diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 36b732d..e3b3268 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -8,11 +8,11 @@ namespace StreamCompaction { namespace Thrust { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. From 55e3125b6f5caa78a2eca3c15afc2de1e99d429a Mon Sep 17 00:00:00 2001 From: Yuxin Hu Date: Sun, 17 Sep 2017 17:12:46 -0400 Subject: [PATCH 2/8] finished GPU Naive Scan and efficient scan for power of 2 --- src/main.cpp | 4 +- stream_compaction/efficient.cu | 52 ++++++++++++++++++++++-- stream_compaction/naive.cu | 73 +++++++++++++++++++++++++++++++++- 3 files changed, 122 insertions(+), 7 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 7305641..4f417bd 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -49,14 +49,14 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index fd1d622..b757804 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,6 +3,9 @@ #include "common.h" #include "efficient.h" +#define blockSize 128 +#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,13 +15,56 @@ namespace StreamCompaction { return timer; } + __global__ void kernEfficientUpsweep(int pow2plus1, int pow2, int N, int* idata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < N) { + if (index % pow2plus1==0) { + idata[index + pow2plus1 - 1] += idata[index + pow2 - 1]; + } + } + } + + __global__ void kernEfficientDownsweep(int pow2plus1, int pow2, int N, int* idata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < N) { + if (index % pow2plus1 == 0) { + int t = idata[index + pow2 - 1]; + idata[index + pow2 - 1] = idata[index + pow2plus1 - 1]; + idata[index + pow2plus1 - 1] += t; + } + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + int *dev_tempin; + cudaMalloc((void**)&dev_tempin, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_tempin failed!"); + cudaMemcpy(dev_tempin, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorWithLine("cuda Memcpy from idata to dev_tempin failed!"); + + timer().startGpuTimer(); + for (int iteration = 0; iteration <= ilog2ceil(n)-1; iteration++) { + kernEfficientUpsweep << > > (pow(2, iteration + 1), + pow(2, iteration), n, dev_tempin); + } + + cudaMemcpy(odata, dev_tempin, n * sizeof(int), cudaMemcpyDeviceToHost); + odata[n - 1] = 0; + cudaMemcpy(dev_tempin, odata, n * sizeof(int), cudaMemcpyHostToDevice); + + for (int iteration = ilog2ceil(n) - 1; iteration >= 0; iteration--) { + kernEfficientDownsweep << > > (pow(2, iteration + 1), + pow(2, iteration), n, dev_tempin); + } + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_tempin, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorWithLine("cuda Memcpy from dev_tempin to odata failed!"); + cudaFree(dev_tempin); } /** diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index dde09d0..acf1ae8 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -1,7 +1,10 @@ +#define GLM_FORCE_CUDA #include #include #include "common.h" #include "naive.h" +#define blockSize 128 +#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) namespace StreamCompaction { namespace Naive { @@ -11,15 +14,81 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + // TODO: __global__ + __global__ void kernNaiveScan(int pow2minus1, int N, int* odata, const int* idata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < N) { + if (index >= pow2minus1) { + int scanIndexHelper = index - pow2minus1; + odata[index] = idata[scanIndexHelper] + idata[index]; + } + else { + odata[index] = idata[index]; + } + } + } + + __global__ void kernRightShift(int N, int* odata, int* idata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index == 0) { + odata[index] = 0; + odata[index + 1] = idata[index]; + }else if (index < N-1) { + odata[index + 1] = idata[index]; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + bool swap = false; //flag, true call kernNaiveScan(iteration, n, dev_tempin, dev_tempout). false call kernNaiveScan(iteration, n, dev_tempout, dev_tempin) + // TODO + int *dev_tempin; + int *dev_tempout; + cudaMalloc((void**)&dev_tempin, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_tempin failed!"); + cudaMalloc((void**)&dev_tempout, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_tempout failed!"); + cudaMemcpy(dev_tempin, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorWithLine("cuda Memcpy from idata to dev_tempin failed!"); + + timer().startGpuTimer(); + for(int iteration = 1; iteration<= ilog2ceil(n); iteration++){ + if (swap) { + kernNaiveScan << > > (pow(2, iteration-1), n, dev_tempin, dev_tempout); + } + else { + kernNaiveScan << > > (pow(2, iteration - 1), n, dev_tempout, dev_tempin); + } + swap = !swap; + } + swap = !swap; //revert back to the last status + if (swap) { + kernRightShift << > >(n, dev_tempout, dev_tempin); + } + else { + kernRightShift << > >(n, dev_tempin, dev_tempout); + } timer().endGpuTimer(); + + + if (swap) { + cudaMemcpy(odata, dev_tempout, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorWithLine("cuda Memcpy from dev_tempin to odata failed!"); + } + else { + cudaMemcpy(odata, dev_tempin, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorWithLine("cuda Memcpy from dev_tempout to odata failed!"); + } + cudaFree(dev_tempin); + cudaFree(dev_tempout); } } } From 56921b93de62a2d2329d149ec03fbd239674465f Mon Sep 17 00:00:00 2001 From: Yuxin Hu Date: Tue, 19 Sep 2017 18:14:49 -0400 Subject: [PATCH 3/8] finished basic feature implementations and readme --- README.md | 103 +++++++++++++++++++++-- img/BlocksizeAndEfficiency.PNG | Bin 0 -> 41156 bytes img/ScanPerformanceAnalysis.PNG | Bin 0 -> 45076 bytes src/main.cpp | 14 ++-- stream_compaction/common.cu | 15 ++++ stream_compaction/efficient.cu | 139 +++++++++++++++++++++++++++----- stream_compaction/naive.cu | 2 +- stream_compaction/thrust.cu | 4 + 8 files changed, 243 insertions(+), 34 deletions(-) create mode 100644 img/BlocksizeAndEfficiency.PNG create mode 100644 img/ScanPerformanceAnalysis.PNG diff --git a/README.md b/README.md index b71c458..0cc13d2 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,104 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Yuxin Hu +* Tested on: Windows 10, i7-6700HQ @ 2.60GHz 8GB, GTX 960M 4096MB (Personal Laptop) -### (TODO: Your README) +### README +#####Project Description +This project is about doing inclusive and exclusive scan for compaction computation in GPU parallel methodology, that will be useful for lots of applications, such as terminating rays that don't contribute in patch tracing. + +#####Features Implemented +1. Exclusive Scan + *CPU Version using single for loop + *GPU Version using Naive Parallel Reduction + *GPU Version using Work Efficient Parallel Reduction with a up sweep and down sweep process + *thrust Version using built in thrust::exclusive_scan function + +2. Stream Compaction + *CPU Version of a single for loop without using exclusive scan + *CPU Version of two for loops with exclusive scan. First for loop scan over the array to check if each element we would like to keep. If yes, we put 1 in a boolean array, if not, we put 0 in a boolean array. Then we call exclusive scan function we wrote in step above to get another array containing exclusive sum of the boolean array. Lastly we use another for loop to get final compaction result from the boolean array and boolean exclusive sum array. + *GPU Version with work efficient scan. The idea is similar to CPU version, except that the boolean array calculation, exclusive scan, and final result is all done parallelly on GPU. + +#####Performance Analysis: +1. Roughly optimize the block sizes of each of your implementations for minimal run time on your GPU. +![BlockSize Versus Efficiency](/img/BlocksizeAndEfficiency.PNG) +

BlockSize Versus Efficiency

+ +There is not much performance change with block size changes. I set it to 512 for all remaining performance analysis. + +2. Scan performance comparason with array size changes +![ScanPerformanceAnalysis](/img/ScanPerformanceAnalysis.PNG) +

Exclusive Scan Performance Analysis with Increasing Array Size

+*It can be observed from the graph that the performance ranking is as follows: thrust > CPU > GPU Naive Scan > GPU Work Efficient Scan. The performance of GPU version scanning is worse than CPU version in my implementation. +*For GPU Naive Scan, it runs log2n levels, with each level of n threads, so the total number of threads is n*Log2n, which is more than the number of elements check in CPU. Moreover I need to do a right shift of the Naive scan result which is another n threads. Although these threads can run in parallel, but considering the the thread scheduling on GPU, the advantage of naive scan is not that obvious comparing to CPU version. +*For GPU Work Efficient Scan, I am doint an up sweep and a down sweep, which result in 2*log2n levels of kernal function calls. Each level consists of n threads, so the total number of threads would be 2n*log2n. It needs twice the thread number as Naive scan. In fact, many threads in each level are not doing work because they do not meet index%2^(level+1), but since all the threads in the same warp need to wait for each other to complete their tasks together, the non-functioning threads still takes time. +*thrust's performance is the best of all scan methods. + + +#####Program Output at array SIZE = 2^15 +``` +**************** +** SCAN TESTS ** +**************** + [ 44 14 14 1 0 15 26 2 38 20 24 46 10 ... 17 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.061234ms (std::chrono Measured) + [ 0 44 58 72 73 73 88 114 116 154 174 198 244 ... 801091 801108 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.06084ms (std::chrono Measured) + [ 0 44 58 72 73 73 88 114 116 154 174 198 244 ... 800978 801016 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.090464ms (CUDA Measured) + [ 0 44 58 72 73 73 88 114 116 154 174 198 244 ... 801091 801108 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.101856ms (CUDA Measured) + [ 0 44 58 72 73 73 88 114 116 154 174 198 244 ... 0 0 ] + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.222976ms (CUDA Measured) + [ 0 44 58 72 73 73 88 114 116 154 174 198 244 ... 801091 801108 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.227488ms (CUDA Measured) + [ 0 44 58 72 73 73 88 114 116 154 174 198 244 ... 800978 801016 ] + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.001184ms (CUDA Measured) + [ 0 44 58 72 73 73 88 114 116 154 174 198 244 ... 801091 801108 ] + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.001152ms (CUDA Measured) + [ 0 44 58 72 73 73 88 114 116 154 174 198 244 ... 800978 801016 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 2 0 3 0 3 2 2 0 0 2 0 2 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.12721ms (std::chrono Measured) + [ 2 2 3 3 2 2 2 2 1 1 2 2 1 ... 2 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.136691ms (std::chrono Measured) + [ 2 2 3 3 2 2 2 2 1 1 2 2 1 ... 2 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.343704ms (std::chrono Measured) + [ 2 2 3 3 2 2 2 2 1 1 2 2 1 ... 2 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.403328ms (CUDA Measured) + [ 2 2 3 3 2 2 2 2 1 1 2 2 1 ... 2 3 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.392096ms (CUDA Measured) + [ 2 2 3 3 2 2 2 2 1 1 2 2 1 ... 2 3 ] + passed +Press any key to continue . . . +``` -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) diff --git a/img/BlocksizeAndEfficiency.PNG b/img/BlocksizeAndEfficiency.PNG new file mode 100644 index 0000000000000000000000000000000000000000..d22b8bd1245ac0e072a679723db0b4f5f2109afc GIT binary patch literal 41156 zcmeFZc|6qX8$Vp9MI||%j%?wy;6w zf!}xDIB)L0Y11!A_TSCDb|M~|HYJ;D|Dj>(VKvIyS#C@AObqIc9|(_&XS{Ka?+SGv zSbS3#I{NYN$g3)D{bGop&!0K3^Q`f)=)n0>Xz4NjgZJ%%e)$RY-necr!l->e{&#`) zdx@#MzuwDF|7FMR)0bm^+WOPMShravCsu9t^EwNsd^;wqxukE_bJ@N?p+|POY*>A$ zjKJrT4JANcxnXSC_s)3eRt?45!y#mmNHhV+hOYk_L0sHIn z7BQZ+r|U1%e_DO|lk5N0!Jg{T2_b?+EHt;Y4u@i3-1isW8toQ}B;!uaSMFF_>t_NX z-Q{gzhU{ZO#p~I=t&a{S#Kkdudh9O33aqG$rCfQI0=N<1@E6mNFe;yu?(>s4^22SBk^Y^y zxjTx9$wA_)j~$xMpX$;h2_+(ODgL;rJ#p&0^|0bc{xEE--G?0C=B%p0yY`q7)ZGSH8kusja>bsx4lnYD1_Mm{6#RZUo|E|SXUOWud5g!l!q?u6l1A9r7;7SX zLjDf1o7ai;KBdWLy0>vi_Gq-J;rV-O>)`D6xFfyxsLNaOTrQ{OvN7b~LCoqu$~X+{ z#I60m44kw*y!)387kJs&@{acOUw@6PzEG7Mvbsm`ZWYo>a-?OsX z-^s%Au6kkKgN7M0RZB@qZXYH6l!aKaNj2F5uN2k<uBMJhWGQ4?j1(6Vvxf;_UT$Zh6#r%Wf`KB78LQ<0MaNb);!bBu*#$l`RaA?hov!5=-?0;@zYq2+)l5u;( z4UH^)zF$W?ThVFx`fT|@4bO$;`Kd~UNVSH@P`&9{9T9<}5iJQ`&dMLS zs*M%y)SQvZk&8x{LpgITESRirLgt`?mf?5=e}o%^uArXO(&INfH3BJ{9da3n$x6Jg zH&G_7b_%{&t6(Bq5Er_9*#=kE>!wRo;9scO;Ec_1(in$!R$PI08T^K~&78FAh#@h@ zZ(1efb}oO#mFK>**4GKKh^8Du;^Ip8kA7Yl`Qt9P8ed+k}V4}bZf-3zduJpq@7 z)(pVEwEiVjd(E9GgLV&x_EU?Wlsi>4dB)og#YjPlM+EIf+AGC|fOviyZQiHke7vCN zw+(1BNYWHD;+A^sxSM!*^3XmLd8L{&Vlz+C%tBU*rrZm-FIfkc9}{6`Rd4&vq^-9E z*Pf#u-G0>m@WTakcZgG;a{JjJQl64k+)e5u4n@LwZYg7>_vUE~ik^#AMnu!_N%b@3 zF|ds&94U=Sekg2_&as=8qKSFmR`0ib2TOCnJVOclJ6eV)60iQ!9D7sPUS+ zdc(!dxlQ9&&qz=H(1N9nq{5zNljST3q}Uv@W;YRw^iIZxV)+v+1sSzj}TOaEmoBn-nu6`u?u7E>iW#|Ml&z~ z!Jc==|CvQLhtZkmwA=mya2{i*&#s(h_9)h12gNsknIk3`aI7=$fxdTqjO~`(QXSE) z-TG6{+Rg@5Ewvd9sfOs!Z5}#JNyUG`^<;Uas8Cb`=|S)hBT- zQAycWuKqtw;^uDP*Bq+2mS|J{ju1hmnJVYA1k^>&9Nbn)rx4EJQuNo{P z8WqgX)0^q5>OJ^{I-(R^dq2sgv>>pYG-LSln~plJsKmH@)%haY9mCO{OOxVD1#H_g zErX>@C7&GAUy*h8pybou`L8I$F{HjMMoVAY$0yxtJMa8P2+5b7l*&0{cdH{%y!|m< zn^y)G@$xA>mGU%u(6|6@q?>6~lN#CF9Wwl^jj!Cn+=#9Fr(>gQy&d*af`Bhue-fvc zI|$|e!=#mSFi^_+R3?8`3$di^)im2P2kyACx>$+6l9g1N?H2|Jmu94*A0Uopx$GiK z8HyR(5D7jG?QXfgIs8r-U}_x*iS&e2+;ca3}t!FUIXQN&L?0(XO z7oU-StF@s?zRI38zqOq43%B-t!E=CdgVB{9p91&Oz+fLpHD>uL6EE#5V_XhQ2JS!g za=IUQ>4m@IE?a2@GT5;J2ndjb^j)Z)h`k|y*Xv^~E=>-KDP;e&#ip-fdE(pCBcdEpt)1@n*R<>aAZYC5p&|Rje6o?VMYXP_aa5Vms4BP1s1&e%P zb!l~^e;o8<0n#{GqK5(C`2m$y%&c^?M{9R8g)K%WS^(oug%pK35K<7T^kX%vo9z&V zn?3d%{HhGTI_wL-`J-vwWtleUm0Bc;;d>B{`P(}99gm{0#Svn*V7iCHBR$(uJH*Am z-_Unf%9Y5K$-TgwVd~svusQwOeq?=fF6LhRa%rDPZr1(q|4xAU%{|AZP^)SptF*7} zzs;2iD^_~w&F7`2e&Y_2C#z@{ zu=-;&y}C~LT?t&ivC7)XjW;4|)+`1H6~Eb4f89O4?iOW}VK5g}-(v}LnJaCo#pjv7 z!t1%EZvMGS`K@q*y=*f>O6gKh59diBZr{D%s5VnNRhzKvsjOP@{GlT>_b7qHk!%Bz+}>N_tu|CTIGD{1Mw%$+Q!9OemT%2KZ>*ByOHZ zxv+#7`a!L4L3rbYiF|=p)q?fdh1uerO^p73zwvM{e&QMaR{7=~{u1jV^7J4pjZAx? zkr)oxfrZfLePtn@3BrK+*Eu{T~CKElF&@CYVYAJZ?ARw0;-srh9C4_ z7(|a1pRN7ZE=S~iaL0#u2w6dt-D{9Xb~Yi)?t+pK>s9Z&`gI(p5tXrJQdtl=9{WW& zdB`55-B^hb$J=a(=PR1iXxLmS%)4vHyoC&*u~Xe!hfuin(>WZw``rnR^w<9e&vdquS)A+X&=>bR zL`9L`(w;AleHz`iuzvTW<`Um=R9kJh8!{%+6SAP79%ta!!^{Hy`UB6rwsjO6y{Pl= zcN_t;CP=T}Pke^r_w-ja-vvKjFtC1dfAc;69%zuXZBjW}+Dd-S^rWQHTW|84en-Z{ zlVCFHIo<;PHF)cS5!u|}-{VNr3Okyd);@!CF)mY^@aSHAAKu|wJ@azPvcS_1GaG1l zeztezysHa3CDr2Zn}YwErRk3J3s3yeq&uxrp(1Nv-xNdTok__~rOED9X`>ZtM`-wb zaN$U|?(vjnS1dW5#_<;wj>(0OY22B*lqvjJAvidD(F~HmzepK3@mjS5{JHru5svPAjWijbfrF0Cb%y%G@LReI%lizcOZ)3}4DW?*hh8uOk^MRZbU@yD64Rf=szbh! zRcl; z(@g3vJOmKCI$*{D*|3+|u%)@lAtH{r$PJOgdxw=k(_#Z7=U-m2PuDG&Cln&NtnTk@rzY=`koacy z*r>Ga{LV?tP#^wCUeAjTD%!W(bCKC=*)1p;j&mCz6eGyk(dRByTyOl{r|^nffwpDW z-*&_mg?L-NIr8+^aEsJVsAe%9q|1-TaY`LS*xRva|#0u zA!PVY{5=`#hONPEUx}isKb%=;+ev%kUZBsdI)(2w%WTxqxd)fczVo2rd$9h@IViWb z@oaCJ)#Ri;Cjzz``%E?mq0k(X^>F@J7Wv-(t^<1X{tT229MU3`TtAqX(%EUDix4i^|JLYb`W`Lrs~r_)(=AZESBahJ zVUog}-{@rR)ZKkD`j%tm1!Z^J&3Ep6?)2sS0qR+Gqijn@!Ki8(L}r=ZIICMN>uM&8 zRlCvrEg?YYPVa2854j>{I$7LwL@HM<=a*;)-Z&*cNW#q_QOA{%x-59#M7z*#gp@C% zgd?5o9+=fC;&p3*jduY)Zy^R-|mh5HIbm_7{j^3 zYMe1}eK+}xuDn&_=dH3X$dHz@bgasa7gk)(_HF*r0UEmWYpM%f;j|`B*Yk^c5K&ey zmRqV^R&t;ucO_|R*7~G4+^1%g6SuwKiS$jNp+b=Fu-0@_PDESnwenw1^NKvFM!YZ_ z7eRZvPCh{eQoQq!>+n$DwSL4utoRNPfSNY-~8J(xHGn!O;Rw@HG4ir%C z%EZ36Ar=td`Ha@Smzqm-gT-|6q6oR?YFQ3HY0;%BRjEvHRTb%mB|h*S)kmM;4)LN1 za=%S1@;%jP?YY#UJB_{)DmhSQjlMj2IH}hpkMb9-#JX`jpKczw4%$@|gTCo%zxU2z zNUY?N^da0Nq0m!rml%rH#y}XT_?ZwhUz^mk%_@!twVlkD?h9^ypuuBmnwZ?!pYmZl zqTQ5{nP_FzXtaypFa{RW27q8aJLl`abqw$tLcZ`0DzFh$yK~yLjOrNeN>ePB`?zmd9Tv zN49f22v|!DTpUVQeG)~Y!l=HBtcQwNMY$ME`D4ATrmO&+zk@qx`ed%%ntq7mV@k+3 zc~FEt{AU`H#rPdqo+BaIrTQ&mH{C07MNI_LX8iWhhlZ#8Yr0M^V0R{=)#-qPT2=A> z+}`mR!b`e_wsUlVqC`^gKlIt|a zs^_=xTUspo(P9uQ^t=chJ+?GEJ9;B6LmDk&Jf0r>o1X=t*Q~Vtmp1cJ9*M^-vGS&Mt??6UBwwvG>&Zsh+dlNyCf(E%DvDYFn0IM&9nc3 zkh`srh|M{8dnmP?d-3fbU@-Ym_9#Vx7Jd`z_ZS^D&jU@7VOmhKX}pd6EmZ zz5*aFB4o5E1EtaQSX#gu`#gxgO*UJQb~9K;h$BDd?g)ow$pf(J5OX2>%G}QTfbgBa z55QM(ll58T7=mWje!dVd3D@gL6M1c?MjZvVtc2$?IwHn123{%iO%}W@V|30KZ`ObUa&x2k zm@wiXG#3+$&YO2=Y*WW!#-%y-arT2hI}=-W@7Ku4x85IIZFbKXDLY- zrI8ZRBJasog@M}>xWrvgpSL0ud`2zRdxPEbS+09_g`mmY5`ft1E(VMBh4rBQ*&C~IBYk?qzt zuPT(QkOzQ|u@er&BN6^HV248M#g$QBFpFUX#RJCroSM~CGx80Gzpsxaq+)ALpJXCk zT=HYofs+q0ZwN9Nh$YcYdOa%VTW59HK-Sd@@<8F4Bj8uPb1c$N+j+%5d$6UCd$v!jSAUd2m)~!j%ZS(o z;wvKQ?sx|At`i|VqfHaC6RaJ##MWc#duqY?Tu^2JllBEKZ#LDZAM@9qzII9)*O3gY zwdQa<%x5+c=b86L_7>1DdZ~@S59pH56hHn#j>eoIJ-c9Z#QXuU)Fv13v@d5gmU_gQeAz#4g+@J_mQ7qJMfmQ47*;B&99scx*3N za2h>T+_Fr_8XHJ(FegNXasUhOg3C@cn(YNg?ovGf^pBDasufmdgHj*sUPqVb+GMW_ zw&!e3$>g%#!tq;!>W!_C{<{&OE*%PE`ISd6Fv4Aqnf(MdQrxbYr?l|gwrkFqrOsHn zMEw;U^QlYniAYUeem(wX5JhInCSu-q05xap0c;ChlCo}FSHrQ99gx{~MYGPgKv7Sj zDKFg8KQ>KpJE};+^`|K;7j&s;H`odb*}j6cSPTr2fuhe!Wd&BhL;?>foq-$CGC_@p zjOSy+3BY7IsS_%&oD(WLhc#<_uwSP3O(#YRR{x$uz^ zZthP>oj8ItYRm#g#P3=2oH6yiTouv{%y7*NWfZm(a_K0cg3f_;1K$rC(v4CMq*(Fj zlBr4Gc-2-9+Z%%IBo*cs+!?ES77etGQjP}KwR7qdZ)If4cV0jr3*m?@p>SLg4@8N+SjK<6lI3nYy%X@(;ydFKqSHxDe>>@tZcQ< zZL=6%6t4t!(0@FG`WM^%rWppKA($E-;5tPbmWk|N7}tAo9lj-}coo%XkVSeT*o@St8;uL{)4uF_IEW3zng>T-EC5j7`|l zR#ZDsLl{`0uLqgi&K9C{>}n;Wmm{gzr_yP8i}WfOjWmk8uTKXyC6+_HxuJ{b3zT5Y zjAohXb_FdS9ARf<#FTEI$9WT3^j-;Ut)>*8fPA-|cTQw`p+NY9H8e&J0)_>?nrr`f zzL&7a#~Gmd7B;<>Uc-%Jy3!ls>94eNVFfN0(8h=N6WFFiZ!*<#58%?y<0vl)N!ySQ zG>b-F9`aQC%AXd$BK;Mw162#1UNH@wJ+p??n_qg82GoVH^vN*~*dT-)(H-FXB<@ZYM$YX$%w{Oqnm*#J0k4WZ%ifd$tPMw{dd~_nr)3NKl;tjx5$+Z))#6KqzNN zoW@anEB`)Ar4`4!%xu5Cwv|HFY?LYYK9UDzRs>@)={s&zS0MqWwQ%hTMc*A?gY@;T z8^d;iOk<{7JG(@tG>$lUBKs7*B_}H{k|*C+s?%gIH6>Mmn>A4&-P_>{#2WG(rtXkk z;bHkDXHnT!ZW@ASwPhJN{^Q4m#F1z0KHR>$2V{jn&U6hqmZNFH1W3fbnZ-tz3+^Pj zOmVTuh^M~L&N*RNc6Rr^0o`e*6`%&|Hh%^Cjd@ER0;4|I0VEBZVTQ`Ja3SBxjl2YG z13QrenCW^rrcHw*!@U(!wi7}vvvycM_9tMh$3Y;%UM+D7G*s=4W85V$o(v#Kv6;^U zLnD6YAz#jv1qSwO*2OU`=w>D%lO6~Ofs&)i8W7WkyL7fr#6^$iv(^X|>}))pd<1_O zY9ybvH+I#&^3SF+9Q+cV}j1++ncYqO<5ecC8`*a4I8JquHoTEJW*=q+ZB#quC=I+1ckrtg(Wb zMnGS0`C1toloCY9kg#`(#}aa}25e*I>p)t(VpGY$&#HIy)O=b?Q}H*Z=UZ1-(Z(ts zgQ5aCx+Vn$xKx~H`2$0}F!{0RVEH@l+ZS_qA_!4fm5XuX86Dcxw#bS5&F@ZsDM8iE z3G(GL^I6QXk#@LQfvMRokZHtOLZZ+A0>U=Bpi5FcTX%!BR@`$vtd(S1ex>A2-~yyw zLKa=Su%)Ckr~jDWXvEHiPj0+Ymb~vHOpSnBga``FY}H0Bc6f+Vn@+yu`fDwt(T~*y2jG5)?O_NS<%(>-OOGTwg(B!Nd`$Y( zyZgRtj7m_TQH zVZ4@F70KCNmmLXPtmCEf@sJ)O9R{Y|r}xoVkupbyp-$Ea4pDK=aiO@&l~y zcVk7Oj1mC#SC%=4gCV_8YOq4@^p?PcM+T~lSW$lFT1JlcL>u6)&K4yI@=OgRhY_R3S8Vn6W%xlM?Nnn-1Spkv3 zz)@<&Uf@2+|K2eS4U+h47ddbw#oKp}zl8=I);RtI zA!W*&;;6o>?-YM2@MXr@aXZ_%UOlZG4*g1-t?)!Gd?qPZ3+@9=3Ih|({sjC!co=z` zXZ(2nYTN@vLz)9~!Q@mGbf~B~8Wv+#%bx0}3@%r_NNGE2lUy2xB?88^4d4htYpxbc*aN)P zaX$UM$Lr6>NpxLZ`t`@w$P5kq=VbNT?4)H&uew`tF29@Re9zAI$JKLy{ zJGu)jKmY!By3XaR^8-Wruq=5>rJB$*&QD&X`eBPVQzT2Kni-`hK zwbnfnPmEe-jB!Y1|G9iMsWmIN8gmvUG<%{(ufweGSjxrb%(<~T$0Ct3bzye0~_Hr zsdMRFnx?DW&j`zxB}(OBo*6si-I7Jg{^BA&$A!Ms(>g1qYkiv7r^07ilB~AaM8qpx z<`YPpXKN(F<)MfW+Ay|=nRf-a^dOncP+bWL(8e^K{#JuC+(17_^muF(Iv z*LpX7&SB{RA@2!KFzzRs=*P5Y(e=kgtStl>s&+o?u*qilORA-DB6V>ai|9ewu!0oKSF3y)6`EP=Qw-bG9V4MHx)1A(e){{_Y_t)E!ln$4#Db z$6f@5BUzw+17EgZj}wJAg2fjW&f%dV2#Nc?!Qs2L_^q)z25?{41UU8ev!4UONSXkE zJpv?S&4}7{74$j;_3X&#wr3C5=0r_5S5R9dqDdKpk1DDj<lpvAjsTe{LB8HVD6zBR{7l^JD{+OMb`KcU_31U7N3$LX6D*9mRX#oIEql8)95<&r zNeXvtgoSF`Pm}WnwV&zJys- zlIf0WR`(VnCmJ5!%lfbnE0wSiVN&g}N6;GUQbiX?x3&OLm{9ri$*&C~;k6{uUgkpm z;Z{K};1MF&Am0sGhX)1F#l%s2bhhnCEx?4lbZ-fQMAUfOs8XgRcTcWtf`Sd^wV`V0( z8`jf?p^P|ld~d*lax4s-m=8quBXXcYY^PH*CLTXg9@YpcvayR z>XlE+!!|y1E~gH~6SvGaLm|?VZ{JbKX+0gmc~{%`ZUm9b4wNE<@M22 zcC`Z31_Nrw72$3pDzPrr#6KT2YzYzf0qO2^YbtF%A78yx=+oinU3&>+sRveeY$mQ1 z>#a30nca~_2$4;0gA+{LuR1~fXqb?|pLzH$~oA}p*1`d8QNRdUbb5t}XAW+LU$IkD@@~>yv z$EfylQ+(L;Jrn63wO%OGDUYAz%XabJh4L|#^8$5l8O3QTTUI z&xUchM$o5NFK(7nA@^Jxite>QTSHFfDB^Q38<&kYnHXle!o6=Q+b6+u38rqH6=+8g z*#>d_x4)?ythKfBa{x#`tbCpp$q^uFuvdS%fsZNRs{in7smT0%7H{1mqvEP**2McG zkl(ZeioI_bSMTqXF6;nRA7|M;1Nsby=4h+}G!Glr>+a>>0324R4F?#Ayf8E+@bEOJmBzsB zUQNu`!qWMqIbRDzlwB=UyNB*8fsHGRZ#Cv(z)88H@H- zW=-fO`AJ>=%ZfX@Oa^)TIe>5D1NM@KeL>8B8r0N)QWH=?iv(>%bWb9vF*N|`;^7XV zD3|k52bLlUe+^}x=zmgH`8yRkAs*{|@(0~-B6c5G(>8G=Da+C+wY=S+LWO)MR**FM zd@@WbUQtBuso*f>1#?IeWBK{+f6vSc$zo0K3tP<&#AVA$!hJNss&I3iO%D9*4jm%9 z%nP@CYuk@|@_>KQK3EoDr6DhAI6qpd%_@VM;G}$?dIA79lBD_L#zONG;n+?xNDj!0 zk^f{Dj85XBJCgI`l(Ma37Dd=~MgJu#yA8kwI1jDe{*N2F%H3>SLtKcU%qMG37*uv- zqGF8Tzxjemc&+h3sY>&6O-s4&RP`9PstU3&&hxRpne3FR#f<-oJtpg9yMLEH1T7Y2$B zJ-h+{R(G#H9M^^|m6=zKYodn&pieSO8bMg}<5o{LN(EYI@vI`yDKKry)FMxrdrrt~ zJm|#x^b#kE3$rt~b-e9J$Qp0a;9pxczSVes%gdR_=7=JUpt|&9w>N4V1%Mx@KU;(x z(9XlMf$$A_L^BuOr$6=(YG#8&8?pcvH`(y#5c6E~S;Kj9Icb_fne7~5LjXTMPyU|t z25-Wa$A}d1MQnu(x51s-4Tjbn9aGWWn2rGS<~ZEkQ1fW}|Jo}Nnf--94z0CRFt@5fX%FXj**eO@OlPkbP;p|BKX zQ+>2F^YUM`3)$VQ`Hd3UShi0V^tEqrQZYDfp)#)m99QFcf`4QKQ#4lGU0F$qLP<9$ z%9Y)iVXqpu0#N}rLoQ%1#-RfM%NX6=;=Aozx=WmR^(0@V=U@NvCfGRfy@sC~j$z3r zgyX%Bux>esz0UsZs;eQBA@tAwdI;h0s$Kts^F41EH=pnITVXXL`8yR!AYdFU7gNyc zd1RE3ZVMHI{?8N?qJc=W{5mT_xcPf^B_^Q~_ShqN#)HX*$7b_0095MUpUxkcp zI2N2dLKa%qTNq*7r$rl++;X%g#za0}G%b3qd}-r@l#pk`tqK9A0kUWMY#~8;zOVa5~HZr3(4B1fUf&H>wLvk>^BGa|&S&C9d z&;L|@S+iK4l7P%fr1eHh4=^@1qu9F4D#Qe;%lIEg>b^b#$&ji`r&kCkpO7?MsNLw| z18{cGk)u_EM4xH69m?!$Q{lH>k+1Eg&8B*YpPAYT8L-x^8`1|@I&1{8E!;~XEYkGi z209Nws)+a;+!U5vED~E6{l7kkjWD$_2u!i{FqGJc)q`z~Iq-rCfwSqaub%zk)eg;F z05TkHI?f(M@yGHK3=xBhp+>t)@JBgMeR@{I7rrZK&yPJgn;6rSPLhWR7B@xLuG;0( z3o=S^o^n`rjll+7N>I!>WB8d|Zo=Q8Y6pVH%^QAFOfKh?+|vh(%<5_-+tQ8Cf~2>a z7X7hp|EDeN2X}bFJ~gebOPfeCpcN*?t}JC8WxZFy$54L|nSN=aQ+77Cy&$tV{zRHpcNDHF^P>Op ze5>G!`EZEDk0l%wRwBF8vD*}svU>)!jB%c2y=OKE2ic+q#wXXJZ)=R+wYaFV`^Q60 zN^k3Kb6o2w!IpnqIZu1e@Z!A1{o|N;2)?U0g(!xaWDGZtgpu?Sf8kiS+MYiC;iWwC zOoL>u1dxq9*+CFVP)d7Y#L{{@e9Q%7tHX^RzNr1fn|Wf;qHO^Nai#~;u${&s8qmLk z)jTjdaeWtDZSfztm9Lon__eh|K>cj!?cTHRPZfuaKXxrsA91|tsoD*MtA@yrnF9=V zO*5Kw+78udoq?pOn>&~l_}q>#7yS=<^QAlSF^{dWf7X~jtrUGR4*Fx#0S#vUNKApx z&TQpF`IAI1T-oQ2A3z>oS|laA)3;wSn?}CxcxuIB8spH^$NyvCSLt@DtQ$J$r_Yb) z{#brjTlVt6zI|sWk3tr&xqhhq0m}8QxpT{oBFwy@%!ypsQtp$g$>*B#8;{$83SaOS zS^5p`IT%6v7;b1Fo`5vQ^*@m8n5|x6^}aiL@UX;L2^-uWb`k%@Xuiw{kR96#hN|Ol zknh~;GHk*ir8|8(i*p2_?S!XllNVh-)c+Sb`$~IhPz}YtgGLD(b?K(IYmVmShvwvy zW~jYpun#-`=WD2ODK5mD@j8*DD%PlLR-R>9W>%6_y7&ZLyBgD9Ae?!^wn*oPhMV2; z4Vt_J{4M0OOS-^y3b@a&If*C=JgW5i+s9)S%&Mu6;U)rFxE=p{)1Z3^gy=HI)9Vhn zH{Ee$ojC_!jaFyeo<0mCkEYT$Z-AL<*CGN#PU&gfv0E-TYB~Igi|Q#jW>y^tF6o%nk>9RW8_(E30ZByQvUGQzWu?v>0DZ2qqMGl0?Ldy zt(4R!cFnhXX;4PV#!4`gia3>#{3SMCYj*z+^6lPZ=0ClX13Yfja*nut9d zz2i&c^jhb_VbUrtIDYsjwbIh{pzDw>r!yPI`n4oj?eohmw9+0IU4Ysnb~E|y*+?^lcD$_GwqI&6YlFjqm?7MY_9UpQfMF2#tz=XSpj<(B)I* zdk%0zf}JYAE-!w6ft|y*sVSeT5au+a>yT*h&@MzLF?h`LTL>MwKr@ml~_J6Ef)Wq|L3*cX9OaUEIY! zkAV>I^IC4u=UM$D zGLYOBCZmAge%H4+hh@Ec7pu!;u}|<psg<1NXZ}XhSDreZGt4RA zPt+J6i9prJ|30RI0~@YIt6R&eob8{%(*x`;yqjuZ(}J?3L!O!nBJ8$BacTUlX*w3c z(+Zyo>p)jeytXQwu&a8*-uP78;>GD3qL6Fa+?L${&Yk?6(Y^29VXR%m$RDccadn+A z>(4U#MaNbeWRzBTgZTR(vnwuob64x~?5hq@JrPls^ ze7hVM{VnCqs==YY37<`bSa{C{Px-t0*`<5*$8luUx%N7x&n;7yRUvLHk&yi`mJ4yf z?uo}jp&!Ip6*X=s-FEM{d+h@;LCawXh}!R^iL)ztosIteJ@zZZ2qBf<5valBxo+Bu zJlvCR9r~+a_0-Ie)vYIU6Zb{51w&Wn&Ocx7N{d|_k`bD%pPJHKNzXAkvxT2zFTGGl z`y?aOo%=^+jEq9@=5XAJkrG7=KE%r1oeYI3wL&LC$3z^Pm{0{?8z%fUa8W5nWu8DcwwXk!*rtmNOBR~sxh_G zw~OYj>iHpzJwbL%f`+VtVQ8~`RV4xl2~4Rnhy8P#IMYNymDMjPK5DI?>firyK+Ocd zSwtL3v=T(fjq=%#G@LXNY>^u+AHy-QHJ2Cejzu^T1MdnqmI#VkVLqpuQK(7-W~nSvF;vskH_ck>Kp@@s(a6+g%6QFyac7*n8d1i zF0#i&fO8-o@a9Q=TBP2S9!G? zkjn>Ns~hy;K8GBx&Ex%KK6+?*I%w9Q46a!ATb1YASj6mS4Yz?oYrVMqIb($?-iIGaToMG9`pN{(|hU0pX(Y5 zzuyM8Tz17M8rnS%howh-@-*Kk<3|~Er1<2&{(o@MUlv~4aM5qP6Y$~>n#sO6B<5ZK{O;%x`Or!30iM5;k`v|$>%;zwKR69sH03QCTp9)+Dp?*#J%N(JX;dvfq?w8^HTR)y{3nXZs-zPf!f zI2EDN50WK@bM{)I5*DWRUL|j)oLF3LETJd`zz){r&V8bx zPXx?H7rZ0$S=B1Ul_>_ycDni%oD#kp>YTCuUcCP{a__cOOyXUS+sg?UF1?S*jF`Z= zs=nhp6@v>)^0tg!e4XbwUFUMXqTod);%CcR6-naIK*mCkeeccM+?f}JvXu{xS2|mI zGCkkd86E+>}_zpte7~Wut-(D zterxQqRZd!b8MkfPZ5r+{-xzxp0hkOTV|#AqMS^eQdaCtNsq9zg`MpL33AdxrDnJA zLPk$qq3=kS>vg3NiJ}?VM{ZWBJ8A^;DwT(AC7Eu#-u8=bT9O4*7IZVK@49(SOa5wU zZl11kwgF%jXXbsy#;eA#?W+tC9+4WPr-!VD3_kV}7zjsKlWXoHh8*J`#zbg5dHcBj z@!JSm#EU06kNL*}6Z>-mkBpWwBOEDs8AJELzV2dG15Rh_;r>vs>T0~Ld~gy@v~hR9 zDcGmU7nZRD9zEh!rPT8Vlb58OB65sebi}WWkAcx3KO8-G5pLLqG>rz3Iol zgY&KMxY<$9?qibSv08bWxBf|J;_`Ko)m%k9R9EsevVW{B!h`Ass^k|*c-Z2jx9z@U z7HrVt)BP+j(YOO>XXIEKkG!5$@kAnu?9wF@RSuLq1MfT)|3NjXI7ZXV*{-(92Z}de~Pt?n{;P|kce9~^u%$5 zrqc9Chr~$!DCx5FmI>W~3Xk@@t>F5T!4AzlUAnJVJw^CkFH4bNf+ZumsjhuMtm8uli&tDEvar>$%17{gPB!HKvZ7kf;G)TvH-)C{b~MUZ$xK9~KZcBBPFs9EnBd~W-u;=RXWT(F`ZyI7NyFY3qRhSQ_O(D9bDd;FYL*P9pU&Y1t z@|VQRxr_vubAM_->pC@MS+XO+Z-T76Dvu8m^7vF9E!{az!950^LpNEY0(NG?EKg`UEf9d#QdqeJleZ%&@uJ$F$;Ci#!mo5hjtzNnu8>7h1dV!F*fd8`b zbcY9i;&j9ViM>#P7A1dN7r5gI--R2}qfbrUZ(H{*uNWteo!Y>*-~gDLwL> zwH*8lQN?$-IInUsZw5#pG$5b*RP=t(c2!RMCBOjg0ks|m2Yy9s+M~B2S0n;CI+#~k z!XO>(E}lRfb1j~bhL1eN z#gXo3)Be@f{->x1hh(alNvQLfi4K`ek?YD*tk0I#`mIYfN^zV#k^4b=kcRs4L8>c? zh;b-b%(FPUGVGPL`1v4J4+~Xb`sNC1;VFfTHFf~k%tSd1hQz1bGqb`MYEg==npt>8 zY=O=IvA5|&1$#=m#s@hJ4h9y_Jgoaa?Y((4o9n;-t-HH)pz~lTT56~%HI*2vrfr_7 z7*aGvG;CA5LquC0Xp17mJkLd16&h1^ca=6VRge~Kc1uJkwghPs;knuSd%owle$RRS zI=^SFv)0caR>Vr~`?~IHcwev2gvz~dB@IA2;3;Qch2lBchT>z`!2-90Ey^L^4)e>n z`!W{gl}%D#s&pEMjJaK_Z`83hN*WvXyf2!G1+DX^$_Ub+)^W{57n9EQ6U+~w?-??N zn|{m{is|r;HqDo8{%^Bia5cfQ@z2|WbC0XkGb@(bw9s0&fv)x|c>F=&Ic-zHUo2x= z=>g3snDAakcZ}{tS1qP^_J`0tn)XkhYV3>)fr$5n(8u)I7c%j_Gsi!Wq?WBbEmU!= z$~2c3u0F`%#W)xFzL$q+ZeqH3Yc(sObDPc3T1`t%DgMVMShhefH>`f;<9mM9a>9yw z!gPjHXElN9Vbrp9V1e_7UB+i>-2OWS&r_FFY_BQ{+q(c`$Gs*%E>E~0)oO(2x#*b* zM)XexaMYRapQS};T=omWA6O_8K%I*=ufir?t^9QsfG^517vL`wMOnJs!Uej3thllmlbUfip?piC*`XS*<7( z^#ev1|G0!Ho>kw0>n~mMxptWC`Q<{V-wpbhVcwe_VDnqT|M9(#FnlP-z?7VgvwL$g zaKa2)#^@SWv2{~$ak^S$adas@`Jg4ys)!qZ{j=-l9Df8(^JPt9DRWb`qVo!|wyqE^F%F8; zdP&f@wZ=6yy=@!feLvWE16CV8$iJNN%L!(vh8R%HTxSg zKHTw;^bcshW^E+Op<-|skuqQWX{U#pZPYVXL@AJtBZ=&eG$&c)&_jw1e_JJL%*%qxCC1!WYXavG0d53O=I8qA5-G>QhrH+FWe#e7(cMNJUQB0 za7~Yh(=h$HenIc%_TrHO%pJ5J6|_KELQDB|w=4op_umKn$MmSff$&#d!)yJ9C0*VC zT`{#E4_A|CNqx{NZeJ;s4i98IM-)#u{CbR$2>&_|eRBQ;h8DvplYPQ2Sd;!H52_mv z?fKp#9lfQSkU+$`n{$v-oXM_;nZ`~ia%-}+jpfhfBw{r-dUi2KoD%i@xZHG|NQ!1y z!N;gS`bHLO5-xs5FRd^)Q)_fqe&D5Aw@UibHw2dz8*ciYU+v7w{^M1ufyUoJyyC#0 zgQ>w)Qr*8^ZC^>kInff9m)(!~N^lyvLk_pH_)c3_w5^S9dN$b4fWzzKiCHxd_b#7S3^I_b;tjJ zf-c-Zs$IC@t=5~Kch6;Y{u{dN#Z&y2#QnJJu}xb3obiDQXvh_}#YEqgzpB|Irl`e7 zF`R1h$SheS{M&r=kds$otbhBTPYzIQY7+1%PO`DcV_~MZLJj`PI#tC<`1VKjrrR7r z9?$DnSe1{&tVuuo^6IJ@Zr9$XuWxeY<=?Y{7ZA12z4*{U4Nr zjB%434+rwR2rc1P%1FM_X=4Sa^jB2#BUTQl{a($D$~8!-6&a>_A<)bvL3tyzsQ_Ag ziiw@8LhT)mYTIwgnfxu{Y7EhXUKMG%y&EmdY-=r;+ZVz3%Dyz9pRAC^mu8-#%cfj) z!`WqpZXJ1uZ~0j#r-T?U{+tCLpkpQm+RBk9FVU;JQY_=FguWi*MYh%pP(8y~beV|K z_$KXm+dbbV0kNL(Lz`x*yUThSXPR;hBq0(5Rh<}%HAHD34CpGCk-?q+UITm-ErAz;{sJB}=> z^2cBEB3H-Wz|~W8+&G^8*&vc6aB57`uhzGc#-$lX(8rJ;xOtwy_L)6a5XQJ=Y-+@$ zFSi>lwhws}5rTVn>=Ss0HZ67epmWi36(g+$^}>12ChOYm>i&>|qe>E*2;#9B^WPBK ztxk9K@N3cWm3&&quZf2`Iro`&I1@7IQ9LCdW~*${5*W zouKKL@XdvpqfM>p(9qX3j(L0jK={l9{Kc_AXU=Q1W-Y89YQ%cscWUbU!AE@4jpjX) zN65#gyV=NIx!!LaV7*Sb*Skb*Qn=srdIJBc=WPk}x1!*(rZ)lO-LcF%!`!y(`j&57 zbYJwpQS&?sn3!{Hv4B`Km06IHi=c~Xi#K+z*y>rfuv(Fa=b|1$%00$Av-L!mpL>4U zoi=5ZnArK3jE`y0LN>-vC~G1UIOcSJCaot1OvJr&97qiZoYIidsqCZ`4?`(7qO~<$02dX#~&c# zSL4iA4+su>Z_mO`f63Op#tlk(VK8@Kp=#e)jp~2;cf3Z5r@xrQh;f}91UZ(aVjm%H zAts+0(C>q+;a++h+Vhw*b)~53BZ8(HZ@u)7`E0s-*><5W)|zPW>}~k@;^gW`R9w}= z;$Q1UK4Ge3hb=joM5znu;^H_IW6vaQx|)M$cHLOn)HNv=;*spD&coqPM)PKq{2m-9@=)Vr#d?nFnNkMS5akOy+=K*MP>fHU%v9}H3#3T&^ zAeN|aFy6njt6*`eCtkoUcU`f$S%+am+gnbJ-S`)U!da;2EZv3WTWS@)!tyJ=j@PO3L#9 zzLW}Bk`73WX%J&e_^B?Xt%UW6n>gWTc*|VJu20LZ)J_jrwSdfG|6ye@>Ax(kouwE{ z&QVX*+%7$_#@7ediHpSYYg?6*Ti#Azz>Dz{@jx<`ByWQp@1OkIcj+Al(r?D>{uf^k zI60r2rvoGx{X36o?~d!b)O}#LtIi|-P)acIFaDCaE>)mEH)TB1r!nzNvZQ_H_f=-9 ztpDMXPd{F8f1q>^cFbVlsBeF~r1T9Ee>K|>cxl*(6h`{#t{Z)oVoLW#*d)fgZUrIv z?{SW}tqpbaX(!^Z9&yvB^wN$ewl1w^1g6V7+H5V<%3D3slTEl1N7j^;KEKKOxO)WJ zSR!?PlXq`Sz86~AR`d#fwR3s~>!thYBi2uO)~`;n*$nS^Qu*lbFGY)ei@NEW#^yv`syWOcsBPDp*%nG)-JM7WoH0 zxQ49A}KK+?@eiuaww;%2ad97={ZBTW*>Wa(L?qT zrc4j^R_>6Lhad|+QG(;!#x1yD-GheKptj2mJ@IL#T8r`D7^iyBO42#&rgZtSE9|O- z5x!%>(A_K1^OKZKsm(yMPR^)aO&76+fA=>Ib~AWcQG@vB?AFQ84qHo@B1(77l8Nhg z;#VjG_9P|iM?ph^7z3uNb=vWoqQpQR zYmb!GqXVG!tAA~Me2=s5D~X=CyJd6t3iE|8wLgVXtf{?1&`!520Z)I-OUBnEm?X@M zIt^KLZrRB;ei4b$TsP+4UM6lOzKc~TQ$mK8w&edf!8=ic$0_skP|B1{z(5>h+4 zJ>y0ydl!ONy?Q7K(}sUn^ZQLJOVJ}lwEjtG&pBMl-P%Bj-0U`*=t*c|G^W317B`1kP_3|zb{=6LVUGq4O}szXUY+zg}ZX87c&{0n=8 zjJ?%NC7Mus(X`rf)#sE|@D*vf7^hMutXMTRvcvyE-R`|BcUgvN(gqwv_|Gj*RFwAE zm;4eFx!X(evcm6EXF~I8pXh+73axvxBTkW91a^#!gLw;A=h1RfA58f)M zm3)UTy!9-YUIk`7phARmJjF(Qv#kIl$)jMJn?9jg8$uVV@pAP*uhiWkSMz!hwFM^|Rmw*ZSo?|II?7 zG8#ub0!CWixA8+wD8B40e-yiFY|>`aa6uBk-|mDPGHZBQF=-)qF_0q zS-JrGNEkzfx>7E%@@8MJ))r9%)_y6b4qt1Zi8XKB6xV?w)n=LOGcSLx_<%cjUgElI z7y&_eU3suH=TG=4MKz7cueF!=SDK$QDtoM>Qg7t)$G5|JA_PSRLHLG_gP8(ee};P5 z(L3Wj$@gS?R8X7#FG*%=)-wi7s^aqNA3a-fJ*B?Yz9$&QG3P(%G!-zPGU_23$23iG z)-4U=V#C+sde)9oQ<@$cVZ{nSIu7I+$=5OO7oKJYD8^n9w4A=`D&lPSYnt4k0lwgF zr&gv6H`RwLUG3z3HoIqIgO{N}NVCnUYPtY!*Wi8rNFU9SC#uU3K1kdHB8%U!Wb98B zElh|N4kDY(9k7t@?fy8sRU+%2j*%>Vzxwnb^791}=hgMgk6%>}UymFn;OJTo&m2zW zH?2(p*``Zx9_zc0UmX>7LBFg%V8;&oFl6!i+NjRHK>vcUnwzPwueE?{A}LQ7YSwrj zS1(8y;4*Lzj_BROhp7$4U+{|=Bp2uuExRV=YeX8qUOhk5XhNOOa6)}MHqZ%qN>!}6 z>3A8{+(5NK!aM=IUE)9+5j5kr+D9ef;SgK0lVioYOffK3`in1JMn9|mjOX!hyNY3Bz9l-H7H16)su z2!;-6<wts_9jo^Uo$gKTI-j+j z{<9@+ZT^cMuvq5;L6$;7b2?h3uy?t_WP~f8)_DV{?6o*xGZ5^s8>m^!cnuVsy%fA} zcYDB_2VXGvdn3ROaPiSjRCILoB)PnKMCE{x&~y2nuTj&@g5;0Tb?P9Ve4uXc?e*;j zmcAUdhqw42)T+}|Pd|##E|FcpD(XH78Th&HBAjhOXhbC*0dr#@S9b|r_5Ax{b$O%B z2N0pIYi+G*)q4Uo@J++CkJrQ19_BfharZA)RO-}$^NVp6h!F1Wray#)EhGzlVu6n0}@y2Mh1XQycuIc<-6 zTP`G3iw@dsM0W?eHB^B2Zal=|sQ#8ybTn%sky+%FB zzMhne)qR&P0^A1>2ey45K&x|4D!$aI3;3BG8#S%`$8oq{8By@0%7ha}A6-S6(!0rs zP%e5_-&8RJyASwa&d#0GWFS?|XFs_T@P6{qMZn!auL`qPB-!ozR&Jb+VO%E-pRDlq-N5j7XEE z6I?NXJ3V^Vz0b2r)@+ZMP!RjOHuE8k2ajVt3MwG-OvL0E*+_8pA$eB7B;EQf&X7l+ zdK);x);joN(TK$BOSu@^QN8fWHpZ9Rj#R^s3z7FIo@kBk>!LX&u76z^dB*8dS=?As z1zvQh2;Sd;I<$Qj{1%b&-Z@Y#E3<-0(J88U4kPDO8zB{5jGnlJ+~9SO!a~g5^SOja zY%Der@E>syt(ZC~^t-+h`w*}YpaR=k)^=~HBNAv4x{0!z{341r2i?`y^W&bT>*s6+ zA-CICxo>vu9;p?>jR6nZ5f*=RWBNXyEW$R+V6P*&6`P|pv$+j}FUN$`fO#6?{J2hZ z-fq3~hS|o-E^1{>F}k&F=7;iol}|a@5c@-Nk=8Rr4BRgPCs7t<$i2wRtFE09_BhJD zS+0g4Id_T&qGoS07e>erYjFt!DT3Y?7OOxyRoF2|FlJgJCS1t(b>-k0CI6UP6A0q7`UlZ< z-(r`$42mMTU0p{J@+w)s)TES;gh9Yv4IAWp(hFn5ov6=+6E}OFYt7Os zE-zRkH;|SM2>QZ&G^VWKd62)mFC8_oaLaKO3~jA6B7!wR>(8P@4ZfU#T0e<4;-UkQ zLUW2v{x`2p#QnMKDaLSp`$iUN!+Wi(v}QYMDvRa*LcsUqR{W3`qufmVLgHTxG;09M zP6~{DS{yU^VN(ZjMFQ`LN}qm4M}3&>?70{u<>D36o_nptjL3f!1w|S^L#n*2-oxGa z=vlg4X6`*JNvKxHm*J;sts8#1#3zwedcnmXoxbZ0irrQPihNw-=u+KNr(GYCGP0B$ z64K5UJAuw;y*e6sZrteotWBq92Av&Ns|Hb`c_9pSg|l7e*rf^BKHQo|@x*2recFJH(P27a9ss=q}OlNNVn2tvJ!|A!>CilnF z1+AaFMsM+aDQD-?%nU|JzFKn|VR0Dw)yIN$d8ktE>Tj&*-G2gg3WQzk=C^&x;HR-F zJAPj8qL;pBhe#pIpUgB=LKU4Ad)xHS^Cz2q<87^;%mPN7fA_uU8ox_)vS|@hiDh#Cb#D6(rm;q&bsO)O*8pQ zi&`m+6IZ%#oC`Zb>9bsbO9@)FZ6)45w{0{6nIX35-GP%lsmWo^6%Fy5K=Y`>JUomv zO;-JC780-GHF&PityoIv$}S_mz4m$wT;FtqRs(qKBpX$1V4S>UhS>haBIMH&U?`hC zMZ;-?B#l#e<+Pht=PQzZeJci#b=g7r{Ccsg!z}VVO^-tXoeH;6w71z>RGbT zNdvn{w>(=#a4~C2NdKZp^{*;$f|cl5Ph1Z7UpTy7>ZQEo&IfimiC!GK;n#*jX9x%uT59 z??>!azTbw}<2vbOv&MBi!_+bElc9G#s|{Fo#S9tsOM}_N7c# z#cnYXwzDGK-Vt}(isAXLZB4WN^WUx=cE0s@;s5ho;6ikRf0qFQ2p;Dkv^*OjV?9$- zz7e(wlKqMyVnTLz=GDghC|5tX$4*95dsC(Oj=OVDZX~`Aylrz|!E9ec!|mqR*AsxO_+qv#7Ox?|9`d*HnutGZ4#7^T!M^_KU z*mE^!0rhgha4uhC;TW9;@qea1nd4#)&zqM2DM5qnY~nyrhjz;%uwmw-9NuJl=h7TD z{?udrnkr$<1kGBe{?gV^R7Ep(EBbPun@muTYnbTg-C6q${{rH#s`yJCAjFqW0{7W6 zrPR~RlBIGiyW1&n&i#Fh!)nsHhj!8%(QmF7YDFW9*`&Za)w`_1dA=()K(b}5-H0ai zT-1mYV|xgS8M$TBx`^v`x&x6(maStgn27ehdJ^OsHab5s(NcB&ibPBrJMMDqJJ;DU z{hE<8IHKZM>|uKAG~ZQ*qZ7j4s$x&Pjo)>}1WJmC|lWy==NPW0Y|u`vkWD;E`5!0kktYT}?HATf_p<_m}Wt!@RLGFi&l zBlIY)7GCO%dI9Jd2Bu6+wGDTMW#D9Ck8G<8WkglnY>_TUMBgbz&N3Tg!+<+=kwHsH~I<;@zh1Wg?rTZP$svk09+ef@# zarvn~D>c(4x8fI9UV#!gVLa}R36_mVr&XWNuw<#JzB!#;1I?s#vl1RhLB4NaA>L&i zMVQc{Ut|S6BFWcgt16A-q;DZ;ax|#>PvAEsj9W2f`0Q{iX*wkeL+z3`l%UUpRZWkZ|cqH;K{5jEG>;D;Bh7^?O#=p1z8P3QSN;stRXj zEziVyGv%}-SijX}aEOhqej975Onjt8oayOiZ9pB#9EEbDJEW>mZWLWh(0ZirIP6Kn zI4S$s0yV8;>K4W+ksV#jnxQ?+K3@ZM)EJ7r@zO}cK^B%+KGmAciylT>_z%I0Nxqw= zy*tV&sYO-7Gv|j4n_srbfE}McZB2~&{kvw#+ko*`YvVoz; zh(2fIA?$K?rX*~iQKst+pGoConCeg|Zj`|Jr_m_n&q(jPQIPH}2r}tPQ>%U6D{{~X z1BGt%Rmhj^JI1)`%#9*Nt3Aj*uY_~Am^(WGucv=|xqf1j0!4%svU4fbrxAlMhYei5 z6h@|3sEPdk=xY;jJ{y#6lp3shq`K*0%R#k!l_F-q4kfYb3~DHHAP4L`!?jpYRhsG1 zX*5(@w)pZ+e%&j_5wb z>I=?RD$x)QH3SA30nsSSmu`_QgW>H_*32i9wM4RHto;w> z8qaEL^-I%;~4q;uBvN`WG$Mdl(L`IFF8cQldAy%4CAR`Ch?{S4W z%S0&BQmqD7egip^(qUA+v5^cR@XCTH>)A6R*-m_AGR8|E7)}*I@7uDl_|$5MKYe@? zuP#W9UG?M3FpOqf)U6BHg}5xry8eUg-B;h9NxGt)W+$y;{h(?3oqGF4c|EH%ml=Xm z({dq7D&NbWlV6QcZTWr%Bh}(7maPt|P_D`5w`m{5o-4ND%^%^Sa=tNqx>5<7n`N5%5L9%|y|EAI`8mGVrvX=cMAcIGz z#of0%=cQ)9B;3Xrh&!$CjjX#I1c~Te>Ge-3wtlbVG$)sdIqG^&)dhVDrq$pfr0Kwy z#Qb)jRK^+|<~=I>*XRi!)-prd^9p6H8-0_8H(P-Cz%*v0BY9`^1P_B7v&*KSH+&L< z40$r-l*qv#$AYL9;~|Lkq^bl*pA;q3AbuL+3KA%t|NNpF7&1M&bqgNUUk`&XXD~V; z2m9GJGY<-LPx9xn9X+KRUSwH%OVpMi>Sp&;X{W~aXo)EKiroXA~;cn1r3@!G!%pX))TxjPABS*H(yv5YRU2%su!S zFGp35Y&N8nyP^kjBPKkGD&U=-JOXJma=J51pB!)J5}LAc4a&O2(cmk2QNl%TWoxYH z+9CQ%$X0%JbBg{lH)>+=xnKsSA>oT?u?u=S362ath~Jf2Iq+gwMr z2*lLX2~2OrhZioO>2AgS3qtBy3zTYmv&F(*G@A+WqD}ku62^G{>Z6~sofv2452#lN znDO1dH=fDjqZp#9cBpi&Aeoi(B<{3;wCiOb>ZHiW2gTo1E9zGo7O1$0;w9CjxGe#P zlaqVURs~l6h;+%O)`B8WT7#dsWTKv#+X^+GB)8#`Z595RPYCo8j9 z+2r}{GolH+uL{OSSYJLRAk_pt3S9w1BY;keapV^*pINU`);&>XR|?#hzi>u`wdS1B z_&cgP*#-XjP~#*n^E5*r&e&eJYkrVw=j*z$8}C|)TD9r+%d;K?OKLE(F$|)9s+VA2Uaw6H}?7qsq@X66z4u@C?m=FT01Gdzd z4N6mmDnW4$d3W_|6d>JGR;CE+PbG70Lsy5lo+hv1eVtWt;vOBDX0STJ(Ae-r_}b*T zK9}NSx$4_g2(H-$NS6W*kN~I!;tVjUlVbQ>lizwzNVs!H$D#Ygcjw&N^Z<1yj zmtUY7D!AO1{5y{xH9-N{SR%DhdkxNhAPr3;={ICj0Vt+2WcFR#n#N}5L@1fc;-md5 zpx&hDqLTEZ1G1H{g6TQ0gF?A}qk$?(yfn9)ki%hvaS3s}yqY*WN2|Q7d#p^Bqd#?q z;KRnJvEcpB5Y}uS{hGRL>?IjI@BF%lJg92VOZ5NxAk&Tb3gZD6fe7Kv~5(WMlEaDpyG62Y4lHj{tR#v zFuPaeA8GUBSp~6pkQf_=voLg3bT6x{XPTYcC@ln$7H3lQMS~jUs5jXNYO)}E0s9_G zOX@_q?ECpm8pxW>O0d{-5}4`jywh}LY$`Ic80d*~Ygqaaw;4F06Px!Y3^6$7{P%cZ z3E7r@8-!oUHcaE1MNi#AyA#U-mTV1~7<(OOF_c9GnayzDm1SU~*qKA8@QWdN?+}8h z*-``boeQsJkPagOJ0|OVL@y_x51Yl9F_Og+3|b{7lcyiC0Elu1XZmna73#@$U-B)O zX#FW6-Jn`6IyC^60>~afe{#392P0QiMQ$J4W?^fZv|7o&s)N}T4~~|rU{5QI8QIK7 z7hfZD%cmTRJ4F<}p!IesK&&R6L*nuKU&y}Ls@)4ba7X{Ly}*+A$0>ZmTO>1 zx3Z$I7&5>J(#X1RSdnt{h4+ydx{y%fwZmb+)^AQ%ejgoFT39)S$zn84Y)sr_ZPavn zHpx%Swp?*M0&Z?_6V70K-)Yj<9}Uej6w5|6%Ey}F_7~6(*P&(iO~O8*Xm47pxGLt2 z{R(eLN88_S^2!hvc}X_u@}R$)W1(KRwU9jEG5yMYB^cVNTNL$K&~wrx=&X z&-y3YTC0dJ(z>m|EvW92Wc`P*JiCdUssX|vbsuoX2-t2VATc--f7QpVH(GOSSB+o- zIXKd2r~uE!FJcI`1SiVzJFJrKg$SRP&em)OSyCnFJ81mK_VA%o+s(eW75b~%p&ffa z1HxsRl)6D2Xzy6f9UraKq{zZ-M8}4{eclWg* z*CZ__mVLjPKFP#}$lvK}w*B(RWtayXAb+esDz%sdf+DVx_6L7lq0foOsObipg8!jl z9{>tgi%_jmi?%6fZZW1plvejCg43E$3tcGqmqeJ#>j1sa9W6A97@rM;5Q-q1$4Esl zzUzb4Nk+AfA3BY!8Q^%}t1(D@(lfybNtS$Rx*Z>=6^tJaex5HxF{L=L}8p*rbkr1x3tyJDnD z=-?^aR@hyfLPW;$OK9dHQe~6CiViGKoytMp-tfFD)}n^pg%E|ziZ#}L%M$YRhi+Z< zG_Xop6d3W0)(tQ)anGEO%NN|guo5pni8{?i!^uLw-zeYN&zarv#qD-%FX6Ryk%%RA zO)x17=2%Bx*>0`1-;vm)C=cR>*V0Q`)SX#|h5r+t@-RoaqP2v&3!)Lld_GN1nI@$G zt;RQX_pMlWN%rY7O`^9O4^Cq@z*jEVHoCJQZZ{v;zdy$? zIQiSVwT_Q-66CU)YT5R)v^+=MJCOdfbc((l?jC!M#RN}4yU}^W@~r9P0x?Te+S<;L zErSy7eoU8318;M$WOfGj?7OzA3Um6BZA^pl0;d$>8d4-JTLR6PzhO9hmssL9YYwcYV zO1KgMJmcC82+@L!ogCI=@cW~;bk_5<0S?xBWUaV1+Z!|KnhePc6IHRBJIoOk{9CFB zow`u>xA3^STk*u-uWtO0RV4wl+m^L$PnYG%YyV-j!|-%&+x(fh|0=A)z%G`mb7k9r zSJ(^`0q>>=0fn~vR1eszsO_^=gYD@AgoEM-%!B6E@5)aCU19s>zFLNCT~4&4I6IQD zvKEu{iRVCKO+c2+6v?_#`<)7c)bIu4VqaX{_@)Q`1;U>Hn1iSSAgmhFUH1BAc3T-6uaf4sH%A()hJ^@TnSub z=TN}*h1V?8w_Hz!L^qU=9v3QRd&vkq62SBJ2`4{#V@hLqJXU$0xf2@qb%SR5BkNF&PXs?4Fghv^1i5d*0yVONrw&`p^ju^k zfeHUWy++62PIceK6BCXI2-3dX*9t({Us^C%eM*vNY+FdAn~&4i%2 ziAM9=o9rd*r?@_KCPt5VL0ZtBSd@mM6MyOJ-!lH1O3aaI7xaEPI$aTFOLZtkRWHLf znYYxfX2|g3oK&6@nYIXQ}elCUghkpH1&GYk9GeRAq7 z`0v_|OrF9E!XVwtMz{J{T+M8PF>U-gdu&$RQtedV(Vo>pbf1sm_=YcLv{kxKgSTis z*D-!5rh$t}7zb3sP@C>t;yIe*Fiwrmm0l&f=GcT3xugXRgYQ%Ua8wMgnQbN@`Zsg# z{^&B@bmqNw%i26_ttz8 zN1YV<0(XhVUbOUP10V)7`7Zr@W}LIC*YIpEUJw#R!3A! z-CRV4N+Bh}FcWWIh)j5Pr*`_%n>C#%mL^2cUbDFD5HvCBA8D1$+EXFZR~jW0b7Wi1 zqF3l!D6`XB4-!j?k@N4rWKO$`yR@&zE-EX{Ae~F(K^Dc23&kgu84RQ)^5b|l~N#gv5Y}Me8ntki!#n5(Tjx4R>$qQ6dMR%VG z@P;A5WWtfKc!{E`-g*Sq@ByTR&F?h{^g(h1{V_F0R0Vpg_3?*lQykw z0iM*|w}B4fBPHJI_kF;fejanxHTWs0@VC&o58=ztBmvk(Lw~*#DvXVHz0(& zX0XH4g_Nkoi3d6RNObZdeTFTZ9XqTkjI5G#PrFjH7zro*Xb4JPWa3&qCI>HcMNMET z243uMJVqb*8>6V{?NsvsDqXa~^|2~xK5KTN?7WAZ?c1Wxy=;21!8?zu($Y)WFrf0< zJ);Bj$E9-)eTy6Xf1z;k|Dte(XxLsW;~KmmR0`raN-BuJohzZ@8s2QvD}9qNE~;HT zrt{5Utj!LPLqz-k2aYRmRUb%?8Y2~HH>e>7al;qO`(Ib~Wng#b9JHSoWkGLlUHTW1 zdyWxeofCgA=hOlaxe(+C<=;e3Yd_qk?tdk6`|$q|xw{|HroG@A`X(s>Lq0H&eh-J_ z6+fI^Q++HR^W@H;clSW^lQ(H@WV`rm%gGVi6>FNulK!}sFf*IddiqV8$IQY^i}AKP zyb_*PpqH#W{c5x4S& zQS*_jA;N7^wdRn)7sw}ewBJ#H*OBx?Oy$ySY8xhUHsR6 zIy)wQ^GV>-n}{^+!DpMjH&=~wjwO-AF?}W5+&9f5L>hs+HZ`qG-t~M+zok-!urO5V zV4e8fs~pjTFJJhD>Uo&jBsXMu65s)<-XpxbC5>?~P~)K-4e zo+s1iqUx2_>ep1FJr8bN1ktP-JGtvEH&7?(of8D$KFUWm#y_mHm`Q23J`yw}R=2di zcRu>h91rvl$D7^Q|8I~tvVCy=hvRJ&QKj{>HS6S#TI7Xbj_Y*UtL5h?2`3C9qQ?Z^xFNoZ90seW z6B_<%L2Vk-{0TR_FD48QS)OJ*4?3RIats)8+h8y1k^VRej0paSX0biA8cx3$3aD*x z8O&_$%UG@{JTz>nqv#Rj6!sumAS>8Xl*y8u17>I0yNLW^XaPwiUj_+avW2#Ord)|< z6Z9y*@d%$pRR^GOdgtl6j-0yeI$8R~ZL@Ca8k_B2&gF7X2nKCVyAh)yGB2IGAN$Ho zK3=A;vtajUEkE&!Kp7DgRt_B>{Y?^vS91DPfHS|_Bw!jfy+f*DA6dJf^I>z<7^=}a zmJT6A|Fh^>AOYiV8k&B4dw-eSF+xLMP4Y*Gv{2gWA6_`XWd^VJ`GZfO!CWBd*e~Zo z6&@aSNLQvlQ3h*ob^KQ>JMMVt>k7cSJ9CsTIU(sO=O|m>Y;C)^ga-4 zQCXPjArEa+vqqx%WwCsZOu!zphyr$(?PJPXE>&>Z;M)n)j5PEjNJn_By8t%_oES2u zNny?u*2ZZJSQF|*g%KRV>QS)Ncduc}5mrl2POFka@d5_YAj+KJi?n}K+Uk%%T|Gmy zHf?e27Ho8;MmdHt%=%6c3;`F5<7|rt*)sZZvqe-&K%e+TcC8wcYe}3&L^nnA7PZA= z{4EyKG%)UKcEP`>V8Bh6yUSkS zpM=!*0Jw0W{^fuC<6>w~E&vDtAG+TycH98%fZrSc`=2|K{l5br+W*4bI<#8sYBThg S;mY>6+E_YTG+n-W_x}MAKZ9fd literal 0 HcmV?d00001 diff --git a/img/ScanPerformanceAnalysis.PNG b/img/ScanPerformanceAnalysis.PNG new file mode 100644 index 0000000000000000000000000000000000000000..4ed9e6d5218e4f4ee4be5bf27f5adc6504943984 GIT binary patch literal 45076 zcmeEvc|6qn_rGo-A>9^~qP-+ro5~h$2^DFP##j<624TiJgyfbiC4?d&TI`HvVwxn` zvLyQ;QI;`wWA{DpQC;0@`272OJbw4_xQ}x0c+KnmTF!Z%=Xsv(NB5O@$MV9!F>pS3wB_;|d6Yy0lh6J8*OU9zqi9+xH0<6bW0^0(TeO55rE@>zby7$4#~yb=hl= zkK*Odz1$j?Hq<`)%A`9jh-6kZWr94`kAL>ClEr@ho#n{;jWx*Q%>46#e+lvD-&t7J zy6f}(`e~!#3hE!fe!9%%hW_=#jk^kooWFkZlU#~h^y{ZR$DX-S8XI*}F6Avtx%PP% z%HKEjsM_k$4O)8b^nX5T+06&^i5|%d4sm#4P{ynT;Z6PbP{Ct*lbL(v=g`T;-8* zTBk+?^Xvb!Xh&XiLz6u+G#e;H6>@yyBYk>nf&aHjdQ*(X&eh926eDK-cs^$JYfPe)A@RPf14*|>JP&kk}SLl5vb<4ViaT>8vFPT%5l&^Fx=CJTrL<@sOo;t!f(PEvuslC5U%+K@sB#x(5;c8l1+HEo|2R}|d z&7yI2=AIpW?6Kj$w~aV;>y-zJ(b%kS$jF> zTS#xb#GXAXZ)Jr_SrF1#npZJwLfg}iZQzO1^M?^ael0b|I;Vz=zawP&*3 zHnz5&WOHR@b}(jVej2m69<6RnKi_AAL89X@RU6D&%ulm6*M{sHB9DyN@pq~QhlE^O zXbxwD9?9Vkb75_J8aGr*vawA3G-Y!hjpsbe*B@NLMufR=c7kbi{%IO{asPEcWtHrx zbaQTZimR*Z6CZF3JAN4SId6<8-g30RO`kwFG`Ja|~&g0y%xYmKpN=)A7 z?!NHQWfc`G*@!Wx9w(>4z4GwpUY;J3t9+%Au`>sCUzONh8hm!>R-chj)<;V$JAAWgPO?P+q zev$L9Bwn76OWr@4nwG}e9sf!X%*;WT1(WJKdgyyB z)j#0XRRjRcbzVHK(H7U@6bO2jA{txNgYb_}(3DkV%d|acvgz z0sy8h=DB>sG9?OF%9!kU_zmQ+?gz!Wf^8+LEo z-vLCk$ZdkXg{2;KPf6eZXi7!ERZ3#+e;}G-+A#U~nV7fw>Cw@7RUeV_z2g^NWAd%d zZ(J9{#HlrsC}}?A+2^xAFOqnyGIe_2*z-|!bDU}k#f-0*=L&gTqb}yKLdS|M>Tv>$ zJFf}S@WyXMiD&3*5iURE`5$J21G@${Up<_VR;D7l6{AI7?(RG+mT*ZH~K$(!;`eM{aiRl2M4q6 zi3R5`ir-i-_F0Ia)2~twuUri7K&$r(ePD$s@Gfof01}h9XKV zxLayM`=q2i$+S%C?`GW#FF!Y4;=lpkTUmgFj2#^62)h@aP#HV8m}Tyiz(8IU@!&%K zG4__+6;o4Acgoj%-}<^d@d^wwkjg1W!G$Mjr zB__DgbxCeySN0VIhrO76c}-p3g8Nl9t`DXk?84aKRTR-~q38WzC3ND{sap~84{qP) zs89Z1K0oVv1+gu3oV980K0ZDYHL_N%f5)NMEx0(DJnIAma`n>_5=yJYxEEZUI`-V$ zT!UDYo}LAGPP3wlgKDCrjAap=PfPAHA%A_@hd^w9EIAI%z;dxOOD zKjT)2)w`z@dB^qPOjdsW>_Mirx5r|%7al!{M+}{{`c~G-lP3uWHq82lIi36EB%b9P z4zFe-qS5I8?c)TCCP6kLpO{W!TE~J*WBSkKB0Cd{7}#Mx5AkY9!NS6VpGvDu$*+3G z@OtxqoIEu-o~utE>HA7JAiLnhJ_nEW=y^f{k2{4-R;a(f@Kvc~pPZaDh^;Rx(>D!5 z;QPEVPP@}RQrQ$9x8{f6Dq0BOdb}EhF!eHm_#Zw@6#bt*AH3J( zt%eq@L6mpF$wc!UK79DCKK;YJEd<=?LT2ADK<S4gw^9j0-`KX1Zn(OC-wwEJ#hN-oy*oan!I+BQARQX1@^iaG8 z1NOSE^(7|`7{YRulC+q|?Fgwe7iuEY0Psx4-QuSXi$`NVa(lqlWx>GgFjh=1^Z!GSa4WD|a?2n1t;zSj2r|L}@{ zZXc)yHBEezGDs9@+OZ*};SZLn(`(4)GOy7)YubD$ZRDG$gl5|j;1}=WTwtPpL>qLX zh;0wA0AdBt9!(@(|N4Go?8#lEkvws?1Qkc>GY^DBn=9IM0vxVh?U9vS8c$6}nIZeF zdCrk~xP=_IzK3vi1+w7u0nJTJa6D`H_!0BV)?`|ktRw6hW37$d&>WX2=#2Q)%k2DV zAH1i+mVk+1J^^?9_%PVP+pBi`9N`qw8tF+% z`M3G6-NMCBH`U?X+xL|iw(hP@^qYwB7Sa=UK6cnL|#KLMp~ z-?0Ou*0&IBBj?NcN!tLrwxtEGT*%8gQQe1!hrx$B493QZaxXab8g?MY3=->`nk-D! z7zcd5Xf|hv+}y0s-&s{%9jYT{f>QloexvVe{tu#tV6%|%X+{ZAYg2XrM0Iq;U>0;k zuigiQ^>>A6|DW3!-^&qiqQHJ}=3lH+H1alM{zRpuneKwVuw zl5&_YfO6x(Teg0<2N6+wdtzGKLZE?FXclRPLJ2N(%VTYXPMkP#6Yyz4fl9sR|MK}? z+!@BSXu+csKywFR4eqHtQW=98T?ivZRN%|X%7AQJym&Ezuy9~pAeZ4ZYW2_MDqMI% zRCfqHfPh4yLgPfm7rN|clX)&)ya>EOdU|zL;DR*3uFveBK7D#dAROSr<-dP#re;6; zz|D1lA~a`HX>ALlJk=ouEiEm6s-T=xO;zB6W*H^~*!KsQhH_r;IQ%7a_wL=B5$0(0 zt2LW8aoC;6xZ0->(1aQ107?O8l4nTK4la+>(rr#!~;uZ`p9}szP z)S5rph$f|BX=woqf>f*Fe`x7s(f{f59iL)yHEbcqBoLApva}i>#jVrD? z&%5B_)c#_p7okwv3oee9kWP-%AujB;<`RIZ(F@NfUd}I$PbX0XwP)lH1o`b*Lj~6w zvPR_m!*egs*M_Jg!GE1jZN~LTOs2<#kEz<&WHeK%w5`)UFfb4tGq&?VT$~JYR|`%g zqRyOIwImr%^=e%zVu0HNHNb^st&qCI=}6g- zt;Q5I7M8OEj|#YfLcy3$GThd=wbViaxA{wtD1EI9=7}Qy>waa-k;J*GWMNet1jqEU z2&LQkXOmfOxSR#d+FiaeNE+wGv+6MZ#?Q?6k4U+0(nFI@{_x>An8eSY;Rdg*Z(&E) z?fDCRz^moi*669jkV!(>}jK=;b6 zt!OR^g+hLuZN+YJp6PMW9CmHpELPJ^rKYAiGh&tE?xe28OcN1>a#zgFfxJsGDXshQ zowe@h%$Hh(OBbk(S&(#RLZe4_y7%?J1Sief^EE{Qaa}@@QF;N)IW-VQAS;dOO_JZEl%!#6*)0?jOdgn7CLWxT z3)=olh*Ll}-%}4+TnYytc8mo?Hb0>+5+kRMbLP?9;c(uHp!sZ&HjC3c@%YMqEx{?g z0Ch^iGDa*y?1mofBEkqLqDL>DTtS^%3gB^x{txSak;LzaKWUdvQzm6KRsC^89XSUD zDsE_cS?V>2aOd9s)kGcwrkFnBy^B_7I9PMh<>BavyF}%Ni;lzsZ=StuEHcr%dDI86TY8x%u#(1(5YT?z;9RBm z?70Nb;q?6zz`8hE&PQg2wx zMwD`>cXvR5!viM~HhasuENs_{T@BlqKFARHORgs{$S+?ORgL9V5=k6NlrJ&z7N%<# zv*c)2K$1$|53w4QsLiexU^bX7$dH1ywKYFAOxpG%jMFYo0#H~@&(WV5r$3IrbpObl;qhhOs(-Prl&Tf{2|TUshEGCo|tyC_Qazqhn{5zHMhsC}EgED+*r&KVAtQsAO!Y%Y72d zr%1Tw*Uf)fNLN^!DW4Nq&z`CCv9h`vQZ;QwK^QNghIR~(5(Tl(nH|FQR*Wq8>Pa6o zUu711Q~`be{{3HavR00%QeK>U>4xKQi$*IpzG8}`in`rFkN}&1!A0_T}pn)13VT zgbpZ~S^$w8>?@xr8qSCUhJQ%R=E|bO@3+1`z();}!`D@fk>8Q4fm~?Kv#6_EkUPC_ zYIZ`Ug@L0sQ1CGs@ z*_X0lgupQ3z=ei{oH%*X8$qbfg8cjv2>zfW{&8*&vSOIr+iN)!`XV?_(|$YAWy7s3 zt?&utK-e@Zm$Bf zXVD~w>SEs_o&eNpQc7=;Y2Nfk0yiV^tWrR)36RkGL&x4Sgqk9Aaq;qH0}0N#3V z5(xcw&BvVK@zW<7sy;GkCRIV|;jPU9L~Qis4a|_=g=<20y}dmm;wOZPlbhms0Oqmt zr~&y*?NufiF&F_&f_{1kQzIKaJSz=#KJ0e76g*J-EsOfPAIk(NbfXSI_iyvOX4OH1 zYuawij%;8Z`#8M^%mgF(Tew$eAWx#^@!{cTap+$=IIXyCQM`=INx=t@eXxus66*zn zneL&1Fy61I?F2vse(Lh&%i&a5ea%%u?)1op?NYKvDBhWSz>VAXIzV4*xH@h-v*Q3m zl{Ga9-O2)P7nWM5z8}JBA5vdb9Gbxjz$q1RjW^5PxX5BZ(`%#FF)QU2w>$Ip>Ak54 zv-~*6?k!>Wa>gme_@W7yVg(`egdYU4}g;mzg$p4YaejrnWCOP99 z7k_$c^qfcC+HpAkVp?HU>y{x}+SmNzY63&tJQvSokc5{wbZ9NYpl{!9re<_cqMDVs zvR`ZsV^VEju7XFgrd0)O?)EyWtMrF<=Hi3G4+H!Tu3N3_5|$v!&CH7?*2o>S{&s@D zv+yxd4zv#$8d?NHKg`;|Wk_&b*x#Fa>HcbJ7R#Y^tBE8^UERe*j4;`XX`FJwOvuK@ z24LR3Ey`vXqkHVpnfmmLxk?1w)t??cV-o9iR5LMj^+sYMosTqflzd<`n<37gOZoHm z()g34BuOsZ`1m*m)Ar${dN3gVrS22~A;Dx(wV42w8^_MZb^?RJte_R@5(T8pi$$*f z(UiwVRE(H2Fu$`pQwhRp&}W{KdDhU@5K3CAL@83oZ|ZxP2iVj%*(^^ZHmde}pfpZ| zNtm8DNQT_aTYcN696T^Rcg;s|;Kp8OgJxmF4>We`Qc9&P?z^cP)1Y|wvmYtDa;~SE z#av;47#Ozo-XKbev0S^{&7%ane6)W^JRjdl|+$EuO857wAUM@bX#4oF?W5uo$m4R z@e(xwJt8GwuL0hF3CAB;>JY?8t?Wr(uUE`v#ee#cD_5 zE}~VN>i8Kf(@1<^$dww?rHHOI(HgFg+nQ1xzl32;i`(`RwlR|K!VZa0s=!M%M3pd| zg%f^bDWBDk1Yp}kJ4Yq{PM_C0j|6%a;^ZDC@PJF;0|NEbqr|j;ubnA+9PX3?nqGZf z8<~IS-6iAY?8C}JgW{#(cL*xCTf$DSQ4Ui}gYabhj9ee7y0Y>zO0`r`PH|#V<#yKJ zsMnMLtE{t}z|piRp<*u&E#ATmu`B-D)=J=qfoEDSJG|CEG%Q)PXiJfyzVQK*#zWZ8 z`)JcUM|R2caR#{_9S|LD^hoP#KA+>uPAbRcJAtR~DM)I&v?)AlYDJV-!5L?%m$gsC zAMCCPa81+DCOKzM#xak0mkrHIpA9nS$dwD_eeckHGoXZv-e)O&R z$8YD{dg)bN)O6o+e;ut+?~#YovTTc>HShI;^le8I*C;_jLuTw;hDI?$?x0ocaZtM1 z*woY&`h~8On!&^!UU~Jro4pK=(24^z=qYr;hi7t<2t_AOmey_gl=RRK8?kX;BW>qu zzhxEY+j~?|gF_9J)c((tC3YV=b;ZZRqQq{GjC+r-cF=Q6%~R5GqqHf_qh!~J{oz4O zK0ZoH3-~A8B@hmtb>K3y?|M2~;lAPaCQN2wb+t|Kw_&+)pKrH9RA5OsPDDmreNEYv zzj_&t^{PK<1GgsMfb;tDCa-DH=eDkvk4Bw%$HrAQiIHjjGjd`+>|Dv_b}Y~6>P-i97o%* z+q|rMc5noJK3W6^C89dGG=CCL&$@oUZYeHEz1$tm&6ISA`jzuNe;)#orAp{ppG{J7 zW~e(#(fK!cY^jadX@!B#(C<{~ERQAr1FVT2ZYa+Qa~bNIz?F7chMcS=Dz2u3-=o=x z1EZ@ueH9Z_5{XVTsl2(_m-}N+va@9&AP|(jW-5|9@-YUmT3sN!8K&<;3%{Q%&PbdR zmBn^T@|nIgJ*Ew|J*e$myh3a^(}Ta$XGa+XAw<3kUDHAmHx3{1G_i2wSp`0ht z7j_?Y)@zI$&%Oj+@VN`q3B<$z#dHRaqEI@? zGHk6M8+=x|Kf6AyOs>Av@SL-mbG`@blRM}`Mf2&;2S&9pn(1Qp682-{w zs`v~x`&)t_ZJ*wh3ei59P|c8;Er(a;yyi~8LjTyejy@OFZ0;6Yf`)_MT zASx3JlgcBeld%HoEYdB~)7I$Y-gC)Mjsm)Lpi2=9Mt|^`M3)K!-#>1so&hRm%616{ zPGpp{?E%zs&CQUl=$`SVvNr90aia3T20`|z{kHT;^DH~mRwoaAzKyFMIQdeBx~kb7 zJI;SX6PFO@uH@NN_aoKV*8)549RHaDUh1oboS~^#?y7`rAv59%9_)0BCwpKnw25N;;gXKx1=V*+w9WP)VHN58;4ipj~ZUsfw5^O+or?npZP|%cZ zZt7L^!laR-rz+0A`|mhbmhhcw2uLsnXy%4ldwJaBRpaw6rlfTRJ8+lJa&qLjaGsu? z7*qRTIhWmBSWm(6y9Fn`>jPV^_fuhk0J=V=$P!ptoSp)OY?irkyaS?xT}yG4Fe1It z(ZX0L_82oBIqDy=58*+0(lceq|4@ha69^d}azE)Lj<<^-*hU?Rg2-_13)w=wU@SPg=YF@g7@>iv$<0 z=@2#m7z<%wF+u9Zf%6AnnFtuIWHYn4V83DSn`@V_??LlL9?8e)@C$OVge$? zixdbUG8(4e5`nSF=}|L%LEg~sy7ItO@jz5u)u=dr2#SrifyR%tR=8g; zsddv;57p@5Xl=2%w?3TI@LYfgi%ulEOglTC9p$GCPJ%c*=es=$B@>ShKcMH32ppaC zDXzLe)D?O5Wt*mLv&^J+t<_rnJScfZGUmCNK~~LIJ})!;uAZVbqXKR3hSXC3x=oT7 zE|;^Z&)6PdMI1SWggR|*W@Nv?O&Oc>FM}hQOIKGapEIeLYDUTWEO<`PSb5Cd6T($n z+{hO2y&-jC9uYLzuI0hOwU02FN1x@tq2A5x`?6{8_1?5*Q^_7&Aj_Gz+65U;OYiN% zX%Nr0PX4vH%<8+vb(d;&x1wcV7sXX>8w%zmZ3@1>glH9G9!M`kFLgsxywaq01X(+O}){<51a2D90`9I5$=3= z6*UR_#pt;N=jb?s)1YrQPj%#WA%{=xJM)7xE__9!%YXpdc8>i4kr+Yp(vDEX|jJ}Tg~U}*gr4) zr}9&7+MS(;tzK*^D-mpFigJMGmk@Edd6h|0aP!(b*FtKRNRz)3??VAb*{?uSLVDHB ztoWX%Mi4j#Hw-?Im5>#CV-pd@dFCtm*-d{jAuslaV(llE9f(X2&OfS`qH$!`anzt~ zgAcSggAgWO1W4?p+Vs52jp6TgM8_D zdQ0Mes)+DYP#OpIu;!;SM6i*Ok;AbS)?d$>@E=<)d)?&p+Ybj#4-5jLI628qI1sMt z>k4#fNJPGzKi|hdBG)uE#W-1H+PwS94PiYr8*iNY=5^+UJsCI7E1sZd-wYPB@n(zk zx#GC8|8T!_IVoW9u*c3pLDuKtJr?|-aIJNRQzC<-D+6DE2l=botWs-VyrA}!IAp5?x=Hxmqm2HQ2CNCk!(~f$-`A<#Q&}jG}>+Rx|h}aS% zER@-2z@W&^?yky7Jgw{e71(*_p4zuQ0fLL=8j?i9^M~4r03R-fnd@M?0 z@0hjb4S~sZ3F_{g%ADCgFZGHYSL@KU{o}a#2Wy<i}^*u*W{Ci2TG5he90`caCZ)SXuSl^R)Nb`RHOyPyBt^I3n&>s!R4 z&zk`4t3`h3QueiWeLn8kUk4!H{&`Q5#?mgoYVkqcuL+_VB%w)x<@PR1h#tZn=ebrk z7V2hwW@EB`*iri6AP}$sno8ci`|IdfbLT+QQuhxdjq!QQ*{*dyoPdB|-fw6Wm5$-8J#Cs`6U(1ptBa-;3HM!O!&j9C;GJi@La z47@fCYL!2_2wNT%d=hwcDGn=sZOMN_&3wJ#3T%YAp`l@-sJjMDj=z2iLI$7gocC3{ zzWyf4xf)u6VFxU+Q;BD0ybKRcD_&=9qnl{DdQn_`vaEY$z6ERPvbjBtC&?^RLrV)# ztU2j)#3TGh+Xsb#i^OWO!H9!_53}nA$AcdX5QGFo_ji^X2}okQc^HH!+@v6B+YMsP z%;1Zsi;0N|0JhvuAaTG5TL_)NO+!6$g4}=+I59Tmm|`?t2r@3Vr_gDl9_7x6&xN!B z-?w8*x|;+CjXCCYM|zV}TefVO5g&@{Way*JPk)r-r}TD1h+yaF9w#aoiV}yTlOEPd zNE?WpTn``tn9AlUI_s$Jc2y3nkqc!s%rE_H!@al2*)@9og|uP08G&NSBSzJ7>4>i@^z(9gN$e_6yiBgM}B#E^EHLy zH7>5i`9-n5eM+i7WkOb=^ZU5{+!W$;=dchzKScDxNTA!cs_--WEOs)_aJpQ?Al8wt z^!VFYYn*4U=*?kkmN!~(jNaeVIOldD_@I{uCob}Xd!3G@2-(UBvaOL)-RD=#x$mT` zNf5~}3Pnyn-Hn%!HoWCBTniARud$vGVK)nA5hZ4x3PABD^9WI%)>Dm9)7hIuS5~M# z?8@=IMOfMLKHb=l-sYpXYwoXdm^2bc1Nb~8iD-xWg_VVl`vJMgVYjVabUScau)Kw% zz6te#KMp~yU>NU?4zaqn&0pdgG6hJEQo=O>MEjss)WWD^KYFb&{@6y0X0YCE-%_Fs z3(t*Bb+`Zo=fujNlrcWkmc;Fut3e+xwb>@PS8ox^^iCC!K~x4voO^p_R8IG@FWUab zMajQ=g1ZRDmhwATiT#2ZhEf|&v=9DUFn#vrG*VWpkq&JlKu25?nE4=!2M}C;;yym= zM;$ps)ScY4u20|8cyQnv^KUEECm&1=M}z|R^|b`=e4{-&xl624v?}-0GS+mOPjfrn zC#p$%?n-Xb*o29(aX+mJsddAsJgdY@^bOyFz@5^Ifm`!C-qK#r*qY}x=ehF!azk<4 zjr>5#5X->Y_7-Z9DPOvrb(!qQ*o3c;JfK~u;@=O-8p5_<(8GN;AwelU#2AFMJi|8s zaI@S*1XAJp0AI2=MMp7oXyvp*^imE#$_BD#i)%8}XJc&Z_eIT}^4AKQhT^3aPzRnK z6O&I^{pZd+Vy{d3+;73?+0pWZJ2`KlkJWa7h{zfU$l30w!=^9ZMnTQB!7^@--YdR{O+Y#|-VNxigoazsT>~ z2c>_SPyu)u#ry0mUZVf4EF|p9wM~GL_?Jtsm|5LCH+1c-O;a~gm`EvJ=8}+VU@VNVG&3fsV~gtoVgwvf=bXiO z?^~!0w^<&#AR`mNWp-%Ol_WPx6w7B%jn8y_sj=DudUi0t+qh|-;N;{ zSNoQ}7X9fl{r~iusU4PQq#P01PkH%iD62(eaQTf;HU>dN9h{eUrah2cicA=+sT;hE z<>KPu359~PQ0RaV0!pwHqaxlE(0y)eN<#l_*SxDq?ct7yu!g*jzdh~y9j)$)lp98$ z^zkMfojw(@QS#1&EPY%|0^3s*(C+A6mG}5&dsimG!^hf1%daQ$l;$Jr3E96~N>8`j zdPLE`{XKOm6_Z0>rQ8_4+H*ahtlV>^+>}=en>|LNK|7VisS|Egfl#6I8quPUq2lE(a5LRa z&G&*{3$Bfsc{b3RO%5;WtHgXEbn@S!9s)JruPq4;3^z8gzTB@AFaDQ%VLQG2Yvl3A zoa}8iSM+xse|*!c{OkSW4)}s5o#x=xPu_Pubs?3Fui-r`Hf9@{zpSOX?q(Ey%~4(H zvGKP~J)#_k2pU59P3jNq5=BT|ja`a8n}nxNyg)4n+89BGudj|aZHS6&ElEJ7i>(M^ ze{KZkYR-t;2CAa1@W_e;|}+NivquTtf@Yj~^PeMb(>?`|@DOWxnUh3_jn?RR)IdsOi+a<=p9 z4_%xpi>|#E8>4e(>uu`Du)FReZLDHP0BnZADH4@FxolYP5wB6RohSu5zv?C@f{0gqsb)oiU% zTT{kDfmP8Iro~-8OJ{Ku3KIr^3qm-~> zW#uuwqSNjB_pYH99^l%1FExRtDYvg_-O);lU-d0Rb1elmJDPIUQf9N6an*;@a~m7Pm%ja-as zH`@T!rVxvwvX84>_^x==lXGI3hhxR|ZBvdahc3Q8r4`@s9YWhqDKVIT` z^Jjp8_I;sn>vV{6n4CdTCjUHK=;IA-6{X35d z0YN4LIAN4njlo>Ma*YV4ivCLavaxYc`p{qN8Ic~-N?4P8Qgk%jnYNFVOGyoK5C28f zC;r375$t}zFQDp|?jG-WFZCyGoW=wmg>$>qsv!l(`Ly2NM2xl^2s#-gy;hF+cC;)= zo*b(u*-Jhr2@s-xj?1I&&tgl-!jf|i^gqbBa!;0-M33Gi4?Q-%efu_3WC*d_A&7Oq z7oxMoAh@vgDhS~Q+=Mza?HVU4_7l;Ge2H5^OwK>aL<}`amq)acf4jIwsln1CnEI{$KRk%&MlrGO+pZG0wR7O zY7SYwBdqpq8zg`5YWk^9Z0Vha74v4WqEWqJP#pgY_BrT(C%Qb6X6!Tv4j0sywSg`7 z2t2@&ZUnNrs?i+S`g*$UHHRvvp)XCqPBzkt2+<=yH;#rdhJp`yMP+0}LnR_U{kwtX z*OcE;eo!i4xPMfO>v6XD^|!M0^ggXwk{vz6gByW@`{c=zM z7C<5c)he(T^}1}dwYD5ku4fFlj_B0zs?#$e9B8x~rm{XSnKJ=G|3pGk<5*i-!&Or? zr4Tv($u?U6jAM;_paQUKNu$FLG1GONr+Q5p$O1Xy{I-~$lK)$W^i?3Nx|8Kda&x8ldXTHQpVa$6{M5B)~tdkKZqTP3wwj&aY2K4+@Ip$u`BE8{H z*a({IV$c^{)mHsYBHkifV>XKfx!foC;egQ0&NDa{v&6S~*!ju2xrfi6E)L!c z;`!3SwHr31iAEa<<9`UnA?^UnI*a%~Is7;Q`jA)b;dX0y_Uu`+4_X+AY0f`%N97>l zI@iL;BLRHzjH0#o3&cx zi8n~{Qsoyt2!z@st90`S7A-ahkV;PqqnLdWv}=O%3-d9)bm=YOL0H}}`7aYRlHRrS z1(#Zy?F@XIkW=5yI8mOuQdh;}H$+CyKd@pWs<<@qZ5{BMY8uyU`9O&4Gkl8QVcv(+ zYGu>hc^#2}+Td2o@#J{S9driJ!$U&$Fxmqkjl-aznzw$oM7`*H-O$y6Xm)AHDSmzT z1S8B7EXQv3={*<8sMy2A93K$wwS;VLu(ig!u}rwBXu7<|GT1=oNkCSVfC9yw#ZO(u z9Ub&~2PH{F=)PHjA2VW9&wd(dz4C(@RXU`7Lg<8^475*3-k^j?-FJsSBdCGTD9utb z2A&R8sAOE~n)LO(%_PbDP;dC!fngV`F4O$!l82_s)I}Zlbkz2UYw5h{gqpt1=+IWf z$yT_$`tIf)@pWaOaQTXs z&%5qwF@c5DuJEWJv0owcV-Sz>9RVlp$sU$t-PWkBFyQucr}NCMclqtdVE`D33W&Pdc^yYSFpnQ$g>Y27QOQDNi*L1xj9TeSMcLrVqDC z@Zxq>BEUJA_x(Y~z8lz0trz&-h_&)=6QmaQWYM$A9FH%%?%(-@iyW^cw|n#rXLKKE zu;&h)0|tcFu9XBM3q%5RWQK%1h$%rm2$^aixh?@QGRxVE!vwdUJ-G~3gl@(Eb$r=* z|F`NKk{kMP1|W;|tM*&Tt>!=F{Ai>nQzLB!ZSt5`Q9rMh*Jc;-v32G2jz*=3yAsd6 z^`(MCBa=s<%DQxI)almTd$$k0WRg1|Sxq7Oa3u;-ep1(;q9HU}78^cARvE|?N`t_D z{Ch%eV83Rc^&3xVT+@`+`$q*$m$nFBK<}Kmo6f13u8|1I6|4O1laq=WO`$#JwNY>O zj}0iApKh_GzW3@Q*(l?GRzIZ4HHkbP# ztlOriq#aGlN?P=FePvocLE&nDQs3*L5Y^Y2yE!f+H$x*cGv2W)Y~mfefoH~q1oT;4 zy~?44NQ&+kSO4I|P1B&WUEALMvZgs>y%r$ku%H{&YdY_7T&^Q+CDI@Z7d&N}4#q}< zer;;{8)^O4#r`dO;`fo`n{|~{_a$OUJ*qNlB4Jb#;Ln*?BTY;t2Lb3a63PGa#h_ku?XqDq_3qlKS`&mj za{v`dvEL;@FHRBx}5=9!2%HeyH zmiPU*OTQDd>ul%dRqZF$A0LKBiAWdGOHcc4FTl+KaWoI2Ada~3NJbiZl7QUZ_2n5t z^qv`rxD^z|aUwj*{>jdhV;kQDh_1Ul&YG#F1q{D)%G^Ch#lw+IchEJVS))YGNI(rk;9Y{F=9}`Y47q*7r0ygaN87+)j zTiHxH%8jzqialY;%ay}{%Q&!{vUf|m$GhAbM@pIGi-Pi9tgQAdwT5QJpU#X*pWC2< zw;T!Dqpq<)yd2 z3s`ydZDYHn+T5IdP~-0e_{7FW_Ud(lkm2f+f?gIyCbRHYl+YIfBcOq1r`}CTA`x+% z*8Xm2KDHPIH*|32?AG`ZyGhi#G!ZUrI;K9*?yh(=-7bCYLU#o&d8-~{f0F5K*3T@Un`-o< z>5(kVa~Z}$KfeRnm||X%q0~AmZ=Io(;P6a?{r_ZK^)5~$rCgOz!R{M3Z-Ss-Hd-P$O|GXZ0Gs zv1d#+@1x82kB?yun%^u!bqU2I?Y<$COK5X_63@Rc4QT}6ShZo#-jqX7L^?D35dHK= zbHdB%iV;M^z{TiD+kWLb%~_PSf*r~5lun43U>6BQ@^na;m>yw>+PX&3&GpvipkN*4r5i^iRretMhA(sL#%GfI@ zX}}oHCN~{U6ljAWHquUsfOb))o$^q-8G%GIYnXL#oo*RZKY23v)~#Ehw8K#!V)}y4 zP}pHxETIM;7u7`A?fbh|<|Q%A1Jf4ZF4rePEDe?tsv*e)F1hQLL{cGvBkrFOVsyqW}dM%j~&+wjR-XwOd};J zi~3T~LV%q>NzbZ~kg&kM9Q;>)enN?$_qS(OJeRDqr|dQRS>9=A)dD9be}(}gf&v-U zwL{5eJmxk;!&gLHa5uYBfALxMBbiWCuF zrx!bU)E$4u=?-Rbr1_uDWpqkZR21SzTiV#i!DvD2wtFbY&SOxKl;(29n60!vvO@CU zVIx}Bv_5E$tUF&|`uOI5NbypJyd$2P#OSOgV}yGTcyKsQg8rq6G&;mn9H2bCGvkbH zW<-2q=8z%?hq|7_YDh7zFVj6$?+J&JK}PL4QmMh~vKB-mc0sh}koscp#^I;luAtY` zP9lhK;X>=j&AgiCe{A{e-!tI6^1&07uJD^#Qbj{}an_-{b&JP+b)o1Lk>FSFFj;uO z$G-yr7!XLr45Z$yx|5b7nkTTBPUYkWk&6xHYRilSLdc^fshmXW_T*8`>5{-x(MdosVZ ziXQd#@nPdph3ZylqlEEtS?zG);-fYn*Wbh4WY_r@E$CZQV^l-Fu(Pk zZuy6r!e9QFTX70o7x*@6!{SMI|8Lgv*Pz^wUaFTOT028VGCcua13{C}U2(>l@&UMD zoOeVtT{HosiIc8FgK!d^dKwifyTB?XPBXf)rV|KNccHGE*CnIxOT>g{^nQq#dAFDF zKMV=muZJH=2Ni4jcrvCyaQ@pJ{vTIrfQxcb;ZgJQH|Cn1olm(l{~7pB@9F4(Ym20e z5dyn~;ouPkK`im1XLO#*Ubgf9=-7-GFtmKbzw<&ZJcbDgFnXTcZqbjL`ELqRBEfR< zs0(BqHbN!eo_}hb<3h3y`?#?}7jTbEWfw`EQOI&-iRplSY4p<&z9kr5-tnO?=?z?nn*}{ zMGz3yLyw0?p3!)_4}>U_Bt!k#0A0^isi`qaq;o7mGTfM3(Te1U9!gry!r}#T^B?W$ zcfYyAT}4p9&ro4#*k;1{PI)NV+t3vQG8r^n z{G9|OVV)fv29Nh|eiv?OZkYc}O$Yww2$u{6e1p1eBuNLiMf(cZ?;v}@(eOyS9!9Tn zMi1t+?Nd)6idxE#n=e^^4Z8b@3)sy91A7g%t8^& z3OKFNwFC?*QB-N0iqi!M1PN*TR*4;K43~%IuF4UTvom6!Jr@`#c~ViP5_=R{C2XyxXAi2xI;NbMqG9VbFW!Iu=6BJQ z+R%cJn63h-RF%<4rGvw z*D|#k8TYoDOXQ7JkEW^b#v>$%d`z_1F~K2Ayde4FubFT`q~$Utcd!sn#Ak_>*t(`t3bg zcX_9;#6q_dD3WxS#)|yexePEJrxL|VcDLv$r`aZ);7~5d4zk~K(52y9;p2?{{g4`c zd^7uWl&o(0g&Bsb<+rzKMUq`0*FmjG+5+l&NNSdySD98t#!cnRwt9xRuu`N7(p zO^=H+PX&!{4)h9)H;dai-oDHW)B*~0*;>2VT+W6sI}dbv!e}t0{XhyD}dyc@rZFDqj$Me zG?nIR$D_3v-YoNx6XB}}`mrJpa$UalL%moH^wzfpmccuC$Yi>1OTI(#KV;2O_PXyZ zvtB`k`n~Vm3o3# zcAs`AL+VI?^B$Dusfqq5gO*CSMD%YsC^$g@zKrLt#SUZY*N1}3v0^{`j z^5MVRk`q-E+bW?IA3Ru_$#Zoj;fe1uySoE=?1mCk6IM{a&1;;ut4&!cn?p-F-Prot z+Tg>LYwmQ*L!h%?Zy%K8)OWJ;Ot(ah4&2nBi#yxBd6iT9jGJ-XehFXxWkI$3f;6;- z%e*1Qm`}43Cb$5`f%4Sg4#!f$2>A=$vn0r+;;P)Zdu9ZU&&8?C$;Q5qC+sCJ!({x5 z4GF)$Q0?;X7A*sV7R(zY<$NDSS_=$)Tn@msq#Mr&Tyh)ar@t&3ut0_$bscU#h4zk! zpLZAE+q!UILqkIc6e(e0g7sNWciQAZ^#J?PCl#I~8ejsP@l?*2vyB@Ng@zF#ajiSV zZZp34*Pj0S9ikxj`0(Lxgt}jhG`g|=R=l?U;`%#ZU;yfobK)IeWXEe$h0P;+3l{Dmy)~|FE;>hj2r3z;|3}6oY8a=8~~x| z&yL^5E<3juA9_+R^bb$=?{|a>O(KnMkbX&>g>INnU#EQg1@CWP)PqQ6 zuwsuEXH72p<=y}Ns?fdyJ;FslQ9hkm5tQTyIb!tIIFc`1O3$dzl`ej^LGdzMyC(;# zsecw&E^i-Z|HV`O`z@^R`KFheN&pZ?{QO zO{Hi4 z(K+Y)Jw2!TUC(v>9)HMO^jY4^>wUkr`$nk}!NF#ZGP)ciEt$|-`t{c@gjxulz7mfH zf?uSep`j+_0(%DVx>IbNurf*+35fN|2`8-m^^3)(v7smw)>4?PeSk#W#(?gK$>cO3 z$MfJu^v(XMQ5D9ZokTm`Evl=-0REKE$KfmEE3F1NP`pVQL|2f|5ubAm85^HeyV zZn@v3Wj@f^@ypjz6G6~ENP5Dw0<*LZT|?_~JR9;Z@Pq?S;TaM=N&+wFb+q;z;vKE{ zp?&WppFuz9^4@!v`2Se#AGKo3hR~6~qOPMKBaTtUiClilR--z#%k4cQ@Q^N3h1gq& z0DL(q|J%V5dq<)?2RXTyeEoKIc9-_nZR8!V@=bl)L{*{6G}Pc zWq7($l$*`549q<+bN{Y%+cW+jb7UDE+=zw4q|6@rR9>J|2M$YT4thjwbp6dGMk9iP zq-5=otpF_{mOA3)&Mcra;DMHcJz!#^Jff7Vb@m?qrAat%`VS+c?YNRL2i1_x*M(R< z2b!N8C*0V*>(?tuEc*X8D*c?wEC?qN`sM)8yp&sYM&JqWYTKSjlXAz?+Csm1qCM=6 zjt-!lMHIo}FopBTq0yWaM7dx+;-mNIJYNY?w2ZWFXUXo@dFfGM{mY&Ab_3!hFK`ps z_RN9&O=~}&BMag!MBR4Tr>JeD7t`>fI$HE536bBtC;^YfQbRSgT=lV!XT3+9p+F~^ z0*!%CL~|%Tw2G*j(S%YKU-s)o`J@l6BT*W_Em`NEu<1sGUW7FZ+IHZq2I>%@7&Pa& zPa2XvMb}9bov2y)n;m$a5ut!Pimq!TScGImCJw#%T4YnVHN7m)lXMcx2Qwt z5}bs6&nJD-1QqZUU4G&<@7?mhdD(|}XpPwPO$D;mhj@5+2=!=Jk2jD8fXtuh@!_up z_*ji!2mjoVP+%n;?jBc@z3zjips@ptVMZ$9hpB$VDD{YH+?X0j2x~Elg(Xi z%K?0d@khc)LgTr)nz8uTGZ-g-$8F6ZIIauY7!5=!)oOpiEJfMLhIVwD-- zw^3qs-fPIf0RgavP>=fch^B6016EFjCfgj11Ajd+wq?w@ zmpfq@&!cn9APGQFn&JHQ#$U-mp-@#^wpC%WMmqDIr>uxsvlW38OZ3FXKOO*DW9JPM z6BtLkJCL6MROcu2`&oOz{0CNw7*rkL9{-pM)isq|cajv?tM}vRkK|2Vo=h=D7;X2m z23T&_UJZ2p-mGN)ZRjG9aMDB$LT3Y0)+(uHXCaX}q`1ClhS`Nuo`aV2sqx=p7PY#6 zf+z)P+uKQ-6B*axk~ib~&iQV|-VMqQts+&Qs8Pt4yXIZG{Fn3J5)>MGfS;fLEeI$= zMl~3C8eWEnhr_-Fbfxc}y+5!j=!;g~e$i^AT4V^1Dh>c(mCwjYazR5bKl}Byvqq_A3EU{TYOtEE98Qqq( z2Fg~pFjuZ;6%X((P&od0C-C(g;7-f(0N&9{%cQ|iFzoK}XWSnq@35`8q{chPC~wO| zvMD2ydI0rg-zB!}OyU0PCO~3X=+*u~9L&h>_4HHTR~Gu%f5g}^NlEH(l71&C+1xEj z84FXdYPP#JV-&G^xLtz7lfk@6rPQM%D!6S^XMEBWr8p6&?f-t2U03T~pywDHwHG`6 zgEqzcccr|m#*d}VxMY7x0Si<}l~6j8(IrUkv9*WY493?lOJn+cty8CQ=ZN2%JIurq z=9B?HY-m)t{EcV1bNZcQe&z^OCdAHeI=MSMD(5E_x&O$)u9bw43G_iC#*+2eBXjKD z_rM&l5w#^+3ODv(Gb}*I`-y(A91Wl(?}L*6Gll}b*>C|}JrjyUHu;~U^&S+L+oEtb z+(RF>qMZ2tWj^rfLMY6@@bDVKrAH4lG^2*?NpoS%M0=~zTyt`=6*Vu#BO9TsXqfy) zvEVb!Gk)7X?FdM=f3fnN7QGx+V|^di;&|P34&2@I=f?RQRjV=6hX$Z{m;56 z&!hhxdL(IY+OlOOLHBfv)g@4uS}0vg4y;#>4M;Y);j}hC>-O(mvz2z9cEWH=g5K#-4C8y)lu-< z^7%8zq_oO_p@~`M#-aZkvQKyhFRdMg7+-Ku*xL^PF!X z%U+8)Y~merHfSm0535{MkV~jTf@MGWWk&0`Op#t;h)D-;`i3yMty;CJ=?(Y!HEf`N zu1J6AL0blV)r^#^FRhk967-KK57XCf;)X@i5mrac709}R`I#ygbxY1@xwhsPE@{1%~^nw|5BStVI>pCV-9HCTq9X!MH_J%%AObK8yy0oS(VxL<52F zyi^AqWj35#9iM9@2>#S55WZ7DS<}~keeI5Z#=$e zXlCh`0L2gUB5D4khXlTt4pkw>0gj{)awZ8@5 zE+XNQIVjVZUV|-3i$k#wp)+ky3OuDl4{}Jua5)VK{>Tu{lft^ZlFvM^Wfmj;%aIHW z0X5dzhK7KdV`(&+p-q2X>`nQ#7Gitr@%`QU2xoTAF|D%8wQqI-wIysC8-n)f-S`QL zbZTlURW~BE^<7H|>EkwpgMZzhWnDkaRUo(H*DH~nua$8h1RfDK@D5;X7*99jdSq>u zv_0n|nY=}ZCj%c7M}~Ih6`=6g2e}oU(X+ZkUD&H@5C-!V`If+f8eILul2v^SOThjs zHeBXdA=^auc12dG|l)$@&M4FNHNcYCy zKY~aifm(wY(pvF@y+U*N`uftqvw4jA!&lz+rd1n6_?_y9o0h*pBv|V zE#WTA1_n_n;JvRv-Lc}0^(APYuSnqKRh*mQ3t9>sy$b)n&Dhs(F~)DELUC3h=j$O* z833`oY;#zoD;!Ikn+Cj>wTkJw460XL?lRpkTO&wu2N8z8BdwAMKNJ}M=(ihmB=$a_ zkD;YFfs)wy!Jk>NAI@|W5RJ`WS_Q&%v z25JTwC=H|baAfBbxPi@?-Yl^(y501eP}*~@shG+C!9(JOpcm8)u2G;X1~otjfJ&E( z-aO?6on;j_rjy9&5~yV;g>r5yLBAMP~WKrv~fPzJO^4i~Qz*(lKsOT9b zC8gwOSBkq6fI?t6f;!759HS{7L2D*-7XSg2*8<9@i3_ z(YC-NC^2*F0GBJyar;a2K!k+E5MqpmK!f-asKk3XGRDB&H*&SubVGJ|a{v_6P@gui zJ|Y#1S>sKDoc8^1SKH#K$dZbHy|(?Qdu?Dm@ZKRf1`App!sONtfv$4?iP}A62pJl7 z4y`_ZOcsdp3m3`kg%-Xg)Bz{#iX~U~>m1fNGr=%2`5(R5rxu5MTM@Y=(1|EfQ&(@L z?!n|MWa;U+;ck0`XYN1p(Q8AqD(^dfSsBusO@dD1Y=&to}mwTiieV zY6PX7t~1x2cS{|)Ns9V#dW#eJ52G+-59X;H6^_TfPi$n?7lhHuF(xwV4fK~tKSFvo zLHqKGp$Tas}=TLi<9c0u&zZ5y}Gv7yRAjk zz954yT|IG|=v;T%2suN2Skvb}d9;UV5u~IxXGZ>6n{T-t?^}PrHs(gqIy*rTn`0zB zH@6LvfW7W16aM_Nc0rYum0!>5JFdRw>@3&x^7yB(zzXs8Y@6nT?At~b$?BGEI75gY zy;w>)orN6yGa*&W?8hZI5Q!z+L#ReLqYC)tA z?5peOL9sM1J}y&{C?utxoRLVWC$24;49vBJWrfA2X5wyJ2>>hJ$X=reN|Dq-`K@xy zK~+@cYTJTwH+7)N1YGdDt)bO`lz)TLV)NwWWDsbFBHSc})ra(Nh;uY6A>w~D=*)~A z$<%vBqFmfM=sQQ-pWjAM?yyKJ)dM2cL`o5ng1gO)l-DkhuV+kiwC<6EN_-n=@Vj<( z&WLWN$(RsKyV&;oAt2?j)_(+D>V|C~Fba(cX6+uqCq0+trQkk+Dj*qh-)mR!xrs2U zDwoZ*mE~omuoDK|LN~aJ`d;<0FG|_3>>jLk?0gBN^GFNe>g0yc>tw!LvY|&4f)QcR?*xE$|6mo_H$ie zYqB2SX+1s(5f_NP)@HJIPE47J%0f_0 zE~O(XYfhA(R*p&wsV5zd-B-8qH+K8oL?EZHUXH_;ZSr^P2HS`-4!5po8gE69Z%>0x zkgwCA(go0rzOmfxPh>DzDj1Ys!ZJRK%PG~%Yb4069rTrzWj@E%yIZ5st*yA`oLSnD ziJ7E!?%`MdvL?PuTHK#dvc&C2S+(^2hDJvCTSiHg4&sGP?Cg1t;-8|oE%PF3pozSK zaLUI5wMIbh@Q5AfZO)|wFb+`Qecd?Z0;siKJ1Q&dChOR9aF091%c<)^@9R&k z#$5Ez(2ov#*l4yrHR)22yMR8yZ-)~t5ondZ-ZzA%wZT{?D9&p#$hdrqH9+UvwDf1E zs_}nO-d>vlGNCYTdtkb8C7J*cxm&Jfk7TRyWt|wyVm3x7E>MeWrke(q{DTC$F0S|v3r_l0F7aC(#j^p$LXIX ziVDgFpaspGVk_D7;yA%DQ{jjKTl?m39w*igpLIfuaLi}^7Si*c!1uxx=6_ipG=9o} zCh!5sCK(#OJ#asYB7`)A8zMdr@OymCzj54Qt|E|UX>(BkA~$ymfM;JkBd-rXbI7Kr z1GA!462la;F`0Tt_Qulgubk zWh`0_ypz=Lp1#5T5Pba*o8C$?k!`sJgb4Fp(h)l2CgPzLR#?W_wRxusnxc8R@FL3s zV3|E@zkecuRyW2&t0n?IYAZWYK0IqpWC-G$%}@Q%GtY#m>9x^BMtnn{ z5eUEqN5>GfO$3{*Msc77w%+Zm?7nKZuBQytL?xMn;gKJr;Q;q0c7T!PQ7Kl!cKiDP|VtYqkIXMg*U&fAshu{AeJSA zx@iS4==N@lAb{v1C+d)I^`&-CbFb>wRx?Tc9AjvBbNzZ`^=(l^B`rUXfb^5f1(`z~ zo?++5I2w?cxN<)60-o7ba{FZJ&gfLevkYn()mpBj^k_lM)rNNVQw9pta)Z_HLF75n ztk{FZg_^c%4JZ0TQ0X)Q*dxJ#hZo0LBmu$o+amV~LHy89j5tYweLvES7TwZjSb!d= zc&H`ANsSmGnd;1!SU7G!IegWhXxv8UTwkH0v8N+`L-WexOD>3krorP#%MVYMeO$k+ zocf$3if7lwo~PABs~Uoky4snI?HRTo@(00f4T&So1Z0g*v)!`BC-rD$Z}5JbvU&H$ z@$}Qg-f2U(_snVG)3by-5-Iol6)!i>U0iaR>!`>* z{~ailjZi#AMErit;AV0AG?nMCE)Ya~cIa&Q!>#VX&DytT9#3&^Y&Jbk&;_yaZd}=% z|1`C+kr%2g_HJ1@U5++c9o9A-ff#8%tnN*PSr30&=XtDjAd5a>`1PIY zD#Jh>0mgjctT>8&-w{8Bw_t9^k zLn>3FG4ZdOvBQ<~^>4HCd)_vhl08W+0?U#u(O1OAr6l=2mF1$n86C%TG1*&RC6j{Q_DK#c7&4Ce8-9*YICQSZF%=lymSex zLy6YCpL%EnGv7$7%O)K#5}fC*WtZYZHu(jlI;YK|$WAC+L(W7s-bvhzk&4kb6CkojTtC`P%~u7h#(_D-Fk?rjf&Y1K8bvxsPguKWbt8tE zUa$#?@I8Xw*SBwr<1c^Mv3HylCgCp5_bgOH9cE}?VWXkX>`5ZO zJ(Cptkc{qT;2mo5I&C(O@4vI^(IU0n&~Kh8e`W5_w{fD(lO#oQ9ut}9$H)o#ox$LX z%^hivb$a^JX^rag#_i6tN;Da7w#TbbW^-c&p)q{FAdP`9W@ghj+jM-9omf(P+dq0) z>hyf~#fgt^R@3IZb4^^B(?j`}I4!NT#VpI0ia~Pz_~C&i+G1g%9u@Y2vHSVV*jPil z47;X+zzokyW3Wwz=lhn+qt+6wn}eoVMg}cx?)|B3DuoGx*k@PqnZH>w`WI#kf3pKv zJgp7Z(vEgw&<~hy^x}5*o8Y!-r{Y0k^G%uZK^>CO+i8~gMs{-jFyOq=5|i z?z7s)Q9SXcul)PFS4#2H`R|Et7gYclWcuIpLt5(WQ&pN%rzKKtv{W@*=8Hq#TI;07 zQfIT9b*Y>DTF6t67aPWwHMHQ>CP~({q;DO=;#;t=%?;F3|mqsV;OLcM? zdCMj}nqhPMn$7bWI<&E6#_W0bxVttND-9nEX;8R+C>}@2nYAAvs~#jtp_wP#nv*-u zOn8&bWk{w+c^@WA-m}%&a{35+Z0f3++w0+zPF*RV)91JDPaVNNWbL<_SM_z8!_==c zKmE$)D|HhP)@+*%nU^716$s)*@&yo#;Pxkt?1R z!riJW%My3GF!j0(dv(=UZwzY&U5OExEK-%5@eFiWy9qsZRZ73}__drtDhb2HcKALS znw+T&s?403 zi#eUiv?3?sW26&El&ji@EDlK|jqJ0KurQ>uLV9VkMxWL;a3r@|UD<9Qdvd8)GG!aj zNV^5<{q@ljY%1v^vW0Kl3o*JmwNnTR^g`zQc-PZGp3(C-tFboAceA_V-%QbU{`2hT z(qU|G<~WfmLkq)5=nAj!eHE{9WJ;zu^vd~6U5DbJyb!jF#05z!(Ckb2)YG!gU{~b^ z$=U)smJ~o|?`YP0LMe*9A5FhMc}3F7;fzbJkvzVG3S;sE&B zxA-mm{KnueTa=wq9Qyj^_s6_3<-1`pQ9t+Ym;A&*(V)Y@CSq11btd91bBoyBcO^J} z_U$B*w4wOe?Of23Tx6B$cKxl&x)GDic~QEPx{KZ0GJ@+wvtQxo*l;2Gt3AvA0@&XdFZAA@Sv!0bDiXd`!R%cV zM?HQn>ZaO99b=yxLuq^b*Arkrks~eG_WOyw)2_E|TiF+tVobga3o;8X(M9~&Ll>S< zqNDl*)4)t)SlWO3)ixKF5nTsk5h>y&i~joZxmdT1_}7O;i|Ro$=I>9(inlj${QaqP z(|`ZrX1A++fB$N`!P>$9T=K>$n4-VF?ma!8%>CDgMLyCi>=yt1Y2Wf+-c0pI(UXmi zCwX#na~+4{w2}xC^<>!IOR!_!&^+whD5_j?gP##v1lWh%(4DH`)L1i zq9$Zp^t~pqbyj%{a)Nu@5cT*3*=me&P97tb7`NQjdlqIs0eLXYZDy!`pioxe# z-mjiUY1xmxIH|kAI`FO%vp2pg=MDDF-nwvG%4@yi`%i5dTTZ$>lkC1J8S|ELj7Zwf zx~=co=xh_VO|;}fD`t>)JdkcGJMyM9Gq9@8|8R}yZMF0iAAG2LrEk2DLZxzEvGv58 z6VI-wVQz9*Tif7k4%<(-op=`ewhHS^$wr%mbrpN;WZ5*V6{7FDpy6?N2SiIsFZL73y0be>K42{}IYM;4%64`g_vz}it^cUV9pK)QdbY-h|J}CJKW(nr1bnz7X=cSQM!fP&z? zz27K>LwQLJ9J<@=zuflJK3*O*OzDb9-^zXNMhDi5n<(74n>*bo>>K9W%5~Vu3p}1< zNev&+8)i8}CLcR+*E79eDza|3E0rr62*(NI2*b0Y5%@IJ`{zYZEjI70Kg!}J4{4$! zvf@`$rt44N3AZuU<#`gK(<_{ntq|Lgt`TW_EFC}mOp)~r8HY&Qdm2Lq*wf?Q4Ht=c zXa291bHv$%W|4ntY^3RT-n7_bDfyi z#9rXpN9V>Rez855Udis<-FWZtxoP9rQ#Y7OCHJ;YSVtvpDY+jCgL-Ut9bbK+GwYM$ z(3O&~0qepm{n4e;^(rs$&ia0I)7<5mHqLowITg>x6qei@S4<`@>tq}Ia^-GeugkdH z$(L?gN0 zitvfMUNC)Q8YZ^)rP{KqA8RHw7P=P`(k@PDpDw=hBEq@hi_7ydDJ5X87x?v_{GwM*RF2FX3BRMa*1pfpbmwPk$86-wUUt+~ znZDEjg_zQ9Szv?TC-VL9p>yL`=-PPpBiXm|1?);2(X@DNX$>uW0bqRqH zan=e!R_3ZWxykrL-TS>{v++$zNlrC<^_k0kCY*)P_AIShCaF4<7b;UFn)C+`wuN!m z7b~*rb7<>c&@y-LW4#%!Kg5>k7BmtoF zEgq{{DRzlloUu;2`xfP1(gvl-gBt0{K5V$Us0qurvvOjF^t`k8RKt~0Ab7rbJ~`w7 zTZz9b0-sLCF9N}}|Lx=J?_+M!|JqrZ9$&!dg!GU@?QQPXEnWD++%x{V_UX!he<_vGh3lL$;YdTy% zJM8XT(q`Qd;#z z;mLHBCVSVyVC-G9fK)U*KxyjZdBxLQf_l~B!tS}QO#R4Xhn{TfU82rsncWzH4z`_qo^MYhrcGgH|m(l;?M3bwL^n zqOtn|*AHfE$ZR1LdwV|28gChE3_?wZ_n2uWZ?&+`JbJ_>_K|%H^-RBD?9T5{Z8zxX zA@jSEYm?r`Eh2gR1)8pUN6tR;UXQ8dNKh6URs6%P70i{`VcU|E8HSfvjfeM4*F4Bp z)MN|iViq3YvcO-Lkm$;Pv$KWZceT+yat$LlinNm?J~f`cc75S4_rgR;fx+xohqwU}i0~*P$p7iL_Y1s51l}Um{ z94VlNTj9^}?E5Z#mH8cC`f=GSnYUQ1S$NDv z+ZG;k(ale7pTDouWZm!gE<4uZ5&Ruy3jaT&&+qp@zW=Xn-|x5h|BkDF`ew}3X)2zo k@ejH> #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 15; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int a[SIZE], b[SIZE], c[SIZE]; @@ -63,28 +63,28 @@ int main(int argc, char* argv[]) { printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -129,14 +129,14 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 8fc0211..690fd14 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,15 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n) { + if (idata[index] == 0) { + bools[index] = 0; + } + else { + bools[index] = 1; + } + } } /** @@ -33,6 +42,12 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n) { + if (bools[index] == 1) { + odata[indices[index]] = idata[index]; + } + } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b757804..36222b3 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,7 +3,7 @@ #include "common.h" #include "efficient.h" -#define blockSize 128 +#define blockSize 512 #define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) namespace StreamCompaction { @@ -35,36 +35,68 @@ namespace StreamCompaction { } } + __global__ void kernNonPowerTwoHelper(int N, int zeroStartIndex, int zeroEndIndex, int* idata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < N) { + if (index >= zeroStartIndex && index < zeroEndIndex) { + idata[index] = 0; + } + } + } + + __global__ void kernChangeElem(int *arr, int idx, int val) { + arr[idx] = val; + } + + void gpuEfficientScan(int originSize, int npoweroftwo, int* dev_tempin) { + dim3 fullBlocksPerGrid((npoweroftwo + blockSize - 1) / blockSize); + //Helper kernal function to set extra elements to 0 if the size is not a power of 2. + if (originSize != npoweroftwo) { + kernNonPowerTwoHelper << > > (npoweroftwo, originSize, npoweroftwo, dev_tempin); + } + + for (int iteration = 0; iteration <= ilog2ceil(npoweroftwo) - 1; iteration++) { + kernEfficientUpsweep << > > (pow(2, iteration + 1), + pow(2, iteration), npoweroftwo, dev_tempin); + } + + /*cudaMemcpy(odata, dev_tempin, npoweroftwo * sizeof(int), cudaMemcpyDeviceToHost); + odata[npoweroftwo - 1] = 0; + cudaMemcpy(dev_tempin, odata, npoweroftwo * sizeof(int), cudaMemcpyHostToDevice);*/ + + kernChangeElem << <1, 1 >> >(dev_tempin, npoweroftwo - 1, 0); + + for (int iteration = ilog2ceil(npoweroftwo) - 1; iteration >= 0; iteration--) { + kernEfficientDownsweep << > > (pow(2, iteration + 1), + pow(2, iteration), npoweroftwo, dev_tempin); + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + int *dev_tempin; - cudaMalloc((void**)&dev_tempin, n * sizeof(int)); + //check if n is power of 2, if not, needs to add extra zeros in order to make n a power of 2 + bool isPowerOfTwo = (n != 0) && ((n & (n - 1)) == 0); + int npoweroftwo = n; + if (!isPowerOfTwo) { + npoweroftwo = pow(2, ilog2ceil(n)); + } + + cudaMalloc((void**)&dev_tempin, npoweroftwo * sizeof(int)); checkCUDAErrorWithLine("cudaMalloc dev_tempin failed!"); - cudaMemcpy(dev_tempin, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(dev_tempin, idata, npoweroftwo * sizeof(int), cudaMemcpyHostToDevice); checkCUDAErrorWithLine("cuda Memcpy from idata to dev_tempin failed!"); timer().startGpuTimer(); - for (int iteration = 0; iteration <= ilog2ceil(n)-1; iteration++) { - kernEfficientUpsweep << > > (pow(2, iteration + 1), - pow(2, iteration), n, dev_tempin); - } - - cudaMemcpy(odata, dev_tempin, n * sizeof(int), cudaMemcpyDeviceToHost); - odata[n - 1] = 0; - cudaMemcpy(dev_tempin, odata, n * sizeof(int), cudaMemcpyHostToDevice); - - for (int iteration = ilog2ceil(n) - 1; iteration >= 0; iteration--) { - kernEfficientDownsweep << > > (pow(2, iteration + 1), - pow(2, iteration), n, dev_tempin); - } + gpuEfficientScan(n, npoweroftwo, dev_tempin); timer().endGpuTimer(); - cudaMemcpy(odata, dev_tempin, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(odata, dev_tempin, npoweroftwo * sizeof(int), cudaMemcpyDeviceToHost); checkCUDAErrorWithLine("cuda Memcpy from dev_tempin to odata failed!"); - cudaFree(dev_tempin); + cudaFree(dev_tempin); } /** @@ -77,10 +109,75 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + // TODO + bool isPowerOfTwo = (n != 0) && ((n & (n - 1)) == 0); + int npoweroftwo = n; + if (!isPowerOfTwo) { + npoweroftwo = pow(2, ilog2ceil(n)); + } + + dim3 fullBlocksPerGrid((npoweroftwo + blockSize - 1) / blockSize); + + //Initialize two host arrays to store intermediate bools and indices + int* ibools; + ibools = (int*)malloc(npoweroftwo * sizeof(int)); + int* indices; + indices = (int*)malloc(npoweroftwo * sizeof(int)); + + //Initialize and allocate CUDA device arrays + int *dev_bool; + int *dev_idata; + int *dev_odata; + int *dev_indices; + cudaMalloc((void**)&dev_bool, npoweroftwo * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_bool failed!"); + cudaMalloc((void**)&dev_idata, npoweroftwo * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_idata failed!"); + cudaMalloc((void**)&dev_odata, npoweroftwo * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_odata failed!"); + cudaMalloc((void**)&dev_indices, npoweroftwo * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_indices failed!"); + + //Copy data from host input data to device data + cudaMemcpy(dev_idata, idata, npoweroftwo * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorWithLine("cuda Memcpy from idata to dev_idata failed!"); + + //Perform a basic check in kernal, mark 1 for numbers not equal to 0, 0 for numbers equal to 0 + timer().startGpuTimer(); + StreamCompaction::Common::kernMapToBoolean << > > (npoweroftwo, dev_bool, dev_idata); + cudaDeviceSynchronize(); + + //Copy result from device array dev_bool to device array dev_indices for excluse scan. + cudaMemcpy(dev_indices, dev_bool, npoweroftwo * sizeof(int), cudaMemcpyDeviceToDevice); + + //Perform an exclusive sum scan on ibools to get the final indices array + gpuEfficientScan(n, npoweroftwo, dev_indices); + + //perform the final scatter step to store the result in dev_odata + StreamCompaction::Common::kernScatter << > > (n, dev_odata, dev_idata, dev_bool, dev_indices); timer().endGpuTimer(); - return -1; + + //copy result from dev_odata to odata. + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorWithLine("cuda Memcpy from dev_odata to odata failed!"); + + cudaFree(dev_bool); + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_indices); + + //How to decide the remianing number in odata? + int count=0; + for (int i = 0; i < n; i++) { + if (odata[i] != 0) { + count++; + } + else { + break; + } + } + return count; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index acf1ae8..31015e9 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,7 +3,7 @@ #include #include "common.h" #include "naive.h" -#define blockSize 128 +#define blockSize 512 #define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) namespace StreamCompaction { diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index e3b3268..63c5d43 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,10 +18,14 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + //thrust::device_vector in(idata, idata + n); + //thrust::device_vector out(odata, odata + n); timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(idata, idata + n, odata); + //thrust::exclusive_scan(in.begin(), in.end(), out.begin()); timer().endGpuTimer(); } } From cfc43720626fdd5535f02bdd027982c1152ce77b Mon Sep 17 00:00:00 2001 From: Yuxin Hu Date: Tue, 19 Sep 2017 18:17:24 -0400 Subject: [PATCH 4/8] fix format issue on readme.md --- README.md | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 0cc13d2..47757b0 100644 --- a/README.md +++ b/README.md @@ -7,22 +7,22 @@ CUDA Stream Compaction * Tested on: Windows 10, i7-6700HQ @ 2.60GHz 8GB, GTX 960M 4096MB (Personal Laptop) ### README -#####Project Description +##### Project Description This project is about doing inclusive and exclusive scan for compaction computation in GPU parallel methodology, that will be useful for lots of applications, such as terminating rays that don't contribute in patch tracing. -#####Features Implemented +##### Features Implemented 1. Exclusive Scan - *CPU Version using single for loop - *GPU Version using Naive Parallel Reduction - *GPU Version using Work Efficient Parallel Reduction with a up sweep and down sweep process - *thrust Version using built in thrust::exclusive_scan function + * CPU Version using single for loop + * GPU Version using Naive Parallel Reduction + * GPU Version using Work Efficient Parallel Reduction with a up sweep and down sweep process + * thrust Version using built in thrust::exclusive_scan function 2. Stream Compaction - *CPU Version of a single for loop without using exclusive scan - *CPU Version of two for loops with exclusive scan. First for loop scan over the array to check if each element we would like to keep. If yes, we put 1 in a boolean array, if not, we put 0 in a boolean array. Then we call exclusive scan function we wrote in step above to get another array containing exclusive sum of the boolean array. Lastly we use another for loop to get final compaction result from the boolean array and boolean exclusive sum array. - *GPU Version with work efficient scan. The idea is similar to CPU version, except that the boolean array calculation, exclusive scan, and final result is all done parallelly on GPU. + * CPU Version of a single for loop without using exclusive scan + * CPU Version of two for loops with exclusive scan. First for loop scan over the array to check if each element we would like to keep. If yes, we put 1 in a boolean array, if not, we put 0 in a boolean array. Then we call exclusive scan function we wrote in step above to get another array containing exclusive sum of the boolean array. Lastly we use another for loop to get final compaction result from the boolean array and boolean exclusive sum array. + * GPU Version with work efficient scan. The idea is similar to CPU version, except that the boolean array calculation, exclusive scan, and final result is all done parallelly on GPU. -#####Performance Analysis: +##### Performance Analysis: 1. Roughly optimize the block sizes of each of your implementations for minimal run time on your GPU. ![BlockSize Versus Efficiency](/img/BlocksizeAndEfficiency.PNG)

BlockSize Versus Efficiency

@@ -32,13 +32,13 @@ There is not much performance change with block size changes. I set it to 512 fo 2. Scan performance comparason with array size changes ![ScanPerformanceAnalysis](/img/ScanPerformanceAnalysis.PNG)

Exclusive Scan Performance Analysis with Increasing Array Size

-*It can be observed from the graph that the performance ranking is as follows: thrust > CPU > GPU Naive Scan > GPU Work Efficient Scan. The performance of GPU version scanning is worse than CPU version in my implementation. -*For GPU Naive Scan, it runs log2n levels, with each level of n threads, so the total number of threads is n*Log2n, which is more than the number of elements check in CPU. Moreover I need to do a right shift of the Naive scan result which is another n threads. Although these threads can run in parallel, but considering the the thread scheduling on GPU, the advantage of naive scan is not that obvious comparing to CPU version. -*For GPU Work Efficient Scan, I am doint an up sweep and a down sweep, which result in 2*log2n levels of kernal function calls. Each level consists of n threads, so the total number of threads would be 2n*log2n. It needs twice the thread number as Naive scan. In fact, many threads in each level are not doing work because they do not meet index%2^(level+1), but since all the threads in the same warp need to wait for each other to complete their tasks together, the non-functioning threads still takes time. -*thrust's performance is the best of all scan methods. +* It can be observed from the graph that the performance ranking is as follows: thrust > CPU > GPU Naive Scan > GPU Work Efficient Scan. The performance of GPU version scanning is worse than CPU version in my implementation. +* For GPU Naive Scan, it runs log2n levels, with each level of n threads, so the total number of threads is n*Log2n, which is more than the number of elements check in CPU. Moreover I need to do a right shift of the Naive scan result which is another n threads. Although these threads can run in parallel, but considering the the thread scheduling on GPU, the advantage of naive scan is not that obvious comparing to CPU version. +* For GPU Work Efficient Scan, I am doint an up sweep and a down sweep, which result in 2*log2n levels of kernal function calls. Each level consists of n threads, so the total number of threads would be 2n*log2n. It needs twice the thread number as Naive scan. In fact, many threads in each level are not doing work because they do not meet index%2^(level+1), but since all the threads in the same warp need to wait for each other to complete their tasks together, the non-functioning threads still takes time. +* thrust's performance is the best of all scan methods. -#####Program Output at array SIZE = 2^15 +##### Program Output at array SIZE = 2^15 ``` **************** ** SCAN TESTS ** From 3ae1029710ef21cf9ed54cdf538f87c953539c29 Mon Sep 17 00:00:00 2001 From: Yuxin Hu Date: Tue, 19 Sep 2017 18:19:06 -0400 Subject: [PATCH 5/8] fix format issue on readme.md --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 47757b0..2da8966 100644 --- a/README.md +++ b/README.md @@ -32,10 +32,10 @@ There is not much performance change with block size changes. I set it to 512 fo 2. Scan performance comparason with array size changes ![ScanPerformanceAnalysis](/img/ScanPerformanceAnalysis.PNG)

Exclusive Scan Performance Analysis with Increasing Array Size

-* It can be observed from the graph that the performance ranking is as follows: thrust > CPU > GPU Naive Scan > GPU Work Efficient Scan. The performance of GPU version scanning is worse than CPU version in my implementation. -* For GPU Naive Scan, it runs log2n levels, with each level of n threads, so the total number of threads is n*Log2n, which is more than the number of elements check in CPU. Moreover I need to do a right shift of the Naive scan result which is another n threads. Although these threads can run in parallel, but considering the the thread scheduling on GPU, the advantage of naive scan is not that obvious comparing to CPU version. -* For GPU Work Efficient Scan, I am doint an up sweep and a down sweep, which result in 2*log2n levels of kernal function calls. Each level consists of n threads, so the total number of threads would be 2n*log2n. It needs twice the thread number as Naive scan. In fact, many threads in each level are not doing work because they do not meet index%2^(level+1), but since all the threads in the same warp need to wait for each other to complete their tasks together, the non-functioning threads still takes time. -* thrust's performance is the best of all scan methods. + * It can be observed from the graph that the performance ranking is as follows: thrust > CPU > GPU Naive Scan > GPU Work Efficient Scan. The performance of GPU version scanning is worse than CPU version in my implementation. + * For GPU Naive Scan, it runs log2n levels, with each level of n threads, so the total number of threads is n*Log2n, which is more than the number of elements check in CPU. Moreover I need to do a right shift of the Naive scan result which is another n threads. Although these threads can run in parallel, but considering the the thread scheduling on GPU, the advantage of naive scan is not that obvious comparing to CPU version. + * For GPU Work Efficient Scan, I am doint an up sweep and a down sweep, which result in 2*log2n levels of kernal function calls. Each level consists of n threads, so the total number of threads would be 2n*log2n. It needs twice the thread number as Naive scan. In fact, many threads in each level are not doing work because they do not meet index%2^(level+1), but since all the threads in the same warp need to wait for each other to complete their tasks together, the non-functioning threads still takes time. + * thrust's performance is the best of all scan methods. ##### Program Output at array SIZE = 2^15 From 0cc7404ffc5b0699280c2524e10d80c288b1ec95 Mon Sep 17 00:00:00 2001 From: Yuxin Hu Date: Tue, 19 Sep 2017 18:21:38 -0400 Subject: [PATCH 6/8] fix format issue on readme.md --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 2da8966..f68bac1 100644 --- a/README.md +++ b/README.md @@ -32,10 +32,11 @@ There is not much performance change with block size changes. I set it to 512 fo 2. Scan performance comparason with array size changes ![ScanPerformanceAnalysis](/img/ScanPerformanceAnalysis.PNG)

Exclusive Scan Performance Analysis with Increasing Array Size

- * It can be observed from the graph that the performance ranking is as follows: thrust > CPU > GPU Naive Scan > GPU Work Efficient Scan. The performance of GPU version scanning is worse than CPU version in my implementation. - * For GPU Naive Scan, it runs log2n levels, with each level of n threads, so the total number of threads is n*Log2n, which is more than the number of elements check in CPU. Moreover I need to do a right shift of the Naive scan result which is another n threads. Although these threads can run in parallel, but considering the the thread scheduling on GPU, the advantage of naive scan is not that obvious comparing to CPU version. - * For GPU Work Efficient Scan, I am doint an up sweep and a down sweep, which result in 2*log2n levels of kernal function calls. Each level consists of n threads, so the total number of threads would be 2n*log2n. It needs twice the thread number as Naive scan. In fact, many threads in each level are not doing work because they do not meet index%2^(level+1), but since all the threads in the same warp need to wait for each other to complete their tasks together, the non-functioning threads still takes time. - * thrust's performance is the best of all scan methods. + ++ It can be observed from the graph that the performance ranking is as follows: thrust > CPU > GPU Naive Scan > GPU Work Efficient Scan. The performance of GPU version scanning is worse than CPU version in my implementation. ++ For GPU Naive Scan, it runs log2n levels, with each level of n threads, so the total number of threads is n*Log2n, which is more than the number of elements check in CPU. Moreover I need to do a right shift of the Naive scan result which is another n threads. Although these threads can run in parallel, but considering the the thread scheduling on GPU, the advantage of naive scan is not that obvious comparing to CPU version. ++ For GPU Work Efficient Scan, I am doint an up sweep and a down sweep, which result in 2*log2n levels of kernal function calls. Each level consists of n threads, so the total number of threads would be 2n*log2n. It needs twice the thread number as Naive scan. In fact, many threads in each level are not doing work because they do not meet index%2^(level+1), but since all the threads in the same warp need to wait for each other to complete their tasks together, the non-functioning threads still takes time. ++ thrust's performance is the best of all scan methods. ##### Program Output at array SIZE = 2^15 From 7c5374e52d579863ccc9e24f16a69b8a276399c6 Mon Sep 17 00:00:00 2001 From: Yuxin Hu Date: Tue, 19 Sep 2017 23:03:00 -0400 Subject: [PATCH 7/8] edit readme.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f68bac1..434c930 100644 --- a/README.md +++ b/README.md @@ -34,8 +34,8 @@ There is not much performance change with block size changes. I set it to 512 fo

Exclusive Scan Performance Analysis with Increasing Array Size

+ It can be observed from the graph that the performance ranking is as follows: thrust > CPU > GPU Naive Scan > GPU Work Efficient Scan. The performance of GPU version scanning is worse than CPU version in my implementation. -+ For GPU Naive Scan, it runs log2n levels, with each level of n threads, so the total number of threads is n*Log2n, which is more than the number of elements check in CPU. Moreover I need to do a right shift of the Naive scan result which is another n threads. Although these threads can run in parallel, but considering the the thread scheduling on GPU, the advantage of naive scan is not that obvious comparing to CPU version. -+ For GPU Work Efficient Scan, I am doint an up sweep and a down sweep, which result in 2*log2n levels of kernal function calls. Each level consists of n threads, so the total number of threads would be 2n*log2n. It needs twice the thread number as Naive scan. In fact, many threads in each level are not doing work because they do not meet index%2^(level+1), but since all the threads in the same warp need to wait for each other to complete their tasks together, the non-functioning threads still takes time. ++ For GPU Naive Scan, it runs log2n levels, with each level of n threads, so the total number of threads is nLog2n, which is more than the number of elements check in CPU. Moreover I need to do a right shift of the Naive scan result which is another n threads. Although these threads can run in parallel, but considering the the thread scheduling on GPU,global memory read latency, the advantage of naive scan is not that obvious comparing to CPU version. ++ For GPU Work Efficient Scan, I am doint an up sweep and a down sweep, which result in 2log2n levels of kernal function calls. Each level consists of n threads, so the total number of threads would be 2nlog2n. It needs twice the thread number as Naive scan. In fact, many threads in each level are not doing work because they do not meet index%2^(level+1). Each streaming-multi processor can only execute a small number of warps at a time, so thoese threads who meet the criteria still have to wait until the SM finishes executing the warps, even though there may not be any threads meeting the requirements in those warps. + thrust's performance is the best of all scan methods. From 5fa717c0244aaa143800133d0c9398c97e42929d Mon Sep 17 00:00:00 2001 From: Yuxin Hu Date: Tue, 19 Sep 2017 23:03:55 -0400 Subject: [PATCH 8/8] edit readme.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 434c930..8bbc3fa 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ There is not much performance change with block size changes. I set it to 512 fo + It can be observed from the graph that the performance ranking is as follows: thrust > CPU > GPU Naive Scan > GPU Work Efficient Scan. The performance of GPU version scanning is worse than CPU version in my implementation. + For GPU Naive Scan, it runs log2n levels, with each level of n threads, so the total number of threads is nLog2n, which is more than the number of elements check in CPU. Moreover I need to do a right shift of the Naive scan result which is another n threads. Although these threads can run in parallel, but considering the the thread scheduling on GPU,global memory read latency, the advantage of naive scan is not that obvious comparing to CPU version. -+ For GPU Work Efficient Scan, I am doint an up sweep and a down sweep, which result in 2log2n levels of kernal function calls. Each level consists of n threads, so the total number of threads would be 2nlog2n. It needs twice the thread number as Naive scan. In fact, many threads in each level are not doing work because they do not meet index%2^(level+1). Each streaming-multi processor can only execute a small number of warps at a time, so thoese threads who meet the criteria still have to wait until the SM finishes executing the warps, even though there may not be any threads meeting the requirements in those warps. ++ For GPU Work Efficient Scan, I am doint an up sweep and a down sweep, which result in 2log2n levels of kernal function calls. Each level consists of n threads, so the total number of threads would be 2nlog2n. It needs twice the thread number as Naive scan. In fact, many threads in each level are not doing work because they do not meet index%2^(level+1). Each streaming-multi processor can only execute a small number of warps at a time, so those threads who meet the criteria still have to wait until the SM finishes executing the warps, even though there may not be any threads meeting the requirements in those warps. + thrust's performance is the best of all scan methods.