From 05634e6992286766df2d8a6899b9f07674e905c2 Mon Sep 17 00:00:00 2001 From: YOUR NAME Date: Tue, 10 Sep 2024 23:00:29 -0400 Subject: [PATCH 01/20] Implements CPU scan algorithms --- stream_compaction/cpu.cu | 45 +++++++++++++++++++++++++++++++++------- stream_compaction/cpu.h | 2 +- 2 files changed, 38 insertions(+), 9 deletions(-) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..053bf06 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -17,10 +17,15 @@ namespace StreamCompaction { * For performance analysis, this is supposed to be a simple for loop. * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ - void scan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); + void scan(int n, int *odata, const int *idata, bool useTimer) { + if (useTimer) timer().startCpuTimer(); + odata[0] = 0; + for (int i = 1; i < n; ++i) { + int input = idata[i - 1]; + int last_output = odata[i - 1]; + odata[i] = input + last_output; + } + if (useTimer) timer().endCpuTimer(); } /** @@ -30,9 +35,15 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int numOutputElements = 0; + for (int i = 0; i < n; ++i) { + int input = idata[i]; + if (input == 0) continue; + odata[numOutputElements] = input; + ++numOutputElements; + } timer().endCpuTimer(); - return -1; + return numOutputElements; } /** @@ -42,9 +53,27 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int trueFalseArray[n]; + for (int i = 0; i < n; ++i) { + int input = idata[i]; + trueFalseArray[i] = (input == 0) ? 0 : 1; + } + + int scannedTFArray[n]; + scan(n, scannedTFArray, trueFalseArray, false); timer().endCpuTimer(); - return -1; + + // Scatter + int numOutputElements = 0; + for (int i = 0; i < n; ++i) { + int input = idata[i]; + int trueFalseValue = trueFalseArray[i]; + if (!trueFalseValue) continue; + + odata[scannedTFArray[i]] = input; + ++numOutputElements; + } + return numOutputElements; } } } diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 873c047..6f3a466 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -6,7 +6,7 @@ namespace StreamCompaction { namespace CPU { StreamCompaction::Common::PerformanceTimer& timer(); - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata, bool useTimer = true); int compactWithoutScan(int n, int *odata, const int *idata); From 26359ccd61c93f447f480c0a1f8f37341c697360 Mon Sep 17 00:00:00 2001 From: YOUR NAME Date: Wed, 11 Sep 2024 02:16:15 -0400 Subject: [PATCH 02/20] IImplements naive GPU scanning (for powers of two) --- stream_compaction/naive.cu | 49 +++++++++++++++++++++++++++++++++++++- stream_compaction/naive.h | 3 ++- 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..034cd72 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -2,6 +2,7 @@ #include #include "common.h" #include "naive.h" +#include namespace StreamCompaction { namespace Naive { @@ -18,8 +19,54 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startGpuTimer(); - // TODO + + int* outputBuf; + int* inputBuf; + cudaMalloc((void**)&outputBuf, n * sizeof(int)); + cudaMalloc((void**)&inputBuf, n * sizeof(int)); + cudaMemcpy(inputBuf, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + for (int depth = 1; depth <= ilog2ceil(n); ++depth) { + int stride = static_cast(pow(2, depth - 1)); + cudaMemcpy(outputBuf, inputBuf, sizeof(int) * stride, cudaMemcpyDeviceToDevice); + + int blockSize = std::min(n - stride, 1024); // cap at 1024 threads, hardware limitation. + dim3 blocksPerGrid((n + blockSize - 1) / blockSize); // note integer division + naiveScan<<>>(n, depth, inputBuf, outputBuf); + + std::swap(outputBuf, inputBuf); + } + + // Convert inclusive scan to exclusive scan + int blockSize = std::min(n, 1024); + dim3 blocksForShift((n + blockSize - 1) / blockSize); + shiftRight<<>>(n, inputBuf, outputBuf); + + cudaMemcpy(odata, outputBuf, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(outputBuf); + cudaFree(inputBuf); timer().endGpuTimer(); } + + __global__ void naiveScan(int n, int depth, const int* inputBuf, int* outputBuf) { + int threadId = threadIdx.x + (blockDim.x * blockIdx.x); + int stride = 1 << (depth - 1); + if ((threadId) >= n - stride) return; + + + outputBuf[threadId + stride] = inputBuf[threadId] + inputBuf[threadId + stride]; + } + + __global__ void shiftRight(int n, const int* inputBuf, int* outputBuf) { + int threadId = threadIdx.x + (blockDim.x * blockIdx.x); + if (threadId >= n) return; + + if (threadId == 0) { + outputBuf[threadId] = 0; + return; + } + + outputBuf[threadId] = inputBuf[threadId - 1]; + } } } diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 37dcb06..b3c0f79 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -5,7 +5,8 @@ namespace StreamCompaction { namespace Naive { StreamCompaction::Common::PerformanceTimer& timer(); - void scan(int n, int *odata, const int *idata); + __global__ void naiveScan(int n, int depth, const int* inputBuf, int* outputBuf); + __global__ void shiftRight(int n, const int* inputBuf, int* outputBuf); } } From 0364676550c95a3a113a7e5ebbea32722e7e1dd6 Mon Sep 17 00:00:00 2001 From: YOUR NAME Date: Wed, 11 Sep 2024 02:18:46 -0400 Subject: [PATCH 03/20] Modifies naive GPU scanning for non-powers of two --- stream_compaction/naive.cu | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 034cd72..57392bb 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,7 @@ #include "common.h" #include "naive.h" #include +#include namespace StreamCompaction { namespace Naive { @@ -20,29 +21,36 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startGpuTimer(); + // Pad input data to be a power of two, if needed. + int nearestPowerOfTwo = pow(2, ilog2ceil(n)); + int* outputBuf; int* inputBuf; - cudaMalloc((void**)&outputBuf, n * sizeof(int)); - cudaMalloc((void**)&inputBuf, n * sizeof(int)); - cudaMemcpy(inputBuf, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMalloc((void**)&outputBuf, nearestPowerOfTwo * sizeof(int)); + cudaMalloc((void**)&inputBuf, nearestPowerOfTwo * sizeof(int)); + cudaMemcpy(inputBuf, idata, nearestPowerOfTwo * sizeof(int), cudaMemcpyHostToDevice); - for (int depth = 1; depth <= ilog2ceil(n); ++depth) { + if (n < nearestPowerOfTwo) { + cudaMemset(inputBuf + n, 0, (nearestPowerOfTwo - n) * sizeof(int)); + } + + for (int depth = 1; depth <= ilog2ceil(nearestPowerOfTwo); ++depth) { int stride = static_cast(pow(2, depth - 1)); cudaMemcpy(outputBuf, inputBuf, sizeof(int) * stride, cudaMemcpyDeviceToDevice); - int blockSize = std::min(n - stride, 1024); // cap at 1024 threads, hardware limitation. - dim3 blocksPerGrid((n + blockSize - 1) / blockSize); // note integer division - naiveScan<<>>(n, depth, inputBuf, outputBuf); + int blockSize = std::min(nearestPowerOfTwo - stride, 1024); // cap at 1024 threads, hardware limitation. + dim3 blocksPerGrid((nearestPowerOfTwo + blockSize - 1) / blockSize); // note integer division + naiveScan<<>>(nearestPowerOfTwo, depth, inputBuf, outputBuf); std::swap(outputBuf, inputBuf); } // Convert inclusive scan to exclusive scan int blockSize = std::min(n, 1024); - dim3 blocksForShift((n + blockSize - 1) / blockSize); + dim3 blocksForShift((nearestPowerOfTwo + blockSize - 1) / blockSize); shiftRight<<>>(n, inputBuf, outputBuf); - cudaMemcpy(odata, outputBuf, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(odata, outputBuf, nearestPowerOfTwo * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(outputBuf); cudaFree(inputBuf); timer().endGpuTimer(); From 1a30a6eaf0a156cf896d320ef8e089fd5cc671ec Mon Sep 17 00:00:00 2001 From: YOUR NAME Date: Wed, 11 Sep 2024 23:00:13 -0400 Subject: [PATCH 04/20] Rewrite naive scan without memcpy --- stream_compaction/naive.cu | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 57392bb..9793f1f 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -18,27 +18,24 @@ namespace StreamCompaction { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - - // Pad input data to be a power of two, if needed. + void scan(int n, int *odata, const int *idata) { int nearestPowerOfTwo = pow(2, ilog2ceil(n)); - int* outputBuf; int* inputBuf; cudaMalloc((void**)&outputBuf, nearestPowerOfTwo * sizeof(int)); cudaMalloc((void**)&inputBuf, nearestPowerOfTwo * sizeof(int)); cudaMemcpy(inputBuf, idata, nearestPowerOfTwo * sizeof(int), cudaMemcpyHostToDevice); - + + + // Pad input data to be a power of two, if needed. if (n < nearestPowerOfTwo) { cudaMemset(inputBuf + n, 0, (nearestPowerOfTwo - n) * sizeof(int)); } - for (int depth = 1; depth <= ilog2ceil(nearestPowerOfTwo); ++depth) { - int stride = static_cast(pow(2, depth - 1)); - cudaMemcpy(outputBuf, inputBuf, sizeof(int) * stride, cudaMemcpyDeviceToDevice); + timer().startGpuTimer(); - int blockSize = std::min(nearestPowerOfTwo - stride, 1024); // cap at 1024 threads, hardware limitation. + for (int depth = 1; depth <= ilog2ceil(nearestPowerOfTwo); ++depth) { + int blockSize = std::min(nearestPowerOfTwo, 1024); // cap at 1024 threads, hardware limitation. dim3 blocksPerGrid((nearestPowerOfTwo + blockSize - 1) / blockSize); // note integer division naiveScan<<>>(nearestPowerOfTwo, depth, inputBuf, outputBuf); @@ -50,19 +47,25 @@ namespace StreamCompaction { dim3 blocksForShift((nearestPowerOfTwo + blockSize - 1) / blockSize); shiftRight<<>>(n, inputBuf, outputBuf); + timer().endGpuTimer(); + cudaMemcpy(odata, outputBuf, nearestPowerOfTwo * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(outputBuf); cudaFree(inputBuf); - timer().endGpuTimer(); } __global__ void naiveScan(int n, int depth, const int* inputBuf, int* outputBuf) { int threadId = threadIdx.x + (blockDim.x * blockIdx.x); - int stride = 1 << (depth - 1); - if ((threadId) >= n - stride) return; + if (threadId >= n) return; + int stride = 1 << (depth - 1); + if (threadId >= n - stride) { + outputBuf[n - threadId - 1] = inputBuf[n - threadId - 1]; + } + else { + outputBuf[threadId + stride] = inputBuf[threadId] + inputBuf[threadId + stride]; + } - outputBuf[threadId + stride] = inputBuf[threadId] + inputBuf[threadId + stride]; } __global__ void shiftRight(int n, const int* inputBuf, int* outputBuf) { From 8ec34d633908128876a58dbfb0afb9d582d50c5a Mon Sep 17 00:00:00 2001 From: YOUR NAME Date: Thu, 12 Sep 2024 23:39:03 -0400 Subject: [PATCH 05/20] Implements initial work-efficient scan Only for power-of-two arrays, only for arrays < 2048 in length. --- stream_compaction/efficient.cu | 60 +++++++++++++++++++++++++++++++++- stream_compaction/efficient.h | 3 +- stream_compaction/naive.cu | 1 - 3 files changed, 61 insertions(+), 3 deletions(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..4bf5baa 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,6 +2,7 @@ #include #include "common.h" #include "efficient.h" +#include namespace StreamCompaction { namespace Efficient { @@ -16,9 +17,66 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* dev_data; + cudaMalloc((void**) &dev_data, n * sizeof(int)); + cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); timer().startGpuTimer(); - // TODO + + for (int depth = 0; depth < ilog2ceil(n); ++depth) { + int totalOperations = n / (1 << depth); + int blockSize = std::min(totalOperations, 1024); // 1024 is a hardware limitation + dim3 blocksPerGrid = (totalOperations + blockSize - 1 / blockSize); + + kernUpSweep<<>>(totalOperations, depth, dev_data); + cudaDeviceSynchronize(); + } + + // Pre-step for downsweep + cudaMemset(dev_data + n - 1, 0, sizeof(int)); + + for (int depth = ilog2ceil(n) - 1; depth >= 0; --depth) { + int totalOperations = n / (1 << depth); + int blockSize = std::min(totalOperations, 1024); // 1024 is a hardware limitation + dim3 blocksPerGrid = (totalOperations + blockSize - 1 / blockSize); + + kernDownSweep<<>>(totalOperations, depth, dev_data); + cudaDeviceSynchronize(); + } + timer().endGpuTimer(); + cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_data); + } + + + /** + * n is the total number of threads doing work on this iteration, + * not necessarily the number of elements in the overall array. + */ + __global__ void kernUpSweep(int n, int depth, int* dev_data) { + int threadId = (blockDim.x * blockIdx.x) + threadIdx.x; + if (threadId >= n) return; + + int twoToDepthPlusOne = (1 << (depth + 1)); + int twoToDepth = (1 << depth); + int leftChildIdx = (threadId * twoToDepthPlusOne) + twoToDepth - 1; + int rightChildIdx = (threadId * twoToDepthPlusOne) + twoToDepthPlusOne - 1; + + dev_data[rightChildIdx] += dev_data[leftChildIdx]; + } + + __global__ void kernDownSweep(int n, int depth, int* dev_data) { + int threadId = (blockDim.x * blockIdx.x) + threadIdx.x; + if (threadId >= n) return; + + int twoToDepthPlusOne = (1 << (depth + 1)); + int twoToDepth = (1 << depth); + int leftChildIdx = (threadId * twoToDepthPlusOne) + twoToDepth - 1; + int rightChildIdx = (threadId * twoToDepthPlusOne) + twoToDepthPlusOne - 1; + + int leftVal = dev_data[leftChildIdx]; + dev_data[leftChildIdx] = dev_data[rightChildIdx]; + dev_data[rightChildIdx] += leftVal; } /** diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..9121892 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -7,7 +7,8 @@ namespace StreamCompaction { StreamCompaction::Common::PerformanceTimer& timer(); void scan(int n, int *odata, const int *idata); - + __global__ void kernUpSweep(int n, int depth, int* dev_data); + __global__ void kernDownSweep(int n, int depth, int* dev_data); int compact(int n, int *odata, const int *idata); } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9793f1f..dd31b09 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,7 +3,6 @@ #include "common.h" #include "naive.h" #include -#include namespace StreamCompaction { namespace Naive { From 87eb37bbbdb865826d0225dfa54b3507f3df639e Mon Sep 17 00:00:00 2001 From: YOUR NAME Date: Thu, 12 Sep 2024 23:48:02 -0400 Subject: [PATCH 06/20] Improves work efficient scan for non power-of-two array sizes --- stream_compaction/efficient.cu | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 4bf5baa..7f5fdbd 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -17,13 +17,21 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int n_padded = pow(2, ilog2ceil(n)); // pad array to neared power of two + int* dev_data; - cudaMalloc((void**) &dev_data, n * sizeof(int)); + cudaMalloc((void**) &dev_data, n_padded * sizeof(int)); cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + // Pad input data to be a power of two, if needed. + if (n < n_padded) { + cudaMemset(dev_data + n, 0, (n_padded - n) * sizeof(int)); + } + timer().startGpuTimer(); - for (int depth = 0; depth < ilog2ceil(n); ++depth) { - int totalOperations = n / (1 << depth); + for (int depth = 0; depth < ilog2ceil(n_padded); ++depth) { + int totalOperations = n_padded / (1 << depth); int blockSize = std::min(totalOperations, 1024); // 1024 is a hardware limitation dim3 blocksPerGrid = (totalOperations + blockSize - 1 / blockSize); @@ -32,10 +40,10 @@ namespace StreamCompaction { } // Pre-step for downsweep - cudaMemset(dev_data + n - 1, 0, sizeof(int)); + cudaMemset(dev_data + n_padded - 1, 0, sizeof(int)); - for (int depth = ilog2ceil(n) - 1; depth >= 0; --depth) { - int totalOperations = n / (1 << depth); + for (int depth = ilog2ceil(n_padded) - 1; depth >= 0; --depth) { + int totalOperations = n_padded / (1 << depth); int blockSize = std::min(totalOperations, 1024); // 1024 is a hardware limitation dim3 blocksPerGrid = (totalOperations + blockSize - 1 / blockSize); @@ -44,7 +52,7 @@ namespace StreamCompaction { } timer().endGpuTimer(); - cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(odata, dev_data, n_padded * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(dev_data); } From a0ac985f06faa182ed139e3baf611778726a26bc Mon Sep 17 00:00:00 2001 From: YOUR NAME Date: Fri, 13 Sep 2024 13:50:41 -0400 Subject: [PATCH 07/20] WIP initial attempt at arbitrary length arrays --- src/main.cpp | 134 ++++++++++++++++----------------- stream_compaction/efficient.cu | 131 ++++++++++++++++++++++++-------- stream_compaction/efficient.h | 4 +- stream_compaction/naive.cu | 2 +- 4 files changed, 170 insertions(+), 101 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..a8ca5ef 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -71,81 +71,81 @@ int main(int argc, char* argv[]) { printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + // printArray(NPOT, c, true); printCmpResult(NPOT, b, c); - printf("\n"); - printf("*****************************\n"); - printf("** STREAM COMPACTION TESTS **\n"); - printf("*****************************\n"); - - // Compaction tests - - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - int count, expectedCount, expectedNPOT; - - // initialize b using StreamCompaction::CPU::compactWithoutScan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); - - zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); + // zeroArray(SIZE, c); + // printDesc("thrust scan, power-of-two"); + // StreamCompaction::Thrust::scan(SIZE, c, a); + // printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + // //printArray(SIZE, c, true); + // printCmpResult(SIZE, b, c); + + // zeroArray(SIZE, c); + // printDesc("thrust scan, non-power-of-two"); + // StreamCompaction::Thrust::scan(NPOT, c, a); + // printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + // //printArray(NPOT, c, true); + // printCmpResult(NPOT, b, c); + + // printf("\n"); + // printf("*****************************\n"); + // printf("** STREAM COMPACTION TESTS **\n"); + // printf("*****************************\n"); + + // // Compaction tests + + // genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case + // a[SIZE - 1] = 0; + // printArray(SIZE, a, true); + + // int count, expectedCount, expectedNPOT; + + // // initialize b using StreamCompaction::CPU::compactWithoutScan you implement + // // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. + // zeroArray(SIZE, b); + // printDesc("cpu compact without scan, power-of-two"); + // count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + // printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + // expectedCount = count; + // printArray(count, b, true); + // printCmpLenResult(count, expectedCount, b, b); + + // zeroArray(SIZE, c); + // printDesc("cpu compact without scan, non-power-of-two"); + // count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + // printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + // expectedNPOT = count; + // printArray(count, c, true); + // printCmpLenResult(count, expectedNPOT, b, c); + + // zeroArray(SIZE, c); + // printDesc("cpu compact with scan"); + // count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + // printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + // printArray(count, c, true); + // printCmpLenResult(count, expectedCount, b, c); + + // zeroArray(SIZE, c); + // printDesc("work-efficient compact, power-of-two"); + // count = StreamCompaction::Efficient::compact(SIZE, c, a); + // printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + // //printArray(count, c, true); + // printCmpLenResult(count, expectedCount, b, c); + + // zeroArray(SIZE, c); + // printDesc("work-efficient compact, non-power-of-two"); + // count = StreamCompaction::Efficient::compact(NPOT, c, a); + // printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + // //printArray(count, c, true); + // printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit delete[] a; diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 7f5fdbd..aac281f 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -13,11 +13,13 @@ namespace StreamCompaction { return timer; } + const int MAX_BLOCK_SIZE = 1024; + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - int n_padded = pow(2, ilog2ceil(n)); // pad array to neared power of two + int n_padded = pow(2, ilog2ceil(n)); // pad array to nearest power of two int* dev_data; cudaMalloc((void**) &dev_data, n_padded * sizeof(int)); @@ -28,63 +30,128 @@ namespace StreamCompaction { cudaMemset(dev_data + n, 0, (n_padded - n) * sizeof(int)); } - timer().startGpuTimer(); + int blockSize = std::min((n_padded / 2), MAX_BLOCK_SIZE); + dim3 blocksPerGrid = ((n_padded / 2) + blockSize - 1) / blockSize; + + int* stored_sums; // temp array used to last entry per block during upsweep. See kernBlockSums for more info. + cudaMalloc((void**) &stored_sums, blocksPerGrid.x * sizeof(int)); + + // timer().startGpuTimer(); - for (int depth = 0; depth < ilog2ceil(n_padded); ++depth) { - int totalOperations = n_padded / (1 << depth); - int blockSize = std::min(totalOperations, 1024); // 1024 is a hardware limitation - dim3 blocksPerGrid = (totalOperations + blockSize - 1 / blockSize); + for (int depth = 0; depth < ilog2ceil(2 * blockSize); ++depth) { + kernUpSweep<<>>(n_padded, depth, dev_data); - kernUpSweep<<>>(totalOperations, depth, dev_data); + blockSize /= 2; // need fewer threads each iteration cudaDeviceSynchronize(); } - // Pre-step for downsweep - cudaMemset(dev_data + n_padded - 1, 0, sizeof(int)); + // Keep blocks per grid constant, so we can handle arbitrarily sized arrays. + // But grow blockSize on each iteration since we need more threads on each depth layer. + blockSize = 1; + for (int depth = ilog2ceil(2 * blockSize) - 1; depth >= 0; --depth) { + kernDownSweep<<>>(n_padded, depth, dev_data, stored_sums); - for (int depth = ilog2ceil(n_padded) - 1; depth >= 0; --depth) { - int totalOperations = n_padded / (1 << depth); - int blockSize = std::min(totalOperations, 1024); // 1024 is a hardware limitation - dim3 blocksPerGrid = (totalOperations + blockSize - 1 / blockSize); + blockSize *= 2; // need more threads each iteration + cudaDeviceSynchronize(); + } - kernDownSweep<<>>(totalOperations, depth, dev_data); + // If the array didn't fit within a single block, we need to collect the individual block scan results, + // put them in an array, and scan that array. Then add the twice-scanned array as increments back to the original results. + // + // This needs to be done recursively to handle arbitrarily large arrays. + if (n_padded > 2 * MAX_BLOCK_SIZE) { + // TODO - partial scans are all exclusive... need to be inclusive for this step + + // Scatter the sums from the previous scan operation + int* sum_data; + cudaMalloc((void**) &sum_data, blocksPerGrid.x * sizeof(int)); + int sumBlockSize = 128; + dim3 sumBlocksPerGrid = (blocksPerGrid.x + sumBlockSize - 1) / sumBlockSize; + kernBlockSums<<>>(blocksPerGrid.x, blockSize, dev_data, sum_data, stored_sums); cudaDeviceSynchronize(); + + // (Recursively) scan the summed blocks array + // Can use sum_data as both the input and output pointers for the scan. No issue writing over it. + scan(blocksPerGrid.x, sum_data, sum_data); + + // Finally, add scanned sum values back to the original dev_data + // In original scan, each thread handled 2 elements. In this step, each handles one, so we need 2x the blocks. + dim3 kernBlocksPerGrid = 2 * blocksPerGrid.x; + kernIncrement<<>>(n_padded, dev_data, sum_data); + cudaDeviceSynchronize(); + + cudaFree(sum_data); } - timer().endGpuTimer(); + // TODO - the recursive scan is going to trigger multiple timer calls... create wrapper function for timing. + // Actually I think best bet is to modify timer to not restart if it's already been started? But then what about end... wrapper function for that? + // timer().endGpuTimer(); + cudaFree(stored_sums); cudaMemcpy(odata, dev_data, n_padded * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(dev_data); } - - /** - * n is the total number of threads doing work on this iteration, - * not necessarily the number of elements in the overall array. - */ __global__ void kernUpSweep(int n, int depth, int* dev_data) { - int threadId = (blockDim.x * blockIdx.x) + threadIdx.x; - if (threadId >= n) return; + int threadId = threadIdx.x; + if (threadId + (blockDim.x * blockIdx.x) >= n/2) return; int twoToDepthPlusOne = (1 << (depth + 1)); int twoToDepth = (1 << depth); - int leftChildIdx = (threadId * twoToDepthPlusOne) + twoToDepth - 1; - int rightChildIdx = (threadId * twoToDepthPlusOne) + twoToDepthPlusOne - 1; + // Since each block is a self contained scan, we calculate these indices w.r.t the local block thread index. + // But then we offset by (blockDim.x * blockIdx.x) because the dev_data is for ALL blocks, so we need to access the right part. + int leftChildIdx = (threadId * twoToDepthPlusOne) + twoToDepth - 1 + (blockDim.x * blockIdx.x); + int rightChildIdx = (threadId * twoToDepthPlusOne) + twoToDepthPlusOne - 1 + (blockDim.x * blockIdx.x); dev_data[rightChildIdx] += dev_data[leftChildIdx]; } - __global__ void kernDownSweep(int n, int depth, int* dev_data) { - int threadId = (blockDim.x * blockIdx.x) + threadIdx.x; - if (threadId >= n) return; + __global__ void kernDownSweep(int n, int depth, int* dev_data, int* stored_sums) { + int threadId = threadIdx.x; + if (threadId + (blockDim.x * blockIdx.x) >= n/2) return; int twoToDepthPlusOne = (1 << (depth + 1)); int twoToDepth = (1 << depth); - int leftChildIdx = (threadId * twoToDepthPlusOne) + twoToDepth - 1; - int rightChildIdx = (threadId * twoToDepthPlusOne) + twoToDepthPlusOne - 1; + int blockLeftChildIdx = (threadId * twoToDepthPlusOne) + twoToDepth - 1; + int globalLeftChildIdx = blockLeftChildIdx + (blockDim.x * blockIdx.x); + int blockRightChildIdx = (threadId * twoToDepthPlusOne) + twoToDepthPlusOne - 1; + int globalRightChildIdx = blockRightChildIdx + + (blockDim.x * blockIdx.x); + + if (blockRightChildIdx == (blockDim.x - 1)) { + stored_sums[blockIdx.x] = dev_data[blockRightChildIdx]; // need this for later kernel + dev_data[blockRightChildIdx] = 0; // zero out last element in block + } + + int leftVal = dev_data[globalLeftChildIdx]; + dev_data[globalLeftChildIdx] = dev_data[globalRightChildIdx]; + dev_data[globalLeftChildIdx] += leftVal; + } + + /** + * Kernel to scatter sums from individual blocks into a sum_data array. + * In most use cases, the sum_data array is probably small and this would be faster on the CPU, but for REALLY big input arrays, + * it might be worth it to do this step on the GPU. + * + * Here, n is the number of blocks used in the upsweep / downsweep steps above. + */ + __global__ void kernBlockSums(int n, int stride, const int* dev_data, int* sum_data, const int* stored_sums) { + int threadId = threadIdx.x + (blockDim.x * blockIdx.x); + if (threadId > n) return; + + // Note: since the partial scans on dev_data were exclusive, we need to do one more addition to get the inclusive scan amount. + sum_data[threadId] = dev_data[(threadId * stride) + (stride - 1)] + stored_sums[threadId]; + } + + /** + * Kernel to add the scanned block sums back to the original array. + * + * n here is the number of elements in the original input arary. + */ + __global__ void kernIncrement(int n, int* dev_data, int* sum_data) { + int threadId = threadIdx.x + (blockIdx.x * blockDim.x); + if (threadId > n) return; - int leftVal = dev_data[leftChildIdx]; - dev_data[leftChildIdx] = dev_data[rightChildIdx]; - dev_data[rightChildIdx] += leftVal; + // Divide blockIdx.x by 2 because we're using 2x the blocks for this step compared to the original scan. + dev_data[threadId] += sum_data[blockIdx.x / 2]; } /** diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 9121892..a6ebebe 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -8,7 +8,9 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata); __global__ void kernUpSweep(int n, int depth, int* dev_data); - __global__ void kernDownSweep(int n, int depth, int* dev_data); + __global__ void kernDownSweep(int n, int depth, int* dev_data, int* stored_sums); + __global__ void kernBlockSums(int n, int stride, const int* dev_data, int* sum_data, const int* stored_sums); + __global__ void kernIncrement(int n, int* dev_data, int* sum_data); int compact(int n, int *odata, const int *idata); } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index dd31b09..e1332c1 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -23,7 +23,7 @@ namespace StreamCompaction { int* inputBuf; cudaMalloc((void**)&outputBuf, nearestPowerOfTwo * sizeof(int)); cudaMalloc((void**)&inputBuf, nearestPowerOfTwo * sizeof(int)); - cudaMemcpy(inputBuf, idata, nearestPowerOfTwo * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(inputBuf, idata, n * sizeof(int), cudaMemcpyHostToDevice); // Pad input data to be a power of two, if needed. From b6d35e89915d1ca89c8b9b7d00b3ef0f9b7fc1d6 Mon Sep 17 00:00:00 2001 From: Matt Schwartz Date: Fri, 13 Sep 2024 17:53:22 -0400 Subject: [PATCH 08/20] Finishes implementation of efficient scan of arbitrary length --- stream_compaction/efficient.cu | 145 ++++++++++++++++----------------- stream_compaction/efficient.h | 6 +- 2 files changed, 71 insertions(+), 80 deletions(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index aac281f..bf09198 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -15,43 +15,35 @@ namespace StreamCompaction { const int MAX_BLOCK_SIZE = 1024; - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - int n_padded = pow(2, ilog2ceil(n)); // pad array to nearest power of two - - int* dev_data; - cudaMalloc((void**) &dev_data, n_padded * sizeof(int)); - cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); - - // Pad input data to be a power of two, if needed. - if (n < n_padded) { - cudaMemset(dev_data + n, 0, (n_padded - n) * sizeof(int)); - } - - int blockSize = std::min((n_padded / 2), MAX_BLOCK_SIZE); - dim3 blocksPerGrid = ((n_padded / 2) + blockSize - 1) / blockSize; + void scan(int n_padded, int* dev_data) { + int maxBlockSize = std::min((n_padded / 2), MAX_BLOCK_SIZE); + int stride = 2 * maxBlockSize; // since each thread works on two entries of dev_data, 2x block size gives the stride to get between blocks of data in dev_data. + dim3 blocksPerGrid = ((n_padded / 2) + maxBlockSize - 1) / maxBlockSize; - int* stored_sums; // temp array used to last entry per block during upsweep. See kernBlockSums for more info. - cudaMalloc((void**) &stored_sums, blocksPerGrid.x * sizeof(int)); + int* stored_sums; // temp array used to last entry per block during upsweep. See kernZeroEntries and kernIncrement for use info. + cudaMalloc((void**)&stored_sums, blocksPerGrid.x * sizeof(int)); - // timer().startGpuTimer(); - - for (int depth = 0; depth < ilog2ceil(2 * blockSize); ++depth) { - kernUpSweep<<>>(n_padded, depth, dev_data); + int blockSize_i = maxBlockSize; + for (int depth = 0; depth < ilog2ceil(2 * maxBlockSize); ++depth) { + kernUpSweep<<>>(n_padded, stride, depth, dev_data); - blockSize /= 2; // need fewer threads each iteration + blockSize_i /= 2; // need fewer threads each iteration cudaDeviceSynchronize(); } + // Between upsweep and downsweep, zero out the last entry of every block (first storing off those entries for later use) + int zeroEntriesBlockSize = 128; + dim3 zeroEntriesBlocksPerGrid = (blocksPerGrid.x + zeroEntriesBlockSize - 1) / zeroEntriesBlockSize; + kernZeroEntries<<>> (blocksPerGrid.x, stride, dev_data, stored_sums); + cudaDeviceSynchronize(); + // Keep blocks per grid constant, so we can handle arbitrarily sized arrays. // But grow blockSize on each iteration since we need more threads on each depth layer. - blockSize = 1; - for (int depth = ilog2ceil(2 * blockSize) - 1; depth >= 0; --depth) { - kernDownSweep<<>>(n_padded, depth, dev_data, stored_sums); + blockSize_i = 1; + for (int depth = ilog2ceil(2 * maxBlockSize) - 1; depth >= 0; --depth) { + kernDownSweep<<>>(n_padded, stride, depth, dev_data); - blockSize *= 2; // need more threads each iteration + blockSize_i *= 2; // need more threads each iteration cudaDeviceSynchronize(); } @@ -60,85 +52,82 @@ namespace StreamCompaction { // // This needs to be done recursively to handle arbitrarily large arrays. if (n_padded > 2 * MAX_BLOCK_SIZE) { - // TODO - partial scans are all exclusive... need to be inclusive for this step - - // Scatter the sums from the previous scan operation - int* sum_data; - cudaMalloc((void**) &sum_data, blocksPerGrid.x * sizeof(int)); - int sumBlockSize = 128; - dim3 sumBlocksPerGrid = (blocksPerGrid.x + sumBlockSize - 1) / sumBlockSize; - kernBlockSums<<>>(blocksPerGrid.x, blockSize, dev_data, sum_data, stored_sums); - cudaDeviceSynchronize(); - // (Recursively) scan the summed blocks array // Can use sum_data as both the input and output pointers for the scan. No issue writing over it. - scan(blocksPerGrid.x, sum_data, sum_data); + scan(blocksPerGrid.x, stored_sums); // Finally, add scanned sum values back to the original dev_data // In original scan, each thread handled 2 elements. In this step, each handles one, so we need 2x the blocks. - dim3 kernBlocksPerGrid = 2 * blocksPerGrid.x; - kernIncrement<<>>(n_padded, dev_data, sum_data); + dim3 kernBlocksPerGrid = 2 * blocksPerGrid.x; + kernIncrement<<>> (n_padded, dev_data, stored_sums); cudaDeviceSynchronize(); - - cudaFree(sum_data); } - // TODO - the recursive scan is going to trigger multiple timer calls... create wrapper function for timing. - // Actually I think best bet is to modify timer to not restart if it's already been started? But then what about end... wrapper function for that? - // timer().endGpuTimer(); cudaFree(stored_sums); + } + + /** + * Wrapper around scan (to facilitate gpu timing and allocating things) + */ + void scan(int n, int *odata, const int *idata) { + int n_padded = pow(2, ilog2ceil(n)); // pad array to nearest power of two + + int* dev_data; + cudaMalloc((void**)&dev_data, n_padded * sizeof(int)); + cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + // Pad input data to be a power of two, if needed. + if (n < n_padded) { + cudaMemset(dev_data + n, 0, (n_padded - n) * sizeof(int)); + } + + timer().startGpuTimer(); + scan(n_padded, dev_data); + timer().endGpuTimer(); + cudaMemcpy(odata, dev_data, n_padded * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(dev_data); } - __global__ void kernUpSweep(int n, int depth, int* dev_data) { + __global__ void kernUpSweep(int n, int stride, int depth, int* dev_data) { int threadId = threadIdx.x; - if (threadId + (blockDim.x * blockIdx.x) >= n/2) return; int twoToDepthPlusOne = (1 << (depth + 1)); int twoToDepth = (1 << depth); // Since each block is a self contained scan, we calculate these indices w.r.t the local block thread index. // But then we offset by (blockDim.x * blockIdx.x) because the dev_data is for ALL blocks, so we need to access the right part. - int leftChildIdx = (threadId * twoToDepthPlusOne) + twoToDepth - 1 + (blockDim.x * blockIdx.x); - int rightChildIdx = (threadId * twoToDepthPlusOne) + twoToDepthPlusOne - 1 + (blockDim.x * blockIdx.x); + int leftChildIdx = (threadId * twoToDepthPlusOne) + twoToDepth - 1 + (stride * blockIdx.x); + int rightChildIdx = (threadId * twoToDepthPlusOne) + twoToDepthPlusOne - 1 + (stride * blockIdx.x); + + if (rightChildIdx >= n) return; dev_data[rightChildIdx] += dev_data[leftChildIdx]; } - __global__ void kernDownSweep(int n, int depth, int* dev_data, int* stored_sums) { + __global__ void kernZeroEntries(int n, int stride, int* dev_data, int* stored_sums) { + int threadId = threadIdx.x + (blockDim.x * blockIdx.x); + if (threadId >= n) return; + + int dev_data_idx = (threadId + 1) * stride - 1; + stored_sums[threadId] = dev_data[dev_data_idx]; + dev_data[dev_data_idx] = 0; + } + + __global__ void kernDownSweep(int n, int stride, int depth, int* dev_data) { int threadId = threadIdx.x; - if (threadId + (blockDim.x * blockIdx.x) >= n/2) return; int twoToDepthPlusOne = (1 << (depth + 1)); int twoToDepth = (1 << depth); int blockLeftChildIdx = (threadId * twoToDepthPlusOne) + twoToDepth - 1; - int globalLeftChildIdx = blockLeftChildIdx + (blockDim.x * blockIdx.x); + int globalLeftChildIdx = blockLeftChildIdx + (stride * blockIdx.x); int blockRightChildIdx = (threadId * twoToDepthPlusOne) + twoToDepthPlusOne - 1; - int globalRightChildIdx = blockRightChildIdx + + (blockDim.x * blockIdx.x); + int globalRightChildIdx = blockRightChildIdx + (stride * blockIdx.x); - if (blockRightChildIdx == (blockDim.x - 1)) { - stored_sums[blockIdx.x] = dev_data[blockRightChildIdx]; // need this for later kernel - dev_data[blockRightChildIdx] = 0; // zero out last element in block - } + if (globalRightChildIdx >= n) return; int leftVal = dev_data[globalLeftChildIdx]; dev_data[globalLeftChildIdx] = dev_data[globalRightChildIdx]; - dev_data[globalLeftChildIdx] += leftVal; - } - - /** - * Kernel to scatter sums from individual blocks into a sum_data array. - * In most use cases, the sum_data array is probably small and this would be faster on the CPU, but for REALLY big input arrays, - * it might be worth it to do this step on the GPU. - * - * Here, n is the number of blocks used in the upsweep / downsweep steps above. - */ - __global__ void kernBlockSums(int n, int stride, const int* dev_data, int* sum_data, const int* stored_sums) { - int threadId = threadIdx.x + (blockDim.x * blockIdx.x); - if (threadId > n) return; - - // Note: since the partial scans on dev_data were exclusive, we need to do one more addition to get the inclusive scan amount. - sum_data[threadId] = dev_data[(threadId * stride) + (stride - 1)] + stored_sums[threadId]; + dev_data[globalRightChildIdx] += leftVal; } /** @@ -148,10 +137,12 @@ namespace StreamCompaction { */ __global__ void kernIncrement(int n, int* dev_data, int* sum_data) { int threadId = threadIdx.x + (blockIdx.x * blockDim.x); - if (threadId > n) return; + if (threadId >= n) return; + + // The extra factor of 2 comes from the fact that we're using twice as many block here as in the original scan. + int sum_data_idx = gridDim.x * threadId / (2 * n); // note: integer division here - // Divide blockIdx.x by 2 because we're using 2x the blocks for this step compared to the original scan. - dev_data[threadId] += sum_data[blockIdx.x / 2]; + dev_data[threadId] += sum_data[sum_data_idx]; } /** diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index a6ebebe..1918e9d 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -7,9 +7,9 @@ namespace StreamCompaction { StreamCompaction::Common::PerformanceTimer& timer(); void scan(int n, int *odata, const int *idata); - __global__ void kernUpSweep(int n, int depth, int* dev_data); - __global__ void kernDownSweep(int n, int depth, int* dev_data, int* stored_sums); - __global__ void kernBlockSums(int n, int stride, const int* dev_data, int* sum_data, const int* stored_sums); + __global__ void kernUpSweep(int n, int stride, int depth, int* dev_data); + __global__ void kernZeroEntries(int n, int stride, int* dev_data, int* stored_sums); + __global__ void kernDownSweep(int n, int stride, int depth, int* dev_data); __global__ void kernIncrement(int n, int* dev_data, int* sum_data); int compact(int n, int *odata, const int *idata); } From 749a82ba279fe520c51d3e7be37f8823227c79ea Mon Sep 17 00:00:00 2001 From: YOUR NAME Date: Sat, 14 Sep 2024 15:01:59 -0400 Subject: [PATCH 09/20] Initial implementation of efficient stream compaction (WIP) --- stream_compaction/common.cu | 24 +++++++++++++++++--- stream_compaction/common.h | 2 +- stream_compaction/efficient.cu | 40 ++++++++++++++++++++++++++++++---- 3 files changed, 58 insertions(+), 8 deletions(-) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..865c2e0 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,10 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + + bools[index] = (idata[index] == 0); } /** @@ -31,8 +34,23 @@ namespace StreamCompaction { * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. */ __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices) { - // TODO + const int *idata, const int *scannedBools) { + int threadId = threadIdx.x + (blockIdx.x * blockDim.x); + if (threadId >= n) return; + + int data = idata[threadId]; + int scan_i = scannedBools[threadId]; + + // Special case for last element of idata array + if (threadId == n - 1 && data) { + odata[scan_i] = data; + return; + } + + int scan_iplusone = scannedBools[threadId + 1]; + if (scan_i != scan_iplusone) { + odata[scan_i] = data; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..fdbf9f8 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -35,7 +35,7 @@ namespace StreamCompaction { __global__ void kernMapToBoolean(int n, int *bools, const int *idata); __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices); + const int *idata, const int *scannedBools); /** * This class is used for timing the performance diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index bf09198..799e445 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -67,7 +67,7 @@ namespace StreamCompaction { } /** - * Wrapper around scan (to facilitate gpu timing and allocating things) + * Wrapper method (to facilitate gpu timing and allocating things) */ void scan(int n, int *odata, const int *idata) { int n_padded = pow(2, ilog2ceil(n)); // pad array to nearest power of two @@ -95,7 +95,7 @@ namespace StreamCompaction { int twoToDepthPlusOne = (1 << (depth + 1)); int twoToDepth = (1 << depth); // Since each block is a self contained scan, we calculate these indices w.r.t the local block thread index. - // But then we offset by (blockDim.x * blockIdx.x) because the dev_data is for ALL blocks, so we need to access the right part. + // But then we offset by (stride * blockIdx.x) because the dev_data is for ALL blocks, so we need to access the right part. int leftChildIdx = (threadId * twoToDepthPlusOne) + twoToDepth - 1 + (stride * blockIdx.x); int rightChildIdx = (threadId * twoToDepthPlusOne) + twoToDepthPlusOne - 1 + (stride * blockIdx.x); @@ -155,10 +155,42 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int n_padded = pow(2, ilog2ceil(n)); // pad array to nearest power of two + int* trueFalseArray, *dev_idata, *dev_odata; + cudaMalloc((void**)&trueFalseArray, n_padded * sizeof(int)); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); // allocate for worst-case scenario + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + if (n < n_padded) { + // Note that if we wanted to *retain* 0s in the compacted array, we'd need to pad + // trueFalseArray with a different value here. + cudaMemset(trueFalseArray + n, 0, (n_padded - n) * sizeof(int)); + } + timer().startGpuTimer(); - // TODO + + int threadsPerBlock = 128; + dim3 blocksPerGrid = (n + threadsPerBlock - 1 / threadsPerBlock); + StreamCompaction::Common::kernMapToBoolean<<>>(n, trueFalseArray, dev_idata); + cudaDeviceSynchronize(); + + scan(n_padded, trueFalseArray); // scan happens in-place, so trueFalseArray is now scanned + + StreamCompaction::Common::kernScatter<<>>(n, dev_odata, dev_idata, trueFalseArray); + cudaDeviceSynchronize(); + timer().endGpuTimer(); - return -1; + + int compactArraySize; + cudaMemcpy(&compactArraySize, trueFalseArray + n_padded - 1, sizeof(int), cudaMemcpyDeviceToHost); + compactArraySize += (idata[n - 1] != 0); // necessary because scan was exclusive + + cudaMemcpy(odata, dev_odata, compactArraySize * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_odata); + cudaFree(dev_idata); + cudaFree(trueFalseArray); + return compactArraySize; } } } From 1680fb7a494487f6335cc9103e916af400f81d00 Mon Sep 17 00:00:00 2001 From: YOUR NAME Date: Sat, 14 Sep 2024 18:03:12 -0400 Subject: [PATCH 10/20] Finishes implementation of efficient compaction --- stream_compaction/common.cu | 2 +- stream_compaction/efficient.cu | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 865c2e0..cc66f76 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -26,7 +26,7 @@ namespace StreamCompaction { int index = threadIdx.x + (blockIdx.x * blockDim.x); if (index >= n) return; - bools[index] = (idata[index] == 0); + bools[index] = (idata[index] != 0); } /** diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 799e445..9b30810 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -171,7 +171,7 @@ namespace StreamCompaction { timer().startGpuTimer(); int threadsPerBlock = 128; - dim3 blocksPerGrid = (n + threadsPerBlock - 1 / threadsPerBlock); + dim3 blocksPerGrid = ((n + threadsPerBlock - 1) / threadsPerBlock); StreamCompaction::Common::kernMapToBoolean<<>>(n, trueFalseArray, dev_idata); cudaDeviceSynchronize(); @@ -183,7 +183,7 @@ namespace StreamCompaction { timer().endGpuTimer(); int compactArraySize; - cudaMemcpy(&compactArraySize, trueFalseArray + n_padded - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&compactArraySize, trueFalseArray + n - 1, sizeof(int), cudaMemcpyDeviceToHost); compactArraySize += (idata[n - 1] != 0); // necessary because scan was exclusive cudaMemcpy(odata, dev_odata, compactArraySize * sizeof(int), cudaMemcpyDeviceToHost); From 1786d5ef64ff30a3ad26b5c3c11b856359bd7808 Mon Sep 17 00:00:00 2001 From: YOUR NAME Date: Sat, 14 Sep 2024 19:32:03 -0400 Subject: [PATCH 11/20] Thrust scan and fix to cpu scan --- stream_compaction/cpu.cu | 11 ++++++++--- stream_compaction/thrust.cu | 11 ++++++++--- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 053bf06..fa2fe48 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -52,16 +52,16 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + int* trueFalseArray = new int[n]; + int* scannedTFArray = new int[n]; + timer().startCpuTimer(); - int trueFalseArray[n]; for (int i = 0; i < n; ++i) { int input = idata[i]; trueFalseArray[i] = (input == 0) ? 0 : 1; } - int scannedTFArray[n]; scan(n, scannedTFArray, trueFalseArray, false); - timer().endCpuTimer(); // Scatter int numOutputElements = 0; @@ -73,6 +73,11 @@ namespace StreamCompaction { odata[scannedTFArray[i]] = input; ++numOutputElements; } + + timer().endCpuTimer(); + + delete[] trueFalseArray; + delete[] scannedTFArray; return numOutputElements; } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..521ef0b 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,16 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::device_vector dv_in(idata, idata + n); + thrust::device_vector dv_out(n); + timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + timer().endGpuTimer(); + + thrust::copy(dv_out.begin(), dv_out.end(), odata); } } } From be9c734459b66b7b6c80fc8f4de06869790c42e6 Mon Sep 17 00:00:00 2001 From: YOUR NAME Date: Sun, 15 Sep 2024 00:09:26 -0400 Subject: [PATCH 12/20] Fixes bug with integer overflow --- src/testing_helpers.hpp | 4 ++-- stream_compaction/efficient.cu | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index 025e94a..d76dc37 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -24,7 +24,7 @@ void printDesc(const char *desc) { template void printCmpResult(int n, T *a, T *b) { printf(" %s \n", - cmpArrays(n, a, b) ? "FAIL VALUE" : "passed"); + cmpArrays(n, a, b) ? "\033[31mFAIL VALUE\033[0m" : "\033[32mpassed\033[0m"); } template @@ -34,7 +34,7 @@ void printCmpLenResult(int n, int expN, T *a, T *b) { } printf(" %s \n", (n == -1 || n != expN) ? "FAIL COUNT" : - cmpArrays(n, a, b) ? "FAIL VALUE" : "passed"); + cmpArrays(n, a, b) ? "\033[31mFAIL VALUE\033[0m" : "\033[32mpassed\033[0m"); } void zeroArray(int n, int *a) { diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 9b30810..49b5c25 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -20,7 +20,7 @@ namespace StreamCompaction { int stride = 2 * maxBlockSize; // since each thread works on two entries of dev_data, 2x block size gives the stride to get between blocks of data in dev_data. dim3 blocksPerGrid = ((n_padded / 2) + maxBlockSize - 1) / maxBlockSize; - int* stored_sums; // temp array used to last entry per block during upsweep. See kernZeroEntries and kernIncrement for use info. + int* stored_sums; // temp array used to store last entry per block during upsweep. See kernZeroEntries and kernIncrement for use info. cudaMalloc((void**)&stored_sums, blocksPerGrid.x * sizeof(int)); int blockSize_i = maxBlockSize; @@ -44,14 +44,14 @@ namespace StreamCompaction { kernDownSweep<<>>(n_padded, stride, depth, dev_data); blockSize_i *= 2; // need more threads each iteration - cudaDeviceSynchronize(); + cudaDeviceSynchronize(); } // If the array didn't fit within a single block, we need to collect the individual block scan results, // put them in an array, and scan that array. Then add the twice-scanned array as increments back to the original results. // // This needs to be done recursively to handle arbitrarily large arrays. - if (n_padded > 2 * MAX_BLOCK_SIZE) { + if (n_padded > 2 * maxBlockSize) { // (Recursively) scan the summed blocks array // Can use sum_data as both the input and output pointers for the scan. No issue writing over it. scan(blocksPerGrid.x, stored_sums); @@ -59,7 +59,7 @@ namespace StreamCompaction { // Finally, add scanned sum values back to the original dev_data // In original scan, each thread handled 2 elements. In this step, each handles one, so we need 2x the blocks. dim3 kernBlocksPerGrid = 2 * blocksPerGrid.x; - kernIncrement<<>> (n_padded, dev_data, stored_sums); + kernIncrement<<>>(n_padded, dev_data, stored_sums); cudaDeviceSynchronize(); } @@ -140,7 +140,7 @@ namespace StreamCompaction { if (threadId >= n) return; // The extra factor of 2 comes from the fact that we're using twice as many block here as in the original scan. - int sum_data_idx = gridDim.x * threadId / (2 * n); // note: integer division here + int sum_data_idx = (long) gridDim.x * threadId / (2 * n); // note: integer division here dev_data[threadId] += sum_data[sum_data_idx]; } From 4dd2d4287211fef44d6866a8e790902268219e51 Mon Sep 17 00:00:00 2001 From: YOUR NAME Date: Sun, 15 Sep 2024 19:16:24 -0400 Subject: [PATCH 13/20] Condense kernels and use shared memory --- src/main.cpp | 132 ++++++++++++++++----------------- stream_compaction/efficient.cu | 124 +++++++++++++++---------------- stream_compaction/efficient.h | 4 +- 3 files changed, 125 insertions(+), 135 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index a8ca5ef..58d98b9 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 20; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -81,71 +81,71 @@ int main(int argc, char* argv[]) { // printArray(NPOT, c, true); printCmpResult(NPOT, b, c); - // zeroArray(SIZE, c); - // printDesc("thrust scan, power-of-two"); - // StreamCompaction::Thrust::scan(SIZE, c, a); - // printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - // //printArray(SIZE, c, true); - // printCmpResult(SIZE, b, c); - - // zeroArray(SIZE, c); - // printDesc("thrust scan, non-power-of-two"); - // StreamCompaction::Thrust::scan(NPOT, c, a); - // printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - // //printArray(NPOT, c, true); - // printCmpResult(NPOT, b, c); - - // printf("\n"); - // printf("*****************************\n"); - // printf("** STREAM COMPACTION TESTS **\n"); - // printf("*****************************\n"); - - // // Compaction tests - - // genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - // a[SIZE - 1] = 0; - // printArray(SIZE, a, true); - - // int count, expectedCount, expectedNPOT; - - // // initialize b using StreamCompaction::CPU::compactWithoutScan you implement - // // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - // zeroArray(SIZE, b); - // printDesc("cpu compact without scan, power-of-two"); - // count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - // printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - // expectedCount = count; - // printArray(count, b, true); - // printCmpLenResult(count, expectedCount, b, b); - - // zeroArray(SIZE, c); - // printDesc("cpu compact without scan, non-power-of-two"); - // count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - // printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - // expectedNPOT = count; - // printArray(count, c, true); - // printCmpLenResult(count, expectedNPOT, b, c); - - // zeroArray(SIZE, c); - // printDesc("cpu compact with scan"); - // count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - // printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - // printArray(count, c, true); - // printCmpLenResult(count, expectedCount, b, c); - - // zeroArray(SIZE, c); - // printDesc("work-efficient compact, power-of-two"); - // count = StreamCompaction::Efficient::compact(SIZE, c, a); - // printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - // //printArray(count, c, true); - // printCmpLenResult(count, expectedCount, b, c); - - // zeroArray(SIZE, c); - // printDesc("work-efficient compact, non-power-of-two"); - // count = StreamCompaction::Efficient::compact(NPOT, c, a); - // printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - // //printArray(count, c, true); - // printCmpLenResult(count, expectedNPOT, b, c); + zeroArray(SIZE, c); + printDesc("thrust scan, power-of-two"); + StreamCompaction::Thrust::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("thrust scan, non-power-of-two"); + StreamCompaction::Thrust::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + + printf("\n"); + printf("*****************************\n"); + printf("** STREAM COMPACTION TESTS **\n"); + printf("*****************************\n"); + + // Compaction tests + + genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printArray(SIZE, a, true); + + int count, expectedCount, expectedNPOT; + + // initialize b using StreamCompaction::CPU::compactWithoutScan you implement + // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. + zeroArray(SIZE, b); + printDesc("cpu compact without scan, power-of-two"); + count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + expectedCount = count; + printArray(count, b, true); + printCmpLenResult(count, expectedCount, b, b); + + zeroArray(SIZE, c); + printDesc("cpu compact without scan, non-power-of-two"); + count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + expectedNPOT = count; + printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + + zeroArray(SIZE, c); + printDesc("cpu compact with scan"); + count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient compact, power-of-two"); + count = StreamCompaction::Efficient::compact(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient compact, non-power-of-two"); + count = StreamCompaction::Efficient::compact(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit delete[] a; diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 49b5c25..35c51b8 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -13,45 +13,23 @@ namespace StreamCompaction { return timer; } - const int MAX_BLOCK_SIZE = 1024; + const int MAX_BLOCK_SIZE = 1024; // keep this as a power of 2 void scan(int n_padded, int* dev_data) { - int maxBlockSize = std::min((n_padded / 2), MAX_BLOCK_SIZE); - int stride = 2 * maxBlockSize; // since each thread works on two entries of dev_data, 2x block size gives the stride to get between blocks of data in dev_data. - dim3 blocksPerGrid = ((n_padded / 2) + maxBlockSize - 1) / maxBlockSize; + int blockSize = std::min((n_padded / 2), MAX_BLOCK_SIZE); + dim3 blocksPerGrid = ((n_padded / 2) + blockSize - 1) / blockSize; int* stored_sums; // temp array used to store last entry per block during upsweep. See kernZeroEntries and kernIncrement for use info. cudaMalloc((void**)&stored_sums, blocksPerGrid.x * sizeof(int)); - int blockSize_i = maxBlockSize; - for (int depth = 0; depth < ilog2ceil(2 * maxBlockSize); ++depth) { - kernUpSweep<<>>(n_padded, stride, depth, dev_data); - - blockSize_i /= 2; // need fewer threads each iteration - cudaDeviceSynchronize(); - } - - // Between upsweep and downsweep, zero out the last entry of every block (first storing off those entries for later use) - int zeroEntriesBlockSize = 128; - dim3 zeroEntriesBlocksPerGrid = (blocksPerGrid.x + zeroEntriesBlockSize - 1) / zeroEntriesBlockSize; - kernZeroEntries<<>> (blocksPerGrid.x, stride, dev_data, stored_sums); + kernScan<<>>(n_padded, ilog2ceil(2 * blockSize), dev_data, stored_sums); cudaDeviceSynchronize(); - // Keep blocks per grid constant, so we can handle arbitrarily sized arrays. - // But grow blockSize on each iteration since we need more threads on each depth layer. - blockSize_i = 1; - for (int depth = ilog2ceil(2 * maxBlockSize) - 1; depth >= 0; --depth) { - kernDownSweep<<>>(n_padded, stride, depth, dev_data); - - blockSize_i *= 2; // need more threads each iteration - cudaDeviceSynchronize(); - } - // If the array didn't fit within a single block, we need to collect the individual block scan results, // put them in an array, and scan that array. Then add the twice-scanned array as increments back to the original results. // // This needs to be done recursively to handle arbitrarily large arrays. - if (n_padded > 2 * maxBlockSize) { + if (n_padded > 2 * blockSize) { // (Recursively) scan the summed blocks array // Can use sum_data as both the input and output pointers for the scan. No issue writing over it. scan(blocksPerGrid.x, stored_sums); @@ -59,7 +37,7 @@ namespace StreamCompaction { // Finally, add scanned sum values back to the original dev_data // In original scan, each thread handled 2 elements. In this step, each handles one, so we need 2x the blocks. dim3 kernBlocksPerGrid = 2 * blocksPerGrid.x; - kernIncrement<<>>(n_padded, dev_data, stored_sums); + kernIncrement<<>>(n_padded, dev_data, stored_sums); cudaDeviceSynchronize(); } @@ -89,45 +67,59 @@ namespace StreamCompaction { cudaFree(dev_data); } - __global__ void kernUpSweep(int n, int stride, int depth, int* dev_data) { - int threadId = threadIdx.x; - - int twoToDepthPlusOne = (1 << (depth + 1)); - int twoToDepth = (1 << depth); - // Since each block is a self contained scan, we calculate these indices w.r.t the local block thread index. - // But then we offset by (stride * blockIdx.x) because the dev_data is for ALL blocks, so we need to access the right part. - int leftChildIdx = (threadId * twoToDepthPlusOne) + twoToDepth - 1 + (stride * blockIdx.x); - int rightChildIdx = (threadId * twoToDepthPlusOne) + twoToDepthPlusOne - 1 + (stride * blockIdx.x); - - if (rightChildIdx >= n) return; - - dev_data[rightChildIdx] += dev_data[leftChildIdx]; - } - - __global__ void kernZeroEntries(int n, int stride, int* dev_data, int* stored_sums) { - int threadId = threadIdx.x + (blockDim.x * blockIdx.x); - if (threadId >= n) return; - - int dev_data_idx = (threadId + 1) * stride - 1; - stored_sums[threadId] = dev_data[dev_data_idx]; - dev_data[dev_data_idx] = 0; - } - - __global__ void kernDownSweep(int n, int stride, int depth, int* dev_data) { - int threadId = threadIdx.x; - - int twoToDepthPlusOne = (1 << (depth + 1)); - int twoToDepth = (1 << depth); - int blockLeftChildIdx = (threadId * twoToDepthPlusOne) + twoToDepth - 1; - int globalLeftChildIdx = blockLeftChildIdx + (stride * blockIdx.x); - int blockRightChildIdx = (threadId * twoToDepthPlusOne) + twoToDepthPlusOne - 1; - int globalRightChildIdx = blockRightChildIdx + (stride * blockIdx.x); - - if (globalRightChildIdx >= n) return; + /** + * n is the size of dev_data + */ + __global__ void kernScan(int n, int numLevels, int* dev_data, int* stored_sums) { + if (threadIdx.x + (blockDim.x * blockIdx.x) >= n/2) return; + + extern __shared__ int s_dev_data[]; + + // Put the right and left children into shared memory. + // Index from dev_data based on *global* position, but put into shared memory as local position (w.r.t. this block) + s_dev_data[2 * threadIdx.x + 1] = + dev_data[2 * threadIdx.x + 1 + (2 * blockDim.x * blockIdx.x)]; + + s_dev_data[2 * threadIdx.x] = + dev_data[2 * threadIdx.x + (2 * blockDim.x * blockIdx.x)]; + + for (int depth = 0; depth < numLevels; ++depth) { + __syncthreads(); + // Make sure the local right-child index this thread will access is in bounds of this block. + if ((threadIdx.x * (1 << (depth + 1))) + (1 << (depth + 1)) - 1 >= 2 * blockDim.x) continue; + + s_dev_data[(threadIdx.x * (1 << (depth + 1))) + (1 << (depth + 1)) - 1] += + s_dev_data[(threadIdx.x * (1 << (depth + 1))) + (1 << depth) - 1]; + } + __syncthreads(); + + // Save off the last entry of s_dev_data to a temporary stored_sums buffer. This temp buffer is used if our initial + // input data is too large to be scanned in a single block. + // Then zero out the last entry for the downsweep. + if (threadIdx.x == 0) { + stored_sums[blockIdx.x] = s_dev_data[2 * blockDim.x - 1]; + s_dev_data[2 * blockDim.x - 1] = 0; + } + + for (int depth = numLevels - 1; depth >= 1; --depth) { + __syncthreads(); + // Make sure the local right-child index this thread will access is in bounds of this block. + if ((threadIdx.x * (1 << (depth + 1))) + (1 << (depth + 1)) - 1 >= 2 * blockDim.x) continue; + + int leftVal = s_dev_data[(threadIdx.x * (1 << (depth + 1))) + (1 << depth) - 1]; + s_dev_data[(threadIdx.x * (1 << (depth + 1))) + (1 << depth) - 1] = + s_dev_data[(threadIdx.x * (1 << (depth + 1))) + (1 << (depth + 1)) - 1]; + + s_dev_data[(threadIdx.x * (1 << (depth + 1))) + (1 << (depth + 1)) - 1] += leftVal; + } - int leftVal = dev_data[globalLeftChildIdx]; - dev_data[globalLeftChildIdx] = dev_data[globalRightChildIdx]; - dev_data[globalRightChildIdx] += leftVal; + __syncthreads(); + // On the last iteration, depth = 0, we write to global memory. + dev_data[2 * threadIdx.x + (2 * blockDim.x * blockIdx.x)] = + s_dev_data[2 * threadIdx.x + 1]; + + dev_data[2 * threadIdx.x + 1 + (2 * blockDim.x * blockIdx.x)] = + (s_dev_data[2 * threadIdx.x] + s_dev_data[2 * threadIdx.x + 1]); } /** diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 1918e9d..355c42a 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -7,9 +7,7 @@ namespace StreamCompaction { StreamCompaction::Common::PerformanceTimer& timer(); void scan(int n, int *odata, const int *idata); - __global__ void kernUpSweep(int n, int stride, int depth, int* dev_data); - __global__ void kernZeroEntries(int n, int stride, int* dev_data, int* stored_sums); - __global__ void kernDownSweep(int n, int stride, int depth, int* dev_data); + __global__ void kernScan(int n, int numLevels, int* dev_data, int* stored_sums); __global__ void kernIncrement(int n, int* dev_data, int* sum_data); int compact(int n, int *odata, const int *idata); } From 559de0af3fe49731f3a9ef81d97ed4cc0d44b001 Mon Sep 17 00:00:00 2001 From: YOUR NAME Date: Sun, 15 Sep 2024 19:26:10 -0400 Subject: [PATCH 14/20] Uses pinned memory for stored sums --- stream_compaction/efficient.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 35c51b8..8dea36b 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -20,7 +20,8 @@ namespace StreamCompaction { dim3 blocksPerGrid = ((n_padded / 2) + blockSize - 1) / blockSize; int* stored_sums; // temp array used to store last entry per block during upsweep. See kernZeroEntries and kernIncrement for use info. - cudaMalloc((void**)&stored_sums, blocksPerGrid.x * sizeof(int)); + // Allocate with pinned memory + cudaHostAlloc((void**)&stored_sums, blocksPerGrid.x * sizeof(int), cudaHostAllocDefault); kernScan<<>>(n_padded, ilog2ceil(2 * blockSize), dev_data, stored_sums); cudaDeviceSynchronize(); @@ -41,7 +42,7 @@ namespace StreamCompaction { cudaDeviceSynchronize(); } - cudaFree(stored_sums); + cudaFreeHost(stored_sums); } /** From fc1423880d14a9219dc34f35da9dca2fa9ce745f Mon Sep 17 00:00:00 2001 From: YOUR NAME Date: Tue, 17 Sep 2024 11:25:12 -0400 Subject: [PATCH 15/20] Add performance testing automation --- performance_automator.sh | 106 ++++++++++++++++++ src/main.cpp | 229 +++++++++++++++++++++++---------------- 2 files changed, 244 insertions(+), 91 deletions(-) create mode 100755 performance_automator.sh diff --git a/performance_automator.sh b/performance_automator.sh new file mode 100755 index 0000000..0c6b51b --- /dev/null +++ b/performance_automator.sh @@ -0,0 +1,106 @@ +#!/bin/bash + +# Run the performance test 10x each, and write the values to a csv file +NUM_TESTS=10 + +cpu_scan_time_pot=0 +cpu_scan_time_npot=0 +cpu_compact_without_scan_time_pot=0 +cpu_compact_without_scan_time_npot=0 +cpu_compact_with_scan_time=0 + +naive_scan_time_pot=0 +naive_scan_time_npot=0 + +efficient_scan_time_pot=0 +efficient_scan_time_npot=0 +efficient_compact_time_pot=0 +efficient_compact_time_npot=0 + +thrust_scan_time_pot=0 +thrust_scan_time_npot=0 + +for i in $(seq 1 $NUM_TESTS) +do + echo -e "Test $i\n" + echo -e "CPU Test:\n" + + result=$(./bin/cis5650_stream_compaction_test cpu) + + elapsed_time=$(echo "$result" | grep -A 1 "cpu scan, power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') + cpu_scan_time_pot=$(echo "$cpu_scan_time_pot + $elapsed_time" | bc) + + elapsed_time=$(echo "$result" | grep -A 1 "cpu scan, non-power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') + cpu_scan_time_npot=$(echo "$cpu_scan_time_npot + $elapsed_time" | bc) + + elapsed_time=$(echo "$result" | grep -A 1 "cpu compact without scan, power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') + cpu_compact_without_scan_time_pot=$(echo "$cpu_compact_without_scan_time_pot + $elapsed_time" | bc) + + elapsed_time=$(echo "$result" | grep -A 1 "cpu compact without scan, non-power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') + cpu_compact_without_scan_time_npot=$(echo "$cpu_compact_without_scan_time_npot + $elapsed_time" | bc) + + elapsed_time=$(echo "$result" | grep -A 1 "cpu compact with scan" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') + cpu_compact_with_scan_time=$(echo "$cpu_compact_with_scan_time + $elapsed_time" | bc) + + echo -e "Naive Test:\n" + + result=$(./bin/cis5650_stream_compaction_test naive) + + elapsed_time=$(echo "$result" | grep -A 1 "naive scan, power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') + naive_scan_time_pot=$(echo "$naive_scan_time_pot + $elapsed_time" | bc) + + elapsed_time=$(echo "$result" | grep -A 1 "naive scan, non-power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') + naive_scan_time_npot=$(echo "$naive_scan_time_npot + $elapsed_time" | bc) + + echo -e "Efficient Test:\n" + + result=$(./bin/cis5650_stream_compaction_test efficient) + + elapsed_time=$(echo "$result" | grep -A 1 "efficient scan, power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') + efficient_scan_time_pot=$(echo "$efficient_scan_time_pot + $elapsed_time" | bc) + + elapsed_time=$(echo "$result" | grep -A 1 "efficient scan, non-power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') + efficient_scan_time_npot=$(echo "$efficient_scan_time_npot + $elapsed_time" | bc) + + elapsed_time=$(echo "$result" | grep -A 1 "efficient compact, power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') + efficient_compact_time_pot=$(echo "$efficient_compact_time_pot + $elapsed_time" | bc) + + elapsed_time=$(echo "$result" | grep -A 1 "efficient compact, non-power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') + efficient_compact_time_npot=$(echo "$efficient_compact_time_npot + $elapsed_time" | bc) + + echo -e "Thrust Test:\n" + + result=$(./bin/cis5650_stream_compaction_test thrust) + + elapsed_time=$(echo "$result" | grep -A 1 "thrust scan, power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') + thrust_scan_time_pot=$(echo "$thrust_scan_time_pot + $elapsed_time" | bc) + + elapsed_time=$(echo "$result" | grep -A 1 "thrust scan, non-power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') + thrust_scan_time_npot=$(echo "$thrust_scan_time_npot + $elapsed_time" | bc) + +done + +average_cpu_scan_time_pot=$(echo "scale=5; $cpu_scan_time_pot / $NUM_TESTS" | bc) +average_cpu_scan_time_npot=$(echo "scale=5; $cpu_scan_time_npot / $NUM_TESTS" | bc) +average_cpu_compact_without_scan_time_pot=$(echo "scale=5; $cpu_compact_without_scan_time_pot / $NUM_TESTS" | bc) +average_cpu_compact_without_scan_time_npot=$(echo "scale=5; $cpu_compact_without_scan_time_npot / $NUM_TESTS" | bc) +average_cpu_compact_with_scan_time=$(echo "scale=5; $cpu_compact_with_scan_time / $NUM_TESTS" | bc) + +average_naive_scan_time_pot=$(echo "scale=5; $naive_scan_time_pot / $NUM_TESTS" | bc) +average_naive_scan_time_npot=$(echo "scale=5; $naive_scan_time_npot / $NUM_TESTS" | bc) + +average_efficient_scan_time_pot=$(echo "scale=5; $efficient_scan_time_pot / $NUM_TESTS" | bc) +average_efficient_scan_time_npot=$(echo "scale=5; $efficient_scan_time_npot / $NUM_TESTS" | bc) +average_efficient_compact_time_pot=$(echo "scale=5; $efficient_compact_time_pot / $NUM_TESTS" | bc) +average_efficient_compact_time_npot=$(echo "scale=5; $efficient_compact_time_npot / $NUM_TESTS" | bc) + +average_thrust_scan_time_pot=$(echo "scale=5; $thrust_scan_time_pot / $NUM_TESTS" | bc) +average_thrust_scan_time_npot=$(echo "scale=5; $thrust_scan_time_npot / $NUM_TESTS" | bc) + +# Now I want to write the results to a csv file +echo -e ",CPU,Naive,Efficient,Thrust\n" > performance_results.csv +echo -e "Scan Time Power of Two,$average_cpu_scan_time_pot,$average_naive_scan_time_pot,$average_efficient_scan_time_pot,$average_thrust_scan_time_pot\n" >> performance_results.csv +echo -e "Scan Time Non-Power of Two,$average_cpu_scan_time_npot,$average_naive_scan_time_npot,$average_efficient_scan_time_npot,$average_thrust_scan_time_npot\n" >> performance_results.csv +echo -e "Compact Time Power of Two,$average_cpu_compact_without_scan_time_pot,,$average_efficient_compact_time_pot,\n" >> performance_results.csv +echo -e "Compact Time Non-Power of Two,$average_cpu_compact_without_scan_time_npot,,$average_efficient_compact_time_npot,\n" >> performance_results.csv +echo -e "(CPU) Compact Time With Scan,$average_cpu_compact_with_scan_time,,,\n" >> performance_results.csv \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 58d98b9..21f745c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -34,25 +34,46 @@ int main(int argc, char* argv[]) { // initialize b using StreamCompaction::CPU::scan you implement // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. // At first all cases passed because b && c are all zeroes. - zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); - - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); + + if (argc > 1 && strcmp(argv[1], "cpu") == 0) { + zeroArray(SIZE, b); + printDesc("cpu scan, power-of-two"); + StreamCompaction::CPU::scan(SIZE, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + // printArray(SIZE, b, true); + + zeroArray(SIZE, c); + printDesc("cpu scan, non-power-of-two"); + StreamCompaction::CPU::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + // printArray(NPOT, b, true); + printCmpResult(NPOT, b, c); + } + + + if (argc > 1 && strcmp(argv[1], "naive") == 0) { + // We have to run the cpu scans just to get values to compare against for the naive scan + zeroArray(SIZE, b); + StreamCompaction::CPU::scan(SIZE, b, a); + + zeroArray(SIZE, c); + StreamCompaction::CPU::scan(NPOT, c, a); + + zeroArray(SIZE, c); + printDesc("naive scan, power-of-two"); + StreamCompaction::Naive::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("naive scan, non-power-of-two"); + StreamCompaction::Naive::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(NPOT, b, c); + } + /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan onesArray(SIZE, c); @@ -60,40 +81,52 @@ int main(int argc, char* argv[]) { StreamCompaction::Naive::scan(SIZE, c, a); printArray(SIZE, c, true); */ - zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - // printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); + if (argc > 1 && strcmp(argv[1], "efficient") == 0) { + // We have to run the cpu scans just to get values to compare against for the efficient scan + zeroArray(SIZE, b); + StreamCompaction::CPU::scan(SIZE, b, a); + + zeroArray(SIZE, c); + StreamCompaction::CPU::scan(NPOT, c, a); + + zeroArray(SIZE, c); + printDesc("work-efficient scan, power-of-two"); + StreamCompaction::Efficient::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + // printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan, non-power-of-two"); + StreamCompaction::Efficient::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + // printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + } + + if (argc > 1 && strcmp(argv[1], "thrust") == 0) { + // We have to run the cpu scans just to get values to compare against for the efficient scan + zeroArray(SIZE, b); + StreamCompaction::CPU::scan(SIZE, b, a); + + zeroArray(SIZE, c); + StreamCompaction::CPU::scan(NPOT, c, a); + + zeroArray(SIZE, c); + printDesc("thrust scan, power-of-two"); + StreamCompaction::Thrust::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("thrust scan, non-power-of-two"); + StreamCompaction::Thrust::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + } + printf("\n"); printf("*****************************\n"); @@ -108,44 +141,58 @@ int main(int argc, char* argv[]) { int count, expectedCount, expectedNPOT; - // initialize b using StreamCompaction::CPU::compactWithoutScan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); - - zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); + if (argc > 1 && strcmp(argv[1], "cpu") == 0) { + // initialize b using StreamCompaction::CPU::compactWithoutScan you implement + // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. + zeroArray(SIZE, b); + printDesc("cpu compact without scan, power-of-two"); + count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + expectedCount = count; + // printArray(count, b, true); + printCmpLenResult(count, expectedCount, b, b); + + zeroArray(SIZE, c); + printDesc("cpu compact without scan, non-power-of-two"); + count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + expectedNPOT = count; + // printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + + zeroArray(SIZE, c); + printDesc("cpu compact with scan"); + count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + // printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + } + + if (argc > 1 && strcmp(argv[1], "efficient") == 0) { + // We have to run the cpu compacts just to get values to compare against for the efficient compact + zeroArray(SIZE, b); + count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + expectedCount = count; + + zeroArray(SIZE, c); + count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + expectedNPOT = count; + + zeroArray(SIZE, c); + printDesc("work-efficient compact, power-of-two"); + count = StreamCompaction::Efficient::compact(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient compact, non-power-of-two"); + count = StreamCompaction::Efficient::compact(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + } + system("pause"); // stop Win32 console from closing on exit delete[] a; From 567b88c1e00a1e3bd3dda0c5021e0881fbe21fa1 Mon Sep 17 00:00:00 2001 From: YOUR NAME Date: Tue, 17 Sep 2024 14:52:33 -0400 Subject: [PATCH 16/20] Adds bank conflict avoidance and moves malloc out of timed code --- performance_automator.sh | 5 --- stream_compaction/efficient.cu | 70 +++++++++++++++++++++++----------- 2 files changed, 47 insertions(+), 28 deletions(-) diff --git a/performance_automator.sh b/performance_automator.sh index 0c6b51b..0914632 100755 --- a/performance_automator.sh +++ b/performance_automator.sh @@ -23,8 +23,6 @@ thrust_scan_time_npot=0 for i in $(seq 1 $NUM_TESTS) do echo -e "Test $i\n" - echo -e "CPU Test:\n" - result=$(./bin/cis5650_stream_compaction_test cpu) elapsed_time=$(echo "$result" | grep -A 1 "cpu scan, power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') @@ -42,7 +40,6 @@ do elapsed_time=$(echo "$result" | grep -A 1 "cpu compact with scan" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') cpu_compact_with_scan_time=$(echo "$cpu_compact_with_scan_time + $elapsed_time" | bc) - echo -e "Naive Test:\n" result=$(./bin/cis5650_stream_compaction_test naive) @@ -52,7 +49,6 @@ do elapsed_time=$(echo "$result" | grep -A 1 "naive scan, non-power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') naive_scan_time_npot=$(echo "$naive_scan_time_npot + $elapsed_time" | bc) - echo -e "Efficient Test:\n" result=$(./bin/cis5650_stream_compaction_test efficient) @@ -68,7 +64,6 @@ do elapsed_time=$(echo "$result" | grep -A 1 "efficient compact, non-power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') efficient_compact_time_npot=$(echo "$efficient_compact_time_npot + $elapsed_time" | bc) - echo -e "Thrust Test:\n" result=$(./bin/cis5650_stream_compaction_test thrust) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 8dea36b..d142d9a 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -15,15 +15,15 @@ namespace StreamCompaction { const int MAX_BLOCK_SIZE = 1024; // keep this as a power of 2 - void scan(int n_padded, int* dev_data) { + #define LOG_NUM_BANKS 5 + #define CONFLICT_FREE_OFFSET(threadIdx) ((threadIdx) >> LOG_NUM_BANKS) + + void scan(int n_padded, int* dev_data, int* stored_sums, int offset) { int blockSize = std::min((n_padded / 2), MAX_BLOCK_SIZE); dim3 blocksPerGrid = ((n_padded / 2) + blockSize - 1) / blockSize; - int* stored_sums; // temp array used to store last entry per block during upsweep. See kernZeroEntries and kernIncrement for use info. - // Allocate with pinned memory - cudaHostAlloc((void**)&stored_sums, blocksPerGrid.x * sizeof(int), cudaHostAllocDefault); - - kernScan<<>>(n_padded, ilog2ceil(2 * blockSize), dev_data, stored_sums); + int sharedMemorySize = (2 * blockSize + CONFLICT_FREE_OFFSET(2 * blockSize - 1)) * sizeof(int); + kernScan<<>>(n_padded, ilog2ceil(2 * blockSize), dev_data, stored_sums + offset); cudaDeviceSynchronize(); // If the array didn't fit within a single block, we need to collect the individual block scan results, @@ -33,16 +33,15 @@ namespace StreamCompaction { if (n_padded > 2 * blockSize) { // (Recursively) scan the summed blocks array // Can use sum_data as both the input and output pointers for the scan. No issue writing over it. - scan(blocksPerGrid.x, stored_sums); + scan(blocksPerGrid.x, stored_sums + offset, stored_sums + offset, blocksPerGrid.x); // Finally, add scanned sum values back to the original dev_data // In original scan, each thread handled 2 elements. In this step, each handles one, so we need 2x the blocks. dim3 kernBlocksPerGrid = 2 * blocksPerGrid.x; - kernIncrement<<>>(n_padded, dev_data, stored_sums); + kernIncrement<<>>(n_padded, dev_data, stored_sums + offset); cudaDeviceSynchronize(); } - cudaFreeHost(stored_sums); } /** @@ -60,12 +59,27 @@ namespace StreamCompaction { cudaMemset(dev_data + n, 0, (n_padded - n) * sizeof(int)); } + int blockSize = std::min((n_padded / 2), MAX_BLOCK_SIZE); + dim3 blocksPerGrid = ((n_padded / 2) + blockSize - 1) / blockSize; + + // Calculate the total amount of memory needed for stored_sums + int stored_sums_size = 0; + for (int i = ilog2ceil(n_padded) - ilog2ceil(MAX_BLOCK_SIZE); i >= 1; i -= ilog2ceil(MAX_BLOCK_SIZE)) { + stored_sums_size += pow(2, i); + } + + // temp array used to store last entry per block during upsweep. See kernScan and kernIncrement for use info. + int* stored_sums; + cudaMalloc((void**)&stored_sums, stored_sums_size * sizeof(int)); + + timer().startGpuTimer(); - scan(n_padded, dev_data); + scan(n_padded, dev_data, stored_sums, 0); timer().endGpuTimer(); cudaMemcpy(odata, dev_data, n_padded * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(dev_data); + cudaFree(stored_sums); } /** @@ -78,10 +92,10 @@ namespace StreamCompaction { // Put the right and left children into shared memory. // Index from dev_data based on *global* position, but put into shared memory as local position (w.r.t. this block) - s_dev_data[2 * threadIdx.x + 1] = + s_dev_data[2 * threadIdx.x + 1 + CONFLICT_FREE_OFFSET(2 * threadIdx.x + 1)] = dev_data[2 * threadIdx.x + 1 + (2 * blockDim.x * blockIdx.x)]; - s_dev_data[2 * threadIdx.x] = + s_dev_data[2 * threadIdx.x + CONFLICT_FREE_OFFSET(2 * threadIdx.x)] = dev_data[2 * threadIdx.x + (2 * blockDim.x * blockIdx.x)]; for (int depth = 0; depth < numLevels; ++depth) { @@ -89,8 +103,8 @@ namespace StreamCompaction { // Make sure the local right-child index this thread will access is in bounds of this block. if ((threadIdx.x * (1 << (depth + 1))) + (1 << (depth + 1)) - 1 >= 2 * blockDim.x) continue; - s_dev_data[(threadIdx.x * (1 << (depth + 1))) + (1 << (depth + 1)) - 1] += - s_dev_data[(threadIdx.x * (1 << (depth + 1))) + (1 << depth) - 1]; + s_dev_data[(threadIdx.x * (1 << (depth + 1))) + (1 << (depth + 1)) - 1 + CONFLICT_FREE_OFFSET((threadIdx.x * (1 << (depth + 1))) + (1 << (depth + 1)) - 1)] += + s_dev_data[(threadIdx.x * (1 << (depth + 1))) + (1 << depth) - 1 + CONFLICT_FREE_OFFSET((threadIdx.x * (1 << (depth + 1))) + (1 << depth) - 1)]; } __syncthreads(); @@ -98,8 +112,8 @@ namespace StreamCompaction { // input data is too large to be scanned in a single block. // Then zero out the last entry for the downsweep. if (threadIdx.x == 0) { - stored_sums[blockIdx.x] = s_dev_data[2 * blockDim.x - 1]; - s_dev_data[2 * blockDim.x - 1] = 0; + stored_sums[blockIdx.x] = s_dev_data[2 * blockDim.x + CONFLICT_FREE_OFFSET(2 * blockDim.x - 1) - 1]; + s_dev_data[2 * blockDim.x + CONFLICT_FREE_OFFSET(2 * blockDim.x - 1) - 1] = 0; } for (int depth = numLevels - 1; depth >= 1; --depth) { @@ -107,20 +121,20 @@ namespace StreamCompaction { // Make sure the local right-child index this thread will access is in bounds of this block. if ((threadIdx.x * (1 << (depth + 1))) + (1 << (depth + 1)) - 1 >= 2 * blockDim.x) continue; - int leftVal = s_dev_data[(threadIdx.x * (1 << (depth + 1))) + (1 << depth) - 1]; - s_dev_data[(threadIdx.x * (1 << (depth + 1))) + (1 << depth) - 1] = - s_dev_data[(threadIdx.x * (1 << (depth + 1))) + (1 << (depth + 1)) - 1]; + int leftVal = s_dev_data[(threadIdx.x * (1 << (depth + 1))) + (1 << depth) - 1 + CONFLICT_FREE_OFFSET((threadIdx.x * (1 << (depth + 1))) + (1 << depth) - 1)]; + s_dev_data[(threadIdx.x * (1 << (depth + 1))) + (1 << depth) - 1 + CONFLICT_FREE_OFFSET((threadIdx.x * (1 << (depth + 1))) + (1 << depth) - 1)] = + s_dev_data[(threadIdx.x * (1 << (depth + 1))) + (1 << (depth + 1)) - 1 + CONFLICT_FREE_OFFSET((threadIdx.x * (1 << (depth + 1))) + (1 << (depth + 1)) - 1)]; - s_dev_data[(threadIdx.x * (1 << (depth + 1))) + (1 << (depth + 1)) - 1] += leftVal; + s_dev_data[(threadIdx.x * (1 << (depth + 1))) + (1 << (depth + 1)) - 1 + CONFLICT_FREE_OFFSET(threadIdx.x * (1 << (depth + 1)) + (1 << (depth + 1)) - 1)] += leftVal; } __syncthreads(); // On the last iteration, depth = 0, we write to global memory. dev_data[2 * threadIdx.x + (2 * blockDim.x * blockIdx.x)] = - s_dev_data[2 * threadIdx.x + 1]; + s_dev_data[2 * threadIdx.x + 1 + CONFLICT_FREE_OFFSET(2 * threadIdx.x + 1)]; dev_data[2 * threadIdx.x + 1 + (2 * blockDim.x * blockIdx.x)] = - (s_dev_data[2 * threadIdx.x] + s_dev_data[2 * threadIdx.x + 1]); + (s_dev_data[2 * threadIdx.x + CONFLICT_FREE_OFFSET(2 * threadIdx.x)] + s_dev_data[2 * threadIdx.x + 1 + CONFLICT_FREE_OFFSET(2 * threadIdx.x + 1)]); } /** @@ -160,6 +174,16 @@ namespace StreamCompaction { // trueFalseArray with a different value here. cudaMemset(trueFalseArray + n, 0, (n_padded - n) * sizeof(int)); } + + // Calculate the total amount of memory needed for stored_sums + int stored_sums_size = 0; + for (int i = ilog2ceil(n_padded) - ilog2ceil(MAX_BLOCK_SIZE); i >= 1; i -= ilog2ceil(MAX_BLOCK_SIZE)) { + stored_sums_size += pow(2, i); + } + + // temp array used to store last entry per block during upsweep. See kernScan and kernIncrement for use info. + int* stored_sums; + cudaMalloc((void**)&stored_sums, stored_sums_size * sizeof(int)); timer().startGpuTimer(); @@ -168,7 +192,7 @@ namespace StreamCompaction { StreamCompaction::Common::kernMapToBoolean<<>>(n, trueFalseArray, dev_idata); cudaDeviceSynchronize(); - scan(n_padded, trueFalseArray); // scan happens in-place, so trueFalseArray is now scanned + scan(n_padded, trueFalseArray, stored_sums, 0); // scan happens in-place, so trueFalseArray is now scanned StreamCompaction::Common::kernScatter<<>>(n, dev_odata, dev_idata, trueFalseArray); cudaDeviceSynchronize(); From 497637be23238f528b2187d9198c295ddb1a3fda Mon Sep 17 00:00:00 2001 From: YOUR NAME Date: Tue, 17 Sep 2024 20:39:50 -0400 Subject: [PATCH 17/20] Adds initial radix sort implementation --- src/main.cpp | 29 ++++++++- stream_compaction/CMakeLists.txt | 2 + stream_compaction/efficient.h | 1 + stream_compaction/radix.cu | 103 +++++++++++++++++++++++++++++++ stream_compaction/radix.h | 13 ++++ 5 files changed, 147 insertions(+), 1 deletion(-) create mode 100644 stream_compaction/radix.cu create mode 100644 stream_compaction/radix.h diff --git a/src/main.cpp b/src/main.cpp index 21f745c..40bdc95 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,9 +11,10 @@ #include #include #include +#include #include "testing_helpers.hpp" -const int SIZE = 1 << 20; // feel free to change the size of array +const int SIZE = 1 << 6; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -193,6 +194,32 @@ int main(int argc, char* argv[]) { printCmpLenResult(count, expectedNPOT, b, c); } + printf("\n"); + printf("*****************************\n"); + printf("** RADIX TESTS **\n"); + printf("*****************************\n"); + + // Radix tests + genArray(SIZE - 1, a, 255); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printArray(SIZE, a, true); + + if (argc > 1 && strcmp(argv[1], "radix") == 0) { + zeroArray(SIZE, b); + printDesc("radix sort, power-of-two"); + StreamCompaction::Radix::sort(SIZE, b, a); + printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(SIZE, b, false); + + // Use std::sort to sort array a and store the result in c so we can compare against it + std::copy(a, a + SIZE, c); + std::sort(c, c + SIZE); + + printArray(SIZE, c, false); + + // Compare the sorted array from Radix sort with the sorted array from std::sort + printCmpResult(SIZE, b, c); + } system("pause"); // stop Win32 console from closing on exit delete[] a; diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index e0ec27c..81e7a15 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -4,6 +4,7 @@ set(headers "naive.h" "efficient.h" "thrust.h" + "radix.h" ) set(sources @@ -12,6 +13,7 @@ set(sources "naive.cu" "efficient.cu" "thrust.cu" + "radix.cu" ) list(SORT headers) diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 355c42a..c76a5a5 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -7,6 +7,7 @@ namespace StreamCompaction { StreamCompaction::Common::PerformanceTimer& timer(); void scan(int n, int *odata, const int *idata); + void scan(int n_padded, int* dev_data, int* stored_sums, int offset); __global__ void kernScan(int n, int numLevels, int* dev_data, int* stored_sums); __global__ void kernIncrement(int n, int* dev_data, int* sum_data); int compact(int n, int *odata, const int *idata); diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu new file mode 100644 index 0000000..6c23a4c --- /dev/null +++ b/stream_compaction/radix.cu @@ -0,0 +1,103 @@ +#include +#include +#include "common.h" +#include "radix.h" +#include +#include "efficient.h" + +namespace StreamCompaction { + namespace Radix { + using StreamCompaction::Common::PerformanceTimer; + + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + const int MAX_BLOCK_SIZE = 1024; // keep this as a power of 2 + + void sort(int n, int *odata, const int *idata) { + int numBitsInInt = sizeof(int) * CHAR_BIT; + int n_padded = pow(2, ilog2ceil(n)); + + int* dev_idata; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + int* dev_odata; + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + + int* bit_mapped_data; + cudaMalloc((void**)&bit_mapped_data, n * sizeof(int)); + + int* scanned_bit_mapped_data; + cudaMalloc((void**)&scanned_bit_mapped_data, n_padded * sizeof(int)); + cudaMemset(scanned_bit_mapped_data, 0, n_padded * sizeof(int)); + + + // Calculate the total amount of memory needed for stored_sums (for the scan step) + int stored_sums_size = 0; + for (int i = ilog2ceil(n_padded) - ilog2ceil(MAX_BLOCK_SIZE); i >= 1; i -= ilog2ceil(MAX_BLOCK_SIZE)) { + stored_sums_size += pow(2, i); + } + + // temp array used to store last entry per block during upsweep. See kernScan and kernIncrement for use info. + int* stored_sums; + cudaMalloc((void**)&stored_sums, stored_sums_size * sizeof(int)); + + int blockSize = 1024; + dim3 blocksPerGrid((n + blockSize - 1) / blockSize); + + timer().startGpuTimer(); + + for (int i = 0; i < numBitsInInt; ++i) { + kernMapBits<<>>(n, bit_mapped_data, dev_idata, i); + cudaDeviceSynchronize(); + + // Scan operates in place, so we need to first copy the bit_mapped_data to scanned_bit_mapped_data so we don't lose it + cudaMemcpy(scanned_bit_mapped_data, bit_mapped_data, n * sizeof(int), cudaMemcpyDeviceToDevice); + + StreamCompaction::Efficient::scan(n_padded, scanned_bit_mapped_data, stored_sums, 0); + cudaDeviceSynchronize(); + + kernSort<<>>(n, dev_odata, dev_idata, bit_mapped_data, scanned_bit_mapped_data); + cudaDeviceSynchronize(); + + // Swap the pointers so that the output of the current iteration becomes the input of the next iteration + int* temp = dev_idata; + dev_idata = dev_odata; + dev_odata = temp; + } + + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(bit_mapped_data); + cudaFree(scanned_bit_mapped_data); + cudaFree(stored_sums); + } + + __global__ void kernMapBits(int n, int* bit_mapped_data, const int* dev_data, int bitshift) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + + bit_mapped_data[index] = !((dev_data[index] >> bitshift) & 1); + } + + __global__ void kernSort(int n, int* dev_odata, const int* dev_idata, const int* bit_mapped_data, const int* scanned_bit_mapped_data) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + + int totalFalses = scanned_bit_mapped_data[n - 1] + bit_mapped_data[n - 1]; + int trueIndex = index - scanned_bit_mapped_data[index] + totalFalses; + int finalIndex = bit_mapped_data[index] ? scanned_bit_mapped_data[index] : trueIndex; + + dev_odata[finalIndex] = dev_idata[index]; + } + + } +} \ No newline at end of file diff --git a/stream_compaction/radix.h b/stream_compaction/radix.h new file mode 100644 index 0000000..b20f00f --- /dev/null +++ b/stream_compaction/radix.h @@ -0,0 +1,13 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace Radix { + StreamCompaction::Common::PerformanceTimer& timer(); + + void sort(int n, int *odata, const int *idata); + __global__ void kernMapBits(int n, int *odata, const int *idata, int bitNumber); + __global__ void kernSort(int n, int *dev_odata, const int* dev_idata, const int *bit_mapped_data, const int *scanned_bit_mapped_data); + } +} From b3a035bf5abdf18ad0ac1646c243cdf75242bef5 Mon Sep 17 00:00:00 2001 From: YOUR NAME Date: Tue, 17 Sep 2024 22:04:17 -0400 Subject: [PATCH 18/20] Finish radix plus small fix to efficient scan --- stream_compaction/efficient.cu | 6 +++--- stream_compaction/radix.cu | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index d142d9a..4dc8ff0 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -17,7 +17,7 @@ namespace StreamCompaction { #define LOG_NUM_BANKS 5 #define CONFLICT_FREE_OFFSET(threadIdx) ((threadIdx) >> LOG_NUM_BANKS) - + void scan(int n_padded, int* dev_data, int* stored_sums, int offset) { int blockSize = std::min((n_padded / 2), MAX_BLOCK_SIZE); dim3 blocksPerGrid = ((n_padded / 2) + blockSize - 1) / blockSize; @@ -70,7 +70,7 @@ namespace StreamCompaction { // temp array used to store last entry per block during upsweep. See kernScan and kernIncrement for use info. int* stored_sums; - cudaMalloc((void**)&stored_sums, stored_sums_size * sizeof(int)); + cudaMalloc((void**)&stored_sums, std::max(1, stored_sums_size) * sizeof(int)); timer().startGpuTimer(); @@ -112,7 +112,7 @@ namespace StreamCompaction { // input data is too large to be scanned in a single block. // Then zero out the last entry for the downsweep. if (threadIdx.x == 0) { - stored_sums[blockIdx.x] = s_dev_data[2 * blockDim.x + CONFLICT_FREE_OFFSET(2 * blockDim.x - 1) - 1]; + stored_sums[blockIdx.x] = s_dev_data[2 * blockDim.x - 1 + CONFLICT_FREE_OFFSET(2 * blockDim.x - 1)]; s_dev_data[2 * blockDim.x + CONFLICT_FREE_OFFSET(2 * blockDim.x - 1) - 1] = 0; } diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu index 6c23a4c..b64ee82 100644 --- a/stream_compaction/radix.cu +++ b/stream_compaction/radix.cu @@ -4,6 +4,7 @@ #include "radix.h" #include #include "efficient.h" +#include namespace StreamCompaction { namespace Radix { @@ -44,7 +45,7 @@ namespace StreamCompaction { // temp array used to store last entry per block during upsweep. See kernScan and kernIncrement for use info. int* stored_sums; - cudaMalloc((void**)&stored_sums, stored_sums_size * sizeof(int)); + cudaMalloc((void**)&stored_sums, std::max(stored_sums_size, 1) * sizeof(int)); int blockSize = 1024; dim3 blocksPerGrid((n + blockSize - 1) / blockSize); @@ -57,6 +58,7 @@ namespace StreamCompaction { // Scan operates in place, so we need to first copy the bit_mapped_data to scanned_bit_mapped_data so we don't lose it cudaMemcpy(scanned_bit_mapped_data, bit_mapped_data, n * sizeof(int), cudaMemcpyDeviceToDevice); + cudaDeviceSynchronize(); StreamCompaction::Efficient::scan(n_padded, scanned_bit_mapped_data, stored_sums, 0); cudaDeviceSynchronize(); From 78fa343b39d969d53583b64a40fbc4fcbfc106e1 Mon Sep 17 00:00:00 2001 From: YOUR NAME Date: Tue, 17 Sep 2024 23:30:41 -0400 Subject: [PATCH 19/20] Minor changes --- performance_automator.sh | 113 +++++++++++++++++---------------- src/main.cpp | 6 +- stream_compaction/efficient.cu | 2 +- stream_compaction/naive.cu | 8 ++- 4 files changed, 69 insertions(+), 60 deletions(-) diff --git a/performance_automator.sh b/performance_automator.sh index 0914632..6899fe1 100755 --- a/performance_automator.sh +++ b/performance_automator.sh @@ -1,24 +1,24 @@ #!/bin/bash # Run the performance test 10x each, and write the values to a csv file -NUM_TESTS=10 +NUM_TESTS=20 -cpu_scan_time_pot=0 -cpu_scan_time_npot=0 -cpu_compact_without_scan_time_pot=0 -cpu_compact_without_scan_time_npot=0 -cpu_compact_with_scan_time=0 +cpu_scan_time_pot=() +cpu_scan_time_npot=() +cpu_compact_without_scan_time_pot=() +cpu_compact_without_scan_time_npot=() +cpu_compact_with_scan_time=() -naive_scan_time_pot=0 -naive_scan_time_npot=0 +naive_scan_time_pot=() +naive_scan_time_npot=() -efficient_scan_time_pot=0 -efficient_scan_time_npot=0 -efficient_compact_time_pot=0 -efficient_compact_time_npot=0 +efficient_scan_time_pot=() +efficient_scan_time_npot=() +efficient_compact_time_pot=() +efficient_compact_time_npot=() -thrust_scan_time_pot=0 -thrust_scan_time_npot=0 +thrust_scan_time_pot=() +thrust_scan_time_npot=() for i in $(seq 1 $NUM_TESTS) do @@ -26,76 +26,83 @@ do result=$(./bin/cis5650_stream_compaction_test cpu) elapsed_time=$(echo "$result" | grep -A 1 "cpu scan, power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') - cpu_scan_time_pot=$(echo "$cpu_scan_time_pot + $elapsed_time" | bc) + cpu_scan_time_pot+=($elapsed_time) elapsed_time=$(echo "$result" | grep -A 1 "cpu scan, non-power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') - cpu_scan_time_npot=$(echo "$cpu_scan_time_npot + $elapsed_time" | bc) + cpu_scan_time_npot+=($elapsed_time) elapsed_time=$(echo "$result" | grep -A 1 "cpu compact without scan, power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') - cpu_compact_without_scan_time_pot=$(echo "$cpu_compact_without_scan_time_pot + $elapsed_time" | bc) + cpu_compact_without_scan_time_pot+=($elapsed_time) elapsed_time=$(echo "$result" | grep -A 1 "cpu compact without scan, non-power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') - cpu_compact_without_scan_time_npot=$(echo "$cpu_compact_without_scan_time_npot + $elapsed_time" | bc) + cpu_compact_without_scan_time_npot+=($elapsed_time) elapsed_time=$(echo "$result" | grep -A 1 "cpu compact with scan" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') - cpu_compact_with_scan_time=$(echo "$cpu_compact_with_scan_time + $elapsed_time" | bc) - + cpu_compact_with_scan_time+=($elapsed_time) result=$(./bin/cis5650_stream_compaction_test naive) elapsed_time=$(echo "$result" | grep -A 1 "naive scan, power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') - naive_scan_time_pot=$(echo "$naive_scan_time_pot + $elapsed_time" | bc) + naive_scan_time_pot+=($elapsed_time) elapsed_time=$(echo "$result" | grep -A 1 "naive scan, non-power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') - naive_scan_time_npot=$(echo "$naive_scan_time_npot + $elapsed_time" | bc) - + naive_scan_time_npot+=($elapsed_time) result=$(./bin/cis5650_stream_compaction_test efficient) elapsed_time=$(echo "$result" | grep -A 1 "efficient scan, power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') - efficient_scan_time_pot=$(echo "$efficient_scan_time_pot + $elapsed_time" | bc) + efficient_scan_time_pot+=($elapsed_time) elapsed_time=$(echo "$result" | grep -A 1 "efficient scan, non-power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') - efficient_scan_time_npot=$(echo "$efficient_scan_time_npot + $elapsed_time" | bc) + efficient_scan_time_npot+=($elapsed_time) elapsed_time=$(echo "$result" | grep -A 1 "efficient compact, power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') - efficient_compact_time_pot=$(echo "$efficient_compact_time_pot + $elapsed_time" | bc) + efficient_compact_time_pot+=($elapsed_time) elapsed_time=$(echo "$result" | grep -A 1 "efficient compact, non-power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') - efficient_compact_time_npot=$(echo "$efficient_compact_time_npot + $elapsed_time" | bc) - + efficient_compact_time_npot+=($elapsed_time) result=$(./bin/cis5650_stream_compaction_test thrust) elapsed_time=$(echo "$result" | grep -A 1 "thrust scan, power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') - thrust_scan_time_pot=$(echo "$thrust_scan_time_pot + $elapsed_time" | bc) + thrust_scan_time_pot+=($elapsed_time) elapsed_time=$(echo "$result" | grep -A 1 "thrust scan, non-power-of-two" | grep -oP 'elapsed time: \K[0-9]+\.[0-9]+') - thrust_scan_time_npot=$(echo "$thrust_scan_time_npot + $elapsed_time" | bc) + thrust_scan_time_npot+=($elapsed_time) done -average_cpu_scan_time_pot=$(echo "scale=5; $cpu_scan_time_pot / $NUM_TESTS" | bc) -average_cpu_scan_time_npot=$(echo "scale=5; $cpu_scan_time_npot / $NUM_TESTS" | bc) -average_cpu_compact_without_scan_time_pot=$(echo "scale=5; $cpu_compact_without_scan_time_pot / $NUM_TESTS" | bc) -average_cpu_compact_without_scan_time_npot=$(echo "scale=5; $cpu_compact_without_scan_time_npot / $NUM_TESTS" | bc) -average_cpu_compact_with_scan_time=$(echo "scale=5; $cpu_compact_with_scan_time / $NUM_TESTS" | bc) - -average_naive_scan_time_pot=$(echo "scale=5; $naive_scan_time_pot / $NUM_TESTS" | bc) -average_naive_scan_time_npot=$(echo "scale=5; $naive_scan_time_npot / $NUM_TESTS" | bc) - -average_efficient_scan_time_pot=$(echo "scale=5; $efficient_scan_time_pot / $NUM_TESTS" | bc) -average_efficient_scan_time_npot=$(echo "scale=5; $efficient_scan_time_npot / $NUM_TESTS" | bc) -average_efficient_compact_time_pot=$(echo "scale=5; $efficient_compact_time_pot / $NUM_TESTS" | bc) -average_efficient_compact_time_npot=$(echo "scale=5; $efficient_compact_time_npot / $NUM_TESTS" | bc) - -average_thrust_scan_time_pot=$(echo "scale=5; $thrust_scan_time_pot / $NUM_TESTS" | bc) -average_thrust_scan_time_npot=$(echo "scale=5; $thrust_scan_time_npot / $NUM_TESTS" | bc) - -# Now I want to write the results to a csv file +calculate_median() { + arr=($(printf '%s\n' "${@}" | sort -n)) + len=${#arr[@]} + if (( $len % 2 == 0 )); then + echo "scale=5; (${arr[$len/2-1]} + ${arr[$len/2]}) / 2" | bc + else + echo "${arr[$len/2]}" + fi +} + +median_cpu_scan_time_pot=$(calculate_median "${cpu_scan_time_pot[@]}") +median_cpu_scan_time_npot=$(calculate_median "${cpu_scan_time_npot[@]}") +median_cpu_compact_without_scan_time_pot=$(calculate_median "${cpu_compact_without_scan_time_pot[@]}") +median_cpu_compact_without_scan_time_npot=$(calculate_median "${cpu_compact_without_scan_time_npot[@]}") +median_cpu_compact_with_scan_time=$(calculate_median "${cpu_compact_with_scan_time[@]}") + +median_naive_scan_time_pot=$(calculate_median "${naive_scan_time_pot[@]}") +median_naive_scan_time_npot=$(calculate_median "${naive_scan_time_npot[@]}") + +median_efficient_scan_time_pot=$(calculate_median "${efficient_scan_time_pot[@]}") +median_efficient_scan_time_npot=$(calculate_median "${efficient_scan_time_npot[@]}") +median_efficient_compact_time_pot=$(calculate_median "${efficient_compact_time_pot[@]}") +median_efficient_compact_time_npot=$(calculate_median "${efficient_compact_time_npot[@]}") + +median_thrust_scan_time_pot=$(calculate_median "${thrust_scan_time_pot[@]}") +median_thrust_scan_time_npot=$(calculate_median "${thrust_scan_time_npot[@]}") + +# Now write the results to a csv file echo -e ",CPU,Naive,Efficient,Thrust\n" > performance_results.csv -echo -e "Scan Time Power of Two,$average_cpu_scan_time_pot,$average_naive_scan_time_pot,$average_efficient_scan_time_pot,$average_thrust_scan_time_pot\n" >> performance_results.csv -echo -e "Scan Time Non-Power of Two,$average_cpu_scan_time_npot,$average_naive_scan_time_npot,$average_efficient_scan_time_npot,$average_thrust_scan_time_npot\n" >> performance_results.csv -echo -e "Compact Time Power of Two,$average_cpu_compact_without_scan_time_pot,,$average_efficient_compact_time_pot,\n" >> performance_results.csv -echo -e "Compact Time Non-Power of Two,$average_cpu_compact_without_scan_time_npot,,$average_efficient_compact_time_npot,\n" >> performance_results.csv -echo -e "(CPU) Compact Time With Scan,$average_cpu_compact_with_scan_time,,,\n" >> performance_results.csv \ No newline at end of file +echo -e "Scan Time Power of Two,$median_cpu_scan_time_pot,$median_naive_scan_time_pot,$median_efficient_scan_time_pot,$median_thrust_scan_time_pot\n" >> performance_results.csv +echo -e "Scan Time Non-Power of Two,$median_cpu_scan_time_npot,$median_naive_scan_time_npot,$median_efficient_scan_time_npot,$median_thrust_scan_time_npot\n" >> performance_results.csv +echo -e "Compact Time Power of Two,$median_cpu_compact_without_scan_time_pot,,$median_efficient_compact_time_pot,\n" >> performance_results.csv +echo -e "Compact Time Non-Power of Two,$median_cpu_compact_without_scan_time_npot,,$median_efficient_compact_time_npot,\n" >> performance_results.csv +echo -e "(CPU) Compact Time With Scan,$median_cpu_compact_with_scan_time,,,\n" >> performance_results.csv \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 40bdc95..062d011 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,7 +14,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 6; // feel free to change the size of array +const int SIZE = 1 << 20; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -209,13 +209,13 @@ int main(int argc, char* argv[]) { printDesc("radix sort, power-of-two"); StreamCompaction::Radix::sort(SIZE, b, a); printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - printArray(SIZE, b, false); + // printArray(SIZE, b, true); // Use std::sort to sort array a and store the result in c so we can compare against it std::copy(a, a + SIZE, c); std::sort(c, c + SIZE); - printArray(SIZE, c, false); + // printArray(SIZE, c, true); // Compare the sorted array from Radix sort with the sorted array from std::sort printCmpResult(SIZE, b, c); diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 4dc8ff0..024982b 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -13,7 +13,7 @@ namespace StreamCompaction { return timer; } - const int MAX_BLOCK_SIZE = 1024; // keep this as a power of 2 + const int MAX_BLOCK_SIZE = 512; // keep this as a power of 2. Tested and found 512 is optimal. #define LOG_NUM_BANKS 5 #define CONFLICT_FREE_OFFSET(threadIdx) ((threadIdx) >> LOG_NUM_BANKS) diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index e1332c1..d28fa12 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -12,11 +12,13 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + const int MAX_BLOCK_SIZE = 256; /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ + void scan(int n, int *odata, const int *idata) { int nearestPowerOfTwo = pow(2, ilog2ceil(n)); int* outputBuf; @@ -34,7 +36,7 @@ namespace StreamCompaction { timer().startGpuTimer(); for (int depth = 1; depth <= ilog2ceil(nearestPowerOfTwo); ++depth) { - int blockSize = std::min(nearestPowerOfTwo, 1024); // cap at 1024 threads, hardware limitation. + int blockSize = std::min(nearestPowerOfTwo, MAX_BLOCK_SIZE); // cap at 1024 threads, hardware limitation. dim3 blocksPerGrid((nearestPowerOfTwo + blockSize - 1) / blockSize); // note integer division naiveScan<<>>(nearestPowerOfTwo, depth, inputBuf, outputBuf); @@ -42,7 +44,7 @@ namespace StreamCompaction { } // Convert inclusive scan to exclusive scan - int blockSize = std::min(n, 1024); + int blockSize = std::min(n, MAX_BLOCK_SIZE); dim3 blocksForShift((nearestPowerOfTwo + blockSize - 1) / blockSize); shiftRight<<>>(n, inputBuf, outputBuf); From 54ad2083c3f7dcc2ca3fcec17cb0681873594a07 Mon Sep 17 00:00:00 2001 From: YOUR NAME Date: Tue, 17 Sep 2024 23:54:50 -0400 Subject: [PATCH 20/20] README and images --- README.md | 124 +++++++++++++++++- ...Compact Times for various-sized arrays.svg | 1 + ...Prescan times for various-sized arrays.svg | 1 + img/downsweep.jpg | Bin 0 -> 81397 bytes img/perf1.webp | Bin 0 -> 5274 bytes img/perf2.webp | Bin 0 -> 5990 bytes performance_automator.sh | 11 +- 7 files changed, 126 insertions(+), 11 deletions(-) create mode 100644 img/Compact Times for various-sized arrays.svg create mode 100644 img/Prescan times for various-sized arrays.svg create mode 100644 img/downsweep.jpg create mode 100644 img/perf1.webp create mode 100644 img/perf2.webp diff --git a/README.md b/README.md index 0e38ddb..0ba3443 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,124 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Matt Schwartz + * [LinkedIn](https://www.linkedin.com/in/matthew-schwartz-37019016b/) + * [Personal website](https://mattzschwartz.web.app/) +* Tested on: Windows 10 22H2, Intel(R) Core(TM) i7-10750H CPU @ 2.60GHz, NVIDIA GeForce RTX 2060 -### (TODO: Your README) +

+ Downsweep +

-Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +*Image source [GPU Gems 3 Chapter 39](https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda)* + +# Background + +In this repository, I implement several variations of a prescan algorithm: a CPU-based version, a naive implementation on the GPU, and a work-efficient GPU implementation, as well as the built-in Thrust library implementation (for comparision). Aftewards, I build upon this prescan algorithm to develop a parallel stream-compaction implementation (again, compared against the CPU). + +Prescans and stream compaction have a variety of uses in data analysis and computer graphics. To learn more about these algorithms, give the [GPU Gems 3 chapter](https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda) a read. + +# The Data + +Let's take a high-level glance at the data, and then follow up with more detail on some of the steps taken along the way to optimize these results. + +A note on measurement taking: all tests are repeated 20 times, and the median data point from each test is recorded and plotted. I chose to use the median because I found that there was significant variance in the results from test to test, and I wanted to lessen the impact of outliers. + +Before recording any data, I ran the (automated) test suite with varying kernel block sizes (for the naive and work-efficient algorithms), and used that data to determine optimal sizes for each algorithm. This way, we can compare apples-to-apples. For the naive scan, I found this to be around 256 threads per block. For the work-efficient scan, I found this to be around 512 threads per block. In both cases, performance was minimally affected by block sizes ranging from 128 - 1024. + +## Prefix Scan + +Perhaps unsurprisingly, the Thrust library is unanimously the winner at every array size, ranging from ~1 thousand elements to ~16 million. However, my work-efficient implementation isn't so far off - about 2x slower than the Thrust implementation (in fact, each successively slower algorithm is about a factor of 2x slower than the previous; a fun coincidence). + +Interestingly, the CPU scan is actually faster across the board than the naive scan! This is attributable both to the greater algorithmic complexity of the naive scan (it has to do far more operations to achieve the same result, albeit in parallel), and to its excessive use of global memory. + +Originally, the work-efficient scan was also slower than the CPU scan, by quite a bit. There were three major improvements I made to get the time down: +1. Instead of using global memory reads, I switched to shared memory. This required completely restructing the scan algorithm so that it could run in independent blocks, and then use those independent partial scan results to compute the full scan. Also, instead of doing the two phases of the scan (upsweep and downsweep) in separate kernel invocations, it suddenly became advantageous to do it in a single invocation; this way, global memory only has to be read from and written to once. +2. Shared memory comes with a caveat; bank conflicts. To address this, I added a variable offset to where data is being stored and read from in shared memory. (see images after graph for a visual!) +3. To accomodate arbritrarily large arrays, there's a recursive step that computes and joins the results of different partial scans together. I was using a pinned memory transfer here for some data that needs to be carried over between recursive invocations, but I realized I could find out a priori how much memory I would need for all iterations, and pre-allocate it. +4. (Bonus) Because I hate my readers, I tried to inline all math expressions in my kernels. Just kidding - I did it to avoid the use of extra registers, not to hurt readability :( + + +

+ Prescan +

+ +According to NSight Compute, adding a variable offset to avoid shared memory bank conflicts decreased my "excessive shared wavefronts" from 88.7% to 12.2%! + +

+ Shared memory access before optimization +

+ +

+ Shared memory access after optimization +

+ +## Compact + +Not much to say here - since the compact algorithm depends on the prescan, most of the speed of the GPU compaction is due to the optimizations crafted in the scan itself! + +

+ Prescan +

+ + +# Sample output (2^20 elements) + +``` +**************** +** SCAN TESTS ** +**************** + [ 27 36 43 45 26 29 25 32 2 46 49 26 19 ... 5 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.6881ms (std::chrono Measured) +==== cpu scan, non-power-of-two ==== + elapsed time: 0.505ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 1.11667ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.898528ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.435392ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.340064ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.294304ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.264192ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 0 3 1 2 1 1 2 0 2 1 2 1 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 2.191ms (std::chrono Measured) + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 2.0626ms (std::chrono Measured) + passed +==== cpu compact with scan ==== + elapsed time: 5.9357ms (std::chrono Measured) + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.717088ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.518976ms (CUDA Measured) + passed + +***************************** +** RADIX TESTS ** +***************************** + [ 147 231 53 245 206 59 5 137 97 116 69 136 169 ... 10 0 ] +==== radix sort, power-of-two ==== + elapsed time: 22.694ms (CUDA Measured) + passed +``` \ No newline at end of file diff --git a/img/Compact Times for various-sized arrays.svg b/img/Compact Times for various-sized arrays.svg new file mode 100644 index 0000000..4b2e055 --- /dev/null +++ b/img/Compact Times for various-sized arrays.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/img/Prescan times for various-sized arrays.svg b/img/Prescan times for various-sized arrays.svg new file mode 100644 index 0000000..47433ee --- /dev/null +++ b/img/Prescan times for various-sized arrays.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/img/downsweep.jpg b/img/downsweep.jpg new file mode 100644 index 0000000000000000000000000000000000000000..88be7ff7ab58a060c91697d96e6a10b57a35c0cd GIT binary patch literal 81397 zcmeEvXH-;6v*?}~a?VkLNRphREdDU;z}>f#RKs1@Cb|X@`~~C z(DU+%@d=0t2!d4rSQg|bPg&4E+J)?7!TyAq&IrMT27&#cMf;V12-#2mA+*2p56a>P zpAcr~pKTLYeQyp>2JfIGzTc65m7($(7P$f7oR7s^P6L1-Hvzo!9AF)MRuee^+yQVf zFfcGNa4<1(2(huS2}$vBaPUbfh>1ywi75zi&eMVS*D-9rD0)!ZV8W0SFFaS_O2#gSdoC1f9foz9Jfn1_O&J)P+S%QHIf`W>M zj)94V4XVLk3n2g$_D3NB06~GkpeRsObTkZ97!E&JNC-o@NW_aOqisRV;C7o2jU+bv zx$GrIQk_;yegXH8J~?!zINi^4U#vU?$#N<}ndRg4tQFdxcnXpC>(AS4SKfIw;I;5o zktHGbZdGFY;K!Z3>eoYyy9T!2VM+NlZ-$rlgp~~KeC{O|)OL(4?-K%0D9A0!IZx^0b-%lZF491D2mGAjbh5*m<9XfE2K+-&L7ED_O$njVCc8x-XJEWXv5@M|1c_a4BPe`yX&FO^Qp=2;rA2B^lGu_zSyZjsp z^dSL-LzmCy`71q2s~-uBw(;)x)da1|-&0)O0MM?0lWS}nY%kTu;uihh7vJac+m{__ z6sPf#8X6wnlWax(5~j98JCqv%X%zvgK`MtrdPqQbr;#-MamHOGg6)S3r|rsEmkb_3&&8F*Glr5wGkt{SyvWDi}xT=>4FZNy1Js&{?QPihqj zsGLU5lpj$SkfT#g)n^3E*TiOD>hYL<(Zy&`Vfm&xI&>!K}QBjBQli9gCy58JV z;rZfPI;wce8e4EbUxaI>lFKTkZctm0?lBV_afv#5q%vzkY|FL!aR#H{QINt)U{qOW z-iWhJ(rM_<^}Lzc8m`bi;j4IDq>UcID2GWZP;Vq~=XmO`<0F_q7vqlS<0R*|Q=w7N zd8w4AFLzdWX6&}(Uai)V4R=_3K6UmlfLHqoie`^%=DFn0n3M)2mAdscxTci8a+_SK z=u$`9l*&E%KA7?!g9-m~KA2Qn3>(jrv?ue{gEHJ6pNhhKW?GykX&Rl36^H$#GLM%J z8Eh=9{Jfo4b_~^u{O`Ug4jq?ZpRzq1o$0NQMo1NGp0Jd-RJ17h)!tKBtQP8yf^#xu zw_C6D?&EgvoWN(Yo%YR-Tb7z!mPxxa1{%E0)!)3>tgk2GeSKPb=ZZv#*=A6E4MHy5 z+Yqve1dzSE`rWnUltYpuS`%=t`tj{$Y|ini&6BOHD&t|?uxFDTS)5e{*lV0&aeIDy zh(prNlCLMHnYl>d^ZlUbpFbuw?A;UX9-oX%WomDlFq!bKo&31+kn9~}lz_W&)IG$- zs`;AaB=G8P;buBU7q(A(r4bAfEm znw!}S>$ysx33f%sEDI04u|Z7sSV;uwGPfuNl^o(CfiBZh8>_iqsxM&^M%Ss`ZVcBD zthU&e*E;R-_m}Tv=AOz91}U`pdpC{YIftS%oSMi@oKo+nX6`9+AF9WkM6@6Q)x0<2 z?7{o)e4n3F*iXnocC&)<8`zpA@MiL-He8+?Jn<;19A4#pI83g6O|j1bHrX{oHzcL( zRK9}*RwXn)8edIBWfmId zms07~9o?TMz5k8T(YU(_h&-71 z7`8bs(eVwTbA&BD{Gi`|js+E6G+-vp|KVYfMqAfv z%D&{MErbG&$21?D)7Kwu-=A}AYwBBsTU24eMADw*2(oWizo+8aTBg%fK+Kd!ID8-K zJ?6~&0RF*lX}vU3HM;QAOU_rFH#}Z3zuP0|H#dK}^kQ?Yq5Pod>bs|-X?qetM(y>N zrps{n9l%s0dg7$u`%}zYV9DTe3(1V}TkGc?PUSBoqU~J+yoE>h#`L2`k-%c{hxcW* z6(wu$$EwpR?vA0xt$nHu$joyYsOXW~xn2uyzO}0e+8DLlXvZNKp@+Jyef)kB9QID` zd{o21R1&&bQ0Nuy_1X|KBtW!a?twicN1d!Yx^x=5f4$d)v%_{TF!Ni;4k~9}O_vcFRtrYLYg_Hjaw@kMIDsPgO^xL^zLex^ zLZ%x9;1YWACeOy1l0<*i+Gs*J-nIFpnoM}28F6T^xicEQe*N)$kHK-0X!LX|Jo@P- zQ0Wn1uIW?LaoH^7jd^%a)%2Sg3v00rbJA;`{D;lSMe9hQ$AMDmspe=bB~32=frgz< zU!pE-b9`-ssl~n-)1koOH0Y~OJ-xl?)z_CUmf_&rL^q!#!HyRke8weuI`;gKz?bS$ z;y!zBeBqA>drpVltG6e7f(Ch|*6+7$Dk_~c=q$w8xT@{9kW(}FRGL}fbT&QLBCWi1 zX`;TK+w1WJ2NIBd6KEM#KRUD!d2(T3=k9}DcSz}6z?zd^vYbXYbh?A-(Mg*BacXmDs;&*`DmP&(H|f9kOhv670dSnHjSiK}B-{%De* zHDEJ0Y_?HXT^Sn~I*>OJlzFmT&mpXvtm{nY@HD>W9lZJTp18)4X-}1C^AjI-^t=h( zbQ3OLkJ>9EH}SZ_Dt!uEDp#Jj%=I5h;7a8M!ms()w^!jGygN}%JM|N$9r+kd>6Lgh z8MkY7*=2&qxf9QMBEih*BySVXtIQ*<;f3vK+{}RSA#+gf301XMmBD-7!g99Kgt3*D zp{578HeA&;{WI6T6`QXvoKPVFjn#FNx)X7dXTnrRQ`ao zd??vOXmARNT-ylTJL*|EX+D`rxt$`WJ3S=U@OBdiWOvb1+C=S2Bn=_~n`$Hw zTDjwpad-v4O5&SPQa^_WbwI_!GtpP8r|y+@8t)K$wan}HJNYPBtzPte>d~>desPizz?|8?-iIoEczfQe~-bsIUOE0;V#GV%dd=(nyoiskBD77W1+8wvufnR%l3|(g&<6HIUglRH~e<0ReX3*>I zcTUAS$_<|-?0tntnQ|J}ImDmt9hEp{$Hez@&7`e-R>PTLkV%EOC(+PK1gw??`Y?!e*tEZy)Iyg61}WFxi*kL{Lz}n5Fz~(TQ2{IzWxd2?5f>? zl-ZEfDfy7)CFi0s&GudQYjC(ztJiSY3o*N1pOQmET_oTr)q9u#z8PCK`w5Ir;MpoJ zp$j(XzItzp1dcS30NN8DwfgM*7oU7!PiWsMd!E`)#5A{jsy&@hixA>c>utx0)0--kMrr9UF9I&#S+-o&ToMcb602qO`2KYBAn!ZBP7L zv18@wpy|7owoEId`B3}KF z-?&!GD7bW?W6H_dVoQJbuqCIuCF5G+(hfZDqph_@^z_E%C90mIn4{`ywC;>VvuMG^ zDc^?+MsVC`LC$@v#z?>n+*B8Ih+;x;>?Tjj*tef$RMJV*PAWSDOjmwDm*~Q;gg>ZA zEj;v|XxY&Z%%J^{u3Az_O!|#9wb=KGO%mTTO0_MkFB2^@3gVP%d;B(rex9Wq&TM-) zjuqQ7&Rs%SuwcgIWZr|*!+r79$^eHieomEs0S4F7JevEYcd-@d6{1e)?bc3*b8Ur6 z8}EcuKl1?>@xZQrgxMnlfu=D-sJU&_82ALI8m?}RY!NJhdw2y@I@S0!2B=xNj9E%t zlZ+`jG(U0^BlRoI7=G}@#j6hY-J2GM;ti`!C-c?nejkZ9Z~U(Ai}kpZ!F~64dQXeI zhI`jWIm6>PajYf|u>#Ch!CzAac3-b2f1SyeZ`qiDp9ZZot_DO0#w@d11vIYjDwWg5 z`yR1u-aK`@XafkRPZnBO%Rjg6NA+(ul;53vW= z9~d4L4$5bjoH~-MpE-qnYpL)H>ef9T zp80IPl2{@+(@?$~ltHJq9!M4>hy?JDT6PEVQK{$3YO5LDrqd1k>q;rs!nRaK#~z`k zjl(A?UOoTN88}tb-=TLh+A=30b>#T^`it&FQ2)mjk6s@{2|;K3=QuoQYy@?6SKq(g z5!PPQ{$zd%J7y5loM`g18$L}KJ&kDmmL`=xaY}wTGaEaxhXk&dFQ0}U9MOWS$ndNA z=b%-vX%qz~{e_RAer5+Zb1I9;y1G6z4I0pLgeoV)?o(Fxl$P$Dl+S?gbD^hR5uesX zl51YXrUlM|+d^o7V%mlAmb4w=!)r)jL+Ptz=4ybZ0#-|{AE#vnh#p#wKXU4*M0b@t??h{CD3pS$lKtYr zNKLj;s?-(hrlTfB!5ZEVZ;pEmPsjHQ_2c_pDUM&otRDqE@fkP`Xi;1Dm&`El zK>|}PlSm+Jhp@jvN3gl3M#G{f%Cej^Q{ntWR_r5-lmtj zhq;_&UleE_2wL$rJk-?fJ7M6S;Qit@bvHB-ZbwzXcYl=fc}{~_z%}NK8of9Awf+*C z{efXK{jA!1TVvyXv8xk1r=69c8cdy{p9pW0d~Kdu9g}<-5VtzDr*Vv*T|2cGw9~R% zrD3SLa@%M4umw4W+@K8pX=UUGt&AeFDjF!??S%yA2CrW(XJxRBa|FQ;d*zv}^vnhb zIj_fjUWRvmJ-eLc@(o?pQWYo_CbG!+7{e$MrgbD#Te`3zZcmU=B zz!kJ#{%wM-oE86V5%T|~>|8P!XKe##V+F?zmHz(yAw^mc3!GbIzsrZ}EDJLRFW<*= zmIWZ->_K)wfC$h7Yydbo4n@ELa0X#s@FNNcf*%185(4a+=-Ao+C_f)2nuoK4m5qmiv$_XZ8St9f4F76I?4hS2B7aXjEqo8}%y5>0!%;Q>kdAcasAZ*+%JZ-E&1fX}* z)#e8=)_3AF%vp&F+>Rc+{cn#S=H>3Jr(mQ9cJ)KJXTtw0M#IC-<}VmD3ujL~3%kGI z@T_dWdN#hEDjv#u8tRrVF3x8SV*G*r3o)j>i@TqUvxD6qlEVK!c;!FPU`=#u8(Rx6 zXHO8s^tN&L{2OwEKhS?A$F{VSb8&WY|3$cj-`kf}`m+G60$g=LoXHx~)5TTO%hSW= zSH;0{1{LW)VR0;7JVAx}Pk3x_itYa+`T2?@IqT?LRiWp3oU<%(FL*f_ONQ z{Cb>2po0J){=ZPr1Hu3oz;o8%cbolpOFcShu?M{-5HQM|NZH=g(^ZU{8{xraaTfQ@ zW#t0r_O)>3=H=qy2Chi?x>{H{*?7`h+JJUx36`zK78ZI3YY7%ZK@A=aS2-Jd2W3BZ z8(qJvdRBf;R-)D{Qj%C#q|Te-1e@Wsb#V5yardCN@vxF$kker}EA|xwOW-!17WBSw zXM~5CuLR3^1u+mlLvypxpObhxNw64cXwiFu#$YE0PkKHs5iUMXetr&mTXze%jgO1F z6Fo1NAPd-}yS1&Dj=bWJ2Em*J%a4Ki`1o-7@N>Di+i~-Xii&dc@Nx6;ae@?_9ybx5 z7QUPak4s>G=ZrWQ{aNiV@^COZ7YEo87X5iw^5AgX9b7#fToCjYpo_s%g2l_r!CFj6 zSV-7b*ou!+RG43olb6@pic?g?Qkc`yM#xfF*j9j-$6AyHi~f&sTexay{C$sq%m5tD z4faY;e+5KnXo$(XSb4#15S}XXlHVm}?P3LX>-tBmz8IgdC9kz8pU8j5z?HN4{cB_3 zu(;1e_{+G!v3|#aVg&`v{pX~evGc#5es}VZWc*uPzs2>BB=C=je{0uoas49+{3GJu z+Vxvp|40J=i1@d5{TA0hlE6PA{;gfV#r2OQ@Q;XpYu9ga{UZtdBjSHoyFmNSA66k7 z1ZWEK0c}3Wc^o4ZIXQDJ9Zf|Qb%k>S7LKyI1HuIg24=z$p6)tIvh+sACiLjtU|=RD zXn_JlFfFV+Ts3qR&!Ptb6$LqZ50K~#{*gc<*xvB>xWWNWWi5L8KMMZEJE4`UI~e~5 z(E{oDtwECw2;Tu=XCF`3GyG=|Cb4ungP~`^pd{`fgCItk zUP8}!cD06EpTX@Q9PaIP7Lo~52jMVp2OA#{9t2@VXD_$|2%kmulE7^&z%Wsivlu=` zPa7+H5I!@`;kfJR$bm5USKt^pcE7-uzrdbgY$Qkv$ho-QJdYNnXSKRa51O<@=#_1J zoNYWkIkiAjorSwKy_^f&)dF!70KSj;tQJ6YCR=(?$U?lLLPDH;TxSumf29Am@DHN@ z%l&L_zmt5`KA$rP*@vIFpOt^&Tynt3VlaXl_V6dp@(}>k-UR@P@t-)xEYKEu2LNh% zei;whnZEpD;&ZUE;yM%PpXvWf@F(Se4gAs{*I9pm%8p+CdvGcJnNY33U{No3dJoW2 zYD3TYAC>suuJ{XEzwp7KV`FRMZi4{3G61#A0bvJDH^SQC+!X16_^)R8->miv4`<+a zy#@i|+*5$WoC_f6Apu|;9{?0$Yyf8Z7%YMOIdAHi2B6Uz01WAtzw12+gXQPfzg(a( zU>54(a2~J>$m!_OTY0&AeAUsgRDZfA%{>H6bDKIrGYX-xuGIZ8K@dm4{8RrgStWepm(6r z&=hD6^eMCk+6sLS9fHn6KS6iFV^OeSWUz}cPM8Qx4yFk+f!V^`VS%vwuq0RxtPIuw z>wpcwW?}2FZzyOe#3=MAoG9Wb$|(9MRw!;LfhZ49Qc(&~YEary22kcuKBJtX;-X$a zWk(f5RY5gEwL|qm4MR;r%}1?9eT_PVx`?`mhK5Fl#)2k{ri^Bc=7{Ex_5dvdtsLzo zS|8d1+AcaeIwd+gx&*o=x&^u?dKmg6^rz@8=>6!6=m!|s7<3qX7>XDs7zm6Ij3kU= zjAo2}jAe`?Oae?MOfgJNOl!=Wm@$~Sm<^acn2VT4ScF(CSQ1!zSdLh?u~M+guwG+L zVtvKN#%914#n#4lzz)Vv!G4DQ7JCl+8x9c;8;%@~DUKITG)@7|OPn#Buei9l%(&9H zCb*us(YQspuW+Yv5AcZbxbRf)tnq^I((r2V`tdgLG4Yx3W$>@#`{5_ySK{~JuM=Pr zFcZiTSP%pfq!H8;j1cS+5)<+gY7)W;9}pH3b`q`-p%XC^DG*r`-648H^onSK7)E@F zSdQ3=IFvY-_%-n&2^t9ti3*7$NhC=rNjJ%7QX*0TQUg+7(lpX0(pfSX88ev*nKM}o zSq0fJ*&#U%xeU1tc?5YGc|Z9+1vP~1higJn}ibG0zN(D+M%2>);$|))oDmE%@ zDj%xHRIjPlsY$5CsjaB*Q&&)rUw~d_D(9$T;xY9hLc}25MOGYbA z>qwhG+f2JkM@%P4=RlW0*Fv{OPf9OC?@XUU-%kJe;)RRK7ricKU+lhk#K6L!&k({; z&MypSN+e-<;W1?0p=l9HJZu zjvS5=P8?1dPH)Z<&N(hJE_JRDu3D~5Zbohs?pW>)?o%E?9%r6ho(WzeURB-@-Ui-n zJ~lopzBIl;eq4S<{viH3{x1UT0yYAV1x5vl1vLck3APEIfX)4dV=Fjkt{hjNThl89Nx)nV_2(nB<%6 zUQ@W1cx};C&@{|+*o?)@*R1n8)ph6VP3HLK7Uq>VP;VICD86xQp=pt0v1_SfnQr;n zO3o_T>XWs!b%OP(jf72{&9be8ZJh0jorGPy-KxEmeUkmUgPcRE!xu+o$4tk4CrzgU zC#18Xb2%IXegj^QAVN4H+FfW}d|mon*O4unm#Gh4 ztX_A$=Da1n)4UIS416km34Ia1A8xYUjJUb%r|6gGkK%9T-*)Tbt=qTe0%QWR10jKy zfo(yTg6;$@1}g;@-p0D^bh|5rCnPRpC)6;s;SSB6;5!Ru#XNq9_7L%KI94jQFpe0MPOXW-J$~eoi%gM?kp8?Ojo~=E<@qD;Kt)i_`q_V7v zsVcpixcYt#q~>PL=UV&P**fF8zIxUAwg&Nr>PD`{f)^KGq&1N?#WZ6z-)Tm+__gf5 zbbYzrYS+5ZX5KdO%IMW#yLNl`Yqi&J-zdI$)gjZ-@>b&Qi%zl5`gbDlYTpaJulXSO zp}I@3tGZjLyQW9Dr>Vj~DMO;VvaD z(=8XR@U6U9Rb1^|ySBFY$@SCGde{cxM*1e}X4RJT)`!nVpBKNle?e|P_)7V;U`Jr5 zZC7h|dJn#Lw15AA@}TgW$hVF|!^7nxpJVjnl#|OR4X0|SlSnxDcWyt=?UDxH1-_r# z1^qL@KnR8cUcqywP|gx68tQpMLqkW$Ku1Tz#Kgh^&-eOkg1}%fR1{Q9G&D?HEKDq1 z{Iher@WIo+!1UvW^F8<1dVl7SXXkc-FA6&_2zb!d&vUz^knO;$%a4SP z0iJ#a_6LQ4=cS#U==Dbd_~HXYfq+MP5rD5bFbE0^6P*AZ6%7R)Jn0Lp0HC20UL+#M z;2lhqVIUzLA~Q&$(6->y_cj!9i`8+jxWum;KTzdmTSG2<`?>7h>NhHuj7);eA#!@2 zLO$88aiLZMcM@#!FbM$&*f<&n3Je_u6xG=ga!?rIMQtJ!UW?~CB&0HKw_{t0`BAg` z82CPipnZ|GbdS3vKbKQM*2X9&plcP%^w;CG_P4_mLK^*!mn<*05u6{D32#?4pUbMUVQyE7w$mX6)p3kN1XM z9@l&0KIymIiS)WT={(W-6Q?)FFm+W3{&7!qv1F;f++M}2*fkbWKrOWGe_?y*tLaD; zW5bY$a;Xi&EBShf$e};l>|J0OimUL4?Cs$Mjk_Df|^U1V_tb z1TR~4dS&4jJN&{)Bgn7z-*%&E_1L7@DRu8ovPl6Z*_vCW3(NB!PIJL7e_6FsU9g(S zSZ)01a#DHfQd2h5a8a8=L;By1dlCC9>AO>mN}e#K2Ka-(2lB@pj%o^>xrfpn3UXpH zygZP}lqb_Ea`yV7531l6^d{x+mli#P$&K>ew>{o#C4?SxITa@4NPWTNS_xVz-X!82 zQUYZ+@-KA#Z;HHm;NwD4tX8l{bzyJ{Aou3yWRz@im{!hy`h?eRnhXz17+iQ%5>c^@ z)f}k%h}i>gLYO`Tfpz^0<2$Xf$qBv1?Y{a%hLaLSCqZ_`EUyG=5+v|bEA)Lpq4xaS zl27IzqfVMhwFmzi+x6(_h_6wttXsSb>8oF+cKQ3Pv6>8n>LS%6n$&+YrtkVxP3b3z zk0eIT523=>mE#!J7YW?a2=%m?8^U*-`-^N%_%Ge%Ao*h0BFGw5K$@G0Dsc2uOU=MZ zE(BHZe@Zw0W`e>hgxN=F$uDi6V3Wu6*^&E9dZ?Ev+62%ud57AwAo?!0cgss-RoI+q zH@Fz<>~#~iu*Ur{pSTMcdKo?>Gsq0OL1k9 zc667Br7%eh33XH?AM*C;ndWHB!s>aIns~x`i#;gq$J?Y07r(kUNMk?MrMxM+Sg0y< zN-j94s!exrC)rc#>$yri0e1nXpK9_yy?K~BY`eWzE|k7tGAOMMTwM5NTQTRl@cc%} z0RFXPrhywzc!hR%#;@j*-a0N2rF!3p2*)7oRGVgeq)IKGc-5SfIt7z_!*Iz%r$!d` zw9+#D7pa&1PfLATE&kM`Z+d`~SJz*&?aD*RRR6T@d#YP!`W^4WW`0cYx_Ki%UpS$-s3TZ8A1+S~{&cM3bu5Q^ns(J%LK0K@ zEJp=hLV)mcs6AeqqlT+%p{a+O243QXp&?Ot$s6NSPxqqj=(UqcH{*}HloQWN2@YM> zs10vEcK;YgTrn}-wVt>o(*9A)PSW#nek*IuCc0~kX>m45AAz2#90jatdYUHja^CiY ziM}F+eAz1#j+=XkhAbw?OK5MX#eTP|t(9QJl z(Y4otY=z|5G9;44Yp}_p#o4a~vp<>5t>bw9pzPj5?Y*I0Lb1bTnTbL)_2l`di)iR1 z8JAzNVuoX05Y}i$?VS*FeDiKZ=bmb5pG0+G9!aPH+n~RAl3U`ly+EP)=A4U+aNG5% zNe)`>HsfLJl2yJ4dP? zua!!}&&+_1A0mzx+r1pevoheg{>b}z)_kdgLaKpG^v$?uE=~^B9pWr>76&sO+hmj0 zX}t?qZU=jo*yQ;>eF&S7 zH|d(Y)>qv4o8%19F9dt_8aoR$W+78Nc-fUXm7Z^1KUA6&*Gqju^=u7A($>7)#PSe%a~E_ zHPKdk2iq`qZAFgMtbPA&1Db{<-iotO#=8gZR%oB?^Eiw!R#EggsNNlFd_~rP+*J^aXw=$3L2LCr*>hbdYCtOa`%uvOtsH>I^@w*3D^sP1!#>vfe z*sU>0=EMsE)2ESVMNKT=BUCp0^LeMb#te^FyfHwX!VJWB`IebhB&& zw&4vz*89zinM*V5=ImT_Qw2lvnf_wkZRL^cL0aQvOU_-yDRHW;Ry5H)zCtbLMFi6} z#Y0(ef{(t!C%1vk-eUABZ{Bnz6MA z*6#EfFG^{Mv5(gIl&QJsajpkN1a~+S1XShugZ`IE*sT^w~%1*}>?ZC~ts4Z+KySYfgb|yvdpBjfTD` z6;GA*W1Ffo>UulAMjzEa6qi+4sogJK!Kn9oOS(f{`XwMi;`Ay4SW)8b0%G&5tC>i0 zINMvWGAB^UNgWq2eM7fDt*C41Fdx2Eo-giQD%zsA=ZszI_Sl{6k>#h?1Sy;+LJgd*}e>S*2nRwEL0J)=IbzD#>$e0h(xsGhzs=RvY?-wSc${t!gJ{IW7tj?jnig3dU_!YI)DHIH1VXNi7O3 zvG=7lJK3i=1KQEXOI(-&3}anfT_G>)Bk1U=7IDUC{BeNL{n z)n1?~b`lbJXSAkBf02dvRmv_k`W?46RR--A%BQzsVOJ?I-+4aJ&}gtc=By@DiFjRm zS(1a(DJ-;FPimL=bOAizXIOK)@e0|@7x7z$$i9E=5lyPO^aS0^wDtA4fq><6o}@vm zlC#3~cyzrmIZ=p{gO`%mRS?~p;-7R4kw_xhfn28B_&*|L6 z+&x$MrD?(bBCczpqQTDB6(4yM-qJSTL?+d);%j$AYRID6U+dbg-XWeEWiG1G)#~T$ zn@}xmAdvCUNcCq|uwdKGV7>psCUaSdEt7S#e5!-=v8HxLBUT@N{%LkK!h=`JTfvH@ z*Bkzb)2DMrHR{DTqq_e02gMS7IQVqMZ|8NMGOGC&5wxN5g_z9&kg_FA+$_>!o7Dcp zy8fielA~Bf(GhlGv&!glCzIaRectsBo@6Kf+Cc9S6r+x+Kr?+;X7y-cjGPcLmX^K- zQkhB!5ImsfRr0#7l|C73tgwtJGlq|xb3b2uN$AVH+C{5vHKlUyBLiHW*g@(=MrkXm zH+A}aiTwziBCOe0uKsAr{uam#3#$_3VKEfwgd;+mHjia%%>}ocpFq!R#qh0 znirifJ*p7We^yEC>%@Df`{IL$>*F<39P9<6tU0q)B|g!wIX_a@z^GgCn0RYn1$heB zXmfD;hd!-b2s-2>)TE{`*4;I%*xEJu%H%KBSrf1tX2tO>jGQi0NTR;oC|6JaOMH$> z;VyynRVEn{6AIsZ1cO51WbrRvkm-yEKeD~ydQ(U%6P53)EpVZJu`R*pzEOg7K5Z=E zHwrjT_T=k}YU&raCeIRYV4ujCka?97dTnnqnJB1;hQ!F4z|o2yGjR5w;)LF#*CvV6 zhYx&yYh{lnw6a3eS|(QY+*7A!Mb8j({zd$|eq$6KLu>80AFu>Ww|!krJNmL5?Z?y$5MBfCs_yWOD57_Om!U^>t{ z9c8U6?qF_%1mC#h)yswbR7vAWMZmtafpdAjSoctg=-T7#XRON?rYa4|ESk31UtwEc zbDVvPSzv9l6@o($6P*pNzg|UuMX^h9;ADJ-e*WWC16jAa!f&r%lkHlMDjkU3fuM1{Z}34M zkxNj^qV|C^kSo8bRixTKPTb6hRbDo6(^Z%e^o;u;T-=Rs|Kx^Zc3`O2+!(Wi<^k{K z8(vnmy671O`XVYQcCy<`w1(hhS0M_o2(z4|gZ>ioxGBNP1=ap0Cx0e|rO4@bZ-wNX z6niiE7th4pk9_DuU;L3_n{iGfFq z7mNb&A`*?#aYB$c9%qg=s6AbSml{>rN6a`AxMwy}p?*i8UCJovkqwUMN11 zMH<74o$JbwsrefI{_5Ltnew3mE#0Y%M;jB%^kyl0JcK&7T~W@Pv!0>tra6Xd9GWNXFt;I{`{*jxiO5M}gXb%A%amrkkR43%j6=DuRW=l}d(eI+yB7MZc#Av062U9ia(vda?;6 z?bTxp(Gt~LPnk!X!cdIg7I}!N>$;)EsUiqW9v&X>h3U!?;4nkOswq<4XGdfvd^%O_ z9=s2RI|`Y9HIVB*el1FN82GjlPN}S7v~u}EVB>0Dym*C_hM@KR9c}qV&DM8rD0f1x z$lx|s2MWg;+*Z3hUwPw(fSX`VjSnlQGhKr2lKo`Yqq|jCVJ*Ugl_G~vg<6dcco&a( zhz4H0l+#l3kEtMQmP>C zSg`3Nw7s0%F?EM~6~4AUlCMu`u3sF){gP#z{4k`7-BNm6PB5iEuB7_3eHoqq zpv25XKgaf2jpkgQ;7avXZ$+6pUUeah=?Ju)>4_J`JjFRF%P*!VwHlle`N86Tv?nw&v(*Rpq8GnR=aXwS3 z>01^_-INEE%vRYsQKXul?^$n5C9*Ga&XBamSs5Z1M0DZ>XkC$GlCZ_AYh=1Q+NuoY zly=|x-W9wPFWGl{Z5KaihiI#kTWu+X8_=1Z{h*j8SRq{L4{xY1G>VL7P(4y z#2LTJ+Xt+a#U3iV$Bf{u;Pp9v0F~pTELaPUCF*e-?a0<4)|r$*YLIr z_=+#Av00@ri*{R4bnv9KF)#?esS?n-ly2&1=%r(7CPzVGKX!9$k8rj6QT^>g&9?*^ z%`245thF5)f)PVuC81;65Ayi3b<8%>K0ThBqXMc@j8uE zYx94aEuuAA9`Z=BB3i!e52dlMMpsrlcKa0+I%D-aN^Uq9Ka^LpkABiu*hj{e5%0+4 zRep;FD*{n~;VV{g)G?^a$R_KmuJH;L=MmPU1n5=-778&U7*z=<_0xOah(|SAks5cu z9q|dTr{x;Iov(EEm7$P(+Lc}(veow6HrXDvUWyM-y$itd1C(6{e4l7}z^d*tD=*m# z82dAmaWmU%bk#UWujVwvkm%q zIN*ty4sT5H%jzZ&LSi2i9C32llo?UN>G2pO@H1ag7ar zO%%S#3I)V4RxVbFhiZGs-NpK3YT`(C?CgYLzeGr=-X9Zr9}%y-gh7H4$I76^;P5cJ z7nK!2^#o9%L@Gt5dBhslg9_H~irEP|pT@q#(S6vOeg6hc2o#oe*UQavFEPepG>W#O z%0l-poS}%!PnNYfCo5N4gCSyAEDXvhJ*6Hjp3Bu1j4Jcg$rF{Osi$90zstPuJ$GR4 z*^}WZ_+%LSk52||GH$Oi@21bMF(%4rM8h-;7P4Zyi)D2w!qgXMBnLd_421+VGpq?d5a0j@osuLCr08{|Fv|fV2BH>?m za<$#Wj{mTe8z^b-JAuoTkgylU3s~7Nj3iB{fQy!dk$p-GLF~=dhAKbe-*RCF7%W{# za8Pu^tT5weUq5K{bdV!w~&GAT30iFqmMd-cy+!vexciS z-8GQml|;Fl7nTRQY5KoUWPs^0_ZW`wuceymrkqEMc1!!lvQEjwB56I9suB{r%#hDd zql&_o*4o%Fj&5VH{wXrFDBEZ<(xjgaZakCvV)j@e%>VlQ8mS$R*SltuuKCA~Uqio; zQEvE07V4&meO!DxeR-)|M2gVj@km|6UD438tN%U`J8Ea0U7B;|jmbTf&?_j(>{?H5hK zo2>_tID6+^(wueq?-Cg?DpD*B(=nKfk}~_=_8A#;X0&=++1V4q*@=NC!=LYJ^}<58 z?Vd0C6Y2h}$xoq?MR`Wbh(gcl^|Z zGwn#c_3tmLzhUh^uVDlHxX;T%V|=X*&gkpUQPkLrQuy5%$8~14tyQsN&E^T#edN8E z(;SKHO!xZlzuo4vI@9Hs|KI5{o#pZ6t}?C)%k}WRlrW3jYxg#ZA9}G!rWTNI8Tj2G z%J1+5!c0|ZKkHrUu8+#X>U8If9gSd@`bBFCKG3-T7mG1^+pM)qk@Mc2)bwgM_Pv1- zM!{!8%sK;Y?qX@aX^bRNgpXiha)rl7@-w&4ElHQx$N1z2cu0WyrEep6eEfh0TlD2=TQQR=*V6454XOG;$iY9h ze%1)^ZA_c!dko3{L`HjB{cRWT-R8UPdxH(-B*}j91ox{L$WsG#*>uxM`E5@=Gn#eb z1O^iVn|&m9VK;L+#@@^|l}27C`xKJY;gIW8&7MkQlvAZ`!9$9QQ*_Tu3mS1JtXI0x zzCrd`gmOon0y>5yqSEJ5!qk#4YSjDi67RwwaJr-3q_^=(crsf{YRuZEh}OOeh7;)l0%JX$}Cqdt244 z&X)!KD6J+UHcbv9LBd+$^ z_mmfpdVM77`9~bS2P{nxk6lXMl9}B5*kkr1>)(lKn=WbS#RUaMM3HvW8Y5{6+pI zS}i|Ams-k?peqahR$d=6R_Z2#{XKo|_lmTAYWSUIvj~d&E_&L^r??G2JEEP)J)1mbnosS!T zgoOQFQ(>YJi!FG@zW;}~vyO{v$=Y?}?(W{W28SfLLvV-S?oNQ<*0_7{#tH5a+(~eE z3$6*00J)trb7W@DH}}kUzkBb$``5d6^{Q2COFi#f(liE|12tE`&5#&HuU2YB`Rw6R zOBo;#y7Xdz{L40pSXnMBiM9-WMcsX?0VV7r*yJif?|p9k z?*{NMjRnE7Z=uUKb3_=^R!l4a=H>3fZ)2kIJ@tjbCEjg#0h?aas`oSgeTyxdj-Vvf ze$E7Qj?;RVGw@01IkrMETFykfO>~zoA6L=q#@87Waw80A5ccZv_c+cy(CU@P^4kCR zWBZqe=_h#g0bRy2dc>-uiJaYGh4=B@6*7lKad*{1B5g#wfC2`fuYWViU<(tC*IK=v zl_;S%71^sICo5TMMo;zyvi0w=w)u&@y&~cL?C}fbeWbTV>42u0;qOd08_tO{yH`Kc|qo@pf=&v;%eFh2z>R2tNbJj z)dna2M`z(5n*aNR9zb_R8gn47cS@CQv(GZvRevB=(U+9Pw$b00RB?&q3n&-<8-PVz z!kHRVNek!knvb~%kcHcCs4DPwjq)C{#}WhL55N%62eQFPPDP7MvxczU$_jqh|JBL) z{Z+EVYlVD>f{z-)KadM&{x;u1sCz?cEYE$6zq{4CmK+ku{-*Xu*b zdBeucdgWVj{%G#HC{N5M5;FVpzYJsXv3o5$VJBSFH&YB^8gsWcsQK6rTM^2^$JF_H+8H4a~{+Rc^#>EjrKLBs8pocY+kvmBS`IH3b5SFrj8W+-6Z@@X98^i|>w< zNwgGGu7Z6vP;B+%OfZlpGJftrQF;3Z)u%e zrTBS|*q&~v0hz%y1%kFJ*gW&l;GYBw<77Q@W{?X0_Uq7Br$JGm$}8K4z`$GIt+p?x z|M)Ev#gy0;$#2)E&0<*|_C=7j)hc}R`G~Y9s#O)bOV|$pnUqkmj`~MMbWsIZbeQ#s z18%c3ZyNL%-q0nkQ&Z7_B6V$h9VOmfS~QWXH!S5 z@k}&;6ZZlEg?JbP+@UH0Vbz%sDJQG(23%K~LUh1q8DNhu0tBLUiE*>A%1qVjunAPj zyuFbhjTmg}RjLJg3c&pkk^@kt+q*iyH+W{9jJqS?iNCTH-6At3qSd_)C9*H7!j^{s z9f6o;AQt1~5fp1}&%|BHhqs&TqAuCJrCq4E0GIDPvO-|c<{sFS`8-xzP6bXRWS#mT zgC|0$6Mm*uB~}5l@_@S+2Sr8+3JqF%E!$<(CAav}nOS4#sJ+yyg5sdQ@{Sox7vHUb zh1?NnI%a(`?mqCQujgg6=}c;t#p4vFNyrC5%)3%*5#vrw;Z`T;azTtHL}QgdABui zkn(gdlj;)4G*6eMg9&)%QBi$Tkb%0(*J5Idi)yrnekYcsU`jnYM-?K2rpUP5rT>{8 zvp$us(OYNTLh+O6_!{4_XPs4bXe|P|k$mrh%WAlyvyNJPF{4C#TSU6gS|JZ2tXk0# zN&#|^t_>8xmQF4{Z7;=DrVPu$z&WYTYOope>cvTWI{j7RT-LvM21?(C&kD| zHjgPyY9Ev!nd)|OYh%L)s5OcQjqUYnw}r#)=wcU+?8;k!yN~!V>I{WV0sWiQrW&qn3;g zk2V9@Y9-(Z%Qsyg>d?SUdwE-smqjS^Lt(1nqs;5+mDy*88JgyPnq?0k0J&DLkB}$p ztE{35TiI;id<^w-F;HW*KYq)BXibzhw8bY~Rc5n5{< zG+9Z8C};NGlA9jM0o_)aC(-5CmDMjV7ry2~Rvy+WlpN=tXF=ETpjB-FBb zWf4ydIo5^l-ynURVMLs`_S&uqjpI}kO2K!5RwL4YmB}=2=!H?2=Yhaj2I{ z346zTp@l^@X6)7e8Bmhes&Vr#-`H3Erbf5aoJtbiuy19u2%noDDhOdzQ|BFf38^p|2p7R z1N&DvD|J?@sOoe>jY@^WbcS8J{Yk6ZZR+ewv`Lp;L}A(#ZIP7_&IT9~(Mg^q6?bcs zV{1J(Cg$Kv3#mQ33}g@8m7sPL$1kS)d+NakhPQHKiIM(qV9% z%{;s*v4m@IvUt@SiEF}@?*gzHEw%3}a!uB(H#FTly=bs+keEaB1ZgEA`A7?(O$o#U z>@dmJw$oo=NuaG&*?9niYGuaJX%z8w)Kv5$-{QS&LS|#C9jOU0BBUa7LkT*@nz1^` z^_#@|hjm8I!RGekB(B#4F)QIN&+DCcsY^_kKcP6VEOWhBb#RT+a;c7Xg5g&Y&k)b< zrcB^d15TXU#(edrE5fme4Yi9#i{|FXWaa&Gr0h$MFzo#vW-AM-L!Gtb9k%Aan4y$C z>*q0j=Qu6dWaF&#JR_3}KJ?vQLBaw9fZK{p=rm{5+I#=euRbMqssi)b>QnBIp3Go{ zo;Q^mtnePVw)*j!wtYiI-k6tI)$y46ig|>lTX<@865(N9uIy=0k;%m4SvHPbVUgPBXAlG}3U^ z_RR58ENx5|p1hsk*(1Oj)pNb6Wm^%PeCN^dd<}cA-P2(!r&K(6gcf(wmk(5e;)Wm| zfGe>JeXKF0O(e#Iu+ynS_Y9+yo7QTfLUqjBGS2Bn{N-*J@8V@K(}A`A&+?{JNEo!{^9s&R9k%9kqs1t04-D19L`psFE zEx8GS%%&vET~({~5{V4Yw}vAM9%EM2AqT^SuXAyQqCCKpgZWQv1Ptw>V=m7%)Cx{P zVxk)YnG6PV7;SLtWH44}h9lzHdM+RD&@7c($W zLudL7vHKN^bIC$A(2{W@kLLl>)Sy(SukIYa?%h0IsEm)248)kqpZ(x-aFuR4n!E5bGHDiZs zlSDTr?O`s&@bn{&^YqL-UcRNQsGRi575|zwkvXWaXd{8g9Qi?oAwFoe1XD(~lvLdQ zb6jcKGnbYANZuB>g<4O-}ZY4PxdU>Y|4}n1n`L5s=c_ZE$1hA zr6!1Y0cEToLWzDs+Ly0j58v|nzS;^LtT8{d+qj!KtJYNRyI=3f=I!hMgbBgEE9wn1 zbgsH8)Sv!P*+7D03C#-g+?|vsrP@A0f5Vg5HuSI9 z1nLm~lp$zzRK(Q<+f{yIGl|f)Ef-VjkFRBue`_OSGMa$ryk+;Ax`wvt2LMf+6!(<~ z4ndy#!2~l!9FBcml)CCOc!WgO@|26tupNSJea|>@A=&T8FVGr%ZcA-@ReD5}YLnIh zWb0HD?WMNGI;dWRBrafU;h{7`^(@Wu>4=W29{`n?XD_cFy?+4qJiOTDACxxm{sSI| zjzv$_wiD%2TjG@mujA&hJW}$xe2ZD268qSiF6M6n^(+W0gK|T`T28ZNo`o($7zZ?s)8@)>Am(dT5M~&L5e-J3pVVNxCq}Z7; z`^{B&4ZzNIeO1TzBsSNOC=oJ8E*Aqz9OiZkh%DQJ^HqFc2GB!+!ZmYkT8>!QqsW}u z5>70dEJ%HJTIl$0r35PEkDdrBYO#gWricq7c?DZMJj`{~)xYTyw0BqTVPLMpjv=ct z|0pM_0$zMC)YUZ4J0+7YY2%iREH+uTS{vK2?gQ(Ujgq5R=3=R^1&I}zBM0OLX(&L* zoD^uNaj`(yUghch04)N!zY$cBo3yt3Ufjt;&|DJpAZC(1A%i=ngS(~-7)}mWgH99= zXS=9O1WF>V3>`o+K*oe&`np&#Q6Lj}8~`Y0lmn>71B$3b0077isQ|P92%ZCH$nw9z zbBse%tR(eS3ET`CNl08q8{oNIS@po6GfX&zuA?9kv@JYTh`z;_W|}5XU7Q$Aq*11` zUpWi^)i119tL1(3=)&k&_^tI<75ep$^)(gS-zjE!Lyfv61QFyql^YfebK1T`a!W}> zbag7$xSHR?wU%v^&YAoknk}IjM7Q8Bf(Zj4bmOo`-<e0l>{Tx;%5g0Ov z=)ng1+Qs=Mx~Vvpr>y+%@d|-u_`X1=tx*uN1nd+&)+i_&M<%c>1K=rR5DF;e32ej7 zrO7Rp?3jBbt-C!EdrDm|pLk<#@XvY}NV=uD$JxHNBN;Ss7G;_!?`=a08sp@Yg8o1?UEIL8-<87$owJIIsJ6GVgFHWsfL;6=UY7; z)-S7`*JHnNevnnBA-#%}biPjYZN60d!t(`%Eo zQh6gZk`kpM+vbmQmSvCwrAs-UZY!!=PCwRUqn;o+b6@s}uYXafk?lpXE`*v!dC3}k z_0Gn1;*uBrR6eduExl_qyGdx&`}dzInS2j&Sv+UlFA?6rF*=)V`IZ7X5oye{qa-u? z+E+2SU3KS3eV@IB{?D80In#gkv2ntEC#rEqm}P07$B+7nwSdbLP_%ocpwuH)2{Zu$ z7;$k3JlkjL@_u46r7UU=k}=kJyp=l>O5R<-p0zUPuS~g|em&x;L*B1=1orQ;5Il;9 zX3NH3J1Oar?Fr9*&f2ykTTY`;ErI+>z|J%ITJ4=Yw!ffAai0Lmm86Y!GUzX9N|~y^>x%F#*vz+xw0=5kC%i3@HuzaE5C|G0Q*t$(QA_11pz!O z8bdj*BqGpt`bo49SSJ4A_lQV)!8Ec{I%odb6IhO*K) z%yEo+*XFi8z_YY{B^@s0-Cn?W>kq&qQUukUJfbOM|1fdHpi;ZQhrFl^9E>k;4q=)j zmRTfk`W}cikqSBWc8AaRST3EkmCe?@;(q`#H<{lu0NgkOQ~K{d?WSYx+^j4z%~w>O zzSH&Xx5L3h7Xt4WwWQm8jK1%=Vq7+k{{X}rn3wPHmnE>ve?y_WnqhwCHv|#jg`3TM*1A)~pSSdDvIyhj=#zW8wzbJ!UU@j} zl&B_>-b}M1<)J3ZbHYPQG>iL_d1MB_g2!HK)RT6zS|Jr1XeDImwWo_GV@DJ<+Al*l2Jd z`{CL=P?o0cA7=djn_RA`j0+c77|~??WGwfrcdGVQUc}<``j^Rx!t8;V~d1m>|g%zx^}Pb5D9F=)q^VQP24aaZKPQ!P;^p zf&eD0&)I{MfD)i&q7W9H<=a0b#62V-NdGh;m^eoXEcq8Ra>~DLH9DhwVLQ%bn>Vsr zSb)xpYW0M(3yMW8R3DWH*hkx%RlK1i2EX!hB_A{{gKQx_sjN9ORen_+x3qX8uviv? z#$8rc99&{pO#g=xqy?#vc*1{r=|8yQsd-L-ppcQIZOU^ZSX$ z=^!s}%<5bk$G7-nVnFH(Gt3WwqK{9EglNf~tmT7@0he!MPpw)MT(XOM7u6ki<{De7 zOxG>iB0od;98^Wz5fLfFow29B=??%glmD@E!(7tG^)mD!4Ud<7sI~aedamiRDcB5F zSw5-wY}zkR8G;lvD4xpSXP9jqnd{w??w_(Df7Hr}o0(|8u8q43=(~Ik7rt*D7@*SS z{Iy%`OTri2C>)aY4M)mSf}^t-&yP1H*0GvjpI0wWw(fp!=axrlTiic!)@Am%CPEiQ zSbVeGj%onbg+9RVHM&Jkj>2tv)L`K1TB2VOT6&mT*j^xuaD97Bv6cquaH^ay`BbB` zo<(-Z>b727>i2d2z>Ab9R~#0ov)D*sg}z59RnPjpxRR{sui<>Qa#4QA+#7223{R>< zCrdf#2oBCf3_ca8S`~aAZHqT*8 z`Gfdk0^&fQIrgPthu?3G>>={nJ~2re$B})w?=4jJKRkCbdY`y6%jrK@?)+4Ivyh3j zj9wr9q^@e`w`M(t8xCsSHXF_ZfIZ@rwNYWF^5S=@yr<29ac8FC+Ei~6K2s*0_`Fhq zGlSz)0GLD&7ET&1!|2S1R{ian?H{o<0!gP;m78&sX~)qmw;2I!*+Oomt=;#E zRG^2v&{vY(SHW%GdcQ z2X^oYPVuE@^=xrEvmWRWgIUnMT(AI*77dugc0(JhlwtE;Jg~Z-O*tKEf};L1OQ=1Y z(;H}1q_i>%hn0cSu&0BzQ`(B$Gfpn8HbU)r zeq*SO)xJuCx0fGU;6Aa|MZO*%_S)8e#T}ScI%`|E}IRMH7~VYS8F)vmZ&D zG@f*f3#JKeTYTvT5fPlBa}JK9enYbjG4UwLT_*-@#bmBd)jGd>nF}q+e~~DRL>D-^ z4a9Na*D}~cYF3)o@Turn7}*>63Q_~~@-$dk&l7O7 z7KzE}z8bvR_V(DfE989hK9~|7!1mp&aZP5cy_B%seZG3{^!hbW;6<&R3BO)dp}0~_ z**^MJ206SRd}A-CkXqb1nxuQ7886}8;yM2rC9Q3+08LHu>t1{u^!BKV>Z=g+EopPh z-HEFuiY)eN+t-|nu_fdQrB$R`>!!O`1>Zs$1B{861<@Oqyqn@;%!)f4Sw8?5^~mDF zDmF_4l8ZM)$voBg9gg79+9JL;7N#|S^ljPD0;ys3a_i0?Sl7$w+y`Fi;KTa?A>AQU z@f(OgtB&KpR^HS;MCpkv#QYNIn7r7r^fpQNKDOVT zz?yZo!8P2wI%zcO2p0?$Vv9Bj=??L-#Pk&oIbRwatsaVL4MKUaM1YK{V?kJZ(cilh z=g~mnV4X3&lUNNItPmZ!#x|&W#bf?kXev8xBB{&;#D4oK(oj){;SSHRTn6a;Dc*A8WF zD9o^dJM5W|1?^bF_{kYihoJ$a5-Z1TY9|X`yvMpqBHm|NBuA1Q+x+k~Yk*^&L9Yl2 zZ6XodD8xa?zobze%H8pjm4UTi5uQtNkyMM9!c- ze)*!JbnM0bees3KUU@O)VM88_wpsA6|HcDhqsnf9Hh+mxczM9A}FK2auL#}E8YB^ zc^}6+BQ>5etjbkz>}_Vd?nl?ByJ3YFh8mlh+l84=c{`w(IgLT)lMyoHeQHn4RY$#6 z7j&)Ert`3K!z4u>ChEINJVst$=!kiWZ$r&52*k7Ld0s`-?`08k@{C}+`cVS?bS>oW#! z#M3Y3xW||DvW!`A`}X5!swSKO?K(Q2jusFNaIRbcJG%EGEjH-Z&M?aQ*cQS}%XD8@ z*2UZH=_mvcP*|Nyp5b1?0K(UkTu$4J#7^$$cvG^TqOv&JfP&KD#M^fHXT+JJv}p3u zkGHfyyZ3g^sUTxtx){M#(n+NGdj9Of5R3hSU6X4&y=FTS7&8;G9f{IvIL4^1dbM&x zs_&-~y~IaQXX<8;j5#39@qHx@WVNGiH&BZeaE(ssL_=riluezC>VQ+#Cn~t$vp@ZUJ1XU}qw7JI(cBjDYm_w)Uy!> zeNE)~XnPvpgjvh(H#o+KEkS!%H~y8PBlT;Ty%)l$-zFf1oH+B872gV_nl*8?_A24Y zS)W=~V|4Q@OwrjXAbDjaW^O@!_&K!lQnB*$oC3a~{jsl^3q<`q+)oFyc%Ro+&CK4Z zzZNOxK&96UKPNNyfU&3M!kmUwE0{h$X1C?wIbX7FT-f$A)dfin3+j>eiqj`E<{G0}dn1riK5wgUiSx$l=UJM$?w;u*!YxmMaz=y(ReiUyb*;cA zm1m36>-=djq6LV+MG7XwR_L&-v!n&d=;u>Bx_+%-nT>tfN61};+oL&dU2wG0Cpeha z0^Z+gzC}{6Ec{j~uX)QpLde@%3k$Ua&%2rWOvPA5QlYNzy)Sis3^+g#03QHV!=&@l zA(+ANAWNL8e@E5EM^<^_d-Q$H;OJ2pDr*fgdN-ckXOtfRk)7Pf)uA7Nol{$vLA=_6 z?Vrzjl0W8eveVg7=v`YnqiXY1qaaFGX!&NO&f>p&*Nyl*6AlcZ)2aV$DZt zFp$CG@?K6+jXwu+ia=IIVz7+0204ZVI4j@<&Uee5Bdy|{jV-VzimkccdB{HJ>CrG!W@Skq}{3@Y}~iTfP2iK2EgS;3&oBc zGEktJdd>HZ=iP}*&(ddY>1o8LJ(k9y!qR91+V=&IMn3>H+bAxq4HTupk8LZa(h>Y` z0BUJ}3!Y!*-R}R#fc!cb?7ea-EJ zEELq2^V*^;YSN|GaEf@UUph}n-;7?)ZVPKNKCfq_-@w7p6TQ%;S=Z+UxUHm4>lh_ye$u-1`GCW4OAbf@%FkT>k{i7TwXbqA<7J zvwM=_a3BayVy@keBv{{S!yBdg>R$2TM3rCVqW7$2C=`$4YiXv<2-s3Re6XSp)i|jj z*kfg%+G!+NsGm)yG$SbD2cV(As!rYi+fkQgvVU2M7 zc4F}gWYE#yk1Kk{*l@Om@qLjb`=r0;oQ}Fb|NReuOEBwcqE*hxlZDpyf)~|aEyyD}&sOOv#X}$4o`E-rY++Etd z?rQ&)%LQYP#s5jv?KY14rnuN{VyvT)Z8$QbY0w~>6!KmTEhxZ^B|(k_mldx-96&g< z26b5Bk_#>#mzzhTag{CBy=lw5nC1VfTAuy+DBa~!;4D_E1vy|}pm7cqT3Fp+=;!L4 zw0|F5U#qb;*0{w^;#$+XWy;QC7327#(83?v!!Sr z@7+rS+V1jJ198NN9JM3km9_VHKJpCzlJ<`oka9ur2jNL^d@Z@~nDD#>_Z42}XKKx_ z`R36~(jKR+dQpY)&i%!9bs|C!)V9@~cpqp`Xa^!RleJxEUT~}eQ{D&df2X+Gy0%KQ z9(8ZqfT%b+&VJhZzTE5EpJJOyK6d-n==Xo-$R3KBp^lRqugcyLJl7zzOw^#mrmZU{ zOLTn7GRqdeTOc~njT#^cLO`2B>l9*2+bhJdDh}WMo6oOD0B>2z;txO?MwUdol3LZ= z-(m3oo87&}8)HNNo6m0)VJ*%@Z5b;8FgZFQ78K?dWMTx<_IE2(%Q2~WY`F4=E*mWw zD0h%0JN-k8L%e@jqzxTI+=G%@8^wLwufD{M|M$XslS|*xoFN9-kqChmffcr7oIm`V zBxtnjYGBL(x2xtva>PT2iidySX4|eKsE&(HAkp%t)Wj+)F|9+EE1B1mcTT{!!NFYS zgUwXtgxnGXnK}Hx%Rgkv&%a^$i(&4Q{tJ8TQvs#sze{#RcfY4zF7i)dVK|xX39kL> z{~UMkbl)i7@J~k?{{0cLre>BFfhX_rrmQi-ij}6hPRov6WEfU z#@Rmu_rD3#fA2TqzI`r3!7RTx{8|jKC_K>q6qwM_(3%#Mwn}6T{Fc_gIsB^r-QgEf z2Hb)lxaCzCCH%KtawZ`wUB6Dlz4x_00BjVzksjG4dcWCg|IZy_Z@7QH*D;ETKuk+X z4YB$CJ@1quHopzVD8JbJPMJd9f){421>-fPB>tMs@9$y$)eBl5oDFvQGN7zeEhtL| zKvl*{NaVLNn)I0w#5}kyf2fzz6{USG9~L|o)0|}I#4(NrNrX&-D!?ICJ5DT0UP^aob z8l^O%D5a{4=qinr(AZ)50T2{Wzn6A@d&XaR^(OjsQ8U7i?<-ol$Vi6n$NmIo9D;&6 zhT7~TIdX2(!c`dt&x+(hSt=TrbUFGpOtwy%8s51?xy1pwYK9!D!Z!pGO|JD>z5Nq{ zQLTkyZ}!ct#l%N@+oKXZsqaK&du;AXvqvy9b0lFp8RZ;Z3P1CvG|ea?8;V!Rv*1mG z$(aO83JytP-UH-ZA{a|#k{taW@mz}irtlXlBfYZ5z=r7A_DagD3xz%^OYc8OXlq=? zpN@OQ)ST@Lgn5tV0$I066&ZPY^0 zkaq(hkP%81>7&6FCMq=b>pH-K3dY-ru7gO4KcJ+|SMUgXXbVp-zSW!ian5^G>3mZE ziWaCVI9*SNB`!en!c<>}Nk*vnOXhTc4EJspgV&HtcuWM=nOm zVzBITpZFc}!KQCe;e~_h4#D{DVqH?KOV;8&@KDI;5^G)244;xAGlcwr5DbOfK)hT= z9Tw_RQJO6I?@eeVo0tlqA(Hrj0+KO6b{;L%S(Zch?O;S#w@5dH`~(@zf}v(%5!zrR zbRjE46=yn6_E(Kpbz6_ie(JKf^_3?~cU?nY(ellgP&k*;2Xz;MTGg^#+2436>#1DS zV$f;0S(q~}P{~a_i@ldK;yqW&j1y7MW!4sX&cxY!2t}Q0TR@X;FS*BgLPe}d^$cRu zvQ13&Xd!29nfp@=7rjr#ok~!6cdWJ}B^wH(NfhGNS~pf?^9Y<`DLI zAfOw#MjmWk-RASvrVNrP3uI7icWm)V{^1|G4EUag4Bq&iWm5M8&lgcV&g)rP1D!rf z{Qw}8EuTlHdl8>40#TU8HP&Y0W10GbxR(FqOVj$# zzBG3NGYaiT4C)aiu%9__dB50`K}R&g#1C>JyQJ^VM2KMG5E#aY@P>6gV);Sde1BtE z^ipNRJZ@PzajxK;yzV5c)6PM2IIa3Bjy#kF*|e%KWa19VZj2%lJ}Fsj)_8AjQr>fM zDu~S0u*{F{6Oe73iYcv&?Xv&|1=Cps z_P>4P#j4|yl$G9%v1<<{;aL+bcS5A+vWDzhpd{1=Knz>n;>I(821k|DvmMBPn{GPy zvF=$DF;F=ek*Lh~vO#7}Q-(tnJj*UDQy}YsLf#=Ig(5~iUew;6DPrP$D)3+R-}~10 zi`3!ee-@h?-x`jxm>?NY$x+8elhNi8SJj8ETn+0WD5G>J^jafzKT*sfJiQ zI{6Wr%95tz6*4TgV7DmmvY039~~wmTHpNBm`Jn5Jx0a?D{{YecyI45R~AX zlm&k#9VO^On}ojY#T%aqFvn1gYLyb0hF*_Sr=s^%cePcYn*J!MLk(0ncn!C|7Y@xuVMyRESdoGG{S9l)^GaiDrz%Ar+f!9X zp0uyU-X?nB;YKsby%ho<9ZZ=9CS+O*SB#=bmpQI;T!2!hOtu2tf@eWVMnbi*)r?A` zMoP?tm-CdE4@2>~y*s)A*8$O98Og&DnZQ4<2CBB@H9QOhJ`;AVN_WYk~Zyh{OU7KBB8dZ)AH zppVDut++3Q{FdJV(JJIJds^(sUFOa_hm2R#S}sGuL`YOp?HnNVbl3oF3}-`?NDHn2 zPt1Mb+G2nT(b4^)*E!n>*T|MGw7eS8&RWvN5tJ<*M|vXT7Htw3odGhNg-#En8Z}8T zAPxu!b+8Q%73;o9%_O5T!=uCd6%wu>&GM%-daJzVzjI!}t2IMrX)~Ao^_B7>USSn8 zmkJ*4xHG{(ak@+6XM$KZ)L@KMPD~w-gOyXzbWZrNkpOh2=jZ2L~!W0yKl%~J$C z3)xdlGd)wc8$(3L!pmnYTvqkDc7+^u^^MlFV>)|c6UM6Cyw@*2r1?)#f}etFA*{I^ z^>s<+iQ}6M0WqLW|KJRcvPT2+>m8NlQV)ra{2;8|jNMDe3d5;2k9*UL5x%V2Zum30Ku?bthoWDP0|oT$UcJ zZeKtOYywHj?5*<#g7e=iD#^lzrh6t+M-Ng&c@8M9&ij!BOen*J&^V?^dZOlzd3_QBWR+7aZ1YdB%XYBy*WT z8h1SLjBd5?2bt&Hse3CV*?D{MBZ;>-W_$0y$TZ}$ZD-^Wm5+Y}s9@KKbl}OTh8+YT zIua?uM_?qQKj)cwGnG#63>SU~j{0EJxk+gR1K&mn5Ld<|LvWZRgMA5Zs$|yveb)vD z9r}VQ2VE03>!ExS6h~IK;V~IP*zPEg8C7Hkzv+O81Jx4%npC~{u$_h=>#}CYm^_%$ zntKsXwYmD3?Xr7yVugSICdS%zAHSJw#CQuH9ldT!QJ7TIwPxffD{Tbqo+eR(JHr_e zg67#cN=`ZymSv!aU&1+&5ZXI9iF4QL{N9Jxly>#*J)LQ^DZ7gagAecc3w>j9S4#P% zOleWg@x?|M-1jkrs^^TQX$I?<5r@Uj<7`wD6pC=}PXqJ990JwuWNNRr?llbO>N~5q zkX&?b_C|bVdfFV8wD}|Yz;&fLTZO9vHf)l(bCO9xMD)<&ekTLi2SUuA?*yTk5#ry- zP{iS3ULzB{(`<~i(Qet^3mU%Afhtt+@qbAl#2u*-?~L23<`z-$^tlg*7T|J}m!-It zqGXD*B7$aXe5qxAF?iGh!m_$J*1^YYud1|+sqGB+&`lz|+EpigmK*0g*O#owgu?rQ1t9;yl<~>?S z&ktU8k6$q}*Y6^NuU1QB;UMEMvzE{~($vq^Wn>jzY~9H>$E`9PjvhhN zDsteI<{hYFA}db)ZtGoO#9i#q(8Om%FXc% zaoQ9zFv4Vb(Iq+D;V6n_A7W|XC?ji#w1-dfU`K@ISBakXGwL!9KA=}I&$kO%Aa%ZX z?ANQp+i47U(40uV@R$$}Qz4s#B`OMn1whS+0hMnXs~cM+arIB6Dmu#8{)%jZ%v$Fc zs0hNdgWCU~W7paFYA`FT8UDKocLX$5-uE0_N&MwSkI{PeiYzL<;Sbh)p1gw3Vv>Eg z`Xq5Rrq-D@ODkOi)dj;R-Ofc|#FBZB)l6*l^#wJmGd@~bPvRD9mM8w37d&kktY?Q^ z85ku}mm>a^Okc&0b9(zexZ~b`k>_(X(h^GNv!xU-pD;&cB$N7#Iroe(3_BuC6jH^^ zodUGPBTfzA?u}P6sMib^+TvY0Ov+PwEvybbY-zW+(#u#Rl?Bi`6fZHMu-G~&c?fER zmCK~5j7tqBTH!w~H0M>7&Qurqb&@ahfMQd)#xj5-Db&ayQV9iP=u_fjy%5zs0wFBg zFV$1MbS}YRql5#BqtcS%n4Su7$^|OeSuKL-J!NbRCFI6YFUf@c*!qKcoH}5q==vWk zaM$vdXc1RSsaaO|7)3_FhK$|qM-hNh;3RiBzwX!@jXH(TAU77PqalkKuUiFDBdV3DT&C_x;zbMCTncu};)O#uM@K3UQOFwNkBk;7Mw5_MIV7$@ zW{ND|{RgjUN|=BuO33?IV#`;kWe5-XPeKTy9&vF5C{|QxHTW%$fUqE#`aE$k#9BKX z0}IDth$}oG+8POq)L{4y3|R>^Lxylm;6x+UWWEq-5#&t(GTZ|8j%p1vJD$j5{fuiZ z7nhQ+4f>2J{;5RtvH&9zoVg)z#hHs#)Y6tn6c5#z!AkW&xs;1?Mj3oB-;HoNS#(sZ zSt!3iAht2Xdy@U7C?kF44BHKV+lFpa314JT(2|4f8AE)QZ@*TEi*q75c6xN?stNX-jhbGHw3ZLgev^It-q^1~it#Sbw|$6j4YwaA z{;mY6q~7f%-Pd2x6l^zW>tkKt(VAx8&MfHc>2>l3(i`P#tvdiGi`_{keX8uZ2qz~@8f9;@=x9GWmb_w~~ z61fFCro6;bZH1i7&>*7|!=Z#S+vS~++ps}kX+(NAfQ(1I>*Q|0X?~Ig_K^F%^7Jba z&AwC4;UkmckQLE9%^XXS`MQI4#~g2~;X}=XE$+!Ao2Ujg6`v@tBJBW|zy=9%WvG?B zxeiEYv>=^nUH{kK{K*{ZrFIGzNFaDTj-PZ7(@5%~$uBe`%<&2CnNq?e4FbwbeP0>6 zNOH7!-aTF6;NCMZi(i2*8Jge7tF`@qkoSKQsXytP_rJ{Vl+e}mSJ`8k5LYarW`6Vb z4rkgMCB{1DRZ>3W7e)=7G27P!Vgsyk{f4&k@(AEXAu54@9Rk*ub@Zer(AZT!RcX!v|?(PuW-QC^Y z-62Sj1QG&$tJCM4?(aJ?&X0R9zZh!(yY^ah&c)h$zVCB)k7?#~id3IUY!FkqGC7zs z)rzGA8@Eq-scR-S;B#_RNpMnEth1l~v{{_T?(wzY5tf(@qDbA7{~I_{;~_27B-v}g zaujWfvHDmPsI^AHhW0H0U~7O60Quz+*MZ}wtR^_$RPTP*(ET^X^uMcp{~`uH;!K^o zl+gO83yKz&eSAIqvpgdb?9DjchGY?%4Dk&rvEEwuI7E#`Z2{_|Uk+wbVD?*SU1(P@ zLIdW23URzAoc_=z?Kl#bqA5Aw|6Jeyu;wYarhHM^?_?&H{kRY#sI<4!I8>{aZY$)o ztFtHDDO7MAOyLCS44gwdtOWLu72kGt$62$eW7|UX3+b0Vs0*&t#BES`6J5FtO2cD= zIO<#$C6|id{Hpg9zqSQGaHUiJueXK#T2nh4zv$jkKf(jQcieig2XY*{QSS(cUm?3| zgB)8QZaQ2lxJx~AO`+9dv3u0hl%;CHaulZD0KL^V#^yS$$ETF&Zp+0aRoR<$WGTG1 z3+EU`>3FB;=l}qX6*(zTRFg<2%F|=)+{{PNl4cp#)}x5c%3+n1M)nR5Fiea@;2)YV zs-Y$|@pOZ_8C%np;Ktf)NN^3PIT~TBNI#`BB}ZO>tf~(l1#HWNlH7b~5+h9>=8f$T~rbNXE7=p_Cod!4cbwxj#&xA5D*cLdm^5^+{jgh=r@E(yI)omP7SH)%NG zrv#Oly?1uQl|HW)$$d_C^j)_-K^>mhct}eYAJrWpKD)oJBVY*(pS*doEKE~%;QOH; zwkL(UM4|w;4i1!PUL^D@Pwye6g;<_^>iM?ASpRWQid)8A)c{2zsQ~raHWh?@|J`aay^dXV_sDKwLZ6@@6s!VCKrN4#;#rpgxXF!dk zaw_Qb)EB&(Sdny9w3GztQV9S#`!e97ihQ~WqjR?JoFv4ekONMN3%>i*Nps;fDTEC+ zSCWCMafZtqS*M#Ud(igyNP|AEJn77=o1nOVLA5IDs)G`qdort(0uAAw?REHSPI9cw zRP}WGtg$o=xof&4XL}!^ju;K<=8Ae@rjMt`cCxj`DeThNrWK&Rno@&aE8)=<+W7L* z3%Qpcn*oxM@sh9y4umQ zjDwEJ!XY0UT2#5KcmMor|3wQuoTc#Rb`a!Hh!+I^E?g|1EAm$)a_o!#oA$L3sE~ov zkH+oGh-04~UOyOz4-tIceBkR&rYF)z)>nXsGuAnE7K-Ogr8|{H%+Hntp_2UsBdzO&&v4INT-qJfL;L z;5xt`rfX}q^~7l;apdG{V7E+dyJ@3v-Pr0^^hT*^dV-B##=_v`(0Cbi1hpO#OCXxF zRgyZ9N1J#Vb%v;`2$oYqGxuY{`CA!6gh_-$8_u4tCx?#<-_fu%896%6(avj61J3E) z#7TcH<)gvL;JbF`^!Rj7l|QM5MHvJcC}XO=!qwr!1l;6bBfK$o&D!^T=5 zjS$=%Vaa}>UL&3em8Ut8rkwGPNKCez>x10@G&I*x6n&#TX_WPG99ZFi`>c-4O!+dD zWhtfL;`TqZKT~#>?>&#w*pQfQSmRBcul6$u$vz&5GAd~i3~EX`$DO- z6hivAIVuncU67-CHLVuwliXXJvj#g(yZP2?E*joekF$xxfhascWuHAM%wsQf=SkR9 z6v{0bz^N$az_TzIHuke#ItR(k`;=L5NQ}a&#;k07mQ&{@J!;QxA8N9EN|;EMO^j9s zK#>PCLCRSM;j!9&F1g_2nC>NYX!gf|HEkcA4#yG>7rd!@wq}pQ8ZN1RE|_B zIT%^S5d92*l2F!tWk5CJujQ4TKKjmoUdiT4!^=DeJc$U*8=!_FW6WEmA)^tF$ch>T zmNvRTMOn7w!ky62CAa70qoxz~SXZL?G-nuCmQO5c+Tm}S034PzInv7xK{PANH9VnR zht_>}efHN0j~H?}So78PAt@*>d2r{RToapS*c98fq-Ux;Lwk&K$*YHE(mUYJt`XtWqjIJS%~UK<1sO;d%5q zFJ*`r`4pi@amD3i#a-nd9Plqh5-s?!ww@P#sQ#gS+j#MFSpzg3;7D~cK^3!X+LSAY z={Gz>Lemf{1xtJg+`8o+MfMKm_E*opI+3@xCQM8H0QXst(oeP&TYmL>Mc0j^iQe96 zCp(rsbFri`z1cbB+0P~or=QK*^X1&~B)eR>?2;1@>l#lPro{DF(GY}rD>B3h{Gc^j zcqsbywC$&C9TnEeaD_ycp0KAv2KwQ+bq||&3=6^pRsc6n_3xCZh(yjY_sewy7;HHE zEe*+H9ft$PQ4X|kq3OH3sL@onygrj3TKopsp#uGYlXCS5}&8`k2ox@ZsH8PY`+vj!v|mII3b= z_Es4T6SpBE5@2ExDn#;Jf{X~jVn(%bY5Lvw zjBOTzT|r|-g1&khZwtij#^ggSalEl_k7@%NL(brhbRIQ^rKkHrCs6}6U^}erQ-OlQ z8`me0e`Bdp`hw1&P#=nqzW^1 zzGQ>;{j9Lkz&eSyaj)>&@~sXu;)=Kq1v+-yvs(rWS>;A~U+D?RG`V!^>UxEJPs zl^Ag6Lz@zt3^_h`Lb68`46-27@xdtswIDi#fBIk@2pv#iJ|&)Jm?#d@WJ%qJJ!g89 zbHKf!bC`!HsgYq+qaBgG$(Qm)sM+D+oea)%in5cMW$X-nK7jbxOB7D&I&|8*`8c}8 znsXdK-92^TrgXv)rH}dJ^?4o^j;fbAd&-0Wm+9sSH`m2I0iq*ehO-x)tZb9+x@}us znk6sk*@Dc*69FCmW`K;+oJg;D!~ko{Pco**UTyK#;ZsigNmL84e7xlwiDMA5w8Q_hY z;-#lP*_`4Uk>i`{ueJ9^Loj(3P4`{1+0>Pe910(iir=T9V7MlzuBWbsGYsO-QPkp% za>B&O?#GIeN+1}8-Vw>6kBXrB7)5mdnB#9jXE$pl{D2M)oOx^O;2aZGbAdzFkh%4c zkylArv^x2GVeI%8+kr!%^l1aqe+e1Hz^R@ZdTenXl9p=UNzK1LmcKtv!xlYZ^02i94^h<) z;MjoH09b#p2Bpo^PfZ^mCSk)f4r7iJa~C*sGOXLg8dHAjrOilu2LkXRO#vPZQ))Nb z=r)N4H=Pyk%9|UVA{A^W<_i=2=xuv=DIu}AtNX>LlDPxIdRhGLYEZ4e(H5DE&P0Ad zV*C*u4Y5sLJzOdsz`H8n{``R6^X5{PaE9U`d`C8i!;P4LKyaxo-$z)OB_+J)%ZYT8 zwj?)w#5l|=KJPGlynE~7N35%fA{kzBmtd5=)JD{Kd`Su)2bt9t5hbKreB6M*iXH-+;9M2Y2l(RZ?vpSZKW@mdH&{1U4xrbcm+tDEN>B2PhhN zcU6;$A+}It7j#8d&Tc-duGw>*h8vRw5z@P10;``Kt0E!WcI7KI6R*`lY@>r%^PgCj zA~|?FDpmmXz5z+w9$lq}oBWVy_6TEKW3syLJG5vb7x6jiCvWxG$!Cf!b{# zsxA-^f;@Lnf^bd8>#hQc+N<`-+JBtrld3V6{nCZ9Mm6!uMT*8F&mg7wq9n5JL#te& zRBL$?qA$*RK&w1;6@wA$#Z)`{xR^3dHe%c|CQ|M9^mR+}+Bjnl@`&{p85?a&8guLpgj?7eip=&;y0pDV7Jj6Bg) zr${QB_y`eMOwlYjg)_h-{S}*n031Hy7SV}6Jb=TE!vYujc6~jAW5%06i%2q`YP5M& zva4pJnW6kI71UTAI>UTBB2Znn65WX!8Uw4rCf*I8cnymeI!7;xKi=?d=#i@|q26rD zByS2qYNYqOKc9+>uXg=C6|^_c%WN#}swZjlsXn`%h+o6l1>LIrHxKQ?YxRov3MV?W zEGC6dYYToA0lU~opB6pUG6(e0Ut!x&&&%!m9U*2;n6@2<0w(L!hrA366uDgRKpKL) zfZ2oC)cI#J!H?VX<&M7VEI;;K%)ap!{?OOElzR83L)fo1-6rrHU*tum)R$zq>5$o! z-a_sE26Cb`+A1O0(|M!8=9D152wA2!h=yS+KnhqEdeY%yt!v+GO8vmjNO@+xq2X2e zmFuA3#(hRe|2^FvOd4zdVa+G$^0hKOr)i6O)k)QRSR0w~b)Lv^vlEllEu$Jt;o}&a zt)m311SwKP>f`d{H?Ybc$1J#p{#=#vy}qfRHvQUmE9ie{m+!I~E;YX9ylAF9vLCmF zPVn*)SNBrbYxQi#jx>hmHmd`XJ2w;Ap$N*kR2a~EpM0i9n zLc#EQ9<1}eP3dyZwlO{^4n|lF@fAJ~sM!-c0Uf3EvXGUzj@Y z;1M`v;cd1v7tc1~g6*Ng_Xsc3NUrA?t}+E*Z?2RenjL&0ioraltQXfGs8Ur?f4A);7s zvbEQ$QFm5Bct4bFt{*^<$}|Uf>!n|fb2dwH+D?v8lEg!@B;ndl>3?$<*;(jD`AC^6 zPvjJ13*23rV$!pJ@uIn!9H$glp3Hh-9@-5&EQFmJt}Cc$?y)5*80wTH=EAE&T*P|c zpl*ZGL28I{9zQ3(RgT|X;f>@ZYHuAh>Rjfx)kgYg!H6-+(mu`l>B3h0**lfd^Bz9_ zQd;dKqq9zCx$OWyF{(aI=#Zz(xJti2WnfxW(j`e@1J_6C#&t=!>F6blF*74Nift8^ zPz>>ntY9EKuByq*Yu>aesrSi5dq}|moHzZv5*#Tpc0s<+^7bPv2nU6+s}Aevxc8&b zHY+Rch=9$pfhu~uSGt&_!(>Nb-}o(p6NADI=^JIcGg7?nn>BWlRVqqs!@2#upRtAO zs%PG)7YwbP;u_N=uuF?YJ|jIpm8DSXjyIMVOdC6(6HGFGu7&kj!_DjRC=C-L&b6a{%=)J1iYHi4>w7e~7a5AftNeoSW3FI0MMl>aW=*+1No4TgJQ{$*)s|1XSzu{EW8g*1YMW;%kR z0KbdBE-oDye#-1Lvt%(HRWnXg;J!KxJf{pUcUm`~JC($3|nka>1ap*AFdO8C%G*($BdAtH}V?Dgg zPeDPEHW2sSWlaAimO~SoY-GDLo7BK)X49GwL@*NpQJ&Mo+y%|TnA^0ND712GZl;6` z*&I2gm#&r42sJ1^jo2io_yfKS1(E@Wl?5Th@$;XHt_q&0nqSzW2|VWcazD+|pZRhq zj7$G!DBoKbKn+$%nGN$Gz=ExSTETq z&d`{O#i~;t*16m&;LO~R{Gu%W*Bjpg-+1mXWY0fB6Kn5aTkikzP$bbY}H0|YFGzNQOpsWTsziL=x-n#fxm%pN_?tp4N2+$ zj(b8o7KeeCpsaP42#K;7d%rTstU^M6+_z#yETSYY6j z&15*7KkAF@`(p&~td)XEN5e38r;{fP-M7%~&bqA5Y>RQ9WGTf8N)8BY_>ngb>`lu$H=;IaAz~sU7wF~?i z2nygxRsJ`5lKbb6{XLt?N553ed>ea3y_WSu@zFm-<^1`PxpQBD^6=*NCv(h`+_UTp z|HbeTdw9PzO{*h@!`f$C_5T*4gU0#r(dyT$8ZkY#+55s2`DAzX>%+*|w`DWiAr@ha zehM2^{J;Bo`$LcefMv0L2;{+Jo>4>UD1rj)k?lBoCtp?*VsQCzX-D|{NOVtSHY9Rb z>^jC_I)})H+)y&lRiXzG>w>J>*Udu@$Y)1|vo4OM6tO-3+7&hn*8a>Vf7Pm@{U41z zfe^%eptE1w@ZU)gYM!81!M;bo3&L9S$LEV_u$!{xMzcA;@)9TXwieNX5&iX^sm;c< z9S{E71+y^nK|86E`M2fiQZCuNd=8T6q+uU>3>G_e?M!1iu&?RkCb|Pla`vTGsGe<2 z%GA}d^Uf{d)EeUU`+tOIU{oFOICD9e2yV)KLMu@EB#T@!(Hw`0jZjX(z(LPeUiMaQ z<_g^k8l4480KI8Wpc>uqo7lbG5jTYn=U_;xLDJ*A+`LmlR-4^7p8ic9ArskMv>}C{ zIxi6su#5xm%dJAm(GK4`A^V24*D@+#uwLGTlL2Z?HdPgMV* z#Tk)$+;foRK$6gj`oWbhIb}oqZR`5*PHTv&<-gPtXvM~gGy0GeLSjE zv;wc7v=^vGM$M%DB>Qkg20IBKDG!5At3K8^8{`5fh9$Nc7S2s)#VHX8P+$u|-e(o; z_$&CV!r->+;_Risbq=`aL>tS3YteJ15%4S$Fd#1ku&6tGs zXEPFIsOk2;*-Np*ZyghQ5`XzNiWD!42fXsSj{jZlwGez`Jli~`*nm-}aULsCv~Yc|NJY6^Po zLy+=f!3siN@LFI~pJ-fRU(=yZ@@HD^U{#vvp{*m#wbn*C5FR8e#WGGa1*~N zWpKtSZh?}oV;tm9A*AGuNTlTwBKCes{yFYvuZG{m z%b;l{*j=GmrfuSN)Rvs}^ew7eXl#B6 zof9=B{UkVb6Bi|ukbBk@>?2UmUg0}8Wkjj=UM+cSsre{Iv;kby=EW@u1STE9l_@_=)UFa2b>gH>8)v|CLSkM{F74Lgp`#hc(P9r$`%V00e)5-HIH_^t>~*EL?tfH~)v` z%{a+69R^C&1<+AcRD+dyK_1VzLz<2&$AYEG;zLkDi)Z2ph454M;ym9YCciIJTRUW; zR$Obfc{PB?$2#2BJI4B8E2LBVC@YY5ph~M8E$dNovOmXmF#hHfefQoTut$h^4{Zlw zHe@EIO{lEeM_g+`XJzcCi9&*0PtX!>;TKP1l0I!DgR4oeeVc>rR!nF7*a*C{g(8@$ zMHf~m=qB<+crJx7UWl^t4!*-W*Xpa3JOxOD`TdoyQ_%w}iKg?j&lei04sBhJBkd@2 z_R;p`_=$r<@0w-iGh?77BUtGY4%RoMti&i%xB2o(zLL`VZ}#ybnYmFXh`jcmFO)($ zydx4$U9Bwb%{T7qkm4EsmR!VSr+CSv(`MgH!9*XcFj14T%Jg%H;w>v8gcw~oJ{c{b zJQ*QAdB7I5lAp#+D6V0>yHuP(Vq7uXgtA}lp52UYdR=2_mXVEWIdz=2V@vcYP|+-U zvo%qX>5d15Jnkksj`GO%3BYdfp~PYvR8IX+QKlH-M!-B?mkhZ4>3%epw>$7H=X$=T z!Ci*1di72ZPPNvW;^)y9*_#F)_U-H8bqVO@Q137l7j!RKY=AUwz7yM9k2mf%(cR2HI)@Z zUCxDsIbzevHi)G#Itg_n{*b&{mTqignX*}^u;7;*tX$qpLr~qk{?Lvy)}M4qo350s zb=J=F!jSoKH0onDVUC7ix{I8LpqD0&6Ab|&S&p5s&Xh{fDvuPlC&zZ-7cc3T4hl5c zhr4iRLS@IFi5-KqX_wu#Qq2+-K8P{aigmv&t5iDV8unnA%EpfK)crDFou4~2BP8ab zteJYRU67ezjtxf-0WiS+3h`xdyIR2E4C-m+IBkp*LU6IPAgv;}y3(cfBu0AAM!Q)+ zXR%Z8x)XZyumLe5Bls{T{aWaeJGd3G-wn4B40KD`WcR(SF&ee8_UQ zM0?_omOI(qiFF(INeRii$;)L{Y>WMZBdcts>lGaghfH%U6{~y|I)B0fMc_(R%KmZk z&KFhTfstOEo5*y}svkVZF9MXz=e8G9t0k(M|65TT*7(ecfoDxx7vEfBuf!k4%TS%= z9f@J5M>GtRi{#>cG$qz5_x6kiOVkxPNadi4Q&l&OQi2h(TD?Zsw5T4r)4gSu!94Rg zgXaOWt>!q%m1gJgZ6I*H9E*mk6h6x-(mFXJbl)HRhMQXz4HJ+1v(|ZUxqdo?@VgGf zlBkX!%SW`4PRYl_2Vhs*tZg8&pxerD#K*tj`3ux*{zksam6s4(H74$l0TY1+<` z+@?>8uvA4Zi9=}M9@K2wA?R|v)v6Yyi~lCWv6udEa(th$X^yu$Z~1P-cF@Lq z*~zD}U07^AI7;FlQv77tI#*vY3cH|@JC{>xEMxXdWi8$l;Udvl zXLVjj>v|mWkz8aBRNG84toLV=ZBg!=8;Ov**%oHu3V9S_^&uSwl9CJVhll&mPcxA= z5piIzbd17;!J3sQAt^!zoE9g|ily)2-XrZZkbZ+>5m@4v`rf5*XD=H!K=i zFzXzUXg<&)Z}vERBO5Umhg0U@Fhg;qSMfZ1K3dSU_H3H8n|dg5vQ&!NH`<9ATB9B+dkiLO8U6@=Nkw z>GJsyvvuGoZUuWhS?s>{GI7#w1%!{^XQZthyj`OmYEO2m#CVZI^AGDHlPN9XQ|9*T z@a8+q*Z*7B`0^VtbA<8zf!kBXqQu9+}nR5JDxWv zQj(wgxWja-Sz+?zy&1^+Ms}~FD?bYN;MgKL)bOXW+%lc&?>wM?s{l6qEIb4LCzHCc zB=hp|*R$l>cWW<1PP-SuezLZj_j(yxY+z<($X}V2ia7Z3jBmM?OW#^^c9=3OP9O+Y zJ-kOEDao_~&SaXYltA19xR8v_3}|A{{IAL+wuDm(vhDjW7AhN>s*h~>tZ+8$8tl-5 zZPdybl2q{GB^1$_KARPjOP)cN1k-qTlY>@T;R+77!xI|_5+k>yzm&h!DLSggf*)nG zyr;42z5So~N`xUYn@vV_$wh4|vdSVPSfK?5Lh5`JVi>ZCbi4wRGi3c>oyG&M>M;vA zK1bZ@JaVl;&IF&Xtdq+{s3g_HLi?Kba9zlcBDO64w#3~W$dJN3!y9TSp$t(X|0gB?+ z0qW+j2Pkp^3w}c3@B(XPWGzlzuomn{1G`u$wM!cxzXipFsKdfE#l_vKja<6Sfh^h> zD2SLZr7)LA5Tmjd^t(o<_IHiWnbNRU1sDhWZ^SFZ00GUhBPjMQV~1NVtw(j z76D>Pc!E$LC)%2b>EaJKu=~{(n^zqHm)|ZOc$#(SuGcj=&x;sv9ny;$Vx9~XNXtav zWlINun>u|Ajl$RLwjJuaJ{u%nBA{3)+wE2ge99F)9M43Hs!H`3sLil_ZrTL}s8UwZ z5|n^Pmm2=`kjeU9_<7aYCdTki@}*XWH5QG!Z%xh~V>3KYk&%pJP&8#9A(c@w!_OCW zjYR|@^Bu359Uaeg?}o4wG|^Q>J;pNx*gjx@dq5-H!|gBAK})c0AS!J1HEyGw2I^v_ z(zw*FS{|{fvhBO-%#vI3oDNwP2oE|*%K1+|bIYf-t_Nb<1jq?3(UluvAgb2O7<1AxW~rn%n)L$}##rr=OpxlQ zDgs3%plEVL!qZRY^2>u=(KkQ_SgM$+<240;PaZWph7}BaR?SYF^EVEAR6OqC z=_sMVIT77_b(ny;#x%@@o~tK3Z8>t@-4>%PAt6!B5Nxz5|17s7O!!9(QX3k3JaNIA z=KO*na8iwwEYu+qVh(G#BRyQYECoH70uLNnt^-J*jM&)dCix{XON7fxu{TBpD#9bA z^C1mvs}o={Uzyb;X*~e1yjay`&&^ilqMC>(o8Rf2c2e5(xnUGSMRf*?g7hErY_1^HTX9i5F_u|3HVNTt zhvydb8xRnTo8Aq=vX#j=A?)+$GSQh8inKqOi)3qNBIjXD2DK2D6$Nw~m5L&n3a4j0 z`LPE+HaoZy%!7N(HQTMZB0bw@-xjpVx745%{?hQQ*kj#0=gNyYe6rOc{_sn~Q?)}! z+j*1gYy&U4xq~-}KcpQAfUkYno#_m*FT4H?&|R@!5!r(JEm;DAT}Esk6nUV*_VvSM86WS# z-UT9d%vgOjSg4Tw{m&Ut1}lZ=zW>*=!bP42O!zxz(j&z9Cnm)+2a~?md!Mv&rD*L6!YR(!qSZ7ANuTHAs z%j{1I9(1%}3N5y}5ULtFPn9Ur1T}-{7g7{p^o$jTdG@#cJT21qch-K5V>#= zmN7wQ@W$Rh&Z#Dd+r-0?(bh}Wa#K*gUpal8lNuLrx|pZbokHrPhG=F&V#REJ>Tiu4 zYQRhh5W5O`PwOx;O%9DLIxw%MRi+E8z-wd_?rv@8M6{#XHVIjH<}F7}GGACWSH(kn z=UM7$cx{uz;4yIfT`sp?q4+%)&(VWA{I}))CF?2hGBPWtHhNS`+dEHyX*X7etQfJq z;M@Jj?1^(%BHWioT$ZSbst6FL8jU|B9SC@`@`^=!a#ac_6u{R(oQW_{F?RuLHiA7@ zS(^a#RAUX|MwI2af61ahBvqPS%6}RAr9W%*sA49cbp8}w#n(wTv#t^Hz?g=0Vx(GR zQ*}2JyIo*Z;Gcjr6#{Mui(LeG7XVc!uWMqO2ekzoN+lsjHbW}Lx{DL};9MD~H;^HE zaP(|6vDi`E-;y1et#5ef#8BhseJ{t(_yiefhe=li31CB80&KdV|H0k?hf62(pR@5x zz4dQGXr4PL$09n!`$*J~Wn+PB%7(%E>~Kmh5Do(@M-rFl8}$&Ahv)p(vuCOBzvW1u z{=C9J;z$0^vHx&Yk;eA7S)X}b3~zq=McGJvMS?Q)k{XvIfbNg^B7F!mok|Wvvd09! zS_`AknjS}lN6F6sb@(QEp*OX1^b>pPq+}3lEbH7O@c7ddeCbI+s^-vrJ^6Cy(-RWG zOH>0ywH{Yat}2y;ZJrTPbZ`j}?0xDfKPxeybut$lr23hi40D5Mb%`WFib*~z`_79# z_}Oaj0z?AFR;4kT-D-$UhksvfzxHu;;SCzJ)iw6yT5zmZO3%UCDtqO(xKA)-Cl*Bb z*908>tuNvs=aKN>}FucfECz0YhUWHD~UAizRb4Z(80&#HV@4 znyo7tWF+GxSJovNrRR-hmgml5sJ&t$v1aU2i6t*osSbN{6Aj;vB}h|b`Z?Fwgt;R7 zKUc!)8hmS|;AHq0*VT}d7|Q;9)QNFcXOj12%NP+XS#l*^F>;xtR8Wb$K|=x8ka7U$ z66Gwe99+pK~{P z`FTWdnbz_ifw>#sKU1B7_U5JIX`(iyQ`-C_Q6VER^9@Q3(xJl!_ zj*DSS!^Gpf4@l9Fmpf4%MSXfwNdeNG&P;{NP>c^SrI6-wDEgilRHy7V{{dPR)Bgej zmc|xXbIM(n0b%iFRp@2$XAKU?Fi{$8j;&!5Hb)GK{XNG9pPKVuR?rAsL>RZgh1N>j zJ69>a1W_42JlXZv{r-Jhqz8^@IOpi-n~rF*Ijyo5F_7Rg26pmJ)n$R_aKkn zKKG|w$l=}Bl^kqwtQ+>J;!1IZUHZxLVs)@mD%IDEWF;`8y#^nuAkX$hj60OL>3OO zvq?$6bt5qtkbkR>uBOc+Hr!=-q_)L-fz>&>U!uS|rGy3I@$W1~Eujr+`8@UBjhlpQH5l-YFNGIMZy;N!V6%V0 zfEIOoXNi6@Y?tID{9YE5#s1^rca!I^AuBzsoq>_I@npw9k#PU5cKLsq+*O+3DW z#KST#3)_RN03yePJRlQ_*%E&Q{%9KfO@0yGeSjhhvA(P3@YZ06zuH9NJuE0 zpbo&mU62_$6$)^ck_?m z*LLzfOrIbaI-oDIwES#Y_bnuh@2cHruV@M;6D16#6j9tzQPMVhJ6a8!Vg1gGkd+i7 z-(+htrRcMfcTC9;R4kWwh>NVdn5OWryx~Q4-rgY_ zYjwDt3qfVHDlQC>iC#D)Fx?K7Cqs?Q@%dxR+&AFxTyt_@c*(+KL6_ERBcW#OSL3w# z^Fqr-$p)yA7)5^DKfVX)QN<_5hA3lhQ<_@wi1)_~Yw`3&rIEs*vqTyY7O}W@^Hb>A zsl4Z+XZOfm`$F*H1EJ(|j%|xl%zL{W`o*bKQwm>YL}F@IP2jM>8*7_}jpaeP7zFnb za0kvL;?br4S%i9Bmuu$lFIImEaiXHXCh_1@*2h~!eParz?J(PC1E(ZOVs;pi*8gKa zLK4Xd+DRt*0+!xLcX16MXwHQOMg*$bmZ_@;wKMx*9i)P--Vz){X2Y}O$|{SI-xo79 z_|h^WBVyOT(mmommkwjDK}EQ06LcTfS-z}?_!8accvQrav)e)O9ju<)U`&|7#37qE z&#`H~ERxGt0H2VLidMPN<#YW(VA&-cN zH24vuhn3L5h0}#5E>u_R%cDszk}eVeZb){nK(IanYdl$J49MUJ(ocv10KNhW9ZH~5 z2ar0z5aVt_o;RyuFIONuT{Zwm>sYoFy-`6&#Gy!0h*mCtyo;=}<~lPrC{H~uYno{B zXuR)~O@=fB(rV>kUf2RLyqZ?aQm;pyfvqZzi(TuR5h+VYtQ#>1qKJb)t*Isi&WO5i zGJVH9BgHfuyAa7Y0^B&GsvJO`ljbRsWf{$DMGjv0D%S5d)<9~1gd`Qlp&!Pz!DBqf(Ez_S9{~|2xD@<#?7-tYG^TZ zAQvFy+krT-9Y6M-hbE@Dy6i1j!{^VDnboYag)MzjLUpJux0LD4bbB^qy5+z~o;DX= zCqz4vr;;R7!sLHPb~VM- z24Y)|NBh*YXX-a3dF8Rtw1PtzXS$hipT)2-ohg#8I(g;M+q95!=UQ_Dt9aN#Zi#NY zR9m6Rm7=pE@J1l^(LE}L=3*p7tPC*l1!edS&Jd2AW^LEid|^g#M)=xNJ8jOmjX1*E z)n(3dt`EWv8E9#g@LAY4e_+ZzaM0KwFk$n|JN}Bn^V8FV>hw})Wu|~i7y}}aGiugo zLx}ibQY-K`q|h`l1)wDU+%PJ{lJxoNp8!}%>OPp1N~**(9(KcU)tR3d@`41CXSxzY zGZWtjqMPG(C1vh<}B+?9~!c2jQlvHsqlUn^sDg{ z++A_UIY4S^wN-VIC`R&jH;=}~TF3Nvu*BYZ{- zm}v}Ary$!*tX|#ctb!5%ppzAj4As{K$QZ|~(V#le`F)+WpH9z7&bajPqDT~`|Nc2t zR?ifn_I31{hn3#hq5d*SCV_>(SIQ_(KTV;8URK67fy@q`-sp(x%6|JThI9%oD2F_w znI)1+KBCg_rX?OwIN#7^vwPm3=2CB_@f+a!1>El<<%isYdR*_F@qWoi+9-|To1gWS zGr`8IHRJnfs|2G%ihbktVKEQ|-U&Q5jxFYz=pwU_h_02VO_Qg^CYCI+u^50DD8iba z#7a>df8ZP=$xo#=x+_a>5AIVt@$)dY`l>G6r1FQ7;#YzgDml%X%hN)e$%4$;kf~`oDpD2~VGiNOK_8qQIYVm^TXU0^rO}n}wx}t%x1Wl-pmxIpe;u(|s6Y&j27X=; z3PB|Hy0gEv)Yfv3@RV2SG4W&{%=@yoc7d#wnjY;Ow-Jg4)(q=Vpf8~bF2R=baMv>hDiW_*V1@j6zQtTI4Dr_Wc8CPdf}oaLhR3+VRi6?MCc zP)mNExMUGzN4_AJpbpye`_`I{=a8Pwob<_bEGhebuc;r< zaWtGB9%2H;5;K-mT)j8TgD_net*sIh90#k2QnmAi&A- z<_N3rx@DsZ!L7O{WuSRQV(90AF2Jg&JorR8Pyw(-10P(B98lv=@MsPBv5{eGd>RIo zBhsj)sZ-tPf%sZ8%ya}S_Y+)2POkp(tfD-PJ71fQ`qBYL5D#preGRu%!r^+=)6jow z^cz4_Dk_TO9~uK(Syl(pz7+onJ}6YWTWa(%pp2`!L*1DYbp}Aj7E+l|a18*q(QYui zpF4K=9$Uf%7W`3-<^e6cwixO4)G0WvaZ@+;ES+Qh33w$rt#Gp-MhCWXGopMZF4|IM z)#XFn4()R!ULlR#%kvO{A<961u2^!iCKqIX>|k$SV^++JjO0LF#bZnooChx4E$iZq zY3L}++2gyr=I<5(z$O7x%AxBt!)GxLF4dCd_3!0GY{Th%8PIAlJuqfc{5n%JxqBsi zSHTAf(WHE$(lwpd(jPlGXWpXtlG4i94HkT|wbXS13ypNIQC(mlRt&-9yDcs2N2R7^ zEu~@oBTvoN15N%{^zFr`T*-qTjB~ivN=BSpgAIK|BWHihn&~5%jHXp^SISh2PuYaI zqdeZ+DgS_3+ZR~m`_jk$%kn48Wt5*ae8>?WvjZ!29X&QSDE5}68F%4wPs z>4@^cPvP4}i07`8bV26`->XK?(ey%y4RN^ zqr_B0d`ed8cqnJ>fNK`Qsa~XG+@w~C7tPO?5(Z!u10Dg8QV;1P_4(CU00m?r&sp!> zgHwV;E3akK;a$mCE5ExH+$ZCS#Dm6%e^ejRZ~$ER8E}C^EQlrD)14WU*=m|JrDA!p zsd&n*lxmWgDTO=F}0}~i5r$Fq2Jbzw`gS7e5Kb6KviWbJrcMkH!l;X%@S?N`% zwltUeMm^C;x;mT@oe`y_{S|>7t#%2ztz5onA*I>=d*aY#ae6H~?@A^%^04#eC&SeJ+oQJXkq}Er=X&4$q9_CE1sZi)A{66fw~LQg;zh8w-aXqHS)_YL z32P!E(nQjQ9)y>!Lhpwkt@TjbP4;G+B8=@A#QLv*4Jq5*QrKQ~D2RwDOS83bZ}yh+<-u1!{zO(@xMN)V zB{OXZRRRoNY~B;ndvKeP zwRS2?bqMaSgA_AZ_ID!)#00>6IfLK_i;=^d|52L2F7`Hs3~8WrCYYAPVB;&C@((zg zmstufYn4UmOo~>CnZQ)!FB^&goTKYDV-W+mM5a;8*XrqR17!EHczQ>s;S(Xb`5rTpMDEDAU%kEv@USY&Ngq7dV zcr-@m{ca|g8Q0c6v#iTt;&YnSPxwFWw}-O#l2p&7E~vTid$sgIl4vgyK@%ic@HCcY=F>;u@UN;KAJ~#odZ~ai_)I zDb}JbTH3CgcI~~_+UM+b?%DUb_de&}WIl5a`MxhBV~*eZ7I9Mw9J?z01`re-Wd0(N zd>65DHh~S}+i;qE+OaZ9dhr}3VXYe)r8~JtZ60sXlc=F-MKgeRo35i^88taan4+Sp zgfe@2uUG$}Ys2g~T@E0e6`03S*Yq9w)tJrJ$?VzbkGtG!TlpLcIz|hhuZqo6`<{9q z?efaN(lmxQr;r27)e*rBsWxf!(bZ4u{nXie@FE|^6C_^;Q69<{{O{SMgo(Xdh~>A? z_W~B%?j1Ka#qv`O`KcrDf}a7?=l%8CrB6k?CJE|}s5pga2HW-On}S^a4vDiA3IZXw z9z3tQ<&{*qwu9XUtDfm-Uykiz^!fB9vQ~{)*0ux~G44VS5>m7aT z_7lcwC@zr)CM*Q0#h1mmE#?ls-fW)*K^$BcR~Budg8mC z!?9VdDZ`6UKDDAMq!>T8eEtw!snDj*4<~__hcz?jK{E1MW%Q~5JgqZ7)3dKiDm9vI z@HiffMGI8pkCd`0cvL{W%;UFJg_8$V=_%-hw&RjOG$T!m=Hk*K1J4&IdxYcNjxkse z{?UfB8un|ua_dte^+nMG$LH28&6cX$Q(^XNM9NSY8VYeAQgb)&i#A=;fA|N>8aOU} zOXW`FI>_55|3nfycA{IFXbWBxjeLkCRY$_lXU#(<{YY%%)w;alw}TbBo2T#F)Xt^d zCo=~uZ9O7s6hD7*kSwm5E=d(I%lB{GSWdq_q>wu9&$kseHk^7m_jFAEqwdfVVtbce zz^st_y%yq}`e)0OJzhM7dJi%>OL!iwI4vE>HWVyLltG9l^zQU2gy#l-11BDThV+)0 zfr^c48PnrsyD-94`$+xAlAYAEhEy722u(K| z5oJ+gF#xX#+1k{)-Udx;2{zK)DL!>IJ3}?tix5#u(v)F^Zym&IpxGz`a!OdvsGek$ zu*#qmu3f*M%{23T7-({8i*n+6Z!&#_^N4>Y=(I}Txz-)x1#K7~EkEB>4Q5iw;JxEoEG zAom&lP_Cf?*n5n|hmAGQGROGLyOX=yY2!z1yWZfB+Q7gz8!LzX^yG@5lGd+0t4?0i zFXE4wKiFji`sM1I13wzGMwF7IX#w=`obU*|U+i8hrQOJ(uFZW7{EXwvgzJSXK7eh=X4rqHJ;? z-ws}L$;cu=ysr;U8>b-835W!fY|N=ZlV-GxCIm<{$BzL1?P1QN_h}H8myJ7!g+k+1 zkx1Xy%w(NNW#<;mL>u|sVe`v z$HX8Z_+ui6{#nJq!tX+ln_9)+>Np>@%{9`X?6iG_Ts<}YxX8PyQeB z2Ho1}u>W}-HgH5=X*6VX?d|Hc%NcszP3PRb@~hHZA&PHIq#BiC&L&?*up(RE09Fz} z@_C-Psmh<<{7#O_|0G9W0iZZ@<9|F0v5RNYq&=+qvgdJw1l#wztA&HluY`qbR&t<= z-;h`I(jLgPI@g3PwD}2vJ8QHf7Wfku<+OP7v>VHEK9nkA)CHvVtAWqQk@WTDjVI80 zf^u*i+IQvQOt=8`{zR^G7}P<_%OL)~!p`^cpj&2X!%1Fs&qK6{>*s@J&D8B{KeWq! zwm>}&fi1w)Z34je7N4r$?}`5WQ240!77h0cPH0s&CAO7v`{GLU+vq zXTqP3bx#GX)TF&j@mEm5>Z!3+j5y%sfDg3^$d01H8U1N7I!uUvnG3*3ko=FPw)_0A zW7UX(+yOuR*}KL6?ras3V1PD$HMSIkWRvjM(D*;usa<_&;{1|zHYDn#{I5pvUOmlG z3xUoZz+iAdMDCxXfEaCDjxjM2&~sw4XWDzi$5`Qve#m51cW+G&6pA~L z#OgV#IjcEmpVO@?#8+hGjKEHX*l9qHd52`IpZ@4V&Cdua{56h-4>^{KKp?4ee5zm+ zzqLM;-vHM-Kl`oy>uVFgwbJd!kJ;iZPRX$Gdn{S17hQ9DEaUQMnG*<*kPd-$py`6b z#&7e^6Cx|1XDI>ckUs(je2$nIgTLzTzZex3o5emal{n~xpu_M@@WNgtl5JN|+?SzU zZX{o+5uAn#99j$^d=2J)auEL+bk07_8p7>b9GuH`V6#WiG~bv!CBT~fAG%rkSK)&C zM_eJMYW4U2s@udM!m8mK5!$JU(7sLi=Nwe~uTqs*JmEYx5iOC2k=W={Zx#5H>%^P3 zth-&e*L_BfAzl}RF4QtAs`E*#O|-DMEWF87?VstUtv+f(?XI;*2t}IW)_(X$W*H$$ zz`Xijk#v1!|#^+t<9i$Z@jk6HX0Lc|^Qc z1pG{Ze>Gb#Gak@EmdW)P)|>l3_3>>5MYCt5|L|F}KTjpYEbZ#omH8lq0K6Zd=)C`@ zbo+>HhG)pDkrMJ7K;Ql+q`Gi$8u5v(BP8l9Qs)9uxNyw=b9Vp5bp5>wupz3z1?#^N zYY0a@n?Ys`n1l}|CJfW}XL0_=Opec;q(3^#6(STHu=`1> zwx$GOQkFAn9pkp!i~S8%b?N{w$E;QP0+yeH+Ely$QH?#l5yAC)5xEoMXwBPhcV9nX zxlCCso}jx5S3)4{-UBsmv(&F)cU%O%T78kN3*UdWO*q`PWig}R~(5QZwwy(STX?63JRmE*8459N;0015raH5v55UXecc4fp860aIhSd2Hb9pyx{nxz3J!o+)@vC=(c zqFxvN>_nmkUWLNz;;)r5GUPR;1m{&JczY6{_PQOM3<{`r{4>OhxU@OPGAgUgr%BKG zM=8)1MP){~Fl#bc)ad2T;7+sX#Z-A>=rs;OBs#4c(;k(T(TA9BNIb+c#Y@b9CEzsa zGcRMeW^jwR=X;V-dd?E1k@099ylr`?HWXw0EawW*>|`k;y6`yZUAV_ z3bSNNgTkoouwr*?iX>U$drck+fdpe%fjxfj3_2@JsvXw}CseDi@-g$~2surPklF41 z$M?S|-o}I9Dc+TUxoxqXo%klF$U8fcND+caM-5oIYMUu>pEhG{A=*+N>c zX?a2!i*!peyk})wPJ>@uNqX19Cj-1;$~`g==C`JiA(a_H@{+xj`ty~TsRX7_OoC+0jGuf2t>fy~SE3)_s9aee;gNJ(IZg4T+^&VBq0R8&6r76puCz&7sprGpSu(;y)~5VW*&g`# zlp**`Tx|I9e}a48|2&&cda^jlOj>`sJPz~=)M=4zW(l2xkW)PWh9_zbrXv9g=maB1 z3j~6mdb8I|@Ro$G2oT%o%JU23;MEoD`|q#iwwE4{q_d0VHtjN4mujLCqh`0Hu|?Qm zXn+93BHBb)5(Ltc)LC^3Sb2wFQ9j@!C=FVAMhRE;AmK+~cz#L#>D(OY-;r)W0rwkK zDOe=jGDsfUM~iT@2$9iuRK0RiUB7t&@7)P+=dfDhb{(onoL!SseZn?x6=%M#5rLe6 zswDORhzoj3Xf#6{W5q9)SJr?qv=qEEyfbk9aUxOi;eL6+i^S0k@-=*_J|I2kO%44E zmm#yHGsx?8)>bXFLY9`#@~*nsolZ0Lk`_p`k5EBgM=b@#jFWLe>ukKBJ2s=KHjT`< zzSbPbK7tn%^-mt9Zz|}yI7K*15P&4exBKvjROOBmK-lOS`JEOlWF}Ic84~;IfN8Q#EcLJA)F!)B|(VxXGo zsDyaEc9Anm@-TiyD~Rltwv743=LD~eJPWRlLQQS}%ugo-%g%L*82l$!!W;xcwTxh> zK%*2AV*|RqT=zdT^&R~CjOw0TW(oTioB|~2qnwg8w$KTW%EGe_sirC!6W>I2NLBpw z?1WZviDi6K=X6vxeNb%Osf@xvMR@rx-zt;KyMCOcgWo9c1XZoBEFUI7i%C}0M?#Qc zL2|D9sG%37=o1yT4jJSzpm2 z3@6@6f}yjJBQV4we(Dqb;fSa(?O$Gn9BCy5O~@HN7OXNL1OtGQCq7-k#SvY!v(%jc z%_;~*5ZJBsGDKyp1(F)8p|~XktnwhdEQB}WPXs$#_aL;(DFdf()qDH;`@3E&mgm9J z%xprg8X~9zJV;TZc)^-0D0L38uCA!IZ0dG)v`5$oGx)OCH#B#Fb-w{pm$$ke{<0RD zzWMz7$r8>D^0+Rvu=IHoOI(4h771~EWw zxGdd2;u8s;WWK0%=50#KG4F(Aq71wdb$;!`5S%ELqLRm~O%71)IMbs=ZF^UYRNCw1 zClF&TN#9h`WdBKPEaiVE{!n?qukvct!l*<7NWGQSK}}Vm`^5iwoFb}?tmQ3bf&qXU zTOqq!=ip4(zek)-$3|A)&hS#xb}UPQT?x9d_sIRut7#~WBJYcIWGhGO=dA?Jd-kXD z&fi=wqlwW~qmvSjfWqZfkw{X`MG0`?)dDexpd|z~`;rFD#v)@~SED&iyHL-t%@P@L zTE~`F6jQe;_Ahwt^jh|)#m1^nQTtsvLJqc4gB<%Gz|)|^XdVSgoe62jXZRSdSl{!h zHckw0&>NHUwT9A#(T_}@$1XZQ`_RF%GIa-}-Xk_4-G>jy6l*2>J1j>{@@cXxiiPvS zC~4_2xO-Jp`;qWDeQzU@w(#1G*}9%G+5i<83R>E;bYlJc)@wyFARYroEwi2Isl6`g zwDR?uQw@o&gY_F|T%(yvo}_V(=n7Psbf|HGx1iNRZ666Ikg%WM*+($%B>q%lRQM}B z&$a5e7xkll7s>1%LoL~*+ErU0h(h8`dWNGfGzb!WJ9TayRaj}he-KKhnx2wZT0G!C zY`J)2_4yJ1vbEDUySa)@z9jEc-6kh6&x`y7r<~RycZ(dI_DSPDf=(lZLpD9vf^ZbN z;%PCv>UR~7Lrc571;<{Go*qnZ?_I6=G~C!>3baN9Kumn3t(qCIYx_<0A%tW++r}~D z=rVg1dzU2g>3pG_4f|D*wmTs2A^s47rYe%EcVrpVaIujui(`7GJivu}ut-l~dZ``{ ztm(6v{**K(NJUS!kpWf76`zOM>Lw8hv5k8^(8SRc(2R(X#fsCW$j=Vnf`X`^Gu!RM zn#-l%UR=qu9>p~@DT{o}&97srB(YX8(rHq#9v`L-;XU$OdEsO<%Sq(phjrcL;9@ zSDAMPcaCd1#4CwK=c&xXrgBvk;>tKEJMlwX^jPZF_gd#+?P~Q z+6@z~pAU_c1?)u^l*i=>CKo3;8qlz-^V!p=vz9fvdsylpWEV3Q&{LwK_1Mx=F(Z{* zf6`kAKYG8?I~#G$kv8^K^$Xf(-QN31!<4|v_K2wH@1&lu(TJ=Z^P}nSN*eUIRD*_} zw60LBh`4*nNp|ol3_oL48`>0Q$CC9#Qol*m!1%t_Znw#r2WE^Rj0TmwPLFC9&b4@^ zNPBqqgTE}?w`!uF%Yoi0?TPzxoC`;v5(LZ0XPiWCRKk2(K~eS<(*|=-EWG4y{(d>S z+zUQ0S4zbm)vUc$V~2vo-ygd0mSCZZj9agW`PtAyab?OQ6~i_uCDqSD8OBGQ2K{#) z67)EVC=JY3gIQy(GVm&7osfV?*qJVzntX~%!?WrxoUAi#s-+;^WTDt{ugo|6IAdhD z@}dJLa^O9z`wbC$&k+*6ZKWte!#vn_kMhzT=}a&^oUHvgRWMU&x*q1ZgwPEasfg|B5wuCNrkpx4)GONAiM3k`i|??E^G{yjXK=7KZ}% zs02BKnBGbU^#Q>064S9VOS+_NI$}`k+z5a-8{PXn^Z8xN*y7lD#EspR+Aph2@b6Zc zf(O(G4a>dyq{$sMe50eu-Z*v~PJRj!YQoL8!z{8iW3Q%%wMsBu82TUfXKcN0r!#e$ z%p=Jc{mJQ@XxZt`Oa~Uv8Vi9Fy`B$pX)93DFU>AdyH#mtn-BYRmS?Jz5IXJ3W`^#& zo%6HFZLr)(tJ-sU7=JxNE0s6=>6597`qb`YeI?1B@A`Q3x0{!*4L3AzD2Fsym%q%G zc(bjY<0QKaQUILcuu)oz>1jYge(WY*U#vWrGJ9N8J)(4O6lWnT`9h1EioZXu*t5%h zj!B%5jW|QHA!Mu~?c*K8yk(KOjX!o)=;HCkEpQVK1~y; zz6e)TEy}b)4_?c&-)oJxLl-riFFSj;5t>eCDNg|UeYIcLk|cZ^Xfq}fB~dIT5IoQ&{n z5Yl!*zRZp$VR)CgfJn}xeeL`W>@sr0#hv=i-xYnpo7@KwD!x~8?;wlX0VF*(Yzh$f|QvCwMx)HjpD(-m` z+*BB*+uWS=bh}?Si@OpnY|{A)k%-UQd3vK6w*{{UXY4pzVSQa_x*}sd9XtKp9)0YR zFdgan6YdJD)jafbTn1DHMGV}=yDQ@){6iQjq`rM{-Mca-&Z60hEJh8PYrAb_q$$<4 zD91`WByUnbMX5cO`a&sl%C$|+6!|#|c2@XlFH&`&ttgxBGsakcs+J0)iA0x)Xiggw zpT6tt8tD8bz@bNeZ~0ackX>a>kiq*o>j z&3koq_&e)@(AAe(nIcKuj;y)H9!|A{g~eo1P4Hfp{x@C;%>i4$P!DOgdJxDRn=5|f zW)LLgp#f~+xz1Qa8_Q{U#AGK_X=Kkpzp>GjJwZN{?m|MH@+M9+!-SyS+E_(=dJ}1( zvA&VSfJP~c?bFz|0dUtil=1R7n1CmthAX1}vvUlZWQP@V5ne?F=5GM2VkCkJz7Lo` zMT+EXiC9rovym&jz8^WLyPr&=wM3uYCS}tJwLRv=BXwd9$y6xI1(aPX1l#N|Y+JL{ zAT_`CVTFmx^iW9m{*o#X2oyzV74+$n;S!f0*6tQNZ6>Ba_`VE3b+mTeB=pe?du3Ce zy|P;Bl`OJAIYN=dTb(@hZq6|S(0*<)r;SI^44dvr!jbHkG_j*Vs=;F*QDvWA+y)1{ zwizd=G~gDRP3wBpE9yA`ETxRk;ui$8W~9_sZMEi$E4SI*esR@ntFT=_?r7W~ZBabw6hS;8S!PH`Z3Kir5L#m~`$et!jiL$ijWYAwx8&^R@P*Rgi zwbiN{-`5nu4K=5i(j^b^@L?(9z1aKOvs4{=9^K`~X9Zzzf7e#hd5KunH@=Kq-Ff@p z6)GTvHxUrwl@Jiz$o;-6^k2&xF&9vw6cfm_u@g0poSA>Xb$!~hHpKtpAv0&9tm_M0 zmO^V9ULFn_KN6K`@p;8fe4$39MQbhDxU?UGj#^vQsdwd$@|pyIDU3XnHrQxAT}kvQ`@QyZ9Qd#dXV1R!R9(;`$VmeMz~rSs%TDCX&fY9irsVYCO|%88!QuBEpx>7w}UJ{prXBtO?7z_Fbx7%E?hXy7%<^9;>ltrkeA?X z!9MAs^F!Hi*8G(ARu=}l?4EejI7&AMo8R4M$~$JJXe*pizmD2-nv}t|#-oQg!_YgR z2p+s0A!BeNSVhszW)rTYM|*oU7ArgR`NT<6mR6QE-R*Et_{BEe%`DOiy%w_cM#{?T zA}FS>nJ(Lpb?o;-hy7~nhB~!n_l3F@DVnJlg%hu-(8y!AnyTKho_8kNC|u8Q?qRj% z8+I!8q&$F;1NLe0js~zXP>A^}ieozM3YuAEtg^s1*Mqe#%XQUGpDdXg8O5a8DpY_( z;jc;h!VNt6=j455@gNy8esU!X;ZbNtImY%noYpUx?}G2x&A$&+YtvMm=I_+LEO^(j zRr2^6oXU3OJ(K2$nti}fnO@5Ph@6;5fngFW!ONT`vjNb&t!9>c`W-=EJon8^ z)7=-C3Jcp*V$}1*B#cFNW=+ZHC8BxEpa8lo$bfqHIuThrGJX>6XI!1#Yykc1SZoX4 z2@moeF%{8R!wUVYX(Bo?G&&M8(dri3!{T$rlmx|1%nv-2vXnK*S>-t>&<}!HDKY?% z*PJmHF?w`0Eie%r33;k*Z@GjZTy4So)Gdy&{T>w{_BH~0bX1@00w#Jpq=9lqwac)n z{L>f+oJYTuqb6ac4w3k)dk%Zn{r8Pu^B-@#Yp5&#eW_)e+FaeXTTpb>Mj~2h8A-`X zKibj2tc_Qog*_mgg6aiFGc?3YD92-vEZD-0f9_+__+89WprJ$HITxpFR-Wz#x8^q5 zWBMSCHC9Q>xCmT+A%*a!3EcWgoNh>Ui&|>`+iDE5DhB;)A>`}EPMvH6IO`>4(Xr7} zTjq7%?An_qCCf*Hgz+6U?P1gtQac@#=dVaiVly4&^_ycGtovR;238WQcXsDo&G||c z>&-!;;Q#;;rK&t=*6qS2{BlVpxb<=I~UAo+EaAGp#Yof~pfNjYUiw z1b4!)h!TaJU>`Nf09oY8-M_@Y(Wxq$VB;z-di2a-z5BBXq{EO3gJY$^ph1>cBtjN@ zs=yOfXoBi6?hwq|vj7YWXp2x_d1=4np?B#%zFCg*)yQujq0H8X*lu%5jEKm#O%`bl zf1--Xs1wUb$#_I7Oa>Aru*}IbN5WrJwf~GXQ-&Igljk2-g@tK{<2#v(iSJ%P+n56J zQfN}OHY@{Ym%YSbgLEf_q2RB4)I0=6#ujNsNy(!aX4PyQw(Qp8IPWwu2Z1e};ftDD z#VR|pXdMGFQQ4)d`7J7<4?9deq=PVuWotNT`` zSYpS5bk9c|EEM)*HJmSX9P31&+i$eNcZr(i40Y`8er z{%XmrAIwk|36ptQQfn{|*^_N}-r2RvvC*u!E3bYklhbG%h9HpbhPNyq{REn%i%|Ki z?kx0&o#!k0?O2bOh(C;t)$i(&X*NS2IOMC!$j&M(z}fqC3S_amvt$LsS>Tusje@ww z`hK!rML3GNDPJ~J3()OS1GVuNg$H#fGVM? zGLBIp*bo5Q6o44JKv6&xjQSt+E`C=i`iI_yZ=1jHSoK})`DJ;!T9KNzBl}ingppe_CRg0T+=)8y*Fpb#$4Pd~< zO}FrM(!>A#sBdHVjrYSWpW&=p>GR(JRq132M+P-6zIRM?UF-e(xxD+8Q{SF_VjBD) zs_T%w5vZu(;_Z4@8sl4W`Rw7>Q8!<_)U5rR*GXEG>uVBryVUkw5Bq=A8=FcO+q88Z zd`S5(@1A#@a-Y#)9cZk|>Igx#o_*Civ?N({yb}x7H&4d^T*=a7LZfs%39+q&BQH^q zlsCcLe7jDdTGPUEGyG_jLz6rx&eyr!KRvawQeg7z8I$b)hsOWK(3CyPlwTVx59YJr ziiVK$w@i{JD7C>~<+$EP-Wme2v4r8043z$=;TriR9sa3m=HswAYy?PM@Ro!A3mQ)4 zoP@m#N4$Vo(Qkml^B>v8vp%lF zwrGQ{2oIT&;czq&4`v)dahK@Vw3w;riN^LuD&*59%cqb@z5Lsk3(CPMb z{Y4-C&A1Jz7FrvO;4!E>K8Z2+kGbBiZQ8K*kqR=@t=xt846BjPD z)+5J&y)k{I5R9;B73-5!DJ77A(IP~Kwdv9+?J)xXDd(2|XXTuB#?9JLd9b@LmxCEu zW6LDIMydyqxbXH7;dzQ+UNSsXOQfZ7$r>Bz&|E%2;R*~=dwZ$37iwyr@97sYsVbYe zUkZJF8Avk=ZGSnJPS*A{uBK{HsPkJY@vr)|IP`Shk0#=~RcmL2(yy|~ihFuW!~f@c zI@4gh@#V9Uw0}p$Q?dW7Q;fgTdG=r5UEn$= zUcgw&?)mnKcZjMpA|nv@!u$IpKmiRB`%F){9g`NCd80>oUW#qiFdaOnd`cA4->Z+8 z>=QN%K=XzLV^AsQ#RbzNsTq&MG_Z|kTt4dU%a~e}dpaJHd6pmi>Ct+>>bp|Z|Gggk zyW^vi+}FU}d`2_iC3Nw4u-&I}s6LnW+>9WA`@w~G*oBbb>8-@+_2h}*0^8^SRQGjU RSa;%iyDGbD&HQi6{|f>17OL%GguzlI6XEf{vdvKnAU$=@Bh7S+q{-u?cGw{rMlaU>+bC2 z1$19+0>19by4qW)-$8`du+p#G}}@jk(Tby_LS$`eNI*EvNzj5O!XDUt!zbwlTC@vu(9q zooy@Gwr!iI0>!qqWy?Q=ACKBqX^fcd}=9r~QplFa+y#BC-FB*tU(swA~%- z-kyDjkqJ98HL1Z+g3{j7biZOn7_B3c6Y;Xg_q6{%1jJbg?5?`^+O5fI-@BYYacdt`MKMCB<@ThR;DOEjCwqK zO(f=S?F|f6qhzpn*DVL)9$in*a(6=DL=Uw^5H_36LIqNAR^ipMFy>7`0(leUqZYE-A(j2+G@1mepqpKY*hAxgl{dh@1$u9N4TWfH0{ zP$-$S3?Ks?MPdOlF`bijXM#kFxn}>~?u5 z*BoOrtJwXYzWMRQlO586r0N_S{2LIpv6QO3P3F`p#JR=g_a*=v3p5;+y$iIM99TOM z*PzbF1o z?c;E9TPGS%n;R$~telRb6AkpXg|;`7d0R~8V~P*2O;>^Hu8P};2pz6${KDqBYXR}@ z_P(`aiLv&9^%Frartmep+swa?DLj@9&e|>*MY^*w{YpO;G3qU30<5ebDsUI*r+M zQ^5*gKrM*vCMU)le-S2IBWk*@i2BZVScPo%|93<%mmaz)sR8)r|<(rMEu7kd(jFsI!rwS7O zwCSXm-{CCX5Z#~>;@?@SiB z(XA0Y*};ngo{tf@TW!z}DzaTl;eH=-*!zx7?z(HtWqE8@>*KoHD$BfU?SmKa z@Rs^VwKx5x;$3lcO4{r_zOAz&7cF|rvQ%i*Cw6lt?YEz*Gb0!Jy!p?cFWkH5swh*d z)EKq)`7MHm5~Y$ejYR+NHd#AreyCRgz^Wxn?oCGMfA_Jv>TD3jfj?pTbn)-sYo6|Y z?ZQy#ulD?U?fJF#m$u!xx#~Bv^5DU%1_qfQ{Hp1sgKj79wU^8{btpF7R|fW|`HL4D zARfNumW0Ei@Wef)^{fgwX3;Jq;!1MEf}JZu3ag~ve8b9Iut#chYJRA(3a?!GAxV3xO;4JyA}>%7wJ;HFRU4qdn9(p3ylY9tYIQQEwWQwP zG_6jy8&%q*!ts7v~BnnR2 zvlP8hjkJ0ST6|O3(ppQY=iV%8VF4)V1u25sUOi}O&@xcmD|D%Mk4{I`Rh!Bg-3$FC zN|%}ORy1Eft4t4C^nUXI(DTLrj*wXQ>eX)mxN*b6z+lW8W!)lhiKq(uo9w-V&Imb! zQKju|bjGrfg$x-@WrcuY>jp}~j15kKE-52wU~>4$?Y+FB^Rhc+@wz$LA2+@)G+sSu zainJgqEv5Y)zZLhz-|v^_FevB&4oDVCv4P2B7i26`n>tiiOX6PiqBS7 zU|_JZ=0f~@k$SZ9(u1UuwR$s(?{*lCuo63<$|1Q_)-k5sIm2|*5$TEs1JZ3B!IH=F z;q&bMnv1;Sl{;`)9-p`y*Pp!SnT%d5ZT0Q9kHc6L&L9d#>8%E~Z))U)YJn+-g=oYW z)UI{L2|?6?Z!OwdH~j#@m&A`mL9yE4$FXYaBj<|1vSGix0m(Q@6m7r_l$|Dw;y%Yi z`Oa%_x8?TPk|j;oj+?9Yd0N z({g|)ms!D;bakZ$s2a-4ta>L-jP7n4a$IkdR+sXIrcnSRt*kMaaCD)Ty+a14q(Do< zKIzJ!toL|GDhu`7^P%04&M*oljHGF#A1w_YI8f7iD4&>k*V>1kMNVfDcH>=ZA9@lt z!+X?u_8n1*=a)02Y>-yE9n^GyI>2xBexdPPB?z@@79X>cXKtff z>n6ydZD@H1JRE9i$XctDvCAu^iq1gar|~grGpDssnUlH#oR+9I07d2_Sc)i%$$V64 zZ-uxo-4;HNXeJ$B-Y}HMDe_#kA>EWQ;J!RO)vNb|H0D9}MBcQd+w1*-0b%7GP4Wc0 z2BjgBPq&Ay8!&NQqGVC+(@UeN5&|@)KsyX6dPrtClYbgMXrBD*oZA5h@Tq%y2!l@Ih0^9u5kcU z&1T(PiJL!q2veA^QNfS&0dHnhlhwEK#Q9k zhs;wautAtBOrTGS_NQ-4oi**Zwj>$cT<*+ zAQ+d*#%T^-osLUhJ?u-VA;&0D3I9UYg6?Cu0dKAKOGsxUrJI8h&;b%oP5UT@(RJvs zic`}{N*DojB&?){V>p3A^4T7M6j~Av{Zq0%;OJh4Z<>>R_3#VSGOV1k6dVJWLWe-O zarFUF0NP_v9oH^bC8U^Jq13eZ$SbW4MueCVS%|2%6+(|xwOmNRY><=u6*tSON&JY_ z@vvm11QMXA*xtSR07Ck{_8w5U_RwKYCA0uULVHgTEcaiOZpCI2&Kd_-vw#-IEL+O- zo0tVE0T`!(;S&VozBc7Bov#-O?}+`U555+U3kgP-J*kSD5Dx#+jrkP#uU z1dds)} z_g5R_ok}_blsBlOe*EUz2RcWZZeu}c>50544q6&y6?}ULrXMo$^${c*h#Vo1!CC64 zAD_AQfx9Cgg=n`EQOFuu9|?@KD=D;Y024vr@*K3C==nT%p`tqK$A7MUDDMDyIIX&a z1fDVz;;2c67h-P)v#wF4DK?^h9OJ*%KB%HNT3QLzR2o^ry$DQ{5Wr5IqZc zfgx~Y6kKia!91qbE2S`R9>HpAE9xhwn|_hRT}}KOu)G9jzbxeY1pLbd>02U%x2Cd;YsNxI3f{|QO4@>#cobilJozo`6oswI?~QAPTreRk^4R?@ zhwB^S;QPCT$#`b*9$S)DPKr}s57BF-e0|=WR5z}TMJZCJB`@{(wFQ|<&6J#NhV5hV zBP9hAg;Zoq^Tc^Rg_VLc>oV?e7>cO7?9-X`RB<7$$b)ZurzeHF+&<7re?BW2f%A zh^(bDsWE!OJ9js{aVhiAXzj7lb+Lb4&osNg!I?Cdw0wwm&Jj=@ov`30(sj;DWx>lc zCAYJi?@3eGwq_HC8|OSHH2g>XEXTBMk8EpPtg6HI`Ge`=Pm+SO_7yF(T?nSwO9!5K zjc*xMre}AGcdVbjdUK)HGtJI#KUa>d(B*0t+!PPCrqJv&kH~_q4+JI%H}=kPV1J+U z#J*02IgyMO$bC!Sxa>sVN!x{p`qKi9Z~k&SsZ5DOZv`4pT}O9PJGvMsB+LWr$G^RmkZK_B8EiTSvlf{d2SvxkI8M|q_5Y2d9ydxK=sGH2Ra6UPY znGyRdWA(a^UdREdtOD4b%GzYXO{DOFUZx1nyDtUw`2PztvwMaX%Mq~MQLt@D^Etp% z+G=;`T!ETOHt(0w_ZOrPMyu}t$a00h0cqiFCHMT!51DwBA2rZO+M_1MVJWQBsl^@K gyuyjc-eBCejU=i6w&%7N*k?pc0DyvX zp0_YH{a989L@V(MDVjo$a69wz-zByKJc~ zz09!d+}g-I$E!@snT0PPz_^;{=?)^qyZ9dJOum4wMxKrSzwh_^{|`wWA1NJZ#0sdx zqYNf`EP+DAgfIj}M-Hes?8GXe1qF~SQW-5IL}FKsj7aI^B=wOID3;@Fdm}ezQLR$? z@yn&3P{nDrQUtYN52pqzi25kV!QYN+qUiQY}>YN+h$OEH_jY6@+FySY}-!9)>~U|ZQI7!a0CD>ByGE| zux)$yY}@K=R_<=wvr*Z$ZQFJ(*#?TWZ3>mUgnMz4ySt6cKl9K0laLHsfHgQolJ9{H zP-#?$=80sj{|wuQZbcqW>4PsRHK4#3Z~<$FN6y2`HFg z)1}XexV-nZzV}E&+HmBB;fCfB72JE&KXD&n7{kI1`&tRO_oxwT5N`ca`;eB9(q_4f zIfR;Yqccj&rJZOWI!nUX5K2M}d8=bYB0`j~*fxMx_oWDxs2lUaoW*8c6sTY^=JH=f zmV|9X!Q@2kl5blHCju~HG`x@bfM=)!Y0q@JMHy6EL zsOA_elZP4Pj2oVU#_-Ew*BDJ=&`J#(8ol#{ZUazG*Vx0#E`o-aL#Bg{ezx<4ZUCeZ zuOkcH0Lb}n0Ni-q=>mRH&rH~jz~SM1H$(8wlW+WC+vxA#D-iu1*M(9Aq0)1vgW@<- zfi%~&kx3l;!DLYC5NuS>$E;f&-SURLL#Ko3vY}P4k{wMHRJ@0y0-fc-38zN;qICK`9wGpWX?{8ticWkh}A0PJyosNW_ zxt2u^no;Y2@+Ls=_mlbu84&%=+;G9~&SkLA5>qwgSQrX-D^Pbjk!-KC@wJfhB>DCo zi;M0MKnPxK`zk}CzpMAmHS@xBTg=hh0Z8VDwadkbCA0C50JRs-9Q-6RqQ6f;`ILP2WLGM6E?2U0ZrzdZZzvnDOPDaH-qHBx%nncjzUHFdEbVtp_XdUI zRn{$4_eT?v#OPTmw@W@WN6_Vd}fG(rEvBjRfV$$&1YU}JTsT}NG`UaqksJki+N){G&)YypH=48fikG^ zbx}aa0_>&;&5G_eS;wf-8m|h~TfdZ%?`&m+QHE-uI6CXk$Gc*CP#V_mjR(|`j)TAc z>v(N{s>IBOYSZ<75TA(*bhpbK>sz(4*=GSjw6&Bg9HC0se&)}gO#C<9;Gfi$gElnU zsjnV{tcZmv+#xSmMl~LK#U+)@ZeD*7ret23_-|_#neFd6QlHj>UyHk@^#8 zRNA%A2a+^PSy5(Y{_Jo2Favm7ZRY%!jM=AJBEtpPqbR3skWY5*+9C5ML{9q55*OiNtN z3Suti-Y^6WG;U$G8(*|b z_*DdIL2SF^Wif~(SlwS4fHT#B0Xln?U~N(#%c87ITci$!#tm%Jo};z=P7&Nmb=vhe zyoU3+%(w@)lkfQ8&;g4j6VvS7hj-RqML+^|a!p>(ykW7a;+?fs`-S#)DMEnXZu@N$ z!hO_e2`^iO;R+*M$t2>qV5L&kcTB2o0z?xEmTvuvA=IrZT;?F*rF|bh?;3)4+K055 zX}FnkZ<=OHa$kW0+{Rm{~{=sV#<6ZO^&p6IlJkQjrV0EiF4l4irdlwzHSdJLq9TR%s zmoEuCVtn^_`U|~2I`{`QqQ8$06ogD1>t4VTQ+4C_;8w184&`pQFY0w!_u$^jMSssl zy>9#-+=quT`r8SDcMJls-TOVbtGOc!4iw1m%3)!B4uq>&u;C=w3M9J+QNdE(1VGqK zuiIKA*+q_?6t-7)Coo>^Mj)Jvdfgre(Erb$NLyghlz{=s+RmS^CzI9=uU>tEu=(eW z1>lbx8JO?c1;;02zz=)a!pQu{Zf}#ByBEJkE?YjnUkIC5O3&33M#gd0i>1)jfq8ux zG`)IvOk0jb~C9soDYb_iEzm&XfZ$1^^Hf5O~D| z(3;z8@j)|LRdwBWW@c7(UVZ%AM;92}rgnGXF~f<-9`HS4Zga%k#+-{V`wSkX3hO#| z?vciD;TM1GYL^!-oQNzwz8tX#Hk$xV+H)G7$aVbu8gS;wv(IK6mc*yGG6*&XN^F#> zAmS16<7@JkwOGs8nss^0no%-OSH4wf#w{h4P%mb<#^~=s4&< z)Y_5%PGO>UkUJE!-?u3HGf;`?|ymRp;& z^l9+tnK$2j8*FXFB96VaROUvMZ>{n+WunYlmcfx^a@6?vQ9d40Q&E0RNEM?5$%OJB zDa>dZZk{NL2;QTppV1~+526_`%cOmCWE>+Fk^V8+P3`~w-4Our^r@+a z2E9d5Brc>rCw(c0RTpNGDV}bK-OQk%%xT0Tnk|9?*ff$L=@pfLNvlvxT+5MUa-fT3 zBr~H?d<_OmW$MbfSN6l7yAxKnrXG4$03l`cs=nb4SAFy52899vU}a{$tEbtsq&cOi zP6-5`fCMEII!baZ4P=ZWgVmy4sBpoAE%KEB z5}-sRRFreI%-BADGi*W6yNqWqP@8)hBr zH0wuW@wL=c%7H%w1#?8zd_<+;;=y-^k(A{(SQu|vYCwHx!s2+E`l^K#XQc5kyO~JS zC3bs!ZIlq~`^&3oTSo!{lU8>puI|dqThK?q(eV{W__j;TdQd${;~Su1GFL~mo7jb& zAvbNnTE9o+u*FtDA>+9fO9VK@%jH3VcpPHxs5E?9x!5Rvl1%W_B3{u#Cad1Px}|%P<|| zqUox;+x@;ta0D||jC#d1Pz!=ax!ZujpwNA=Fm$xmm-)#?r19!xCSlQw8t_)L;W+YL zodO@bTHvD&aHOaYDUG+J6r+aH8YltBvl}pH!5gHp1t?N8K&Q)|heFf}3eax}m^1HS zK(76xq|4Whs&f@E^O8u(Wufq{7G^E^miq1{NUGTe^tm2e@R=7K84Xwq`cBk`NZMOf z1;{Csqh^n)Z0bY45{BNAY)-SLQr>!A=;8sG*Yh$r_7P=eE<`PPI8n*pT~vGR<)#Q>KIlUNTxD*S5B)kc3)?eA-0 z)8m|GfXS!+r%bb299y%VD%SjH5T>}Nl{jqTmddl_4#V_nK<(h7uLu0{<%xSLJR%i- zHRo=lzp+O+Q5I2cttTY8WgAt0|+7e%^qKSAkspXGDuAdoTV~*tWRSI#$eE(%5FkL zJdn&FZZc7p96d*um#FZFRQT1LyCoV7Xb?%<1f*0SX=7z96xb;_u%U|Fqe?%FR;xQp;4JozV?{X@Qwmh<`@#`+-P1oZBFYJEsw9wAP+d!^JS>MJrw z-#XVD4S7cDmHqHysUvMoee-Zba{ z3D-Owq8LHfmq&D5^VE`rX&^}4wWQ%B15n6b2L*H63sOHI4*H1ZDFJk*hY!7*b@7p0 z7u*;?9fDO48Qn>ERyq{QYF_`hvbWYeR5p80GSFp6%pbu>TZiaKd;Mu8W}0l4w?wZh zJ)~+}Nx)*Tos1C|jsu(Yg6n@(!S?|!;KC-VV9E-bp|Q853%v!*N0Kf#rt`mcHlC)6 zhnWplHb;o*5F~(Q8n1r+-w5lsv<`v7yOY+*v@e{B^y+v=yW&4IlRfMtn*7TYi}um77j zUt9ukfNugX>eLvn9(Odr;|b@$$?lXfm7~$Pu3AKPxxbb&GBB_jzr^$)3*&%`RvY+V z7o2amL}gVc6>Wv`y2UHIQv(PX5{dz67DCTxnmzxv<_tG^={g2DoD1pHbap32Rh+I; z3qE*0N0c6s*eV3a2e7cgD)+q&3c`WNuTJNG-EcVAsSDWNNd_t}fvb6zG@N9F9m0c3 zpc0b_$q!=jX_DFTob67AaGkJ8aK`5Kjc^$+NE9{0dLraQl}t`x>@A)D zb;H><5o!-It840^szEgHbCUy-#%U4O5RHr318nhS!VHx}X*Z$yn&wZ=o)tf(aqt(* z{aGhMvYck1MF{332Ym5P1Cu$VKCSh?b~F;zKR*_wy<{UYP!FLnc-cc4AER}MPF-Im ziSVkw4dWyHSojeMEz(>Rb$p7w7U#-DAmMaXJw3=tt5C>r|2WY`8$!e#(>p5oUpok_ zZb377n<=o0Wtuhmn>s^+FUu2Ti){nQy%y(+ zzS3Z~7??&3{I8u5v0F(%Zgihz2K^u=677;NIuT5f+YVfUw zyN%FUE^PpDHGqNtH3=*@+?)tBL-;olx|KaQ=EJN54R*0B&n}h67BB)E2D>OEUvq=J zW)G7xwg4)j5;E|=25WRg=&J-B{$LLM+3we?%yrT4!EH_#{T|#7jJu8gc3|9X^fwd3 ze2qgILV8agVV>9`8Er_#f>8keI3s*xVd78kAsh9uSgwY%w{llb%TJ}8m<}nb4TRH5 z_29C(VA5IKz4h`wWTPG<&1K>e!yb8rBN_}Nd~90?HYiSDMykhv~mbb-wKe7ZnWD2#T^DzY@F2Szn6`6pm*Al7wXMAE!J>X{? z=}E}hSlwBkgq$9H{P6IM z5_8YI{{kuMv=2$2s&D^(?dQ}gZCml?zvwKOrcS_9nT_U&$8}G-~%kS9EdN7JjYn}(+d>d?9 zQzW5;MglKmYDWcgt9Ypg#Na;w;a%-k`k@oVgcSvx>NkAFet%D*9Opo?JkcBt8=w7v z5>-38W63;22GR$%(N9_`UddKw&&OT~Ye3tXJuAA^ST@?U$hx(pejBUa=D$ea>P~>^ zuZzurJ_Wn#Q;mi#pt7;Uj_u5n`n)0-)dPOU-s-7UR=E^Y&weQSRDTKorL-EI4(zN7 z?-?gLvKJC)VS09KPXoMx@XYCBvcPOWU@8$?s*)_G_JmWcNle!!1k`w4cpr40H=31Z zpv!oZ6fa8lcB>uL*eLZlN~v0C&b0KFsl}mI$FT63Frsk^8t)G0Lq|Pw7%I!-2eN@z z)Bvl!q9HrBGg}9}f~EBR&qG;DcA5Wz3D~hDBK}|4&lx!6P`?PW7#2PoW}{u|%}2YO zZlKF_u*(?M7L`P}76Y5AKt043oBZx3r<5>WIh6!i8uJwzy7_0Mb06s>31y*e22j!m zoTrKa2JU&OUroR{=w~R58`vP+uLtS`9b$h1Yb;;WspB`p- UMJ1%fdO)3jb&8VH9eFE~LU-|l_y7O^ literal 0 HcmV?d00001 diff --git a/performance_automator.sh b/performance_automator.sh index 6899fe1..d6c5419 100755 --- a/performance_automator.sh +++ b/performance_automator.sh @@ -101,8 +101,9 @@ median_thrust_scan_time_npot=$(calculate_median "${thrust_scan_time_npot[@]}") # Now write the results to a csv file echo -e ",CPU,Naive,Efficient,Thrust\n" > performance_results.csv -echo -e "Scan Time Power of Two,$median_cpu_scan_time_pot,$median_naive_scan_time_pot,$median_efficient_scan_time_pot,$median_thrust_scan_time_pot\n" >> performance_results.csv -echo -e "Scan Time Non-Power of Two,$median_cpu_scan_time_npot,$median_naive_scan_time_npot,$median_efficient_scan_time_npot,$median_thrust_scan_time_npot\n" >> performance_results.csv -echo -e "Compact Time Power of Two,$median_cpu_compact_without_scan_time_pot,,$median_efficient_compact_time_pot,\n" >> performance_results.csv -echo -e "Compact Time Non-Power of Two,$median_cpu_compact_without_scan_time_npot,,$median_efficient_compact_time_npot,\n" >> performance_results.csv -echo -e "(CPU) Compact Time With Scan,$median_cpu_compact_with_scan_time,,,\n" >> performance_results.csv \ No newline at end of file +echo -e "Scan Time Power of Two,$median_cpu_scan_time_pot,$median_naive_scan_time_pot,$median_efficient_scan_time_pot,$median_thrust_scan_time_pot" >> performance_results.csv +echo -e "Scan Time Non-Power of Two,$median_cpu_scan_time_npot,$median_naive_scan_time_npot,$median_efficient_scan_time_npot,$median_thrust_scan_time_npot" >> performance_results.csv +echo -e "Compact Time Power of Two,$median_cpu_compact_without_scan_time_pot,,$median_efficient_compact_time_pot," >> performance_results.csv +echo -e "Compact Time Non-Power of Two,$median_cpu_compact_without_scan_time_npot,,$median_efficient_compact_time_npot," >> performance_results.csv +echo -e "(CPU) Compact Time With Scan,$median_cpu_compact_with_scan_time,,," >> performance_results.csv +