From 777ef67300079828cfce5423bd30d8b80f60388c Mon Sep 17 00:00:00 2001 From: John Marcao <–jmarcao@users.noreply.github.com> Date: Wed, 11 Sep 2019 15:17:04 -0400 Subject: [PATCH 01/19] Update CMakeLists.txt --- Project2-Stream-Compaction/stream_compaction/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt index cdbef77..185a604 100644 --- a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt +++ b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_75 ) From 6e1bb53488f0d5af8d8ecaa6f85ceb720eaa7e1a Mon Sep 17 00:00:00 2001 From: John Marcao <–jmarcao@users.noreply.github.com> Date: Wed, 11 Sep 2019 15:18:11 -0400 Subject: [PATCH 02/19] Change to timer for recursive calls. --- .../stream_compaction/common.h | 47 ++++++++++++++----- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/Project2-Stream-Compaction/stream_compaction/common.h b/Project2-Stream-Compaction/stream_compaction/common.h index 996997e..40212d2 100644 --- a/Project2-Stream-Compaction/stream_compaction/common.h +++ b/Project2-Stream-Compaction/stream_compaction/common.h @@ -60,23 +60,42 @@ namespace StreamCompaction { void startCpuTimer() { - if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); } - cpu_timer_started = true; - - time_start_cpu = std::chrono::high_resolution_clock::now(); + // Don't update timer if this was called recursevely. + if (timer_count == 0) { + timer_count++; + if (cpu_timer_started) { throw std::runtime_error("CPU timer not started"); } + cpu_timer_started = true; + time_start_cpu = std::chrono::high_resolution_clock::now(); + } + else { + // Timer was called while already running, do nothing. + // Some implementations may prefer to reset the timer. + } } void endCpuTimer() { - time_end_cpu = std::chrono::high_resolution_clock::now(); - - if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); } - - std::chrono::duration duro = time_end_cpu - time_start_cpu; - prev_elapsed_time_cpu_milliseconds = - static_cast(duro.count()); - - cpu_timer_started = false; + // This is the last endCall, safe. + if (timer_count == 1) { + timer_count--; + time_end_cpu = std::chrono::high_resolution_clock::now(); + + if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); } + + std::chrono::duration duro = time_end_cpu - time_start_cpu; + prev_elapsed_time_cpu_milliseconds = + static_cast(duro.count()); + + cpu_timer_started = false; + } + // Decrement and do nothing. + else if (timer_count > 1) { + timer_count--; + } + // Timer is 0 or below, not cool. + else { + timer_count = 0; + } } void startGpuTimer() @@ -127,6 +146,8 @@ namespace StreamCompaction { float prev_elapsed_time_cpu_milliseconds = 0.f; float prev_elapsed_time_gpu_milliseconds = 0.f; + + int timer_count = 0; }; } } From 1a723827e2e1601333bf3de9dc8526569d7fbe4c Mon Sep 17 00:00:00 2001 From: John Marcao <–jmarcao@users.noreply.github.com> Date: Wed, 11 Sep 2019 15:18:29 -0400 Subject: [PATCH 03/19] CPU Implementations --- .../stream_compaction/cpu.cu | 64 +++++++++++++++++-- 1 file changed, 57 insertions(+), 7 deletions(-) diff --git a/Project2-Stream-Compaction/stream_compaction/cpu.cu b/Project2-Stream-Compaction/stream_compaction/cpu.cu index a2d3e6c..445cdf1 100644 --- a/Project2-Stream-Compaction/stream_compaction/cpu.cu +++ b/Project2-Stream-Compaction/stream_compaction/cpu.cu @@ -19,7 +19,13 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + // Exclusive, naive, sequential scan. + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } + timer().endCpuTimer(); } @@ -30,9 +36,19 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + // Simple stream compaction w/o scan. + // Fills output array with non-null values. + int oidx = 0; + for (int i = 0; i < n; i++) { + if (idata[i]) { + odata[oidx] = idata[i]; + oidx++; + } + } + timer().endCpuTimer(); - return -1; + return oidx; } /** @@ -41,10 +57,44 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - return -1; + // CPU Stream compation with scan() function. + // Create intermediate buffer. + int *tmpMap = (int*)malloc(n * sizeof(int)); + if (!tmpMap) { + throw std::runtime_error("Failed to allocate memory for tmpMap buffer!"); + } + int *tmpScan = (int*)malloc(n * sizeof(int)); + if (!tmpScan) { + throw std::runtime_error("Failed to allocate memory for tmpScan buffer!"); + } + + timer().startCpuTimer(); + + // Step 1: Map + for (int i = 0; i < n; i++) { + tmpMap[i] = (idata[i] != 0); + } + + // Step 2: Scan + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } + + // Step 3: Scatter + int oidx = 0; + for (int i = 0; i < n; i++) { + if (tmpMap[i]) { + odata[tmpScan[i]] = idata[i]; + oidx++; + } + } + + timer().endCpuTimer(); + + free(tmpMap); + free(tmpScan); + return oidx; } } } From b2d9232855d71cfe879f091e035475578f499ff7 Mon Sep 17 00:00:00 2001 From: jmarcao Date: Wed, 11 Sep 2019 18:19:27 -0400 Subject: [PATCH 04/19] Fix Copy/Paste error --- Project2-Stream-Compaction/.vscode/settings.json | 6 ++++++ Project2-Stream-Compaction/stream_compaction/cpu.cu | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) create mode 100644 Project2-Stream-Compaction/.vscode/settings.json diff --git a/Project2-Stream-Compaction/.vscode/settings.json b/Project2-Stream-Compaction/.vscode/settings.json new file mode 100644 index 0000000..a796974 --- /dev/null +++ b/Project2-Stream-Compaction/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "files.associations": { + "*.psharp": "csharp", + "*.cu": "c" + } +} \ No newline at end of file diff --git a/Project2-Stream-Compaction/stream_compaction/cpu.cu b/Project2-Stream-Compaction/stream_compaction/cpu.cu index 445cdf1..e41a3b6 100644 --- a/Project2-Stream-Compaction/stream_compaction/cpu.cu +++ b/Project2-Stream-Compaction/stream_compaction/cpu.cu @@ -76,9 +76,9 @@ namespace StreamCompaction { } // Step 2: Scan - odata[0] = 0; + tmpScan[0] = 0; for (int i = 1; i < n; i++) { - odata[i] = odata[i - 1] + idata[i - 1]; + tmpScan[i] = tmpScan[i - 1] + tmpMap[i - 1]; } // Step 3: Scatter From 6f3fc1690e58cd7e55ebd30e97e1ac236ac49a58 Mon Sep 17 00:00:00 2001 From: jmarcao Date: Wed, 11 Sep 2019 19:16:11 -0400 Subject: [PATCH 05/19] Clarified when to start measuring cputime. --- Project2-Stream-Compaction/stream_compaction/cpu.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Project2-Stream-Compaction/stream_compaction/cpu.cu b/Project2-Stream-Compaction/stream_compaction/cpu.cu index e41a3b6..1a26920 100644 --- a/Project2-Stream-Compaction/stream_compaction/cpu.cu +++ b/Project2-Stream-Compaction/stream_compaction/cpu.cu @@ -68,6 +68,8 @@ namespace StreamCompaction { throw std::runtime_error("Failed to allocate memory for tmpScan buffer!"); } + // Exclude the above mallocs from timing, since they can block! + // Assume everything is allocated already for us timer().startCpuTimer(); // Step 1: Map From 07cfc592f22288984850c2f6d0901b6f963eb397 Mon Sep 17 00:00:00 2001 From: John Marcao <–jmarcao@users.noreply.github.com> Date: Fri, 13 Sep 2019 11:31:11 -0400 Subject: [PATCH 06/19] Implemented naive CUDA Scan --- .../stream_compaction/naive.cu | 74 ++++++++++++++++++- .../stream_compaction/naive.h | 2 + 2 files changed, 74 insertions(+), 2 deletions(-) diff --git a/Project2-Stream-Compaction/stream_compaction/naive.cu b/Project2-Stream-Compaction/stream_compaction/naive.cu index 4308876..283c28d 100644 --- a/Project2-Stream-Compaction/stream_compaction/naive.cu +++ b/Project2-Stream-Compaction/stream_compaction/naive.cu @@ -1,5 +1,6 @@ #include #include +#include #include "common.h" #include "naive.h" @@ -11,15 +12,84 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + // Allocate buffers on GPU and move data in + const size_t ARR_LEN = n * sizeof(int); + const int NUM_THREADS = n; + const int NUM_BLOCKS = 1; + int* dev_odata; + int* dev_tmp; + + cudaMalloc(&dev_odata, ARR_LEN); + cudaMalloc(&dev_tmp, ARR_LEN); + + // Copy input to odata buffer + // After each loop of the algorithm we will swap tmp and odata + // So that the final result will always be located in the dev_odata buffer. + + cudaMemcpy(dev_odata, idata, ARR_LEN, ::cudaMemcpyHostToDevice); + cudaMemcpy(dev_tmp, idata, ARR_LEN, ::cudaMemcpyHostToDevice); + + // Algorithm adapted from GPU Gems 3, Section 39.2.1 + /* + 1: for d = 1 to log2 n do + 2: for all k in parallel do + 3: if k >= 2^(d-1) then + 4: x[k] = x[k – 2^(d-1)] + x[k] + */ timer().startGpuTimer(); - // TODO + int* INSPECT_TMP = (int*)malloc(n * sizeof(int)); + int* INSPECT_ODATA = (int*)malloc(n * sizeof(int)); + for (int d = 1; d <= ilog2ceil(n); d++) { + std::swap(dev_tmp, dev_odata); +#if _DEBUG + cudaMemcpy(INSPECT_ODATA, dev_odata, ARR_LEN, ::cudaMemcpyDeviceToHost); + cudaMemcpy(INSPECT_TMP, dev_tmp, ARR_LEN, ::cudaMemcpyDeviceToHost); +#endif + kernScanStep<<>>(n, d, dev_odata, dev_tmp); +#if _DEBUG + cudaMemcpy(INSPECT_ODATA, dev_odata, ARR_LEN, ::cudaMemcpyDeviceToHost); + cudaMemcpy(INSPECT_TMP, dev_tmp, ARR_LEN, ::cudaMemcpyDeviceToHost); +#endif + cudaDeviceSynchronize(); + } + std::swap(dev_tmp, dev_odata); + kernInclusiveToExclusive<<>>(n, dev_odata, dev_tmp); + cudaMemset(dev_odata, 0, sizeof(int)); // Set first element to 0 (identity) timer().endGpuTimer(); + + // Copy back to host and free memory + cudaMemcpy(odata, dev_odata, ARR_LEN, ::cudaMemcpyDeviceToHost); + + cudaFree(dev_tmp); + cudaFree(dev_odata); } + + __global__ void kernScanStep(const int N, const int D, int *out, const int* in) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k >= N) { + return; + } + + if (k >= (int)powf(2, D - 1)) { + out[k] = in[k - (int)powf(2, D - 1)] + in[k]; + } + else { + out[k] = in[k]; + } + } + + __global__ void kernInclusiveToExclusive(const int N, int *out, const int* in) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k >= N - 1) { // Modified condition, we do NOT want the last thread working. + return; + } + + out[k + 1] = in[k]; + } } } diff --git a/Project2-Stream-Compaction/stream_compaction/naive.h b/Project2-Stream-Compaction/stream_compaction/naive.h index 37dcb06..d136397 100644 --- a/Project2-Stream-Compaction/stream_compaction/naive.h +++ b/Project2-Stream-Compaction/stream_compaction/naive.h @@ -7,5 +7,7 @@ namespace StreamCompaction { StreamCompaction::Common::PerformanceTimer& timer(); void scan(int n, int *odata, const int *idata); + __global__ void kernScanStep(const int N, const int D, int *odata, const int* idata); + __global__ void kernInclusiveToExclusive(const int N, int *out, const int* in); } } From fce71261e78af215f9fef8a68510e08344d748af Mon Sep 17 00:00:00 2001 From: John Marcao <–jmarcao@users.noreply.github.com> Date: Fri, 13 Sep 2019 16:00:41 -0400 Subject: [PATCH 07/19] Implemented Efficient Scan and Thrust Scan --- .../stream_compaction/efficient.cu | 95 ++++++++++++++++++- .../stream_compaction/efficient.h | 7 ++ .../stream_compaction/thrust.cu | 1 + 3 files changed, 100 insertions(+), 3 deletions(-) diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.cu b/Project2-Stream-Compaction/stream_compaction/efficient.cu index 2db346e..7b1a3fe 100644 --- a/Project2-Stream-Compaction/stream_compaction/efficient.cu +++ b/Project2-Stream-Compaction/stream_compaction/efficient.cu @@ -1,5 +1,7 @@ #include #include +#include +#include #include "common.h" #include "efficient.h" @@ -12,13 +14,76 @@ namespace StreamCompaction { return timer; } + int nextPowerOfTwo(int in) { + int out = 0; + float log = log2(in); + + // If this is true, the number IS a power of 2 + if (ceil(log) == floor(log)) { + out = in; + } + else { + // Not a power of two, grab the next one up. + out = 1; + do { + out = out << 1; + } while (out < in); + } + + return out; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + // Efficient algorithm uses balanded binary trees and two phases: upsweep and downsweep. + // This can be performed inplace. + + // 0) Correct length to be Power of 2 + const int N = nextPowerOfTwo(n); // Returns 'n' if input is already a power of 2. + + // 1) Initialize Memory + int* dev_data = 0; + cudaMalloc(&dev_data, N * sizeof(int)); + cudaMemset(dev_data + n, 0, (N - n) * sizeof(int)); + checkCUDAError("CUDA memset 1 failed"); + cudaMemcpy(dev_data, idata, n * sizeof(int), ::cudaMemcpyHostToDevice); + + // 2) Upsweep + timer().startGpuTimer(); + int* INSPECT = (int*)malloc(N * sizeof(int)); + for (int d = 0; d <= ilog2ceil(N) - 1; d++) { + const int NUM_THREADS = N; + const int NUM_BLOCKS = 1; + + // Kernel Call + cudaMemcpy(INSPECT, dev_data, N * sizeof(int), ::cudaMemcpyDeviceToHost); + kernWorkEffScanUpsweep<<>>(n, d, dev_data, dev_data); + cudaMemcpy(INSPECT, dev_data, N * sizeof(int), ::cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + } + + // 3) Downsweep + cudaMemset(dev_data + (N-1), 0, 1*sizeof(int)); + checkCUDAError("CuDA memset 2 failed"); + for (int d = ilog2ceil(N) - 1; d >= 0; d--) { + const int NUM_THREADS = N; + const int NUM_BLOCKS = 1; + + // Kernel Call + cudaMemcpy(INSPECT, dev_data, N * sizeof(int), ::cudaMemcpyDeviceToHost); + kernWorkEffScanDownsweep << > > (n, d, dev_data, dev_data); + cudaMemcpy(INSPECT, dev_data, N * sizeof(int), ::cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + } + + // 4) Cleanup + timer().endGpuTimer(); + cudaMemcpy(odata, dev_data, N * sizeof(int), ::cudaMemcpyDeviceToHost); + cudaFree(dev_data); + + return; } /** @@ -36,5 +101,29 @@ namespace StreamCompaction { timer().endGpuTimer(); return -1; } + + __global__ void kernWorkEffScanUpsweep(const int N, const int D, int *out, const int* in) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k >= N) { + return; + } + + if (k % (int)powf(2, D + 1) == 0) { + out[k + (int)powf(2, D + 1) - 1] += in[k + (int)powf(2, D) - 1]; + } + } + + __global__ void kernWorkEffScanDownsweep(const int N, const int D, int *out, const int* in) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k >= N) { + return; + } + + if (k % (int)powf(2, D + 1) == 0) { + int tmp = in[k + (int)powf(2, D) - 1]; + out[k + (int)powf(2, D) - 1] = out[k + (int)powf(2, D + 1) - 1]; + out[k + (int)powf(2, D + 1) - 1] += tmp; + } + } } } diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.h b/Project2-Stream-Compaction/stream_compaction/efficient.h index 803cb4f..acef929 100644 --- a/Project2-Stream-Compaction/stream_compaction/efficient.h +++ b/Project2-Stream-Compaction/stream_compaction/efficient.h @@ -9,5 +9,12 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata); int compact(int n, int *odata, const int *idata); + + __global__ void kernWorkEffScanUpsweep(const int N, const int D, int *out, const int* in); + + __global__ void kernWorkEffScanDownsweep(const int N, const int D, int *out, const int* in); + + int nextPowerOfTwo(int in); + } } diff --git a/Project2-Stream-Compaction/stream_compaction/thrust.cu b/Project2-Stream-Compaction/stream_compaction/thrust.cu index 1def45e..642fc92 100644 --- a/Project2-Stream-Compaction/stream_compaction/thrust.cu +++ b/Project2-Stream-Compaction/stream_compaction/thrust.cu @@ -22,6 +22,7 @@ namespace StreamCompaction { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(idata, idata + n * sizeof(int), odata); timer().endGpuTimer(); } } From f636acea9c30e77af613fd685898f3a74d792879 Mon Sep 17 00:00:00 2001 From: John Marcao <–jmarcao@users.noreply.github.com> Date: Fri, 13 Sep 2019 16:03:34 -0400 Subject: [PATCH 08/19] Fixed heap corruption bug! --- Project2-Stream-Compaction/stream_compaction/thrust.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project2-Stream-Compaction/stream_compaction/thrust.cu b/Project2-Stream-Compaction/stream_compaction/thrust.cu index 642fc92..a28ccf6 100644 --- a/Project2-Stream-Compaction/stream_compaction/thrust.cu +++ b/Project2-Stream-Compaction/stream_compaction/thrust.cu @@ -22,7 +22,7 @@ namespace StreamCompaction { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - thrust::exclusive_scan(idata, idata + n * sizeof(int), odata); + thrust::exclusive_scan(idata, idata + n, odata); timer().endGpuTimer(); } } From 460b6656a7d97e8dd7169ea043242ee4ad02378a Mon Sep 17 00:00:00 2001 From: John Marcao <–jmarcao@users.noreply.github.com> Date: Fri, 13 Sep 2019 16:55:14 -0400 Subject: [PATCH 09/19] Cleaned up some debug, increased size of arrays to get betetr grip of performance. --- Project2-Stream-Compaction/src/main.cpp | 6 ++--- .../stream_compaction/efficient.cu | 25 +++++-------------- .../stream_compaction/naive.cu | 25 ++++--------------- 3 files changed, 14 insertions(+), 42 deletions(-) diff --git a/Project2-Stream-Compaction/src/main.cpp b/Project2-Stream-Compaction/src/main.cpp index d016553..51ee56b 100644 --- a/Project2-Stream-Compaction/src/main.cpp +++ b/Project2-Stream-Compaction/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 16; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -38,13 +38,13 @@ int main(int argc, char* argv[]) { printDesc("cpu scan, power-of-two"); StreamCompaction::CPU::scan(SIZE, b, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); + //printArray(SIZE, b, true); zeroArray(SIZE, c); printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); + //printArray(NPOT, b, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.cu b/Project2-Stream-Compaction/stream_compaction/efficient.cu index 7b1a3fe..b74e3d5 100644 --- a/Project2-Stream-Compaction/stream_compaction/efficient.cu +++ b/Project2-Stream-Compaction/stream_compaction/efficient.cu @@ -42,40 +42,27 @@ namespace StreamCompaction { // 0) Correct length to be Power of 2 const int N = nextPowerOfTwo(n); // Returns 'n' if input is already a power of 2. + + // TODO: How to best comute Blocks/BlockSize? + const int NUM_THREADS = n; + const int NUM_BLOCKS = 1; // 1) Initialize Memory int* dev_data = 0; cudaMalloc(&dev_data, N * sizeof(int)); cudaMemset(dev_data + n, 0, (N - n) * sizeof(int)); - checkCUDAError("CUDA memset 1 failed"); cudaMemcpy(dev_data, idata, n * sizeof(int), ::cudaMemcpyHostToDevice); // 2) Upsweep timer().startGpuTimer(); - int* INSPECT = (int*)malloc(N * sizeof(int)); for (int d = 0; d <= ilog2ceil(N) - 1; d++) { - const int NUM_THREADS = N; - const int NUM_BLOCKS = 1; - - // Kernel Call - cudaMemcpy(INSPECT, dev_data, N * sizeof(int), ::cudaMemcpyDeviceToHost); kernWorkEffScanUpsweep<<>>(n, d, dev_data, dev_data); - cudaMemcpy(INSPECT, dev_data, N * sizeof(int), ::cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); } // 3) Downsweep - cudaMemset(dev_data + (N-1), 0, 1*sizeof(int)); - checkCUDAError("CuDA memset 2 failed"); + cudaMemset(dev_data + (N-1), 0, 1*sizeof(int)); // Set last element to 0 for (int d = ilog2ceil(N) - 1; d >= 0; d--) { - const int NUM_THREADS = N; - const int NUM_BLOCKS = 1; - - // Kernel Call - cudaMemcpy(INSPECT, dev_data, N * sizeof(int), ::cudaMemcpyDeviceToHost); - kernWorkEffScanDownsweep << > > (n, d, dev_data, dev_data); - cudaMemcpy(INSPECT, dev_data, N * sizeof(int), ::cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); + kernWorkEffScanDownsweep<<>>(n, d, dev_data, dev_data); } // 4) Cleanup diff --git a/Project2-Stream-Compaction/stream_compaction/naive.cu b/Project2-Stream-Compaction/stream_compaction/naive.cu index 283c28d..60a1c61 100644 --- a/Project2-Stream-Compaction/stream_compaction/naive.cu +++ b/Project2-Stream-Compaction/stream_compaction/naive.cu @@ -20,51 +20,36 @@ namespace StreamCompaction { // Allocate buffers on GPU and move data in const size_t ARR_LEN = n * sizeof(int); const int NUM_THREADS = n; - const int NUM_BLOCKS = 1; + const int NUM_BLOCKS = 1; // TODO: How to best comute Blocks/BlockSize? int* dev_odata; int* dev_tmp; + // Allocate our arrays cudaMalloc(&dev_odata, ARR_LEN); cudaMalloc(&dev_tmp, ARR_LEN); // Copy input to odata buffer // After each loop of the algorithm we will swap tmp and odata // So that the final result will always be located in the dev_odata buffer. - cudaMemcpy(dev_odata, idata, ARR_LEN, ::cudaMemcpyHostToDevice); cudaMemcpy(dev_tmp, idata, ARR_LEN, ::cudaMemcpyHostToDevice); // Algorithm adapted from GPU Gems 3, Section 39.2.1 - /* - 1: for d = 1 to log2 n do - 2: for all k in parallel do - 3: if k >= 2^(d-1) then - 4: x[k] = x[k – 2^(d-1)] + x[k] - */ timer().startGpuTimer(); - int* INSPECT_TMP = (int*)malloc(n * sizeof(int)); - int* INSPECT_ODATA = (int*)malloc(n * sizeof(int)); for (int d = 1; d <= ilog2ceil(n); d++) { std::swap(dev_tmp, dev_odata); -#if _DEBUG - cudaMemcpy(INSPECT_ODATA, dev_odata, ARR_LEN, ::cudaMemcpyDeviceToHost); - cudaMemcpy(INSPECT_TMP, dev_tmp, ARR_LEN, ::cudaMemcpyDeviceToHost); -#endif kernScanStep<<>>(n, d, dev_odata, dev_tmp); -#if _DEBUG - cudaMemcpy(INSPECT_ODATA, dev_odata, ARR_LEN, ::cudaMemcpyDeviceToHost); - cudaMemcpy(INSPECT_TMP, dev_tmp, ARR_LEN, ::cudaMemcpyDeviceToHost); -#endif - cudaDeviceSynchronize(); } + + // Algorithm above produced inclusive scan, adjust to exclusive. std::swap(dev_tmp, dev_odata); kernInclusiveToExclusive<<>>(n, dev_odata, dev_tmp); cudaMemset(dev_odata, 0, sizeof(int)); // Set first element to 0 (identity) + timer().endGpuTimer(); // Copy back to host and free memory cudaMemcpy(odata, dev_odata, ARR_LEN, ::cudaMemcpyDeviceToHost); - cudaFree(dev_tmp); cudaFree(dev_odata); } From 2d87850c41c3bc07e2c6ea7cc95a61353f155bea Mon Sep 17 00:00:00 2001 From: John Marcao <–jmarcao@users.noreply.github.com> Date: Sat, 14 Sep 2019 11:26:07 -0400 Subject: [PATCH 10/19] Implemented efficient stream compaction --- Project2-Stream-Compaction/src/main.cpp | 2 +- .../stream_compaction/common.cu | 16 ++++- .../stream_compaction/efficient.cu | 60 ++++++++++++++++++- 3 files changed, 73 insertions(+), 5 deletions(-) diff --git a/Project2-Stream-Compaction/src/main.cpp b/Project2-Stream-Compaction/src/main.cpp index 51ee56b..362f4e7 100644 --- a/Project2-Stream-Compaction/src/main.cpp +++ b/Project2-Stream-Compaction/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 16; // feel free to change the size of array +const int SIZE = 1 << 10; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/Project2-Stream-Compaction/stream_compaction/common.cu b/Project2-Stream-Compaction/stream_compaction/common.cu index 2ed6d63..bc0d481 100644 --- a/Project2-Stream-Compaction/stream_compaction/common.cu +++ b/Project2-Stream-Compaction/stream_compaction/common.cu @@ -23,7 +23,12 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k >= n) { + return; + } + + bools[k] = (idata[k] != 0); } /** @@ -32,7 +37,14 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k >= n) { + return; + } + + if (bools[k]) { + odata[indices[k]] = idata[k]; + } } } diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.cu b/Project2-Stream-Compaction/stream_compaction/efficient.cu index b74e3d5..f64fb65 100644 --- a/Project2-Stream-Compaction/stream_compaction/efficient.cu +++ b/Project2-Stream-Compaction/stream_compaction/efficient.cu @@ -83,10 +83,66 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + const int NUM_THREADS = n; + const int NUM_BLOCKS = 1; + + // Sim8ilar to CPU implementation, except we use CUDA kernels + // instead of for-loops + + // 0) Correct length to be Power of 2 + const int N = nextPowerOfTwo(n); // Returns 'n' if input is already a power of 2. + + int* INSPECT = (int*)malloc(N * sizeof(int)); + + // Prepare memory + int* dev_odata; + int* dev_idata; + int* dev_map; + int* dev_indicies; + + cudaMalloc(&dev_odata, N * sizeof(int)); + cudaMalloc(&dev_idata, N * sizeof(int)); + cudaMalloc(&dev_map, N * sizeof(int)); + cudaMalloc(&dev_indicies, N * sizeof(int)); + + cudaMemcpy(dev_idata, idata, N * sizeof(int), ::cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + // 1) Map + Common::kernMapToBoolean << > > (N, dev_map, dev_idata); + cudaMemcpy(INSPECT, dev_idata, N * sizeof(int), ::cudaMemcpyDeviceToHost); + cudaMemcpy(INSPECT, dev_map, N * sizeof(int), ::cudaMemcpyDeviceToHost); + + // 2) Scan + // 2a) Upsweep + cudaMemcpy(dev_indicies, dev_map, N * sizeof(int), ::cudaMemcpyDeviceToDevice); + for (int d = 0; d <= ilog2ceil(N) - 1; d++) { + kernWorkEffScanUpsweep << > > (N, d, dev_indicies, dev_indicies); + } + // 2b) Downsweep + cudaMemset(dev_indicies + (N - 1), 0, 1 * sizeof(int)); // Set last element to 0 + for (int d = ilog2ceil(N) - 1; d >= 0; d--) { + kernWorkEffScanDownsweep << > > (N, d, dev_indicies, dev_indicies); + } + cudaMemcpy(INSPECT, dev_indicies, N * sizeof(int), ::cudaMemcpyDeviceToHost); + + // 3) Scatter + Common::kernScatter << > > (N, dev_odata, dev_idata, dev_map, dev_indicies); timer().endGpuTimer(); - return -1; + + // Copy back to host + cudaMemcpy(odata, dev_odata, N * sizeof(int), ::cudaMemcpyDeviceToHost); + + // Get number of elements from indicies + int num_elements = 0; + cudaMemcpy(&num_elements, dev_indicies + N - 1, sizeof(int), ::cudaMemcpyDeviceToHost); + + cudaFree(dev_odata); + cudaFree(dev_idata); + cudaFree(dev_map); + cudaFree(dev_indicies); + + return num_elements; } __global__ void kernWorkEffScanUpsweep(const int N, const int D, int *out, const int* in) { From 5be306052002ef3f89865ad7d8d8c2a4e060dc4c Mon Sep 17 00:00:00 2001 From: John Marcao <–jmarcao@users.noreply.github.com> Date: Mon, 16 Sep 2019 22:13:49 -0400 Subject: [PATCH 11/19] Skeleton of application and how data is loaded and presented. Tested and verified matrix multiplcation via cuBLAS library. --- Project2-Character-Recognition/CMakeLists.txt | 2 + .../character_recognition/CMakeLists.txt | 2 +- .../character_recognition/mlp.cu | 257 +++++++++++++++++- .../character_recognition/mlp.h | 65 +++++ Project2-Character-Recognition/src/main.cpp | 250 ++++++++--------- 5 files changed, 425 insertions(+), 151 deletions(-) diff --git a/Project2-Character-Recognition/CMakeLists.txt b/Project2-Character-Recognition/CMakeLists.txt index 09e9198..8809cc4 100644 --- a/Project2-Character-Recognition/CMakeLists.txt +++ b/Project2-Character-Recognition/CMakeLists.txt @@ -22,6 +22,7 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") endif() include_directories(.) +link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64) add_subdirectory(character_recognition) cuda_add_executable(${CMAKE_PROJECT_NAME} @@ -32,4 +33,5 @@ cuda_add_executable(${CMAKE_PROJECT_NAME} target_link_libraries(${CMAKE_PROJECT_NAME} character_recognition ${CORELIBS} + cublas ) diff --git a/Project2-Character-Recognition/character_recognition/CMakeLists.txt b/Project2-Character-Recognition/character_recognition/CMakeLists.txt index 7446175..bfd956f 100644 --- a/Project2-Character-Recognition/character_recognition/CMakeLists.txt +++ b/Project2-Character-Recognition/character_recognition/CMakeLists.txt @@ -7,5 +7,5 @@ set(SOURCE_FILES cuda_add_library(character_recognition ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_70 ) diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu index 5a3ed7f..f1d37cb 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.cu +++ b/Project2-Character-Recognition/character_recognition/mlp.cu @@ -1,5 +1,9 @@ #include #include +#include +#include +#include +#include #include "common.h" #include "mlp.h" @@ -13,15 +17,250 @@ namespace CharacterRecognition { // TODO: __global__ - /** - * Example of use case (follow how you did it in stream compaction) - */ - /*void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } - */ + struct MLPData { + int input_len; + int hidden_len; + int out_len; + + int* input_layer; + int* hidden_layer; + int* output_layer; + + int* ih_weights; + int* ho_weights; + }; + + void intializeMLP(int input_len, int hidden_len, int out_len) { + } + + void loadInputMLP(struct CharacterRecognition::MLPData* mlp, int* idata, int len) { + + } + + void stepMLP(struct CharacterRecognition::MLPData* mlp) { + // This MLP flows from input to output with no feedback + // So we will work in steps + // 1) (CPU) Read input data and copy to Device + // 2) (GPU) Each Node of Hidden layer computes its value by sum(input*weight foreach input) and compares it to activation + // 3) (GPU) Each Node of the Output layer computes its value by sum(hidden*weifght foreach hidden) and compares it to activation + // 4) (CPU) Reads output nodes and uses lookup table to get result. + } + + void matrixMultiplyExample() { + int input_rows; + int input_cols; + int ih_weight_rows; + int ih_weight_cols; + int hidden_rows; + int hidden_cols; + + // Allocate the matricies + float *input_matrix = (float*)malloc(input_rows * input_cols * sizeof(float)); + float *weight_matrix = (float*)malloc(ih_weight_rows * ih_weight_cols * sizeof(float)); + float *hidden_matrix = (float*)malloc(hidden_rows * hidden_cols * sizeof(float)); + + // Allocate the matricies on the GPU + float* dev_input_matrix; + float* dev_weight_matrix; + float* dev_hidden_matrix; + cudaMalloc(&dev_input_matrix, input_rows * input_cols * sizeof(float)); + cudaMalloc(&dev_weight_matrix, ih_weight_rows * ih_weight_cols * sizeof(float)); + cudaMalloc(&dev_hidden_matrix, hidden_rows * hidden_cols * sizeof(float)); + + // Work work work + + // Free memory + cudaFree(dev_input_matrix); + cudaFree(dev_weight_matrix); + cudaFree(dev_hidden_matrix); + + free(input_matrix); + free(weight_matrix); + free(hidden_matrix); + + return; + } + + void matrixMul(const Matrix* A, const Matrix* B, Matrix* C) { + const float alpha = 1.0f; + const float beta = 0.0f; + + // Create + cublasHandle_t ch; + cublasCreate(&ch); + + // Do a Matrix Multiply + cublasSgemm( + ch, + CUBLAS_OP_N, + CUBLAS_OP_N, + A->rowcnt, + B->colcnt, + A->colcnt, + &alpha, + A->dev_data, + A->rowcnt, + B->dev_data, + B->rowcnt, + &beta, + C->dev_data, + A->rowcnt); + + // Destroy + cublasDestroy(ch); + } + + Matrix::Matrix(int colcnt, int rowcnt) : colcnt(colcnt), rowcnt(rowcnt) + { + this->cpuAlloc(); + for (int i = 0; i < this->getLen(); i++) { + this->cpu_data[i] = 0; + } + + this->devAlloc(); + + } + + Matrix::~Matrix() + { + this->cpuFree(); + this->devFree(); + } + + void Matrix::cpuAlloc() + { + cpu_data = (float*)malloc(rowcnt * colcnt * sizeof(float)); + if (dev_data == NULL) { + throw std::runtime_error("Failed to allocate cpu_data for Matrix!"); + } + } + + void Matrix::devAlloc() + { + cudaMalloc(&dev_data, rowcnt * colcnt * sizeof(float)); + checkCUDAError("Failed to allocate dev_data for Matrix!"); + } + + void Matrix::cpuFree() + { + if (cpu_data) { + free(cpu_data); + } + } + + void Matrix::devFree() + { + if (dev_data) { + cudaFree(dev_data); + } + } + + void Matrix::copyCpuToDev() + { + cudaMemcpy(this->dev_data, this->cpu_data, this->getLen() * sizeof(float), ::cudaMemcpyHostToDevice); + } + + void Matrix::copyDevToCpu() + { + cudaMemcpy(this->cpu_data, this->dev_data, this->getLen() * sizeof(float), ::cudaMemcpyDeviceToHost); + } + + int Matrix::getLen() + { + return rowcnt * colcnt; + } + + ImageFile::ImageFile(std::string filepath) : fd(0) + { + fd = std::fopen(filepath.c_str(), "r"); + } + + ImageFile::~ImageFile() + { + std::fclose(fd); + } + + void ImageFile::readImage(Matrix* m) + { + // Format of Image File + // filename\r\n + // num_pixels\r\n + // pixels_0_255 ... pixels_0_255\r\n // Note leading space and list is space-delimited. + + int bytes_read = 0; + + bytes_read += std::fscanf(this->fd, "%i", &this->expected_number); + bytes_read += std::fscanf(this->fd, "%i", &this->pixels); + + for (int i = 0; i < pixels; i++) { + int tmp = 0; + bytes_read += std::fscanf(this->fd, "%i", &tmp); + m->cpu_data[i] = (float)(tmp / 255); + } + + return; + } + + int ImageFile::getExpectedNumber() + { + return this->expected_number; + } + + Perceptron::Perceptron(int pixels, int outputs) : + inputData(pixels, 1), + hiddenLayer(pixels, 1), + outputLayer(outputs, 1), + ihWeights(pixels, pixels), + hoWeights(pixels, outputs) + { + } + + Perceptron::~Perceptron() + { + } + + void Perceptron::randomizeWeights() + { + // kernRandomizeMatrix + } + + void Perceptron::loadBrain(std::string brainfile) + { + // readFile into Matrix + // copy into correct matricies + } + + void Perceptron::saveBrain(std::string brainfile) + { + // Read matricxies + // Output to file as a format to be defined + } + + void Perceptron::loadTrainingDataSet(int expected_result, Matrix * input) + { + // Load data and store the expected result + } + + void Perceptron::train(int iterations) + { + // Run the machine on the data set 'iteratations' times + // Includes backprop + } + + void Perceptron::loadDataSet(Matrix * input) + { + // Load a data set to run + } + + void Perceptron::run() + { + // Run the machine on the data set. + } + + int Perceptron::getLastResult() + { + // Get the result of the last run. + } // TODO: implement required elements for MLP sections 1 and 2 here } diff --git a/Project2-Character-Recognition/character_recognition/mlp.h b/Project2-Character-Recognition/character_recognition/mlp.h index 2096228..83d6bfe 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.h +++ b/Project2-Character-Recognition/character_recognition/mlp.h @@ -5,5 +5,70 @@ namespace CharacterRecognition { Common::PerformanceTimer& timer(); + class Matrix { + public: + Matrix(int colcnt, int rowcnt); + ~Matrix(); + + int colcnt; + int rowcnt; + + float* dev_data; + float* cpu_data; + + void copyCpuToDev(); + void copyDevToCpu(); + + int getLen(); + + private: + // Memory-Management + void cpuAlloc(); + void devAlloc(); + void cpuFree(); + void devFree(); + }; + + class ImageFile { + public: + ImageFile(std::string filepath); + ~ImageFile(); + + // Reads the data into a Matrix object + void readImage(Matrix* m); + + int getExpectedNumber(); + + private: + std::FILE* fd; + + int expected_number; // TODO: This shouldn't be here. Rethink this... + int pixels; + }; + + class Perceptron { + public: + Perceptron(int pixels, int outputs); + ~Perceptron(); + + void randomizeWeights(); + void loadBrain(std::string brainfile); + void saveBrain(std::string brainfile); + + void loadTrainingDataSet(int expected_result, Matrix* input); + void train(int iterations); + + void loadDataSet(Matrix* input); + void run(); + int getLastResult(); + + private: + Matrix inputData; + Matrix hiddenLayer; + Matrix outputLayer; + Matrix ihWeights; + Matrix hoWeights; + }; // TODO: implement required elements for MLP sections 1 and 2 here + void matrixMul(const Matrix* A, const Matrix* B, Matrix* C); } diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp index 11dd534..50771da 100644 --- a/Project2-Character-Recognition/src/main.cpp +++ b/Project2-Character-Recognition/src/main.cpp @@ -1,152 +1,120 @@ /** - * @file main.cpp - * @brief Stream compaction test program - * @authors Kai Ninomiya - * @date 2015 - * @copyright University of Pennsylvania + * Character Recognition + * John Marcao, CIS565 2019 */ #include +#include +#include +#include #include #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array -const int NPOT = SIZE - 3; // Non-Power-Of-Two -int *a = new int[SIZE]; -int *b = new int[SIZE]; -int *c = new int[SIZE]; +using CharacterRecognition::Matrix; +using CharacterRecognition::ImageFile; +using CharacterRecognition::Perceptron; + +constexpr int PIXELS = 10201; +constexpr int OUTPUTS = 52; + +std::vector parseDirectory(const std::string path); +void testMatrixMul(); int main(int argc, char* argv[]) { - // Scan tests - - printf("\n"); - printf("****************\n"); - printf("** SCAN TESTS **\n"); - printf("****************\n"); - - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - // initialize b using StreamCompaction::CPU::scan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. - // At first all cases passed because b && c are all zeroes. - zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); - - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ - - zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - printf("\n"); - printf("*****************************\n"); - printf("** STREAM COMPACTION TESTS **\n"); - printf("*****************************\n"); - - // Compaction tests - - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - int count, expectedCount, expectedNPOT; - - // initialize b using StreamCompaction::CPU::compactWithoutScan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); - - zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - system("pause"); // stop Win32 console from closing on exit - delete[] a; - delete[] b; - delete[] c; + /**************************** + * TODO: User Input for training/loading/saving + */ + const std::string IMAGE_PATH = "..\\data-set\\*"; + std::vector files = parseDirectory(IMAGE_PATH); + + Perceptron p(PIXELS, OUTPUTS); + + // Begin With Random Values + p.randomizeWeights(); + for (auto &fname : files) { + ImageFile inputFile(fname); + Matrix inputData(PIXELS, 1); + inputFile.readImage(&inputData); + + p.loadTrainingDataSet(inputFile.getExpectedNumber(), &inputData); + p.train(100); + } + + // Now Run against data set + std::vector correct_guesses; + std::vector wrong_guesses; + for (auto &fname : files) { + ImageFile inputFile(fname); + Matrix inputData(PIXELS, 1); + inputFile.readImage(&inputData); + + p.loadDataSet(&inputData); + p.run(); + + if(inputFile.getExpectedNumber() == p.getLastResult()) { + correct_guesses.push_back(fname); + } + else { + wrong_guesses.push_back(fname); + } + } + + // Report Results + int correct = correct_guesses.size(); + int wrong = wrong_guesses.size(); + float accuracy = (float)correct / wrong; + std::cout << "Run complete with accuracy " << accuracy << std::endl; + std::cout << "MLP was wrong about the following files: " << std::endl; + for (auto &f : wrong_guesses) { + std::cout << "\t" << f << std::endl; + } + + return 0; +} + +std::vector parseDirectory(const std::string path) { + std::vector ret; + std::regex fileMatch(".*\.txt$"); + + // Directory walking adapted from https://www.bfilipek.com/2019/04/dir-iterate.html + + WIN32_FIND_DATA FindFileData; + HANDLE hFind = FindFirstFile(path.c_str(), &FindFileData); + if (hFind == INVALID_HANDLE_VALUE) { + throw std::runtime_error("FindFirstFile failed!"); + } + + do { + std::string file(FindFileData.cFileName); + if (std::regex_match(file, fileMatch)) { + ret.push_back(file); + } + } while (FindNextFile(hFind, &FindFileData) != 0); + + FindClose(hFind); + + return ret; +} + +void testMatrixMul() { + Matrix m_a(10201, 1); // Input Values + Matrix m_b(10201, 10201); // Weights + Matrix m_c(10201, 1); // Output Values + + // Init matrix + for (int i = 0; i < m_a.getLen(); i++) { + m_a.cpu_data[i] = i; + } + for (int i = 0; i < m_b.getLen(); i++) { + m_b.cpu_data[i] = m_b.getLen() - i; + } + + // Populate Device + m_a.copyCpuToDev(); + m_b.copyCpuToDev(); + + matrixMul(&m_a, &m_b, &m_c); + + m_c.copyDevToCpu(); } From c41e388543a974a26b8e42ae456c56eb02d56ea6 Mon Sep 17 00:00:00 2001 From: John Marcao <–jmarcao@users.noreply.github.com> Date: Tue, 17 Sep 2019 00:21:52 -0400 Subject: [PATCH 12/19] Partial implementation of backprop. Should probably break it down into several smaller kernels and store intermediate values in arrays. --- .../character_recognition/mlp.cu | 229 ++++++++++++------ .../character_recognition/mlp.h | 17 +- Project2-Character-Recognition/src/main.cpp | 2 +- 3 files changed, 170 insertions(+), 78 deletions(-) diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu index f1d37cb..f88f4e0 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.cu +++ b/Project2-Character-Recognition/character_recognition/mlp.cu @@ -1,9 +1,11 @@ #include #include +#include #include #include #include #include +#include #include "common.h" #include "mlp.h" @@ -14,72 +16,6 @@ namespace CharacterRecognition { static PerformanceTimer timer; return timer; } - - // TODO: __global__ - - struct MLPData { - int input_len; - int hidden_len; - int out_len; - - int* input_layer; - int* hidden_layer; - int* output_layer; - - int* ih_weights; - int* ho_weights; - }; - - void intializeMLP(int input_len, int hidden_len, int out_len) { - } - - void loadInputMLP(struct CharacterRecognition::MLPData* mlp, int* idata, int len) { - - } - - void stepMLP(struct CharacterRecognition::MLPData* mlp) { - // This MLP flows from input to output with no feedback - // So we will work in steps - // 1) (CPU) Read input data and copy to Device - // 2) (GPU) Each Node of Hidden layer computes its value by sum(input*weight foreach input) and compares it to activation - // 3) (GPU) Each Node of the Output layer computes its value by sum(hidden*weifght foreach hidden) and compares it to activation - // 4) (CPU) Reads output nodes and uses lookup table to get result. - } - - void matrixMultiplyExample() { - int input_rows; - int input_cols; - int ih_weight_rows; - int ih_weight_cols; - int hidden_rows; - int hidden_cols; - - // Allocate the matricies - float *input_matrix = (float*)malloc(input_rows * input_cols * sizeof(float)); - float *weight_matrix = (float*)malloc(ih_weight_rows * ih_weight_cols * sizeof(float)); - float *hidden_matrix = (float*)malloc(hidden_rows * hidden_cols * sizeof(float)); - - // Allocate the matricies on the GPU - float* dev_input_matrix; - float* dev_weight_matrix; - float* dev_hidden_matrix; - cudaMalloc(&dev_input_matrix, input_rows * input_cols * sizeof(float)); - cudaMalloc(&dev_weight_matrix, ih_weight_rows * ih_weight_cols * sizeof(float)); - cudaMalloc(&dev_hidden_matrix, hidden_rows * hidden_cols * sizeof(float)); - - // Work work work - - // Free memory - cudaFree(dev_input_matrix); - cudaFree(dev_weight_matrix); - cudaFree(dev_hidden_matrix); - - free(input_matrix); - free(weight_matrix); - free(hidden_matrix); - - return; - } void matrixMul(const Matrix* A, const Matrix* B, Matrix* C) { const float alpha = 1.0f; @@ -118,7 +54,6 @@ namespace CharacterRecognition { } this->devAlloc(); - } Matrix::~Matrix() @@ -210,8 +145,8 @@ namespace CharacterRecognition { inputData(pixels, 1), hiddenLayer(pixels, 1), outputLayer(outputs, 1), - ihWeights(pixels, pixels), - hoWeights(pixels, outputs) + kjWeights(pixels, pixels), + jiWeights(pixels, outputs) { } @@ -221,7 +156,19 @@ namespace CharacterRecognition { void Perceptron::randomizeWeights() { - // kernRandomizeMatrix + // Create an RNG via curand and then populate the weights array with those numbers. + curandGenerator_t gen; + + // Create and seed generator + curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT); + curandSetPseudoRandomGeneratorSeed(gen, ::time(NULL)); + + // Populate weight matricies + curandGenerateUniform(gen, this->kjWeights.dev_data, this->kjWeights.getLen()); + curandGenerateUniform(gen, this->jiWeights.dev_data, this->jiWeights.getLen()); + + // Cleanup + curandDestroyGenerator(gen); } void Perceptron::loadBrain(std::string brainfile) @@ -241,10 +188,21 @@ namespace CharacterRecognition { // Load data and store the expected result } - void Perceptron::train(int iterations) + void Perceptron::train() { - // Run the machine on the data set 'iteratations' times - // Includes backprop + // Training: We want to run our data, calculate the backprop variables, + // and average the results over many runs. + // So what we do, we run the machine and then add a step to collect information + // about the weights and activations of the machine. These will then be stored off + // in a seperate array. + // After running, the calcualted weights can either be fed back in and re-trained + // or output to a file for the user to recover. + + // 1) Run the Perceptron over the input data + this->run(); + + // 2) Collect backprop information. + this->backprop(); } void Perceptron::loadDataSet(Matrix * input) @@ -255,6 +213,28 @@ namespace CharacterRecognition { void Perceptron::run() { // Run the machine on the data set. + // Step 1) Calculate values of hidden layer. + matrixMul(&inputData, &kjWeights, &hiddenLayer); + + // Step 2) Apply Hidden Layer Bias + // TODO: Would be nice. + + // STEP 3) Apply sigmoid function + sigmoid(&hiddenLayer); + + // Step 4) Hidden layer now populated, get output layer + matrixMul(&hiddenLayer, &jiWeights, &outputLayer); + + // Step 5) Apply sigmoid to output layers + sigmoid(&outputLayer); + + // Setp 6) Store the result, ie the brightest node in the output layer + // Output layer is small, so do this on CPU. + outputLayer.copyDevToCpu(); + result = std::max_element( + outputLayer.cpu_data, + outputLayer.cpu_data + outputLayer.getLen() + ) - outputLayer.cpu_data; } int Perceptron::getLastResult() @@ -262,5 +242,106 @@ namespace CharacterRecognition { // Get the result of the last run. } - // TODO: implement required elements for MLP sections 1 and 2 here + void Perceptron::backprop() + { + // Backprop algoritm runs in two phases. + // From the output, compute the deltas that should be made to the jiWeights + // Then, from there, calculate the deltas that should be applied to the kjWeights + + // 1) jiWeights + + } + + void Perceptron::updateHiddenToOutputWeights() + { + + } + + ////////////////////////////////// + ////////////////////////////////// + // KERNEL OPERATIONS + ////////////////////////////////// + ////////////////////////////////// + + void sigmoid(Matrix* m) { + // TODO: Optimize block utilization + int threads = m->getLen(); + kernSigmoid << <1, threads >> > (m->getLen(), m->dev_data); + } + + __global__ void kernSigmoid(int n, float* data) { + int idx = threadIdx.x + (blockDim.x) * threadIdx.y + (blockDim.y * blockDim.x) * threadIdx.z; + if (idx > n) { + return; + } + + data[n] = devSigmoid(data[n]); + } + + void activation(Matrix* m) { + // TODO: Optimize block utilization + int threads = m->getLen(); + kernActivation << <1, threads >> > (m->getLen(), m->dev_data); + } + + __global__ void kernActivation(int n, float* data) { + int idx = threadIdx.x + (blockDim.x) * threadIdx.y + (blockDim.y * blockDim.x) * threadIdx.z; + if (idx > n) { + return; + } + + data[n] = devInverseSigmoid(data[n]); + } + + __device__ float devSigmoid(float n) { + return 1.0f / (1 + expf(-1.0f * n)); + } + + __device__ float devInverseSigmoid(float n) { + return 1.0f / (1 + expf(n)); + } + + __global__ void kernUpdateHiddenToOutputWeights(int n, const float* jiWeights, const float* outputLayer, const float* hiddenLayer, const float* expectedLayer, float* jiWeightsDelta) { + int tidx = threadIdx.x + (blockDim.x) * threadIdx.y + (blockDim.y * blockDim.x) * threadIdx.z; + int bidx = blockIdx.x + (gridDim.x) * blockIdx.y + (gridDim.y * gridDim.x) * blockIdx.z; + int idx = tidx + bidx * (blockDim.x * blockDim.y * blockDim.z); + if (idx > n) { + return; + } + + // Leverage architecture to our advantage + // Uses I blocks of J threads + int j = tidx; + int i = bidx; + int rowcount = blockDim.x; + + float lambda = 1.0f; // TODO: Make this adjustable, set it to E/10, where E is ??? + float theta = -logf((1.0f/hiddenLayer[j]) - 1); + float omega = expectedLayer[i] - outputLayer[i]; + float psi = omega * devInverseSigmoid(theta); + + jiWeightsDelta[j + i * rowcount] += lambda * hiddenLayer[j] * psi; + } + + __global__ void kernUpdateInputToHiddenWeights(int n, const float* kjWeights, const float* hiddenLayer, const float* inputLayer, float* kjWeightsDelta) { + int tidx = threadIdx.x + (blockDim.x) * threadIdx.y + (blockDim.y * blockDim.x) * threadIdx.z; + int bidx = blockIdx.x + (gridDim.x) * blockIdx.y + (gridDim.y * gridDim.x) * blockIdx.z; + int idx = tidx + bidx * (blockDim.x * blockDim.y * blockDim.z); + if (idx > n) { + return; + } + + // Leverage architecture to our advantage + // Uses I blocks of J threads + int k = tidx; + int j = bidx; + int rowcount = blockDim.x; + + //float lambda = 1.0f; // TODO: Make this adjustable, set it to E/10, where E is ??? + //float theta = -logf((1.0f / hiddenLayer[j]) - 1); + //float omega = expectedLayer[i] - outputLayer[i]; + //float psi = omega * devInverseSigmoid(theta); + + //jiWeightsDelta[j + i * rowcount] += lambda * hiddenLayer[j] * psi; + } } diff --git a/Project2-Character-Recognition/character_recognition/mlp.h b/Project2-Character-Recognition/character_recognition/mlp.h index 83d6bfe..b398ce0 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.h +++ b/Project2-Character-Recognition/character_recognition/mlp.h @@ -56,7 +56,7 @@ namespace CharacterRecognition { void saveBrain(std::string brainfile); void loadTrainingDataSet(int expected_result, Matrix* input); - void train(int iterations); + void train(); void loadDataSet(Matrix* input); void run(); @@ -66,8 +66,19 @@ namespace CharacterRecognition { Matrix inputData; Matrix hiddenLayer; Matrix outputLayer; - Matrix ihWeights; - Matrix hoWeights; + Matrix kjWeights; // Input -> Hidden + Matrix jiWeights; // Hidden -> Output + + // Backprop data + Matrix kjWeightsDelta; + Matrix jiWeightsDelta; + + int result; + + void backprop(); + void applyBackprop(); + + void updateHiddenToOutputWeights(); }; // TODO: implement required elements for MLP sections 1 and 2 here void matrixMul(const Matrix* A, const Matrix* B, Matrix* C); diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp index 50771da..b168e5b 100644 --- a/Project2-Character-Recognition/src/main.cpp +++ b/Project2-Character-Recognition/src/main.cpp @@ -63,7 +63,7 @@ int main(int argc, char* argv[]) { // Report Results int correct = correct_guesses.size(); int wrong = wrong_guesses.size(); - float accuracy = (float)correct / wrong; + float accuracy = (float)correct / (wrong + correct); std::cout << "Run complete with accuracy " << accuracy << std::endl; std::cout << "MLP was wrong about the following files: " << std::endl; for (auto &f : wrong_guesses) { From abe079387e96d61dcdce3a1398626a3402adb109 Mon Sep 17 00:00:00 2001 From: John Marcao <–jmarcao@users.noreply.github.com> Date: Tue, 17 Sep 2019 23:00:02 -0400 Subject: [PATCH 13/19] "Attempt" to work. Backpropogation is not working correctly, so learning is shot. --- Project2-Character-Recognition/CMakeLists.txt | 1 + .../character_recognition/mlp.cu | 472 ++++++++++++++---- .../character_recognition/mlp.h | 60 ++- Project2-Character-Recognition/src/main.cpp | 64 +-- 4 files changed, 441 insertions(+), 156 deletions(-) diff --git a/Project2-Character-Recognition/CMakeLists.txt b/Project2-Character-Recognition/CMakeLists.txt index 8809cc4..0d94e63 100644 --- a/Project2-Character-Recognition/CMakeLists.txt +++ b/Project2-Character-Recognition/CMakeLists.txt @@ -34,4 +34,5 @@ target_link_libraries(${CMAKE_PROJECT_NAME} character_recognition ${CORELIBS} cublas + curand ) diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu index f88f4e0..2e2e52c 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.cu +++ b/Project2-Character-Recognition/character_recognition/mlp.cu @@ -9,6 +9,8 @@ #include "common.h" #include "mlp.h" +constexpr int BLOCKSIZE = 1024; + namespace CharacterRecognition { using Common::PerformanceTimer; PerformanceTimer& timer() @@ -17,33 +19,145 @@ namespace CharacterRecognition { return timer; } - void matrixMul(const Matrix* A, const Matrix* B, Matrix* C) { - const float alpha = 1.0f; - const float beta = 0.0f; + void initCublas() + { + cublasStatus_t status; + status = cublasCreate(&ch); + if (status != CUBLAS_STATUS_SUCCESS) { + std::cout << "Failed to initialize cublas: " << status << std::endl; + } + } + + void deleteCublas() + { + cublasStatus_t status; + status = cublasDestroy(ch); + if (status != CUBLAS_STATUS_SUCCESS) { + std::cout << "Failed to destroy cublas: " << status << std::endl; + } + } - // Create - cublasHandle_t ch; - cublasCreate(&ch); + void matrixMul(cublasHandle_t ch, const Matrix* A, const Matrix* B, Matrix* C) { + cublasStatus_t status; + const float alpha = 1.0f; // Factor to multiply A by + const float beta = 0.0f; // Factor to multiply C by prior to result. + + assert(A->colcnt == B->rowcnt); + assert(A->rowcnt == C->rowcnt); + assert(B->colcnt == C->colcnt); // Do a Matrix Multiply - cublasSgemm( + status = cublasSgemm( ch, CUBLAS_OP_N, CUBLAS_OP_N, A->rowcnt, B->colcnt, - A->colcnt, + B->rowcnt, &alpha, A->dev_data, A->rowcnt, B->dev_data, - B->rowcnt, + B->rowcnt, &beta, C->dev_data, A->rowcnt); - // Destroy - cublasDestroy(ch); + if (status != CUBLAS_STATUS_SUCCESS) { + std::cout << "Failed to perform matrix multiply: " << status << std::endl; + } + + cudaDeviceSynchronize(); + checkCUDAError("Failed matrixMul"); + } + + void matrixMul(cublasHandle_t ch, const Matrix* A, cublasOperation_t aop, const Matrix* B, cublasOperation_t bop, Matrix* C) { + const float alpha = 1.0f; // Factor to multiply A by + const float beta = 0.0f; // Factor to multiply C by prior to result. + + //assert(A->colcnt == B->rowcnt); + //assert(A->rowcnt == C->rowcnt); + //assert(B->colcnt == C->colcnt); + + // Do a Matrix Multiply + cublasSgemm( + ch, + aop, + bop, + A->rowcnt, + B->colcnt, + B->rowcnt, + &alpha, + A->dev_data, + A->rowcnt, + B->dev_data, + B->rowcnt, + &beta, + C->dev_data, + A->rowcnt); + + cudaDeviceSynchronize(); + checkCUDAError("Failed matrixMul"); + } + + void matrixSub(cublasHandle_t ch, const Matrix* A, const Matrix* B, Matrix* C) { + const float alpha = 1.0f; // Factor to multiply A by + const float beta = -1.0f; // Factor to multiply B by + + assert(A->colcnt == B->colcnt); + assert(A->rowcnt == B->rowcnt); + assert(A->colcnt == C->colcnt); + assert(A->rowcnt == C->rowcnt); + + // Do a Matrix Subtraction + cublasSgeam( + ch, + CUBLAS_OP_N, + CUBLAS_OP_N, + A->rowcnt, + B->colcnt, + &alpha, + A->dev_data, + A->rowcnt, + &beta, + B->dev_data, + B->rowcnt, + C->dev_data, + A->rowcnt + ); + + cudaDeviceSynchronize(); + checkCUDAError("Failed matrixMul"); + } + + void matrixAdd(cublasHandle_t ch, const Matrix* A, const Matrix* B, Matrix* C) { + const float alpha = 1.0f; // Factor to multiply A by + const float beta = 1.0f; // Factor to multiply B by + + assert(A->colcnt == B->colcnt); + assert(A->rowcnt == B->rowcnt); + assert(A->colcnt == C->colcnt); + assert(A->rowcnt == C->rowcnt); + + // Do a Matrix Subtraction + cublasSgeam( + ch, + CUBLAS_OP_N, + CUBLAS_OP_N, + A->rowcnt, + B->colcnt, + &alpha, + A->dev_data, + A->rowcnt, + &beta, + B->dev_data, + B->rowcnt, + C->dev_data, + A->rowcnt + ); + + cudaDeviceSynchronize(); + checkCUDAError("Failed matrixMul"); } Matrix::Matrix(int colcnt, int rowcnt) : colcnt(colcnt), rowcnt(rowcnt) @@ -54,6 +168,7 @@ namespace CharacterRecognition { } this->devAlloc(); + this->copyCpuToDev(); } Matrix::~Matrix() @@ -87,17 +202,30 @@ namespace CharacterRecognition { { if (dev_data) { cudaFree(dev_data); + checkCUDAError("Failed to free dev_data for Matrix!"); } } void Matrix::copyCpuToDev() { cudaMemcpy(this->dev_data, this->cpu_data, this->getLen() * sizeof(float), ::cudaMemcpyHostToDevice); + checkCUDAError("Failed to memcpy in copyCpuToDev()"); } void Matrix::copyDevToCpu() { cudaMemcpy(this->cpu_data, this->dev_data, this->getLen() * sizeof(float), ::cudaMemcpyDeviceToHost); + checkCUDAError("Failed to memcpy in copyDevToCpu()"); + } + + void Matrix::copyMatrix(Matrix * m) + { + assert(this->colcnt == m->colcnt); + assert(this->rowcnt == m->rowcnt); + + memcpy(this->cpu_data, m->cpu_data, m->getLen() * sizeof(float)); + cudaMemcpy(this->dev_data, m->dev_data, m->getLen() * sizeof(float), ::cudaMemcpyDeviceToDevice); + checkCUDAError("Failed to memcpy in copyMatrix()"); } int Matrix::getLen() @@ -126,11 +254,13 @@ namespace CharacterRecognition { bytes_read += std::fscanf(this->fd, "%i", &this->expected_number); bytes_read += std::fscanf(this->fd, "%i", &this->pixels); + + pixels = std::min(pixels, PIXELS); for (int i = 0; i < pixels; i++) { int tmp = 0; bytes_read += std::fscanf(this->fd, "%i", &tmp); - m->cpu_data[i] = (float)(tmp / 255); + m->cpu_data[i] = ((float)tmp / 255.0f); } return; @@ -142,11 +272,22 @@ namespace CharacterRecognition { } Perceptron::Perceptron(int pixels, int outputs) : - inputData(pixels, 1), - hiddenLayer(pixels, 1), + inputLayer(pixels, 1), + hiddenLayer(pixels / 5, 1), outputLayer(outputs, 1), - kjWeights(pixels, pixels), - jiWeights(pixels, outputs) + expectedLayer(outputs, 1), + kjWeights(pixels / 5, pixels), + jiWeights(outputs, pixels / 5), + kjWeightsDelta(pixels / 5, pixels), + jiWeightsDelta(outputs, pixels / 5), + jiOmega(outputs, 1), + jiPsi(outputs, 1), + jiTheta(outputs, 1), + kjTheta(pixels / 5, 1), + kjOmega(pixels / 5, 1), + kjPsi(pixels / 5, 1), + result(0), + tr_runs(0.0f) { } @@ -167,6 +308,9 @@ namespace CharacterRecognition { curandGenerateUniform(gen, this->kjWeights.dev_data, this->kjWeights.getLen()); curandGenerateUniform(gen, this->jiWeights.dev_data, this->jiWeights.getLen()); + // Synchronize + cudaDeviceSynchronize(); + // Cleanup curandDestroyGenerator(gen); } @@ -175,122 +319,243 @@ namespace CharacterRecognition { { // readFile into Matrix // copy into correct matricies + // TODO } void Perceptron::saveBrain(std::string brainfile) { // Read matricxies // Output to file as a format to be defined + // TODO } - void Perceptron::loadTrainingDataSet(int expected_result, Matrix * input) + void Perceptron::loadTrainingDataSet(ImageFile * input) { - // Load data and store the expected result + // Load data + loadDataSet(input); + + // Update expected layer for training + for (int i = 0; i < expectedLayer.getLen(); i++) { + expectedLayer.cpu_data[i] = 0; + } + expectedLayer.cpu_data[input->getExpectedNumber()] = 1.0f; } - void Perceptron::train() + void Perceptron::loadDataSet(ImageFile * input) { - // Training: We want to run our data, calculate the backprop variables, - // and average the results over many runs. - // So what we do, we run the machine and then add a step to collect information - // about the weights and activations of the machine. These will then be stored off - // in a seperate array. - // After running, the calcualted weights can either be fed back in and re-trained - // or output to a file for the user to recover. - - // 1) Run the Perceptron over the input data - this->run(); - - // 2) Collect backprop information. - this->backprop(); + // Load data and store the expected result + input->readImage(&this->inputLayer); + inputLayer.copyCpuToDev(); } - void Perceptron::loadDataSet(Matrix * input) - { - // Load a data set to run + // Mostly for debug + void Perceptron::updateCpu() { + inputLayer.copyDevToCpu(); + hiddenLayer.copyDevToCpu(); + outputLayer.copyDevToCpu(); + kjWeights.copyDevToCpu(); + jiWeights.copyDevToCpu(); + kjWeightsDelta.copyDevToCpu(); + jiWeightsDelta.copyDevToCpu(); } - void Perceptron::run() + void Perceptron::impl_run(bool training) { // Run the machine on the data set. - // Step 1) Calculate values of hidden layer. - matrixMul(&inputData, &kjWeights, &hiddenLayer); - - // Step 2) Apply Hidden Layer Bias - // TODO: Would be nice. - - // STEP 3) Apply sigmoid function - sigmoid(&hiddenLayer); + matrixMul(ch, &inputLayer, &kjWeights, &hiddenLayer); // Step 1) Calculate values of hidden layer. + if (training) { + kjTheta.copyMatrix(&hiddenLayer); // Step 1.1) Save off hidden layer before sigmoids for backprop + } - // Step 4) Hidden layer now populated, get output layer - matrixMul(&hiddenLayer, &jiWeights, &outputLayer); + reluActivate(&hiddenLayer); // STEP 2) Apply activation function + matrixMul(ch, &hiddenLayer, &jiWeights, &outputLayer); // Step 3) Hidden layer now populated, get output layer + if (training) { + jiTheta.copyMatrix(&outputLayer); // Step 3.1) Save off output layer before sigmoids for backprop + } - // Step 5) Apply sigmoid to output layers - sigmoid(&outputLayer); + softmaxActivate(&outputLayer); // Step 4) Apply activation to output layers - // Setp 6) Store the result, ie the brightest node in the output layer + // Setp 5) Store the result, ie the brightest node in the output layer // Output layer is small, so do this on CPU. outputLayer.copyDevToCpu(); result = std::max_element( outputLayer.cpu_data, outputLayer.cpu_data + outputLayer.getLen() ) - outputLayer.cpu_data; + + // Inc. Run Counter for Backprop + if (training) { + this->tr_runs++; + } + } + + void Perceptron::run() + { + impl_run(false); + } + + void Perceptron::train() + { + impl_run(true); } int Perceptron::getLastResult() { // Get the result of the last run. + return result; } - void Perceptron::backprop() + void Perceptron::updateBackprop() { // Backprop algoritm runs in two phases. // From the output, compute the deltas that should be made to the jiWeights // Then, from there, calculate the deltas that should be applied to the kjWeights - // 1) jiWeights + // 1.0) Calculate delat to ji Weights + // 1.1) Calculate iTheta ... Done during run() + matrixSub(ch, &expectedLayer, &outputLayer, &jiOmega); // 1.2) Calculate iOmega ... Done by subtracting expectedLayer - outputLayer + calcPsi(&jiOmega, &jiTheta, &jiPsi); // 1.3) Calculate iPsi ... a little fancier + calcDeltaChange(0.01f, &hiddenLayer, &jiPsi, &jiWeightsDelta); // 1.4) Lastly, calculate the delta to each weight. + // 2.0) Now repeat for the kj Weights + calcOmega(&jiPsi, &jiWeights, &kjOmega); // This omega is done with a special function, unlike subtraction from last layer + calcPsi(&kjOmega, &kjTheta, &kjPsi); + calcDeltaChange(0.01f, &inputLayer, &kjPsi, &kjWeightsDelta); } - void Perceptron::updateHiddenToOutputWeights() + void Perceptron::applyBackprop() { + // Average over the number of runs + float t = 1.0f / this->tr_runs; + this->tr_runs = 0.0f; // Reset - } + // Scaler multiplication functions + cublasSscal(ch, kjWeightsDelta.getLen(), &t, kjWeightsDelta.dev_data, 1); + cudaDeviceSynchronize(); + checkCUDAError("Failed cublasSscal1"); - ////////////////////////////////// - ////////////////////////////////// - // KERNEL OPERATIONS - ////////////////////////////////// - ////////////////////////////////// + cublasSscal(ch, jiWeightsDelta.getLen(), &t, jiWeightsDelta.dev_data, 1); + cudaDeviceSynchronize(); + checkCUDAError("Failed cublasSscal3"); + + // Applying backprop just means adding the deltas to the og matricies + matrixAdd(ch, &kjWeights, &kjWeightsDelta, &kjWeights); + cudaDeviceSynchronize(); + checkCUDAError("Failed matrixAdd1"); + + matrixAdd(ch, &jiWeights, &jiWeightsDelta, &jiWeights); + cudaDeviceSynchronize(); + checkCUDAError("Failed matrixAdd2"); + } - void sigmoid(Matrix* m) { - // TODO: Optimize block utilization - int threads = m->getLen(); - kernSigmoid << <1, threads >> > (m->getLen(), m->dev_data); + __global__ void kernCalcPsi(int n, float * omega, float * theta, float * psi) { + int idx = getGlobalIdx_3D_3D(); + if (idx < n) { + psi[idx] = omega[idx] + devInverseSigmoid(theta[idx]); + } } - __global__ void kernSigmoid(int n, float* data) { - int idx = threadIdx.x + (blockDim.x) * threadIdx.y + (blockDim.y * blockDim.x) * threadIdx.z; - if (idx > n) { - return; + void Perceptron::calcPsi(const Matrix * omega, const Matrix * theta, Matrix * psi) + { + // Call a kernel to handle this. + assert(omega->colcnt == theta->colcnt); + assert(omega->rowcnt == theta->rowcnt); + assert(omega->colcnt == psi->colcnt); + assert(omega->rowcnt == psi->rowcnt); + assert(omega->rowcnt == 1); + + int n = omega->colcnt; + kernCalcPsi<<<1, n>>>(n, omega->dev_data, theta->dev_data, psi->dev_data); + cudaDeviceSynchronize(); + checkCUDAError("Failed calcPsi"); + } + + __global__ void kernCalcDeltaChange(int n, float lambda, float * layer, float * psi, float * deltaOut) { + int gloablId = getGlobalIdx_3D_3D(); + int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z; + int threadId = (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x; + + if (gloablId < n) { + deltaOut[gloablId] += lambda * layer[threadId] * psi[blockId]; } + } - data[n] = devSigmoid(data[n]); + void Perceptron::calcDeltaChange(const float lambda, const Matrix * leftLayer, const Matrix * psi, Matrix * weightDelta) + { + // Call a kernel to handle this + int blocks = weightDelta->rowcnt; + int threadsperblock = weightDelta->colcnt; + int totalthreads = blocks * threadsperblock; + + kernCalcDeltaChange<<>>(totalthreads, 0.01f, leftLayer->dev_data, psi->dev_data, weightDelta->dev_data); + cudaDeviceSynchronize(); + checkCUDAError("Failed calcDeltaChange"); } - void activation(Matrix* m) { - // TODO: Optimize block utilization - int threads = m->getLen(); - kernActivation << <1, threads >> > (m->getLen(), m->dev_data); + void Perceptron::calcOmega(const Matrix * psi, const Matrix * weights, Matrix * omega) + { + // Transpose matrix since we are multiplying the other way + matrixMul(ch, psi, CUBLAS_OP_N, weights, CUBLAS_OP_T,omega); + checkCUDAError("Failed calcOmega"); } - __global__ void kernActivation(int n, float* data) { - int idx = threadIdx.x + (blockDim.x) * threadIdx.y + (blockDim.y * blockDim.x) * threadIdx.z; - if (idx > n) { - return; + void reluActivate(Matrix * m) + { + dim3 fullBlocksPerGrid((m->getLen() + BLOCKSIZE - 1) / BLOCKSIZE); + kernReluActivate << > > (m->getLen(), m->dev_data, m->dev_data); + checkCUDAError("Failed reluActivate"); + } + // Softmax involves exponentiating each value, summing them, and then dividing so that the sum of the vector is 1. + void softmaxActivate(Matrix * m) { + cublasStatus_t status; + dim3 fullBlocksPerGrid((m->getLen() + BLOCKSIZE - 1) / BLOCKSIZE); + float expSum = 0.0f; + float invExpSum = 0.0f; + + // Prescale down + // TODO: For some reason my values are huge at this stage (10,000~) and exponentiating them + // is impossible. I scale everything down to make the system at least WORK. + invExpSum = 0.0001f; + status = cublasSscal(ch, m->getLen(), &invExpSum, m->dev_data, 1); + if (status != CUBLAS_STATUS_SUCCESS) { + std::cout << "Failed cublasSscal: " << status << std::endl; + } + + // Exponentiate + kernExponentiate << > > (m->getLen(), m->dev_data, m->dev_data); + checkCUDAError("Failed kernExponentiate"); + + // Sum + status = cublasSasum(ch, m->getLen(), m->dev_data, 1, &expSum); + if (status != CUBLAS_STATUS_SUCCESS) { + std::cout << "Failed cublasSasum: " << status << std::endl; } - data[n] = devInverseSigmoid(data[n]); + // Normalize + invExpSum = 1.0f / expSum; + status = cublasSscal(ch, m->getLen(), &invExpSum, m->dev_data, 1); + if (status != CUBLAS_STATUS_SUCCESS) { + std::cout << "Failed cublasSscal: " << status << std::endl; + } + } + + ////////////////////////////////// + ////////////////////////////////// + // KERNEL OPERATIONS + ////////////////////////////////// + ////////////////////////////////// + + __device__ int getGlobalIdx_3D_3D() { + int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z; + int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x; + return threadId; + } + + __global__ void kernReluActivate(int n, float* in, float* out) { + int idx = getGlobalIdx_3D_3D(); + if (idx < n) { + out[idx] = fmaxf(0.0f, in[idx]); + } } __device__ float devSigmoid(float n) { @@ -301,47 +566,32 @@ namespace CharacterRecognition { return 1.0f / (1 + expf(n)); } - __global__ void kernUpdateHiddenToOutputWeights(int n, const float* jiWeights, const float* outputLayer, const float* hiddenLayer, const float* expectedLayer, float* jiWeightsDelta) { - int tidx = threadIdx.x + (blockDim.x) * threadIdx.y + (blockDim.y * blockDim.x) * threadIdx.z; - int bidx = blockIdx.x + (gridDim.x) * blockIdx.y + (gridDim.y * gridDim.x) * blockIdx.z; - int idx = tidx + bidx * (blockDim.x * blockDim.y * blockDim.z); - if (idx > n) { - return; + __global__ void kernExponentiate(int n, float* in, float* out) { + int idx = getGlobalIdx_3D_3D(); + if (idx < n) { + out[idx] = expf(in[idx]); } - - // Leverage architecture to our advantage - // Uses I blocks of J threads - int j = tidx; - int i = bidx; - int rowcount = blockDim.x; - - float lambda = 1.0f; // TODO: Make this adjustable, set it to E/10, where E is ??? - float theta = -logf((1.0f/hiddenLayer[j]) - 1); - float omega = expectedLayer[i] - outputLayer[i]; - float psi = omega * devInverseSigmoid(theta); - - jiWeightsDelta[j + i * rowcount] += lambda * hiddenLayer[j] * psi; } - __global__ void kernUpdateInputToHiddenWeights(int n, const float* kjWeights, const float* hiddenLayer, const float* inputLayer, float* kjWeightsDelta) { - int tidx = threadIdx.x + (blockDim.x) * threadIdx.y + (blockDim.y * blockDim.x) * threadIdx.z; - int bidx = blockIdx.x + (gridDim.x) * blockIdx.y + (gridDim.y * gridDim.x) * blockIdx.z; - int idx = tidx + bidx * (blockDim.x * blockDim.y * blockDim.z); - if (idx > n) { - return; + void testMatrixMul() { + Matrix m_a(4, 1); // Input Values + Matrix m_b(2, 4); // Weights + Matrix m_c(2, 1); // Output Values + + // Init matrix + for (int i = 0; i < m_a.getLen(); i++) { + m_a.cpu_data[i] = i + 1; + } + for (int i = 0; i < m_b.getLen(); i++) { + m_b.cpu_data[i] = 2; } - // Leverage architecture to our advantage - // Uses I blocks of J threads - int k = tidx; - int j = bidx; - int rowcount = blockDim.x; + // Populate Device + m_a.copyCpuToDev(); + m_b.copyCpuToDev(); - //float lambda = 1.0f; // TODO: Make this adjustable, set it to E/10, where E is ??? - //float theta = -logf((1.0f / hiddenLayer[j]) - 1); - //float omega = expectedLayer[i] - outputLayer[i]; - //float psi = omega * devInverseSigmoid(theta); + matrixMul(ch, &m_a, &m_b, &m_c); - //jiWeightsDelta[j + i * rowcount] += lambda * hiddenLayer[j] * psi; + m_c.copyDevToCpu(); } } diff --git a/Project2-Character-Recognition/character_recognition/mlp.h b/Project2-Character-Recognition/character_recognition/mlp.h index b398ce0..b6ba0cf 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.h +++ b/Project2-Character-Recognition/character_recognition/mlp.h @@ -1,6 +1,11 @@ #pragma once #include "common.h" +#include + +constexpr int PIXEL_RES = 2; +constexpr int PIXELS = 10201 / PIXEL_RES; +constexpr int OUTPUTS = 52; namespace CharacterRecognition { Common::PerformanceTimer& timer(); @@ -19,6 +24,8 @@ namespace CharacterRecognition { void copyCpuToDev(); void copyDevToCpu(); + void copyMatrix(Matrix* m); + int getLen(); private: @@ -55,17 +62,24 @@ namespace CharacterRecognition { void loadBrain(std::string brainfile); void saveBrain(std::string brainfile); - void loadTrainingDataSet(int expected_result, Matrix* input); - void train(); + void loadTrainingDataSet(ImageFile * input); + void loadDataSet(ImageFile * input); - void loadDataSet(Matrix* input); void run(); + void train(); + int getLastResult(); + void updateBackprop(); + void applyBackprop(); + + void updateCpu(); // For debugging + private: - Matrix inputData; + Matrix inputLayer; Matrix hiddenLayer; Matrix outputLayer; + Matrix expectedLayer; Matrix kjWeights; // Input -> Hidden Matrix jiWeights; // Hidden -> Output @@ -73,13 +87,43 @@ namespace CharacterRecognition { Matrix kjWeightsDelta; Matrix jiWeightsDelta; + Matrix jiOmega; + Matrix jiPsi; + Matrix jiTheta; + + Matrix kjTheta; + Matrix kjOmega; + Matrix kjPsi; + int result; + float tr_runs; - void backprop(); - void applyBackprop(); + void impl_run(bool training); - void updateHiddenToOutputWeights(); + void calcPsi(const Matrix* omega, const Matrix* theta, Matrix* psi); + void calcDeltaChange(const float lambda, const Matrix* leftLayer, const Matrix* psi, Matrix* weightDelta); + void calcOmega(const Matrix* psi, const Matrix* weights, Matrix* omega); }; + + static cublasHandle_t ch; + + void initCublas(); + void deleteCublas(); + // TODO: implement required elements for MLP sections 1 and 2 here - void matrixMul(const Matrix* A, const Matrix* B, Matrix* C); + void matrixMul(cublasHandle_t ch, const Matrix* A, const Matrix* B, Matrix* C); + + void sigmoid(Matrix* m); + void reluActivate(Matrix* m); + void softmaxActivate(Matrix* m); + + __global__ void kernSigmoid(int n, float* data); + void activation(Matrix* m); + __global__ void kernActivation(int n, float* data); + __device__ float devSigmoid(float n); + __device__ float devInverseSigmoid(float n); + __device__ int getGlobalIdx_3D_3D(); + + void testMatrixMul(); + } diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp index b168e5b..b7522cd 100644 --- a/Project2-Character-Recognition/src/main.cpp +++ b/Project2-Character-Recognition/src/main.cpp @@ -14,42 +14,53 @@ using CharacterRecognition::Matrix; using CharacterRecognition::ImageFile; using CharacterRecognition::Perceptron; - -constexpr int PIXELS = 10201; -constexpr int OUTPUTS = 52; +using CharacterRecognition::initCublas; +using CharacterRecognition::deleteCublas; std::vector parseDirectory(const std::string path); void testMatrixMul(); + int main(int argc, char* argv[]) { /**************************** * TODO: User Input for training/loading/saving */ - const std::string IMAGE_PATH = "..\\data-set\\*"; - std::vector files = parseDirectory(IMAGE_PATH); + const std::string IMAGE_PATH = "..\\data-set\\"; + const std::string IMAGE_SEARCH_PATH = IMAGE_PATH + "*"; + std::vector files = parseDirectory(IMAGE_SEARCH_PATH); + initCublas(); + + CharacterRecognition::testMatrixMul(); Perceptron p(PIXELS, OUTPUTS); // Begin With Random Values p.randomizeWeights(); - for (auto &fname : files) { - ImageFile inputFile(fname); - Matrix inputData(PIXELS, 1); - inputFile.readImage(&inputData); + p.updateCpu(); - p.loadTrainingDataSet(inputFile.getExpectedNumber(), &inputData); - p.train(100); + // Load files and train on those files + for (int i = 0; i < 10; i++) { + for (auto &fname : files) { + ImageFile inputFile(IMAGE_PATH + fname); + + p.loadTrainingDataSet(&inputFile); + p.train(); + p.updateBackprop(); + p.updateCpu(); + } + p.applyBackprop(); + p.updateCpu(); } + p.updateCpu(); + // Now Run against data set std::vector correct_guesses; std::vector wrong_guesses; for (auto &fname : files) { - ImageFile inputFile(fname); - Matrix inputData(PIXELS, 1); - inputFile.readImage(&inputData); + ImageFile inputFile(IMAGE_PATH + fname); - p.loadDataSet(&inputData); + p.loadDataSet(&inputFile); p.run(); if(inputFile.getExpectedNumber() == p.getLastResult()) { @@ -70,6 +81,7 @@ int main(int argc, char* argv[]) { std::cout << "\t" << f << std::endl; } + deleteCublas(); return 0; } @@ -96,25 +108,3 @@ std::vector parseDirectory(const std::string path) { return ret; } - -void testMatrixMul() { - Matrix m_a(10201, 1); // Input Values - Matrix m_b(10201, 10201); // Weights - Matrix m_c(10201, 1); // Output Values - - // Init matrix - for (int i = 0; i < m_a.getLen(); i++) { - m_a.cpu_data[i] = i; - } - for (int i = 0; i < m_b.getLen(); i++) { - m_b.cpu_data[i] = m_b.getLen() - i; - } - - // Populate Device - m_a.copyCpuToDev(); - m_b.copyCpuToDev(); - - matrixMul(&m_a, &m_b, &m_c); - - m_c.copyDevToCpu(); -} From 05dddbb95fcf6a7bad4c7acfc88dbb6b444d0774 Mon Sep 17 00:00:00 2001 From: John Marcao <–jmarcao@users.noreply.github.com> Date: Tue, 17 Sep 2019 23:38:31 -0400 Subject: [PATCH 14/19] Some cleanup and build fixes. Results from MLP still HUGE. --- .../character_recognition/mlp.cu | 27 ++++++------------- .../character_recognition/mlp.h | 5 ++-- 2 files changed, 10 insertions(+), 22 deletions(-) diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu index 2e2e52c..80f30b8 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.cu +++ b/Project2-Character-Recognition/character_recognition/mlp.cu @@ -127,7 +127,7 @@ namespace CharacterRecognition { ); cudaDeviceSynchronize(); - checkCUDAError("Failed matrixMul"); + checkCUDAError("Failed matrixSub"); } void matrixAdd(cublasHandle_t ch, const Matrix* A, const Matrix* B, Matrix* C) { @@ -157,7 +157,7 @@ namespace CharacterRecognition { ); cudaDeviceSynchronize(); - checkCUDAError("Failed matrixMul"); + checkCUDAError("Failed matrixAdd"); } Matrix::Matrix(int colcnt, int rowcnt) : colcnt(colcnt), rowcnt(rowcnt) @@ -373,6 +373,8 @@ namespace CharacterRecognition { jiTheta.copyMatrix(&outputLayer); // Step 3.1) Save off output layer before sigmoids for backprop } + outputLayer.copyDevToCpu(); + softmaxActivate(&outputLayer); // Step 4) Apply activation to output layers // Setp 5) Store the result, ie the brightest node in the output layer @@ -421,6 +423,8 @@ namespace CharacterRecognition { calcOmega(&jiPsi, &jiWeights, &kjOmega); // This omega is done with a special function, unlike subtraction from last layer calcPsi(&kjOmega, &kjTheta, &kjPsi); calcDeltaChange(0.01f, &inputLayer, &kjPsi, &kjWeightsDelta); + + // Old way did not work, lets try from scratch... } void Perceptron::applyBackprop() @@ -429,23 +433,8 @@ namespace CharacterRecognition { float t = 1.0f / this->tr_runs; this->tr_runs = 0.0f; // Reset - // Scaler multiplication functions - cublasSscal(ch, kjWeightsDelta.getLen(), &t, kjWeightsDelta.dev_data, 1); - cudaDeviceSynchronize(); - checkCUDAError("Failed cublasSscal1"); - - cublasSscal(ch, jiWeightsDelta.getLen(), &t, jiWeightsDelta.dev_data, 1); - cudaDeviceSynchronize(); - checkCUDAError("Failed cublasSscal3"); - - // Applying backprop just means adding the deltas to the og matricies - matrixAdd(ch, &kjWeights, &kjWeightsDelta, &kjWeights); - cudaDeviceSynchronize(); - checkCUDAError("Failed matrixAdd1"); - - matrixAdd(ch, &jiWeights, &jiWeightsDelta, &jiWeights); - cudaDeviceSynchronize(); - checkCUDAError("Failed matrixAdd2"); + cublasSaxpy(ch, kjWeights.getLen(), &t, kjWeightsDelta.dev_data, 1, kjWeights.dev_data, 1); + cublasSaxpy(ch, jiWeights.getLen(), &t, jiWeightsDelta.dev_data, 1, jiWeights.dev_data, 1); } __global__ void kernCalcPsi(int n, float * omega, float * theta, float * psi) { diff --git a/Project2-Character-Recognition/character_recognition/mlp.h b/Project2-Character-Recognition/character_recognition/mlp.h index b6ba0cf..87091f9 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.h +++ b/Project2-Character-Recognition/character_recognition/mlp.h @@ -117,12 +117,11 @@ namespace CharacterRecognition { void reluActivate(Matrix* m); void softmaxActivate(Matrix* m); - __global__ void kernSigmoid(int n, float* data); - void activation(Matrix* m); - __global__ void kernActivation(int n, float* data); __device__ float devSigmoid(float n); __device__ float devInverseSigmoid(float n); __device__ int getGlobalIdx_3D_3D(); + __global__ void kernReluActivate(int n, float* in, float* out); + __global__ void kernExponentiate(int n, float* in, float* out); void testMatrixMul(); From f813c196a2453ac7b99779842ae683e7101860cb Mon Sep 17 00:00:00 2001 From: John Marcao <–jmarcao@users.noreply.github.com> Date: Wed, 18 Sep 2019 00:48:09 -0400 Subject: [PATCH 15/19] README --- .../img/compact_cmp.png | Bin 0 -> 23819 bytes Project2-Stream-Compaction/img/scan_cmp.png | Bin 0 -> 17432 bytes .../img/scan_cmp_ohne_cpu.png | Bin 0 -> 32171 bytes .../proj2_compact_data.xlsx | Bin 0 -> 11778 bytes README.md | 93 ++++++++++++++++-- 5 files changed, 86 insertions(+), 7 deletions(-) create mode 100644 Project2-Stream-Compaction/img/compact_cmp.png create mode 100644 Project2-Stream-Compaction/img/scan_cmp.png create mode 100644 Project2-Stream-Compaction/img/scan_cmp_ohne_cpu.png create mode 100644 Project2-Stream-Compaction/proj2_compact_data.xlsx diff --git a/Project2-Stream-Compaction/img/compact_cmp.png b/Project2-Stream-Compaction/img/compact_cmp.png new file mode 100644 index 0000000000000000000000000000000000000000..beb85112f6ea22d6d3614a5233da2c00283128f0 GIT binary patch literal 23819 zcmb4rc{tSV8}F2)NXnA3CP^W(XPMy*5m}ObUqV^4jAa;0o9s!*I+kSLLkLOsW$gP> z)?paiFpTXyqu=|!zjK{)u5->GU0LS)eDCME@B4FapGS~}nj$qN6D0%!p?;twuL*&i zJcK|_%$+$2esc2Ai5L8L!c|l89;Bd)Wf6RH%H}WCzaWs}2&#Q^a`64x=Supn5Xkuk z(!UeUPFYVN5Tld_@_%W0nygItJ!f7>TJSn@){T>KEdN|Szx0`3+aoT&Dkm;JR~eTh z9hH!zT9RO-`o3nw+1lD>dN>zxn@sDbEgpbo0H1K_DqX7myH$EFA|C0`b3b4}Ssz zQGJCW15cdkp@2X{E~e2yAWyFyKOp~qe&FSqHn%V1o|Y<#_XP$&Krb0${rOYNbqiD0XXkNLQC!zj)4nXRKD(4Vp4zHeX_z|yifhZch$^t8ds zLgNu#;UwF_;)sdy>pQ9JnRZ?@1qKbLa$aHjc9vtgHsP&2{R@IlbNYdAyBFF#S~Np& z_gcej^Gy8cba1<02>rCup?kig+H{_uw!?Gjk)f>mA?+jRwu4vx!B>eQRkk(mVu9}i zrT6q6EjR$;K?aV!6DBfeY>{s zX6-aO*YB6m$t%$uP{f|*a{hA8s^Dk`(vw|3JF`6}*UD0+B&n~qzu^nBC*k+wX)ldS zWsh^C%a-%-G_A%LS@J8pMgu(_20zBH_F9GeOx9msHc33}G&$)l5_oGodX6HUvhnox z4K?&+C%v~y&4ZwIL+vtqd!A}ngM=_BJ_hl?xhXX+qTBNvLb?%}L*4P_Qf9FrPa}r& z6P>ovvZWxwuAoG+C}v`-mRbe+FTAI3L=}~<9{*^K8MC@s&&1sbY{=0?e=1cm9 z_ax_6^>0UyhWN>a826ETPqIWwm?G919pQgnN-8TTN%A~BZ6$tg<#w!uWZj7*hHZCfsp5p2^TP-2YBaXbYP*i5s`tfz)2D&4h_H}0c9^1aL_Ov=jmv$}IR1T) zCp9)~QW|?8AULN4_G?Es51=>a&&x^a z6D`*ZHX^vbF1%CQ-_l&^zVj9q*12CwbLN}UP3z7hi^tnr96J^l2W!qa>RHoo`xK>< zJ6MNvYiP`4uYU=Mo%k^?MkS5qm>A|lKUY4?lU8Nik`t6a6i;H2disnV? zQ>{E%jA*Rp>F&BYTpG|5xW&5jS4#Z_BvZoC)=$nIio(l&n;{56N_T7B#8?{d*4dG6 z0?$n02W3$!;l}*0X=QZAP`} zSETQW3$Zn(xic+y6m$@}UnEHEqceLaiJiA34l@pgUM;t1BRCsPt_*kUQrkTlyeFmKVYv?f^=xUXi*Kjg^;G#^ z4V**~$s^w2TeAX_J`argsqEIY8&}%u9FwUV?0yVATdb$h3H*Yph!P@OUl+lTmI(xiWXDqkyRgb zg9V$)$i!~NCjGwDEgr(Vo_D}N(TA>t{hHk1{iLx%N*vGB^fi;v;IS%!tI1(6scNZ_ z0#f%oujDrN>1<;{x$S%OHBXWo?y5Z*5x*Ob6w7A?SetBvd_}N%h#89d8^ZL5fzIJG*-c^`!*85f_OI2-lzVq=K&G?jLQCY)iuQ+XNOynGlqXS&TRY>&?Q75^HiB`|z+2K_v=YBlv%QyVQj_o+Y}86VW8I=k{nmSYFQ zQlfjrmOnoa+fR?Pxs$A1bYt(!FlPy)O8AtF%dv+Zy*zV={k5NG0JU6hh&S7%UA<+l zRLPR^+XWj=jFXQdM~Z&R-Iu#YOc6R+c=xGh*g?XD4b08}#S(Q7MRYw@M*ii|jaRi& z^-eGi-M6r;b7?g4GeCw`aqq6tSIVa> z^6V)bdP5_^La&_!c`h*`mj~I(I@m5El~3?7^-w%LHRwFjK~C+QxJ}OW;2xE-Onsnf z0Qak9GLbWBG~JMsigWivVIiLAES{dgKR?tsx&;43NWSkfDSvjBGM8!-dZ{{~*uHS+ zMQ5=0`uQIfQ5KvkE!7gWg8_5hf*+Bi22v-ZrwcCaUU;!bp%L*GCOgp=Xj}7ymjSu_ zvZnfniHPt^8|{$k?ca+f_%{LH&4dyViusaw*h#Lt*U$}o(9^4W4!@%nV5g4GK>E4T zXfz1-!dvP&YZ?>8?C;B=4a&NEds3&>i|I-g{bc4#pzI}12{oF9RP1RFsy3c`7&xBe zK<@F*iNzkg3GFm_ap@M_CoEmp@Lu;&3l5xN|I3Jt*2;V%SQ_ZXq>>Rj#Fh00c0x_+X_2$>Ih| z#l2=}mP<#UWNMq172F*r7v()Fql#IZR2cEpKv?O;)(_s#7{7c^hsdjfI0L@kyyP$W z7yiW3)V?C36P8s?;WU-jI9gOTa8MOJ@F_z7=IhuGPOYZ;6{{LT+W|G4wIi#Uwv0-T$oSe}Q6+`&T&VR@O zNjl%-+iB{b)3s?Mq8*owW;*mzTf*h)LCNyHWMJNyU3D(OqDye<@X=;{a0gh1PthSw zef?D|ELwq-!SkjMt`dj2OL=IKLUEyptB`J`WUu+Sr1IioL1SC_C32&>4`c(GzH_#~ z2P^yxD0t|Qkd*q4t}bQVGbg7ZE=aph2u8R&N8K%YSHrdFBt+HP+WLN)y1IG<{|UYF z_;1+9`&kj|HSo;YpNVGVkodOQ*-!GQ+FF?L$s&8vAQ;4dH>2-a5UllyW-Nq{8I%&& zY@a-lPeZS*6@eARKvf~%|La$ZJZgR2ne+@e+_zcs*Eo8(6FQe@{jE5O5Lx-H?QLaz zTypXt5a>Ssg#X*6rKR_nDD(pS?Dvw3NXXT+>x7>cf;Hz-XhCUmL1EX&Bu!mcHOXItn3$kNx>P3q{nkg`-qrG` zkr934lNx79`%1i*dbKjMwDfT-M2l2aU9|Oi<-g>-*+O^1cn5a%`_*vQ4prpI%g7EeR zf~%{8z5ZFk4Gw>RZ(Y~~g5M(@m&AK-i{g(ar!^!RS`0c(hs18;oV`b3BpN!;55Z?gJT*O%y32;<@q}bke4}u}C$SRQ3dyn$9-fTNO%o(I@@@P#*V$y})0abn(l}7~(GApPkHW@xwi3`7E{EW4T3ex@S zeJk*Ynvn5FM z5R?8s;_;n@6Da|34-XZ5>zoTo4qGPVxW-KK+WV6|6tZ-|jTW>>ypTUnmO1`xmjXD(EJ4%ZH9w-U~%qspb zm;ebG#Mb`5V1kJxw4PW(^nVU=fFwcFzs8>P25dr9?4MK1Avt7N#=pjXydodz)POnw z+-Z}1nhCqO7>N-U7ET8;AFnVtGIFaYQ{yJ^mp@B9zy&p00EZL>mYNa+U``%|!#M&) zsgh*Q0M-@+TlX{q?nFnD`I&{qy)<>9IPiu6dGH1UCT8X+jIfBvCzXqCwN8#0D%X)_ z{51_U=}1+0)|K#&t|-M`!?ZYcvetIR`x6yO4=t^(#$uvcpSaz!wSF&2IOn6R>yIEs z0E1u>f_(n`8T>{qzpuw;->~XrzcA@=Gy(9-qm1}7RUSnVSy_=?ghk5#yynj_hsU>P zy5Sa&!Yb`z;TN-PNb<-1bLMF`N*^hrdJ3_W8Ve-Zg!}sXXpsO!;kRtx)TX$gF}2ts zkMq|jB@07jp(Mxdd{U|(1PhYC7kh1P$;+#KVZJ=}a6Yz9Gpc}WE0m;;sK3oR_yYm; zuEOU!AulKlEBQw%;f96=T4aAHzd{>kNE<)W;~OZC(pw7s=V9fH>%>oe&y0h^?PBwt z;HNYvs_8%7j_&qs;doS(;8~67I9sxO8Ak)mL*Az zBhu@ckKq}>5?CFJ5=6VRw)L(CV*KGTRASaAlS^z#p=IaA=`W5QpqtG*V$R} zzki)xS7(~0ej}waY!5!)loak?}5|6M?Bn{lM;VD|=5T*zJu`gv^sxMCZRfpLQv`AW~q#=09 zBLHgPgIw!+i%ngkU7-xY%XjJXAhMRga{#Wo&<|Je)jVGl=OX0UGS_H*9LV**|0ux& z8SIw#NTS(iPqCANSBM4@#(_es;1-vbNHR^?B(IDXt+ImsD5<8a3-$wbh$Jj7f}9E< zT@^h!_fQt5)O^7@2nNnfF2@}2%n4jvwiJXiAXm^Ueuet)nf$E8Ep5YaWqjqkx*V)5 zS++i2Q0B+uNRn^$lb!IdT5IDiRKk0$H!{Mr2XEZ3nm=zEJM8^(HdzYAI>4JFU|_nc znEtrVg_j+v^>N+TZG>k{&<3LV#M4tR4c*f6mVY1-oPJm}TO@{#J!uf0QVh-_>yG6+ zqnZB1k96P=8kBB;2u(^<2F82PCX0Kgz!|UEZWPc!M5;l;_$QarG6W_N;pV5{g05Pa z1nc}Jncryy0&$(#(b0j?bQr1No1phr0+x_^)7XqD>n6$PWo>LSMK z$w;0&O8#1^N}169+;YHF%y=nc&)N7C6D#SJT9Qd1vA%mZOGVUts>ID>!Q(p7UhN5- z^{E@_tR5Ys;r;vf-Lh|d-tJ(nEwI|#q(z2X!TRATJs|d&DR4yrU63+2+v%O0QUPC0 z7xdCfb76;X9#%FTAj<1}B@E&WbJ(l(JP?blFoQLt7rSNHJD~I%B{*=`5q8 z+UKql-8?+7y=|rcGiFF=-63JdY<~=_hrxyN93+11xCS5?U`25&+tkg_-d5 z~e04Kme)Z6I-_@OqCqPgww51EDF`F@XED7Z^0)yy#<>4d(TaAt%0ppP3QZ5Yi62d z@2dA(t7M#h9CrxGU_n0oAGnQv0<_=I)k%w-PVcJa{g`FzIZi&29VdPNcwQT`KkXZuD@Y z(AXtq$%<9wDQ=)4#tmIK>t7s-&f_iUDcO+gII>jEFNPYm#`8qj7ndp z&iXZpxj~Tep8FFj=IF6O+Z=`I=W^7>ItjoH17(AN{N7v+soUvp_qM_VXyPtG@T5kM9>jUC~D4-~;=yL98IlU5o$W>bSVG=4NM77fO+?nnw( z=;SW%zU$mdRB$jaboOk%B$U{eV}{Q@Gy^BhK=~TfU#-xWFJD$w{jh#NKT=T2ScpL+ z<(lCl7jr9Ei-}vTQ@1&|2xf&|W2Y+4nAT-sI8~SX)2;LOXUbj0X{I7kU&P&QAGfQy z)^sg@ABwZFvoltF?rKxtC6SzlT1v&YwPiiWNABp*6Q#*y`#{S5-dhWjGh(bVRXbC* zPRZ`89b&F3EdUb)mIYWDap1;|8K1;xSb(~m184o>*EJzQ?x?BIqlY?ikAc98b zB&5?31YH+(Z*Omh+HQU!=7{5H7UM3F`tr2ghJKcEEtU*&M^#;2ig6=*B#HNDH{)os zG4C+~x{z|Un8`_#R;cZLfS*gdl?_7^<1z0r!t3=`#uV|r8|e`+G{c7Tr|H0to@D1` z>6qBrW5?-28cDfh*^Nm*3%9AQ_uhj)ZHe=A7fvvE4{`52U>BnC{|w^A*w`3o1kli6 zZSzBYCnUxgN(+Z5L`s7d*dhL$pya?U;&25cuYE|Fn&{YhCA@_5XLt6p(`lrp3_}cP z$`EdnIw}Ud3)@MjF~Ai6z?x>i ze3q@t{Y2V}9z}B0(!_HzNSdR&dpQUUF6ySHsDi=4Yn@vvlI0)iD zv6Yv$O2kC$+}+6oH z_TUX5V}74q^b`I|Eeo4nuYE|}jk5WIWEt1O?nO`bcTj@SM9Q(pGVfCZWk0o&w*sEUB`) zwfl-MQP9!y)qutbCjB(_<7%Y0uh=2RvLKNVcm^W+0W{}Gxyry!GQ(%#YM}gt^EH_v zA#PiE=T>MG!D#AW`fY(&(7W_PZw&%lM(0eDhu|kVZ%br~Tf6~9{dwdm2iBWnXQ11a zQqQFbEZh^X-?5JTxIb537N2wJ{4?9|?y-c>mX$set)n-~H4nLi*KfM9*PDOjIx;yK zk^PDCKD*r!j?)?Snl`lgDeO{q950mm#8qXG3eB=9bYgAkLw%74kH@YP`F8Df5o4eh zv+$T=8maIp2(5ED7Wy=stQ zdS*JV7+J~EYk3QV`&tjj{;T_(gfr})d|z}#Q~8L*J-iRS0MQc!DQefs2w`O`Gxy>h zUmwpZb-xNMdwDp^-hgwBg`gUXN|Ld6&8Zj7>{IEv1F0t8w^VkgweJvGM3AfOS&j|o zp@Wfzq>y9zzlNM{L}8bf^=;B9h%?M}i&ved=nXr9P3;1kCC>Aq(;Wt+*JD zS=jqG>uFSwtl8tHB|kr2wlv;&W;`TmxT z65t^!(MfG(={t2i|K9|t>a6)?VFRi#*nRF`8~Q<8-YX1ao;n6Ir7q!_U(bN!3}(t{ zAiO(yPthLAQf=F*M3aN&f(k*hJ_9FPxJSxSbvZ5e4ZB7Pei|;oodsdF1}*!WjMpip z#|Be#I~yg_7$Tnh315WajMwg((phnelIc2$)Kf_oWjza3{crMZdBlXv&rd}v+?Jq1 zuuq=079V7?n`8-fGHXg3FgPiTnQV7UyX+m^8CEFA5cs>KLVUaAa&#Ds>QmE_CXKO- zuGpsjMHMeSh=vs?7~gvz7vW7Ay1q^WG3m5OeukKhR;(*>jElfhgH>0n9m*4j!fDi8 zu9ifUm$Wt9eM=spcc|83nbKF~hA$SQakC}dk}H;UJ#A%ge|9ADx$fPuDz573@G5Nu z-m)_2lKWQjJ^G&ZHR8FJN-;uB{x#pz5GD=K79R(#9J*Ah zU3z207ERKh{17gv_GR*V&%&Z#vCJBk$fe1x6aESxKYn~So=)o6*}56Q;Ggz*LB#W@ z%MS^OM29Nl;=U~q1BTQvJl(Q9myzS@&qiLL%uf0fb#Brpdh3wdmth~4EfXwgK&L|L z%ZDf1xy=S3{h#ni;6D-e8iYEkjOlPvu+lo$J8-j72+s{Hgp^G!y)fQednru7v|q;g z9a30nT=3NzE1rddpCiVBcf?yQY+gP6N=U@|% z6mx1C8Woa4da7(*|8B#m=2n0J-I{!)q|0bO{W|}>_vF_-^)#OaO0zf~CglD?&D|OD zl#8G!zstCBg_*f;Agj`SS8tVF1^3L$T8oi!CVxv@3BTy*tr_coU#lJy6-o6JYscgK zj0+NW4U_@6#g&zTj5Cr`3yCYEP){93#^t0~@9TtbNWubI)h{bomJ?*3J*cf))@&s- z{`X0}VKC(>UR=yDV)1&NXyoc@crOk8lOh)GTtB(LE)FuaX9wB*C#ZAMdK9~!FZ%7e zNcmUA{+*V9`GoEDFt&ZVR;XRF3BE=Yc6a^rY;vo2fjY`lS2y6ac0Fe1RZ{4K;B=)z3DsF@@p!QuJ&Wb&sbXE)pl{3S#xk5VE* zBd`@p8i5tu@5_b*rPA44B9MQH=t9ByEk8_znU{24bhM`u< zHEvW}m~34cM~t7f%^Zpej70q}fUbqHvGIFv(C{y~KM)jqbl(LXpA-V)^;nK2DC4|# zYn_&8`3B8R*eCv%swwgcSO44K5VHJsmVlnJo)uavNrMcs%|)Y&q!@q_hx8ZM*Z)zDfB+7h_*{D?t|wMd8nKpAaPO?dm+C+Xr&D(#kduw?)Mk|m z84?Vp=$5*j0=}H5XMS~`%7emjq3O8~mGbcAaSlCX_NDU!{VbuoX7uS!=({Y4{6{YA z;k1|Eo?*5S)!r~>CwM{p5n@p2?MlT5nSE7E`XkMQ_EJ}SZ18SsjEwW0+aSeZG2BCa zb$#8+ewDB5QJ3UL#t4sEIn@6-9#-FawcLm6JHwcP(8#_R?zxsRB;+welCbFycvVUTVJ=x@y4yiCd&8*73z$%gn>CpXnW z%Ycy7-}qXh{79k|nu*)pSBzyu-b#6!Wrn-BwBWdfmBJX_Q%*i*wqAbL@nWs-qaMZh z;cUfDgWYBaU6^8vfgdv{YRvKt&G5GhHl1*J6*_ut=UQ-|mWerFl;N*p#k!Epz(clZ z`$2W(xWS9EjB7N0?S%=bi^0$xH8xaf1tw-0+{)H!!|AbpNRmUWO)$icbcLQM?^X+L zMpSI2rA18KFOJWqHXP8hx3)L_q}*jYkzQ?@P#jrio!XPRBfFpAy{U6IYDILSA>uOzL?j8L= zYYvL1rsto31+~^Jh(75Vr08=tjKUoyM4u~cqtVoKdIn=;zZkO|EY+mL@TpCaeL}<8 zeQx5L<#+o}<=gg8@YbO!XRmk;Xkm>}YYXg?f881ghV27hN9?`SZZHO^w0ZtqLzffhGgZZ^u7dYk=~pl| z@&}SS)r?pwqAC<)ToOFwX#yu+=NRCNCx1=bKvEp9M}aOj4`iS*n}2S(a1gZZ%HW=l zY^vYO81hywlu_tnX}d)L@P4ydk>P;o+{1~)ys5E?Ydto7hv`*cAPb*m zeD)lXR~mxqb>R$8nY6vZRBY+z^0K{$N5yRN;i0c_SgndB7Bza~PxtC8M6ulA{kX8p zDr$+g`yuGa#ZQ1?`)u;|_IB1(>5u2Xf2xB9^eDpqw@o#rZsEZa8(7_P57nog5~syS z)I+T}udnb!w#*TT(aLw|ot>`*gD-aiyg{4J)A1s^fYKKaC7hSd%7RR-K{x4{8^mGu zO+nb0<-%1Mmh38#G;k(lT`|s~dQAXPHfY8M?tDV?XdfN~E5z+o@}E%sObZIGq+iw2 zsS1#{@4@7sG?F7txpcAh86BYP*KuI2($d2adQ7|*wOgS!xXQzH8pieYt0|FMH;J#l zHOy3>x(<<7fuWp^+j7##WRQCTcvQwqY`1&E&pw4kzlHtg2*i+G z?F9|-*<>6Jhp2oP;l8r*Ce!a$54Z2~EB>y9o#9umR~lSbRY6?$;h{(V0L?~uts7C1 zAQJ$C72Bk*Zp6xen&`Qk5<4?eT!eE+vxZftL^6~~l{GH_?Jg)U39<(xI0c|LD{YX? z=Em{pR*%n9HG4E$hOlPrtQe~-X>KTzbd13~5C+OE0{N;c+VMT6bal(JN(438Y`{hM zR(5#W_+&f-Xy&_u0mXYD#>4(8MU#SuG)rJw!~)FPbcrZ9*$Fzv zDM5g~v5m)rVjGL?8>n)bZJ7bt3$(O*U8cezb~d>~#xJ1#GXKY2iNWbc1Gmn@>vqny zplvQxYkRO!xNc#>KaA_V9KGjdpe-U}gf$e|%laPpJn3TpDuX@Anp!}+e>PcH_DGs_NwxlHO2xEsk3)`L4KDJ;o6dk9fyDDQ&uzM9y^!9fL_Q^#hp2nZ%j@LzVZ6-3Ri}}BH ztQgHZCT-Pzu2sm*Xqd_nzJ{iBm9V#04stOD0Q{5^R0IHmpwRr>E$dA-R-}8Kn^J6> zB!0trLFmm8Y+qKF`60P$#6y9Dcsw?QELh)Vo{Q{Mlz%~%pW9ORG9lDw;?>-lis*Q{ zoG@Xh+BpmNtes1pT?0JclPmWn-_Kc;z7rgS>kj*4DS+9`wEtU!O=QdsMEo+wXv=A>V7xMI7T9@or@iL;)g{{qSu2H^?WJ392^|Os(SA=MA48>sx4W| zY8{pH7rzxG97KG00&{oC%e1Lh@?5+~_iIptGS&Gq{!h#kKW{NsB6)N?=F4wbvLBjF zkJRM3spRM9yI<-J33gj$ca5}>-HHzY7S1#R&=m~%{-A5V;t>DT`1X5w6kXgG{uuX- zn(_5&ywy20HCwkf1bBUz!_%iq$Ne}MYG^>xZTcq%DZ@1CMpl;}*$2)hFC&jkVj;t% zl?U8}FNP4PPN_t9qd2P5w8qGKHWIpb`c zV`x{-4k!ij$w$n#>M}?T7xYM39Xg2@!Vb*1NkmNSMkV*9kj@T1%#Y?s`1HyL(je#K z(Twr}|0PffE2p6sZ9qZ|2`0&)0YiDH!9li>@Ru*LC_Vn9d5sT zvRxw=W9jG6wWE``t{`>5S_P-dr>J)C8o09n2eEfct}bINFtIZoyH;s4 zy?#`X3poc^Uu3l^JR^dNC&O!$wO=wo^ge&XTRAtcxAN)z*NZy^1u5RUUI$ zrQsqx@!emjI`th)7E)5PzH(M7l3!(b=~kAUKeqKv9%b2(RHb$AD1rUc>LFk&s5r6p zYeP)J=>Efniyt{|8O{ypnhp#gwnv#?lH!hpqnY$Cr==NW zZ%p-n2VXGb%7r_ww(oHf?U#HsW0fyUdi};2`(5i`zy@EDBZdPCdK-u)3I0VsYSgoK4>j9N%M zDqZaOYa=xmbk1$X!dbON+mBJJbK6t9G_=SrxC5|4b6Wx!qiU&1v&D}0jRAw&5;z=f z2WRy&xi^DWY{gUckHuJ>87YvP>iYWIl{&m|fpKShH)S0|HntW|(;S{*wX({pB ztgMY}%jfamYyx#8K0X?MRC%x$$W)$nm~O~yci_EE{g6rGo=4U^%bb2O!*#SUOEQCF8QYJLR$NSY(F8sM`He)+jskeRD&mjn~NlFsB= ze&fDPAN8U#vOg~w9*b+L`XyB18dnV;+>3uBa4EOTKy+qVYkt5FNhYlC?e(> z>rw%(Wx+u}^^;{iMz0TGjBM<1hFOvZl#=X+mHe+ql3FBV1^GhRdIzjs_%?sX4}X=` zZAWS8232&8u_gpMbn;1Tx+u}G4Z{^q2GsjfG!3ch@jW-pmPj4FyXk1Z9>JdwICDPf!*`!l4kIrsT2BFiVzg-Wb;*ViROJhn+}zia{%LRWle8kkJI#a*WL#1<>#;K3rj?XBGP4OW}L@m)t6 zm;0u6#;+)0Ei`5DN`f-8t6}UwiQf5hKZk!BEPUHdTWXx#60=^YXN`Nw=V;fT2TR;S zcv~_)0vt(=Skj=HaFf)IV{ZJ+c)s1-b1FopH{%Xgd>OE0<|k#Qh2BJeK0n6K&37t2 zz!mR73LV+sDarqLp`+;c$WKRIBR6jkwtzmNubk+H=kC)whK6BVjY*zj8*n{g5ES6q ze-9D(MT0?Gzi>1M?Vh9>`_0fB0Mgx9!EZbJ5pag@J^+!INCb+1GO9-`URZ#SfXI?u z&BIaAYx)5>joS50jIV_p9DTd==VTU*ZNd5DD*nb7ShA*xN6Snl9)8PMJSZ z12nIg;kS@TWKPQe2AGA_w82b&BG!wPkS3uFgwnIbO-ASYD=hzknQtg|2CcoE;+LI(;fJz!eRKfR>NGUfqP2pHxQzKKg^$TN zaPs|S)U-A&T!H2BR{)e#5@|IK&@qw_?l=~}H0F)}9-lb7=G`vA7 zIvPO-X_spo)I~LEXA|#6#g2nkmj&*C+il<-Q-V9crKdSowTjYg9T?9wjwH1NQw#JH zA%ueK@Ih6r)w8iuy9LTsY|ii6{01Os774vaU2#7P0Ebfz@PHXq)44vcj)}(aP#6m( z`UntS#wcnCf<$6u0TbKs z`lVRGQ#_!XnPN@(`W~2;sX?VYnr7CmcWAx40d)pMADPYX;~9d+w+;e4+2=HuSxB*TE+H^DA;@yWmiC48#)PwLjTH z8Uf$Pr02u-c~#JgrT4z++1@)5ln2VZ@lp@qBUZpi^d8O3%+Mm}8T>&7z7=d;<`@y} z-S0L_el$`5!hv{xi0c)(loHaaT_9mlgKo1vNLW)@h9!du;~^D;d)-DYSP;wwfje&i z2hDpmGBN@J3`x(+i;KWQ5XN*N5o%!K^%WcW>$XbWX4KHvP!0`efD_ufH|a^-!ljYQ z(!#>RXLTKOrM|0xy(>@f+ny--a@0vPIJpg~_sx9MKJ8=nyM}S4QP(%lXFSopyz!H- z_d$J}Th3C0e{;mtIfdN04AhbX=BX*cfF0Q(^*%gow{U2v12EKLNj$On%4N$cTd7?W z9J>?po(sB?q7sPm0EjXmBq(S;8L4%9iJ`~=BLTR}hvTNZj`~#v4jq-D=7--ot0<3| z?l~49CAk;TJ2?ph%@p15?%8*$ZLtYpmhkCNPuF2Jx%KEEzZ%rKB;y0zX#X~qWA8Jy zaQY$jzN=kgtPGImTr&q3b5?o^Dkj=5yCZZ?=)u4zm4W4nt||et)>xv)^`xOD#;ItG zWzvd}qdo#>@w;h^KvD(*D1+w)3Z39_ePwlZL1j6$pu_i@E%vQEs#M@0pkTE8+nwJg zO>3>IK*w#MK*gWk6m)ke>`s{C2g{W+FUuL9L)id#{VZuLgI9bZVsP`6RG{ zPJ!YGz{g@ZQ@YpF7TkW-V8s#?&+Aih)d-WN?BYeINv1xh-SKv-a{)X6 zi}b!2@efT*XN3!?oj)OHkj zOy!UaCv}%G-~OV5fHM%PQT{sd+?Pqc{@%#qf0^;N&p?wdlWi~j6BsBV$dy&T!6z9J zgVMV$7OT{*frYM)`oyP0VnJ(!YYHy_e^gCY*A z@q!X+XE9@qsr1_WRacOAY!5n ziL7jCG#AK$)bp$l9%p|r3An?9A0LSmlS40YPY%~Cpa6LjE{;D3dJqTPpOC+o%79t9 z2v~YzoU_TYa^A|b2f}K=bst+tTidp8oQuQ5A58-Md-^H?A!&sgs_5b2+f0WrIqDei zXR8Tt@VG`Yf@sDvjHMN7Fd#!7_3%bI+Zq30z|X99UwPl1Zf%0x(ylpaLZR8b>f_#j zjs!1bJ-*GaU)C+YN^Vz#MM1Y+YXZy%>dt{gM!IH3%-BKs-@8O<^hm9a$j2sw=h$6O z+ggI#KF3(m#ri-_)#tfaL#>B&s!gs8zJnTRj`KlHe$NFUM=}Mv$hR<^u3Ij(tvH|k zz+Yr}*EhpIfdpebQbg%B+d^7L*B8LY=>q&%g#EQeLpKK5Ern%c zc1Rhxu0Xm60uWW0Ey7h%69pz~_LBHX@YdPWyVHm(+*9qyxtQb7cotAcRQO{7`@IKf ztqY)b?N3L|R6zH@QbCzLph7)+QjQpjTHDEP&cU)UsOKIi5`nBmKL7;fNxhFZwO|L_ zWZpaC9vpsfMHiRD?#JIahQ6uQ#_sjrJ|F=g1Guud40r&$T<$ofrFY$>Ih~v21zYM4 z%o83pYSID+qvJajm)HR^~0V&?leUP;P{L;=G-J~H?&7+Fbf41<}~4sG?dRGY1Bk_m6NP833q z6j%{NzlN$!%(PsDkQXi;?qErh?lzE49w1F~b2FeI!6$DmEiV36S=P0dnIz0pYz8Lh zE^M6+amHXq*!QP(O{+3YRngN|~b{4R9 zwqmXzz*y%U80wyfD*D>-mpY$Kwb>X~G&K3R(+4ILVNh%cU z@x?t<15?-gs(kB%ySsAevi7R27jaITcl*Sh!fFov4?oZV%(Deqhsfu9sdr#7M_(&r zkk$ZTn%(r8C5Jwu{B=-;F$)qM-Xd~i%yEmRL(9|N`S627Hw=U?(^`14~0+TcH z;9g9YCBPG~r%NG=MUUwtnk6dcG*WF2HtDP}`ek#N@NFr6mVH%T{5fRm{_Q~|z_jIH zn?v^ltlISMcD_2)wb?^`V$;iX{83+H`2agB5;xP%xYY!?<_~U}W<0#{cxgPQO|I%u zU&79>jW@{E59go_rY2gL^_tF2LIuvTdZv8F1KG;x(eL*AccC$CgFcq`R9X;tI0~pu zNQf5UXlHL9>S}nN0U=hqQ*Z^*|F>`R;zP%az8mK8)>e2OPVepcBK8itIu$z?_tg(A zK@w8>IHWp*(VqcSv`Y;g9S?=#&Lxil!Y?QtsxHpYgY>WjL~SlP^tv_`4Rn4?+K+g^ zNILV_RiX-ML%YyBgjAQgN5mc9%L2 zanjY)rOTmzI2Iq8ALi_ep-0EqaEJA(wO`XeFM1zcB`mWVB~xTc$+KTg+t~HVb~Oa) zWa5~Raf|6hWa2p*=zkpiGU7fr|DoqlP~oJzJXf;cx2>QqcAve8>Z9&h&wl*xz6w2d zlX^&+lX1=t0}wl;?8| zu4EA2$DEsQFILMekj!egamrM!Z1cBO$|P{GGoY9v#ia+CSv57#Xk}*`IO-3QG{{8& z-#C%l?3J(pd}D891v~i-+fLdW6^-9*Kg_z_!r3I*m%uX1xEIR7O)B{CttA|8emo{Z zSz1xeWH_;J(ca0=BPGq zck-zxcvh@;(G63X52hwv4nb7Axu8Yj1aKW>7f5Ht&wx*OeS0#6gc*p^J6&VaE`>p0 zwol+<6K$=zfd>>2zG`$4G9TQ_1b6}Np?u|(SgbCsl9H`Mt@(YMYISW|O1c37S`vcGKe#b!}kX8{qm>4i^fY`hQpx;hQxoS2$a-koJ zA~dp!)*Q?f`mOcy!@iwM0e|SA=J!{cD`?bu<%=aYO=yk9j!i8SB1l-4zam?eoXLzZEcoC zS^9WhIqvw>EiNtsgaWBRH2g({I*n#0xH19)cEWFmWZ=geQ%a`G6fxR^55iPsjykBk zeK&pulpTrUnWX1Md{=cJGdp=97c99o$*QOzdSHuY#`YTIj0~n)Z+o0Si!i*=5@rb1 zlGfyS;L(GbWnN6J=zjm~H@jZ4G~p5GRV`1t&Sr;2!1Mn{DQ6lE<@@&WiAqu?N)(zT zWW;fr402uv3>*jEDP?(C_`(I8qrzRkBe&+yKFGl)V4J{K4M*SkL0{w)WpTbffw5R9m)j|lM?*gNq)l?LE@S! zz09z=fWBFD26{rJ<@Mp)GfUR(U07q|QfbUuAR+mW%c8`R{ogSHw1ssw0wD`HnOQ-DqlL^F*5NSW>e zRK%4{;J+bOT8u{}z^Vt%xnrJ7?3>+LU#PWDr-!V}mqXx~MdA4^cD?+|$mLZ~zE~!B zX%LW80Yylc+WHHP7h*Rd3_BW$WT%$qnGTkeafUrI<&)Ak7^k#l4Fw3J<+7%#$k(J`A*5nwpnfV%LH#OF;uxM!F84Ux5 zFv-Q`2ept`UO7%m?jn*Fq0%MwEvs7UTW3_0WKhtMDTo`0^p7=YjGN0Nnx> zJ2vN;>=l!syFNsZIM1gxY!`}5jflUoLk=)Gf)RIef$k1$Q)6R5o~XfPk{qdJ#Z60+ zVO*}rq{S#uoqdgt%B<#il9~pCzZNq1IM_lBGNFo&)m5 z&QqZ9Og3M!B+h-}11((`;1Uo|?lZkiybLS~ud}q^?1fKyc@q z1M%{F;bT#vx%6#4^l2u%_>%FTwMH{O=c4&&=wDw?>|z3Bnl(_`5QfM6b5CHUt(RGN4%p6jM{h3qZ(|=!5WrF65^#?;Bdt zzRipoBN2Rg#q14!E#D$Z`oocg)zbWr4oH*vHNQM?kQ#lD+E77S-S0-APpcBU-{ka!nd z05;$IUOsc$denlDhaI-}Q}iVnXJUlE+)mR6TN19WFc^u-yAHr_b`vis{5dyk2&uXyW97DSa^GG4{2(;_ZdemJfMq;w3=6;2nZ zZbmL3b3hzUh!g6L=}HQp{I2maDCePRgEZdV!tcUd-^Ff%qC)45mM4kSh;CnJ<~3U@ zYnt!;2Tbqt`)+1@^Aq3L_HKPra2NKhx($jt2-6B&&ttcTLs%uS(yMN2jdPMRlfES% zyt5KxQfrHNK`MZ4(!Fr7z)i%d1l_^0vAn^~Os&ThafTDn-`LE3{9#E$Z48$XccQy0 zz@;lnffEp4{ZmXmKWIK53QIr){j8Jy1|+@n1#jGaYr$^SAS9M)i^S96$Z1yNEFr{D zPeh4f)_p6ta(f>$1ZTGm&k*Z}>G1-dkK8_Zw`?XHvIAz|e)>2a)dO3H{>Xz-nZ#qU zno?*EpeEgnHQ@cm)27GdaGxZC5{1WVb<$7m(_W@tA4Kd z+3w^IW^=?C3Hy6mlhY4g;^8*6z*OnzsNoZG}2#ZwVmG~qLx ztZ3onllh+BX4GEE-q{-n^ZK7PTmr}70jAerhTT)i>s0E8D#Wjr; zbA80|-FAhvr~61n%+)2zVEs@zlXWXx$oafW12X5F^UL7v*m712Zex1=+Wtd;k%0pG zS_z00^)p>*V0{wCawXg(ilw-kns-+VUdDE8EL2Azf}Z6HDx;P+BO1v@?Wn3E!{MFW z(@)v9ikR$>4N*nR<&1F`Cqbpf3^mZ#n|#6K+p7r&Pwi#l3h>nG>laIVcM==emDpny zv(nG|8kH)eQ#J5&$-?@)TdDZD7W!teo-RoHK5*ss#;AI**jM8`;LPvj&*oj(>U6BI!fp3)g#0jyy|n+ol63^qBeoI$X53qYr_!D-T~F#C%is;( zi{ou8SNh{>vgYV;!L5qBCBw1?Cu*&Hc+8%umkZoDq17*O-2aS7b?M!AVu27VQrKsS znh~EJe<3SgVp`3}tqa&z;`U%&pJ6$MB-B-P?dTt(oC>Y1_^ixMhbGf6z|A}-9I@Bn z(i|*pe{%lxq@>BSbT++z8dXAzTEMxAc?{cn0?-fJ z|1oDVFoR!0k!@=m&BQF)(~WhGx~`wNIIiETvYdUU2%%3p_Z(AWxmbv53Tr-mp|;x< zS(gx@<$$%vv}|jf`l6WK)B@Iu=Z7~-&^10ZANnY?wbv+*jqswcK0Sw)>cJ23qg>Nc zF4dTb1a??N3%s!C_L~Ft6<>+jWU*RUPS2_V&np8R|9RmDcUt-P2KwkSj(rhl!X{kX z0IvQuEb?Z0@+*|#53_xNFQwpSgVv9W!$OXoht3dc!QWH_ixBYxIe3gZRoJ`3&H(P6 z_YUvZy5QVHu_5Lkx#ghy(8`&^IL7mJO^JCT0 zmIFPn)4`rUu8ScfxJ!UQUi!o&mflR6lGUv{bQp!Uhm%NgAy`{67&VL2E&E+o96h;! zC4}6K{K{ax)Qom(_N1@?_FrH&c92-7UO31puV3Pn1DXGJR7PNCs#LM?Z~g8VoYZ7z zpyeIdIr6;ZxaKj3NUXh$`1PaCv0b8x@;Bu6wd;F7>HFDJ@!MH9Uh)e2cUZKVF6EN@ zqv&EbvJp!1&y&NhU1?TkMKLK*i(-`r(TWg?vvf}hC(|!q3EOZr`|skzb=gnT$iUW> z$-5i^HOqf_-v}t3$22grz!tAHg@eFc`3bo(T5TC$cMWA(35C`(V8u0Ow=(2 zGtud5=P|OHAcsi+315G}mK}Ya6m@OOo%a@5hwI@>`9h~p!ycYTI3Br}bqfj@3 zcJH{3oo)9#bBae)+dCU)!CB18YoxS*w;HTGw{g524#%pw@m;b4^OFXhA&wyfh}*cf z=KTk+r}}IF^5abdLwY#`(m@f+HG61Wd}cnBR_{6t+hlQxQL1wCFaC8 zPQ)vxC(aAx^?cW!s}=X%y0E2vedfuX(rUb3mekZ0tAfaQ=N@Us_KcPTtNE>lx1QhZ z$}bNx1Aq0=c;ynre*w3rL-Wm^2-k=&3zki08trnKtg0R-bh*xc+!HVAq1LVcURZs7 z5U<6p3$}K%K#H4@tWOe3N4mz?^#b?WDBf z+OBfb!9MtBaSzk43A3b`7rmKr10Ca|1~+r6k-i&Z zOi(^d7oy!EMm%xEqP6SRt|mV*Q;nqVYhfZGrSw_!$WYi1i-n4juUS^Usj19bQ|hGC zie%7f7%?)6-df@HV<3b1c{6i5%*K1$)$1G<1&Rd5Om@#-hb3Kk-%|g|Xj}F?`tTop z?mYEB{%+SS&RXW9qSI_Dx6p*1#W9Mq`)T-(Uj>927%hOZ%1Lx!0AhM| z1;2JTsOP7%iCg&9!$&E&!n(Cbn@!{?b&xLK0GIsh(EjN3B2qLjS<>Q^_o!9z0~<|Y z=(P8#MCqQRt&I56m@f{;2*2-*Xo@RV%X*R^Wdm0o(Yy}GIw!#M47iKar`*lt|8pDd zTmyxfZz|i4b6{6{sw;4wm$M7pNZAp8>j&;6jQn)+I8*&kX!)<9v}@Yz&=}-rVC|*L zk%_AR2_(PFK;qCyoba7blZzb656Pi2n=cL>j7~$NDyD@BO!$wX;+~X~4JErp=DcXxwy z%zXD^KYKsVd;I_RzwhyXpU1J;IBTtY-S?H}bzax#t0*r;ginPJgTaWTpNlKQV3&1a zFszv?m%t}G6-Vsge^@rkQqN$yT{J7;FWAOU<(|S|1wjO71~}mFSKmF?uz|sdo1p)& zTCCFFz+hrI(&A4M4!Ro?I_BEOu}g;)9Rs?!Ppw0u=2bV1j#2z=>ra{qpmKz5TzL|hyO z8|?M9LUW^t67uZ&U%x*~ybFHex%i2Vx(0stFu4MLKIH*7yWgP#A`rbsgaw1$`Rolo zz9udPeqQElDQ2c|?U31O_Rb7Zyl^*`=XU7~H*qc2&l{!peE z^T|N1_ntboUVm+FsZvOhUmJGZZI`)OvkjM%4^x6LdS8<(wm%suw-}TuSmTJJ&s!P;kM|S#vgTmip~%KD<_qL7Cd6~TJmz{`;APHR;<`(X z+G2c0b0eDU-5sQ^@Zv;6;}{wtg?vJe^#!<2S+QPn%9!W(BN~aJ*&!bL*@HDDuWV1D zkJ01QhCb!TNxY{^`%!z?Cy%UxaB=C+4NEdFGe6_O=6j?2aw0Vrt8_!n z^W&CWo1!0b^EhLZ&UP1bj)%$?l(Mpmil@oVCVko@6D{3 zoSf2q)5hb%k;2^6oTL7k&>hjF&ce=wsmIZ|osNHOaLl@lKg_Cp>N~=lX`K^GN!m}Q zCtC@l#ci>~wJS!=D}RRbMMxc-J2^duk&27yi=C9D?6>9~gU;0?H_{dUWb*Le!# zn^z-eOOlN_f85V4Tg9sfCPlrg_mQ-9Eo>IZwx$%Wy0WH*+EWwA?_}+86Http?Wej}6D(B1%#h!xXI|#bxCg=q*mBY$dHNsbZm_M4@2FFY2AEtiXLfr_ zkenyvceq=WG1P4PQ00Bzo5kN0gaNwSvymFU11BziWk<<1CiRjb_X*xMbN1M>N2~nc z(;I}}LOhbK$_P=#JAc8Tr~6EDapF=dC4U)Gtuv~A{g?m5I{$NDg&FBV!#6QCCFMV75I}Waz zb#+6INfnL0p~?EqY1_q=cN1~MeZ2M9OciRUcW#}}nRvvTJ?Vmr&qco>bSJt_^pUt& z{)QJ^7)`Kaq)S1dJH4ARTS|m`b12)SchFxUlHkPZk#v-oc=eZ^aJioQQ16lg$+S0$9OM8;2GP1&Sa7 z{2l3s7J0;?*AwsdT>sN{qk1ph&$aadT+va*w%m89+1wnhd6Q6{B=IAPEneNmY`~-T zAiz>c!e{fjG`{4i^!oeaM{;_e{OXU`Bkxyv5Y=6o{PgzjYdl6BPl-_(U4kwecz2G% z13A<}sn&;wcOHF_N7(+zON`fFQIV8wldw4%dNp?TO@iPv@!Sm@io7549ACy<6K3BV zG|6&)>YkCsJ#-Cqjpsx<{+eDa%J{5nsBnW z^YgrQjboxAn`06u;|*+CA|pPXb2|nc+3v6GtwXb#y(K2p*=hOCE+rg6o!?RzdtVo& zI%w~R->Q3^m33M1lgSmt^W9mWk2B6#`cHu3@b7*>;7t+0=0MyPRQT`0 z{eNZ$r>7z}^LOh)I-XR$V@BFgDd=GLuz>}H;*n-K;T4HZnZw#VJT;d+JB-NBgD+7! zfoltW70ump^xs}5-gz7HN@|6x=8UxV&KvL$NI;5}_e%m^@r?g;{vzb-{r88ZatsOE zd^yfHI9qj`@G(LHeGP(q8}~N8$1rG%+XiQcsBK&7_X|Ge)bJy*8Am@=|KBj31&B0^**tTR(q@Ke_m?tW&PNr?N#{0>P z*N+W6?0^GPyp(v?J(&Fdcb=n%O`lN$#$2eeVC#ch{p@OK7*`ir? zZzSW=vGiC@g8pWCuVuNARm12Du~Xj5^+wl;utKx%ydhuAO?)#qbmq{XzLFUEi*`xu z-0l0&D4#HcvDt*GZ(1S!x^Jgj7BBXx8se?s6XCqRLM&LN6c9)u&lU18zum91hyAqc zwVA75D)n*aIjp}18}=dN)aCY-Yt#q8%5ud>eI5wDmJqcY(=L_(Z(7@8pFs9okshNZ7+#zlhYwj6E~XPNc??I#vWm6Dp1#&zStZ?z zA9N&xRe}VjSQ;4el`DaXgddfViZ^I_+~Q>y?t68BW0{-;kyKbz(xDnTv`qAtmpRwl zm?)_6YTt3NOw`PZNtP-ttP}u|c=NYLsH^b#NY7NB<>AF@OM1i)LL%gCtH{!07L(A3 zoW$!-@#r@PUJ=O%^92OE85V`dS5f5L`0HWYoJK+!@rT$wH-O@WL@if%Ya3xso?uyu zJ!+r(;js_4+EeWZu=LyiXV~UHjs1TQg&yRa45x48Gp=^#e+`e^znalcl?pJE`)w*k z9`ivnKJ?0-v5j8RcZO#({1wFjYq`hX8m{twwjd(y`ktpltI-wQz4yC5(clASc!IUd zig}FeIfTZRi}0(>D5u!JkF|jFz}$tYZFzX4xlX?N=Ga~*it0?Qya|K#+}-bGfhW2r zpw$TPQSq5-(1addvDtJLuW4#g1Ph-Si=q_Mgbe!Px<1}IWK3KW=Yp2@d@2C}I|T9n z1qk>*6Vv}+fcZaPJ)V@aa{{@!9oEN9U`E%X@9pnnar48q~n(UXH3B5I-~VdNdvzd!6)*NGyq zuQ^Hxb;klY1jeP0mzq0f;&9ea#DaDC;ooK;r}cJ_DH&l_oC$+*y*jIE<)=_f#wYyk zo@h{{TjZ;lCK7Tw{)IS@ku#MX29xYMRj)0r_*&JP?(wn4j2VG$w~oggB#bJseV~K6 zD-~h}6wG;|1)A7sajo=GTvZ9|Wq!WiF!u+=m8G>7hmWa=tl+}ItXz-C9fo?w*%`sc ze)a5m3Gz`0+bkD`x__hsR%1msNVOo|WK@)qivx2v*g2c|!$*alr=Upb*IP|d(G|xM zJI|pM=P~b}m$Z%|iAjcb_ItOD%05Ji_p+b2z=kh7{UQU!V6oAns+%J(Tbeu_29+A;A7a5#@!d zlV!F|#-z533j_TTse#@hEk-HW2wdn*wN8sYhSWr9@bPb?inlJ_v2-v?A4O2MAzW5j z%BdRv@Z!Oss;Fv=s;-NY#9#NUcv4vK%>QebqH*KeFAFN(olfF{oyuL@F@Kan4eaDc zB_~7ttBWOCb16iMQSXcnazowyAd~1J8JGK|2vt!%BvJ}pVYfQ|8(5E)f_&GoiGzikw2Y`KNIJ0PyY~E!7beaMz$#3X z#tS@dbcP{ zd?i)1fR69fSQq7DV<3?!T5|CQ-@`l?3mc%v2jUm9_=#2*DL}njeW4;Fx&Yn5JO#X{ zbYjVzr@H3odFJ0A{)cHAKn@x20z~!|I)-B5LKyER$W@yBHmB}#xMvNGp?al1;8v78ZrK=Ib%Jl@7j3{b9G7u?nJ~BR(Zs@Cc zB*)1C-fFg2yT6~hh?UP;V zwo>R=gcRT;*o1skHzkD~>ArzYQF5vnNAgk0-;N3+;MWl6pB_VT&YT_Ti{CI?t ztpi?^C$IZmc_AZQ-tjsP;eOexJ19l3nwxYS?>U@&f)Zdb@1~7rDLx9zF;`1F3!TjR z{9B4V9msq|l?sKW`T%F;z%>ajY#1UMGVp$cc3?jZL(J> zWbye7f7^M$h3je++Dti86Tv$t!b!P`AQ|R&I*GV%a#|$h9Gy((CKUAumx|@ab-p-l)NO?3+*ep7fJ+zLEK9;%;-ELW4 z)hc)x6HeNu{B?eJn1ZqPOHx8{x{y?vQ%kCkyj!Bg4+OntaR4^t4-?atyCD7_vP3KM z@s0u0#9!sLc-pEs&!UsZ0zO*kadtTDMIyVYAlfs|QFh>{d?;zsaBYQqmu z#`)C^!koXYp~bGlZ|1qTI4O6w-zvr3`6V4ijMRv$o{((Re4R8+5o2}#LaW7>Rfa4= z9|J{(Dl&#OXnUCT@9C1Aewyl5aP_9!D^y$>utma`BW}Ezz!#s8x(Ugnb7zZjO8ezE z*0z+8ABKNkzUSFHWoYO$+ok$rnzLA9e4U$eK1Ru>XZ(t>I3$v&Kj^SL3r?CP;AX0j zTKSAIS#iTAkL-;0tl+Q4p*TJKX`o~u#C8y_oOiwCtNor_I@-eX*`4!t0P3pEDxw0eOF}Pw4FbhqKI3R8ms$V&!gvdA7+Z~) zlrQ~OWx-1N45{qb-~KFlVI)NF_#LlNmCz1~>tH5t*_XeT$}w>M@ybon_DA4%vAPnQ z&XZ3<1%nf0VxYi+)#Iji@mTOgKjC9z*!wR3nzXS$chaVz%WU)!mHdn1_3_7czu5Ix zS#$*LTb(2+<7132Aa+%~NfWHINA$ z6|aeHrbOZ=ZGSTDM&(QvtjxsRs|__}W1wnNPnvEk%9Gty#f9lV8rOAsV>1K7uG-LG zik}F2r)ny$#(o>mA2YiZUXn3#TgO4GbKUSki`T;#jzaBkOuR`)Tv?TBC3C4|r>k`u z8PnsymIHW7g-qTpDf{K7HnzDyp#wFy$+(3K3vU;#sOYh@*K>H$sZEw@>r_U0jLLsp$}kKX;$5_f1n`i=e>(WSAyP|H`t~Zx(7^E9H-! z9Bc@l?)SSPZUB=`2PS>ivzOIwFx*aaoRY6W5S+=gwjFNQK4x)0o7dyISn9Y*dM)We z`|-}`}P1m zzv7Ev3O6Y%(3R_+VYtmbDcGFP!cVmtH`}j=cbu83rwsyY>y8S)InNU=I~|#2lB+f{ zUpYt}>+T2Oh`p#=S1kE#ySpbo$xCS_LF_7JZ9_VN3n3_ruL1-!rVU_CK-6wh!r9jZlbs^(nV|cl z<4dNTxDcW}iC2p<!UXuwcz40EHEfOU4O;Q}4z`M;D;BK{g0=-%;GHf-3y_e1LiE zHw6LoHV;MT8hnZeiSi*xE|1ubbwEt<9{sOS+#K@p-<|)4E0jzr;$js++S`w3*Kf3s ztK-55y`bR7^8^Zh?(dZstGrhfsDiIz!NfqMxIi$Jun?Br!)Sbw#nBgT5|U|W#v6v7 zHDvT@e)5sIcOBkL-MkSbc%BMRH3v=#?cr~T?wK0g`&0ZN3-0$y`=4;x@0(-- z+YvR($tzUVh)k-o;FZYzNZJi~;YUzN{09^r=It_jwi-B2S{J<7c$6u(n)~>(nShb+ zlhTq7-D8>&vdL^s+jNJeHj3HjA?gpC2-L$=Cj8r6As{HKrK2@fdKw+*T}DACO`hYr z(@bGivp=X8Ce?Xhk36(D+I(Uf@g(egrLTeA`8BF!>1`)nuu*xahy+hY*q1$JT_>@T zSnk{Nbk14Uex|NaI24Vleb=(sXiTj0oph9ypF*?ls2ScduJI{2eV&1}WNrR1Iz=ot zyFb>J^r3KEtK1u@`9DS@>|3VdK7Mg?h^&$Bw1OKWbJ<#1s#lSXA6W#cPV}yGoZJI| zNoibeGJEQL)1Gv>jhmbM^=x zUHpp0-Vsk|2Yp|n+SjmS#`h~&cJWlUt)Va1!-pD?F3y`^7@s>s^31){U}T_kl+NNqVthZ zk-p80jZfb5tDa=oyfIOQVntT{ro19lk#{J5eyz?T{V6ap@a;jvE$&G&oU2TaK3b53 zrxhjAuzomcRK+2ndMC*a;>8;6`aUSTz5WA?zRp;;Ct}zVcnbr7Q1fq0`b)WW3pgH^ zZFv+SWDND?D!x6EVw;SYQ>4T&u@CJ-uwbfX<;A8K;rhG!U^0F{mUU7Bx?e69PgQ1~ zn+4`>D)|23d!RQct^j8I)fVWDYBED8&j&)TUQyMWW_%Mk^n*!WY*+vsEc^{T-q;OZ z>iz5ARZ&1$ZxYCyVsjwhg4|U5PD-ZU0SXDf`u{78{OiNezse#!EG1>0P=)9t5fhW0 zIZ$uN%2QG- zSdT2QlP5QdcBAK&!|C`DQT;kL*NUq7=XT2%JoMP(=%X5=RQ-v`Rg^HkH&v zvy5efjIIu{+oX#`yBxf5`F^(A*3)MX;!JvC64Me?6YhRzM^}mPq31-N!rYZC(!MwvF14tc8>r+#vEGM(pWo`k~ z<7DKAF9Aqxt~s?(YOp@#C4fK6P}{xZt}F=z5IC$k+#1PXIC;Ll6Z}Fef-Eo2)=(I1Ojnf<)MonaxvP$Cm zbQcc>BcJ^X)<0WtIBxxQ*Lhjup3XsH!tI2lM7jdMz7W%G&GCTATilgtb$%Wn>P9$w z1+9dC#XS-Frses{n?JGLHfx~6VT;|6ji0Kmn<}%Niz4RyDooT6U@%JTW9gQp^`3R= z63xyaZ>Q&!;k&sq*Ph#|OZ~z&?r=~cyI*kAzr)!6^;ztSuZP#@gBiosY-S%n+KH3Z z()4wc)h}bNU6M^>{z1w3@z3E&3HY{A zslGjxmKhRpOxxy zppxp|R_m6N*WOBIN2i>Q5#x6yoS7f6Tm7Cqj!GXYy48VQNEfXb-bWUa#~LT1Yij%B zLr3^lVVe5G*K@pS{hx1gXsIdl>~TI(e+hUFz4EAWmGG!`B-(!pGlw}f)rgAR;Ex;U zwsAIKX(tWNZgg#>5|BOq*kO1L`88)H!m!OHpUfIv9*xu-x@{LoY^T5!+nd8|vK{0< zm|Dyno>*-{sh3Nt#|8+qWYUB{Z-9ZltZfn|&?^OIMw=d2X2Qp7JD{=>!r1{&KOy4`=_6_Wi6*tlX=*rQ|^eT^pP=es1n@N zKPAg)ut3OzM5>SM1___a|oLP(R|Wc5X#WK-+wBIV5LE#Y(h z6=M-M+Zl_;xEIth<>%~DDS6N)Exc3 zpg4g$$(A(MBFk;Jrb!T-h_T)R0O%+ohC#m{&=IG4v|lL77lvX>Dz+zO6|&(-j=$Jv zccniU%nfn##C+IF+^BYXDcFr@X1)8xE*C%~09@7~UnY`Y!|-W10bT5A1>$1^RO3pv z{kx`7EpglOuR5gz{P%H*F`#D2K$Xp<3^5FW-rPm@DG2GQyc{>1d6p=GExMDLLz3`h zEJ|i*d$xqAiFrIL`B80$L$4O}_JFZ6&IQp+Nqa#6z1*p44dSe}ut^wI%afD+i^$ct zT_3lflx&im2K6st6tE$?$_V{!+|1%P<=Va%}nW3(Cfs3G)hjQ zLo+Mx1`oc#_Jq zc4Lt}GNMx}Ov5X|@AxUozzs**Z0W4T_OXNyw1`stY<#CzSWQ&5lf~%bvyvU#wLmg) z=DL2nM%F}S^``;-5uETb;mu-!bVGvAgTc4|s@<)=Iknl=q+AH~X^yaQ(hMr(k8&m{ zW@V5>7pa?R!tzCN^<_U5x>ojuK(-->6+0~dk61w%{#UF(C7cB99ca#ROA!>sL^s~Z z{1@+J?`OXK>td7J|4Kh>hwCQiwrfpDBbx-b%Kdbgy*CF3c$l?1=hbfHu^v>^y9r|0 zxXUQ4{qFr33;&tSt@t&y@csfxA+J!;iwKhM&${Ts|0t}DCgnE!XbhBO(%LIm<)NuL zI?}qAkC?9gUFsL$K$2euxDukpl*8r~$|f;K%t8~D#8;Y79+X7GH5J}mVg3F&HEj4?y|TLahjNpTDcc*%R^*F z6}jY|GKFNF;S3AoWIYTyYDyA5PFPV7Z^{pUyen%^bdy~<@?2tZqcYP*pX=2PNPIJ; zpgrPh)g;+e(M|Wfgvz1PuB_LbEOfkfo=)WcEe%d(+9a1yd6nn{L==Dj!A{GfZFhDw z`%mh`vGFMT%-BxO*JrJLH;(-~emU_9`Iv?8OtSDnq#mUua1;y9@%>pJvVP?W?~t?@W*bk^Hvp8b1^8`>T+q zU;dZuE}#fpe^O*mEQ&yCWo3|(4%$q93pt-%>1_`#DoPbHcpa7SS#kqBOJk3Q6Xn$h24#LQ%M zk4q^yRPstT!KQe5X5juVlez0@BpOq@i~E9}^s1>iy(`W^kqxm~ge z)POQ?uYK2Z9=zC-(nDob{)l;|85S&{6f?6h1-cbyG40B$RZ&(L*(id6gORCP&VjoZ z{7Uh&{`OjzLDA)FJ?UYXR?lg1tvDSH)aw0W#P<`e1^|)V6O_EFTBlr3uoTKRJ+}vt z{-}H|8HeT^WNv~+-pJMd?1i*pbcPJiUhS6=ekO3XUz7ioef@e3eL2X3*BOa;XFv3l z41}D+6_zd!m=jcn(9!&HEupZf;n^Z=_I$G%u#)h{*W{Z#gjbsM6;WJAMk8BZY!z^0 zKve})0E(a#_3osRmSrDxXqGia}6BRPU8Qu7L}IhP5= z8jQ;r=3Y_Db+)U>r#8K;f_@u_A46w}tI-={?+1T=k`B@cp)zQb|4Lobc!TH$q!5$8 zvS&TpP*RZm2(U#diplS*l&3=~NNSv_9IKELjFb`lE|n%f}UK*OuPX>eX6!dFLODC%bQ^aZt~?>G#wEy?;Ye4d`aP(mq1=> z+AeNHkrBk7`#|jnE*eYD>|yP4SF5CM`;aL=HMXib>$fC zN*<2R_#4L(wLhap1*mGH&x<$p39T-0hj-JzQ}uacU(N&Ji%6Eg)=UBl_lO#{l-ie$ zTR%FzOiwraNB~`Mk(sX$u38qhd2mu>1~9AYgUJ98Tc0T<@AB(C0c{+R6mmNc&$t`e`b0`9 z6fM9K(4Z15JsLFIHkSxC_#DmEMxB0vIMHMy63)tH1c?eCs(cjas+WDBwyN846x`{Y zT5)j8^}Ri2j*ns&xGy)f@qY~pkc*Ik#@8da)nB`9k$cOxfX)v|&H>hV9Suq9GT-6x zTPu!}b008lB$Lsg7n`P~E>3gZWh3kW3=3$aohB$+WG1fmCLNxoY31ogeP?AA#mRkAYQan89dea->F3W+QTA*o zt|)DVh-l+<$P_QC0a93_5RiBzU3nr{@cQ27s19V^v236l_mS9St`? zI~zd8GsUCv88%l3KDnPjm;jIhG}LM8w#}c13Bjxn$DwNqLB|VF|46;BaCdMZa!`0V_Wt+{j0bXN*lu7y6N6k!b=t)Sbg>7{8T zx{whIotXWN>Fm=WZ=b_#Plb=n{hT*uDWc zk*itcTC%oGJ#<3yR+;|A$d3FYeaL1+l7mrXCA~_PQXk)>mm&WqwBQwZESWep_Z zr#M~%I2L3zK1kdKE;5Z8Jv3f<%#|xx$ATXoTgh{?Q-jWev5tE4{Cc6)v25!FPO!HI zjRlEjNP{)Kt4dxqZf)L3=rS;B!<(JhXZyca$8HUm0UN-xNspsAFWc%f*= zroUV|{u&5k`Kkv>UZ_7=c)RJ1d>}LSKtLhh`ClI%r8lY#hSwd$S#qetF0RM|1~PF; zeQVqvea@!p@}U^={Y>}YeK{ZbAl=KU99x0j@kxTB;ZM+NleTHjK3gRRAfk4Ac(fCu z6s|yTg_;T?U|NL^zii9mMa)}b@Is4iTQhwXXpt{u!kM6*aQsmuGe2;8Wzb4qhFsJP zf+x86bgD12x>e_uymDOXN}Vu0B!Jpk85F&M`QH&oD%ZL2y})dU>dVmE+3-WqCdjuB)t(T zW26?270PrY&>09UTX!Jx-!YGz&0gMC9rgp*TvlON8#;R@)FbB!!yxeY#0gJzEAHhagY*3?+l0a4{zf-MwrK*We5J8(5$XJL|I947 zKkRfxPlIOYnPR!cVFloY5XV4H*a~H_j$)ZGz9bnGm+Yr!cw*F7RZRm%L0iY#c?UIL zpdmt*xdGyr+8Ma}>>t$AQ?e@g_ZWk@#7q+YVckiq;*-pLawzxFIQpe&UDnA0Byd#A zF7aKUrZ!bz008n3w#2qh#~-Hl`k>i>-}Y1coR&YMakK5z}$at<+`4QBkgAB zHm^RL3DP~;(8HYA_rCry;c}`6m9E|B1G|hw?tcc4(lvSdNAd@!LBof+1&;h9v6tSc z19y9*9mHxrSEV2VbS~smPfRC<<-RGWR~`b)0u=o`Bz52hk<$VMmibfF z!~|&0Et%?Y&IelzHXI75Hu^unZfo_l&KG~HTqJ>b_5o)=ROjOWZclRMFA<`5UuRqz zhHB4%&tN11m4vU#OtUIQUb?fK1fI&=KzWs=;9pD*a_u3GAdn86OeHMA4PihH9K6oe zxDJ683Mp6290XS&Z^LUwM*fK3xpD+%PY4Y9dG(Re#0)+$`q_Tdy zS!+wB1l_^{-Qu$lRsy=gru!8@4$Cc1tAt#)XMY3T1cn`cf#A(W^$%+Uw50p0rjAxz zM2#~@7D-H~qp>d6B`N4o>Fmqa32mR1aTHm>l8CO9ic>($J1q7rFyov=nF={syUknC z39cR1z5^Bw)R}RK8S1*T&WxV*_D3v&V3X7xdNkID2fyvM)&~Tcv^T#BN(i%fE8rJg zTp)r#DV8y)Q!END(dD009H^ z2L>$e$Ei&`4!Z>Ez?}>4L{KY~;W|MxJVI%tvFGm!+FXoMqvX-m0qPUN zNY!qZ`u8vA%xxa%(zDSbNamNpgP|+HB>~?*hAejg!#-eEXK1nIu!s!E$6*hZVKp^^ z>6{VqzZYLOF@Y(U`NXzGt5tRUEdaacH3&t+tfw=xOT>D<;Ce)gkGqsVBoZg<~7*lZCP7wb$82&{*c zjg4{)5@8gorihVmad)s;p_#9ZVG;_uAC;4fP0g>-*|)$uC??SdG3Nm5aQV5TKWwEi z-L6KT0UV`wy8HUp)p#eM-fLhis)&wr5|p_>0b@Tz%#1b`rC8@V>JG9`7wo*o_J=+H-2I_S5q@$U?&Ru^6glI~^gQjNI?ULhn zJHz0#Bzmp~%Z+0IhnZtMJrAyPXEFu-z>*|HU8{dtUXlMwM7!G6ieBcUrSr*jGf-&D zwgxw#o-`e|hIh@nQ|qF-rW}3}azQg9l7q9ot1A&+2XvgbyNx@S>1fW*HBLP}mC%3q zG1s{tlGdjKv_yHG^$LK?$l3KA5OdcbTbFcRh#yR`+{!ZU1T0++l7FHKUWS>lJtRiE zOLf5Jz|@`okvL2KIh?(84CqMGjejC4WCO`h3-&q<5;{VYf~Ch|?ilFfbHw47l_|NQ z13bKadZSci8@&+>nyJft#m1s&0k{}wrnSk+)l@{|2|%O~_Wc>=X~=im>UqVOh6ZQ2 zPy=|t9Fi#P`Ns*3y2EDsIJbDD^v<4w#Z5>?PhR0(%%bnzWXUIB#f+RbvR>i@dh~PO zK+?t3xdOf~jLPvRp*zHRhI*5gvm9{LLz+F_8nf3Mw8APqPodA@yZzvDyg0D+i90wS zexp|n&TmfdDO5QSaE)$)3{Qog?v&;oXo}$PF9t6ZtpF$B6UATuNmqOFq7KJl#sb_g zc*=jqQqMOA{5EUmAW*bqBIS%``wJ zWQ(?_RayQrYyiYa$xZ<9+e|-zR)v%Ve-)VbW$nlaBr{oQ!ODy`atq6MBiJ4e+S76e z&SzLfTxRvT(AS9+zAl!%hSuiIIs5TSZ&$$g;MLX z8M%*qs}-HKOH5IY_{C$1`a{P?K3OCE(OE;O0~!vJcibO4(0F|ALkYLE+N;pyY0V1kOGV!hO6>X`Za zz6tO=3Hx5)dlSLq9(kRI@G8U%QFRypX+1^Rv8-Jjri7Zr1IwNB8j;dfboXRMCk|s7 z?T^Y$(y)oPlcVOFWng5if&tnqfiU2Ms-i*0oHrFLtvxG=vrFuB%T073K+bcHg7Wfd zXxT~K@!B`Msy9(JWovwHze!*x8NH}{@U=vP0V?HZVdXt@e3)NZ6U24<8SO7D*Y@$) zWk%N1c-Gajx--limjc1#Nax)oJ%#RCj9FF1Q6XT3!8aHnxOje_%tFbCDpAhutyJj< z-Kb5JtGoZ$k>PC^7a1!J5!9>$FztboRpjgy-)E{zQJ9Z|otB(y<{#EhQGo<66v%+*PPMtNU;~rjA)EWj+sRGNSuE z%Xj)lxNL;3tB9J!6wCda`h-b(a6-E$+RYA(>Q1x;rAjCTxZEb!1H$oJDy7(KPhfMX0?Q5fOFU12|)0e z^F^&{o_K5j{ssQ1Z61V(oRgr>kWSjVMkmi|Y}@>q_0#Lfp)mm%ET zMF$X%s3M=6TqneW=7f3yFaW(PXvXEg{pu-~;@@plKv zM4mm=emBY##~i^Fo!&hQFMiCo&fTALQ$0Lv*CS^>gmR>QFRPbn#ka3L;H0?x(>B33 z?Q*>~=Y5sp`uTDPM}s{1T5ZiI_Hpxid+XyX9%FkuYBWKz@FEigU9#SYe#Wr?k?LwP zY4^Qe8x{%L_V2Ou8RZ^qDdhAupYxR+guEp5i#~hD8r2wK9)gX>&Z6zH>pDhm6%Ah0o20>Tb*C8)w9}s4c8md?_;g72!L4;fSMwIx))`uP_k4L3Uv2Uv zYrgVGg(Sd5@T_FmweA(1pCaeUqxJPXsVVU(mUgXoJBByEC>Ktm(_QKY(skR_r}Nt1 zc@-;E`E3&v#(i0h@)#hKjuOaNwu)8v6DWSH125KkcQTZ<4A(wUf1gy>o3NdQWloC9 z|9sk#igsO?myi<2-0>8%(-&C61oXAR7Sze%jCFp41P8f@jZw$(aK;=$$@1s+S4V13 zxN9rlEO6Ax93T@7;r#Jz2b?UGFI*)in~&7L>3J-WSh~&_l%W;Vc-R8bx1HEnDmy%s zMpUKMG0KJK9!joBd5wB{h58H|H`I5nViL_pQr=pM#H1;l4cwQtL&z5A%dk4EPh9{ zn+vt&Q!FUx9}u9`h6jb8-@Ku%;g3UcZu6(sAm@ZEV3iu<(yH9&niV3`px2eI;@`Lb z#r_%Uy#$9Mn%FN;_i3VIzX2WRwWL6kexN2PpvuWdYCk^_BEdTyO!bBL(DSr+-G;cc9Uk|WDlI~&JR!RdK? z*G5-3r>d-av4I2$yc{2+Ewi)I_8rYD%O+pBT^x*7HUFcoDVqkXe9E(zj(m0dl;j0Q zrD$E`r-$%UmLs|tK1@CyQ{BC?(L@4@NeroUk%8aji(ZzTJ-vpqrm$Iwg{dzN3pjcV z5^`kn-|Pu$cWTu4IQ-#7@80h)VBg4a!=cc%&2?+9Q`zwtb0zVUtdP$o(?h|~-<| zg{IE~OEK1^iaUcN(T;FlHJw-!+NCoU+vIGs%kC(^CQE+m5-%AQSFrA?PfQqUrOr}$ zl~oY+1)3b)2H#;t>C=Csi?o$6mT(N>5I}tV{f#|EhEw(UM#_qGnWpj}?WzXKZoj)g zOx|+2kBsE11Y6rdk#+xiXSiMwP21JoXKwP`g-2&FF1OF#_0QimNHcN5aTg+(1op~5 zg-LPn$Al?lItFTZo^VHLZ$;KrM@0?Am6|+%!CPLbfnP z?dvOLHr&2X&}BcMyOUj*Pt;8en}hJF2t_FSW5PlNSatiQG0 ze9Y^n3%Jl!e%?P3LN6ou`KYhKhu75df#}ODs}D468DUIdu$Ab(xy^WaUU$se-quJXWf4ei0nV&jg`b8px>QXkfp< z*t}?bc5?D!>t(H>kPlq%{0TVMoUrtHKyBf$M{UNykTQF?%x9J~UpjyjLWp$-Y8w2v v>Hq)mAFAjFw!!QCZjkl=2?8iKpK2MrM1Ap{zC55a>w!F}Tz+}+*X`Bt;v@7{Bz z>el&j>i$S|da|Yrd&V=yTpgk)FM)zci1_Bs8x$!?G37UJp!MFodAkk|1&pK;viboZ zZyl8-KyS*1iT8jnFyu?kz=!K_=o|dX{Lqw+ok$qPF`UBk!it5ce+1ML<3^LU0x9Gy$L|rH@8{$}hsfoz^dgdbH z7MTw+5)u;B%I`_kuJ&7Qo+y4dePpLE9~gK9K30P6A0BQnK=9-N(bI4#r=qtvZ^;AD zLGYF~nFxhQ2(SNlCV%}n%VeU0{D~ZK9|W0zp8Y>Hkb?-vH(%D)T52MkF^$8_`2J{V zJ~TdCk{k7OSxuPnrmRVzzS_|Fd-M+xJ;_j!QIU^`D;9Y9)@RDfAk_dw&Y8&~+u1yp zA{u|9o$Z>()wpB53G||Xlzm%ZGLxqs{;)Z^m+@7wf;1NbRgP={lGIZS?$kbM0_?Bu z^9ufEboreh6JDXJ$7ZxldxQrR-=%RCJb08;qvwb*HGES&G@)w(;%$^<(xU9zF^eU< z636M;KtB71CnkD&L<5iz(Ua>->IDU`_(&ei1kw zS3CEiQIEQ1s1ffb1A~FNKP&l_$`d$d^_}Ca-O)9eWx-3e9}V^wc^Vn*R?%C~-#%2j z$uCQ-_DlQ4;VpbP%+tl3z*3|@=Wh>H97;FoNAJYb!K20j5fomfc7Dh_J6|rYSY%i133<#ue#2O5JR|7YF>1^ion z(!Q^5CmVs-YyDH#mTrZ~b;WO2w$hNg4^%XOe!U5PE8*N^h9G<13_~Hka5&k1@{C|S zg+>YruTrk8$NNZO)09U2Q17(HE>}c!b#ApM; zTXPfGGRDOr@s6?HT5k^i_a939NTg;pyqw5|jF^SBx_0E(5p+e@ZX?OtTqce3$>1_1 zLBt($BgK^I-V0WlT1e1_twq__(;kbXg z2rdSHKU%upUkl7d&s$xV+h7^eLyQ${O?sdZ zYl9_bVq9Y)6FGc?)30g#Lo_K-P-%88AQp!g$C#0^<5=IR-JQxM%1sC4@B?!AE_Kc_ z@9Lw;ZmB2b-9_hcH2q-xeb9zsQ}q`AEi&F_^^Ay?=<7i&fmscs>wcl25w7)W4G z(XU_YcDqHgvu)K$5SkOHyjpbHO5Ut@{5k4)>lX}O{W8>n$mRh~dr$LrxG=~DZ|D6- zH89a{Pg@L!(Az48W>Icqbd&`~o6m*C;LKB5;SbF4y!8ZV-6V2BHdu%>u?Pzt2sDHY z!8O&2$6VxXXYLXI+hNYM|0=E+V^~(&Ltni4**JQ$xzdbj24YXYGIhmA&sCO4%e{6mp)qq|oPGGUo)RonPCAU2gkHc_Y+$Li52cIuM&8CNUve|?bC(HC37ri;jjQVvQdfF{gYHLmZjo_0 zzGBZbn>AhYU|lSk!eZF+)A?1(Q|0p?(f+^AMs0$ZF&C(@<*Bf0!LG#UsLEJ~jDZ%~ zUyb!BEh(rJfY5AFVl*ifXUX%cCQ`mpp7GrMH1G3Hq`}ri7vGMC$1hRdtBbeiCE1UP zs;m8c7xz~W%13NxDiRE&tmcZP0?<#ytr2n$qppvsEZi&$uCw_;M*{1_8wN4)%T#rW z+Spq;y(qHo73KDw6~kD+9m|ol>SJ!HwugHorh8zBL$g$!vZ5DZX!v^SsXcL!oHZ%7 zxN^tosqmC(8B=)pLFfK}`;wOHp>UQ|#JG%EQ$W+~Oxi>X>?rRz*o>|QGDW8I;3e8N z%4y?}w_*O4QdlhHg~mXN+7#Yglu89{wIkAd{lp5+@MlG8M`W>Dqr?+$4#cG6^3$U= z<94U^{>Rbb&#(ByKG-Zrn(X=sG8|UiAEDz?%HOf_((vr7%EM*8JMN2P4}3cPl)$6# zk@lvs0_=*(^GlZJI&1OH&4;2EkzXJyT3QSbwO=p}Nw@dgH$Q7y;uESqKa(t({*G%a zbh}y=Ma+S}IrMNh^NRcMh0v4W9Q**HRmCcNhtNjN(#nONpy8{K8-=S!x9k}8MJpcmYA-$iHLTqXzucut0%hVM&nb0xB$v! z#muuYGD`A4>PMlBJgsrDg6}8Y2aSIe<9nmb?Zzy57E+j7QlA40kb_dFUzh9uc#06d zvBMuJ%G0m;((>Uisx!GIPQm9;=o45=`6M1;7aTrBB%vVkU`yaBQ^xqqxLYjOJ z2Cp3dU68CK=g!^0jSh&Gh0qhz?-x}STaVITNeX1~JX%F3Rg!V4B@xIYP=Jl7yyGn@ zKdYz&<4b%Fm1U426tBtv`6=RKg2sayqtRpeF#5;C8!LW{^~gGrgCWls0F=m5B?cLB zWH@E5E_hd2WcIgX`n!Wg#~HDfyUGBpHL-P1<*smOzE;YeO3RAQON*1>*;^&GWU$3! z0J<{eBsd^CevD5%!##c+oQ{b|+^nLb2u__BMi-~;8^;r$jE^|tOtpf+AATF0M;rfl z+!8*2igG!xgQ1>|EFfAt3y>*CHe5Gkfb6qosJrHS(J~&(gWd03m(Z1>PMwt_EgQ-8 z0-q#~a5Kb+5I%~bTPhP8cevQZG9|g0jBoQ&<^MzneFSm>dMq0&BE7m(cR%66@U-~S z(Ew(9^2Mt1LWW@jwN<|4*W4B;ZrTPA#dV}iF1Zr}4_8Q1(;1;OA38`f0b(3w>Y^X( zCF@=272Y4w9FTRkl(gLNV3_WyLS3$jDV3~sC2$&gf3kJ55xio7SXJ&a_U$U z=yBe?)Mw|Ig|goOHGKS!8u~(}z+c?rCu|LHJ#cyA+tR@lf==X0{0T1MNNoEHf25>E z!+%?=sNqh2&h4fp4^Rc1kDSV;CsBk@;S6S=hgYotX<@-X%0D5gmdg$4&Vjj0&FAZf z&j}q4)l78GG1f$poL;!7EQI=9)NALvQy@A|5PTsaz<(v@#bCo?U5&yW`{;@sx_B$_ zp0E?a?_GG*Swhz7HVZ^?*URADPrrF5NJ7Jxp&UpqHWFe=5hUi8sGZ2|hhpm$+CPF?B3nXdt-wOu_P5*C3(Enx1=Ow2# zVTpw+kY`=(c?cgbS9_s^POtDJ+G`Re~7w}a&@eOj{Eb14cb2|zu0 zf{mjZj>oLQd>`DncBFh>M~!Re4de|qxmAB3QMDbk^I-ZIr5E) zAv`g5(XQIFmf7;G>Cp6Ypzp6|%|YJ(jg++>?ot&cGD+7{2s^F#Ynt}?HM_H1)8H4#aRr(=2 z(Ls`000+s=Lp9=}TLd8kjM?q$j=2!TiO!=ef@_Ktm1~{+h=*FmkD5U?JbBK#-Yo0ZKl@RDb^Y5ct2WMkR+ zU;Y4CX0TN+V~Pv)zvvL`kcf7Cw>}GiNGam0tgt5X6iCo$)^?DmLS6-tYv&)aXxFg9 z)Z%H<^W4A`1xdoZsuz*$-RDyj!8{x3$}_^FIEX|>uVS7xd?NNzbT}ii(L#JxFHp*( zg9u64Lz*`x=BeNQm$8gkz^b4YxC^%XnVvt-KJ9 z@LxvrZ}=F`ms4|Wdb$``yJ`Z^*F_;JiI8;`vN2D@w`~?*0AS@+CN4Eu3M8`-at;de z0JljwJXEJ@J?3gxYx;}=m9D%dy$l0iZ=K%n<_Qp7w-8uB)aKD2G z2)psaRL!g!y^-E(xw}{A1ae%9TO!x%7xPou@(zT5fX~WG|w22 z8aODgsm|{NZL}~T#z2_ZZB#_)ygtr(Bg4wV@;w|#WB%vUW%-|{^cHX1EQ%3%TaQo? z*1q1DSL0X;j1ukp#6$-^9d}9aMr;KE0TcVQf~}^pMa8xXF=6s6_Rk@~K!fp|LuYB* z1S#M|Ez_S1X~I%pv1&{pzn ziu{`ZgZ=B|Z>7Fq5Mc=z4}|$?p^Zqw?e?muUovj zP0yNF2JSba^1Idjg4z-i$dTP%ciH%6`UCeHU}UaBNZ_EOz4|gS02LuP^VE!xU0#C= zMO6Gho4oE}1cYv)?cRUHc$Fg1Xo052iJwbjAqO7fnXk{W;tOOuF9f3H3;i4UjA*Yu z2xW%ZaKmR4#|NBHa`0bQ70&y2gYom%gRM2(Myh|E`#0iqwqCdBV82aP|Jo#2Y{O=J zA>kwjnJx2rJg|9hevPi@4uMS^1|STEJ)m$#1jt_4M4p&;=M}X>@VoLMVL((`O1++J zc9SavdB~RM|6HRR&OxzSdpfQUbR*&H1Q|bgMI_km^NJKf+^oc_f5D}i1p-ORhC@Hv zNHV6mv>>?he~eL`ubRwLTLXHF#(jm+&`^O8FCg}RLPJ(8em(8y4=+-t5X`ewQ=R}X z9nDApDKQdANgwlX8t-?jJoc18k}ZXLcvd2_)2x|{$j~G<=~!EUG^#^WV}_XSm%+6| z@2a|3CNfN@1F)kedw>KJkken;pq*;-kW<@_Rn_&aUIh}YZy@%hqcX$1`ztk1tuTby zNr_dBLY@OFsRW7q{_x6lp~#@Vv??CTGCM}cmevig~?d- z)eN3bXf<-f5NBkRDr!_b5Qzx_G5`>?U~D78B#1Zy%9G&d$I73w+hT0MVcVyau5>7~ z`-g8V2OrmfIMa^efY>A&n znUVi4I|V+rOOMC3knRehFGCpNmPnI!Q%54;zclX2c>`q5%GpN@#=iaMi!O+k!(y_T zN`#|Dx(Uj!0jk*TDgJfTZ(4$h6+rf!weL_^bZ~O&uCF$tRMXR}C6Qiy~rM{7U@Vpz!=)eSMSRa6sE4J zxywN9vi!5Jgvfx7D9)KoYe*>}pEJgx844X)6CC9i>C!MSrcVPHW9RFc1KyC2o}qtMFd|)E^{+EaiPF_IWhdGynddH{3RMsRAOagKu(CIQ-b)Vf;nW z14#pLiaK~<)pbQc$n*Vw&Y7a*)ZR4rYVCYZ*#M70$^3&IH{9}(x(-IJUCf4KI;AHT zqGTo7)m7po{Mhl!%e2RVmU7?icSI_z@K#?&SV^k)r)q}JbHFbv;)QI!l4JGnl>fF; zeO%3+72)wi9HUQA@Fyzj+S519&?#i=n%k0U+yJB6VgPPP*T2V9`VRoe3)pXujU}by zvT3m|8c1!XMTnkqoMS)ae{H~!<+MigE9r>2&Ka9|NB;BsW+<7iK{BkqK&pyi7*D-N zuLuifbn3&hbgs0?J5ADYVrC#yDGQiky1#JO$cX^iK>0_5H5{>##&63Lo&x_`v1FQh zzRSXKwbpoLijSH#( z;PZuB01TyILr}303vXqN&AKy4sQyz4W7tp=^F9oO8p}yHHjbuCZ&LvfM#XlUoTvrK zIEbiS#Z<` zWVFX<=E7Tt5b}P650vvzdP+5$p(2G1!rAy*mf0*GJs}(^aI-nFl5ubdf6~wC0EI>q zi8_A<3Y60JvJopk&fDZ#OL#!rn*sTiP%)-dNu?aCc}EeBk|uP1`3TjCO5RK&_&(_B z1&BM?Zd|t;RO6vDZNh&GU2dgZK?u28L4Ll0WKN{qiDa&vab>R_0jH=jszy=4)cTu| z2*Oj`(RV2oVD4UQ;#1Bg4#GXjDi39{JLp460c7-YqBlBpYoa z@5-XEa~(9;A(=Xs-?OzKR}Eq&vyim|-vS%S;#dGV9M6vxb~72tw-9nsy^AiRnf>BZ z5Y=zo@id_Zr?t6U;1M^NgU5Qqc;P9gMbVq&O=pvQ%@zW^X+f51!_r*$KA^~tSJE(n z3=>o+dcnj6U27TsaW+8lPFnEA9l#SHNq;39!W2!0yN^vV{iiQ*FS4R|EY*Nb>77j& z%^1kRYLEPj4bXFlc%x&`Ekl#50fmQ^jq^L|+%lh{XUQc|O#2LOX4X%FlFjnX^45|( zmz)*O@|8M6VaFNZzWqrijx`1I*HSidA4Ek()6&wOo}TVM;Q%S#)nJkf=;KL#a?h5@ zuzH)vq3PLFiZu1Xs1(}`mYgoHnedV`3*Wnh%-B>jL<}zLm-W|-v0xBFf_2B7Do7~< zs-e;8*oyqK-tM6RzD_HUGtMvd0cQOxu3FAa7<>2^w_R=T^V36(%~A4)KprwdBG)$8 zhd8^EB^TWctnu%!wvF863Gq7L?7zF%38Te^d-twJC*oU(Q9ycYT*h6yI4jR|Z2$-1 z-fo9s*JL2jr~(m(XAihx-YqsbhT$@fdyJYdH@ly-K9pT60YnzxthZey<6+F~H&S`p zsH4l~9~a7;tUlXj2>%FZ;kMvS*rH~}W7gMg^=hyXDd?QYSNJsn>@CV>+9g5eQj;K2bS1U;0PPSnxWxD(K|>O`zA9F3~bk zUG0yvlE;!BLnd7}G&L3bq@h5bg#Vt;N%9>2TZv7T>24R;n&vVXx2t_1-9((wcJV3w zT1;{S0(|*#yfL!y2GaHE<&Q1U1KJJyiI5(#i$H6d*DN1YDKXVVe;_JCZ`c?N9 zjYq%1;Y;ax9S1si#m%Bg@}H3>i;GDA{I2KqtzvNo`PA+`o^eA!Cs+3%19WA8$x9}O z)5JX7--{9X33y&xEU<_R`IVJ1+31jfwTy#hLza&`&WQM=4g_KR;ECYB9MP$fJ=^-8 zqhR>549Bx_WZSRTa^_S@s^JZERX&e+Tfvie8P}`GBpydu{ZidKIl1Xdr_e&4yC`jP zIU283Qw;wQSecE!Exl9D;Qjn|77aP)MQXE4>mmGHXUhljP;DEd1rb-n^L|88`q-eH?0~1g zoevt1q^IAF$7Uvx12QqdDtIK*wf2|S8LWaPz1RqJWB@+F%=$9{elcX*&1cCtBuOqi zzo517puw!yc)8w6vJ&G?BnD zBlp)bt8#g&oiuQHL>P(fZ1e>Q@ye7vSQpzlbe@c@eMlW>vPtsJHo3vmYm}7~Bz@@G z+TVTW*=hkJovE$dG1t~OZliU&t#aTq9 zt(qz@Y<^H*pz_1qF?};AnoYCKr^P7(9Si{EwGXM`T@Bohah+O(8psO}4!K-v=(D(qj1pU*INfi|al>KfO ztc{E$xs+pbg%_r&?j+rG6*H*jGW<$Z^5@}PRqp{7&i2C4^Ro~8w_tyU+D0JNfbJy_jZS=Um~hKj-z&d}aVHRU(UxXy%#eOgdk)DN~PY=RIHeDRVQN7DAjg z$PbzHTknq~T%_Ss<(P;(EBa5SYSLz|9(s>Hm?5m`H{pNJJ~dWy%hD=D>}fT$L<``} zcfltQ+GnBblicShVwzlCr2k1J^g)6HbrLd_=#_Hm$ZaB0c>*Np{^I^@YUt zenwyVpWS}71A$9Tu7ejAVI30!2IIrI?k88iEYCB{x6c5UyB;qVF7+4>Mz*%4jDCnc z9++=aS-y8eS-~2@_E=@E~!{ttd54EvC`up@qfO3?s!V9VhjH zO0DL}S6}@`Cy_~ot|yViRe%}sDe4f<_YWyZ+W!91l3f+2P z%!06;zfV-t`2sdBbZ9%!O~g;X5Nbk=z&>sf;g1S=xf)3@?;apVN zc;ooS_GcpzV~U_1MJmosE^f|3h8NVafvB|cQAD=S`SGEPO~(6#V*@?9sYBPaCrnWc z@Wu8gom^TN$z-9bm*XN||ID68lz+mK%Cq;dKD?WlNa~<4VgD5lZ8NosjEXvMH_Z7X z^VA{RU%jOvYNfby;N;2Qq?8+qAam?GE38W%T;h++G>!ZA+WD}Y@Q_eore9gQx3#=-irONR?&6F-KNK5~2S@^#PRhh=VNYsAcJ%{dNngrN-yWZL|K zvf#g9q~V+2WR_W-^Gb%Q_eBY5Uv418GzA!t0$NogYJ3~u@TyfS&WSR}5wu=FB7*s# z8$*j-^0zFDIi_jT`6pK1&Ch9Oca8l)yOWmLCKuDA4fCDH)uBrw=C3*>BoxSL^@go- zOjn|G5uZb|6@{%!0KpoIDZbJ>GC2F9y#(UG!oikU37A*5k+B6SIt&D#0YG7zUfkBpO3fFr+MD#L4g8>6>RC$3{YFJNA~InX4*bhHA4 z<8};QkQd@T?uvS$wOTFmmSEvG5hZ)?RlK%<{_wj+RhYfl-(@PHqn53gA|@Lowa`Ud zV8OzgPQfPr*p)MY)YZCphJ~ox;H*g=M64DMVU{M})@72goyo@uD6}1N{C4;FEmJ;U zFus@l+0=lW`t)#fAYRa6AhP<**|lX=%ddfSZ49@YQ?eperrNmT%q4>ybAYepX8OaW z4AnxKnRxEdV{0H`%t+%Q>DY{M-hS^CHx~*|o`?NOT;koU$-zjyE?!jPkmSvL%SDr46) zrd8(oq6GiV|55A2vHWrSu|Z*O<+xeMV>{t9vyjI@z>eL+b+N0yr`MzJw@LjH0-xQB z=y3EOcrGnoX4`e)-gK50c3u<2RsD0B)rlejiTOW2NgT}fm90XJkVl(HOB1Z(VufED zmw5kR@k1PLo$d+kRI|z2?Pjg8va`Dh$QFdfZ2o1Iq4cR-WGn}SV|1Im%<%>@oKJ36a!v)k9(+G{6_%)z;%OfCI-85wdNIlyQ zRFm*SX#*^XwC_Ac5N4PVVHfKoW+aoLnO|5PZnoF^(Oqb0dC{b)h2y#})Amg(JtUVa@Ujm8zfd{OMM3 z?H^@4(ok-Hg|39har3$J|NSeYb1T}LuVg%)4#f^2Zk94_9xLhqyIPNqWV-C(EVY+c zpz4XNj-`lWKm@}cyz7QW&dDN>`wD+(pOlqFZll9E?kV;C`@>8%dVbT;_qhA6t2Fa! z@A=sSt((KS-!>OtzI{{7H@rT~Y0*>GHpXX0ERJ1DiaKtrIQY`XR#3ebBxo0qk~~K6 zvKY{ow^4?pu=H*)i`4gVioaaX{^UlX<-A(i@3&gyW&$P3m|M!+X1>9I&KZ_+D2Efd zs-cq?%XBO@{zLClobN$@`rE=|jja72PfNPFLIG-s@FK?s6Aq?rjy2Is&r=D?(>^nn z46N}O-Wu5Iv`qlMcw28cP=$`w%rK{-+wuOAMU<3oWHI|% zF*z@*EOuxBX3p38%42B}<*xZamIHlLZb|-Ur9r#T1IRUW)@{}o2=Zz&$mECsbF;<< z?<5IX1O z3%x?Jf;&E6%opQ=PomQ8uVZ+x`V3OjE}#o4!&A7cV0q+_+W<{38-zkqy1fp!0wkh064>fgCxjrv?1XZ|rU*g+YUJc0 z{=Va|*w(1rGkfAu&rX{m&60h(NZ&?)?eHP{6ad7-29rJ!c(x<;6d*(SJjA4_IlDKnIfU14ZSE_c z-3qW)nGD~5?n9QH@Kp#?P>kCgP7Y8SVeu^q3y^e@$gSjVS~7 z?&Pff$vOpP_t&9o8E$6u`3gip{O{#nAx7b}C7UYgV}C~oTAYElf$r?p9~XQi&+_Nb z2WY;d7pzkMKqO987za`)Yw>r3s%(L!Ifu` zw!VEHiPsjx?_SO|cvK{k9o1~Tkdo)p(!Z{n!8;61V>Q%2L zDmnvzWG%}-8A9lLSSCLD``b%OAWeB%G58)rFKA?rT=n`fMT0xGm><77mDXUH&SjtO z&)oo-Y}kT$rIwAVb;E!w3~lEuWi-c%s;4R zc&>5nG9Gc^Ucc9=6S@nm<-Bd1cgVfan08uB=$#PUp0FpjLbOS6>1UU0 z#eevZx!=z-t#z)p=<;v&0oA&CnJ(Ru+R*r|*V;2Dg7BMCttzwLND?rSELfOzJrA=* znWP)2a$e725|TxaNZuaAU$da8d#L9Lm=s@+v+<7iND%G-Laa zlVjy{#3_UKDd978EE$R`@ux5IBQ*<=*1+qp0l*7;-z^4Ju@NIVCaql^x){cxK5OOx zG$I=r1%*+sQFQ=@LASm_m_7ewv)Zvnu^K?>4`yxFzy-|s$VAL&e5A?1Yv)Psp5{DI{aIg}Kb=~#^E%Q$wh!nwa7?2`#BP6YtWc}i z(gu_vtm}rCRBxEa#8*P{r}uq^%t7z5RnWr>zPPyf*EQQJv%e{mLQtj2%`RoQ^0*Fw zd-)B=F}%5=HcQ0MenTlNp-zKHo-^F??H7h}Cz`kZlcChy>;m=bl^b%)mp`jEE_&zr zFIK_BBoxK1!vrw9rOq3RK*PU;AokEVwsO0`E3^$YK^UnCbZE*LK* zp~x`GOmm)yOG3e)SWI`YS6(JTBp%w?NM<#B?aN0*SPmOAJC33=zYrd8J10Y;zi_ETP?+T*XT4x6#l(kK7R1rE@Bj}xiLd&pZ z0sDlVmv8lt@~y)wcEuo`DO7}JUMU)frJ!k`3K<=Zu2o1-9Za9dVso`NN`k72!l&-522(Q+iG;TZxgnuuiZvgLRE=|0qg&~WY3d6Anr ztA`menn?Tn?BC4!3K?8Fd`Ne4^gH#AHQ%ia*^qFihz%&SsDG`Ev@jdwBRQU+tnY_* z97iS6Ap(h%_31|r02@*32Pr~?;UBI46e%Y(R8`UJKM$&71;8r z&uqQgOmuItW0hOO|1e|nLQb9lAHCEC2MaUr%#ejobx41@3DTb?-5AZ>E}jxP!j5wt5yaT< z$m_PmI#?)~ZDTqG>il#$=RtK(+&{t@HPqu>4>LgNQ?apHD7S#>%%EH1xO=wvsK%a1 z`j1Qpql#-1+qbmJ$5BUS+tC@fW)|P}`xy_6?J*NeH0Yfs$C^=1h}eg&0zjur9JaN= z&KoED^l6N}R-O+a2D&6=n!*cqIWpX8P1=+9g5HDceOA7S#-+fcp#71zl8 zGi}WLoV@93+1CVkWIZLON$}qgO2lG2tcgyJZ!9a4vsBrj(W-sH+A+ax;ejzf=>dDi zf+uUS1p@yh#6>|9VjFYy%doqSsOgu>u~j4@7Uy{+1A?}mYxcambCNVPfl6V66F(Kr zd7rYhQ4<=I^%8Fv3Zy)5#qMXcT|Odjl$FPqVCo7#T3X}cU6YTy&9GT*@Gip)6wyRy=BzDIEXj9B+%JhwB?#j{+*BILOl zGnnS#gZ@t8uNQ-eF<*~TY9t+JfjiI}@yublCV^83z)G>1w&g$0>Eb#fd3VGhhBE2( zvCT~jH3*di(smM2V9xub6bFrFsmA467h%VZUV0h}f}OHAm;mfy5{ors0TJd8y;c;m>i(OgBmO5XiI3Z)wP8{u#C!sLw$XHl2~6a)(#COrlu0c zBJok9$)fknj69FF={II%)2@FE2lmFB8#G>3+Vpq2McU_}d0dhT)NbI7luVO*2-AU> z`6`o41aCTP>SVnx_X<~Z`5)NDOH%cotXJwE2zt#!;XYpL#HoL}SIvBGAMt4zJxfd@ zxaowgvVCX=3eJZ|o0aEbw2XF9Zr8uUsO+&pwesv>1nX=8GC2O1s1w^k5ciC*{*qX+Mm=aEq%I~!FgnJUrdP@t3t z_#QIlqiIyV4V_+YgM$!-{CpZDzu*}-3+ATSu9VO`3$*tFTElp~HRumD=VNs%RdH(; z1Mjxq*I!l6}Q~G8fdly)!|2xlGg` z87cgA;9Gw?rbBbsdQl~=A3W!|V@`(EcHQY-0PTjUMCyc*KRh*7eDE9-Uxf)aIeLI* zb@J&KiyK~h5OTb;fNhfcP2lI|u#(8p%#Gy=JthcQ_Ub^h9E0 z>a1e+m$Q8n%vZfkq+*w6ibVQ=jop0NFJb$Mkc0M!@UOzQkn> zzZ_hqtRTV|(8c{k=RihA=E)^ry;h^y8hqEd)=`#pWPrl8y}6X}{i%DoX1;hNw^L`* z>8PbWh@_|4W2Z*wY0O{9-AktSV$diW^gxd_mzsg%?SQBuYE~@sTKbAQ)xz@ zHasG_S8g@Gsqu6Zs)&4=4o|zfY7bCb`vaHNQM{Yf z1XY%C{5%8U(*dCR%+CP(U--Gl4HAkd@h0ZNIf~F&xmzEUa7{B{8+)!aG>QHVpk2c8 z1r23nzPaxT0*cvgFR6jsgWS!^g(~jFVDQZQZDOD=m$b56yu>g%o~{1tx}D1n!Ju(3 zZBhe0u|Ky{h@0~?LSw$##*AnDG-Tme3?Z{-$QSuXHPj@S(=SPN7r$B;VWG|Ys`+~p~w zO=4b))r*7JzB_rkhuK#86-mT1W3#}Ii9m&V+P?J`=u8YV`;O`dm=En0&P3Syl!dN9 z*+=7qo(P5p4uPAHymC-l8n~+EtQtAm#0Wr-^sew8#KmSgOSq584dNP;jP@4 z%ki)_F19Dios7q$>dnPjlLLoU>O#)fQwyZ(_6o8J38jbeogl@nmbtVoFMF%abKV9ckmz$b2Fe1 z@}bCm7&}3gyUlTQ9AvGP{p9?3xQP*h{8WPOS_vJaE>7HBMQ7C515k1nK`36?C8Ol`7L5(a} zqqghza_JU%%tB(fu`-s_J@~6Dd?eNej)ycV}!6on^LDm)RE!-CV!j=^vnv`TXDpr z&B@O}P1mTR3rEB}^aFMS^L3eb)C_`;GYTzAF%2ebSq~b`A?VYW>0;mWt=>zwIUKKU zIk!|etqLFX zBg4WFvss1YWzF)SzxdlG$4I9XN8b7SJ9Rdy7tb{i1Ki7oZ`!Ew1tO;@OF@o3qrtR~e5xJ+NzTsThHlHuucJ*{pEXq!vc8Ysq z6H!j;jHRO=R%4eB%7~k-Ek-sL_)Xv%xBvcnS2MG+lEHx9o6h61{e!+pUOsWecE%_% zlbGCsh@HaC;-GkVv%CqxB{|?Ej2w#`H!V24P>GttKjD~Dg*m#t&g^9~4dG=@ie_k= z^kpi$H+{b8#pZ|;IrPgxNk$vjSzCFE(}W}1vs2Gz|B8pzgMr|mi$;TE7gcSX?MQG? zc&-1O!zS`(TJi_}<+LeIn39RvRV&`2y*7XVa#2s*N; z4d;ucH_H_fq#D?u`^+yf-ChrEh1!Yxml>Cf7RKn2!9|sPc5C~mnSFFlW|dWjO8)5z zC=HChit6WCQDQuO@!=GYdyM6RmjU{$*Fa5X(E1xI5hyF1I!0(9@iaaBJOn`aCbi&P zvR77obh)(f`KYLYrsu)#I|k@1BASqE`t6Q4#Ksu?#tX#A@z#xM=U;nSoRG?VIjv zi)<{<$#`Zg>lN!fCSoB+%ZEG-4S~C`zgRDD%650&E8t1??PCf3+qKK|Gv<$_>V;(V zUyKtBUOY`^KD^VVh#Q;WZjE?j=VP0JrSzl0zDHm82a-K66fe4uvrx<0iWHbs&+~7H z>+)!AWlr{sA(y`6mo2-&(3A^>XDd9By%8|C%`Nzfi>=kvG5c|)@Zb2+2lZG#X$-GF6#ze} z0_^u)5KVyelGx+RWzzMZEEs=%;-{9&6a-Qw5WZY$jh)v6=F;bp^t4lGV}s_aFW8a~ zo1~ymBD^HJk4?7nGQa2#d^f(%`1+h&wein*Ch0(N%7*KE-;s)NMh@TZSo|4zTz3tk zd2Urf&E3vC>-au^PNEcua_C<^A5&pfGvA=$<7lyUa>e+-(6!$9S!k}({XI;!EHQJ+ zOe=7ebrkl3Sa#BK-exd&$v<2p1-IoNAAdF$E*c7ANmMqV*Ld=_k6>fH0#NK9uVQku zr&gmFUE@OqZbRiUmwsx-Ee+UU__3e5D=aNDQaV3G`d4GM^h8O@N#+|KU*7BH3W_SB z4d*)2?YAkHa=a(@oN7O8q6rsp6_X^Gl)wXZ&uw_sMutItF8PsVOksfd;C}8-q zrI}@9`Sa10OzUme1=fl8vDSj@&zsdB;1xErzw2%LdS~_~IWfxOmj)-!MPJTZ^=?9GT#^B3pdb#`qxZKP?Zmie;4yE~-5ifm1Dv z8-}TJ-^*CKSad9qF93+2W&0r+R-ra2L1J{*G=7|$e8tT(5E&pYU zoq@!@hWAEhyj#wpeS~0}uXVbr*l*%yQTM$8 zD;padkks_%d~bVa#{$EW5%l5s1)eYUBs*y|%u3Nl~!Bsh;>Z5*5Fe#l-FyyyiSqiUz+wyMqCh}6G?_Pq_7 zfi^qI2T+VVAh5z%V-S5+UiUfNu=!x}hJ`whvTorD+l1D}F`Vm8W^Lh$E0A92;7i!w zIrVXKZ`V2(_uV6c;3=HV=U}s>1hSZJHQsm$AnY32Nt4UOWFAnRPxm8BJ)|}?!NoL2 zHqL9YZ(}-v?aBDNMX9@CcTWrEzeRKpy~NfZdZN5;qZSVGMzmsFPDnE-S{<&#*?8{) zVAti8K!)!Ww-ajzqknAUBYj50)S8F#(E=V$P>>(VBpPK+#9ap?2Goxnx}17 z>lW4gSwgQik~F0s2TcAE*1uhwF^Fk6OE+kHvj5o1HLG?3pUdf_#?Jr1S2mN&;OzVD zagkY-L92}2RnIoz)PV213fC42zvVERR6e7znVANMaZazf#x50{=Fz%?$;}#{?=6|F z#Rnx1&sW;#wR&&=RJo}c&xPsaI(`$AW_w&mmBJv|om;w;g7|VF(vGT;58S>ITsLrb6aK;QY1+5koAB+Hi*jGj zA91g*F-#tBCJ%E@^Acho5QWih|ulY{laDvOyvYMb9BGI6cpEW;@q zfj1KZZ01}8h8Hh-ope+8_ohdhwqt{zxMi1Fx`*iDx;8d&w@lhH{5gpII{jy{^Hcny zxK;VlQd;JRIWe1k6DNPo+T%YRRgrdFFICBl;Bq~k<~P)Rh1%@});wsgEwwJ@J|tP{ z6>U&0jl4X!I)AUzgLk+ONx=MQ1_|hs6)u}t#oDs87;QMONF}C&d%}&z8o{MW0?Bi` zdC?kFEizu&`Jx;1@_YpbR8f>QFED`Q>Ew0)Cr_RfrS2*k0ZGCJ22)?nn%rIX%8k;> zX$Cb(+SUBGQI}yi2fle1#dO@xa@3D2qiK^PG_X{tAWwBCfRt|XQgX&aK9&-cn_MtT z*K{DBN10?`2b7o*q&jw5j#Wm=n!7$F`v!hSb?wK7YFDMO|Vvgog@LI5-u&Vi2S-f+qDvNh!zUGdu!kK#iZ`{ zuQq1YQNDvW7YnaR871MlOQ0Kv&CfHQHp&zmId%qrf!X>_bn|D^hhP3TCvLaC-&>_< z^k&}Xy0fFR^Rpaj9~Js)Kdsp}to{sZ_FDhLuQa6g>$0{bXh6_C1k@g_Tl6$dHiL8m zUHAE+$0wIe%Ie__ZsqPJ*>T`QsPq-C!m`-DUeh9yLtCT!YU){%U;jOBH^X?v8jBvx z-be#P1d_Q!sMTyF{Qr-aI|^p)>XHoi!zCL#|R2U?tFz+ z1fD9R1p}&{2%7)d`t+6gu%K|X3wyNSR?1mQp{^~1L;YVS zL~$44@Ww~jQRUbV>OIy8L`*F#KD=dbvW2X{=FUYIv+<|%J4F5#}FkjfmI`zPQnW=!k%1X z>O)yhMa=RhotGnTYmSBHK^4#E3>&qcJq^32?Pk6d_FY_u-Wb4})7mlS&VBk9+9vU* zniO7AERE)3Bs6|g1sM}cW{E#NHU9fiqWr1vaMY_fvY5xO`CHD5vW16YZ0{)E-mOsS zHjgkcq?9p2m~h^1K<5rGydi>N|Q6~yP45);zR73G3i~mN)XzAqp5c?&b^MwZ%>I6#i*(+h3pHq z6sDr;ofQ@Cn?Tt9flfeW`h-FaF>Sa%ky{R?j^!sy>(L`%V&r+Bny=kE#IUC+Rn4oBDX^Y64d7zBh#YMCo;cT1G{1K(_&wfqH7qR zd2=C;9-7rT51+CI ztp+xCDsWVXa=d)hIxI;Aj~#Kt`rI&-io*eAk>)mo}w zP^O!{TT02yo_4c1wkd@b>fU^!yWtJa#;Fh*HH~azg3~S7&kD%v@3SCY)2TGWsE_m|yFdBKB zC*Vv36UYu5;w`3*@vuEzi&`|QNb1id;s5l?<7xtBQ{_dp&v!w_t(e#P7!032aJVJr z(lIP0j2?=lXzFaIR3+>=24?8_(1GL+nE8g=2h;ww}J&u+wlaoXsBsQTefi9x~D?47eO-On`gKQ9$XJm%0H9(eNS4y|C%_w zqjK|jG!XnNgG>iHqLaQ&B4Rq5ie!~FGgP>rtd)6#+S=cOTh2#=vgn@BNl}qxdgD^Z znSQ)vO9WI|kzd%dKVv5L9cfiZZgMq0xDajWNo+h9x6niw_R4WeH1LhlJSFK*l7QEj zK6xxN7v}?e?lzbIaNw8B)pUB_`R7_}vbY)-C#wVWp}{dcZkFQZ#+UvN4S;-MpbdCy z^2B!JU=0g^vMMUGnUX-cT_>rZEN)hp4hZ*+pjcYp&hb7xFq<6S_Ld{SV^XUs$H>tI zGUx>XFh#iH*#7p})N`?#S$CFrl*8WIF(878&(NZ-+K;jWMdi3%;%GFgtw2ux{zLoA-7SjtoaDcef2YO=LW9G@*DjGU^k6 zsErFIZJaB2JD6k|!+5E>v{Bs%k;B(`lHw<4W+;|x28ex0K?IZUx-E))zaVA$aXq&& zNb%8r_OCFHFJ2){0*59Kei>)jc-iTtW|R8XJ<0UcY$hwW%nG{Gym? zvr)$h7h&xkHWjM+apE86U}8x=8rJ%b0=MW>OcTfUz&`_HRECvishxzb_mVzt!Sh60 zMG)&1=)W9|3WrZXvX&oNlh|WxVywP&?K^81$LXhB$iBrFTJJ^zwStSnA4NavcYdh1 z0`idq5x!~r0yDiE6|1%xcztRtt`2agA117p)fdxCkeP!_x z8y;Y0n?7vOY+3uG8$Iwk3>n_}nW%?t-o5Gvca)rMJVNY*7b7;+i8Z4@4M(UfY_4WmCOU(H^~C#CWX|lXQt8YnDc(-XP{J|EPR34WJz|db{;{ zZ(wv~K z2`y!WaejZpfBe4K-j(Th=6&(bNltmy@*(-6#K^5wa!UM{+78D7DH17ODcQjzo)4w`T!z8E8#EGAhsc0jdA3MKjrj{#)JbMK1PWe8>7l`48U}c<>+@Hz zU*nNR|A$I;@8HI7mLq|lZaX*~<`EZ%JCeT&JZ#hJ828ntJ5gTrk7fdNk}%zfnbk)6 zM7BZ6C&lvpUZY7Ep|1^2csJ+ck|(ftI-)A1ynm}Nkfr?shjF90KvqLSn!AlsjYRi; zzhb{~5rcTECm_*7nN;dBII$_M#D(qCpCYDCD?Pz}@@9sDm1arJ_icc^yzZ%s&F3-J z&6Mt!wvnfIoQ*t@C;syV|5#bV5KG=+nk?;u)TS|c;?s$+!sm}npq-SS5#Oxr;#R(} zHM;&3v$(nbTox$RQsr{@9d&hYlV>M%?f zMP&hiAX0xB2wNW*`YLA_H;2(&P04CxG0%G08=YQwblrM&Po?gsn@_up<;m4D?(;oX zB$2pW8R~85eKVv55@k3)$ay&JZ#qYj_ylNEnD}FlnY}}g6VDruxruL%LoqpTzXbl& zG;i!Y-Kr&zeU1c+vEgRNN_VRnpPjKf&FBY>eOrFic1{wU*12&D#{-^G({55fGw}_~ zX+3p757TKoo`@?SOgpPec^b(Gr~fKAF~9=ovBx0p6xgo4HaWQ%IO<%<7JY8nwhXZ=$12A?T z$XLfMY!|~6NK%b2A#F=?5{B-;+X>&i{|i1%RN&g^o?Ec*pVwkya;1>HBK^k4 zg%R^w1Yj7;Ps?U`K~Vqs2I%%d%CWSzgWJhN+0)-i`T+}G=e2@T3--TKLY*GeuBIL~ zMB8D?I!qhhhqgtqZx@~=VtS8-0i+ghcU|kc83C#a-*b{iso8-dd~^z(5Uxp5RVPB- zY74d+F;l)k1IDW-DLPUMs8iRgPk$2Sla7@8zTJtx475 zbgk0d6n2*@#mT}So2#@UuwqhHTX{U5i5pe+hV*vZK-SmJ|iRifLfez;RaW;UY zmARZ0HM*%g=KWRpFtd##L~dE(W*_at_64ixWx)$ko%T=8M}ZZQBCNpp<-C1RjnM7T zU~be96z!3Fd72A-uKcm{ZnmJ+w)^*mR|eh>LO@8D63?U#E(CU`YT<=S3tnI_wl(0{ zu?_9m6n_qQRuYbU3(QwMoQQ{M-VcB78$gu-SR4-j#^re;hW*z}X6zIYP%LEVm?e~j z`XvqanA-g|Ydrj1;7i+n!P+)y^Ne8}NU$fmXI&wqQSDr* zdiU$wmRuTUFAT5)YyR=uu;T#dPY!nq@(J6-+@1p44|x7PCNg5|+fo*&;VmHk3Lp)@ zBd+Br#QC>ZIvEaJa+dIbc`e5ZpF8r(IsT)EP^L%@$F|d-BJ%>*$#9hY<{`~~eF1x#hG2z%l7J*^L!j(}onxE)N>RcclOFn&v%?L&=f?XDX!2}v1# z`K{a{&t>uG2=6Hyz*Jfy4tVRh_?PXxVwvpSAYW)@!mN5k>M{jFIpE3vIF^)cCd@!g zQl6B@u7LdyDD)SJ+F|LHdBnjif6C9wLktyKy|CcbJon@Pay*=Y4&ol$d$k4GCv}>w zcHQC+%B=ufaKq+J@ddF zuarCQ9t7Yuw45|?vi8OVZZ`r;^(EjAtd%X-g_FuXWQ9PUL7tkf!;@aP0)EVYpowgv zS1!P)_OBry+1VU?S8hiDon(8i7Q!MNEQNFQbw-NRQPV<~3ODXx{&7A2kJR$#JQ*qX z%``m**6mjk9TGkpr6kEn*gv)*<+DN#VsaAp+1vdjt}oY(_=7+QgJhR}7GwCjvRW2J z<_FmJtZd_rzr&9`$B@oQIqP6hf(VeC33P>HKDEQ;T_PF)-P zeASem@XQWOEyidZ{ux|?pETRkPCAyA{f>j3?ccyos49*f4s5B|bC$Ni72l0KABk2p z|ACuLP*D#o#n#k~qCRO4@>4q_3M=keL&y>_tW-N&^Y|bBek=Vl<1&Y8H8&q{WE|&T zisH@O>o~I#)Ux~-jP-9e zCI^dT%?RUkwIT=FSKL*+9}@wduR|LCg75}AQJ75{ZGK(GrOWB()F!z-f)j>G=( zdf}Bly}#$57~pXe0LNzl>pH->#?1g61BS`JV+e38@sI8bPRU5ZocQ)%Z^(f{;mh)+ z^81lU(Em=vOfSpt!<32tbpV(+^2yuFt{$lxe(>XeLIn&KrT6$f3&|8Az+T_SwWF6! z{2z9mK{{SJneOu<^O$m zI&1<6%NZZ1K z0xKZ9>6%?BJR$-Y7xxU{cmS!7uAz`Z3=U2eR`h2U~OyzpX(?8ostG#D_=6k=| z{^mt6Wr(^M4_zq=zwYO++ncVC|FDU@k}d6b{JQmok+QzFk5G?iLLeZK4MwxOGM2KWQJ5m=;_aOu;&~Ja zc{^?G_sFmEZtn4TfoC1(jjS0Cj3I{%mP#}ELj8gttLI(ibb^8Q9-?FgmV%YWLoutR z?2kQspURg<(vGem^g;|}qmx3dWijn0XeF!O`U`c+lRkm-MT;o(c%sTrv+LWmNLPDe zJO3!k$6M`Zag-X57pnvJdu1=IAPrFGVjv^M~(189c#vl=ni7<}uTv?A#{IG5QWu1 zZ4=&vOknS3R2YZ%sJe2nCN{z|Gq>4@kdGB&9b-O&b(%etCyuQnC5O@1^m5pOlG9cv zWaIMpA6;`+o1r{hd9p;r?mwImLY9Yi2nCrTLSqytavZCxP|HXw2`ETtNA{IGIpLxh(k82q@x2{g3FL1KwK!|M&l?-05^XM>;IMaNu1tZB zY!Vn9C8L8upZqAlhx>QP`Fl|W#y%!gyPuG0CnXu}V`{@x*#myH6H_+2%S`PQSTl-J zC_3b)KGR58rK;?)hH|VHO)6F_36n=tnW9P1IpY(1pLphcf>Nc=zK;}O!duSCZ&P4e-ykS8PLGlc0&&>zRxF82#accR zBiD@Oe(nd?rlfoBKpu7(ZB;<3HH#pbiHM!Y9(K!MQR{Nzq(-8tU?NREm<7w5uO=i| z>AC4~Da2koGi}B|`NpLC1XmPWIt78j!FtZk=j&gk*v5I+ZcgOKbf(|J;4K z1Eg8G0^q`RBrXZcOYyob+fR;CFQvn6V4i{@&97F__(_b>vo=nu*+Omo36EH-gigY` zthr$5tcpzQ0W`#t#$w4nune_FEkdkvV7Aw2V#e3P)!X*-qsLJnCS?6o|H$gbVa3pk zku;>aey_X$?U(5}-iwXRa{CK0u>#h65XxA;zPaGZ=c!%mFEP~F*aBPTYY4F&ZFW4lL(R4z3%$jUMMds~S zM-cLqtdb6^?vCNiI=G|YP!C>ma>~%{hJ@X{6>P+iS8C~t`Zgg1_8YH9mLIUU9%adN zZlgTW@R8E_$RmgiSHH7Xw!0_#Zy^GPFtn&>p-Ie#lr6tQl_Y%`3%1`!cQa5u3GJ zmL%%n#z$!JbpGTHChLPlOxu-6lIM ztX=jXMhE+Wk%T`2IRXgn9=AOhF$W7S1ik5tCVBM+f;@x1p{F8?)tsT7>NU7Y13RVf zM{)vTse^mU>O=w!QL<>IgQdc|s!3jm8kepz&PQtYjOHtAN-2T|M)E)9)T!yEl{h&d zSbm>%lxA^D{^B5J3)u--l(ZuDeSS!{p%rCozaxK0pd0=|JDV)pK^T2@V9z6#Rq+A1 z7bPCG&=uBfci|Y(gAQboubpB5qg$)z?2^I*G3j!8s_qk=ZN{r^Ot&LPY%D3g$Vy(A zF8_;jfhVg=6Sg#y3dzM79hkwu7?kyKTeR;Tt!FD4y(8jx^Wo>4)#^gz!)7wx5%${q zjdpcD|1H=ewb|`04yy|)`b&eWWEEh% zxl~ZzhhK<576g#zFS7Hc30??Nz> z%X~^upNASPv+)fXN>TX+%AQ$(1dC%s&}E2rSWSE3xwP(m_G zBk~zl9;pGHi}Gmy^mIhOETom7rFbDnPB=YI!5pzJZbpXPNOo#I)b!XKZSQr^#CLT# zdnMt6fD(sY?Ms+YF$vGNZ3(TV`Y*>IH23%FXBv#wWM|a=-IKUFB&dgXw#hbr$ysl~ zs|Y>IVf#WYz0tEfI}v0!XDB34+7d8-@EF!(4jG@YkCC03is(}3rpx(dIL7d0X5et? z4vt}u=IP{RjRq~zRxJ5sPSG=1i#dn0DoU{RU(aKn5!5_$t@!b-F&CCj;~nH5mYiFy zjbYCt?3j-)SY0j9brQZop8hezj-iT7Ce7vzmu||oHa@E3>=nf6#n;RqJ z{_Y8HdSOzGC&Up?!K0&x0hp2Y*R<)18ro^77pG;iKjs?wy^-tF5UnyTJi6}a3o!@M zBBLI3$I7xXWgQsi-%^Q0T@lI+2aM8XNZ=X`VEI?eP9lxEcr|%>=v^gU#Ymt&@f2XW zXS;IJZ#II(%pQX45PYAdxT1bhhC{jtnZngk$ zyqNz$TtD#>zlsJYu6>br*1=tmFUSl#-J00=xmlwcgK~_79?y?*M)FEN2Qm+3OeS-7Omc94 zqLXy@46wA!D=5Gt@KOjG$%)uBKZArA~DeYJT0*vQD{ zND}%f^OnX)UoJlM(`JmV4S`+;o%grQW-nR6Sz-vmWE^jk@2KpGYkrasNcVokW8~`* z>N`s^D>;3FzxjTbZJQ~Sqh9SSg|oJ?juNpFzalJ*{}FP}Pe>}U&R1^uO`)02hRztP zf_6RPO;j?f0M3`$m@SFus3tA7MR7W25HW(=biP+YKZ;a3zvtpRB1sR?0-GuVZcF(Bg_@O3Q+4f}QG4D>DCSlJoB)xB_ z3;S~~s)nw+FgW6GqCJL6Pr^Y-V9DE*JNoYEEi{s#7%Lle4>;edJ}Y&FtZk9+a`I>a z`*YAfX64?%3po|*Syz^~$&Ye4yLewe7^dXOoba3wknYdvO_o(z;bbjA7&vRqD2S?5 zrE52%Xe?!$A?_~g!GMt=giqFLRu%-9U?k59dq>OQ)&%$J6~nxLPgD$dy?mJMPRhlYcn@KZcm(SuH>l#G76>k-hYP&hz3 zB}UZ7%IvUm7?ZytC`zFdybAmp-v+*_vRNRMJ9?`^RXHvUOog$JSWARtqM8e`j?};$ zrNeaVG#BK<3EJ;a1$6KJP^yy;e~Xmivo7Y41Ht zO5!MG8y4b34&83j87>0)q9`GyW+4{hLx_#R1S@nOm!KC2a|n}qz;Yj{`>_(E3W(rr zK80Mn)A-y#$&4SE3E^W+_Me)kSN1>WX$s+((OheLZF&%Gvz(WO{w_?0$hPH}HTS=N zsMI_HL93%!YNtH#4Gb6a)(S+H*}uypBxl=6At&1op-qs*ibJ>uRfZjgNq1Cit;Y0t z7rr3B1i~)tVxXgpQ43Vpo);LS?gLG7GhMEtf2GM*cvk*hZptGa{8uVoBPtNAL#x{k>*b&IFS@x9|`s5Ah~l!mey(r6wGIvPE)U#g7uCJY%|Y4~6}c@wjn!9SwWg-)MgJ%gwJ?7)5iBW-qaN&JQttSj zQ|fVyKS#RUei#9Rm@7lRyrtHldtg<6`GPbd;fxd@Javj>V4H%N`&|UBL=e#2@C6_sxv4Q~YBXykAGfY|%OhmS@^JQ@3c+59bf483x1M(G&MPH^D-*fCf*HY(3r-3bQjz==Xv#enie+Uy`IBMgJ}64MnD!jVcSK0Zdq zB-Z>SviF{17YD!GNZ%I|Vu`VKqo00#Vn@u9sC5SuE+UK~PfzSnnB4QGQ4TknOqWE( z0g9iT5u`BT?U#%f3Dka2w!&OMRza{v)JDmR8K89pf0DBySY*6g9a7Ew>oFqmpdjKx zHQ0*tv=uNDt1po2*WBQ4n){g}`!q>Yu><_D*xg0>wy0lI7cmRuL!clcf>P6FWS z=Ok_C<88^^cI$?k#MC3z8@@oU04FE;qqQndcq~qet_0_OHXeoCNANgZoup+llAqXf z=rbM-(xe&1eJ4lzLjhQ>2r9#s$r-y!?Ku;|XCllh?!$7_x@qZ(yYXTD1BFis>%DK;p15-J+Z%(tVX6Nu^fMdTO^UKyO(F`(_LJ! zeUB;lPl8f`tgu->LQOU(qn-tQ5F%9w#>!`Spu=#F4ivI)O`IGwlXFjCV(G%hC$-5I z)WsIqJ8l0gk~Sg&Nak1m&A%LyT@5^HXzMUtPP1y6guS zN)u$ovy-^S(oJrW|5a1Ni?nOF$eaR@1D2Hgfq#|$>T}jir41qbEPw!H$8^yJ5oAet z7RTIy9K@xiV=@WxbF#%d)srPGlldvo93UYk$(A`uQ}E<}a8Q?Sd66_8`O0K<_#3 z>jDF$)74}^RF6^2)zrsijNKi8FlMAUDRiL!T_f0R37|Wn2?Kb&$-+!^|A$5)^HOp!GOQ!SqAAE-gR``eK-h)H`r%!N`098urUhl4A0kxC+j`Sbj=b&L; zrvVj4@b2l6=^{+zdJn z#OxffUqk``uu{2pjXB7lnVD1w6h2~yS99JAs9=7Cb9bS0vera_(-8LrQkIb0kWR%0 zmoi18@A=4NpL~RyL-Xt_({i>8te!1WJ`!a1iwO-O|p86hU*k`~Hhnfmgwy6&GU% zt2hjBXT_I{J3)Jg7%MKo#B2ay>Ou$nXSoPD9r2PK(`de2vs)*#Ht;%Sqn3r^o5T1} zsAUr6>&VckAJbA(QK8HpOSF<>DyN6eKf}n0cW?nv?8`mSI2eV>MpNSTgKMVPAFn5@ zvqlW1m|q5`Rz5tdc#^it@_c!zrStL`gY%76{t^pU0v2wBcogY#F6V)1pc(1+%}^2K zQ5h)#gV(osZYYWpW1^3LE;2Qe!Bcc&mMWiI_6kb1Pdgx*y0Fjll8vUWY3lK7o4X6g zUo>#|u>nQ808kRS(WS4+WciYtJULKfxnLbU;8g!fztR6=`KdZpcBzn!2}w!8ru)BWsADs zQ7M=RiGjzsbW%ASc#DghM*0GUr2+?ea(E9f?Ttbdb}di542+D$q(?c}K+t zCqp}rnTFVbik%z?j2MOCu-vZ6d3Kgmu<=R2GUfOoVc0D>tr?>k1QExL4)6UUTrJq<9L`}A=8}_nk9IG7fj%D-N05EXlm`NCge6czJ9=fb)oKgL&%H`p~gx^k@U6!@S6|C zYR8%|0n_S;tT_G`>Z%(~^PEajt>}3Q{a)PF! zo!A}*V)9=$&;;GvsbmROc$ST)7+&4$E6~6<{gDyqOfi0dN-=m%+<_wiio%? zl~Y^PyOPGjuMz{RWWrNtLRBbZ^vbw!XnFWSn7k>zYQ0jbD|)x(fEfXWqq3mN2KMaz zxUn?v*~I)kB>oUi$-~JMjDCAP`nS`gNZ8J=juiH)UrL3)RL zvLA}l1%bm{GOjrF1vHBi>%^tz$c=}J_mH}2f5JP<_flCwbKqN|)Nf=I$6}%0I$%hW zIx_rhS0nV*vM!Yks>Gnn_?pc62>D{wn|oEv(VKF+4GK((yGEj{A!f7Ib-?^b<^Y}b z_v*xABitaf&06&y)rrLPgJwI8aEJn$ZIqT!ZQb~iA&Rz2L^hP=6Y zGPR6$sB)uy&|;3p4|ShLw1n{}-a^Y6SBEOoTs>!u#x_b&o$@K{g^ut}pzsab09I7_D)NHr?>jI*>2SxB$~DwNPT0 z6QPT!B@K5k5GF=DL9Th#(KSnnRQ|igVwI8_0TZp^Lpa3k$o6E}<@d)zP)bRX%SA`S zQFfruN*%MMC6`FBt^vV=uzp1cX$i#PpYoc_l7EI;uO#N*L%G+wT<;N6g4B_vHuLUCK2tn@Ua^1;Wh862=Xe zGbLwniz|ggV?3+|jjkm74;^P7VW?}Cy8C>=kfae$_@NOztfG`A;-(=y0-HdfDL%{=H^(iSZbl0CXj7E+M0AiM8C24&nWyW)k}rZR3n0AU5o#Bj+j8Ebe#EjT+w@bot?7lUpA|Xd!(< zK%6gKV&VBf2}VasQ@_p3xQp=5-W%XepoUbCc=-smeG>_hM;L$rZvsd_=$7 zo4lX|N-=x8lA14Rw_i-dXFQvaI>2C9erX~S$=4(_ii$hX-H0y0n!(b{qhJZHc(GhR zua{Yuxv{8xmkV8R0&F#!8wLG z@`V)Y`Ga>YS?3`x$Oem-}fVW*Y&M`E(QF8H2iI_%M=sjXx$3_n4&4vR? z=Z6;TM(ZoW*y2zPXwpU#)-TCIeDCDPS(iv}*I-74==N@Pu5Wm(hROP+qrYZuhyQPV z$cC>>j1LhE?D{dxk$cre6U+U0vONgX^u=YVi{?dXzebu2>>N z5wKts5)DZ{uJxztnS~1NoFlWbjhkI0v&@ZUCr2NC=drS<2YNMLiir9Ox^)gM^Ol8H zHU+q(mUqk>1qCBqW_<@ymX}VVfrmM>Gl0X@c8isprrM+CFO5fLH=CPJdIZ+1AG>cx z4t}0;qb^uj%qBhUG`KX}Kf`;yY#f^{-`4qiLq5{jvxzn3nYO5&Yz6ZQbsm}U5Q5#Br%qRwfDIRgN*XrFukKTiPSWRaJjNw zvO*@lC?9nFWND7j0<2(-lRbD-*6*%%L@Vir-nm=NXfj_EwDjs<*514D4sT|)nOr>D zfz80*3(6D@^{pP6vNWgM^DyAj;3mP6VO3Hd_SYzIH7W6Vaq&oXW)4IMAI@WoG#Sdbg7a zhWkfiXp4Yr_j;{{t?$r@!`+mW$TP&dOa~cTkw)w(DaO#jq#1fQi+t9j!6Dyz_c)uq zW2%oFRzBpHv~Z?0yH8_yvv8qRPYS0#!QH_R3O2_BKDQ7`{$u(4L2UB3gjHV2*Ar{I zUkl=a@1x|$$9F{s2(ps7=uZZ#S~ZL`2h!NdzR~b@XzD%ZPJ5JN`Hta{YoMY$6cpgv zVAF1+85JJ`i$X4qm2#_RsBJeYw5h`pTdYNnx^ybAELLFDeq+Rk@ z^nU=KR>o$RwoPw}gteUkmc5nTekTZtvu6sgFpMOyS{X8YrV5XsN|m?wF6RAMJQt%O zzJ9WpF@t>a5Du*h{Nhve4=6vVNRj`qSR4lQVCHYw~M-BGQqM#huen@f!!Ha>K$G*TuQ zN+aafVwI{IGF{-iIt5@Bp!4M>#D2t=rg(?bV~7zLV(mo-0^Cc0QXCzICe`df9V_s1 zr8wv+k9)~31D}05F}MkvS06@f=_EU~)x)Zh@#OJo%9u6%F(9s*5WcN`lnTddyOz=%f-3P$XbIvC~T?z)flfyB{R)sobdNdn7&jHP+)92fAUMGR4@=JkiEf zQPNYaZ`ihxC0nOaztpk3sp_xZ@r{^=pez3{NR8vd;QWv=o9l^3mx=?t9Z5$fuZx|D#R*SSLno4WR2SD;9X>Ht`&J1u zltI6ATQ`vu0|9cME^XpM0UoPKz>eQA%(d|Fx!UHX`va-8t*GVnTqA3%4hHAUN{)E5 zBl#CQt&V0Zf(5DPE|N&JmE?Ay*B z{5CR@;8opaXfl)R3UwL|j?rvJvmGvL`43Oy*Jf&(JEdF>UI7~_9el|2@Fw&3r?Q$m z%>JEMn1nbs_Z-a0rqLB8U93LdBdj&CRyrCq#XYP`-!siGy)sNt%9p1!zf@5$qArjj z@gT3NEGpwqkJZGGnr5f7;6m4a9N;umnH@uA9dGq-v1~o&o&y{1ciVHw%rA>e zLJ_NP)lYCJi_KJ=Y!E*#i}h7xW|gSKx~l%UB$*ahE1|ATAtCZioc%Y0!mg^Lmn7L@ zd1mN}YyHsEcgaFUx$P$>Cd2FMX@xolMDee82gH(>K9On$`A@`_tPq-^Ft@@4**FWc zUWQJ1`8?IeB^3G^Y13Rn(?8dX!{>Ewg$7AVMk&} znvykeAPxE0xx#qPRDTY}>%}Gwc6}~MnwUmz`9jr`vrv?tZ>7rEH(0corVGQiW@9Tz zrbi>r8TUg7oCb9S9ag9dv;S?*x==yXCQ=&vO+QeyPA%y@=7df8Oh+jo&ZHR4bX@2{ z8jWtu1YY}VUKnT>&_qqfo6*R6azDWBiwFw=E5;D}FuV^LM^8%AF~XegS3m$5q|tG` zK*SP|^uy%5=Odm)aP`W*wO#+pV9lJ4s7sF03kf_=Y>FE6&l}TtfPY}z+0XmHIcOi7 zWZlr1*jh7@(CY*RNOIioRpsWcY!_rEX5>!tKX{uL$$R1s3%)0dHz|UGMW~y(Tq`L< zyio6)0f>c53;iY?qbs^r?qBoS;0_PKlNUhLNcA9non#9=ZEXz6>?2Q!0qgl@3>uU_ z#~KGSru6tB2E{m$#$SX-1N#8MQ!Q>?1MV9j%JBGrFpi5?<#xX z@A>FT0$7NE1R!FuwpMcXxH#4xzjtNWJsJ%s$as+fkjSN9Pz|Xx)z-32~HF_S= z_sGqVD|wRujptla&ge_n^kCn*{_7&qH}+FNv`mK^^y5Gs=3Yct;ZA#&i|3fj)9CNK zE;J+∩9kf-q?gFAdPyrX98bP-YSA#xn#*LaSDl>(BAZE#A#jHDy@U`GO65QS6X zAa;3X!2s3K2AY}%3E-pzM4F6KmGpH&9Y3R14pnqD6ThNYnk7>nGZ4;y4~DFx&ypsm zsw*Y^Swo*Syp#fA9_|Hn*C^>9-UB)j)X<@DQN<7Rwb>hYK!jws$FJns^tqugeM1@1 z?$TloyN4VVj6M6h)zL=M$UJ|5vIk)nUJ}2T*+WLF2_0Q}@3x2Id%0!EJr&gvT52n| z2Py8c42S~{)^?Cp&c}H}Sjee?d&3jJT7$U5BEoeauJkxAOV2{MqGa0{_=y6d5voEt z0wNUyl*haV%oPY3`a*O-K|LgCx7OrL>LI}&EXF@RC)~2L)s^C@PvE^frhdt$Etl#= zGme>bdgT8c{S&sm$xWMAqWhFtFX*K5>sW-;iQbQ4jN2z&&j=e%GN{DE!b-7 zmUr9t8$(o&#&yQVCf6^Ug3rN9@*Y)A7v6t+=IFpe0ZGGvf%Ovq5=s6zb2yk7S{gF^ z@%-bsai}^HhR2THj&;wEXz%>Uwi!jfvNmQBw+uAMh{vz1Ka^8qVUBIZLjx!0yppHN z%}Wxn=8d2H0D;tY83hNd-6NWy9GN34+Xkjth^sjvg^4`36%^cLdbwRVoK3bRyNM(t zbjBuXgJjQn)5GVIfm4az<{VV49x|fw;i!giR(A-rj}nVnUUsOK1F(H_5O+crUlWPM zI2Yex_P_?kG=4YY@Li<{?-T7L%fjr#C-7fM#-z!UFT*B_z18y5lXdrGCLudSN)Y#U zFKU%pXNnO(3a46CF0LZ~TD93#Ef5m8D}PKtvT&}O$;3^#;XZjo_4cio$sPIA4psvL z=7bW3&^ENHpjGw@bCaZwhsuQg`Cfmb1@S{F(7e&%b7@`>UQ=E}IXW=#`8aa7=J_Gj>7jU1MAG3n-(w(I(H+{RuKI^w#8D9$wecT9Lg(5s*tj8 ztA!sk^MUz*%LBzW26_->#$l@)!>Bmdx);f(VBcB?y`|7;{7j7`u6SF)=lh&b_NAVY zIX6L5CsYMb{@O*adqA@OktTbFotPn$*Zui)?;^OJ*ZXq!u1mYRsqUSG6%4t8!tqR$ z_RG@~){1uf^XdK>``$F=S!Z|qERprJK1_Q|R{CqDCYt{2=M<$A3VG*Ao z`IIEI>a=`a|AZn;*Z0lC8Gt{dn^B-U_5%>$uk_xo50yWPpAFl?3>UZ=STSswcZ$=LpGFN!xLpJJT- z#jrf0=DCG<1@UF8nU)&h>N}r|NE@NH%u?eiL+%Yn394GdQ`(5I9Q-Q73>SFn=juj6 zS-80_iV_xDuoWD_d$HXc`0Q22byCfW*{qMn7?WsGfU7dwc6 zRW(j>!#wuTw4w2`uIba!Ug13gLE?Cr;SgUpT#um0nfvkR2EoE3VlDfHU_5IGWt6B{ z9LJ~4rM*XnKo?eOLyOyr=ot&GrhIfBByI5)-4M~5yP0Gz zeYc4%wg2I&Xss2^ zs;5Q3*%PvLA{>UXa8a(AvdIiEN_v~hmQ1@8Pe!+COTkacybv{xs3vH*{5}1Pv z7K&{B++arxCb&(5kz z&{XqN*t(KNcyf-|a@ubM(6x*MwB}P(CLB~IWcQQ4stMa}!!uf9V7P2wuYGFU4*t0C zF~|r%en%gppf;s7yc~WsMc6Hlr6>g;tH;1X9Oa-e-)!q@=4e{n_9>T+EZm_cf^m}3 z4{M#4dnEBewYNDXc$!(JNv{ddj%7ar8P)s~DP^NC1%<+RH@DSYof&MyJ2F=DJG5Qw zu2~o8CbDUDtQ>_#CmgX%t6OK?L>#Q@V4-*}jX3E+Xr`EC_xNr)Z94A+B-RQ197`Hy z9~3^d&D0MxL}|7t4&41N%pGK=LCe^3nbJhQ>k8#I&&55f1f{@v{LVI_h^RIQt{ryN zTon}t8kopXpD=&B{ShYLv`UPa?K|fUok5e?)-RKs@sVU)VX{s+$p)F7Y@y}D6EZYJ z*fR5}9bXUVd6J7EuG?gm*@lL0}mYRL4sN*(UvM{O|`?pjMgs36k4# zaIY}SR> zGP4BM*|51!z7Luxb`+M6=0MyLk}t0mQ5iV5{s3glijtBwo0lDO*06EkuVRed-apbZ z6?6s{ILg7HlfxR33g@zSnw0Q9Cv!D*C^6>-XyhEECLQkBBpHaAxPAiP+9CM_2e(Pt zo)lI&F8Qy2ns#BSW}ox^4OpLRbhUw#6t;SNjpu4J_L|G$+yhqu}HHowFx#H2mgnF%bn#!{ zY_wS*v&k`TqT_XaplX1&11nFLw)KyS#KknyH<;^gsKfcRaESwM zc)*$ww-&lYU3@E0MB;{bX}4D}gzALF47^IZk*JgC+3~xf)7_1(lRl(Q!ah*=&UzXU z-d_(_xWhskt3!`Z%8xX3@`JXGHmM zli(H{qkKe)t@7CtDiZg8{!;=mOyw{Nt!^5#;6i$F@NN(MV5E@hs5{HIJA(tF*$x0q zb;k~As2*Tfs<3QzM^Cw}f)EPDT>F6DWw-}btaRo`O1PVG#%=J^&=S9dSpKI1YIxrJ z0ajg`)B>A1iG7%9wz`l*u9V(0e}@G(07ZDFeGDHQcH0wf6^Abn^&MFWvy$Oibs&5w zbzCVk=TzLhiGXNQ?d=lX#QWQsbdTPv)f2$_X6r_oX=&B&U2N%sE>P>e`%LM&mAygk zLmrLm!sS+;Ic;k?-`D5AjVv=xQ+@rf>-0-fM)}8j>FZkP z3!3PfTKx(x78CRE;3VkKYY*AZ(e~QXP^e5Snvh2i=U^M@seq)9rMI_{GMdY!THYfMFbmEjDm^=0j>ZR{EJt?dkdP5ZT~0pYD;u6qW7#qwF?fo zr#Vbj1=d3(@PTRG+)IyoOBor)!Eya_VlNXbEopZZ{gzp*iGBXI|1OfK(sz0e{GZO_~V z9o5g8-Elb$yaJUFk%o@MGnsk-52=A41|)@wP9s#l6LAiN{3*b+U1J#l*(d7{PQWu- zga6_N!Lj|)I!;X~P2RQAD;?ZeVA0`jYje_yA1*azFwz)?da-lsd3QOfD3xdB+yhVy zT#M({>1f=auyCw4h-6V%i*s%1O1-NLKhTmuaelBr$ipwv3?)P&4XrZK-Y9I1I3ZF( z_#7e1Cow=fQb5C;-w+T@-r0Z7I)_F6<}R4IF!&= z*F$|(fKmS4^>l4){^xnGF89xt9{bUHkr6TQ1md0uaT$+uSr*Z^R8Lg-vQifiTW>6t zum80g7*ZN@I2%8@io?4h4y4j&+^jGPkVhg#i@{ zS9iwrfjrV@Ls;8GJ`Equz|IFs1hP~}B3?;8$MP#nqYh`~WcL!z#`_aP3tZ#%_X|mH zzz0gITN7=FVcDjW_Ry*FyA@@KkjlOh+@9JxOCpxeG{m>o1l=ay7u0a-Srp!SRjwtZ zhke{=-@_mu7gEe4%2feKe$fyN%9EiUG;TC=WvpBH}^ z6y8BeAMhbf^I9`MkQgkdRtDeCT(mRtJa1lhU7fqhdC}i#8nckEK36{@{3~(-!0BFd z>EGAO{_F4hSN}KFv$7I@HSpJBzJCFK^|`N{__xx&-+{juIQseAK?F^4&!$_zpL+mTB>^03a)g&gN^Z`Xb|@cW72PXoEc ze;D}leDFK;_bt+&&{48~K!4vb{chp+O~aoSo`HW@_{$dJcl2MA@1IyOuuF0VNK mzr+6;K>r Date: Wed, 18 Sep 2019 00:49:13 -0400 Subject: [PATCH 16/19] Oops, wrong readme! --- Project2-Stream-Compaction/README.md | 95 ++++++++++++++++++++++++++-- 1 file changed, 88 insertions(+), 7 deletions(-) diff --git a/Project2-Stream-Compaction/README.md b/Project2-Stream-Compaction/README.md index 0e38ddb..5b63243 100644 --- a/Project2-Stream-Compaction/README.md +++ b/Project2-Stream-Compaction/README.md @@ -1,14 +1,95 @@ -CUDA Stream Compaction +CUDA Number Algorithms ====================== **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* John Marcao + * [LinkedIn](https://www.linkedin.com/in/jmarcao/) + * [Personal Website](https://jmarcao.github.io) +* Tested on: Windows 10, i5-4690K @ 3.50GHz, 8GB DDR3, RTX 2080 TI 3071MB (Personal) -### (TODO: Your README) +# Goals -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +The goal of this project was to study and analyze different common algorithms in GPU programming. We specifically look at two common operations: stream compatcion and exclusive scan. The two functions are common in GPU programming and Hraphics design, with applications in raytracing and performance focued improvements. The two algorithms can be combined to form powerful tools. The algorithms work when serialized, but the parallel capabilities of GPUs allow them to be (hopefully) sped up. +Scanning is a simple function: Starting at the begining of an array, step through the array adding the previous cell's value to the current cell. Stream Compation is the processor of removing unwanted values from an array while preserving the order between wanted values. One example of this is removing all zero elements from an array. + +For this project I implemented four versions of the above algorithms. +* CPU Implementaion - To measure the performance of the algotrithms in serial execution. +* Naive GPU - A simple GPU implementation with little thought to advanced algorithms or memory. +* Efficient GPU - A more focused implementation that takes advantage of the GPU's parallelism. +* Thrust - A library implementation of the algorithms, for comparison. + +To see how each implementation comapres, I ran each with varying block sizes to see the reponse. The results are below. + +# Performance Data + +I first present some charts showing the performance differences between the implementations. + +![](img/scan_cmp.png) + +![](img/scan_cmp_ohne_cpu.png) + +![](img/compact_cmp.png) + +Looking at the first chart, we can see that for values below 1MB, there is very little difference in the performance between the 8 scenarios. However, as we approach 4MB, it becomes clear that the CPU is underperforming comapred to the GPUs. In fact, as can be seen in figure 2, the GPU implementations have improved rates with larger data sets. This comes down to a fundamental rule on GPUs: overhead is high when working with small ammounts of data. Why go through the process of offloading a scan operation with only 256 elements? An optimized CPU implementation will take advantage of pipelining, cacheing, and other utilities to greatly speed up the operation. When approaching large data sets that can be worked with independently, we can see a huge benefit to the GPU. + +Looking at the performance of just the GPU implementations of Scan, there isn;t much difference between the Naive and Efficient implementations. The two are roughly the same and variation is likely due to noise in the test environment (other processes using the GPU, latencies in CPU scheduling during observation). It is obvious from the data set, however, that the Thrust Non-Base 2 implementation is the most efficient. + +Lastly, looking at figure 3, we can see the performance comparisson for the Stream Compartion algorithm. The slowest implementaton here is the CPU compaction with Scan. This makes sense, since the scan operation is still serialized and takes up a lot of CPU time. This goes to show that some improvements to an algorithm will only benefit systems that can take advantage of parallelism. We see that the Efficient GPU implementation, both Power of Two and Non Power of Two, perform slightly worse than the CPU implementation without Scan. + +# Example Run + +``` +**************** +** SCAN TESTS ** +**************** + [ 42 16 18 8 38 28 13 0 26 5 30 4 48 ... 39 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0021ms (std::chrono Measured) +==== cpu scan, non-power-of-two ==== + elapsed time: 0.002ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 0.04096ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.038976ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.089536ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.086976ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.001696ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.000608ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 0 2 0 0 0 3 0 0 3 0 2 2 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.0025ms (std::chrono Measured) + [ 2 2 3 3 2 2 1 3 2 2 1 3 3 ... 2 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.0025ms (std::chrono Measured) + [ 2 2 3 3 2 2 1 3 2 2 1 3 3 ... 1 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.0055ms (std::chrono Measured) + [ 2 2 3 3 2 2 1 3 2 2 1 3 3 ... 2 1 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.317888ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.545888ms (CUDA Measured) + passed +Press any key to continue . . . +``` \ No newline at end of file From f1e5d9bccc7f3be5506a43c996a221d74a2339c1 Mon Sep 17 00:00:00 2001 From: John Marcao <–jmarcao@users.noreply.github.com> Date: Wed, 18 Sep 2019 00:49:58 -0400 Subject: [PATCH 17/19] Wrong readme... again. I need sleep. --- Project2-Character-Recognition/README.md | 95 ++++++++++++++++++++++-- 1 file changed, 88 insertions(+), 7 deletions(-) diff --git a/Project2-Character-Recognition/README.md b/Project2-Character-Recognition/README.md index 4503fac..5b63243 100644 --- a/Project2-Character-Recognition/README.md +++ b/Project2-Character-Recognition/README.md @@ -1,14 +1,95 @@ -CUDA Character Recognition +CUDA Number Algorithms ====================== **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* John Marcao + * [LinkedIn](https://www.linkedin.com/in/jmarcao/) + * [Personal Website](https://jmarcao.github.io) +* Tested on: Windows 10, i5-4690K @ 3.50GHz, 8GB DDR3, RTX 2080 TI 3071MB (Personal) -### (TODO: Your README) +# Goals -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +The goal of this project was to study and analyze different common algorithms in GPU programming. We specifically look at two common operations: stream compatcion and exclusive scan. The two functions are common in GPU programming and Hraphics design, with applications in raytracing and performance focued improvements. The two algorithms can be combined to form powerful tools. The algorithms work when serialized, but the parallel capabilities of GPUs allow them to be (hopefully) sped up. +Scanning is a simple function: Starting at the begining of an array, step through the array adding the previous cell's value to the current cell. Stream Compation is the processor of removing unwanted values from an array while preserving the order between wanted values. One example of this is removing all zero elements from an array. + +For this project I implemented four versions of the above algorithms. +* CPU Implementaion - To measure the performance of the algotrithms in serial execution. +* Naive GPU - A simple GPU implementation with little thought to advanced algorithms or memory. +* Efficient GPU - A more focused implementation that takes advantage of the GPU's parallelism. +* Thrust - A library implementation of the algorithms, for comparison. + +To see how each implementation comapres, I ran each with varying block sizes to see the reponse. The results are below. + +# Performance Data + +I first present some charts showing the performance differences between the implementations. + +![](img/scan_cmp.png) + +![](img/scan_cmp_ohne_cpu.png) + +![](img/compact_cmp.png) + +Looking at the first chart, we can see that for values below 1MB, there is very little difference in the performance between the 8 scenarios. However, as we approach 4MB, it becomes clear that the CPU is underperforming comapred to the GPUs. In fact, as can be seen in figure 2, the GPU implementations have improved rates with larger data sets. This comes down to a fundamental rule on GPUs: overhead is high when working with small ammounts of data. Why go through the process of offloading a scan operation with only 256 elements? An optimized CPU implementation will take advantage of pipelining, cacheing, and other utilities to greatly speed up the operation. When approaching large data sets that can be worked with independently, we can see a huge benefit to the GPU. + +Looking at the performance of just the GPU implementations of Scan, there isn;t much difference between the Naive and Efficient implementations. The two are roughly the same and variation is likely due to noise in the test environment (other processes using the GPU, latencies in CPU scheduling during observation). It is obvious from the data set, however, that the Thrust Non-Base 2 implementation is the most efficient. + +Lastly, looking at figure 3, we can see the performance comparisson for the Stream Compartion algorithm. The slowest implementaton here is the CPU compaction with Scan. This makes sense, since the scan operation is still serialized and takes up a lot of CPU time. This goes to show that some improvements to an algorithm will only benefit systems that can take advantage of parallelism. We see that the Efficient GPU implementation, both Power of Two and Non Power of Two, perform slightly worse than the CPU implementation without Scan. + +# Example Run + +``` +**************** +** SCAN TESTS ** +**************** + [ 42 16 18 8 38 28 13 0 26 5 30 4 48 ... 39 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0021ms (std::chrono Measured) +==== cpu scan, non-power-of-two ==== + elapsed time: 0.002ms (std::chrono Measured) + passed +==== naive scan, power-of-two ==== + elapsed time: 0.04096ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.038976ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.089536ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.086976ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.001696ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.000608ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 0 2 0 0 0 3 0 0 3 0 2 2 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.0025ms (std::chrono Measured) + [ 2 2 3 3 2 2 1 3 2 2 1 3 3 ... 2 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.0025ms (std::chrono Measured) + [ 2 2 3 3 2 2 1 3 2 2 1 3 3 ... 1 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.0055ms (std::chrono Measured) + [ 2 2 3 3 2 2 1 3 2 2 1 3 3 ... 2 1 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.317888ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.545888ms (CUDA Measured) + passed +Press any key to continue . . . +``` \ No newline at end of file From 17c8a3f62bf0a6905ef9e9a4b24df9fc8c685d2f Mon Sep 17 00:00:00 2001 From: John Marcao <–jmarcao@users.noreply.github.com> Date: Wed, 18 Sep 2019 00:51:12 -0400 Subject: [PATCH 18/19] Revert "Wrong readme... again. I need sleep." This reverts commit f1e5d9bccc7f3be5506a43c996a221d74a2339c1. --- Project2-Character-Recognition/README.md | 95 ++---------------------- 1 file changed, 7 insertions(+), 88 deletions(-) diff --git a/Project2-Character-Recognition/README.md b/Project2-Character-Recognition/README.md index 5b63243..4503fac 100644 --- a/Project2-Character-Recognition/README.md +++ b/Project2-Character-Recognition/README.md @@ -1,95 +1,14 @@ -CUDA Number Algorithms +CUDA Character Recognition ====================== **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* John Marcao - * [LinkedIn](https://www.linkedin.com/in/jmarcao/) - * [Personal Website](https://jmarcao.github.io) -* Tested on: Windows 10, i5-4690K @ 3.50GHz, 8GB DDR3, RTX 2080 TI 3071MB (Personal) +* (TODO) YOUR NAME HERE + * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. +* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) -# Goals +### (TODO: Your README) -The goal of this project was to study and analyze different common algorithms in GPU programming. We specifically look at two common operations: stream compatcion and exclusive scan. The two functions are common in GPU programming and Hraphics design, with applications in raytracing and performance focued improvements. The two algorithms can be combined to form powerful tools. The algorithms work when serialized, but the parallel capabilities of GPUs allow them to be (hopefully) sped up. +Include analysis, etc. (Remember, this is public, so don't put +anything here that you don't want to share with the world.) -Scanning is a simple function: Starting at the begining of an array, step through the array adding the previous cell's value to the current cell. Stream Compation is the processor of removing unwanted values from an array while preserving the order between wanted values. One example of this is removing all zero elements from an array. - -For this project I implemented four versions of the above algorithms. -* CPU Implementaion - To measure the performance of the algotrithms in serial execution. -* Naive GPU - A simple GPU implementation with little thought to advanced algorithms or memory. -* Efficient GPU - A more focused implementation that takes advantage of the GPU's parallelism. -* Thrust - A library implementation of the algorithms, for comparison. - -To see how each implementation comapres, I ran each with varying block sizes to see the reponse. The results are below. - -# Performance Data - -I first present some charts showing the performance differences between the implementations. - -![](img/scan_cmp.png) - -![](img/scan_cmp_ohne_cpu.png) - -![](img/compact_cmp.png) - -Looking at the first chart, we can see that for values below 1MB, there is very little difference in the performance between the 8 scenarios. However, as we approach 4MB, it becomes clear that the CPU is underperforming comapred to the GPUs. In fact, as can be seen in figure 2, the GPU implementations have improved rates with larger data sets. This comes down to a fundamental rule on GPUs: overhead is high when working with small ammounts of data. Why go through the process of offloading a scan operation with only 256 elements? An optimized CPU implementation will take advantage of pipelining, cacheing, and other utilities to greatly speed up the operation. When approaching large data sets that can be worked with independently, we can see a huge benefit to the GPU. - -Looking at the performance of just the GPU implementations of Scan, there isn;t much difference between the Naive and Efficient implementations. The two are roughly the same and variation is likely due to noise in the test environment (other processes using the GPU, latencies in CPU scheduling during observation). It is obvious from the data set, however, that the Thrust Non-Base 2 implementation is the most efficient. - -Lastly, looking at figure 3, we can see the performance comparisson for the Stream Compartion algorithm. The slowest implementaton here is the CPU compaction with Scan. This makes sense, since the scan operation is still serialized and takes up a lot of CPU time. This goes to show that some improvements to an algorithm will only benefit systems that can take advantage of parallelism. We see that the Efficient GPU implementation, both Power of Two and Non Power of Two, perform slightly worse than the CPU implementation without Scan. - -# Example Run - -``` -**************** -** SCAN TESTS ** -**************** - [ 42 16 18 8 38 28 13 0 26 5 30 4 48 ... 39 0 ] -==== cpu scan, power-of-two ==== - elapsed time: 0.0021ms (std::chrono Measured) -==== cpu scan, non-power-of-two ==== - elapsed time: 0.002ms (std::chrono Measured) - passed -==== naive scan, power-of-two ==== - elapsed time: 0.04096ms (CUDA Measured) - passed -==== naive scan, non-power-of-two ==== - elapsed time: 0.038976ms (CUDA Measured) - passed -==== work-efficient scan, power-of-two ==== - elapsed time: 0.089536ms (CUDA Measured) - passed -==== work-efficient scan, non-power-of-two ==== - elapsed time: 0.086976ms (CUDA Measured) - passed -==== thrust scan, power-of-two ==== - elapsed time: 0.001696ms (CUDA Measured) - passed -==== thrust scan, non-power-of-two ==== - elapsed time: 0.000608ms (CUDA Measured) - passed - -***************************** -** STREAM COMPACTION TESTS ** -***************************** - [ 2 0 2 0 0 0 3 0 0 3 0 2 2 ... 1 0 ] -==== cpu compact without scan, power-of-two ==== - elapsed time: 0.0025ms (std::chrono Measured) - [ 2 2 3 3 2 2 1 3 2 2 1 3 3 ... 2 1 ] - passed -==== cpu compact without scan, non-power-of-two ==== - elapsed time: 0.0025ms (std::chrono Measured) - [ 2 2 3 3 2 2 1 3 2 2 1 3 3 ... 1 2 ] - passed -==== cpu compact with scan ==== - elapsed time: 0.0055ms (std::chrono Measured) - [ 2 2 3 3 2 2 1 3 2 2 1 3 3 ... 2 1 ] - passed -==== work-efficient compact, power-of-two ==== - elapsed time: 0.317888ms (CUDA Measured) - passed -==== work-efficient compact, non-power-of-two ==== - elapsed time: 0.545888ms (CUDA Measured) - passed -Press any key to continue . . . -``` \ No newline at end of file From b170f679c0be8ddd3f10387f678e46b311cbb9d1 Mon Sep 17 00:00:00 2001 From: John Marcao <–jmarcao@users.noreply.github.com> Date: Wed, 18 Sep 2019 01:34:36 -0400 Subject: [PATCH 19/19] Sad Readme --- Project2-Character-Recognition/README.md | 31 +++++++++++++++++++----- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/Project2-Character-Recognition/README.md b/Project2-Character-Recognition/README.md index 4503fac..e15fe15 100644 --- a/Project2-Character-Recognition/README.md +++ b/Project2-Character-Recognition/README.md @@ -3,12 +3,31 @@ CUDA Character Recognition **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* John Marcao + * [LinkedIn](https://www.linkedin.com/in/jmarcao/) + * [Personal Website](https://jmarcao.github.io) +* Tested on: Windows 10, i5-4690K @ 3.50GHz, 8GB DDR3, RTX 2080 TI 3071MB (Personal) -### (TODO: Your README) +# Goals -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +The goal of this project is to explore one of the many applications of parallel programming: Machine Learning. We start with implementing a Perceptron, a basic ML construct that uses several layers, each with varying weights and biases, to implement a character recognition machine. The Perceptron takes in data in its input layer and then utilizes parallel algotihms to perform matrix mutliplication and operations to transform the input data into a guess in the output data. With enough "training", the Perceptron can detects what character is written in an image. +The perceptron I designed has three layers: +* Input Layer - Accepts formatted image data +* Hidden Layer - Intermediate layer that reduces the number of datapoints in the image by 80% +* Output Layer - Final layer that produces an output based on the weights learned by the Perceptron. + +Unfortunetly I was not able to get my Perceptron up and running in the alotted time. Since I cannot discuss performance characterization, I will go over some issues in my design and some lessons learned. + +## What is Working + +* Forward-Propohation... sort of. My machine is able to take an input and feed it through the perceptron to form an output decision. There were some changes I had to make in my system that deviated from the traditional models I studied. My node values were particularly high, so much so that the Softmax equation applied to the last layer would fail due to overflowing float values (e^1023043 is too much?). I remidied this by adding a step where each value in the output layer is scaled down such that the Softmax equation still works. +* Matrix Manipulation - Using the cublas library, I was able to set up several functions and calls to perform a variety of transformations on the low-level matrix values through my more high-level classes. + +## What is not Working + +* Learning/Backpropagation - Right now, the system can go through one learning epoch and then it can apply the deltas based on the error to the weights of the system. However, during the second epoch, the system diverges and my float values overflow. I am not sure why this is the case. Thoughts include inverted operations, invertedt matrix indicies, etc. + +# Challenges + +The most challenging part of this project was getting the complexity under control. The perceptron has a lot of moving parts and a lot of equations, and getting them confused and mixed up is easy. There is also additional complexity with the introduction of the cublas library with CUDA. The library is incredibly popwerful, providing several functions for Vector and Matrix operations. Part of the challenge of this project was understanding the library as well as its API and properly using it. I found that there are a lot of math concepts that, although vital, were lost on me. \ No newline at end of file