From 96e40a5712db4ba821733c0b4798e71abb5f1aa0 Mon Sep 17 00:00:00 2001 From: tabathah Date: Fri, 13 Sep 2019 12:37:37 -0400 Subject: [PATCH 1/7] Stream compaction part complete --- .../stream_compaction/CMakeLists.txt | 2 +- .../stream_compaction/cpu.cu | 55 ++++++- .../stream_compaction/efficient.cu | 145 +++++++++++++++++- .../stream_compaction/naive.cu | 56 ++++++- .../stream_compaction/thrust.cu | 13 +- 5 files changed, 253 insertions(+), 18 deletions(-) diff --git a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt index cdbef77..6444fc7 100644 --- a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt +++ b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_37 ) diff --git a/Project2-Stream-Compaction/stream_compaction/cpu.cu b/Project2-Stream-Compaction/stream_compaction/cpu.cu index a2d3e6c..dac6329 100644 --- a/Project2-Stream-Compaction/stream_compaction/cpu.cu +++ b/Project2-Stream-Compaction/stream_compaction/cpu.cu @@ -18,9 +18,18 @@ namespace StreamCompaction { * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ void scan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); + bool standalone = true; + try { timer().startCpuTimer(); } + catch (std::exception) { standalone = false; } + + int sum = 0; + for (int i = 0; i < n; i++) + { + odata[i] = sum; + sum += idata[i]; + } + + if(standalone){ timer().endCpuTimer(); } } /** @@ -30,9 +39,19 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + int idxInOut = 0; + for (int i = 0; i < n; i++) + { + if (idata[i] != 0) + { + odata[idxInOut] = idata[i]; + idxInOut++; + } + } + timer().endCpuTimer(); - return -1; + return idxInOut; } /** @@ -42,9 +61,31 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + int* temp = new int[n]; + int* tempScan = new int[n]; + for (int i = 0; i < n; i++) + { + if (idata[i] != 0) + temp[i] = 1; + else + temp[i] = 0; + } + + scan(n, tempScan, temp); + + int num = 0; + for (int i = 0; i < n; i++) + { + if (temp[i] == 1) + { + odata[tempScan[i]] = idata[i]; + num++; + } + } + timer().endCpuTimer(); - return -1; + return num; } } } diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.cu b/Project2-Stream-Compaction/stream_compaction/efficient.cu index 2db346e..1e268e2 100644 --- a/Project2-Stream-Compaction/stream_compaction/efficient.cu +++ b/Project2-Stream-Compaction/stream_compaction/efficient.cu @@ -12,13 +12,102 @@ namespace StreamCompaction { return timer; } + #define blockSize 128 + + __global__ void kernMapToBoolean(int N, int* arr, int* boolArr) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index > N) return; + + boolArr[index] = arr[index]; + if (boolArr[index] != 0) + { + boolArr[index] = 1; + } + } + + __global__ void kernUpSweep(int N, int d, int* arr) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index > N) return; + + int powDPlus = pow(float(2), float(d + 1)); + int powD = pow(float(2), float(d)); + + if (index % powDPlus == 0) + { + arr[index + powDPlus - 1] += arr[index + powD - 1]; + } + if (index == N - 1) + { + arr[index] = 0; + } + } + + __global__ void kernDownSweep(int N, int d, int* arr) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index > N) return; + + int powDPlus = pow(float(2), float(d + 1)); + int powD = pow(float(2), float(d)); + + if (index % powDPlus == 0) + { + int temp = arr[index + powD - 1]; + arr[index + powD - 1] = arr[index + powDPlus - 1]; + arr[index + powDPlus - 1] += temp; + } + } + + __global__ void kernInclusiveToExclusive(int N, int* arr) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index > N) return; + + arr[index] -= arr[0]; + } + + __global__ void kernScatter(int N, int* idata, int* boolArr, int* scanArr, int* odata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index > N) return; + + if (boolArr[index] == 1) + { + int idx = scanArr[index]; + odata[idx] = idata[index]; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* dev_arr; + + cudaMalloc((void**)&dev_arr, n * sizeof(int)); + cudaMemcpy(dev_arr, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + + for (int i = 0; i < ilog2ceil(n); i++) + { + kernUpSweep << > > (n, i, dev_arr); + } + + for (int j = ilog2ceil(n)-1; j >= 0; j--) + { + kernDownSweep << > > (n, j, dev_arr); + } + + kernInclusiveToExclusive << > > (n, dev_arr); + + cudaMemcpy(odata, dev_arr, sizeof(int) * n, cudaMemcpyDeviceToHost); + timer().endGpuTimer(); + + cudaFree(dev_arr); } /** @@ -31,10 +120,56 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - return -1; + int* dev_boolArr; + int* dev_scanArr; + int* dev_idata; + int* dev_odata; + + int* host_boolArr = new int[n]; + int* host_scanArr = new int[n]; + + cudaMalloc((void**)&dev_boolArr, n * sizeof(int)); + cudaMalloc((void**)&dev_scanArr, n * sizeof(int)); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + bool standalone = true; + try { timer().startCpuTimer(); } + catch (std::exception) { standalone = false; } + + kernMapToBoolean << > > (n, dev_idata, dev_boolArr); + cudaMemcpy(host_boolArr, dev_boolArr, sizeof(int) * n, cudaMemcpyDeviceToHost); + + scan(n, host_scanArr, host_boolArr); + cudaMemcpy(dev_scanArr, host_scanArr, sizeof(int) * n, cudaMemcpyHostToDevice); + + kernScatter << > > (n, dev_idata, dev_boolArr, dev_scanArr, dev_odata); + + if (standalone) { timer().endCpuTimer(); } + + cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + + int num = 0; + for (int i = n-1; i >= 0; i--) + { + if (host_boolArr[i] != 0) + { + num = host_scanArr[i] + 1; + break; + } + } + + cudaFree(dev_boolArr); + cudaFree(dev_scanArr); + cudaFree(dev_idata); + cudaFree(dev_odata); + + free(host_boolArr); + free(host_scanArr); + + return num; } } } diff --git a/Project2-Stream-Compaction/stream_compaction/naive.cu b/Project2-Stream-Compaction/stream_compaction/naive.cu index 4308876..f42ecc3 100644 --- a/Project2-Stream-Compaction/stream_compaction/naive.cu +++ b/Project2-Stream-Compaction/stream_compaction/naive.cu @@ -11,15 +11,67 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + #define blockSize 128 + + __global__ void kernNaiveScan(int N, int d, int* read, int* write) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + + int start = pow(float(2), float(d - 1)); + if (index >= start) + { + write[index] = read[index - start] + read[index]; + } + else + { + write[index] = read[index]; + } + } + + __global__ void kernInclusiveToExclusive(int N, int* read, int* write) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + + if (index == 0) + { + write[index] = 0; + } + else + { + write[index] = read[index-1]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* dev_read; + int* dev_write; + + cudaMalloc((void**)&dev_read, n * sizeof(int)); + cudaMalloc((void**)&dev_write, n * sizeof(int)); + + cudaMemcpy(dev_read, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + + for (int i = 1; i <= ilog2ceil(n); i++) + { + kernNaiveScan << > > (n, i, dev_read, dev_write); + std::swap(dev_read, dev_write); + } + kernInclusiveToExclusive << > > (n, dev_read, dev_write); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_write, sizeof(int) * n, cudaMemcpyDeviceToHost); + + cudaFree(dev_read); + cudaFree(dev_write); } } } diff --git a/Project2-Stream-Compaction/stream_compaction/thrust.cu b/Project2-Stream-Compaction/stream_compaction/thrust.cu index 1def45e..6183414 100644 --- a/Project2-Stream-Compaction/stream_compaction/thrust.cu +++ b/Project2-Stream-Compaction/stream_compaction/thrust.cu @@ -18,11 +18,18 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::host_vector host_idata(n); + thrust::copy(idata, idata + n, host_idata.begin()); + + thrust::device_vector dv_in = host_idata; + + thrust::device_vector dv_out(n); + timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); timer().endGpuTimer(); + + thrust::copy(dv_out.begin(), dv_out.end(), odata); } } } From a1a88bb1fd83dad44ebeef0b4a5177dfbcaa47e8 Mon Sep 17 00:00:00 2001 From: tabathah Date: Sat, 14 Sep 2019 16:39:37 -0400 Subject: [PATCH 2/7] performance analysis and readme --- Project2-Stream-Compaction/README.md | 99 ++++++++++++++++-- .../img/arraySizeChartFull.JPG | Bin 0 -> 30923 bytes .../img/arraySizeChartZoom.JPG | Bin 0 -> 36573 bytes .../img/blockSizeChart.JPG | Bin 0 -> 26446 bytes .../stream_compaction/efficient.cu | 32 +++--- 5 files changed, 113 insertions(+), 18 deletions(-) create mode 100644 Project2-Stream-Compaction/img/arraySizeChartFull.JPG create mode 100644 Project2-Stream-Compaction/img/arraySizeChartZoom.JPG create mode 100644 Project2-Stream-Compaction/img/blockSizeChart.JPG diff --git a/Project2-Stream-Compaction/README.md b/Project2-Stream-Compaction/README.md index 0e38ddb..ddf8108 100644 --- a/Project2-Stream-Compaction/README.md +++ b/Project2-Stream-Compaction/README.md @@ -3,12 +3,99 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Tabatha Hickman + * LinkedIn:https://www.linkedin.com/in/tabatha-hickman-335987140/ +* Tested on: Windows 10 Pro, i7-5600U CPU @ 2.60GHz 16GB, GeForce 840M (personal computer) -### (TODO: Your README) +## Stream Compaction -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +### Approaches +Stream compaction is the process of removing null or otherwise useless values from an array. Naively, you could just traverse each element one by one, removing it if necessary, but there are other methods that can be used to reduce computation. One of them involves using a prefix-sum scan. If we create a mirror array to the original which has boolean values (0 or 1) representing if the value is null or not, then perform a scan on that, we will find that for each value which should be in the output of stream compaction, the mirrored value in the scan output is its index in the output array. + +So now we should think about how best we can perform the prefix-sum scan. There are several approaches I implemented that accomplish this: + +* CPU Naive approach: A simple for loop which keeps track of a sum, goes through each element in the array and adds its value to the sum. +* GPU Naive approach: We reduce the number of computations to n times log base 2 of n by looping through each power of 2 from 1 to the first that is greater than or equal to the size of the array and summing pairs of numbers with that power of 2 distance from one another. As this adds up we get finally get the final sum adding the sum at the halfway index to the sum at the last index. +* GPU Work-Efficient approach: This approach further optimizes the scan by imagining the array as a binary tree and summing pairs of two iteratively until we get to the top of the tree with a final sum. This is considered the upsweep phase. It works, but it only sets the final index with the correct sum and neglects the others. So there is next a downsweep phase where from the top of the tree to the leaves, we take two nodes and swap their values with the lower index one adding its value to the current sum of the right one. By the time we get to the bottom of the tree, where we have each index in the original array, they all have the correct output for the prefix sum. +* GPU Thrust approach: Simply invokes the scan function of the Thrust library. + +### Performance Analysis + +Optimal block sizes for each scan approach: I tested different block sizes to find the optimal one for each GPu implementation and got these results: + +![](img/blockSizeChart.JPG) + +* Naive: 128 +* Work-Efficient: 128 + +Comparison of all scan approaches: + +Here is a chart of the results for all approaches with differing array sizes. + +![](img/arraySizeChartFull.JPG) + +And here is a zoomed in version of the chart so you can see the comparisons at a more detailed level. + +![](img/arraySizeChartZoom.JPG) + +The CPU and Thrust implmentation appears to be pretty stable until we reach a size of 2^16 or 2^17. Though Thrust starts out as one of the worst times at the lowest array size, which leads me to believe a lot of the computation for Thrust involves the overhead of starting it up and using the library, and is unrelated to array size until we reach really large numbers. Since the CPU implementation should have O(n) time, I was surprised to see the slope was so minimal as the array size increased by powers of 2. However, the CPU implementation is really only doing one calculation per element so maybe the simplicity of just using a for loop has so little overhead is what makes this implementation so much faster. + +As for the Naive and Work-efficient GPU implementations, both seem to follow the same exponential curve, with the work-efficient consitently about double the time of naive. Both are far worse than the Thrust or CPU implementations, which only grows more clear as array size increases. I think that these two implementations are worse only because the task at hand is so simple. The overhead of invoking multiple kernels, performing multiple loops, and in some cases needing to swap buffers seems to be more trouble than it's worth. As for the difference between the naive and work-efficient, I would guess that the extra kernel is what causes the doubling issue, since almost the same calculations need to be done in the two implementations, but in the work-efficient we split it into upsweep and downsweep which need to be completed in sequence so it all takes twice the time. + +Output of test program: (2^8 array size) + +``` +**************** +** SCAN TESTS ** +**************** + [ 11 10 31 32 43 9 48 31 19 2 32 39 22 ... 12 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.000789ms (std::chrono Measured) + [ 0 11 21 52 84 127 136 184 215 234 236 268 307 ... 5946 5958 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.000395ms (std::chrono Measured) + [ 0 11 21 52 84 127 136 184 215 234 236 268 307 ... 5870 5891 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.052672ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.05264ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.090496ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.091072ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.095808ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.103232ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 0 1 1 1 2 2 2 3 0 3 3 3 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.001579ms (std::chrono Measured) + [ 2 1 1 1 2 2 2 3 3 3 3 1 2 ... 1 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.001579ms (std::chrono Measured) + [ 2 1 1 1 2 2 2 3 3 3 3 1 2 ... 2 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 5.07698ms (std::chrono Measured) + [ 2 1 1 1 2 2 2 3 3 3 3 1 2 ... 1 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.08976ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.089792ms (CUDA Measured) + passed +``` diff --git a/Project2-Stream-Compaction/img/arraySizeChartFull.JPG b/Project2-Stream-Compaction/img/arraySizeChartFull.JPG new file mode 100644 index 0000000000000000000000000000000000000000..34d1a891fb1ec279073dacfbd69f17fd26ad6e4f GIT binary patch literal 30923 zcmeIb2Ut`~)-Jpekf?}~vt-FRLyHQ6NX}Vu&Zz;R0R<&XmYlPI4^Ua<6U)!f1*6v-ktLm*)t5&_Wb_<_?F93I*NlHrsNJvNk81WB) zFWwoF09%*>fUGRQ1ONa!fPy3p+(1YuDo><X2LPkQ|c$Sp>{VyU1aHnOH3_wGO z*obQ}L;}b*f4%cv{`vWdz)u8zBJdM|p9uU!;Qv7cP~Rc{;z8gcJcak~P=1#`L`M7~ zBXjToKxE_}@*my=ddPRYetHxC4{FWNF8@T}Cjvha_=&(z1b#q(i-S{8h(l0_lZT3v zSBRHWh(`eUOWy&23h)GwLX?CrViA}-JKGDfv)ekc85r9cny?w!*|38R?AbZlIM@LZ z5ZKYj$ce5l3TFA*Cmhf69Tl5~coATio2- z*xb0;>>SP5IRym;**Un_xwu#nIarT~e zb8!}>c5$&V7BV$3Hzp73IM88-17vV?r5s^pev7?EBv&j=g z68k=!tQ>-@oIEPO4KW)BH?Ii$f5|7p{vC`T8v3to`QL?>n252FkkR+H+d2MPy|RhJ zKR4Re7Gemh?bpsBlAxLQG=$IIocvx6?Ok_l4O#ko){u#hUM_Rf8K}MnmZr~yz<08S^07}F;BP!x# z^84ZBpFT)8kWo-?qM>78Vj(hA-T`hPAtT>FK}JPIK^#jWc_HotD7dJ1?{kXZyr*b@ zM(u#dwwT8=D7*N5?0pXXh7}-}{9GAphE} z-+T67`h|<=*9{aDWE8aT{X)9ohDgY`D5&>2Z{8JGL^E)>N6qDnj`t`cy|fvFhFfVL z-_UUolYo|Ik?!Do*M8~Q-`6p}|5nd_@7N#xngXzqkr0E2j0=E(OUWR{v>X5JWF__X z3wewT86yL+*vKZpm=W|hQbfF>j38zKm$U$&aAQyk-2_#ND#TBCXaGq9PvPI4kc}l` z`NG6w>{Z|7@vcsd`#^cEmO-~P#j~lk=R~`bMLI7|+iN{fMGo}hbjIqf9QeatGK(lY z+y8tfsfmxFadi9U@__L^_d~priaxZ~soa3$D;sjD$iv6BEpC)%Uh%A9Zf3}@lt3Ys z4^E1dpAU%$(jC`1`|8P8vY3A^eoE(ku#RVztEddl7+PI!ryz{QayJRXswGi>LMw?4 z?!g`KGT(2^?rRDXQDiwiY$rQ&sE}KK_zwLbh9U1#;il!wrzvX^Hm0qu)k+&x3tq(* z^cqv#P;o*u?|D6DC$)p;u6&nnr(PctLg(_~0EO#%vq*EbgJGvXa7VgW{n=0@It**m zl}lqLdpbBYcIO@jDT{-Dhn)HNOJSZWF}x95=brj?7FHDgD%b69u7J>oOh~uhwMHZW zo6dH3b@Z*2*DpW4pR!HIqh2(+7Z@oOI9;oOG;k22GIjPA^s$5|D|WoMx~Yz!1QcKK zs8>uU#3?Fendn0}#)Jjuma?ve9>pjNk@ow(v9zJk&mV>HjSW4`1a`E%c?J`8_d`#T zs1kirY--7$1Rk-8Wf>i$-P{y=APZ`POhQa7lt_u$i*wcCfKqVMt(uicQ3>u=*!@wL z{_>gbv|wGKRS*|*YE@p*P9oErsI zf<`IA2dXbcE%(74gALp{TO@js-OQE*XM+{?Yj)k=3Mm`wiv`*N4}7N-;SrOM`^jx; z@mD5StT7+SWx8As&W&OC>6x2xTG5w&x)xCyMcKpR_j;0a7&oX_ZsRPU6hqn)WY+>; z+T?25Hk1t?pOfsAQ0dm^&@vU2qB^310@KhNka726xgWrCnpj0i!A_QaqZ#jrJcGi{|u?-}fFD1AbG!1Go`s{Sc+O6H6L3TQumlTeJd zY4ErNN800*n@N^`DwaouihPo$gaK(wVz zu|#_vo=RZ7q1E|z3dUGZcf&BxnkfcnDk&83m#7gh=69nk>})IJG~*RsPD&#yv8o^W zcBm?)-d>&azJ)?R0G>U6FlwoNR=Ij z0~Ju|0*iNAur=9<5|>r43nJq@9oJnaovK1+ zW2c3jY>rgTkq>)QY>ty?!2t{NL&;%MMQ2xRq-eFxS0T%#MK(@t_cjQ7P!evY=V|s3 zJU>-*pRj(X7*)&}=5<{dRz0xSv`BCeGx{k?^fgJwE0dfHNQ2G;lx=dXd&AQk56NZc zO1-ky`P@jG*Ty2{X0RKpZ}A6?I>#D3mWWDiY+;TADnAID2(7IJFq3Ywr)GoaK0 zsM7&o1-d>!?sdDf2KZ*k=4?7z6rnh7vLaeYfO+%&XEKvFg6o`e2dZ&UT26D1ULn48 z-jr0Hd6$=Q?ub#>I*-?52L@<}IcZnibWEZo#c2*R$fUKA4Od|51tCW#{gE zo0iPRcv&BLLr9K1onIWVW1Kpnq&MUOb^)83XU1aKd^0!mdx6}xT-r>v zaz`A+@0Jrm9b1u!S+4F=W9=)DQi8cN9N+@++N`L_W|vnc*|A6)r8u^oOz-db_ukt~ z@IR)X(v{rm)4+PbnURqZFdb2|)7?yyQh`)0flvA{q%DeGJ}U%N1V~FroEpQ9o6b&} zI>O5t&a&Y^QxS{@i4~u4GU{pw4$P0i0TQv^)VVkqtonQar9wsDRDMqe4nVY_=2u)0 z5B;ml`Q?Xq2Nk{IL5=L!*LDY)+uhe2ki9KPC?NCm@jt=&zgKfq{1wZxyWxPPB&c@) zC0bDe!26F&vThp7X7Bt}QA$Pq0imFd{j;|j2RCLa`1a6!Kcj}ZAtzB9bv~QS%T?l~ z&P8+XV^U6jARw!mmJ!6%+w$Mu2K+Nyi2qI5+C9=u(2M<yc#T>Kvzpnj!|yx#eK}`y-?Q zv^Cd9Fj3HRwG-+*3CuK9WZ?El;(xXu!A~vfw7j&E z&eQ8Nu|hgBi6rh7RKP)v!$uQa@Us_f1_m-}CU|C@6!t?4BpigArk}6J@kfd?Ywa4A zHoV2U$)qkSwbxY?jx}4X!wGgU&T5jg&wcB_CfZ%Q5e zp`Rp0TFI}*cnt)-lDQ~3AN)=DZgKUT;6Q&xQvw_arT9bqZd`xM^)vfVWd2ii2Fb-f zHa2qRxdWM$?k6A9npC9eMpz-FNGGJ6Qx&ws+2j3~GFk6nwP5V>P6cj*o%8v{+pEl| zfy?EiB6jS_Pce&ER!%ky@6~(XeRDTm8{daAe@)#hbqJSYhR${7;>;|3V=H&PV1)x+ z(-|2@sW7i>=s48;H^FL9q(zH~T@VheP3iYRI$~%|t%-Ee3h`k9KC~G@jKFWHK&v?1 zq)5B{c3Py61XCYn9Cw@Q_->WLs}84=wMyL#AFk=vs=Hcr4FLzSg3t45^uL-&GFL3g zUVUz+A?_4qnoG^q+Hm!o>bQ?59*kR4YRW(tOcqOfLK~%K{!b7t{~K%hcUf?z;#`k9 z*Ek38_5F~KXtmp*d%n+<6lF_0*kmPK1d`=pOzdypO@=4Tjrc>Jv-i%I^yvBO$hY>; zyEKfNU1Y^WD~(3ZjLwt1Rj1W&-e7GWi#>9?a2CaGk(N~q8&JDAL2M8jk-z?@Z4-WJ z&$2hQB`VLAmk>={jn_8X*;DcJs7rMdJqRUK1#&m|ddrihGv{9JYA0`Vc+{z7v|Cj5 zMM{`H{*JDu4JzbNn3YhTL@(IN6pgckf4QDQep6O2BWNc?g*v0^DN5EF14BKcn!Ix_|?>^s=L>FC0PO>ANl#(8#(LZQTzDV zv;%@!Zm`rRba}w?O{N~JR&0fEtPcxBd?8H)@-6s8Gvd-C`nY?@hCbewwV~cMjH)Xt zwqpZlzn~Z=eYFVez)LiCI<8WaaAH%;C+^~a9-2*x7Zn0LH2lo5b#Cgzx1Le3Nmyib zL&VjMKWrD*uu7FR>0YFUKXO=BSZn?bA67*TqhEF=F652Fvb^0DGE_nI+&3Q6W7!Zj z)9!YlY1LDrhF2Ta!0 z)n6AEv_c7Sa9o0iFwAgL|u=-%8((Y04vw1fP)xY$_)5Z$1MpLnQ66hHv%n#I1hmrA|1Hzq-cEm zQJhz+MVMG5qT#?~l~8h|dn(uFBZXdv5%@#An9F?QP0BB8IfA)gqwh05Hl_T)ANm49xc>XbY@1eOH_(r(9=^InB2%|HM8 z#?-br%RY$<=YhN?$98(@q$8Kq3iens9#R)ujGLWN=B$Q=o40E`_^{8;iAB$4{C!B7 zbdBJnwl3{havV8SOR#IpD#nTIqsx0XY?&^rA5f#;4=O4?w&|L2U0I0e*P2G`t3xv! z+X7b<*KZtLszfWG=plB5{k=_*aKNA94^c(}kXUJgLVGUQ&lP_YR4RUE_y38^f07bZ zhAuTez2a_l<9g5*iNgdcbhyM$ZOO2}qR{;I5-2#Gtt0`M*Qk&k)VZ9lWmj5U3Adxl@ zTb|`>X-AM|a@Y;uk*Epfd~$*(#u1O1vAI5W(G2+r*7vD@zI>gyx7Pqda7S|q=47y( zN@w>9iy{OEdBI}rt9|Q~{!!W`d4p+f$sG%a4L$10k(~-FtS!@OzLwGKpc`cbXpVa= zr8LOnLBsn2sZJ(#c``0mxBO;xD@rFHEwbc!REQ90-5{sE`+ap&FAj{f_Y1sH~&mboJfx2E`KL{_pMoy?pTmYR@XUjX(|0CK)mhTt ze*H!YUh|!*ddX||GFP#2gcbFbzD{7|FxZF)y)g1A>HA_i=BTlklxY@XwAJwq+^lo& z$3BSE6NP*z+oj}Y6n;DXna0OSdk%S6GSw_E(Thu#?~7+E-G_U{laFi^+-|qJZX7;d)si}mUxY!O!z!#dHA^9X^zpT z_f zQ_pbs{R^H5+P!7cK%Ue)T9#^4qbbQev3+Q9tXXk5T)VR^xdLR4Xiibu-guwtMx1kM z>qfRusD|!DPPGi$bgG)rz7s!bR~j2T?QcJ=3y2`k-kFKDLDuL4MSnvMm@9hKA$R*} zw(KGiJ+f*;y{CbTn}?Q_r#j0!u8f=^DPp@g+Os~uSv9%jG3^xHK+vYAQbc*2uV1-= zd{2@w@+OBDdJ#!cp2fbUYp9K@>0#m?x*0g|ln!0s^eZEBGG9VY`GPL`q)zpeNTEe8 zyF2YIt_uiP{2c<6sIiKd?5@=$E^T7Z2V^W<=o-3thi9N$VQNn=;t>a|%fmOar1=+%$*0< zlaRe($S-T!X*6uR>1?B^CA=c{%mhIu%fj$k@sJ3ol&{JVd`%9*;E4^T&c?y6`Om*Q z0zV)B6P$lmbC{TuACYQnnng?9rDbbtYxNO}v8>g9OAv<-v z9TsOKOXbP73s0!5tl`st_@l-cdUb0?d&pGinpu2yQot_ z#{rK;;pT9Tf8mDYC6SVFy;Xi`sZq4lZFSlgemLZv&xZG%yTNra%a(lAXUa3L?8hEn zaK=a|1j)~RMo#qrHvV?t$lnSo`RDI3_n&aX0SwAEDeLjI>dO1t!&pcBTZdz7AtDmj z;YO#PYH&cdhDgUH6w(NzytcG^^-{<=k;cunX(JB#okzyW^vN#da&*;@a*U|}chZXe z@FJy{*h?#<^zLloQz-1IEMhszasN&#qo}KMYEE@KI4)8L$_P_@R!_0u8wSs}D_N>{ zNv2vq)aZW%@&_TpHMT~>i7cbG>e}?khU6B=Fy1?}u6xDhpIxe(Wm(vpU+VCwwSEh6 zocK0f3)0%v-#)(hG}46P0Gc1Eo(x~=X%s?iQd?a8fAB;oOy_XN^q&q_TtJ0$WKV@l z#56;9NmfjPnZ5U@_!d*-pY0x-HeL~jT|KWJeqv_rpxwV(Jb(4g;kexd9Mz_NX_`%5 zm3*t5#R6{}N6_$@NDIduY^QEPk=vqH>d!}}POp@!B9vW1yWaAc-H*Ce11K{Gw) z!(;w8V}kzq;rnZZiicX{xHFs2)aiq^mxCh$^ibke2s2&h^Y|lIX+Z66HkW3Yjv5Q&=ab zQ0&=3ai4@BQjLJpx(3yda`wco^>x>{Ynj4`^L@@TH+*3GYW#NZTNGe!Ah z0V0%;Lkvn6u~*+!VqA?P!7U%;?=AL}*CSLbNsQrIXKJ7QO<1C#3)U#ex{mmwE`!41 zL{xrJ*91?Em+4v7OqP{gLM}cY;+)FV{Kvy8hdH(?T|1r1!KkMdjmb7sYyEX~;2UfU zeEA^+3{>|f9JDEjh8c!<#wt@fI=Y|u$FQurKQ=4m7V?u{(}4pa)=M=9y3q+KF$J5h zxKkV|A8(7~%XlrRl&BJIZFGv*fS-|~Tbq%X#^r5v8TZhqH#w2rmOM2I0_F?q0Q^~gzG1>~6e5|}Z*OR_9CIP`lL43Pz3ifi z~N%L~lo8gUI8l5sec-sFDj0ihHTJpJ zDICCg*0%2!F!yEtO7djZj_u|{s=Gm6jyMd^v*OI(*n?sb)(6kzan+YR*}AsdM?dc z?pD@pi+QVBv#`T&j-QdD_0rwZQr6%=kuE=iP&_(fUujy@USc4B@Yv-1(<&a-Nxta( zLf~I(@He5qdJFiWCqJ+#X&ApPDGSzYJVjR0RrL)AHoK|Zd+awX^Zcb<1q6YE0S|0- z`NawA#mX3K-ux41_Si_P?4zy!&M^-ucQY^15^umYW6#PR7%; zrXQPGkvEp11o2K42xA^D!zR@ex1ZYa!=z5KVeL=jV0tnTI+%5A5MznL;UfeUggNC8 zB{5Cq{Wfe9MAv;H=HrA1z$LMo#=jVDw2w50Y-)T;h^F77P*oc)}E@L&_- z3JU(t*#EgR^anljBUSTnk{$GT#_3W_2An@M4BsiY-s*n>2PolyuJw6t8%DZCR)=`~ zdt6KpNQIZbDVVu+$BuaPe{Z~NeBbO1q?D(LZxJ$V z3s`WEq9F(XN=UYIw4%-^15>19oN~?`g`g0GKIF0dHsk){-w*otAXeA0{Z}|Uq@PB_ zhEMewdJkRsTrcL?+^o;?8-(+o=B;m0%e!!Q(zHKlcp|Mj`j)8~&4RzqwP8l4L?TA?z~UGaSn>V@5R2a-oG;%Pk-Piz8i_(H-MCX&>!Cy_TNbjyx(_c zX&B}|AJ6#zBXID8Kb$wo4dF+zigs<>@I$|h8on5FVR{tpp0geLU?^GBu~|2Po>f;bk5&x4nCee58Xl^mLwND>(t~0-yQg4S~&4#9henaGR8Dt7H068hF`b%n8&uFkQ5Sk7LP?J;-@R`3HU9i3VQJ;;h zKCurUOMR)eC@wvm_EEW(dwQeJ07Le$T{o#L=|yeUF(2FR&bCd;(xGYqEwP!Cug>k< zr6T#ZYP0A2ZCtdntOT^B~6 z?QngkOLck0-zV1}TnBu?NgJ0rVe`7earDLWs$tX`V4WR9afZ0D^zfoBq%ST>);n*Z}WyhrzEtP+d z8^TprmfAxdCo@bnWKA$OQGcsRE74r0X78Mr!LGW$v10mDs(JldTBIl6D}j zJT?0=%`iEhcjekhho0V7zty;fM$Pr%X2;GcLrNA1;ri>w1G&N_<|w1Z8rmT4R++)l zO?e5*BFlAew2F_+4*3!8PUC3N%EzMKxrfC;UpmI+sx(;AHH^rkUCtP43-LA<_Sh|S zk7V}s-ZB~4Ae3FF6sli%I~lPS?1?PobW=-^N;5N2BjSTTBaibYe*nLdAK2-QE58K? zS~1D$z0gjI$y0R?PR1rWCwba!J+>+J_o_FJi-T0i#(65jol>%0(0a&CCI-f!4!pCS zr*{6!7THeK{aBHZS;`TF2eS#mImf;KP9N<1Y1FyXExOY&|HKIT^!RdIJOk{}DqcNl zMqzAl)C(<#iN2|{REsLksAXKQKQWek6}47axHAY45s@J_`88iZ{ucM~@BeEZNh_tN zt|C7AegPA3!S{tM%Mx@_=_JNcwgR)4$JbVv@?@5ENdc@_GGd8A@#tsEzUQkon?`x! z(+PWJBfGgiZ7S;D%65;1zs5vCmO?4zx>8R09H6)_cP^cwR)}NC*H&$p^Fr-#AcE8- zG!=Bdbf7gx0SwvLkL#IChwoL5Lf@Rwmz|y`G_~fM%;~RKoxEgaUmB_3p*-T={d8Yi zBtd&sjU*zOCXT+fb{q$1@RB!{!&`!R>k z;DApF!%+boFhjUsps}v8JN$+(KX@arLbvM6hqsT+vMD;n4nMYKF(@=aVz`ECIAUAx% zEUVV5Ga<9urjalFEarv1NZ{0Yso}tz;tbpIRW@U>#x5tkO!W_Pl8v70H|9;@eAt5V zG-e5hPxhCyDXNg_4yo^1#zM}nAsv}^ch?KYm8+n%H?zlbsu4C=#j-I2wk#LaX-8mg zHLeuAj{*mtg*KR_vFnOM(321sae-;o30H$ww{&;V&bS?e)}}j!fDPBaV}Hb8XV_7i z8)ZfD578(cfL%_<=aib4^d}$=LxVcyFCPuYWz6yqal&}9Q*n=*VZ}o`O>>x_{547H^yOq~wqB6MH{4(w%e5Z_lV} zOs_We9%C~1%`Z_D&u?z;#BGK_iqs+GR(oeJE8u`DHXIN^7%KC@^~4LbQ!j~wXUwB1 z=@Rb3;OaKVllq9KS9}W)x;)x(mkXhJJ;>+B?1y+>-dD0@bjkE1NlS^Dm*q2d*)$69iK83{2OIpo|qdwS*1`J$$Ly$`U!~^9JG2b*1 z9O3$X%N2h9R+#8ZW=pgWu|uHEK5Zs}i|f3nYfk)cd(wv-FElpj&!a4x4)$_;VTm*B zhkfD*9{y{k3ml-jCUo_98P0InuH?7XK7aTvM_b#&-CpuNbh9&N`6|3gr~~K9KKz^N zb!uQ-XJF9r$EFI7(sPa>?`k@s@Gy38Kz~>NHJ3`~^{GW>vqy2-9ONypZh;_GsUt`P z>0BAINO4fvdD&;w(|;}U5e^JUVC^rRZoJy`=s4Z;XM=D_sGrKBK|Q|eL)O`MJhsm9 zRJ@d9{x@Z{ zPg5b@BoI%oQ(pacP%VYmDGtRZXsbB1AobEbKm2MrLG|1{Uys8^U30!)VD>(ABIp?N zftPK@xsu*nu|l*1zH80C4H)tui~Z1}aa&N3E5h7;l0;k6#paN(BMO_N4MdZWz(iYk zaIAJOUl1y{_T)heCFV!OzWVnwq#w+Qzh6ix;5l$(P@!_R%T%DElA9K2pq2y&&d9r3UEUv1on%CMj2czXo4QF@DDGcJrDv2MC#zUS zNG;tAHb9Pg2O$|Kk@NQe2U!*|fE&ZyTU1t-SrNP6qPK_z`_JBD6WqHKXmiqcPx(sZ zXj-eAtM3$?peDQFGmS?;BVv!E1$n)*8aA&~mw-W~z4J)>eP#@1%_DqqCU=ybuQ(!O zCwPF0&AnJ!dFt?ogyBP389vm%lfb_TOj?Y{gv$F@mg+LxdS(6((|BH-F_cnN%ERu! zflF^h$TH|U#(6|N{B1V}Q9^N%mRr=krgz9z09T_xUWXPttrKYf4MKa)V2FvbjRf;BXzH|0N!PN7|aA4yKF{9wXsS2b8K{UwE zU!cK(vf_jw(CADF%nPzI1P3@SVGI{dsIaGqqcC*HH3+hTumRye7FHcreELHTl==vc zhJT@{YZbDHXh7~B6hRzJD*jNq&XN%hs5>@YoWOx78EEU)dFm>Z@Pe!#ikLRX*Kpt+ zpb z5+re7f8}qXga;|C)rVfff$nUW!3k9U$Quh9X7(fgls`aRn3EBH;>L`K&eFf&Y3dbg z3WV7;Ln3++>%$*FX8u^(c|;Lb{0(xng`jTZF5diI!vOhHr-}bdpG6R(d5#F0Kqv(R z^!^SU04>9ykWkaDY(e&&n3&WxR>_*8e@TSnXSw(RViy5nI1b)pWLu(6Ma9 zrD)UIHDs^zyN-(DgC+#6;6R+r9%L0%yaal`iWq;p3&f)Id-fkX?L49a2MRBY5C!yu z5pBQr#D&iPZScPr_&&Z+lfQ3t<9=%N|Aa>W>kW&-OUezeUrokFUhc0`?BIt9=CrW3 zlgECIm@eD-xy|L27l8=lQ*8(PQdhtXUWs>h&1wSmz6ogqReXJZ&gO*BCI!&W1mr+v zcp)$x+TIUCM%YKyaYbF+9(#pzbA-Ag>93d`;HMly+&fcmJoDJxLrf=x!Lr!LqNvLU zowj~Aptf}Pay|nMJue7AuI;N zyjUA{Bu)+<;f{USYSU6vMpuPV6>K+`{hXS4?^hk5L22iwe6tAV+$*jyVxD1nHOPl7 zyTTmKs*+sW%_SB?x#ksPFOl5vj^x*RzRk)JuQ+uJ)A@5zN(iA2+THW34a%0lyc*hpFGt;NNetSz*Wno zT)gPW>G<8w#I5yjH>9eHRUOgkf^AU3IEqplb~Y3614>yd;<`B_N8{7MwzU55BK*W9 z$!nn3mZRApT-EYXSedEKZd#CkQu9vaW6{LHst?4~1 zjN0=u0kHGu#Y$Fh(K0zXxaA;PR}n&JyByva?%fn>HEbAoKakp=Kw$7u2GwT;AwU?@ z1;Fn;@1P?Hkg_XT9x;_K-#9(3mL;+Xld>szXfRM}?Nb0ITg&;-vxj4G8J1OjhUHi- zO&Ce%cW5rxyE}czvDd#KiVB!)jp^NQv~n5F1(ST4-=Q#e^w%!*k))=%dVeb^NC?ncaj?^S#GSq(oA79GT+Myk#z7=VfNt?5Fi^sy``cfzO0Kebh-7VCeql-jvE)a zZ*=yMv=)F)b_ldPyTm~j`-GL6!cO;gOu@CGnV%9ki90Y9ih-^L#B`4S{>leZkJURN zt5fEj`FXxv(Gkb@5P=e>f!|uH%~?)s$&x)Vrbn_h zq`D}>O)@4xbripzgq`R|wUStED$o16llL_%Gq*xS3{FF^3u?%2PZU`KYpnill(_W% zd&Ri{x*~nwWEi^M^nJ!m12_b}a&O8$dI4^*GP|7!@p7OhrXP7-&2m6%o!g@nKUGvu z`pCyVE9q>Mf^0aT^g2&${Iq{BaicV3&35+6b?w|C+|s5bfM=Syv$m2jLYlV24GSXH z=`<2OU$Hi$aI^CJ1a|B}Ze6yLa^AYyiVcxr%b}|^3&m|d95)cguYQrIaC=m6t6y1@ zFNJ8>nr&)nK<~*Ox7;TC+QhD$4Bjt~LB(RWaTj^QD;&-@i8N& zliR)v0fXvg7QfX5>mQ&dP5I9bwe4?5x?!tSxYJ^z>d8L@PKzVUtfAaF8nd!am^u|v z#gh$JlR27_oJY*Hw>rs!fR4eCF)>bU%~a*;04sV7f=ovYn78zQg3GmI_Exo0`Or@n~OgEz``WJg_MC zlbR&8$3_>{$cLh@c$p)=AB>d}k$st9(|59)YGp}X!%xV{ca=8TLMVVuc;BaG#kWI1 z_ik(39Ub}WTqm)LFQ5%!g2{7LWGROb5{+jro};p(5?^ zK2erzs~g?O1E!h#Vgfw*pLI+92N5~kCgY5lx|79O_(LWYt> zbl8*HSMJXCFtW(W7)*pjjhyhBl44nT_9W&RP4f;abK5ew4puk}IBYy5_Np*k{i;_tM@ri&?^HCrNpY$+l*04Cq*!O&bO|<4qswV)`#f=mZQLZHn_hG#OwPW| zg?PgS3(H1XKB40RJNBr!`;IW%;zkVBh>&ZzRrU>*sr7t?db5Vu7irF)U8%hBkYX}7 zNQxRm=VJ0U`6Hiv2)~_1ugntEbCk|NYw*gc6Pqt0jL~AivTm(amS}2D%69fetR`)J zBz^qj8=J-o8n(p5XT`;bL&955D1BV%RI3*CA%g(otqgWHw$D%B%K?lfVlF^`@fyxSW1hAyIjCTc)X>9BBsg5ykbx)J*zH}2lF z7I!3Zpm6Z(jqVqkUboc8smEAgWb@a>utTh5Awf04rS~`MBoG@Y8jcI!4h+CgO0Y}h zq>As(#y(Ne+~R1W13w|x^U2SY?i23>Hj-4dszObQwY(CQG*gh@4R$)viZ^dh6WLO9 z@*ssSp+$(4$0$`uynWr> z9h}UZ?TQR)DuYyU%ZUNQXNR9d_T@M{Oh16X5G@aDfsGqT{9Ea=6COiEm`t4f=@eTI zBHNZT5=KUOux^=3B|vpbUk8Ze&y=FRa3n?lY)tW{==qdt=8UF1tsQL@l7D@}hjur% zAvyV8hYmS;hLHi3D!z_>`RXc*tmO1ig&t-C3v{9x={1HT`9FJY@W)pO{~6g4#_8F! zRyI(PSZl*p^0HgY+?es8lVmM##pDalS0AAu=Eyf4%KUGfne$k@C*AU;SoOFW!+YK{ z+jciEaFJa(BCXrDmuxD7Lyw(@(vDvEJ=mq^f7A1D#gFjJ-KdicaPQXRLd3!8RCsgu zzfBv;^6QdzqK%+_n#U6LWHZZ`#WL5_HR4_0D`ib^jlQ;d4k^uPvYdLD6n4|WE)7~A z?Q(J!QW$r{kdKfMbM%2u9N%SQX#2v5{v(Yf1XXAzwSFKSQ90jp{|oBuaQez=MyG-% z@0HYXVSVTmzL25MN3%=BrPMw;C?Jr){0Mf40-CynqyBzkc z{?{n${{!DM-(GI15_6gg8#1^_E0G=x&h55}56LR4{gl35 zGE~+}6;X|ZLizG-fpE@fV-$*bK{AJ7{35%|>(sXDy6V*W9R&%Tm~v8A0>Q8DU-#rj zb0q*rEr{Q)wdvqsrN&&pRDv{>zYSl5Z!1pnSyDuO9JV`R^;&XvZn*={8RXr4l4|tdmg1=B3~($T^jZdaOP%4Y3umloZ`%swq=~5jX@*MW;fg+GsRNawU}+# ztW+&t`|RBB<`O`GUN@ie4z_2_l>|e|WGk8`AG~yNNww>a6bKgOqN6()2UMb6N7R$g zt)%LMHA*8dLaVZ|k9gaHGy6U*kYA|R%^10ea2&Vz`f=(7BHo2m$8}mv#qV~a=`4nZ zw1pqX-wRJ8!tdAmu#w$^#{M|NpHbB=!BQ;OPQo6t9}KRCy=uA;M7$%8pc<=#o7RO9 z{4s(!@prnmErDJ_d`C}&Kt~te4DPBWE?71~0xJzsU!>`zRI4KSOIu#~c1R;L1TBLQ zKg8#DjRrYHtP@2@i;a4A7@G;MxscPsz+t{#TGnrM&@XO$ICGc~%%NE7tZzL`HtzAN zt;4yu%}3^bArS1nls#TJ}6kKrbONV(r*t5@5}WPqgK3c_3n}0-E0clx<;R zW8o%JjY}N9`&#e0r{Au_eKoQapL@+qAEDy6(2`Fk4$od5gpfAr)Wv$Iy!V8Zfc^0r1?BiiTb=TR8=#}5>!o%{B^rt+cJch z@h0%tIO8JD7ulgszztm6V2Aj^w+>)EV;+BoY zs+BFwjfE){<1McWDv9g7>hxxc7<`g)XJngtP-%%&lT{BZbvxJ3D!JrnH#A_)KE4O( z$c~2G9C_Hh;L>B7EfxLatL@XPr*VXl1{0!ucQCIxnA-mXf70#W>{+ZYv{on4p@ArAT&+*eA`pop{(^Xwv^{wjaBlIhD9=P*JT22~(gM$OyVE+K< z!kr;0H**sJP*4Dv002M*5aEOYcvuZl<)JJPXbvF4>Ts|h9wjAz`wMFU+-cq<1yEqs z+pudfSOG{k|MJVX`p?f#4*cZ6PY(R#z)ueRsxB72r0yXp-zkZ^L|ASKVljNTq_{o8v9QetBpB(ss16&-Of_lm3Y-(;JO21dvL{Dk{LX=*eSAj#p?t!tHxvYnS@iUJnDn=fbMuIQs z#l{;}8^K|GRb~?BAI2!$AMsk^h&tB_{I1NXY2h*livD z^7v=Q_W!-vwlWukh1&i?9IXDA3anR{$lbbiON9N$2mVJ6{PGP<;$ZiEmrvN8l>ZMt zKOgcBG5IIEezNNy;=n)D_@{LJWY<5$fq$s+PwDy>W7lsO*VqOIY+Ydp7CHqy08o&S zkdY8kkdcs4QBY9Pu+d={0}T)B&TVW$JYph3JVF8za(XHfGCDE>LTWZ@ItE5&7G`2f zc5ZeiE_xhazXOQaNH|oS zlE`9z6pi6Av#Rzks07gNIVm zGO}`yo~fv+scUFz8NDzzF*P%{aCCBZadmU|@O$ka5E%3(I4b&GOl;iy_=L3djLfX; zoZP(9vhs?`s_L5B=9bpB_Kq)|T|>hoqhsS=CngsbmzGyn*VZ>S4-SuxPfpLyFD}2) z1qUGfg{<$C{T*G{FuLFo5fKnkzR?8-?+Pmj*oa6}oX9wmN+<^Qcd5Bvq2k_;OfC6* zi-ucyAJ5QX5DlM}XMyhE8)?5#_Fp5+_n)HdJ7K@kH3{5CfP+mQ0yZEHTuKKse1QLV zEcHvRDHMs1N1bG?>tz-(68z61*>d=j!Oh%i1|sP8t})vr@KMDuaPmc1~Bu+0*1=;}@i zmq7?!BaI9l5fg+0+6&+-ENv*T&ZTT70c!3#F!=N({0jmUu$uLu(SM^QQh9H;Z}?eV zFzuYf^NRinX~oN)NWahXrmqqvMqbKiZPbc3X0jBr@M|NLHv7*yEHPnwL`gZ26A+$z zF3t*G=|Ta>D^1r}hpJaE@8-m_EtHjhu1V-IkyVX@m*DU)8aVlYc_w?HT9ltFmQ0s? zn(R?>kv4mYO(GlLzVeyU$7{2%qF8I4y0FW|fH+$EtgZeraV`t}lI@P+vh5ZM^6hH1UNb$| z=Czq;6XHv&+Va+&SVW&`Ll63i@@jE%&*K(5nD|k*eFT;fNK7xVx@u;ObYjXe{UlICP#?S=xwX;{B?nscm6Js4VZ5*;rJ<@bG)6&*FeMMqT#3Uz(5&kt_ z$+EJ{Ex{U-Hm2x#mG+w#Xd>l^^NfOhOXFFbv(-rD@2*J)Dm>|CT+^)^A`tTO$+nKC zqP$}Uuf?SsD@N*NxJo+=5abf{Co^3!KT>r!%y`ghh7p2-+5 z%4%=N$;!pbJJO<{>{i09?HBRi52i9*y7f}5EIVMi6jXK#0nz+r<+Kq--yEa@aITyj z@lX%odBak^W29PDU7;=`B}XSUnc@EWBN0`dX=-NG!B~vA|G2K(Zfd1_D7N&0A8mgU zl9aORi#0;gFo((hAevzaxtpookgE+lxga3R0W9h3D~Bh8tbjd(vo^R z*XdvdMz>rB1{dsZi#*7;bag5D^0YS~oU#SQ9=qt)5(4ksQ#dKy|5XLqr726kV4Wbr zxnZKrGv~UDL*Yv0W|vDSG`wN6*Z-Z6O|q@A6<>=jPOIO9G?GS|4;sUtTzYJ=%|miS zkds?`+20a=f@ntD?4+2?B$iwY_S;ls>%^vmZn76cJmvEh?xT|l*;CpZaEIgu&QQgm z1&s@Kihw1!B!ngzGM+zrOPG5?mIG&&Qy&Kf!0{kDD1avNTNQ{64(SA4T*Mvf@2CG- z`EcuJbF}}poGDU#GUm+spo4%9dzSxeVfV^Ib8y_sR@K#LnATv%B6!k;Kl8q-O2QS> znvs;$a?|y7_XYd8(ytY@cEoZOJ$We5-_sNY1^mf>s{-=BwO6NdF5I*|KqDg_I`RL= zyo$woOPpid9LCq20Yh^riX=#W05O3q_ zxNC;*6?m=RZSC_f(>R94*eX7`BOM9i>qDCs%mDlk7kJ?}v6Ew(%U2~-E|+i#i;ZOW zhZS-*aSr=F!<`A5Kw|2aNSaFB)VBQs-k}CIF8N)Wr0w$-Pt-FbUOB*Y8h!BR$ zWW)GZC(G`n`8M>AmrD_2HTnT+je61UPBPq&?|AxC+(Z_2iR}<|50b`sZ4N}_3tKRx zRrh`TXE>5mRP>>=!ZWcm-IuUkPmYkOcqAb72}01SU=pD#V}$^&gB^!mhHD;$Wn`mt%z7qNf{I<>Ra)>YcfVl6=m%fjih>(%jP^=h%w9NL|{v!VzdU7nxfg+o~g+;cpX`M9@KR7f!iMHFK(uJ z)LrJ*9UjcK%ALx2XD0lv;;rs0{a4q>zuBB0tq7wspIT@9vv5i_ne4MkcG?MMY)Pd`#wCtiIVTY69$6H#Mg5JfyL zoVLuU;aGW2@y=o_ZDg{`W8@X4Hdz5$=Vt>W*p_K)%28w8iuE<&L@(ouqPsQHy_Z() zI>=7_HpYXy_cz9Pif`3pr1scS#>`MfEz;dZiBAkrvog?5S{g7Z?eMozSB*3rTnb;` zx_7}F!+CHE#8Irzk%X^MJ=NzJ4D6YuMZRK^Fw7ot`GE9q2|a#a(EoL?RgwN8(p=<1 zF)g_bol3{Yi>41n5x;sIMFbt%yA=)mk2Q6>pC6(iO_{BW<_A*nlZq794etpbo--O6 zdBrx_kDTibZ+jD#gb?1NqPnc=y|L8#Aa4?ehqEY=$AL`n>f+fCZq8OV5-I~*Aemfk zw4;}9=O6hrrME0%OPo*1a1Pf5IPY&a>DXV6{>JMXFvXLx6#OV|yKNgVG;&TTR%sXnCMC zRp62VAt9i+#B+7%`q+F})0`G-w9OoN?n$DvNgSKWKI=drmifl$+TZWweOCKt`r67! zR9HTX=?ulHLkco z9{N|8b4xN4(N{xIU~UWw5Pypr{OtZuVh*NfhgQ|8cYt-em4f>_^3H3SOK)D=W-XZTYfF{Lvod*NGcX47|B&GzsrVsRLAW*^`)KE85WxFyQbkfMa>XG z&I7&ZMtfre&5z`m1D(k_1(dG)S9D|JjmYD(v?m^%8A#Nei1sf!So-aUl^kG&u=6Iq zg{aVTu7pd9F!Yy3tmyfDm!>*S@?|3_r@7Ij_aJZOyvv<+nDg-c*NE?8$=mK6HK_Dm zHq*Zg+PA>X@3iNE`vVFGU?HI2=rj8+$r3pKi@R6xS1QfygaQ`Q;ynY1@06qfTv%~{ zaHl9->Spnrf2c|Jo7u!)G+Qv%$w8APi4dR-kFk{qS8{jexol6&m18P8EvwJ~hx%v) z8qWHOGJ!+O^9%1E$)!w5hiC?m$q{R7E=x(@u14IQj~ub{yj(4CGZ`&ht)5wsS3q$n z?zcJMPaOCAh?xQ@hZ`7#S%Ghs;Lj>*D51`_+lBCwB!-?5GcudpBM^iW>Z+~&;)rXy z6cKr;i-PS=jPTct7ZHng;!KO}BkvdwTNY!3c0Ub;ZXWe7gcmQ{p3}=gdIpp=3GU7q z%ABBI(;TiH6b1Wma0uY4OA%Yd@S+|I=Bvh7glAM4Na3nwN$P-GS+%!?P|L$gSeL9X zDOZ|4Tq`)otg*wk%1m2*n!NkI9IdNjofuuHtKl>Dr)bMOOT2q8wUcZmb<9ms4$`r| zQqNYMAi3HNIdZv1Mp@8SQPMezPD%nlb8EZL#QZ@?3Xcv6S4rX@Eu=^1Y;B~K@t(Qn zl5g$_GgoSKik|YRc*DnHA%A7o2!vC@SpwoczDtO^AXdvPbna-4eQl}4RPNcB{Emk(v}6%!rUcI%M@ z$*b3bf$dG^eE;$a_hmWVr!NYW%0s<_KSV2!sB??p_tUB!>ePl)2j!St+E9AGc#33A zp&Ft!l-5)gtPv&cJi$ToE+3C_K5|s?fvR?^eOulGQ^)K%Nz$W=dVXc;G)7t=2`kaA zZ`fA_AsmxgxFuU-sQ%tQUm$0_uewu!e9*hcz_%$g^n#CzK= z(RihY%RcS5F1!aW-f{}C!0%j0iptF+MH)NOnECTfD1q3g7o|S%Ei?cok)8jClhQc z{;mPw-5@vkBe{T$t-^L2ggLFlY^X-AqrXQE1$;WMndui+V@^deij8&T-6n{({a#%p zfxAjYbhNb&tW*5WcGdXE4cN4AyVHZEAQiJMMu>MZINd`?E$~zxO~ZX6vn&PGen=nF zx~?USyjgK&iyU4XDf#`u{hzjGzX{3T*kW^KJn61D`kP!rD8Ma&^H=Kv_vKo0gw2ZQ zYpCn+#%V8U+b-q&#Vc+c{Lhm_W&JPVQY<TWi{_Ev>34>Y=-7X_blu0hPO++&Y&hanpiuUGdJ2qC%pUuHoz? zLkgo|0Ih7dO&6KZ%Q-fW6x0m~GV6ari~ph4H?_P>TfbyEGbI^yX~%LE74O(DB%ZI= z7D+gJv%|wIeL-WeiailQ3qil`buio#Qb`QoliKX-A;Z(kQ$ANK9wc}PmszgHZ4n2= z7rjB%E*fwbe*?K+hZc@XQ&|;`s%~YUBSr3WTO}roJUCrW(uuou%z3jmA_r!VYWncB zl^P-xhE3}ecklauIXR|yvsf^WbwHx*h*1<_9UHsgR~w#TiY9*dI^TllCR6Gz)2L`N zKQ(*cn6loSzGW4dCra=k{Z&V-_!n>I*-kBmzkT{@00o8(p2BHj74}mETOY^soQ5+8WxDO^ zsa}M7I{FhWt&v7yN-m`eo%HK#Y&|Ac(P|?)=>+#1 zO#10~{V#AE{nSRDJsK)vr){TxoFm>k6S$^R<>Pb~Q&dscl%V-Fk8`h2v*~WfyhT&! zs^yL0@cj}~6BFdeH3rqkeToMCOS)kx@klXAY6BOqsGp|ea9n$gCp3YAnO@|}e3*3Ys>I%2KC3;YBhR_#{sa`Dk`M6_C_Z<{) zmVUN*te@NY>@4P;5Uc*Wlhg0|J36MCe`Fs2Ml1h|cu0R>Q~s>c|GoqE+%3{}p?+v@ zd#8xqx{%JLx}wsq=t74Xki{rpTj>W`m+XlKaf1 zmZ=lhLp`69WnLyRL0&@OYYuoD!sx{sM4u$FQrK!f$kvxkK`Dz<8rkFB8jLe#CK8_& zcZ>M&=~}^+_t~T2rJ>_$mc}Z}P)OF)f0@m%u}#Y` z*V%zg^1qRIg%Z0(9M-C?Rk3Ly)G9Q>Ur3grG}W_@ih$HUkV$CW?cgAbH+n9`6Ct#6 zd{roYdJ^M3=m;Y1>Jx@*xc_S0A1|}mEnv9iw3Bs2{cc$RGe)`RbY9V_%?AKQSwXXW zly#47xiZ3|n^dVcP{&555A|gEju+|&^cZmAW`4SPnDcV^A$P{p=4@n5GT*edN;%R~ zyi`qZs-e}<@?xf;hvAz<={2u1P)y@V0-t`}mi;VRmPGyz=w;SS*i)$=<|R?7XsA$m zTkwp?D2l(2kW*Fk0X*u?9B+!q^WH*qC1dO=p;sUKCqYB}uc9DNrmgmpC#H^!D&)u$ z3kzq{^IsW|XWQdG)ovc@q~&Z6mFu=vmSgMKJ$SXJLeP9!GKyo40Zyob0%k%v$&jIe zUDoC8sj%t`zTK&CmsKBOCWZ{-+Zm+um!wbO{}+c`{)ZLH|FiDlr>AAiEm>Q4Dh1xw zQAi-!&EBbs);XY?ZE$H~;>7lw2$~0dQojf;S(=)=H;KVEnqgiP#MWp;5hrr{?lcr& zT6faLXJwZ49cOP)=faEhVev@8a4)-oFv@JVsBaDLiVA?r7Dyo@$9l=$N6Nn&0=66y zY5t=avt|FatK+@GVvTkDKzciK(fC@^^?C^J#GIUEwnn$BGs_!X`I7O8Z}xdZ zHZMnaYurczn2Dy{Ot$>S{g)1kdbpOA3oflU9SkXiRpRu&>{+v>mzD)g!UZiz1dIB# zDg0jC(1G98vCusA4{YGj&zSgsIijD7mToK`4H*dq#ecX3+pq_|mE~)fSX*niW(1GP4z$Eb6IL}%uN2={ReZoTur_#jczNhj zXDS}}2$=mRU5&p$uJ^dg7^z|GCI!YT}O9nwW2Op|C>Z zK~eTidR}b!XA)FDADJ`tOYosU31J}0b}n--gxqSk#o*m=zUn0U!QsPMlJeREiZ}ZD zO(9m;Nv*>8q=0Ck&gHo0;{d{)1=;6P;XEX&D$n5*3ptKOLs@2~Mw_@rp3qG7ZD_LM zZ2!eLGDdZi-l;DW$}mIc=KKV>W=8?ENpK?xM};7b+5?xOTJiu#3zD-a%!u#;#WR^r z5ZYuIy_#;+%=@vt++zMx>W13-gVTDJ0(q&dPZMl1gph+X(LgAm*MOopymMSfxWlqE zfolY~imlCl7<96<#!`@%mrQav#O|(gu)JSXVa$A=SPGM7*@kSBtl-R^=ekWE&vOSk6F`tvj*N&Z%B6m^rs`^kn>0h%Uz&?nyj4 zmF-!nAl?JT{m@>qOy1elT+R5s)mVWYTQn+ZE5BFMW3zGTY;m-qRhRyA?}4_N&Ufs? zq6`R+%1#SI%+oq`Ef=DTS=s_2|{(xfz(Gqbk}=}cC}tZ2zb&OG38<>2wz zKBBSPBq4i4NMGsbH{?T!&?d}yVrFU}qvX&O{u1_Nh2@{V)<64{z8wn%j+)u8=AV9p z?=N8XHw7Cs2Om)iJGRn~+O+rTmko}nIf$DS6nKdy)@cg`M5v)aBhoKBDDt3-1%jJ> zhAX-`YOIqom}C;x7}SAYw)_-g6iRM**f$PkKigC!xnm=!tZwPbIqYXhh|f|gME z1&=9~bi%{Zrq9cig|hM*Ro{x!n(+m8>=#$%LIIi(>>&HN(#(Rnh1I@mK4Pwj%EAnS zrDgx2S zbDj8=!}%=>qj+A+XI?!A0`cnRbY_KdhA@-)efDI&w=7D3`)#B3*XdR#Sz`5Ic7b#Z zF32>*q3saE8};u;oJMSDDQ7d4uDnTp6q!wKe&@WZ_1UJYU&|%t3wM$ieSv%An^@T} zdbY;p7K$-HFYJgseDC>&!z7Nv?GnpC<(hFpz)72(>yRh*ROzZ&f(qqK=7e{^>%08S zR>-{Lt2j}1Regyr1664NHz$OciP~S45Dgi?ttg_vBF5gG-K&YEzsq#E zgIBr@^V*z}Ql;<$7b9>UENtBlE7k2n4WNPLtWp?Uj^4*fdS_I_on%06Zl!V%7##Po z@`^Pc1H^G5vDQs7D+clXuy5fSnxA*H{}fVJ zE4O0Hg&Pe89?)E1K`LB%$JOjJDU20ik)8*!No1Oyn%8lgu#wo4z4~wXk8po~FU)rv zr5kc8<{RUQT{TayeDy*=pGyWcv*hdD=;Oji{bR?e1sm4ZYrOS&8$8?$o#79kfBCx0 zg)C#ABBKT5W?H!R7R|0-Rz=dD9?00^o*wX6(%MG)W+ozXrL2jKs)6q6@P`&S)iI=5 z-izPaHGM9}6{Mw*BiYeiZqw&|DnL0hxK8X_nd*L%Pi#>YLTb3lEP+Efu*}rnGvhEp zS9hhtKJO)9A>`GYG466O(NDXI?JlU!R+#kiTrs}XPH>t0)(6bkU`C<`xoU8)Hc_`E z&|$H#{~IRdN9C5UuaIT_(S3Z*)L$51Df*a9aY3$AN;jO-!9`yA#CU7+q|(o5wg0H= zmz4zZo2}8_BjCM;0x$N*-WTnBib=fW5p{HQ3-^Lnarhih?Hs`-qMX~z zeJO0JUOilOdxPXj9QNtvi^*{|#n(YR3TS(+XHZ~D=;GdVQU~^u+cvP>)PGWRp+fC1 zfv55u-~e*`lRpCJZa!pyr2MZl!<>y37>2r$@7cJ<_HCJsKdjV+<>WQ~1`Q~_`}!JpX#v;c%{CL?Z}NI0&n+Jo}L|*@q%ip|@GZ z^hTa~@ogUNeLrP=Ch zG8#rp_g25{1pd9$;WfK_OC0$<)AWA``1~lP{|!i}+B8BrT-Z}qb(JSna++70ITH|% zsfU`PMnP+Utl0FiAemn_P7SoajnP(`WAb-)F0ajA@`J$uwHBq9GQt*-mJDSJZN z*Z#D^DPnEi%4{<9@*S5v4?G>lYnt=?N2&NOT_0HMB{2P-XZ{LB@iq){M>3^3^4ZK; z=NuC<(iyApUVO{?`Ri2wUkR2E+-7g>iawxVW~Lga z)W$YPZ|>QJlF__puQaIQJoaE>DjKJ8^%8$w%(o79+_C?$UuQ{`?6bzf&dZeSgM1zq zlAMbYoYlj1oc27+I1F-q(Fv`NCuPHwhgQ}@xO?Re-r2d=7QMLYsf0QCwC+RE(e=q+ zl@y{L2Jl6xYYa+jzMqFXDU#7R4>U4sCu8FF5|@~vLAe0892J@mCCix6FvS=s=sn&_ z9%7hSHh!oS)nf+~DQcekXHS^0a(d5*KQ6}hJJ~z;b`DPS@MSYH@>*pXPbZT;F>q58Z|e%;)zAl;CN;^HZe zMHvY*mjm`IrE~gcC%mI(v)DuBy2ZD}oIgQ<*rS&{SXBn+YAbljj~}nrsD+gB1dwCD z>{p-h$5qnlkoadalGM~n3Oll?!bRllTKu;g*RGk!Om*>ltgS%q4sNu!w_(|=3WLWq zcSCw*&f;#nA{ixTk6b3~b82i>2N&P+Id2@PmARff*_e3}YThw9HkGyq1%N>LLF2l_ zoy$hu_bWy=5ZF=ZTF#C~48<}CuQGkJI~CQ=Y+bI(QYy^VVph&vV9s=1=>5OX1ungQ zkSCw9p=VuS&e}69V-5esuJJ`Uw!G0tdio75SU`B=f#%TURG+20>X4aWZYRa8aV`)Z`%Er9MnJPed@1{u%>$Lc;NAM$eSHktog79Rm3}xkv|e8y!3{uf|%{Qj=wWuQ(UqW|0!s z&zu4TugQ3R2d5iR^UX8}cg-7w3C>h3Z&x`tOshWu`P`(S#kKL%FBdf3O-XFOEXsK- zg#Z3?1j+sjgS}ydgI>AcR~5Xa6;f!i136?sD4PCv#wt9onc3>>?lwL{=x|`u5y2HX z1G$rhC!1zTE`OcMV4O2szEOO3KAO;9UpvHekT}6gdFu56+*0T`-YB$xQYlqYr(0$H z)?dH3!9Mn`6`eaPI8OYE81@qG*Ou z*I?o7u8Dw}GgDLCv*X2NVMceO%(0=h1Dj*ziubmbNJ~$!+Xo$>B}+qB@!V#<(UuDQaDv`Lh^$Uud&?f6(2#{Z!5 zV<2V-Pw!cBoVd?vabkXOVSMkxzRk#36nKN;2m($ewrlA~$*ZqrAB|kUB-UU^Si;ds zxw-Ep-d?f!dm63S7TrJnbw;j|n*3#*z@|#FrlZ2<4`}9l1P*V=cVsurac~EdHqDl`W?DZq`I-r$ao` zJ!rI^^DbGs)_c>iLk5b}qT8ys`||abDr%=^lXb{=ARi~aFpnTOVZvkjSBq&N{IA(F zzAI2*Mio-1^ZYKbO5$cZB3jNrBzQ zp}>9K6siN3Cf5hXm#s?|6^|0sgXeZP7{m%tjx5$<^}$}b4_1V|10OnyG;I%o&e1`+ zR%RZ3rH%!bAKL>9yxm@cKB?^7Sn73rCdMV?CqO=EAby&EfeF&;^lM4E7e{VgktDv? z4LgXqT0XxYp7!-hbu`c8!lL*3K(l~NEV_PO+VKen%HH&(a~rcR;pB?D8gT zhNR22W9d@{fqI2GTm#F*DM1C>*gNb9%4RV^jZv2zCS= ze7bs19QjspO~{oH&iU5dKDQNwXp-4>#5SwiCPNw?;XEn&QvHGPf)|0}=c?g_Y4LE$ zdWdyG>dmWs^4dt)36uz#Yns|fkxhEYrA43nnDMihJWczwVN^cNE#}2F$qcafkbNQp z(rSv&rHdnUWln4l(jnNMQGz4l4|A({BKk)E86u3;wXmbO3AyyIp9%WzygQwahdhM6K*xQQ>v3(?aa^}zSI0y9_dz=} z@vy0e#V(g7VZra=9)nA#bkG~?+Tw)t3GE92p|t2j<$4gEozcw`Su z5+w6ry1HW){AsTwHc3RUsqKZ#kRqhQ>7Bo7<37qJBNQ-Qy#paqmO~isQ9K5PSrbx;8a_Btdd#PmU06=PMVP)>Q#y4Ad{gZ=X#iTA zy6Q_!+9hMiuAqgYZTXBh6+=ka<51&Xy$6iP&`cA&uC0PMvt217hYV|^5#%}xcI zCkrTUYFiR-T^H{$Mcg&dE(jOPF<)xc(u4wWFfGFeYyNiK!+gsO9AF;Ua;dpxA&z2j zW46*a)|UtcKKY)D4P7IHu{Vq|Ka0ywT{8x@PMU4dfYXTIn-M?XuxJY5gp?noZM0-& zfWk;?hG$jAyCh#yn0q5VSWFgix=WD;I@t(pi>1j>M}fdMEomXh4R^>3fN+shc($m9 z`ldlsu$PITiH6&qn~!VFAm*!{=*20$u_KBi@%HyGb+Ifpu}1xE(1i_#W~q)P z74JN3pq4yVk{NY0ZKPtqvPUV0owV++I!ih@@PGmsk>Z-5%MtK?9Os4s6o>>}jt&7R z5-3Bl5jm^2FZCgEcTT_BUYtOIXzm0k@H)?NSW*0J0yYTo1^PPrFcFC9HFIW8W^GmA zjnjb)gWSmWWGE2MYaw5}!Rd8# z9~Nw_U1*9?iE9^P0lBPXh;K{JjVRqDhHr($LE!zia6xNBR}^tLns* zaJ|#pk^)f`s8H{rnR90kmw)A%064?Q=xx@+)%5MJLsFOpve|Un#tImp!lpjZ`eXj4YpIVM;S4Z0biqk{(x7w z_yW~ps=?h{JcX%846aoFDsnj7?3?d7bbI9~ zaX~_LSg(1^>=TWsPTUD0@ooBnvcrW?VC3sm>Irrz#0gGYavZtlb}4JwCz|JiRssE~ zg<1d_M>^p02XBee@gILvp^@4+N*z9Y#nlc4wjrTT1H#EeJsN8e^)kySEY%mbA1JfNx zv+P&fN~d+8i#>vyQzk72c$E4I6~nVTI|%e_LTbRM2xX{ChY6-THEg*b@D7#cr^-C- zYC*()EAx2p0+5k*QOaQ! zuXS{8+nO8lsuFy*0DF_p5A~2reA!GWaLWY-6U3V?Zb5v zsHD9A5_u9d=l@M!f4q3W2x``aNoW5aRn&mbIz)pFPv(=UqR{!2eTj zFn|fx{W2HCWx>~sSfNf&K}#zUH>I}w`B32Nr0ETUwfG)Pt@%^@cmbG#noZL|{pR9( zgIQn_bBZ~bm`w={|0shDe{+GraK0o*akLNG0mDF5!b|5&%r_3+K`=B71%4EtzmX-N zT}))CL)NB$5QfBp(;;_zxgl;aRWgST^Xt_5Px<1ZfW(yI)qc~0%#Rlmgme2T=HR2| zJ;=E}gel@&k*EJ+!4V@BuyAX!B3FnbJTR;#5YZ(yzn8 z`j+0K*a~+oWG)V~m{-K+2P-XJy&RyqW>BV!58QxZFtDf?R)K$&UHl~%v$|ZWSspDy;*-kpQ$*M?30U^U0cg%JqGAfBecYP%N|XmBu)T`rAf9bd zz$;j6+!hLmF2b@e+T^7Gsb7^W!I>LG2!JeMky@A&gh z-^wr7y|&t0LL(re-6i}a_@&)ZlbA(+3N#r<&}38EIFgE8jUUx=Y9@U@tv(1ryr(rX zMUfOR7m-;wXDJT}7}~k#1l}I#zD))0l>G|F#kA0!H-DRAft1t%ZqR(PA|>bE6-nbc zA#F)Fdh5C@qA@8l5enr}LXA%UYola#URUSZ*juJD;isp_0q&jp@3}te$G9P&r5B+f zrX>9GQbMLp;;_nin=(Cae^czVj=QqE9UoMm%88KsmE_QjmrppEJFC=C7`^6yze0%1 zWcMkT#1Edcis&?J>$0F$EY^m#z=qw#1iJ&5lY1V#X#L@BdJS-Od(0#(eR9r>Q;Mg` zrul5gRe`l^>L$&4s!v%8#bl8Kg@S=-egOGhBkfRnSy>GM28(M8UpFEy$x{t zNbQWK0gr%*^&og)-v;cQb*?u{CLuH}+|8XI8N*H^hB?a}utsXi16KpvSBA&IlQRSm zXj4WLv)uv}Mr|7Vjd{Y@W7e2D5hE`m*GNW6eY&Frb;4x|&AE}3I|LBu5$F%`y3EGG zQ+hOBj}E7vmL_(3S_;s8j2Fr2L`t5CHaTSyK=c7fqs$N-=5&YIvYLWPT3NG6b#1zW zY7gZ;b#zUK3y8hb+^V&PB#Yh@4kcb-Rbi`yDrPk}ly*uP#8_FuqwR1xq$)43l#DUTX+OPNDgqVqH1 zmIRqgVzK#TeIeevQuh$z?v-A|wvZ#)P-F;R}XH<);FT$0Ju(GQI%2lzWB_3 zx5_F46S^5<%e%R%ib+SV4~QQj5biV|*Q((RNa(_o-TAPt7RvcU0nx3bA$GO7h2A7b zZGGC08XcCj((Se!veZCWBA2w=G_vUCncp(&{0$dDz#@@#p`owFE~Q(%uF+Fz(%YeD zq@DxFGqCe+xF;BD!;du@$A<9ZgF8q$MEY89Co#d>p*#yk3(&u_@zjjDpwq2sVmGgw zE1%Dm6t4XC8a-jV<5Rad^_rmqTn#N99i8dBq_%iVF zzf`dpcaY4k6Uyvi#Y(0^Bye&@$>$_P5k3Eae%44eRKC8GvU|!yp#AdghZ?(9;46?^ zLus=SCAb==GG2Z|<~!F-4Q9=xW_G9h>X*;gCagX^?|fWbFbnN84<`7 zB~VcmNZ+u?(>^H*M_^|B^4#N&`Kb02jk}L4zdnV}jj{C3`bc*&KZbxY+Mef-(PT;$iMRo$iz@%429UFLyt zDMkvxbKFLVDS&7n5@0j8g->kUn?W!R&uB|~_(3^i&+1h;3h7%e`%c`kah{d2XgU5z zwn;ctPoHq`J4iGqIr>Uc=gpqGv}hcy)o??Bj7ip7tm@}ar#?$at5#Wyti97i=G2_* z{Tk9vOvPbwm-YR7MIM*CLSY9!*$!F5(c@&oi$1(r2jp?KIUz*ZktWoDT+H8Ma*6UeW9@pF#opzC%0jUHfsu z{z}Ji`{1=*_P+eZudI8l>^s}#TS+j_Jg5eY^k5HF)kwD^%3UN0t}kbQ^nE=+3JmyI z5M&+oPxHSh>Sq_YO7zxk^l?lvgC7m3%C?Ae<-7D6nwGOH?|O=p?qVO%P7-7AS%A{d zmjN;{3Lr@`cdA9#rvUVC(CvsdI_*ZdmFC*o)6#7 z$qn1q&aCRT+wgSeu$Yn|jbNwMZ9Dgk&&OG9i>z#3#7nB-%Zbd^5UVVD5*M$%ccBNz zIV{H{7(j=+g^fgI`IT@U0#&@^G|@Zwo(mQ?$iZTX(;L>+0p2NVna3lUzMkE0^bm}B ztXa5Z?bNB{1Oz_`HVBroOIlNLbNf(>rey6km1@Q=rwW1 zPIp%-kjsg8y=W0Wz2$9}=;FL2*x|8}1LUyVJ|Es8Uft<>m?_ic|Ayth4=%hMtq(v% zJ8~kgKJp|cQ^74Zj-Z(E;(29gT@5_|h(G)K6v67GSM0KHadXyjMJrbG`s*ZLP)_IV z0t8IKWfhv%Nnx{){E+3VyHWX6YDn*K&|bGMPE*7YEoHvg5Yi#;i^!1z(XCDL-ioz6 zv^K6%TSt^z{Xhr*LD?fRy4_+m+tfD0?6T!kl>^9(z?Pa&%j!|5ud0$95E-4qy^#P6 z#c7oI$%iOKz0`hur}Z;odE!Ai)2r&}NFQXcJZ!<-sR0VwfP2(GRs%fHXGEF%D9FP+f<%~R6A-^}eTVBU7tXN*1u(}dVTQVibrF~iV8gbr* zaQWa<=ba4?{A97{j5M#@o#UG}>KgyFBOTw&88XR!78%JS(;*-BIm=7EBgWW;5_5g1`&AY~whQ~V$J z{|y*#p=Fc+a8ZgDfEH7DWTE-xNyHRS;wyXP!^Gy&L(w3V>g&yuU*5igtLnM6WzE;F z-MtY@;->Cc2aB2DC<2&r^Qs7oVRY7di<&F`M)6e636mN#x(8Lu#u$Dnn4IG&7=Y!U zb!6|K&3E|w`_x63LM969*;y+R%4Rm%D%CdMcbW?bbRtl>k}F!QTQFK%@6ra#&4;V| zSGD%}yq_ZMiXK=R@)3au#~%yNxOB4Elqddw<(*el6Yau-vS5i#^G2mvVpf^;G%h*W7Jgb>c~U;J0+T%2{* zIydKLE@s{}Yt76%duE=!*PifR#Z}K?+f<(nfya7O;rD%0aqmKU?dqy(lG<{G&n(uZ zWmz^<^WWMhcGtLNzZcZBSk4RJEb0xaKBtaLP6vI-It@zocRNmP==1R^8U?c151<#s}$zZ0BKlQ`rmKqfuD2Oz~$ znjjbHthY;b*o;9|e}#19xTkc>Wf7^7Zecdo!c%1!Y-~NUR~SAdWB2={`rzUN#}dgk z@P|}=<7qz+;x>kFyF7o2gf}4I+*Uq0oC8;y`L|f)LKYVExxQ3{!zo-w`5qdBV$QtR z{WiEE1k2;<_FDHp>r&LZ7hkTDypiGgcO4a~gZOPUn@JqXYLZV4Y;Q)|YDz}=7zXgs zCdVge6S|GFXC@8B+S=tgZ0OM+i`RaRJ2t)=6QiQUt7wuNmh8tGP8*_F1v^FdjsstT zxQ6u#s{4np`&lM7$UJ2N#KGax!eCy)HGu5<~=NT_|I zZ=JBLCGy};@dz##c-IB5T-)6mGdXvOG6&;InlpY|qXd5i@jEIrtAl6)al&A}8zEWN z<#Cp9l8e7?6L$P>b1p@pHy|G@VKmL>s95s!I<3+>Fe@pvg7<2C$_=!VXY@v@+^e~H ztW_X|m!}5dzk72Vg0IozCkr_}&Du6kL?0f?QroMoGXN7ki(XeNdtbsRTfMYB z)SMYHZvZ<)`Z*!~%*W8(Bj$(v)T}`ZfGDC2j0^x?%}1@7wtrOqe{fh?(nY;<4MPpO zs{EJr2l99%*x9p{do(p?sR6%udyAah`r|ES>iCa^^F-Gj!=HFALQfwlYJc$U+j1|D zrLB=E%zxy90#KSl= zdGyD()^~|n5^Q~UjeV|}>qp3OeZ`mawA(y$%15FCx9yZn^MfaDb%kCtu-Nq4Yk9X) z)460H02Eg_cYRmnkh`NX#4)F-ecunN0Q&SyXTf3LgZh0xE~K5xP5dqDi72CiYlYmSPg)rs-2HupltX zo63v7o1gCYQ1K+_j5tu5IKKoq7}Y!0qGuQDQm~1EjzT{=Tk2V`;fS>Mjz=T1qR+kZD4&k&_?2ro z>LOfAIZ}aZSEoAJ+20OV`v&3Wu-u>3CG zTbSbdeBr+z5%^xL&SOt+Q9QbbbPyU0agDIybb>>LMZr($?Dn}%4VNIcta9>SEZcdf zJ|rd%BWkLyyH;g>?VCxE7`i}R2zTnLr zZyE)AV5$F$kqXEMpAkc>Nd!oP1T&4&f#ZJnI8pa-csREUeAjmJ z-sFyv`4zsttb(Es_HijR6G86JGxgcqWT&mJ1{t26kAb5~xm>La3P+nG#)5V+q&5^N zVBYOrY+_ee*VIc%d?{9XWajG_qlY$LqL-tV@Jv`#ads1Q!84@+7kMcRp<7}JJ=&3b zku=maWD>N7Z*0bGbjwSK!?V}^s^^N(1>^B)eLy0{$9=rEwmKCa5;-_>96>dleC<+$ zQC}ZO_$Y5IAo8v5Fwy#ccFikVAw^>0yT3^Gjp<1JZ`HD;^JaW{nuUw*$1CL+XORP= zAA#FQrtJW<#SJ`+2GNJtmFG}=IS)Xy?!ft?;YoBN_v~FiH9Ad#83}EC0bPsPK{GK2 zpj{zzrhM6$TbNf4Qmrwkn9TeAtejMJvJ2a@2#YU!)qml9}P;p2_*CBr6cpD~I(({6<^>-O>U znxOGL|2|*JjVXK;M5W@@33C2jI|_SuXVtJdvU13^+gPMlV)6onR=jZGM5=KX#Gy2; zk0iz520^Cc9=b{*Sa3m;o6o%JmPKE9v+CsDBaYWgZrJFXO1!UOSAQChMx>f z5Dza?=uHiQ3+(}_H#r<*pDM=I7SW9SQc#ggV0~xpLH)!`?s|cXVW`EhZQ>2tJPuQ? z;L{biWCNTZ7d9R$s`jCxmNBn-uM@9;UsmyOJ8oJRm`}ImyKwaC;h}X?bptXb7^qt})vSY7LSJx%k?m5mi_$>x8;rp@FPSY{P zA9Xm?uk;ZKEVvjSC0=PCQw3|xkVA+@d`s>q{7HC`+A~z~@ApRj%f>N4j{e)*L=VRP E0Ry;0p#T5? literal 0 HcmV?d00001 diff --git a/Project2-Stream-Compaction/img/blockSizeChart.JPG b/Project2-Stream-Compaction/img/blockSizeChart.JPG new file mode 100644 index 0000000000000000000000000000000000000000..9aab80c468c8b3658ad9a1106186f6ba58869d01 GIT binary patch literal 26446 zcmeHP1zc3y);}}|f*_p=C`h-GLr7nwL0Xy-X^?gR6%=U@1S#o86eI;{Fz62HW+>@_ zneT8zx$oT<_xs-048H^G%-Lt}z5ZvNz1Lprzd;S5CV>k|a*A>Q8X6kl0sa7}sS7=_ z9#AU)P*DNc006)RFwsN-bWp<7x~TvJK>B_^Hx1xo-I8kT7R98e?x&n3VE zpk4mu%87jTIYZzKfinco5I95N41xcL2w+8E{Om!Hf}X;Q2+Z$tK~&_=GR6&l0EmkE zA^-3ua6?Y;I`by}A6m_s%4Z0iA#jGk83JbroI-$)hgVpHM_7bc;1aK(h#;>B4=?aX z-vNLYa1&4f??g@EliRquI*M>}+q-a?S~!?la+y2WaeJ6La`STWa06lx4@XmTTT9nV zW|r1advTWa>N=K7Pz!MuT|pHd6-OCM8>oVpv!$ljEiH2|TXSIx7Kp_8b7CGM9(Im) zmae9kJnZh-yNGy*GyYt=2q>R?&CMtw=4@dlqH$C1*An2FIODHre+RquHuYtZcqym zD^n{$GYbnrPBT+}3r=2M3sX*03w~Zs3x0l!>w>&D_^f!%7=LZv!u)&dj&9EPes0^s zoZIrArJbd{s|$z*FE=CiFP;8ZwFyG>z0p5}AJh>nBO>E$Y3gcu6FekN9w#S{Fek5o z)^CrP>jvKqG48)8C&qmO#;K0}YkU4rp(P<^VJ>2R(su{vUz*pnbo!4y?R!uOFx2*o za-jT+0M-hc93CE?829N0{~CdxzX2@{yzjexf_Gl}FZi4_@~1#PgX;{gKZU@bDtu;L zXK?)~1pZXvGwb?a2G>dX~#HbXVvoC>gmJudrTY=U}J4#Ldsm#>c|O&UR7>8V)Y*IlObE z`1qu3w3M`LfBJ*^01)G%C885xpfLjI#Aq1AXs9ND0n9UEftksZ!sVCFu z-eBf8a}K~eM@~UWMSbNe3+pvD0YRbbH-trGZpzBZD<~>yYH91}-qzDMx3ILbwt?Pt zado@z?&0b6@X_PIpeIj*qoQM;$Hu*gPe@D8$jr*l$<2H7wzTYBc|~PaLt|5OOKV&E z$DZE4{(-@v;gPB7nc2Ddg~g@i&8=_SJG*=P2Ztwmp#d1bsP$d5ztD>q)C(OG69W_H zL@zXScksX<#>BeJi%lY>j$`U{k&(|ImsC3PRY^S_6Tij=nVECA_oN$eXn zf%K3?0Y2v>+8t1U7$XX(#Zs09WOtQe2U8Ts8?1;cbw?Y|ko#nuq-V9CA@e`XX2^aR zf7vOy6i}BX{fCE^u7(O(Vvl~~?$aJ2n_F0Kdt^)nnt} z@Y-HKNq-OD0agOOVvNA!yw<(S6=A>>!jWqbKK zRm{2|+X|vHn@|8wZ?mSV3s*TwLB!XZRb$RKkIvOh5)d^-*Ha2d$hKmkupbo;nVM-e!^Wi%vNw+RBY*&5kCP7)xs zpH4sFwtsNNMtvzR*WB^m8gBgLH%VFxyEde2=lR0$-vV>~7q}l)wj6y_XXV zJjztG;noRErB6O@k)-ajFuW=ZHMm)_5%lCp+-<4}L0nTy9(RGML~q8l{Ze!k@8Fmk zPO1*sTDgnej2*xH%_pPdqhB~eq`sarrxb1x-Y#;N>*nw=E$|5Z)}&rG7F!h)tFu`e zf=S$hea=q~ufqkdRbwZ<9&{v=Ui%o|$WU)dvp`yoYgAJ0HVW|Y@2q9T=w(kC@KJWM zENV&nQ_CrSs)Ss*<9ay7`*H2*<_^7Ww4!q>=hV zoqRbikx_44*#mBQ&ybZ-S%v=etmFw+oN)d27tiF%Y5Xs?Q$bJd+&q|`srOnKct6DZ zZ0}w~>1CY^Io2-OByUx&cvr3`Fy!Et-@B`fcJ+aVCx>qwGg1p}RNdHe`n-f46Oz~g zcX(v%F8NPUHC^&I9*xIP)tgCTFqcp*1@~)iyV1z1 zBkG*7up~+Mg;ZS<>z_P`^BZ8~k+<1>n}bblIhQDfVS$-uwZw+G%j4d@+^4$e>|Bq~`tu zi*lhsawhV<`)^ibc|(X3%`pk3$hg8&vm%*$ULp_&S!ntO!+R0fj-WTzw@sEW3up|u zzO-|fuk7V6_oTR1^6>(tJEnETCF)$$seB{;M*LSpV@5LP0x*}jhTj-C*?n+p-!2Sh z#lTVCNP3r)7pmlaETd7Q@LBznFAt1URV@|3zWdcbRe#?_#g?7wYSh)|b62ehhRvhf zFoJv2{CIZi@TCgx-mmiS<6gD5Q%?_57ACtzL)9@S5d)`b*!2?J-fNg%J3MqHe&0V^ zJv$`A`T-zHEzD!ZNTJ8?shc7VZ`?ZA8j*uDJM;T93Ogen0z6XYVIu0;`yxG^T0v^n zwS9uhFWuNNCOVk6T>5eV0>n;HKrpc2AVvjs${RBoUKWXs=NvfSFf@icU?v(NwJ>H@ zm>sw})3-f37X{;2c_-AB>nL*#uzbU(O6Q`#t%j}hbZS%uU3p@4>FRB+`ERwKKNC2X zth_eA#PCycqFLVEuNU6+3(EB|_LZp!&2=${o4U*$__?-S3SMMQC3Ht#2xa!rS&<6w z4EqW%@8pS~%CP&e#v#={c|Y4$;=)6{!At$OGINHFr0@dyp^U6Rl;T0mmP^PXnh_Q$ zn!o!LzVb2GNY5ccE!kU<;I&D7!k5f*0g{A2<}CSO@T9u?NOfoY5CVmUNXi#eQuP_flFBN5fKsq;guvzt6ay8 z_ZzrVQC(4mkjUE1h$Er6=joJ^O2s@0V0)lfq5x8lysLw7_~Za%$-`THoKda_h0q0a z#`U%ipG;|&WIPr`DcYid22;V4Vj656Djm%T`NC_HUR`LivI~$Sgz`c0e)n%e7iOuJ zu?y&?mewVpfKd7$BA|R${0y0YzY+9yKjTv3V~}gzPQ?K|J+xD<-(S0dnif(VDm3J1 z{u{63IB+~ji+MxC6Tn`cyg)0h@_4SNjEq!n6$k}N3;ii3d)B`jA7N;)P96`A9 z?H&t*oX}^(WuJMjL{89a`A1AmK-cMXGiSTNT~!92D`P1 ze1n15;Aimr;Ii~9H)2(Nw;Fb^PI0`;imrcF`~MYWbnL6A*^OvapiPmV_zd*i%mOy z;>W{v1kD&7d;h+qs?Rz!xW>!(ZHHD^>T^m=FRPb0q1%D+X0;Rv29;>ouWUB3+grby zh7V*97Ox4|6l9i`#ltZd;OtPGgejpCO7-O-%-zlzEirMSN{O^C+x3N3CmSw$^73=< z3cfTM_#M<5E(k-o-ve#}!x%4K-27N1HS)RIPy8tr)uDxEPAb89cA0+HsoeE`yQJdr z1tPLY8PbBhSAk2EWHeGE^2R$C_hEW=R&-K3FBQMy#@^7GuYYlK)QKraPs`pVS>Xl_ zM#w@&n%_B4;s3lNgPAR=3BXnq&q1dd9TGrXvoA`qP%i>kJ5uY01Au@l_?+dheN)ma0U`@!%9! z?iF(%?`9v1hJ&L{JAFAKl>5+=`{+6BC5txhD>W=_N`Mf^Obgy2Vm|FoopzB<%fOtN zx5vE#*8%TO)!tBUZ5Cw;cCkF|*B6S#Xi0GHK*dig;}ioEK6bAX#KxLDXdIv2X?6olODa>`)SYlNv!s>y}QkNejJX3(&ghF zyd8XeXxXx)YH0sihoNeS+pztboPUOuqNUwi(luBm5`wJi-E@D7{mgRKfam(#@MvUw z_&O{I1?Z=4I0SYi;p{#>jP}0Kc78w*(zk~Kb_QQQNz7MW?wi<8u^jDBPNq(@ky#Pc z-Pd)W=%!Al>q_f#?LIe^4NHJRYJ4RE%5u`Gml_17Uy-GW9X`LJVejI84}4m0(1Tf1 ze>-psAu}Behx2tGGjo23sRVRNGY$HWLg@>ILLHZ$GtNz0X^6VC|-e-e-Up9)ULz&hMjMQ|+}itic*5q<+7ax0{H`Jw=BQBmnc zt6ns3KBSv6+Uzn@M@+SFp8lo1BSkl?8o9Z?bf8PmR-F}dZAI7)X>EUzFz>oXRzw&I zm{pxJBC)eS*!g1Zl6$YAK=rdc%#d*6`0m<+-^{3YCiL=C^QY2`KwcgXK$qK}MIZ2r z&}c$;Jbv1Vq*%f(YWV)mKejU?9A1jyf3Ia`r(qg$O+!Y^U`e2ReKPQ#9%N6;!2A(3 z>r2kkK#M3-P9tXX{2NRwHFdvBBxr0+LcN0wO1V?`zS7DSTt%hrrHgi@Z3Qn3$%vd7 zXXZ!n8j5f+x7@Xf$UQ`r%AqBR5k<40M z)8#t)Nre|qhCiO3wd9ThfCma67T!sw63+iT&g7csLr1dXkKVS|VtuT9#eNu%F-;ik^(=BH1QyD=iF~4J;Ew|C)gA7DktN*uP!u5f z$po=|jVRDn69w>mf+41}<7ke$5%GrmM00UpL#AfnM39xM47+W;byH$pbqh6hdyD^VADw~=H6`L=}u_U7sul28C%3)m{8 z5CuFiIbw5}qQl+kO+TUv+Q~}6v0*7nOnF#=EH$9}3VC9}4TpTVvsn&K5DO@PVYfvU zc{O}rr3L~{_(Vq!!P)(yk`F~HaYr%)&jbYoXP|&bEMV4Zn+OH8^}sf@VBpfhWs~6u zEC~Yx=tRH2SN`bgR|XW28ibUJ2BXL6$O|dpja^{vOo}L=Sslbg6>NGDEE5SzuODBvgtG9?5CosTggTLdXbDhwMhJz&THh6C)MJ1Ol0RwD@(3kBOIlRvsF zanceDDF?O`oPJz~NO=NlfPquo7WmK}jh=M+0@!Ij*roui!43?-9OA-esUZ6akSV;A zRxrSl0x3t@3>%O3VMq|a18P+yTlg*tSbK08gcKb2LAON)dC}oS1sN1@sT~D0b%R$< z&|3tHL|A|-*!&ej4Hr)u!vRMDC}ckm0xriY_k+#=`_}-bTi^!g*0}NJeXyMPg`@%) z#zQUG@1Ljj#F1EU_NUt_oay&9zZI`^t|-5xqcb!+=BneF`=Bg)jr+P)9Y#@F1<~p{ zr48+Hhr=C{DV=ZFka2tYDb0~1X%CAnw?g4*r@aJ@0LUaKQeZ%W_@dq;%X>Ybs00~qi8LM#ByYuL+OGzEm;6|Ym{Lze{M|I-GY!Ulj9aC9S?FdrP z#UNETbOBkL-xWwBJwv!uZ$K^evKm1L zWSe-E-Trk$ca(jZNtj8%s;k*bbEaH)Jwx*zxQeI_`=gM^-w0V?#BWyB=;-K&=@uA) zjYoZxm7lTqN3q++KeBaf*1L}m3!ndp2rf+Hgs+F#@6aiYjNVER6?nMnG=^t+>D}a- zBtVGE5DK_yhfgnZ75UKQ%iJm2BhgpLDLI}O^&)cVc?X7l*(Nl5ifS~ib8X%>jkpCy za*xdUt=*paNI7%aB@oRJzEVFkNMzhsuvSzSU%1W`MiYGxo~`60l=Vt2_ZgFO0Io3x ziTs9Jv3i={1=<{D;#QfFnC+QDQZ-Bid&ok8$=)Ig*wnJC0AnO2u!(eV+&an3pI!T3 zg^M=UKyrEaf%{WCkwZmnCoo{3{qdSW!dq%P(e#M-`<~{`&4t-Qg^N%0V2s1Zwh#nG za}|+@<#5$wFUEHsD8Q*M(|uc)bc57jKBx*&NVsoB)-#th|27W5WOa0 zrupT%^(gsoMFEXXhY!;IvD59tU~`EjF_3(heVyj!!uMmwMt#JeQ^q4@hz2UbsQ_5I z?rH15_oegbo3Ke63XohVs<+IOLM{39ro4+Q)S&oC$2SafWI>D(OeUQSoWxwGqC$ zYi2}d;>!?%MX$2Zrt8$R1D5c6Iv;ThuM?MEQ0C{sj4pKwIR`A|{D}F~)vlCMQe zku_O~K8ey2*Y4wi_-A*^Awq@9WXwY^hnhbnBbFo6x`VcztuBlHBkGh_6_^Rl)E8_% zS|KnW%x91kO@IomJ{nP4EtB;eZ+)`UNjBjd-5A+!Yo9$IcjLy66?~ia*tUzaM*(i8 zIQz{~3vGgO%&EQER^wSl9RFa+bCHkfKKZw1W7K8M?)@@7{@sURWt(um{0Zl)z%|I{z(HK1H}u@t3;kO7 z6eqd7oUI%hQyAkjF@NUU$jL|yyJ!YDh9nJceYf5%?LevBOfe#T)>Pb@1r(41g$aV; z3$Y(!Y%uVw19q?*_swK8{Wl@k=BzmN|0859cVF3Njg)=a{zRaB^ph6?Z49A{<2!9+ z((JC1?cq#yxqNYJfj~b0`bM~Y-N*7wX>Gw_civomf$i~KAvY92r@v$XFJ!zqt4akmcIDqhQX7FDa22x|{D{d8e!CJsJ4LreTU#$`3Ow5*l^4 z)>mX_CytPh*5^ukKybkzyMu{T$LZ#2}rqkkhIUdsG$!g@FY%1jRww_RE8t$7&S4wX~=XJ-l5DD2F?7ry`X`mgA*k@_cy;*AL$cA z6vi~d8CHO!5#pniBi6$t6Ylpo14}T)q+>F05ZidY!!sci=-YA|rlh$^zoYskjTi;k z(VT=Emffl!uu7E6*G+bSZ4xzAKSqpPW6%RVkc(3Lsh{>rH6+R*$B!8_i;wDpiHg+L z_xbD6YES@@JnmQu3h?qQ^8mTIHnJ2zjSyypc-B_q)Kt@atF-%l8Vu} z?ZPe>7~$;J@t3DnA{=OKC~h#=ZQb5|P6abG-zoIAll%A$Db6*Sf~;(#InW`=zHy|O z?$P(Cmckiia_PnXHW`zus~H=QLU#+Dy!jeQcB~)^R})mKFAGIgvlQ7j84JAWzAlB$ zRV?Cp_kk7!&agDNbXz1mxJ;E~JV~_iLOZQ|(93k%(&buxQ84Pqbgs2gzcqL*PHmI6N5-L_v4t{R)?|u) zA!We`9+qO8rC(SU!cL$(EN|lMo#Is@Ypzq12Ys_@rxXHRsa^y7e&@S4@tY#Q^Xh(g z0yvG_&n=SAU+Er(!ba4xGPEtCw&maMsr&d{@L1m;*p2tjHyjgL_7b_>Lcz9j|IM24 zk{`B2*(+LJ=?=5d3vHugdH$8Ju~l}P!fjp6ctOTwg&s;^SNIO&LeT>i{I{r~X zue{|1SL)j5ZrS5&TlQu)iMjPah=q3-wn|uTzQM}wh;bDsv+XCH&rPp*PVO#U{qmFU zxsZBC+&Pa{DM>n6%t?!lYsAuwIqm0ou9K+FdkTOg_3s3N^zUqdpGoT9`2*qK$p$}X zKfV(PBz`1*f6jmW#vu5f;5|vMoYf8{o_9BuKU{7#{P!;>oJ{cpA1_q^A`!HI?LiyNOTZSET_UCln8wu0)E zbBpet-%8ui#DB5{r;wS7^bym!8tQG>Jnf4oaOPvR6&T54xy+tOQhE0*E{rL-hMP0q zx%@(ecM#d9v7_tV^X=tP^JBZ`rh=CF4+m_0MEW=>-#ssYi@%|U^ed(hPt9#~e9aNh z^jOC~2J*3HJB8nab*i71tf$X(U;DRdG$sTQ3b98b7yr zH-L?6PUkqo;j6fIcjCKk=Gy#lIKCVE4`xlSO$Of|F>X%W=|5!LWPw7X!)g^XhmxA?x5B!(t4K2xhT5H2WMf zADF@>Z^ztipKQ&hKv3|3bvPu^d0eDV|KW;AZ1%l`K>z!$LztQ`>=yhVeY+z>TN->n z-8{sn$vU+jSg}b9;Nz9VBKyM(+X++hk8*O7Bp_YouhW|+^vOR+EeoJ=o>d=u2F?j< z;CGgDDmM#=fT_#BL9F{pr17;BF!HzZqgY0rKqP83+K`|6xlw6~XqS18HWMIw_kAM8_^Ba~gd=M3ChF zhb*-4YV&(7t4_)BsaR zjOjZ4esd04=l;8Ojb6B&{XK4ST;w38#-&I63iR3<*J=HDRHYJUBD)4Ru?us;FUVvr zl}unNYr69fF`s8Y->fa4c;COC1&D6p?T{1+W{VB?CQMHoF&b52-c&jF<`dgv_nCYE zz!?Mxw)0PK3v^EA>f_%2ut)Bd)I^w16^LeazVcwB1sboQCWYL(LpbjzQpKLIsnU{Z zYPWj~(YCzrkzQJ&KpZtKJ(f>`PaAUFQvsxSE&ay8{*@43k-ttn4l*_FI)6NTUi3U` zOy2&RUbB50VLwNYXS}n7Qv)|(9p#Z?Fu30F-kLC)HMnC!Enr|v#lL1DBBUkbVQBu| zlLZbOb4fb4)4QL@G&K#fdgl=58%CATB+O-N{3kFjS$8G+kI_GWp01K(D}mdyW}IsT?HMUy*S3Dd?DLE!s zIk!l}JJIvQNsN~->On_jmf!LO>)B`A1~>HSF%QqZ_ATL}Oq?Wfu}lhoC??ogSBtP#hYUOiSTX?_8C6TsEslbHB>3g*}rpYrnB4Q@U9wH_fU(m5V9VU5O1!2Q>4v^~ql=eq>;qHL46 z`hH^3`+BoH+%O+Gvc#QkL#p;T(wyx2qO==@JMJ}u)4uQ@flYN=sg+`m(;|IsZ;(r`HL zf+5?7f+ruaRnj;gKTz{DoOLTa&!Y5DzC>tXoufJfY7uMoscqHyvm?;X@*2IU`^RWv zxl%=KJJ3}xIQOO0N$CnZb^|k0s^fCKpV(xGHO}ioB~}bB75aIF%O>z~MXRST4pwFZ z9liMuym9Gj4a4W+|Y#4=n!3boX%p_4I&*J4ytzncq25x;pVNBX7Uwx zW^!`(gvda$1gp2QrX)NcckbMdY||!i8jaNZW+U>Mc!gZVc33bupZ}wE(wO2s3(9C* z(x6uLSmRsgB_^c$1;nM7wuwres?_H;ZYkunE za==8o>`3Q;7uHFd?ODwjzuu(qt)S=Cb+=GM?~us8|@uULp2O>^BwX`wA)($aJ&3pY^neIo38$%@Sm;xKT`f*)G;+xZ(Cad zUc<3Qg1GbVNTVdPWEN9*AWB|_GR!?Pv?Vnq=AUKn=<)IKiCXzFr==y#<2!J}FUnIS zOyAh6Fk16s?bNKF&ip<@{W2A?@w)(3zZZd%?DnNw>+~Hr#bm3YDo-vjKq}<+h^Nh@^>}ZvBPkKY0D8 zC*^;%<~v2SJ$)N`-rYJo%JNgr58HBGejmwo`x7}{ zu&iX;%sVNdAD`2C(qKRNn_8d^S9c+2(M`Ss17-|!27Vkf`sxpDnvosny|l?bj*oX_ z6cl(S1!lXuo-fdUlr@2HrX+^%;Bi6jZ;*U^%R{hI&613)_@eiwuMANwT2VNctPDqQ zu%RVLDIF)##-n4Hm2CVfWgH}XIng9wEwiFeSvGHYY*a8Ip0igsYS)CHiBKkocSGTl zQ=*emCaD^hkt#f87-Xzk!V$(3;|@z;LP^`HZqpIgIlNMnR%vT`oL?svmtN8@b8%JE zFfuN$vK}O$fd|Ao9mz#>fDm_&(Sd+O+U&Oo6fjB0U;>J2DBuHZGLhlPkm1{|*V}2D zmvDxkx{qXY*~&VS*Xepv+pe*};mEG=k!lh4yEkAK>zypiYc7LL2%F4y%0qT~b7*#F z_Jo0lv74D-P+48#1*hj&MQGcfZazff=rX81&x#Dp29v&cjLTp3p$$_3FMPUqXBVL1 z?FY|h`qrl3p9@|VRu%4NN`T?-hwU2V$1j9y+FTAZh}(+9YA}_yb}-&&Cd8rDSqNGe zyxu#Ho9kY{JbPDYuRNS&w9b?gck(hNyC0u0bK;=3Ea145hfhSm!su(Q)O$06b?2>a7HM)$MhvhUXO~b56>A7@LcwNkL6`YidH`{_7%D^ zt-xu&j#XAVVADgDtXuB?+-FduDMC6{7(V#!?q`Y!?kBD}rF0t8hdpvtZQkWcRFBjn zUJWHpUQEu$eBIW9V=W7K+Nb0$J1=diUM+SY+Zr|!+^N3XBVHeEPS5$#A5Wg4!Gmf= z|EZlQPYFfBaI^ZlVs8%ABJc$n; ojg^ivn5`K`-Je0!pe?d{{oY?a?Yo|q|3>cmbdmoK83Q%^Z-H+b#Q*>R literal 0 HcmV?d00001 diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.cu b/Project2-Stream-Compaction/stream_compaction/efficient.cu index 1e268e2..123b9ac 100644 --- a/Project2-Stream-Compaction/stream_compaction/efficient.cu +++ b/Project2-Stream-Compaction/stream_compaction/efficient.cu @@ -17,7 +17,7 @@ namespace StreamCompaction { __global__ void kernMapToBoolean(int N, int* arr, int* boolArr) { int index = (blockIdx.x * blockDim.x) + threadIdx.x; - if (index > N) return; + if (index >= N) return; boolArr[index] = arr[index]; if (boolArr[index] != 0) @@ -29,10 +29,10 @@ namespace StreamCompaction { __global__ void kernUpSweep(int N, int d, int* arr) { int index = (blockIdx.x * blockDim.x) + threadIdx.x; - if (index > N) return; + if (index >= N) return; - int powDPlus = pow(float(2), float(d + 1)); - int powD = pow(float(2), float(d)); + int powDPlus = 1 << (d+1); + int powD = 1 << d; if (index % powDPlus == 0) { @@ -49,8 +49,8 @@ namespace StreamCompaction { int index = (blockIdx.x * blockDim.x) + threadIdx.x; if (index > N) return; - int powDPlus = pow(float(2), float(d + 1)); - int powD = pow(float(2), float(d)); + int powDPlus = 1 << (d + 1); + int powD = 1 << d; if (index % powDPlus == 0) { @@ -63,7 +63,7 @@ namespace StreamCompaction { __global__ void kernInclusiveToExclusive(int N, int* arr) { int index = (blockIdx.x * blockDim.x) + threadIdx.x; - if (index > N) return; + if (index >= N) return; arr[index] -= arr[0]; } @@ -71,7 +71,7 @@ namespace StreamCompaction { __global__ void kernScatter(int N, int* idata, int* boolArr, int* scanArr, int* odata) { int index = (blockIdx.x * blockDim.x) + threadIdx.x; - if (index > N) return; + if (index >= N) return; if (boolArr[index] == 1) { @@ -84,10 +84,18 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int pow2Length = 1 << ilog2ceil(n); + int* idataPow2 = new int[pow2Length]; + memcpy(idataPow2, idata, n * sizeof(int)); + for (int i = n; i < pow2Length; i++) + { + idataPow2[i] = 0; + } + int* dev_arr; - cudaMalloc((void**)&dev_arr, n * sizeof(int)); - cudaMemcpy(dev_arr, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + cudaMalloc((void**)&dev_arr, pow2Length * sizeof(int)); + cudaMemcpy(dev_arr, idataPow2, sizeof(int) * n, cudaMemcpyHostToDevice); timer().startGpuTimer(); @@ -103,10 +111,10 @@ namespace StreamCompaction { kernInclusiveToExclusive << > > (n, dev_arr); - cudaMemcpy(odata, dev_arr, sizeof(int) * n, cudaMemcpyDeviceToHost); - timer().endGpuTimer(); + cudaMemcpy(odata, dev_arr, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaFree(dev_arr); } From a6fa8cef8b30e08486fe36bc4e81259d3a382bf2 Mon Sep 17 00:00:00 2001 From: tabathah Date: Mon, 16 Sep 2019 16:17:38 -0400 Subject: [PATCH 3/7] fix to stream compaction plus untested impl of char recognition --- .../character_recognition/CMakeLists.txt | 2 +- .../character_recognition/mlp.cu | 197 ++++++++++++++++++ Project2-Character-Recognition/src/main.cpp | 13 +- Project2-Stream-Compaction/src/main.cpp | 2 +- .../stream_compaction/efficient.cu | 8 +- 5 files changed, 213 insertions(+), 9 deletions(-) diff --git a/Project2-Character-Recognition/character_recognition/CMakeLists.txt b/Project2-Character-Recognition/character_recognition/CMakeLists.txt index 7446175..01edd01 100644 --- a/Project2-Character-Recognition/character_recognition/CMakeLists.txt +++ b/Project2-Character-Recognition/character_recognition/CMakeLists.txt @@ -7,5 +7,5 @@ set(SOURCE_FILES cuda_add_library(character_recognition ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_30 ) diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu index 5a3ed7f..a06e429 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.cu +++ b/Project2-Character-Recognition/character_recognition/mlp.cu @@ -1,5 +1,6 @@ #include #include +#include #include "common.h" #include "mlp.h" @@ -23,5 +24,201 @@ namespace CharacterRecognition { } */ +#define blockSize 128 + + __host__ __device__ unsigned int hash(unsigned int a) { + a = (a + 0x7ed55d16) + (a << 12); + a = (a ^ 0xc761c23c) ^ (a >> 19); + a = (a + 0x165667b1) + (a << 5); + a = (a + 0xd3a2646c) ^ (a << 9); + a = (a + 0xfd7046c5) + (a << 3); + a = (a ^ 0xb55a4f09) ^ (a >> 16); + return a; + } + + __host__ __device__ float genRandom(int index) { + thrust::default_random_engine rng(hash((int)(index))); + thrust::uniform_real_distribution unitDistrib(-1, 1); + + return (float)unitDistrib(rng); + } + + __global__ void kernInitRandomWeights(int N, float* wtMat, float scale) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < N) { + float rand = genRandom(index); + wtMat[index] = scale * rand; + } + } + + __global__ void kernInitZero(int N, float* data) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < N) + { + data[index] = 0; + } + } + + __global__ void kernSumWeights(int iDim, int oDim, float* wtMat, float* idata, float* odata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= oDim) { return; } + + int row = index * iDim; + for (int idx = 0; idx < iDim; idx++) + { + int wtIdx = idx + row; + odata[index] += wtMat[wtIdx] * idata[idx]; + } + } + + __global__ void kernActivationFxn(int N, float* idata, float* odata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { return; } + + float x = idata[index]; + odata[index] = 1.0f / (1.0f + exp(-x)); + } + + __global__ void kernCalcErrors(int N, float* target, float* output, float* odata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { return; } + + odata[index] = target[index] - output[index]; + } + + __global__ void kernEditWeightsji(int N, int jDim, float lambda, float* hidden, float* errors, float* outputSums, + float* partialErr, float* wtMat) + { + // for hidden to output weights: + // delta = lambda * value of hidden node * (target - output) * derivative of f(x) (where x is the sum before it went in f(x) or is just the output??) + // derivative of f = f * (1-f) + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { return; } + + int i = index % jDim; + int j = index - i; + + float x = outputSums[i]; + float fx = 1.0f / (1.0f + exp(-x)); + partialErr[i] = errors[i] * fx * (1 - fx); + float deltaW = lambda * hidden[j] * partialErr[i]; + + wtMat[index] += deltaW; + } + + __global__ void kernEditWeightskj(int N, int kDim, int jDim, int iDim, float lambda, float* input, float* hiddenSums, + float* partialErr, float* wji, + float* wtMat) + { + // for hidden to output weights: + // delta = lambda * value of input node * derivative of f(x) * + // derivative of f = f * (1-f) + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { return; } + + int j = index % kDim; + int k = index - j; + + float sumPropErrs = 0; + for (int i = 0; i < iDim; i++) + { + sumPropErrs += partialErr[i] * wji[j + i * jDim]; + } + + float x = hiddenSums[j]; + float fx = 1.0f / (1.0f + exp(-x)); + float deltaW = lambda * input[k] * sumPropErrs * fx * (1 - fx); + + wtMat[index] += deltaW; + } + // TODO: implement required elements for MLP sections 1 and 2 here + void mlpTrain(int i, int j, int k, float* odata, float* idata, float* wkj, float* wji, float* target) + { + float *dev_input, *dev_hidden, *dev_output; + float *dev_hiddenSums, *dev_outputSums; + float *dev_wkj, *dev_wji; + float *dev_target, *dev_errors, *dev_partialErr, *dev_tempwji; + + cudaMalloc((void**)&dev_input, k * sizeof(float)); + cudaMalloc((void**)&dev_hidden, j * sizeof(float)); + cudaMalloc((void**)&dev_output, i * sizeof(float)); + cudaMemcpy(dev_input, idata, i * sizeof(float), cudaMemcpyHostToDevice); + + cudaMalloc((void**)&dev_hiddenSums, j * sizeof(float)); + cudaMalloc((void**)&dev_outputSums, i * sizeof(float)); + + cudaMalloc((void**)&dev_wkj, k * j * sizeof(float)); + cudaMalloc((void**)&dev_wji, j * i * sizeof(float)); + + cudaMalloc((void**)&dev_target, i * sizeof(float)); + cudaMalloc((void**)&dev_errors, i * sizeof(float)); + cudaMalloc((void**)&dev_partialErr, i * sizeof(float)); + cudaMalloc((void**)&dev_tempwji, i * j * sizeof(float)); + cudaMemcpy(dev_target, target, i * sizeof(float), cudaMemcpyHostToDevice); + + //cudaMemcpy(dev_wkj, wkj, k * j * sizeof(float), cudaMemcpyHostToDevice); + //cudaMemcpy(dev_wji, wji, j * i * sizeof(float), cudaMemcpyHostToDevice); + + dim3 ifullBlocksPerGrid((i + blockSize - 1) / blockSize); + dim3 jfullBlocksPerGrid((j + blockSize - 1) / blockSize); + dim3 kfullBlocksPerGrid((k + blockSize - 1) / blockSize); + dim3 wkjfullBlocksPerGrid((k*j + blockSize - 1) / blockSize); + dim3 wjifullBlocksPerGrid((j*i + blockSize - 1) / blockSize); + + // initialize non input buffers to zeros and give weight matrices random values + kernInitRandomWeights << > > (k*j, dev_wkj, 100); + kernInitRandomWeights << > > (j*i, dev_wji, 100); + + kernInitZero << > > (j, dev_hidden); + kernInitZero << > > (i, dev_output); + + // input -> hidden + kernSumWeights << > > (k, j, dev_wkj, dev_input, dev_hiddenSums); + kernActivationFxn << > > (j, dev_hiddenSums, dev_hidden); + + // hidden -> output + kernSumWeights << > > (j, i, dev_wji, dev_hidden, dev_outputSums); + kernActivationFxn << > > (i, dev_outputSums, dev_output); + + // calculate error, lambda + kernCalcErrors << > > (i, dev_target, dev_output, dev_errors); + + float* errs = new float[i]; + cudaMemcpy(errs, dev_errors, i * sizeof(float), cudaMemcpyDeviceToHost); + float sumErr = 0; + for (int e = 0; e < i; e++) + { + sumErr += (errs[e]*errs[e]); + } + float lambda = sumErr/2.0f; + + // update weights + cudaMemcpy(dev_tempwji, dev_wji, j * i * sizeof(float), cudaMemcpyDeviceToDevice); + kernEditWeightsji << > > (j*i, j, lambda, dev_hidden, dev_errors, dev_outputSums, + dev_partialErr, dev_wji); + kernEditWeightskj << > > (k*j, k, j, i, lambda, dev_input, dev_hiddenSums, dev_partialErr, + dev_tempwji, dev_wkj); + + cudaMemcpy(odata, dev_output, i * sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(wkj, dev_wkj, k * j * sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(wji, dev_wji, j * i * sizeof(float), cudaMemcpyDeviceToHost); + + cudaFree(dev_input); + cudaFree(dev_hidden); + cudaFree(dev_output); + + cudaFree(dev_hiddenSums); + cudaFree(dev_outputSums); + + cudaFree(dev_wkj); + cudaFree(dev_wji); + } } diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp index 11dd534..26b1154 100644 --- a/Project2-Character-Recognition/src/main.cpp +++ b/Project2-Character-Recognition/src/main.cpp @@ -18,9 +18,16 @@ int *b = new int[SIZE]; int *c = new int[SIZE]; int main(int argc, char* argv[]) { + float *xorInput = new float[2]; + xorInput[0] = 0; + xorInput[1] = 0; + float *xorTarget = new float[1]; + xorTarget[0] = 0; + + // Scan tests - printf("\n"); + /*printf("\n"); printf("****************\n"); printf("** SCAN TESTS **\n"); printf("****************\n"); @@ -58,7 +65,7 @@ int main(int argc, char* argv[]) { StreamCompaction::Naive::scan(SIZE, c, a); printArray(SIZE, c, true); */ - zeroArray(SIZE, c); + /*zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); @@ -143,7 +150,7 @@ int main(int argc, char* argv[]) { count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); + printCmpLenResult(count, expectedNPOT, b, c);*/ system("pause"); // stop Win32 console from closing on exit delete[] a; diff --git a/Project2-Stream-Compaction/src/main.cpp b/Project2-Stream-Compaction/src/main.cpp index d016553..1058dc6 100644 --- a/Project2-Stream-Compaction/src/main.cpp +++ b/Project2-Stream-Compaction/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 14; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.cu b/Project2-Stream-Compaction/stream_compaction/efficient.cu index 123b9ac..8e090a8 100644 --- a/Project2-Stream-Compaction/stream_compaction/efficient.cu +++ b/Project2-Stream-Compaction/stream_compaction/efficient.cu @@ -47,7 +47,7 @@ namespace StreamCompaction { __global__ void kernDownSweep(int N, int d, int* arr) { int index = (blockIdx.x * blockDim.x) + threadIdx.x; - if (index > N) return; + if (index >= N) return; int powDPlus = 1 << (d + 1); int powD = 1 << d; @@ -101,15 +101,15 @@ namespace StreamCompaction { for (int i = 0; i < ilog2ceil(n); i++) { - kernUpSweep << > > (n, i, dev_arr); + kernUpSweep << > > (pow2Length, i, dev_arr); } for (int j = ilog2ceil(n)-1; j >= 0; j--) { - kernDownSweep << > > (n, j, dev_arr); + kernDownSweep << > > (pow2Length, j, dev_arr); } - kernInclusiveToExclusive << > > (n, dev_arr); + kernInclusiveToExclusive << > > (pow2Length, dev_arr); timer().endGpuTimer(); From f840647d03c84f29166d66faf9bd3d4df275630d Mon Sep 17 00:00:00 2001 From: tabathah Date: Tue, 17 Sep 2019 00:59:25 -0400 Subject: [PATCH 4/7] somewhat working neural net --- Project2-Character-Recognition/README.md | 27 ++- .../character_recognition/mlp.cu | 129 ++++++++--- .../character_recognition/mlp.h | 5 + Project2-Character-Recognition/src/main.cpp | 217 ++++++------------ 4 files changed, 193 insertions(+), 185 deletions(-) diff --git a/Project2-Character-Recognition/README.md b/Project2-Character-Recognition/README.md index 4503fac..22bae0c 100644 --- a/Project2-Character-Recognition/README.md +++ b/Project2-Character-Recognition/README.md @@ -3,12 +3,27 @@ CUDA Character Recognition **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Tabatha Hickman + * LinkedIn:https://www.linkedin.com/in/tabatha-hickman-335987140/ +* Tested on: Windows 10 Pro, i7-5600U CPU @ 2.60GHz 16GB, GeForce 840M (personal computer) -### (TODO: Your README) +## Neural Network Implementation -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +This project's purpose was to create a neural network which does its computations on the GPU. I created a multi-layer perceptron with one hidden layer, so in total there are 3 layers (input, hidden, output). We evaluate the network by feeding information forward to the next layer. To process each new layer, I performed a summation for each output node on all the input nodes multiplied by the corresponding weight between those two nodes, then ran that sum through an activation function. In this case our function was ```f(x) = 1/(1+e^-x)```. + +We want to find the best set of weights so that the outputs of the network are as accurate as possible. We do this by entering a training phase. First we start with random values for the weights. Then, provided with inputs and corresponding target outputs, we run the inputs through the network and compare the outputs with their targets and find the error associated. Then through backward propagation, we can go through each weight and update it based on the results so that next time the output is more accurate. + +Once the network has been trained adequately, we can run new inputs on it and see if we get some good results. + +Using provided weights for a working XOR neural network, I was able to verify my code correctly feeds forward and builds the network. I've been able to produce my own fairly accurate weights for XOR, but they certainly aren't perfect and it already takes 1000s of iterations of training to get as close as they are. + +``` +Ran 13101 iterations of training + (0, 0) expected: 0.000000, result 0.071486 + (0, 1) expected: 1.000000, result 0.930205 + (1, 0) expected: 1.000000, result 0.923021 + (1, 1) expected: 0.000000, result 0.063928 +``` + + diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu index a06e429..c5ae05a 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.cu +++ b/Project2-Character-Recognition/character_recognition/mlp.cu @@ -24,7 +24,7 @@ namespace CharacterRecognition { } */ -#define blockSize 128 + #define blockSize 128 __host__ __device__ unsigned int hash(unsigned int a) { a = (a + 0x7ed55d16) + (a << 12); @@ -36,8 +36,8 @@ namespace CharacterRecognition { return a; } - __host__ __device__ float genRandom(int index) { - thrust::default_random_engine rng(hash((int)(index))); + __host__ __device__ float genRandom(float time, int index) { + thrust::default_random_engine rng(hash((int)(index * time))); thrust::uniform_real_distribution unitDistrib(-1, 1); return (float)unitDistrib(rng); @@ -47,7 +47,7 @@ namespace CharacterRecognition { { int index = (blockIdx.x * blockDim.x) + threadIdx.x; if (index < N) { - float rand = genRandom(index); + float rand = genRandom(N, index); wtMat[index] = scale * rand; } } @@ -66,10 +66,9 @@ namespace CharacterRecognition { int index = (blockIdx.x * blockDim.x) + threadIdx.x; if (index >= oDim) { return; } - int row = index * iDim; for (int idx = 0; idx < iDim; idx++) { - int wtIdx = idx + row; + int wtIdx = idx * oDim + index; odata[index] += wtMat[wtIdx] * idata[idx]; } } @@ -80,7 +79,8 @@ namespace CharacterRecognition { if (index >= N) { return; } float x = idata[index]; - odata[index] = 1.0f / (1.0f + exp(-x)); + float e = exp(-x); + odata[index] = 1.0f / (1.0f + e); } __global__ void kernCalcErrors(int N, float* target, float* output, float* odata) @@ -91,7 +91,7 @@ namespace CharacterRecognition { odata[index] = target[index] - output[index]; } - __global__ void kernEditWeightsji(int N, int jDim, float lambda, float* hidden, float* errors, float* outputSums, + __global__ void kernEditWeightsji(int N, int iDim, float lambda, float* hidden, float* errors, float* outputSums, float* partialErr, float* wtMat) { // for hidden to output weights: @@ -101,8 +101,8 @@ namespace CharacterRecognition { int index = (blockIdx.x * blockDim.x) + threadIdx.x; if (index >= N) { return; } - int i = index % jDim; - int j = index - i; + int i = index % iDim; + int j = index / iDim; float x = outputSums[i]; float fx = 1.0f / (1.0f + exp(-x)); @@ -112,7 +112,7 @@ namespace CharacterRecognition { wtMat[index] += deltaW; } - __global__ void kernEditWeightskj(int N, int kDim, int jDim, int iDim, float lambda, float* input, float* hiddenSums, + __global__ void kernEditWeightskj(int N, int jDim, int iDim, float lambda, float* input, float* hiddenSums, float* partialErr, float* wji, float* wtMat) { @@ -123,8 +123,8 @@ namespace CharacterRecognition { int index = (blockIdx.x * blockDim.x) + threadIdx.x; if (index >= N) { return; } - int j = index % kDim; - int k = index - j; + int j = index % jDim; + int k = index / jDim; float sumPropErrs = 0; for (int i = 0; i < iDim; i++) @@ -139,8 +139,19 @@ namespace CharacterRecognition { wtMat[index] += deltaW; } + void makeWeightMat(int n, float* data) + { + float* dev_data; + cudaMalloc((void**)&dev_data, n * sizeof(float)); + + kernInitRandomWeights << > > (n, dev_data, 30); + + cudaMemcpy(data, dev_data, n * sizeof(float), cudaMemcpyDeviceToHost); + cudaFree(dev_data); + } + // TODO: implement required elements for MLP sections 1 and 2 here - void mlpTrain(int i, int j, int k, float* odata, float* idata, float* wkj, float* wji, float* target) + float mlpTrain(int i, int j, int k, float* odata, float* idata, float* wkj, float* wji, float* target) { float *dev_input, *dev_hidden, *dev_output; float *dev_hiddenSums, *dev_outputSums; @@ -150,13 +161,15 @@ namespace CharacterRecognition { cudaMalloc((void**)&dev_input, k * sizeof(float)); cudaMalloc((void**)&dev_hidden, j * sizeof(float)); cudaMalloc((void**)&dev_output, i * sizeof(float)); - cudaMemcpy(dev_input, idata, i * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(dev_input, idata, k * sizeof(float), cudaMemcpyHostToDevice); cudaMalloc((void**)&dev_hiddenSums, j * sizeof(float)); cudaMalloc((void**)&dev_outputSums, i * sizeof(float)); cudaMalloc((void**)&dev_wkj, k * j * sizeof(float)); cudaMalloc((void**)&dev_wji, j * i * sizeof(float)); + cudaMemcpy(dev_wkj, wkj, k * j * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(dev_wji, wji, j * i * sizeof(float), cudaMemcpyHostToDevice); cudaMalloc((void**)&dev_target, i * sizeof(float)); cudaMalloc((void**)&dev_errors, i * sizeof(float)); @@ -164,32 +177,20 @@ namespace CharacterRecognition { cudaMalloc((void**)&dev_tempwji, i * j * sizeof(float)); cudaMemcpy(dev_target, target, i * sizeof(float), cudaMemcpyHostToDevice); - //cudaMemcpy(dev_wkj, wkj, k * j * sizeof(float), cudaMemcpyHostToDevice); - //cudaMemcpy(dev_wji, wji, j * i * sizeof(float), cudaMemcpyHostToDevice); - - dim3 ifullBlocksPerGrid((i + blockSize - 1) / blockSize); - dim3 jfullBlocksPerGrid((j + blockSize - 1) / blockSize); - dim3 kfullBlocksPerGrid((k + blockSize - 1) / blockSize); - dim3 wkjfullBlocksPerGrid((k*j + blockSize - 1) / blockSize); - dim3 wjifullBlocksPerGrid((j*i + blockSize - 1) / blockSize); - - // initialize non input buffers to zeros and give weight matrices random values - kernInitRandomWeights << > > (k*j, dev_wkj, 100); - kernInitRandomWeights << > > (j*i, dev_wji, 100); - - kernInitZero << > > (j, dev_hidden); - kernInitZero << > > (i, dev_output); + // initialize non input buffers to zeros + kernInitZero << > > (j, dev_hidden); + kernInitZero << > > (i, dev_output); // input -> hidden - kernSumWeights << > > (k, j, dev_wkj, dev_input, dev_hiddenSums); - kernActivationFxn << > > (j, dev_hiddenSums, dev_hidden); + kernSumWeights << > > (k, j, dev_wkj, dev_input, dev_hiddenSums); + kernActivationFxn << > > (j, dev_hiddenSums, dev_hidden); // hidden -> output - kernSumWeights << > > (j, i, dev_wji, dev_hidden, dev_outputSums); - kernActivationFxn << > > (i, dev_outputSums, dev_output); + kernSumWeights << > > (j, i, dev_wji, dev_hidden, dev_outputSums); + kernActivationFxn << > > (i, dev_outputSums, dev_output); // calculate error, lambda - kernCalcErrors << > > (i, dev_target, dev_output, dev_errors); + kernCalcErrors << > > (i, dev_target, dev_output, dev_errors); float* errs = new float[i]; cudaMemcpy(errs, dev_errors, i * sizeof(float), cudaMemcpyDeviceToHost); @@ -198,13 +199,14 @@ namespace CharacterRecognition { { sumErr += (errs[e]*errs[e]); } - float lambda = sumErr/2.0f; + sumErr /= 2.0f; + float lambda = sumErr; // update weights cudaMemcpy(dev_tempwji, dev_wji, j * i * sizeof(float), cudaMemcpyDeviceToDevice); - kernEditWeightsji << > > (j*i, j, lambda, dev_hidden, dev_errors, dev_outputSums, + kernEditWeightsji << > > (j*i, i, lambda, dev_hidden, dev_errors, dev_output, dev_partialErr, dev_wji); - kernEditWeightskj << > > (k*j, k, j, i, lambda, dev_input, dev_hiddenSums, dev_partialErr, + kernEditWeightskj << > > (k*j, j, i, lambda, dev_input, dev_hidden, dev_partialErr, dev_tempwji, dev_wkj); cudaMemcpy(odata, dev_output, i * sizeof(float), cudaMemcpyDeviceToHost); @@ -220,5 +222,56 @@ namespace CharacterRecognition { cudaFree(dev_wkj); cudaFree(dev_wji); + + cudaFree(dev_target); + cudaFree(dev_errors); + cudaFree(dev_partialErr); + cudaFree(dev_tempwji); + + return sumErr; + } + + void mlpRun(int i, int j, int k, float* odata, float* idata, float* wkj, float* wji) + { + float *dev_input, *dev_hidden, *dev_output; + float *dev_hiddenSums, *dev_outputSums; + float *dev_wkj, *dev_wji; + + cudaMalloc((void**)&dev_input, k * sizeof(float)); + cudaMalloc((void**)&dev_hidden, j * sizeof(float)); + cudaMalloc((void**)&dev_output, i * sizeof(float)); + cudaMemcpy(dev_input, idata, k * sizeof(float), cudaMemcpyHostToDevice); + + cudaMalloc((void**)&dev_hiddenSums, j * sizeof(float)); + cudaMalloc((void**)&dev_outputSums, i * sizeof(float)); + + cudaMalloc((void**)&dev_wkj, k * j * sizeof(float)); + cudaMalloc((void**)&dev_wji, j * i * sizeof(float)); + cudaMemcpy(dev_wkj, wkj, k * j * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(dev_wji, wji, j * i * sizeof(float), cudaMemcpyHostToDevice); + + // initialize non input buffers to zeros + kernInitZero << > > (j, dev_hidden); + kernInitZero << > > (i, dev_output); + + // input -> hidden + kernSumWeights << > > (k, j, dev_wkj, dev_input, dev_hiddenSums); + kernActivationFxn << > > (j, dev_hiddenSums, dev_hidden); + + // hidden -> output + kernSumWeights << > > (j, i, dev_wji, dev_hidden, dev_outputSums); + kernActivationFxn << > > (i, dev_outputSums, dev_output); + + cudaMemcpy(odata, dev_output, i * sizeof(float), cudaMemcpyDeviceToHost); + + cudaFree(dev_input); + cudaFree(dev_hidden); + cudaFree(dev_output); + + cudaFree(dev_hiddenSums); + cudaFree(dev_outputSums); + + cudaFree(dev_wkj); + cudaFree(dev_wji); } } diff --git a/Project2-Character-Recognition/character_recognition/mlp.h b/Project2-Character-Recognition/character_recognition/mlp.h index 2096228..52b75ac 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.h +++ b/Project2-Character-Recognition/character_recognition/mlp.h @@ -5,5 +5,10 @@ namespace CharacterRecognition { Common::PerformanceTimer& timer(); + void makeWeightMat(int n, float* data); + // TODO: implement required elements for MLP sections 1 and 2 here + float mlpTrain(int i, int j, int k, float* odata, float* idata, float* wkj, float* wji, float* target); + + void mlpRun(int i, int j, int k, float* odata, float* idata, float* wkj, float* wji); } diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp index 26b1154..0f1906e 100644 --- a/Project2-Character-Recognition/src/main.cpp +++ b/Project2-Character-Recognition/src/main.cpp @@ -11,149 +11,84 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array -const int NPOT = SIZE - 3; // Non-Power-Of-Two -int *a = new int[SIZE]; -int *b = new int[SIZE]; -int *c = new int[SIZE]; - int main(int argc, char* argv[]) { - float *xorInput = new float[2]; - xorInput[0] = 0; - xorInput[1] = 0; - float *xorTarget = new float[1]; - xorTarget[0] = 0; - + float *xorInput1 = new float[3]; + xorInput1[0] = 0; + xorInput1[1] = 0; + xorInput1[2] = 1; //bias + + float *xorTarget1 = new float[1]; + xorTarget1[0] = 0; + + float *xorInput2 = new float[3]; + xorInput2[0] = 0; + xorInput2[1] = 1; + xorInput2[2] = 1; + + float *xorTarget2 = new float[1]; + xorTarget2[0] = 1; + + float *xorInput3 = new float[3]; + xorInput3[0] = 1; + xorInput3[1] = 0; + xorInput3[2] = 1; + + float *xorTarget3 = new float[1]; + xorTarget3[0] = 1; + + float *xorInput4 = new float[3]; + xorInput4[0] = 1; + xorInput4[1] = 1; + xorInput4[2] = 1; + + float *xorTarget4 = new float[1]; + xorTarget4[0] = 0; + + float *wkj = new float[9]; + float *wji = new float[3]; + CharacterRecognition::makeWeightMat(9, wkj); + CharacterRecognition::makeWeightMat(3, wji); + // testing values from spreadsheet, + // make sure to change j and k to 2 to get rid of bias + /*wkj[0] = 10.1; + wkj[1] = 0.9; + wkj[2] = 20; + wkj[3] = 0.87; + wji[0] = 41; + wji[1] = -54;*/ + + + float *xorOutput = new float[1]; + + int i = 1; + int j = 3; + int k = 3; - // Scan tests - - /*printf("\n"); - printf("****************\n"); - printf("** SCAN TESTS **\n"); - printf("****************\n"); - - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - // initialize b using StreamCompaction::CPU::scan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. - // At first all cases passed because b && c are all zeroes. - zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); - - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ - - /*zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - printf("\n"); - printf("*****************************\n"); - printf("** STREAM COMPACTION TESTS **\n"); - printf("*****************************\n"); - - // Compaction tests - - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - int count, expectedCount, expectedNPOT; - - // initialize b using StreamCompaction::CPU::compactWithoutScan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); - - zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c);*/ + //training + float tgtError = 0.01f; + float currError = 100000.0f; + int count = 0; + while (currError > tgtError && count < 15000) + { + currError = 0; + currError += CharacterRecognition::mlpTrain(i, j, k, xorOutput, xorInput1, wkj, wji, xorTarget1); + currError += CharacterRecognition::mlpTrain(i, j, k, xorOutput, xorInput2, wkj, wji, xorTarget2); + currError += CharacterRecognition::mlpTrain(i, j, k, xorOutput, xorInput3, wkj, wji, xorTarget3); + currError += CharacterRecognition::mlpTrain(i, j, k, xorOutput, xorInput4, wkj, wji, xorTarget4); + count++; + } + + //test + printf("Ran %d iterations of training\n", count); + CharacterRecognition::mlpRun(i, j, k, xorOutput, xorInput1, wkj, wji); + printf(" (0, 0) expected: %f, result %f\n", xorTarget1[0], xorOutput[0]); + CharacterRecognition::mlpRun(i, j, k, xorOutput, xorInput2, wkj, wji); + printf(" (0, 1) expected: %f, result %f\n", xorTarget2[0], xorOutput[0]); + CharacterRecognition::mlpRun(i, j, k, xorOutput, xorInput3, wkj, wji); + printf(" (1, 0) expected: %f, result %f\n", xorTarget3[0], xorOutput[0]); + CharacterRecognition::mlpRun(i, j, k, xorOutput, xorInput4, wkj, wji); + printf(" (1, 1) expected: %f, result %f\n", xorTarget4[0], xorOutput[0]); system("pause"); // stop Win32 console from closing on exit - delete[] a; - delete[] b; - delete[] c; + } From c5f21f4afd15494c4f73f6af620ed8bb17a2dc0d Mon Sep 17 00:00:00 2001 From: tabathah Date: Tue, 17 Sep 2019 13:41:15 -0400 Subject: [PATCH 5/7] best shot at char recognition --- Project2-Character-Recognition/README.md | 3 +- .../character_recognition/mlp.cu | 2 + Project2-Character-Recognition/src/main.cpp | 82 ++++++++++++++++++- .../src/testing_helpers.hpp | 2 +- 4 files changed, 86 insertions(+), 3 deletions(-) diff --git a/Project2-Character-Recognition/README.md b/Project2-Character-Recognition/README.md index 22bae0c..83f9979 100644 --- a/Project2-Character-Recognition/README.md +++ b/Project2-Character-Recognition/README.md @@ -15,7 +15,7 @@ We want to find the best set of weights so that the outputs of the network are a Once the network has been trained adequately, we can run new inputs on it and see if we get some good results. -Using provided weights for a working XOR neural network, I was able to verify my code correctly feeds forward and builds the network. I've been able to produce my own fairly accurate weights for XOR, but they certainly aren't perfect and it already takes 1000s of iterations of training to get as close as they are. +Using provided weights for a working XOR neural network, I was able to verify my code correctly feeds forward and builds the network. I've also been able to produce my own fairly accurate weights for XOR: (This had a target error of 0.01) ``` Ran 13101 iterations of training @@ -25,5 +25,6 @@ Ran 13101 iterations of training (1, 1) expected: 0.000000, result 0.063928 ``` +Unfortunately, I was having a lot of trouble extending this to character recognition. Training does not seem to be working - the error is huge and doesn't improve at all with further iterations. I attempted to debug this and started getting "CUDA grid launch failed" errors. Upon looking this up I found out this has to do with the TDR of my Debugger, but I can't find the place to change that setting. diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu index c5ae05a..9e95acb 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.cu +++ b/Project2-Character-Recognition/character_recognition/mlp.cu @@ -141,6 +141,8 @@ namespace CharacterRecognition { void makeWeightMat(int n, float* data) { + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + float* dev_data; cudaMalloc((void**)&dev_data, n * sizeof(float)); diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp index 0f1906e..a2afaa4 100644 --- a/Project2-Character-Recognition/src/main.cpp +++ b/Project2-Character-Recognition/src/main.cpp @@ -7,11 +7,37 @@ */ #include +#include +#include #include #include #include "testing_helpers.hpp" +void readFromFile(int idx, float* inputArr) +{ + std::string fileName = std::to_string(idx) + "info.txt"; + if (idx < 10) { fileName = std::to_string(0) + fileName; } + fileName = "../data-set/" + fileName; + + std::ifstream infile(fileName); + + int n1, n2, count; + float x; + count = 0; + + if (!(infile >> n1 >> n2)) { printf("Error reading first two lines of file %s\n", fileName); } + + while (infile >> x && count < 10201) + { + inputArr[count] = x; + count++; + } +} + int main(int argc, char* argv[]) { + // XOR TESTING + printf("XOR TESTING\n"); + float *xorInput1 = new float[3]; xorInput1[0] = 0; xorInput1[1] = 0; @@ -89,6 +115,60 @@ int main(int argc, char* argv[]) { CharacterRecognition::mlpRun(i, j, k, xorOutput, xorInput4, wkj, wji); printf(" (1, 1) expected: %f, result %f\n", xorTarget4[0], xorOutput[0]); + + // CHAR RECOG TESTING + printf("CHAR RECOG TESTING\n"); + + i = 52; + j = 10202; + k = 10202; // +1 for bias + + float *CRwkj = new float[k*j]; + float *CRwji = new float[j*i]; + CharacterRecognition::makeWeightMat(k*j, wkj); + CharacterRecognition::makeWeightMat(j*i, wji); + + float *CRoutput = new float[i]; + + tgtError = 0.01f; + currError = 100000.0f; + count = 0; + while (currError > tgtError && count < 10) + { + currError = 0; + for (int f = 0; f < 52; f++) + { + float* tgt = new float[i]; + zeroArray(i, tgt); + tgt[f] = 1; + + float* input = new float[k]; + readFromFile(f + 1, input); + input[k-1] = 1; + + currError += CharacterRecognition::mlpTrain(i, j, k, CRoutput, input, CRwkj, CRwji, tgt); + + delete[] input; + delete[] tgt; + } + printf("After %d iterations, error = %f\n", count, currError); + count++; + } + system("pause"); // stop Win32 console from closing on exit - + delete[] xorInput1; + delete[] xorTarget1; + delete[] xorInput2; + delete[] xorTarget2; + delete[] xorInput3; + delete[] xorTarget3; + delete[] xorInput4; + delete[] xorTarget4; + delete[] xorOutput; + delete[] wkj; + delete[] wji; + + delete[] CRoutput; + delete[] CRwkj; + delete[] CRwji; } diff --git a/Project2-Character-Recognition/src/testing_helpers.hpp b/Project2-Character-Recognition/src/testing_helpers.hpp index b28a8d2..dab93be 100644 --- a/Project2-Character-Recognition/src/testing_helpers.hpp +++ b/Project2-Character-Recognition/src/testing_helpers.hpp @@ -37,7 +37,7 @@ void printCmpLenResult(int n, int expN, T *a, T *b) { cmpArrays(n, a, b) ? "FAIL VALUE" : "passed"); } -void zeroArray(int n, int *a) { +void zeroArray(int n, float *a) { for (int i = 0; i < n; i++) { a[i] = 0; } From 8749fd9db4cd3dae2d9bf2ce1b890a11d4b09ea8 Mon Sep 17 00:00:00 2001 From: tabathah Date: Tue, 17 Sep 2019 20:17:36 -0400 Subject: [PATCH 6/7] small edits --- Project2-Character-Recognition/character_recognition/mlp.cu | 2 -- Project2-Character-Recognition/src/main.cpp | 5 ++--- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu index 9e95acb..c5ae05a 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.cu +++ b/Project2-Character-Recognition/character_recognition/mlp.cu @@ -141,8 +141,6 @@ namespace CharacterRecognition { void makeWeightMat(int n, float* data) { - dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); - float* dev_data; cudaMalloc((void**)&dev_data, n * sizeof(float)); diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp index a2afaa4..75c0cc3 100644 --- a/Project2-Character-Recognition/src/main.cpp +++ b/Project2-Character-Recognition/src/main.cpp @@ -119,7 +119,7 @@ int main(int argc, char* argv[]) { // CHAR RECOG TESTING printf("CHAR RECOG TESTING\n"); - i = 52; + i = 1; j = 10202; k = 10202; // +1 for bias @@ -139,8 +139,7 @@ int main(int argc, char* argv[]) { for (int f = 0; f < 52; f++) { float* tgt = new float[i]; - zeroArray(i, tgt); - tgt[f] = 1; + tgt[0] = f + 1; float* input = new float[k]; readFromFile(f + 1, input); From 9316677840b9ac62a2683b128262388d8877c7a2 Mon Sep 17 00:00:00 2001 From: Tabatha Hickman Date: Tue, 17 Sep 2019 20:25:30 -0400 Subject: [PATCH 7/7] update main readme --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 3a0b2fe..265ca67 100644 --- a/README.md +++ b/README.md @@ -3,14 +3,14 @@ CUDA Number Algorithms **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Tabatha Hickman + * LinkedIn:https://www.linkedin.com/in/tabatha-hickman-335987140/ +* Tested on: Windows 10 Pro, i7-5600U CPU @ 2.60GHz 16GB, GeForce 840M (personal computer) -### (TODO: Your README) +See these readmes for more information on the two subprojects: -Link to the readmes of the other two subprojects. +* [Stream Compaction](./Project2-Stream-Compaction/README.md) +* [Character Recognition](./Project2-Character-Recognition/README.md) -Add anything else you think is relevant up to this point. -(Remember, this is public, so don't put anything here that you don't want to share with the world.) +Note: changed the sm value in CMakeLists of both stream_compaction and character_recognition to higher value so the project would build.