From 2787b87a1a5e12e0651dc26f6c134e4141274b50 Mon Sep 17 00:00:00 2001 From: BowenBao Date: Mon, 26 Sep 2016 15:36:39 -0400 Subject: [PATCH 1/6] Part 1 & Part 2 --- stream_compaction/cpu.cu | 48 ++++++++++++++++++++++++---- stream_compaction/efficient.cu | 57 ++++++++++++++++++++-------------- stream_compaction/naive.cu | 53 +++++++++++++++++++++++++------ 3 files changed, 119 insertions(+), 39 deletions(-) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..190d913 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -8,8 +8,13 @@ namespace CPU { * CPU scan (prefix sum). */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + // TODO : finished + if (n <= 0) return; + odata[0] = idata[0]; + for (int i = 1; i < n; ++i) + { + odata[i] = odata[i - 1] + idata[i]; + } } /** @@ -18,8 +23,16 @@ void scan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { - // TODO - return -1; + // TODO : finished + int num_remain = 0; + for (int i = 0; i < n; ++i) + { + if (idata[i] != 0) + { + odata[num_remain++] = idata[i]; + } + } + return num_remain; } /** @@ -28,8 +41,31 @@ int compactWithoutScan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - // TODO - return -1; + // TODO : finished + // map data to 1 and 0 for non-zero and zero. + int *tmp_data = new int(n); + for (int i = 0; i < n; ++i) + { + tmp_data[i] = idata[i] == 0 ? 0 : 1; + //if (idata[i] == 0) tmp_data[i] = 0; + //else tmp_data[i] = 1; + //printf("%d is %d\n", i, tmp_data[i]); + } + + // scan + scan(n, odata, tmp_data); + + // scatter + int num_remain = 0; + for (int i = 0; i < n; ++i) + { + if (tmp_data[i] == 1) + { + odata[odata[i]-1] = idata[i]; + num_remain++; + } + } + return num_remain; } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..7c63ad1 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -4,31 +4,42 @@ #include "efficient.h" namespace StreamCompaction { -namespace Efficient { + namespace Efficient { -// TODO: __global__ + // TODO: __global__ + __global__ void kernScanEfficient(int N, int interval, int *data) + { + // up sweep + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + int cur_index = index + 2 * interval - 1; + int last_index = index + interval - 1; + if (cur_index >= N) return; -/** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ -void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); -} + data[cur_index] = data[last_index]; + } -/** - * Performs stream compaction on idata, storing the result into odata. - * All zeroes are discarded. - * - * @param n The number of elements in idata. - * @param odata The array into which to store elements. - * @param idata The array of elements to compact. - * @returns The number of elements remaining after compaction. - */ -int compact(int n, int *odata, const int *idata) { - // TODO - return -1; -} + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) { + // TODO + printf("TODO\n"); + } -} + /** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ + int compact(int n, int *odata, const int *idata) { + // TODO + return -1; + } + + } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..0009eb9 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -4,17 +4,50 @@ #include "naive.h" namespace StreamCompaction { -namespace Naive { + namespace Naive { -// TODO: __global__ + // TODO: __global__ : finished -/** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ -void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); -} + __global__ void kernScan(int N, int start_idx, int *odata, const int *idata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + if (index >= start_idx) + { + odata[index] = idata[index - start_idx] + idata[index]; + } + else + { + odata[index] = idata[index]; + } + } -} + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) { + // TODO : finished + int blockSize(128); + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + int *tmp_data, *tmp_data2; + cudaMalloc((void**)&tmp_data, n * sizeof(int)); + cudaMalloc((void**)&tmp_data2, n * sizeof(int)); + cudaMemcpy(tmp_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + int loop_times = ilog2ceil(n); + int start_idx = 1; + for (int i = 1; i <= loop_times; ++i) + { + kernScan<<>>(n, start_idx, tmp_data2, tmp_data); + int *tmp_pt = tmp_data; + tmp_data = tmp_data2; + tmp_data2 = tmp_data; + start_idx *= 2; + } + + cudaMemcpy(odata, tmp_data, n * sizeof(int), cudaMemcpyDeviceToHost); + } + + } } From 11a2cf38e53e3234b567b48a77d39e581005a5ff Mon Sep 17 00:00:00 2001 From: BowenBao Date: Tue, 27 Sep 2016 22:29:51 -0400 Subject: [PATCH 2/6] part 3 & extra credit & readme --- README.md | 140 +++++++++++++++++- image/process_time.png | Bin 0 -> 17419 bytes image/process_time_blocksize.png | Bin 0 -> 11001 bytes src/main.cpp | 197 ++++++++++++++++++++++++- stat.xlsx | Bin 0 -> 8434 bytes stream_compaction/common.cu | 10 ++ stream_compaction/cpu.cu | 47 ++++-- stream_compaction/cpu.h | 6 +- stream_compaction/efficient.cu | 242 +++++++++++++++++++++++++++++-- stream_compaction/efficient.h | 6 +- stream_compaction/naive.cu | 28 +++- stream_compaction/naive.h | 2 +- stream_compaction/thrust.cu | 23 ++- stream_compaction/thrust.h | 2 +- 14 files changed, 655 insertions(+), 48 deletions(-) create mode 100644 image/process_time.png create mode 100644 image/process_time_blocksize.png create mode 100644 stat.xlsx diff --git a/README.md b/README.md index b71c458..ae3735e 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,141 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Bowen Bao +* Tested on: Windows 10, i7-6700K @ 4.00GHz 32GB, GTX 1080 8192MB (Personal Computer) -### (TODO: Your README) +## Overview -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +Here's the list of features of this project: +1. CPU Scan and Stream Compaction +2. Naive GPU Scan +3. Efficient GPU Scan and Stream Compaction +4. Thrust Scan +5. Optimize efficient GPU Scan +6. Radix Sort based on GPU Scan +7. Benchmark suite + +## Instruction to Run + +I made a few changes to the function headers to add more flexible benchmarking capabilities, such as able to return process times, change block size without re-compile, etc. The only change that is visible to the user is that they need to pass in a double parameter as reference to be able to receive the logged process time. + +I added a benchmark suite for testing the run time of each implementation under different parameter settings. Also, I inserted a few tests for radix sort into the original main function. + +## Performance Analysis +### Performance of different implementation + +![](/images/process_time.png) + +Here's the test result for each of the methods. The tests are run with the block size of 256(which is decided as near optimal after testing on numerous values). For each methods, I ran 100 independent tests, and calculated their average process time. + +We can observe indeed that the GPU version of scan has a better performance than CPU scan. + +### Performance of GPU methods under different block size + +![](/images/process_time_blocksize.png) + +The tests are run with the stream length of 2^24, each method is tested 100 times and recorded the average. Observe that the performance starts to decrease after blocksize getting over 256. + +## Extra Credits +### Improving GPU Scan +See part 3 in Questions. + +### Radix Sort +I followed the algorithm in the slides, and implemented a radix sort method based on the GPU Scan function. One interesting note is that when checking bits of the numbers, numbers with 1 on the first bit are actually smaller than those with 0, as on these occasions they turned out to be negative, which is the reverse case against situations on other bits. I tested my radix sort function with a special hand crafted case containing negative numbers, and with a random large test case. + +## Questions +* Roughly optimize the block sizes of each of your implementations for minimal + run time on your GPU. + * (You shouldn't compare unoptimized implementations to each other!) +See Performance Analysis. + +* Compare all of these GPU Scan implementations (Naive, Work-Efficient, and + Thrust) to the serial CPU version of Scan. Plot a graph of the comparison + (with array size on the independent axis). + * You should use CUDA events for timing GPU code. Be sure **not** to include + any *initial/final* memory operations (`cudaMalloc`, `cudaMemcpy`) in your + performance measurements, for comparability. Note that CUDA events cannot + time CPU code. + * You can use the C++11 `std::chrono` API for timing CPU code. See this + [Stack Overflow answer](http://stackoverflow.com/a/23000049) for an example. + Note that `std::chrono` may not provide high-precision timing. If it does + not, you can either use it to time many iterations, or use another method. + * To guess at what might be happening inside the Thrust implementation (e.g. + allocation, memory copy), take a look at the Nsight timeline for its + execution. Your analysis here doesn't have to be detailed, since you aren't + even looking at the code for the implementation. +See Performance Analysis. + +* Write a brief explanation of the phenomena you see here. + * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is + it different for each implementation? +One problem with "naive" efficient GPU scan is that there are too many threads wasted(after being checked that their index mod interval is not zero). One way of improving this is to assign the index as the divided result of the original index by the interval, and compute back the actual index later in that thread. With this improvement, we can save a lot of useless works done by threads, and note that waste grows exponentially with the number of elements in stream in the original implementation. + +* Paste the output of the test program into a triple-backtick block in your + README. + * If you add your own tests (e.g. for radix sort or to test additional corner + cases), be sure to mention it explicitly. +See Output. + +## Output + +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 35 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604305 1604316 ] + passed +==== naive scan, power-of-two ==== + passed +==== naive scan, non-power-of-two ==== + passed +==== work-efficient scan, power-of-two ==== + passed +==== work-efficient scan, non-power-of-two ==== + passed +==== thrust scan, power-of-two ==== + passed +==== thrust scan, non-power-of-two ==== + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 3 1 ] + passed +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ] + passed +==== work-efficient compact, power-of-two ==== + passed +==== work-efficient compact, non-power-of-two ==== + passed +==== work-efficient compact, power-of-two, last non-zero ==== + passed +==== work-efficient compact, power-of-two, last zero ==== + passed +==== work-efficient compact, test on special case 1 ==== + passed +==== work-efficient compact, test on special case 2 ==== + passed +==== cpu compact without scan, test on special case 1 ==== + passed +==== radix sort, test on special case ==== + [ 0 5 -2 6 3 7 -5 2 7 1 ] + sorted: + [ -5 -2 0 1 2 3 5 6 7 7 ] + passed +==== radix sort, test ==== + [ 38 7719 1238 2437 8855 1797 8365 2285 450 612 5853 8100 1142 ... 5085 6505 ] + sorted: + [ 0 0 0 0 0 0 0 1 1 1 1 1 1 ... 9999 9999 ] + passed \ No newline at end of file diff --git a/image/process_time.png b/image/process_time.png new file mode 100644 index 0000000000000000000000000000000000000000..07efe2b70ba3de3858fa4e8cacad4412f1818d20 GIT binary patch literal 17419 zcmcJ%byQrSN#u-cY~-@XsSB6-iNqvLT{P;KLJ(ck=HL5Go_kuZ@s^&uE{dw4D$To_E~;J?gP9 zFhxKR`yu`Qotm5eZW`zlZcp0&S;WoCd{Jp>X|wBHbM@V-T}^Z;VY%v1bINq{{M-(h zkXB(wAdNTNmS|>bveE>_kKNFO(1eif(SX)h2q8J*%a_quq5?=-kBvkGs7M}%1iGQV zAPaSSJG&rvwmCWq=XDA<|DwO3__<9^v31D)%Vn`p{>G&D$lRPKAR$680+qYB=-nJL z@c%am(j=ys2nZq?Uw0k>U%iMF1fCK+2BRP#kco+$0v}9Kdr1%wzG6IrAtE3|l9Pe% zn>~4G^zDDr=Y-aN0Qqw?B_aKUG>H|H@d zgz*SpmW@k#!oAEc9*-c0uIeP|_~w!6J+B^-c%Pjv+zl-Mo`d1e>xka`fiAYO!)`i` zKo1D>yh^G^8}odycF74iiG5P{7Av=1%XpY*V1rh7#*)l{^99u5kWyQamE=2jN7$~d%PfldyE9pQ3aK?J9qv@|aTm;>a<$@vS|lhkcS%jHKy>Z6 z*<*CdbSlQ_sH-PT%O`WG!*?|Y%yAU#EtAic%q6UqiLB{!Wf`q#AL$#PH zuY|*L@R~mxG&g#Dn@mXLbUFB9Olm4aKQM6eHFX!C9#@6%vP3geraOc@;a(1ug^}kc z4b%9Wy3JIzs_iOGyJPReof=pwG|76js;1FZj0aU~Px(2sKYh8#;%Xm$gRCN$n@}EW zCs?h%s~zi4VB1&N$w%F5r0>+UHA`YG%Fz$O(O{AeQ*#umCIZ!yvE0kBU1hPjy-EF^ z+z$GKz@E|od||`ER6n=OT!4sZWLft`&o;+dzaWiB_VDzvyuZ9nA2Y9Rj3YzBZcrRP zdbYp=)`0(CojY&-~IqnKDKD&ABv#4$t zKFuZ@6vPwCf{njQ06KC2eTl=k;LR^a7Rr$4ND>pRb^G>~?M1u`4*REq1ln*8bMes4e)f`HHFT2y0~=WuuL|yg8~{6 zTg)2cC%cGhe2)Hk3T*0n>O$q3R7K{VV%-7u}gU zLV+sH!QjOg_Vj~Id-|3BsgQa>8TP$AB~_i*mZmW}C0>tDh&oYv^>rP*@zL`06U5g^ zIrmi=($_SX;dq@sK5CgdO~fmdbeJvF&0pr4{iT6iVSpM;7`R>gEVH z+|Pz&OwU-qEPk@IAdx-fo3&j|)a{o0^}_|opPHF&zBJa zoLPf3Gj$agd#@p zP@X1b_T9Y;`bhQ~X3|beXIoS2dF5io_q)5z59I>uHm0luzqASUU$Ndrs4$jA7f zsMGxzVT|uiC&~?=5no?r?b5=Ya8F`jl#$+W=f9?m6Rh;=O1#Cb%|Qa^6NI8(`arl4 z{kA1rA%)#^rqCCBhe))6tKmzs$7G=Cw(mbkr*t{k>b#C;QmsPIWrw1Db}W%8eZ35w zAiowbhwYt;u)ruDK3MOBE9%V+Ki&MP-6xxyJZXjHdwi?tgXtqU2sEtkJV-cerszkQdzusCTqTV&72W@B}u=~ZequcruE4f@-~aBuo2aT<>(K0YS+ zV?c+0yUT_w%I&77%paU&nu%_-Tpn;bcx+T0eJe$A-%`tNjZqC#?$!5-2}nh^s>=!& z#D9=b!mll*#x*Q)+<7BrO?2s7Xrh7sq|5S7ZrJXetALTm_=Rgkvq4pTjD-~cB|qOd ze*F%C4!(V-b79!fF3>sJDC&qx@;X8Un%# zp4Z7UUIrRhyB6k8s~fGSP@oEW3Wownk&E}H){SZkWpT{yBN2Z5;rD_f0wBq|*VhMq zEtcb@oEg|wuE2~qXR`T76(8CB*H&dp77%9&MSlQ*EulDzoez09~ zt$W)Q!0_7auP$2m6r&yQwc@d{5)>byPDjOwxbO49S|8LHI=9%K7F^=OIi8J1Du3lh z!G`+3b!6(tW4o`C8aT(_U_4YrOgp5%OfT}{enyn0nCFlGN)_40Q{&3tpz1Rn;Q!wT zKz7P^oUFLs6Lu9Awitpv5MS8Vc5^Lr^*UHe`DD~UeT;rtlJ__#OS z{9&0f(Hi-2Rrr&4nXRps=)ba%fy5DDqZr=|Z+@+}IRb`t)WD7D%J9#I{;zFGf=aTd z7K=A%^mhZ}`|L=U^@O)GAx}o!@2BCRQ>h)f zaX-ebDZhIVa7W{9P|Z{o-c$Pr4Dbtgq>E^ODhQn-{EIg?V%WSdxswO`tTapbaRK<4xTPctKCoy{91SUz#qZ?78AWQ z?dd;2m>n&+J%T&(7BEJh2iQSRgjaaGna3+NHfCgC*ZoH@DiHVQY}9Y8!+txKC=bvl zxnY#!P0sgUqQ%O7Kt?oG0b2d4uiVAA!MP^*rw1eGh7QupvMQ6-%defx$h7(Fk}&#XH)+ zi*XNb0HRCBKoA`Ovl)KfbZ50N$ZSGW=DBsVjt4y^U8%IZM|z1~k^}MNZ;LsRk5f># z1(<9oIC(AS3mQm;Yj+khieHNwIp09vWO@9H)GBOGXv5#nG+BZsziZ2br=Jn9XN>K2 z!ej-+ufP9ZG$RZ|hnN_(Z)IM8DEikYMExP>&7>M$&ji#?2tg*Tb8zpeJi(oz4ZB(0 z051mp8qIxula^|AuL7!{8C1znV7m8{88$iCgQuNTKy%oa6kBOdgfZ>&QHx=J;LRuY4s1Ntlo-w!aa);dZ zW8*bEKdFiM;L!S=w0c>xG2DJNNj2;NEhs?l&{3j$NIoR>9hJ1PwR0tL0kG>euuAe3 z4`RDQ<8%4*nq z8uTC%IcOPO0nsEH1r{RgKjdz!p+`azh1wZw%)JR7@z0 zEbNeJ-I2LPkYW&@Yv%?z6Sg9dJcJ#Vy~KI&Jj3=N-7`nY3;cVOkHTmpkj#f*F99Yt zi!twA4Lw9^46@EBTHtSlbKX&s#D2Z?39t$kJxv^3^Pc8-(Rz0AQm{AOg<_MIK++Eq zvj#N8@gv=`t!*`w?3cW(tp;&f0#iozd1xnbii`;;7;Tp}h(;}icGz8ODzXi|#jq#F z_v>%r3Z*cgzVyU0>D4?(asUQHZ|Bk4E8Mfv0t5Wr+Uy2{QA`V+KyA;TNQs(7Y^QZ3 z#MGC+=}-~|z?2MhiS~Pa698W(w>FmNRBESLj{x}UP4_~!hsW|^5(?aCUJ2+|SkJ?H zM<3~loCe>c8}PMEjr)hRz&N>$&6>5{HNaSCWaq4(VKUG0*#g#&$iq@)`VTCyi@>%E ziThfHRt@Yfbz`)F;Pf746)U+Fo8c8%yrbTs&4#qB3Wr{8&(r>5Br+Yy)=cit`OKf| z*2sI0004o(mtU$ko#P96*5V_&6+JmkKXMN_^ehCMK+)NVUer3xTJykF|RzeZC}k6MkZG7vmxrHs=Nj?C*81fq0WRR^p5Ev z0-{R(w$43LV=v61S}>>AA>`(=n{^?^U<-x$BB!COMVmv-94X^t}I~@04W0-=ai$=w?8b4T!XeGwHyVkPJ-XE$^ zcketjgRiVp0Z;yMI~a~N`PdIjb?y%s`P)JX<;It(fz<8m?b1_Iie4bKWR-qN(59w5Bwao6(aK681w?v2uDiWS$^2-Wj+ za|tYQXF3>oXwkiBR-Ru9nPf!uLj>n1p$0DUF8&d)xz1WwN(;2Cxwv4=7j&uMcWAp< zYv;^_ECvo?lx+Ud`R#Zac4;TR_1?{~%`5%w_#aZOf^IT1)-yqiy=TrD<%@Ao0PMO1 zSrCz_Q`*J=`|>F2HtOjSj!{lhjR7tB5!Rwj?lH;=>RHLB|99ISx zz>i?3L%No6w6|U!`|3$OF*{(Kw2vB6_ngqF*?S_L{DK2Z>RRftG%7tTOvl@_emcT{ zkf$TSMlcvz#x_yWN}0)o zz7V8wrAlp~2?Buk{kUR#2gF@V9p3VBTtNv8>3Z(ckM3dy>EHkZGZ4T~sQMX(!crkl zcd8CBhyo&T0hP8I=oRS7{y^Xt$|cIHB6bQGSmhiWD>%09v;lt2dRd*{86@oHUxS_@dcm7euM07OCEQ z;_x!Rw#7KZ;8!=M`cufXpfN{;r*DeCbo{PO@OX%rpAiUcq#=c+)a)+PM|YQb&RFYg zB$N~K7oGNir<*Gs|6H$2L-+~Z-!2YKoFX#noW%@<=}jD->#DEPg?<=eUyB(M-{zmJ z+1+p#;XPp2!&5W)0p8F&IX8_|ozO_$Ii&o7bkpn<$@>;FMPu$YupKLuZRPPoe}DRTG(eK^%VE}QW`G=m*#-3#3cv=`o*02Tm=HOm(OL`;n0|K%$dYhz0=OG9SfnqM>2;L~~aLpWmJ> zqb*$Z2e+YJ^e#_XYM69feVIk;9R<^7VEJEj2GTX=fTIA72BmK{yaKB#nY+dnj)T@V zbL9jdl1P6(+v2%0K`Q-As#|JMLY-9OefVP?n?3=rLjm!}uu;VkGer+YdQ{~n*@8I<`j8K2qjLywkYLnGSpQ0$_=0y~mFcQbKC7Ri3h6*}+nMkK<3fX9vM zo(I8SwNbq10Nzh2Ci2JOzr|#Hin$N93sD7gKRr8L_;}Tn;k>u|EgLY^AK!^|Z0do% z){TfK`oSiN%dh9rzdXt1RZt+jqOM-m!6b4%Lsj@`%ooq;&%?HrYxe8A`EDxjWJaCd z^$;sIx0!tZmp!q)`NuVA@V{(*QmCgipE1hww5ltwLd*Y+iAlr?1y0RWbhFx`Q>9q} z)!d@=`+KM!1Xv=`E!si#b*E9n7NMVV1#`{WjO!)SSl+G-?N@Lhvg%X8i#^-u6ihJ& zsn?3rh0hds2l_w%7IY8w-3Vri4hpaKwlM%zJ(=fEg0pGw0whZYYPHFH#xCm=(%uc< zZFa>O1K8#slpfYc<0%+f*OU!)?9kHx`13uH)F70ZQw{DZo7kU|z^DMIt@3aDy=-i)#2C zfV!PVIPtZb>hHrv`+-6I+N!?Y54r?7o!u;CHbDzOc_4{$PhzMV`UM;^i>L5*L@A<` z>pG6lxj=Af$p`~40UeqIDU5W72WHM$8oe;B(4qcrt?seNDW|HMFaY;2R~f2+1E8UP z1v=U@D2(SYNI*Cm=0Nj#ZmQs9>v zME5ops&+SS%isyG>o+`-g<7vP~tS__1QoxZSh=V7*ySkWd>}` zteEhsriQd&&e(=h$SSCVd%5+tcS^!zx_>Bw$Hnd%W9GXb>R;AL$_$!%Ivk2w)qHNU zPYZ1GBVTucv_Ldkqf@+{_}T=_TcpNFx?R}uJP>E}lt01RHB6L$P?6~AbiZRUmlL{H zs3mFq&(7DwibMu%5&8VM>r_nj{8JYm+$MX6jPKsF;hU2@P^C$`YhKea^BOOdTiye3 zPP|2XiZ^KTM`2Cwj$M-0$OffxB-LL$V5JJ!+?BiP*h}}V+40SZA6nl=SD6A7s$u!; z+~K)%ri%lp&1129O$i8VkU-q%U#_ETHRW%N2b6qvHc`NuPa>Nug6L&O!?G>q15D--Y!uV99F+`2{?3Vag!_uowR*ee2q%hy`)vh38YLP= zpS0E#TZF;A6Lg6%umO#)ez}$bZZvNm4k&m`J4j9untK#Gv^Lp@JhVp6^ft6cJ{_-g z-b5IjcExjcWV@-fO*3Bszzl%PKaEiT2BR_Kd)s0IQoG4D#j?W3XV^8vKG`u9ot}X( zha>cwsgao#&8x!!a1U!}d$P}8%7)UEHRM>o>CIoNuGi{p_W0XnJmRDsZ~4vWsA5x^ zKk=6B#^!PubXPs8hc%)r9zWn;E?S~j|685&+J~;`IZTCumsXNSQ)6E+tFis36$MiL zqn=9aQI(K+)oA7xk^qE-f$NFy|? zex-?krRjwAzxZX#?mdMy+7%%dNSX}UjX$%3gGU?7kU~4^epmd_Y0)D<=YOh_bOBBU z0{369{Nyn0`42C`{u)+wWy(Gq!|uQGN|6jiVq>#kvu`LQ<;vi3lZ?k;kx*|mw;Ij@x-0r{9Z{A)lUeFg@Icb2YzBZTVJ-uv=K03@Pl<=m_ zf1MpEXv0DHoWD^4=E{cIpC(gzlLPGkiM??u8)^5pp$%hMnLJz2IBGh?2Kz^weflhG z@$|1P%bO~1v8Cm)t5y0m)ckHwP~SBs-kKKoUB^lc+YCBV5&JYNUwv<1BTH`&iOA`b zaMnbWJ%ug{VSxMqs1RN4)T0j_Vu0`-55m8BK2t1lX=9FchurFJ!=uR+{qyshriOxX zk&DrhU~oF$w-$#5qw<}iUWH$Dxf;?=mnY*47$spCr&7~`WJyupr{w%P2A z)J@@W_0KexlP=}P$;aX+jJo6Vx8W3q;hFZ= z!Vc=iUIB3;;+1a-KNT%clbTR~rM%R0uc@k;{+Ooksi50lqmc{eTPVc~nr-Ry=_ud(p3#5WPsjzGLG2bZF+klCW(;V=>W&5 zR}moApENhj-26{jN9r1n87qo&qx$Xb?HBC8mZLPU1HOOz_6?VY)ccw}p+rY? zUS7T=@);qwnubO&8Y%Ah^mM5+M?8v>%@GH{ylkVgT3i)o^Aexso&w$Un~;@qdBk8fs?3{pcSotMM&J4 zMNJqls89rp@2OonbBrvq8PxWX>1I)lEj?pJ>lyDT$xHG)31((yxeO~St3+d5)6J1w zrh)D-LT&Q2w6v0`mX^A!m%+42QO#sh#XCsnzV2vlZr)|L$CMD(DI>ze*uHcv2FGR- z;?W#N#%>OEc)gncV_EO|Wp|DCYT|0O#jc@v)t+z+l@SI;M$HnfciJCqi~@G%8WW_a zVO16v76|VUVT``9efs#UiC3L7S5@cFRmFre=|8k}#V1Wm2xvp|dQ_r-eWNlqz`lwn zvL1h8Fw?v>avP7$&l6qrOkxBzKEN=5SLTB#O2_+u08i#^`02MZ-+G>|FW%3yGXPg| z9~Ls#baF4d7@ycYEvMLuu;L&Ex@6W({X#n6XtqYB(Vp8(C_VDU#d~^g4l?#}rp8t- zb!%&@PXFSo8MzkulNe6l+IdnmVhrrR*J;hDWa=qDO+!3U;D!clwJV3dbATbt&u1WR z(hja9%uvKe0)Yx%8r1w&l1nx6=@;Belr_m}bO^fi2pth@v^$6HgEU<1KVmDA?!@t5 zJzpnpHXQnN^}I@JADVlDU4v?Nx*iiAqU3KeaJrIE8CI8ukReZF$@ARKC|PYM!m7O( z^-6hnB>8A1rAT_Dp|;$9p(QToPZdUVAFtz^mQzq!N)nnA-~9}yvekqa%{96_R~d2Ho$jJ4&Cd^%o-Wm4 zZ$9~7v{qrrN)EUp%CYdrptZ8a^s-#`#CZK@rY7D;*I2`occhX@U%thsdHpw#eo79quLeblKBo9PH(m;;RbK0Y}ZPN8Sx0BYmC%KixeNn{@k84X=p1~ z0612k9o%;wrqGswxkxN{hZ|a(2}&bEnt!EjC(KxO-?kH=V%FT>gw7yy-r|XY};NJ6&v)!0)?jxbcU|M)C#!6z=hP*DIYeo4JhYu6UPn3UPZ8jy)ZVbpa>Sp@Y5t zN&ao&=}P0Fp>f$1LQT$bR)eaB;noz%#+=Wtt{l>Y_`j(5+3v>)-$}a7n<<%Lq3Xn~ zPS@L;T8_Qv#_{rhJCN~D1&Py%GuOy1d_U+ZzL6U*+bu9F(kRi4ns;rHjh0yYJrYD^ ziSvOlAtpz;04}JEZATKYvFwL}`9^xC$^z4#@we#Dv}=#MlLU*;0_^S^`}zBV@ktpN zU`h}T)n`7RnCHwrBp>6O=R-7OfuR{)AMH8zxKubfzvNCN-kG9^#cdpFU>1$S)A{0d zEt9&lmT)qoCa=ue^T}n02UF2Im8#TwC?rE2N+vz?fdhae+G;NgOJJ98mC23xD>4bE zhhSU4bAY>YY4{##^9euH{&EPOLVx{nXCjm?VL(anK(~A}9k($XNJ-z1dRJP;66jaPRpRXON z+_%tX7pe3b8(8{{=8Li&6YI`@qmP~`)p^V5!?oO2OQCM`e{k31{ga*r2*;t^4TDcW6j2jP2O+CPi9PkDo1p+n_t6FwIg9N4$c zWOXVnhDmYV2MvjafbYhPo*)M8xp_3asIeEiXe(JGOiydj4JUfzF%%n2Bhv?um66<- zSGSe5uuo(wn?Ld;PE(=M%X%VUNj_1tYw)whXli{kQf)0uKPJ8u@-{)0ejQM=39a=wW(epGCdgpA{S(vA7s6Qg*I9KBO^nP> z03OU{V5b>$GmjLBy2Y~-*AY10IIs6p%-@03liOxQ1Ir7}Enly;OCWa+V`O%`z*(K9 zR|08ukl;^dt==pZC|&@01|D*gNaOhEFpltStr1bU0J*))Jy>}ykXG`F$2TNsTz}8| z!np)rp<<$$8NwY`9D|77xNlojaXy{N6By>c!c8vVVrJ55wh;=T4yiZBeZONXFVk&q zDRfp7P7bKJ^4&%BCmo9n0a(^S|Bixt%So?J2K_R7uLw48{k1UDcC>Z*jH8Sl3DW%D za%PjTN4@%(Y!BfyM`fRZizyV`+z`nR+{U6yo$k3UBZ}|RVcb$b;n|1wKXwOw0jH{ptXr?>Mx5qEJ!wW6ITsmG(12JWr8mjZIKI4Hcy2&*4b!ZyuU4fRZWS z?!*IA*~~jBOkg|>{`hQNR3#dV*fd{^vr47K{a~b>!KRBJa6gZ!ot4ToD1@43-~33)lB+Fg&V{{ zo8$4d*#PtJ9r?25+@z~jWF<2CI33>J@d)WVe##)=aQA}~*66aLp$-?iC~a)@TowGU zzzT;*UH&@0#_6cTCaY#Io9CId&bin4GLkO%DLEB0 zl7YaMX(V)g6FzAwUEwVz#Xu3#?`LnLsb&$G`Qu~h%rIkZMRsEbk08ZA5MBwM1ReJO z+|pPQ-PzmNh@GrX;{{oofOxKo;QJ(?R`V*>JN;!O{=4~%F79REYzmDB4?S}Pjc1+y z;JetWP0HC#KW3z{B+P65X?1EUHu4H$7Oei_u;v7a!IE-};$Pud*Y3oqXFALfXmbc% z=jgn`eY)Uzs4DJ;j9tK)d%eje2`B}$x-QQ0!4CzTPui=Gb)L{Ht63)h0W7`~_-X&o z)8vCg>n?!WtMb~n$(YZujF))5z6*<1i~L!km1a_8?wVkx$;GZb!@Y)AHZ%#K#m`a!sTy^BJyZHknV3WRb+(YfOu1T6~ny@ks$55OMAxkE|I3(h zJAm~CIftXo13|^Efh9v|!Ho}-BZZvq)mSt!DfgZ=by^Ib&TWlH2jf0-Hk-wfP8i3$ zT?+rwM;N6=e_$|Vyi>)ia<8)RT0PGy1aD}gNFB9NhiQFt_^?sZ7H38=*wcdXpl?Fa z07e5~gr!J@^+Lxz0r5}DOX5pd%w*~8g{ixH+O`)GaKm1#-bUEs+&dUdjon|2r|>1M zy}ven?(Hoej?_qJLr*g{sUZ*^UCPRh6m2MMM607##tN^4eDq1`=w%-VxZ>?ba{1*6 zfwO=RAV2G^A7;O)?1fo~?@0Uc4LAUpIu_yeW#LR?D+qMOW4C{~F{9}V<9d55a_TV* zY)mnG&AT}hv%BpeOt0hVor@Sn4M7&|fP#_|(`cIbqma)LmkCFV(S`=$Sf{4Ni~KtMrXhg! zZ<3(^_gTsSZ(=`SFh74lD`0a`97<=9X9fk5H4bL=(x`U#Gg8}d!UM_$`xoTS`|2WG zUDB2}b zH+Xv3{rK+sMD4J29ilS{*m~jgb+x>m`~qoGj7XAZQXsGOc`A8IQQD^~xMutFjhegG89nIS{AbrEMjaVNU`cXBQ_a&@2)=;xK4C;_ zIgq4?qVSW|;B06{y)`~JyuDbK)bng*m{k2>VV*%)AN%ivF-$Jmg*K5c)j308Ha$~~yd(N18Bi=;?syf9!V(k2V z==9lt1h>rNO9FM?N%^=D7I~M^fcFN}u*hepu*aq4WYE9-`onlrxgfW4e8aG;_krCS z>pF{@>BaTJgjy4?_wef3#$c)@pQ1nBe5uLOZB$r)c=x*i$!=!vr8Fc8Igh($cep-w zrqyAAURXr$(oxM_1;!^P&O5vlX%Ug%uL`c{iQlw;GR?W%kV0)s+$GM)x(&;vRccPT zWwO79csn$@qLH4cJTp}ktbrFFp|oE9G1Z3Oo({Wu3++xSvR=YENSn@Acm@KS>LirO zA{Ohi*#l4{SwTyXvsJd{om#lL07Iajx@+%}NA|Ikb8>r4IoLrm&HX*$M`Uxts`jUx zRcaD+dhgPV3~o||k%4Sn5)3HM?ELXYx7g&hfqh0glyda>!|-Fi1~)(?ez(5lnzVic z=}`4L3Y5&aI7bn!B=K&(+~i5hsd zCqk{dRE;jS=S$xuGToh36M8CId20B zS-!Hqq`$9Z5wZNA7P$n)3U*|pcQ{!H8H&|C>E%bNAmnID3^jT!;@I*cmf)uIMR-(U z_jRYcpY`spzlmu>n(4A$_mb^X;k{ zlArZeoZ`2$zkwu@s&eyo(Pi+{!QW0#u)mo*9*eJp(hGqsCm5N(eF{B(cVrqNRCw-j z#D3uO_Qg~ffj$QR(5n&JGn1?z#;b^HylYxQMJ@9zq;x&7uf^eWs7qbFDE$ua$UsTH zUub(u7yR)$!m9#~`)(5H3h*yP_!Ri!BZ|sv2Y{DodOZyrvD!nTFk@GIMNYxeoZe|> z&o^VP*p-#ok9-_c9aDv^{xs_eDTQ;0+^$TAz0eQ!_(6pC1e~D!k&4dRiOt-NeTP>^ zDq}Vx$T#@Do7j~j<&M4G`^I0+7jS4?DE;v$0ojnG({~6m%5yT<-;s2D%iG-P#$cuK zi^S@b#Ox1!P}rDQ#`LZBGu3l$Jxg{#nf>fSRcqPt=FKC3I;BVfbq+?*d7EK+7&}K3 zl~lcOG5LlUL*1Fr^CE(V@>9pO383_23Zb$`1iDW4v_zDnq36G7c2pDob^xr0%6(q> z*OC880n>jH+W+6Ho}`ckyKG9A{kWZYND$tjm|}iyd+`&$9{at#BSEBKq%x3lkAV!- zQlTM&QNDUhai8DnZ>8IPOUBPq(J+6{Zrf+Deo5hiLP>Q&psh2&s`NI!yX+I#lrK(K zBP9gKie~Z*!9F=iInwDBf(j;(*S~--lljL_KJ@+L&0sO1;APEcw#I?oSp1waj^%xs zrW}j|MnuVy7p6E};W??{+CbeGPyldkA`>7^Zb}{%5r$1^`<$HrI(5;qP)`4c1DAIq zYcP3Ms>~2xwD3^EJ9dgi4pu}vuW>1sPx0>elN1H;IONJTJF9aYA>jjCHMBMk10fJalVRlG8bjsPd@6c}|! ze4C(P>UwMThzt?ffCnMuJie)N2U244rpoM*BXGB9JOyoSh1jELlN>1NhhU0TI-Nd+ z*EG}=Q)VOp+M3&k2 zB#OY(obNidc4^%8UN>(e7|>MSsveH5sI+Cg@1uc$Cq4JJ?D0siAx$?p-+(-g{A*G2 z0@W06OmOS-nSP0hnlnQtvjr$SJ_sPH+!oba{CRkj3%IOgc58zCx$ye5o>5|E=cE`jUJqMqb(!8&Zj`+M_=!z*D z`yuWtozK*lbLql80)B+(*LqQfeJ`^`SQm)kw2ylK`SZ!IUkI%`0Ud(T*<(6I^YkE> zU}d{kcTnHd2&Is?#roZ@$MRHKrwp}eQR?z&t%>#fZJ2Sr;Xs;|by!~@0ik<&)rp9# zE4Sh*er^?^9;}$7Y8zuESyEZs!tjKX+!DBCDR~J$+vxG=cpZajpw`+x@cC!`w&xPc zwZ@3I0G4EfQ3+~BZMrFR;#+8?^&|_+$9KO+Mu4{(B*8&l?2VFJX^y>i(A`g5D(RWjO}9b0LNFvm}@cVY=j=A-xNq!?O+Mwl>vDtvP-`SgoVHH`;rhC*rnZg z80zr_r1ZGJ=^`mu^csH7tnR?71fG6<7vl&+e|%pYn}hQItwi?!0XEzbg~!N@loBBJ zF`e&tkjXGjq2W`3A^6vB2>U?5<<6YL^bkL5H4&@>=7D zg~w(T zj264iNY)UmrKKBN$!t`ZK?$z&b{C|zJa++DHP>+tTj*Qg>q)F#w7uH1!t@!+Ue-wV zF7ZW=>NvfS@L{=0v#73aYeH>M@Hh;aoblpqh-;7Nw#ilXY9kz9jCKz*c^kVQd1A-6 zihCwTH2O%xsLLchBkTB7ewt>oY_u%%M)cO&u;cCyuO?lD zQBRq=)1QXvX`+kZ*rFT9WzosHZwAe3uEGJsC&CY7R7J!P2jOnQ#piDhkV-8-An_0=Vpzs43KXav z{K61!n`wMSrBPBU9WMjHo}?*sXU6MF`gVohPTdEtUGXj+;Y$2{IoYHo%Nl!C>ArJf z+S?Nw{)IQK;lwiiheo~4`;Dhd+H*xxd;lA)nPbhi1XS--8QY$9{1wzne|xSs4VRAz z;8t3VqPh};B?Bj3ZKw>?uWqkb=8~V%IoOmZzdQ8XZT8l>E1}S{IvC@lZkjlF!8kEt zEStL?WF0fs6d$rI*Kvci6m1HFuaBk9q}&ct*bv?wpAMs47vVWRh*shoU5S7 zYaYq|5?{Dpsy=5`!l%2k^w~Ai`ilF3gBVBX`2y7OFL0wwZ^uh zNccOtjx`<487DLais!P(QcwaDpQWhj5gcP?aQJ1tMsQy~4~5&8 z`WLv_KYoAN2hs>At-Q(5XV3>)8_m6`TX%evD|OM;lbf#4MX8&?t~X=55O2GvX68G{ z1{Bof`Z`V3O9^5E!sBXa2x3Q0T=_~ix!dJ%8*3yH{FrGoO!f6p0gDddxkDb`g7*zd zHWS#j6@bYSWFEN>G<)P6Wl~1-s<6e0rla~luaL|Ut@LX6Szdkpjw6HjhAsO@K9VI! zCRew_MPhHNw>xsWH&Uotg-h6st}8MB;)&mOGmxP#F)>d&F#o3^5vN~t* zf@U*LaQXOpM&~x#dVamNZVoOKfl_Grc_!&f!fEYlX`qymJ7XCy*zcLvm*3i&%a+@A zuhq{?^4zmEW?MHu5w|K4)wPOHkAo$3H=AxQ7$xe)+D$})d0os6fm;ilw+&bBYQu(F zp{y2{5nRqpKLE}#MiTeg9;=dcZiG6*>GoySueJP3!2Vzb}<^?q|gzYbAJNWS3#KlRq0D!i6g_uO&NYDmyI)3ytr` z&RuiS;;yyki8z|!N)UX( z5TC#y=5_=&C0Z}<+r+>Sk3I>^O^DcB@OXC3mND?X3P7MYbOhoB>J?gEfMCD;o(#q)hRH%Bi%8*$>-L;AJ z_#F4Ozz8xlZQu=gpcukc;LwN3##NB30}2J@kVh-9OL;oJmP&Yr)z27)MUJM<;`8-r zoioUQfetj$kn>*R2{GL9iOT&iW2EP3oPJ+R;cw-$K^|@5&tNjB%S$sj=>DBXq$`gh z&oChF)P~g;x|zr%s`AL4+@xapDfBe90V8Q;!;Gn11xeA+^8TkXP)3+Qoj`FbP`XLK zv-7qq`ziUcqzW}Xi?j0t@O={H8d&q`MSWbN@O=?58TJ3MupR@(@BHV*4e%2e2s!Cu cfzNNeC!nr`IJ*VIz!L;%afSC~qK3Zz7s|T=egFUf literal 0 HcmV?d00001 diff --git a/image/process_time_blocksize.png b/image/process_time_blocksize.png new file mode 100644 index 0000000000000000000000000000000000000000..da3975d53528749ae2ceaca83516de26b8e61cf4 GIT binary patch literal 11001 zcmdUVXH*kyyKbzAG^GfNfQm>HLPtOdDn+^if=B?R_ZC7Ah!mxFFoA${5D6kRw21WH zgh=logd)Ae8Q%BZBH#IO_S*aGz1H`G#hRJSJTrH>?(4qp$N!14EZN0t7eOEpncQP3 zRS@VjD+qMz*@e@WKT$pfVaAW&W?$=<7Tz%jAyV=Wj6 zL|%RLcdFhd(+~vW2$7R|pzf@@fZmT}=y91@WzA75o3hlW5|j4$vGrETBTy;k7PyAa z+=;!YJUjVHGx);8>koa=0luH3U;Es;P3LCV64((CajLQNhV})zwKqPxbT`A^hs$(5 zW+rA)*%U$StTk5@&nS*mA`%bWKRy`BC`*(WSzL`@imyc29QH0x)Ki=Sfk>I`;zuGp z56D2Ems{^9MnyrOj!PxvAdmnN<_rkb>O}y7Bw>G@zW!f3&1~ps-h;lY4Dl z;HBy=%H&RG?JA}9p$-+RL7k<b+vgGKfS=&4SjU(1A0`{;LS)aaXP<^7Fg(YNn5Lih*R%e zn#!;7k4e3$;EK>eYy7D~;8zcmmfl^g<4lXoUA8@hGIpR%G{}nYAgbUWjQ+m*%k5m; z@G|6KsgA0CgA*u^?D-zNqKSWOkEWt2zvfkb_5?%Z+zWqQp8ULQWf@#Ye!}I0rVGUn zmSB{*>|((o0-2u*)`K-u9|qJfXi<6kYKA8C{LDubJS`Pf<7Cpy5qr`Y^02&1yX{=F z3`edu<%l-NJnh8WHftNsQQ6R6gdReHq(v3H&WEiq0Ban1x3RzFym4Sb!^ndM?xqMd zxJKz#4GD|b8V5K`Twie-w7laGX~>%&zqklC{^m^GJW=6_j&xpa*&+31c8xO*Rljc? zii+xmFb(iH?<)4hXFAtgX8P=TjX%0g(q~|(VZ_O7NNd#CQh&2t%Z7r&hO=rRK3no( z<#&j~He-kAkr$dU0!fOLWmALtiqIG(7+9vU|Ghi5I6mireD2C=529E8)vRJO`1)pD zLr+AatE=HR5*F?4!cfi4F4Z5xRQKId=T)UDZ!@*(h0MTu=t?SZ*j#qpqyhP)#Y)k7 z%5*6AWN`~du!N?wE=^l%?PH?eDOqmSgAVcR(^jYPSE>m$NTh)5R;Il2t8qZ^zxJd}@yam%xbp;taywF0}B=EwWzeKZD`Fv!;}B zk@(ugnH&nK8~O^K*)|tAy3)wHTtF?g(N%Qo4yYxmT;CEQE4# zSau-&jQn`*X-7n3(s`Su=B6uKJuRv~Iw1}oMKdm(I`js;iqePRk*}BaT)hW6vR@}X zc`1L8&n*x z*W<0Y^-VuWPorq~X*tM4&JnNw0V-GiBm01*hpxN=$EV#}GS+3_{Sc?4uJfxc z@9mVI_QbvrU!6%i6k)eaNcwDJOqj>iN^VDZtb-SIUhO_?g+5r)2_ym>sd*(Ntd9@X znj`33Ryz=dzol?ZSi5Y++qv-d@+rWLJJXtZyO+kl2|V2e%R7Vd6>E;1gNi;*Uv;OX zZxLfdnDY@Hi1IIZ`|>9mDv1x__HNN!8y^vl2Kiyu=%GTo%qJ24cbtJ`xk&pZbRxlQ zWV8nQ>74wuH}d29_~^gPBxk>XCQ#Q`DSR9wZe%RLyZa3$<#&EaJ4)=R0`b)>6VV!fIA?Vhe+u+^`YRj+`XFM<00NnkLy15j)S3Surws%!6T{V$y5FKIi?gRvkQtw3U1 zG+x)OwJpooCpP_+Xfibtbwj*5%rphYu=!I`U`uPb`)o-F-|>auGUZ9$#YL3#VBM2O zm@Na`L5iJE@PVt}%}yDYw_J2k*{cjs0~D3Q5?(zPfM!&5e2Vt^uN~39w6lAP>J9`~ zmKfBBkkE4f=o;$?CkaqA`kt+xJSMu}Yb{ z7QOK>pHZkQ?99Nn&P7_=Kt1R2CqN`jFS(XP)j zH{Zw$pfTN(zD&b9s8I#}a*p--4M*Z7++pPcAJwq)L~sd{a(JF^4oeWZ8P|rZ`b#`U zXTyHut1y1fSVUOZB(C%Q6I4Qqd_~e5g4CK5+7Dc=UdPCugeeoKioH`Jub`k%MDh4D zTK0`p5kjS9x;1X$l)1FI1x1+A;lX|pBO>ov?(41b7%uHWTiNN@LN{r0CJ9T9X{Y*T z>4Jg$Cs&!}nKX3IP3B&A+u^toT&4Tj{3tY}Gp_5}@I16dU%(bPVuB86b*}|j2@=bO zG8QgyzXE7RQJ-<%*hT84E>c5#`>yj`FFEtxD9g8e7!PJ1NziQL%q`cSs4Edu2$`r4 z9f(YLd2oa76>~fqO_Lwq1lgU>uMc6!PpeKU#A@tWq)^yjUYMHq8nMofOS_$ACOx6d zyFRbQsBMdWcYv8VT))_y?3}?OTz(He7q#G~pIZEd6=4))z)T847QUp{J9qwCYUp0L zAw5E{$Z9~NmOasJQAB$wFKw8Fqm#O{4G0lau=Ca0GGVuk`Md@0B>eN;x?uVZI!0A) zbm6K95@I**ql1sDt(B(Yf4etp1$;|h+u~5!{#ZzhLXx;-OsS=TZwG@#ax))G#%WjY z7`3Hy(WcC{44b-VHub%~6f>=*;0OYqV{aW|jag#T8|h^^Gqz>!QHEvt>+;oeu77-T zbOVqZJEa|yR@RT4v*SWYM#AT5juR&ZjyC1y`lY#H^nvj6wQ&xqq4}FN(nra`qvYBG zMa$M1W1@4#P~SS+zlm3M5c$j+;usv;rrRytA0`z2jtp@d@(ip$z@i^H>XBL zn|{7i{k=2{9qmnEOF4oE;x2wtATtVRQH6YA|ISCf1q5dsygv{4yBtX*ZRI)<{;7;? z)GaltR1-Q%D<1=fWz-J`=U7vUW2^T6gx2#jmj?U4{ywyy@`JP@9zyEN_-=(?}lr2NL8ht1L#EcXazEaQkcXBPB_*2 z;_82d=pg(eIBSFCB4=|IJgW0JbO+&sLxt~5-+2?(-u#g;-{29u7Sn#$Fx@1_u_`2w z=Vhdt7FD$Z$}(tRwbx6EzIH~m*D(TH;FCejCx9@Mk^SwN96LNOccjNBsxb*d)}%;D zi0E;mGTZ*ICxu2Rp(NME^|szAhSMkP#p%FcAU;4VC#u@%h}E#UcpR= zsY8lazW(V_dS61#X4hGRe*rZFW&)v0#s7=>nj3@+KYQ*tHnNw^-f(z?KiIfzaS_Il zB?PpxuNPbhlF2OI?W^U97^!!D!GV ze}X`cGCr^O)zvMm^1Tz*7WBaIOT{&wG}gfL47q_5ahYxRDbicalJgJi_XzcxQT}Pk zlhmhxrZl7?Ph+J9wxT$|V_h)zCkprQ_p*C@tez3&e0%IpySF)iP^SWvj)|jdX;&`E zC(=JxDd>l?VjE-j3SQJQ6%(I6@r~U0JluiS7+J622}}W;|D!V)&QK=`m(15V{||LD z--FhBhFQH&@Frv)R|AdJ?z62Y?iRX*_s2b&J1@1Ys#r+jeF7}!VCX$z>D{Z3*OuGW zS2j8uU@di&$z3@jC*HUen(^_KR724?-|&R>!G{ca>JyiDfXlP*@@tssp(dA(F*m13 z%#wzTdC1ZT!`yQo^!of~aT|um1)@6j9v=+Y_n{ev**y3>*vG6*{bJ;Cum&d**4*4b zoOM#+w!f8dht2EQz<|fJiWZy`&o);2$sc{TfscGUSod(OnRPMoe(5xAy~#{&e;1AI zLR~AeJ9f{kzGZNAS0dVMXk)WAD6SW0w^KzMA}pjdzsIKO#nG*>6yMC=q-0N?v-Lit zU_h78!saP+#t}g=zfPEZKHXs=0@Pl*Xw}&pmDsUSE0s#OfygCs% z5yme+1=h!}qO`R=mY#?X<$2Vt%VSKIm+m1-ZChn>0hVq0G+j(5ty>aI4YAcsaxibV zU8P@n+uT2t3t5{X^7vJcdjT6jK>Ba`ZrlyxS})Q*q8+gz4yxb6q8K4g(yj%qZjh{D z-u+q#&ZhL+pb1SO!2A1^k!_Q%O3e%xVnW~t?P_i+n1dB8QROeK`{=F3CCINxbkTVO zg$i2rJI}p&+Pp3D56ns>F;wlI^SRQJ#Z&_PTIqbpE zYtJpbyM4_7y}=g}dnYKGjb`-{t;=K@mD8t0yMtHhyR4z8j3UE0U8Z|D*jCNr+mzj# zZuq@ETqXQ#%=zW*T9^lGSqUr$qBH0)g7hD_(mR5T+hYtc)9T*#+=?7*w6y2Ip|TGR zFcExx8@xpYy>NTnyZrf?%-C}P9#3%~cE8t>xwTOf35*Qyc##@8wzM#88+Kav(=PjC z-&T$v+(RG6LD&w&xmoI@F^tZATJH!((@gs)0@RPU6|oe!_W>c2=UPOyAN-TAz%!ei&`ov+?2veJoXF;A?0DU(nC2$# zr9wT@6`UzW-}T(21W3LHb{cRWkVCw2gR=WnVvv5gY2{9JlRl~DhPBS%LCKH0L6o+; z7_Zv0M}rAuOJ{;DQSZJ<%6Hk{jbmN+*@ZNV*8caw&jURvl>7wSmv#%9W(17cm<=vp zjg9i~;W=!`(TBv|(JWhT>Z;O#QKVT5%0>)(h!{ofuG?iEal3 zPY;{w1pfMVojh&oOSIT8ZWFain^O`ANY6Ek*rHDn&-A4oyRUSIdt5{|Wol+g(R&d} zOoj(niDt`&KZ z*XHW2ub!Dd%RZ#tP`kR$anv8R&42gdrQhTe9-iwxqtv>yphnB;=rMU-n6D#zEW~ZG zPm@E}ejTsY`MIvl>rNe2u-S(7zI$aJD+@uk``p5rzx#KgQZdWMfsc`%`(J>DX5t$F zh+@Xml@;q}41|FE3$IdI_7Zr?Va%NVC2ZgK{p#T_)<7{l_ofAJj&#+zoYVh2<(ru0 zhBe@jz&z{T{8ddn&aPiP#&K!7dSD}w>%jeMD<=uxm+dWCr!8aVVVKcoT^?g^RM)X# zW}mZGCi)T9^cCaQ2Qs)NR=S7{g!mfw7Ae{W;Eb9SRLl9ty1f01tbv3KK1q`cS0niB*hwChDxxmObwEJcL}Z?8<3u<6`v_%0=Dc=5g`$r{YJbU)9@g~#Q~q>QpP>jKPC)T!Z=c-wnFR);~Vv`#6E}HUVyR`(m;JAy8JBw znM}5{=^^y-pLpq!j4rb^fNA>SyyKjJ3wCl7u?0pwU5eWS?qU&Nk5?~Ib~BqvLy908 zJ)cXKvu?(n2FzFfYfZe(GC3pB6I*%6f0{w=rJ;E;GoP<(wEPjhDp=^6KN#KHkCYAW z8{Gb01)$*;nBkJ%WhXoDIP4N)xodJR0HWR`02x-U|S$kX?usPTV&D|v^8|Z_R@8VuZ#Z6cU>Jz+jbc12deDgK#ztBL)-xm z0;1%=Z#DB`GbJ8oVa+ZPQ*`(Lx~nm*abIx;sQmGOQ80lcRuhLGzWRFZb^jC-ZpI=t z0e)i6Dz>?eEXP14F&T(XfehoZP)9H&$di22EP?r+OI*Z-q}&|`i8f*j^FS2 z#6Wq%w}76O-OfPyNEG@jEquovR$Voa;I64SMNb&s?e%*^1gVVYr$_G5x}sF9_o{;F z+M|c)TXSLLaUr*P_yjvIBOd}li+YG>-@rZ+Oc-O}-s*aH4z=T~R?4=x)g?AFI6A%4 zc;#f}0dF1IaO60khEm%CXnZj_ysP-yVvK8YtJyu3Z z>SF~FtFzwNYJjA)zJGRSdhIOIs)SdfLN7p zUd4LnLL{JJ)2)wdw*q!ujicp+VsseY)>Hj|ZDa+?N@x{~ z8(i^N(G+3cn-_DO{#H6bTMQZgRWEcVDY5J9G1T`O)Hv)tP@x%^T@pQ5U>3_cQ0ZFQ zL<%ggW2+_^V0-lFuS^|3^L*6IDIJ7HNt05%nLVwJ~v3d=&)zQHK^SicxXY#(tR z_@kGjW2w1LbvMXajVBoC2!hD5PJsCZxlN@@7#pW!mkwv#8>{sS3(Z)L%_K7=nuYVh zhNzcOFjW>!4Q_f0&)MR3YbU^mKVry(KUXvCPN^7)7Q%xS^?CRZJI{Bdw4!I{QMvCK zAP0@&lu{p;s}eL7Ct(yGjaGM7>-7*#*Qr4-v41OvrqKFBP{u}H2stM4{*CFI-l^m2 zQ!;&p)i!Olb^MzZBiPczbK3nHGF=zqh<>9@jtl;>Nuvb+C7SQs0$$gSWi+4|O(evH z@ZOnd;&36yq=fwL#QJ|G+@I-YzKTpoHwl_9rR8`s*hdFszD{fM3&cez$j9odh-?4InU+puVu zX%M`>*_6AL?GK4?7@q6`^s(N@eJrr1J{NnN6>0Q+K!Q0|xU8fM(9tK9jDJsUxAzL( zU-ydv4;b>3XHUD4F{Tz|ogsgRQQS@TKoa#0F(C)Y^4cs787!1oC>ZB`GPVLEz;-5fg z|)&X#HaZ()_b&1uFTL)6Hk>!JPNe`@s!fb9E{<{w+z4 zvJ*3+w|d1yyggYR20zDs1{SpKWVI5sPsltFHM%mFqm~|~G1Nu_r%`Cx`!V{gG@w-9 zA88{t1!s_V@*xxiNvS?fzc{A4ytTwUgkjD?QO@?T^UH7~&Z%d8&vu}J*HZG2ILZ@H z)K{c|N4V42qBX)Vax|&mxSvzxiswbxt`sLLBYDHCl@(H9LOUcg#ACjk!( zm0W?yLSl>^t7`ORwWusPXTIJ}cK+zFmaM4RKwG{w_UN6m7n)Cn(_InBJU_)Ai)ax) z3{|#3P&*5QPb9U01C7z#x zo~E?q9gSM))c*Su-W`5)W${-7Mn3G%*4@{1Cx-$t;daWAc4$){eYs#3E&+N`yuU^bY`EDhV!!LNuA*9mb+Aj-J#PYNjdkuI zhf0dUQ&juz+O$z_YKK0wUQ1aujD^fXZrqu(Y2yqX{n6tMMO})+N+o;Gep2}1>7L3a z{Dha~Dp4aggoLUODJvoLb?Y_oG+Eb!DXenA=w)B|UFxepbG;zCUZ3$AjJ^;p%-P-&20zOONaY$zt*FC+TU*7m{<*S>if*e^ zRC-`>*)It&qR)42(aWR#!Epe;Yjkj#_2YVT_ivGis~cz$i`XvkM*Of{S>-gg9NY)S zjwbpsF)^0m54Q{~41paBUzcH;2lcq?m8)6adBvxp&1pTY0jdN)x!WJhnBsgwo9cyQ+E?uck)uZMnr zqLYrg#wRhY+U{to#hkVT-y(Di7K=^XpEuDqNB~BpN535hTOIH;R43rlX~NnKhYi^T}A+ho$U>JfWOQR?#`Pd#|H;?HOxu#Sgwle9cN40 zerk!s@0!qa#xD-J-y@&T(7zA%pO&}mN*lPXU6Xb19!tvet}^8*p%$iOzmOGxzru=m zZBcHcBr_L%t&jMykOQ*in^vH4i(hT4m7H9>{`)W9T87Uav93mI>gI(!4F>irSPhnR za#M$Sg$G+uOWiw%yjDk23)62*M18tfX7k0jIB$SE|53rEp@$0c;|xFzd`&1o1MTAB zT-GK+4Qo+dWzMv)l||R?=6@%@FHgB#?t31REZNsfQ&X}@F-yJ;ruRR z{ysh`a{~!*#>pt-$%e{Gdo95Z_?P+>p@55q!*zDf+LHN4$wGKLgWFzvU?Acov3`)f zgfP}5{krsi-aX3I&)@%EFl@jXVN#kEmTr-EufO#Y*qEZzhj2mDWA)9w;JkQ1_jI4q zODyC(ui6nQ#%!!x#AlMu)|-&$9v{KuNTmt2{}r*-s1F$KN1lOpv9(|fGygNOw*|u( zGHJp1#{pJ4L&o7%#&d~Oi(IU%5@x7bkx+_oOEg4dB_w>m8&eZO%Mmp6lC) z#3qNy$cs!1juC)F3?0O3q!U;ve%!h+{>_(!NWU#X+#>M&SgB__b@`X1?#aBD!1g83 zXZ@~lUwZLT(I@2pSswBK-lusP+ZN6&JIMqBy?zCoYzX%xLT0WTK;J100Tc!#C#@`n JdHC|p{{g0!V445` literal 0 HcmV?d00001 diff --git a/src/main.cpp b/src/main.cpp index 675da35..7b111ec 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -12,11 +12,132 @@ #include #include #include "testing_helpers.hpp" +#include +#include + + +void bench_mark() +{ + int run_times = 100; + const int block_choice = 5; + const int input_choice = 4; + const int method_choice = 7; + + int blockSizes[] = { 32, 128, 256, 512, 1024 }; + int inputSizes[] = { 8, 16, 20, 24 }; + std::string methods[] = { + "cpu scan", + "naive scan", + "eff scan", + "thrust scan", + "cpu compact no scan", + "cpu compact", + "gpu compact" + }; + + double result[method_choice * block_choice * input_choice]; + + for (int i = 0; i < run_times; ++i) + { + printf("=========Running %d round ... \n", i+1); + for (int j = 0; j < block_choice; ++j) + { + printf("==================Running on blockSize %d ... \n", blockSizes[j]); + for (int k = 0; k < input_choice; ++k) + { + printf("=============================Running on inputSize 2^%d ... \n", inputSizes[k]); + int idx; + // generate input + int SIZE = 1 << inputSizes[k]; + int *a = new int[SIZE]; + int *b = new int[SIZE]; + + genArray(SIZE - 1, a, 1000); + int cur_method = 0; + + // cpu scan + idx = block_choice * input_choice * (cur_method++) + j * input_choice + k; + printf("==test on pos %d \n", idx); + zeroArray(SIZE, b); + result[idx] += StreamCompaction::CPU::scan(SIZE, b, a); + + // naive scan + idx = block_choice * input_choice * (cur_method++) + j * input_choice + k; + printf("==test on pos %d \n", idx); + zeroArray(SIZE, b); + result[idx] + += StreamCompaction::Naive::scan(SIZE, b, a, blockSizes[j]); + + // work-efficient scan + idx = block_choice * input_choice * (cur_method++) + j * input_choice + k; + printf("==test on pos %d \n", idx); + zeroArray(SIZE, b); + result[idx] + += StreamCompaction::Efficient::scan(SIZE, b, a, blockSizes[j]); + + // thrust scan + idx = block_choice * input_choice * (cur_method++) + j * input_choice + k; + printf("==test on pos %d \n", idx); + zeroArray(SIZE, b); + result[idx] + += StreamCompaction::Thrust::scan(SIZE, b, a); + + // cpu compact no scan + idx = block_choice * input_choice * (cur_method++) + j * input_choice + k; + printf("==test on pos %d \n", idx); + zeroArray(SIZE, b); + double time; + StreamCompaction::CPU::compactWithoutScan(SIZE, b, a, time); + result[idx] += time; + + // cpu compact + idx = block_choice * input_choice * (cur_method++) + j * input_choice + k; + printf("==test on pos %d \n", idx); + zeroArray(SIZE, b); + StreamCompaction::CPU::compactWithScan(SIZE, b, a, time); + result[idx] += time; + + // gpu compact + idx = block_choice * input_choice * (cur_method++) + j * input_choice + k; + printf("test on pos %d \n", idx); + zeroArray(SIZE, b); + StreamCompaction::Efficient::compact(SIZE, b, a, time); + result[idx] += time; + + delete[] a; + delete[] b; + } + } + } + + // print result + printf("===================== RESULTS ========================\n"); + for (int j = 0; j < block_choice; ++j) + { + printf("======= block size %d ===========\n", blockSizes[j]); + + for (int i = 0; i < method_choice; ++i) + { + printf("==== method %s ==== ", methods[i].c_str()); + for (int k = 0; k < input_choice; ++k) + { + printf(" %d input %f time ", inputSizes[k], result[block_choice * input_choice * i + j * input_choice + k] / run_times); + } + printf("\n"); + } + + printf("=====================================\n"); + } +} + int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; + const int SIZE = 1 << 16; const int NPOT = SIZE - 3; - int a[SIZE], b[SIZE], c[SIZE]; + //int a[SIZE], b[SIZE], c[SIZE]; + int *a = new int[SIZE]; + int *b = new int[SIZE]; + int *c = new int[SIZE]; // Scan tests @@ -89,35 +210,97 @@ int main(int argc, char* argv[]) { int count, expectedCount, expectedNPOT; + double time; + zeroArray(SIZE, b); printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a, time); expectedCount = count; printArray(count, b, true); printCmpLenResult(count, expectedCount, b, b); zeroArray(SIZE, c); printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a, time); expectedNPOT = count; printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); zeroArray(SIZE, c); printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + count = StreamCompaction::CPU::compactWithScan(SIZE, c, a, time); printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); + count = StreamCompaction::Efficient::compact(SIZE, c, a, time); //printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); + count = StreamCompaction::Efficient::compact(NPOT, c, a, time); //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + + zeroArray(SIZE, c); + a[SIZE - 1] = 5; + printDesc("work-efficient compact, power-of-two, last non-zero"); + count = StreamCompaction::Efficient::compact(SIZE, c, a, time); + int *bb = new int[SIZE]; + int cpuCount = StreamCompaction::CPU::compactWithoutScan(SIZE, bb, a, time); + //printArray(count, c, true); + printCmpLenResult(count, cpuCount, bb, c); + + zeroArray(SIZE, c); + a[SIZE - 1] = 0; + printDesc("work-efficient compact, power-of-two, last zero"); + count = StreamCompaction::Efficient::compact(SIZE, c, a, time); + //printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + printDesc("work-efficient compact, test on special case 1"); + int test[5] = { 1, 0, 1, 0, 1 }; + count = StreamCompaction::Efficient::compact(5, c, test, time); + printCmpLenResult(count, 3, c, c); + + printDesc("work-efficient compact, test on special case 2"); + int test1[5] = { 1, 0, 1, 0, 0 }; + count = StreamCompaction::Efficient::compact(5, c, test1, time); + printCmpLenResult(count, 2, c, c); + + printDesc("cpu compact without scan, test on special case 1"); + count = StreamCompaction::CPU::compactWithoutScan(5, c, test, time); + printCmpLenResult(count, 3, c, c); + + + + //bench_mark(); + + int testArr[] = { 0, 5, -2, 6, 3, 7, -5, 2, 7, 1 }; + int resultArr[10]; + int goalArr[] = { -5, -2, 0, 1, 2, 3, 5, 6, 7, 7 }; + + StreamCompaction::Efficient::radix_sort(10, resultArr, testArr); + printDesc("radix sort, test on special case"); + printArray(10, testArr, true); + printf(" sorted:\n"); + printArray(10, resultArr, true); + printCmpResult(10, goalArr, resultArr); + + genArray(SIZE, a, 10000); + StreamCompaction::Efficient::radix_sort(SIZE, b, a); + printDesc("radix sort, test"); + printArray(SIZE, a, true); + printf(" sorted:\n"); + printArray(SIZE, b, true); + std::sort(a, a + SIZE); + printCmpResult(SIZE, a, b); + + + delete[] a; + delete[] b; + delete[] c; + delete[] bb; } diff --git a/stat.xlsx b/stat.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..b838afaed670eb6b2382ba0f1d9f5414401125a5 GIT binary patch literal 8434 zcmeHMg;$hY_Z~{Rhi;?=>F(~3?jZ%~7-DD!0TmD!>26WFyIVqOBn3qzhb|R<zuvU`=0YWXYccD9d#5`Am9!F0{{Rp0BjDj9W9XnfCN+kfEa** zY$osS2DWztgY+RD_TCov{asz?3s8~S@&L$)`Tw^6;T0%P8rACKAyB+f`6mB~OKrJU z219TkIz+_rK&rd{#o%jOvz$kdZgZY~C6LP}aT2Z}db=FRcQo!;>*m@BeKOdlixU$* z+-+b_lAa!N-MQmL?y*^5Jl zglmH>IIm4hMTeJqK4=-P5{kTeHBInwl6c1H5qqHJ$ww|#)ELIVWl~?uVjTu!J$4)2 z7O(46^-gcLX8u_#TjW)qsDuu@m5H@GZJoljngW!yub!R0HXTfGzSk9G9u&~U_B`fP z=ekvh#1X`0mLn6+0G(O@=s3sJlUDG#y$$Y{CMnbyJ?y1l`O3UbOx?dmW!IV$pJ1m& z&gjbdp?D}qL0eRq0~+#jc%)gRTYKM@I>@bL68#gZ;^i&yoN&AKq+5<)q?>^$IC}rX z@V0p1-n&h+h=Ze)?NGj(I{?7#Eeb&AZ?tSM;JJH>z?uevb=U}6g1qcqyt(iHc>YVr z|6vaP<72XM5SJP`k-^Lc%>+Lmaq2{`p%xso~J9TLD+md z;yKEiOY_x7*6u1zA1TxjyuEKkiAh{c9Yrdc9&S3QX|if_S&1|&r?+1bRox_ZLd~cP9nDfp;P3r}YMzbOP!kB$!!-pf#7`oC`M1=Y$Ny4R)Qcn;o(uyDn zB>)4---Y{kp7^^P9L;_F zWE=&D^-WFm8SD=?Z(HP-&hO+xA#{@`6qu?KxDdePml=k8mpzK- z7{~439%@1+oO=qdaW<*T)M;M}N(M!N>7$30=|b6I-kSyKZ3gglu-w-->|IO8oTI1k zho44qT^hvds?^ONF`{p z->9fBSGu@WOg8UHsV5EHe`omQc0z=qx;j(J>wSXWjE`-0u1MV@Wzm15N| z$Y1=fjaquSE2e~8RNF_jcv51B_2AIp=zM(7YA?ijj20=(SG2Ojy;j7x{PVL`{%H?f z@#LCk0cndCdz7P(ww)$AV4fLFWHkkHA88rts(N?~n8`upa~zey_%};s z3(g;a4PP6drai!waft-p%!Jz%88TwdRjV`00hUZT%nn^z)~n%vWlVq)5bSB{DIhhA*jK=^OYLr1BY-bT?e zDwyc)!ny1NFOA$FKJqk4fw=_zm})o=hWaWt#D594k;>|e4a7(*;@koL4mNK`YcG2{ zJ+Rj!HwW(@#}hcAM-9QqC^!^O;N>?>YC@`7hu*{bu8d&n5tSngFTcszs~`-}7-uPV z1MSw=&$$#zbZfngS_vb{d-iS4MF6npp~%_s_R;!hdbFB>p8JIPsc@NN$I2=J!k7%j zoz1bVgCs^_`Re)@%<2r@&sHrd5Zxrch46;}yqKMghgHxsw#XQ?mIK#fwl5&8zg&~b9Z`F2bO2zG{>OdeHv)nk?OpA;e|z%&Sdu-H z@fa#$U>Di7G>*65H^Gnb%&Q;Xy1ZCnvdd1QZfM+lV93u0Yo{VcW)?luWh*L9m2;O& znvy}m?Kq9cWUAk$n`9kdxLdKwlWdqreADqVcQr5|-~@a$)sYsUoQB;CO96H2 z97|@!E~GI{r-V9-un7cdt0ct|+GDz1VRL>{U&;;fB6J;_cI|cE zL^<48wl$;IHO%=nANy+MoVr@BUS)ywiP$hcZ^M{B#W?db3yx^El9C?3#8`8t3$h@)jcYmR%a>_wy_;%xuWFYg|S+Q*0ma{i9LE7@-9}bt$`$~u}pf=PR1FqMg1{J?N|Rd zLWl3ovQ#9nllq^yPX*z&yG7p5@|w^O$bOEe`9TnQldeBznr$BK3lFrdOLaF6!^9yq zfb_uM!ecbly>7?~v?{sZiAoXVM4__h?wZL^Foz21#vuffSAww_Q@(l=`cyjAwCK6n zxGyf_>X4h9X=%WONu^oeu?nJ$? zlXS>765KL~yX~rS658K9G5y@0B3fl{?hc|WN*Ng?D|I=(AAfMwPh}=WEVtchuG4-G zJg?kMHSQ0HXr(C*gq4wg;DlD0M2z%micay-Ivjg57JkPV&lA+2!yS7ytgjs@Y>OdxW{t3KxjKSPgu#yokomaZnZ_ zWf-ECIie2d*vfkKQkddJ1|fj?oDbPcTbN`%Zd9*KU4EZS!Znencbvugu{A1puL(GV z>!R+|hu`m&qISAX3zZk;*d2{|X!}9CccjcSfdA2y9j}-mS zb=)nUXss{WGeKI{*O=?ZniC|M3T8ftZ5O?#XCd;Ahc$1OZB`Y8vxr4Ix@}@HJ&sQ? zC!TsUq=c8{CQhNv;?JbnM0NMg)b?GwO93O1(Af<*o*~Qy?_f24aA*}N`y>-Xj zd~%4~MSRvaWOzz9Uzp4QKb{Gcd-)1E>52D2yQ>vkBUTI@9#0d&1CAq>M@kLQ`)ey_ z!mFyr#?4%PJ;-3n_y_n~`dIsFU-rFF6+rcYB51CsaqfKv+w%O+>!f z#Yzj@sIz(cj*s`9@@b-Gy`(u)1G3+sh#1=eQ~OR^LjXqP;6#hX-J^z>ul^8qZ|nXI z#%Zz>MNM&oL;{S8kR5P;kH6kvh>QJi$w-mDne!44FobST8l$7Fc11)Tx?d|G2yEqbL=QA@ z5ha`*P*K}*+3SK2r}h`yQtik1OdL)6b$aUs%&LXwq}7<0;V6?aC}||OZaAf`SK%vG zcMprb=HRHep^)N>sXD%Z>+U{>uD~h0G3b-V4_8|rDn8-5(9jLcrz-iNT@hD=r(bus zkvG4L&k%^o}P2$D4D_D7pO8lk; zLK&Iw9z?@Cs_`e4@AVW!DUHnd6| zC_kUuqwQ6iB24twR)o`hLoUmBAO}Ex48%G}dw|B~m_YX&_puQVm88bj1wK4Ge~3+s zg-nHvN4sfJpC@K3Ahtw;q}*3#=5S>-S4;SC#c3P;>WSyi;v@mP8kM66TXKGOt{MUjRoQ2OhsMVq|(C&`IQ)R8F0$;UReLwD)cXOB4BwTQ#AE_|vzb zt*&2Q`a@^4+N2NwUPA)_$bSd$j~kq~qrE-YoBOx!w=6O#X~b>mM`De3jfV=K+*pAY zb;r>NL?g@2^J~U2^%j)EPS4M2)Eai^Ijlf9QzyHbx%~2j@vJ6^D2{jav){eLW(&_K z)0k0|3k?blG1jr+EaNwPMM)HJN1kpvo^GRXN?Z@w82WgTslLAPY0%ms5fguoQ{JFT zWxX9~Q|qB-VYvOia_zCQEYAruvEV*NVrOLjvAKuwz0OpfCA@-s8`AHYx&oi&UVJ0X zvI}T(r)`GeGBy?voiEF=CrSjcI~Big?Dfv=#r#O-+XjhFg*w~oyGiz6Wlvr>SYE5s zsXofWlnB2kKAFG@CQ-^nJDYByS~fF*2|BC-JS`y+ zVf<)F{Q>heM}8)S@1b8NN@`j+oZq~P`+d$*REHBxcH%tw`|IbqvcTejdOgmtGWn$J zc`Fei0C8a9t3odW2ISDEKsQ)**xrl@Njev?SO#Po!h`of>vGkYIn*b-T5vXa&D4y4 zbjrBwZcn02!lN{*YMyKN5RJY1T%FhyoVksJJ2j!*D92qjFL#YEg&I4b$E^+u&DVI+ zolIK7J;BjKB+mll>#SEVX=$GYt_GzEw?s#8`)!_A+SY1!=J)*Ot;pM`YO1t@10R&tYUEBHizkZ-SjXr0^C?y=8oRv*^*_2T zP&+ppkXDV^8*RvAi5!JT@4rp1O^s_?`#zm@^GnRt0dJ0F5hJz;+z}##COdaq9WQqe zZ*E(6FZ&-?7KA4IufBvJV=T-CbM2U3=6Ay0CrC3+#qu zOw^!ii=F|Sx))*<1n5Pn5LM=fQP*s}-EkX~a=VARQUG|s9C=M|95qUC5T;m*xm zX&Rqv0YKR$gX+8lZOD?O{y@gW&TbP(_KLL2eCeBM7uuJ8o}CLN6eC+q9T4}m8=tPY z1^iD5A8z~2_{=n(ILH`$h;e@qjPfhB%32kZ;3)!kgosgszv9l?!{cArA@KH}mIae_ zU*f@ugri*3;jB=JuIS)|m)ocqomN{T!5SSjOKkINnVys1I+B+j)hdDWjGv zs3#ff7P#i~H(QBTPxUWTYzV>aT__#ag%(rskVndD+fzK~F$8B)w}EUW{RY}}xD|yo zmxrEy8aU;%O-b!_QI{##FY82YTuLuPsy`%W#XQ??v;x%@7T!hEe)aOp~LCrj;=MV$Ca&7Gy0*amGC6X}6(v0Q+B zm`hret?0#s^MxX}G78z&AcGaM^*XiZ)-cMAlMu_ac>k#A(J;@EmdH}LyO{Tcuw}aT zHESSEn_X{Wq~2Lyq(hKOo4+%LEe;N z`FPrQb{y~^TZr-MU;{PyD&{+s>(Z2EJi^`~hK>3=`{A9>f$Uj80C x|MUR>FjD{k|A?eNoB#a?{Hu93)nCm2b0F%dqak(|0Kh{0!VtmQoc71R{{s+vh!Ow* literal 0 HcmV?d00001 diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..23174dc 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,10 @@ namespace Common { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + if (idata[index] != 0) bools[index] = 1; + else bools[index] = 0; } /** @@ -33,6 +37,12 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + if (bools[index] == 1) + { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 190d913..e285031 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,5 +1,6 @@ #include #include "cpu.h" +#include "chrono" namespace StreamCompaction { namespace CPU { @@ -7,14 +8,22 @@ namespace CPU { /** * CPU scan (prefix sum). */ -void scan(int n, int *odata, const int *idata) { +double scan(int n, int *odata, const int *idata) { + // record time + auto start = std::chrono::system_clock::now(); + // TODO : finished - if (n <= 0) return; - odata[0] = idata[0]; + if (n <= 0) return -1; + odata[0] = 0; for (int i = 1; i < n; ++i) { - odata[i] = odata[i - 1] + idata[i]; + odata[i] = odata[i - 1] + idata[i-1]; } + + std::chrono::duration diff = (std::chrono::system_clock::now() - start); + //printf("CPU scan took %fms\n", diff.count()); + + return diff.count(); } /** @@ -22,8 +31,11 @@ void scan(int n, int *odata, const int *idata) { * * @returns the number of elements remaining after compaction. */ -int compactWithoutScan(int n, int *odata, const int *idata) { +int compactWithoutScan(int n, int *odata, const int *idata, double &time) { // TODO : finished + // record time + auto start = std::chrono::system_clock::now(); + int num_remain = 0; for (int i = 0; i < n; ++i) { @@ -32,6 +44,10 @@ int compactWithoutScan(int n, int *odata, const int *idata) { odata[num_remain++] = idata[i]; } } + + std::chrono::duration diff = (std::chrono::system_clock::now() - start); + //printf("CPU compact without scan took %fms\n", diff.count()); + time = diff.count(); return num_remain; } @@ -40,16 +56,17 @@ int compactWithoutScan(int n, int *odata, const int *idata) { * * @returns the number of elements remaining after compaction. */ -int compactWithScan(int n, int *odata, const int *idata) { +int compactWithScan(int n, int *odata, const int *idata, double &time) { // TODO : finished + // record time + auto start = std::chrono::system_clock::now(); + // map data to 1 and 0 for non-zero and zero. - int *tmp_data = new int(n); + int *tmp_data = new int[n]; for (int i = 0; i < n; ++i) { - tmp_data[i] = idata[i] == 0 ? 0 : 1; - //if (idata[i] == 0) tmp_data[i] = 0; - //else tmp_data[i] = 1; - //printf("%d is %d\n", i, tmp_data[i]); + if (idata[i] == 0) tmp_data[i] = 0; + else tmp_data[i] = 1; } // scan @@ -61,10 +78,16 @@ int compactWithScan(int n, int *odata, const int *idata) { { if (tmp_data[i] == 1) { - odata[odata[i]-1] = idata[i]; + odata[odata[i]] = idata[i]; num_remain++; } } + + delete[] tmp_data; + + std::chrono::duration diff = (std::chrono::system_clock::now() - start); + //printf("CPU compact with scan took %fms\n", diff.count()); + time = diff.count(); return num_remain; } diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 6348bf3..1c0880b 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -2,10 +2,10 @@ namespace StreamCompaction { namespace CPU { - void scan(int n, int *odata, const int *idata); + double scan(int n, int *odata, const int *idata); - int compactWithoutScan(int n, int *odata, const int *idata); + int compactWithoutScan(int n, int *odata, const int *idata, double &time); - int compactWithScan(int n, int *odata, const int *idata); + int compactWithScan(int n, int *odata, const int *idata, double &time); } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 7c63ad1..d665f0c 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,29 +2,191 @@ #include #include "common.h" #include "efficient.h" +#include namespace StreamCompaction { namespace Efficient { // TODO: __global__ - __global__ void kernScanEfficient(int N, int interval, int *data) + __global__ void kernScanUpSweep(int N, int interval, int *data) { // up sweep int index = (blockIdx.x * blockDim.x) + threadIdx.x; - if (index >= N) return; - int cur_index = index + 2 * interval - 1; - int last_index = index + interval - 1; + int real_index = index * interval * 2; + if (real_index >= N) return; + int cur_index = real_index + 2 * interval - 1; + int last_index = real_index + interval - 1; + if (cur_index >= N) return; + + data[cur_index] = data[last_index] + data[cur_index]; + } + + __global__ void kernScanDownSweep(int N, int interval, int *data) + { + // down seep + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + int real_index = index * interval * 2; + if (real_index >= N) return; + int last_index = real_index + interval - 1; + int cur_index = real_index + 2 * interval - 1; if (cur_index >= N) return; + int tmp = data[last_index]; + data[last_index] = data[cur_index]; + data[cur_index] += tmp; + } + + __global__ void kernMapDigitToBoolean(int N, int digit, int *odata, const int *idata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + int mask = 1 << digit; + if ((idata[index] & mask) == 0) + { + if (digit != 31) odata[index] = 0; + else odata[index] = 1; + } + else + { + if (digit != 31) odata[index] = 1; + else odata[index] = 0; + } + } + + __global__ void kernFlipBoolean(int N, int *odata, const int *idata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + if (idata[index] == 0) + { + odata[index] = 1; + } + else + { + odata[index] = 0; + } + } + + __global__ void kernSortOneRound(int N, int *bools, int *indices_zero, int *indices_one, int maxFalse, + int *odata, const int *idata) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) return; + if (bools[index] == 0) + { + // false; + odata[indices_zero[index]] = idata[index]; + } + else + { + odata[indices_one[index] + maxFalse] = idata[index]; + } + } + + void radix_sort(int n, int *odata, const int *idata, int blockSize) + { + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + int *cuda_idata, *cuda_bools_one, *cuda_bools_zero, + *cuda_indices_one, *cuda_indices_zero, *cuda_odata; + int *bools = new int[n]; + int *indices = new int[n]; + + cudaMalloc((void**)&cuda_idata, n * sizeof(int)); + cudaMalloc((void**)&cuda_bools_one, n * sizeof(int)); + cudaMalloc((void**)&cuda_bools_zero, n * sizeof(int)); + cudaMalloc((void**)&cuda_odata, n * sizeof(int)); + cudaMalloc((void**)&cuda_indices_one, n * sizeof(int)); + cudaMalloc((void**)&cuda_indices_zero, n * sizeof(int)); + + cudaMemcpy(cuda_idata, idata, n*sizeof(int), cudaMemcpyHostToDevice); + + for (int i = 0; i < 32; ++i) + { + kernMapDigitToBoolean << > >(n, i, cuda_bools_one, cuda_idata); + cudaMemcpy(bools, cuda_bools_one, n * sizeof(int), cudaMemcpyDeviceToHost); + scan(n, indices, bools); + cudaMemcpy(cuda_indices_one, indices, n * sizeof(int), cudaMemcpyHostToDevice); + + kernFlipBoolean << > >(n, cuda_bools_zero, cuda_bools_one); + cudaMemcpy(bools, cuda_bools_zero, n * sizeof(int), cudaMemcpyDeviceToHost); + scan(n, indices, bools); + cudaMemcpy(cuda_indices_zero, indices, n * sizeof(int), cudaMemcpyHostToDevice); - data[cur_index] = data[last_index]; + int totalFalse; + cudaMemcpy(&totalFalse, &cuda_indices_zero[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + totalFalse += bools[n - 1]; + + kernSortOneRound << > >(n, cuda_bools_one, cuda_indices_zero, cuda_indices_one, totalFalse, cuda_odata, cuda_idata); + + int *tmp = cuda_idata; + cuda_idata = cuda_odata; + cuda_odata = tmp; + } + + cudaMemcpy(odata, cuda_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(cuda_idata); + cudaFree(cuda_bools_one); + cudaFree(cuda_bools_zero); + cudaFree(cuda_odata); + cudaFree(cuda_indices_one); + cudaFree(cuda_indices_zero); } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + float scan(int n, int *odata, const int *idata, int blockSize) { + // record time + float diff(0); + cudaEvent_t start, end; + cudaEventCreate(&start); + cudaEventCreate(&end); + cudaEventRecord(start, 0); + + int loop_times = ilog2ceil(n); + int totalNum = 1; + for (int i = 0; i < loop_times; ++i) + { + totalNum *= 2; + } + int interval = 1; + //printf("total looptimes: %d, total num %d\n", loop_times, totalNum); + + int *tmp_data; + cudaMalloc((void**)&tmp_data, totalNum * sizeof(int)); + cudaMemset(tmp_data, 0, totalNum); + cudaMemcpy(tmp_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + + // up sweep + for (int i = 0; i < loop_times; ++i) + { + dim3 fullBlocksPerGrid((totalNum / (interval * 2) + blockSize - 1) / blockSize); + kernScanUpSweep << > >(totalNum, interval, tmp_data); + interval *= 2; + } + + // down sweep + cudaMemset(&tmp_data[totalNum - 1], 0, sizeof(int)); + + for (int i = 0; i < loop_times; ++i) + { + dim3 fullBlocksPerGrid((totalNum / interval + blockSize - 1) / blockSize); + interval /= 2; + kernScanDownSweep << > >(totalNum, interval, tmp_data); + } + + cudaMemcpy(odata, tmp_data, n*sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(tmp_data); + + cudaEventRecord(end, 0); + cudaEventSynchronize(start); + cudaEventSynchronize(end); + cudaEventElapsedTime(&diff, start, end); + + //printf("GPU scan took %fms\n", diff); + return diff; } /** @@ -36,9 +198,69 @@ namespace StreamCompaction { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ - int compact(int n, int *odata, const int *idata) { + int compact(int n, int *odata, const int *idata, double &time, int blockSize) { // TODO - return -1; + // record time + float diff(0); + cudaEvent_t start, end; + cudaEventCreate(&start); + cudaEventCreate(&end); + cudaEventRecord(start, 0); + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + int *indices_cuda; + int *bools_cuda; + int *idata_cuda; + int *odata_cuda; + + int *indices = new int[n]; + int *bools = new int[n]; + + cudaMalloc((void**)&indices_cuda, n * sizeof(int)); + cudaMalloc((void**)&bools_cuda, n * sizeof(int)); + cudaMalloc((void**)&idata_cuda, n * sizeof(int)); + cudaMalloc((void**)&odata_cuda, n * sizeof(int)); + + cudaMemcpy(idata_cuda, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + Common::kernMapToBoolean<<>>(n, bools_cuda, idata_cuda); + + cudaMemcpy(bools, bools_cuda, n * sizeof(int), cudaMemcpyDeviceToHost); + + scan(n, indices, bools); + + cudaMemcpy(indices_cuda, indices, n * sizeof(int), cudaMemcpyHostToDevice); + + Common::kernScatter << > >(n, odata_cuda, idata_cuda, bools_cuda, indices_cuda); + + int remain_elem; + cudaMemcpy(&remain_elem, &indices_cuda[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + remain_elem += bools[n - 1]; + //for (int i = 0; i < n; ++i) + //{ + // if (bools[i] == 1) remain_elem++; + //} + + cudaMemcpy(odata, odata_cuda, remain_elem * sizeof(int), cudaMemcpyDeviceToHost); + + delete[] bools; + delete[] indices; + + cudaFree(indices_cuda); + cudaFree(bools_cuda); + cudaFree(idata_cuda); + cudaFree(odata_cuda); + + cudaEventRecord(end, 0); + cudaEventSynchronize(start); + cudaEventSynchronize(end); + cudaEventElapsedTime(&diff, start, end); + + //printf("GPU compact took %fms\n", diff); + + time = diff; + return remain_elem; } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 395ba10..c3ce7ae 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -2,8 +2,10 @@ namespace StreamCompaction { namespace Efficient { - void scan(int n, int *odata, const int *idata); + float scan(int n, int *odata, const int *idata, int blockSize = 128); - int compact(int n, int *odata, const int *idata); + int compact(int n, int *odata, const int *idata, double &time, int blockSize = 128); + + void radix_sort(int n, int *odata, const int *idata, int blockSize = 128); } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 0009eb9..5cc01f5 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -25,28 +25,44 @@ namespace StreamCompaction { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + float scan(int n, int *odata, const int *idata, int blockSize) { // TODO : finished - int blockSize(128); + // record time + float diff(0); + cudaEvent_t start, end; + cudaEventCreate(&start); + cudaEventCreate(&end); + cudaEventRecord(start, 0); + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); int *tmp_data, *tmp_data2; cudaMalloc((void**)&tmp_data, n * sizeof(int)); cudaMalloc((void**)&tmp_data2, n * sizeof(int)); - cudaMemcpy(tmp_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); - + cudaMemset(tmp_data2, 0, n * sizeof(int)); + cudaMemset(tmp_data, 0, n * sizeof(int)); + cudaMemcpy(tmp_data+1, idata, (n-1) * sizeof(int), cudaMemcpyHostToDevice); int loop_times = ilog2ceil(n); int start_idx = 1; - for (int i = 1; i <= loop_times; ++i) + for (int i = 0; i < loop_times; ++i) { kernScan<<>>(n, start_idx, tmp_data2, tmp_data); int *tmp_pt = tmp_data; tmp_data = tmp_data2; - tmp_data2 = tmp_data; + tmp_data2 = tmp_pt; start_idx *= 2; } cudaMemcpy(odata, tmp_data, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(tmp_data); + cudaFree(tmp_data2); + + cudaEventRecord(end, 0); + cudaEventSynchronize(start); + cudaEventSynchronize(end); + cudaEventElapsedTime(&diff, start, end); + + return diff; } } diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 21152d6..1a4d276 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -2,6 +2,6 @@ namespace StreamCompaction { namespace Naive { - void scan(int n, int *odata, const int *idata); + float scan(int n, int *odata, const int *idata, int blockSize=128); } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..aaff222 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -5,6 +5,7 @@ #include #include "common.h" #include "thrust.h" +#include namespace StreamCompaction { namespace Thrust { @@ -12,10 +13,30 @@ namespace Thrust { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ -void scan(int n, int *odata, const int *idata) { +double scan(int n, int *odata, const int *idata) { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::host_vector hv_in(n); + + for (int i = 0; i < n; ++i) + { + hv_in[i] = idata[i]; + } + + thrust::device_vector dv_in(hv_in), dv_out(n); + + // record time + auto start = std::chrono::system_clock::now(); + + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + std::chrono::duration diff = (std::chrono::system_clock::now() - start); + //printf("Thrust scan took %fms\n", diff.count()); + + thrust::copy(dv_out.begin(), dv_out.end(), odata); + + return diff.count(); } } diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h index 06707f3..07e50a1 100644 --- a/stream_compaction/thrust.h +++ b/stream_compaction/thrust.h @@ -2,6 +2,6 @@ namespace StreamCompaction { namespace Thrust { - void scan(int n, int *odata, const int *idata); + double scan(int n, int *odata, const int *idata); } } From 268fe0094e8b695c03d56ae1c6c8967b2712b8dd Mon Sep 17 00:00:00 2001 From: BowenBao Date: Tue, 27 Sep 2016 22:30:14 -0400 Subject: [PATCH 3/6] comment time test --- stream_compaction/naive.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 5cc01f5..c8966dc 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -62,6 +62,7 @@ namespace StreamCompaction { cudaEventSynchronize(end); cudaEventElapsedTime(&diff, start, end); + //printf("GPU naive scan took %fms\n", diff); return diff; } From a3d6b04cc3b8d83ac41f5bb48059ed4bd6834452 Mon Sep 17 00:00:00 2001 From: BowenBao Date: Tue, 27 Sep 2016 22:31:31 -0400 Subject: [PATCH 4/6] fix readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ae3735e..adcd574 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ I added a benchmark suite for testing the run time of each implementation under ## Performance Analysis ### Performance of different implementation -![](/images/process_time.png) +![](/image/process_time.png) Here's the test result for each of the methods. The tests are run with the block size of 256(which is decided as near optimal after testing on numerous values). For each methods, I ran 100 independent tests, and calculated their average process time. @@ -35,7 +35,7 @@ We can observe indeed that the GPU version of scan has a better performance than ### Performance of GPU methods under different block size -![](/images/process_time_blocksize.png) +![](/image/process_time_blocksize.png) The tests are run with the stream length of 2^24, each method is tested 100 times and recorded the average. Observe that the performance starts to decrease after blocksize getting over 256. From 64467801cb498d9076a2db50107cef5a57527ddf Mon Sep 17 00:00:00 2001 From: BowenBao Date: Tue, 27 Sep 2016 22:34:17 -0400 Subject: [PATCH 5/6] fix readme 2 --- README.md | 118 +++++++++++++++++++++++++++--------------------------- 1 file changed, 59 insertions(+), 59 deletions(-) diff --git a/README.md b/README.md index adcd574..cfb4f4c 100644 --- a/README.md +++ b/README.md @@ -82,62 +82,62 @@ See Output. ## Output -**************** -** SCAN TESTS ** -**************** - [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 35 0 ] -==== cpu scan, power-of-two ==== - [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] -==== cpu scan, non-power-of-two ==== - [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604305 1604316 ] - passed -==== naive scan, power-of-two ==== - passed -==== naive scan, non-power-of-two ==== - passed -==== work-efficient scan, power-of-two ==== - passed -==== work-efficient scan, non-power-of-two ==== - passed -==== thrust scan, power-of-two ==== - passed -==== thrust scan, non-power-of-two ==== - passed - -***************************** -** STREAM COMPACTION TESTS ** -***************************** - [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 1 0 ] -==== cpu compact without scan, power-of-two ==== - [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ] - passed -==== cpu compact without scan, non-power-of-two ==== - [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 3 1 ] - passed -==== cpu compact with scan ==== - [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ] - passed -==== work-efficient compact, power-of-two ==== - passed -==== work-efficient compact, non-power-of-two ==== - passed -==== work-efficient compact, power-of-two, last non-zero ==== - passed -==== work-efficient compact, power-of-two, last zero ==== - passed -==== work-efficient compact, test on special case 1 ==== - passed -==== work-efficient compact, test on special case 2 ==== - passed -==== cpu compact without scan, test on special case 1 ==== - passed -==== radix sort, test on special case ==== - [ 0 5 -2 6 3 7 -5 2 7 1 ] - sorted: - [ -5 -2 0 1 2 3 5 6 7 7 ] - passed -==== radix sort, test ==== - [ 38 7719 1238 2437 8855 1797 8365 2285 450 612 5853 8100 1142 ... 5085 6505 ] - sorted: - [ 0 0 0 0 0 0 0 1 1 1 1 1 1 ... 9999 9999 ] - passed \ No newline at end of file + **************** + ** SCAN TESTS ** + **************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 35 0 ] + ==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] + ==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604305 1604316 ] + passed + ==== naive scan, power-of-two ==== + passed + ==== naive scan, non-power-of-two ==== + passed + ==== work-efficient scan, power-of-two ==== + passed + ==== work-efficient scan, non-power-of-two ==== + passed + ==== thrust scan, power-of-two ==== + passed + ==== thrust scan, non-power-of-two ==== + passed + + ***************************** + ** STREAM COMPACTION TESTS ** + ***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 1 0 ] + ==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ] + passed + ==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 3 1 ] + passed + ==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ] + passed + ==== work-efficient compact, power-of-two ==== + passed + ==== work-efficient compact, non-power-of-two ==== + passed + ==== work-efficient compact, power-of-two, last non-zero ==== + passed + ==== work-efficient compact, power-of-two, last zero ==== + passed + ==== work-efficient compact, test on special case 1 ==== + passed + ==== work-efficient compact, test on special case 2 ==== + passed + ==== cpu compact without scan, test on special case 1 ==== + passed + ==== radix sort, test on special case ==== + [ 0 5 -2 6 3 7 -5 2 7 1 ] + sorted: + [ -5 -2 0 1 2 3 5 6 7 7 ] + passed + ==== radix sort, test ==== + [ 38 7719 1238 2437 8855 1797 8365 2285 450 612 5853 8100 1142 ... 5085 6505 ] + sorted: + [ 0 0 0 0 0 0 0 1 1 1 1 1 1 ... 9999 9999 ] + passed \ No newline at end of file From 3f7b1c69af5dc1b50fdddcfc9a70fd0d46629875 Mon Sep 17 00:00:00 2001 From: BowenBao Date: Tue, 27 Sep 2016 22:39:30 -0400 Subject: [PATCH 6/6] fix readme 3 --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index cfb4f4c..509d799 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,7 @@ I followed the algorithm in the slides, and implemented a radix sort method base * Roughly optimize the block sizes of each of your implementations for minimal run time on your GPU. * (You shouldn't compare unoptimized implementations to each other!) + See Performance Analysis. * Compare all of these GPU Scan implementations (Naive, Work-Efficient, and @@ -67,17 +68,20 @@ See Performance Analysis. allocation, memory copy), take a look at the Nsight timeline for its execution. Your analysis here doesn't have to be detailed, since you aren't even looking at the code for the implementation. + See Performance Analysis. * Write a brief explanation of the phenomena you see here. * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is it different for each implementation? + One problem with "naive" efficient GPU scan is that there are too many threads wasted(after being checked that their index mod interval is not zero). One way of improving this is to assign the index as the divided result of the original index by the interval, and compute back the actual index later in that thread. With this improvement, we can save a lot of useless works done by threads, and note that waste grows exponentially with the number of elements in stream in the original implementation. * Paste the output of the test program into a triple-backtick block in your README. * If you add your own tests (e.g. for radix sort or to test additional corner cases), be sure to mention it explicitly. + See Output. ## Output