diff --git a/README.md b/README.md index b71c458..a70589f 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,69 @@ CUDA Stream Compaction ====================== -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** +###University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2 -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +Ju Yang -### (TODO: Your README) +### Tested on: Windows 7, i7-4710MQ @ 2.50GHz 8GB, GTX 870M 6870MB (Hasee Notebook K770E-i7) +![result](doc/1024.png) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +## TODOs finished: + ### 1. naive.cu + + __global__ void naive_sum(int n,int* odata, int* idata); + + void scan(int n, int *odata, const int *idata); + + ### 2. efficient.cu + + __global__ void prescan(int *g_odata, int *g_idata, int n, int*temp); + + void scan(int n, int *odata, const int *idata); + + int compact(int n, int *odata, const int *idata); + + ### 3 thrust.cu + + void scan(int n, int *odata, const int *idata); + + + ### 4 cpu.cu + + void scan(int n, int *odata, const int *idata); + + int compactWithoutScan(int n, int *odata, const int *idata) ; + + int compactWithScan(int n, int *odata, const int *idata); + + ### 5 common.cu + + __global__ void kernMapToBoolean(int n, int *bools, const int *idata); + + __global__ void kernScatter(int n, int *odata, + const int *idata, const int *bools, const int *indices); + + ### Modified the main.cpp a little bit for display. + +## Performance Graph + + ### Scanning + ![result](doc/image001.gif) + ### Thrust Scanning + ![result](doc/data_29123_image001.gif) + ### Compact + ![result](doc/data_6317_image001.gif) + +## Analysis + ### Thrust + As we can see, the thrust::exclusive_scan is rather time-costing compared with other methods. Even if I used device_vector to store the data, it is still the slowest. + But since I did not free the device_vectors, the non-pow2 as second round's speed is much faster. + I think the reason is, when calling thrust functions, it will apply for some blocks/threads inside the GPU, and will release later on. + Although I tried my best to avoid any read/write from CPU to GPU, the scan function still cost some time to arrange for some place. + + ### Unfixed Known Bugs + #### 1. When using multiple blocks, sometimes the result is not right. I think it is because __syncthreads() doesn't sync blocks? + #### 2. Since I used only 1 block, when the SIZE is more than 1024(which is the limit), apperently the result is wrong. + #### 3. CPU performace is much better, and sometimes the calculating time doesn't always raise with the SIZE. + I think this is because the SIZE is still not large enough? diff --git a/doc/1024.png b/doc/1024.png new file mode 100644 index 0000000..c17a51f Binary files /dev/null and b/doc/1024.png differ diff --git a/doc/128.png b/doc/128.png new file mode 100644 index 0000000..4f9081a Binary files /dev/null and b/doc/128.png differ diff --git a/doc/16.png b/doc/16.png new file mode 100644 index 0000000..e4057f2 Binary files /dev/null and b/doc/16.png differ diff --git a/doc/256.png b/doc/256.png new file mode 100644 index 0000000..ca4fd76 Binary files /dev/null and b/doc/256.png differ diff --git a/doc/32.png b/doc/32.png new file mode 100644 index 0000000..ec22e71 Binary files /dev/null and b/doc/32.png differ diff --git a/doc/512.png b/doc/512.png new file mode 100644 index 0000000..9e7b183 Binary files /dev/null and b/doc/512.png differ diff --git a/doc/64.png b/doc/64.png new file mode 100644 index 0000000..5c4b56f Binary files /dev/null and b/doc/64.png differ diff --git a/doc/8.png b/doc/8.png new file mode 100644 index 0000000..eb5e5d4 Binary files /dev/null and b/doc/8.png differ diff --git a/doc/data.xls b/doc/data.xls new file mode 100644 index 0000000..8b434a8 Binary files /dev/null and b/doc/data.xls differ diff --git a/doc/data_29123_image001.gif b/doc/data_29123_image001.gif new file mode 100644 index 0000000..b077003 Binary files /dev/null and b/doc/data_29123_image001.gif differ diff --git a/doc/data_6317_image001.gif b/doc/data_6317_image001.gif new file mode 100644 index 0000000..04a6eac Binary files /dev/null and b/doc/data_6317_image001.gif differ diff --git a/doc/image001.gif b/doc/image001.gif new file mode 100644 index 0000000..3e6fc9d Binary files /dev/null and b/doc/image001.gif differ diff --git a/src/main.cpp b/src/main.cpp index 7305641..83808fc 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,14 +13,17 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array + + +const int SIZE = 1 << 10; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int a[SIZE], b[SIZE], c[SIZE]; int main(int argc, char* argv[]) { + // Scan tests + printf("Size= %d, Non-Pow2 Size= %d. \n", SIZE, NPOT); - printf("\n"); printf("****************\n"); printf("** SCAN TESTS **\n"); printf("****************\n"); @@ -42,7 +45,7 @@ int main(int argc, char* argv[]) { printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); + //printArray(NPOT, b, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); @@ -115,14 +118,14 @@ int main(int argc, char* argv[]) { count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); expectedNPOT = count; - printArray(count, c, true); + //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); zeroArray(SIZE, c); printDesc("cpu compact with scan"); count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); + //printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 8fc0211..93b2530 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,15 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = threadIdx.x; + if (idata[index] == 0)//If this is 0 + { + bools[index] = 0; + } + else + { + bools[index] = 1; + } } /** @@ -33,6 +42,13 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + + int index = threadIdx.x; + + if (bools[index]!=0) + { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 05ce667..2b30508 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,15 +1,15 @@ #include #include "cpu.h" -#include "common.h" +#include "common.h" namespace StreamCompaction { namespace CPU { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** @@ -20,6 +20,14 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int i; + int count=0; + + for (i = 0; i < n; i++) + { + count += idata[i]; + odata[i] = count; + } timer().endCpuTimer(); } @@ -31,8 +39,21 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int i; + int count = 0; + for (i = 0; i < n; i++) + { + if (idata[i] != 0) + { + odata[i-count] = idata[i]; + } + else + { + count++; + } + } timer().endCpuTimer(); - return -1; + return n-count; } /** @@ -43,8 +64,23 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int i; + int count=0; + + for (i = 0; i < n; i++) + { + if (idata[i] != 0) + { + odata[i - count] = idata[i]; + } + else + { + count++; + } + } + timer().endCpuTimer(); - return -1; + return n- count; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 36c5ef2..0b1754b 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -5,36 +5,214 @@ namespace StreamCompaction { namespace Efficient { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ + __global__ void fillempty(int n,int * start, int fill) + { + int index= threadIdx.x; + + start[index+n] = fill; + } + + //Refered from Nvidia + //https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_pref01.html + // + __global__ void prescan(int *g_odata, int *g_idata, int n, int*temp) + { + //extern __shared__ int temp[]; // allocated on invocation + + int thid = blockIdx.x * blockDim.x + threadIdx.x; + int offset = 1; + + temp[2 * thid] = g_idata[2 * thid]; // load input into shared memory + temp[2 * thid + 1] = g_idata[2 * thid + 1]; + + int d,ai,bi; + + // is >> faster than /=2 for int? + for (d = n >> 1; d > 0; d >>= 1)// build sum in place up the tree + { + __syncthreads(); + if (thid < d) + { + ai = offset*(2 * thid + 1) - 1; + bi = offset*(2 * thid + 2) - 1; + + temp[bi] += temp[ai]; + } + offset *= 2; + } + + if (thid == 0) + { + temp[n - 1] = 0; // clear the last element + } + + int t; + + for (d = 1; d < n; d *= 2) // traverse down tree & build scan + { + offset >>= 1; + __syncthreads(); + if (thid < d) + { + ai = offset*(2 * thid + 1) - 1; + bi = offset*(2 * thid + 2) - 1; + + t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + } + __syncthreads(); + + g_odata[2 * thid] = temp[2 * thid]; // write results to device memory + g_odata[2 * thid + 1] = temp[2 * thid + 1]; + } + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + + //Deal with non pow2 arrays + int N; + int count; + count = ilog2(n); + N = 1<> > (n,buffer3, 0); + fillempty << <1, (4 + N - n) >> > (n,buffer4, 0); + fillempty << <1, (4 + N - n) >> > (n,temp, 0); + + //Start Timer + timer().startGpuTimer(); // TODO - timer().endGpuTimer(); + prescan<<<1,N/2>>>(buffer4, buffer3, N,temp); + //END Timer + timer().endGpuTimer(); + + //COPY data back to CPU + cudaMemcpy(odata, buffer4+1, N * sizeof(int), cudaMemcpyDeviceToHost); + //Should I do this???? + odata[n - 1] = odata[n - 2] + idata[n - 1]; + //Free buffers + cudaFree(buffer3); + cudaFree(buffer4); + cudaFree(temp); } + + + + __global__ void naive_sum2(int n, int* odata, int* idata) + { + int index = threadIdx.x; + int i; + int * ping; + int * pong; + int * swap; + ping = idata; + pong = odata; + __syncthreads(); + + for (i = 1; i < n; i *= 2) + { + if (index - i >= 0) + { + pong[index] = ping[index] + ping[index - i]; + } + else + { + pong[index] = ping[index]; + } + //Ping-Pong here! + swap = ping; + ping = pong; + pong = swap; + __syncthreads(); + } + } + + /** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ - /** - * Performs stream compaction on idata, storing the result into odata. - * All zeroes are discarded. - * - * @param n The number of elements in idata. - * @param odata The array into which to store elements. - * @param idata The array of elements to compact. - * @returns The number of elements remaining after compaction. - */ int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + // TODO - timer().endGpuTimer(); - return -1; + + int* data; + cudaMalloc((void**)&data, n * sizeof(int)); + cudaMemcpy(data, idata, n * sizeof(int), cudaMemcpyHostToDevice); //COPY from idata in CPU to data in GPU + + int* out; + cudaMalloc((void**)&out, n * sizeof(int)); + cudaMemcpy(out, idata, n * sizeof(int), cudaMemcpyHostToDevice); //COPY from idata in CPU to data in GPU + + int* bools; + cudaMalloc((void**)&bools, n * sizeof(int)); + + int* index; + cudaMalloc((void**)&index, n * sizeof(int)); + + + timer().startGpuTimer(); + + Common::kernMapToBoolean<<<1,n>>>(n, bools, data); //map the bools + + //TEST + cudaMemcpy(odata, bools, n * sizeof(int), cudaMemcpyDeviceToHost); //COPY from idata in CPU to data in GPU + int i; + int count = 0; + + + for (i = 0; i < n; i++) + { + count += odata[i]; + odata[i] = count-1; + } + + cudaMemcpy(index, odata, n * sizeof(int), cudaMemcpyHostToDevice); //COPY from idata in CPU to data in GPU + cudaMemcpy(data, idata, n * sizeof(int), cudaMemcpyHostToDevice); //COPY from idata in CPU to data in GPU + + Common::kernScatter << <1, n>> > (n, out, data, bools, index);//write to out + + timer().endGpuTimer(); + + //COPY data back to CPU + cudaMemcpy(odata, out, n * sizeof(int), cudaMemcpyDeviceToHost); //COPY back to CPU + + //Handle the first + if(idata[0]!=0) + { + odata[0] = idata[0]; + } + return count; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9218f8e..55f7bb5 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -5,21 +5,71 @@ namespace StreamCompaction { namespace Naive { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } // TODO: __global__ + __global__ void naive_sum(int n,int* odata, int* idata) + { + int index = blockIdx.x *blockDim.x + threadIdx.x; + int i; + int * ping; + int * pong; + int * swap; + ping = idata; + pong = odata; + __syncthreads(); + + for (i = 1; i < n; i *= 2) + { + if (index - i >= 0) + { + pong[index] = ping[index] + ping[index - i]; + } + else + { + pong[index] = ping[index]; + } + //Ping-Pong here! + swap = ping; + ping = pong; + pong = swap; + __syncthreads(); + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + // TODO - timer().endGpuTimer(); + int count; + + + //Create 2 buffers + int* buffer1; + int* buffer2; + cudaMalloc((void**)&buffer1, n * sizeof(int)); + cudaMalloc((void**)&buffer2, n * sizeof(int)); + cudaMemcpy(buffer1, idata, n * sizeof(int), cudaMemcpyHostToDevice); //COPY from idata in CPU to buffer1 in GPU + cudaMemcpy(buffer2, idata, n * sizeof(int), cudaMemcpyHostToDevice); //COPY from idata in CPU to buffer2 in GPU + //Start timer + timer().startGpuTimer(); + //Do Naive_Sum here! + naive_sum << <1,n>> > (n, buffer2, buffer1); + //End timer + timer().endGpuTimer(); + //COPY data back to CPU + cudaMemcpy(odata, buffer1, n * sizeof(int), cudaMemcpyDeviceToHost); + //Free buffers + cudaFree(buffer1); + cudaFree(buffer2); + } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 36b732d..c7088b5 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -8,21 +8,42 @@ namespace StreamCompaction { namespace Thrust { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + int* a; + cudaMalloc((void**)&a, n * sizeof(int)); + cudaMemcpy(a, idata, n * sizeof(int), cudaMemcpyHostToDevice); //COPY from CPU to GPU + + thrust::device_vector in(n+1); + thrust::device_vector out(n+1); + + thrust::device_ptr here(a); + thrust::copy(here, here + n, in.begin()); //CPOY from GPU to Thrust + + //Start Timing timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - timer().endGpuTimer(); + + thrust::exclusive_scan(in.begin(), in.end(), out.begin()); + timer().endGpuTimer(); + //End Timing + + thrust::copy(out.begin()+1, out.end(), odata); //COPY from Thrust to CPU + + cudaFree(a); + + } } }