diff --git a/README.md b/README.md index b71c458..0e468b2 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,42 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Yash Vardhan +* Tested on: Windows 10 Pro, Intel i5-4200U (4) @ 2.600GHz 4GB, NVIDIA GeForce 840M 2048MB -### (TODO: Your README) +In this project, I have implemented the stream compaction and scan(prefix sum) in CUDA on both a CPU and a GPU, and have compared the performance of both of these side-by-side. The parallel version of Stream Compaction run on a GPU is a useful tool for many applications like deferred shading, path tracer algorithms. Algorithms like scan (a.k.a. prefix-sum) are the basis of many algorithms. They are specifically designed to run on GPU architecture. -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +The scan algorithms were implemented on - +- CPU +- Naive Scan on GPU +- Work-efficient Scan on GPU +- Scan using Thrust on GPU. + +Performance Analysis +------------------------- + +Block Sizes were varied from 16 to 1024. The performance peaked around 64,128 and 256. I selected to benchmark the performance graphs on block size of 128. + +### Time(in ms) vs Number of elements in array +---------------------------------------------- + +![](img/scan.jpg) + +![](img/Data.png) + +The size of array was increased incrementally with a magnitude of 2^4. The transistion from 2^16 to 2^20 showed a signinficant diversion between thrust and naive GPU impementation. Also a difference between Work-Efficient and naive GPU implementation was observed. The Work-efficient GPU Scan could be optimized much more by using shared memory, which will reflect ina future benchmark test. + +### Scan Results +---------------- + +Tested on array size of 512 + +![](img/resScan.png) + +### Stream Compaction Results +----------------------------- + +Tested on array size of 512 + +![](img/resCompact.png) diff --git a/img/Data.png b/img/Data.png new file mode 100644 index 0000000..7e8a6be Binary files /dev/null and b/img/Data.png differ diff --git a/img/resCompact.png b/img/resCompact.png new file mode 100644 index 0000000..1c0d9f8 Binary files /dev/null and b/img/resCompact.png differ diff --git a/img/resScan.png b/img/resScan.png new file mode 100644 index 0000000..b109058 Binary files /dev/null and b/img/resScan.png differ diff --git a/img/scan.jpg b/img/scan.jpg new file mode 100644 index 0000000..b776a1c Binary files /dev/null and b/img/scan.jpg differ diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..c8709e7 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_50 ) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 8fc0211..54e9132 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,10 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + + bools[index] = (idata[index] != 0) ? 1 : 0; } /** @@ -33,7 +37,12 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + + int i = indices[index]; + if (bools[index] == 1) + odata[i] = idata[index]; } - } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 55f1b38..fee9207 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -9,6 +9,8 @@ #include #include #include + +#define blockSize 128 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) @@ -27,7 +29,7 @@ inline int ilog2(int x) { } inline int ilog2ceil(int x) { - return ilog2(x - 1) + 1; + return ilog2(x - 1) + 1; } namespace StreamCompaction { diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 05ce667..28c5f9a 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,15 +1,15 @@ #include #include "cpu.h" -#include "common.h" +#include "common.h" namespace StreamCompaction { namespace CPU { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** @@ -17,9 +17,15 @@ namespace StreamCompaction { * For performance analysis, this is supposed to be a simple for loop. * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ - void scan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO + void scan(int n, int *odata, const int *idata) { + timer().startCpuTimer(); + //TODO + int sum = 0; + for (int i = 0; i < n; i++) + { + odata[i] = sum; + sum += idata[i]; + } timer().endCpuTimer(); } @@ -31,8 +37,17 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int numel = 0; + for (int i = 0; i < n; i++) + { + if (idata[i] != 0) + { + odata[numel] = idata[i]; + numel++; + } + } timer().endCpuTimer(); - return -1; + return numel; } /** @@ -43,8 +58,25 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int *oscan = (int*)malloc(n * sizeof(int)); + int *iscan = (int*)malloc(n * sizeof(int)); + for (int i = 0; i < n; i++) + if(idata[i]==0) + iscan[i] = 0; + else + iscan[i] = 1; + scan(n, oscan, iscan); + int numel = 0; + for (int i = 0; i < n; i++) + { + if (idata[i] != 0) + { + odata[oscan[i]] = idata[i]; + numel++; + } + } timer().endCpuTimer(); - return -1; + return numel; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 36c5ef2..7d4cea2 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -12,13 +12,66 @@ namespace StreamCompaction { return timer; } + __global__ void upsweep(int n, int k, int* dev) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + + if ((index % (2 * k) == 0) && (index + (2 * k) <= n)) + dev[index + (2 * k) - 1] += dev[index + k - 1]; + } + + __global__ void downsweep(int n, int k, int* dev) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + + if ((index % (2 * k) == 0) && (index + (2 * k) <= n)) + { + int tmp = dev[index + k - 1]; + dev[index + k - 1] = dev[index + (2 * k) - 1]; + dev[index + (2 * k) - 1] += tmp; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + void scan(int n, int *odata, const int *idata) { + + int* dev; + int potn = 1 << ilog2ceil(n); + + cudaMalloc((void**)&dev, potn * sizeof(int)); + checkCUDAError("Malloc for input device failed\n"); + + cudaMemset(dev, 0, potn * sizeof(n)); + + cudaMemcpy(dev, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy for device failed\n"); + + dim3 fullBlocksPerGrid((potn + blockSize - 1) / blockSize); + + //timer().startGpuTimer(); + + for (int k = 1; k < potn; k*=2) + { + upsweep <<< fullBlocksPerGrid, blockSize >>> (potn, k, dev); + } + + cudaMemset(dev + potn - 1, 0, sizeof(int)); + + for (int k = potn/2; k>0; k/=2) + { + downsweep <<< fullBlocksPerGrid, blockSize >>> (potn, k, dev); + } + + //timer().endGpuTimer(); + + cudaMemcpy(odata, dev, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy for output data failed\n"); + + cudaFree(dev); } /** @@ -31,10 +84,57 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int* idev; + int* odev; + cudaMalloc((void**)&idev, n * sizeof(int)); + checkCUDAError("cudaMalloc idata failed!"); + + cudaMalloc((void**)&odev, n * sizeof(*odev)); + checkCUDAError("cudaMalloc odev failed!"); + + cudaMemcpy(idev, idata, n * sizeof(*idata), cudaMemcpyHostToDevice); + + int potn = 1 << ilog2ceil(n); + int* boolarr; + + cudaMalloc((void**)&boolarr, potn * sizeof(int)); + checkCUDAError("cudaMalloc bool failed!"); + + cudaMemset(boolarr, 0, potn * sizeof(int)); + + int* indices; + cudaMalloc((void**)&indices, potn * sizeof(int)); + checkCUDAError("cudaMalloc bool failed!"); + + cudaMemcpy(indices, boolarr, n * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpy from to dev_bools to dev_indices failed!"); + + dim3 fullBlocksPerGrid((potn + blockSize - 1) / blockSize); + timer().startGpuTimer(); // TODO + StreamCompaction::Common::kernMapToBoolean <<>>(n, boolarr, idev); + scan(n, indices, boolarr); + StreamCompaction::Common::kernScatter <<>>(n, odev, idev, boolarr, indices); + timer().endGpuTimer(); - return -1; + + cudaMemcpy(odata, odev, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy for odev failed"); + + int numbool = 0; + cudaMemcpy(&numbool, boolarr + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + + int numindices = 0; + cudaMemcpy(&numindices, indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + + int total = numbool + numindices; + cudaFree(indices); + cudaFree(idev); + cudaFree(odev); + cudaFree(boolarr); + + return total; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9218f8e..1b4e48e 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -11,15 +11,67 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } + // TODO: __global__ + __global__ void naivescan(int n, int k, int* idev, int* odev) + { + auto index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) {return;} + + if (index >= k) + odev[index] = idev[index] + idev[index - k]; + else + odev[index] = idev[index]; + } + + __global__ void inc2exc(int n, int* idev, int* odev) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { return; } + + if (index > 0) + odev[index] = idev[index - 1]; + else + odev[index] = 0; + + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + // TODO + int* idev; + int* odev; + cudaMalloc((void**)&idev, n * sizeof(int)); + checkCUDAError("Malloc for input device failed\n"); + + cudaMalloc((void**)&odev, n * sizeof(int)); + checkCUDAError("Malloc for input device failed\n"); + + cudaMemcpy(idev, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy for input device failed\n"); + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + timer().startGpuTimer(); - // TODO + + for (int k = 1; k < n; k<<=1) + { + naivescan <<< fullBlocksPerGrid, blockSize >>> (n, k, idev, odev); + std::swap(idev, odev); + } + + inc2exc <<< fullBlocksPerGrid, blockSize >>> (n, idev, odev); + timer().endGpuTimer(); + + cudaMemcpy(odata, odev, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy for output device failed\n"); + + cudaFree(odev); + cudaFree(idev); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 36b732d..0bcea1a 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,20 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + thrust::device_vector dv_in(idata, idata + n); + thrust::device_vector dv_out(odata, odata + n); + timer().startGpuTimer(); + // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + timer().endGpuTimer(); + + thrust::copy(dv_out.begin(), dv_out.end(), odata); } } }