diff --git a/README.md b/README.md index 0e38ddb..c35b973 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,82 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Sydney Miller + * [LinkedIn](https://www.linkedin.com/in/sydney-miller-upenn/) +* Tested on: GTX 222 222MB (CETS Virtual Lab) -### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +### README +### Performance Analysis + +#### Optimize Block Sizes for Each Implementation +![Scan Run Time vs Block Size Chart](img/ScanRunTimeVSBlockSizeChart.png) +![Scan Run Time vs Block Size Graph](img/ScanRunTimeVSBlockSize.png) + +![Stream Compaction Run Time vs Block Size Chart](img/StreamCompactionRuntimeVSBlockSizeChart.png) +![Stream Compaction Run Time vs Block Size Graph](img/StreamCompactionRuntimeVSBlockSize.png) + +For most of the implementations it seems like they perform the best when the black size is not too big or small. A smaller block size for stream compaction seemed to work better compared to scan. + +#### Compare GPU Scan implementations to Serial CPU For Varying Array Sizess +![Stream Compaction Run Time vs Block Size Chart](img/ScanRunTimeVSArraySizeChart.png) +![Stream Compaction Run Time vs Block Size Graph](img/ScanRunTimeVSArraySize.png) + +All of the implementations performed similarly in that the run time increased when the size of the array increased. The thrust implementation compared to the other implementation was much faster and a much shallower slope as seen in the chart above. This means that the thrust implementation handles exponential growth of array sizes a lot better than the other implementations. My guess is that the thrust implementation handles memory allocation much better than the other GPU implementations since we are able to see the greatest performance difference with larger amounts of memory being used. The CPU implantation was the next fastest implementation. I think the bottlenecks for the various GPU implementations are their use of global memory. In addition, some implementation use %, which can take longer on a GPU compared to comparison operators. + +#### Output of Test Program +``` +**************** +** SCAN TESTS ** +**************** + [ 5 39 0 1 23 32 46 30 49 44 40 18 31 ... 10 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0005ms (std::chrono Measured) + [ 0 5 44 44 45 68 100 146 176 225 269 309 327 ... 6474 6484 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.0005ms (std::chrono Measured) + [ 0 5 44 44 45 68 100 146 176 225 269 309 327 ... 6380 6411 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.043008ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.03584ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.0856ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.04608ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 3 0 3 1 2 0 2 3 0 0 2 3 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.0009ms (std::chrono Measured) + [ 1 3 3 1 2 2 3 2 3 3 2 2 2 ... 3 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.0011ms (std::chrono Measured) + [ 1 3 3 1 2 2 3 2 3 3 2 2 2 ... 3 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.0006ms (std::chrono Measured) + [ 1 3 3 1 2 2 3 2 3 3 2 2 2 ... 3 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.033792ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.044032ms (CUDA Measured) + passed +``` diff --git a/img/ScanRunTimeVSArraySize.png b/img/ScanRunTimeVSArraySize.png new file mode 100644 index 0000000..fd66d9f Binary files /dev/null and b/img/ScanRunTimeVSArraySize.png differ diff --git a/img/ScanRunTimeVSArraySizeChart.png b/img/ScanRunTimeVSArraySizeChart.png new file mode 100644 index 0000000..eb761fd Binary files /dev/null and b/img/ScanRunTimeVSArraySizeChart.png differ diff --git a/img/ScanRunTimeVSBlockSize.png b/img/ScanRunTimeVSBlockSize.png new file mode 100644 index 0000000..9518b15 Binary files /dev/null and b/img/ScanRunTimeVSBlockSize.png differ diff --git a/img/ScanRunTimeVSBlockSizeChart.png b/img/ScanRunTimeVSBlockSizeChart.png new file mode 100644 index 0000000..006e173 Binary files /dev/null and b/img/ScanRunTimeVSBlockSizeChart.png differ diff --git a/img/StreamCompactionRuntimeVSBlockSize.png b/img/StreamCompactionRuntimeVSBlockSize.png new file mode 100644 index 0000000..619047b Binary files /dev/null and b/img/StreamCompactionRuntimeVSBlockSize.png differ diff --git a/img/StreamCompactionRuntimeVSBlockSizeChart.png b/img/StreamCompactionRuntimeVSBlockSizeChart.png new file mode 100644 index 0000000..039fd84 Binary files /dev/null and b/img/StreamCompactionRuntimeVSBlockSizeChart.png differ diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..e77300b 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,6 +1,6 @@ #include "common.h" -void checkCUDAErrorFn(const char *msg, const char *file, int line) { +void checkCUDAErrorFn(const char* msg, const char* file, int line) { cudaError_t err = cudaGetLastError(); if (cudaSuccess == err) { return; @@ -22,17 +22,29 @@ namespace StreamCompaction { * Maps an array to an array of 0s and 1s for stream compaction. Elements * which map to 0 will be removed, and elements which map to 1 will be kept. */ - __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + __global__ void kernMapToBoolean(int n, int* bools, const int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + bools[index] = idata[index] == 0 ? 0 : 1; } /** * Performs scatter on an array. That is, for each element in idata, * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. */ - __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices) { - // TODO + __global__ void kernScatter(int n, int* odata, + const int* idata, const int* bools, const int* indices) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + if (bools[index] == 1) { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..4f4bdaa 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,7 +19,10 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = idata[i - 1] + odata[i - 1]; + } timer().endCpuTimer(); } @@ -30,9 +33,15 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int index = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[index] = idata[i]; + index++; + } + } timer().endCpuTimer(); - return -1; + return index; } /** @@ -41,10 +50,26 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - return -1; + //timer().startCpuTimer(); + // create a new array mapping the input array to zero's and one's + int* zerosAndOnes = new int[n]; + for (int i = 0; i < n; i++) { + idata[i] == 0 ? zerosAndOnes[i] = 0 : zerosAndOnes[i] = 1; + } + + // scan new array + int* scannedArray = new int[n]; + scan(n, scannedArray, zerosAndOnes); + + //scatter + for (int i = 0; i < n; i++) { + if (zerosAndOnes[i] == 1) { + odata[scannedArray[i]] = idata[i]; + } + } + + //timer().endCpuTimer(); + return scannedArray[n-1]; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..42105a3 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -1,8 +1,13 @@ +#include +#include #include #include #include "common.h" #include "efficient.h" +#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) + + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,13 +17,131 @@ namespace StreamCompaction { return timer; } + // up sweep + __global__ void upSweep(int n, int d, int* data, int dist, int distHalf) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index >= n || index % dist != 0) { + return; + } + + int toUpdate = index + dist - 1; + int toGet = index + distHalf - 1; + + data[toUpdate] += data[toGet]; + } + + // up sweep efficient + __global__ void upSweepEfficient(int n, int d, int* data, int stride, int offset) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n || index >= n / stride) { + return; + } + + int toUpdate = ((index + 1) * stride) - 1; + int toGet = toUpdate - offset; + + data[toUpdate] += data[toGet]; + } + + // down sweep + __global__ void downSweep(int n, int d, int* data, int dist, int distHalf) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n || index % dist != 0) { + return; + } + + int t_index = index + distHalf - 1; + int replace_index = index + dist - 1; + + int t = data[t_index]; + data[t_index] = data[replace_index]; + data[replace_index] += t; + } + + // down sweep efficient + __global__ void downSweepEfficient(int n, int d, int* data, int stride, int offset) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n || index >= n / stride) { + return; + } + + int replace_index = n - 1 - (index * stride); + int t_index = replace_index - offset; + + + int t = data[t_index]; + data[t_index] = data[replace_index]; + data[replace_index] += t; + } + + // set n-1 to power of 2 values equal to 0 + __global__ void setZeros(int n, int power_of_2, int* data) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index < power_of_2 && index >= n - 1) { + data[index] = 0; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int* odata, const int* idata) { + int power_of_2 = 1; + while (power_of_2 < n) { + power_of_2 *= 2; + } + + // create array of size power of 2 + int* data; + + cudaMalloc((void**)&data, power_of_2 * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc data failed!"); + + // fill array and pad end with 0's + std::unique_ptrpadded_array{ new int[power_of_2] }; + cudaMemcpy(padded_array.get(), idata, sizeof(int) * n, cudaMemcpyHostToHost); + for (int i = n; i < power_of_2; i++) { + padded_array[i] = 0; + } + + cudaMemcpy(data, padded_array.get(), sizeof(int) * power_of_2, cudaMemcpyHostToDevice); + + // kernel values + int blockSize = 128; + dim3 fullBlocksPerGrid((power_of_2 + blockSize - 1) / blockSize); + timer().startGpuTimer(); - // TODO + // up-sweep + for (int d = 0; d <= ilog2(power_of_2) - 1; d++) { + int dist = pow(2, d + 1); + int distHalf = pow(2, d); + upSweep << > > (power_of_2, d, data, dist, distHalf); + /*int stride = pow(2, d+1); + int offset = pow(2, d); + upSweepEfficient << > > (power_of_2, d, data, stride, offset);*/ + } + + + // set the last value to 0 + setZeros << > > (n, power_of_2, data); + + // down-sweep + for (int d = ilog2(power_of_2) - 1; d >= 0; d--) { + int dist = pow(2, d + 1); + int distHalf = pow(2, d); + downSweep << > > (power_of_2, d, data, dist, distHalf); + /*int stride = pow(2, d + 1); + int offset = pow(2, d); + downSweepEfficient << > > (power_of_2, d, data, stride, offset);*/ + } timer().endGpuTimer(); + + // set the out data to the scanned data + cudaMemcpy(odata, data, sizeof(int) * n, cudaMemcpyDeviceToHost); + + // free memory + cudaFree(data); } /** @@ -30,11 +153,52 @@ namespace StreamCompaction { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ - int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - return -1; + int compact(int n, int* odata, const int* idata) { + // malloc necessary space oon GPU + int* gpu_idata; + int* bools; + int* scanned_data; + int* scattered_data; + + cudaMalloc((void**)&gpu_idata, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc gpu_idata failed!"); + cudaMemcpy(gpu_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + cudaMalloc((void**)&bools, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc bools failed!"); + + cudaMalloc((void**)&scanned_data, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc scanned_data failed!"); + + cudaMalloc((void**)&scattered_data, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc scattered_data failed!"); + + int blockSize = 128; + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + //timer().startGpuTimer(); + // change to zeros and ones + Common::kernMapToBoolean << > > (n, bools, gpu_idata); + + // exclusive scan data + scan(n, scanned_data, bools); + + // scatter + Common::kernScatter << > > (n, scattered_data, gpu_idata, bools, scanned_data); + cudaMemcpy(odata, scattered_data, sizeof(int) * n, cudaMemcpyDeviceToHost); + int num = n; + for (int i = 0; i < n; i++) { + if (odata[i] == 0) { + num = i; + break; + } + } + //timer().endGpuTimer(); + + // return last index in scanned_data + std::unique_ptrscanned_cpu{ new int[n] }; + cudaMemcpy(scanned_cpu.get(), scanned_data, sizeof(int) * num, cudaMemcpyDeviceToHost); + return num; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..65446fe 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,23 +3,90 @@ #include "common.h" #include "naive.h" +#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) + namespace StreamCompaction { namespace Naive { + + using StreamCompaction::Common::PerformanceTimer; PerformanceTimer& timer() { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + // one iteration of inclusive scan + __global__ void iteration(int n, int d, const int* idata, int* odata, int offset) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + if (index >= offset) { + odata[index] = idata[index - offset] + idata[index]; + } else { + odata[index] = idata[index]; + } + } + + // turns inclusive scan to exclusive scane + __global__ void inclusiveToExclusive(int n, const int* idata, int* odata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index + 1 >= n) { + return; + } + if (index == 0) { + odata[0] = 0; + } + odata[index + 1] = idata[index]; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + void scan(int n, int* odata, const int* idata) { + int power_of_2 = 1; + while (power_of_2 < n) { + power_of_2 *= 2; + } + + // create arrays of size power of 2 + int* data_1; + int* data_2; + + cudaMalloc((void**)&data_1, power_of_2 * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc data_1 failed!"); + cudaMalloc((void**)&data_2, power_of_2 * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc data_2 failed!"); + + // fill array and pad end with 0's + cudaMemcpy(data_1, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + // call kernel + int blockSize = 128; + dim3 fullBlocksPerGrid((power_of_2 + blockSize - 1) / blockSize); + + //timer().startGpuTimer(); + for (int d = 1; d <= ilog2ceil(n); d++) { + int offset = pow(2, d - 1); + iteration << > > (power_of_2, d, data_1, data_2, offset); + int* temp = data_1; + data_1 = data_2; + data_2 = temp; + } + + inclusiveToExclusive << > > (power_of_2, data_1, data_2); + int* temp = data_1; + data_1 = data_2; + data_2 = temp; + //timer().endGpuTimer(); + + // set the out data to the scanned data + cudaMemcpy(odata, data_1, sizeof(int) * n, cudaMemcpyDeviceToHost); + + // free memory + cudaFree(data_1); + cudaFree(data_2); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..3ceb705 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,21 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::host_vector hv_in(n); + thrust::device_vector dv_in = hv_in; + + for (int i = 0; i < n; i++) { + dv_in[i] = idata[i]; + } + thrust::device_vector dv_out(n); + timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); timer().endGpuTimer(); + + for (int i = 0; i < n; i++) { + odata[i] = dv_out[i]; + } } } }