diff --git a/README.md b/README.md index b71c458..782f4eb 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,68 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Xincheng Zhang +* Tested on: (TODO) Windows 10, i7-4702HQ @ 2.20GHz 8GB, GTX 870M 3072MB (Personal Laptop) -### (TODO: Your README) +### Output Screenshot +------------- +@blocksize = 128; Arraysize = 1<<9 +![](https://github.com/XinCastle/Project2-Stream-Compaction/blob/master/img/sc1.png) +![](https://github.com/XinCastle/Project2-Stream-Compaction/blob/master/img/sc2.png) +![](https://github.com/XinCastle/Project2-Stream-Compaction/blob/master/img/sc3.png) + +### Description&Features +------------- +``` +1: CPU Scan; Stream Compaction +2: Naive Scan using GPU +3: Efficient GPU Scan; Stream Compaction +4: Thrust Scan +``` + +### Blocksize Optimization +------------- +@constant Arraysize = 1<<9, the performance of different methods will change accroding to the blocksize. Therefore, I modify the blocksize to find the optimized value of these methods. + +![](https://github.com/XinCastle/Project2-Stream-Compaction/blob/master/img/chart1.png) + +**The test data of the chart above is the following:** +------------- +Block Size | Naive Scan | Efficient Scan | Thrust Scan | CPU Scan +---|---|---|---|--- +32 | 0.3818 | 0.1598 |1.0674 |0.0019 +64 | 0.0389 | 0.1575 |1.0808 |0.0018 +128 | 0.0382 | 0.1373 |1.0888 |0.0019 +256 | 0.0387 | 0.1542 |1.0669 |0.0018 +512 | 0.0428 | 0.1398 |1.0899 |0.0019 +1024 | 0.043 | 0.1532 |1.0523 |0.0018 + +From the data I get and the chart above, we can tell that for CPU scan, the blocksize doesn't change the performance. For naive scan, its best blocksize is 128. For efficient scan, its best blocksize is 128. As for thrust scan, its best blocksize is 1024. + + +### Performance Comparison Based on Array Size +------------- +Array Size | Naive Scan | Efficient Scan | Thrust Scan | CPU Scan +---|---|---|---|--- +2^8 | 0.3546 | 0.127 |1.0821 |0.0014 +2^12 | 0.0531 | 0.1795 |2.3651 |0.1398 +2^16 | 0.2922 | 0.6992 |8.2656 |0.265 +2^20 | 3.3498 | 7.4632 |40.6058 |3.2167 +2^24 | 61.6843 | 130.091 |556.343 |53.3077 + +The chart is the following: +![](https://github.com/XinCastle/Project2-Stream-Compaction/blob/master/img/chart2.png) + + +### Questions +------------- +* Roughly optimize the block sizes of each of your implementations for minimal run time on your GPU. +Answer: in the "blocksize optimization" above. + +* Compare all of these GPU Scan implementations +Answer: in the "Performance Comparison Based on Array Size" above. I guess that thrust scan uses shared memory. + +* Write a brief explanation of the phenomena you see here. +Answer: I think the reason why GPU methods are slower than CPU method is because that in these methods, not all the threads are working which means we have lots of threads doing nothing so they are not efficient enough to be faster than CPU scan. Moreover, I think I/O is another factor that causes bottleneck because there are many memory copy operations in my code. -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) diff --git a/img/chart1.png b/img/chart1.png new file mode 100644 index 0000000..2f7c561 Binary files /dev/null and b/img/chart1.png differ diff --git a/img/chart2.png b/img/chart2.png new file mode 100644 index 0000000..d59d129 Binary files /dev/null and b/img/chart2.png differ diff --git a/img/data1.png b/img/data1.png new file mode 100644 index 0000000..8b65b05 Binary files /dev/null and b/img/data1.png differ diff --git a/img/data2.png b/img/data2.png new file mode 100644 index 0000000..cbced7f Binary files /dev/null and b/img/data2.png differ diff --git a/img/sc1.png b/img/sc1.png new file mode 100644 index 0000000..a32f790 Binary files /dev/null and b/img/sc1.png differ diff --git a/img/sc2.png b/img/sc2.png new file mode 100644 index 0000000..17ad6d0 Binary files /dev/null and b/img/sc2.png differ diff --git a/img/sc3.png b/img/sc3.png new file mode 100644 index 0000000..184eb08 Binary files /dev/null and b/img/sc3.png differ diff --git a/src/main.cpp b/src/main.cpp index 53ef79e..89277d0 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 24; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 8fc0211..bb8dae9 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,11 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + if (index < n) + { + bools[index] = (idata[index] == 0) ? 0 : 1; + } } /** @@ -33,6 +38,21 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + if (index >= n) + { + return; + } + + if (index == n - 1) + { + odata[indices[index]] = idata[index]; + } + + else if (indices[index] != indices[index + 1]) + { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 693fd5e..ace62e2 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -10,123 +10,129 @@ #include #include + +#include + + #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define blockSize 1024 + /** - * Check for CUDA errors; print and exit if there was a problem. - */ +* Check for CUDA errors; print and exit if there was a problem. +*/ void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1); inline int ilog2(int x) { - int lg = 0; - while (x >>= 1) { - ++lg; - } - return lg; + int lg = 0; + while (x >>= 1) { + ++lg; + } + return lg; } inline int ilog2ceil(int x) { - return if x == 1 : 0 ? ilog2(x - 1) + 1; + return ilog2(x - 1) + 1; } namespace StreamCompaction { - namespace Common { - __global__ void kernMapToBoolean(int n, int *bools, const int *idata); - - __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices); - - /** - * This class is used for timing the performance - * Uncopyable and unmovable - * - * Adapted from WindyDarian(https://github.com/WindyDarian) - */ - class PerformanceTimer - { - public: - PerformanceTimer() - { - cudaEventCreate(&event_start); - cudaEventCreate(&event_end); - } - - ~PerformanceTimer() - { - cudaEventDestroy(event_start); - cudaEventDestroy(event_end); - } - - void startCpuTimer() - { - if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); } - cpu_timer_started = true; - - time_start_cpu = std::chrono::high_resolution_clock::now(); - } - - void endCpuTimer() - { - time_end_cpu = std::chrono::high_resolution_clock::now(); - - if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); } - - std::chrono::duration duro = time_end_cpu - time_start_cpu; - prev_elapsed_time_cpu_milliseconds = - static_cast(duro.count()); - - cpu_timer_started = false; - } - - void startGpuTimer() - { - if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); } - gpu_timer_started = true; - - cudaEventRecord(event_start); - } - - void endGpuTimer() - { - cudaEventRecord(event_end); - cudaEventSynchronize(event_end); - - if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); } - - cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end); - gpu_timer_started = false; - } - - float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015 - { - return prev_elapsed_time_cpu_milliseconds; - } - - float getGpuElapsedTimeForPreviousOperation() //noexcept - { - return prev_elapsed_time_gpu_milliseconds; - } - - // remove copy and move functions - PerformanceTimer(const PerformanceTimer&) = delete; - PerformanceTimer(PerformanceTimer&&) = delete; - PerformanceTimer& operator=(const PerformanceTimer&) = delete; - PerformanceTimer& operator=(PerformanceTimer&&) = delete; - - private: - cudaEvent_t event_start = nullptr; - cudaEvent_t event_end = nullptr; - - using time_point_t = std::chrono::high_resolution_clock::time_point; - time_point_t time_start_cpu; - time_point_t time_end_cpu; - - bool cpu_timer_started = false; - bool gpu_timer_started = false; - - float prev_elapsed_time_cpu_milliseconds = 0.f; - float prev_elapsed_time_gpu_milliseconds = 0.f; - }; - } -} + namespace Common { + __global__ void kernMapToBoolean(int n, int *bools, const int *idata); + + __global__ void kernScatter(int n, int *odata, + const int *idata, const int *bools, const int *indices); + + /** + * This class is used for timing the performance + * Uncopyable and unmovable + * + * Adapted from WindyDarian(https://github.com/WindyDarian) + */ + class PerformanceTimer + { + public: + PerformanceTimer() + { + cudaEventCreate(&event_start); + cudaEventCreate(&event_end); + } + + ~PerformanceTimer() + { + cudaEventDestroy(event_start); + cudaEventDestroy(event_end); + } + + void startCpuTimer() + { + if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); } + cpu_timer_started = true; + + time_start_cpu = std::chrono::high_resolution_clock::now(); + } + + void endCpuTimer() + { + time_end_cpu = std::chrono::high_resolution_clock::now(); + + if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); } + + std::chrono::duration duro = time_end_cpu - time_start_cpu; + prev_elapsed_time_cpu_milliseconds = + static_cast(duro.count()); + + cpu_timer_started = false; + } + + void startGpuTimer() + { + if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); } + gpu_timer_started = true; + + cudaEventRecord(event_start); + } + + void endGpuTimer() + { + cudaEventRecord(event_end); + cudaEventSynchronize(event_end); + + if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); } + + cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end); + gpu_timer_started = false; + } + + float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015 + { + return prev_elapsed_time_cpu_milliseconds; + } + + float getGpuElapsedTimeForPreviousOperation() //noexcept + { + return prev_elapsed_time_gpu_milliseconds; + } + + // remove copy and move functions + PerformanceTimer(const PerformanceTimer&) = delete; + PerformanceTimer(PerformanceTimer&&) = delete; + PerformanceTimer& operator=(const PerformanceTimer&) = delete; + PerformanceTimer& operator=(PerformanceTimer&&) = delete; + + private: + cudaEvent_t event_start = nullptr; + cudaEvent_t event_end = nullptr; + + using time_point_t = std::chrono::high_resolution_clock::time_point; + time_point_t time_start_cpu; + time_point_t time_end_cpu; + + bool cpu_timer_started = false; + bool gpu_timer_started = false; + + float prev_elapsed_time_cpu_milliseconds = 0.f; + float prev_elapsed_time_gpu_milliseconds = 0.f; + }; + } +} \ No newline at end of file diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 05ce667..ca9a5fa 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -17,9 +17,23 @@ namespace StreamCompaction { * For performance analysis, this is supposed to be a simple for loop. * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ + + void scanhelp(int n, int *odata, const int *idata) + { + odata[0] = 0; + for (int i = 1; i < n; i++) + { + odata[i] = odata[i - 1] + idata[i - 1]; + } + } + + + void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + scanhelp(n, odata, idata); + timer().endCpuTimer(); } @@ -29,22 +43,70 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { + + int counter = 0; timer().startCpuTimer(); // TODO + for (int i = 0; i < n; i++) + { + if (idata[i] != 0) + { + odata[counter++] = idata[i]; + } + } + timer().endCpuTimer(); - return -1; + return counter; + //return -1; } + + + int scatter(int n, int *odata, const int *idata, const int *ichange, const int *exSum) + { + int counter = 0; + for (int i = 0; i < n; i++) + { + if (ichange[i] == 1) + { + odata[exSum[i]] = idata[i]; + counter++; + } + } + return counter; + } + + + /** * CPU stream compaction using scan and scatter, like the parallel version. * * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + + int* iChange = new int[n]; + int* exSum = new int[n]; + timer().startCpuTimer(); // TODO - timer().endCpuTimer(); - return -1; + + for (int i = 0; i < n; i++) + { + iChange[i] = (idata[i] == 0) ? 0 : 1; + } + + //odataChanged is the exclusive prefix sum + scanhelp(n, exSum, iChange); + int counter = scatter(n, odata, idata, iChange, exSum); + + timer().endCpuTimer(); + + delete[] iChange; + delete[] exSum; + + + return counter; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 36c5ef2..696fffa 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -12,15 +12,111 @@ namespace StreamCompaction { return timer; } + + __global__ void upSweep(int n, int d, int *idata) + { + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + if (index >= n) + { + return; + } + + //based on slides + int delta = 1 << d; + int doubleDelta = 1 << (d + 1); + + if (index % doubleDelta == 0) + { + idata[index + doubleDelta - 1] += idata[index + delta - 1]; + } + } + + + __global__ void downSweep(int n, int d, int *idata) + { + //based on slides + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + if (index >= n) + { + return; + } + int delta = 1 << d; + int doubleDelta = 1 << (d + 1); + + if (index % doubleDelta == 0) + { + int t = idata[index + delta - 1]; + idata[index + delta - 1] = idata[index + doubleDelta - 1]; + idata[index + doubleDelta - 1] += t; + } + } + + //helper function for scan + void helpscan(int n, int *devData) + { + int blockNum = (n + blockSize - 1) / blockSize; + + for (int d = 0; d < ilog2ceil(n) - 1; d++) + { + upSweep << > >(n, d, devData); + } + + int counter = 0; + cudaMemcpy(&devData[n - 1], &counter, sizeof(int), cudaMemcpyHostToDevice); + + for (int d = ilog2ceil(n) - 1; d >= 0; d--) + { + downSweep << > >(n, d, devData); + } + } + + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + // TODO + //handle both conditions of PO2 and NPO2 + int num; + int *t; + int depth = ilog2ceil(n); + + if (n & (n - 1) != 0) + { + num = 1 << depth; + t = (int*)malloc(num * sizeof(int)); + memcpy(t, idata, num * sizeof(int)); + + + for (int j = n; j < num; j++) + { + t[j] = 0; + } + + } + else + { + num = n; + t = (int*)malloc(num * sizeof(int)); + memcpy(t, idata, num * sizeof(int)); + } + + int size = num * sizeof(int); + int *devi; + + cudaMalloc((void**)&devi, size); + cudaMemcpy(devi, t, size, cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + helpscan(num, devi); + timer().endGpuTimer(); + + cudaMemcpy(odata, devi, size, cudaMemcpyDeviceToHost); + cudaFree(devi); } + + /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -31,10 +127,75 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - return -1; + // TODO + //handle both conditions of PO2 and NPO2 + int num; + int *t; + int depth = ilog2ceil(n); + + if (n & (n - 1) != 0) + { + num = 1 << depth; + t = (int*)malloc(num * sizeof(int)); + memcpy(t, idata, num * sizeof(int)); + + + for (int j = n; j < num; j++) + { + t[j] = 0; + } + + } + else + { + num = n; + t = (int*)malloc(num * sizeof(int)); + memcpy(t, idata, num * sizeof(int)); + } + + int asize = num * sizeof(int); + int blockNum = (num + blockSize - 1) / blockSize; + int *devi; + int *devo; + int *devm; + + cudaMalloc((void**)&devi, asize); + checkCUDAError("cudaMalloc failed"); + cudaMalloc((void**)&devo, asize); + checkCUDAError("cudaMalloc failed"); + cudaMalloc((void**)&devm, asize); + checkCUDAError("cudaMalloc failed"); + + timer().startGpuTimer(); + + cudaMemcpy(devi, t, asize, cudaMemcpyHostToDevice); + StreamCompaction::Common::kernMapToBoolean << > >(num, devm, devi); + + int end; + cudaMemcpy(&end, devm + num - 1, sizeof(int), cudaMemcpyDeviceToHost); + + helpscan(num, devm); + + int size; + cudaMemcpy(&size, devm + num - 1, sizeof(int), cudaMemcpyDeviceToHost); + + StreamCompaction::Common::kernScatter << > >(num, devo, devi, devm, devm); + + timer().endGpuTimer(); + + cudaMemcpy(odata, devo, asize, cudaMemcpyDeviceToHost); + + + if (end == 1) + { + size++; + } + + cudaFree(devi); + cudaFree(devo); + cudaFree(devm); + + return size; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9218f8e..e733d76 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,8 @@ #include "common.h" #include "naive.h" +int *devi, *devo; + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -13,13 +15,60 @@ namespace StreamCompaction { } // TODO: __global__ + __global__ void kernelNaive(int n, int delta, const int *idata, int *odata) + { + int index = (blockIdx.x *blockDim.x) + threadIdx.x; + if (index >= n) + { + return; + } + if (index - delta < 0) + { + odata[index] = idata[index]; + } + else + { + odata[index] = idata[index - delta] + idata[index]; + } + } + + + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + // TODO - timer().endGpuTimer(); + int depth = ilog2ceil(n); + + cudaMalloc((void**)&devi, n * sizeof(int)); + cudaMalloc((void**)&devo, n * sizeof(int)); + + checkCUDAError("cudaMalloc error"); + + cudaMemcpy(devi, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + int blockNum = (n + blockSize - 1) / blockSize; + int delta; + + timer().startGpuTimer(); + for (int i = 1; i <= depth; i++) + { + delta = (1 << (i - 1)); + kernelNaive << < blockNum, blockSize >> >(n, delta, devi, devo); + std::swap(devi, devo); + } + + timer().endGpuTimer(); + + std::swap(devi, devo); + + cudaMemcpy(odata + 1, devo, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost); + odata[0] = 0; + + cudaFree(devi); + cudaFree(devo); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 36b732d..5b0df2f 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,17 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + thrust::device_vector dv_in(idata, idata + n); + thrust::device_vector dv_out(odata, odata + n); + timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + timer().endGpuTimer(); + thrust::copy(dv_out.begin(), dv_out.end(), odata); } } }