diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..12949a7 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,20 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "CUDA C++: Launch", + "type": "cuda-gdb", + "request": "launch", + "program": "" + }, + { + "name": "CUDA C++: Attach", + "type": "cuda-gdb", + "request": "attach", + "processId": "${command:cuda.pickProcess}" + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..fdb8524 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,18 @@ +{ + "ros.distro": "humble", + "files.associations": { + "*.sdf": "xml", + "*.world": "xml", + "array": "cpp", + "deque": "cpp", + "forward_list": "cpp", + "list": "cpp", + "string": "cpp", + "unordered_map": "cpp", + "unordered_set": "cpp", + "vector": "cpp", + "string_view": "cpp", + "initializer_list": "cpp", + "numeric": "cpp" + } +} \ No newline at end of file diff --git a/README.md b/README.md index 0e38ddb..7bad2db 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,101 @@ CUDA Stream Compaction ====================== -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +**Jason Xie** -### (TODO: Your README) +[🤓 LinkedIn](https://linkedin.com/in/jia-chun-xie) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +[😇 my website](https://jchunx.dev) +[🥵 X (formerly 🐦)](https://x.com/codemonke_) + +Tested on: Ubuntu 22.04, i5-8400, RTX 3060Ti, personal machine + + +## About + +CUDA implementation of stream compaction & scan. + +## Performance Analysis + +### Block Size + +For all kernels, I found that 128 was the optimal block size. + +### Time vs. Array Size + +![comparison plot](assets/cuda-perf-compare-proj2.png) +| Array Size | CPU Scan | Naive Scan | Work-Efficient Scan | Thrust Scan | CPU Compact | Work-Efficient Compact | +| ---------- | -------- | ---------- | ------------------- | ----------- | ----------- | ---------------------- | +| 2^8 | 0.000333 | 0.0806 | 0.0963 | 0.0522 | 0.000834 | 0.125 | +| 2^12 | 0.00235 | 0.0840 | 0.109 | 0.0543 | 0.00924 | 0.108 | +| 2^16 | 0.0388 | 0.0820 | 0.120 | 0.0522 | 0.145 | 0.107 | +| 2^20 | 0.541 | 0.503 | 0.533 | 0.194 | 2.42 | 0.726 | +| 2^24 | 9.033 | 11.134 | 7.661 | 0.500 | 38.880 | 8.487 | + +## What is going on here? + +Unoptimized GPU scanning actually performs worse than CPU scanning. A bit of Nsight shows that the kernels have low warp occupancy (esp. up / down sweeps): +![low occupancy](assets/low-occupancy.png) + +The hypothesis here is that the kernels are being bottlenecked by global memory access and warp divergence. + +## Test Program Outputs + +``` +**************** +** SCAN TESTS ** +**************** + [ 38 28 32 41 11 4 49 31 48 8 42 48 22 ... 25 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 4.48317ms (std::chrono Measured) + [ 38 66 98 139 150 154 203 234 282 290 332 380 402 ... 205516747 205516747 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 4.44224ms (std::chrono Measured) + [ 38 66 98 139 150 154 203 234 282 290 332 380 402 ... 205516679 205516703 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 4.82586ms (CUDA Measured) + a[4194304] = 102760269, b[4194304] = 102760231 + FAIL VALUE +==== naive scan, non-power-of-two ==== + elapsed time: 4.18042ms (CUDA Measured) + a[4194304] = 102760269, b[4194304] = 102760231 + FAIL VALUE +==== work-efficient scan, power-of-two ==== + elapsed time: 3.68918ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 3.61574ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.346912ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.310976ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 0 2 1 1 0 3 1 2 0 2 2 0 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 19.3632ms (std::chrono Measured) + [ 2 1 1 3 1 2 2 2 2 2 2 1 3 ... 3 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 19.8116ms (std::chrono Measured) + [ 2 1 1 3 1 2 2 2 2 2 2 1 3 ... 3 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 71.7739ms (std::chrono Measured) + [ 2 1 1 3 1 2 2 2 2 2 2 1 3 ... 3 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 4.29862ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 4.53734ms (CUDA Measured) + passed +``` \ No newline at end of file diff --git a/assets/cuda-perf-compare-proj2.png b/assets/cuda-perf-compare-proj2.png new file mode 100644 index 0000000..aa4319a Binary files /dev/null and b/assets/cuda-perf-compare-proj2.png differ diff --git a/assets/low-occupancy.png b/assets/low-occupancy.png new file mode 100644 index 0000000..a8c787d Binary files /dev/null and b/assets/low-occupancy.png differ diff --git a/build.sh b/build.sh new file mode 100644 index 0000000..29b879b --- /dev/null +++ b/build.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# Create build directory if it doesn't exist +mkdir -p build + +# Change into the build directory +cd build + +# Default build type to Release +build_type="Release" + +# Check for argument "debug" to change build type +if [ "$1" == "debug" ]; then + echo "Building in debug mode" + build_type="Debug" +fi + +# Run cmake with the specified build type +cmake -DCMAKE_BUILD_TYPE=$build_type .. + +# Build the project with dbg if debug was specified +if [ "$1" == "debug" ]; then + make dbg=1 +else + make +fi \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..cb06e58 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,9 +11,10 @@ #include #include #include +#include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 24; // 8 feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -21,7 +22,9 @@ int *c = new int[SIZE]; int main(int argc, char* argv[]) { // Scan tests - + // std::cout << "sleeping..." << std::endl; + // sleep(5); + // std::cout << "awake!" << std::endl; printf("\n"); printf("****************\n"); printf("** SCAN TESTS **\n"); diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..6599553 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,11 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + bools[index] = idata[index] == 0 ? 0 : 1; } /** @@ -32,7 +36,13 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + if (bools[index] == 1) { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..b0b2e2b 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,10 +19,24 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + for (int i = 0; i < n; i++) { + odata[i] = idata[i]; + if (i > 0) { + odata[i] += odata[i - 1]; + } + } timer().endCpuTimer(); } + void _scan_no_timer(int n, int *odata, const int *idata) { + for (int i = 0; i < n; i++) { + odata[i] = idata[i]; + if (i > 0) { + odata[i] += odata[i - 1]; + } + } + } + /** * CPU stream compaction without using the scan function. * @@ -30,9 +44,15 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int numElements = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[numElements] = idata[i]; + numElements++; + } + } timer().endCpuTimer(); - return -1; + return numElements; } /** @@ -42,9 +62,29 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int* bools = new int[n]; + for (int i = 0; i < n; i++) { + bools[i] = (idata[i] != 0) ? 1 : 0; + } + int* scanned = new int[n]; + _scan_no_timer(n, scanned, bools); + // convert to exclusive scan + for (int i = n - 1; i > 0; i--) { + scanned[i] = scanned[i - 1]; + } + scanned[0] = 0; + int numElements = 0; + for (int i = 0; i < n; i++) { + if (bools[i] != 0) { + odata[scanned[i]] = idata[i]; + numElements++; + } + } + + delete[] bools; + delete[] scanned; timer().endCpuTimer(); - return -1; + return numElements; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..12a2d6e 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -12,13 +12,74 @@ namespace StreamCompaction { return timer; } + __global__ void kernScanUpsweep(int n, int d, int* idata) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k >= n || k % (1 << (d + 1)) != 0) { + return; + } + int idx_right = k + (1 << (d + 1)) - 1; + int idx_left = k + (1 << d) - 1; + idata[idx_right] += idata[idx_left]; + } + + __global__ void kernScanDownsweep(int n, int d, int* idata) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + if (k >= n || k % (1 << (d + 1)) != 0) { + return; + } + int idx_right = k + (1 << (d + 1)) - 1; + int idx_left = k + (1 << d) - 1; + + int tmp = idata[idx_left]; + idata[idx_left] = idata[idx_right]; + idata[idx_right] += tmp; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + + int n_padded = 1 << ilog2ceil(n); + + int *dev_idata; + + cudaMalloc((void**)&dev_idata, n_padded * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_idata failed!"); + cudaMemset(dev_idata + n, 0, (n_padded - n) * sizeof(int)); timer().startGpuTimer(); - // TODO + _scan_dev(n_padded, dev_idata); timer().endGpuTimer(); + + cudaMemcpy(odata, dev_idata+1, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost); + odata[n - 1] = odata[n - 2] + idata[n - 1]; + checkCUDAError("cudaMemcpy odata failed!"); + cudaFree(dev_idata); + checkCUDAError("cudaFree failed!"); + } + + void _scan_dev(int n_padded, int *dev_idata) { + int blockSize = 128; + int num_blocks = (n_padded + blockSize - 1) / blockSize; + + int ilog2ceil_n = ilog2ceil(n_padded)-1; + + for (int d = 0; d <= ilog2ceil_n; d++) { + kernScanUpsweep<<>>(n_padded, d, dev_idata); + checkCUDAError("kernScanUpsweep failed!"); + } + // 2. downsweep + cudaMemset(dev_idata + n_padded - 1, 0, sizeof(int)); + checkCUDAError("cudaMemset dev_idata failed!"); + + for (int d = ilog2ceil_n; d >= 0; d--) { + kernScanDownsweep<<>>(n_padded, d, dev_idata); + checkCUDAError("kernScanDownsweep failed!"); + } } /** @@ -31,10 +92,61 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int blockSize = 128; + int n_padded = 1 << ilog2ceil(n); + int num_blocks = (n_padded + blockSize - 1) / blockSize; + + int *dev_bools; + int *dev_idata; + int *dev_indices; + int *dev_scattered; + + cudaMalloc((void**)&dev_bools, n_padded * sizeof(int)); + checkCUDAError("cudaMalloc dev_bools failed!"); + cudaMalloc((void**)&dev_idata, n_padded * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy dev_idata failed!"); + if (n < n_padded) { + cudaMemset(dev_idata + n, 0, (n_padded - n) * sizeof(int)); + checkCUDAError("cudaMemset dev_idata failed!"); + } + timer().startGpuTimer(); - // TODO + Common::kernMapToBoolean<<>>(n_padded, dev_bools, dev_idata); + + cudaMalloc((void**)&dev_indices, n_padded * sizeof(int)); + checkCUDAError("cudaMalloc dev_indices failed!"); + cudaMemcpy(dev_indices, dev_bools, n_padded * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpy dev_indices failed!"); + + _scan_dev(n_padded, dev_indices); + checkCUDAError("_scan_dev failed!"); + + cudaMalloc((void**)&dev_scattered, n_padded * sizeof(int)); + checkCUDAError("cudaMalloc dev_scattered failed!"); + + Common::kernScatter<<>>(n_padded, dev_scattered, dev_idata, dev_bools, dev_indices); + checkCUDAError("kernScatter failed!"); timer().endGpuTimer(); - return -1; + + // since dev_indices is exclusive scan (prefix sum), we can grab num valid elements from the last element + int num_valid; + int last_bool; + cudaMemcpy(&num_valid, dev_indices + n_padded - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&last_bool, dev_bools + n_padded - 1, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy num_valid failed!"); + num_valid += last_bool; + cudaMemcpy(odata, dev_scattered, num_valid * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy odata failed!"); + + cudaFree(dev_bools); + cudaFree(dev_idata); + cudaFree(dev_indices); + cudaFree(dev_scattered); + checkCUDAError("cudaFree failed!"); + + return num_valid; } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..29ae0fd 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -8,6 +8,8 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata); + void _scan_dev(int n, int *dev_idata); + int compact(int n, int *odata, const int *idata); } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..993e2b1 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -11,15 +11,64 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + __global__ void kernNaiveScan(int n, int d, int* odata, int* idata) { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + int left = 1 << (d-1); + if (k >= left) { + odata[k] = idata[k-left] + idata[k]; + } + else { + odata[k] = idata[k]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + int block_size = 128; + int n_padded = 1 << ilog2ceil(n); + int num_blocks = (n_padded + block_size - 1) / block_size; + int* dev_bufferA; + int* dev_bufferB; + + cudaMalloc((void**)&dev_bufferA, n_padded * sizeof(int)); + checkCUDAError("cudaMalloc dev_bufferA failed!"); + cudaMalloc((void**)&dev_bufferB, n_padded * sizeof(int)); + checkCUDAError("cudaMalloc dev_bufferB failed!"); + + cudaMemset(dev_bufferA, 0, n_padded * sizeof(int)); + checkCUDAError("cudaMemset dev_bufferA failed!"); + cudaMemset(dev_bufferB, 0, n_padded * sizeof(int)); + checkCUDAError("cudaMemset dev_bufferB failed!"); + + cudaMemcpy(dev_bufferA, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy idata to dev_bufferA failed!"); + timer().startGpuTimer(); - // TODO + int d_max = ilog2ceil(n_padded); + for (int d=1; d<=d_max; d++) { + kernNaiveScan<<>>(n_padded, d, dev_bufferB, dev_bufferA); + checkCUDAError("kernNaiveScan failed!"); + std::swap(dev_bufferA, dev_bufferB); + } timer().endGpuTimer(); + if (d_max % 2 == 0) { + cudaMemcpy(odata, dev_bufferA, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy dev_bufferA to odata failed!"); + } else { + cudaMemcpy(odata, dev_bufferB, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy dev_bufferB to odata failed!"); + } + + cudaFree(dev_bufferA); + checkCUDAError("cudaFree dev_bufferA failed!"); + cudaFree(dev_bufferB); + checkCUDAError("cudaFree dev_bufferB failed!"); + + } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..a8f34fb 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,15 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + + thrust::device_vector dv_in(idata, idata + n); + thrust::device_vector dv_out(n); timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); timer().endGpuTimer(); + thrust::copy(dv_out.begin()+1, dv_out.end(), odata); + odata[n-1] = odata[n-2] + idata[n-1]; } } }