diff --git a/README.md b/README.md
index 0e38ddb..6cf09f5 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,95 @@ CUDA Stream Compaction
**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
-* (TODO) YOUR NAME HERE
- * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Kyle Bauer
+ * [LinkedIn](https://www.linkedin.com/in/kyle-bauer-75bb25171/), [twitter](https://x.com/KyleBauer414346)
+* Tested on: Windows 10, i-7 12700 @ 2.1GHz 32GB, NVIDIA T1000 4GB (CETS Virtual Lab)
-### (TODO: Your README)
+Features
+---
+* CPU Scan and Stream Compaction
+* Naive Scan
+* Work-Efficient Scan and Stream Compaction
+* Thrust Scan Wrapper
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+Analysis
+---
+
+
+
+
+The CPU, Naive, and Work-Efficient implementations all scaled similarly with an increasing array size. Generally, doubling the array size would double the runtime of each algorithm.
+
+The CPU and Work-Efficient implementations compared very similarly, with the Work-Efficient runtimes never straying more than 3% away from the CPU runtimes.
+
+The Naive implemenation's runtime diverged slightly from the CPU and Work-Efficient runtimes at around the 2^21 array size mark. In runs with a lesser element size than this, Naive performed up to 6% faster (at 2^20 elements) compared to the CPU implementation. And in runs with a greater element size, Naive performed at most 10% worse (at 2^24 elements) than the CPU implementation.
+
+The Thrust implementation is clearly the overall most performant option, pulling completely away from all other implementations as the array size increases.
+
+Potential Bottlenecks:
+1. Global Memory: Both the Naive and Work-Efficient algorithms were implemented using global memory with no shared memory, creating a massive amount of overhead anytime the implementations wish to read or write data.
+2. Memory Locality: Both the Naive and Work-Efficient algorithms read and write data across very large arrays. As the algorithms progress, these memory accesses become progressively more sparse- randomly accessing the memory will cause cache thrashing decreasing the bus utilization.
+3. GPU Utilization: The Naive algorithm suffers from not saturating the GPU (Many threads are ended early leaving a couple of active threads in a warp). This inherently decreases parallelism and will increase the runtime as the array size grows.
+
+Sample Output
+---
+
+```
+****************
+** SCAN TESTS **
+****************
+ [ 33 40 46 12 48 15 5 37 39 42 27 41 35 ... 10 0 ]
+==== cpu scan, power-of-two ====
+ elapsed time: 27.4656ms (std::chrono Measured)
+ [ 0 33 73 119 131 179 194 199 236 275 317 344 385 ... 410928744 410928754 ]
+==== cpu scan, non-power-of-two ====
+ elapsed time: 26.8243ms (std::chrono Measured)
+ [ 0 33 73 119 131 179 194 199 236 275 317 344 385 ... 410928700 410928722 ]
+ passed
+==== naive scan, power-of-two ====
+ elapsed time: 31.6926ms (CUDA Measured)
+ passed
+==== naive scan, non-power-of-two ====
+ elapsed time: 30.7692ms (CUDA Measured)
+ passed
+==== work-efficient scan, power-of-two ====
+ elapsed time: 23.55ms (CUDA Measured)
+ passed
+==== work-efficient scan, non-power-of-two ====
+ elapsed time: 23.0375ms (CUDA Measured)
+ passed
+==== thrust scan, power-of-two ====
+ elapsed time: 1.71158ms (CUDA Measured)
+ passed
+==== thrust scan, non-power-of-two ====
+ elapsed time: 1.14893ms (CUDA Measured)
+ passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+ [ 0 3 2 0 2 0 0 2 0 0 3 3 2 ... 1 0 ]
+==== cpu compact without scan, power-of-two ====
+ elapsed time: 31.584ms (std::chrono Measured)
+ [ 3 2 2 2 3 3 2 2 1 1 2 3 1 ... 1 1 ]
+ passed
+==== cpu compact without scan, non-power-of-two ====
+ elapsed time: 35.5074ms (std::chrono Measured)
+ [ 3 2 2 2 3 3 2 2 1 1 2 3 1 ... 2 2 ]
+ passed
+==== cpu compact with scan, power-of-two ====
+ elapsed time: 74.7157ms (std::chrono Measured)
+ [ 3 2 2 2 3 3 2 2 1 1 2 3 1 ... 1 1 ]
+ passed
+==== cpu compact with scan, non-power-of-two ====
+ elapsed time: 73.4743ms (std::chrono Measured)
+ [ 3 2 2 2 3 3 2 2 1 1 2 3 1 ... 2 2 ]
+ passed
+==== work-efficient compact, power-of-two ====
+ elapsed time: 33.6798ms (CUDA Measured)
+ passed
+==== work-efficient compact, non-power-of-two ====
+ elapsed time: 24.5682ms (CUDA Measured)
+ passed
+```
diff --git a/img/Scan Implementation Comparison Pow2.svg b/img/Scan Implementation Comparison Pow2.svg
new file mode 100644
index 0000000..99f118d
--- /dev/null
+++ b/img/Scan Implementation Comparison Pow2.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/img/Scan Implementation Comparison.svg b/img/Scan Implementation Comparison.svg
new file mode 100644
index 0000000..245ce24
--- /dev/null
+++ b/img/Scan Implementation Comparison.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..a11297d 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
#include
#include "testing_helpers.hpp"
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 24; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int *a = new int[SIZE];
int *b = new int[SIZE];
@@ -127,12 +127,20 @@ int main(int argc, char* argv[]) {
printCmpLenResult(count, expectedNPOT, b, c);
zeroArray(SIZE, c);
- printDesc("cpu compact with scan");
+ printDesc("cpu compact with scan, power-of-two");
count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);
+ zeroArray(SIZE, c);
+ printDesc("cpu compact with scan, non-power-of-two");
+ count = StreamCompaction::CPU::compactWithScan(NPOT, c, a);
+ printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+ expectedNPOT = count;
+ printArray(count, c, true);
+ printCmpLenResult(count, expectedNPOT, b, c);
+
zeroArray(SIZE, c);
printDesc("work-efficient compact, power-of-two");
count = StreamCompaction::Efficient::compact(SIZE, c, a);
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..ebe88c2 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -23,7 +23,11 @@ namespace StreamCompaction {
* which map to 0 will be removed, and elements which map to 1 will be kept.
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
- // TODO
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+ if (index >= n) return;
+
+ bools[index] = idata[index] != 0;
}
/**
@@ -32,7 +36,14 @@ namespace StreamCompaction {
*/
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
- // TODO
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+ if (index >= n) return;
+
+ if (bools[index])
+ {
+ odata[indices[index]] = idata[index];
+ }
}
}
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..becb241 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -19,7 +19,13 @@ namespace StreamCompaction {
*/
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
- // TODO
+
+ odata[0] = 0;
+ for (int k = 1; k < n; ++k)
+ {
+ odata[k] = odata[k - 1] + idata[k - 1];
+ }
+
timer().endCpuTimer();
}
@@ -29,10 +35,21 @@ namespace StreamCompaction {
* @returns the number of elements remaining after compaction.
*/
int compactWithoutScan(int n, int *odata, const int *idata) {
+ int size = 0;
+
timer().startCpuTimer();
- // TODO
+
+ for (int k = 0; k < n; ++k)
+ {
+ if (idata[k] != 0)
+ {
+ odata[size] = idata[k];
+ ++size;
+ }
+ }
+
timer().endCpuTimer();
- return -1;
+ return size;
}
/**
@@ -41,10 +58,39 @@ namespace StreamCompaction {
* @returns the number of elements remaining after compaction.
*/
int compactWithScan(int n, int *odata, const int *idata) {
+ int* indices = new int[n];
+ int size = 0;
+
timer().startCpuTimer();
- // TODO
+
+ // Compute the temporary array of pass/fail checks
+ for (int k = 0; k < n; ++k)
+ {
+ odata[k] = idata[k] != 0;
+ }
+
+ // Scan the temporary array
+ indices[0] = 0;
+ for (int k = 1; k < n; ++k)
+ {
+ indices[k] = indices[k - 1] + odata[k - 1];
+ }
+
+ // Scatter based on the found indices
+ for (int k = 0; k < n; ++k)
+ {
+ if (odata[k] != 0)
+ {
+ odata[indices[k]] = idata[k];
+ ++size;
+ }
+ }
+
timer().endCpuTimer();
- return -1;
+
+ delete[](indices);
+
+ return size;
}
}
}
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..0d8acc4 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -12,13 +12,85 @@ namespace StreamCompaction {
return timer;
}
+ // Finds the modulo of value with 2^power)
+ __device__ int modPowBase2(int value, int power)
+ {
+ int shift = (sizeof(int) * 8 - power);
+ return (value << shift) >> shift;
+ }
+
+ // Performs parallel reduction
+ __global__ void kernUpSweep(int n, int d, int *data)
+ {
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (index >= n) return;
+
+ if (modPowBase2(index, d + 1)) return;
+
+ data[index + (1 << (d + 1)) - 1] += data[index + (1 << d) - 1];
+ }
+
+ // Build a scan from an up-swept input
+ __global__ void kernDownSweep(int n, int d, int* data)
+ {
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (index >= n - 1) return;
+
+ if (modPowBase2(index, d + 1)) return;
+
+ int t = data[index + (1 << d) - 1];
+ data[index + (1 << d) - 1] = data[index + (1 << (d + 1)) - 1];
+ data[index + (1 << (d + 1)) - 1] += t;
+ }
+
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
+
+ // Number of levels needed for the scan
+ int maxDepth = ilog2ceil(n);
+ // Padded size of the work device array
+ size_t dataSize = (1ull << maxDepth);
+ // Reset value for down-sweeping
+ int zero = 0;
+
+ // Allocate device arrays
+ int *dev_data;
+
+ cudaMalloc((void**)&dev_data, dataSize * sizeof(int));
+ checkCUDAError("cudaMalloc Efficient::scan::dev_data failed!");
+
+ cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+ const int blockSize = 256;
+ dim3 gridDim((dataSize + blockSize - 1) / blockSize);
+
timer().startGpuTimer();
- // TODO
+
+ // Up-Sweep
+ for (int d = 0; d < maxDepth; ++d)
+ {
+ kernUpSweep<<>>(dataSize, d, dev_data);
+ }
+
+ // Replace the last element with 0
+ cudaMemcpy(dev_data + dataSize - 1, &zero, sizeof(int), cudaMemcpyHostToDevice);
+
+ // Down-Sweep
+ for (int d = maxDepth - 1; d >= 0; --d)
+ {
+ kernDownSweep<<>>(dataSize, d, dev_data);
+ }
+
timer().endGpuTimer();
+
+ // Copy the output data
+ cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+ // Free device arrays
+ cudaFree(dev_data);
+ checkCUDAError("cudaFree Efficient::scan failed!");
}
/**
@@ -31,10 +103,85 @@ namespace StreamCompaction {
* @returns The number of elements remaining after compaction.
*/
int compact(int n, int *odata, const int *idata) {
+ // Number of levels needed for the scan
+ int maxDepth = ilog2ceil(n);
+ // Padded size of the work device array
+ size_t dataSize = (1ull << maxDepth);
+ // Reset value for down-sweeping
+ int zero = 0;
+
+ // Allocate device arrays
+ int *dev_idata, *dev_bools, *dev_indices, *dev_odata;
+
+ cudaMalloc((void**)&dev_idata, dataSize * sizeof(int));
+ checkCUDAError("cudaMalloc Efficient::compact::dev_idata failed!");
+
+ cudaMalloc((void**)&dev_bools, dataSize * sizeof(int));
+ checkCUDAError("cudaMalloc Efficient::compact::dev_bools failed!");
+
+ cudaMalloc((void**)&dev_indices, dataSize * sizeof(int));
+ checkCUDAError("cudaMalloc Efficient::compact::dev_indices failed!");
+
+ cudaMalloc((void**)&dev_odata, dataSize * sizeof(int));
+ checkCUDAError("cudaMalloc Efficient::compact::dev_odata failed!");
+
+ cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+ int blockSize = 128;
+ dim3 gridDim((dataSize + blockSize - 1) / blockSize);
+
timer().startGpuTimer();
- // TODO
+
+ // Map to boolean
+ Common::kernMapToBoolean<<>>(dataSize, dev_bools, dev_idata);
+
+
+ // Scan the indices
+ cudaMemcpy(dev_indices, dev_bools, dataSize * sizeof(int), cudaMemcpyHostToHost);
+
+ // Up-Sweep
+ for (int d = 0; d < maxDepth; ++d)
+ {
+ kernUpSweep<<>>(dataSize, d, dev_indices);
+ }
+
+ // Replace the last element with 0
+ cudaMemcpy(dev_indices + dataSize - 1, &zero, sizeof(int), cudaMemcpyHostToDevice);
+
+ // Down-Sweep
+ for (int d = maxDepth - 1; d >= 0; --d)
+ {
+ kernDownSweep<<>>(dataSize, d, dev_indices);
+ }
+
+
+ // Scatter
+ Common::kernScatter<<>>(dataSize, dev_odata, dev_idata, dev_bools, dev_indices);
+
timer().endGpuTimer();
- return -1;
+
+ // Retrieve the output size
+ int* ptr_size = new int();
+ int* ptr_doLast = new int();
+ cudaMemcpy(ptr_size, dev_indices + dataSize - 1, sizeof(int), cudaMemcpyDeviceToHost);
+ cudaMemcpy(ptr_doLast, dev_bools + dataSize - 1, sizeof(int), cudaMemcpyDeviceToHost);
+
+ int size = *ptr_size + (*ptr_doLast ? 1 : 0);
+
+ delete(ptr_doLast);
+ delete(ptr_size);
+
+ // Copy the output data
+ cudaMemcpy(odata, dev_odata, size * sizeof(int), cudaMemcpyDeviceToHost);
+
+ // Free device arrays
+ cudaFree(dev_odata);
+ cudaFree(dev_indices);
+ cudaFree(dev_bools);
+ cudaFree(dev_idata);
+ checkCUDAError("cudaFree Efficient::compact failed!");
+
+ return size;
}
}
}
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..6bff991 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -11,15 +11,80 @@ namespace StreamCompaction {
static PerformanceTimer timer;
return timer;
}
- // TODO: __global__
+
+ __global__ void kernNaiveScan(int n, int d, int* odata, const int* idata)
+ {
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (index >= n) return;
+
+ int pow2 = (1 << (d - 1));
+ if (index >= pow2)
+ {
+ // Combine
+ odata[index] = idata[index - pow2] + idata[index];
+ }
+ else
+ {
+ // Unused this iteration - Copy over to the other array
+ odata[index] = idata[index];
+ }
+ }
+
+ __global__ void kernShiftRight(int n, int s, int* odata, const int* idata)
+ {
+ int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+ if (index >= n) return;
+
+ // Shift to the right
+ // Fill newly empty slots with 0s
+ odata[index] = index >= s ? idata[index - s] : 0;
+ }
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
+ // Allocate device arrays
+ int *dev_odata, *dev_idata;
+
+ cudaMalloc((void**)&dev_odata, n * sizeof(int));
+ checkCUDAError("cudaMalloc Naive::scan::dev_odata failed!");
+
+ cudaMalloc((void**)&dev_idata, n * sizeof(int));
+ checkCUDAError("cudaMalloc Naive::scan::dev_idata failed!");
+
+ cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+ const int blockSize = 128;
+ dim3 gridDim((n + blockSize - 1) / blockSize);
+
+ // Number of levels needed for the scan
+ int maxDepth = ilog2ceil(n);
+
timer().startGpuTimer();
- // TODO
+
+ // Perform an Inclusive Scan
+ for (int d = 1; d <= maxDepth; ++d)
+ {
+ kernNaiveScan<<>>(n, d, dev_odata, dev_idata);
+
+ int *tmp = dev_idata;
+ dev_idata = dev_odata;
+ dev_odata = tmp;
+ }
+
+ // Inclusive Scan -> Exclusive Scan conversion
+ kernShiftRight<<>>(n, 1, dev_odata, dev_idata);
+
timer().endGpuTimer();
+
+ // Copy the output data
+ cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+ // Free device arrays
+ cudaFree(dev_odata);
+ cudaFree(dev_idata);
+ checkCUDAError("cudaFree Naive::scan failed!");
}
}
}
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..46ebf77 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,11 +18,17 @@ namespace StreamCompaction {
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
+ thrust::host_vector thrust_odata(n);
+ thrust::host_vector thrust_idata(idata, idata + n);
+
+ thrust::device_vector dev_thrust_odata(thrust_odata);
+ thrust::device_vector dev_thrust_idata(thrust_idata);
+
timer().startGpuTimer();
- // TODO use `thrust::exclusive_scan`
- // example: for device_vectors dv_in and dv_out:
- // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+ thrust::exclusive_scan(dev_thrust_idata.begin(), dev_thrust_idata.end(), dev_thrust_odata.begin());
timer().endGpuTimer();
+
+ thrust::copy(dev_thrust_odata.begin(), dev_thrust_odata.end(), odata);
}
}
}