Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 67 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,73 @@ CUDA Stream Compaction

**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**

* (TODO) YOUR NAME HERE
* (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
* Li Zheng
* [LinkedIn](https://www.linkedin.com/in/li-zheng-1955ba169)
* Tested on: Windows CUDA10, i5-3600 @ 3.59GHz 16GB, RTX 2060 6GB (personal computer)

### (TODO: Your README)
This project implements different versions of scan, including CPU scan, naive scan, work-efficient scan and thrust scan. Some of these methods are used to implement stream compaction. A timer is used to measure the time cost and evaluate the performance.

Include analysis, etc. (Remember, this is public, so don't put
anything here that you don't want to share with the world.)
### Performance Analysis
![blockSize](img/blockSize.PNG)
This diagram demonstrates the change of time with respect to block size. The block size of 128 and 256 have relatively good performance.

![powerOfTwo](img/powerOfTwo.PNG)
![nonePowerOfTwo](img/nonPowerOfTwo.PNG)
The diagrams show the change of time with array size increases. The first diagram is for power-of-two size array. The second one is for non-power-of-two size array. Their performance is almost the same. When the array size is small, the CPU implementation has a better performance. I think it is because most of the threads doesn't actually work at a deeper level, but just swap two device memory. Additionally, the GPU version algorithms use bit shifting to find offsets or intervals of each level, which takes extra time. With the array size increases, the GPU version algorithms have better performance, especially the work-efficient and thrust method.

### Output of The Test Program
Here is the test result of an array of 2^16 and a block size of 128. More results are in img/performance analysis.xlsx.
```
****************
** SCAN TESTS **
****************
[ 49 29 24 15 46 49 46 8 35 40 38 18 44 ... 3 0 ]
==== cpu scan, power-of-two ====
elapsed time: 0.1305ms (std::chrono Measured)
[ 0 49 78 102 117 163 212 258 266 301 341 379 397 ... 1603889 1603892 ]
==== cpu scan, non-power-of-two ====
elapsed time: 0.129ms (std::chrono Measured)
[ 0 49 78 102 117 163 212 258 266 301 341 379 397 ... 1603839 1603856 ]
passed
==== naive scan, power-of-two ====
elapsed time: 0.052416ms (CUDA Measured)
passed
==== naive scan, non-power-of-two ====
elapsed time: 0.050752ms (CUDA Measured)
passed
==== work-efficient scan, power-of-two ====
elapsed time: 0.11264ms (CUDA Measured)
passed
==== work-efficient scan, non-power-of-two ====
elapsed time: 0.113344ms (CUDA Measured)
passed
==== thrust scan, power-of-two ====
elapsed time: 0.073344ms (CUDA Measured)
passed
==== thrust scan, non-power-of-two ====
elapsed time: 0.055296ms (CUDA Measured)
passed

*****************************
** STREAM COMPACTION TESTS **
*****************************
[ 3 3 2 1 0 1 2 2 3 0 2 2 0 ... 3 0 ]
==== cpu compact without scan, power-of-two ====
elapsed time: 0.1239ms (std::chrono Measured)
[ 3 3 2 1 1 2 2 3 2 2 1 2 2 ... 3 3 ]
passed
==== cpu compact without scan, non-power-of-two ====
elapsed time: 0.1241ms (std::chrono Measured)
[ 3 3 2 1 1 2 2 3 2 2 1 2 2 ... 3 2 ]
passed
==== cpu compact with scan ====
elapsed time: 0.3149ms (std::chrono Measured)
[ 3 3 2 1 1 2 2 3 2 2 1 2 2 ... 3 3 ]
passed
==== work-efficient compact, power-of-two ====
elapsed time: 0.124928ms (CUDA Measured)
passed
==== work-efficient compact, non-power-of-two ====
elapsed time: 0.198656ms (CUDA Measured)
passed
```
Binary file added img/blockSize.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/nonPowerOfTwo.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/performance analysis.xlsx
Binary file not shown.
Binary file added img/powerOfTwo.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#include <stream_compaction/thrust.h>
#include "testing_helpers.hpp"

const int SIZE = 1 << 8; // feel free to change the size of array
const int SIZE = 1 << 16; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int *a = new int[SIZE];
int *b = new int[SIZE];
Expand Down
17 changes: 17 additions & 0 deletions stream_compaction/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,16 @@ namespace StreamCompaction {
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
// TODO
int index = (blockIdx.x * blockDim.x) + threadIdx.x;
if (index >= n) {
return;
}
if (idata[index] == 0) {
bools[index] = 0;
}
else {
bools[index] = 1;
}
}

/**
Expand All @@ -33,6 +43,13 @@ namespace StreamCompaction {
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
// TODO
int index = (blockIdx.x * blockDim.x) + threadIdx.x;
if (index >= n) {
return;
}
if (bools[index] == 1) {
odata[indices[index]] = idata[index];
}
}

}
Expand Down
1 change: 1 addition & 0 deletions stream_compaction/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
#define blockSize 256

/**
* Check for CUDA errors; print and exit if there was a problem.
Expand Down
112 changes: 72 additions & 40 deletions stream_compaction/cpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,47 +4,79 @@
#include "common.h"

namespace StreamCompaction {
namespace CPU {
using StreamCompaction::Common::PerformanceTimer;
PerformanceTimer& timer()
{
static PerformanceTimer timer;
return timer;
}
namespace CPU {
using StreamCompaction::Common::PerformanceTimer;
PerformanceTimer& timer()
{
static PerformanceTimer timer;
return timer;
}

/**
* CPU scan (prefix sum).
* For performance analysis, this is supposed to be a simple for loop.
* (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
*/
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
timer().endCpuTimer();
}
/**
* CPU scan (prefix sum).
* For performance analysis, this is supposed to be a simple for loop.
* (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
*/
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
odata[0] = 0;
for (int k = 1; k < n; ++k) {
odata[k] = odata[k - 1] + idata[k - 1];
}
timer().endCpuTimer();
}

/**
* CPU stream compaction without using the scan function.
*
* @returns the number of elements remaining after compaction.
*/
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
timer().endCpuTimer();
return -1;
}
/**
* CPU stream compaction without using the scan function.
*
* @returns the number of elements remaining after compaction.
*/
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int ptr = 0;
for (int i = 0; i < n; i++) {
if (idata[i] != 0) {
odata[ptr] = idata[i];
ptr++;
}
}
timer().endCpuTimer();
return ptr;
}

/**
* CPU stream compaction using scan and scatter, like the parallel version.
*
* @returns the number of elements remaining after compaction.
*/
int compactWithScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
timer().endCpuTimer();
return -1;
}
}
/**
* CPU stream compaction using scan and scatter, like the parallel version.
*
* @returns the number of elements remaining after compaction.
*/
int compactWithScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int count = 0;
int *tmp = new int[n];
for (int i = 0; i < n; i++) {
if (idata[i] == 0) {
tmp[i] = 0;
}
else {
tmp[i] = 1;
count++;
}
}
int *tmpScan = new int[n];
tmpScan[0] = 0;
for (int k = 1; k < n; ++k) {
tmpScan[k] = tmpScan[k - 1] + tmp[k - 1];
}
for (int i = 0; i < n; i++) {
odata[tmpScan[i]] = idata[i];
}
timer().endCpuTimer();
delete tmp;
delete tmpScan;
return count;
}
}
}
104 changes: 99 additions & 5 deletions stream_compaction/efficient.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,67 @@ namespace StreamCompaction {
return timer;
}

__global__ void kernEfficientScanUpSweep(int n, int *odata, int d) {
int index = threadIdx.x + (blockIdx.x * blockDim.x);
if (index >= n) {
return;
}
int interval = 1 << (d + 1);
int halfInterval = 1 << d;
if ((index + 1) % interval == 0) {
odata[index] += odata[index - halfInterval];
}
}

__global__ void kernEfficientScanDownSweep(int n, int *odata, int d, int topLayer) {
int index = threadIdx.x + (blockIdx.x * blockDim.x);
if (index >= n) {
return;
}
if (d == topLayer && index == n - 1) {
odata[index] = 0;
}
int interval = 1 << (d + 1);
int halfInterval = 1 << d;
if ((index + 1) % interval == 0) {
int tmp = odata[index - halfInterval];
odata[index - halfInterval] = odata[index];
odata[index] += tmp;
}
}
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
timer().startGpuTimer();
// TODO
timer().endGpuTimer();
int N = pow(2, ilog2ceil(n));
int *dev_odata;
cudaMalloc((void**)&dev_odata, N * sizeof(int));
cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
if (N > n) {
int *zeroArray = new int[N - n];
for (int i = 0; i < N - n; i++) {
zeroArray[i] = 0;
}
cudaMemcpy(dev_odata + n, zeroArray, (N - n) * sizeof(int), cudaMemcpyHostToDevice);
}
timer().startGpuTimer();
dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
int topLayer = ilog2ceil(n) - 1;
for (int d = 0; d <= topLayer; d++) {
kernEfficientScanUpSweep << <fullBlocksPerGrid, blockSize >> > (N, dev_odata, d);
}

for (int d = topLayer; d >= 0; d--) {
kernEfficientScanDownSweep << <fullBlocksPerGrid, blockSize >> > (N, dev_odata, d, topLayer);
}

timer().endGpuTimer();
cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_odata);
}


/**
* Performs stream compaction on idata, storing the result into odata.
* All zeroes are discarded.
Expand All @@ -31,10 +83,52 @@ namespace StreamCompaction {
* @returns The number of elements remaining after compaction.
*/
int compact(int n, int *odata, const int *idata) {
timer().startGpuTimer();
// TODO
timer().endGpuTimer();
return -1;
int N = pow(2, ilog2ceil(n));
dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
int topLayer = ilog2ceil(n) - 1;
int *dev_idata, *dev_odata, *dev_bools, *dev_indices;
cudaMalloc((void**)&dev_idata, n * sizeof(int));
cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void**)&dev_odata, n * sizeof(int));
cudaMalloc((void**)&dev_bools, N * sizeof(int));
cudaMalloc((void**)&dev_indices, N * sizeof(int));

timer().startGpuTimer();
Common::kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (n, dev_bools, dev_idata);
if (N > n) {
int *zeroArray = new int[N - n];
for (int i = 0; i < N - n; i++) {
zeroArray[i] = 0;
}
cudaMemcpy(dev_bools + n, zeroArray, (N - n) * sizeof(int), cudaMemcpyHostToDevice);
}
cudaMemcpy(dev_indices, dev_bools, N * sizeof(int), cudaMemcpyDeviceToDevice);

int countScatter = 0;
for (int i = 0; i < n; i++) {
if (idata[i] != 0) {
countScatter++;
}
}
for (int d = 0; d <= topLayer; d++) {
kernEfficientScanUpSweep << <fullBlocksPerGrid, blockSize >> > (N, dev_indices, d);
}

for (int d = topLayer; d >= 0; d--) {
kernEfficientScanDownSweep << <fullBlocksPerGrid, blockSize >> > (N, dev_indices, d, topLayer);
}

Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (n, dev_odata,
dev_idata, dev_bools, dev_indices);
timer().endGpuTimer();

cudaMemcpy(odata, dev_odata, countScatter * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_idata);
cudaFree(dev_odata);
cudaFree(dev_bools);
cudaFree(dev_indices);
return countScatter;
}
}
}
Loading