Skip to content
75 changes: 69 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,75 @@ CUDA Stream Compaction

**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**

* (TODO) YOUR NAME HERE
* (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
* Logan Cho
* [LinkedIn](https://www.linkedin.com/in/logan-cho/)
* [Personal Website](https://www.logancho.com/)
* Tested on: Windows 11, 13th Gen Intel(R) Core(TM) i7-13700H, 2.40 GHz, RTX 4060 Laptop GPU

### (TODO: Your README)
# Sample Output
```
****************
** SCAN TESTS **
****************
[ 48 27 47 0 26 20 5 27 18 47 10 9 22 ... 35 0 ]
==== cpu scan, power-of-two ====
elapsed time: 0.0004ms (std::chrono Measured)
[ 0 48 75 122 122 148 168 173 200 218 265 275 284 ... 6629 6664 ]
==== cpu scan, non-power-of-two ====
elapsed time: 0.0004ms (std::chrono Measured)
[ 0 48 75 122 122 148 168 173 200 218 265 275 284 ... 6562 6595 ]
passed
==== naive scan, power-of-two ====
elapsed time: 0.08192ms (CUDA Measured)
passed
==== naive scan, non-power-of-two ====
elapsed time: 0.02048ms (CUDA Measured)
passed
==== work-efficient scan, power-of-two ====
elapsed time: 0.191232ms (CUDA Measured)
passed
==== work-efficient scan, non-power-of-two ====
elapsed time: 0.137216ms (CUDA Measured)
passed
==== thrust scan, power-of-two ====
elapsed time: 0.071904ms (CUDA Measured)
passed
==== thrust scan, non-power-of-two ====
elapsed time: 0.04608ms (CUDA Measured)
passed

Include analysis, etc. (Remember, this is public, so don't put
anything here that you don't want to share with the world.)
*****************************
** STREAM COMPACTION TESTS **
*****************************
[ 0 3 3 0 0 2 1 1 0 3 2 1 2 ... 1 0 ]
==== cpu compact without scan, power-of-two ====
elapsed time: 0.0006ms (std::chrono Measured)
passed
==== cpu compact without scan, non-power-of-two ====
elapsed time: 0.0003ms (std::chrono Measured)
passed
==== cpu compact with scan ====
elapsed time: 0.001ms (std::chrono Measured)
passed
==== work-efficient compact, power-of-two ====
elapsed time: 0.188416ms (CUDA Measured)
passed
==== work-efficient compact, non-power-of-two ====
elapsed time: 0.084992ms (CUDA Measured)
passed
```

# Charts
![](images/Chart1.png)
![](images/Chart2.png)

# In-Depth Performance Analysis
* CUDA Block Size Optimization
* Through trial and error, I narrowed down the optimal block sizes for each of my implementations of scan/compaction.
* Naive: (256, 1, 1)
* Efficient: (256, 1, 1)
* Comparison of all Scan Implementations (CPU, GPU (Naive, Efficient, Thrust)):
* ![](images/Chart1.png)
* This chart plots the time taken/runtime of different Scan implementations across different input array sizes. We can draw a conclusion that the GPU methods have a significant performance boost over the serial CPU method for larger array sizes, larger than 2 to the power of 14.
* ![](images/Chart2.png)
* We can also see from this additional chart that between the GPU methods, Thrust is the most performant by far. And we also see that fficient outpaces Naive once the size of the array exceeds 2 to the power of 20.
Binary file added images/Chart1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/Chart2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
20 changes: 11 additions & 9 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#include <stream_compaction/thrust.h>
#include "testing_helpers.hpp"

const int SIZE = 1 << 8; // feel free to change the size of array
const int SIZE = 1 << 22; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int *a = new int[SIZE];
int *b = new int[SIZE];
Expand Down Expand Up @@ -54,11 +54,12 @@ int main(int argc, char* argv[]) {
//printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
onesArray(SIZE, c);
printDesc("1s array for finding bugs");
StreamCompaction::Naive::scan(SIZE, c, a);
printArray(SIZE, c, true); */

//For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
//onesArray(SIZE, c);
//printDesc("1s array for finding bugs");
//StreamCompaction::Naive::scan(SIZE, c, a);
//printArray(SIZE, c, true);

zeroArray(SIZE, c);
printDesc("naive scan, non-power-of-two");
Expand All @@ -67,6 +68,7 @@ int main(int argc, char* argv[]) {
//printArray(SIZE, c, true);
printCmpResult(NPOT, b, c);


zeroArray(SIZE, c);
printDesc("work-efficient scan, power-of-two");
StreamCompaction::Efficient::scan(SIZE, c, a);
Expand Down Expand Up @@ -115,22 +117,22 @@ int main(int argc, char* argv[]) {
count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
expectedCount = count;
printArray(count, b, true);
//printArray(count, b, true);
printCmpLenResult(count, expectedCount, b, b);

zeroArray(SIZE, c);
printDesc("cpu compact without scan, non-power-of-two");
count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
expectedNPOT = count;
printArray(count, c, true);
//printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);

zeroArray(SIZE, c);
printDesc("cpu compact with scan");
count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
printArray(count, c, true);
//printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);

zeroArray(SIZE, c);
Expand Down
1 change: 1 addition & 0 deletions stream_compaction/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
#define blockSize 128

/**
* Check for CUDA errors; print and exit if there was a problem.
Expand Down
53 changes: 50 additions & 3 deletions stream_compaction/cpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ namespace StreamCompaction {
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
odata[0] = 0;
for (int i = 1; i < n; i++) {
odata[i] = odata[i - 1] + idata[i - 1];
}
timer().endCpuTimer();
}

Expand All @@ -31,8 +35,16 @@ namespace StreamCompaction {
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int index = 0;
for (int i = 0; i < n; i++) {
if (idata[i] != 0) {
odata[index] = idata[i];
index++;
}
}

timer().endCpuTimer();
return -1;
return index;
}

/**
Expand All @@ -41,10 +53,45 @@ namespace StreamCompaction {
* @returns the number of elements remaining after compaction.
*/
int compactWithScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int* bitArray = new int[n];
int* scanBitArray = new int[n];

// 1. Populate scanBitArray
for (int i = 0; i < n; i++) {
if (idata[i] != 0) {
bitArray[i] = 1;
}
else {
bitArray[i] = 0;
}
}


//
timer().startCpuTimer();

{
//1.5 scan
scanBitArray[0] = 0;
for (int i = 1; i < n; i++) {
scanBitArray[i] = scanBitArray[i - 1] + bitArray[i - 1];
}
//scan(n, scanBitArray, bitArray);
}


// 2. Scatter
int numElem = scanBitArray[n - 1];
for (int i = 0; i < n; i++) {
if (idata[i] != 0) {
odata[scanBitArray[i]] = idata[i];
}
}
timer().endCpuTimer();
return -1;
delete[] bitArray;
delete[] scanBitArray;
return numElem;
}
}
}
Loading