Skip to content
68 changes: 62 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,69 @@
CUDA Stream Compaction
======================

**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
###University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2

* (TODO) YOUR NAME HERE
* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
Ju Yang

### (TODO: Your README)
### Tested on: Windows 7, i7-4710MQ @ 2.50GHz 8GB, GTX 870M 6870MB (Hasee Notebook K770E-i7)
![result](doc/1024.png)

Include analysis, etc. (Remember, this is public, so don't put
anything here that you don't want to share with the world.)
## TODOs finished:
### 1. naive.cu

__global__ void naive_sum(int n,int* odata, int* idata);

void scan(int n, int *odata, const int *idata);

### 2. efficient.cu

__global__ void prescan(int *g_odata, int *g_idata, int n, int*temp);

void scan(int n, int *odata, const int *idata);

int compact(int n, int *odata, const int *idata);


### 3 thrust.cu

void scan(int n, int *odata, const int *idata);


### 4 cpu.cu

void scan(int n, int *odata, const int *idata);

int compactWithoutScan(int n, int *odata, const int *idata) ;

int compactWithScan(int n, int *odata, const int *idata);

### 5 common.cu

__global__ void kernMapToBoolean(int n, int *bools, const int *idata);

__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices);

### Modified the main.cpp a little bit for display.

## Performance Graph

### Scanning
![result](doc/image001.gif)
### Thrust Scanning
![result](doc/data_29123_image001.gif)
### Compact
![result](doc/data_6317_image001.gif)

## Analysis
### Thrust
As we can see, the thrust::exclusive_scan is rather time-costing compared with other methods. Even if I used device_vector to store the data, it is still the slowest.
But since I did not free the device_vectors, the non-pow2 as second round's speed is much faster.
I think the reason is, when calling thrust functions, it will apply for some blocks/threads inside the GPU, and will release later on.
Although I tried my best to avoid any read/write from CPU to GPU, the scan function still cost some time to arrange for some place.

### Unfixed Known Bugs
#### 1. When using multiple blocks, sometimes the result is not right. I think it is because __syncthreads() doesn't sync blocks?
#### 2. Since I used only 1 block, when the SIZE is more than 1024(which is the limit), apperently the result is wrong.
#### 3. CPU performace is much better, and sometimes the calculating time doesn't always raise with the SIZE.
I think this is because the SIZE is still not large enough?
Binary file added doc/1024.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/128.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/16.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/256.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/32.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/512.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/64.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/8.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/data.xls
Binary file not shown.
Binary file added doc/data_29123_image001.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/data_6317_image001.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/image001.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
13 changes: 8 additions & 5 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,17 @@
#include <stream_compaction/thrust.h>
#include "testing_helpers.hpp"

const int SIZE = 1 << 8; // feel free to change the size of array


const int SIZE = 1 << 10; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int a[SIZE], b[SIZE], c[SIZE];

int main(int argc, char* argv[]) {

// Scan tests
printf("Size= %d, Non-Pow2 Size= %d. \n", SIZE, NPOT);

printf("\n");
printf("****************\n");
printf("** SCAN TESTS **\n");
printf("****************\n");
Expand All @@ -42,7 +45,7 @@ int main(int argc, char* argv[]) {
printDesc("cpu scan, non-power-of-two");
StreamCompaction::CPU::scan(NPOT, c, a);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
printArray(NPOT, b, true);
//printArray(NPOT, b, true);
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
Expand Down Expand Up @@ -115,14 +118,14 @@ int main(int argc, char* argv[]) {
count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
expectedNPOT = count;
printArray(count, c, true);
//printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);

zeroArray(SIZE, c);
printDesc("cpu compact with scan");
count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
printArray(count, c, true);
//printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);

zeroArray(SIZE, c);
Expand Down
16 changes: 16 additions & 0 deletions stream_compaction/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,15 @@ namespace StreamCompaction {
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
// TODO
int index = threadIdx.x;
if (idata[index] == 0)//If this is 0
{
bools[index] = 0;
}
else
{
bools[index] = 1;
}
}

/**
Expand All @@ -33,6 +42,13 @@ namespace StreamCompaction {
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
// TODO

int index = threadIdx.x;

if (bools[index]!=0)
{
odata[indices[index]] = idata[index];
}
}

}
Expand Down
52 changes: 44 additions & 8 deletions stream_compaction/cpu.cu
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
#include <cstdio>
#include "cpu.h"

#include "common.h"
#include "common.h"

namespace StreamCompaction {
namespace CPU {
using StreamCompaction::Common::PerformanceTimer;
PerformanceTimer& timer()
{
static PerformanceTimer timer;
return timer;
using StreamCompaction::Common::PerformanceTimer;
PerformanceTimer& timer()
{
static PerformanceTimer timer;
return timer;
}

/**
Expand All @@ -20,6 +20,14 @@ namespace StreamCompaction {
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int i;
int count=0;

for (i = 0; i < n; i++)
{
count += idata[i];
odata[i] = count;
}
timer().endCpuTimer();
}

Expand All @@ -31,8 +39,21 @@ namespace StreamCompaction {
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int i;
int count = 0;
for (i = 0; i < n; i++)
{
if (idata[i] != 0)
{
odata[i-count] = idata[i];
}
else
{
count++;
}
}
timer().endCpuTimer();
return -1;
return n-count;
}

/**
Expand All @@ -43,8 +64,23 @@ namespace StreamCompaction {
int compactWithScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int i;
int count=0;

for (i = 0; i < n; i++)
{
if (idata[i] != 0)
{
odata[i - count] = idata[i];
}
else
{
count++;
}
}

timer().endCpuTimer();
return -1;
return n- count;
}
}
}
Loading