Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 36 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,42 @@ CUDA Stream Compaction

**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**

* (TODO) YOUR NAME HERE
* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
* Yash Vardhan
* Tested on: Windows 10 Pro, Intel i5-4200U (4) @ 2.600GHz 4GB, NVIDIA GeForce 840M 2048MB

### (TODO: Your README)
In this project, I have implemented the stream compaction and scan(prefix sum) in CUDA on both a CPU and a GPU, and have compared the performance of both of these side-by-side. The parallel version of Stream Compaction run on a GPU is a useful tool for many applications like deferred shading, path tracer algorithms. Algorithms like scan (a.k.a. prefix-sum) are the basis of many algorithms. They are specifically designed to run on GPU architecture.

Include analysis, etc. (Remember, this is public, so don't put
anything here that you don't want to share with the world.)
The scan algorithms were implemented on -

- CPU
- Naive Scan on GPU
- Work-efficient Scan on GPU
- Scan using Thrust on GPU.

Performance Analysis
-------------------------

Block Sizes were varied from 16 to 1024. The performance peaked around 64,128 and 256. I selected to benchmark the performance graphs on block size of 128.

### Time(in ms) vs Number of elements in array
----------------------------------------------

![](img/scan.jpg)

![](img/Data.png)

The size of array was increased incrementally with a magnitude of 2^4. The transistion from 2^16 to 2^20 showed a signinficant diversion between thrust and naive GPU impementation. Also a difference between Work-Efficient and naive GPU implementation was observed. The Work-efficient GPU Scan could be optimized much more by using shared memory, which will reflect ina future benchmark test.

### Scan Results
----------------

Tested on array size of 512

![](img/resScan.png)

### Stream Compaction Results
-----------------------------

Tested on array size of 512

![](img/resCompact.png)
Binary file added img/Data.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/resCompact.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/resScan.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/scan.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion stream_compaction/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ set(SOURCE_FILES

cuda_add_library(stream_compaction
${SOURCE_FILES}
OPTIONS -arch=sm_20
OPTIONS -arch=sm_50
)
11 changes: 10 additions & 1 deletion stream_compaction/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ namespace StreamCompaction {
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
// TODO
int index = threadIdx.x + (blockIdx.x * blockDim.x);
if (index >= n) return;

bools[index] = (idata[index] != 0) ? 1 : 0;
}

/**
Expand All @@ -33,7 +37,12 @@ namespace StreamCompaction {
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
// TODO
int index = threadIdx.x + (blockIdx.x * blockDim.x);
if (index >= n) return;

int i = indices[index];
if (bools[index] == 1)
odata[i] = idata[index];
}

}
}
4 changes: 3 additions & 1 deletion stream_compaction/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include <algorithm>
#include <chrono>
#include <stdexcept>

#define blockSize 128

#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
Expand All @@ -27,7 +29,7 @@ inline int ilog2(int x) {
}

inline int ilog2ceil(int x) {
return ilog2(x - 1) + 1;
return ilog2(x - 1) + 1;
}

namespace StreamCompaction {
Expand Down
54 changes: 43 additions & 11 deletions stream_compaction/cpu.cu
Original file line number Diff line number Diff line change
@@ -1,25 +1,31 @@
#include <cstdio>
#include "cpu.h"

#include "common.h"
#include "common.h"

namespace StreamCompaction {
namespace CPU {
using StreamCompaction::Common::PerformanceTimer;
PerformanceTimer& timer()
{
static PerformanceTimer timer;
return timer;
using StreamCompaction::Common::PerformanceTimer;
PerformanceTimer& timer()
{
static PerformanceTimer timer;
return timer;
}

/**
* CPU scan (prefix sum).
* For performance analysis, this is supposed to be a simple for loop.
* (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
*/
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
//TODO
int sum = 0;
for (int i = 0; i < n; i++)
{
odata[i] = sum;
sum += idata[i];
}
timer().endCpuTimer();
}

Expand All @@ -31,8 +37,17 @@ namespace StreamCompaction {
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int numel = 0;
for (int i = 0; i < n; i++)
{
if (idata[i] != 0)
{
odata[numel] = idata[i];
numel++;
}
}
timer().endCpuTimer();
return -1;
return numel;
}

/**
Expand All @@ -43,8 +58,25 @@ namespace StreamCompaction {
int compactWithScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int *oscan = (int*)malloc(n * sizeof(int));
int *iscan = (int*)malloc(n * sizeof(int));
for (int i = 0; i < n; i++)
if(idata[i]==0)
iscan[i] = 0;
else
iscan[i] = 1;
scan(n, oscan, iscan);
int numel = 0;
for (int i = 0; i < n; i++)
{
if (idata[i] != 0)
{
odata[oscan[i]] = idata[i];
numel++;
}
}
timer().endCpuTimer();
return -1;
return numel;
}
}
}
110 changes: 105 additions & 5 deletions stream_compaction/efficient.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,66 @@ namespace StreamCompaction {
return timer;
}

__global__ void upsweep(int n, int k, int* dev)
{
int index = threadIdx.x + (blockIdx.x * blockDim.x);
if (index >= n) return;

if ((index % (2 * k) == 0) && (index + (2 * k) <= n))
dev[index + (2 * k) - 1] += dev[index + k - 1];
}

__global__ void downsweep(int n, int k, int* dev)
{
int index = threadIdx.x + (blockIdx.x * blockDim.x);
if (index >= n) return;

if ((index % (2 * k) == 0) && (index + (2 * k) <= n))
{
int tmp = dev[index + k - 1];
dev[index + k - 1] = dev[index + (2 * k) - 1];
dev[index + (2 * k) - 1] += tmp;
}
}

/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
timer().startGpuTimer();
// TODO
timer().endGpuTimer();
void scan(int n, int *odata, const int *idata) {

int* dev;
int potn = 1 << ilog2ceil(n);

cudaMalloc((void**)&dev, potn * sizeof(int));
checkCUDAError("Malloc for input device failed\n");

cudaMemset(dev, 0, potn * sizeof(n));

cudaMemcpy(dev, idata, n * sizeof(int), cudaMemcpyHostToDevice);
checkCUDAError("cudaMemcpy for device failed\n");

dim3 fullBlocksPerGrid((potn + blockSize - 1) / blockSize);

//timer().startGpuTimer();

for (int k = 1; k < potn; k*=2)
{
upsweep <<< fullBlocksPerGrid, blockSize >>> (potn, k, dev);
}

cudaMemset(dev + potn - 1, 0, sizeof(int));

for (int k = potn/2; k>0; k/=2)
{
downsweep <<< fullBlocksPerGrid, blockSize >>> (potn, k, dev);
}

//timer().endGpuTimer();

cudaMemcpy(odata, dev, n * sizeof(int), cudaMemcpyDeviceToHost);
checkCUDAError("cudaMemcpy for output data failed\n");

cudaFree(dev);
}

/**
Expand All @@ -31,10 +84,57 @@ namespace StreamCompaction {
* @returns The number of elements remaining after compaction.
*/
int compact(int n, int *odata, const int *idata) {
int* idev;
int* odev;
cudaMalloc((void**)&idev, n * sizeof(int));
checkCUDAError("cudaMalloc idata failed!");

cudaMalloc((void**)&odev, n * sizeof(*odev));
checkCUDAError("cudaMalloc odev failed!");

cudaMemcpy(idev, idata, n * sizeof(*idata), cudaMemcpyHostToDevice);

int potn = 1 << ilog2ceil(n);
int* boolarr;

cudaMalloc((void**)&boolarr, potn * sizeof(int));
checkCUDAError("cudaMalloc bool failed!");

cudaMemset(boolarr, 0, potn * sizeof(int));

int* indices;
cudaMalloc((void**)&indices, potn * sizeof(int));
checkCUDAError("cudaMalloc bool failed!");

cudaMemcpy(indices, boolarr, n * sizeof(int), cudaMemcpyDeviceToDevice);
checkCUDAError("cudaMemcpy from to dev_bools to dev_indices failed!");

dim3 fullBlocksPerGrid((potn + blockSize - 1) / blockSize);

timer().startGpuTimer();
// TODO
StreamCompaction::Common::kernMapToBoolean <<<fullBlocksPerGrid, blockSize >>>(n, boolarr, idev);
scan(n, indices, boolarr);
StreamCompaction::Common::kernScatter <<<fullBlocksPerGrid, blockSize >>>(n, odev, idev, boolarr, indices);

timer().endGpuTimer();
return -1;

cudaMemcpy(odata, odev, n * sizeof(int), cudaMemcpyDeviceToHost);
checkCUDAError("cudaMemcpy for odev failed");

int numbool = 0;
cudaMemcpy(&numbool, boolarr + n - 1, sizeof(int), cudaMemcpyDeviceToHost);

int numindices = 0;
cudaMemcpy(&numindices, indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost);

int total = numbool + numindices;
cudaFree(indices);
cudaFree(idev);
cudaFree(odev);
cudaFree(boolarr);

return total;
}
}
}
54 changes: 53 additions & 1 deletion stream_compaction/naive.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,67 @@ namespace StreamCompaction {
static PerformanceTimer timer;
return timer;
}

// TODO: __global__
__global__ void naivescan(int n, int k, int* idev, int* odev)
{
auto index = threadIdx.x + (blockIdx.x * blockDim.x);
if (index >= n) {return;}

if (index >= k)
odev[index] = idev[index] + idev[index - k];
else
odev[index] = idev[index];
}


__global__ void inc2exc(int n, int* idev, int* odev)
{
int index = threadIdx.x + (blockIdx.x * blockDim.x);
if (index >= n) { return; }

if (index > 0)
odev[index] = idev[index - 1];
else
odev[index] = 0;

}
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {

// TODO
int* idev;
int* odev;
cudaMalloc((void**)&idev, n * sizeof(int));
checkCUDAError("Malloc for input device failed\n");

cudaMalloc((void**)&odev, n * sizeof(int));
checkCUDAError("Malloc for input device failed\n");

cudaMemcpy(idev, idata, n * sizeof(int), cudaMemcpyHostToDevice);
checkCUDAError("cudaMemcpy for input device failed\n");

dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);

timer().startGpuTimer();
// TODO

for (int k = 1; k < n; k<<=1)
{
naivescan <<< fullBlocksPerGrid, blockSize >>> (n, k, idev, odev);
std::swap(idev, odev);
}

inc2exc <<< fullBlocksPerGrid, blockSize >>> (n, idev, odev);

timer().endGpuTimer();

cudaMemcpy(odata, odev, n * sizeof(int), cudaMemcpyDeviceToHost);
checkCUDAError("cudaMemcpy for output device failed\n");

cudaFree(odev);
cudaFree(idev);
}
}
}
Loading