diff --git a/README.md b/README.md
index 0e38ddb..d2ee079 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,115 @@
-CUDA Stream Compaction
-======================
+Project 2 Stream Compaction
+===========================
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
+**University of Pennsylvania, CIS 5650: GPU Programming and Architecture**
-* (TODO) YOUR NAME HERE
- * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Dominik Kau ([LinkedIn](https://www.linkedin.com/in/dominikkau/))
+* Tested on: Windows 10, i7-12700 @ 2.10 GHz, 32 GB, T1000 4096 MB (CETS machine)
-### (TODO: Your README)
+## Scan and Stream Compaction
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+Scan is an algorithm that outputs an array in which all previous elements of the input array have been summed up (https://en.wikipedia.org/wiki/Prefix_sum).
+Stream compaction outputs an array that contains only those elements of the input that fulfill some predicate (in this project being non-zero).
+The parallel implementation of stream compaction in this project will make use of the scan algorithm.
+This project contains 4 implementations of the scan algorithm:
+* straight-forward implementation on the CPU (CPU)
+* naive, parallel implementation on the GPU (GPU naive)
+* work-efficient implementation on the GPU (GPU efficient)
+* implementation using the thrust library (GPU thrust)
+
+The stream compaction algorithm is implemented in 3 variants:
+* straight-forward implementation on the CPU (CPU)
+* scan based implementation on the CPU
+* work-efficient scan based implementation on the GPU (GPU)
+
+These cases are labeled in the following graphs by the identifiers given in parentheses.
+All implementations were tested on array lengths that are powers of 2 and on array lengths that are not powers of 2.
+As far as I understand Part 5, Extra Credit, I implemented the alluded optimization by starting only as many threads as are needed (up to the block size) in every round of the algorithm.
+
+## Performance Analysis
+
+### Scan timings on arrays with sizes that are powers of 2
+
+
+
+Surprisingly, the CPU implementation is quicker than both of my GPU implementations.
+This is despite the fact, that for both GPU implementations, I am launching kernels only with as many threads as are needed (up to the block size).
+This prevents starting many threads that will not be active after a first if-condition.
+This result might be due to the usage of a CETS machine.
+
+Unfortunately, I haven't been able to make a detailed performance analysis using the Nsight suite, but obviously the memory access pattern is suboptimal.
+Firstly, using global memory will lead to high latencies in each kernel call.
+Secondly the strided indexing pattern in both GPU implementations has the same performance as random access of memory which creates a big bottleneck.
+
+Those bottlenecks are not present in the thrust implementation which is by far the fastest.
+
+### Scan timings on arrays with sizes that are not powers of 2
+
+
+
+The performance on arrays with non-powers of 2 lengths reflects the same trends as the results above.
+Interestingly, for smaller arrays the thrust implementation is quite a bit quicker in this scenario than in the above case of arrays with lengths of power 2.
+
+### Timing results of compacting algorithm
+
+
+
+For the compacting algorithm, the GPU implementation is actually faster for large array sizes.
+Here, only the straight-forward CPU implementation is used, as it is faster than the scan based algorithm.
+I would have expected bigger differences between the two GPU and the CPU implementation, but again, the surprisingly short CPU runtime could stem from the fact that I am using a CETS machine.
+There is not a big difference between the arrays with power of 2 lengths and those with non-power of 2 lengths.
+
+
+## Console Output
+
+This is the console output after running the project with an array size of $2^{20} = 1048576$.
+I removed the numeric outputs for clarity.
+
+```
+****************
+** SCAN TESTS **
+****************
+==== cpu scan, power-of-two ====
+ elapsed time: 0.4223ms (std::chrono Measured)
+==== cpu scan, non-power-of-two ====
+ elapsed time: 0.3906ms (std::chrono Measured)
+ passed
+==== naive scan, power-of-two ====
+ elapsed time: 2.48595ms (CUDA Measured)
+ passed
+==== naive scan, non-power-of-two ====
+ elapsed time: 2.34307ms (CUDA Measured)
+ passed
+==== work-efficient scan, power-of-two ====
+ elapsed time: 1.90022ms (CUDA Measured)
+ passed
+==== work-efficient scan, non-power-of-two ====
+ elapsed time: 1.82384ms (CUDA Measured)
+ passed
+==== thrust scan, power-of-two ====
+ elapsed time: 0.503264ms (CUDA Measured)
+ passed
+==== thrust scan, non-power-of-two ====
+ elapsed time: 0.183776ms (CUDA Measured)
+ passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+==== cpu compact without scan, power-of-two ====
+ elapsed time: 2.1741ms (std::chrono Measured)
+ passed
+==== cpu compact without scan, non-power-of-two ====
+ elapsed time: 2.1809ms (std::chrono Measured)
+ passed
+==== cpu compact with scan ====
+ elapsed time: 3.3442ms (std::chrono Measured)
+ passed
+==== work-efficient compact, power-of-two ====
+ elapsed time: 3.5247ms (CUDA Measured)
+ passed
+==== work-efficient compact, non-power-of-two ====
+ elapsed time: 3.73046ms (CUDA Measured)
+ passed
+```
diff --git a/img/performance_compact.svg b/img/performance_compact.svg
new file mode 100644
index 0000000..28efc4a
--- /dev/null
+++ b/img/performance_compact.svg
@@ -0,0 +1,1379 @@
+
+
+
diff --git a/img/performance_nonpow2.svg b/img/performance_nonpow2.svg
new file mode 100644
index 0000000..26a93fc
--- /dev/null
+++ b/img/performance_nonpow2.svg
@@ -0,0 +1,1342 @@
+
+
+
diff --git a/img/performance_pow2.svg b/img/performance_pow2.svg
new file mode 100644
index 0000000..35c2c07
--- /dev/null
+++ b/img/performance_pow2.svg
@@ -0,0 +1,1328 @@
+
+
+
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..ca51934 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
#include
#include "testing_helpers.hpp"
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 20; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int *a = new int[SIZE];
int *b = new int[SIZE];
@@ -51,48 +51,48 @@ int main(int argc, char* argv[]) {
printDesc("naive scan, power-of-two");
StreamCompaction::Naive::scan(SIZE, c, a);
printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(SIZE, c, true);
+ printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);
- /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
+ /*For bug - finding only : Array of 1s to help find bugs in stream compaction or scan
onesArray(SIZE, c);
printDesc("1s array for finding bugs");
StreamCompaction::Naive::scan(SIZE, c, a);
- printArray(SIZE, c, true); */
+ printArray(SIZE, c, true);*/
zeroArray(SIZE, c);
printDesc("naive scan, non-power-of-two");
StreamCompaction::Naive::scan(NPOT, c, a);
printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(SIZE, c, true);
+ printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);
zeroArray(SIZE, c);
printDesc("work-efficient scan, power-of-two");
StreamCompaction::Efficient::scan(SIZE, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(SIZE, c, true);
+ printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);
zeroArray(SIZE, c);
printDesc("work-efficient scan, non-power-of-two");
StreamCompaction::Efficient::scan(NPOT, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(NPOT, c, true);
+ printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);
zeroArray(SIZE, c);
printDesc("thrust scan, power-of-two");
StreamCompaction::Thrust::scan(SIZE, c, a);
printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(SIZE, c, true);
+ printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);
zeroArray(SIZE, c);
printDesc("thrust scan, non-power-of-two");
StreamCompaction::Thrust::scan(NPOT, c, a);
printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(NPOT, c, true);
+ printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);
printf("\n");
@@ -137,14 +137,14 @@ int main(int argc, char* argv[]) {
printDesc("work-efficient compact, power-of-two");
count = StreamCompaction::Efficient::compact(SIZE, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(count, c, true);
+ printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);
zeroArray(SIZE, c);
printDesc("work-efficient compact, non-power-of-two");
count = StreamCompaction::Efficient::compact(NPOT, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
- //printArray(count, c, true);
+ printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);
system("pause"); // stop Win32 console from closing on exit
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..12c8acf 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -1,4 +1,5 @@
#include "common.h"
+#include "device_launch_parameters.h"
void checkCUDAErrorFn(const char *msg, const char *file, int line) {
cudaError_t err = cudaGetLastError();
@@ -17,22 +18,41 @@ void checkCUDAErrorFn(const char *msg, const char *file, int line) {
namespace StreamCompaction {
namespace Common {
+ __global__ void shiftArrayElements(int n, int shift, const int* readBuffer, int* writeBuffer) {
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+ if (index >= n) return;
+ if (index < shift) {
+ writeBuffer[index] = 0;
+ return;
+ }
+
+ writeBuffer[index] = readBuffer[index - shift];
+ }
/**
* Maps an array to an array of 0s and 1s for stream compaction. Elements
* which map to 0 will be removed, and elements which map to 1 will be kept.
*/
- __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
+ __global__ void kernMapToBoolean(int n, int* bools, const int* idata) {
// TODO
- }
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+ if (index >= n) return;
+ bools[index] = (idata[index] != 0);
+
+ }
+
/**
* Performs scatter on an array. That is, for each element in idata,
* if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
*/
- __global__ void kernScatter(int n, int *odata,
- const int *idata, const int *bools, const int *indices) {
+ __global__ void kernScatter(int n, int* odata,
+ const int* idata, const int* bools, const int* indices) {
// TODO
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+ if (index >= n) return;
+
+ if (bools[index]) odata[indices[index]] = idata[index];
}
}
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index d2c1fed..cfdc45b 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -37,6 +37,9 @@ namespace StreamCompaction {
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices);
+ __global__ void shiftArrayElements(int n, int shift,
+ const int* readBuffer, int* writeBuffer);
+
/**
* This class is used for timing the performance
* Uncopyable and unmovable
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..044d37b 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -17,10 +17,15 @@ namespace StreamCompaction {
* For performance analysis, this is supposed to be a simple for loop.
* (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
*/
- void scan(int n, int *odata, const int *idata) {
- timer().startCpuTimer();
- // TODO
- timer().endCpuTimer();
+ void scan(int n, int *odata, const int *idata, bool timed) {
+ if (timed) timer().startCpuTimer();
+ // TODO
+ int partialSum = 0;
+ for (int i = 0; i < n; ++i) {
+ odata[i] = partialSum;
+ partialSum += idata[i];
+ }
+ if (timed) timer().endCpuTimer();
}
/**
@@ -30,9 +35,20 @@ namespace StreamCompaction {
*/
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
- // TODO
+ int numElements = 0;
+ for (int i = 0; i < n; ++i) {
+ if (idata[i]) odata[numElements++] = idata[i];
+ }
timer().endCpuTimer();
- return -1;
+ return numElements;
+ }
+
+ int scatter(int n, int* odata, const int* bdata, const int* idata) {
+ int numElements = 0;
+ for (int i = 0; i < n; ++i) {
+ if (bdata[i]) odata[numElements++] = idata[i];
+ }
+ return numElements;
}
/**
@@ -41,10 +57,20 @@ namespace StreamCompaction {
* @returns the number of elements remaining after compaction.
*/
int compactWithScan(int n, int *odata, const int *idata) {
+ int* buffer = new int[n];
timer().startCpuTimer();
- // TODO
+ // Create boolean mask
+ for (int i = 0; i < n; ++i) {
+ buffer[i] = (idata[i] != 0);
+ }
+
+ scan(n, odata, idata, 0);
+
+ int numElements = scatter(n, odata, buffer, idata);
+
timer().endCpuTimer();
- return -1;
+ delete[] buffer;
+ return numElements;
}
}
}
diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h
index 873c047..f2f8c14 100644
--- a/stream_compaction/cpu.h
+++ b/stream_compaction/cpu.h
@@ -6,7 +6,7 @@ namespace StreamCompaction {
namespace CPU {
StreamCompaction::Common::PerformanceTimer& timer();
- void scan(int n, int *odata, const int *idata);
+ void scan(int n, int *odata, const int *idata, bool timed = 1);
int compactWithoutScan(int n, int *odata, const int *idata);
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..89427ea 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -2,6 +2,7 @@
#include
#include "common.h"
#include "efficient.h"
+#include "device_launch_parameters.h"
namespace StreamCompaction {
namespace Efficient {
@@ -12,13 +13,76 @@ namespace StreamCompaction {
return timer;
}
+ __global__ void kernUpSweep(int n, int pow2tod, int* buffer) {
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+ int pow2todp1 = 2 * pow2tod;
+
+ if (index > n / pow2todp1 - 1) return;
+ index *= pow2todp1;
+
+ buffer[index + pow2todp1 - 1] += buffer[index + pow2tod - 1];
+ }
+
+ __global__ void kernDownSweep(int n, int pow2tod, int* buffer) {
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+ int pow2todp1 = 2 * pow2tod;
+
+ if (index > n / pow2todp1 - 1) return;
+ index *= pow2todp1;
+
+ int tmp = buffer[index + pow2tod - 1];
+ buffer[index + pow2tod - 1] = buffer[index + pow2todp1 - 1];
+ buffer[index + pow2todp1 - 1] += tmp;
+ }
+
+ dim3 computeBlocksPerGrid(int threads, int blockSize) {
+ return dim3{ (unsigned int)(threads + blockSize - 1) / blockSize };
+ }
+
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
- void scan(int n, int *odata, const int *idata) {
- timer().startGpuTimer();
+ void scan(int n, int *odata, const int *idata, bool timed) {
+ int blockSize = 128;
+
+ bool isPower2Length = (n == (1 << ilog2(n)));
+
+ int bufferLength = (isPower2Length) ? n : 1 << ilog2ceil(n);
+
+ int* dev_tmpArray;
+ cudaMalloc((void**)&dev_tmpArray, bufferLength * sizeof(int));
+ checkCUDAError("cudaMalloc tmpArray failed!");
+
+ if (!isPower2Length) {
+ cudaMemset(dev_tmpArray + n, 0, (bufferLength - n) * sizeof(int));
+ }
+
+ cudaMemcpy(dev_tmpArray, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+ if (timed) timer().startGpuTimer();
// TODO
- timer().endGpuTimer();
+ for (int d = 0; d < ilog2ceil(n); ++d) {
+ dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize);
+ kernUpSweep<<>>(bufferLength, 1 << d, dev_tmpArray);
+ cudaDeviceSynchronize();
+ checkCUDAError("kernUpSweep failed!");
+ }
+
+ cudaMemset(dev_tmpArray + bufferLength - 1, 0, sizeof(int));
+
+ for (int d = ilog2ceil(n) - 1; d >= 0; --d) {
+ dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize);
+ kernDownSweep<<>>(bufferLength, 1 << d, dev_tmpArray);
+ cudaDeviceSynchronize();
+ checkCUDAError("kernDownSweep failed!");
+ }
+ if (timed) timer().endGpuTimer();
+
+ cudaMemcpy(odata, dev_tmpArray, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+ cudaFree(dev_tmpArray);
}
/**
@@ -30,11 +94,55 @@ namespace StreamCompaction {
* @param idata The array of elements to compact.
* @returns The number of elements remaining after compaction.
*/
- int compact(int n, int *odata, const int *idata) {
+ int compact(int n, int* odata, const int* idata) {
+ int blockSize = 128;
+ dim3 blocks{ (unsigned int)(n + blockSize - 1) / blockSize };
+
+ int* dev_buffer1;
+ int* dev_buffer2;
+ int* dev_boolArray;
+ int* dev_indices;
+ cudaMalloc((void**)&dev_boolArray, n * sizeof(int));
+ checkCUDAError("cudaMalloc dev_boolArray failed!");
+ cudaMalloc((void**)&dev_indices, n * sizeof(int));
+ checkCUDAError("cudaMalloc dev_indices failed!");
+ cudaMalloc((void**)&dev_buffer1, n * sizeof(int));
+ checkCUDAError("cudaMalloc dev_buffer1 failed!");
+ cudaMalloc((void**)&dev_buffer2, n * sizeof(int));
+ checkCUDAError("cudaMalloc dev_buffer2 failed!");
+
+ cudaMemcpy(dev_buffer1, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+ checkCUDAError("cudaMemcpy idata->dev_buffer1 failed!");
+
timer().startGpuTimer();
- // TODO
+
+ StreamCompaction::Common::kernMapToBoolean<<>>(n, dev_boolArray, dev_buffer1);
+ cudaDeviceSynchronize();
+ checkCUDAError("kernMapToBoolean failed!");
+
+ scan(n, dev_indices, dev_boolArray, 0);
+
+ StreamCompaction::Common::kernScatter<<>>(n, dev_buffer2, dev_buffer1, dev_boolArray, dev_indices);
+ cudaDeviceSynchronize();
+ checkCUDAError("kernScatter failed!");
+
+ cudaMemcpy(odata, dev_buffer2, n * sizeof(int), cudaMemcpyDeviceToHost);
+ checkCUDAError("cudaMemcpy dev_buffer2->odata failed!");
+
+ // Index that last element in idata would have, if it was valid
+ int lastIndex;
+ cudaMemcpy(&lastIndex, dev_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+ // Check if last element is valid
+ int lastBool;
+ cudaMemcpy(&lastBool, dev_boolArray + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+
timer().endGpuTimer();
- return -1;
+
+ cudaFree(dev_boolArray);
+ cudaFree(dev_indices);
+ cudaFree(dev_buffer1);
+ cudaFree(dev_buffer2);
+ return (lastBool) ? lastIndex + 1 : lastIndex;
}
}
-}
+}
\ No newline at end of file
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 803cb4f..a4455cc 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -6,7 +6,7 @@ namespace StreamCompaction {
namespace Efficient {
StreamCompaction::Common::PerformanceTimer& timer();
- void scan(int n, int *odata, const int *idata);
+ void scan(int n, int *odata, const int *idata, bool timed = 1);
int compact(int n, int *odata, const int *idata);
}
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..b7c18cd 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -2,6 +2,7 @@
#include
#include "common.h"
#include "naive.h"
+#include "device_launch_parameters.h"
namespace StreamCompaction {
namespace Naive {
@@ -12,14 +13,72 @@ namespace StreamCompaction {
return timer;
}
// TODO: __global__
+ __global__ void kernHandleNonPower(int n, int d, int* buffer) {
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+ int pow2tod = 1 << d;
+
+ if (index >= n - pow2tod) return;
+
+ buffer[pow2tod + index] += buffer[index];
+ }
+
+ __global__ void kernNaiveScanStep(int n, int d, const int* readBuffer, int* writeBuffer) {
+ // compute thread index
+ int index = threadIdx.x + blockIdx.x * blockDim.x;
+ if (index >= n) return;
+
+ int pow2tod = 1 << d;
+
+ if (index > pow2tod) {
+ writeBuffer[index] = readBuffer[index] + readBuffer[index - pow2tod];
+ }
+ else {
+ writeBuffer[index] = readBuffer[index];
+ }
+ }
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
+ int blockSize = 128;
+ dim3 fullBlocksPerGrid{ (unsigned int) (n + blockSize - 1) / blockSize };
+
+ int* dev_buffer1;
+ int* dev_buffer2;
+
+ cudaMalloc((void**)&dev_buffer1, n * sizeof(int));
+ checkCUDAError("cudaMalloc dev_buffer1 failed!");
+ cudaMalloc((void**)&dev_buffer2, n * sizeof(int));
+ checkCUDAError("cudaMalloc dev_buffer2 failed!");
+
+ cudaMemcpy(dev_buffer2, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
timer().startGpuTimer();
- // TODO
+ StreamCompaction::Common::shiftArrayElements<<>>(n, 1, dev_buffer2, dev_buffer1);
+ checkCUDAError("shiftArrayElements failed!");
+ //cudaDeviceSynchronize();
+
+ for (int d = 0; d < ilog2(n); ++d) {
+ kernNaiveScanStep <<>>(n, d, dev_buffer1, dev_buffer2);
+ checkCUDAError("naiveScanStep failed!");
+ //cudaDeviceSynchronize();
+
+ std::swap(dev_buffer1, dev_buffer2);
+ }
+ // perform last step
+ if ((1 << ilog2(n)) != n) {
+ fullBlocksPerGrid.x = (n - (1 << ilog2(n)) + blockSize - 1) / blockSize;
+ kernHandleNonPower<<>>(n, ilog2(n), dev_buffer1);
+ checkCUDAError("handleNonPower failed!");
+ cudaDeviceSynchronize();
+ }
timer().endGpuTimer();
+
+ cudaMemcpy(odata, dev_buffer1, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+ cudaFree(dev_buffer1);
+ cudaFree(dev_buffer2);
}
}
-}
+}
\ No newline at end of file
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..32a3a73 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,11 +18,21 @@ namespace StreamCompaction {
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
+ int* dev_buffer;
+ thrust::device_ptr dev_thrustBuffer;
+ cudaMalloc((void**)&dev_buffer, n * sizeof(int));
+ checkCUDAError("cudaMalloc dev_buffer failed!");
+ dev_thrustBuffer = thrust::device_ptr(dev_buffer);
+
+ cudaMemcpy(dev_buffer, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
timer().startGpuTimer();
- // TODO use `thrust::exclusive_scan`
- // example: for device_vectors dv_in and dv_out:
- // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+ thrust::exclusive_scan(dev_thrustBuffer, dev_thrustBuffer + n, dev_thrustBuffer);
timer().endGpuTimer();
+
+ cudaMemcpy(odata, dev_buffer, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+ cudaFree(dev_buffer);
}
}
}