diff --git a/README.md b/README.md
index b71c458..fa99094 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,94 @@ CUDA Stream Compaction
**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Daniel Krupka
+* Tested on: Debian testing (stretch), Intel(R) Core(TM) i7-4710HQ CPU @ 2.50GHz 8GB, GTX 850M
-### (TODO: Your README)
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+# Project 2 - Stream Compaction
+This project's goal was to compare various methods for achieving [stream compaction](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html).
+The test program was modified to take block size and array size as arguments, and ran tests for each algorithm on both
+power-of-two and non-power-of-two data. Test output was the following:
+```
+****************
+** SCAN TESTS **
+****************
+ [ 33 36 27 15 43 35 36 42 49 21 12 27 40 ... 6 0 ]
+==== cpu scan, power-of-two ====
+ [ 0 33 69 96 111 154 189 225 267 316 337 349 376 ... 12852633 12852639 ]
+==== cpu scan, non-power-of-two ====
+ [ 0 33 69 96 111 154 189 225 267 316 337 349 376 ... 12852608 12852617 ]
+ passed
+==== naive scan, power-of-two ====
+ passed
+==== naive scan, non-power-of-two ====
+ passed
+==== work-efficient scan, power-of-two ====
+ passed
+==== work-efficient scan, non-power-of-two ====
+ passed
+==== real work-efficient scan, power-of-two ====
+ passed
+==== real work-efficient scan, non-power-of-two ====
+ passed
+==== thrust scan, power-of-two ====
+ passed
+==== thrust scan, non-power-of-two ====
+ passed
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+ [ 3 2 1 3 1 3 2 0 1 1 2 3 2 ... 0 0 ]
+==== cpu compact without scan, power-of-two ====
+ [ 3 2 1 3 1 3 2 1 1 2 3 2 3 ... 1 3 ]
+ passed
+==== cpu compact without scan, non-power-of-two ====
+ [ 3 2 1 3 1 3 2 1 1 2 3 2 3 ... 3 1 ]
+ passed
+==== cpu compact with scan ====
+ passed
+==== work-efficient compact, power-of-two ====
+ passed
+==== work-efficient compact, non-power-of-two ====
+ passed
+==== real work-efficient compact, power-of-two ====
+ passed
+==== real work-efficient compact, non-power-of-two ====
+ passed
+```
+
+# Analysis - Scanning
+A major step of compaction is scanning. I tested a CPU implementation, a naive CUDA
+implementation, two efficient CUDA implementations, and the Thrust library's implementation.
+
+
+
+Interestingly, the Thrust implementation fared the worst, though Nvidia's NSight profiler
+showed that Thrust was not actually using much GPU time. A likely explanation is that Thrust
+may be shuffling data or partitioning the work between CPU and GPU.
+
+
+
+
+Looking at the other implementations on their own shows that the GPU implementations are substantially
+faster than the CPU for large workloads, but somewhat slower on small ones. This makes sense, as for small loads,
+the GPU is nowhere near fully saturated.
+
+# Analysis - Compaction
+Moving on to compaction, the CPU fares even worse.
+
+
+Focusing on only the GPU implementations, we see that the more optimized version
+begins to perform noticeably better, where the two were mostly indistinguishable
+for simple scanning.
+
+
+For the non-optimized GPU reduction, block size had a substantial effect, with
+a 1024 thread block performing 1.3x faster than with 128 threads.
+
+
+The optimized GPU reduction showed much less variance with block size. This likely due to
+the active threads no longer being scattered between different warps, allowing for more early
+termination and lowering the number of active warps and thus blocks.
+
diff --git a/cis565_stream_compaction_test.launch b/cis565_stream_compaction_test.launch
index 4267429..07b70cd 100644
--- a/cis565_stream_compaction_test.launch
+++ b/cis565_stream_compaction_test.launch
@@ -8,8 +8,8 @@
-
-
+
+
@@ -18,8 +18,8 @@
-
+
diff --git a/images/times_all_comp.png b/images/times_all_comp.png
new file mode 100644
index 0000000..afe4110
Binary files /dev/null and b/images/times_all_comp.png differ
diff --git a/images/times_all_comp_zoom.png b/images/times_all_comp_zoom.png
new file mode 100644
index 0000000..3939439
Binary files /dev/null and b/images/times_all_comp_zoom.png differ
diff --git a/images/times_blk256.png b/images/times_blk256.png
new file mode 100644
index 0000000..fbe3097
Binary files /dev/null and b/images/times_blk256.png differ
diff --git a/images/times_blk256_nothrust.png b/images/times_blk256_nothrust.png
new file mode 100644
index 0000000..81d947c
Binary files /dev/null and b/images/times_blk256_nothrust.png differ
diff --git a/images/times_blk256_nothrust_zoom.png b/images/times_blk256_nothrust_zoom.png
new file mode 100644
index 0000000..2fa02b1
Binary files /dev/null and b/images/times_blk256_nothrust_zoom.png differ
diff --git a/images/times_blk_eff.png b/images/times_blk_eff.png
new file mode 100644
index 0000000..2eee8e3
Binary files /dev/null and b/images/times_blk_eff.png differ
diff --git a/images/times_blk_naive.png b/images/times_blk_naive.png
new file mode 100644
index 0000000..642d376
Binary files /dev/null and b/images/times_blk_naive.png differ
diff --git a/images/times_blk_realeff.png b/images/times_blk_realeff.png
new file mode 100644
index 0000000..dd91f8f
Binary files /dev/null and b/images/times_blk_realeff.png differ
diff --git a/images/times_cpu_comp.png b/images/times_cpu_comp.png
new file mode 100644
index 0000000..ae073ee
Binary files /dev/null and b/images/times_cpu_comp.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 675da35..7f515f8 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -10,13 +10,30 @@
#include
#include
#include
+#include
#include
#include "testing_helpers.hpp"
+#include
+#include
+
+
int main(int argc, char* argv[]) {
- const int SIZE = 1 << 8;
- const int NPOT = SIZE - 3;
- int a[SIZE], b[SIZE], c[SIZE];
+ double t1,t2;
+
+ int sizeExp = 19;
+ int blkSize = 256;
+ if (argc >= 3) {
+ sizeExp = atoi(argv[1]);
+ blkSize = atoi(argv[2]);
+ }
+ int SIZE = 1 << sizeExp;
+ int NPOT = SIZE - 3;
+ int *a = new int[SIZE], *b = new int[SIZE], *c = new int[SIZE];
+
+ StreamCompaction::Naive::blkSize = blkSize;
+ StreamCompaction::Efficient::blkSize = blkSize;
+ StreamCompaction::RealEfficient::blkSize = blkSize;
// Scan tests
@@ -33,48 +50,72 @@ int main(int argc, char* argv[]) {
printDesc("cpu scan, power-of-two");
StreamCompaction::CPU::scan(SIZE, b, a);
printArray(SIZE, b, true);
+ double tCpuScanPot = StreamCompaction::CPU::last_runtime;
zeroArray(SIZE, c);
printDesc("cpu scan, non-power-of-two");
StreamCompaction::CPU::scan(NPOT, c, a);
printArray(NPOT, b, true);
printCmpResult(NPOT, b, c);
+ double tCpuScanNpot = StreamCompaction::CPU::last_runtime;
zeroArray(SIZE, c);
printDesc("naive scan, power-of-two");
StreamCompaction::Naive::scan(SIZE, c, a);
//printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);
+ double tNaiveScanPot = StreamCompaction::Naive::last_runtime;
zeroArray(SIZE, c);
printDesc("naive scan, non-power-of-two");
StreamCompaction::Naive::scan(NPOT, c, a);
//printArray(SIZE, c, true);
printCmpResult(NPOT, b, c);
+ double tNaiveScanNpot = StreamCompaction::Naive::last_runtime;
zeroArray(SIZE, c);
printDesc("work-efficient scan, power-of-two");
StreamCompaction::Efficient::scan(SIZE, c, a);
//printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);
+ double tEffScanPot = StreamCompaction::Efficient::last_runtime;
zeroArray(SIZE, c);
printDesc("work-efficient scan, non-power-of-two");
StreamCompaction::Efficient::scan(NPOT, c, a);
//printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);
+ double tEffScanNpot = StreamCompaction::Efficient::last_runtime;
+
+
+ zeroArray(SIZE, c);
+ printDesc("real work-efficient scan, power-of-two");
+ StreamCompaction::RealEfficient::scan(SIZE, c, a);
+ //printArray(SIZE, c, true);
+ printCmpResult(SIZE, b, c);
+ double tRealEffScanPot = StreamCompaction::RealEfficient::last_runtime;
+
+ zeroArray(SIZE, c);
+ printDesc("real work-efficient scan, non-power-of-two");
+ StreamCompaction::RealEfficient::scan(NPOT, c, a);
+ //printArray(NPOT, c, true);
+ printCmpResult(NPOT, b, c);
+ double tRealEffScanNpot = StreamCompaction::RealEfficient::last_runtime;
+
zeroArray(SIZE, c);
printDesc("thrust scan, power-of-two");
StreamCompaction::Thrust::scan(SIZE, c, a);
//printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);
+ double tThrustScanPot = StreamCompaction::Thrust::last_runtime;
zeroArray(SIZE, c);
printDesc("thrust scan, non-power-of-two");
StreamCompaction::Thrust::scan(NPOT, c, a);
//printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);
+ double tThrustScanNpot = StreamCompaction::Thrust::last_runtime;
printf("\n");
printf("*****************************\n");
@@ -95,6 +136,7 @@ int main(int argc, char* argv[]) {
expectedCount = count;
printArray(count, b, true);
printCmpLenResult(count, expectedCount, b, b);
+ double tCpuCompNoscanPot = StreamCompaction::CPU::last_runtime;
zeroArray(SIZE, c);
printDesc("cpu compact without scan, non-power-of-two");
@@ -102,22 +144,51 @@ int main(int argc, char* argv[]) {
expectedNPOT = count;
printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);
+ double tCpuCompNoscanNpot = StreamCompaction::CPU::last_runtime;
zeroArray(SIZE, c);
printDesc("cpu compact with scan");
count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
- printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);
+ double tCpuCompScanPot = StreamCompaction::CPU::last_runtime;
zeroArray(SIZE, c);
printDesc("work-efficient compact, power-of-two");
count = StreamCompaction::Efficient::compact(SIZE, c, a);
//printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);
+ double tEffCompScanPot = StreamCompaction::Efficient::last_runtime;
zeroArray(SIZE, c);
printDesc("work-efficient compact, non-power-of-two");
count = StreamCompaction::Efficient::compact(NPOT, c, a);
//printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);
+ double tEffCompScanNpot = StreamCompaction::Efficient::last_runtime;
+
+ zeroArray(SIZE, c);
+ printDesc("real work-efficient compact, power-of-two");
+ count = StreamCompaction::RealEfficient::compact(SIZE, c, a);
+ //printArray(count, c, true);
+ printCmpLenResult(count, expectedCount, b, c);
+ double tRealEffCompScanPot = StreamCompaction::RealEfficient::last_runtime;
+
+ zeroArray(SIZE, c);
+ printDesc("real work-efficient compact, non-power-of-two");
+ count = StreamCompaction::RealEfficient::compact(NPOT, c, a);
+ //printArray(count, c, true);
+ printCmpLenResult(count, expectedNPOT, b, c);
+ double tRealEffCompScanNpot = StreamCompaction::RealEfficient::last_runtime;
+
+ fprintf(stderr, "[%d, %d, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f, %.3f]\n",
+ SIZE, blkSize,
+ tCpuScanPot, tNaiveScanPot, tEffScanPot, tRealEffScanPot, tThrustScanPot,
+ tCpuCompNoscanPot, tCpuCompScanPot, tEffCompScanPot, tRealEffCompScanPot
+ );
+
+ delete a;
+ delete b;
+ delete c;
+
+ return 0;
}
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index cdbef77..de726ec 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -7,11 +7,13 @@ set(SOURCE_FILES
"naive.cu"
"efficient.h"
"efficient.cu"
+ "real_efficient.h"
+ "real_efficient.cu"
"thrust.h"
"thrust.cu"
)
cuda_add_library(stream_compaction
${SOURCE_FILES}
- OPTIONS -arch=sm_20
+ OPTIONS -arch=sm_50
)
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index e600c29..6f952b6 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -4,12 +4,22 @@
namespace StreamCompaction {
namespace CPU {
+double last_runtime;
+
/**
* CPU scan (prefix sum).
*/
void scan(int n, int *odata, const int *idata) {
- // TODO
- printf("TODO\n");
+ double t1 = clock();
+
+ int t = 0;
+ for (int i = 0; i < n; i++) {
+ odata[i] = t;
+ t += idata[i];
+ }
+
+ double t2 = clock();
+ last_runtime = 1.0E6*(t2-t1)/CLOCKS_PER_SEC;
}
/**
@@ -18,8 +28,19 @@ void scan(int n, int *odata, const int *idata) {
* @returns the number of elements remaining after compaction.
*/
int compactWithoutScan(int n, int *odata, const int *idata) {
- // TODO
- return -1;
+ double t1 = clock();
+
+ int oIdx = 0;
+ for (int i = 0; i < n; i++) {
+ if (idata[i] != 0) {
+ odata[oIdx] = idata[i];
+ oIdx++;
+ }
+ }
+
+ double t2 = clock();
+ last_runtime = 1.0E6*(t2-t1)/CLOCKS_PER_SEC;
+ return oIdx;
}
/**
@@ -28,8 +49,30 @@ int compactWithoutScan(int n, int *odata, const int *idata) {
* @returns the number of elements remaining after compaction.
*/
int compactWithScan(int n, int *odata, const int *idata) {
- // TODO
- return -1;
+ double t1 = clock();
+
+ int *keep = new int[n];
+ for (int i = 0; i < n; i++) {
+ keep[i] = (idata[i] != 0) ? 1 : 0;
+ }
+
+ int *keepScan = new int[n];
+ int nKeep = 0;
+ scan(n, keepScan, keep);
+ for (int i = 0; i < n; i++) {
+ if (!keep[i])
+ continue;
+
+ nKeep++;
+ odata[keepScan[i]] = idata[i];
+ }
+
+ double t2 = clock();
+ last_runtime = 1.0E6*(t2-t1)/CLOCKS_PER_SEC;
+
+ delete keepScan;
+ delete keep;
+ return nKeep;
}
}
diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h
index 6348bf3..7e87420 100644
--- a/stream_compaction/cpu.h
+++ b/stream_compaction/cpu.h
@@ -2,6 +2,8 @@
namespace StreamCompaction {
namespace CPU {
+ extern double last_runtime;
+
void scan(int n, int *odata, const int *idata);
int compactWithoutScan(int n, int *odata, const int *idata);
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index b2f739b..31ad328 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -6,14 +6,99 @@
namespace StreamCompaction {
namespace Efficient {
-// TODO: __global__
+double last_runtime;
+int blkSize = 256;
+
+// perform reduction
+__global__ void kernScanUp(int n, int dPow, int *data) {
+ int k = blockIdx.x * blockDim.x + threadIdx.x;
+ if (k % dPow != 0 || k + dPow - 1 >= n)
+ return;
+
+ data[k + dPow - 1] += data[k + dPow/2 - 1];
+}
+
+// perform reduction
+__global__ void kernScanDown(int n, int dPow, int *data) {
+ int k = blockIdx.x * blockDim.x + threadIdx.x;
+ if (k % dPow != 0 || k + dPow - 1 >= n)
+ return;
+
+ int t = data[k + dPow/2 - 1];
+ data[k + dPow/2 - 1] = data[k + dPow - 1];
+ data[k + dPow - 1] += t;
+}
+
+// mark nonzeroes
+__global__ void kernMark(int n, int *keep, const int *data) {
+ int k = blockIdx.x * blockDim.x + threadIdx.x;
+ if (k >= n)
+ return;
+
+ keep[k] = (data[k] != 0) ? 1 : 0;
+}
+
+__global__ void kernScatter(int n, int *out, const int *keep, const int *scan, const int *data) {
+ int k = blockIdx.x * blockDim.x + threadIdx.x;
+ if (k >= n)
+ return;
+
+ if (keep[k]) {
+ out[scan[k]] = data[k];
+ }
+}
+
+static int getPot(int n) {
+ unsigned int pot = n;
+ pot--;
+ pot |= pot >> 1;
+ pot |= pot >> 2;
+ pot |= pot >> 4;
+ pot |= pot >> 8;
+ pot |= pot >> 16;
+ pot++;
+
+ return pot;
+}
+
+static void devScanUtil(int n, int *devData) {
+ int pot = getPot(n);
+
+ dim3 blkDim(blkSize);
+
+ int dPow = 2;
+ while (dPow/2 < n) {
+ dim3 blkCnt((pot + blkDim.x - 1)/blkDim.x);
+ kernScanUp<<>>(pot, dPow, devData);
+ dPow *= 2;
+ }
+ cudaMemset(&devData[pot-1], 0, sizeof(int));
+
+ while (dPow > 1) {
+ dim3 blkCnt((pot + blkDim.x - 1)/blkDim.x);
+ kernScanDown<<>>(pot, dPow, devData);
+ dPow /= 2;
+ }
+}
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
- // TODO
- printf("TODO\n");
+ int pot = getPot(n);
+
+ int *devData;
+ cudaMalloc((void**)&devData, pot*sizeof(int));
+ cudaMemset(devData, 0, pot*sizeof(int));
+ cudaMemcpy(devData, idata, n*sizeof(int), cudaMemcpyHostToDevice);
+
+ double t1 = clock();
+ devScanUtil(n, devData);
+ double t2 = clock();
+ last_runtime = 1.0E6 * (t2-t1) / CLOCKS_PER_SEC;
+
+ cudaMemcpy(odata, devData, n*sizeof(int), cudaMemcpyDeviceToHost);
+ cudaFree(devData);
}
/**
@@ -26,8 +111,45 @@ void scan(int n, int *odata, const int *idata) {
* @returns The number of elements remaining after compaction.
*/
int compact(int n, int *odata, const int *idata) {
- // TODO
- return -1;
+ int pot = getPot(n);
+
+ // upload data
+ int *devData;
+ cudaMalloc((void**)&devData, n*sizeof(int));
+ cudaMemcpy(devData, idata, n*sizeof(int), cudaMemcpyHostToDevice);
+
+ dim3 blkDim(blkSize);
+ dim3 blkCnt((n + blkDim.x - 1)/blkDim.x);
+
+ // mark values to keep
+ int *devKeep, *devScan;
+ cudaMalloc((void**)&devKeep, pot*sizeof(int));
+ cudaMalloc((void**)&devScan, pot*sizeof(int));
+ cudaMemset(devKeep, 0, pot*sizeof(int));
+
+ double t1 = clock();
+ kernMark<<>>(n, devKeep, devData);
+ cudaMemcpy(devScan, devKeep, pot*sizeof(int), cudaMemcpyDeviceToDevice);
+
+ // scan boolean array
+ devScanUtil(n, devScan);
+ int nKeep;
+ cudaMemcpy(&nKeep, &devScan[pot-1], sizeof(int), cudaMemcpyDeviceToHost);
+
+ // scatter to output
+ int *devOut;
+ cudaMalloc((void**)&devOut, n*sizeof(int));
+ kernScatter<<>>(n, devOut, devKeep, devScan, devData);
+ double t2 = clock();
+ last_runtime = 1.0E6 * (t2-t1) / CLOCKS_PER_SEC;
+ cudaMemcpy(odata, devOut, nKeep*sizeof(int), cudaMemcpyDeviceToHost);
+
+ cudaFree(devOut);
+ cudaFree(devData);
+ cudaFree(devKeep);
+ cudaFree(devScan);
+
+ return nKeep;
}
}
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 395ba10..fb54bde 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -2,6 +2,9 @@
namespace StreamCompaction {
namespace Efficient {
+ extern double last_runtime;
+ extern int blkSize;
+
void scan(int n, int *odata, const int *idata);
int compact(int n, int *odata, const int *idata);
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 3d86b60..e89b742 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -6,14 +6,67 @@
namespace StreamCompaction {
namespace Naive {
-// TODO: __global__
+double last_runtime;
+int blkSize = 256;
+
+__global__ void kernScan(int n, int dPow, int *odata, const int *idata) {
+ int k = blockIdx.x * blockDim.x + threadIdx.x;
+ if (k >= n)
+ return;
+
+ if (k >= dPow)
+ odata[k] = idata[k - dPow] + idata[k];
+ else
+ odata[k] = idata[k];
+}
+
+__global__ void kernInclToExcl(int n, int *odata, const int *idata) {
+ int k = blockIdx.x * blockDim.x + threadIdx.x;
+ if (k >= n-1)
+ return;
+
+ odata[k+1] = idata[k];
+};
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
- // TODO
- printf("TODO\n");
+ int dPow = 1, dLogPow = 0;
+ int *devData[2];
+ cudaMalloc((void**)&devData[0], n*sizeof(int));
+ cudaMalloc((void**)&devData[1], n*sizeof(int));
+ cudaMemcpy(devData[0], idata, n*sizeof(int), cudaMemcpyHostToDevice);
+ cudaMemset(devData[1], 0, n*sizeof(int));
+
+ double t1 = clock();
+
+ dim3 blkDim(blkSize);
+ dim3 blkCnt((n + blkDim.x - 1)/blkDim.x);
+
+ int dst, src;
+ while (dPow/2 < n) {
+ src = dLogPow % 2;
+ dst = 1 - src;
+ kernScan<<>>(n, dPow, devData[dst], devData[src]);
+ dPow *= 2;
+ dLogPow++;
+ cudaDeviceSynchronize();
+ }
+
+ src = dLogPow % 2;
+ dst = 1 - src;
+ kernInclToExcl<<>>(n, devData[dst], devData[src]);
+ cudaDeviceSynchronize();
+
+ double t2=clock();
+ last_runtime = 1.0E6 * (t2-t1) / CLOCKS_PER_SEC;
+
+ cudaMemcpy(odata, devData[dst], n*sizeof(int), cudaMemcpyDeviceToHost);
+ odata[0] = 0;
+
+ cudaFree(devData[0]);
+ cudaFree(devData[1]);
}
}
diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h
index 21152d6..060df1d 100644
--- a/stream_compaction/naive.h
+++ b/stream_compaction/naive.h
@@ -2,6 +2,9 @@
namespace StreamCompaction {
namespace Naive {
+ extern double last_runtime;
+ extern int blkSize;
+
void scan(int n, int *odata, const int *idata);
}
}
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index d8dbb32..b74f882 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -9,6 +9,8 @@
namespace StreamCompaction {
namespace Thrust {
+double last_runtime;
+
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
@@ -16,6 +18,14 @@ void scan(int n, int *odata, const int *idata) {
// TODO use `thrust::exclusive_scan`
// example: for device_vectors dv_in and dv_out:
// thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+
+ thrust::device_vector devIn(idata, idata+n), devOut(n);
+ double t1 = clock();
+ thrust::exclusive_scan(devIn.begin(), devIn.end(), devOut.begin());
+ double t2 = clock();
+ last_runtime = 1.0E6 * (t2-t1) / CLOCKS_PER_SEC;
+
+ thrust::copy(devOut.begin(), devOut.end(), odata);
}
}
diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h
index 06707f3..e59174f 100644
--- a/stream_compaction/thrust.h
+++ b/stream_compaction/thrust.h
@@ -2,6 +2,8 @@
namespace StreamCompaction {
namespace Thrust {
+ extern double last_runtime;
+
void scan(int n, int *odata, const int *idata);
}
}