diff --git a/README.md b/README.md index b71c458..7318caf 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,99 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Xiaomao Ding +* Tested on: Windows 8.1, i7-4700MQ @ 2.40GHz 8.00GB, GT 750M 2047MB (Personal Computer) -### (TODO: Your README) +# Intro +The code in this repo implements stream compaction and scan algorithms on the GPU in CUDA as well as on the CPU in C++ for performance comparisons. The scan algorithm performs a parallel prefix sum on the GPU. For more information, read this [NVIDIA link](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html). -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +![Image of Prefix Sum](http://http.developer.nvidia.com/GPUGems3/elementLinks/39fig02.jpg) + Image from NVIDIA + +# Performance Analysis +This section below discusses the performance of the algorithms in this repository. + +### Optimal Block Size +Each GPU algorithm was tested using an array of 2^14 integers. The optimal block size was found to be 128-256 as shown below. All calculations following this section are done with block size 128. Performance was timed using CUDAEvents. + +| Block Size | Naive GPU scan (ms) | Efficient GPU scan (ms) | Efficient GPU Compaction (ms)| +| :------------- |-------------:| -----:|-----:| +| 64 | 0.124 | 0.527 |0.531 | +| 128 | 0.094 | 0.484 |0.412 | +| 256 | 0.095 | 0.473 |0.423 | +| 512 | 0.102 | 0.471 |0.454 | +| 1024 | 0.109 | 0.495 |0.487 | + +![Plot of block size](https://github.com/xnieamo/Project2-Stream-Compaction/blob/master/images/blockSizePlot.png) + +### Performance comparisons +This section describes the performance of the various implementations of scan and stream compaction in this repository. For some reason, I get a stack overflow error when trying to run the algorithms with greater than 2^16 array entries, so that is maximum array size presented here. + +| Array Size | CPU scan | Naive GPU scan | Efficient GPU scan | CPU Compact w/o scan | CPU compact w/ scan | Efficient GPU compact | Thrust | +|:------|-----------:|----------------:|---------------------:|----------------------:|---------------------:|-----------------------:|--------:| +| 2^12 | 0.015623 | 0.051032 | 0.298 | 0.0090072 | 0.0312529 | 0.263 | 0.352 | +| 2^14 | 0.062499 | 0.0928 | 0.422 | 0.0468755 | 0.1716863 | 0.425 | 0.502 | +| 2^16 | 0.2343767 | 0.342 | 1.15 | 0.250018 | 0.6718685 | 1.127 | 1.325 | + +![Plot of various runtimes](https://github.com/xnieamo/Project2-Stream-Compaction/blob/master/images/performanceChart.png) + +Because we are implementing the work-efficient algorithm described in GPU Gems without any optimizations, it actually runs SLOWER! When looking at the NVIDIA NSight runtime analysis, it appears that the thrust implementation is using asynchronous memory transfer, which seems to allow the CPU to call functions while a kernel is running. Surprisingly, the thrust implementation is still slower than the efficient GPU implementation (runtime was taken from NSight analysis, discounting initial and final memcpy operations). + +In the case of the work-efficient algorithm, one of the issues that affects runtime is the fact that many threads idle as the upsweep and downsweep progress. Aside from that, a main bottleneck in my implementation is memory transfer from host to device. In the stream compaction algorithm, there is a need to set the last index to 0. Instead of doing this via a kernel, I transfer back to host. This results in an expensive memory transfer and adds roughly 0.100 ms to the runtime. Another bottleneck that seems to take about as long as the calculation itself is the cudaLaunch function. The internet hasn't been helpful in telling me what this does, but I suspect that it is responsible for launching the grids or blocks on the GPU. If so, then changing the index to 0 on the GPU might save me 25% of my runtime! + +With the naive GPU scan, there aren't really many addressable bottlenecks. The calculation just takes that long. + +For the CPU implementation, I think for this particular project, the w/o scan compaction runs faster as it only needs to perform a single comparison operation per element. The w/ scan implementation adds a large amount of unnecessary calculations (on the CPU) which makes it run much slower. This shows that GPU and CPU algorithms and the way we should about implementing code on these machines differs by quite a lot! + +### Program output +Finally, here is the output of the various tests to validate my implementations, using an array of 2^16 elements. They all pass, woohoo! + +``` + +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 35 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604305 1604316 ] + passed +==== naive scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] + passed +==== naive scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 0 0 ] + passed +==== work-efficient scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] + passed +==== work-efficient scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604305 1604316 ] + passed +==== thrust scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] + passed +==== thrust scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604305 1604316 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 3 1 ] + passed +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ] + passed +==== work-efficient compact, power-of-two ==== + passed +==== work-efficient compact, non-power-of-two ==== + passed +``` diff --git a/images/blockSizePlot.png b/images/blockSizePlot.png new file mode 100644 index 0000000..2345b9e Binary files /dev/null and b/images/blockSizePlot.png differ diff --git a/images/performanceChart.png b/images/performanceChart.png new file mode 100644 index 0000000..a0da332 Binary files /dev/null and b/images/performanceChart.png differ diff --git a/src/main.cpp b/src/main.cpp index 675da35..04ce6bb 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -7,6 +7,8 @@ */ #include +#include +#include #include #include #include @@ -14,12 +16,11 @@ #include "testing_helpers.hpp" int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; + const int SIZE = 1 << 16; const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; // Scan tests - printf("\n"); printf("****************\n"); printf("** SCAN TESTS **\n"); @@ -31,7 +32,13 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, b); printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); + //auto begin = std::chrono::high_resolution_clock::now(); + //for (int i = 0; i < 1000; i++){ + StreamCompaction::CPU::scan(SIZE, b, a); + //} + //auto end = std::chrono::high_resolution_clock::now(); + //std::cout << std::chrono::duration_cast(end - begin).count() << "ns" << std::endl; + printArray(SIZE, b, true); zeroArray(SIZE, c); @@ -43,37 +50,37 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -91,7 +98,14 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, b); printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + + //begin = std::chrono::high_resolution_clock::now(); + //for (int i = 0; i < 1000; i++){ + count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + //} + //end = std::chrono::high_resolution_clock::now(); + //std::cout << std::chrono::duration_cast(end - begin).count() << "ns" << std::endl; + expectedCount = count; printArray(count, b, true); printCmpLenResult(count, expectedCount, b, b); @@ -105,7 +119,15 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + + + //begin = std::chrono::high_resolution_clock::now(); + //for (int i = 0; i < 1000; i++){ + count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + //} + //end = std::chrono::high_resolution_clock::now(); + //std::cout << std::chrono::duration_cast(end - begin).count() << "ns" << std::endl; + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..e31ca3c 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_30 ) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..5f9bef4 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,39 +1,48 @@ #include "common.h" void checkCUDAErrorFn(const char *msg, const char *file, int line) { - cudaError_t err = cudaGetLastError(); - if (cudaSuccess == err) { - return; - } - - fprintf(stderr, "CUDA error"); - if (file) { - fprintf(stderr, " (%s:%d)", file, line); - } - fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err)); - exit(EXIT_FAILURE); + cudaError_t err = cudaGetLastError(); + if (cudaSuccess == err) { + return; + } + + fprintf(stderr, "CUDA error"); + if (file) { + fprintf(stderr, " (%s:%d)", file, line); + } + fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err)); + exit(EXIT_FAILURE); } namespace StreamCompaction { -namespace Common { - -/** - * Maps an array to an array of 0s and 1s for stream compaction. Elements - * which map to 0 will be removed, and elements which map to 1 will be kept. - */ -__global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO -} + namespace Common { -/** - * Performs scatter on an array. That is, for each element in idata, - * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. - */ -__global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices) { - // TODO -} + /** + * Maps an array to an array of 0s and 1s for stream compaction. Elements + * which map to 0 will be removed, and elements which map to 1 will be kept. + */ -} + __global__ void kernMapToBoolean(int n, int *bools, const int *idata){ + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + + bools[index] = 1; + if (idata[index] == 0) bools[index] = 0; + } + + + /** + * Performs scatter on an array. That is, for each element in idata, + * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. + */ + __global__ void kernScatter(int n, int *odata, + const int *idata, const int *bools, const int *indices) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + + if (bools[index] == 1) odata[indices[index]] = idata[index]; + } + + } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 4f52663..aec8363 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -3,6 +3,8 @@ #include #include #include +#include +#include #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..fb52d56 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -2,35 +2,79 @@ #include "cpu.h" namespace StreamCompaction { -namespace CPU { - -/** - * CPU scan (prefix sum). - */ -void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); -} + namespace CPU { -/** - * CPU stream compaction without using the scan function. - * - * @returns the number of elements remaining after compaction. - */ -int compactWithoutScan(int n, int *odata, const int *idata) { - // TODO - return -1; -} + /** + * CPU scan (prefix sum). + */ + void scan(int n, int *odata, const int *idata) { + // Initialize first value to 0 + odata[0] = 0; -/** - * CPU stream compaction using scan and scatter, like the parallel version. - * - * @returns the number of elements remaining after compaction. - */ -int compactWithScan(int n, int *odata, const int *idata) { - // TODO - return -1; -} + // Start loop at second element. The prefix sum should be sum of the + // previous elements in idata and odata + for (int x = 1; x < n; x++){ + odata[x] = idata[x - 1] + odata[x - 1]; + } -} + } + + /** + * CPU stream compaction without using the scan function. + * + * @returns the number of elements remaining after compaction. + */ + int compactWithoutScan(int n, int *odata, const int *idata) { + + // Set initial count of elements to 0. Also start an index tracker for the output variable + int numberOfNonZeroElements = 0; + int outIdx = 0; + + // Loop over each element in the input array + for (int x = 0; x < n; x++){ + if (idata[x] != 0){ + // If the value is nonzero, put into output array. Increment trackers as necessary. + odata[outIdx] = idata[x]; + outIdx++; + numberOfNonZeroElements++; + } + } + + return numberOfNonZeroElements; + } + + /** + * CPU stream compaction using scan and scatter, like the parallel version. + * + * @returns the number of elements remaining after compaction. + */ + int compactWithScan(int n, int *odata, const int *idata) { + // Allocate a temporary array and set each entry to 1 if the corresponding entry + // in idata should be kept, 0 otherwise. + int *tempArray = new int[n]; + for (int x = 0; x < n; x++){ + if (idata[x] == 0) tempArray[x] = 0; + else tempArray[x] = 1; + } + + // Run scan on tempArray + int *scanResults = new int[n]; + scan(n, scanResults, tempArray); + + // Scatter results into odata. Also keep track of number of elements added. + int numberOfNonZeroElements = 0; + for (int x = 0; x < n; x++){ + if (tempArray[x] == 1) { + odata[scanResults[x]] = idata[x]; + numberOfNonZeroElements++; + } + } + + // Free memory for temporary arrays we created + delete[] tempArray, scanResults; + + return numberOfNonZeroElements; + } + + } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..a989c0e 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,33 +2,166 @@ #include #include "common.h" #include "efficient.h" +#include + +#define blockSize 128 namespace StreamCompaction { -namespace Efficient { + namespace Efficient { -// TODO: __global__ + __global__ void kernUpSweep(int n, int *data, int d){ + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; -/** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ -void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); -} + if ((index % d) > 0) return; + data[index + d - 1] += data[index + d / 2 - 1]; + } -/** - * Performs stream compaction on idata, storing the result into odata. - * All zeroes are discarded. - * - * @param n The number of elements in idata. - * @param odata The array into which to store elements. - * @param idata The array of elements to compact. - * @returns The number of elements remaining after compaction. - */ -int compact(int n, int *odata, const int *idata) { - // TODO - return -1; -} + __global__ void kernDownSweep(int n, int *data, int d){ + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; -} + if ((index % d) > 0) return; + int t = data[index + d / 2 - 1]; + data[index + d / 2 - 1] = data[index + d - 1]; + data[index + d - 1] += t; + } + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) { + + // Pad and resize idata into temporary array if it is not a power of 2 + int powerOf2Size = std::ceil(std::log2(n)); + int newN = std::pow(2, powerOf2Size); + int *temp = new int[newN]; + for (int x = 0; x < n; x++){ + temp[x] = idata[x]; + } + + dim3 fullBlocksPerGrid((newN + blockSize - 1) / blockSize); + + // Create GPU array pointers + int *dev_data; + + // Allocate GPU space + cudaMalloc((void**)&dev_data, newN * sizeof(int)); + checkCUDAErrorFn("Failed to allocate dev_data"); + + cudaMemcpy(dev_data, temp, sizeof(int)*newN, cudaMemcpyHostToDevice); + checkCUDAErrorFn("Failed to copy dev_iData"); + + //cudaEvent_t start, stop; + //cudaEventCreate(&start); + //cudaEventCreate(&stop); + //cudaEventRecord(start); + // Perform scan + for (int x = 1; x < newN; x *= 2) { + kernUpSweep<<>>(newN, dev_data, 2 * x); + } + cudaMemcpy(temp, dev_data, sizeof(int)*newN, cudaMemcpyDeviceToHost); + temp[newN - 1] = 0; + cudaMemcpy(dev_data, temp, sizeof(int)*newN, cudaMemcpyHostToDevice); + for (int x = newN / 2; x > 0; x /= 2) { + kernDownSweep<<>>(newN, dev_data, 2 * x); + } + + //cudaEventRecord(stop); + + //cudaEventSynchronize(stop); + //float milliseconds = 0; + //cudaEventElapsedTime(&milliseconds, start, stop); + //std::cout << milliseconds << std::endl; + + cudaMemcpy(temp, dev_data, sizeof(int)*newN, cudaMemcpyDeviceToHost); + for (int x = 0; x < n; x++){ + odata[x] = temp[x]; + } + + cudaFree(dev_data); + delete[] temp; + } + + /** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ + int compact(int n, int *odata, const int *idata) { + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + int *dev_bools; + int *dev_idata; + int *dev_odata; + int *dev_indices; + + // Allocate GPU space + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + checkCUDAErrorFn("Failed to allocate dev_data"); + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAErrorFn("Failed to allocate dev_data"); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAErrorFn("Failed to allocate dev_data"); + + cudaMalloc((void**)&dev_indices, n * sizeof(int)); + checkCUDAErrorFn("Failed to allocate dev_data"); + + cudaMemcpy(dev_idata, idata, sizeof(int)*n, cudaMemcpyHostToDevice); + checkCUDAErrorFn("Failed to copy dev_iData"); + + //cudaEvent_t start, stop; + //cudaEventCreate(&start); + //cudaEventCreate(&stop); + //cudaEventRecord(start); + + Common::kernMapToBoolean<<>>(n, dev_bools, dev_idata); + + int *bools = new int[n]; + + cudaMemcpy(bools, dev_bools, sizeof(int)*n, cudaMemcpyDeviceToHost); + checkCUDAErrorFn("Failed to copy bools"); + + scan(n, odata, bools); + + // Find number of elements. It is the last value in the indices array. If the + // last entry of bool is 1, that means we need to add 1 since the value be an index, + // not the count of elements. + int numberOfElements = odata[n - 1]; + if (bools[n - 1] == 1) numberOfElements++; + + // Copy indices over + cudaMemcpy(dev_indices, odata, sizeof(int)*n, cudaMemcpyHostToDevice); + checkCUDAErrorFn("Failed to copy dev_oData"); + + Common::kernScatter<<>>(n, dev_odata, dev_idata, dev_bools, dev_indices); + + //cudaEventRecord(stop); + + //cudaEventSynchronize(stop); + //float milliseconds = 0; + //cudaEventElapsedTime(&milliseconds, start, stop); + //std::cout << milliseconds << std::endl; + + // Bring odata back + cudaMemcpy(odata, dev_odata, sizeof(int)*n, cudaMemcpyDeviceToHost); + checkCUDAErrorFn("Failed to copy dev_oData"); + + // Free memories + cudaFree(dev_bools); + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_indices); + delete[] bools; + + return numberOfElements; + } + + } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..d84c608 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -2,19 +2,77 @@ #include #include "common.h" #include "naive.h" +#include +#include + +#define blockSize 128 namespace StreamCompaction { -namespace Naive { + namespace Naive { -// TODO: __global__ + __global__ void kernScanInnerLoop(int n, int *odata, int *idata, int d){ + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; -/** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ -void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); -} + if (index >= d) + odata[index] = idata[index - d] + idata[index]; + else + odata[index] = idata[index]; -} + } + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) { + + //cudaEvent_t start, stop; + //cudaEventCreate(&start); + //cudaEventCreate(&stop); + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + // Create GPU array pointers + int *dev_oData; + int *dev_iData; + + // Allocate GPU space + cudaMalloc((void**)&dev_oData, n * sizeof(int)); + cudaMalloc((void**)&dev_iData, n * sizeof(int)); + + // Copy data to GPU + cudaMemcpy(dev_iData, idata, sizeof(int)*n, cudaMemcpyHostToDevice); + checkCUDAErrorFn("Failed to copy dev_iData"); + cudaMemcpy(dev_oData, odata, sizeof(int)*n, cudaMemcpyHostToDevice); + checkCUDAErrorFn("Failed to copy dev_oData"); + + //cudaEventRecord(start); + // Perform scan + for (int x = 1; x < n; x *= 2) { + kernScanInnerLoop << > >(n, dev_oData, dev_iData, x); + std::swap(dev_oData, dev_iData); + } + //cudaEventRecord(stop); + + //cudaEventSynchronize(stop); + //float milliseconds = 0; + //cudaEventElapsedTime(&milliseconds, start, stop); + //std::cout << milliseconds << std::endl; + + // Swap back + std::swap(dev_oData, dev_iData); + + // Copy data back to CPU + cudaMemcpy(odata, dev_oData, sizeof(int)*n, cudaMemcpyDeviceToHost); + + // Shift right + for (int x = n - 1; x > 0; x--) odata[x] = odata[x - 1]; + odata[0] = 0; + + // Free memory on GPU and CPU + cudaFree(dev_iData); + cudaFree(dev_oData); + } + + } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..278291f 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -16,6 +16,24 @@ void scan(int n, int *odata, const int *idata) { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::host_vector host_odata(n); + thrust::device_vector dev_thrust_odata = host_odata; + + int *dev_idata; + cudaMalloc((void**)&dev_idata, sizeof(int)*n); + checkCUDAErrorFn("Failed to allocate dev_data"); + cudaMemcpy(dev_idata, idata, sizeof(int)*n, cudaMemcpyHostToDevice); + checkCUDAErrorFn("Failed to copy dev_data"); + + // Use a thrust pointer because the vector wasn't working for me + thrust::device_ptr dev_thrust_idata(dev_idata); + + // Scan and copy back + thrust::exclusive_scan(dev_thrust_idata, dev_thrust_idata + n, dev_thrust_odata.begin()); + thrust::copy(dev_thrust_odata.begin(), dev_thrust_odata.end(), odata); + + // Free mem + cudaFree(dev_idata); } }