diff --git a/README.md b/README.md index a82ea0f..b6752a3 100644 --- a/README.md +++ b/README.md @@ -3,211 +3,86 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Xinyue Zhu +* Tested on: Windows 10, i5-5200U @ 2.20GHz 8GB, GTX 960M -### (TODO: Your README) - -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) - -Instructions (delete me) -======================== - -This is due Sunday, September 13 at midnight. - -**Summary:** In this project, you'll implement GPU stream compaction in CUDA, -from scratch. This algorithm is widely used, and will be important for -accelerating your path tracer project. - -Your stream compaction implementations in this project will simply remove `0`s -from an array of `int`s. In the path tracer, you will remove terminated paths -from an array of rays. - -In addition to being useful for your path tracer, this project is meant to -reorient your algorithmic thinking to the way of the GPU. On GPUs, many -algorithms can benefit from massive parallelism and, in particular, data -parallelism: executing the same code many times simultaneously with different -data. - -You'll implement a few different versions of the *Scan* (*Prefix Sum*) -algorithm. First, you'll implement a CPU version of the algorithm to reinforce -your understanding. Then, you'll write a few GPU implementations: "naive" and -"work-efficient." Finally, you'll use some of these to implement GPU stream -compaction. - -**Algorithm overview & details:** There are two primary references for details -on the implementation of scan and stream compaction. - -* The [slides on Parallel Algorithms](https://github.com/CIS565-Fall-2015/cis565-fall-2015.github.io/raw/master/lectures/2-Parallel-Algorithms.pptx) - for Scan, Stream Compaction, and Work-Efficient Parallel Scan. -* GPU Gems 3, Chapter 39 - [Parallel Prefix Sum (Scan) with CUDA](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html). - -Your GPU stream compaction implementation will live inside of the -`stream_compaction` subproject. This way, you will be able to easily copy it -over for use in your GPU path tracer. - - -## Part 0: The Usual - -This project (and all other CUDA projects in this course) requires an NVIDIA -graphics card with CUDA capability. Any card with Compute Capability 2.0 -(`sm_20`) or greater will work. Check your GPU on this -[compatibility table](https://developer.nvidia.com/cuda-gpus). -If you do not have a personal machine with these specs, you may use those -computers in the Moore 100B/C which have supported GPUs. - -**HOWEVER**: If you need to use the lab computer for your development, you will -not presently be able to do GPU performance profiling. This will be very -important for debugging performance bottlenecks in your program. - -### Useful existing code - -* `stream_compaction/common.h` - * `checkCUDAError` macro: checks for CUDA errors and exits if there were any. - * `ilog2ceil(x)`: computes the ceiling of log2(x), as an integer. -* `main.cpp` - * Some testing code for your implementations. - - -## Part 1: CPU Scan & Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -In `stream_compaction/cpu.cu`, implement: - -* `StreamCompaction::CPU::scan`: compute an exclusive prefix sum. -* `StreamCompaction::CPU::compactWithoutScan`: stream compaction without using - the `scan` function. -* `StreamCompaction::CPU::compactWithScan`: stream compaction using the `scan` - function. Map the input array to an array of 0s and 1s, scan it, and use - scatter to produce the output. You will need a **CPU** scatter implementation - for this (see slides or GPU Gems chapter for an explanation). - -These implementations should only be a few lines long. - - -## Part 2: Naive GPU Scan Algorithm - -In `stream_compaction/naive.cu`, implement `StreamCompaction::Naive::scan` - -This uses the "Naive" algorithm from GPU Gems 3, Section 39.2.1. We haven't yet -taught shared memory, and you **shouldn't use it yet**. Example 39-1 uses -shared memory, but is limited to operating on very small arrays! Instead, write -this using global memory only. As a result of this, you will have to do -`ilog2ceil(n)` separate kernel invocations. - -Beware of errors in Example 39-1 in the book; both the pseudocode and the CUDA -code in the online version of Chapter 39 are known to have a few small errors -(in superscripting, missing braces, bad indentation, etc.) - -Since the parallel scan algorithm operates on a binary tree structure, it works -best with arrays with power-of-two length. Make sure your implementation works -on non-power-of-two sized arrays (see `ilog2ceil`). This requires extra memory -- your intermediate array sizes will need to be rounded to the next power of -two. - - -## Part 3: Work-Efficient GPU Scan & Stream Compaction - -### 3.1. Scan - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::scan` - -All of the text in Part 2 applies. - -* This uses the "Work-Efficient" algorithm from GPU Gems 3, Section 39.2.2. -* Beware of errors in Example 39-2. -* Test non-power-of-two sized arrays. - -### 3.2. Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::compact` - -For compaction, you will also need to implement the scatter algorithm presented -in the slides and the GPU Gems chapter. - -In `stream_compaction/common.cu`, implement these for use in `compact`: - -* `StreamCompaction::Common::kernMapToBoolean` -* `StreamCompaction::Common::kernScatter` - - -## Part 4: Using Thrust's Implementation - -In `stream_compaction/thrust.cu`, implement: - -* `StreamCompaction::Thrust::scan` - -This should be a very short function which wraps a call to the Thrust library -function `thrust::exclusive_scan(first, last, result)`. - -To measure timing, be sure to exclude memory operations by passing -`exclusive_scan` a `thrust::device_vector` (which is already allocated on the -GPU). You can create a `thrust::device_vector` by creating a -`thrust::host_vector` from the given pointer, then casting it. - - -## Part 5: Radix Sort (Extra Credit) (+10) - -Add an additional module to the `stream_compaction` subproject. Implement radix -sort using one of your scan implementations. Add tests to check its correctness. - - -## Write-up - -1. Update all of the TODOs at the top of this README. -2. Add a description of this project including a list of its features. -3. Add your performance analysis (see below). - -All extra credit features must be documented in your README, explaining its -value (with performance comparison, if applicable!) and showing an example how -it works. For radix sort, show how it is called and an example of its output. - -Always profile with Release mode builds and run without debugging. +### README +The results are marked by the number of requirements. + 1 : CPU Scan & Compaction + 2 : Naive Scan (naive.cu) + 3.1 : Work-Efficient Scan + 3.2 Compaction (efficient.cu) + 4 : Thrust Scan (thrust.cu) ### Questions +blocksize compare: to be honest,changing blocksize does not make much difference.. + +* GPU Scan implementations (not included in code)(Naive, Work-Efficient, and + Thrust) to the serial CPU version of Scan. + ![](graph.png) +*the time line of thrust is seperate. It runs evey few seconds. + +* The CudaMalloc function take over 167439 us. Over half of the time.This I/O is bottlenenecks. + The I/O time of deferent method is different. Because some methods needs to allocate more places in device. + + +*Output: + +**************** +** SCAN TESTS ** +**************** + + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 26 0 ] +==== cpu scan, power-of-two ==== + 1.1 + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] +==== cpu scan, non-power-of-two ==== + 1.1 + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6146 6190 ] + passed +==== naive scan, power-of-two ==== + 2.1 + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] + passed +==== naive scan, non-power-of-two ==== + 2.1 + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] + passed +==== work-efficient scan, power-of-two ==== + 3.1 + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] + passed +==== work-efficient scan, non-power-of-two ==== +3.1 + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6146 6190 ] + passed +==== thrust scan, power-of-two ==== +4.1 passed +==== thrust scan, non-power-of-two ==== +4.1 passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== +1.2 + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== +1.2 + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 3 2 ] + passed +==== cpu compact with scan ==== +1.3 + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 1 ] + passed +==== work-efficient compact, power-of-two ==== +3.2 + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 1 ] + passed +==== work-efficient compact, non-power-of-two ==== +3.2 + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 3 2 ] + passed -* Roughly optimize the block sizes of each of your implementations for minimal - run time on your GPU. - * (You shouldn't compare unoptimized implementations to each other!) - -* Compare all of these GPU Scan implementations (Naive, Work-Efficient, and - Thrust) to the serial CPU version of Scan. Plot a graph of the comparison - (with array size on the independent axis). - * You should use CUDA events for timing. Be sure **not** to include any - explicit memory operations in your performance measurements, for - comparability. - * To guess at what might be happening inside the Thrust implementation, take - a look at the Nsight timeline for its execution. - -* Write a brief explanation of the phenomena you see here. - * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is - it different for each implementation? - -* Paste the output of the test program into a triple-backtick block in your - README. - * If you add your own tests (e.g. for radix sort or to test additional corner - cases), be sure to mention it explicitly. - -These questions should help guide you in performance analysis on future -assignments, as well. - -## Submit - -If you have modified any of the `CMakeLists.txt` files at all (aside from the -list of `SOURCE_FILES`), you must test that your project can build in Moore -100B/C. Beware of any build issues discussed on the Google Group. - -1. Open a GitHub pull request so that we can see that you have finished. - The title should be "Submission: YOUR NAME". -2. Send an email to the TA (gmail: kainino1+cis565@) with: - * **Subject**: in the form of `[CIS565] Project 2: PENNKEY` - * Direct link to your pull request on GitHub - * In the form of a grade (0-100+) with comments, evaluate your own - performance on the project. - * Feedback on the project itself, if any. diff --git a/graph.png b/graph.png new file mode 100644 index 0000000..acf2667 Binary files /dev/null and b/graph.png differ diff --git a/src/main.cpp b/src/main.cpp index 675da35..90ec7b6 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -26,42 +26,43 @@ int main(int argc, char* argv[]) { printf("****************\n"); genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; + a[SIZE - 1] = 0;//a[0]-a[size-1]:number=size printArray(SIZE, a, true); zeroArray(SIZE, b); printDesc("cpu scan, power-of-two"); StreamCompaction::CPU::scan(SIZE, b, a); printArray(SIZE, b, true); + zeroArray(SIZE, c); printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); + StreamCompaction::CPU::scan(NPOT, c, a);//253 printArray(NPOT, b, true); printCmpResult(NPOT, b, c); - + ///////////////////////////////////////////////////// zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); @@ -112,12 +113,13 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + getchar(); } diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..5893e1b 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -22,17 +22,41 @@ namespace Common { * Maps an array to an array of 0s and 1s for stream compaction. Elements * which map to 0 will be removed, and elements which map to 1 will be kept. */ -__global__ void kernMapToBoolean(int n, int *bools, const int *idata) { + __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int k = threadIdx.x; + + if (idata[k] == 0)bools[k] = 0; + else bools[k] = 1; + /*for (int i = 0; i < n; i++){ + if (idata[i] == 0) bools[i] = 0; + else { + bools[i] = 1; + } + }*/ + } /** * Performs scatter on an array. That is, for each element in idata, * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. */ -__global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices) { +__global__ void kernScatter(int n, int *odata,const int *idata, const int *bools, const int *indices) { + //last, dev_odata, dev_idata, dev_bool, dev_boolb + //indices[i]={0,1,2,3,4},n is the muber of indices // TODO + /*for (int i = 0; i < n; i++){ + if (bools[i] == 1) + { + odata[indices[i]] = idata[i]; + } + }*/ + //for (int i = 0; i < n; i++){ odata[i] = 0; } + int k = threadIdx.x; + if (bools[k] == 1){ + int t = indices[k];// + odata[t] = idata[k]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..2c9454f 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,36 +1,101 @@ #include #include "cpu.h" - +#include +#include namespace StreamCompaction { -namespace CPU { + namespace CPU { -/** - * CPU scan (prefix sum). - */ -void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); -} + /** + * CPU scan (prefix sum). + */ + void scan(int n, int *odata, const int *idata) {//b,a + // TODO + odata[0] = 0; + if (n > 1){ + for (int i = 1; i < n; i++){ + odata[i] = idata[i - 1] + odata[i - 1]; + } + } + std::cout << "1.1"; -/** - * CPU stream compaction without using the scan function. - * - * @returns the number of elements remaining after compaction. - */ -int compactWithoutScan(int n, int *odata, const int *idata) { - // TODO - return -1; -} + } -/** - * CPU stream compaction using scan and scatter, like the parallel version. - * - * @returns the number of elements remaining after compaction. - */ -int compactWithScan(int n, int *odata, const int *idata) { - // TODO - return -1; -} + /** + * CPU stream compaction without using the scan function. + * + * @returns the number of elements remaining after compaction. + */ + int compactWithoutScan(int n, int *odata, const int *idata) { + // TODO + //int *p = odata; + + int count = 0; + for (int i = 0; i < n; i++){ + if (idata[i] != 0){ + odata[count] = idata[i]; + count++; + } + + } + std::cout << "1.2"< #include "common.h" #include "efficient.h" +#include +int *dev_A1; +int *dev_B1; namespace StreamCompaction { namespace Efficient { @@ -11,9 +14,86 @@ namespace Efficient { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ +__global__ void Uscan(int p1, int p2, int *od){ + int thid = threadIdx.x*2*p1; + //od[thid + p1 - 1] = id[thid + p1 - 1]; + //od[thid + p2 - 1] = id[thid + p2 - 1]; + + od[thid + p2 - 1] += od[thid + p1 - 1]; + + } +__global__ void put0(int * odata, int n) + { + odata[n - 1] = 0; + } + +__global__ void Dscan(int p1,int p2,int *od){ + + int thid = threadIdx.x*2*p1; + + //od[thid + p1 - 1] = id[thid + p1 - 1]; + //od[thid + p2 - 1] = id[thid + p2 - 1]; + //if (thid == n) { od[n - 1] = 0; } + int t = od[thid +p1 - 1];// + od[thid + p1 - 1] = od[thid + p2 - 1]; + od[thid + p2 - 1] += t; + } + + + +void init(int n, const int *hst_A){ + + int _size = n*sizeof(int); + cudaMalloc((void**)&dev_A1, _size); + cudaMemcpy(dev_A1, hst_A, _size, cudaMemcpyHostToDevice); + + +} + + void scan(int n, int *odata, const int *idata) { // TODO - printf("TODO\n"); + + //dev_A1,dev_B1 + int p1,p2; + int num; + if (n % 2 != 0) + { + num = ilog2ceil(n); + num = pow(2, num); + } + else num = n; + int *_idata = new int[num]; + init(num, idata); + float ms=0; + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + cudaEventElapsedTime(&ms, start, stop); + for (int d = 0; d <= ilog2ceil(num) - 1; d++){ + p1 = pow(2, d); + p2 = pow(2, d + 1); + Uscan << <1, 512 >> >(p1, p2, dev_A1); + } + put0 << <1, 1 >> >(dev_A1, num); + for (int d = ilog2ceil(num) - 1; d >= 0; d--){ + p1 = pow(2, d); + p2 = pow(2, d + 1); + Dscan << <1, 512 >> >(p1, p2, dev_A1); + + } + cudaEventSynchronize(stop); + + cudaEventElapsedTime(&ms, start, stop); + printf("\t time of 3.1 efficient function1: %f ms\n", ms); + cudaMemcpy(odata, dev_A1, num* sizeof(int), cudaMemcpyDeviceToHost);//destination,source, + cudaFree(dev_A1); + + printf("3.1\n"); } /** @@ -24,10 +104,74 @@ void scan(int n, int *odata, const int *idata) { * @param odata The array into which to store elements. * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. - */ + */ +int *dev_idata; +int *dev_odata; +int *dev_indices; +int *dev_bool; +int *dev_boolb; + int compact(int n, int *odata, const int *idata) { // TODO - return -1; + int num; + if (n % 2 != 0) + { + num = ilog2ceil(n); + num = pow(2, num); + } + else num = n; + + int _size = num*sizeof(int); + + cudaMalloc((void**)&dev_bool, _size); + cudaMalloc((void**)&dev_boolb, _size); + cudaMalloc((void**)&dev_odata, _size); + cudaMalloc((void**)&dev_idata, _size); + cudaMemcpy(dev_idata, idata, _size, cudaMemcpyHostToDevice); + + int p1, p2; + int hst; + int last; + //step 1 + + Common::kernMapToBoolean <<< 1, n >>>(n, dev_bool, dev_idata); + Common::kernMapToBoolean << < 1, n >> >(n, dev_boolb, dev_idata);//back_up + //cudaMemcpy(&hst, &dev_idata[6],sizeof(int), cudaMemcpyDeviceToHost); + //std::cout << hst; + //Step 2 + + for (int d = 0; d <= ilog2ceil(num) - 1; d++){ + p1 = pow(2, d); + p2 = pow(2, d + 1); + Uscan <<<1, num >> >(p1, p2, dev_boolb);//change end to n + } + put0 <<<1, 1 >> >(dev_boolb, num); + //cudaMemcpy(&hst, &dev_idata[6], sizeof(int), cudaMemcpyDeviceToHost); + //std::cout << hst << "ss1"; + for (int d = ilog2ceil(num) - 1; d >= 0; d--){ + p1 = pow(2, d); + p2 = pow(2, d + 1); + Dscan <<<1, num >> >(p1, p2, dev_boolb); + } + + + //???????????my dev_idata changed its value here...have no idea why. + cudaMemcpy(dev_idata, idata, _size, cudaMemcpyHostToDevice); + ////???????????????????????///////////// + //cudaMemcpy(&hst, &dev_idata[6], sizeof(int), cudaMemcpyDeviceToHost); + //std::cout << hst << "ss2"; + //Step 3 : Scatter + //cudaMemcpy(&hst, &dev_idata[2],sizeof(int), cudaMemcpyDeviceToHost); + //std::cout << hst; + cudaMemcpy(&last, &(dev_boolb[num - 1]), sizeof(int), cudaMemcpyDeviceToHost); + //cudaMalloc((void**)&dev_odata, last*sizeof(int)); + + Common::kernScatter <<<1, num >> >(last, dev_odata, dev_idata, dev_bool, dev_boolb); + + cudaMemcpy(odata, dev_odata, last*sizeof(int), cudaMemcpyDeviceToHost); + + printf("3.2\n"); + return last; } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 395ba10..4769a7d 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -3,7 +3,7 @@ namespace StreamCompaction { namespace Efficient { void scan(int n, int *odata, const int *idata); - + void init(int n,const int*b); int compact(int n, int *odata, const int *idata); } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..83f3742 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -2,7 +2,10 @@ #include #include "common.h" #include "naive.h" - +#include +int *dev_A; +int *dev_B; +int *dev_C; namespace StreamCompaction { namespace Naive { @@ -11,9 +14,69 @@ namespace Naive { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ +__global__ void Nscan(int n, int logn, int *Ain, int*Bout,int *Ctemp){//in;out;temp + + int thid = threadIdx.x; + int offset; + Bout[0] = 0; + Ctemp[0] = 0; + for (int j = 0; j < n-1; j++){ + Ctemp[j + 1] = Ain[j]; + } + + //Ctemp[thid] = (thid > 0) ? Ain[thid - 1] : 0; + for (int d = 1; d <= logn; d++){ + offset = 2; + + if (d == 1)offset = 1; + if (d == 2)offset = 2; + else + for (int i = 1; i < d-1; i++){ + offset *= 2; + } + if (thid >= offset)//pow(2,d-1){d=1,off=1}{d=2,off=2}{d=3,off=4}off=pow(2,d-1){d=4,offset=8} + Ctemp[thid] += Ctemp[thid - offset]; + } + Bout[0] = 0; + Bout[thid] = Ctemp[thid]; + + } +void init(int *hst_A, int *hst_B,int n){ + + int _size = n *sizeof(int); + cudaMalloc((void**)&dev_A, _size); + cudaMemcpy(dev_A, hst_A, _size, cudaMemcpyHostToDevice); + + cudaMalloc((void**)&dev_B, _size); + cudaMemcpy(dev_B, hst_B, _size, cudaMemcpyHostToDevice); + + cudaMalloc((void**)&dev_C, _size); + } void scan(int n, int *odata, const int *idata) { // TODO - printf("TODO\n"); + int num; + if (n % 2 != 0) + { + num = ilog2ceil(n); + num = pow(2, num); + } + else num = n; + int *_idata=new int[num]; + for (int i = 0; i < num; i++){ + _idata[i] = idata[i]; + } + init(_idata, odata, num); + + //std::cout << ilog2ceil(4) << ilog2ceil(5);//2,3; + int logn = ilog2ceil(num); + Nscan <<< 1, num >> >(num,logn,dev_A, dev_B,dev_C); + + cudaMemcpy(odata, dev_B, num* sizeof(int), cudaMemcpyDeviceToHost);//destination,source, + cudaFree(dev_A); + cudaFree(dev_B); + cudaFree(dev_C); + + printf("2.1"); } } diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 21152d6..0cdba88 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -3,5 +3,8 @@ namespace StreamCompaction { namespace Naive { void scan(int n, int *odata, const int *idata); + void init(int *hst_idata,int *hst_odata,int n); + //int *dev_A; + //int *dev_B; } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..ee4c69c 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -16,7 +16,24 @@ void scan(int n, int *odata, const int *idata) { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + thrust::device_vector dv_in,dv_out; + for (int i = 0; i < n; i++){ + dv_in.push_back(idata[i]); + dv_out.push_back(0); + } + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::copy(dv_out.begin(), dv_out.end(), odata); + + + printf("4.1"); + + } + + } + } +