From a629849227a75f32e41640c4ad05c9bdfbf7336c Mon Sep 17 00:00:00 2001
From: licheng <lichengc@seas.upenn.edu>
Date: Fri, 15 Sep 2023 22:40:38 -0400
Subject: [PATCH 1/8] 3.2

---
 src/main.cpp                   |  2 +-
 stream_compaction/common.cu    |  9 ++++
 stream_compaction/common.h     |  1 +
 stream_compaction/cpu.cu       | 33 ++++++++++++-
 stream_compaction/efficient.cu | 90 ++++++++++++++++++++++++++++++++--
 stream_compaction/naive.cu     | 45 +++++++++++++++++
 stream_compaction/thrust.cu    |  9 ++++
 7 files changed, 183 insertions(+), 6 deletions(-)
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..eb7adb1 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -51,7 +51,7 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..8f4009b 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -24,6 +24,9 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+            int id = blockDim.x * blockIdx.x + threadIdx.x;
+            if (id >= n)return;
+            bools[id] = idata[id] == 0 ? 0 : 1;
         }
 
         /**
@@ -33,6 +36,12 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+            int id = blockDim.x * blockIdx.x + threadIdx.x;
+            if (id >= n)return;
+            if (bools[id] == 1) {
+                int idx = indices[id];
+                odata[idx] = idata[id];
+            }
         }
 
     }
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index d2c1fed..57f97ec 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -9,6 +9,7 @@
 #include <algorithm>
 #include <chrono>
 #include <stdexcept>
+#include <device_launch_parameters.h>
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..a4e1903 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -20,6 +20,10 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            if (n > 0)odata[0] = 0;
+            for (int i = 1;i < n;++i) {
+                odata[i] = odata[i-1] + idata[i - 1];
+            }
             timer().endCpuTimer();
         }
 
@@ -31,8 +35,14 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            int ans = 0;
+            for (int i = 0;i < n;++i) {
+                if (idata[i] != 0) {
+                    odata[ans++] = idata[i];
+                }
+            }
             timer().endCpuTimer();
-            return -1;
+            return ans;
         }
 
         /**
@@ -43,8 +53,27 @@ namespace StreamCompaction {
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            int* cnt = new int[n];
+            for (int i = 0;i < n;++i) {
+                cnt[i] = idata[i] == 0 ? 0 : 1;
+            }
+            
+            //scan(n, odata, cnt);
+            if (n > 0)odata[0] = 0;
+            for (int i = 1;i < n;++i) {
+                odata[i] = odata[i - 1] + cnt[i - 1];
+            }
+
+            int numIdx = 0;
+            for (int i = 0;i < n;++i) {
+                if (idata[i] != 0) {
+                    numIdx = odata[i];
+                    odata[numIdx] = idata[i];
+                }
+            }
+            delete[] cnt;
             timer().endCpuTimer();
-            return -1;
+            return (numIdx+1);
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..29577d6 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -15,9 +15,55 @@ namespace StreamCompaction {
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
+
+        __global__ void kernelUpSweep(int n, int offset, int* i_odata) {
+            int id = threadIdx.x + blockDim.x * blockIdx.x;
+            if (id >= n || (id + 1) % (offset * 2) != 0)return;
+            i_odata[id] = i_odata[id] + i_odata[id - offset];
+        }
+
+        __global__ void kernelDownSweep(int n, int offset, int* i_odata) {
+            int id = threadIdx.x + blockDim.x * blockIdx.x;
+            if (id >= n || (id+1)%(offset*2)!=0)return;
+            //change 2
+            int prevIdx = id - offset;
+            int prevNum = i_odata[prevIdx];
+            i_odata[prevIdx] = i_odata[id];
+            i_odata[id] += prevNum;
+        }
+
+        void devScan(int* dev_data, int layerCnt, const dim3& fullBlocksPerGrid, int blockSize) {
+            int N = 1 << layerCnt;
+            int offset = 1;
+            for (int i = 0;i < layerCnt;++i) {
+                kernelUpSweep << <fullBlocksPerGrid, blockSize >> > (N, offset, dev_data);
+                offset *= 2;
+            }
+            cudaMemset(dev_data + offset - 1,0,sizeof(int));
+            for (int i = 0;i < layerCnt;++i) {
+                offset /= 2;
+                kernelDownSweep << <fullBlocksPerGrid, blockSize >> > (N, offset, dev_data);
+            }
+        }
+
         void scan(int n, int *odata, const int *idata) {
             timer().startGpuTimer();
             // TODO
+            int* dev_data;
+            int layerCnt = ilog2ceil(n);
+            int N = 1 << layerCnt;
+            int blockSize = 32;
+            dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+            cudaMalloc((void**)&dev_data, N * sizeof(int));
+            checkCUDAError("cudaMalloc dev_data failed!");
+            cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+
+            devScan(dev_data, layerCnt,fullBlocksPerGrid, blockSize);
+
+            //exclusive scan
+            cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_data);
             timer().endGpuTimer();
         }
 
@@ -31,10 +77,48 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
+            timer().startCpuTimer();
             // TODO
-            timer().endGpuTimer();
-            return -1;
+            int layerCnt = ilog2ceil(n);
+            int N = 1 << layerCnt;
+            int* dev_idata;
+            int* dev_odata;
+            int* dev_bools;
+            int* dev_indices;
+            cudaMalloc((void**)&dev_idata, N * sizeof(int));
+            checkCUDAError("cudaMalloc dev_idata failed!");
+            cudaMalloc((void**)&dev_bools, N * sizeof(int));
+            checkCUDAError("cudaMalloc dev_bools failed!");
+            cudaMalloc((void**)&dev_odata, N * sizeof(int));
+            checkCUDAError("cudaMalloc dev_odata failed!");
+            cudaMalloc((void**)&dev_indices, N * sizeof(int));
+            checkCUDAError("cudaMalloc dev_indices failed!");
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            
+            int blockSize = 32;
+            dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+            StreamCompaction::Common::kernMapToBoolean <<<fullBlocksPerGrid,blockSize>>>(n, dev_bools, dev_idata);
+            
+            cudaMemcpy(dev_indices, dev_bools, N * sizeof(int), cudaMemcpyDeviceToDevice);
+            devScan(dev_indices, layerCnt, fullBlocksPerGrid, blockSize);
+            
+            StreamCompaction::Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (n, dev_odata, dev_idata, dev_bools, dev_indices);
+            
+            //read GPU
+            int ans = 0;
+            cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            cudaMemcpy(&ans, dev_indices + (N - 1), sizeof(int), cudaMemcpyDeviceToHost);
+            int lastBool = 0;
+            cudaMemcpy(&lastBool, dev_bools + (N - 1), sizeof(int), cudaMemcpyDeviceToHost);
+            ans += lastBool;
+            
+            //free GPU
+            cudaFree(dev_idata);
+            cudaFree(dev_bools);
+            cudaFree(dev_odata);
+            cudaFree(dev_indices);
+            timer().endCpuTimer();
+            return ans;
         }
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..d3c0488 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -12,6 +12,17 @@ namespace StreamCompaction {
             return timer;
         }
         // TODO: __global__
+        __global__ void kernelNaiveScan(int n, int offset,int *odata, const int* idata) {
+            int id = threadIdx.x + blockIdx.x * blockDim.x;
+            if (id >= n)return;
+            if (id >= offset) {
+                odata[id] = idata[id] + idata[id - offset];
+            }
+            else {
+                odata[id] = idata[id];
+            }
+        }
+
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
@@ -19,6 +30,40 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
             timer().startGpuTimer();
             // TODO
+            //initialize
+            int* dev_ping;
+            int* dev_pong;
+            cudaMalloc((void**)&dev_ping, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_ping failed!");
+            cudaMalloc((void**)&dev_pong, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_pong failed!");
+            cudaMemcpy(dev_ping, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            
+            int* dev_in;
+            int* dev_out;
+            int blockSize = 32;
+            dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+            int layerCnt = ilog2ceil(n);
+            int offset = 1;
+            for (int i = 0;i < layerCnt;++i) {
+                if (i % 2 == 0) {
+                    dev_in = dev_ping;
+                    dev_out = dev_pong;
+                }else {
+                    dev_in = dev_pong;
+                    dev_out = dev_ping;
+                }
+                kernelNaiveScan << <fullBlocksPerGrid, blockSize >> > (n,offset,dev_out, dev_in);
+                offset *= 2;
+            }
+
+            //exclusive scan
+            cudaMemcpy(odata + 1, dev_out, sizeof(int) * (n-1), cudaMemcpyDeviceToHost);
+            if(n>0)odata[0] = 0;
+            //free mem
+            cudaFree(dev_ping);
+            cudaFree(dev_pong);
+
             timer().endGpuTimer();
         }
     }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..c5d0310 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -22,6 +22,15 @@ namespace StreamCompaction {
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            thrust::host_vector<int> thrust_host_data(idata, idata + n);
+            thrust::device_vector<int> thrust_dev_idata = thrust_host_data;
+            thrust::device_vector<int> thrust_dev_odata(n * sizeof(int));
+
+            thrust::exclusive_scan(thrust_dev_idata.begin(), thrust_dev_idata.end(), thrust_dev_odata.begin());
+
+            thrust::copy(thrust_dev_odata.begin(), thrust_dev_odata.end(), odata);
+
+
             timer().endGpuTimer();
         }
     }

From 18923a01f2024b94d235ee9d3eeaa9158f8ff236 Mon Sep 17 00:00:00 2001
From: licheng <lichengc@seas.upenn.edu>
Date: Fri, 15 Sep 2023 23:31:47 -0400
Subject: [PATCH 2/8] fix bug in thrust

---
 src/main.cpp                   | 18 +++++++++---------
 stream_compaction/common.h     |  2 ++
 stream_compaction/efficient.cu |  8 ++++----
 stream_compaction/naive.cu     |  2 +-
 stream_compaction/thrust.cu    |  2 +-
 5 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index eb7adb1..1ce62e3 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 20; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -29,7 +29,7 @@ int main(int argc, char* argv[]) {
 
     genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
     a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
+    //printArray(SIZE, a, true);
 
     // initialize b using StreamCompaction::CPU::scan you implement
     // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
@@ -38,20 +38,20 @@ int main(int argc, char* argv[]) {
     printDesc("cpu scan, power-of-two");
     StreamCompaction::CPU::scan(SIZE, b, a);
     printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(SIZE, b, true);
+   // printArray(SIZE, b, true);
 
     zeroArray(SIZE, c);
     printDesc("cpu scan, non-power-of-two");
     StreamCompaction::CPU::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(NPOT, b, true);
+    //printArray(NPOT, b, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    printArray(SIZE, c, true);
+    //printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
@@ -104,7 +104,7 @@ int main(int argc, char* argv[]) {
 
     genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
     a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
+    //printArray(SIZE, a, true);
 
     int count, expectedCount, expectedNPOT;
 
@@ -115,7 +115,7 @@ int main(int argc, char* argv[]) {
     count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
     printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
     expectedCount = count;
-    printArray(count, b, true);
+    //printArray(count, b, true);
     printCmpLenResult(count, expectedCount, b, b);
 
     zeroArray(SIZE, c);
@@ -123,14 +123,14 @@ int main(int argc, char* argv[]) {
     count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
     printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
     expectedNPOT = count;
-    printArray(count, c, true);
+    //printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("cpu compact with scan");
     count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
     printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(count, c, true);
+    //printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index 57f97ec..266cee4 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -19,6 +19,8 @@
  */
 void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1);
 
+static int blockSize = 256;
+
 inline int ilog2(int x) {
     int lg = 0;
     while (x >>= 1) {
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 29577d6..f123376 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -3,6 +3,7 @@
 #include "common.h"
 #include "efficient.h"
 
+
 namespace StreamCompaction {
     namespace Efficient {
         using StreamCompaction::Common::PerformanceTimer;
@@ -52,7 +53,6 @@ namespace StreamCompaction {
             int* dev_data;
             int layerCnt = ilog2ceil(n);
             int N = 1 << layerCnt;
-            int blockSize = 32;
             dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
             cudaMalloc((void**)&dev_data, N * sizeof(int));
             checkCUDAError("cudaMalloc dev_data failed!");
@@ -95,14 +95,14 @@ namespace StreamCompaction {
             checkCUDAError("cudaMalloc dev_indices failed!");
             cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
             
-            int blockSize = 32;
+            
             dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
-            StreamCompaction::Common::kernMapToBoolean <<<fullBlocksPerGrid,blockSize>>>(n, dev_bools, dev_idata);
+            StreamCompaction::Common::kernMapToBoolean <<<fullBlocksPerGrid,blockSize>>>(N, dev_bools, dev_idata);
             
             cudaMemcpy(dev_indices, dev_bools, N * sizeof(int), cudaMemcpyDeviceToDevice);
             devScan(dev_indices, layerCnt, fullBlocksPerGrid, blockSize);
             
-            StreamCompaction::Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (n, dev_odata, dev_idata, dev_bools, dev_indices);
+            StreamCompaction::Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (N, dev_odata, dev_idata, dev_bools, dev_indices);
             
             //read GPU
             int ans = 0;
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index d3c0488..de10be7 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -41,7 +41,7 @@ namespace StreamCompaction {
             
             int* dev_in;
             int* dev_out;
-            int blockSize = 32;
+            
             dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
             int layerCnt = ilog2ceil(n);
             int offset = 1;
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index c5d0310..fb52617 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -24,7 +24,7 @@ namespace StreamCompaction {
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
             thrust::host_vector<int> thrust_host_data(idata, idata + n);
             thrust::device_vector<int> thrust_dev_idata = thrust_host_data;
-            thrust::device_vector<int> thrust_dev_odata(n * sizeof(int));
+            thrust::device_vector<int> thrust_dev_odata(n);
 
             thrust::exclusive_scan(thrust_dev_idata.begin(), thrust_dev_idata.end(), thrust_dev_odata.begin());
 

From 9fa2b460e1907c995d2e6a13607baf916406d8d2 Mon Sep 17 00:00:00 2001
From: licheng <lichengc@seas.upenn.edu>
Date: Sun, 17 Sep 2023 00:09:18 -0400
Subject: [PATCH 3/8] part5

---
 src/main.cpp                   |  2 +-
 stream_compaction/efficient.cu | 30 ++++++++++++++++++++----------
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 1ce62e3..dc164b3 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 20; // feel free to change the size of array
+const int SIZE = 1 << 8; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index f123376..d3aea4f 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -18,14 +18,20 @@ namespace StreamCompaction {
          */
 
         __global__ void kernelUpSweep(int n, int offset, int* i_odata) {
-            int id = threadIdx.x + blockDim.x * blockIdx.x;
-            if (id >= n || (id + 1) % (offset * 2) != 0)return;
+            //int id = threadIdx.x + blockDim.x * blockIdx.x;
+            //if (id >= n || (id + 1) % (offset * 2) != 0)return;
+            int threadId = threadIdx.x + blockDim.x * blockIdx.x + 1;
+            int id = offset * 2 * threadId - 1;
+            if (id >= n)return;
             i_odata[id] = i_odata[id] + i_odata[id - offset];
         }
 
         __global__ void kernelDownSweep(int n, int offset, int* i_odata) {
-            int id = threadIdx.x + blockDim.x * blockIdx.x;
-            if (id >= n || (id+1)%(offset*2)!=0)return;
+            //int id = threadIdx.x + blockDim.x * blockIdx.x;
+            //if (id >= n || (id + 1) % (offset * 2)!=0)return;
+            int threadId = threadIdx.x + blockDim.x * blockIdx.x + 1;
+            int id = offset * 2 * threadId - 1;
+            if (id >= n)return;
             //change 2
             int prevIdx = id - offset;
             int prevNum = i_odata[prevIdx];
@@ -33,17 +39,22 @@ namespace StreamCompaction {
             i_odata[id] += prevNum;
         }
 
-        void devScan(int* dev_data, int layerCnt, const dim3& fullBlocksPerGrid, int blockSize) {
+        void devScan(int* dev_data, int layerCnt, int blockSize) {
             int N = 1 << layerCnt;
             int offset = 1;
+            int needN = N;
             for (int i = 0;i < layerCnt;++i) {
-                kernelUpSweep << <fullBlocksPerGrid, blockSize >> > (N, offset, dev_data);
+                dim3 blockPerGrid((needN + blockSize - 1) / blockSize);
+                kernelUpSweep << <blockPerGrid, blockSize >> > (N, offset, dev_data);
                 offset *= 2;
+                needN /= 2;
             }
             cudaMemset(dev_data + offset - 1,0,sizeof(int));
             for (int i = 0;i < layerCnt;++i) {
                 offset /= 2;
-                kernelDownSweep << <fullBlocksPerGrid, blockSize >> > (N, offset, dev_data);
+                dim3 blockPerGrid((needN + blockSize - 1) / blockSize);
+                kernelDownSweep << <blockPerGrid, blockSize >> > (N, offset, dev_data);
+                needN *= 2;
             }
         }
 
@@ -53,12 +64,11 @@ namespace StreamCompaction {
             int* dev_data;
             int layerCnt = ilog2ceil(n);
             int N = 1 << layerCnt;
-            dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
             cudaMalloc((void**)&dev_data, N * sizeof(int));
             checkCUDAError("cudaMalloc dev_data failed!");
             cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
 
-            devScan(dev_data, layerCnt,fullBlocksPerGrid, blockSize);
+            devScan(dev_data, layerCnt, blockSize);
 
             //exclusive scan
             cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost);
@@ -100,7 +110,7 @@ namespace StreamCompaction {
             StreamCompaction::Common::kernMapToBoolean <<<fullBlocksPerGrid,blockSize>>>(N, dev_bools, dev_idata);
             
             cudaMemcpy(dev_indices, dev_bools, N * sizeof(int), cudaMemcpyDeviceToDevice);
-            devScan(dev_indices, layerCnt, fullBlocksPerGrid, blockSize);
+            devScan(dev_indices, layerCnt,blockSize);
             
             StreamCompaction::Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (N, dev_odata, dev_idata, dev_bools, dev_indices);
             

From 7a92abd844b58173870b2925e0dfae15c54c1acd Mon Sep 17 00:00:00 2001
From: licheng <lichengc@seas.upenn.edu>
Date: Sun, 17 Sep 2023 15:53:25 -0400
Subject: [PATCH 4/8] sort

---
 src/main.cpp                   | 57 ++++++++++++++++++++++-
 stream_compaction/common.h     |  2 -
 stream_compaction/cpu.cu       |  7 +++
 stream_compaction/cpu.h        |  2 +
 stream_compaction/efficient.cu | 82 ++++++++++++++++++++++++++++++++++
 stream_compaction/efficient.h  |  2 +
 stream_compaction/naive.cu     |  2 +
 stream_compaction/thrust.cu    | 12 +++++
 stream_compaction/thrust.h     |  2 +
 9 files changed, 165 insertions(+), 3 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index dc164b3..0f155ed 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 20; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -147,6 +147,61 @@ int main(int argc, char* argv[]) {
     //printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
+    printf("\n");
+    printf("****************\n");
+    printf("** SORT TESTS **\n");
+    printf("****************\n");
+
+    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+    a[SIZE - 1] = 0;
+    //printArray(SIZE, a, true);
+
+    // initialize b using StreamCompaction::CPU::scan you implement
+    // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
+    // At first all cases passed because b && c are all zeroes.
+    zeroArray(SIZE, b);
+    printDesc("cpu sort, power-of-two");
+    StreamCompaction::CPU::sort(SIZE, b, a);
+    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+    //printArray(SIZE, b, true);
+
+    zeroArray(SIZE, c);
+    printDesc("work-efficient sort, power-of-two");
+    StreamCompaction::Efficient::sort(SIZE, c, a);
+    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    //printArray(SIZE, c, true);
+    printCmpResult(SIZE, b, c);
+
+    zeroArray(SIZE, c);
+    printDesc("thrust sort, power-of-two");
+    StreamCompaction::Thrust::sort(SIZE, c, a);
+    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    //printArray(SIZE, c, true);
+    printCmpResult(SIZE, b, c);
+
+
+    zeroArray(SIZE, b);
+    printDesc("cpu sort, non-power-of-two");
+    StreamCompaction::CPU::sort(NPOT, b, a);
+    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+    //printArray(NPOT, b, true);
+
+    zeroArray(SIZE, c);
+    printDesc("work-efficient sort, non-power-of-two");
+    StreamCompaction::Efficient::sort(NPOT, c, a);
+    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    //printArray(NPOT, c, true);
+    printCmpResult(NPOT, b, c);
+
+    zeroArray(SIZE, c);
+    printDesc("thrust sort, non-power-of-two");
+    StreamCompaction::Thrust::sort(NPOT, c, a);
+    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    //printArray(NPOT, c, true);
+    printCmpResult(NPOT, b, c);
+
+
+
     system("pause"); // stop Win32 console from closing on exit
     delete[] a;
     delete[] b;
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index 266cee4..57f97ec 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -19,8 +19,6 @@
  */
 void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1);
 
-static int blockSize = 256;
-
 inline int ilog2(int x) {
     int lg = 0;
     while (x >>= 1) {
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index a4e1903..d03a7f5 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -27,6 +27,13 @@ namespace StreamCompaction {
             timer().endCpuTimer();
         }
 
+        void sort(int n, int* odata, const int* idata) {
+            timer().startCpuTimer();
+            memcpy(odata, idata, sizeof(int) * n);
+            std::sort(odata, odata + n);
+            timer().endCpuTimer();
+        }
+
         /**
          * CPU stream compaction without using the scan function.
          *
diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h
index 873c047..8a6efa9 100644
--- a/stream_compaction/cpu.h
+++ b/stream_compaction/cpu.h
@@ -8,6 +8,8 @@ namespace StreamCompaction {
 
         void scan(int n, int *odata, const int *idata);
 
+        void sort(int n, int* odata, const int* idata);
+
         int compactWithoutScan(int n, int *odata, const int *idata);
 
         int compactWithScan(int n, int *odata, const int *idata);
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index d3aea4f..7b300a0 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -3,6 +3,7 @@
 #include "common.h"
 #include "efficient.h"
 
+static int blockSize = 256;
 
 namespace StreamCompaction {
     namespace Efficient {
@@ -130,5 +131,86 @@ namespace StreamCompaction {
             timer().endCpuTimer();
             return ans;
         }
+
+        __global__ void kernMapToBoolean(int n, int* bools, const int* idata, int mask, bool recordZero) {
+            // TODO
+            int id = blockDim.x * blockIdx.x + threadIdx.x;
+            if (id >= n)return;
+            bools[id] = (idata[id] & mask) == 0 ? recordZero : (!recordZero);
+        }
+
+        __global__ void kernSortScatter(int n, int* odata,
+            const int* idata, const int* isOneBools, 
+            const int* indices_0,const int* indices_1,int zeroCnt) {
+            // TODO
+            int id = blockDim.x * blockIdx.x + threadIdx.x;
+            if (id >= n)return;
+            if (isOneBools[id] == 1) {
+                int idx = indices_1[id] + zeroCnt;
+                odata[idx] = idata[id];
+            }
+            else {
+                int idx = indices_0[id];
+                odata[idx] = idata[id];
+            }
+        }
+
+        void sort(int n, int* odata, const int* idata) {
+            timer().startCpuTimer();
+            // TODO
+            int layerCnt = ilog2ceil(n);
+            int N = 1 << layerCnt;
+            int* dev_idata;
+            int* dev_odata;
+            int* dev_bools;
+            int* dev_indices_1;
+            int* dev_indices_0;
+            cudaMalloc((void**)&dev_idata, N * sizeof(int));
+            checkCUDAError("cudaMalloc dev_idata failed!");
+            cudaMalloc((void**)&dev_bools, N * sizeof(int));
+            checkCUDAError("cudaMalloc dev_bools failed!");
+            cudaMalloc((void**)&dev_odata, N * sizeof(int));
+            checkCUDAError("cudaMalloc dev_odata failed!");
+            cudaMalloc((void**)&dev_indices_1, N * sizeof(int));
+            checkCUDAError("cudaMalloc dev_indices failed!");
+            cudaMalloc((void**)&dev_indices_0, N * sizeof(int));
+            checkCUDAError("cudaMalloc dev_indices failed!");
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            cudaMemset(dev_idata + n, INT_MAX, (N - n)*sizeof(int));//to make non-power-of-two right
+
+            dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+            int mask = 1;
+            for (int i = 0;i < 32;++i) {
+                //map 0
+                kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (N, dev_bools, dev_idata,mask,true);
+                cudaMemcpy(dev_indices_0, dev_bools, N * sizeof(int), cudaMemcpyDeviceToDevice);
+                devScan(dev_indices_0, layerCnt, blockSize);
+                int zeroCnt = 0;
+                int lastBool = 0;
+                cudaMemcpy(&zeroCnt, dev_indices_0 + (N - 1), sizeof(int), cudaMemcpyDeviceToHost);
+                cudaMemcpy(&lastBool, dev_bools + (N - 1), sizeof(int), cudaMemcpyDeviceToHost);
+                zeroCnt += lastBool;
+
+                //map1
+                kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (N, dev_bools, dev_idata, mask, false);
+                cudaMemcpy(dev_indices_1, dev_bools, N * sizeof(int), cudaMemcpyDeviceToDevice);
+                devScan(dev_indices_1, layerCnt, blockSize);
+                kernSortScatter << <fullBlocksPerGrid, blockSize >> > (N, 
+                    dev_odata, dev_idata, dev_bools, dev_indices_0, dev_indices_1,zeroCnt);
+                mask <<= 1;
+                std::swap(dev_odata, dev_idata);
+            }
+
+            //read GPU
+            cudaMemcpy(odata, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToHost);
+
+            //free GPU
+            cudaFree(dev_idata);
+            cudaFree(dev_bools);
+            cudaFree(dev_odata);
+            cudaFree(dev_indices_0);
+            cudaFree(dev_indices_1);
+            timer().endCpuTimer();
+        }
     }
 }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 803cb4f..90ab20a 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -9,5 +9,7 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata);
 
         int compact(int n, int *odata, const int *idata);
+        
+        void sort(int n, int* odata, const int* idata);
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index de10be7..8ece8a8 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -3,6 +3,8 @@
 #include "common.h"
 #include "naive.h"
 
+static int blockSize = 256;
+
 namespace StreamCompaction {
     namespace Naive {
         using StreamCompaction::Common::PerformanceTimer;
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index fb52617..82f437b 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -3,6 +3,7 @@
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #include <thrust/scan.h>
+#include <thrust/sort.h>
 #include "common.h"
 #include "thrust.h"
 
@@ -31,6 +32,17 @@ namespace StreamCompaction {
             thrust::copy(thrust_dev_odata.begin(), thrust_dev_odata.end(), odata);
 
 
+            timer().endGpuTimer();
+        }
+
+        void sort(int n, int* odata, const int* idata) {
+            timer().startGpuTimer();
+            thrust::host_vector<int> thrust_host_data(idata, idata + n);
+            thrust::device_vector<int> thrust_dev_data = thrust_host_data;
+
+            thrust::sort(thrust_dev_data.begin(), thrust_dev_data.end());
+            thrust::copy(thrust_dev_data.begin(), thrust_dev_data.end(), odata);
+
             timer().endGpuTimer();
         }
     }
diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h
index fe98206..38bb58a 100644
--- a/stream_compaction/thrust.h
+++ b/stream_compaction/thrust.h
@@ -7,5 +7,7 @@ namespace StreamCompaction {
         StreamCompaction::Common::PerformanceTimer& timer();
 
         void scan(int n, int *odata, const int *idata);
+
+        void sort(int n, int* odata, const int* idata);
     }
 }

From a3d7988903ee3119fcd3fbab764c5c4b91e18070 Mon Sep 17 00:00:00 2001
From: licheng <lichengc@seas.upenn.edu>
Date: Mon, 18 Sep 2023 11:29:29 -0400
Subject: [PATCH 5/8] improve sort

---
 src/main.cpp                   | 54 ++++++++++++++++++++++--
 stream_compaction/cpu.cu       |  4 +-
 stream_compaction/efficient.cu | 77 +++++++++++++++++++---------------
 stream_compaction/efficient.h  |  1 +
 stream_compaction/naive.cu     | 12 ++++--
 stream_compaction/thrust.cu    | 18 ++++----
 6 files changed, 117 insertions(+), 49 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 0f155ed..6f3236c 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,15 +13,63 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 20; // feel free to change the size of array
+const int SIZE = (1 << 28); // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
 int *c = new int[SIZE];
 
 int main(int argc, char* argv[]) {
-    // Scan tests
 
+#if 0
+	float avgCPU = 0.f;
+	float avgEff = 0.f;
+	float avgNav = 0.f;
+	float avgThrust = 0.f;
+    for (int i = 0;i < 100;++i) {
+        genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+        a[SIZE - 1] = 0;
+
+        zeroArray(SIZE, c);
+        StreamCompaction::CPU::scan(SIZE, c, a);
+        avgCPU += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+
+        zeroArray(SIZE, c);
+        StreamCompaction::Naive::scan(SIZE, c, a);
+        avgNav += StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation();
+
+        zeroArray(SIZE, c);
+        StreamCompaction::Efficient::scan(SIZE, c, a);
+        avgEff += StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();
+
+        zeroArray(SIZE, c);
+        StreamCompaction::Thrust::scan(SIZE, c, a);
+        avgThrust += StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation();
+
+        //zeroArray(SIZE, c);
+        //StreamCompaction::CPU::sort(SIZE, c, a);
+        //avgCPU += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+
+
+        //zeroArray(SIZE, c);
+        //StreamCompaction::Efficient::sort(SIZE, c, a);
+        //avgEff += StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();
+
+        //zeroArray(SIZE, c);
+        //StreamCompaction::Thrust::sort(SIZE, c, a);
+        //avgThrust += StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation();
+
+    }
+
+    std::cout << "avg CPU: " << avgCPU / 100. << std::endl;
+	std::cout << "avg naive: " << avgNav / 100. << std::endl;
+	std::cout << "avg efficient: " << avgEff / 100. << std::endl;
+	std::cout << "avg thrust: " << avgThrust / 100. << std::endl;
+	std::cout << "......." << std::endl;
+#endif
+
+#if 1
+    // Scan tests
     printf("\n");
     printf("****************\n");
     printf("** SCAN TESTS **\n");
@@ -200,7 +248,7 @@ int main(int argc, char* argv[]) {
     //printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
-
+#endif // 1
 
     system("pause"); // stop Win32 console from closing on exit
     delete[] a;
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index d03a7f5..f938f4f 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -28,8 +28,10 @@ namespace StreamCompaction {
         }
 
         void sort(int n, int* odata, const int* idata) {
-            timer().startCpuTimer();
+            
             memcpy(odata, idata, sizeof(int) * n);
+
+            timer().startCpuTimer();
             std::sort(odata, odata + n);
             timer().endCpuTimer();
         }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 7b300a0..18d7c75 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -3,10 +3,11 @@
 #include "common.h"
 #include "efficient.h"
 
-static int blockSize = 256;
+static int blockSize = 64;
 
 namespace StreamCompaction {
     namespace Efficient {
+        
         using StreamCompaction::Common::PerformanceTimer;
         PerformanceTimer& timer()
         {
@@ -19,20 +20,16 @@ namespace StreamCompaction {
          */
 
         __global__ void kernelUpSweep(int n, int offset, int* i_odata) {
-            //int id = threadIdx.x + blockDim.x * blockIdx.x;
-            //if (id >= n || (id + 1) % (offset * 2) != 0)return;
             int threadId = threadIdx.x + blockDim.x * blockIdx.x + 1;
+            if (threadId > n)return;
             int id = offset * 2 * threadId - 1;
-            if (id >= n)return;
             i_odata[id] = i_odata[id] + i_odata[id - offset];
         }
 
         __global__ void kernelDownSweep(int n, int offset, int* i_odata) {
-            //int id = threadIdx.x + blockDim.x * blockIdx.x;
-            //if (id >= n || (id + 1) % (offset * 2)!=0)return;
             int threadId = threadIdx.x + blockDim.x * blockIdx.x + 1;
+            if (threadId > n)return;
             int id = offset * 2 * threadId - 1;
-            if (id >= n)return;
             //change 2
             int prevIdx = id - offset;
             int prevNum = i_odata[prevIdx];
@@ -45,22 +42,22 @@ namespace StreamCompaction {
             int offset = 1;
             int needN = N;
             for (int i = 0;i < layerCnt;++i) {
+                needN /= 2;
                 dim3 blockPerGrid((needN + blockSize - 1) / blockSize);
-                kernelUpSweep << <blockPerGrid, blockSize >> > (N, offset, dev_data);
+                kernelUpSweep << <blockPerGrid, blockSize >> > (needN, offset, dev_data);
                 offset *= 2;
-                needN /= 2;
             }
             cudaMemset(dev_data + offset - 1,0,sizeof(int));
             for (int i = 0;i < layerCnt;++i) {
                 offset /= 2;
                 dim3 blockPerGrid((needN + blockSize - 1) / blockSize);
-                kernelDownSweep << <blockPerGrid, blockSize >> > (N, offset, dev_data);
+                kernelDownSweep << <blockPerGrid, blockSize >> > (needN, offset, dev_data);
                 needN *= 2;
             }
         }
 
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
+            
             // TODO
             int* dev_data;
             int layerCnt = ilog2ceil(n);
@@ -69,13 +66,17 @@ namespace StreamCompaction {
             checkCUDAError("cudaMalloc dev_data failed!");
             cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
 
+            timer().startGpuTimer();
+
             devScan(dev_data, layerCnt, blockSize);
 
+            timer().endGpuTimer();
+
             //exclusive scan
             cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost);
 
             cudaFree(dev_data);
-            timer().endGpuTimer();
+            
         }
 
         /**
@@ -88,7 +89,7 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
-            timer().startCpuTimer();
+            
             // TODO
             int layerCnt = ilog2ceil(n);
             int N = 1 << layerCnt;
@@ -108,13 +109,18 @@ namespace StreamCompaction {
             
             
             dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
-            StreamCompaction::Common::kernMapToBoolean <<<fullBlocksPerGrid,blockSize>>>(N, dev_bools, dev_idata);
+
+            timer().startCpuTimer();
+
+            StreamCompaction::Common::kernMapToBoolean <<<fullBlocksPerGrid, blockSize>>>(N, dev_bools, dev_idata);
             
             cudaMemcpy(dev_indices, dev_bools, N * sizeof(int), cudaMemcpyDeviceToDevice);
-            devScan(dev_indices, layerCnt,blockSize);
+            devScan(dev_indices, layerCnt, blockSize);
             
-            StreamCompaction::Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (N, dev_odata, dev_idata, dev_bools, dev_indices);
+            StreamCompaction::Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (n, dev_odata, dev_idata, dev_bools, dev_indices);
             
+            timer().endCpuTimer();
+
             //read GPU
             int ans = 0;
             cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost);
@@ -128,7 +134,7 @@ namespace StreamCompaction {
             cudaFree(dev_bools);
             cudaFree(dev_odata);
             cudaFree(dev_indices);
-            timer().endCpuTimer();
+            
             return ans;
         }
 
@@ -140,30 +146,31 @@ namespace StreamCompaction {
         }
 
         __global__ void kernSortScatter(int n, int* odata,
-            const int* idata, const int* isOneBools, 
-            const int* indices_0,const int* indices_1,int zeroCnt) {
+            const int* idata, const int* isZeroBools, 
+            const int* indices_0,int zeroCnt) {
             // TODO
             int id = blockDim.x * blockIdx.x + threadIdx.x;
             if (id >= n)return;
-            if (isOneBools[id] == 1) {
-                int idx = indices_1[id] + zeroCnt;
+            if (isZeroBools[id] == 1) {
+                int idx = indices_0[id];
                 odata[idx] = idata[id];
             }
             else {
-                int idx = indices_0[id];
+                //ones before current id: id - indices_0[id], therefore we remove a scan 
+                int idx = id - indices_0[id] + zeroCnt;
                 odata[idx] = idata[id];
             }
         }
 
         void sort(int n, int* odata, const int* idata) {
-            timer().startCpuTimer();
+            
             // TODO
             int layerCnt = ilog2ceil(n);
             int N = 1 << layerCnt;
             int* dev_idata;
             int* dev_odata;
             int* dev_bools;
-            int* dev_indices_1;
+            //int* dev_indices_1;
             int* dev_indices_0;
             cudaMalloc((void**)&dev_idata, N * sizeof(int));
             checkCUDAError("cudaMalloc dev_idata failed!");
@@ -171,8 +178,8 @@ namespace StreamCompaction {
             checkCUDAError("cudaMalloc dev_bools failed!");
             cudaMalloc((void**)&dev_odata, N * sizeof(int));
             checkCUDAError("cudaMalloc dev_odata failed!");
-            cudaMalloc((void**)&dev_indices_1, N * sizeof(int));
-            checkCUDAError("cudaMalloc dev_indices failed!");
+            //cudaMalloc((void**)&dev_indices_1, N * sizeof(int));
+            //checkCUDAError("cudaMalloc dev_indices failed!");
             cudaMalloc((void**)&dev_indices_0, N * sizeof(int));
             checkCUDAError("cudaMalloc dev_indices failed!");
             cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
@@ -180,6 +187,9 @@ namespace StreamCompaction {
 
             dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
             int mask = 1;
+
+            timer().startGpuTimer();
+
             for (int i = 0;i < 32;++i) {
                 //map 0
                 kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (N, dev_bools, dev_idata,mask,true);
@@ -192,15 +202,17 @@ namespace StreamCompaction {
                 zeroCnt += lastBool;
 
                 //map1
-                kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (N, dev_bools, dev_idata, mask, false);
-                cudaMemcpy(dev_indices_1, dev_bools, N * sizeof(int), cudaMemcpyDeviceToDevice);
-                devScan(dev_indices_1, layerCnt, blockSize);
-                kernSortScatter << <fullBlocksPerGrid, blockSize >> > (N, 
-                    dev_odata, dev_idata, dev_bools, dev_indices_0, dev_indices_1,zeroCnt);
+                //kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (N, dev_bools, dev_idata, mask, false);
+                //cudaMemcpy(dev_indices_1, dev_bools, N * sizeof(int), cudaMemcpyDeviceToDevice);
+                //devScan(dev_indices_1, layerCnt, blockSize);
+                kernSortScatter << <fullBlocksPerGrid, blockSize >> > (N,
+                    dev_odata, dev_idata, dev_bools, dev_indices_0,zeroCnt);
                 mask <<= 1;
                 std::swap(dev_odata, dev_idata);
             }
 
+            timer().endGpuTimer();
+
             //read GPU
             cudaMemcpy(odata, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToHost);
 
@@ -209,8 +221,7 @@ namespace StreamCompaction {
             cudaFree(dev_bools);
             cudaFree(dev_odata);
             cudaFree(dev_indices_0);
-            cudaFree(dev_indices_1);
-            timer().endCpuTimer();
+            
         }
     }
 }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 90ab20a..3cea52b 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -4,6 +4,7 @@
 
 namespace StreamCompaction {
     namespace Efficient {
+
         StreamCompaction::Common::PerformanceTimer& timer();
 
         void scan(int n, int *odata, const int *idata);
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 8ece8a8..cac5422 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -3,10 +3,11 @@
 #include "common.h"
 #include "naive.h"
 
-static int blockSize = 256;
+static int blockSize = 128;
 
 namespace StreamCompaction {
     namespace Naive {
+        
         using StreamCompaction::Common::PerformanceTimer;
         PerformanceTimer& timer()
         {
@@ -30,7 +31,7 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
+            
             // TODO
             //initialize
             int* dev_ping;
@@ -47,6 +48,9 @@ namespace StreamCompaction {
             dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
             int layerCnt = ilog2ceil(n);
             int offset = 1;
+
+            timer().startGpuTimer();
+
             for (int i = 0;i < layerCnt;++i) {
                 if (i % 2 == 0) {
                     dev_in = dev_ping;
@@ -59,6 +63,8 @@ namespace StreamCompaction {
                 offset *= 2;
             }
 
+            timer().endGpuTimer();
+
             //exclusive scan
             cudaMemcpy(odata + 1, dev_out, sizeof(int) * (n-1), cudaMemcpyDeviceToHost);
             if(n>0)odata[0] = 0;
@@ -66,7 +72,7 @@ namespace StreamCompaction {
             cudaFree(dev_ping);
             cudaFree(dev_pong);
 
-            timer().endGpuTimer();
+            
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 82f437b..22dd573 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -19,31 +19,31 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
+            
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
             thrust::host_vector<int> thrust_host_data(idata, idata + n);
             thrust::device_vector<int> thrust_dev_idata = thrust_host_data;
             thrust::device_vector<int> thrust_dev_odata(n);
-
+            
+            timer().startGpuTimer();
             thrust::exclusive_scan(thrust_dev_idata.begin(), thrust_dev_idata.end(), thrust_dev_odata.begin());
-
-            thrust::copy(thrust_dev_odata.begin(), thrust_dev_odata.end(), odata);
-
-
             timer().endGpuTimer();
+            
+            
+            thrust::copy(thrust_dev_odata.begin(), thrust_dev_odata.end(), odata);            
         }
 
         void sort(int n, int* odata, const int* idata) {
-            timer().startGpuTimer();
             thrust::host_vector<int> thrust_host_data(idata, idata + n);
             thrust::device_vector<int> thrust_dev_data = thrust_host_data;
 
+            timer().startGpuTimer();
             thrust::sort(thrust_dev_data.begin(), thrust_dev_data.end());
-            thrust::copy(thrust_dev_data.begin(), thrust_dev_data.end(), odata);
-
             timer().endGpuTimer();
+
+            thrust::copy(thrust_dev_data.begin(), thrust_dev_data.end(), odata);
         }
     }
 }

From 15c8aa4ee9ea2f39aaf11c22a94159fc97317e1c Mon Sep 17 00:00:00 2001
From: LichengCAO <81556019+LichengCAO@users.noreply.github.com>
Date: Mon, 18 Sep 2023 16:41:19 -0400
Subject: [PATCH 6/8] Update README.md

---
 README.md | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 0e38ddb..e142e42 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,29 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Licheng CAO
+  * [LinkedIn](https://www.linkedin.com/in/licheng-cao-6a523524b/)
+* Tested on: Windows 10, i7-10870H @ 2.20GHz 32GB, GTX 3060 6009MB
 
-### (TODO: Your README)
+Implemented Features
+======================
+  * naive GPU scan
+  * efficient GPU scan (with reduced number of threads)
+  * GPU stream compaction
+  * simple radix sort
+
+Analysis
+======================
+### blocksize selection
+* ![blocksize_select](https://github.com/LichengCAO/Project2-Stream-Compaction/assets/81556019/7402b71e-c8dd-4949-9be1-cc63e8a7cec9)
+* Figure 1
+* Figure 1 shows the running time of my GPU program under different blocksizes. As the result, I pick 128 blocksize for my naive method and 64 blocksize for my efficient method.
+
+### scan
+* ![scan_largenum](https://github.com/LichengCAO/Project2-Stream-Compaction/assets/81556019/1dfd0dfe-80d0-440d-87a1-aab498bf6f9e)
+ * Figure 2 average runtime with large array size
+* ![scan_smallnum](https://github.com/LichengCAO/Project2-Stream-Compaction/assets/81556019/eca72f02-124d-4dbf-961e-b0e8864e4550)
+ * Figure 3 average runtime with small array size
+ * Figure 2, 3 show the runtime of scan with different method. From figures, we can find that when the array size is under 24,576, the CPU method is faster than other methods. However, when the array size increase to around 100,000, the GPU methods perform better.
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
 

From a5764bede65373eaba38cff97a9b05cd6af64abd Mon Sep 17 00:00:00 2001
From: LichengCAO <81556019+LichengCAO@users.noreply.github.com>
Date: Tue, 19 Sep 2023 14:58:52 -0400
Subject: [PATCH 7/8] Update README.md

---
 README.md | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 87 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index e142e42..d6bd148 100644
--- a/README.md
+++ b/README.md
@@ -12,20 +12,102 @@ Implemented Features
   * naive GPU scan
   * efficient GPU scan (with reduced number of threads)
   * GPU stream compaction
-  * simple radix sort
+  * naive GPU radix sort
 
 Analysis
 ======================
-### blocksize selection
+### Blocksize selection
 * ![blocksize_select](https://github.com/LichengCAO/Project2-Stream-Compaction/assets/81556019/7402b71e-c8dd-4949-9be1-cc63e8a7cec9)
 * Figure 1
-* Figure 1 shows the running time of my GPU program under different blocksizes. As the result, I pick 128 blocksize for my naive method and 64 blocksize for my efficient method.
+* Figure 1 shows the running time of my GPU program under different blocksizes. Consequently, I have chosen a block size of 128 for my naive method and 64 for my efficient method based on the results.
 
-### scan
+### Scan
 * ![scan_largenum](https://github.com/LichengCAO/Project2-Stream-Compaction/assets/81556019/1dfd0dfe-80d0-440d-87a1-aab498bf6f9e)
  * Figure 2 average runtime with large array size
 * ![scan_smallnum](https://github.com/LichengCAO/Project2-Stream-Compaction/assets/81556019/eca72f02-124d-4dbf-961e-b0e8864e4550)
  * Figure 3 average runtime with small array size
- * Figure 2, 3 show the runtime of scan with different method. From figures, we can find that when the array size is under 24,576, the CPU method is faster than other methods. However, when the array size increase to around 100,000, the GPU methods perform better.
+ * Figure 2 and 3 display the runtime performance of different methods for the scan operation. Upon analyzing these figures, it becomes evident that for array sizes below 24,576, the CPU method outperforms the other approaches in terms of speed. However, as the array size increases to approximately 100,000, the GPU methods exhibit superior performance. I think the bottlenecks in both GPU methods are related to memory input/output (I/O), as the computational tasks within these methods are not particularly complex. The bottleneck of CPU method may stem from its inability to execute operations in parallel, as its runtime is roughly proportional to the array size.
+ * ![cuda_compute](https://github.com/LichengCAO/Project2-Stream-Compaction/assets/81556019/88dd91d1-cec9-45dc-873f-093b51e57935)
+ * With Nsight Compute, we can see that thrust_scan uses 3 kernel functions to scan the array. I suspect that this method may closely resemble the scan method mentioned at the end of the slide, which involves dividing arrays into several blocks for scanning and subsequently adding offsets within each block to obtain the final result.
 
+### Sort
+* ![sort_largenum](https://github.com/LichengCAO/Project2-Stream-Compaction/assets/81556019/4e27cfed-501d-42f0-8029-8729402aff04)
+ * Figure 4 average sort time with large array size
+* ![sort_smallnum](https://github.com/LichengCAO/Project2-Stream-Compaction/assets/81556019/a9a22753-c58a-4664-9200-68ba61eb2641)
+ * Figure 5 average sort time with small array size
+* Figure 4, 5 show the runtime of sort with different method. With small arrays, my implementation of radix sort runs slower than the other two methods. With large large arrays, the 2 GPU methods run much faster than the CPU method.
+* The primary computational cost in my implementation arises from the scan procedure used to rearrange numbers based on their bits. Initially, I employed two separate scans to determine the correct indices for numbers with '0' and '1' bits at a specific position. Surprisingly, this approach made my radix implementation even slower than the CPU method.
+  * After reviewing others' implementations, I came to realize that I can calculate the index for numbers with '1' based on the scan result for numbers with '0' (i.e., index1 = total_number_of_0 + (cur_id_of_num - number_of_0_before_cur_id)). This modification boosted the performance of my implementation significantly, resulting in it running approximately 40% faster than the CPU method.
+  * Furthermore, it became evident that scanning all 32 bits in each iteration was unnecessary for sorting numbers. By checking if the array is already sorted at the beginning of each loop, I could avoid unnecessary scans. As a result, the runtime for the sorting process reduced to just 1/8 of its original duration.
+
+
+Tests
+======================
+```result
+****************
+** SCAN TESTS **
+****************
+==== cpu scan, power-of-two ====
+   elapsed time: 96.4101ms    (std::chrono Measured)
+==== cpu scan, non-power-of-two ====
+   elapsed time: 92.3129ms    (std::chrono Measured)
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 50.0737ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 50.0838ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 18.0347ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 18.0009ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 1.9712ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 1.95203ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 138.717ms    (std::chrono Measured)
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 137.777ms    (std::chrono Measured)
+    passed
+==== cpu compact with scan ====
+   elapsed time: 233.015ms    (std::chrono Measured)
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 18.0009ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 18.0009ms    (CUDA Measured)
+    passed
+
+****************
+** SORT TESTS **
+****************
+==== cpu sort, power-of-two ====
+   elapsed time: 1192.6ms    (std::chrono Measured)
+==== work-efficient sort, power-of-two ====
+   elapsed time: 161.361ms    (CUDA Measured)
+    passed
+==== thrust sort, power-of-two ====
+   elapsed time: 10.9355ms    (CUDA Measured)
+    passed
+==== cpu sort, non-power-of-two ====
+   elapsed time: 1200.21ms    (std::chrono Measured)
+==== work-efficient sort, non-power-of-two ====
+   elapsed time: 159.406ms    (CUDA Measured)
+    passed
+==== thrust sort, non-power-of-two ====
+   elapsed time: 10.8496ms    (CUDA Measured)
+    passed
+```
 

From 44ede6b2b60713311c79d5393eab3d1cbcddf856 Mon Sep 17 00:00:00 2001
From: licheng <lichengc@seas.upenn.edu>
Date: Tue, 19 Sep 2023 15:00:16 -0400
Subject: [PATCH 8/8] radix sort opt

---
 src/main.cpp                   | 32 ++++++++++++++++++++++++-
 stream_compaction/efficient.cu | 43 +++++++++++++++++++++++++++-------
 2 files changed, 66 insertions(+), 9 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 6f3236c..da7269d 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = (1 << 28); // feel free to change the size of array
+const int SIZE = (1 << 20); // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -68,6 +68,36 @@ int main(int argc, char* argv[]) {
 	std::cout << "......." << std::endl;
 #endif
 
+#if 0
+    float avgCPU = 0.f;
+    float avgEff = 0.f;
+    float avgThrust = 0.f;
+    int testCnt = 5;
+    for (int i = 0;i < testCnt;++i) {
+        genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+        a[SIZE - 1] = 0;
+
+        zeroArray(SIZE, c);
+        StreamCompaction::CPU::sort(SIZE, c, a);
+        avgCPU += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+
+        zeroArray(SIZE, c);
+        StreamCompaction::Efficient::sort(SIZE, c, a);
+        avgEff += StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();
+
+        zeroArray(SIZE, c);
+        StreamCompaction::Thrust::sort(SIZE, c, a);
+        avgThrust += StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation();
+
+    }
+    float denom = testCnt;
+    std::cout << "avg CPU: " << avgCPU / denom << std::endl;
+    std::cout << "avg efficient: " << avgEff / denom << std::endl;
+    std::cout << "avg thrust: " << avgThrust / denom << std::endl;
+    std::cout << "......." << std::endl;
+#endif
+
+
 #if 1
     // Scan tests
     printf("\n");
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 18d7c75..58cc307 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -2,6 +2,8 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "efficient.h"
+#include <thrust/scan.h>
+#include <thrust/device_ptr.h>
 
 static int blockSize = 64;
 
@@ -128,7 +130,7 @@ namespace StreamCompaction {
             int lastBool = 0;
             cudaMemcpy(&lastBool, dev_bools + (N - 1), sizeof(int), cudaMemcpyDeviceToHost);
             ans += lastBool;
-            
+
             //free GPU
             cudaFree(dev_idata);
             cudaFree(dev_bools);
@@ -162,6 +164,12 @@ namespace StreamCompaction {
             }
         }
 
+        __global__ void kernCheckSorted(int n, bool* odata, const int* idata) {
+            int id = blockDim.x * blockIdx.x + threadIdx.x;
+            if (id >= (n-1))return;
+            if(idata[id] > idata[id + 1])*odata=false;
+        }
+
         void sort(int n, int* odata, const int* idata) {
             
             // TODO
@@ -170,20 +178,27 @@ namespace StreamCompaction {
             int* dev_idata;
             int* dev_odata;
             int* dev_bools;
-            //int* dev_indices_1;
+            bool* dev_sorted;
             int* dev_indices_0;
+
             cudaMalloc((void**)&dev_idata, N * sizeof(int));
             checkCUDAError("cudaMalloc dev_idata failed!");
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            cudaMemset(dev_idata + n, INT_MAX, (N - n) * sizeof(int));//to make non-power-of-two right
+
             cudaMalloc((void**)&dev_bools, N * sizeof(int));
             checkCUDAError("cudaMalloc dev_bools failed!");
+
             cudaMalloc((void**)&dev_odata, N * sizeof(int));
             checkCUDAError("cudaMalloc dev_odata failed!");
-            //cudaMalloc((void**)&dev_indices_1, N * sizeof(int));
-            //checkCUDAError("cudaMalloc dev_indices failed!");
+
+            cudaMalloc((void**)&dev_sorted, sizeof(bool));
+            checkCUDAError("cudaMalloc dev_sorted failed!");
+
             cudaMalloc((void**)&dev_indices_0, N * sizeof(int));
             checkCUDAError("cudaMalloc dev_indices failed!");
-            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
-            cudaMemset(dev_idata + n, INT_MAX, (N - n)*sizeof(int));//to make non-power-of-two right
+
+
 
             dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
             int mask = 1;
@@ -191,22 +206,33 @@ namespace StreamCompaction {
             timer().startGpuTimer();
 
             for (int i = 0;i < 32;++i) {
+                //improvement2 avoid too much scan: makes the algorithm 8x faster
+                bool sorted = false;
+                cudaMemset(dev_sorted, true, sizeof(bool));
+                kernCheckSorted << <fullBlocksPerGrid, blockSize >> > (n, dev_sorted, dev_idata);
+                cudaMemcpy(&sorted, dev_sorted, sizeof(bool), cudaMemcpyDeviceToHost);
+                if (sorted)break;
+
                 //map 0
                 kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (N, dev_bools, dev_idata,mask,true);
                 cudaMemcpy(dev_indices_0, dev_bools, N * sizeof(int), cudaMemcpyDeviceToDevice);
                 devScan(dev_indices_0, layerCnt, blockSize);
+                //thrust::device_ptr<int> thrust_ptr(dev_indices_0);
+                //thrust::device_ptr<int> thrust_ptr_1(dev_bools);
+                //thrust::exclusive_scan(thrust_ptr_1, thrust_ptr_1 + N, thrust_ptr);
+                
                 int zeroCnt = 0;
                 int lastBool = 0;
                 cudaMemcpy(&zeroCnt, dev_indices_0 + (N - 1), sizeof(int), cudaMemcpyDeviceToHost);
                 cudaMemcpy(&lastBool, dev_bools + (N - 1), sizeof(int), cudaMemcpyDeviceToHost);
                 zeroCnt += lastBool;
 
+                //improvement1 skip map1: only need to scan once to get the index of 0 and 1, makes the algorithm faster than CPU
                 //map1
                 //kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (N, dev_bools, dev_idata, mask, false);
                 //cudaMemcpy(dev_indices_1, dev_bools, N * sizeof(int), cudaMemcpyDeviceToDevice);
                 //devScan(dev_indices_1, layerCnt, blockSize);
-                kernSortScatter << <fullBlocksPerGrid, blockSize >> > (N,
-                    dev_odata, dev_idata, dev_bools, dev_indices_0,zeroCnt);
+                kernSortScatter << <fullBlocksPerGrid, blockSize >> > (N, dev_odata, dev_idata, dev_bools, dev_indices_0,zeroCnt);
                 mask <<= 1;
                 std::swap(dev_odata, dev_idata);
             }
@@ -221,6 +247,7 @@ namespace StreamCompaction {
             cudaFree(dev_bools);
             cudaFree(dev_odata);
             cudaFree(dev_indices_0);
+            cudaFree(dev_sorted);
             
         }
     }