diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000..12949a7
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,20 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "CUDA C++: Launch",
+            "type": "cuda-gdb",
+            "request": "launch",
+            "program": ""
+        },
+        {
+            "name": "CUDA C++: Attach",
+            "type": "cuda-gdb",
+            "request": "attach",
+            "processId": "${command:cuda.pickProcess}"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..fdb8524
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,18 @@
+{
+    "ros.distro": "humble",
+    "files.associations": {
+        "*.sdf": "xml",
+        "*.world": "xml",
+        "array": "cpp",
+        "deque": "cpp",
+        "forward_list": "cpp",
+        "list": "cpp",
+        "string": "cpp",
+        "unordered_map": "cpp",
+        "unordered_set": "cpp",
+        "vector": "cpp",
+        "string_view": "cpp",
+        "initializer_list": "cpp",
+        "numeric": "cpp"
+    }
+}
\ No newline at end of file
diff --git a/README.md b/README.md
index 0e38ddb..7bad2db 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,101 @@
 CUDA Stream Compaction
 ======================
 
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+**Jason Xie**
 
-### (TODO: Your README)
+[🤓 LinkedIn](https://linkedin.com/in/jia-chun-xie)
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+[😇 my website](https://jchunx.dev)
 
+[🥵 X (formerly 🐦)](https://x.com/codemonke_)
+
+Tested on: Ubuntu 22.04, i5-8400, RTX 3060Ti, personal machine
+
+
+## About
+
+CUDA implementation of stream compaction & scan.
+
+## Performance Analysis
+
+### Block Size
+
+For all kernels, I found that 128 was the optimal block size.
+
+### Time vs. Array Size
+
+![comparison plot](assets/cuda-perf-compare-proj2.png)
+| Array Size | CPU Scan | Naive Scan | Work-Efficient Scan | Thrust Scan | CPU Compact | Work-Efficient Compact |
+| ---------- | -------- | ---------- | ------------------- | ----------- | ----------- | ---------------------- |
+| 2^8        | 0.000333 | 0.0806 | 0.0963 | 0.0522 | 0.000834 | 0.125 |
+| 2^12 | 0.00235 | 0.0840 | 0.109 | 0.0543 | 0.00924 | 0.108 |
+| 2^16 | 0.0388 | 0.0820 | 0.120 | 0.0522 | 0.145 | 0.107 |
+| 2^20 | 0.541 | 0.503 | 0.533 | 0.194 | 2.42 | 0.726 |
+| 2^24 | 9.033 | 11.134 | 7.661 | 0.500 | 38.880 | 8.487 |
+
+## What is going on here?
+
+Unoptimized GPU scanning actually performs worse than CPU scanning. A bit of Nsight shows that the kernels have low warp occupancy (esp. up / down sweeps):
+![low occupancy](assets/low-occupancy.png)
+
+The hypothesis here is that the kernels are being bottlenecked by global memory access and warp divergence.
+
+## Test Program Outputs
+
+```
+****************
+** SCAN TESTS **
+****************
+    [  38  28  32  41  11   4  49  31  48   8  42  48  22 ...  25   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 4.48317ms    (std::chrono Measured)
+    [  38  66  98 139 150 154 203 234 282 290 332 380 402 ... 205516747 205516747 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 4.44224ms    (std::chrono Measured)
+    [  38  66  98 139 150 154 203 234 282 290 332 380 402 ... 205516679 205516703 ]
+    passed 
+==== naive scan, power-of-two ====
+   elapsed time: 4.82586ms    (CUDA Measured)
+    a[4194304] = 102760269, b[4194304] = 102760231
+    FAIL VALUE 
+==== naive scan, non-power-of-two ====
+   elapsed time: 4.18042ms    (CUDA Measured)
+    a[4194304] = 102760269, b[4194304] = 102760231
+    FAIL VALUE 
+==== work-efficient scan, power-of-two ====
+   elapsed time: 3.68918ms    (CUDA Measured)
+    passed 
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 3.61574ms    (CUDA Measured)
+    passed 
+==== thrust scan, power-of-two ====
+   elapsed time: 0.346912ms    (CUDA Measured)
+    passed 
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.310976ms    (CUDA Measured)
+    passed 
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   0   0   2   1   1   0   3   1   2   0   2   2   0 ...   3   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 19.3632ms    (std::chrono Measured)
+    [   2   1   1   3   1   2   2   2   2   2   2   1   3 ...   3   3 ]
+    passed 
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 19.8116ms    (std::chrono Measured)
+    [   2   1   1   3   1   2   2   2   2   2   2   1   3 ...   3   2 ]
+    passed 
+==== cpu compact with scan ====
+   elapsed time: 71.7739ms    (std::chrono Measured)
+    [   2   1   1   3   1   2   2   2   2   2   2   1   3 ...   3   3 ]
+    passed 
+==== work-efficient compact, power-of-two ====
+   elapsed time: 4.29862ms    (CUDA Measured)
+    passed 
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 4.53734ms    (CUDA Measured)
+    passed 
+```
\ No newline at end of file
diff --git a/assets/cuda-perf-compare-proj2.png b/assets/cuda-perf-compare-proj2.png
new file mode 100644
index 0000000..aa4319a
Binary files /dev/null and b/assets/cuda-perf-compare-proj2.png differ
diff --git a/assets/low-occupancy.png b/assets/low-occupancy.png
new file mode 100644
index 0000000..a8c787d
Binary files /dev/null and b/assets/low-occupancy.png differ
diff --git a/build.sh b/build.sh
new file mode 100644
index 0000000..29b879b
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# Create build directory if it doesn't exist
+mkdir -p build
+
+# Change into the build directory
+cd build
+
+# Default build type to Release
+build_type="Release"
+
+# Check for argument "debug" to change build type
+if [ "$1" == "debug" ]; then
+  echo "Building in debug mode"
+  build_type="Debug"
+fi
+
+# Run cmake with the specified build type
+cmake -DCMAKE_BUILD_TYPE=$build_type ..
+
+# Build the project with dbg if debug was specified
+if [ "$1" == "debug" ]; then
+  make dbg=1
+else
+  make
+fi
\ No newline at end of file
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..cb06e58 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -11,9 +11,10 @@
 #include <stream_compaction/naive.h>
 #include <stream_compaction/efficient.h>
 #include <stream_compaction/thrust.h>
+#include <unistd.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 24; // 8 feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -21,7 +22,9 @@ int *c = new int[SIZE];
 
 int main(int argc, char* argv[]) {
     // Scan tests
-
+    // std::cout << "sleeping..." << std::endl;
+    // sleep(5);
+    // std::cout << "awake!" << std::endl;
     printf("\n");
     printf("****************\n");
     printf("** SCAN TESTS **\n");
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..6599553 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -23,7 +23,11 @@ namespace StreamCompaction {
          * which map to 0 will be removed, and elements which map to 1 will be kept.
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-            // TODO
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n) {
+                return;
+            }
+            bools[index] = idata[index] == 0 ? 0 : 1;
         }
 
         /**
@@ -32,7 +36,13 @@ namespace StreamCompaction {
          */
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
-            // TODO
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n) {
+                return;
+            }
+            if (bools[index] == 1) {
+                odata[indices[index]] = idata[index];
+            } 
         }
 
     }
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..b0b2e2b 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -19,10 +19,24 @@ namespace StreamCompaction {
          */
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            for (int i = 0; i < n; i++) {
+                odata[i] = idata[i];
+                if (i > 0) {
+                    odata[i] += odata[i - 1];
+                }
+            }
             timer().endCpuTimer();
         }
 
+        void _scan_no_timer(int n, int *odata, const int *idata) {
+            for (int i = 0; i < n; i++) {
+                odata[i] = idata[i];
+                if (i > 0) {
+                    odata[i] += odata[i - 1];
+                }
+            }
+        }
+
         /**
          * CPU stream compaction without using the scan function.
          *
@@ -30,9 +44,15 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            int numElements = 0;
+            for (int i = 0; i < n; i++) {
+                if (idata[i] != 0) {
+                    odata[numElements] = idata[i];
+                    numElements++;
+                }
+            }
             timer().endCpuTimer();
-            return -1;
+            return numElements;
         }
 
         /**
@@ -42,9 +62,29 @@ namespace StreamCompaction {
          */
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            int* bools = new int[n];
+            for (int i = 0; i < n; i++) {
+                bools[i] = (idata[i] != 0) ? 1 : 0;
+            }
+            int* scanned = new int[n];
+            _scan_no_timer(n, scanned, bools);
+            // convert to exclusive scan
+            for (int i = n - 1; i > 0; i--) {
+                scanned[i] = scanned[i - 1];
+            }
+            scanned[0] = 0;
+            int numElements = 0;
+            for (int i = 0; i < n; i++) {
+                if (bools[i] != 0) {
+                    odata[scanned[i]] = idata[i];
+                    numElements++;
+                }
+            }
+
+            delete[] bools;
+            delete[] scanned;
             timer().endCpuTimer();
-            return -1;
+            return numElements;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..12a2d6e 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -12,13 +12,74 @@ namespace StreamCompaction {
             return timer;
         }
 
+        __global__ void kernScanUpsweep(int n, int d, int* idata) {
+            int k = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (k >= n || k % (1 << (d + 1)) != 0) {
+                return;
+            }
+            int idx_right = k + (1 << (d + 1)) - 1;
+            int idx_left = k + (1 << d) - 1;
+            idata[idx_right] += idata[idx_left];
+        }
+
+        __global__ void kernScanDownsweep(int n, int d, int* idata) {
+            int k = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (k >= n || k % (1 << (d + 1)) != 0) {
+                return;
+            }
+            int idx_right = k + (1 << (d + 1)) - 1;
+            int idx_left = k + (1 << d) - 1;
+
+            int tmp = idata[idx_left];
+            idata[idx_left] = idata[idx_right];
+            idata[idx_right] += tmp;
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            
+
+            int n_padded = 1 << ilog2ceil(n);
+
+            int *dev_idata;
+
+            cudaMalloc((void**)&dev_idata, n_padded * sizeof(int));
+            checkCUDAError("cudaMalloc dev_idata failed!");
+            
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAError("cudaMemcpy dev_idata failed!");
+            cudaMemset(dev_idata + n, 0, (n_padded - n) * sizeof(int));
             timer().startGpuTimer();
-            // TODO
+            _scan_dev(n_padded, dev_idata);
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_idata+1, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost);
+            odata[n - 1] = odata[n - 2] + idata[n - 1];
+            checkCUDAError("cudaMemcpy odata failed!");
+            cudaFree(dev_idata);
+            checkCUDAError("cudaFree failed!");     
+        }
+
+        void _scan_dev(int n_padded, int *dev_idata) {
+            int blockSize = 128;
+            int num_blocks = (n_padded + blockSize - 1) / blockSize;
+
+            int ilog2ceil_n = ilog2ceil(n_padded)-1;
+
+            for (int d = 0; d <= ilog2ceil_n; d++) {
+                kernScanUpsweep<<<num_blocks, blockSize>>>(n_padded, d, dev_idata);
+                checkCUDAError("kernScanUpsweep failed!");
+            }
+            // 2. downsweep
+            cudaMemset(dev_idata + n_padded - 1, 0, sizeof(int));
+            checkCUDAError("cudaMemset dev_idata failed!");
+
+            for (int d = ilog2ceil_n; d >= 0; d--) {
+                kernScanDownsweep<<<num_blocks, blockSize>>>(n_padded, d, dev_idata);
+                checkCUDAError("kernScanDownsweep failed!");
+            }
         }
 
         /**
@@ -31,10 +92,61 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
+            int blockSize = 128;
+            int n_padded = 1 << ilog2ceil(n);
+            int num_blocks = (n_padded + blockSize - 1) / blockSize;
+
+            int *dev_bools; 
+            int *dev_idata;           
+            int *dev_indices;
+            int *dev_scattered;
+
+            cudaMalloc((void**)&dev_bools, n_padded * sizeof(int));
+            checkCUDAError("cudaMalloc dev_bools failed!");
+            cudaMalloc((void**)&dev_idata, n_padded * sizeof(int));
+            checkCUDAError("cudaMalloc dev_idata failed!");
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAError("cudaMemcpy dev_idata failed!");
+            if (n < n_padded) {
+                cudaMemset(dev_idata + n, 0, (n_padded - n) * sizeof(int));
+                checkCUDAError("cudaMemset dev_idata failed!");
+            }
+
             timer().startGpuTimer();
-            // TODO
+            Common::kernMapToBoolean<<<num_blocks, blockSize>>>(n_padded, dev_bools, dev_idata);
+
+            cudaMalloc((void**)&dev_indices, n_padded * sizeof(int));
+            checkCUDAError("cudaMalloc dev_indices failed!");
+            cudaMemcpy(dev_indices, dev_bools, n_padded * sizeof(int), cudaMemcpyDeviceToDevice);
+            checkCUDAError("cudaMemcpy dev_indices failed!");
+
+            _scan_dev(n_padded, dev_indices);
+            checkCUDAError("_scan_dev failed!");
+
+            cudaMalloc((void**)&dev_scattered, n_padded * sizeof(int));
+            checkCUDAError("cudaMalloc dev_scattered failed!");
+
+            Common::kernScatter<<<num_blocks, blockSize>>>(n_padded, dev_scattered, dev_idata, dev_bools, dev_indices);
+            checkCUDAError("kernScatter failed!");
             timer().endGpuTimer();
-            return -1;
+
+            // since dev_indices is exclusive scan (prefix sum), we can grab num valid elements from the last element
+            int num_valid;
+            int last_bool;
+            cudaMemcpy(&num_valid, dev_indices + n_padded - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMemcpy(&last_bool, dev_bools + n_padded - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            checkCUDAError("cudaMemcpy num_valid failed!");
+            num_valid += last_bool;
+            cudaMemcpy(odata, dev_scattered, num_valid * sizeof(int), cudaMemcpyDeviceToHost);
+            checkCUDAError("cudaMemcpy odata failed!");
+
+            cudaFree(dev_bools);
+            cudaFree(dev_idata);
+            cudaFree(dev_indices);
+            cudaFree(dev_scattered);
+            checkCUDAError("cudaFree failed!");
+            
+            return num_valid;
         }
     }
 }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 803cb4f..29ae0fd 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -8,6 +8,8 @@ namespace StreamCompaction {
 
         void scan(int n, int *odata, const int *idata);
 
+        void _scan_dev(int n, int *dev_idata);
+
         int compact(int n, int *odata, const int *idata);
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..993e2b1 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -11,15 +11,64 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
-        // TODO: __global__
+
+        __global__ void kernNaiveScan(int n, int d, int* odata, int* idata) {
+            int k = (blockIdx.x * blockDim.x) + threadIdx.x;
+            int left = 1 << (d-1);
+            if (k >= left) {
+                odata[k] = idata[k-left] + idata[k];
+            }
+            else {
+                odata[k] = idata[k];
+            }
+        }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            
+            int block_size = 128;
+            int n_padded = 1 << ilog2ceil(n);
+            int num_blocks = (n_padded + block_size - 1) / block_size;
+            int* dev_bufferA;
+            int* dev_bufferB;
+
+            cudaMalloc((void**)&dev_bufferA, n_padded * sizeof(int));
+            checkCUDAError("cudaMalloc dev_bufferA failed!");
+            cudaMalloc((void**)&dev_bufferB, n_padded * sizeof(int));
+            checkCUDAError("cudaMalloc dev_bufferB failed!");
+
+            cudaMemset(dev_bufferA, 0, n_padded * sizeof(int));
+            checkCUDAError("cudaMemset dev_bufferA failed!");
+            cudaMemset(dev_bufferB, 0, n_padded * sizeof(int));
+            checkCUDAError("cudaMemset dev_bufferB failed!");
+            
+            cudaMemcpy(dev_bufferA, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAError("cudaMemcpy idata to dev_bufferA failed!");
+
             timer().startGpuTimer();
-            // TODO
+            int d_max = ilog2ceil(n_padded);
+            for (int d=1; d<=d_max; d++) {
+                kernNaiveScan<<<num_blocks, block_size>>>(n_padded, d, dev_bufferB, dev_bufferA);
+                checkCUDAError("kernNaiveScan failed!");
+                std::swap(dev_bufferA, dev_bufferB);
+            }
             timer().endGpuTimer();
+            if (d_max % 2 == 0) {
+                cudaMemcpy(odata, dev_bufferA, n * sizeof(int), cudaMemcpyDeviceToHost);
+                checkCUDAError("cudaMemcpy dev_bufferA to odata failed!");
+            } else {
+                cudaMemcpy(odata, dev_bufferB, n * sizeof(int), cudaMemcpyDeviceToHost);
+                checkCUDAError("cudaMemcpy dev_bufferB to odata failed!");
+            }
+            
+            cudaFree(dev_bufferA);
+            checkCUDAError("cudaFree dev_bufferA failed!");
+            cudaFree(dev_bufferB);
+            checkCUDAError("cudaFree dev_bufferB failed!");
+
+            
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..a8f34fb 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,11 +18,15 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            
+
+            thrust::device_vector<int> dv_in(idata, idata + n);
+            thrust::device_vector<int> dv_out(n);
             timer().startGpuTimer();
-            // TODO use `thrust::exclusive_scan`
-            // example: for device_vectors dv_in and dv_out:
-            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
             timer().endGpuTimer();
+            thrust::copy(dv_out.begin()+1, dv_out.end(), odata);
+            odata[n-1] = odata[n-2] + idata[n-1];
         }
     }
 }