From 465f824b5ce05c1d6be9cdc0e92c9fe03531a19e Mon Sep 17 00:00:00 2001
From: Kau <dominikk@kite.upenn.edu>
Date: Tue, 10 Sep 2024 16:16:36 -0400
Subject: [PATCH 01/10] Added Part 1

---
 stream_compaction/cpu.cu | 39 +++++++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..d9896f2 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -20,6 +20,11 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            int partialSum = 0;
+            for (int i = 0; i < n; ++i) {
+                odata[i] = partialSum;
+                partialSum += idata[i];
+            }
             timer().endCpuTimer();
         }
 
@@ -30,9 +35,20 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            int numElements = 0;
+            for (int i = 0; i < n; ++i) {
+                if (idata[i]) odata[numElements++] = idata[i];
+            }
             timer().endCpuTimer();
-            return -1;
+            return numElements;
+        }
+
+        int scatter(int n, int* odata, const int* bdata, const int* idata) {
+            int numElements = 0;
+            for (int i = 0; i < n; ++i) {
+                if (bdata[i])  odata[numElements++] = idata[i];
+            }
+            return numElements;
         }
 
         /**
@@ -42,9 +58,24 @@ namespace StreamCompaction {
          */
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            // Create boolean mask
+            int* buffer = new int[n];
+            for (int i = 0; i < n; ++i) {
+                buffer[i] = (idata[i] != 0);
+            }
+
+            // Scan (calling timed function leads to error)
+            int partialSum = 0;
+            for (int i = 0; i < n; ++i) {
+                odata[i] = partialSum;
+                partialSum += buffer[i];
+            }
+
+            int numElements = scatter(n, odata, buffer, idata);
+
+            delete[] buffer;
             timer().endCpuTimer();
-            return -1;
+            return numElements;
         }
     }
 }

From 1d673ff6f0fbf0eccf2da1f8133d789524439638 Mon Sep 17 00:00:00 2001
From: Kau <dominikk@kite.upenn.edu>
Date: Mon, 16 Sep 2024 10:04:38 -0400
Subject: [PATCH 02/10] Added Part 2

---
 src/main.cpp               |  6 ++--
 stream_compaction/cpu.cu   |  4 +--
 stream_compaction/naive.cu | 73 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 77 insertions(+), 6 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..08fcfa6 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 4; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -51,7 +51,7 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
@@ -64,7 +64,7 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index d9896f2..1cbd725 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -57,9 +57,9 @@ namespace StreamCompaction {
          * @returns the number of elements remaining after compaction.
          */
         int compactWithScan(int n, int *odata, const int *idata) {
+            int* buffer = new int[n];
             timer().startCpuTimer();
             // Create boolean mask
-            int* buffer = new int[n];
             for (int i = 0; i < n; ++i) {
                 buffer[i] = (idata[i] != 0);
             }
@@ -73,8 +73,8 @@ namespace StreamCompaction {
 
             int numElements = scatter(n, odata, buffer, idata);
 
-            delete[] buffer;
             timer().endCpuTimer();
+            delete[] buffer;
             return numElements;
         }
     }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..6e78567 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -2,6 +2,7 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "naive.h"
+#include "device_launch_parameters.h"
 
 namespace StreamCompaction {
     namespace Naive {
@@ -13,13 +14,83 @@ namespace StreamCompaction {
         }
         // TODO: __global__
 
+        __global__ void shiftArrayElements(int n, const int* readBuffer, int* writeBuffer) {
+            int index = threadIdx.x + blockIdx.x * blockDim.x;
+            if (index >= n) return;
+
+            if (index == 0) {
+                writeBuffer[0] = 0;
+                return;
+            }
+
+            writeBuffer[index] = readBuffer[index - 1];
+        }
+
+        __global__ void handleNonPower(int n, int d, int* buffer) {
+            int index = threadIdx.x + blockIdx.x * blockDim.x;
+            int pow2tod = 1 << d;
+
+            if (index >= n - pow2tod) return;
+
+            buffer[pow2tod + index] += buffer[index];
+        }
+
+        __global__ void naiveScanStep(int n, int d, const int* readBuffer, int* writeBuffer) {
+            // compute thread index
+            int index = threadIdx.x + blockIdx.x * blockDim.x;
+            if (index >= n) return;
+
+            int pow2tod = 1 << d;
+
+            if (index > pow2tod) {
+                writeBuffer[index] = readBuffer[index] + readBuffer[index - pow2tod];
+            }
+            else {
+                writeBuffer[index] = readBuffer[index];
+            }
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int blockSize = 128;
+            dim3 fullBlocksPerGrid{ (unsigned int) (n + blockSize - 1) / blockSize };
+
+            int* dev_buffer1;
+            int* dev_buffer2;
+
+            cudaMalloc((void**)&dev_buffer1, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_buffer1 failed!");
+            cudaMalloc((void**)&dev_buffer2, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_buffer2 failed!");
+
+            cudaMemcpy(dev_buffer2, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
             timer().startGpuTimer();
+            shiftArrayElements<<<fullBlocksPerGrid, blockSize>>>(n, dev_buffer2, dev_buffer1);
+            checkCUDAError("shiftArrayElements failed!");
+            cudaDeviceSynchronize();
             // TODO
+            for (int d = 0; d < ilog2(n); ++d) {
+                naiveScanStep<<<fullBlocksPerGrid, blockSize>>>(n, d, dev_buffer1, dev_buffer2);
+                checkCUDAError("naiveScanStep failed!");
+                cudaDeviceSynchronize();
+
+                std::swap(dev_buffer1, dev_buffer2);
+            }
+            if ((1 << ilog2(n)) != n) {
+                fullBlocksPerGrid.x = (n - (1 << ilog2(n)) + blockSize - 1) / blockSize;
+                handleNonPower<<<fullBlocksPerGrid, blockSize>>>(n, ilog2(n), dev_buffer1);
+                checkCUDAError("handleNonPower failed!");
+                cudaDeviceSynchronize();
+            }
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_buffer1, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_buffer1);
+            cudaFree(dev_buffer2);
         }
     }
-}
+}
\ No newline at end of file

From fc07224a46286cdda8638d5dd65141a0f2513607 Mon Sep 17 00:00:00 2001
From: Kau <dominikk@kite.upenn.edu>
Date: Mon, 16 Sep 2024 16:58:28 -0400
Subject: [PATCH 03/10] Added Part 3

---
 src/main.cpp                   |  6 +--
 stream_compaction/common.cu    | 11 +++++
 stream_compaction/common.h     |  3 ++
 stream_compaction/efficient.cu | 83 +++++++++++++++++++++++++++++++++-
 stream_compaction/naive.cu     | 16 +------
 5 files changed, 101 insertions(+), 18 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 08fcfa6..e19a26c 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 4; // feel free to change the size of array
+const int SIZE = 1 << 8; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -71,14 +71,14 @@ int main(int argc, char* argv[]) {
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..c949f8c 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -1,4 +1,5 @@
 #include "common.h"
+#include "device_launch_parameters.h"
 
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
     cudaError_t err = cudaGetLastError();
@@ -17,7 +18,17 @@ void checkCUDAErrorFn(const char *msg, const char *file, int line) {
 
 namespace StreamCompaction {
     namespace Common {
+        __global__ void shiftArrayElements(int n, int shift, const int* readBuffer, int* writeBuffer) {
+            int index = threadIdx.x + blockIdx.x * blockDim.x;
+            if (index >= n) return;
 
+            if (index < shift) {
+                writeBuffer[index] = 0;
+                return;
+            }
+
+            writeBuffer[index] = readBuffer[index - shift];
+        }
         /**
          * Maps an array to an array of 0s and 1s for stream compaction. Elements
          * which map to 0 will be removed, and elements which map to 1 will be kept.
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index d2c1fed..cfdc45b 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -37,6 +37,9 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices);
 
+        __global__ void shiftArrayElements(int n, int shift,
+            const int* readBuffer, int* writeBuffer);
+
         /**
         * This class is used for timing the performance
         * Uncopyable and unmovable
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..2c73fc3 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -2,6 +2,7 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "efficient.h"
+#include "device_launch_parameters.h"
 
 namespace StreamCompaction {
     namespace Efficient {
@@ -12,13 +13,93 @@ namespace StreamCompaction {
             return timer;
         }
 
+        __global__ void kernUpSweep(int n, int d, int* buffer) {
+            int index = threadIdx.x + blockIdx.x * blockDim.x;
+   
+            int pow2tod = 1 << d;
+            int pow2todp1 = 2 * pow2tod;
+
+            if (index >= n / pow2todp1) return;
+            index *= pow2todp1;
+
+            buffer[index + pow2todp1 - 1] += buffer[index + pow2tod - 1];
+        }
+
+        __global__ void kernDownSweep(int n, int d, int s, int* buffer) {
+            int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+            int pow2tod = 1 << d;
+            int pow2todp1 = 2 * pow2tod;
+
+            if (s) {
+                buffer[pow2todp1 - 1] = 0;
+            }
+
+            if (index > n / pow2todp1) return;
+            index *= pow2todp1;
+
+            int tmp = buffer[index + pow2tod - 1];
+            buffer[index + pow2tod - 1] = buffer[index + pow2todp1 - 1];
+            buffer[index + pow2todp1 - 1] += tmp;
+        }
+
+        __global__ void kernZeroPadding(int n, int d, int* buffer) {
+            int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+            if (index >= 1 << (d + 1) - n) return;
+
+            buffer[n + index] = 0;
+        }
+
+        dim3 computeBlocksPerGrid(int threads, int blockSize) {
+            return dim3{ (unsigned int)(threads + blockSize - 1) / blockSize };
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int blockSize = 128;
+
+            bool isPower2Length = (n == (1 << ilog2(n)));
+
+            int bufferLength = (isPower2Length) ? n : 1 << ilog2ceil(n);
+
+            int* tmpArray;
+            cudaMalloc((void**)&tmpArray, bufferLength * sizeof(int));
+            checkCUDAError("cudaMalloc tmpArray failed!");
+
+            if (!isPower2Length) {
+                dim3 blocks = computeBlocksPerGrid(n - (1 << ilog2(n)), blockSize);
+                kernZeroPadding<<<blocks, blockSize>>>(n, ilog2(n), tmpArray);
+                checkCUDAError("kernZeroPadding failed!");
+                cudaDeviceSynchronize();
+            }
+
+            cudaMemcpy(tmpArray, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
             timer().startGpuTimer();
             // TODO
+            for (int d = 0; d < ilog2ceil(n); ++d) {
+                dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize);
+                kernUpSweep<<<blocks, blockSize>>>(bufferLength, d, tmpArray);
+                checkCUDAError("kernUpSweep failed!");
+                cudaDeviceSynchronize();
+            }
+            
+            bool flag = 1;
+            for (int d = ilog2ceil(n) - 1; d >= 0; --d) {
+                dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize);
+                kernDownSweep<<<blocks, blockSize>>>(bufferLength, d, flag, tmpArray);
+                flag = 0;
+                checkCUDAError("kernDownSweep failed!");
+                cudaDeviceSynchronize();
+            }
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, tmpArray, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(tmpArray);
         }
 
         /**
@@ -37,4 +118,4 @@ namespace StreamCompaction {
             return -1;
         }
     }
-}
+}
\ No newline at end of file
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 6e78567..eb2ba1b 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -13,19 +13,6 @@ namespace StreamCompaction {
             return timer;
         }
         // TODO: __global__
-
-        __global__ void shiftArrayElements(int n, const int* readBuffer, int* writeBuffer) {
-            int index = threadIdx.x + blockIdx.x * blockDim.x;
-            if (index >= n) return;
-
-            if (index == 0) {
-                writeBuffer[0] = 0;
-                return;
-            }
-
-            writeBuffer[index] = readBuffer[index - 1];
-        }
-
         __global__ void handleNonPower(int n, int d, int* buffer) {
             int index = threadIdx.x + blockIdx.x * blockDim.x;
             int pow2tod = 1 << d;
@@ -68,7 +55,7 @@ namespace StreamCompaction {
             cudaMemcpy(dev_buffer2, idata, n * sizeof(int), cudaMemcpyHostToDevice);
 
             timer().startGpuTimer();
-            shiftArrayElements<<<fullBlocksPerGrid, blockSize>>>(n, dev_buffer2, dev_buffer1);
+            StreamCompaction::Common::shiftArrayElements<<<fullBlocksPerGrid, blockSize>>>(n, 1, dev_buffer2, dev_buffer1);
             checkCUDAError("shiftArrayElements failed!");
             cudaDeviceSynchronize();
             // TODO
@@ -79,6 +66,7 @@ namespace StreamCompaction {
 
                 std::swap(dev_buffer1, dev_buffer2);
             }
+            // perform last step 
             if ((1 << ilog2(n)) != n) {
                 fullBlocksPerGrid.x = (n - (1 << ilog2(n)) + blockSize - 1) / blockSize;
                 handleNonPower<<<fullBlocksPerGrid, blockSize>>>(n, ilog2(n), dev_buffer1);

From 73a3ed81d93710556bd48c6ed723697fcd9c19a9 Mon Sep 17 00:00:00 2001
From: Kau <dominikk@kite.upenn.edu>
Date: Tue, 17 Sep 2024 10:38:27 -0400
Subject: [PATCH 04/10] Added Part 4

---
 src/main.cpp                   | 14 ++---
 stream_compaction/common.cu    | 17 ++++--
 stream_compaction/efficient.cu | 94 ++++++++++++++++++++++++++++++++--
 stream_compaction/thrust.cu    | 13 +++++
 4 files changed, 123 insertions(+), 15 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index e19a26c..ca51934 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 20; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -54,11 +54,11 @@ int main(int argc, char* argv[]) {
     printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
-    /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
+    /*For bug - finding only : Array of 1s to help find bugs in stream compaction or scan
     onesArray(SIZE, c);
     printDesc("1s array for finding bugs");
     StreamCompaction::Naive::scan(SIZE, c, a);
-    printArray(SIZE, c, true); */
+    printArray(SIZE, c, true);*/
 
     zeroArray(SIZE, c);
     printDesc("naive scan, non-power-of-two");
@@ -85,14 +85,14 @@ int main(int argc, char* argv[]) {
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, non-power-of-two");
     StreamCompaction::Thrust::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     printf("\n");
@@ -137,14 +137,14 @@ int main(int argc, char* argv[]) {
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
     system("pause"); // stop Win32 console from closing on exit
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index c949f8c..12c8acf 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -33,17 +33,26 @@ namespace StreamCompaction {
          * Maps an array to an array of 0s and 1s for stream compaction. Elements
          * which map to 0 will be removed, and elements which map to 1 will be kept.
          */
-        __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
+        __global__ void kernMapToBoolean(int n, int* bools, const int* idata) {
             // TODO
-        }
+            int index = threadIdx.x + blockIdx.x * blockDim.x;
+            if (index >= n) return;
 
+            bools[index] = (idata[index] != 0);
+            
+        }
+        
         /**
          * Performs scatter on an array. That is, for each element in idata,
          * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
          */
-        __global__ void kernScatter(int n, int *odata,
-                const int *idata, const int *bools, const int *indices) {
+        __global__ void kernScatter(int n, int* odata,
+                const int* idata, const int* bools, const int* indices) {
             // TODO
+            int index = threadIdx.x + blockIdx.x * blockDim.x;
+            if (index >= n) return;
+
+            if (bools[index]) odata[indices[index]] = idata[index];
         }
 
     }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2c73fc3..2760ba8 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -19,7 +19,7 @@ namespace StreamCompaction {
             int pow2tod = 1 << d;
             int pow2todp1 = 2 * pow2tod;
 
-            if (index >= n / pow2todp1) return;
+            if (index > n / pow2todp1 - 1) return;
             index *= pow2todp1;
 
             buffer[index + pow2todp1 - 1] += buffer[index + pow2tod - 1];
@@ -35,7 +35,7 @@ namespace StreamCompaction {
                 buffer[pow2todp1 - 1] = 0;
             }
 
-            if (index > n / pow2todp1) return;
+            if (index > n / pow2todp1 - 1) return;
             index *= pow2todp1;
 
             int tmp = buffer[index + pow2tod - 1];
@@ -102,6 +102,49 @@ namespace StreamCompaction {
             cudaFree(tmpArray);
         }
 
+        void scanUntimed(int n, int* odata, const int* idata) {
+            int blockSize = 128;
+
+            bool isPower2Length = (n == (1 << ilog2(n)));
+
+            int bufferLength = (isPower2Length) ? n : 1 << ilog2ceil(n);
+
+            int* tmpArray;
+            cudaMalloc((void**)&tmpArray, bufferLength * sizeof(int));
+            checkCUDAError("cudaMalloc tmpArray failed!");
+
+            if (!isPower2Length) {
+                dim3 blocks = computeBlocksPerGrid(n - (1 << ilog2(n)), blockSize);
+                kernZeroPadding << <blocks, blockSize >> > (n, ilog2(n), tmpArray);
+                checkCUDAError("kernZeroPadding failed!");
+                cudaDeviceSynchronize();
+            }
+
+            cudaMemcpy(tmpArray, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            // TODO
+            for (int d = 0; d < ilog2ceil(n); ++d) {
+                dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize);
+                kernUpSweep<<<blocks, blockSize>>>(bufferLength, d, tmpArray);
+                checkCUDAError("kernUpSweep failed!");
+                cudaDeviceSynchronize();
+            }
+
+            bool flag = 1;
+            for (int d = ilog2ceil(n) - 1; d >= 0; --d) {
+                dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize);
+                kernDownSweep<<<blocks, blockSize>>>(bufferLength, d, flag, tmpArray);
+                flag = 0;
+                checkCUDAError("kernDownSweep failed!");
+                cudaDeviceSynchronize();
+            }
+
+            cudaMemcpy(odata, tmpArray, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(tmpArray);
+        }
+
+
         /**
          * Performs stream compaction on idata, storing the result into odata.
          * All zeroes are discarded.
@@ -111,11 +154,54 @@ namespace StreamCompaction {
          * @param idata  The array of elements to compact.
          * @returns      The number of elements remaining after compaction.
          */
-        int compact(int n, int *odata, const int *idata) {
+        int compact(int n, int* odata, const int* idata) {
+            int blockSize = 128;
+            dim3 blocks{ (unsigned int)(n + blockSize - 1) / blockSize };
+
+            int* dev_buffer1;
+            int* dev_buffer2;
+            int* dev_boolArray;
+            int* dev_indices;
+            cudaMalloc((void**)&dev_boolArray, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_boolArray failed!");
+            cudaMalloc((void**)&dev_indices,   n * sizeof(int));
+            cudaDeviceSynchronize();
+            checkCUDAError("cudaMalloc dev_indices failed!");
+            cudaMalloc((void**)&dev_buffer1,   n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_buffer1 failed!");
+            cudaMalloc((void**)&dev_buffer2,   n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_buffer2 failed!");
+
+            cudaMemcpy(dev_buffer1, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
             timer().startGpuTimer();
+            cudaDeviceSynchronize();
+            checkCUDAError("timer failed!");
             // TODO
+            
+
+            StreamCompaction::Common::kernMapToBoolean<<<blocks, blockSize>>>(n, dev_boolArray, dev_buffer1);
+            cudaDeviceSynchronize();
+            checkCUDAError("kernMapToBoolean failed!");
+            
+            scanUntimed(n, dev_indices, dev_boolArray);
+
+            StreamCompaction::Common::kernScatter<<<blocks, blockSize>>>(n, dev_buffer2, dev_buffer1, dev_boolArray, dev_indices);
+            checkCUDAError("kernScatter failed!");
+            cudaDeviceSynchronize();
+            
+            cudaMemcpy(odata, dev_buffer2, n * sizeof(int), cudaMemcpyDeviceToHost);
+            
+            int numElem;
+            cudaMemcpy(&numElem, &dev_indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+            
             timer().endGpuTimer();
-            return -1;
+
+            cudaFree(dev_boolArray);
+            cudaFree(dev_indices);
+            cudaFree(dev_buffer1);
+            cudaFree(dev_buffer2);
+            return numElem;
         }
     }
 }
\ No newline at end of file
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..463ca2e 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,11 +18,24 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int* dev_buffer;
+            thrust::device_ptr<int> dev_thrustBuffer;
+            cudaMalloc((void**)&dev_buffer, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_buffer failed!");
+            dev_thrustBuffer = thrust::device_ptr<int>(dev_buffer);
+
+            cudaMemcpy(dev_buffer, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
             timer().startGpuTimer();
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            thrust::exclusive_scan(dev_thrustBuffer, dev_thrustBuffer + n, dev_thrustBuffer);
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_buffer, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_buffer);
         }
     }
 }

From 4a13a709603152dfad968456b3c16fa5e2308e76 Mon Sep 17 00:00:00 2001
From: Kau <dominikk@kite.upenn.edu>
Date: Tue, 17 Sep 2024 11:03:58 -0400
Subject: [PATCH 05/10] Bug Fixes and Code Cleanup

---
 stream_compaction/cpu.cu       | 13 ++---
 stream_compaction/cpu.h        |  2 +-
 stream_compaction/efficient.cu | 90 +++++++++-------------------------
 stream_compaction/efficient.h  |  2 +-
 4 files changed, 30 insertions(+), 77 deletions(-)

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 1cbd725..956cced 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -17,15 +17,15 @@ namespace StreamCompaction {
          * For performance analysis, this is supposed to be a simple for loop.
          * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
          */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startCpuTimer();
+        void scan(int n, int *odata, const int *idata, bool timed) {
+            if (timed) timer().startCpuTimer();
             // TODO
             int partialSum = 0;
             for (int i = 0; i < n; ++i) {
                 odata[i] = partialSum;
                 partialSum += idata[i];
             }
-            timer().endCpuTimer();
+            if (timed) timer().endCpuTimer();
         }
 
         /**
@@ -64,12 +64,7 @@ namespace StreamCompaction {
                 buffer[i] = (idata[i] != 0);
             }
 
-            // Scan (calling timed function leads to error)
-            int partialSum = 0;
-            for (int i = 0; i < n; ++i) {
-                odata[i] = partialSum;
-                partialSum += buffer[i];
-            }
+            scan(n, odata, idata, 0);
 
             int numElements = scatter(n, odata, buffer, idata);
 
diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h
index 873c047..f2f8c14 100644
--- a/stream_compaction/cpu.h
+++ b/stream_compaction/cpu.h
@@ -6,7 +6,7 @@ namespace StreamCompaction {
     namespace CPU {
         StreamCompaction::Common::PerformanceTimer& timer();
 
-        void scan(int n, int *odata, const int *idata);
+        void scan(int n, int *odata, const int *idata, bool timed = 1);
 
         int compactWithoutScan(int n, int *odata, const int *idata);
 
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2760ba8..cbd3eb3 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -25,16 +25,12 @@ namespace StreamCompaction {
             buffer[index + pow2todp1 - 1] += buffer[index + pow2tod - 1];
         }
 
-        __global__ void kernDownSweep(int n, int d, int s, int* buffer) {
+        __global__ void kernDownSweep(int n, int d, int* buffer) {
             int index = threadIdx.x + blockIdx.x * blockDim.x;
 
             int pow2tod = 1 << d;
             int pow2todp1 = 2 * pow2tod;
 
-            if (s) {
-                buffer[pow2todp1 - 1] = 0;
-            }
-
             if (index > n / pow2todp1 - 1) return;
             index *= pow2todp1;
 
@@ -58,93 +54,52 @@ namespace StreamCompaction {
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
-            int blockSize = 128;
-
-            bool isPower2Length = (n == (1 << ilog2(n)));
-
-            int bufferLength = (isPower2Length) ? n : 1 << ilog2ceil(n);
-
-            int* tmpArray;
-            cudaMalloc((void**)&tmpArray, bufferLength * sizeof(int));
-            checkCUDAError("cudaMalloc tmpArray failed!");
-
-            if (!isPower2Length) {
-                dim3 blocks = computeBlocksPerGrid(n - (1 << ilog2(n)), blockSize);
-                kernZeroPadding<<<blocks, blockSize>>>(n, ilog2(n), tmpArray);
-                checkCUDAError("kernZeroPadding failed!");
-                cudaDeviceSynchronize();
-            }
-
-            cudaMemcpy(tmpArray, idata, n * sizeof(int), cudaMemcpyHostToDevice);
-
-            timer().startGpuTimer();
-            // TODO
-            for (int d = 0; d < ilog2ceil(n); ++d) {
-                dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize);
-                kernUpSweep<<<blocks, blockSize>>>(bufferLength, d, tmpArray);
-                checkCUDAError("kernUpSweep failed!");
-                cudaDeviceSynchronize();
-            }
-            
-            bool flag = 1;
-            for (int d = ilog2ceil(n) - 1; d >= 0; --d) {
-                dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize);
-                kernDownSweep<<<blocks, blockSize>>>(bufferLength, d, flag, tmpArray);
-                flag = 0;
-                checkCUDAError("kernDownSweep failed!");
-                cudaDeviceSynchronize();
-            }
-            timer().endGpuTimer();
-
-            cudaMemcpy(odata, tmpArray, n * sizeof(int), cudaMemcpyDeviceToHost);
-
-            cudaFree(tmpArray);
-        }
-
-        void scanUntimed(int n, int* odata, const int* idata) {
+        void scan(int n, int *odata, const int *idata, bool timed) {
             int blockSize = 128;
 
             bool isPower2Length = (n == (1 << ilog2(n)));
 
             int bufferLength = (isPower2Length) ? n : 1 << ilog2ceil(n);
 
-            int* tmpArray;
-            cudaMalloc((void**)&tmpArray, bufferLength * sizeof(int));
+            int* dev_tmpArray;
+            cudaMalloc((void**)&dev_tmpArray, bufferLength * sizeof(int));
             checkCUDAError("cudaMalloc tmpArray failed!");
 
             if (!isPower2Length) {
                 dim3 blocks = computeBlocksPerGrid(n - (1 << ilog2(n)), blockSize);
-                kernZeroPadding << <blocks, blockSize >> > (n, ilog2(n), tmpArray);
+                kernZeroPadding<<<blocks, blockSize>>>(n, ilog2(n), dev_tmpArray);
                 checkCUDAError("kernZeroPadding failed!");
                 cudaDeviceSynchronize();
             }
 
-            cudaMemcpy(tmpArray, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            cudaMemcpy(dev_tmpArray, idata, n * sizeof(int), cudaMemcpyHostToDevice);
 
+            if (timed) timer().startGpuTimer();
             // TODO
             for (int d = 0; d < ilog2ceil(n); ++d) {
                 dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize);
-                kernUpSweep<<<blocks, blockSize>>>(bufferLength, d, tmpArray);
+                kernUpSweep<<<blocks, blockSize>>>(bufferLength, d, dev_tmpArray);
                 checkCUDAError("kernUpSweep failed!");
                 cudaDeviceSynchronize();
             }
 
-            bool flag = 1;
+            int zero = 0;
+            cudaMemcpy(&dev_tmpArray[bufferLength - 1], &zero, sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAError("cudaMemcpy zero failed!");
+            
             for (int d = ilog2ceil(n) - 1; d >= 0; --d) {
                 dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize);
-                kernDownSweep<<<blocks, blockSize>>>(bufferLength, d, flag, tmpArray);
-                flag = 0;
+                kernDownSweep<<<blocks, blockSize>>>(bufferLength, d, dev_tmpArray);
                 checkCUDAError("kernDownSweep failed!");
                 cudaDeviceSynchronize();
             }
+            if (timed) timer().endGpuTimer();
 
-            cudaMemcpy(odata, tmpArray, n * sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMemcpy(odata, dev_tmpArray, n * sizeof(int), cudaMemcpyDeviceToHost);
 
-            cudaFree(tmpArray);
+            cudaFree(dev_tmpArray);
         }
 
-
         /**
          * Performs stream compaction on idata, storing the result into odata.
          * All zeroes are discarded.
@@ -178,13 +133,12 @@ namespace StreamCompaction {
             cudaDeviceSynchronize();
             checkCUDAError("timer failed!");
             // TODO
-            
 
             StreamCompaction::Common::kernMapToBoolean<<<blocks, blockSize>>>(n, dev_boolArray, dev_buffer1);
             cudaDeviceSynchronize();
             checkCUDAError("kernMapToBoolean failed!");
             
-            scanUntimed(n, dev_indices, dev_boolArray);
+            scan(n, dev_indices, dev_boolArray, 0);
 
             StreamCompaction::Common::kernScatter<<<blocks, blockSize>>>(n, dev_buffer2, dev_buffer1, dev_boolArray, dev_indices);
             checkCUDAError("kernScatter failed!");
@@ -192,8 +146,12 @@ namespace StreamCompaction {
             
             cudaMemcpy(odata, dev_buffer2, n * sizeof(int), cudaMemcpyDeviceToHost);
             
-            int numElem;
-            cudaMemcpy(&numElem, &dev_indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+            // Index that last element in idata would have, if it was valid
+            int lastIndex;
+            cudaMemcpy(&lastIndex, &dev_indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+            // Check if last element is valid
+            int lastBool;
+            cudaMemcpy(&lastBool, &dev_boolArray[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
             
             timer().endGpuTimer();
 
@@ -201,7 +159,7 @@ namespace StreamCompaction {
             cudaFree(dev_indices);
             cudaFree(dev_buffer1);
             cudaFree(dev_buffer2);
-            return numElem;
+            return (lastBool) ? lastIndex + 1 : lastIndex;
         }
     }
 }
\ No newline at end of file
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 803cb4f..a4455cc 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -6,7 +6,7 @@ namespace StreamCompaction {
     namespace Efficient {
         StreamCompaction::Common::PerformanceTimer& timer();
 
-        void scan(int n, int *odata, const int *idata);
+        void scan(int n, int *odata, const int *idata, bool timed = 1);
 
         int compact(int n, int *odata, const int *idata);
     }

From ce79167c04a539c4146f87b0045829e69192e64c Mon Sep 17 00:00:00 2001
From: Kau <dominikk@kite.upenn.edu>
Date: Wed, 18 Sep 2024 17:35:05 -0400
Subject: [PATCH 06/10] Using CudaMemset

---
 stream_compaction/efficient.cu | 29 +++++++++--------------------
 stream_compaction/naive.cu     | 10 +++++-----
 2 files changed, 14 insertions(+), 25 deletions(-)

diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index cbd3eb3..9c84ae3 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -13,10 +13,9 @@ namespace StreamCompaction {
             return timer;
         }
 
-        __global__ void kernUpSweep(int n, int d, int* buffer) {
+        __global__ void kernUpSweep(int n, int pow2tod, int* buffer) {
             int index = threadIdx.x + blockIdx.x * blockDim.x;
    
-            int pow2tod = 1 << d;
             int pow2todp1 = 2 * pow2tod;
 
             if (index > n / pow2todp1 - 1) return;
@@ -25,10 +24,9 @@ namespace StreamCompaction {
             buffer[index + pow2todp1 - 1] += buffer[index + pow2tod - 1];
         }
 
-        __global__ void kernDownSweep(int n, int d, int* buffer) {
+        __global__ void kernDownSweep(int n, int pow2tod, int* buffer) {
             int index = threadIdx.x + blockIdx.x * blockDim.x;
 
-            int pow2tod = 1 << d;
             int pow2todp1 = 2 * pow2tod;
 
             if (index > n / pow2todp1 - 1) return;
@@ -55,7 +53,7 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata, bool timed) {
-            int blockSize = 128;
+            int blockSize = 64;
 
             bool isPower2Length = (n == (1 << ilog2(n)));
 
@@ -66,10 +64,7 @@ namespace StreamCompaction {
             checkCUDAError("cudaMalloc tmpArray failed!");
 
             if (!isPower2Length) {
-                dim3 blocks = computeBlocksPerGrid(n - (1 << ilog2(n)), blockSize);
-                kernZeroPadding<<<blocks, blockSize>>>(n, ilog2(n), dev_tmpArray);
-                checkCUDAError("kernZeroPadding failed!");
-                cudaDeviceSynchronize();
+                cudaMemset(dev_tmpArray + n, 0, (bufferLength - n) * sizeof(int));
             }
 
             cudaMemcpy(dev_tmpArray, idata, n * sizeof(int), cudaMemcpyHostToDevice);
@@ -78,18 +73,16 @@ namespace StreamCompaction {
             // TODO
             for (int d = 0; d < ilog2ceil(n); ++d) {
                 dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize);
-                kernUpSweep<<<blocks, blockSize>>>(bufferLength, d, dev_tmpArray);
+                kernUpSweep<<<blocks, blockSize>>>(bufferLength, 1 << d, dev_tmpArray);
                 checkCUDAError("kernUpSweep failed!");
                 cudaDeviceSynchronize();
             }
 
-            int zero = 0;
-            cudaMemcpy(&dev_tmpArray[bufferLength - 1], &zero, sizeof(int), cudaMemcpyHostToDevice);
-            checkCUDAError("cudaMemcpy zero failed!");
+            cudaMemset(dev_tmpArray + bufferLength - 1, 0, sizeof(int));
             
             for (int d = ilog2ceil(n) - 1; d >= 0; --d) {
                 dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize);
-                kernDownSweep<<<blocks, blockSize>>>(bufferLength, d, dev_tmpArray);
+                kernDownSweep<<<blocks, blockSize>>>(bufferLength, 1 << d, dev_tmpArray);
                 checkCUDAError("kernDownSweep failed!");
                 cudaDeviceSynchronize();
             }
@@ -120,7 +113,6 @@ namespace StreamCompaction {
             cudaMalloc((void**)&dev_boolArray, n * sizeof(int));
             checkCUDAError("cudaMalloc dev_boolArray failed!");
             cudaMalloc((void**)&dev_indices,   n * sizeof(int));
-            cudaDeviceSynchronize();
             checkCUDAError("cudaMalloc dev_indices failed!");
             cudaMalloc((void**)&dev_buffer1,   n * sizeof(int));
             checkCUDAError("cudaMalloc dev_buffer1 failed!");
@@ -130,9 +122,6 @@ namespace StreamCompaction {
             cudaMemcpy(dev_buffer1, idata, n * sizeof(int), cudaMemcpyHostToDevice);
 
             timer().startGpuTimer();
-            cudaDeviceSynchronize();
-            checkCUDAError("timer failed!");
-            // TODO
 
             StreamCompaction::Common::kernMapToBoolean<<<blocks, blockSize>>>(n, dev_boolArray, dev_buffer1);
             cudaDeviceSynchronize();
@@ -148,10 +137,10 @@ namespace StreamCompaction {
             
             // Index that last element in idata would have, if it was valid
             int lastIndex;
-            cudaMemcpy(&lastIndex, &dev_indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMemcpy(&lastIndex, dev_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
             // Check if last element is valid
             int lastBool;
-            cudaMemcpy(&lastBool, &dev_boolArray[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMemcpy(&lastBool, dev_boolArray + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
             
             timer().endGpuTimer();
 
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index eb2ba1b..f811076 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -13,7 +13,7 @@ namespace StreamCompaction {
             return timer;
         }
         // TODO: __global__
-        __global__ void handleNonPower(int n, int d, int* buffer) {
+        __global__ void kernHandleNonPower(int n, int d, int* buffer) {
             int index = threadIdx.x + blockIdx.x * blockDim.x;
             int pow2tod = 1 << d;
 
@@ -22,7 +22,7 @@ namespace StreamCompaction {
             buffer[pow2tod + index] += buffer[index];
         }
 
-        __global__ void naiveScanStep(int n, int d, const int* readBuffer, int* writeBuffer) {
+        __global__ void kernNaiveScanStep(int n, int d, const int* readBuffer, int* writeBuffer) {
             // compute thread index
             int index = threadIdx.x + blockIdx.x * blockDim.x;
             if (index >= n) return;
@@ -58,9 +58,9 @@ namespace StreamCompaction {
             StreamCompaction::Common::shiftArrayElements<<<fullBlocksPerGrid, blockSize>>>(n, 1, dev_buffer2, dev_buffer1);
             checkCUDAError("shiftArrayElements failed!");
             cudaDeviceSynchronize();
-            // TODO
+
             for (int d = 0; d < ilog2(n); ++d) {
-                naiveScanStep<<<fullBlocksPerGrid, blockSize>>>(n, d, dev_buffer1, dev_buffer2);
+                kernNaiveScanStep <<<fullBlocksPerGrid, blockSize>>>(n, d, dev_buffer1, dev_buffer2);
                 checkCUDAError("naiveScanStep failed!");
                 cudaDeviceSynchronize();
 
@@ -69,7 +69,7 @@ namespace StreamCompaction {
             // perform last step 
             if ((1 << ilog2(n)) != n) {
                 fullBlocksPerGrid.x = (n - (1 << ilog2(n)) + blockSize - 1) / blockSize;
-                handleNonPower<<<fullBlocksPerGrid, blockSize>>>(n, ilog2(n), dev_buffer1);
+                kernHandleNonPower<<<fullBlocksPerGrid, blockSize>>>(n, ilog2(n), dev_buffer1);
                 checkCUDAError("handleNonPower failed!");
                 cudaDeviceSynchronize();
             }

From 8109858318780483251230b4a1b95f87b010fb00 Mon Sep 17 00:00:00 2001
From: DomIno0o <51756125+DomIno0o@users.noreply.github.com>
Date: Wed, 18 Sep 2024 23:09:31 -0400
Subject: [PATCH 07/10] Add files via upload

---
 img/performance_compact.svg | 1379 +++++++++++++++++++++++++++++++++++
 img/performance_nonpow2.svg | 1342 ++++++++++++++++++++++++++++++++++
 img/performance_pow2.svg    | 1328 +++++++++++++++++++++++++++++++++
 3 files changed, 4049 insertions(+)
 create mode 100644 img/performance_compact.svg
 create mode 100644 img/performance_nonpow2.svg
 create mode 100644 img/performance_pow2.svg

diff --git a/img/performance_compact.svg b/img/performance_compact.svg
new file mode 100644
index 0000000..28efc4a
--- /dev/null
+++ b/img/performance_compact.svg
@@ -0,0 +1,1379 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="452.764166pt" height="334.28425pt" viewBox="0 0 452.764166 334.28425" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2024-09-18T22:27:00.267128</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.9.2, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 334.28425 
+L 452.764166 334.28425 
+L 452.764166 0 
+L 0 0 
+z
+" style="fill: #ffffff"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 45.478125 296.728 
+L 445.564166 296.728 
+L 445.564166 7.2 
+L 45.478125 7.2 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <path d="M 60.10821 296.728 
+L 60.10821 7.2 
+" clip-path="url(#p604d34fa50)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_2">
+      <defs>
+       <path id="m11c805b433" d="M 0 0 
+L 0 3.5 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m11c805b433" x="60.10821" y="296.728" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- $\mathdefault{10^{6}}$ -->
+      <g transform="translate(51.30821 311.326437) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-31" d="M 794 531 
+L 1825 531 
+L 1825 4091 
+L 703 3866 
+L 703 4441 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3481 531 
+L 3481 0 
+L 794 0 
+L 794 531 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-30" d="M 2034 4250 
+Q 1547 4250 1301 3770 
+Q 1056 3291 1056 2328 
+Q 1056 1369 1301 889 
+Q 1547 409 2034 409 
+Q 2525 409 2770 889 
+Q 3016 1369 3016 2328 
+Q 3016 3291 2770 3770 
+Q 2525 4250 2034 4250 
+z
+M 2034 4750 
+Q 2819 4750 3233 4129 
+Q 3647 3509 3647 2328 
+Q 3647 1150 3233 529 
+Q 2819 -91 2034 -91 
+Q 1250 -91 836 529 
+Q 422 1150 422 2328 
+Q 422 3509 836 4129 
+Q 1250 4750 2034 4750 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-36" d="M 2113 2584 
+Q 1688 2584 1439 2293 
+Q 1191 2003 1191 1497 
+Q 1191 994 1439 701 
+Q 1688 409 2113 409 
+Q 2538 409 2786 701 
+Q 3034 994 3034 1497 
+Q 3034 2003 2786 2293 
+Q 2538 2584 2113 2584 
+z
+M 3366 4563 
+L 3366 3988 
+Q 3128 4100 2886 4159 
+Q 2644 4219 2406 4219 
+Q 1781 4219 1451 3797 
+Q 1122 3375 1075 2522 
+Q 1259 2794 1537 2939 
+Q 1816 3084 2150 3084 
+Q 2853 3084 3261 2657 
+Q 3669 2231 3669 1497 
+Q 3669 778 3244 343 
+Q 2819 -91 2113 -91 
+Q 1303 -91 875 529 
+Q 447 1150 447 2328 
+Q 447 3434 972 4092 
+Q 1497 4750 2381 4750 
+Q 2619 4750 2861 4703 
+Q 3103 4656 3366 4563 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-31" transform="translate(0 0.765625)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0.765625)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(128.203125 39.046875) scale(0.7)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_3">
+      <path d="M 232.713023 296.728 
+L 232.713023 7.2 
+" clip-path="url(#p604d34fa50)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#m11c805b433" x="232.713023" y="296.728" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- $\mathdefault{10^{7}}$ -->
+      <g transform="translate(223.913023 311.326437) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-37" d="M 525 4666 
+L 3525 4666 
+L 3525 4397 
+L 1831 0 
+L 1172 0 
+L 2766 4134 
+L 525 4134 
+L 525 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-31" transform="translate(0 0.684375)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0.684375)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(128.203125 38.965625) scale(0.7)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_5">
+      <path d="M 405.317836 296.728 
+L 405.317836 7.2 
+" clip-path="url(#p604d34fa50)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#m11c805b433" x="405.317836" y="296.728" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- $\mathdefault{10^{8}}$ -->
+      <g transform="translate(396.517836 311.326437) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-38" d="M 2034 2216 
+Q 1584 2216 1326 1975 
+Q 1069 1734 1069 1313 
+Q 1069 891 1326 650 
+Q 1584 409 2034 409 
+Q 2484 409 2743 651 
+Q 3003 894 3003 1313 
+Q 3003 1734 2745 1975 
+Q 2488 2216 2034 2216 
+z
+M 1403 2484 
+Q 997 2584 770 2862 
+Q 544 3141 544 3541 
+Q 544 4100 942 4425 
+Q 1341 4750 2034 4750 
+Q 2731 4750 3128 4425 
+Q 3525 4100 3525 3541 
+Q 3525 3141 3298 2862 
+Q 3072 2584 2669 2484 
+Q 3125 2378 3379 2068 
+Q 3634 1759 3634 1313 
+Q 3634 634 3220 271 
+Q 2806 -91 2034 -91 
+Q 1263 -91 848 271 
+Q 434 634 434 1313 
+Q 434 1759 690 2068 
+Q 947 2378 1403 2484 
+z
+M 1172 3481 
+Q 1172 3119 1398 2916 
+Q 1625 2713 2034 2713 
+Q 2441 2713 2670 2916 
+Q 2900 3119 2900 3481 
+Q 2900 3844 2670 4047 
+Q 2441 4250 2034 4250 
+Q 1625 4250 1398 4047 
+Q 1172 3844 1172 3481 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-31" transform="translate(0 0.765625)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0.765625)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(128.203125 39.046875) scale(0.7)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_7">
+      <defs>
+       <path id="mfcd7f37bd2" d="M 0 0 
+L 0 2 
+" style="stroke: #000000; stroke-width: 0.6"/>
+      </defs>
+      <g>
+       <use xlink:href="#mfcd7f37bd2" x="52.210247" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_8">
+      <g>
+       <use xlink:href="#mfcd7f37bd2" x="112.067436" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_9">
+      <g>
+       <use xlink:href="#mfcd7f37bd2" x="142.461635" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_10">
+      <g>
+       <use xlink:href="#mfcd7f37bd2" x="164.026662" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_8">
+     <g id="line2d_11">
+      <g>
+       <use xlink:href="#mfcd7f37bd2" x="180.753797" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_9">
+     <g id="line2d_12">
+      <g>
+       <use xlink:href="#mfcd7f37bd2" x="194.420861" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_10">
+     <g id="line2d_13">
+      <g>
+       <use xlink:href="#mfcd7f37bd2" x="205.976199" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_11">
+     <g id="line2d_14">
+      <g>
+       <use xlink:href="#mfcd7f37bd2" x="215.985888" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_12">
+     <g id="line2d_15">
+      <g>
+       <use xlink:href="#mfcd7f37bd2" x="224.81506" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_13">
+     <g id="line2d_16">
+      <g>
+       <use xlink:href="#mfcd7f37bd2" x="284.672249" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_14">
+     <g id="line2d_17">
+      <g>
+       <use xlink:href="#mfcd7f37bd2" x="315.066448" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_15">
+     <g id="line2d_18">
+      <g>
+       <use xlink:href="#mfcd7f37bd2" x="336.631475" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_16">
+     <g id="line2d_19">
+      <g>
+       <use xlink:href="#mfcd7f37bd2" x="353.35861" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_17">
+     <g id="line2d_20">
+      <g>
+       <use xlink:href="#mfcd7f37bd2" x="367.025674" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_18">
+     <g id="line2d_21">
+      <g>
+       <use xlink:href="#mfcd7f37bd2" x="378.581012" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_19">
+     <g id="line2d_22">
+      <g>
+       <use xlink:href="#mfcd7f37bd2" x="388.590701" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_20">
+     <g id="line2d_23">
+      <g>
+       <use xlink:href="#mfcd7f37bd2" x="397.419873" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_4">
+     <!-- Array length [-] -->
+     <g transform="translate(207.232864 325.004562) scale(0.1 -0.1)">
+      <defs>
+       <path id="DejaVuSans-41" d="M 2188 4044 
+L 1331 1722 
+L 3047 1722 
+L 2188 4044 
+z
+M 1831 4666 
+L 2547 4666 
+L 4325 0 
+L 3669 0 
+L 3244 1197 
+L 1141 1197 
+L 716 0 
+L 50 0 
+L 1831 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-72" d="M 2631 2963 
+Q 2534 3019 2420 3045 
+Q 2306 3072 2169 3072 
+Q 1681 3072 1420 2755 
+Q 1159 2438 1159 1844 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1341 3275 1631 3429 
+Q 1922 3584 2338 3584 
+Q 2397 3584 2469 3576 
+Q 2541 3569 2628 3553 
+L 2631 2963 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-61" d="M 2194 1759 
+Q 1497 1759 1228 1600 
+Q 959 1441 959 1056 
+Q 959 750 1161 570 
+Q 1363 391 1709 391 
+Q 2188 391 2477 730 
+Q 2766 1069 2766 1631 
+L 2766 1759 
+L 2194 1759 
+z
+M 3341 1997 
+L 3341 0 
+L 2766 0 
+L 2766 531 
+Q 2569 213 2275 61 
+Q 1981 -91 1556 -91 
+Q 1019 -91 701 211 
+Q 384 513 384 1019 
+Q 384 1609 779 1909 
+Q 1175 2209 1959 2209 
+L 2766 2209 
+L 2766 2266 
+Q 2766 2663 2505 2880 
+Q 2244 3097 1772 3097 
+Q 1472 3097 1187 3025 
+Q 903 2953 641 2809 
+L 641 3341 
+Q 956 3463 1253 3523 
+Q 1550 3584 1831 3584 
+Q 2591 3584 2966 3190 
+Q 3341 2797 3341 1997 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-79" d="M 2059 -325 
+Q 1816 -950 1584 -1140 
+Q 1353 -1331 966 -1331 
+L 506 -1331 
+L 506 -850 
+L 844 -850 
+Q 1081 -850 1212 -737 
+Q 1344 -625 1503 -206 
+L 1606 56 
+L 191 3500 
+L 800 3500 
+L 1894 763 
+L 2988 3500 
+L 3597 3500 
+L 2059 -325 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-20" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6c" d="M 603 4863 
+L 1178 4863 
+L 1178 0 
+L 603 0 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-65" d="M 3597 1894 
+L 3597 1613 
+L 953 1613 
+Q 991 1019 1311 708 
+Q 1631 397 2203 397 
+Q 2534 397 2845 478 
+Q 3156 559 3463 722 
+L 3463 178 
+Q 3153 47 2828 -22 
+Q 2503 -91 2169 -91 
+Q 1331 -91 842 396 
+Q 353 884 353 1716 
+Q 353 2575 817 3079 
+Q 1281 3584 2069 3584 
+Q 2775 3584 3186 3129 
+Q 3597 2675 3597 1894 
+z
+M 3022 2063 
+Q 3016 2534 2758 2815 
+Q 2500 3097 2075 3097 
+Q 1594 3097 1305 2825 
+Q 1016 2553 972 2059 
+L 3022 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6e" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-67" d="M 2906 1791 
+Q 2906 2416 2648 2759 
+Q 2391 3103 1925 3103 
+Q 1463 3103 1205 2759 
+Q 947 2416 947 1791 
+Q 947 1169 1205 825 
+Q 1463 481 1925 481 
+Q 2391 481 2648 825 
+Q 2906 1169 2906 1791 
+z
+M 3481 434 
+Q 3481 -459 3084 -895 
+Q 2688 -1331 1869 -1331 
+Q 1566 -1331 1297 -1286 
+Q 1028 -1241 775 -1147 
+L 775 -588 
+Q 1028 -725 1275 -790 
+Q 1522 -856 1778 -856 
+Q 2344 -856 2625 -561 
+Q 2906 -266 2906 331 
+L 2906 616 
+Q 2728 306 2450 153 
+Q 2172 0 1784 0 
+Q 1141 0 747 490 
+Q 353 981 353 1791 
+Q 353 2603 747 3093 
+Q 1141 3584 1784 3584 
+Q 2172 3584 2450 3431 
+Q 2728 3278 2906 2969 
+L 2906 3500 
+L 3481 3500 
+L 3481 434 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-74" d="M 1172 4494 
+L 1172 3500 
+L 2356 3500 
+L 2356 3053 
+L 1172 3053 
+L 1172 1153 
+Q 1172 725 1289 603 
+Q 1406 481 1766 481 
+L 2356 481 
+L 2356 0 
+L 1766 0 
+Q 1100 0 847 248 
+Q 594 497 594 1153 
+L 594 3053 
+L 172 3053 
+L 172 3500 
+L 594 3500 
+L 594 4494 
+L 1172 4494 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-68" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-5b" d="M 550 4863 
+L 1875 4863 
+L 1875 4416 
+L 1125 4416 
+L 1125 -397 
+L 1875 -397 
+L 1875 -844 
+L 550 -844 
+L 550 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-2d" d="M 313 2009 
+L 1997 2009 
+L 1997 1497 
+L 313 1497 
+L 313 2009 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-5d" d="M 1947 4863 
+L 1947 -844 
+L 622 -844 
+L 622 -397 
+L 1369 -397 
+L 1369 4416 
+L 622 4416 
+L 622 4863 
+L 1947 4863 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-41"/>
+      <use xlink:href="#DejaVuSans-72" x="68.408203"/>
+      <use xlink:href="#DejaVuSans-72" x="107.771484"/>
+      <use xlink:href="#DejaVuSans-61" x="148.884766"/>
+      <use xlink:href="#DejaVuSans-79" x="210.164062"/>
+      <use xlink:href="#DejaVuSans-20" x="269.34375"/>
+      <use xlink:href="#DejaVuSans-6c" x="301.130859"/>
+      <use xlink:href="#DejaVuSans-65" x="328.914062"/>
+      <use xlink:href="#DejaVuSans-6e" x="390.4375"/>
+      <use xlink:href="#DejaVuSans-67" x="453.816406"/>
+      <use xlink:href="#DejaVuSans-74" x="517.292969"/>
+      <use xlink:href="#DejaVuSans-68" x="556.501953"/>
+      <use xlink:href="#DejaVuSans-20" x="619.880859"/>
+      <use xlink:href="#DejaVuSans-5b" x="651.667969"/>
+      <use xlink:href="#DejaVuSans-2d" x="690.681641"/>
+      <use xlink:href="#DejaVuSans-5d" x="726.765625"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="line2d_24">
+      <path d="M 45.478125 200.80244 
+L 445.564166 200.80244 
+" clip-path="url(#p604d34fa50)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_25">
+      <defs>
+       <path id="m3d100fb314" d="M 0 0 
+L -3.5 0 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m3d100fb314" x="45.478125" y="200.80244" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- $\mathdefault{10^{1}}$ -->
+      <g transform="translate(20.878125 204.601659) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-31" transform="translate(0 0.684375)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0.684375)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(128.203125 38.965625) scale(0.7)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="line2d_26">
+      <path d="M 45.478125 75.915406 
+L 445.564166 75.915406 
+" clip-path="url(#p604d34fa50)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_27">
+      <g>
+       <use xlink:href="#m3d100fb314" x="45.478125" y="75.915406" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- $\mathdefault{10^{2}}$ -->
+      <g transform="translate(20.878125 79.714625) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-32" d="M 1228 531 
+L 3431 531 
+L 3431 0 
+L 469 0 
+L 469 531 
+Q 828 903 1448 1529 
+Q 2069 2156 2228 2338 
+Q 2531 2678 2651 2914 
+Q 2772 3150 2772 3378 
+Q 2772 3750 2511 3984 
+Q 2250 4219 1831 4219 
+Q 1534 4219 1204 4116 
+Q 875 4013 500 3803 
+L 500 4441 
+Q 881 4594 1212 4672 
+Q 1544 4750 1819 4750 
+Q 2544 4750 2975 4387 
+Q 3406 4025 3406 3419 
+Q 3406 3131 3298 2873 
+Q 3191 2616 2906 2266 
+Q 2828 2175 2409 1742 
+Q 1991 1309 1228 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-31" transform="translate(0 0.765625)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0.765625)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(128.203125 39.046875) scale(0.7)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="line2d_28">
+      <defs>
+       <path id="m2597e600ff" d="M 0 0 
+L -2 0 
+" style="stroke: #000000; stroke-width: 0.6"/>
+      </defs>
+      <g>
+       <use xlink:href="#m2597e600ff" x="45.478125" y="288.094731" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="line2d_29">
+      <g>
+       <use xlink:href="#m2597e600ff" x="45.478125" y="266.103216" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_5">
+     <g id="line2d_30">
+      <g>
+       <use xlink:href="#m2597e600ff" x="45.478125" y="250.499988" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="line2d_31">
+      <g>
+       <use xlink:href="#m2597e600ff" x="45.478125" y="238.397184" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_7">
+     <g id="line2d_32">
+      <g>
+       <use xlink:href="#m2597e600ff" x="45.478125" y="228.508473" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_8">
+     <g id="line2d_33">
+      <g>
+       <use xlink:href="#m2597e600ff" x="45.478125" y="220.147687" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_9">
+     <g id="line2d_34">
+      <g>
+       <use xlink:href="#m2597e600ff" x="45.478125" y="212.905245" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_10">
+     <g id="line2d_35">
+      <g>
+       <use xlink:href="#m2597e600ff" x="45.478125" y="206.516958" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_11">
+     <g id="line2d_36">
+      <g>
+       <use xlink:href="#m2597e600ff" x="45.478125" y="163.207697" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_12">
+     <g id="line2d_37">
+      <g>
+       <use xlink:href="#m2597e600ff" x="45.478125" y="141.216182" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_13">
+     <g id="line2d_38">
+      <g>
+       <use xlink:href="#m2597e600ff" x="45.478125" y="125.612953" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_14">
+     <g id="line2d_39">
+      <g>
+       <use xlink:href="#m2597e600ff" x="45.478125" y="113.510149" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_15">
+     <g id="line2d_40">
+      <g>
+       <use xlink:href="#m2597e600ff" x="45.478125" y="103.621438" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_16">
+     <g id="line2d_41">
+      <g>
+       <use xlink:href="#m2597e600ff" x="45.478125" y="95.260652" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_17">
+     <g id="line2d_42">
+      <g>
+       <use xlink:href="#m2597e600ff" x="45.478125" y="88.01821" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_18">
+     <g id="line2d_43">
+      <g>
+       <use xlink:href="#m2597e600ff" x="45.478125" y="81.629923" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_19">
+     <g id="line2d_44">
+      <g>
+       <use xlink:href="#m2597e600ff" x="45.478125" y="38.320662" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_20">
+     <g id="line2d_45">
+      <g>
+       <use xlink:href="#m2597e600ff" x="45.478125" y="16.329147" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_7">
+     <!-- Runtime [ms] -->
+     <g transform="translate(14.798438 185.812437) rotate(-90) scale(0.1 -0.1)">
+      <defs>
+       <path id="DejaVuSans-52" d="M 2841 2188 
+Q 3044 2119 3236 1894 
+Q 3428 1669 3622 1275 
+L 4263 0 
+L 3584 0 
+L 2988 1197 
+Q 2756 1666 2539 1819 
+Q 2322 1972 1947 1972 
+L 1259 1972 
+L 1259 0 
+L 628 0 
+L 628 4666 
+L 2053 4666 
+Q 2853 4666 3247 4331 
+Q 3641 3997 3641 3322 
+Q 3641 2881 3436 2590 
+Q 3231 2300 2841 2188 
+z
+M 1259 4147 
+L 1259 2491 
+L 2053 2491 
+Q 2509 2491 2742 2702 
+Q 2975 2913 2975 3322 
+Q 2975 3731 2742 3939 
+Q 2509 4147 2053 4147 
+L 1259 4147 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-75" d="M 544 1381 
+L 544 3500 
+L 1119 3500 
+L 1119 1403 
+Q 1119 906 1312 657 
+Q 1506 409 1894 409 
+Q 2359 409 2629 706 
+Q 2900 1003 2900 1516 
+L 2900 3500 
+L 3475 3500 
+L 3475 0 
+L 2900 0 
+L 2900 538 
+Q 2691 219 2414 64 
+Q 2138 -91 1772 -91 
+Q 1169 -91 856 284 
+Q 544 659 544 1381 
+z
+M 1991 3584 
+L 1991 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-69" d="M 603 3500 
+L 1178 3500 
+L 1178 0 
+L 603 0 
+L 603 3500 
+z
+M 603 4863 
+L 1178 4863 
+L 1178 4134 
+L 603 4134 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6d" d="M 3328 2828 
+Q 3544 3216 3844 3400 
+Q 4144 3584 4550 3584 
+Q 5097 3584 5394 3201 
+Q 5691 2819 5691 2113 
+L 5691 0 
+L 5113 0 
+L 5113 2094 
+Q 5113 2597 4934 2840 
+Q 4756 3084 4391 3084 
+Q 3944 3084 3684 2787 
+Q 3425 2491 3425 1978 
+L 3425 0 
+L 2847 0 
+L 2847 2094 
+Q 2847 2600 2669 2842 
+Q 2491 3084 2119 3084 
+Q 1678 3084 1418 2786 
+Q 1159 2488 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1356 3278 1631 3431 
+Q 1906 3584 2284 3584 
+Q 2666 3584 2933 3390 
+Q 3200 3197 3328 2828 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-73" d="M 2834 3397 
+L 2834 2853 
+Q 2591 2978 2328 3040 
+Q 2066 3103 1784 3103 
+Q 1356 3103 1142 2972 
+Q 928 2841 928 2578 
+Q 928 2378 1081 2264 
+Q 1234 2150 1697 2047 
+L 1894 2003 
+Q 2506 1872 2764 1633 
+Q 3022 1394 3022 966 
+Q 3022 478 2636 193 
+Q 2250 -91 1575 -91 
+Q 1294 -91 989 -36 
+Q 684 19 347 128 
+L 347 722 
+Q 666 556 975 473 
+Q 1284 391 1588 391 
+Q 1994 391 2212 530 
+Q 2431 669 2431 922 
+Q 2431 1156 2273 1281 
+Q 2116 1406 1581 1522 
+L 1381 1569 
+Q 847 1681 609 1914 
+Q 372 2147 372 2553 
+Q 372 3047 722 3315 
+Q 1072 3584 1716 3584 
+Q 2034 3584 2315 3537 
+Q 2597 3491 2834 3397 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-52"/>
+      <use xlink:href="#DejaVuSans-75" x="64.982422"/>
+      <use xlink:href="#DejaVuSans-6e" x="128.361328"/>
+      <use xlink:href="#DejaVuSans-74" x="191.740234"/>
+      <use xlink:href="#DejaVuSans-69" x="230.949219"/>
+      <use xlink:href="#DejaVuSans-6d" x="258.732422"/>
+      <use xlink:href="#DejaVuSans-65" x="356.144531"/>
+      <use xlink:href="#DejaVuSans-20" x="417.667969"/>
+      <use xlink:href="#DejaVuSans-5b" x="449.455078"/>
+      <use xlink:href="#DejaVuSans-6d" x="488.46875"/>
+      <use xlink:href="#DejaVuSans-73" x="585.880859"/>
+      <use xlink:href="#DejaVuSans-5d" x="637.980469"/>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_46">
+    <path d="M 63.663854 283.567636 
+L 115.62308 240.704012 
+L 167.582306 205.633488 
+L 219.541532 170.828642 
+L 271.500758 132.49029 
+L 323.459985 93.023034 
+L 375.419211 57.606667 
+L 427.378437 20.360364 
+" clip-path="url(#p604d34fa50)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square"/>
+   </g>
+   <g id="line2d_47">
+    <path d="M 63.663854 283.39826 
+L 115.62308 241.190636 
+L 167.582306 205.103069 
+L 219.541532 170.47804 
+L 271.500758 131.430922 
+L 323.459985 95.800282 
+L 375.419211 57.651189 
+L 427.378437 20.37575 
+" clip-path="url(#p604d34fa50)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square"/>
+   </g>
+   <g id="line2d_48">
+    <path d="M 63.663854 257.361011 
+L 115.62308 224.765876 
+L 167.582306 197.69838 
+L 219.541532 168.305018 
+L 271.500758 141.140483 
+L 323.459985 106.587812 
+L 375.419211 70.300615 
+L 427.378437 32.245688 
+" clip-path="url(#p604d34fa50)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square"/>
+   </g>
+   <g id="line2d_49">
+    <path d="M 63.663854 254.283766 
+L 115.62308 239.369434 
+L 167.582306 198.580364 
+L 219.541532 164.018617 
+L 271.500758 141.874997 
+L 323.459985 106.450688 
+L 375.419211 69.678714 
+L 427.378437 33.506438 
+" clip-path="url(#p604d34fa50)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square"/>
+   </g>
+   <g id="patch_3">
+    <path d="M 45.478125 296.728 
+L 45.478125 7.2 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 445.564166 296.728 
+L 445.564166 7.2 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 45.478125 296.728 
+L 445.564166 296.728 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 45.478125 7.2 
+L 445.564166 7.2 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="legend_1">
+    <g id="patch_7">
+     <path d="M 52.478125 73.9125 
+L 231.4375 73.9125 
+Q 233.4375 73.9125 233.4375 71.9125 
+L 233.4375 14.2 
+Q 233.4375 12.2 231.4375 12.2 
+L 52.478125 12.2 
+Q 50.478125 12.2 50.478125 14.2 
+L 50.478125 71.9125 
+Q 50.478125 73.9125 52.478125 73.9125 
+z
+" style="fill: #ffffff; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter"/>
+    </g>
+    <g id="line2d_50">
+     <path d="M 54.478125 20.298437 
+L 64.478125 20.298437 
+L 74.478125 20.298437 
+" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square"/>
+    </g>
+    <g id="text_8">
+     <!-- CPU, power of 2 -->
+     <g transform="translate(82.478125 23.798437) scale(0.1 -0.1)">
+      <defs>
+       <path id="DejaVuSans-43" d="M 4122 4306 
+L 4122 3641 
+Q 3803 3938 3442 4084 
+Q 3081 4231 2675 4231 
+Q 1875 4231 1450 3742 
+Q 1025 3253 1025 2328 
+Q 1025 1406 1450 917 
+Q 1875 428 2675 428 
+Q 3081 428 3442 575 
+Q 3803 722 4122 1019 
+L 4122 359 
+Q 3791 134 3420 21 
+Q 3050 -91 2638 -91 
+Q 1578 -91 968 557 
+Q 359 1206 359 2328 
+Q 359 3453 968 4101 
+Q 1578 4750 2638 4750 
+Q 3056 4750 3426 4639 
+Q 3797 4528 4122 4306 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-50" d="M 1259 4147 
+L 1259 2394 
+L 2053 2394 
+Q 2494 2394 2734 2622 
+Q 2975 2850 2975 3272 
+Q 2975 3691 2734 3919 
+Q 2494 4147 2053 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2053 4666 
+Q 2838 4666 3239 4311 
+Q 3641 3956 3641 3272 
+Q 3641 2581 3239 2228 
+Q 2838 1875 2053 1875 
+L 1259 1875 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-55" d="M 556 4666 
+L 1191 4666 
+L 1191 1831 
+Q 1191 1081 1462 751 
+Q 1734 422 2344 422 
+Q 2950 422 3222 751 
+Q 3494 1081 3494 1831 
+L 3494 4666 
+L 4128 4666 
+L 4128 1753 
+Q 4128 841 3676 375 
+Q 3225 -91 2344 -91 
+Q 1459 -91 1007 375 
+Q 556 841 556 1753 
+L 556 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-2c" d="M 750 794 
+L 1409 794 
+L 1409 256 
+L 897 -744 
+L 494 -744 
+L 750 256 
+L 750 794 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-70" d="M 1159 525 
+L 1159 -1331 
+L 581 -1331 
+L 581 3500 
+L 1159 3500 
+L 1159 2969 
+Q 1341 3281 1617 3432 
+Q 1894 3584 2278 3584 
+Q 2916 3584 3314 3078 
+Q 3713 2572 3713 1747 
+Q 3713 922 3314 415 
+Q 2916 -91 2278 -91 
+Q 1894 -91 1617 61 
+Q 1341 213 1159 525 
+z
+M 3116 1747 
+Q 3116 2381 2855 2742 
+Q 2594 3103 2138 3103 
+Q 1681 3103 1420 2742 
+Q 1159 2381 1159 1747 
+Q 1159 1113 1420 752 
+Q 1681 391 2138 391 
+Q 2594 391 2855 752 
+Q 3116 1113 3116 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6f" d="M 1959 3097 
+Q 1497 3097 1228 2736 
+Q 959 2375 959 1747 
+Q 959 1119 1226 758 
+Q 1494 397 1959 397 
+Q 2419 397 2687 759 
+Q 2956 1122 2956 1747 
+Q 2956 2369 2687 2733 
+Q 2419 3097 1959 3097 
+z
+M 1959 3584 
+Q 2709 3584 3137 3096 
+Q 3566 2609 3566 1747 
+Q 3566 888 3137 398 
+Q 2709 -91 1959 -91 
+Q 1206 -91 779 398 
+Q 353 888 353 1747 
+Q 353 2609 779 3096 
+Q 1206 3584 1959 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-77" d="M 269 3500 
+L 844 3500 
+L 1563 769 
+L 2278 3500 
+L 2956 3500 
+L 3675 769 
+L 4391 3500 
+L 4966 3500 
+L 4050 0 
+L 3372 0 
+L 2619 2869 
+L 1863 0 
+L 1184 0 
+L 269 3500 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-66" d="M 2375 4863 
+L 2375 4384 
+L 1825 4384 
+Q 1516 4384 1395 4259 
+Q 1275 4134 1275 3809 
+L 1275 3500 
+L 2222 3500 
+L 2222 3053 
+L 1275 3053 
+L 1275 0 
+L 697 0 
+L 697 3053 
+L 147 3053 
+L 147 3500 
+L 697 3500 
+L 697 3744 
+Q 697 4328 969 4595 
+Q 1241 4863 1831 4863 
+L 2375 4863 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-43"/>
+      <use xlink:href="#DejaVuSans-50" x="69.824219"/>
+      <use xlink:href="#DejaVuSans-55" x="130.126953"/>
+      <use xlink:href="#DejaVuSans-2c" x="203.320312"/>
+      <use xlink:href="#DejaVuSans-20" x="235.107422"/>
+      <use xlink:href="#DejaVuSans-70" x="266.894531"/>
+      <use xlink:href="#DejaVuSans-6f" x="330.371094"/>
+      <use xlink:href="#DejaVuSans-77" x="391.552734"/>
+      <use xlink:href="#DejaVuSans-65" x="473.339844"/>
+      <use xlink:href="#DejaVuSans-72" x="534.863281"/>
+      <use xlink:href="#DejaVuSans-20" x="575.976562"/>
+      <use xlink:href="#DejaVuSans-6f" x="607.763672"/>
+      <use xlink:href="#DejaVuSans-66" x="668.945312"/>
+      <use xlink:href="#DejaVuSans-20" x="704.150391"/>
+      <use xlink:href="#DejaVuSans-32" x="735.9375"/>
+     </g>
+    </g>
+    <g id="line2d_51">
+     <path d="M 54.478125 34.976562 
+L 64.478125 34.976562 
+L 74.478125 34.976562 
+" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square"/>
+    </g>
+    <g id="text_9">
+     <!-- CPU, non-power of 2 -->
+     <g transform="translate(82.478125 38.476562) scale(0.1 -0.1)">
+      <use xlink:href="#DejaVuSans-43"/>
+      <use xlink:href="#DejaVuSans-50" x="69.824219"/>
+      <use xlink:href="#DejaVuSans-55" x="130.126953"/>
+      <use xlink:href="#DejaVuSans-2c" x="203.320312"/>
+      <use xlink:href="#DejaVuSans-20" x="235.107422"/>
+      <use xlink:href="#DejaVuSans-6e" x="266.894531"/>
+      <use xlink:href="#DejaVuSans-6f" x="330.273438"/>
+      <use xlink:href="#DejaVuSans-6e" x="391.455078"/>
+      <use xlink:href="#DejaVuSans-2d" x="454.833984"/>
+      <use xlink:href="#DejaVuSans-70" x="490.917969"/>
+      <use xlink:href="#DejaVuSans-6f" x="554.394531"/>
+      <use xlink:href="#DejaVuSans-77" x="615.576172"/>
+      <use xlink:href="#DejaVuSans-65" x="697.363281"/>
+      <use xlink:href="#DejaVuSans-72" x="758.886719"/>
+      <use xlink:href="#DejaVuSans-20" x="800"/>
+      <use xlink:href="#DejaVuSans-6f" x="831.787109"/>
+      <use xlink:href="#DejaVuSans-66" x="892.96875"/>
+      <use xlink:href="#DejaVuSans-20" x="928.173828"/>
+      <use xlink:href="#DejaVuSans-32" x="959.960938"/>
+     </g>
+    </g>
+    <g id="line2d_52">
+     <path d="M 54.478125 49.654687 
+L 64.478125 49.654687 
+L 74.478125 49.654687 
+" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square"/>
+    </g>
+    <g id="text_10">
+     <!-- GPU efficient, power of 2 -->
+     <g transform="translate(82.478125 53.154687) scale(0.1 -0.1)">
+      <defs>
+       <path id="DejaVuSans-47" d="M 3809 666 
+L 3809 1919 
+L 2778 1919 
+L 2778 2438 
+L 4434 2438 
+L 4434 434 
+Q 4069 175 3628 42 
+Q 3188 -91 2688 -91 
+Q 1594 -91 976 548 
+Q 359 1188 359 2328 
+Q 359 3472 976 4111 
+Q 1594 4750 2688 4750 
+Q 3144 4750 3555 4637 
+Q 3966 4525 4313 4306 
+L 4313 3634 
+Q 3963 3931 3569 4081 
+Q 3175 4231 2741 4231 
+Q 1884 4231 1454 3753 
+Q 1025 3275 1025 2328 
+Q 1025 1384 1454 906 
+Q 1884 428 2741 428 
+Q 3075 428 3337 486 
+Q 3600 544 3809 666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-63" d="M 3122 3366 
+L 3122 2828 
+Q 2878 2963 2633 3030 
+Q 2388 3097 2138 3097 
+Q 1578 3097 1268 2742 
+Q 959 2388 959 1747 
+Q 959 1106 1268 751 
+Q 1578 397 2138 397 
+Q 2388 397 2633 464 
+Q 2878 531 3122 666 
+L 3122 134 
+Q 2881 22 2623 -34 
+Q 2366 -91 2075 -91 
+Q 1284 -91 818 406 
+Q 353 903 353 1747 
+Q 353 2603 823 3093 
+Q 1294 3584 2113 3584 
+Q 2378 3584 2631 3529 
+Q 2884 3475 3122 3366 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-47"/>
+      <use xlink:href="#DejaVuSans-50" x="77.490234"/>
+      <use xlink:href="#DejaVuSans-55" x="137.792969"/>
+      <use xlink:href="#DejaVuSans-20" x="210.986328"/>
+      <use xlink:href="#DejaVuSans-65" x="242.773438"/>
+      <use xlink:href="#DejaVuSans-66" x="304.296875"/>
+      <use xlink:href="#DejaVuSans-66" x="339.501953"/>
+      <use xlink:href="#DejaVuSans-69" x="374.707031"/>
+      <use xlink:href="#DejaVuSans-63" x="402.490234"/>
+      <use xlink:href="#DejaVuSans-69" x="457.470703"/>
+      <use xlink:href="#DejaVuSans-65" x="485.253906"/>
+      <use xlink:href="#DejaVuSans-6e" x="546.777344"/>
+      <use xlink:href="#DejaVuSans-74" x="610.15625"/>
+      <use xlink:href="#DejaVuSans-2c" x="649.365234"/>
+      <use xlink:href="#DejaVuSans-20" x="681.152344"/>
+      <use xlink:href="#DejaVuSans-70" x="712.939453"/>
+      <use xlink:href="#DejaVuSans-6f" x="776.416016"/>
+      <use xlink:href="#DejaVuSans-77" x="837.597656"/>
+      <use xlink:href="#DejaVuSans-65" x="919.384766"/>
+      <use xlink:href="#DejaVuSans-72" x="980.908203"/>
+      <use xlink:href="#DejaVuSans-20" x="1022.021484"/>
+      <use xlink:href="#DejaVuSans-6f" x="1053.808594"/>
+      <use xlink:href="#DejaVuSans-66" x="1114.990234"/>
+      <use xlink:href="#DejaVuSans-20" x="1150.195312"/>
+      <use xlink:href="#DejaVuSans-32" x="1181.982422"/>
+     </g>
+    </g>
+    <g id="line2d_53">
+     <path d="M 54.478125 64.332812 
+L 64.478125 64.332812 
+L 74.478125 64.332812 
+" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square"/>
+    </g>
+    <g id="text_11">
+     <!-- GPU efficient, non-power of 2 -->
+     <g transform="translate(82.478125 67.832812) scale(0.1 -0.1)">
+      <use xlink:href="#DejaVuSans-47"/>
+      <use xlink:href="#DejaVuSans-50" x="77.490234"/>
+      <use xlink:href="#DejaVuSans-55" x="137.792969"/>
+      <use xlink:href="#DejaVuSans-20" x="210.986328"/>
+      <use xlink:href="#DejaVuSans-65" x="242.773438"/>
+      <use xlink:href="#DejaVuSans-66" x="304.296875"/>
+      <use xlink:href="#DejaVuSans-66" x="339.501953"/>
+      <use xlink:href="#DejaVuSans-69" x="374.707031"/>
+      <use xlink:href="#DejaVuSans-63" x="402.490234"/>
+      <use xlink:href="#DejaVuSans-69" x="457.470703"/>
+      <use xlink:href="#DejaVuSans-65" x="485.253906"/>
+      <use xlink:href="#DejaVuSans-6e" x="546.777344"/>
+      <use xlink:href="#DejaVuSans-74" x="610.15625"/>
+      <use xlink:href="#DejaVuSans-2c" x="649.365234"/>
+      <use xlink:href="#DejaVuSans-20" x="681.152344"/>
+      <use xlink:href="#DejaVuSans-6e" x="712.939453"/>
+      <use xlink:href="#DejaVuSans-6f" x="776.318359"/>
+      <use xlink:href="#DejaVuSans-6e" x="837.5"/>
+      <use xlink:href="#DejaVuSans-2d" x="900.878906"/>
+      <use xlink:href="#DejaVuSans-70" x="936.962891"/>
+      <use xlink:href="#DejaVuSans-6f" x="1000.439453"/>
+      <use xlink:href="#DejaVuSans-77" x="1061.621094"/>
+      <use xlink:href="#DejaVuSans-65" x="1143.408203"/>
+      <use xlink:href="#DejaVuSans-72" x="1204.931641"/>
+      <use xlink:href="#DejaVuSans-20" x="1246.044922"/>
+      <use xlink:href="#DejaVuSans-6f" x="1277.832031"/>
+      <use xlink:href="#DejaVuSans-66" x="1339.013672"/>
+      <use xlink:href="#DejaVuSans-20" x="1374.21875"/>
+      <use xlink:href="#DejaVuSans-32" x="1406.005859"/>
+     </g>
+    </g>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="p604d34fa50">
+   <rect x="45.478125" y="7.2" width="400.086041" height="289.528"/>
+  </clipPath>
+ </defs>
+</svg>
diff --git a/img/performance_nonpow2.svg b/img/performance_nonpow2.svg
new file mode 100644
index 0000000..26a93fc
--- /dev/null
+++ b/img/performance_nonpow2.svg
@@ -0,0 +1,1342 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="452.572916pt" height="334.28425pt" viewBox="0 0 452.572916 334.28425" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2024-09-18T22:27:00.060256</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.9.2, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 334.28425 
+L 452.572916 334.28425 
+L 452.572916 0 
+L 0 0 
+z
+" style="fill: #ffffff"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 45.478125 296.728 
+L 445.372916 296.728 
+L 445.372916 7.2 
+L 45.478125 7.2 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <path d="M 60.101216 296.728 
+L 60.101216 7.2 
+" clip-path="url(#p314b7e48b1)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_2">
+      <defs>
+       <path id="m9c89aa5517" d="M 0 0 
+L 0 3.5 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m9c89aa5517" x="60.101216" y="296.728" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- $\mathdefault{10^{6}}$ -->
+      <g transform="translate(51.301216 311.326437) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-31" d="M 794 531 
+L 1825 531 
+L 1825 4091 
+L 703 3866 
+L 703 4441 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3481 531 
+L 3481 0 
+L 794 0 
+L 794 531 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-30" d="M 2034 4250 
+Q 1547 4250 1301 3770 
+Q 1056 3291 1056 2328 
+Q 1056 1369 1301 889 
+Q 1547 409 2034 409 
+Q 2525 409 2770 889 
+Q 3016 1369 3016 2328 
+Q 3016 3291 2770 3770 
+Q 2525 4250 2034 4250 
+z
+M 2034 4750 
+Q 2819 4750 3233 4129 
+Q 3647 3509 3647 2328 
+Q 3647 1150 3233 529 
+Q 2819 -91 2034 -91 
+Q 1250 -91 836 529 
+Q 422 1150 422 2328 
+Q 422 3509 836 4129 
+Q 1250 4750 2034 4750 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-36" d="M 2113 2584 
+Q 1688 2584 1439 2293 
+Q 1191 2003 1191 1497 
+Q 1191 994 1439 701 
+Q 1688 409 2113 409 
+Q 2538 409 2786 701 
+Q 3034 994 3034 1497 
+Q 3034 2003 2786 2293 
+Q 2538 2584 2113 2584 
+z
+M 3366 4563 
+L 3366 3988 
+Q 3128 4100 2886 4159 
+Q 2644 4219 2406 4219 
+Q 1781 4219 1451 3797 
+Q 1122 3375 1075 2522 
+Q 1259 2794 1537 2939 
+Q 1816 3084 2150 3084 
+Q 2853 3084 3261 2657 
+Q 3669 2231 3669 1497 
+Q 3669 778 3244 343 
+Q 2819 -91 2113 -91 
+Q 1303 -91 875 529 
+Q 447 1150 447 2328 
+Q 447 3434 972 4092 
+Q 1497 4750 2381 4750 
+Q 2619 4750 2861 4703 
+Q 3103 4656 3366 4563 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-31" transform="translate(0 0.765625)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0.765625)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(128.203125 39.046875) scale(0.7)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_3">
+      <path d="M 232.62352 296.728 
+L 232.62352 7.2 
+" clip-path="url(#p314b7e48b1)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#m9c89aa5517" x="232.62352" y="296.728" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- $\mathdefault{10^{7}}$ -->
+      <g transform="translate(223.82352 311.326437) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-37" d="M 525 4666 
+L 3525 4666 
+L 3525 4397 
+L 1831 0 
+L 1172 0 
+L 2766 4134 
+L 525 4134 
+L 525 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-31" transform="translate(0 0.684375)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0.684375)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(128.203125 38.965625) scale(0.7)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_5">
+      <path d="M 405.145824 296.728 
+L 405.145824 7.2 
+" clip-path="url(#p314b7e48b1)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#m9c89aa5517" x="405.145824" y="296.728" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- $\mathdefault{10^{8}}$ -->
+      <g transform="translate(396.345824 311.326437) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-38" d="M 2034 2216 
+Q 1584 2216 1326 1975 
+Q 1069 1734 1069 1313 
+Q 1069 891 1326 650 
+Q 1584 409 2034 409 
+Q 2484 409 2743 651 
+Q 3003 894 3003 1313 
+Q 3003 1734 2745 1975 
+Q 2488 2216 2034 2216 
+z
+M 1403 2484 
+Q 997 2584 770 2862 
+Q 544 3141 544 3541 
+Q 544 4100 942 4425 
+Q 1341 4750 2034 4750 
+Q 2731 4750 3128 4425 
+Q 3525 4100 3525 3541 
+Q 3525 3141 3298 2862 
+Q 3072 2584 2669 2484 
+Q 3125 2378 3379 2068 
+Q 3634 1759 3634 1313 
+Q 3634 634 3220 271 
+Q 2806 -91 2034 -91 
+Q 1263 -91 848 271 
+Q 434 634 434 1313 
+Q 434 1759 690 2068 
+Q 947 2378 1403 2484 
+z
+M 1172 3481 
+Q 1172 3119 1398 2916 
+Q 1625 2713 2034 2713 
+Q 2441 2713 2670 2916 
+Q 2900 3119 2900 3481 
+Q 2900 3844 2670 4047 
+Q 2441 4250 2034 4250 
+Q 1625 4250 1398 4047 
+Q 1172 3844 1172 3481 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-31" transform="translate(0 0.765625)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0.765625)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(128.203125 39.046875) scale(0.7)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_7">
+      <defs>
+       <path id="mf8f7f13f23" d="M 0 0 
+L 0 2 
+" style="stroke: #000000; stroke-width: 0.6"/>
+      </defs>
+      <g>
+       <use xlink:href="#mf8f7f13f23" x="52.207029" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_8">
+      <g>
+       <use xlink:href="#mf8f7f13f23" x="112.035605" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_9">
+      <g>
+       <use xlink:href="#mf8f7f13f23" x="142.415275" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_10">
+      <g>
+       <use xlink:href="#mf8f7f13f23" x="163.969993" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_8">
+     <g id="line2d_11">
+      <g>
+       <use xlink:href="#mf8f7f13f23" x="180.689132" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_9">
+     <g id="line2d_12">
+      <g>
+       <use xlink:href="#mf8f7f13f23" x="194.349663" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_10">
+     <g id="line2d_13">
+      <g>
+       <use xlink:href="#mf8f7f13f23" x="205.899477" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_11">
+     <g id="line2d_14">
+      <g>
+       <use xlink:href="#mf8f7f13f23" x="215.904382" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_12">
+     <g id="line2d_15">
+      <g>
+       <use xlink:href="#mf8f7f13f23" x="224.729333" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_13">
+     <g id="line2d_16">
+      <g>
+       <use xlink:href="#mf8f7f13f23" x="284.557909" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_14">
+     <g id="line2d_17">
+      <g>
+       <use xlink:href="#mf8f7f13f23" x="314.937579" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_15">
+     <g id="line2d_18">
+      <g>
+       <use xlink:href="#mf8f7f13f23" x="336.492297" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_16">
+     <g id="line2d_19">
+      <g>
+       <use xlink:href="#mf8f7f13f23" x="353.211436" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_17">
+     <g id="line2d_20">
+      <g>
+       <use xlink:href="#mf8f7f13f23" x="366.871967" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_18">
+     <g id="line2d_21">
+      <g>
+       <use xlink:href="#mf8f7f13f23" x="378.421781" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_19">
+     <g id="line2d_22">
+      <g>
+       <use xlink:href="#mf8f7f13f23" x="388.426686" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_20">
+     <g id="line2d_23">
+      <g>
+       <use xlink:href="#mf8f7f13f23" x="397.251637" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_4">
+     <!-- Array length [-] -->
+     <g transform="translate(207.137239 325.004562) scale(0.1 -0.1)">
+      <defs>
+       <path id="DejaVuSans-41" d="M 2188 4044 
+L 1331 1722 
+L 3047 1722 
+L 2188 4044 
+z
+M 1831 4666 
+L 2547 4666 
+L 4325 0 
+L 3669 0 
+L 3244 1197 
+L 1141 1197 
+L 716 0 
+L 50 0 
+L 1831 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-72" d="M 2631 2963 
+Q 2534 3019 2420 3045 
+Q 2306 3072 2169 3072 
+Q 1681 3072 1420 2755 
+Q 1159 2438 1159 1844 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1341 3275 1631 3429 
+Q 1922 3584 2338 3584 
+Q 2397 3584 2469 3576 
+Q 2541 3569 2628 3553 
+L 2631 2963 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-61" d="M 2194 1759 
+Q 1497 1759 1228 1600 
+Q 959 1441 959 1056 
+Q 959 750 1161 570 
+Q 1363 391 1709 391 
+Q 2188 391 2477 730 
+Q 2766 1069 2766 1631 
+L 2766 1759 
+L 2194 1759 
+z
+M 3341 1997 
+L 3341 0 
+L 2766 0 
+L 2766 531 
+Q 2569 213 2275 61 
+Q 1981 -91 1556 -91 
+Q 1019 -91 701 211 
+Q 384 513 384 1019 
+Q 384 1609 779 1909 
+Q 1175 2209 1959 2209 
+L 2766 2209 
+L 2766 2266 
+Q 2766 2663 2505 2880 
+Q 2244 3097 1772 3097 
+Q 1472 3097 1187 3025 
+Q 903 2953 641 2809 
+L 641 3341 
+Q 956 3463 1253 3523 
+Q 1550 3584 1831 3584 
+Q 2591 3584 2966 3190 
+Q 3341 2797 3341 1997 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-79" d="M 2059 -325 
+Q 1816 -950 1584 -1140 
+Q 1353 -1331 966 -1331 
+L 506 -1331 
+L 506 -850 
+L 844 -850 
+Q 1081 -850 1212 -737 
+Q 1344 -625 1503 -206 
+L 1606 56 
+L 191 3500 
+L 800 3500 
+L 1894 763 
+L 2988 3500 
+L 3597 3500 
+L 2059 -325 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-20" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6c" d="M 603 4863 
+L 1178 4863 
+L 1178 0 
+L 603 0 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-65" d="M 3597 1894 
+L 3597 1613 
+L 953 1613 
+Q 991 1019 1311 708 
+Q 1631 397 2203 397 
+Q 2534 397 2845 478 
+Q 3156 559 3463 722 
+L 3463 178 
+Q 3153 47 2828 -22 
+Q 2503 -91 2169 -91 
+Q 1331 -91 842 396 
+Q 353 884 353 1716 
+Q 353 2575 817 3079 
+Q 1281 3584 2069 3584 
+Q 2775 3584 3186 3129 
+Q 3597 2675 3597 1894 
+z
+M 3022 2063 
+Q 3016 2534 2758 2815 
+Q 2500 3097 2075 3097 
+Q 1594 3097 1305 2825 
+Q 1016 2553 972 2059 
+L 3022 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6e" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-67" d="M 2906 1791 
+Q 2906 2416 2648 2759 
+Q 2391 3103 1925 3103 
+Q 1463 3103 1205 2759 
+Q 947 2416 947 1791 
+Q 947 1169 1205 825 
+Q 1463 481 1925 481 
+Q 2391 481 2648 825 
+Q 2906 1169 2906 1791 
+z
+M 3481 434 
+Q 3481 -459 3084 -895 
+Q 2688 -1331 1869 -1331 
+Q 1566 -1331 1297 -1286 
+Q 1028 -1241 775 -1147 
+L 775 -588 
+Q 1028 -725 1275 -790 
+Q 1522 -856 1778 -856 
+Q 2344 -856 2625 -561 
+Q 2906 -266 2906 331 
+L 2906 616 
+Q 2728 306 2450 153 
+Q 2172 0 1784 0 
+Q 1141 0 747 490 
+Q 353 981 353 1791 
+Q 353 2603 747 3093 
+Q 1141 3584 1784 3584 
+Q 2172 3584 2450 3431 
+Q 2728 3278 2906 2969 
+L 2906 3500 
+L 3481 3500 
+L 3481 434 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-74" d="M 1172 4494 
+L 1172 3500 
+L 2356 3500 
+L 2356 3053 
+L 1172 3053 
+L 1172 1153 
+Q 1172 725 1289 603 
+Q 1406 481 1766 481 
+L 2356 481 
+L 2356 0 
+L 1766 0 
+Q 1100 0 847 248 
+Q 594 497 594 1153 
+L 594 3053 
+L 172 3053 
+L 172 3500 
+L 594 3500 
+L 594 4494 
+L 1172 4494 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-68" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-5b" d="M 550 4863 
+L 1875 4863 
+L 1875 4416 
+L 1125 4416 
+L 1125 -397 
+L 1875 -397 
+L 1875 -844 
+L 550 -844 
+L 550 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-2d" d="M 313 2009 
+L 1997 2009 
+L 1997 1497 
+L 313 1497 
+L 313 2009 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-5d" d="M 1947 4863 
+L 1947 -844 
+L 622 -844 
+L 622 -397 
+L 1369 -397 
+L 1369 4416 
+L 622 4416 
+L 622 4863 
+L 1947 4863 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-41"/>
+      <use xlink:href="#DejaVuSans-72" x="68.408203"/>
+      <use xlink:href="#DejaVuSans-72" x="107.771484"/>
+      <use xlink:href="#DejaVuSans-61" x="148.884766"/>
+      <use xlink:href="#DejaVuSans-79" x="210.164062"/>
+      <use xlink:href="#DejaVuSans-20" x="269.34375"/>
+      <use xlink:href="#DejaVuSans-6c" x="301.130859"/>
+      <use xlink:href="#DejaVuSans-65" x="328.914062"/>
+      <use xlink:href="#DejaVuSans-6e" x="390.4375"/>
+      <use xlink:href="#DejaVuSans-67" x="453.816406"/>
+      <use xlink:href="#DejaVuSans-74" x="517.292969"/>
+      <use xlink:href="#DejaVuSans-68" x="556.501953"/>
+      <use xlink:href="#DejaVuSans-20" x="619.880859"/>
+      <use xlink:href="#DejaVuSans-5b" x="651.667969"/>
+      <use xlink:href="#DejaVuSans-2d" x="690.681641"/>
+      <use xlink:href="#DejaVuSans-5d" x="726.765625"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="line2d_24">
+      <path d="M 45.478125 221.677305 
+L 445.372916 221.677305 
+" clip-path="url(#p314b7e48b1)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_25">
+      <defs>
+       <path id="m29e610597e" d="M 0 0 
+L -3.5 0 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m29e610597e" x="45.478125" y="221.677305" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- $\mathdefault{10^{0}}$ -->
+      <g transform="translate(20.878125 225.476524) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-31" transform="translate(0 0.765625)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0.765625)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(128.203125 39.046875) scale(0.7)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="line2d_26">
+      <path d="M 45.478125 137.554172 
+L 445.372916 137.554172 
+" clip-path="url(#p314b7e48b1)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_27">
+      <g>
+       <use xlink:href="#m29e610597e" x="45.478125" y="137.554172" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- $\mathdefault{10^{1}}$ -->
+      <g transform="translate(20.878125 141.353391) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-31" transform="translate(0 0.684375)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0.684375)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(128.203125 38.965625) scale(0.7)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="line2d_28">
+      <path d="M 45.478125 53.431039 
+L 445.372916 53.431039 
+" clip-path="url(#p314b7e48b1)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_29">
+      <g>
+       <use xlink:href="#m29e610597e" x="45.478125" y="53.431039" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- $\mathdefault{10^{2}}$ -->
+      <g transform="translate(20.878125 57.230258) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-32" d="M 1228 531 
+L 3431 531 
+L 3431 0 
+L 469 0 
+L 469 531 
+Q 828 903 1448 1529 
+Q 2069 2156 2228 2338 
+Q 2531 2678 2651 2914 
+Q 2772 3150 2772 3378 
+Q 2772 3750 2511 3984 
+Q 2250 4219 1831 4219 
+Q 1534 4219 1204 4116 
+Q 875 4013 500 3803 
+L 500 4441 
+Q 881 4594 1212 4672 
+Q 1544 4750 1819 4750 
+Q 2544 4750 2975 4387 
+Q 3406 4025 3406 3419 
+Q 3406 3131 3298 2873 
+Q 3191 2616 2906 2266 
+Q 2828 2175 2409 1742 
+Q 1991 1309 1228 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-31" transform="translate(0 0.765625)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0.765625)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(128.203125 39.046875) scale(0.7)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="line2d_30">
+      <defs>
+       <path id="m01ad9a53f4" d="M 0 0 
+L -2 0 
+" style="stroke: #000000; stroke-width: 0.6"/>
+      </defs>
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="280.476851" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_5">
+     <g id="line2d_31">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="265.663503" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="line2d_32">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="255.153265" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_7">
+     <g id="line2d_33">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="247.000891" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_8">
+     <g id="line2d_34">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="240.339917" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_9">
+     <g id="line2d_35">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="234.708143" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_10">
+     <g id="line2d_36">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="229.829679" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_11">
+     <g id="line2d_37">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="225.526568" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_12">
+     <g id="line2d_38">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="196.353718" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_13">
+     <g id="line2d_39">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="181.54037" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_14">
+     <g id="line2d_40">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="171.030132" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_15">
+     <g id="line2d_41">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="162.877758" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_16">
+     <g id="line2d_42">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="156.216784" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_17">
+     <g id="line2d_43">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="150.58501" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_18">
+     <g id="line2d_44">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="145.706546" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_19">
+     <g id="line2d_45">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="141.403435" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_20">
+     <g id="line2d_46">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="112.230585" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_21">
+     <g id="line2d_47">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="97.417237" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_22">
+     <g id="line2d_48">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="86.906999" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_23">
+     <g id="line2d_49">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="78.754625" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_24">
+     <g id="line2d_50">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="72.093651" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_25">
+     <g id="line2d_51">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="66.461877" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_26">
+     <g id="line2d_52">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="61.583413" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_27">
+     <g id="line2d_53">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="57.280302" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_28">
+     <g id="line2d_54">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="28.107452" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_29">
+     <g id="line2d_55">
+      <g>
+       <use xlink:href="#m01ad9a53f4" x="45.478125" y="13.294104" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_8">
+     <!-- Runtime [ms] -->
+     <g transform="translate(14.798438 185.812437) rotate(-90) scale(0.1 -0.1)">
+      <defs>
+       <path id="DejaVuSans-52" d="M 2841 2188 
+Q 3044 2119 3236 1894 
+Q 3428 1669 3622 1275 
+L 4263 0 
+L 3584 0 
+L 2988 1197 
+Q 2756 1666 2539 1819 
+Q 2322 1972 1947 1972 
+L 1259 1972 
+L 1259 0 
+L 628 0 
+L 628 4666 
+L 2053 4666 
+Q 2853 4666 3247 4331 
+Q 3641 3997 3641 3322 
+Q 3641 2881 3436 2590 
+Q 3231 2300 2841 2188 
+z
+M 1259 4147 
+L 1259 2491 
+L 2053 2491 
+Q 2509 2491 2742 2702 
+Q 2975 2913 2975 3322 
+Q 2975 3731 2742 3939 
+Q 2509 4147 2053 4147 
+L 1259 4147 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-75" d="M 544 1381 
+L 544 3500 
+L 1119 3500 
+L 1119 1403 
+Q 1119 906 1312 657 
+Q 1506 409 1894 409 
+Q 2359 409 2629 706 
+Q 2900 1003 2900 1516 
+L 2900 3500 
+L 3475 3500 
+L 3475 0 
+L 2900 0 
+L 2900 538 
+Q 2691 219 2414 64 
+Q 2138 -91 1772 -91 
+Q 1169 -91 856 284 
+Q 544 659 544 1381 
+z
+M 1991 3584 
+L 1991 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-69" d="M 603 3500 
+L 1178 3500 
+L 1178 0 
+L 603 0 
+L 603 3500 
+z
+M 603 4863 
+L 1178 4863 
+L 1178 4134 
+L 603 4134 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6d" d="M 3328 2828 
+Q 3544 3216 3844 3400 
+Q 4144 3584 4550 3584 
+Q 5097 3584 5394 3201 
+Q 5691 2819 5691 2113 
+L 5691 0 
+L 5113 0 
+L 5113 2094 
+Q 5113 2597 4934 2840 
+Q 4756 3084 4391 3084 
+Q 3944 3084 3684 2787 
+Q 3425 2491 3425 1978 
+L 3425 0 
+L 2847 0 
+L 2847 2094 
+Q 2847 2600 2669 2842 
+Q 2491 3084 2119 3084 
+Q 1678 3084 1418 2786 
+Q 1159 2488 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1356 3278 1631 3431 
+Q 1906 3584 2284 3584 
+Q 2666 3584 2933 3390 
+Q 3200 3197 3328 2828 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-73" d="M 2834 3397 
+L 2834 2853 
+Q 2591 2978 2328 3040 
+Q 2066 3103 1784 3103 
+Q 1356 3103 1142 2972 
+Q 928 2841 928 2578 
+Q 928 2378 1081 2264 
+Q 1234 2150 1697 2047 
+L 1894 2003 
+Q 2506 1872 2764 1633 
+Q 3022 1394 3022 966 
+Q 3022 478 2636 193 
+Q 2250 -91 1575 -91 
+Q 1294 -91 989 -36 
+Q 684 19 347 128 
+L 347 722 
+Q 666 556 975 473 
+Q 1284 391 1588 391 
+Q 1994 391 2212 530 
+Q 2431 669 2431 922 
+Q 2431 1156 2273 1281 
+Q 2116 1406 1581 1522 
+L 1381 1569 
+Q 847 1681 609 1914 
+Q 372 2147 372 2553 
+Q 372 3047 722 3315 
+Q 1072 3584 1716 3584 
+Q 2034 3584 2315 3537 
+Q 2597 3491 2834 3397 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-52"/>
+      <use xlink:href="#DejaVuSans-75" x="64.982422"/>
+      <use xlink:href="#DejaVuSans-6e" x="128.361328"/>
+      <use xlink:href="#DejaVuSans-74" x="191.740234"/>
+      <use xlink:href="#DejaVuSans-69" x="230.949219"/>
+      <use xlink:href="#DejaVuSans-6d" x="258.732422"/>
+      <use xlink:href="#DejaVuSans-65" x="356.144531"/>
+      <use xlink:href="#DejaVuSans-20" x="417.667969"/>
+      <use xlink:href="#DejaVuSans-5b" x="449.455078"/>
+      <use xlink:href="#DejaVuSans-6d" x="488.46875"/>
+      <use xlink:href="#DejaVuSans-73" x="585.880859"/>
+      <use xlink:href="#DejaVuSans-5d" x="637.980469"/>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_56">
+    <path d="M 63.655161 256.022068 
+L 115.589549 231.126557 
+L 167.523938 203.130324 
+L 219.458326 179.968392 
+L 271.392715 154.843068 
+L 323.327103 128.434717 
+L 375.261491 103.449276 
+L 427.19588 78.566666 
+" clip-path="url(#p314b7e48b1)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square"/>
+   </g>
+   <g id="line2d_57">
+    <path d="M 63.655161 190.56981 
+L 115.589549 166.908029 
+L 167.523938 137.619224 
+L 219.458326 114.302365 
+L 271.392715 88.673534 
+L 323.327103 70.563224 
+L 375.261491 46.807226 
+L 427.19588 20.360364 
+" clip-path="url(#p314b7e48b1)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square"/>
+   </g>
+   <g id="line2d_58">
+    <path d="M 63.655161 199.722283 
+L 115.589549 185.013515 
+L 167.523938 161.715865 
+L 219.458326 145.832715 
+L 271.392715 123.549565 
+L 323.327103 100.947211 
+L 375.261491 76.717128 
+L 427.19588 52.127645 
+" clip-path="url(#p314b7e48b1)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square"/>
+   </g>
+   <g id="line2d_59">
+    <path d="M 63.655161 283.567636 
+L 115.589549 270.360375 
+L 167.523938 258.136064 
+L 219.458326 237.661958 
+L 271.392715 206.02795 
+L 323.327103 194.869217 
+L 375.261491 170.752615 
+L 427.19588 146.056755 
+" clip-path="url(#p314b7e48b1)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square"/>
+   </g>
+   <g id="patch_3">
+    <path d="M 45.478125 296.728 
+L 45.478125 7.2 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 445.372916 296.728 
+L 445.372916 7.2 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 45.478125 296.728 
+L 445.372916 296.728 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 45.478125 7.2 
+L 445.372916 7.2 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="legend_1">
+    <g id="patch_7">
+     <path d="M 52.478125 73.9125 
+L 149.4125 73.9125 
+Q 151.4125 73.9125 151.4125 71.9125 
+L 151.4125 14.2 
+Q 151.4125 12.2 149.4125 12.2 
+L 52.478125 12.2 
+Q 50.478125 12.2 50.478125 14.2 
+L 50.478125 71.9125 
+Q 50.478125 73.9125 52.478125 73.9125 
+z
+" style="fill: #ffffff; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter"/>
+    </g>
+    <g id="line2d_60">
+     <path d="M 54.478125 20.298437 
+L 64.478125 20.298437 
+L 74.478125 20.298437 
+" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square"/>
+    </g>
+    <g id="text_9">
+     <!-- CPU -->
+     <g transform="translate(82.478125 23.798437) scale(0.1 -0.1)">
+      <defs>
+       <path id="DejaVuSans-43" d="M 4122 4306 
+L 4122 3641 
+Q 3803 3938 3442 4084 
+Q 3081 4231 2675 4231 
+Q 1875 4231 1450 3742 
+Q 1025 3253 1025 2328 
+Q 1025 1406 1450 917 
+Q 1875 428 2675 428 
+Q 3081 428 3442 575 
+Q 3803 722 4122 1019 
+L 4122 359 
+Q 3791 134 3420 21 
+Q 3050 -91 2638 -91 
+Q 1578 -91 968 557 
+Q 359 1206 359 2328 
+Q 359 3453 968 4101 
+Q 1578 4750 2638 4750 
+Q 3056 4750 3426 4639 
+Q 3797 4528 4122 4306 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-50" d="M 1259 4147 
+L 1259 2394 
+L 2053 2394 
+Q 2494 2394 2734 2622 
+Q 2975 2850 2975 3272 
+Q 2975 3691 2734 3919 
+Q 2494 4147 2053 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2053 4666 
+Q 2838 4666 3239 4311 
+Q 3641 3956 3641 3272 
+Q 3641 2581 3239 2228 
+Q 2838 1875 2053 1875 
+L 1259 1875 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-55" d="M 556 4666 
+L 1191 4666 
+L 1191 1831 
+Q 1191 1081 1462 751 
+Q 1734 422 2344 422 
+Q 2950 422 3222 751 
+Q 3494 1081 3494 1831 
+L 3494 4666 
+L 4128 4666 
+L 4128 1753 
+Q 4128 841 3676 375 
+Q 3225 -91 2344 -91 
+Q 1459 -91 1007 375 
+Q 556 841 556 1753 
+L 556 4666 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-43"/>
+      <use xlink:href="#DejaVuSans-50" x="69.824219"/>
+      <use xlink:href="#DejaVuSans-55" x="130.126953"/>
+     </g>
+    </g>
+    <g id="line2d_61">
+     <path d="M 54.478125 34.976562 
+L 64.478125 34.976562 
+L 74.478125 34.976562 
+" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square"/>
+    </g>
+    <g id="text_10">
+     <!-- GPU naive -->
+     <g transform="translate(82.478125 38.476562) scale(0.1 -0.1)">
+      <defs>
+       <path id="DejaVuSans-47" d="M 3809 666 
+L 3809 1919 
+L 2778 1919 
+L 2778 2438 
+L 4434 2438 
+L 4434 434 
+Q 4069 175 3628 42 
+Q 3188 -91 2688 -91 
+Q 1594 -91 976 548 
+Q 359 1188 359 2328 
+Q 359 3472 976 4111 
+Q 1594 4750 2688 4750 
+Q 3144 4750 3555 4637 
+Q 3966 4525 4313 4306 
+L 4313 3634 
+Q 3963 3931 3569 4081 
+Q 3175 4231 2741 4231 
+Q 1884 4231 1454 3753 
+Q 1025 3275 1025 2328 
+Q 1025 1384 1454 906 
+Q 1884 428 2741 428 
+Q 3075 428 3337 486 
+Q 3600 544 3809 666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-76" d="M 191 3500 
+L 800 3500 
+L 1894 563 
+L 2988 3500 
+L 3597 3500 
+L 2284 0 
+L 1503 0 
+L 191 3500 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-47"/>
+      <use xlink:href="#DejaVuSans-50" x="77.490234"/>
+      <use xlink:href="#DejaVuSans-55" x="137.792969"/>
+      <use xlink:href="#DejaVuSans-20" x="210.986328"/>
+      <use xlink:href="#DejaVuSans-6e" x="242.773438"/>
+      <use xlink:href="#DejaVuSans-61" x="306.152344"/>
+      <use xlink:href="#DejaVuSans-69" x="367.431641"/>
+      <use xlink:href="#DejaVuSans-76" x="395.214844"/>
+      <use xlink:href="#DejaVuSans-65" x="454.394531"/>
+     </g>
+    </g>
+    <g id="line2d_62">
+     <path d="M 54.478125 49.654687 
+L 64.478125 49.654687 
+L 74.478125 49.654687 
+" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square"/>
+    </g>
+    <g id="text_11">
+     <!-- GPU efficient -->
+     <g transform="translate(82.478125 53.154687) scale(0.1 -0.1)">
+      <defs>
+       <path id="DejaVuSans-66" d="M 2375 4863 
+L 2375 4384 
+L 1825 4384 
+Q 1516 4384 1395 4259 
+Q 1275 4134 1275 3809 
+L 1275 3500 
+L 2222 3500 
+L 2222 3053 
+L 1275 3053 
+L 1275 0 
+L 697 0 
+L 697 3053 
+L 147 3053 
+L 147 3500 
+L 697 3500 
+L 697 3744 
+Q 697 4328 969 4595 
+Q 1241 4863 1831 4863 
+L 2375 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-63" d="M 3122 3366 
+L 3122 2828 
+Q 2878 2963 2633 3030 
+Q 2388 3097 2138 3097 
+Q 1578 3097 1268 2742 
+Q 959 2388 959 1747 
+Q 959 1106 1268 751 
+Q 1578 397 2138 397 
+Q 2388 397 2633 464 
+Q 2878 531 3122 666 
+L 3122 134 
+Q 2881 22 2623 -34 
+Q 2366 -91 2075 -91 
+Q 1284 -91 818 406 
+Q 353 903 353 1747 
+Q 353 2603 823 3093 
+Q 1294 3584 2113 3584 
+Q 2378 3584 2631 3529 
+Q 2884 3475 3122 3366 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-47"/>
+      <use xlink:href="#DejaVuSans-50" x="77.490234"/>
+      <use xlink:href="#DejaVuSans-55" x="137.792969"/>
+      <use xlink:href="#DejaVuSans-20" x="210.986328"/>
+      <use xlink:href="#DejaVuSans-65" x="242.773438"/>
+      <use xlink:href="#DejaVuSans-66" x="304.296875"/>
+      <use xlink:href="#DejaVuSans-66" x="339.501953"/>
+      <use xlink:href="#DejaVuSans-69" x="374.707031"/>
+      <use xlink:href="#DejaVuSans-63" x="402.490234"/>
+      <use xlink:href="#DejaVuSans-69" x="457.470703"/>
+      <use xlink:href="#DejaVuSans-65" x="485.253906"/>
+      <use xlink:href="#DejaVuSans-6e" x="546.777344"/>
+      <use xlink:href="#DejaVuSans-74" x="610.15625"/>
+     </g>
+    </g>
+    <g id="line2d_63">
+     <path d="M 54.478125 64.332812 
+L 64.478125 64.332812 
+L 74.478125 64.332812 
+" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square"/>
+    </g>
+    <g id="text_12">
+     <!-- GPU thrust -->
+     <g transform="translate(82.478125 67.832812) scale(0.1 -0.1)">
+      <use xlink:href="#DejaVuSans-47"/>
+      <use xlink:href="#DejaVuSans-50" x="77.490234"/>
+      <use xlink:href="#DejaVuSans-55" x="137.792969"/>
+      <use xlink:href="#DejaVuSans-20" x="210.986328"/>
+      <use xlink:href="#DejaVuSans-74" x="242.773438"/>
+      <use xlink:href="#DejaVuSans-68" x="281.982422"/>
+      <use xlink:href="#DejaVuSans-72" x="345.361328"/>
+      <use xlink:href="#DejaVuSans-75" x="386.474609"/>
+      <use xlink:href="#DejaVuSans-73" x="449.853516"/>
+      <use xlink:href="#DejaVuSans-74" x="501.953125"/>
+     </g>
+    </g>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="p314b7e48b1">
+   <rect x="45.478125" y="7.2" width="399.894791" height="289.528"/>
+  </clipPath>
+ </defs>
+</svg>
diff --git a/img/performance_pow2.svg b/img/performance_pow2.svg
new file mode 100644
index 0000000..35c2c07
--- /dev/null
+++ b/img/performance_pow2.svg
@@ -0,0 +1,1328 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="452.572916pt" height="334.28425pt" viewBox="0 0 452.572916 334.28425" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2024-09-18T22:26:59.849723</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.9.2, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 334.28425 
+L 452.572916 334.28425 
+L 452.572916 0 
+L 0 0 
+z
+" style="fill: #ffffff"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 45.478125 296.728 
+L 445.372916 296.728 
+L 445.372916 7.2 
+L 45.478125 7.2 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <path d="M 60.101216 296.728 
+L 60.101216 7.2 
+" clip-path="url(#p19fa7cb5a1)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_2">
+      <defs>
+       <path id="mf2850f2670" d="M 0 0 
+L 0 3.5 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#mf2850f2670" x="60.101216" y="296.728" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- $\mathdefault{10^{6}}$ -->
+      <g transform="translate(51.301216 311.326437) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-31" d="M 794 531 
+L 1825 531 
+L 1825 4091 
+L 703 3866 
+L 703 4441 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3481 531 
+L 3481 0 
+L 794 0 
+L 794 531 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-30" d="M 2034 4250 
+Q 1547 4250 1301 3770 
+Q 1056 3291 1056 2328 
+Q 1056 1369 1301 889 
+Q 1547 409 2034 409 
+Q 2525 409 2770 889 
+Q 3016 1369 3016 2328 
+Q 3016 3291 2770 3770 
+Q 2525 4250 2034 4250 
+z
+M 2034 4750 
+Q 2819 4750 3233 4129 
+Q 3647 3509 3647 2328 
+Q 3647 1150 3233 529 
+Q 2819 -91 2034 -91 
+Q 1250 -91 836 529 
+Q 422 1150 422 2328 
+Q 422 3509 836 4129 
+Q 1250 4750 2034 4750 
+z
+" transform="scale(0.015625)"/>
+        <path id="DejaVuSans-36" d="M 2113 2584 
+Q 1688 2584 1439 2293 
+Q 1191 2003 1191 1497 
+Q 1191 994 1439 701 
+Q 1688 409 2113 409 
+Q 2538 409 2786 701 
+Q 3034 994 3034 1497 
+Q 3034 2003 2786 2293 
+Q 2538 2584 2113 2584 
+z
+M 3366 4563 
+L 3366 3988 
+Q 3128 4100 2886 4159 
+Q 2644 4219 2406 4219 
+Q 1781 4219 1451 3797 
+Q 1122 3375 1075 2522 
+Q 1259 2794 1537 2939 
+Q 1816 3084 2150 3084 
+Q 2853 3084 3261 2657 
+Q 3669 2231 3669 1497 
+Q 3669 778 3244 343 
+Q 2819 -91 2113 -91 
+Q 1303 -91 875 529 
+Q 447 1150 447 2328 
+Q 447 3434 972 4092 
+Q 1497 4750 2381 4750 
+Q 2619 4750 2861 4703 
+Q 3103 4656 3366 4563 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-31" transform="translate(0 0.765625)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0.765625)"/>
+       <use xlink:href="#DejaVuSans-36" transform="translate(128.203125 39.046875) scale(0.7)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_3">
+      <path d="M 232.62352 296.728 
+L 232.62352 7.2 
+" clip-path="url(#p19fa7cb5a1)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#mf2850f2670" x="232.62352" y="296.728" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- $\mathdefault{10^{7}}$ -->
+      <g transform="translate(223.82352 311.326437) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-37" d="M 525 4666 
+L 3525 4666 
+L 3525 4397 
+L 1831 0 
+L 1172 0 
+L 2766 4134 
+L 525 4134 
+L 525 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-31" transform="translate(0 0.684375)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0.684375)"/>
+       <use xlink:href="#DejaVuSans-37" transform="translate(128.203125 38.965625) scale(0.7)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_5">
+      <path d="M 405.145824 296.728 
+L 405.145824 7.2 
+" clip-path="url(#p19fa7cb5a1)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#mf2850f2670" x="405.145824" y="296.728" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- $\mathdefault{10^{8}}$ -->
+      <g transform="translate(396.345824 311.326437) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-38" d="M 2034 2216 
+Q 1584 2216 1326 1975 
+Q 1069 1734 1069 1313 
+Q 1069 891 1326 650 
+Q 1584 409 2034 409 
+Q 2484 409 2743 651 
+Q 3003 894 3003 1313 
+Q 3003 1734 2745 1975 
+Q 2488 2216 2034 2216 
+z
+M 1403 2484 
+Q 997 2584 770 2862 
+Q 544 3141 544 3541 
+Q 544 4100 942 4425 
+Q 1341 4750 2034 4750 
+Q 2731 4750 3128 4425 
+Q 3525 4100 3525 3541 
+Q 3525 3141 3298 2862 
+Q 3072 2584 2669 2484 
+Q 3125 2378 3379 2068 
+Q 3634 1759 3634 1313 
+Q 3634 634 3220 271 
+Q 2806 -91 2034 -91 
+Q 1263 -91 848 271 
+Q 434 634 434 1313 
+Q 434 1759 690 2068 
+Q 947 2378 1403 2484 
+z
+M 1172 3481 
+Q 1172 3119 1398 2916 
+Q 1625 2713 2034 2713 
+Q 2441 2713 2670 2916 
+Q 2900 3119 2900 3481 
+Q 2900 3844 2670 4047 
+Q 2441 4250 2034 4250 
+Q 1625 4250 1398 4047 
+Q 1172 3844 1172 3481 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-31" transform="translate(0 0.765625)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0.765625)"/>
+       <use xlink:href="#DejaVuSans-38" transform="translate(128.203125 39.046875) scale(0.7)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_7">
+      <defs>
+       <path id="m605e565844" d="M 0 0 
+L 0 2 
+" style="stroke: #000000; stroke-width: 0.6"/>
+      </defs>
+      <g>
+       <use xlink:href="#m605e565844" x="52.207029" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_8">
+      <g>
+       <use xlink:href="#m605e565844" x="112.035605" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_9">
+      <g>
+       <use xlink:href="#m605e565844" x="142.415275" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_10">
+      <g>
+       <use xlink:href="#m605e565844" x="163.969993" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_8">
+     <g id="line2d_11">
+      <g>
+       <use xlink:href="#m605e565844" x="180.689132" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_9">
+     <g id="line2d_12">
+      <g>
+       <use xlink:href="#m605e565844" x="194.349663" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_10">
+     <g id="line2d_13">
+      <g>
+       <use xlink:href="#m605e565844" x="205.899477" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_11">
+     <g id="line2d_14">
+      <g>
+       <use xlink:href="#m605e565844" x="215.904382" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_12">
+     <g id="line2d_15">
+      <g>
+       <use xlink:href="#m605e565844" x="224.729333" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_13">
+     <g id="line2d_16">
+      <g>
+       <use xlink:href="#m605e565844" x="284.557909" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_14">
+     <g id="line2d_17">
+      <g>
+       <use xlink:href="#m605e565844" x="314.937579" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_15">
+     <g id="line2d_18">
+      <g>
+       <use xlink:href="#m605e565844" x="336.492297" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_16">
+     <g id="line2d_19">
+      <g>
+       <use xlink:href="#m605e565844" x="353.211436" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_17">
+     <g id="line2d_20">
+      <g>
+       <use xlink:href="#m605e565844" x="366.871967" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_18">
+     <g id="line2d_21">
+      <g>
+       <use xlink:href="#m605e565844" x="378.421781" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_19">
+     <g id="line2d_22">
+      <g>
+       <use xlink:href="#m605e565844" x="388.426686" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_20">
+     <g id="line2d_23">
+      <g>
+       <use xlink:href="#m605e565844" x="397.251637" y="296.728" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_4">
+     <!-- Array length [-] -->
+     <g transform="translate(207.137239 325.004562) scale(0.1 -0.1)">
+      <defs>
+       <path id="DejaVuSans-41" d="M 2188 4044 
+L 1331 1722 
+L 3047 1722 
+L 2188 4044 
+z
+M 1831 4666 
+L 2547 4666 
+L 4325 0 
+L 3669 0 
+L 3244 1197 
+L 1141 1197 
+L 716 0 
+L 50 0 
+L 1831 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-72" d="M 2631 2963 
+Q 2534 3019 2420 3045 
+Q 2306 3072 2169 3072 
+Q 1681 3072 1420 2755 
+Q 1159 2438 1159 1844 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1341 3275 1631 3429 
+Q 1922 3584 2338 3584 
+Q 2397 3584 2469 3576 
+Q 2541 3569 2628 3553 
+L 2631 2963 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-61" d="M 2194 1759 
+Q 1497 1759 1228 1600 
+Q 959 1441 959 1056 
+Q 959 750 1161 570 
+Q 1363 391 1709 391 
+Q 2188 391 2477 730 
+Q 2766 1069 2766 1631 
+L 2766 1759 
+L 2194 1759 
+z
+M 3341 1997 
+L 3341 0 
+L 2766 0 
+L 2766 531 
+Q 2569 213 2275 61 
+Q 1981 -91 1556 -91 
+Q 1019 -91 701 211 
+Q 384 513 384 1019 
+Q 384 1609 779 1909 
+Q 1175 2209 1959 2209 
+L 2766 2209 
+L 2766 2266 
+Q 2766 2663 2505 2880 
+Q 2244 3097 1772 3097 
+Q 1472 3097 1187 3025 
+Q 903 2953 641 2809 
+L 641 3341 
+Q 956 3463 1253 3523 
+Q 1550 3584 1831 3584 
+Q 2591 3584 2966 3190 
+Q 3341 2797 3341 1997 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-79" d="M 2059 -325 
+Q 1816 -950 1584 -1140 
+Q 1353 -1331 966 -1331 
+L 506 -1331 
+L 506 -850 
+L 844 -850 
+Q 1081 -850 1212 -737 
+Q 1344 -625 1503 -206 
+L 1606 56 
+L 191 3500 
+L 800 3500 
+L 1894 763 
+L 2988 3500 
+L 3597 3500 
+L 2059 -325 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-20" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6c" d="M 603 4863 
+L 1178 4863 
+L 1178 0 
+L 603 0 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-65" d="M 3597 1894 
+L 3597 1613 
+L 953 1613 
+Q 991 1019 1311 708 
+Q 1631 397 2203 397 
+Q 2534 397 2845 478 
+Q 3156 559 3463 722 
+L 3463 178 
+Q 3153 47 2828 -22 
+Q 2503 -91 2169 -91 
+Q 1331 -91 842 396 
+Q 353 884 353 1716 
+Q 353 2575 817 3079 
+Q 1281 3584 2069 3584 
+Q 2775 3584 3186 3129 
+Q 3597 2675 3597 1894 
+z
+M 3022 2063 
+Q 3016 2534 2758 2815 
+Q 2500 3097 2075 3097 
+Q 1594 3097 1305 2825 
+Q 1016 2553 972 2059 
+L 3022 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6e" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-67" d="M 2906 1791 
+Q 2906 2416 2648 2759 
+Q 2391 3103 1925 3103 
+Q 1463 3103 1205 2759 
+Q 947 2416 947 1791 
+Q 947 1169 1205 825 
+Q 1463 481 1925 481 
+Q 2391 481 2648 825 
+Q 2906 1169 2906 1791 
+z
+M 3481 434 
+Q 3481 -459 3084 -895 
+Q 2688 -1331 1869 -1331 
+Q 1566 -1331 1297 -1286 
+Q 1028 -1241 775 -1147 
+L 775 -588 
+Q 1028 -725 1275 -790 
+Q 1522 -856 1778 -856 
+Q 2344 -856 2625 -561 
+Q 2906 -266 2906 331 
+L 2906 616 
+Q 2728 306 2450 153 
+Q 2172 0 1784 0 
+Q 1141 0 747 490 
+Q 353 981 353 1791 
+Q 353 2603 747 3093 
+Q 1141 3584 1784 3584 
+Q 2172 3584 2450 3431 
+Q 2728 3278 2906 2969 
+L 2906 3500 
+L 3481 3500 
+L 3481 434 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-74" d="M 1172 4494 
+L 1172 3500 
+L 2356 3500 
+L 2356 3053 
+L 1172 3053 
+L 1172 1153 
+Q 1172 725 1289 603 
+Q 1406 481 1766 481 
+L 2356 481 
+L 2356 0 
+L 1766 0 
+Q 1100 0 847 248 
+Q 594 497 594 1153 
+L 594 3053 
+L 172 3053 
+L 172 3500 
+L 594 3500 
+L 594 4494 
+L 1172 4494 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-68" d="M 3513 2113 
+L 3513 0 
+L 2938 0 
+L 2938 2094 
+Q 2938 2591 2744 2837 
+Q 2550 3084 2163 3084 
+Q 1697 3084 1428 2787 
+Q 1159 2491 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 4863 
+L 1159 4863 
+L 1159 2956 
+Q 1366 3272 1645 3428 
+Q 1925 3584 2291 3584 
+Q 2894 3584 3203 3211 
+Q 3513 2838 3513 2113 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-5b" d="M 550 4863 
+L 1875 4863 
+L 1875 4416 
+L 1125 4416 
+L 1125 -397 
+L 1875 -397 
+L 1875 -844 
+L 550 -844 
+L 550 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-2d" d="M 313 2009 
+L 1997 2009 
+L 1997 1497 
+L 313 1497 
+L 313 2009 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-5d" d="M 1947 4863 
+L 1947 -844 
+L 622 -844 
+L 622 -397 
+L 1369 -397 
+L 1369 4416 
+L 622 4416 
+L 622 4863 
+L 1947 4863 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-41"/>
+      <use xlink:href="#DejaVuSans-72" x="68.408203"/>
+      <use xlink:href="#DejaVuSans-72" x="107.771484"/>
+      <use xlink:href="#DejaVuSans-61" x="148.884766"/>
+      <use xlink:href="#DejaVuSans-79" x="210.164062"/>
+      <use xlink:href="#DejaVuSans-20" x="269.34375"/>
+      <use xlink:href="#DejaVuSans-6c" x="301.130859"/>
+      <use xlink:href="#DejaVuSans-65" x="328.914062"/>
+      <use xlink:href="#DejaVuSans-6e" x="390.4375"/>
+      <use xlink:href="#DejaVuSans-67" x="453.816406"/>
+      <use xlink:href="#DejaVuSans-74" x="517.292969"/>
+      <use xlink:href="#DejaVuSans-68" x="556.501953"/>
+      <use xlink:href="#DejaVuSans-20" x="619.880859"/>
+      <use xlink:href="#DejaVuSans-5b" x="651.667969"/>
+      <use xlink:href="#DejaVuSans-2d" x="690.681641"/>
+      <use xlink:href="#DejaVuSans-5d" x="726.765625"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2">
+    <g id="ytick_1">
+     <g id="line2d_24">
+      <path d="M 45.478125 248.368992 
+L 445.372916 248.368992 
+" clip-path="url(#p19fa7cb5a1)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_25">
+      <defs>
+       <path id="md78bd7114c" d="M 0 0 
+L -3.5 0 
+" style="stroke: #000000; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#md78bd7114c" x="45.478125" y="248.368992" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- $\mathdefault{10^{0}}$ -->
+      <g transform="translate(20.878125 252.168211) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-31" transform="translate(0 0.765625)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0.765625)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(128.203125 39.046875) scale(0.7)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_2">
+     <g id="line2d_26">
+      <path d="M 45.478125 154.350225 
+L 445.372916 154.350225 
+" clip-path="url(#p19fa7cb5a1)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_27">
+      <g>
+       <use xlink:href="#md78bd7114c" x="45.478125" y="154.350225" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- $\mathdefault{10^{1}}$ -->
+      <g transform="translate(20.878125 158.149444) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSans-31" transform="translate(0 0.684375)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0.684375)"/>
+       <use xlink:href="#DejaVuSans-31" transform="translate(128.203125 38.965625) scale(0.7)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_3">
+     <g id="line2d_28">
+      <path d="M 45.478125 60.331458 
+L 445.372916 60.331458 
+" clip-path="url(#p19fa7cb5a1)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.8; stroke-linecap: square"/>
+     </g>
+     <g id="line2d_29">
+      <g>
+       <use xlink:href="#md78bd7114c" x="45.478125" y="60.331458" style="stroke: #000000; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- $\mathdefault{10^{2}}$ -->
+      <g transform="translate(20.878125 64.130676) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSans-32" d="M 1228 531 
+L 3431 531 
+L 3431 0 
+L 469 0 
+L 469 531 
+Q 828 903 1448 1529 
+Q 2069 2156 2228 2338 
+Q 2531 2678 2651 2914 
+Q 2772 3150 2772 3378 
+Q 2772 3750 2511 3984 
+Q 2250 4219 1831 4219 
+Q 1534 4219 1204 4116 
+Q 875 4013 500 3803 
+L 500 4441 
+Q 881 4594 1212 4672 
+Q 1544 4750 1819 4750 
+Q 2544 4750 2975 4387 
+Q 3406 4025 3406 3419 
+Q 3406 3131 3298 2873 
+Q 3191 2616 2906 2266 
+Q 2828 2175 2409 1742 
+Q 1991 1309 1228 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSans-31" transform="translate(0 0.765625)"/>
+       <use xlink:href="#DejaVuSans-30" transform="translate(63.623047 0.765625)"/>
+       <use xlink:href="#DejaVuSans-32" transform="translate(128.203125 39.046875) scale(0.7)"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_4">
+     <g id="line2d_30">
+      <defs>
+       <path id="m8f710c7ce1" d="M 0 0 
+L -2 0 
+" style="stroke: #000000; stroke-width: 0.6"/>
+      </defs>
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="285.782821" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_5">
+     <g id="line2d_31">
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="276.671461" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_6">
+     <g id="line2d_32">
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="269.226938" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_7">
+     <g id="line2d_33">
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="262.932683" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_8">
+     <g id="line2d_34">
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="257.480352" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_9">
+     <g id="line2d_35">
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="252.671055" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_10">
+     <g id="line2d_36">
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="220.066523" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_11">
+     <g id="line2d_37">
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="203.51064" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_12">
+     <g id="line2d_38">
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="191.764054" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_13">
+     <g id="line2d_39">
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="182.652694" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_14">
+     <g id="line2d_40">
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="175.208171" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_15">
+     <g id="line2d_41">
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="168.913916" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_16">
+     <g id="line2d_42">
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="163.461585" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_17">
+     <g id="line2d_43">
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="158.652288" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_18">
+     <g id="line2d_44">
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="126.047756" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_19">
+     <g id="line2d_45">
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="109.491873" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_20">
+     <g id="line2d_46">
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="97.745287" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_21">
+     <g id="line2d_47">
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="88.633927" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_22">
+     <g id="line2d_48">
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="81.189404" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_23">
+     <g id="line2d_49">
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="74.895149" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_24">
+     <g id="line2d_50">
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="69.442818" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_25">
+     <g id="line2d_51">
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="64.63352" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_26">
+     <g id="line2d_52">
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="32.028989" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="ytick_27">
+     <g id="line2d_53">
+      <g>
+       <use xlink:href="#m8f710c7ce1" x="45.478125" y="15.473105" style="stroke: #000000; stroke-width: 0.6"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_8">
+     <!-- Runtime [ms] -->
+     <g transform="translate(14.798438 185.812437) rotate(-90) scale(0.1 -0.1)">
+      <defs>
+       <path id="DejaVuSans-52" d="M 2841 2188 
+Q 3044 2119 3236 1894 
+Q 3428 1669 3622 1275 
+L 4263 0 
+L 3584 0 
+L 2988 1197 
+Q 2756 1666 2539 1819 
+Q 2322 1972 1947 1972 
+L 1259 1972 
+L 1259 0 
+L 628 0 
+L 628 4666 
+L 2053 4666 
+Q 2853 4666 3247 4331 
+Q 3641 3997 3641 3322 
+Q 3641 2881 3436 2590 
+Q 3231 2300 2841 2188 
+z
+M 1259 4147 
+L 1259 2491 
+L 2053 2491 
+Q 2509 2491 2742 2702 
+Q 2975 2913 2975 3322 
+Q 2975 3731 2742 3939 
+Q 2509 4147 2053 4147 
+L 1259 4147 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-75" d="M 544 1381 
+L 544 3500 
+L 1119 3500 
+L 1119 1403 
+Q 1119 906 1312 657 
+Q 1506 409 1894 409 
+Q 2359 409 2629 706 
+Q 2900 1003 2900 1516 
+L 2900 3500 
+L 3475 3500 
+L 3475 0 
+L 2900 0 
+L 2900 538 
+Q 2691 219 2414 64 
+Q 2138 -91 1772 -91 
+Q 1169 -91 856 284 
+Q 544 659 544 1381 
+z
+M 1991 3584 
+L 1991 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-69" d="M 603 3500 
+L 1178 3500 
+L 1178 0 
+L 603 0 
+L 603 3500 
+z
+M 603 4863 
+L 1178 4863 
+L 1178 4134 
+L 603 4134 
+L 603 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-6d" d="M 3328 2828 
+Q 3544 3216 3844 3400 
+Q 4144 3584 4550 3584 
+Q 5097 3584 5394 3201 
+Q 5691 2819 5691 2113 
+L 5691 0 
+L 5113 0 
+L 5113 2094 
+Q 5113 2597 4934 2840 
+Q 4756 3084 4391 3084 
+Q 3944 3084 3684 2787 
+Q 3425 2491 3425 1978 
+L 3425 0 
+L 2847 0 
+L 2847 2094 
+Q 2847 2600 2669 2842 
+Q 2491 3084 2119 3084 
+Q 1678 3084 1418 2786 
+Q 1159 2488 1159 1978 
+L 1159 0 
+L 581 0 
+L 581 3500 
+L 1159 3500 
+L 1159 2956 
+Q 1356 3278 1631 3431 
+Q 1906 3584 2284 3584 
+Q 2666 3584 2933 3390 
+Q 3200 3197 3328 2828 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-73" d="M 2834 3397 
+L 2834 2853 
+Q 2591 2978 2328 3040 
+Q 2066 3103 1784 3103 
+Q 1356 3103 1142 2972 
+Q 928 2841 928 2578 
+Q 928 2378 1081 2264 
+Q 1234 2150 1697 2047 
+L 1894 2003 
+Q 2506 1872 2764 1633 
+Q 3022 1394 3022 966 
+Q 3022 478 2636 193 
+Q 2250 -91 1575 -91 
+Q 1294 -91 989 -36 
+Q 684 19 347 128 
+L 347 722 
+Q 666 556 975 473 
+Q 1284 391 1588 391 
+Q 1994 391 2212 530 
+Q 2431 669 2431 922 
+Q 2431 1156 2273 1281 
+Q 2116 1406 1581 1522 
+L 1381 1569 
+Q 847 1681 609 1914 
+Q 372 2147 372 2553 
+Q 372 3047 722 3315 
+Q 1072 3584 1716 3584 
+Q 2034 3584 2315 3537 
+Q 2597 3491 2834 3397 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-52"/>
+      <use xlink:href="#DejaVuSans-75" x="64.982422"/>
+      <use xlink:href="#DejaVuSans-6e" x="128.361328"/>
+      <use xlink:href="#DejaVuSans-74" x="191.740234"/>
+      <use xlink:href="#DejaVuSans-69" x="230.949219"/>
+      <use xlink:href="#DejaVuSans-6d" x="258.732422"/>
+      <use xlink:href="#DejaVuSans-65" x="356.144531"/>
+      <use xlink:href="#DejaVuSans-20" x="417.667969"/>
+      <use xlink:href="#DejaVuSans-5b" x="449.455078"/>
+      <use xlink:href="#DejaVuSans-6d" x="488.46875"/>
+      <use xlink:href="#DejaVuSans-73" x="585.880859"/>
+      <use xlink:href="#DejaVuSans-5d" x="637.980469"/>
+     </g>
+    </g>
+   </g>
+   <g id="line2d_54">
+    <path d="M 63.655161 283.567636 
+L 115.589549 252.816493 
+L 167.523938 230.080993 
+L 219.458326 197.032473 
+L 271.392715 172.877766 
+L 323.327103 144.699865 
+L 375.261491 115.844104 
+L 427.19588 88.587894 
+" clip-path="url(#p19fa7cb5a1)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square"/>
+   </g>
+   <g id="line2d_55">
+    <path d="M 63.655161 211.185285 
+L 115.589549 185.00026 
+L 167.523938 156.743746 
+L 219.458326 127.527874 
+L 271.392715 99.478103 
+L 323.327103 70.75704 
+L 375.261491 41.934025 
+L 427.19588 20.360364 
+" clip-path="url(#p19fa7cb5a1)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square"/>
+   </g>
+   <g id="line2d_56">
+    <path d="M 63.655161 222.156194 
+L 115.589549 206.007454 
+L 167.523938 187.03032 
+L 219.458326 159.450082 
+L 271.392715 138.019263 
+L 323.327103 112.934535 
+L 375.261491 86.398481 
+L 427.19588 58.745319 
+" clip-path="url(#p19fa7cb5a1)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square"/>
+   </g>
+   <g id="line2d_57">
+    <path d="M 63.655161 276.405777 
+L 115.589549 272.861815 
+L 167.523938 262.070156 
+L 219.458326 249.933877 
+L 271.392715 228.623039 
+L 323.327103 214.106 
+L 375.261491 186.554306 
+L 427.19588 162.110304 
+" clip-path="url(#p19fa7cb5a1)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square"/>
+   </g>
+   <g id="patch_3">
+    <path d="M 45.478125 296.728 
+L 45.478125 7.2 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 445.372916 296.728 
+L 445.372916 7.2 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 45.478125 296.728 
+L 445.372916 296.728 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 45.478125 7.2 
+L 445.372916 7.2 
+" style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="legend_1">
+    <g id="patch_7">
+     <path d="M 52.478125 73.9125 
+L 149.4125 73.9125 
+Q 151.4125 73.9125 151.4125 71.9125 
+L 151.4125 14.2 
+Q 151.4125 12.2 149.4125 12.2 
+L 52.478125 12.2 
+Q 50.478125 12.2 50.478125 14.2 
+L 50.478125 71.9125 
+Q 50.478125 73.9125 52.478125 73.9125 
+z
+" style="fill: #ffffff; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter"/>
+    </g>
+    <g id="line2d_58">
+     <path d="M 54.478125 20.298437 
+L 64.478125 20.298437 
+L 74.478125 20.298437 
+" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square"/>
+    </g>
+    <g id="text_9">
+     <!-- CPU -->
+     <g transform="translate(82.478125 23.798437) scale(0.1 -0.1)">
+      <defs>
+       <path id="DejaVuSans-43" d="M 4122 4306 
+L 4122 3641 
+Q 3803 3938 3442 4084 
+Q 3081 4231 2675 4231 
+Q 1875 4231 1450 3742 
+Q 1025 3253 1025 2328 
+Q 1025 1406 1450 917 
+Q 1875 428 2675 428 
+Q 3081 428 3442 575 
+Q 3803 722 4122 1019 
+L 4122 359 
+Q 3791 134 3420 21 
+Q 3050 -91 2638 -91 
+Q 1578 -91 968 557 
+Q 359 1206 359 2328 
+Q 359 3453 968 4101 
+Q 1578 4750 2638 4750 
+Q 3056 4750 3426 4639 
+Q 3797 4528 4122 4306 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-50" d="M 1259 4147 
+L 1259 2394 
+L 2053 2394 
+Q 2494 2394 2734 2622 
+Q 2975 2850 2975 3272 
+Q 2975 3691 2734 3919 
+Q 2494 4147 2053 4147 
+L 1259 4147 
+z
+M 628 4666 
+L 2053 4666 
+Q 2838 4666 3239 4311 
+Q 3641 3956 3641 3272 
+Q 3641 2581 3239 2228 
+Q 2838 1875 2053 1875 
+L 1259 1875 
+L 1259 0 
+L 628 0 
+L 628 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-55" d="M 556 4666 
+L 1191 4666 
+L 1191 1831 
+Q 1191 1081 1462 751 
+Q 1734 422 2344 422 
+Q 2950 422 3222 751 
+Q 3494 1081 3494 1831 
+L 3494 4666 
+L 4128 4666 
+L 4128 1753 
+Q 4128 841 3676 375 
+Q 3225 -91 2344 -91 
+Q 1459 -91 1007 375 
+Q 556 841 556 1753 
+L 556 4666 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-43"/>
+      <use xlink:href="#DejaVuSans-50" x="69.824219"/>
+      <use xlink:href="#DejaVuSans-55" x="130.126953"/>
+     </g>
+    </g>
+    <g id="line2d_59">
+     <path d="M 54.478125 34.976562 
+L 64.478125 34.976562 
+L 74.478125 34.976562 
+" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square"/>
+    </g>
+    <g id="text_10">
+     <!-- GPU naive -->
+     <g transform="translate(82.478125 38.476562) scale(0.1 -0.1)">
+      <defs>
+       <path id="DejaVuSans-47" d="M 3809 666 
+L 3809 1919 
+L 2778 1919 
+L 2778 2438 
+L 4434 2438 
+L 4434 434 
+Q 4069 175 3628 42 
+Q 3188 -91 2688 -91 
+Q 1594 -91 976 548 
+Q 359 1188 359 2328 
+Q 359 3472 976 4111 
+Q 1594 4750 2688 4750 
+Q 3144 4750 3555 4637 
+Q 3966 4525 4313 4306 
+L 4313 3634 
+Q 3963 3931 3569 4081 
+Q 3175 4231 2741 4231 
+Q 1884 4231 1454 3753 
+Q 1025 3275 1025 2328 
+Q 1025 1384 1454 906 
+Q 1884 428 2741 428 
+Q 3075 428 3337 486 
+Q 3600 544 3809 666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-76" d="M 191 3500 
+L 800 3500 
+L 1894 563 
+L 2988 3500 
+L 3597 3500 
+L 2284 0 
+L 1503 0 
+L 191 3500 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-47"/>
+      <use xlink:href="#DejaVuSans-50" x="77.490234"/>
+      <use xlink:href="#DejaVuSans-55" x="137.792969"/>
+      <use xlink:href="#DejaVuSans-20" x="210.986328"/>
+      <use xlink:href="#DejaVuSans-6e" x="242.773438"/>
+      <use xlink:href="#DejaVuSans-61" x="306.152344"/>
+      <use xlink:href="#DejaVuSans-69" x="367.431641"/>
+      <use xlink:href="#DejaVuSans-76" x="395.214844"/>
+      <use xlink:href="#DejaVuSans-65" x="454.394531"/>
+     </g>
+    </g>
+    <g id="line2d_60">
+     <path d="M 54.478125 49.654687 
+L 64.478125 49.654687 
+L 74.478125 49.654687 
+" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square"/>
+    </g>
+    <g id="text_11">
+     <!-- GPU efficient -->
+     <g transform="translate(82.478125 53.154687) scale(0.1 -0.1)">
+      <defs>
+       <path id="DejaVuSans-66" d="M 2375 4863 
+L 2375 4384 
+L 1825 4384 
+Q 1516 4384 1395 4259 
+Q 1275 4134 1275 3809 
+L 1275 3500 
+L 2222 3500 
+L 2222 3053 
+L 1275 3053 
+L 1275 0 
+L 697 0 
+L 697 3053 
+L 147 3053 
+L 147 3500 
+L 697 3500 
+L 697 3744 
+Q 697 4328 969 4595 
+Q 1241 4863 1831 4863 
+L 2375 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSans-63" d="M 3122 3366 
+L 3122 2828 
+Q 2878 2963 2633 3030 
+Q 2388 3097 2138 3097 
+Q 1578 3097 1268 2742 
+Q 959 2388 959 1747 
+Q 959 1106 1268 751 
+Q 1578 397 2138 397 
+Q 2388 397 2633 464 
+Q 2878 531 3122 666 
+L 3122 134 
+Q 2881 22 2623 -34 
+Q 2366 -91 2075 -91 
+Q 1284 -91 818 406 
+Q 353 903 353 1747 
+Q 353 2603 823 3093 
+Q 1294 3584 2113 3584 
+Q 2378 3584 2631 3529 
+Q 2884 3475 3122 3366 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSans-47"/>
+      <use xlink:href="#DejaVuSans-50" x="77.490234"/>
+      <use xlink:href="#DejaVuSans-55" x="137.792969"/>
+      <use xlink:href="#DejaVuSans-20" x="210.986328"/>
+      <use xlink:href="#DejaVuSans-65" x="242.773438"/>
+      <use xlink:href="#DejaVuSans-66" x="304.296875"/>
+      <use xlink:href="#DejaVuSans-66" x="339.501953"/>
+      <use xlink:href="#DejaVuSans-69" x="374.707031"/>
+      <use xlink:href="#DejaVuSans-63" x="402.490234"/>
+      <use xlink:href="#DejaVuSans-69" x="457.470703"/>
+      <use xlink:href="#DejaVuSans-65" x="485.253906"/>
+      <use xlink:href="#DejaVuSans-6e" x="546.777344"/>
+      <use xlink:href="#DejaVuSans-74" x="610.15625"/>
+     </g>
+    </g>
+    <g id="line2d_61">
+     <path d="M 54.478125 64.332812 
+L 64.478125 64.332812 
+L 74.478125 64.332812 
+" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square"/>
+    </g>
+    <g id="text_12">
+     <!-- GPU thrust -->
+     <g transform="translate(82.478125 67.832812) scale(0.1 -0.1)">
+      <use xlink:href="#DejaVuSans-47"/>
+      <use xlink:href="#DejaVuSans-50" x="77.490234"/>
+      <use xlink:href="#DejaVuSans-55" x="137.792969"/>
+      <use xlink:href="#DejaVuSans-20" x="210.986328"/>
+      <use xlink:href="#DejaVuSans-74" x="242.773438"/>
+      <use xlink:href="#DejaVuSans-68" x="281.982422"/>
+      <use xlink:href="#DejaVuSans-72" x="345.361328"/>
+      <use xlink:href="#DejaVuSans-75" x="386.474609"/>
+      <use xlink:href="#DejaVuSans-73" x="449.853516"/>
+      <use xlink:href="#DejaVuSans-74" x="501.953125"/>
+     </g>
+    </g>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="p19fa7cb5a1">
+   <rect x="45.478125" y="7.2" width="399.894791" height="289.528"/>
+  </clipPath>
+ </defs>
+</svg>

From 1dce29ca8fb9334a9da978f0f2fbaf1d39e2f1d2 Mon Sep 17 00:00:00 2001
From: DomIno0o <51756125+DomIno0o@users.noreply.github.com>
Date: Wed, 18 Sep 2024 23:34:19 -0400
Subject: [PATCH 08/10] Initial update of README.md

---
 README.md | 119 +++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 110 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 0e38ddb..2908c8f 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,115 @@
-CUDA Stream Compaction
-======================
+Project 2 Stream Compaction
+===========================
 
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
+**University of Pennsylvania, CIS 5650: GPU Programming and Architecture**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Dominik Kau ([LinkedIn](https://www.linkedin.com/in/dominikkau/))
+* Tested on: Windows 10, i7-12700 @ 2.10 GHz, 32 GB, T1000 4096 MB (CETS machine)
 
-### (TODO: Your README)
+## Scan and Stream Compaction
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+Scan is an algorithm that outputs an array in which all previous elemnts of the input array have been summed up (https://en.wikipedia.org/wiki/Prefix_sum).
+Stream compaction outputs an array that contains only those elements of the input that fulfill some predicate (in this project being non-zero).
+The parallel implementation of stream compaction in this project will make use of the scan algorithm.
 
+This project contains 4 implementations of the scan algorithm:
+* straight-forward implementation on the CPU (CPU)
+* naive, parallel implementation on the GPU (GPU naive)
+* work-efficient implementation on the GPU (GPU efficient)
+* implementation using the thrust library (GPU thrust)
+
+The stream compatction algorithm is implememented in 3 variants:
+* straight-forward implementation on the CPU (CPU)
+* scan based implementation on the CPU
+* work-efficient scan based implementation on the GPU (GPU)
+
+These cases are labeled in the following graphs by the identifiers given in parantheses.
+All implementations were tested on array lengths that are powers of 2 and on array lengths that are not powers of 2.
+
+## Performance Analysis
+
+### Scan timings on arrays with sizes that are powers of 2
+
+![](img/performance_pow2.svg)
+
+Surprisingly, the CPU implementation is quicker than both of my GPU implementations.
+This is despite the fact, that for both GPU implementations, I am launching kernels only with as many threads as are needed (up to the block size).
+This prevents starting many threads that will not be active after a first if-condition.
+As far as I understand this is what Part 5, Extra Credit is alluding to.
+This result might be due to the usage of a CETS machine.
+
+Unfortunately, I haven't been able to make a detailed performance analysis using the Nsight suite, but obviously the memory access pattern is suboptimal.
+Firstly, using global memory will lead to high latencies in each kernel call.
+Secondly the strided indexing pattern in both GPU implementations has the same performance as random access of memory which creates a big bottleneck.
+
+Those bottlenecks are not present in the thrust implementation which is by far the fastest.
+
+### Scan timings on arrays with sizes that are not powers of 2
+
+![](img/performance_nonpow2.svg)
+
+The performance on arrays with non-powers of 2 lengths reflects the same trends as the results above.
+Interestingly, for smaller arrays the thrust implementation is quite a bit quicker in this scenario than in the above case of arrays with lengths of power 2.
+
+### Timing results of compacting algorithm
+
+![](img/performance_compact.svg)
+
+For the compacting algorithm, the GPU implementation is actually faster for large array sizes.
+Here, only the straight-forward CPU implementation is used, as it is faster than the scan based algorithm.
+I would have expected bigeer differences between the two GPU and the CPU implementation, but again, the surprisingly short CPU runtime could stem from the fact that I am using a CETS machine.
+There is not a big difference between the arrays with power of 2 lengths and those with non-power of 2 lengths.
+
+
+## Console Output
+
+This is the console ouput after running the project with an array size of $2^{20} = 1048576$.
+I removed the numeric outputs for clarity.
+
+```
+****************
+** SCAN TESTS **
+****************
+==== cpu scan, power-of-two ====
+   elapsed time: 0.4223ms    (std::chrono Measured)
+==== cpu scan, non-power-of-two ====
+   elapsed time: 0.3906ms    (std::chrono Measured)
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 2.48595ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 2.34307ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 1.90022ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 1.82384ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 0.503264ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.183776ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 2.1741ms    (std::chrono Measured)
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 2.1809ms    (std::chrono Measured)
+    passed
+==== cpu compact with scan ====
+   elapsed time: 3.3442ms    (std::chrono Measured)
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 3.5247ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 3.73046ms    (CUDA Measured)
+    passed
+```

From a9244875852d6db4864dd412d74a8dd8b85b1332 Mon Sep 17 00:00:00 2001
From: DomIno0o <51756125+DomIno0o@users.noreply.github.com>
Date: Wed, 18 Sep 2024 23:43:42 -0400
Subject: [PATCH 09/10] Update README.md

---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 2908c8f..d2ee079 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ Project 2 Stream Compaction
 
 ## Scan and Stream Compaction
 
-Scan is an algorithm that outputs an array in which all previous elemnts of the input array have been summed up (https://en.wikipedia.org/wiki/Prefix_sum).
+Scan is an algorithm that outputs an array in which all previous elements of the input array have been summed up (https://en.wikipedia.org/wiki/Prefix_sum).
 Stream compaction outputs an array that contains only those elements of the input that fulfill some predicate (in this project being non-zero).
 The parallel implementation of stream compaction in this project will make use of the scan algorithm.
 
@@ -18,13 +18,14 @@ This project contains 4 implementations of the scan algorithm:
 * work-efficient implementation on the GPU (GPU efficient)
 * implementation using the thrust library (GPU thrust)
 
-The stream compatction algorithm is implememented in 3 variants:
+The stream compaction algorithm is implemented in 3 variants:
 * straight-forward implementation on the CPU (CPU)
 * scan based implementation on the CPU
 * work-efficient scan based implementation on the GPU (GPU)
 
-These cases are labeled in the following graphs by the identifiers given in parantheses.
+These cases are labeled in the following graphs by the identifiers given in parentheses.
 All implementations were tested on array lengths that are powers of 2 and on array lengths that are not powers of 2.
+As far as I understand Part 5, Extra Credit, I implemented the alluded optimization by starting only as many threads as are needed (up to the block size) in every round of the algorithm.
 
 ## Performance Analysis
 
@@ -35,7 +36,6 @@ All implementations were tested on array lengths that are powers of 2 and on arr
 Surprisingly, the CPU implementation is quicker than both of my GPU implementations.
 This is despite the fact, that for both GPU implementations, I am launching kernels only with as many threads as are needed (up to the block size).
 This prevents starting many threads that will not be active after a first if-condition.
-As far as I understand this is what Part 5, Extra Credit is alluding to.
 This result might be due to the usage of a CETS machine.
 
 Unfortunately, I haven't been able to make a detailed performance analysis using the Nsight suite, but obviously the memory access pattern is suboptimal.
@@ -57,13 +57,13 @@ Interestingly, for smaller arrays the thrust implementation is quite a bit quick
 
 For the compacting algorithm, the GPU implementation is actually faster for large array sizes.
 Here, only the straight-forward CPU implementation is used, as it is faster than the scan based algorithm.
-I would have expected bigeer differences between the two GPU and the CPU implementation, but again, the surprisingly short CPU runtime could stem from the fact that I am using a CETS machine.
+I would have expected bigger differences between the two GPU and the CPU implementation, but again, the surprisingly short CPU runtime could stem from the fact that I am using a CETS machine.
 There is not a big difference between the arrays with power of 2 lengths and those with non-power of 2 lengths.
 
 
 ## Console Output
 
-This is the console ouput after running the project with an array size of $2^{20} = 1048576$.
+This is the console output after running the project with an array size of $2^{20} = 1048576$.
 I removed the numeric outputs for clarity.
 
 ```

From 7a541a69af34480cc14723f9d29faca5b463f698 Mon Sep 17 00:00:00 2001
From: Kau <dominikk@kite.upenn.edu>
Date: Wed, 18 Sep 2024 23:53:03 -0400
Subject: [PATCH 10/10] Minor code cleanup

---
 stream_compaction/cpu.cu       |  2 +-
 stream_compaction/efficient.cu | 18 ++++++------------
 stream_compaction/naive.cu     |  4 ++--
 stream_compaction/thrust.cu    |  3 ---
 4 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 956cced..044d37b 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -19,7 +19,7 @@ namespace StreamCompaction {
          */
         void scan(int n, int *odata, const int *idata, bool timed) {
             if (timed) timer().startCpuTimer();
-            // TODO
+            // TODO        
             int partialSum = 0;
             for (int i = 0; i < n; ++i) {
                 odata[i] = partialSum;
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 9c84ae3..89427ea 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -37,14 +37,6 @@ namespace StreamCompaction {
             buffer[index + pow2todp1 - 1] += tmp;
         }
 
-        __global__ void kernZeroPadding(int n, int d, int* buffer) {
-            int index = threadIdx.x + blockIdx.x * blockDim.x;
-
-            if (index >= 1 << (d + 1) - n) return;
-
-            buffer[n + index] = 0;
-        }
-
         dim3 computeBlocksPerGrid(int threads, int blockSize) {
             return dim3{ (unsigned int)(threads + blockSize - 1) / blockSize };
         }
@@ -53,7 +45,7 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata, bool timed) {
-            int blockSize = 64;
+            int blockSize = 128;
 
             bool isPower2Length = (n == (1 << ilog2(n)));
 
@@ -74,8 +66,8 @@ namespace StreamCompaction {
             for (int d = 0; d < ilog2ceil(n); ++d) {
                 dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize);
                 kernUpSweep<<<blocks, blockSize>>>(bufferLength, 1 << d, dev_tmpArray);
-                checkCUDAError("kernUpSweep failed!");
                 cudaDeviceSynchronize();
+                checkCUDAError("kernUpSweep failed!");
             }
 
             cudaMemset(dev_tmpArray + bufferLength - 1, 0, sizeof(int));
@@ -83,8 +75,8 @@ namespace StreamCompaction {
             for (int d = ilog2ceil(n) - 1; d >= 0; --d) {
                 dim3 blocks = computeBlocksPerGrid(bufferLength / (1 << (d + 1)), blockSize);
                 kernDownSweep<<<blocks, blockSize>>>(bufferLength, 1 << d, dev_tmpArray);
-                checkCUDAError("kernDownSweep failed!");
                 cudaDeviceSynchronize();
+                checkCUDAError("kernDownSweep failed!");
             }
             if (timed) timer().endGpuTimer();
 
@@ -120,6 +112,7 @@ namespace StreamCompaction {
             checkCUDAError("cudaMalloc dev_buffer2 failed!");
 
             cudaMemcpy(dev_buffer1, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAError("cudaMemcpy idata->dev_buffer1 failed!");
 
             timer().startGpuTimer();
 
@@ -130,10 +123,11 @@ namespace StreamCompaction {
             scan(n, dev_indices, dev_boolArray, 0);
 
             StreamCompaction::Common::kernScatter<<<blocks, blockSize>>>(n, dev_buffer2, dev_buffer1, dev_boolArray, dev_indices);
-            checkCUDAError("kernScatter failed!");
             cudaDeviceSynchronize();
+            checkCUDAError("kernScatter failed!");
             
             cudaMemcpy(odata, dev_buffer2, n * sizeof(int), cudaMemcpyDeviceToHost);
+            checkCUDAError("cudaMemcpy dev_buffer2->odata failed!");
             
             // Index that last element in idata would have, if it was valid
             int lastIndex;
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index f811076..b7c18cd 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -57,12 +57,12 @@ namespace StreamCompaction {
             timer().startGpuTimer();
             StreamCompaction::Common::shiftArrayElements<<<fullBlocksPerGrid, blockSize>>>(n, 1, dev_buffer2, dev_buffer1);
             checkCUDAError("shiftArrayElements failed!");
-            cudaDeviceSynchronize();
+            //cudaDeviceSynchronize();
 
             for (int d = 0; d < ilog2(n); ++d) {
                 kernNaiveScanStep <<<fullBlocksPerGrid, blockSize>>>(n, d, dev_buffer1, dev_buffer2);
                 checkCUDAError("naiveScanStep failed!");
-                cudaDeviceSynchronize();
+                //cudaDeviceSynchronize();
 
                 std::swap(dev_buffer1, dev_buffer2);
             }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 463ca2e..32a3a73 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -27,9 +27,6 @@ namespace StreamCompaction {
             cudaMemcpy(dev_buffer, idata, n * sizeof(int), cudaMemcpyHostToDevice);
 
             timer().startGpuTimer();
-            // TODO use `thrust::exclusive_scan`
-            // example: for device_vectors dv_in and dv_out:
-            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
             thrust::exclusive_scan(dev_thrustBuffer, dev_thrustBuffer + n, dev_thrustBuffer);
             timer().endGpuTimer();