From c8b629cfcd6b3928a914933016173731a7b271e1 Mon Sep 17 00:00:00 2001
From: Neela Kausik <kausik1@llnl.gov>
Date: Wed, 19 Jul 2023 14:36:53 -0700
Subject: [PATCH 01/18] forall_with_streams and updated BenchmarkForall.cpp

---
 benchmarks/BenchmarkForall.cpp | 17 ++++++++++
 src/care/forall.h              | 62 +++++++++++++++++++++++-----------
 2 files changed, 60 insertions(+), 19 deletions(-)
diff --git a/benchmarks/BenchmarkForall.cpp b/benchmarks/BenchmarkForall.cpp
index e909ac7c..da904822 100644
--- a/benchmarks/BenchmarkForall.cpp
+++ b/benchmarks/BenchmarkForall.cpp
@@ -69,6 +69,23 @@ static void benchmark_gpu_loop(benchmark::State& state) {
 // Register the function as a benchmark
 BENCHMARK(benchmark_gpu_loop)->Range(1, INT_MAX);
 
+static void benchmark_gpu_loop_streams(benchmark::State& state) {
+   const int size = state.range(0);
+   care::host_device_ptr<int> data(size, "data");
+
+   for (auto _ : state) {
+      RAJA::resources::Cuda res;
+      care::forall_with_stream(care::gpu{}, res, "test", 0, 0, size, [=] CARE_DEVICE (int i) {
+         data[i] = i;
+      }); 
+   }
+
+   data.free();
+}
+
+// Register the function as a benchmark
+BENCHMARK(benchmark_gpu_loop_streams)->Range(1, INT_MAX);
+
 #endif
 
 // Run the benchmarks
diff --git a/src/care/forall.h b/src/care/forall.h
index d45433cc..4ac0043a 100644
--- a/src/care/forall.h
+++ b/src/care/forall.h
@@ -18,12 +18,11 @@
 
 // CARE headers
 #include "care/policies.h"
+#include "care/RAJAPlugin.h"
 #include "care/util.h"
-#include "care/PluginData.h"
 
 // other library headers
 #include "chai/ArrayManager.hpp"
-#include "chai/ExecutionSpaces.hpp"
 #include "RAJA/RAJA.hpp"
 
 namespace care {
@@ -76,9 +75,9 @@ namespace care {
       const int length = end - start;
 
       if (length != 0) {
-         PluginData::setFileName(fileName);
-         PluginData::setLineNumber(lineNumber);
-
+#ifndef CARE_DISABLE_RAJAPLUGIN
+         RAJAPlugin::pre_forall_hook(ExecutionPolicyToSpace<ExecutionPolicy>::value, fileName, lineNumber);
+#endif
 
 #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
          RAJA::RangeStrideSegment rangeSegment =
@@ -94,6 +93,10 @@ namespace care {
 #else
          RAJA::forall<ExecutionPolicy>(rangeSegment, std::forward<LB>(body));
 #endif
+
+#ifndef CARE_DISABLE_RAJAPLUGIN
+         RAJAPlugin::post_forall_hook(ExecutionPolicyToSpace<ExecutionPolicy>::value, fileName, lineNumber);
+#endif
       }
    }
 
@@ -189,6 +192,31 @@ namespace care {
 #endif
    }
 
+   template <typename LB> 
+   void forall_with_stream(gpu, RAJA::resources::Cuda res, const char * fileName, const int lineNumber,
+               const int start, const int end, LB&& body) {
+#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
+      s_reverseLoopOrder = true;
+#endif
+
+#if CARE_ENABLE_GPU_SIMULATION_MODE
+      forall(gpu_simulation{}, res, fileName, lineNumber, start, end, std::forward<LB>(body));
+#elif defined(__CUDACC__)
+      forall(RAJA::cuda_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>{},
+             res, RAJA::RangeSegment(start, end), std::forward<LB>(body));
+#elif defined(__HIPCC__)
+      forall(RAJA::hip_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>{},
+             res, RAJA::RangeSegment(start, end), std::forward<LB>(body));
+#else
+      forall(RAJA::seq_exec{}, res, fileName, lineNumber, start, end, std::forward<LB>(body));
+#endif
+
+#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
+      s_reverseLoopOrder = false;
+#endif   
+	}
+
+
    ////////////////////////////////////////////////////////////////////////////////
    ///
    /// @author Alan Dayton
@@ -211,8 +239,8 @@ namespace care {
 #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
       s_reverseLoopOrder = true;
 #endif
-      PluginData::setParallelContext(true);
-      
+
+      RAJAPlugin::setParallelContext(true);
 #if CARE_ENABLE_GPU_SIMULATION_MODE
       forall(gpu_simulation{}, fileName, lineNumber, start, end, std::forward<LB>(body));
 #elif defined(__CUDACC__)
@@ -226,8 +254,8 @@ namespace care {
 #else
       forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, std::forward<LB>(body));
 #endif
-      PluginData::setParallelContext(false);
 
+      RAJAPlugin::setParallelContext(false);
 #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
       s_reverseLoopOrder = false;
 #endif
@@ -487,10 +515,8 @@ namespace care {
    ///
    ////////////////////////////////////////////////////////////////////////////////
    template <typename LB, typename Exec>
-   void launch_2D_jagged(Exec /*policy*/, int xstart, int /*xend*/, int const * host_lengths, int ystart, int ylength, const char * /* fileName */, int /* lineNumber */, LB && body) {
-      chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance();       
-      arrayManager->setExecutionSpace(ExecutionPolicyToSpace<Exec>::value);
-
+   void launch_2D_jagged(Exec /*policy*/, int xstart, int /*xend*/, int const * host_lengths, int ystart, int ylength, const char * fileName, int lineNumber, LB && body) {
+      care::RAJAPlugin::pre_forall_hook(chai::CPU, fileName, lineNumber);
       // intentional trigger of copy constructor for CHAI correctness
       LB body_to_call{body};
       for (int y = ystart; y < ylength; ++y) {
@@ -498,7 +524,7 @@ namespace care {
             body_to_call(x, y);
          }
       }
-      arrayManager->setExecutionSpace(chai::ExecutionSpace::NONE);
+      care::RAJAPlugin::post_forall_hook(chai::CPU, fileName, lineNumber);
    }
 
 #ifdef CARE_GPUCC
@@ -541,22 +567,20 @@ namespace care {
    ///
    ////////////////////////////////////////////////////////////////////////////////
    template <typename LB>
-   void launch_2D_jagged(care::gpu, int xstart, int xend, int const * gpu_lengths, int ystart, int ylength, const char * fileName, int lineNumber , LB && body) {
+   void launch_2D_jagged(care::gpu, int xstart, int xend, int const * gpu_lengths, int ystart, int ylength, const char * fileName, int lineNumber, LB && body) {
        if (xend > 0 && ylength > 0) {
           // TODO launch this kernel in the camp or RAJA default stream - not sure how to do this - for now this is a synchronous call on the CUDA/HIP default stream
-          chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance();
-          arrayManager->setExecutionSpace(chai::GPU);
-
+          care::RAJAPlugin::pre_forall_hook(chai::GPU, fileName, lineNumber);
           dim3 dimBlock(CARE_CUDA_BLOCK_SIZE, 1);
           dim3 dimGrid;
           dimGrid.x  = (xend/CARE_CUDA_BLOCK_SIZE)+(xend%CARE_CUDA_BLOCK_SIZE==0?0:1);
           dimGrid.y = ylength;
           care_kernel_2D<<<dimGrid, dimBlock>>>( body, gpu_lengths, ylength);
-          
-          arrayManager->setExecutionSpace(chai::ExecutionSpace::NONE);
+          care::RAJAPlugin::post_forall_hook(chai::GPU, fileName, lineNumber);
        }
    }
 #endif
+
 } // namespace care
 
 #endif // !defined(_CARE_FORALL_H_)

From d7f25c9788099c897a733356586b48bd51c6f2f5 Mon Sep 17 00:00:00 2001
From: Neela Kausik <kausik1@llnl.gov>
Date: Wed, 19 Jul 2023 14:45:00 -0700
Subject: [PATCH 02/18] branch wasn't up to date

---
 benchmarks/BenchmarkForall.cpp |  2 +-
 src/care/forall.h              | 57 ++++++++++++++++++++++------------
 2 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/benchmarks/BenchmarkForall.cpp b/benchmarks/BenchmarkForall.cpp
index da904822..eefd3392 100644
--- a/benchmarks/BenchmarkForall.cpp
+++ b/benchmarks/BenchmarkForall.cpp
@@ -75,7 +75,7 @@ static void benchmark_gpu_loop_streams(benchmark::State& state) {
 
    for (auto _ : state) {
       RAJA::resources::Cuda res;
-      care::forall_with_stream(care::gpu{}, res, "test", 0, 0, size, [=] CARE_DEVICE (int i) {
+      care::forall_with_stream(care::gpu{}, res, "BenchmarkForall.cpp", 78, 0, size, [=] CARE_DEVICE (int i) {
          data[i] = i;
       }); 
    }
diff --git a/src/care/forall.h b/src/care/forall.h
index 4ac0043a..76d9cf1f 100644
--- a/src/care/forall.h
+++ b/src/care/forall.h
@@ -18,11 +18,12 @@
 
 // CARE headers
 #include "care/policies.h"
-#include "care/RAJAPlugin.h"
 #include "care/util.h"
+#include "care/PluginData.h"
 
 // other library headers
 #include "chai/ArrayManager.hpp"
+#include "chai/ExecutionSpaces.hpp"
 #include "RAJA/RAJA.hpp"
 
 namespace care {
@@ -75,9 +76,9 @@ namespace care {
       const int length = end - start;
 
       if (length != 0) {
-#ifndef CARE_DISABLE_RAJAPLUGIN
-         RAJAPlugin::pre_forall_hook(ExecutionPolicyToSpace<ExecutionPolicy>::value, fileName, lineNumber);
-#endif
+         PluginData::setFileName(fileName);
+         PluginData::setLineNumber(lineNumber);
+
 
 #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
          RAJA::RangeStrideSegment rangeSegment =
@@ -93,10 +94,6 @@ namespace care {
 #else
          RAJA::forall<ExecutionPolicy>(rangeSegment, std::forward<LB>(body));
 #endif
-
-#ifndef CARE_DISABLE_RAJAPLUGIN
-         RAJAPlugin::post_forall_hook(ExecutionPolicyToSpace<ExecutionPolicy>::value, fileName, lineNumber);
-#endif
       }
    }
 
@@ -192,6 +189,23 @@ namespace care {
 #endif
    }
 
+   ////////////////////////////////////////////////////////////////////////////////
+   ///
+   /// @author Neela Kausik
+   ///
+   /// @brief If GPU is available, execute on the device. Otherwise, execute on
+   ///        the host. This specialization is needed for clang-query.
+   ///
+   /// @arg[in] gpu Used to choose this overload of forall
+   /// @arg[in] res Resource provided for execution
+   /// @arg[in] fileName The name of the file where this function is called
+   /// @arg[in] lineNumber The line number in the file where this function is called
+   /// @arg[in] start The starting index (inclusive)
+   /// @arg[in] end The ending index (exclusive)
+   /// @arg[in] body The loop body to execute at each index
+   ///
+   ////////////////////////////////////////////////////////////////////////////////
+
    template <typename LB> 
    void forall_with_stream(gpu, RAJA::resources::Cuda res, const char * fileName, const int lineNumber,
                const int start, const int end, LB&& body) {
@@ -214,8 +228,7 @@ namespace care {
 #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
       s_reverseLoopOrder = false;
 #endif   
-	}
-
+	}   
 
    ////////////////////////////////////////////////////////////////////////////////
    ///
@@ -239,8 +252,8 @@ namespace care {
 #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
       s_reverseLoopOrder = true;
 #endif
-
-      RAJAPlugin::setParallelContext(true);
+      PluginData::setParallelContext(true);
+      
 #if CARE_ENABLE_GPU_SIMULATION_MODE
       forall(gpu_simulation{}, fileName, lineNumber, start, end, std::forward<LB>(body));
 #elif defined(__CUDACC__)
@@ -254,8 +267,8 @@ namespace care {
 #else
       forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, std::forward<LB>(body));
 #endif
+      PluginData::setParallelContext(false);
 
-      RAJAPlugin::setParallelContext(false);
 #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
       s_reverseLoopOrder = false;
 #endif
@@ -515,8 +528,10 @@ namespace care {
    ///
    ////////////////////////////////////////////////////////////////////////////////
    template <typename LB, typename Exec>
-   void launch_2D_jagged(Exec /*policy*/, int xstart, int /*xend*/, int const * host_lengths, int ystart, int ylength, const char * fileName, int lineNumber, LB && body) {
-      care::RAJAPlugin::pre_forall_hook(chai::CPU, fileName, lineNumber);
+   void launch_2D_jagged(Exec /*policy*/, int xstart, int /*xend*/, int const * host_lengths, int ystart, int ylength, const char * /* fileName */, int /* lineNumber */, LB && body) {
+      chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance();       
+      arrayManager->setExecutionSpace(ExecutionPolicyToSpace<Exec>::value);
+
       // intentional trigger of copy constructor for CHAI correctness
       LB body_to_call{body};
       for (int y = ystart; y < ylength; ++y) {
@@ -524,7 +539,7 @@ namespace care {
             body_to_call(x, y);
          }
       }
-      care::RAJAPlugin::post_forall_hook(chai::CPU, fileName, lineNumber);
+      arrayManager->setExecutionSpace(chai::ExecutionSpace::NONE);
    }
 
 #ifdef CARE_GPUCC
@@ -567,20 +582,22 @@ namespace care {
    ///
    ////////////////////////////////////////////////////////////////////////////////
    template <typename LB>
-   void launch_2D_jagged(care::gpu, int xstart, int xend, int const * gpu_lengths, int ystart, int ylength, const char * fileName, int lineNumber, LB && body) {
+   void launch_2D_jagged(care::gpu, int xstart, int xend, int const * gpu_lengths, int ystart, int ylength, const char * fileName, int lineNumber , LB && body) {
        if (xend > 0 && ylength > 0) {
           // TODO launch this kernel in the camp or RAJA default stream - not sure how to do this - for now this is a synchronous call on the CUDA/HIP default stream
-          care::RAJAPlugin::pre_forall_hook(chai::GPU, fileName, lineNumber);
+          chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance();
+          arrayManager->setExecutionSpace(chai::GPU);
+
           dim3 dimBlock(CARE_CUDA_BLOCK_SIZE, 1);
           dim3 dimGrid;
           dimGrid.x  = (xend/CARE_CUDA_BLOCK_SIZE)+(xend%CARE_CUDA_BLOCK_SIZE==0?0:1);
           dimGrid.y = ylength;
           care_kernel_2D<<<dimGrid, dimBlock>>>( body, gpu_lengths, ylength);
-          care::RAJAPlugin::post_forall_hook(chai::GPU, fileName, lineNumber);
+          
+          arrayManager->setExecutionSpace(chai::ExecutionSpace::NONE);
        }
    }
 #endif
-
 } // namespace care
 
 #endif // !defined(_CARE_FORALL_H_)

From 26fbe1b6572c76b764a145f877f54eb49597ead9 Mon Sep 17 00:00:00 2001
From: Neela Kausik <kausik1@llnl.gov>
Date: Mon, 24 Jul 2023 13:38:56 -0700
Subject: [PATCH 03/18] multiple kernels

---
 benchmarks/BenchmarkForall.cpp | 48 ++++++++++--------
 benchmarks/BenchmarkRaja.cpp   | 89 ++++++++++++++++++++++++++++++++++
 benchmarks/CMakeLists.txt      | 13 +++++
 3 files changed, 130 insertions(+), 20 deletions(-)
 create mode 100644 benchmarks/BenchmarkRaja.cpp

diff --git a/benchmarks/BenchmarkForall.cpp b/benchmarks/BenchmarkForall.cpp
index eefd3392..4bf536b2 100644
--- a/benchmarks/BenchmarkForall.cpp
+++ b/benchmarks/BenchmarkForall.cpp
@@ -8,6 +8,7 @@
 // CARE headers
 #include "care/DefaultMacros.h"
 #include "care/host_device_ptr.h"
+#include "care/util.h"
 
 // Other library headers
 #include <benchmark/benchmark.h>
@@ -19,9 +20,14 @@ static void benchmark_sequential_loop(benchmark::State& state) {
    const int size = state.range(0);
    care::host_device_ptr<int> data(size, "data");
 
+   // For consistency with the GPU case, which requires a warm up kernel
+   CARE_SEQUENTIAL_LOOP(i, 0, size) {
+      data[i] = 0;
+   } CARE_SEQUENTIAL_LOOP_END
+
    for (auto _ : state) {
       CARE_SEQUENTIAL_LOOP(i, 0, size) {
-         data[i] = i;
+         data[i] += i;
       } CARE_SEQUENTIAL_LOOP_END
    }
 
@@ -37,10 +43,19 @@ static void benchmark_openmp_loop(benchmark::State& state) {
    const int size = state.range(0);
    care::host_device_ptr<int> data(size, "data");
 
+   // For consistency with the GPU case, which requires a warm up kernel
+   CARE_OPENMP_LOOP(i, 0, size) {
+      data[i] = 0;
+   } CARE_OPENMP_LOOP_END
+
+   // TODO: Is a synchronize needed?
+
    for (auto _ : state) {
       CARE_OPENMP_LOOP(i, 0, size) {
-         data[i] = i;
+         data[i] += i;
       } CARE_OPENMP_LOOP_END
+
+      // TODO: Is a synchronize needed?
    }
 
    data.free();
@@ -57,34 +72,27 @@ static void benchmark_gpu_loop(benchmark::State& state) {
    const int size = state.range(0);
    care::host_device_ptr<int> data(size, "data");
 
+   // Warm up kernel
+   CARE_GPU_LOOP(i, 0, size) {
+      data[i] = 0;
+   } CARE_GPU_LOOP_END
+
+   care::gpuDeviceSynchronize(__FILE__, __LINE__);
+
    for (auto _ : state) {
       CARE_GPU_LOOP(i, 0, size) {
-         data[i] = i;
+         data[i] += i;
       } CARE_GPU_LOOP_END
-   }
-
-   data.free();
-}
 
-// Register the function as a benchmark
-BENCHMARK(benchmark_gpu_loop)->Range(1, INT_MAX);
-
-static void benchmark_gpu_loop_streams(benchmark::State& state) {
-   const int size = state.range(0);
-   care::host_device_ptr<int> data(size, "data");
-
-   for (auto _ : state) {
-      RAJA::resources::Cuda res;
-      care::forall_with_stream(care::gpu{}, res, "BenchmarkForall.cpp", 78, 0, size, [=] CARE_DEVICE (int i) {
-         data[i] = i;
-      }); 
+      // Timings are much more consistent with this synchronize
+      care::gpuDeviceSynchronize(__FILE__, __LINE__);
    }
 
    data.free();
 }
 
 // Register the function as a benchmark
-BENCHMARK(benchmark_gpu_loop_streams)->Range(1, INT_MAX);
+BENCHMARK(benchmark_gpu_loop)->Range(1, INT_MAX);
 
 #endif
 
diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp
new file mode 100644
index 00000000..9c092f5f
--- /dev/null
+++ b/benchmarks/BenchmarkRaja.cpp
@@ -0,0 +1,89 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// Copyright 2020 Lawrence Livermore National Security, LLC and other CARE developers.
+// See the top-level LICENSE file for details.
+//
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////////////
+
+// CARE headers
+#include "care/DefaultMacros.h"
+#include "care/host_device_ptr.h"
+#include "care/forall.h"
+#include "care/policies.h"
+#include "RAJA/RAJA.hpp"
+
+// Other library headers
+#include <benchmark/benchmark.h>
+#include <omp.h>
+
+// Std library headers
+#include <climits>
+#include <cmath>
+
+#define NUM_KERNELS 4
+
+using namespace care;
+
+#if defined(CARE_GPUCC)
+//each kernel has a separate stream
+static void benchmark_gpu_loop_separate_streams(benchmark::State& state) {
+   int N = state.range(0);
+	const char * fileName = "test";
+
+	RAJA::resources::Cuda res_arr[NUM_KERNELS];
+	RAJA::resources::Event event_arr[NUM_KERNELS];
+	for(int i = 0; i < NUM_KERNELS; i++)
+	{
+		RAJA::resources::Cuda res;
+		res_arr[i] = res;
+		RAJA::resources::Event e = res.get_event();
+		event_arr[i] = e;
+	}
+	
+	care::host_device_ptr<int> arr(N, "arr");
+   for (auto _ : state) {
+        		//run num kernels
+        		for(int j = 0; j < NUM_KERNELS; j++)
+				{
+					care::forall_with_stream(gpu{}, res_arr[j], fileName, 71, 0 , N, [=] RAJA_HOST_DEVICE (int i) {
+						arr[i] = i;
+					});
+					
+					if(j > 0) res_arr[j].wait_for(&event_arr[j - 1]);
+				}
+   	}
+		arr.free();
+}
+
+// Register the function as a benchmark
+BENCHMARK(benchmark_gpu_loop_separate_streams)->Range(1, INT_MAX);
+
+//all kernels on one stream
+static void benchmark_gpu_loop_single_stream(benchmark::State& state) {
+int N = state.range(0);
+	const char * fileName = "test";
+	RAJA::resources::Cuda res;
+	
+	care::host_device_ptr<int> arr(N, "arr");
+			
+	RAJA::resources::Event e = res.get_event();
+	for (auto _ : state) {
+        		//run num kernels
+        		for(int j = 0; j < NUM_KERNELS; j++)
+				{
+					care::forall_with_stream(gpu{}, res, fileName, 71, 0 , N, [=] RAJA_HOST_DEVICE (int i) {
+						arr[i] = i;
+					});
+					res.wait();
+				}
+   	}
+		arr.free();
+}
+
+// Register the function as a benchmark
+BENCHMARK(benchmark_gpu_loop_single_stream)->Range(1, INT_MAX);
+
+#endif
+
+// Run the benchmarks
+BENCHMARK_MAIN();
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 5f798b75..cfde099a 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -36,6 +36,19 @@ target_include_directories(BenchmarkForall
 blt_add_benchmark(NAME BenchmarkForall
                   COMMAND BenchmarkForall)
 
+blt_add_executable(NAME BenchmarkRaja
+                   SOURCES BenchmarkRaja.cpp
+                   DEPENDS_ON ${care_benchmark_depends})
+
+target_include_directories(BenchmarkRaja
+                           PRIVATE ${PROJECT_SOURCE_DIR}/src)
+
+target_include_directories(BenchmarkRaja
+                           PRIVATE ${PROJECT_BINARY_DIR}/include)
+
+blt_add_benchmark(NAME BenchmarkRaja
+                  COMMAND BenchmarkRaja)
+
 blt_add_executable(NAME BenchmarkNumeric
                    SOURCES BenchmarkNumeric.cpp
                    DEPENDS_ON ${care_benchmark_depends})

From 291fd6576dc5bbf8f0722768bedc26ca6ac78fc6 Mon Sep 17 00:00:00 2001
From: Neela Kausik <kausik1@llnl.gov>
Date: Mon, 24 Jul 2023 16:16:25 -0700
Subject: [PATCH 04/18] care macros

---
 benchmarks/BenchmarkRaja.cpp |  8 ++++----
 src/care/DefaultMacros.h     | 13 +++++++++++++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp
index 9c092f5f..46f82f46 100644
--- a/benchmarks/BenchmarkRaja.cpp
+++ b/benchmarks/BenchmarkRaja.cpp
@@ -45,9 +45,9 @@ static void benchmark_gpu_loop_separate_streams(benchmark::State& state) {
         		//run num kernels
         		for(int j = 0; j < NUM_KERNELS; j++)
 				{
-					care::forall_with_stream(gpu{}, res_arr[j], fileName, 71, 0 , N, [=] RAJA_HOST_DEVICE (int i) {
+					CARE_STREAMED_LOOP(i, res_arr[j], 0 , N) {
 						arr[i] = i;
-					});
+               } CARE_STREAMED_LOOP_END
 					
 					if(j > 0) res_arr[j].wait_for(&event_arr[j - 1]);
 				}
@@ -71,9 +71,9 @@ int N = state.range(0);
         		//run num kernels
         		for(int j = 0; j < NUM_KERNELS; j++)
 				{
-					care::forall_with_stream(gpu{}, res, fileName, 71, 0 , N, [=] RAJA_HOST_DEVICE (int i) {
+					CARE_STREAMED_LOOP(i, res, 0 , N) {
 						arr[i] = i;
-					});
+					}CARE_STREAMED_LOOP_END
 					res.wait();
 				}
    	}
diff --git a/src/care/DefaultMacros.h b/src/care/DefaultMacros.h
index 1de5ec51..37668c42 100644
--- a/src/care/DefaultMacros.h
+++ b/src/care/DefaultMacros.h
@@ -548,6 +548,15 @@
 #define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) }); \
    CARE_NEST_END(CHECK) }}
 
+#define CARE_CHECKED_STREAMED_LOOP_START(INDEX, RESOURCE, START_INDEX, END_INDEX, CHECK) { \
+   if (END_INDEX > START_INDEX) { \
+      CARE_NEST_BEGIN(CHECK) \
+      care::forall_with_stream(care::gpu{}, RESOURCE, __FILE__, __LINE__, START_INDEX, END_INDEX, [=] CARE_DEVICE (const int INDEX) { \
+         CARE_SET_THREAD_ID(INDEX)
+
+#define CARE_CHECKED_STREAMED_LOOP_END(CHECK) }); \
+   CARE_NEST_END(CHECK) }}
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is
@@ -753,6 +762,10 @@
 
 #define CARE_PARALLEL_LOOP_END CARE_CHECKED_PARALLEL_LOOP_END(care_parallel_loop_check)
 
+#define CARE_STREAMED_LOOP(INDEX, RESOURCE, START_INDEX, END_INDEX) CARE_CHECKED_STREAMED_LOOP_START(INDEX, RESOURCE, START_INDEX, END_INDEX, care_streamed_loop_check)
+
+#define CARE_STREAMED_LOOP_END CARE_CHECKED_STREAMED_LOOP_END(care_streamed_loop_check)
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end a RAJA loop that uses at least one

From c31c924a9a114c6105674918318cc6f9c9200535 Mon Sep 17 00:00:00 2001
From: Neela Kausik <kausik1@llnl.gov>
Date: Tue, 25 Jul 2023 08:51:47 -0700
Subject: [PATCH 05/18] try to fix cuda error

---
 src/care/forall.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/care/forall.h b/src/care/forall.h
index 76d9cf1f..f6ed5f7b 100644
--- a/src/care/forall.h
+++ b/src/care/forall.h
@@ -206,6 +206,7 @@ namespace care {
    ///
    ////////////////////////////////////////////////////////////////////////////////
 
+#if defined(CARE_GPUCC)
    template <typename LB> 
    void forall_with_stream(gpu, RAJA::resources::Cuda res, const char * fileName, const int lineNumber,
                const int start, const int end, LB&& body) {
@@ -229,6 +230,7 @@ namespace care {
       s_reverseLoopOrder = false;
 #endif   
 	}   
+#endif
 
    ////////////////////////////////////////////////////////////////////////////////
    ///

From 533e0a42a9bba1015c44ac5eac65042f354c1249 Mon Sep 17 00:00:00 2001
From: Neela Kausik <kausik1@llnl.gov>
Date: Tue, 1 Aug 2023 11:38:32 -0700
Subject: [PATCH 06/18] fixed indentation and macros

---
 benchmarks/BenchmarkRaja.cpp | 76 +++++++++++++++++-------------------
 src/care/DefaultMacros.h     |  4 +-
 2 files changed, 38 insertions(+), 42 deletions(-)

diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp
index 46f82f46..89ae09a3 100644
--- a/benchmarks/BenchmarkRaja.cpp
+++ b/benchmarks/BenchmarkRaja.cpp
@@ -28,31 +28,29 @@ using namespace care;
 //each kernel has a separate stream
 static void benchmark_gpu_loop_separate_streams(benchmark::State& state) {
    int N = state.range(0);
-	const char * fileName = "test";
 
-	RAJA::resources::Cuda res_arr[NUM_KERNELS];
-	RAJA::resources::Event event_arr[NUM_KERNELS];
-	for(int i = 0; i < NUM_KERNELS; i++)
-	{
-		RAJA::resources::Cuda res;
-		res_arr[i] = res;
-		RAJA::resources::Event e = res.get_event();
-		event_arr[i] = e;
-	}
+   RAJA::resources::Cuda res_arr[NUM_KERNELS];
+   RAJA::resources::Event event_arr[NUM_KERNELS];
+   for(int i = 0; i < NUM_KERNELS; i++)
+   {
+      RAJA::resources::Cuda res;
+      res_arr[i] = res;
+      RAJA::resources::Event e = res.get_event();
+      event_arr[i] = e;
+   }
 	
-	care::host_device_ptr<int> arr(N, "arr");
+   care::host_device_ptr<int> arr(N, "arr");
    for (auto _ : state) {
-        		//run num kernels
-        		for(int j = 0; j < NUM_KERNELS; j++)
-				{
-					CARE_STREAMED_LOOP(i, res_arr[j], 0 , N) {
-						arr[i] = i;
-               } CARE_STREAMED_LOOP_END
-					
-					if(j > 0) res_arr[j].wait_for(&event_arr[j - 1]);
-				}
-   	}
-		arr.free();
+      //run num kernels
+      for(int j = 0; j < NUM_KERNELS; j++)
+      {
+         CARE_STREAMED_LOOP(res_arr[j], i, 0 , N) {
+         arr[i] = i;
+         } CARE_STREAMED_LOOP_END					
+         if(j > 0) res_arr[j].wait_for(&event_arr[j - 1]);
+      }
+   }
+   arr.free();
 }
 
 // Register the function as a benchmark
@@ -60,24 +58,22 @@ BENCHMARK(benchmark_gpu_loop_separate_streams)->Range(1, INT_MAX);
 
 //all kernels on one stream
 static void benchmark_gpu_loop_single_stream(benchmark::State& state) {
-int N = state.range(0);
-	const char * fileName = "test";
-	RAJA::resources::Cuda res;
-	
-	care::host_device_ptr<int> arr(N, "arr");
-			
-	RAJA::resources::Event e = res.get_event();
-	for (auto _ : state) {
-        		//run num kernels
-        		for(int j = 0; j < NUM_KERNELS; j++)
-				{
-					CARE_STREAMED_LOOP(i, res, 0 , N) {
-						arr[i] = i;
-					}CARE_STREAMED_LOOP_END
-					res.wait();
-				}
-   	}
-		arr.free();
+   int N = state.range(0);	
+
+   RAJA::resources::Cuda res;   
+
+   care::host_device_ptr<int> arr(N, "arr");	
+   for (auto _ : state) {
+      //run num kernels
+      for(int j = 0; j < NUM_KERNELS; j++)
+      {
+         CARE_STREAMED_LOOP(res, i, 0, N) {
+         arr[i] = i;
+         }CARE_STREAMED_LOOP_END
+         res.wait();
+      }
+   }
+   arr.free();
 }
 
 // Register the function as a benchmark
diff --git a/src/care/DefaultMacros.h b/src/care/DefaultMacros.h
index 37668c42..97aeb674 100644
--- a/src/care/DefaultMacros.h
+++ b/src/care/DefaultMacros.h
@@ -548,7 +548,7 @@
 #define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) }); \
    CARE_NEST_END(CHECK) }}
 
-#define CARE_CHECKED_STREAMED_LOOP_START(INDEX, RESOURCE, START_INDEX, END_INDEX, CHECK) { \
+#define CARE_CHECKED_STREAMED_LOOP_START(RESOURCE, INDEX, START_INDEX, END_INDEX, CHECK) { \
    if (END_INDEX > START_INDEX) { \
       CARE_NEST_BEGIN(CHECK) \
       care::forall_with_stream(care::gpu{}, RESOURCE, __FILE__, __LINE__, START_INDEX, END_INDEX, [=] CARE_DEVICE (const int INDEX) { \
@@ -762,7 +762,7 @@
 
 #define CARE_PARALLEL_LOOP_END CARE_CHECKED_PARALLEL_LOOP_END(care_parallel_loop_check)
 
-#define CARE_STREAMED_LOOP(INDEX, RESOURCE, START_INDEX, END_INDEX) CARE_CHECKED_STREAMED_LOOP_START(INDEX, RESOURCE, START_INDEX, END_INDEX, care_streamed_loop_check)
+#define CARE_STREAMED_LOOP(RESOURCE, INDEX, START_INDEX, END_INDEX) CARE_CHECKED_STREAMED_LOOP_START(RESOURCE, INDEX, START_INDEX, END_INDEX, care_streamed_loop_check)
 
 #define CARE_STREAMED_LOOP_END CARE_CHECKED_STREAMED_LOOP_END(care_streamed_loop_check)
 

From 5d3a07ba365717351253f4b41ea40983430c150d Mon Sep 17 00:00:00 2001
From: Neela Kausik <kausik1@llnl.gov>
Date: Wed, 2 Aug 2023 16:02:20 -0700
Subject: [PATCH 07/18] legacy macros

---
 src/care/DefaultMacros.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/care/DefaultMacros.h b/src/care/DefaultMacros.h
index 97aeb674..6e055d25 100644
--- a/src/care/DefaultMacros.h
+++ b/src/care/DefaultMacros.h
@@ -261,6 +261,10 @@
 
 #define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK)
 
+#define CARE_CHECKED_STREAMED_LOOP_START(RESOURCE, INDEX, START_INDEX, END_INDEX, CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK)
+
+#define CARE_CHECKED_STREAMED_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK)
+
 ////////////////////////////////////////////////////////////////////////////////
 ///
 /// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is

From 131dc67af6c29ced6a25c0f729fe2f6572bc9239 Mon Sep 17 00:00:00 2001
From: Neela Kausik <kausik1@llnl.gov>
Date: Thu, 3 Aug 2023 11:59:22 -0700
Subject: [PATCH 08/18] warmup kernels and multiple arrays

---
 benchmarks/BenchmarkRaja.cpp | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp
index 89ae09a3..9e8b9437 100644
--- a/benchmarks/BenchmarkRaja.cpp
+++ b/benchmarks/BenchmarkRaja.cpp
@@ -31,26 +31,33 @@ static void benchmark_gpu_loop_separate_streams(benchmark::State& state) {
 
    RAJA::resources::Cuda res_arr[NUM_KERNELS];
    RAJA::resources::Event event_arr[NUM_KERNELS];
+   care::host_device_ptr<int> arrays[NUM_KERNELS];
    for(int i = 0; i < NUM_KERNELS; i++)
    {
       RAJA::resources::Cuda res;
       res_arr[i] = res;
       RAJA::resources::Event e = res.get_event();
       event_arr[i] = e;
+      care::host_device_ptr<int> arr(N, "arr");
+      arrays[i] = arr;
    }
+
+   //warmup kernel
+   RAJA::resources::Cuda warmup_res;
+   CARE_STREAMED_LOOP(warmup_res, i, 0 , N) {
+      arrays[0][i] = 0;
+   } CARE_STREAMED_LOOP_END					
 	
-   care::host_device_ptr<int> arr(N, "arr");
    for (auto _ : state) {
       //run num kernels
       for(int j = 0; j < NUM_KERNELS; j++)
       {
          CARE_STREAMED_LOOP(res_arr[j], i, 0 , N) {
-         arr[i] = i;
+            arrays[j][i] = i;
          } CARE_STREAMED_LOOP_END					
-         if(j > 0) res_arr[j].wait_for(&event_arr[j - 1]);
       }
    }
-   arr.free();
+   for(int i = 0; i < NUM_KERNELS; i++) {arrays[i].free();}
 }
 
 // Register the function as a benchmark
@@ -62,18 +69,30 @@ static void benchmark_gpu_loop_single_stream(benchmark::State& state) {
 
    RAJA::resources::Cuda res;   
 
-   care::host_device_ptr<int> arr(N, "arr");	
+   care::host_device_ptr<int> arrays[NUM_KERNELS];
+   for(int i = 0; i < NUM_KERNELS; i++)
+   {
+      care::host_device_ptr<int> arr(N, "arr");
+      arrays[i] = arr;
+   }
+
+   //warmup kernel
+   RAJA::resources::Cuda warmup_res;
+   CARE_STREAMED_LOOP(warmup_res, i, 0, N) {
+      arrays[0][i] = i;
+   } CARE_STREAMED_LOOP_END
+
    for (auto _ : state) {
       //run num kernels
       for(int j = 0; j < NUM_KERNELS; j++)
       {
          CARE_STREAMED_LOOP(res, i, 0, N) {
-         arr[i] = i;
-         }CARE_STREAMED_LOOP_END
+            arrays[j][i] = i;
+         } CARE_STREAMED_LOOP_END
          res.wait();
       }
    }
-   arr.free();
+   for(int i = 0; i < NUM_KERNELS; i++) {arrays[i].free();}
 }
 
 // Register the function as a benchmark

From 0459805e22d6bd4b33d13dc9064a32562a6ec847 Mon Sep 17 00:00:00 2001
From: Neela Kausik <kausik1@llnl.gov>
Date: Thu, 3 Aug 2023 13:19:24 -0700
Subject: [PATCH 09/18] fixed loop size

---
 benchmarks/BenchmarkRaja.cpp | 43 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp
index 9e8b9437..014eed2c 100644
--- a/benchmarks/BenchmarkRaja.cpp
+++ b/benchmarks/BenchmarkRaja.cpp
@@ -20,48 +20,47 @@
 #include <climits>
 #include <cmath>
 
-#define NUM_KERNELS 4
+#define size 1000000
 
-using namespace care;
+namespace care{
 
 #if defined(CARE_GPUCC)
 //each kernel has a separate stream
 static void benchmark_gpu_loop_separate_streams(benchmark::State& state) {
    int N = state.range(0);
-
-   RAJA::resources::Cuda res_arr[NUM_KERNELS];
-   RAJA::resources::Event event_arr[NUM_KERNELS];
-   care::host_device_ptr<int> arrays[NUM_KERNELS];
-   for(int i = 0; i < NUM_KERNELS; i++)
+   RAJA::resources::Cuda res_arr[N];
+   RAJA::resources::Event event_arr[N];
+   care::host_device_ptr<int> arrays[16];
+   for(int i = 0; i < N; i++)
    {
       RAJA::resources::Cuda res;
       res_arr[i] = res;
       RAJA::resources::Event e = res.get_event();
       event_arr[i] = e;
-      care::host_device_ptr<int> arr(N, "arr");
+      care::host_device_ptr<int> arr(size, "arr");
       arrays[i] = arr;
    }
 
    //warmup kernel
    RAJA::resources::Cuda warmup_res;
-   CARE_STREAMED_LOOP(warmup_res, i, 0 , N) {
+   CARE_STREAMED_LOOP(warmup_res, i, 0 , size) {
       arrays[0][i] = 0;
    } CARE_STREAMED_LOOP_END					
 	
    for (auto _ : state) {
       //run num kernels
-      for(int j = 0; j < NUM_KERNELS; j++)
+      for(int j = 0; j < N; j++)
       {
-         CARE_STREAMED_LOOP(res_arr[j], i, 0 , N) {
+         CARE_STREAMED_LOOP(res_arr[j], i, 0 , size) {
             arrays[j][i] = i;
          } CARE_STREAMED_LOOP_END					
       }
    }
-   for(int i = 0; i < NUM_KERNELS; i++) {arrays[i].free();}
+   for(int i = 0; i < N; i++) {arrays[i].free();}
 }
 
 // Register the function as a benchmark
-BENCHMARK(benchmark_gpu_loop_separate_streams)->Range(1, INT_MAX);
+BENCHMARK(benchmark_gpu_loop_separate_streams)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(12)->Arg(16);
 
 //all kernels on one stream
 static void benchmark_gpu_loop_single_stream(benchmark::State& state) {
@@ -69,36 +68,38 @@ static void benchmark_gpu_loop_single_stream(benchmark::State& state) {
 
    RAJA::resources::Cuda res;   
 
-   care::host_device_ptr<int> arrays[NUM_KERNELS];
-   for(int i = 0; i < NUM_KERNELS; i++)
+   care::host_device_ptr<int> arrays[16];
+   for(int i = 0; i < N; i++)
    {
-      care::host_device_ptr<int> arr(N, "arr");
+      care::host_device_ptr<int> arr(size, "arr");
       arrays[i] = arr;
    }
 
    //warmup kernel
    RAJA::resources::Cuda warmup_res;
-   CARE_STREAMED_LOOP(warmup_res, i, 0, N) {
+   CARE_STREAMED_LOOP(warmup_res, i, 0, size) {
       arrays[0][i] = i;
    } CARE_STREAMED_LOOP_END
 
    for (auto _ : state) {
       //run num kernels
-      for(int j = 0; j < NUM_KERNELS; j++)
+      for(int j = 0; j < N; j++)
       {
-         CARE_STREAMED_LOOP(res, i, 0, N) {
+         CARE_STREAMED_LOOP(res, i, 0, size) {
             arrays[j][i] = i;
          } CARE_STREAMED_LOOP_END
          res.wait();
       }
    }
-   for(int i = 0; i < NUM_KERNELS; i++) {arrays[i].free();}
+   for(int i = 0; i < N; i++) {arrays[i].free();}
 }
 
 // Register the function as a benchmark
-BENCHMARK(benchmark_gpu_loop_single_stream)->Range(1, INT_MAX);
+BENCHMARK(benchmark_gpu_loop_single_stream)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(12)->Arg(16);
 
 #endif
 
+} //namespace care
+
 // Run the benchmarks
 BENCHMARK_MAIN();

From 4915fb4c6dd4fc894d15e91837944f6b881e8169 Mon Sep 17 00:00:00 2001
From: Neela Kausik <kausik1@llnl.gov>
Date: Thu, 3 Aug 2023 15:09:02 -0700
Subject: [PATCH 10/18] added CARE_DLL to PluginData.h

---
 src/care/PluginData.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/care/PluginData.h b/src/care/PluginData.h
index b248cd95..37788a50 100644
--- a/src/care/PluginData.h
+++ b/src/care/PluginData.h
@@ -29,17 +29,17 @@ namespace care{
 
          static void setParallelContext(bool isParallel);
 
-         static bool isParallelContext();
+         CARE_DLL_API static bool isParallelContext();
 
-         static bool post_parallel_forall_action_registered(void * key); 
+         CARE_DLL_API static bool post_parallel_forall_action_registered(void * key); 
 
          static ActionMap get_post_parallel_forall_actions();
 
-         static void register_post_parallel_forall_action(void * key, std::function<void(chai::ExecutionSpace, const char *, int)> action);
+         CARE_DLL_API static void register_post_parallel_forall_action(void * key, std::function<void(chai::ExecutionSpace, const char *, int)> action);
 
          static void clear_post_parallel_forall_actions();
 
-         static std::vector<const chai::PointerRecord*> getActivePointers();
+         CARE_DLL_API static std::vector<const chai::PointerRecord*> getActivePointers();
 
          static void addActivePointer(const chai::PointerRecord* record);
 
@@ -47,7 +47,7 @@ namespace care{
 
          static void clearActivePointers();
 
-         static int s_threadID;
+         CARE_DLL_API static int s_threadID;
 
       private:
          static const char * s_file_name;

From bea52591236fec9e9c962c6e5ed75b50828fc5dc Mon Sep 17 00:00:00 2001
From: Neela Kausik <kausik1@llnl.gov>
Date: Thu, 3 Aug 2023 15:10:35 -0700
Subject: [PATCH 11/18] Revert "added CARE_DLL to PluginData.h"

This reverts commit 4915fb4c6dd4fc894d15e91837944f6b881e8169.
---
 src/care/PluginData.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/care/PluginData.h b/src/care/PluginData.h
index 37788a50..b248cd95 100644
--- a/src/care/PluginData.h
+++ b/src/care/PluginData.h
@@ -29,17 +29,17 @@ namespace care{
 
          static void setParallelContext(bool isParallel);
 
-         CARE_DLL_API static bool isParallelContext();
+         static bool isParallelContext();
 
-         CARE_DLL_API static bool post_parallel_forall_action_registered(void * key); 
+         static bool post_parallel_forall_action_registered(void * key); 
 
          static ActionMap get_post_parallel_forall_actions();
 
-         CARE_DLL_API static void register_post_parallel_forall_action(void * key, std::function<void(chai::ExecutionSpace, const char *, int)> action);
+         static void register_post_parallel_forall_action(void * key, std::function<void(chai::ExecutionSpace, const char *, int)> action);
 
          static void clear_post_parallel_forall_actions();
 
-         CARE_DLL_API static std::vector<const chai::PointerRecord*> getActivePointers();
+         static std::vector<const chai::PointerRecord*> getActivePointers();
 
          static void addActivePointer(const chai::PointerRecord* record);
 
@@ -47,7 +47,7 @@ namespace care{
 
          static void clearActivePointers();
 
-         CARE_DLL_API static int s_threadID;
+         static int s_threadID;
 
       private:
          static const char * s_file_name;

From 1f159f14a3ee1a5ab682ae0645706dfee3fb5a6c Mon Sep 17 00:00:00 2001
From: Neela Kausik <kausik1@llnl.gov>
Date: Fri, 4 Aug 2023 11:51:59 -0700
Subject: [PATCH 12/18] addressed comments

---
 benchmarks/BenchmarkRaja.cpp | 51 +++++++++++++++++++-----------------
 src/care/forall.h            |  8 ++++--
 2 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp
index 014eed2c..f143eedd 100644
--- a/benchmarks/BenchmarkRaja.cpp
+++ b/benchmarks/BenchmarkRaja.cpp
@@ -22,41 +22,42 @@
 
 #define size 1000000
 
-namespace care{
-
 #if defined(CARE_GPUCC)
 //each kernel has a separate stream
 static void benchmark_gpu_loop_separate_streams(benchmark::State& state) {
    int N = state.range(0);
-   RAJA::resources::Cuda res_arr[N];
-   RAJA::resources::Event event_arr[N];
+   care::Resource res_arr[16];
+   RAJA::resources::Event event_arr[16];
    care::host_device_ptr<int> arrays[16];
    for(int i = 0; i < N; i++)
    {
-      RAJA::resources::Cuda res;
-      res_arr[i] = res;
-      RAJA::resources::Event e = res.get_event();
-      event_arr[i] = e;
-      care::host_device_ptr<int> arr(size, "arr");
-      arrays[i] = arr;
+      res_arr[i] = care::Resource();
+      event_arr[i] = res_arr[i].get_event();
+      arrays[i] = care::host_device_ptr<int>(size, "arr");
    }
 
    //warmup kernel
-   RAJA::resources::Cuda warmup_res;
-   CARE_STREAMED_LOOP(warmup_res, i, 0 , size) {
+   CARE_GPU_LOOP(i, 0 , size) {
       arrays[0][i] = 0;
-   } CARE_STREAMED_LOOP_END					
+   } CARE_GPU_LOOP_END	
+
+   care::gpuDeviceSynchronize(__FILE__, __LINE__);   
 	
    for (auto _ : state) {
       //run num kernels
+      //#pragma omp parallel for
       for(int j = 0; j < N; j++)
       {
          CARE_STREAMED_LOOP(res_arr[j], i, 0 , size) {
             arrays[j][i] = i;
          } CARE_STREAMED_LOOP_END					
       }
+      care::gpuDeviceSynchronize(__FILE__, __LINE__);
+   }
+
+   for(int i = 0; i < N; i++){
+      arrays[i].free();
    }
-   for(int i = 0; i < N; i++) {arrays[i].free();}
 }
 
 // Register the function as a benchmark
@@ -66,32 +67,36 @@ BENCHMARK(benchmark_gpu_loop_separate_streams)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->
 static void benchmark_gpu_loop_single_stream(benchmark::State& state) {
    int N = state.range(0);	
 
-   RAJA::resources::Cuda res;   
+   care::Resource res;   
 
    care::host_device_ptr<int> arrays[16];
    for(int i = 0; i < N; i++)
    {
-      care::host_device_ptr<int> arr(size, "arr");
-      arrays[i] = arr;
+      arrays[i] = care::host_device_ptr<int>(size, "arr");
    }
 
    //warmup kernel
-   RAJA::resources::Cuda warmup_res;
-   CARE_STREAMED_LOOP(warmup_res, i, 0, size) {
+   CARE_GPU_LOOP(i, 0, size) {
       arrays[0][i] = i;
-   } CARE_STREAMED_LOOP_END
+   } CARE_GPU_LOOP_END
+
+   care::gpuDeviceSynchronize(__FILE__, __LINE__);
 
    for (auto _ : state) {
       //run num kernels
+      //#pragma omp parallel for
       for(int j = 0; j < N; j++)
       {
          CARE_STREAMED_LOOP(res, i, 0, size) {
             arrays[j][i] = i;
          } CARE_STREAMED_LOOP_END
-         res.wait();
       }
+      res.wait();
+   }
+
+   for(int i = 0; i < N; i++){
+      arrays[i].free();
    }
-   for(int i = 0; i < N; i++) {arrays[i].free();}
 }
 
 // Register the function as a benchmark
@@ -99,7 +104,5 @@ BENCHMARK(benchmark_gpu_loop_single_stream)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg
 
 #endif
 
-} //namespace care
-
 // Run the benchmarks
 BENCHMARK_MAIN();
diff --git a/src/care/forall.h b/src/care/forall.h
index f6ed5f7b..27158080 100644
--- a/src/care/forall.h
+++ b/src/care/forall.h
@@ -30,22 +30,26 @@ namespace care {
 #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
    static bool s_reverseLoopOrder = false;
 #endif
-
+   
    template <typename T>
    struct ExecutionPolicyToSpace {
       static constexpr const chai::ExecutionSpace value = chai::CPU;
    };
 
 #if defined(__CUDACC__)
+   typedef RAJA::resources::Cuda Resource;
    template <>
    struct ExecutionPolicyToSpace<RAJA::cuda_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>> {
       static constexpr const chai::ExecutionSpace value = chai::GPU;
    };
 #elif defined (__HIPCC__)
+   typedef RAJA::resources::Hip Resource;
    template <>
    struct ExecutionPolicyToSpace<RAJA::hip_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>> {
       static constexpr const chai::ExecutionSpace value = chai::GPU;
    };
+#else
+   typedef RAJA::resources::Host Resource;
 #endif
 
 #if CARE_ENABLE_GPU_SIMULATION_MODE
@@ -208,7 +212,7 @@ namespace care {
 
 #if defined(CARE_GPUCC)
    template <typename LB> 
-   void forall_with_stream(gpu, RAJA::resources::Cuda res, const char * fileName, const int lineNumber,
+   void forall_with_stream(gpu, Resource res, const char * fileName, const int lineNumber,
                const int start, const int end, LB&& body) {
 #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
       s_reverseLoopOrder = true;

From ada1508b1e788108196ee23c40c82c6b11e6f265 Mon Sep 17 00:00:00 2001
From: Neela Kausik <kausik1@llnl.gov>
Date: Fri, 4 Aug 2023 11:53:33 -0700
Subject: [PATCH 13/18] omp parallel for

---
 benchmarks/BenchmarkRaja.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp
index f143eedd..dbfc984e 100644
--- a/benchmarks/BenchmarkRaja.cpp
+++ b/benchmarks/BenchmarkRaja.cpp
@@ -45,7 +45,7 @@ static void benchmark_gpu_loop_separate_streams(benchmark::State& state) {
 	
    for (auto _ : state) {
       //run num kernels
-      //#pragma omp parallel for
+      #pragma omp parallel for
       for(int j = 0; j < N; j++)
       {
          CARE_STREAMED_LOOP(res_arr[j], i, 0 , size) {
@@ -84,7 +84,7 @@ static void benchmark_gpu_loop_single_stream(benchmark::State& state) {
 
    for (auto _ : state) {
       //run num kernels
-      //#pragma omp parallel for
+      #pragma omp parallel for
       for(int j = 0; j < N; j++)
       {
          CARE_STREAMED_LOOP(res, i, 0, size) {

From aa901137250d7ca78d45bc4f933e0389b4df0766 Mon Sep 17 00:00:00 2001
From: Neela Kausik <kausik1@llnl.gov>
Date: Fri, 4 Aug 2023 12:21:19 -0700
Subject: [PATCH 14/18] try to make benchmarks more similar

---
 benchmarks/BenchmarkRaja.cpp | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp
index dbfc984e..6d5a2393 100644
--- a/benchmarks/BenchmarkRaja.cpp
+++ b/benchmarks/BenchmarkRaja.cpp
@@ -49,7 +49,7 @@ static void benchmark_gpu_loop_separate_streams(benchmark::State& state) {
       for(int j = 0; j < N; j++)
       {
          CARE_STREAMED_LOOP(res_arr[j], i, 0 , size) {
-            arrays[j][i] = i;
+            arrays[j][i] = sqrtf(i) + cosf(j) * powf(i, j);
          } CARE_STREAMED_LOOP_END					
       }
       care::gpuDeviceSynchronize(__FILE__, __LINE__);
@@ -67,9 +67,7 @@ BENCHMARK(benchmark_gpu_loop_separate_streams)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->
 static void benchmark_gpu_loop_single_stream(benchmark::State& state) {
    int N = state.range(0);	
 
-   care::Resource res;   
-
-   care::host_device_ptr<int> arrays[16];
+  care::host_device_ptr<int> arrays[16];
    for(int i = 0; i < N; i++)
    {
       arrays[i] = care::host_device_ptr<int>(size, "arr");
@@ -77,21 +75,20 @@ static void benchmark_gpu_loop_single_stream(benchmark::State& state) {
 
    //warmup kernel
    CARE_GPU_LOOP(i, 0, size) {
-      arrays[0][i] = i;
+      arrays[0][i] = 0;
    } CARE_GPU_LOOP_END
 
    care::gpuDeviceSynchronize(__FILE__, __LINE__);
 
    for (auto _ : state) {
       //run num kernels
-      #pragma omp parallel for
       for(int j = 0; j < N; j++)
       {
-         CARE_STREAMED_LOOP(res, i, 0, size) {
-            arrays[j][i] = i;
-         } CARE_STREAMED_LOOP_END
+         CARE_GPU_LOOP(i, 0, size) {
+            arrays[j][i] = sqrtf(i) + cosf(j) * powf(i, j);
+         } CARE_GPU_LOOP_END
       }
-      res.wait();
+      care::gpuDeviceSynchronize(__FILE__, __LINE__);
    }
 
    for(int i = 0; i < N; i++){

From 65ce8b07daf4635f90b936017f40450785293e9a Mon Sep 17 00:00:00 2001
From: Neela Kausik <kausik1@llnl.gov>
Date: Fri, 4 Aug 2023 12:42:33 -0700
Subject: [PATCH 15/18] set num threads

---
 benchmarks/BenchmarkRaja.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp
index 6d5a2393..9c88f0b8 100644
--- a/benchmarks/BenchmarkRaja.cpp
+++ b/benchmarks/BenchmarkRaja.cpp
@@ -45,10 +45,11 @@ static void benchmark_gpu_loop_separate_streams(benchmark::State& state) {
 	
    for (auto _ : state) {
       //run num kernels
+      omp_set_num_threads(16);
       #pragma omp parallel for
       for(int j = 0; j < N; j++)
       {
-         CARE_STREAMED_LOOP(res_arr[j], i, 0 , size) {
+            CARE_STREAMED_LOOP(res_arr[j], i, 0 , size) {
             arrays[j][i] = sqrtf(i) + cosf(j) * powf(i, j);
          } CARE_STREAMED_LOOP_END					
       }
@@ -67,7 +68,7 @@ BENCHMARK(benchmark_gpu_loop_separate_streams)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->
 static void benchmark_gpu_loop_single_stream(benchmark::State& state) {
    int N = state.range(0);	
 
-  care::host_device_ptr<int> arrays[16];
+   care::host_device_ptr<int> arrays[16];
    for(int i = 0; i < N; i++)
    {
       arrays[i] = care::host_device_ptr<int>(size, "arr");

From 25b888581f4610c37c5e6289d3ec6d89968af0fe Mon Sep 17 00:00:00 2001
From: Neela Kausik <kausik1@llnl.gov>
Date: Fri, 4 Aug 2023 13:27:29 -0700
Subject: [PATCH 16/18] forall resource overload

---
 src/care/forall.h | 48 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/src/care/forall.h b/src/care/forall.h
index 27158080..4b638607 100644
--- a/src/care/forall.h
+++ b/src/care/forall.h
@@ -101,6 +101,50 @@ namespace care {
       }
    }
 
+      ////////////////////////////////////////////////////////////////////////////////
+   ///
+   /// @author Peter Robinson, Alan Dayton
+   ///
+   /// @brief Loops over the given indices and calls the loop body with each index.
+   ///        This overload is CHAI and RAJA aware and sets the execution space accordingly.
+   ///
+   /// @arg[in] policy Used to choose this overload of forall
+   /// @arg[in] res Resource to be used
+   /// @arg[in] fileName The name of the file where this function is called
+   /// @arg[in] lineNumber The line number in the file where this function is called
+   /// @arg[in] start The starting index (inclusive)
+   /// @arg[in] end The ending index (exclusive)
+   /// @arg[in] body The loop body to execute at each index
+   ///
+   ////////////////////////////////////////////////////////////////////////////////
+   template <typename ExecutionPolicy, typename LB>
+   void forall(ExecutionPolicy /* policy */, Resource res, const char * fileName, const int lineNumber,
+               const int start, const int end, LB&& body) {
+      const int length = end - start;
+
+      if (length != 0) {
+         PluginData::setFileName(fileName);
+         PluginData::setLineNumber(lineNumber);
+
+
+#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
+         RAJA::RangeStrideSegment rangeSegment =
+            s_reverseLoopOrder ?
+            RAJA::RangeStrideSegment(end - 1, start - 1, -1) :
+            RAJA::RangeStrideSegment(start, end, 1);
+#else
+         RAJA::RangeSegment rangeSegment = RAJA::RangeSegment(start, end);
+#endif
+
+#if CARE_ENABLE_GPU_SIMULATION_MODE
+         RAJA::forall<RAJA::seq_exec>(res, rangeSegment, std::forward<LB>(body));
+#else
+         RAJA::forall<ExecutionPolicy>(res, rangeSegment, std::forward<LB>(body));
+#endif
+      }
+   }
+
+
    ////////////////////////////////////////////////////////////////////////////////
    ///
    /// @author Alan Dayton
@@ -222,10 +266,10 @@ namespace care {
       forall(gpu_simulation{}, res, fileName, lineNumber, start, end, std::forward<LB>(body));
 #elif defined(__CUDACC__)
       forall(RAJA::cuda_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>{},
-             res, RAJA::RangeSegment(start, end), std::forward<LB>(body));
+             res, fileName, lineNumber, start, end, std::forward<LB>(body));
 #elif defined(__HIPCC__)
       forall(RAJA::hip_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>{},
-             res, RAJA::RangeSegment(start, end), std::forward<LB>(body));
+             res, fileName, lineNumber, start, end, std::forward<LB>(body));
 #else
       forall(RAJA::seq_exec{}, res, fileName, lineNumber, start, end, std::forward<LB>(body));
 #endif

From f761e5fa8f79b02fb93860ad1f0c5d0f17c773eb Mon Sep 17 00:00:00 2001
From: Neela Kausik <kausik1@llnl.gov>
Date: Fri, 4 Aug 2023 14:23:03 -0700
Subject: [PATCH 17/18] N threads

---
 benchmarks/BenchmarkRaja.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp
index 9c88f0b8..8b3b3285 100644
--- a/benchmarks/BenchmarkRaja.cpp
+++ b/benchmarks/BenchmarkRaja.cpp
@@ -45,7 +45,7 @@ static void benchmark_gpu_loop_separate_streams(benchmark::State& state) {
 	
    for (auto _ : state) {
       //run num kernels
-      omp_set_num_threads(16);
+      omp_set_num_threads(N);
       #pragma omp parallel for
       for(int j = 0; j < N; j++)
       {

From e28bfb5a3e233201cc554968f7d11d6e630c37df Mon Sep 17 00:00:00 2001
From: Neela Kausik <kausik1@llnl.gov>
Date: Fri, 4 Aug 2023 14:38:38 -0700
Subject: [PATCH 18/18] resource template parameter

---
 src/care/forall.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/care/forall.h b/src/care/forall.h
index 4b638607..20036cd4 100644
--- a/src/care/forall.h
+++ b/src/care/forall.h
@@ -117,8 +117,8 @@ namespace care {
    /// @arg[in] body The loop body to execute at each index
    ///
    ////////////////////////////////////////////////////////////////////////////////
-   template <typename ExecutionPolicy, typename LB>
-   void forall(ExecutionPolicy /* policy */, Resource res, const char * fileName, const int lineNumber,
+   template <typename R, typename ExecutionPolicy, typename LB>
+   void forall(ExecutionPolicy /* policy */, R res, const char * fileName, const int lineNumber,
                const int start, const int end, LB&& body) {
       const int length = end - start;