From c8b629cfcd6b3928a914933016173731a7b271e1 Mon Sep 17 00:00:00 2001 From: Neela Kausik Date: Wed, 19 Jul 2023 14:36:53 -0700 Subject: [PATCH 01/18] forall_with_streams and updated BenchmarkForall.cpp --- benchmarks/BenchmarkForall.cpp | 17 ++++++++++ src/care/forall.h | 62 +++++++++++++++++++++++----------- 2 files changed, 60 insertions(+), 19 deletions(-) diff --git a/benchmarks/BenchmarkForall.cpp b/benchmarks/BenchmarkForall.cpp index e909ac7c..da904822 100644 --- a/benchmarks/BenchmarkForall.cpp +++ b/benchmarks/BenchmarkForall.cpp @@ -69,6 +69,23 @@ static void benchmark_gpu_loop(benchmark::State& state) { // Register the function as a benchmark BENCHMARK(benchmark_gpu_loop)->Range(1, INT_MAX); +static void benchmark_gpu_loop_streams(benchmark::State& state) { + const int size = state.range(0); + care::host_device_ptr data(size, "data"); + + for (auto _ : state) { + RAJA::resources::Cuda res; + care::forall_with_stream(care::gpu{}, res, "test", 0, 0, size, [=] CARE_DEVICE (int i) { + data[i] = i; + }); + } + + data.free(); +} + +// Register the function as a benchmark +BENCHMARK(benchmark_gpu_loop_streams)->Range(1, INT_MAX); + #endif // Run the benchmarks diff --git a/src/care/forall.h b/src/care/forall.h index d45433cc..4ac0043a 100644 --- a/src/care/forall.h +++ b/src/care/forall.h @@ -18,12 +18,11 @@ // CARE headers #include "care/policies.h" +#include "care/RAJAPlugin.h" #include "care/util.h" -#include "care/PluginData.h" // other library headers #include "chai/ArrayManager.hpp" -#include "chai/ExecutionSpaces.hpp" #include "RAJA/RAJA.hpp" namespace care { @@ -76,9 +75,9 @@ namespace care { const int length = end - start; if (length != 0) { - PluginData::setFileName(fileName); - PluginData::setLineNumber(lineNumber); - +#ifndef CARE_DISABLE_RAJAPLUGIN + RAJAPlugin::pre_forall_hook(ExecutionPolicyToSpace::value, fileName, lineNumber); +#endif #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS RAJA::RangeStrideSegment rangeSegment = @@ -94,6 +93,10 @@ namespace care { #else RAJA::forall(rangeSegment, std::forward(body)); #endif + +#ifndef CARE_DISABLE_RAJAPLUGIN + RAJAPlugin::post_forall_hook(ExecutionPolicyToSpace::value, fileName, lineNumber); +#endif } } @@ -189,6 +192,31 @@ namespace care { #endif } + template + void forall_with_stream(gpu, RAJA::resources::Cuda res, const char * fileName, const int lineNumber, + const int start, const int end, LB&& body) { +#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS + s_reverseLoopOrder = true; +#endif + +#if CARE_ENABLE_GPU_SIMULATION_MODE + forall(gpu_simulation{}, res, fileName, lineNumber, start, end, std::forward(body)); +#elif defined(__CUDACC__) + forall(RAJA::cuda_exec{}, + res, RAJA::RangeSegment(start, end), std::forward(body)); +#elif defined(__HIPCC__) + forall(RAJA::hip_exec{}, + res, RAJA::RangeSegment(start, end), std::forward(body)); +#else + forall(RAJA::seq_exec{}, res, fileName, lineNumber, start, end, std::forward(body)); +#endif + +#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS + s_reverseLoopOrder = false; +#endif + } + + //////////////////////////////////////////////////////////////////////////////// /// /// @author Alan Dayton @@ -211,8 +239,8 @@ namespace care { #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS s_reverseLoopOrder = true; #endif - PluginData::setParallelContext(true); - + + RAJAPlugin::setParallelContext(true); #if CARE_ENABLE_GPU_SIMULATION_MODE forall(gpu_simulation{}, fileName, lineNumber, start, end, std::forward(body)); #elif defined(__CUDACC__) @@ -226,8 +254,8 @@ namespace care { #else forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, std::forward(body)); #endif - PluginData::setParallelContext(false); + RAJAPlugin::setParallelContext(false); #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS s_reverseLoopOrder = false; #endif @@ -487,10 +515,8 @@ namespace care { /// //////////////////////////////////////////////////////////////////////////////// template - void launch_2D_jagged(Exec /*policy*/, int xstart, int /*xend*/, int const * host_lengths, int ystart, int ylength, const char * /* fileName */, int /* lineNumber */, LB && body) { - chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance(); - arrayManager->setExecutionSpace(ExecutionPolicyToSpace::value); - + void launch_2D_jagged(Exec /*policy*/, int xstart, int /*xend*/, int const * host_lengths, int ystart, int ylength, const char * fileName, int lineNumber, LB && body) { + care::RAJAPlugin::pre_forall_hook(chai::CPU, fileName, lineNumber); // intentional trigger of copy constructor for CHAI correctness LB body_to_call{body}; for (int y = ystart; y < ylength; ++y) { @@ -498,7 +524,7 @@ namespace care { body_to_call(x, y); } } - arrayManager->setExecutionSpace(chai::ExecutionSpace::NONE); + care::RAJAPlugin::post_forall_hook(chai::CPU, fileName, lineNumber); } #ifdef CARE_GPUCC @@ -541,22 +567,20 @@ namespace care { /// //////////////////////////////////////////////////////////////////////////////// template - void launch_2D_jagged(care::gpu, int xstart, int xend, int const * gpu_lengths, int ystart, int ylength, const char * fileName, int lineNumber , LB && body) { + void launch_2D_jagged(care::gpu, int xstart, int xend, int const * gpu_lengths, int ystart, int ylength, const char * fileName, int lineNumber, LB && body) { if (xend > 0 && ylength > 0) { // TODO launch this kernel in the camp or RAJA default stream - not sure how to do this - for now this is a synchronous call on the CUDA/HIP default stream - chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance(); - arrayManager->setExecutionSpace(chai::GPU); - + care::RAJAPlugin::pre_forall_hook(chai::GPU, fileName, lineNumber); dim3 dimBlock(CARE_CUDA_BLOCK_SIZE, 1); dim3 dimGrid; dimGrid.x = (xend/CARE_CUDA_BLOCK_SIZE)+(xend%CARE_CUDA_BLOCK_SIZE==0?0:1); dimGrid.y = ylength; care_kernel_2D<<>>( body, gpu_lengths, ylength); - - arrayManager->setExecutionSpace(chai::ExecutionSpace::NONE); + care::RAJAPlugin::post_forall_hook(chai::GPU, fileName, lineNumber); } } #endif + } // namespace care #endif // !defined(_CARE_FORALL_H_) From d7f25c9788099c897a733356586b48bd51c6f2f5 Mon Sep 17 00:00:00 2001 From: Neela Kausik Date: Wed, 19 Jul 2023 14:45:00 -0700 Subject: [PATCH 02/18] branch wasn't up to date --- benchmarks/BenchmarkForall.cpp | 2 +- src/care/forall.h | 57 ++++++++++++++++++++++------------ 2 files changed, 38 insertions(+), 21 deletions(-) diff --git a/benchmarks/BenchmarkForall.cpp b/benchmarks/BenchmarkForall.cpp index da904822..eefd3392 100644 --- a/benchmarks/BenchmarkForall.cpp +++ b/benchmarks/BenchmarkForall.cpp @@ -75,7 +75,7 @@ static void benchmark_gpu_loop_streams(benchmark::State& state) { for (auto _ : state) { RAJA::resources::Cuda res; - care::forall_with_stream(care::gpu{}, res, "test", 0, 0, size, [=] CARE_DEVICE (int i) { + care::forall_with_stream(care::gpu{}, res, "BenchmarkForall.cpp", 78, 0, size, [=] CARE_DEVICE (int i) { data[i] = i; }); } diff --git a/src/care/forall.h b/src/care/forall.h index 4ac0043a..76d9cf1f 100644 --- a/src/care/forall.h +++ b/src/care/forall.h @@ -18,11 +18,12 @@ // CARE headers #include "care/policies.h" -#include "care/RAJAPlugin.h" #include "care/util.h" +#include "care/PluginData.h" // other library headers #include "chai/ArrayManager.hpp" +#include "chai/ExecutionSpaces.hpp" #include "RAJA/RAJA.hpp" namespace care { @@ -75,9 +76,9 @@ namespace care { const int length = end - start; if (length != 0) { -#ifndef CARE_DISABLE_RAJAPLUGIN - RAJAPlugin::pre_forall_hook(ExecutionPolicyToSpace::value, fileName, lineNumber); -#endif + PluginData::setFileName(fileName); + PluginData::setLineNumber(lineNumber); + #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS RAJA::RangeStrideSegment rangeSegment = @@ -93,10 +94,6 @@ namespace care { #else RAJA::forall(rangeSegment, std::forward(body)); #endif - -#ifndef CARE_DISABLE_RAJAPLUGIN - RAJAPlugin::post_forall_hook(ExecutionPolicyToSpace::value, fileName, lineNumber); -#endif } } @@ -192,6 +189,23 @@ namespace care { #endif } + //////////////////////////////////////////////////////////////////////////////// + /// + /// @author Neela Kausik + /// + /// @brief If GPU is available, execute on the device. Otherwise, execute on + /// the host. This specialization is needed for clang-query. + /// + /// @arg[in] gpu Used to choose this overload of forall + /// @arg[in] res Resource provided for execution + /// @arg[in] fileName The name of the file where this function is called + /// @arg[in] lineNumber The line number in the file where this function is called + /// @arg[in] start The starting index (inclusive) + /// @arg[in] end The ending index (exclusive) + /// @arg[in] body The loop body to execute at each index + /// + //////////////////////////////////////////////////////////////////////////////// + template void forall_with_stream(gpu, RAJA::resources::Cuda res, const char * fileName, const int lineNumber, const int start, const int end, LB&& body) { @@ -214,8 +228,7 @@ namespace care { #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS s_reverseLoopOrder = false; #endif - } - + } //////////////////////////////////////////////////////////////////////////////// /// @@ -239,8 +252,8 @@ namespace care { #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS s_reverseLoopOrder = true; #endif - - RAJAPlugin::setParallelContext(true); + PluginData::setParallelContext(true); + #if CARE_ENABLE_GPU_SIMULATION_MODE forall(gpu_simulation{}, fileName, lineNumber, start, end, std::forward(body)); #elif defined(__CUDACC__) @@ -254,8 +267,8 @@ namespace care { #else forall(RAJA::seq_exec{}, fileName, lineNumber, start, end, std::forward(body)); #endif + PluginData::setParallelContext(false); - RAJAPlugin::setParallelContext(false); #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS s_reverseLoopOrder = false; #endif @@ -515,8 +528,10 @@ namespace care { /// //////////////////////////////////////////////////////////////////////////////// template - void launch_2D_jagged(Exec /*policy*/, int xstart, int /*xend*/, int const * host_lengths, int ystart, int ylength, const char * fileName, int lineNumber, LB && body) { - care::RAJAPlugin::pre_forall_hook(chai::CPU, fileName, lineNumber); + void launch_2D_jagged(Exec /*policy*/, int xstart, int /*xend*/, int const * host_lengths, int ystart, int ylength, const char * /* fileName */, int /* lineNumber */, LB && body) { + chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance(); + arrayManager->setExecutionSpace(ExecutionPolicyToSpace::value); + // intentional trigger of copy constructor for CHAI correctness LB body_to_call{body}; for (int y = ystart; y < ylength; ++y) { @@ -524,7 +539,7 @@ namespace care { body_to_call(x, y); } } - care::RAJAPlugin::post_forall_hook(chai::CPU, fileName, lineNumber); + arrayManager->setExecutionSpace(chai::ExecutionSpace::NONE); } #ifdef CARE_GPUCC @@ -567,20 +582,22 @@ namespace care { /// //////////////////////////////////////////////////////////////////////////////// template - void launch_2D_jagged(care::gpu, int xstart, int xend, int const * gpu_lengths, int ystart, int ylength, const char * fileName, int lineNumber, LB && body) { + void launch_2D_jagged(care::gpu, int xstart, int xend, int const * gpu_lengths, int ystart, int ylength, const char * fileName, int lineNumber , LB && body) { if (xend > 0 && ylength > 0) { // TODO launch this kernel in the camp or RAJA default stream - not sure how to do this - for now this is a synchronous call on the CUDA/HIP default stream - care::RAJAPlugin::pre_forall_hook(chai::GPU, fileName, lineNumber); + chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance(); + arrayManager->setExecutionSpace(chai::GPU); + dim3 dimBlock(CARE_CUDA_BLOCK_SIZE, 1); dim3 dimGrid; dimGrid.x = (xend/CARE_CUDA_BLOCK_SIZE)+(xend%CARE_CUDA_BLOCK_SIZE==0?0:1); dimGrid.y = ylength; care_kernel_2D<<>>( body, gpu_lengths, ylength); - care::RAJAPlugin::post_forall_hook(chai::GPU, fileName, lineNumber); + + arrayManager->setExecutionSpace(chai::ExecutionSpace::NONE); } } #endif - } // namespace care #endif // !defined(_CARE_FORALL_H_) From 26fbe1b6572c76b764a145f877f54eb49597ead9 Mon Sep 17 00:00:00 2001 From: Neela Kausik Date: Mon, 24 Jul 2023 13:38:56 -0700 Subject: [PATCH 03/18] multiple kernels --- benchmarks/BenchmarkForall.cpp | 48 ++++++++++-------- benchmarks/BenchmarkRaja.cpp | 89 ++++++++++++++++++++++++++++++++++ benchmarks/CMakeLists.txt | 13 +++++ 3 files changed, 130 insertions(+), 20 deletions(-) create mode 100644 benchmarks/BenchmarkRaja.cpp diff --git a/benchmarks/BenchmarkForall.cpp b/benchmarks/BenchmarkForall.cpp index eefd3392..4bf536b2 100644 --- a/benchmarks/BenchmarkForall.cpp +++ b/benchmarks/BenchmarkForall.cpp @@ -8,6 +8,7 @@ // CARE headers #include "care/DefaultMacros.h" #include "care/host_device_ptr.h" +#include "care/util.h" // Other library headers #include @@ -19,9 +20,14 @@ static void benchmark_sequential_loop(benchmark::State& state) { const int size = state.range(0); care::host_device_ptr data(size, "data"); + // For consistency with the GPU case, which requires a warm up kernel + CARE_SEQUENTIAL_LOOP(i, 0, size) { + data[i] = 0; + } CARE_SEQUENTIAL_LOOP_END + for (auto _ : state) { CARE_SEQUENTIAL_LOOP(i, 0, size) { - data[i] = i; + data[i] += i; } CARE_SEQUENTIAL_LOOP_END } @@ -37,10 +43,19 @@ static void benchmark_openmp_loop(benchmark::State& state) { const int size = state.range(0); care::host_device_ptr data(size, "data"); + // For consistency with the GPU case, which requires a warm up kernel + CARE_OPENMP_LOOP(i, 0, size) { + data[i] = 0; + } CARE_OPENMP_LOOP_END + + // TODO: Is a synchronize needed? + for (auto _ : state) { CARE_OPENMP_LOOP(i, 0, size) { - data[i] = i; + data[i] += i; } CARE_OPENMP_LOOP_END + + // TODO: Is a synchronize needed? } data.free(); @@ -57,34 +72,27 @@ static void benchmark_gpu_loop(benchmark::State& state) { const int size = state.range(0); care::host_device_ptr data(size, "data"); + // Warm up kernel + CARE_GPU_LOOP(i, 0, size) { + data[i] = 0; + } CARE_GPU_LOOP_END + + care::gpuDeviceSynchronize(__FILE__, __LINE__); + for (auto _ : state) { CARE_GPU_LOOP(i, 0, size) { - data[i] = i; + data[i] += i; } CARE_GPU_LOOP_END - } - - data.free(); -} -// Register the function as a benchmark -BENCHMARK(benchmark_gpu_loop)->Range(1, INT_MAX); - -static void benchmark_gpu_loop_streams(benchmark::State& state) { - const int size = state.range(0); - care::host_device_ptr data(size, "data"); - - for (auto _ : state) { - RAJA::resources::Cuda res; - care::forall_with_stream(care::gpu{}, res, "BenchmarkForall.cpp", 78, 0, size, [=] CARE_DEVICE (int i) { - data[i] = i; - }); + // Timings are much more consistent with this synchronize + care::gpuDeviceSynchronize(__FILE__, __LINE__); } data.free(); } // Register the function as a benchmark -BENCHMARK(benchmark_gpu_loop_streams)->Range(1, INT_MAX); +BENCHMARK(benchmark_gpu_loop)->Range(1, INT_MAX); #endif diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp new file mode 100644 index 00000000..9c092f5f --- /dev/null +++ b/benchmarks/BenchmarkRaja.cpp @@ -0,0 +1,89 @@ +////////////////////////////////////////////////////////////////////////////////////// +// Copyright 2020 Lawrence Livermore National Security, LLC and other CARE developers. +// See the top-level LICENSE file for details. +// +// SPDX-License-Identifier: BSD-3-Clause +////////////////////////////////////////////////////////////////////////////////////// + +// CARE headers +#include "care/DefaultMacros.h" +#include "care/host_device_ptr.h" +#include "care/forall.h" +#include "care/policies.h" +#include "RAJA/RAJA.hpp" + +// Other library headers +#include +#include + +// Std library headers +#include +#include + +#define NUM_KERNELS 4 + +using namespace care; + +#if defined(CARE_GPUCC) +//each kernel has a separate stream +static void benchmark_gpu_loop_separate_streams(benchmark::State& state) { + int N = state.range(0); + const char * fileName = "test"; + + RAJA::resources::Cuda res_arr[NUM_KERNELS]; + RAJA::resources::Event event_arr[NUM_KERNELS]; + for(int i = 0; i < NUM_KERNELS; i++) + { + RAJA::resources::Cuda res; + res_arr[i] = res; + RAJA::resources::Event e = res.get_event(); + event_arr[i] = e; + } + + care::host_device_ptr arr(N, "arr"); + for (auto _ : state) { + //run num kernels + for(int j = 0; j < NUM_KERNELS; j++) + { + care::forall_with_stream(gpu{}, res_arr[j], fileName, 71, 0 , N, [=] RAJA_HOST_DEVICE (int i) { + arr[i] = i; + }); + + if(j > 0) res_arr[j].wait_for(&event_arr[j - 1]); + } + } + arr.free(); +} + +// Register the function as a benchmark +BENCHMARK(benchmark_gpu_loop_separate_streams)->Range(1, INT_MAX); + +//all kernels on one stream +static void benchmark_gpu_loop_single_stream(benchmark::State& state) { +int N = state.range(0); + const char * fileName = "test"; + RAJA::resources::Cuda res; + + care::host_device_ptr arr(N, "arr"); + + RAJA::resources::Event e = res.get_event(); + for (auto _ : state) { + //run num kernels + for(int j = 0; j < NUM_KERNELS; j++) + { + care::forall_with_stream(gpu{}, res, fileName, 71, 0 , N, [=] RAJA_HOST_DEVICE (int i) { + arr[i] = i; + }); + res.wait(); + } + } + arr.free(); +} + +// Register the function as a benchmark +BENCHMARK(benchmark_gpu_loop_single_stream)->Range(1, INT_MAX); + +#endif + +// Run the benchmarks +BENCHMARK_MAIN(); diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 5f798b75..cfde099a 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -36,6 +36,19 @@ target_include_directories(BenchmarkForall blt_add_benchmark(NAME BenchmarkForall COMMAND BenchmarkForall) +blt_add_executable(NAME BenchmarkRaja + SOURCES BenchmarkRaja.cpp + DEPENDS_ON ${care_benchmark_depends}) + +target_include_directories(BenchmarkRaja + PRIVATE ${PROJECT_SOURCE_DIR}/src) + +target_include_directories(BenchmarkRaja + PRIVATE ${PROJECT_BINARY_DIR}/include) + +blt_add_benchmark(NAME BenchmarkRaja + COMMAND BenchmarkRaja) + blt_add_executable(NAME BenchmarkNumeric SOURCES BenchmarkNumeric.cpp DEPENDS_ON ${care_benchmark_depends}) From 291fd6576dc5bbf8f0722768bedc26ca6ac78fc6 Mon Sep 17 00:00:00 2001 From: Neela Kausik Date: Mon, 24 Jul 2023 16:16:25 -0700 Subject: [PATCH 04/18] care macros --- benchmarks/BenchmarkRaja.cpp | 8 ++++---- src/care/DefaultMacros.h | 13 +++++++++++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp index 9c092f5f..46f82f46 100644 --- a/benchmarks/BenchmarkRaja.cpp +++ b/benchmarks/BenchmarkRaja.cpp @@ -45,9 +45,9 @@ static void benchmark_gpu_loop_separate_streams(benchmark::State& state) { //run num kernels for(int j = 0; j < NUM_KERNELS; j++) { - care::forall_with_stream(gpu{}, res_arr[j], fileName, 71, 0 , N, [=] RAJA_HOST_DEVICE (int i) { + CARE_STREAMED_LOOP(i, res_arr[j], 0 , N) { arr[i] = i; - }); + } CARE_STREAMED_LOOP_END if(j > 0) res_arr[j].wait_for(&event_arr[j - 1]); } @@ -71,9 +71,9 @@ int N = state.range(0); //run num kernels for(int j = 0; j < NUM_KERNELS; j++) { - care::forall_with_stream(gpu{}, res, fileName, 71, 0 , N, [=] RAJA_HOST_DEVICE (int i) { + CARE_STREAMED_LOOP(i, res, 0 , N) { arr[i] = i; - }); + }CARE_STREAMED_LOOP_END res.wait(); } } diff --git a/src/care/DefaultMacros.h b/src/care/DefaultMacros.h index 1de5ec51..37668c42 100644 --- a/src/care/DefaultMacros.h +++ b/src/care/DefaultMacros.h @@ -548,6 +548,15 @@ #define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) }); \ CARE_NEST_END(CHECK) }} +#define CARE_CHECKED_STREAMED_LOOP_START(INDEX, RESOURCE, START_INDEX, END_INDEX, CHECK) { \ + if (END_INDEX > START_INDEX) { \ + CARE_NEST_BEGIN(CHECK) \ + care::forall_with_stream(care::gpu{}, RESOURCE, __FILE__, __LINE__, START_INDEX, END_INDEX, [=] CARE_DEVICE (const int INDEX) { \ + CARE_SET_THREAD_ID(INDEX) + +#define CARE_CHECKED_STREAMED_LOOP_END(CHECK) }); \ + CARE_NEST_END(CHECK) }} + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is @@ -753,6 +762,10 @@ #define CARE_PARALLEL_LOOP_END CARE_CHECKED_PARALLEL_LOOP_END(care_parallel_loop_check) +#define CARE_STREAMED_LOOP(INDEX, RESOURCE, START_INDEX, END_INDEX) CARE_CHECKED_STREAMED_LOOP_START(INDEX, RESOURCE, START_INDEX, END_INDEX, care_streamed_loop_check) + +#define CARE_STREAMED_LOOP_END CARE_CHECKED_STREAMED_LOOP_END(care_streamed_loop_check) + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros that start and end a RAJA loop that uses at least one From c31c924a9a114c6105674918318cc6f9c9200535 Mon Sep 17 00:00:00 2001 From: Neela Kausik Date: Tue, 25 Jul 2023 08:51:47 -0700 Subject: [PATCH 05/18] try to fix cuda error --- src/care/forall.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/care/forall.h b/src/care/forall.h index 76d9cf1f..f6ed5f7b 100644 --- a/src/care/forall.h +++ b/src/care/forall.h @@ -206,6 +206,7 @@ namespace care { /// //////////////////////////////////////////////////////////////////////////////// +#if defined(CARE_GPUCC) template void forall_with_stream(gpu, RAJA::resources::Cuda res, const char * fileName, const int lineNumber, const int start, const int end, LB&& body) { @@ -229,6 +230,7 @@ namespace care { s_reverseLoopOrder = false; #endif } +#endif //////////////////////////////////////////////////////////////////////////////// /// From 533e0a42a9bba1015c44ac5eac65042f354c1249 Mon Sep 17 00:00:00 2001 From: Neela Kausik Date: Tue, 1 Aug 2023 11:38:32 -0700 Subject: [PATCH 06/18] fixed indentation and macros --- benchmarks/BenchmarkRaja.cpp | 76 +++++++++++++++++------------------- src/care/DefaultMacros.h | 4 +- 2 files changed, 38 insertions(+), 42 deletions(-) diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp index 46f82f46..89ae09a3 100644 --- a/benchmarks/BenchmarkRaja.cpp +++ b/benchmarks/BenchmarkRaja.cpp @@ -28,31 +28,29 @@ using namespace care; //each kernel has a separate stream static void benchmark_gpu_loop_separate_streams(benchmark::State& state) { int N = state.range(0); - const char * fileName = "test"; - RAJA::resources::Cuda res_arr[NUM_KERNELS]; - RAJA::resources::Event event_arr[NUM_KERNELS]; - for(int i = 0; i < NUM_KERNELS; i++) - { - RAJA::resources::Cuda res; - res_arr[i] = res; - RAJA::resources::Event e = res.get_event(); - event_arr[i] = e; - } + RAJA::resources::Cuda res_arr[NUM_KERNELS]; + RAJA::resources::Event event_arr[NUM_KERNELS]; + for(int i = 0; i < NUM_KERNELS; i++) + { + RAJA::resources::Cuda res; + res_arr[i] = res; + RAJA::resources::Event e = res.get_event(); + event_arr[i] = e; + } - care::host_device_ptr arr(N, "arr"); + care::host_device_ptr arr(N, "arr"); for (auto _ : state) { - //run num kernels - for(int j = 0; j < NUM_KERNELS; j++) - { - CARE_STREAMED_LOOP(i, res_arr[j], 0 , N) { - arr[i] = i; - } CARE_STREAMED_LOOP_END - - if(j > 0) res_arr[j].wait_for(&event_arr[j - 1]); - } - } - arr.free(); + //run num kernels + for(int j = 0; j < NUM_KERNELS; j++) + { + CARE_STREAMED_LOOP(res_arr[j], i, 0 , N) { + arr[i] = i; + } CARE_STREAMED_LOOP_END + if(j > 0) res_arr[j].wait_for(&event_arr[j - 1]); + } + } + arr.free(); } // Register the function as a benchmark @@ -60,24 +58,22 @@ BENCHMARK(benchmark_gpu_loop_separate_streams)->Range(1, INT_MAX); //all kernels on one stream static void benchmark_gpu_loop_single_stream(benchmark::State& state) { -int N = state.range(0); - const char * fileName = "test"; - RAJA::resources::Cuda res; - - care::host_device_ptr arr(N, "arr"); - - RAJA::resources::Event e = res.get_event(); - for (auto _ : state) { - //run num kernels - for(int j = 0; j < NUM_KERNELS; j++) - { - CARE_STREAMED_LOOP(i, res, 0 , N) { - arr[i] = i; - }CARE_STREAMED_LOOP_END - res.wait(); - } - } - arr.free(); + int N = state.range(0); + + RAJA::resources::Cuda res; + + care::host_device_ptr arr(N, "arr"); + for (auto _ : state) { + //run num kernels + for(int j = 0; j < NUM_KERNELS; j++) + { + CARE_STREAMED_LOOP(res, i, 0, N) { + arr[i] = i; + }CARE_STREAMED_LOOP_END + res.wait(); + } + } + arr.free(); } // Register the function as a benchmark diff --git a/src/care/DefaultMacros.h b/src/care/DefaultMacros.h index 37668c42..97aeb674 100644 --- a/src/care/DefaultMacros.h +++ b/src/care/DefaultMacros.h @@ -548,7 +548,7 @@ #define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) }); \ CARE_NEST_END(CHECK) }} -#define CARE_CHECKED_STREAMED_LOOP_START(INDEX, RESOURCE, START_INDEX, END_INDEX, CHECK) { \ +#define CARE_CHECKED_STREAMED_LOOP_START(RESOURCE, INDEX, START_INDEX, END_INDEX, CHECK) { \ if (END_INDEX > START_INDEX) { \ CARE_NEST_BEGIN(CHECK) \ care::forall_with_stream(care::gpu{}, RESOURCE, __FILE__, __LINE__, START_INDEX, END_INDEX, [=] CARE_DEVICE (const int INDEX) { \ @@ -762,7 +762,7 @@ #define CARE_PARALLEL_LOOP_END CARE_CHECKED_PARALLEL_LOOP_END(care_parallel_loop_check) -#define CARE_STREAMED_LOOP(INDEX, RESOURCE, START_INDEX, END_INDEX) CARE_CHECKED_STREAMED_LOOP_START(INDEX, RESOURCE, START_INDEX, END_INDEX, care_streamed_loop_check) +#define CARE_STREAMED_LOOP(RESOURCE, INDEX, START_INDEX, END_INDEX) CARE_CHECKED_STREAMED_LOOP_START(RESOURCE, INDEX, START_INDEX, END_INDEX, care_streamed_loop_check) #define CARE_STREAMED_LOOP_END CARE_CHECKED_STREAMED_LOOP_END(care_streamed_loop_check) From 5d3a07ba365717351253f4b41ea40983430c150d Mon Sep 17 00:00:00 2001 From: Neela Kausik Date: Wed, 2 Aug 2023 16:02:20 -0700 Subject: [PATCH 07/18] legacy macros --- src/care/DefaultMacros.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/care/DefaultMacros.h b/src/care/DefaultMacros.h index 97aeb674..6e055d25 100644 --- a/src/care/DefaultMacros.h +++ b/src/care/DefaultMacros.h @@ -261,6 +261,10 @@ #define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK) +#define CARE_CHECKED_STREAMED_LOOP_START(RESOURCE, INDEX, START_INDEX, END_INDEX, CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) + +#define CARE_CHECKED_STREAMED_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK) + //////////////////////////////////////////////////////////////////////////////// /// /// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is From 131dc67af6c29ced6a25c0f729fe2f6572bc9239 Mon Sep 17 00:00:00 2001 From: Neela Kausik Date: Thu, 3 Aug 2023 11:59:22 -0700 Subject: [PATCH 08/18] warmup kernels and multiple arrays --- benchmarks/BenchmarkRaja.cpp | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp index 89ae09a3..9e8b9437 100644 --- a/benchmarks/BenchmarkRaja.cpp +++ b/benchmarks/BenchmarkRaja.cpp @@ -31,26 +31,33 @@ static void benchmark_gpu_loop_separate_streams(benchmark::State& state) { RAJA::resources::Cuda res_arr[NUM_KERNELS]; RAJA::resources::Event event_arr[NUM_KERNELS]; + care::host_device_ptr arrays[NUM_KERNELS]; for(int i = 0; i < NUM_KERNELS; i++) { RAJA::resources::Cuda res; res_arr[i] = res; RAJA::resources::Event e = res.get_event(); event_arr[i] = e; + care::host_device_ptr arr(N, "arr"); + arrays[i] = arr; } + + //warmup kernel + RAJA::resources::Cuda warmup_res; + CARE_STREAMED_LOOP(warmup_res, i, 0 , N) { + arrays[0][i] = 0; + } CARE_STREAMED_LOOP_END - care::host_device_ptr arr(N, "arr"); for (auto _ : state) { //run num kernels for(int j = 0; j < NUM_KERNELS; j++) { CARE_STREAMED_LOOP(res_arr[j], i, 0 , N) { - arr[i] = i; + arrays[j][i] = i; } CARE_STREAMED_LOOP_END - if(j > 0) res_arr[j].wait_for(&event_arr[j - 1]); } } - arr.free(); + for(int i = 0; i < NUM_KERNELS; i++) {arrays[i].free();} } // Register the function as a benchmark @@ -62,18 +69,30 @@ static void benchmark_gpu_loop_single_stream(benchmark::State& state) { RAJA::resources::Cuda res; - care::host_device_ptr arr(N, "arr"); + care::host_device_ptr arrays[NUM_KERNELS]; + for(int i = 0; i < NUM_KERNELS; i++) + { + care::host_device_ptr arr(N, "arr"); + arrays[i] = arr; + } + + //warmup kernel + RAJA::resources::Cuda warmup_res; + CARE_STREAMED_LOOP(warmup_res, i, 0, N) { + arrays[0][i] = i; + } CARE_STREAMED_LOOP_END + for (auto _ : state) { //run num kernels for(int j = 0; j < NUM_KERNELS; j++) { CARE_STREAMED_LOOP(res, i, 0, N) { - arr[i] = i; - }CARE_STREAMED_LOOP_END + arrays[j][i] = i; + } CARE_STREAMED_LOOP_END res.wait(); } } - arr.free(); + for(int i = 0; i < NUM_KERNELS; i++) {arrays[i].free();} } // Register the function as a benchmark From 0459805e22d6bd4b33d13dc9064a32562a6ec847 Mon Sep 17 00:00:00 2001 From: Neela Kausik Date: Thu, 3 Aug 2023 13:19:24 -0700 Subject: [PATCH 09/18] fixed loop size --- benchmarks/BenchmarkRaja.cpp | 43 ++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp index 9e8b9437..014eed2c 100644 --- a/benchmarks/BenchmarkRaja.cpp +++ b/benchmarks/BenchmarkRaja.cpp @@ -20,48 +20,47 @@ #include #include -#define NUM_KERNELS 4 +#define size 1000000 -using namespace care; +namespace care{ #if defined(CARE_GPUCC) //each kernel has a separate stream static void benchmark_gpu_loop_separate_streams(benchmark::State& state) { int N = state.range(0); - - RAJA::resources::Cuda res_arr[NUM_KERNELS]; - RAJA::resources::Event event_arr[NUM_KERNELS]; - care::host_device_ptr arrays[NUM_KERNELS]; - for(int i = 0; i < NUM_KERNELS; i++) + RAJA::resources::Cuda res_arr[N]; + RAJA::resources::Event event_arr[N]; + care::host_device_ptr arrays[16]; + for(int i = 0; i < N; i++) { RAJA::resources::Cuda res; res_arr[i] = res; RAJA::resources::Event e = res.get_event(); event_arr[i] = e; - care::host_device_ptr arr(N, "arr"); + care::host_device_ptr arr(size, "arr"); arrays[i] = arr; } //warmup kernel RAJA::resources::Cuda warmup_res; - CARE_STREAMED_LOOP(warmup_res, i, 0 , N) { + CARE_STREAMED_LOOP(warmup_res, i, 0 , size) { arrays[0][i] = 0; } CARE_STREAMED_LOOP_END for (auto _ : state) { //run num kernels - for(int j = 0; j < NUM_KERNELS; j++) + for(int j = 0; j < N; j++) { - CARE_STREAMED_LOOP(res_arr[j], i, 0 , N) { + CARE_STREAMED_LOOP(res_arr[j], i, 0 , size) { arrays[j][i] = i; } CARE_STREAMED_LOOP_END } } - for(int i = 0; i < NUM_KERNELS; i++) {arrays[i].free();} + for(int i = 0; i < N; i++) {arrays[i].free();} } // Register the function as a benchmark -BENCHMARK(benchmark_gpu_loop_separate_streams)->Range(1, INT_MAX); +BENCHMARK(benchmark_gpu_loop_separate_streams)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(12)->Arg(16); //all kernels on one stream static void benchmark_gpu_loop_single_stream(benchmark::State& state) { @@ -69,36 +68,38 @@ static void benchmark_gpu_loop_single_stream(benchmark::State& state) { RAJA::resources::Cuda res; - care::host_device_ptr arrays[NUM_KERNELS]; - for(int i = 0; i < NUM_KERNELS; i++) + care::host_device_ptr arrays[16]; + for(int i = 0; i < N; i++) { - care::host_device_ptr arr(N, "arr"); + care::host_device_ptr arr(size, "arr"); arrays[i] = arr; } //warmup kernel RAJA::resources::Cuda warmup_res; - CARE_STREAMED_LOOP(warmup_res, i, 0, N) { + CARE_STREAMED_LOOP(warmup_res, i, 0, size) { arrays[0][i] = i; } CARE_STREAMED_LOOP_END for (auto _ : state) { //run num kernels - for(int j = 0; j < NUM_KERNELS; j++) + for(int j = 0; j < N; j++) { - CARE_STREAMED_LOOP(res, i, 0, N) { + CARE_STREAMED_LOOP(res, i, 0, size) { arrays[j][i] = i; } CARE_STREAMED_LOOP_END res.wait(); } } - for(int i = 0; i < NUM_KERNELS; i++) {arrays[i].free();} + for(int i = 0; i < N; i++) {arrays[i].free();} } // Register the function as a benchmark -BENCHMARK(benchmark_gpu_loop_single_stream)->Range(1, INT_MAX); +BENCHMARK(benchmark_gpu_loop_single_stream)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(12)->Arg(16); #endif +} //namespace care + // Run the benchmarks BENCHMARK_MAIN(); From 4915fb4c6dd4fc894d15e91837944f6b881e8169 Mon Sep 17 00:00:00 2001 From: Neela Kausik Date: Thu, 3 Aug 2023 15:09:02 -0700 Subject: [PATCH 10/18] added CARE_DLL to PluginData.h --- src/care/PluginData.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/care/PluginData.h b/src/care/PluginData.h index b248cd95..37788a50 100644 --- a/src/care/PluginData.h +++ b/src/care/PluginData.h @@ -29,17 +29,17 @@ namespace care{ static void setParallelContext(bool isParallel); - static bool isParallelContext(); + CARE_DLL_API static bool isParallelContext(); - static bool post_parallel_forall_action_registered(void * key); + CARE_DLL_API static bool post_parallel_forall_action_registered(void * key); static ActionMap get_post_parallel_forall_actions(); - static void register_post_parallel_forall_action(void * key, std::function action); + CARE_DLL_API static void register_post_parallel_forall_action(void * key, std::function action); static void clear_post_parallel_forall_actions(); - static std::vector getActivePointers(); + CARE_DLL_API static std::vector getActivePointers(); static void addActivePointer(const chai::PointerRecord* record); @@ -47,7 +47,7 @@ namespace care{ static void clearActivePointers(); - static int s_threadID; + CARE_DLL_API static int s_threadID; private: static const char * s_file_name; From bea52591236fec9e9c962c6e5ed75b50828fc5dc Mon Sep 17 00:00:00 2001 From: Neela Kausik Date: Thu, 3 Aug 2023 15:10:35 -0700 Subject: [PATCH 11/18] Revert "added CARE_DLL to PluginData.h" This reverts commit 4915fb4c6dd4fc894d15e91837944f6b881e8169. --- src/care/PluginData.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/care/PluginData.h b/src/care/PluginData.h index 37788a50..b248cd95 100644 --- a/src/care/PluginData.h +++ b/src/care/PluginData.h @@ -29,17 +29,17 @@ namespace care{ static void setParallelContext(bool isParallel); - CARE_DLL_API static bool isParallelContext(); + static bool isParallelContext(); - CARE_DLL_API static bool post_parallel_forall_action_registered(void * key); + static bool post_parallel_forall_action_registered(void * key); static ActionMap get_post_parallel_forall_actions(); - CARE_DLL_API static void register_post_parallel_forall_action(void * key, std::function action); + static void register_post_parallel_forall_action(void * key, std::function action); static void clear_post_parallel_forall_actions(); - CARE_DLL_API static std::vector getActivePointers(); + static std::vector getActivePointers(); static void addActivePointer(const chai::PointerRecord* record); @@ -47,7 +47,7 @@ namespace care{ static void clearActivePointers(); - CARE_DLL_API static int s_threadID; + static int s_threadID; private: static const char * s_file_name; From 1f159f14a3ee1a5ab682ae0645706dfee3fb5a6c Mon Sep 17 00:00:00 2001 From: Neela Kausik Date: Fri, 4 Aug 2023 11:51:59 -0700 Subject: [PATCH 12/18] addressed comments --- benchmarks/BenchmarkRaja.cpp | 51 +++++++++++++++++++----------------- src/care/forall.h | 8 ++++-- 2 files changed, 33 insertions(+), 26 deletions(-) diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp index 014eed2c..f143eedd 100644 --- a/benchmarks/BenchmarkRaja.cpp +++ b/benchmarks/BenchmarkRaja.cpp @@ -22,41 +22,42 @@ #define size 1000000 -namespace care{ - #if defined(CARE_GPUCC) //each kernel has a separate stream static void benchmark_gpu_loop_separate_streams(benchmark::State& state) { int N = state.range(0); - RAJA::resources::Cuda res_arr[N]; - RAJA::resources::Event event_arr[N]; + care::Resource res_arr[16]; + RAJA::resources::Event event_arr[16]; care::host_device_ptr arrays[16]; for(int i = 0; i < N; i++) { - RAJA::resources::Cuda res; - res_arr[i] = res; - RAJA::resources::Event e = res.get_event(); - event_arr[i] = e; - care::host_device_ptr arr(size, "arr"); - arrays[i] = arr; + res_arr[i] = care::Resource(); + event_arr[i] = res_arr[i].get_event(); + arrays[i] = care::host_device_ptr(size, "arr"); } //warmup kernel - RAJA::resources::Cuda warmup_res; - CARE_STREAMED_LOOP(warmup_res, i, 0 , size) { + CARE_GPU_LOOP(i, 0 , size) { arrays[0][i] = 0; - } CARE_STREAMED_LOOP_END + } CARE_GPU_LOOP_END + + care::gpuDeviceSynchronize(__FILE__, __LINE__); for (auto _ : state) { //run num kernels + //#pragma omp parallel for for(int j = 0; j < N; j++) { CARE_STREAMED_LOOP(res_arr[j], i, 0 , size) { arrays[j][i] = i; } CARE_STREAMED_LOOP_END } + care::gpuDeviceSynchronize(__FILE__, __LINE__); + } + + for(int i = 0; i < N; i++){ + arrays[i].free(); } - for(int i = 0; i < N; i++) {arrays[i].free();} } // Register the function as a benchmark @@ -66,32 +67,36 @@ BENCHMARK(benchmark_gpu_loop_separate_streams)->Arg(1)->Arg(2)->Arg(4)->Arg(8)-> static void benchmark_gpu_loop_single_stream(benchmark::State& state) { int N = state.range(0); - RAJA::resources::Cuda res; + care::Resource res; care::host_device_ptr arrays[16]; for(int i = 0; i < N; i++) { - care::host_device_ptr arr(size, "arr"); - arrays[i] = arr; + arrays[i] = care::host_device_ptr(size, "arr"); } //warmup kernel - RAJA::resources::Cuda warmup_res; - CARE_STREAMED_LOOP(warmup_res, i, 0, size) { + CARE_GPU_LOOP(i, 0, size) { arrays[0][i] = i; - } CARE_STREAMED_LOOP_END + } CARE_GPU_LOOP_END + + care::gpuDeviceSynchronize(__FILE__, __LINE__); for (auto _ : state) { //run num kernels + //#pragma omp parallel for for(int j = 0; j < N; j++) { CARE_STREAMED_LOOP(res, i, 0, size) { arrays[j][i] = i; } CARE_STREAMED_LOOP_END - res.wait(); } + res.wait(); + } + + for(int i = 0; i < N; i++){ + arrays[i].free(); } - for(int i = 0; i < N; i++) {arrays[i].free();} } // Register the function as a benchmark @@ -99,7 +104,5 @@ BENCHMARK(benchmark_gpu_loop_single_stream)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg #endif -} //namespace care - // Run the benchmarks BENCHMARK_MAIN(); diff --git a/src/care/forall.h b/src/care/forall.h index f6ed5f7b..27158080 100644 --- a/src/care/forall.h +++ b/src/care/forall.h @@ -30,22 +30,26 @@ namespace care { #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS static bool s_reverseLoopOrder = false; #endif - + template struct ExecutionPolicyToSpace { static constexpr const chai::ExecutionSpace value = chai::CPU; }; #if defined(__CUDACC__) + typedef RAJA::resources::Cuda Resource; template <> struct ExecutionPolicyToSpace> { static constexpr const chai::ExecutionSpace value = chai::GPU; }; #elif defined (__HIPCC__) + typedef RAJA::resources::Hip Resource; template <> struct ExecutionPolicyToSpace> { static constexpr const chai::ExecutionSpace value = chai::GPU; }; +#else + typedef RAJA::resources::Host Resource; #endif #if CARE_ENABLE_GPU_SIMULATION_MODE @@ -208,7 +212,7 @@ namespace care { #if defined(CARE_GPUCC) template - void forall_with_stream(gpu, RAJA::resources::Cuda res, const char * fileName, const int lineNumber, + void forall_with_stream(gpu, Resource res, const char * fileName, const int lineNumber, const int start, const int end, LB&& body) { #if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS s_reverseLoopOrder = true; From ada1508b1e788108196ee23c40c82c6b11e6f265 Mon Sep 17 00:00:00 2001 From: Neela Kausik Date: Fri, 4 Aug 2023 11:53:33 -0700 Subject: [PATCH 13/18] omp parallel for --- benchmarks/BenchmarkRaja.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp index f143eedd..dbfc984e 100644 --- a/benchmarks/BenchmarkRaja.cpp +++ b/benchmarks/BenchmarkRaja.cpp @@ -45,7 +45,7 @@ static void benchmark_gpu_loop_separate_streams(benchmark::State& state) { for (auto _ : state) { //run num kernels - //#pragma omp parallel for + #pragma omp parallel for for(int j = 0; j < N; j++) { CARE_STREAMED_LOOP(res_arr[j], i, 0 , size) { @@ -84,7 +84,7 @@ static void benchmark_gpu_loop_single_stream(benchmark::State& state) { for (auto _ : state) { //run num kernels - //#pragma omp parallel for + #pragma omp parallel for for(int j = 0; j < N; j++) { CARE_STREAMED_LOOP(res, i, 0, size) { From aa901137250d7ca78d45bc4f933e0389b4df0766 Mon Sep 17 00:00:00 2001 From: Neela Kausik Date: Fri, 4 Aug 2023 12:21:19 -0700 Subject: [PATCH 14/18] try to make benchmarks more similar --- benchmarks/BenchmarkRaja.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp index dbfc984e..6d5a2393 100644 --- a/benchmarks/BenchmarkRaja.cpp +++ b/benchmarks/BenchmarkRaja.cpp @@ -49,7 +49,7 @@ static void benchmark_gpu_loop_separate_streams(benchmark::State& state) { for(int j = 0; j < N; j++) { CARE_STREAMED_LOOP(res_arr[j], i, 0 , size) { - arrays[j][i] = i; + arrays[j][i] = sqrtf(i) + cosf(j) * powf(i, j); } CARE_STREAMED_LOOP_END } care::gpuDeviceSynchronize(__FILE__, __LINE__); @@ -67,9 +67,7 @@ BENCHMARK(benchmark_gpu_loop_separate_streams)->Arg(1)->Arg(2)->Arg(4)->Arg(8)-> static void benchmark_gpu_loop_single_stream(benchmark::State& state) { int N = state.range(0); - care::Resource res; - - care::host_device_ptr arrays[16]; + care::host_device_ptr arrays[16]; for(int i = 0; i < N; i++) { arrays[i] = care::host_device_ptr(size, "arr"); @@ -77,21 +75,20 @@ static void benchmark_gpu_loop_single_stream(benchmark::State& state) { //warmup kernel CARE_GPU_LOOP(i, 0, size) { - arrays[0][i] = i; + arrays[0][i] = 0; } CARE_GPU_LOOP_END care::gpuDeviceSynchronize(__FILE__, __LINE__); for (auto _ : state) { //run num kernels - #pragma omp parallel for for(int j = 0; j < N; j++) { - CARE_STREAMED_LOOP(res, i, 0, size) { - arrays[j][i] = i; - } CARE_STREAMED_LOOP_END + CARE_GPU_LOOP(i, 0, size) { + arrays[j][i] = sqrtf(i) + cosf(j) * powf(i, j); + } CARE_GPU_LOOP_END } - res.wait(); + care::gpuDeviceSynchronize(__FILE__, __LINE__); } for(int i = 0; i < N; i++){ From 65ce8b07daf4635f90b936017f40450785293e9a Mon Sep 17 00:00:00 2001 From: Neela Kausik Date: Fri, 4 Aug 2023 12:42:33 -0700 Subject: [PATCH 15/18] set num threads --- benchmarks/BenchmarkRaja.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp index 6d5a2393..9c88f0b8 100644 --- a/benchmarks/BenchmarkRaja.cpp +++ b/benchmarks/BenchmarkRaja.cpp @@ -45,10 +45,11 @@ static void benchmark_gpu_loop_separate_streams(benchmark::State& state) { for (auto _ : state) { //run num kernels + omp_set_num_threads(16); #pragma omp parallel for for(int j = 0; j < N; j++) { - CARE_STREAMED_LOOP(res_arr[j], i, 0 , size) { + CARE_STREAMED_LOOP(res_arr[j], i, 0 , size) { arrays[j][i] = sqrtf(i) + cosf(j) * powf(i, j); } CARE_STREAMED_LOOP_END } @@ -67,7 +68,7 @@ BENCHMARK(benchmark_gpu_loop_separate_streams)->Arg(1)->Arg(2)->Arg(4)->Arg(8)-> static void benchmark_gpu_loop_single_stream(benchmark::State& state) { int N = state.range(0); - care::host_device_ptr arrays[16]; + care::host_device_ptr arrays[16]; for(int i = 0; i < N; i++) { arrays[i] = care::host_device_ptr(size, "arr"); From 25b888581f4610c37c5e6289d3ec6d89968af0fe Mon Sep 17 00:00:00 2001 From: Neela Kausik Date: Fri, 4 Aug 2023 13:27:29 -0700 Subject: [PATCH 16/18] forall resource overload --- src/care/forall.h | 48 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/src/care/forall.h b/src/care/forall.h index 27158080..4b638607 100644 --- a/src/care/forall.h +++ b/src/care/forall.h @@ -101,6 +101,50 @@ namespace care { } } + //////////////////////////////////////////////////////////////////////////////// + /// + /// @author Peter Robinson, Alan Dayton + /// + /// @brief Loops over the given indices and calls the loop body with each index. + /// This overload is CHAI and RAJA aware and sets the execution space accordingly. + /// + /// @arg[in] policy Used to choose this overload of forall + /// @arg[in] res Resource to be used + /// @arg[in] fileName The name of the file where this function is called + /// @arg[in] lineNumber The line number in the file where this function is called + /// @arg[in] start The starting index (inclusive) + /// @arg[in] end The ending index (exclusive) + /// @arg[in] body The loop body to execute at each index + /// + //////////////////////////////////////////////////////////////////////////////// + template + void forall(ExecutionPolicy /* policy */, Resource res, const char * fileName, const int lineNumber, + const int start, const int end, LB&& body) { + const int length = end - start; + + if (length != 0) { + PluginData::setFileName(fileName); + PluginData::setLineNumber(lineNumber); + + +#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS + RAJA::RangeStrideSegment rangeSegment = + s_reverseLoopOrder ? + RAJA::RangeStrideSegment(end - 1, start - 1, -1) : + RAJA::RangeStrideSegment(start, end, 1); +#else + RAJA::RangeSegment rangeSegment = RAJA::RangeSegment(start, end); +#endif + +#if CARE_ENABLE_GPU_SIMULATION_MODE + RAJA::forall(res, rangeSegment, std::forward(body)); +#else + RAJA::forall(res, rangeSegment, std::forward(body)); +#endif + } + } + + //////////////////////////////////////////////////////////////////////////////// /// /// @author Alan Dayton @@ -222,10 +266,10 @@ namespace care { forall(gpu_simulation{}, res, fileName, lineNumber, start, end, std::forward(body)); #elif defined(__CUDACC__) forall(RAJA::cuda_exec{}, - res, RAJA::RangeSegment(start, end), std::forward(body)); + res, fileName, lineNumber, start, end, std::forward(body)); #elif defined(__HIPCC__) forall(RAJA::hip_exec{}, - res, RAJA::RangeSegment(start, end), std::forward(body)); + res, fileName, lineNumber, start, end, std::forward(body)); #else forall(RAJA::seq_exec{}, res, fileName, lineNumber, start, end, std::forward(body)); #endif From f761e5fa8f79b02fb93860ad1f0c5d0f17c773eb Mon Sep 17 00:00:00 2001 From: Neela Kausik Date: Fri, 4 Aug 2023 14:23:03 -0700 Subject: [PATCH 17/18] N threads --- benchmarks/BenchmarkRaja.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/BenchmarkRaja.cpp b/benchmarks/BenchmarkRaja.cpp index 9c88f0b8..8b3b3285 100644 --- a/benchmarks/BenchmarkRaja.cpp +++ b/benchmarks/BenchmarkRaja.cpp @@ -45,7 +45,7 @@ static void benchmark_gpu_loop_separate_streams(benchmark::State& state) { for (auto _ : state) { //run num kernels - omp_set_num_threads(16); + omp_set_num_threads(N); #pragma omp parallel for for(int j = 0; j < N; j++) { From e28bfb5a3e233201cc554968f7d11d6e630c37df Mon Sep 17 00:00:00 2001 From: Neela Kausik Date: Fri, 4 Aug 2023 14:38:38 -0700 Subject: [PATCH 18/18] resource template parameter --- src/care/forall.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/care/forall.h b/src/care/forall.h index 4b638607..20036cd4 100644 --- a/src/care/forall.h +++ b/src/care/forall.h @@ -117,8 +117,8 @@ namespace care { /// @arg[in] body The loop body to execute at each index /// //////////////////////////////////////////////////////////////////////////////// - template - void forall(ExecutionPolicy /* policy */, Resource res, const char * fileName, const int lineNumber, + template + void forall(ExecutionPolicy /* policy */, R res, const char * fileName, const int lineNumber, const int start, const int end, LB&& body) { const int length = end - start;