From 9008e78364dace0deeb34ebd9780b343998b48bf Mon Sep 17 00:00:00 2001 From: Saransh Chopra Date: Mon, 1 Dec 2025 14:34:38 +0100 Subject: [PATCH 1/3] feat: concat kernel for Alpaka --- src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx | 54 ++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx index c828668..8db0467 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx @@ -316,7 +316,59 @@ return out.str(); } - }; + + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) { + std::string op; + op = "\n//------ CONCAT_KERNEL_ALPAKA\n"; + op += SP + "struct ConcatKernel {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const * const * input_ptrs,"; + op += " T * output, std::size_t const * input_strides, std::size_t const * output_strides, "; + op += "std::size_t const * axis_offsets, std::size_t const * axis_sizes, std::size_t const num_inputs, "; + op += "std::size_t const concat_axis, std::size_t const ndim) const {\n"; + op += SP + SP + SP + SP + "auto elements = alpaka::uniformElementsND(acc, alpaka::Vec(output_shape));\n"; + op += SP + SP + SP + SP + "for (auto const& elem : elements) {\n"; + op += SP + SP + SP + SP + SP + "size_t out_idx = 0;\n"; + op += SP + SP + SP + SP + SP + "size_t in_idx = 0;\n"; + op += SP + SP + SP + SP + SP + "size_t axis_coord = elem[concat_axis];\n\n"; + op += SP + SP + SP + SP + SP + "size_t chosen_input = 0;\n"; + op += SP + SP + SP + SP + SP + "for (size_t i = 0; i < num_inputs; ++i) {\n"; + op += SP + SP + SP + SP + SP + SP + "size_t start = axis_offsets[i];\n"; + op += SP + SP + SP + SP + SP + SP + "size_t size = axis_sizes[i];\n"; + op += SP + SP + SP + SP + SP + SP + "if (axis_coord >= start && axis_coord < start + size) { chosen_input = i; break; }\n"; + op += SP + SP + SP + SP + SP + "}\n"; + op += SP + SP + SP + SP + SP + "for (int d = 0; d < (int)ndim; ++d) {\n"; + op += SP + SP + SP + SP + SP + SP + "size_t out_coord = elem[d];\n"; + op += SP + SP + SP + SP + SP + SP + "size_t in_coord = (d == (int)concat_axis) ? (out_coord - axis_offsets[chosen_input]) : out_coord;\n"; + op += SP + SP + SP + SP + SP + SP + "in_idx += in_coord * input_strides[d];\n"; + op += SP + SP + SP + SP + SP + SP + "out_idx += out_coord * output_strides[d];\n"; + op += SP + SP + SP + SP + SP + "}\n"; + op += SP + SP + SP + SP + SP + "T const * src = input_ptrs[chosen_input];\n"; + op += SP + SP + SP + SP + SP + "output[out_idx] = src[in_idx];\n"; + op += SP + SP + SP + SP + "}\n"; // end for elements + op += SP + SP + "}\n"; // end operator() + op += SP + "};\n"; + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + return SP + "ConcatKernel concatKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("TMVA SOFIE Operator Transpose called to Generate without being initialized first"); + } + std::stringstream out; + auto length = ConvertDynamicShapeToLength(fShape); + out << "\n//------ CONCAT_GPU_ALPAKA\n"; + out << SP << "alpaka::WorkDivMembers workDiv_"<::all("<<(stoi(length)+256-1)/256<<"), alpaka::Vec::all(256), alpaka::Vec::all(1));\n"; + out << SP << "alpaka::exec(queue, workDiv_" << fNX << ", concatKernel, alpaka::getPtrNative(deviceBuf_" << fNX << "), static_cast(" << length << ")); \n"; + return out.str(); + } + }; }//SOFIE From 80c4fc5fe01770bd95a540ab447702b235b239d1 Mon Sep 17 00:00:00 2001 From: Saransh Chopra Date: Tue, 9 Dec 2025 17:21:54 +0100 Subject: [PATCH 2/3] fix kernel and call signature + do not depend on offsets --- src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx | 57 +++++++++++-------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx index 8db0467..a232167 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx @@ -322,32 +322,39 @@ op = "\n//------ CONCAT_KERNEL_ALPAKA\n"; op += SP + "struct ConcatKernel {\n"; op += SP + SP + "template\n"; - op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const * const * input_ptrs,"; - op += " T * output, std::size_t const * input_strides, std::size_t const * output_strides, "; - op += "std::size_t const * axis_offsets, std::size_t const * axis_sizes, std::size_t const num_inputs, "; - op += "std::size_t const concat_axis, std::size_t const ndim) const {\n"; - op += SP + SP + SP + SP + "auto elements = alpaka::uniformElementsND(acc, alpaka::Vec(output_shape));\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* const* input_ptrs, T* output,"; + op += " std::size_t const* const* input_strides_ptrs, std::size_t const* axis_sizes,"; + op += " std::size_t num_inputs, std::size_t axis, std::size_t const* output_strides,"; + op += " std::size_t const* output_shape) const {\n"; + op += SP + SP + SP + SP + "using DimAcc = alpaka::Dim;\n"; + op += SP + SP + SP + SP + "using IdxAcc = alpaka::Idx;\n"; + op += SP + SP + SP + SP + "constexpr std::size_t D = static_cast(DimAcc::value);\n"; + op += SP + SP + SP + SP + "alpaka::Vec shapeVec{};\n"; + op += SP + SP + SP + SP + "for (std::size_t d = 0; d < D; ++d) shapeVec[d] = output_shape[d];\n"; + op += SP + SP + SP + SP + "auto elements = alpaka::uniformElementsND(acc, shapeVec);\n"; op += SP + SP + SP + SP + "for (auto const& elem : elements) {\n"; - op += SP + SP + SP + SP + SP + "size_t out_idx = 0;\n"; - op += SP + SP + SP + SP + SP + "size_t in_idx = 0;\n"; - op += SP + SP + SP + SP + SP + "size_t axis_coord = elem[concat_axis];\n\n"; - op += SP + SP + SP + SP + SP + "size_t chosen_input = 0;\n"; - op += SP + SP + SP + SP + SP + "for (size_t i = 0; i < num_inputs; ++i) {\n"; - op += SP + SP + SP + SP + SP + SP + "size_t start = axis_offsets[i];\n"; + op += SP + SP + SP + SP + SP + "std::size_t out_idx = 0;\n"; + op += SP + SP + SP + SP + SP + "for (std::size_t d = 0; d < D; ++d) out_idx += idx[d] * output_strides[d];\n"; + op += SP + SP + SP + SP + SP + "std::size_t axis_coord = idx[axis];\n"; + op += SP + SP + SP + SP + SP + "std::size_t chosen = 0;\n"; + op += SP + SP + SP + SP + SP + "std::size_t offset = 0;\n"; + op += SP + SP + SP + SP + SP + "for (std::size_t k = 0; k < num_inputs; ++k) {\n"; + op += SP + SP + SP + SP + SP + SP + "std::size_t sz = axis_sizes[k];\n"; op += SP + SP + SP + SP + SP + SP + "size_t size = axis_sizes[i];\n"; - op += SP + SP + SP + SP + SP + SP + "if (axis_coord >= start && axis_coord < start + size) { chosen_input = i; break; }\n"; + op += SP + SP + SP + SP + SP + SP + "if (axis_coord < offset + sz) { chosen = k; break; }\n"; + op += SP + SP + SP + SP + SP + SP + "offset += sz;\n"; op += SP + SP + SP + SP + SP + "}\n"; - op += SP + SP + SP + SP + SP + "for (int d = 0; d < (int)ndim; ++d) {\n"; - op += SP + SP + SP + SP + SP + SP + "size_t out_coord = elem[d];\n"; - op += SP + SP + SP + SP + SP + SP + "size_t in_coord = (d == (int)concat_axis) ? (out_coord - axis_offsets[chosen_input]) : out_coord;\n"; - op += SP + SP + SP + SP + SP + SP + "in_idx += in_coord * input_strides[d];\n"; - op += SP + SP + SP + SP + SP + SP + "out_idx += out_coord * output_strides[d];\n"; + op += SP + SP + SP + SP + SP + "std::size_t in_idx = 0;"; + op += SP + SP + SP + SP + SP + "for (std::size_t d = 0; d < D; ++d) {\n"; + op += SP + SP + SP + SP + SP + SP + "std::size_t coord_out = idx[d];\n"; + op += SP + SP + SP + SP + SP + SP + "std::size_t coord_in = (d == axis) ? (coord_out - offset) : coord_out;\n"; + op += SP + SP + SP + SP + SP + SP + "in_idx += coord_in * input_strides_ptrs[chosen][d];\n"; op += SP + SP + SP + SP + SP + "}\n"; - op += SP + SP + SP + SP + SP + "T const * src = input_ptrs[chosen_input];\n"; + op += SP + SP + SP + SP + SP + "T const* src = input_ptrs[chosen];\n"; op += SP + SP + SP + SP + SP + "output[out_idx] = src[in_idx];\n"; - op += SP + SP + SP + SP + "}\n"; // end for elements - op += SP + SP + "}\n"; // end operator() - op += SP + "};\n"; + op += SP + SP + SP + SP + "}\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "};\n"; return op; } @@ -359,13 +366,15 @@ std::string Generate_GPU_ALPAKA(std::string OpName) override { OpName = "op_" + OpName; if (fShape.empty()) { - throw std::runtime_error("TMVA SOFIE Operator Transpose called to Generate without being initialized first"); + throw std::runtime_error("TMVA SOFIE Operator Concat called to Generate without being initialized first"); } std::stringstream out; auto length = ConvertDynamicShapeToLength(fShape); out << "\n//------ CONCAT_GPU_ALPAKA\n"; - out << SP << "alpaka::WorkDivMembers workDiv_"<::all("<<(stoi(length)+256-1)/256<<"), alpaka::Vec::all(256), alpaka::Vec::all(1));\n"; - out << SP << "alpaka::exec(queue, workDiv_" << fNX << ", concatKernel, alpaka::getPtrNative(deviceBuf_" << fNX << "), static_cast(" << length << ")); \n"; + out << SP << "alpaka::WorkDivMembers workDiv_"<::all("<<(stoi(length)+256-1)/256<<"), alpaka::Vec::all(256), alpaka::Vec::all(1));\n"; + out << SP << "alpaka::exec(queue, workDiv_" << fInputs << ", concatKernel, alpaka::getPtrNative(deviceBuf_" << fInputs << "), alpaka::getPtrNative(deviceBuf_" << fOutput << "), " + << ConvertShapeToString(UTILITY::ComputeStrideFromShape(fInputShapes)) << ", " << fInputShapes[fAxis] << ", " << fInputs.size() << ", " << fAxis << ", " + << ConvertShapeToString(UTILITY::ComputeStrideFromShape(fOutputShape)) << ", " << fOutputShape << ");\n"; return out.str(); } }; From 0075a97a5e22bf9396fae5e21995c3f08d9e3159 Mon Sep 17 00:00:00 2001 From: Saransh Chopra Date: Sun, 14 Dec 2025 15:02:14 +0100 Subject: [PATCH 3/3] fix build --- src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx | 12 +++--- src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx | 5 +++ src/SOFIE_core/src/SOFIE_common.cxx | 40 +++++++++++++++++++ 3 files changed, 51 insertions(+), 6 deletions(-) diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx index a232167..892e4a5 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx @@ -365,16 +365,16 @@ std::string Generate_GPU_ALPAKA(std::string OpName) override { OpName = "op_" + OpName; - if (fShape.empty()) { + if (fOutputShape.empty()) { throw std::runtime_error("TMVA SOFIE Operator Concat called to Generate without being initialized first"); } std::stringstream out; - auto length = ConvertDynamicShapeToLength(fShape); + auto length = ConvertDynamicShapeToLength(fOutputShape); out << "\n//------ CONCAT_GPU_ALPAKA\n"; - out << SP << "alpaka::WorkDivMembers workDiv_"<::all("<<(stoi(length)+256-1)/256<<"), alpaka::Vec::all(256), alpaka::Vec::all(1));\n"; - out << SP << "alpaka::exec(queue, workDiv_" << fInputs << ", concatKernel, alpaka::getPtrNative(deviceBuf_" << fInputs << "), alpaka::getPtrNative(deviceBuf_" << fOutput << "), " - << ConvertShapeToString(UTILITY::ComputeStrideFromShape(fInputShapes)) << ", " << fInputShapes[fAxis] << ", " << fInputs.size() << ", " << fAxis << ", " - << ConvertShapeToString(UTILITY::ComputeStrideFromShape(fOutputShape)) << ", " << fOutputShape << ");\n"; + out << SP << "alpaka::WorkDivMembers workDiv_"<::all("<< length << " + 256 - 1) / 256), alpaka::Vec::all(256), alpaka::Vec::all(1));\n"; + out << SP << "alpaka::exec(queue, workDiv_" << ConvertShapeToString(fInputs) << ", concatKernel, alpaka::getPtrNative(deviceBuf_" << ConvertShapeToString(fInputs) << "), alpaka::getPtrNative(deviceBuf_" << fOutput << "), " + << ConvertShapeToString(UTILITY::ComputeStrideFromShape(fInputShapes)) << ", " << ConvertShapeToString(fInputShapes[fAxis]) << ", " << fInputs.size() << ", " << fAxis << ", " + << ConvertShapeToString(UTILITY::ComputeStrideFromShape(fOutputShape)) << ", " << ConvertShapeToString(fOutputShape) << ");\n"; return out.str(); } }; diff --git a/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx b/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx index 8b9727b..b3a56e9 100644 --- a/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx +++ b/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx @@ -202,8 +202,12 @@ inline std::size_t ConvertShapeToLength(const std::vector & shape){ } std::string ConvertShapeToString(const std::vector & shape); +std::string ConvertStringShapeToString(const std::vector & shape); +std::string ConvertShapeToString(const std::vector & shape); std::string ConvertDimShapeToString(const std::vector & shape); std::string ConvertShapeToString(const std::vector & shape); +std::string ConvertVectorDimShapeToString(const std::vector> & shapes); +std::string ConvertShapeToString(const std::vector> & shapes); @@ -513,6 +517,7 @@ void UnidirectionalBroadcast(const T* data, const std::vector& shape, co /// compute stride of a tensor given its shape (assume layout is row-major) std::vector ComputeStrideFromShape(const std::vector & shape); std::vector ComputeStrideFromShape(const std::vector & shape); +std::vector> ComputeStrideFromShape(const std::vector> & shapes); /// function to check if a >> 0 and a < MAX using a single comparison //// use trick casting to unsigned values so it becomes a single comparison diff --git a/src/SOFIE_core/src/SOFIE_common.cxx b/src/SOFIE_core/src/SOFIE_common.cxx index 05f873b..2ccf49f 100644 --- a/src/SOFIE_core/src/SOFIE_common.cxx +++ b/src/SOFIE_core/src/SOFIE_common.cxx @@ -116,6 +116,20 @@ std::string ConvertShapeToString(const std::vector & shape) { return out.str(); } +std::string ConvertStringShapeToString(const std::vector & shape) { + std::stringstream out; + out << "{ "; + for (size_t i = 0; i < shape.size(); i++) { + out << shape[i]; + if (i < shape.size()-1) out << " , "; + } + out << " }"; + return out.str(); +} +std::string ConvertShapeToString(const std::vector & shape) { + return ConvertStringShapeToString(shape); +} + std::string ConvertDimShapeToString(const std::vector & shape) { std::stringstream out; out << "{ "; @@ -164,6 +178,20 @@ std::string ConvertDynamicShapeToLength(const std::vector & shape) { return ConvertDimShapeToLength(shape); } +std::string ConvertVectorDimShapeToString(const std::vector> & shapes) { + std::stringstream out; + out << "{ "; + for (size_t i = 0; i < shapes.size(); i++) { + out << ConvertShapeToString(shapes[i]); + if (i < shapes.size() - 1) out << " , "; + } + out << " }"; + return out.str(); +} +std::string ConvertShapeToString(const std::vector> & shapes) { + return ConvertVectorDimShapeToString(shapes); +} + namespace{ template @@ -537,4 +565,16 @@ std::vector UTILITY::ComputeStrideFromShape(const std::vector & shape) return strides; } +std::vector> UTILITY::ComputeStrideFromShape(const std::vector> & shapes) { + std::vector> all_strides; + all_strides.reserve(shapes.size()); + + // Process each shape individually using the existing single-vector implementation + for (const auto& shape : shapes) { + all_strides.push_back(ComputeStrideFromShape(shape)); + } + + return all_strides; +} + } // namespace SOFIE