diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..182ccd4 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,81 @@ +{ + "files.associations": { + "*.icc": "cpp", + "limits": "cpp", + "cctype": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "array": "cpp", + "atomic": "cpp", + "bit": "cpp", + "*.tcc": "cpp", + "bitset": "cpp", + "compare": "cpp", + "complex": "cpp", + "concepts": "cpp", + "cstdint": "cpp", + "deque": "cpp", + "map": "cpp", + "set": "cpp", + "string": "cpp", + "unordered_map": "cpp", + "unordered_set": "cpp", + "vector": "cpp", + "exception": "cpp", + "algorithm": "cpp", + "functional": "cpp", + "iterator": "cpp", + "memory": "cpp", + "memory_resource": "cpp", + "numeric": "cpp", + "optional": "cpp", + "random": "cpp", + "regex": "cpp", + "string_view": "cpp", + "system_error": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "utility": "cpp", + "fstream": "cpp", + "initializer_list": "cpp", + "iomanip": "cpp", + "iosfwd": "cpp", + "iostream": "cpp", + "istream": "cpp", + "new": "cpp", + "numbers": "cpp", + "ostream": "cpp", + "sstream": "cpp", + "stdexcept": "cpp", + "streambuf": "cpp", + "cinttypes": "cpp", + "typeinfo": "cpp", + "charconv": "cpp", + "chrono": "cpp", + "condition_variable": "cpp", + "list": "cpp", + "ratio": "cpp", + "future": "cpp", + "mutex": "cpp", + "semaphore": "cpp", + "shared_mutex": "cpp", + "span": "cpp", + "stop_token": "cpp", + "thread": "cpp", + "cfenv": "cpp", + "variant": "cpp", + "format": "cpp", + "any": "cpp", + "source_location": "cpp", + "run_inference_particle_net.C": "cpp", + "test.C": "cpp" + } +} diff --git a/README.md b/README.md index 97902f8..597cb56 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,11 @@ source setup.sh ``` Now ROOT should also access the SOFIE libraries while it runs. This helps to accelerate development. Submit your developments here and we will proceed with the developments in ROOT carefull. - +3. To enable testing generated code with alpaka implementations, build using the following command: +```bash +cmake -Dtesting=ON -DENABLE_ALPAKA_TESTS=ON -DCMAKE_INSTALL_PREFIX=../install -DCMAKE_BUILD_TYPE=RelWithDebInfo .. +``` +The default architecture is CUDA, but can be configured using an additional`-DALPAKA_BACKEND=hip` cmake option. ## Inspiration The standalone version of SOFIE is developed with inspiration from the standalone version of RooFit developed by Jonas Rembser that can be found [here](https://github.com/guitargeek/roofit). diff --git a/settings-dev.cmake b/settings-dev.cmake new file mode 100644 index 0000000..6a8496f --- /dev/null +++ b/settings-dev.cmake @@ -0,0 +1,7 @@ +set (CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "" FORCE) +set (CMAKE_INSTALL_PREFIX ../install CACHE STRING "" FORCE) +set (CMAKE_INSTALL_BINDIR bin CACHE STRING "" FORCE) +set (CMAKE_INSTALL_INCLUDEDIR include CACHE STRING "" FORCE) +set (CMAKE_INSTALL_LIBDIR lib CACHE STRING "" FORCE) +set (testing ON CACHE BOOL "" FORCE) +set (mathmore ON CACHE BOOL "" FORCE) diff --git a/src/.vscode/settings.json b/src/.vscode/settings.json new file mode 100644 index 0000000..8bc121a --- /dev/null +++ b/src/.vscode/settings.json @@ -0,0 +1,61 @@ +{ + "files.associations": { + "*.icc": "cpp", + "iostream": "cpp", + "ostream": "cpp", + "cctype": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "array": "cpp", + "atomic": "cpp", + "bit": "cpp", + "bitset": "cpp", + "compare": "cpp", + "complex": "cpp", + "concepts": "cpp", + "cstdint": "cpp", + "deque": "cpp", + "map": "cpp", + "set": "cpp", + "string": "cpp", + "unordered_map": "cpp", + "unordered_set": "cpp", + "vector": "cpp", + "exception": "cpp", + "algorithm": "cpp", + "functional": "cpp", + "iterator": "cpp", + "memory": "cpp", + "memory_resource": "cpp", + "numeric": "cpp", + "optional": "cpp", + "random": "cpp", + "regex": "cpp", + "string_view": "cpp", + "system_error": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "utility": "cpp", + "fstream": "cpp", + "initializer_list": "cpp", + "iomanip": "cpp", + "iosfwd": "cpp", + "istream": "cpp", + "limits": "cpp", + "new": "cpp", + "numbers": "cpp", + "sstream": "cpp", + "stdexcept": "cpp", + "streambuf": "cpp", + "cinttypes": "cpp", + "typeinfo": "cpp" + } +} \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c48e8d1..102ca3b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -8,3 +8,4 @@ set(sofie_legacy_eval_backend ON CACHE BOOL "" FORCE) add_subdirectory(SOFIE_core) add_subdirectory(SOFIE_parsers) +add_subdirectory(utils) diff --git a/src/SOFIE_core/CMakeLists.txt b/src/SOFIE_core/CMakeLists.txt index 7297957..4cab8e0 100644 --- a/src/SOFIE_core/CMakeLists.txt +++ b/src/SOFIE_core/CMakeLists.txt @@ -76,6 +76,7 @@ list(TRANSFORM sources_headers PREPEND "inc/") set(sources_cxx src/RModel_Base.cxx src/RModel.cxx + src/RModel_ALPAKA.cxx src/RModel_GNN.cxx src/RModel_GraphIndependent.cxx src/RFunction.cxx @@ -87,18 +88,24 @@ set(sources_cxx target_sources(SOFIE_core PRIVATE ${sources_headers} ${sources_cxx}) target_include_directories(SOFIE_core PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/inc) +target_link_libraries(SOFIE_core PUBLIC utils) target_link_libraries(SOFIE_core PUBLIC Tree Core RIO ) -ROOT_GENERATE_DICTIONARY(G__SOFIE ${sources_headers} +ROOT_GENERATE_DICTIONARY(G__SOFIE_core ${sources_headers} LINKDEF inc/LinkDef.h MODULE SOFIE_core OPTIONS --deep ) +# Install the dictionaries. +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_core_rdict.pcm + ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_core.rootmap + DESTINATION lib) + install(TARGETS SOFIE_core LIBRARY DESTINATION lib ) diff --git a/src/SOFIE_core/README.md b/src/SOFIE_core/README.md index 033cad4..2259d7a 100644 --- a/src/SOFIE_core/README.md +++ b/src/SOFIE_core/README.md @@ -25,7 +25,6 @@ SOFIE works in a parser-generator working architecture. With SOFIE, the user get From ROOT command line, or in a ROOT macro, we can proceed with an ONNX model: ```c++ -using namespace TMVA::Experimental; SOFIE::RModelParser_ONNX parser; SOFIE::RModel model = parser.Parse(“./example_model.onnx”); model.Generate(); @@ -73,7 +72,6 @@ SOFIE also supports generating inference code with RDataFrame as inputs, refer t Here is the updated list of supported ONNX operators. You can obtain this list by doing ```cpp -using namespace TMVA::Experimental; SOFIE::RModelParser_ONNX parser; std::vector supportedOperators = parser.GetRegisteredOperators(); ``` @@ -164,7 +162,6 @@ The above operators are supported for tensors of the following types: You can also check your model whether all operators are implemented by doing the following: ```c++ -using namespace TMVA::Experimental; SOFIE::RModelParser_ONNX parser; parser.CheckModel("example_model.ONNX"); ``` diff --git a/src/SOFIE_core/inc/SOFIE/RFunction.hxx b/src/SOFIE_core/inc/SOFIE/RFunction.hxx index 53c30e3..f79691a 100644 --- a/src/SOFIE_core/inc/SOFIE/RFunction.hxx +++ b/src/SOFIE_core/inc/SOFIE/RFunction.hxx @@ -3,6 +3,7 @@ #include "SOFIE/RModel_Base.hxx" #include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" #include #include diff --git a/src/SOFIE_core/inc/SOFIE/RModel.hxx b/src/SOFIE_core/inc/SOFIE/RModel.hxx index 79541af..ed28b48 100644 --- a/src/SOFIE_core/inc/SOFIE/RModel.hxx +++ b/src/SOFIE_core/inc/SOFIE/RModel.hxx @@ -16,14 +16,21 @@ private: int fVerbose = 0; int fBatchSize = -1; long fReadPos = 0; // reading file position + size_t fConstantTensorSize = 0; // size (in Bytes) of the allocated constant tensors + size_t fWeightsTensorSize = 0; // size (in Bytes) of the allocated weight tensors + size_t fOtherTensorSize = 0; // size (in Bytes) of intermediate tensors which are not managed by the memory pool + + OptimizationLevel fOptimizationLevel = OptimizationLevel::kExtended; std::unordered_map fInputTensorInfos; // input tensors where shape may not fully defined or other graph inputs? std::unordered_map fReadyInputTensorInfos; // input tensors where shape is full defined std::unordered_map fInitializedTensors; std::unordered_map fIntermediateTensorInfos; std::unordered_map fDynamicTensorInfos; + std::unordered_map, bool>> fShapeTensors; // constant tensors describing a shape std::unordered_map fShapeParams; // parameters defining the dynamic shape (e.g. batch size), store also its default value + std::vector fDimShapeNames; // parameter names used to define the shapes std::vector fOutputTensorNames; std::vector fInputTensorNames; // input tensor names using ONNX order @@ -58,9 +65,14 @@ public: int Verbose() const { return fVerbose;} - const std::vector &GetTensorShape(std::string name) const; - std::vector GetDynamicTensorShape(std::string name) const; - const ETensorType &GetTensorType(std::string name) const; + const std::vector &GetTensorShape(const std::string & name) const; + std::vector GetDimTensorShape(const std::string & name) const; + const ETensorType &GetTensorType(const std::string & name) const; + std::vector GetDynamicTensorShape(const std::string & name) const ; + + // get the values for the tensor representing a shape + const std::vector & GetShapeTensorValues(const std::string & tensor_name) const; + bool CheckIfTensorAlreadyExist(std::string tensor_name); void AddInputTensorInfo(std::string input_name, ETensorType type, std::vector shape); @@ -102,6 +114,8 @@ public: AddInitializedTensor(tensor_name, GetTemplatedType(T()), shape, data); } + void AddShapeTensor(const std::string & name, const std::vector & shapeValues, bool scalar = false); + // add and initialize subgraph to the model void InitializeSubGraph(std::shared_ptr graph); @@ -118,13 +132,15 @@ public: bool IsDimInputTensor(const std::string &name) const; // check if tensor is a fully specified input tensor bool IsReadyInputTensor(const std::string &name) const; + /// check if a tensor is a shape tensor + bool IsShapeTensor(const std::string & name) const; // Add intermediate tensor void AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector dim_shape); void AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector shape); // Add an intermediate dynamic tensor void AddDynamicTensor(std::string tensor_name, ETensorType type, std::vector shape); - + void AddShapeParam(const std::string & name, size_t def_value = 0); void AddInputTensorName(std::string name); void AddOutputTensorNameList(std::vector output_tensor_names); void @@ -132,6 +148,8 @@ public: void UpdateInitializedTensor(std::string tensor_name, ETensorType type, std::vector shape, std::shared_ptr data); std::shared_ptr GetInitializedTensorData(std::string tensor_name); + template + std::vector GetTensorData(const std::string & name); void Initialize(int batchSize = -1, bool verbose = false); void Initialize(const std::map & inputParams, bool verbose = false); @@ -141,34 +159,64 @@ public: { Generate(static_cast>(options), batchSize, pos, verbose); } + void GenerateGPU_ALPAKA(std::underlying_type_t options, int batchSize = -1, bool verbose = false); + void GenerateGPU_ALPAKA(Options options = Options::kDefault, int batchSize = -1, bool verbose = false) + { + GenerateGPU_ALPAKA(static_cast>(options), batchSize, verbose); + } // generate the infer function signature. If isdecl= false generate the calling infer function // used to infer the sub-graphs std::string GenerateInferSignature(bool isdecl = true); + // generate the infer function signature for inference on ALPAKA. If isdecl= false generate the calling infer function + // used to infer the sub-graphs + std::string GenerateInferSignature_GPU_ALPAKA(bool isdecl = true); + + void RemoveIntermediateTensor(const std::string& tensor_name){ + fIntermediateTensorInfos.erase(tensor_name); + } + // calculate total intermediate memory and position intermediate tensor addresses - std::string AllocateIntermediateMemory(std::span op_output_tensors); - void CheckAndFlushIntermediateMemory(std::span op_output_tensors, const size_t& op_idx); + std::string AllocateIntermediateMemory(std::span op_output_tensors); + void CheckAndFlushIntermediateMemory(std::span op_output_tensors, const size_t& op_idx); protected: // internal functions // generate code for the initialized tensors void GenerateInitializedTensorInfo(); + + void GenerateInitializedTensorInfo_GPU_ALPAKA(); // generate code for the intermediate tensors void GenerateIntermediateTensorInfo(); + + // generate code for the temporary initialized tensors containers + void GenerateTemporaryInitializedTensorContainers_GPU_ALPAKA(); + // generate code for the dynamic tensors void GenerateDynamicTensorInfo(); + + void GenerateDynamicTensorInfo_GPU_ALPAKA(); // generate code for declarations needed by operators void GenerateOperatorDeclarations(); // generate code for inference void GenerateOutput(); + + void GenerateOutput_GPU_ALPAKA(); + + void MoveInitializedTensorsToBuffers_ALPAKA(); // generate code for initializing memory pool for intermediate tensors void GenerateIntermediateMemoryPool(); // Generate all session code void GenerateSessionCode(); + void GenerateSessionCode_GPU_ALPAKA(); + void GenerateGPU_ALPAKA_Buffers(); + + void CheckAndFuseOperators(); public: const std::vector &GetInputTensorNames() const { return fInputTensorNames; } const std::vector &GetOutputTensorNames() const { return fOutputTensorNames; } + const std::vector & GetDimShapeNames() const { return fDimShapeNames; } void ReadInitializedTensorsFromFile(long); long WriteInitializedTensorsToFile(std::string filename = ""); @@ -203,6 +251,21 @@ public: ClassDefNV(RModel, 3); }; +template +inline std::vector RModel::GetTensorData(const std::string & name) { + if (!IsInitializedTensor(name)) return std::vector{}; + T * data = static_cast(GetInitializedTensorData(name).get()); + size_t size = ConvertShapeToLength(GetTensorShape(name)); + return std::vector(data, data+size); +} + +template<> +inline std::vector RModel::GetTensorData(const std::string & name) { + if (!IsShapeTensor(name)) return std::vector{}; + return GetShapeTensorValues(name); +} + + } // namespace SOFIE #endif // SOFIE_RMODEL diff --git a/src/SOFIE_core/inc/SOFIE/RModel_Base.hxx b/src/SOFIE_core/inc/SOFIE/RModel_Base.hxx index f8a9d34..deac58b 100644 --- a/src/SOFIE_core/inc/SOFIE/RModel_Base.hxx +++ b/src/SOFIE_core/inc/SOFIE/RModel_Base.hxx @@ -12,7 +12,6 @@ #include #include #include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" #include "TBuffer.h" @@ -27,10 +26,26 @@ enum class Options { kGNNComponent = 0x10, }; +// Optimization levels inspired by ONNXRuntime. +// We only get Operator Fusion with the Basic, and +// memory reuse with Extended. kExtended is enabled +// by default +enum class OptimizationLevel { + kBasic = 0x0, + kExtended = 0x1, +}; + enum class WeightFileType { None, RootBinary, Text }; -std::underlying_type_t operator|(Options opA, Options opB); -std::underlying_type_t operator|(std::underlying_type_t opA, Options opB); + +inline std::underlying_type_t operator|(Options opA, Options opB) { + return static_cast>(opA) | + static_cast>(opB); +} + +inline std::underlying_type_t operator|(std::underlying_type_t opA, Options opB) { + return opA | static_cast>(opB); +} class RModel_Base { @@ -53,6 +68,43 @@ protected: bool fIsGNN = false; bool fIsGNNComponent = false; + // Function to generate the code for declaring and initializing constant tensors + // This is for tensors which are not part of weight files and can be created from the Constant operator + template + std::string GenerateConstantTensorCode(const std::pair &t) + { + std::stringstream strs; + std::string type = ConvertTypeToString(t.second.type()); + size_t length = ConvertShapeToLength(t.second.shape()); + // avoid using stack sizes for constant tensors to reduce compilation time + bool allocateOnStack = (length > 100) ? false : true; + + const T *data = t.second.data(); + + // and check if all values are the same + bool sameData = false; + // for non stack allocation check if data are the same + if (!allocateOnStack && length > 1) { + size_t idx = 1; + do { + sameData = (data[idx] == data[idx - 1]); + idx++; + } while (sameData && idx < length); + } + if (allocateOnStack) { + strs << type << " tensor_" << t.first << "[" << length << "] = " << ConvertValuesToString(length, data) << ";\n"; + } else { + strs << "std::vector<" << type << "> fTensor_" << t.first << " = "; + if (sameData) + strs << "std::vector<" << type << ">(" << length << ", " << ConvertValToString(data[0]) << ");\n"; + else { + strs << ConvertValuesToString(length, data) << ";\n"; + } + strs << "const " << type << " * tensor_" + t.first + " = fTensor_" + t.first + ".data();\n"; + } + return strs.str(); + } + public: /** Default constructor. Needed to allow serialization of ROOT objects. See @@ -82,6 +134,7 @@ public: fCustomOpHeaders.insert(filename); } void GenerateHeaderInfo(std::string &hgname); + void GenerateHeaderInfo_GPU_ALPAKA(std::string& hgname); void PrintGenerated() { std::cout << fGC; } std::string ReturnGenerated() { return fGC; } diff --git a/src/SOFIE_core/inc/SOFIE/ROperator.hxx b/src/SOFIE_core/inc/SOFIE/ROperator.hxx index edbec58..17b62f6 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator.hxx @@ -2,63 +2,105 @@ #define SOFIE_ROPERATOR #include +#include #include #include "SOFIE/SOFIE_common.hxx" -//#include "RModel.hxx" - - namespace SOFIE{ class RModel; +enum class OperatorKind { + GEMM = 0, + LAYERNORM = 1, + RELU = 2, + CONSTANT = 3, + CONSTANTOFSHAPE = 4, + UNDEFINED = 5, + CONV=6, + BATCHNORM=7 +}; + +inline const char* toString(OperatorKind kind) { + switch (kind) { + case OperatorKind::GEMM: return "GEMM"; + case OperatorKind::LAYERNORM: return "LAYERNORM"; + case OperatorKind::RELU: return "RELU"; + case OperatorKind::CONSTANT: return "CONSTANT"; + case OperatorKind::CONSTANTOFSHAPE: return "CONSTANTOFSHAPE"; + case OperatorKind::BATCHNORM: return "BATCHNORM"; + case OperatorKind::CONV: return "CONV"; + case OperatorKind::UNDEFINED: return "UNDEFINED"; + default: return "UNKNOWN"; + } +} + +inline std::set FusableKinds = { OperatorKind::RELU, OperatorKind::LAYERNORM, OperatorKind::BATCHNORM}; + class ROperator{ public: virtual std::vector GetBlasRoutines() { return {}; } virtual std::vector GetStdLibs() { return {}; } - virtual std::vector> ShapeInference(std::vector>) = 0; - virtual std::vector TypeInference(std::vector) = 0; + virtual std::vector> ShapeInference(std::vector>) { return {}; }; + virtual std::vector TypeInference(std::vector) { return {}; }; virtual void Initialize(RModel&) = 0; virtual std::string Generate(std::string OpName) = 0; //expect unique opName for each operator within the same RModel + virtual std::string Generate_GPU_ALPAKA(std::string OpName){ return "";} //expect unique opName for each operator within the same RModel // generate initialization code for session constructor virtual std::string GenerateInitCode() { return "";} + virtual std::string GenerateInitCode_GPU_ALPAKA() { return "";}; // generate some specific declaration code for Session virtual std::string GenerateDeclCode() { return "";} // generate session data members specific to operator virtual std::string GenerateSessionMembersCode(std::string /*opName*/) { return ""; } + virtual std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) { return ""; } + virtual std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) { return ""; } virtual std::string Header() { return "";} + virtual std::string GetFusableOutputTensorName() { return "";} + virtual std::string GetBlasConfig() { return ""; } + virtual void UpdateFusableTensorName(std::string, const std::function& removal_func){ return;}; //virtual void Forward_reference() = 0; //virtual void Forward_blas() = 0; virtual ~ROperator(){} protected: - + OperatorKind fKind = OperatorKind::UNDEFINED; + size_t fOpOrder = 0; const std::string SP = " "; ///< space used to correctly indent the generated C++ code bool fUseSession = false; ///< flag to identify if using the session class bool fIsOutputConstant = false; ///< flag to identify if operator has a constant output (no need to generate code) - - mutable std::vector fInputTensorNames; - mutable std::vector fOutputTensorNames; + bool fIsOutputParamShape = false; ///< flag to identify of the output represents a parametric shape (can be knwon at compile time) + + mutable std::vector fInputTensorNames; + mutable std::vector fOutputTensorNames; public: - std::span GetOpInputTensors() const { + std::span GetOpInputTensors() const { return fInputTensorNames; } - std::span GetOpOutputTensors() const { + std::span GetOpOutputTensors() const { return fOutputTensorNames; } - + + OperatorKind GetKind() const { return fKind; } + + void RegisterOperatorOrder(const size_t ord){ + fOpOrder = ord; + } + size_t GetOpOrder(){ + return fOpOrder; + } + }; }//SOFIE - #endif //SOFIE_OPERATOR diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx index 127eaff..2d0e6cb 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx @@ -1,5 +1,5 @@ -#ifndef SOFIE_ROperator_BasicBinary -#define SOFIE_ROperator_BasicBinary +#ifndef SOFIE_SOFIE_ROperator_BasicBinary +#define SOFIE_SOFIE_ROperator_BasicBinary #include "SOFIE/SOFIE_common.hxx" #include "SOFIE/ROperator.hxx" @@ -7,9 +7,15 @@ #include -namespace SOFIE{ +namespace SOFIE { -enum EBasicBinaryOperator { Add, Sub, Mul, Div, Pow }; +enum EBasicBinaryOperator { + Add, + Sub, + Mul, + Div, + Pow +}; template struct BinaryOperatorTrait {}; @@ -17,42 +23,42 @@ struct BinaryOperatorTrait {}; template struct BinaryOperatorTrait { static const std::string Name() { return "Add"; } - static std::string Op(const std::string & t1, const std::string t2) { return t1 + " + " + t2; } - static T Func(T t1, T t2) {return t1 + t2;} + static std::string Op(const std::string &t1, const std::string t2) { return t1 + " + " + t2; } + static T Func(T t1, T t2) { return t1 + t2; } }; template struct BinaryOperatorTrait { static const std::string Name() { return "Sub"; } - static std::string Op(const std::string & t1, const std::string t2) { return t1 + " - " + t2; } - static T Func (T t1, T t2) { return t1 - t2;} + static std::string Op(const std::string &t1, const std::string t2) { return t1 + " - " + t2; } + static T Func(T t1, T t2) { return t1 - t2; } }; template struct BinaryOperatorTrait { static const std::string Name() { return "Mul"; } - static std::string Op(const std::string & t1, const std::string t2) { return t1 + " * " + t2; } - static T Func (T t1, T t2) { return t1 * t2;} + static std::string Op(const std::string &t1, const std::string t2) { return t1 + " * " + t2; } + static T Func(T t1, T t2) { return t1 * t2; } }; template struct BinaryOperatorTrait { static const std::string Name() { return "Div"; } - static std::string Op(const std::string & t1, const std::string t2) { return t1 + " / " + t2; } - static T Func (T t1, T t2) { return t1/t2;} + static std::string Op(const std::string &t1, const std::string t2) { return t1 + " / " + t2; } + static T Func(T t1, T t2) { return t1 / t2; } }; template struct BinaryOperatorTrait { static const std::string Name() { return "Pow"; } - static std::string Op(const std::string & t1, const std::string t2) { return "std::pow(" + t1 + "," + t2 + ")"; } - static T Func (T t1, T t2) { return std::pow(t1,t2);} + static std::string Op(const std::string &t1, const std::string t2) { return "std::pow(" + t1 + "," + t2 + ")"; } + static T Func(T t1, T t2) { return std::pow(t1, t2); } }; -template -class ROperator_BasicBinary final : public ROperator{ +template +class ROperator_BasicBinary final : public ROperator { private: - + int fBroadcastFlag = 0; std::string fNA; std::string fNB; std::string fNBroadcastedA; @@ -63,154 +69,446 @@ private: std::vector fShapeB; std::vector fShapeY; + std::vector fDimShapeA; + std::vector fDimShapeB; + std::vector fDimShapeY; + public: - ROperator_BasicBinary(){} - ROperator_BasicBinary(std::string nameA, std::string nameB, std::string nameY): - fNA(UTILITY::Clean_name(nameA)), fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY)){ - fInputTensorNames = { fNA, fNB }; - fOutputTensorNames = { fNY }; - } + ROperator_BasicBinary() {} + ROperator_BasicBinary(std::string nameA, std::string nameB, std::string nameY) + : fNA(UTILITY::Clean_name(nameA)), fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY)) + { + fInputTensorNames = {fNA, fNB}; + fOutputTensorNames = {fNY}; + } // type of output given input - std::vector TypeInference(std::vector input) override { - return input; - } + std::vector TypeInference(std::vector input) override { return input; } // shape of output tensors given input tensors - std::vector> ShapeInference(std::vector> input) override { + std::vector> ShapeInference(std::vector> input) override + { // assume now inputs have same shape (no broadcasting) auto ret = std::vector>(1, input[0]); // return vector size 1 with first input return ret; } - void Initialize(RModel& model) override { + void Initialize(RModel &model) override + { // input must be a graph input, or already initialized intermediate tensor - if (!model.CheckIfTensorAlreadyExist(fNA)){ + if (!model.CheckIfTensorAlreadyExist(fNA)) { throw std::runtime_error(std::string("TMVA SOFIE Binary Op Input Tensor ") + fNA + "is not found in model"); } if (!model.CheckIfTensorAlreadyExist(fNB)) { throw std::runtime_error(std::string("TMVA SOFIE Binary Op Input Tensor ") + fNB + "is not found in model"); } - fShapeA = model.GetTensorShape(fNA); - fShapeB = model.GetTensorShape(fNB); - bool broadcast = !UTILITY::AreSameShape(fShapeA, fShapeB); - if (broadcast) { - // Y is the common shape of A and B - fShapeY = UTILITY::UnidirectionalBroadcastShape(fShapeA, fShapeB); - bool broadcastA = !UTILITY::AreSameShape(fShapeA, fShapeY); - bool broadcastB = !UTILITY::AreSameShape(fShapeB, fShapeY); - // Broadcast A to Y - if (broadcastA) { - fNBroadcastedA = "Broadcasted" + fNA + "to" + fNY; - if (model.IsInitializedTensor(fNA)) { - auto data = model.GetInitializedTensorData(fNA); - std::shared_ptr broadcastedData( - UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeA, fShapeY), - std::default_delete()); - // Update the data and the shape of A - model.AddConstantTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY, broadcastedData); - fShapeA = fShapeY; + int dynamicInputs = 0; + if (model.IsDynamicTensor(fNA)) { + fDimShapeA = model.GetDimTensorShape(fNA); + dynamicInputs |= 1; + } else { + fShapeA = model.GetTensorShape(fNA); + fDimShapeA = ConvertShapeToDim(fShapeA); + } + if (model.IsDynamicTensor(fNB)) { + dynamicInputs |= 2; + fDimShapeB = model.GetDimTensorShape(fNB); + } else { + fShapeB = model.GetTensorShape(fNB); + fDimShapeB = ConvertShapeToDim(fShapeB); + } + if (dynamicInputs & 1 && model.Verbose()) + std::cout << BinaryOperatorTrait::Name() << " : input " << fNA << " is dynamic " + << ConvertDimShapeToString(fDimShapeA) << " "; + if (dynamicInputs & 2 && model.Verbose()) + std::cout << BinaryOperatorTrait::Name() << " : input " << fNB << " is dynamic " + << ConvertDimShapeToString(fDimShapeB) << " "; + std::cout << std::endl; + // check if need to broadcast at initialization time if shapes are known and different + // (we could broadcast the tensor tensor to maximum values of dynamic shapes - to be done) + // case of known shapes + // if shapes are known find the output shape from broadcasting + if (dynamicInputs == 0) { + auto ret = UTILITY::MultidirectionalBroadcastShape(fShapeA, fShapeB); + fBroadcastFlag = ret.first; + fShapeY = ret.second; + if (model.IsConstantTensor(fNA) && model.IsConstantTensor(fNB)) { + bool broadcast = fBroadcastFlag > 0; + if (broadcast) { + // Y is the common shape of A and B + bool broadcastA = fBroadcastFlag & 2; + bool broadcastB = fBroadcastFlag & 1; + // Broadcast A to Y + if (broadcastA) { + fNBroadcastedA = "Broadcasted" + fNA + "to" + fNY; + auto data = model.GetInitializedTensorData(fNA); + std::shared_ptr broadcastedData( + UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeA, fShapeY), + std::default_delete()); + if (model.Verbose()) + std::cout << "broadcasted data A " << ConvertShapeToString(fShapeY) << " : " + << ConvertValuesToString(ConvertShapeToLength(fShapeY), + static_cast(broadcastedData.get())) + << std::endl; + // Update the data and the shape of A + model.AddConstantTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY, broadcastedData); + fShapeA = fShapeY; + fDimShapeA = ConvertShapeToDim(fShapeA); + } + // Broadcast B to Y + if (broadcastB) { + fNBroadcastedB = "Broadcasted" + fNB + "to" + fNY; + auto data = model.GetInitializedTensorData(fNB); + if (model.Verbose()) + std::cout << "data B " << ConvertShapeToString(fShapeB) << " : " + << ConvertValuesToString(ConvertShapeToLength(fShapeB), static_cast(data.get())) + << std::endl; + std::shared_ptr broadcastedData( + UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeB, fShapeY), + std::default_delete()); + // do not update tensor B but add broadcasted one (since it can be input to some other operators) + if (model.Verbose()) + std::cout << "broadcasted data B " << ConvertShapeToString(fShapeY) << " : " + << ConvertValuesToString(ConvertShapeToLength(fShapeY), + static_cast(broadcastedData.get())) + << std::endl; + model.AddConstantTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY, broadcastedData); + fShapeB = fShapeY; + fDimShapeB = ConvertShapeToDim(fShapeB); + } } else { - // Add an intermediate tensor for broadcasting A - model.AddIntermediateTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY); + fShapeY = fShapeA; } - } - // Broadcast B to Y - if (broadcastB) { - fNBroadcastedB = "Broadcasted" + fNB + "to" + fNY; - if (model.IsInitializedTensor(fNB)) { - auto data = model.GetInitializedTensorData(fNB); - std::cout << "data B " << ConvertShapeToString(fShapeB) << " : " << - ConvertValuesToString(ConvertShapeToLength(fShapeB), static_cast(data.get())) << std::endl; - std::shared_ptr broadcastedData( - UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeB, fShapeY), - std::default_delete()); - // do not update tensor B but add broadcasted one (since it can be input to some other operators) - std::cout << "broadcasted data B " << ConvertShapeToString(fShapeY) << " : " << - ConvertValuesToString(ConvertShapeToLength(fShapeY), static_cast(broadcastedData.get())) << std::endl; - model.AddConstantTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY, broadcastedData); - fShapeB = fShapeY; - } else { - // Add an intermediate tensor for broadcasting B - model.AddIntermediateTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY); + // tensors are constant: perform here the binary operation + + const std::string &nameA = fNBroadcastedA.empty() ? fNA : fNBroadcastedA; + const std::string &nameB = fNBroadcastedB.empty() ? fNB : fNBroadcastedB; + auto dataA = static_cast(model.GetInitializedTensorData(nameA).get()); + auto dataB = static_cast(model.GetInitializedTensorData(nameB).get()); + std::vector dataY(ConvertShapeToLength(fShapeY)); + for (size_t i = 0; i < dataY.size(); i++) { + dataY[i] = BinaryOperatorTrait::Func(dataA[i], dataB[i]); } + model.AddConstantTensor(fNY, fShapeY, dataY.data()); + // flag tensors to not be written in the weight file + model.SetNotWritableInitializedTensor(nameA); + model.SetNotWritableInitializedTensor(nameB); + fIsOutputConstant = true; + if (model.Verbose()) { + std::cout << BinaryOperatorTrait::Name() << " : " << fNA << " " << ConvertShapeToString(fShapeA) + << " , " << fNB << " " << ConvertShapeToString(fShapeB) << " ---> " << fNY << " " + << ConvertShapeToString(fShapeY) << " : " << ConvertValuesToString(dataY) << std::endl; + } + } else { + // case of defined and non-constant tensors + model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fShapeY); + if (model.Verbose()) { + std::cout << BinaryOperatorTrait::Name() << " : " << fNA << " " << ConvertShapeToString(fShapeA) + << " , " << fNB << " " << ConvertShapeToString(fShapeB) << " ---> " << fNY << " " + << ConvertShapeToString(fShapeY) << std::endl; + } + // we convert non-dim shapes to Dim shapes + fDimShapeY = ConvertShapeToDim(fShapeY); } } else { - fShapeY = fShapeA; - } - // check case of constant output (if all inputs are defined) - if (model.IsInitializedTensor(fNA) && model.IsInitializedTensor(fNB)) { - const std::string& nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA; - const std::string& nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB; - auto dataA = static_cast(model.GetInitializedTensorData(nameA).get()); - auto dataB = static_cast(model.GetInitializedTensorData(nameB).get()); - std::vector dataY(ConvertShapeToLength(fShapeY)); - for (size_t i = 0; i < dataY.size(); i++) { - dataY[i] = BinaryOperatorTrait::Func(dataA[i], dataB[i]); + // case A or B have dynamic shapes. We need to broadcast if shape are not same + auto ret = UTILITY::MultidirectionalBroadcastShape(fDimShapeA, fDimShapeB); + fBroadcastFlag = ret.first; + fDimShapeY = ret.second; + // case of all parametric shapes and MultiDirectionalBroadcastShape return the max of the 2 + // need to do before we declare the output tensor shape and the broadcasted ones + if (ret.first & 4) { + // check if one of the parameter is an input dimension + // define function to find this + auto IsInputDimParam = [&](const std::string &p) { + auto inputNames = model.GetInputTensorNames(); + for (auto &input : inputNames) { + for (auto &i_s : model.GetDimTensorShape(input)) { + if (i_s.isParam && i_s.param == p) + return true; + } + } + return false; + }; + for (size_t i = 0; i < fDimShapeY.size(); i++) { + auto &s = fDimShapeY[i]; + if (s.isParam && s.param.find("std::max") != std::string::npos) { + if (IsInputDimParam(fDimShapeA[i].param)) { + // case dim is 1 we indicate that the input parameter is equal to 1 + if (fDimShapeA[i].dim != 1) + s = fDimShapeA[i]; + else + s = fDimShapeB[i]; + } else if (IsInputDimParam(fDimShapeB[i].param)) { + if (fDimShapeB[i].dim != 1) + s = fDimShapeB[i]; + else + s = fDimShapeA[i]; + } + } + } + } + + model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fDimShapeY); + if (model.Verbose()) { + std::cout << BinaryOperatorTrait::Name() << " : " << ConvertShapeToString(fDimShapeA) << " , " + << ConvertShapeToString(fDimShapeB) << " --> " << ConvertShapeToString(fDimShapeY) << std::endl; } - model.AddConstantTensor(fNY, fShapeY, dataY.data()); - // flag tensors to not be written in a fil - model.SetNotWritableInitializedTensor(nameA); - model.SetNotWritableInitializedTensor(nameB); - fIsOutputConstant = true; - if (model.Verbose()) - std::cout << "Binary op ---> " << fNY << " " << ConvertShapeToString(fShapeY) << " : " - << ConvertValuesToString(dataY) << std::endl; - } - else { - model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fShapeY); } } - std::string GenerateInitCode() override { + std::string GenerateInitCode() override + { std::stringstream out; return out.str(); } - std::string Generate(std::string OpName) override { + std::string Generate(std::string opName) override + { - if (fIsOutputConstant) return ""; + if (fIsOutputConstant) + return ""; - OpName = "op_" + OpName; + opName = "op_" + opName; - if (fShapeY.empty()) { + if (fDimShapeY.empty()) { throw std::runtime_error("TMVA SOFIE Binary Op called to Generate without being initialized first"); } std::stringstream out; - out << SP << "\n//------ " << BinaryOperatorTrait::Name() << "\n"; - size_t length = ConvertShapeToLength(fShapeY); + out << SP << "\n//------ " << opName << " " << BinaryOperatorTrait::Name() << " --> " + << ConvertDimShapeToString(fDimShapeY) << "\n"; + auto length = ConvertDimShapeToLength(fDimShapeY); std::string typeName = TensorType::Name(); - // Broadcast A if it's uninitialized - // use broadcasting function where we pass an already allocated tensor to minimize memory allocations - if (fShapeA != fShapeY) { - out << SP << "// Broadcasting uninitialized tensor " << fNA << "\n"; - out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNA << ", " << ConvertShapeToString(fShapeA) << ", " << ConvertShapeToString(fShapeY) - << ", fTensor_" << fNBroadcastedA << ");\n"; - } - // Broadcast B if it's uninitialized - if (fShapeB != fShapeY) { - out << SP << "// Broadcasting uninitialized tensor " << fNB << "\n"; - out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertShapeToString(fShapeY) - << ", fTensor_" << fNBroadcastedB << ");\n"; - } - const std::string& nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA; - const std::string& nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB; - out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n"; - out << SP << SP << "tensor_" << fNY << "[id] = " << BinaryOperatorTrait::Op( "tensor_" + nameA + "[id]" , "tensor_" + nameB + "[id]") << " ;\n"; - out << SP << "}\n"; + + // we need to check if we can broadcast (case flag has bit 4 set) + + if (fBroadcastFlag & 4) { + // need to check if shapes are the same + auto lengthA = ConvertDimShapeToLength(fDimShapeA); + auto lengthB = ConvertDimShapeToLength(fDimShapeB); + out << SP << "if (" << lengthA << "!=" << lengthB << ") {\n"; + // check if A->B or B->A + // bool broadcastable = true; + for (size_t i = 0; i < fDimShapeY.size(); i++) { + if (fBroadcastFlag & 5 && fDimShapeY[i] == fDimShapeA[i] && fDimShapeA[i].dim > 1 && + fDimShapeB[i].isParam) { + // B->A B[i] needs to be 1 + out << SP << SP << "if (" << fDimShapeB[i] << "!= 1)\n"; + out << SP << SP << SP << "throw std::runtime_error(\"SOFIE - Cannot broadcast B->A in operator " + << opName << "\");\n"; + } + if (fBroadcastFlag & 6 && fDimShapeY[i] == fDimShapeB[i] && fDimShapeB[i].dim > 1 && + fDimShapeA[i].isParam) { + // A-> B A[i] needs to be 1 + out << SP << SP << "if (" << fDimShapeA[i] << "!= 1)\n"; + out << SP << SP << SP << "throw std::runtime_error(\"SOFIE - Cannot broadcast A->B in operator " + << opName << "\");\n"; + } else if (fDimShapeA[i].isParam && fDimShapeB[i].isParam) { + // both shapes are parametric and we broadcast to maximum + // we allocate here output vector + out << SP << SP << "if (" << fDimShapeA[i] << " != " << fDimShapeB[i] << " && (" << fDimShapeA[i] + << " != 1 || " << fDimShapeB[i] << " != 1))\n"; + out << SP << SP << SP << "throw std::runtime_error(\"SOFIE - Cannot broadcast shapes in operator " << opName + << "\");\n"; + } + } + out << SP << "}\n"; + } + + auto stridesA = UTILITY::ComputeStrideFromShape(fDimShapeA); + auto stridesB = UTILITY::ComputeStrideFromShape(fDimShapeB); + auto stridesY = UTILITY::ComputeStrideFromShape(fDimShapeY); + + std::string compute_idx_A, compute_idx_B, compute_idx_Y; + if (fDimShapeA.empty() || + std::all_of(fDimShapeA.begin(), fDimShapeA.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) { + compute_idx_A = "0"; + } else { + for (size_t i = 0; i < fDimShapeA.size(); ++i) { + if (fDimShapeA[i].dim == 1 || fDimShapeA[i].GetVal() == "1") + continue; + compute_idx_A += "idx_" + std::to_string(i + (fDimShapeY.size() - fDimShapeA.size())); + if (stridesA[i].GetVal() != "1") + compute_idx_A += " * " + stridesA[i].GetVal(); + compute_idx_A += " + "; + } + // remove last 3 character " + " + for (int j = 0; j < 3; j++) + compute_idx_A.pop_back(); + } + if (fDimShapeB.empty() || + std::all_of(fDimShapeB.begin(), fDimShapeB.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) { + compute_idx_B = "0"; + } else { + for (size_t i = 0; i < fDimShapeB.size(); ++i) { + if (fDimShapeB[i].dim == 1 || fDimShapeB[i].GetVal() == "1") + continue; + compute_idx_B += "idx_" + std::to_string(i + (fDimShapeY.size() - fDimShapeB.size())); + if (stridesB[i].GetVal() != "1") + compute_idx_B += " * " + stridesB[i].GetVal(); + compute_idx_B += " + "; + } + // remove last 3 character " + " + for (int j = 0; j < 3; j++) + compute_idx_B.pop_back(); + } + int nloop = 0; + if (fDimShapeY.empty() || + std::all_of(fDimShapeY.begin(), fDimShapeY.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) { + compute_idx_Y = "0"; + } else { + for (size_t i = 0; i < fDimShapeY.size(); ++i) { + if (fDimShapeY[i].dim != 1 && fDimShapeY[i].GetVal() != "1") { + nloop++; + for (int j = 0; j < nloop; j++) out << SP; + out << "for (size_t idx_" << i << " = 0; idx_" << i << " < " << fDimShapeY[i] + << "; ++idx_" << i << "){\n"; + compute_idx_Y += "idx_" + std::to_string(i); + if (stridesY[i].GetVal() != "1") + compute_idx_Y += " * " + stridesY[i].GetVal(); + compute_idx_Y += " + "; + } + } + // remove last 3 characters " + " + for (int j = 0; j < 3; j++) + compute_idx_Y.pop_back(); + } + for (int j = 0; j < nloop + 1; j++) out << SP; + out << "tensor_" << fNY << "[" << compute_idx_Y << "] = " + << BinaryOperatorTrait::Op("tensor_" + fNA + "[" + compute_idx_A + "]", + "tensor_" + fNB + "[" + compute_idx_B + "]") + << " ;\n"; + + for (int i = nloop; i > 0; i--) { + for (int j = 0; j < i; j++) out << SP; + out << "}\n"; + } return out.str(); } - std::vector GetStdLibs() override { + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) { + if (fIsOutputConstant) + return ""; + + std::string op; + op = "\n//------ "+opName+"_"+BinaryOperatorTrait::Name()+"_KERNEL_ALPAKA\n"; + op += SP + "struct Binary"+BinaryOperatorTrait::Name()+"Kernel {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const * A, T const * B, T * C,\n"; + for( size_t i=0; i("; + for (size_t i = 0; i < fDimShapeY.size(); i++) { + op += "size_" + std::to_string(i); + } + op.pop_back(); + op += "));\n"; + op += SP + SP + SP + SP + "for (auto const& elem : elements) {\n"; + + auto stridesA = UTILITY::ComputeStrideFromShape(fDimShapeA); + auto stridesB = UTILITY::ComputeStrideFromShape(fDimShapeB); + auto stridesY = UTILITY::ComputeStrideFromShape(fDimShapeY); + + std::string compute_idx_A, compute_idx_B, compute_idx_Y; + if (fDimShapeA.empty() || + std::all_of(fDimShapeA.begin(), fDimShapeA.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) { + compute_idx_A = "0"; + } else { + for (size_t i = 0; i < fDimShapeA.size(); ++i) { + if (fDimShapeA[i].dim == 1 || fDimShapeA[i].GetVal() == "1") + continue; + compute_idx_A += "elem[" + std::to_string(i + (fDimShapeY.size() - fDimShapeA.size())) + "]"; + if (stridesA[i].GetVal() != "1") + compute_idx_A += " * " + stridesA[i].GetVal(); + compute_idx_A += " + "; + } + // remove last 3 character " + " + for (int j = 0; j < 3; j++) + compute_idx_A.pop_back(); + } + if (fDimShapeB.empty() || + std::all_of(fDimShapeB.begin(), fDimShapeB.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) { + compute_idx_B = "0"; + } else { + for (size_t i = 0; i < fDimShapeB.size(); ++i) { + if (fDimShapeB[i].dim == 1 || fDimShapeB[i].GetVal() == "1") + continue; + compute_idx_B += "elem[" + std::to_string(i + (fDimShapeY.size() - fDimShapeB.size())) + "]"; + if (stridesB[i].GetVal() != "1") + compute_idx_B += " * " + stridesB[i].GetVal(); + compute_idx_B += " + "; + } + // remove last 3 character " + " + for (int j = 0; j < 3; j++) + compute_idx_B.pop_back(); + } + int nloop = 0; + if (fDimShapeY.empty() || + std::all_of(fDimShapeY.begin(), fDimShapeY.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) { + compute_idx_Y = "0"; + } else { + for (size_t i = 0; i < fDimShapeY.size(); ++i) { + if (fDimShapeY[i].dim != 1 && fDimShapeY[i].GetVal() != "1") { + nloop++; + for (int j = 0; j < nloop; j++) op += SP; + compute_idx_Y += "elem[" + std::to_string(i) + "]"; + if (stridesY[i].GetVal() != "1") + compute_idx_Y += " * " + stridesY[i].GetVal(); + compute_idx_Y += " + "; + } + } + // remove last 3 characters " + " + for (int j = 0; j < 3; j++) + compute_idx_Y.pop_back(); + } + for (int j = 0; j < nloop + 1; j++) op += SP; + op += "C[" + compute_idx_Y + "] = " + + BinaryOperatorTrait::Op("A[" + compute_idx_A + "]", + "B[" + compute_idx_B + "]") + + " ;\n"; + for (int i = nloop; i > 0; i--) { + for (int j = 0; j < i; j++) op += SP; + op += "}\n"; + } + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string OpName) { + if (fIsOutputConstant) + return ""; + + return SP + "Binary"+BinaryOperatorTrait::Name()+"Kernel " + OpName + "Kernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) { + if (fIsOutputConstant) + return ""; + + if (fDimShapeY.empty()) { + throw std::runtime_error("TMVA SOFIE Operator Basic Binary called to Generate without being initialized first"); + } + std::stringstream out; + auto length = ConvertDimShapeToLength(fDimShapeY); + out << "\n//------ "+OpName+"_ALPAKA\n"; + out << SP << "alpaka::WorkDivMembers workDiv_"<::all("<<(stoi(length)+256-1)/256<<"), alpaka::Vec::all(256), alpaka::Vec::all(1));\n"; + out << SP << "alpaka::exec(queue, workDiv_" << fNY << ", " << OpName << "Kernel, alpaka::getPtrNative(deviceBuf_" << fNA << "), alpaka::getPtrNative(deviceBuf_"< GetStdLibs() override + { if (Op == EBasicBinaryOperator::Pow) { - return { std::string("cmath") }; + return {std::string("cmath")}; } else { return {}; } } -}; -}//SOFIE + +}; +} // namespace SOFIE -#endif //SOFIE_ROperator_BasicBinary +#endif // SOFIE_ROperator_BasicBinary diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx index c18c17e..b98ded5 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx @@ -107,6 +107,33 @@ public: return out.str(); } + std::string Generate_GPU_Kernel_ALPAKA() { + std::string op; + op = "\n//------ " + UnaryOpTraits::Name() + "_KERNEL_ALPAKA\n"; + op += SP + "struct Unary" + UnaryOpTraits::Name() + "Kernel{\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T* data, std::size_t numElements) const {\n"; + op += SP + SP + SP + "for (auto i : alpaka::uniformElements(acc, numElements)) {\n"; + op += SP + SP + SP + "data[i] = " << UnaryOpTraits::Op("data[i]") << ";\n"; + op += SP + SP + "}\n"; + op += SP + "}\n};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + return SP + "Unary" + UnaryOpTraits::Name() + "Kernel " + UnaryOpTraits::Name() + "Kernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + std::stringstream out; + auto length = ConvertShapeToLength(fShapeX); + out << "\n//------ "+OpName+"_ALPAKA\n"; + out << SP << "alpaka::WorkDivMembers workDiv_"<::all("<<(length+255)/256<<"), alpaka::Vec::all(256), alpaka::Vec::all(1));\n"; + out << SP << "alpaka::exec(queue, workDiv_" << fNX << ", " << UnaryOpTraits::Name() << "Kernel, alpaka::getPtrNative(deviceBuf_" << fNX << "), static_cast(" << length << ")); \n"; + return out.str(); + } + std::vector GetStdLibs() override { if (Op == EBasicUnaryOperator::kSqrt || Op == EBasicUnaryOperator::kExp || Op == EBasicUnaryOperator::kLog) { return { std::string("cmath") }; diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx index a27cea4..1a6098d 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx @@ -1,9 +1,9 @@ #ifndef SOFIE_ROPERATOR_BatchNormalization #define SOFIE_ROPERATOR_BatchNormalization -#include "SOFIE_common.hxx" -#include "ROperator.hxx" -#include "RModel.hxx" +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" #include diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx index 47c3d66..c813f7c 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx @@ -90,6 +90,36 @@ public: return out.str(); } + std::string Generate_GPU_Kernel_ALPAKA() { + std::string op; + op = "\n//------ CAST_KERNEL_ALPAKA\n"; + op += SP + "struct CastKernel{\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, SrcT const * src, DstT * dst, std::size_t numElements) const {\n"; + op += SP + SP + SP + "for (auto i : alpaka::uniformElements(acc, numElements)) {\n"; + op += SP + SP + SP + "dst[i] = static_cast(src[i]);\n"; + op += SP + SP + "}\n"; + op += SP + "}\n};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) { + return SP + "CastKernel castKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("TMVA SOFIE Operator Cast called to Generate without being initialized first"); + } + std::stringstream out; + auto length = ConvertShapeToLength(fShape); + out << "\n//------ CAST_GPU_ALPAKA\n"; + out << SP << "alpaka::WorkDivMembers workDiv_"<::all("<< (length+255)/256 <<"), alpaka::Vec::all(256), alpaka::Vec::all(1));\n"; + out << SP << "alpaka::exec(queue, workDiv_" << fNX << ", castKernel, alpaka::getPtrNative(deviceBuf_" << fNX << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast(" << length << ")); \n"; + return out.str(); + } + }; }//SOFIE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx index 0d5e574..c828668 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx @@ -1,5 +1,5 @@ #ifndef SOFIE_ROPERATOR_Concat - #define SOFIE_ROPERATOR_Concat +#define SOFIE_ROPERATOR_Concat #include "SOFIE/SOFIE_common.hxx" @@ -25,6 +25,7 @@ std::vector> fInputShapes; public: + ROperator_Concat(){} ROperator_Concat(std::vector inputs, int axis, int newAxis, std::string output): fAxis(axis), fnewAxis(newAxis), fOutput(UTILITY::Clean_name(output)) { @@ -53,6 +54,7 @@ throw std::runtime_error("TMVA SOFIE Concat Op - invalid axis value "); int concat_dim=0; + // case of Concat (fNewAxis = 0) and not ConcatFromSequence if(fnewAxis == 0){ for (size_t i = 0; i < inputs.size(); i++) { if (i > 0 && inputs[i].size() != inputs[i - 1].size()) @@ -73,6 +75,7 @@ ret[0][fAxis] = concat_dim; } std::vector stack; + // case ConCatFromSequence if(fnewAxis == 1){ for(size_t i = 0; i < inputs.size(); i++) { if (i > 0 && inputs[i].size() != inputs[i-1].size() ) @@ -96,8 +99,8 @@ } // get shape of output given inputs. It is going to be called after initialized - std::vector> ShapeInference(const std::vector> & inputs) { - std::vector> ret(1); + std::vector ShapeInference(const std::vector> & inputs, const RModel & model) { + std::vector ret(inputs[0].size()); // treat negative axis case if (fAxis<0) { fAxis = inputs[0].size()+fAxis; @@ -105,31 +108,54 @@ if (fAxis < 0 || fAxis >= (int) inputs[0].size()) throw std::runtime_error("TMVA SOFIE Concat Op - invalid axis value "); - int concat_dim=0; + Dim concat_dim; if(fnewAxis == 0){ for (size_t i = 0; i < inputs.size(); i++) { if (i > 0 && inputs[i].size() != inputs[i - 1].size()) throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have different shapes " + fInputs[i] + " : " + - ConvertDynamicShapeToString(inputs[i]) + " and " + fInputs[i-1] + " : " + ConvertDynamicShapeToString(inputs[i - 1])); + ConvertShapeToString(inputs[i]) + " and " + fInputs[i-1] + " : " + ConvertShapeToString(inputs[i - 1])); for (size_t iaxis = 0; iaxis < inputs[i].size(); iaxis++) { if ((int)iaxis == fAxis) { - // support only non-params shape for the concatenation axis - if (inputs[i][iaxis].isParam) - throw std::runtime_error("TMVA SOFIE Concat Op - not supporting input param dimensions for concatenation axis. Input shape is " + - ConvertDynamicShapeToString(inputs[i])); - concat_dim += inputs[i][iaxis].dim; + // support both integer and params shape for the concatenation axis + if (concat_dim.param.empty() && concat_dim.dim == 0) + concat_dim = inputs[i][iaxis]; + else if (inputs[i][iaxis].isParam || concat_dim.isParam) { + concat_dim = + Dim{ concat_dim.GetVal() + std::string("+ ") + inputs[i][iaxis].GetVal(), + static_cast(-1)}; + } else { + concat_dim = Dim { concat_dim.dim + inputs[i][iaxis].dim }; + } + } + else if (i == 0) { + ret[iaxis] = inputs[i][iaxis]; } - // other dimensions must be the same - else if (i > 0 && inputs[i][iaxis].GetVal() != inputs[i - 1][iaxis].GetVal()) + else if ((!inputs[i][iaxis].isParam && !ret[iaxis].isParam) && (inputs[i][iaxis].dim != ret[iaxis].dim)) { throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have wrong shapes " + - ConvertDynamicShapeToString(inputs[i]) + " and " + - ConvertDynamicShapeToString(inputs[i - 1])); + ConvertShapeToString(inputs[i]) + " and " + + ConvertShapeToString(inputs[i - 1])); + } + else if (!inputs[i][iaxis].isParam && ret[iaxis].isParam){ + // if shape is not parametric use it + ret[iaxis] = inputs[i][iaxis]; + } + else if (inputs[i][iaxis].isParam && ret[iaxis].isParam) { + // check which parameter is first in RModel list + auto & dimNames = model.GetDimShapeNames(); + auto p1 = std::find(dimNames.begin(), dimNames.end(), inputs[i][iaxis].param); + auto p2 = std::find(dimNames.begin(), dimNames.end(), ret[iaxis].param); + if (p1 < p2) ret[iaxis] = inputs[i][iaxis]; + } + } + // add parenthesis in case is an expression + if (concat_dim.isParam && concat_dim.dim == static_cast(-1)) + concat_dim = Dim{ std::string("(") + concat_dim.GetVal() + std::string(")"), concat_dim.dim }; } - // output shape - ret[0] = inputs[0]; - ret[0][fAxis].dim = concat_dim; + // output shape for concatenated axis + ret[fAxis] = Dim{concat_dim}; + } // case of stacking (not supported yet) // here we need to check that input shapes are the same @@ -141,24 +167,30 @@ return ret; } - void Initialize(RModel& model) override { + void Initialize(RModel& model) override { for (auto &it : fInputs) { if (model.CheckIfTensorAlreadyExist(it) == false) { throw std::runtime_error("TMVA SOFIE Concat Op Input Tensor " + it + " is not found in model"); } - fInputShapes.push_back(model.GetDynamicTensorShape(it)); + fInputShapes.push_back(model.GetDimTensorShape(it)); } - fOutputShape = ShapeInference(fInputShapes)[0]; + fOutputShape = ShapeInference(fInputShapes, model); if (model.Verbose()) - std::cout << "Output of concat operator has shape " << ConvertDynamicShapeToString(fOutputShape) << std::endl; + std::cout << "Output of concat operator has shape " << ConvertDimShapeToString(fOutputShape) << std::endl; // check if concat has constant inputs , axis 0(concat contigous memory and type is integer) + bool isOutputShape = false; if (model.GetTensorType(fInputs[0]) == ETensorType::INT64 && fAxis == 0) { fIsOutputConstant = true; + isOutputShape = true; + for ( auto & input : fInputs) { if (!model.IsInitializedTensor(input)) { fIsOutputConstant = false; - break; + if (!model.IsShapeTensor(input)) { + isOutputShape = false; + break; + } } } if (fIsOutputConstant) { @@ -177,26 +209,53 @@ model.AddConstantTensor(fOutput, outputShape, outputData.data()); if (model.Verbose()) { std::cout << "output of Concat is a constant tensor " << ConvertShapeToString(outputShape) << " : " - << ConvertValuesToString(outputData) << std::endl; + << ConvertValuesToString(outputData) << " (constant)" << std::endl; } + } else if (isOutputShape) { + auto outputShape = ConvertShapeToInt(fOutputShape); // conversion must be possible + std::vector outputData(ConvertShapeToLength(outputShape)); + size_t offset = 0; + for ( auto & input : fInputs) { + std::vector inputData; + auto inputShape = model.GetTensorShape(input); // shape is not dynamic + size_t inputLength = ConvertShapeToLength(inputShape); // shape can be a scalar + if (model.IsShapeTensor(input)) + inputData = model.GetShapeTensorValues(input); + else if (model.IsConstantTensor(input)) { + inputData.resize(inputLength); + auto intData = static_cast(model.GetInitializedTensorData(input).get()); + for (size_t i = 0; i < inputData.size(); i++) + inputData[i] = Dim{ static_cast(intData[i])}; + } + std::cout << "concatenating input data " << inputLength << " " << inputData[0] << std::endl; + std::copy(inputData.begin(), inputData.end(), outputData.begin() + offset ); + offset += inputLength; + } + // add output tensor + model.AddShapeTensor(fOutput,outputData, false); // cannot be a scalar + if (model.Verbose()) { + std::cout << "output of Concat is a shape tensor " << ConvertShapeToString(outputShape) << " : " + << ConvertShapeToString(outputData) << " (shape)" << std::endl; + } + fIsOutputConstant = true; } } if (!fIsOutputConstant) { model.AddIntermediateTensor(fOutput, model.GetTensorType(fInputs[0]), fOutputShape); if (model.Verbose()) { - std::cout << "Concat ---> " << fOutput << " " << ConvertDynamicShapeToString(fOutputShape) << std::endl; + std::cout << "Concat ---> " << fOutput << " " << ConvertDimShapeToString(fOutputShape) << std::endl; } } } - std::string Generate(std::string OpName) override { + std::string Generate(std::string opName) override { if (fIsOutputConstant) return ""; - OpName = "op_"+OpName; + opName = "op_" + opName; if(fOutputShape.empty()){ throw std::runtime_error("TMVA SOFIE Concat called to Generate without being initialized first"); } std::stringstream out; - out<<"\n//--------- Concat\n"; + out<<"\n//--------- Concat " << opName << " --> " << ConvertShapeToString(fOutputShape) << "\n"; // special case when memory is contiguous bool hasShapeOnes = true; for(int i = 0; i 0) out << offset; offset += " + " + length; @@ -260,4 +319,5 @@ }; }//SOFIE - #endif //SOFIE_ROPERATOR_CONCAT + + #endif //SOFIE_ROPERATOR_CONCAT \ No newline at end of file diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx index 0d08432..6590909 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx @@ -101,6 +101,11 @@ public: // no code to generate here. Tensor are defined in Session constructor return "//---------------------------------------\n"; } + + std::string Generate_GPU_ALPAKA(std::string /* OpName */) override { + // no code to generate here. Tensor are defined in Session constructor + return "//---------------------------------------\n"; + } }; }//SOFIE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx index 0467385..b9d917b 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx @@ -1,9 +1,9 @@ #ifndef SOFIE_SOFIE_ROPERATOR_CONVTRANSPOSE_HXX #define SOFIE_SOFIE_ROPERATOR_CONVTRANSPOSE_HXX -#include -#include -#include +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" #include #include diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx index c834a06..bf163b7 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx @@ -122,6 +122,60 @@ public: return out.str(); } + std::string Generate_GPU_Kernel_ALPAKA() { + std::string op; + op = "\n//------ Expand_KERNEL_ALPAKA\n"; + op += SP + "struct ExpandKernel {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const * input, T * output, const size_t * input_shape, const size_t * output_shape, const size_t * input_strides, const size_t * output_strides, const size_t ndim){\n"; + op += SP + SP + SP + SP + "size_t input_idx = 0;\n"; + op += SP + SP + SP + SP + "size_t output_idx = 0;\n"; + op += SP + SP + SP + SP + "size_t coord_out;\n"; + op += SP + SP + SP + SP + "size_t coord_in;\n"; + op += SP + SP + SP + SP + "auto elements = alpaka::uniformElementsND(acc, alpaka::Vec(output_shape));\n"; + op += SP + SP + SP + SP + "for (auto const& elem : elements) {\n"; + op += SP + SP + SP + SP + "input_idx = 0;\n"; + op += SP + SP + SP + SP + "output_idx = 0;\n"; + op += SP + SP + SP + SP + "for (int i = 0; i < ndim; ++i) {\n"; + op += SP + SP + SP + SP + SP + "coord_out = elem[i];\n"; + op += SP + SP + SP + SP + SP + "coord_in = (input_shape[i] == 1) ? 0 : coord_out;\n"; + op += SP + SP + SP + SP + SP + "input_idx += coord_in * input_strides[i];\n}\n"; + op += SP + SP + SP + SP + SP + "output_idx += coord_out * output_strides[i];\n}\n"; + op += SP + SP + SP + SP + SP + "output[output_idx] = input[input_idx];\n"; + op += SP + SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + return SP + "ExpandKernel expandKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("TMVA SOFIE Operator Expand called to Generate without being initialized first"); + } + + std::stringstream out; + auto length = ConvertShapeToLength(fShape); + out << "\n//------ EXPAND_GPU_ALPAKA\n"; + out << SP << "alpaka::WorkDivMembers workDiv_" << fNX + << "(alpaka::Vec::all((" << length << " + 256 - 1) / 256), " + << "alpaka::Vec::all(256), alpaka::Vec::all(1));\n"; + + out << SP << "alpaka::exec(queue, workDiv_" << fNX + << ", expandKernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNY + << "), "<< ConvertShapeToString(fShapeX) <<", "< #include - namespace SOFIE { /*! \brief Gated Recurrent Unit operator @@ -91,7 +90,7 @@ template class ROperator_GRU final : public ROperator { fNSequence_lens(UTILITY::Clean_name(nameSequence_lens)), fNInitial_h(UTILITY::Clean_name(nameInitial_h)), fNY(UTILITY::Clean_name(nameY)), fNY_h(UTILITY::Clean_name(nameY_h)) { - + fInputTensorNames = { fNX, fNW, fNR }; if (!fNB.empty()){ fInputTensorNames.emplace_back(fNB); @@ -123,39 +122,34 @@ template class ROperator_GRU final : public ROperator { * * \param input type of the input tensors */ - std::vector TypeInference(std::vector /*input*/); + std::vector TypeInference(std::vector /*input*/) override; /*! \brief Infers the shape of the output tensors * * \param input shape of the input tensors */ - std::vector> ShapeInference(std::vector> /*input*/); + std::vector> ShapeInference(std::vector> /*input*/) override; /*! \brief Initialize the model * * \param model Model */ - void Initialize(RModel &); + void Initialize(RModel &) override; /*! \brief Generate the inference code * * \param OpName name of the operator */ - std::string Generate(std::string /*OpName*/); - - /*! \brief Generate the code for the Session internal data vectors - * - * \param opName name of the operator - */ - std::string GenerateSessionMembersCode(std::string opName); + std::string Generate(std::string /*OpName*/) override; /*! \brief Returns the blas routines needed to compile the generated code */ - std::vector GetBlasRoutines() { return { std::string("Gemm"), std::string("Axpy") }; } + std::vector GetBlasRoutines() override { return { std::string("Gemm"), std::string("Axpy") }; } }; } // namespace SOFIE + // Implementation of the ROperator_GRU class #include "SOFIE/ROperator_GRU.icc" diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc b/src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc index f3813c2..38030d1 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc +++ b/src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc @@ -175,51 +175,45 @@ void ROperator_GRU::Initialize(RModel& model){ fAttrActivations = {"Sigmoid", "Tanh"}; } } -} -// generate code for Session data members (e.g. internal vectors) -template -std::string ROperator_GRU::GenerateSessionMembersCode(std::string opName) -{ - opName = "op_" + opName; - std::stringstream out; + // To get unique intermediate tensor names, we add the name of the input + // tensor. One might also consider using the index of the operator in the + // RMode, but this information is not available in the current scope. + std::string opName = "op_gru_" + fNX; size_t num_directions = fShapeW[0]; size_t seq_length = (fAttrLayout == 0) ? fShapeX[0] : fShapeX[1]; size_t batch_size = (fAttrLayout == 0) ? fShapeX[1] : fShapeX[0]; size_t input_size = fShapeX[2]; + auto declareVector = [&](std::string const &name, std::size_t n){ + std::string fullName = opName + "_" + name; + model.AddIntermediateTensor(fullName, ConvertStringToType(fType), std::vector{n}); + }; + if (fAttrLayout != 0) { - out << "std::vector<" << fType << "> fVec_" << opName << "_input = std::vector<" << fType << ">(" - << seq_length * batch_size * input_size << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_initial_hidden_state = std::vector<" << fType << ">(" - << num_directions * batch_size * fAttrHiddenSize << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_initial_cell_state = std::vector<" << fType << ">(" - << num_directions * batch_size * fAttrHiddenSize << ");\n"; + declareVector("input", seq_length * batch_size * input_size); + declareVector("initial_hidden_state", num_directions * batch_size * fAttrHiddenSize); + declareVector("initial_cell_state", num_directions * batch_size * fAttrHiddenSize); } // Set the feedforward size_t ff_size = seq_length * batch_size * fAttrHiddenSize; - out << "std::vector<" << fType << "> fVec_" << opName << "_f_update_gate = std::vector<" << fType << ">(" << ff_size << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_f_reset_gate = std::vector<" << fType << ">(" << ff_size << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_f_hidden_gate = std::vector<" << fType << ">(" << ff_size << ");\n"; + declareVector("f_update_gate", ff_size); + declareVector("f_reset_gate", ff_size); + declareVector("f_hidden_gate", ff_size); // gate results size_t hs_size = seq_length * num_directions * batch_size * fAttrHiddenSize; - out << "std::vector<" << fType << "> fVec_" << opName << "_update_gate = std::vector<" << fType << ">(" << hs_size << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_reset_gate = std::vector<" << fType << ">(" << hs_size << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_hidden_gate = std::vector<" << fType << ">(" << hs_size << ");\n"; + declareVector("update_gate", hs_size); + declareVector("reset_gate", hs_size); + declareVector("hidden_gate", hs_size); // feedback - out << "std::vector<" << fType << "> fVec_" << opName << "_feedback = std::vector<" << fType << ">(" - << batch_size * fAttrHiddenSize << ");\n"; + declareVector("feedback", batch_size * fAttrHiddenSize); // hiddden state if (fAttrLayout != 0 || fNY.empty()) { - out << "std::vector<" << fType << "> fVec_" << opName << "_hidden_state = std::vector<" << fType << ">(" << hs_size << ");\n"; + declareVector("hidden_state", hs_size); } - - out << "\n"; - - return out.str(); } @@ -234,12 +228,14 @@ auto ROperator_GRU::Generate(std::string OpName) size_t input_size = fShapeX[2]; size_t num_directions = fShapeW[0]; + auto getVec = [&](std::string const &name) { return "tensor_op_gru_" + fNX + "_" + name; }; + // set the input if (fAttrLayout == 0) { - out << SP << fType << " *" << OpName << "_input = tensor_" << fNX << ";\n"; + out << SP << fType << " const* " << OpName << "_input = tensor_" << fNX << ";\n"; } else { if (fUseSession) { - out << SP << fType << " * " << OpName << "_input = fVec_" << OpName << "_input.data();\n"; + out << SP << fType << " * " << OpName << "_input = " << getVec("input") << ";\n"; } else { out << SP << fType << " " << OpName << "_input[" << seq_length * batch_size * input_size << "];\n"; } @@ -261,8 +257,7 @@ auto ROperator_GRU::Generate(std::string OpName) << fNInitial_h << ";\n"; } else { if (fUseSession) { - out << SP << fType << " * " << OpName << "_initial_hidden_state = fVec_" << OpName - << "_initial_hidden_state.data();\n"; + out << SP << fType << " * " << OpName << "_initial_hidden_state = " << getVec("initial_hidden_state") << ";\n"; } else { out << SP << fType << " " << OpName << "_initial_hidden_state[" << num_directions * batch_size * fAttrHiddenSize << "];\n"; @@ -283,9 +278,9 @@ auto ROperator_GRU::Generate(std::string OpName) // Set the feedforward size_t feedforward_size = seq_length * batch_size * fAttrHiddenSize; if (fUseSession) { - out << SP << fType << " * " << OpName << "_f_update_gate = fVec_" << OpName << "_f_update_gate.data();\n"; - out << SP << fType << " * " << OpName << "_f_reset_gate = fVec_" << OpName << "_f_reset_gate.data();\n"; - out << SP << fType << " * " << OpName << "_f_hidden_gate = fVec_" << OpName << "_f_hidden_gate.data();\n"; + out << SP << fType << " * " << OpName << "_f_update_gate = " << getVec("f_update_gate") << ";\n"; + out << SP << fType << " * " << OpName << "_f_reset_gate = " << getVec("f_reset_gate") << ";\n"; + out << SP << fType << " * " << OpName << "_f_hidden_gate = " << getVec("f_hidden_gate") << ";\n"; } else { out << SP << fType << " " << OpName << "_f_update_gate[" << feedforward_size << "] = {0};\n"; out << SP << fType << " " << OpName << "_f_reset_gate[" << feedforward_size << "] = {0};\n"; @@ -294,9 +289,9 @@ auto ROperator_GRU::Generate(std::string OpName) // Set the gates size_t hidden_state_size = seq_length * num_directions * batch_size * fAttrHiddenSize; if (fUseSession) { - out << SP << fType << " * " << OpName << "_update_gate = fVec_" << OpName << "_update_gate.data();\n"; - out << SP << fType << " * " << OpName << "_reset_gate = fVec_" << OpName << "_reset_gate.data();\n"; - out << SP << fType << " * " << OpName << "_hidden_gate = fVec_" << OpName << "_hidden_gate.data();\n"; + out << SP << fType << " * " << OpName << "_update_gate = " << getVec("update_gate") << ";\n"; + out << SP << fType << " * " << OpName << "_reset_gate = " << getVec("reset_gate") << ";\n"; + out << SP << fType << " * " << OpName << "_hidden_gate = " << getVec("hidden_gate") << ";\n"; } else { out << SP << fType << " " << OpName << "_update_gate[" << hidden_state_size << "] = {0};\n"; out << SP << fType << " " << OpName << "_reset_gate[" << hidden_state_size << "] = {0};\n"; @@ -307,14 +302,14 @@ auto ROperator_GRU::Generate(std::string OpName) out << SP << fType << " *" << OpName << "_hidden_state = tensor_" << fNY << ";\n"; } else { if (fUseSession) { - out << SP << fType << " * " << OpName << "_hidden_state = fVec_" << OpName << "_hidden_state.data();\n"; + out << SP << fType << " * " << OpName << "_hidden_state = " << getVec("hidden_state") << ";\n"; } else { out << SP << fType << " " << OpName << "_hidden_state[" << hidden_state_size << "] = {0};\n"; } } if (fUseSession) { - out << SP << fType << " * " << OpName << "_feedback = fVec_" << OpName << "_feedback.data();\n"; + out << SP << fType << " * " << OpName << "_feedback = " << getVec("feedback") << ";\n"; } else { out << SP << fType << " " << OpName << "_feedback[" << batch_size * fAttrHiddenSize << "] = {0};\n"; } diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Gather.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Gather.hxx index 4d34846..4f685d7 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Gather.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Gather.hxx @@ -212,6 +212,73 @@ public: return out.str(); } + std::string Generate_GPU_Kernel_ALPAKA() { + std::string op; + op = "\n//------ GATHER_KERNEL_ALPAKA\n"; + op += SP + "struct GatherKernel {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const * input, T const * indices, T * output, std::size_t const * output_shape, std::size_t const axis, std::size_t const axisDim, std::size_t const indicesNumElements, std::size_t const * output_strides, std::size_t const * input_strides, std::size_t const ndim) const {\n"; + op += SP + SP + SP + SP + "auto elements = alpaka::uniformElementsND(acc, alpaka::Vec(output_shape));\n"; + op += SP + SP + SP + SP + "for (auto const& elem : elements) {\n"; + + // find flattened index for indices tensor + op += SP + SP + SP + SP + "int64_t idxLinear = 0;\n{\n"; + op += SP + SP + SP + SP + SP + "int64_t stride = 1;\n"; + op += SP + SP + SP + SP + SP + "for (int i = ndim - 1; i >= axis; --i) {;\n"; + op += SP + SP + SP + SP + SP + "stride *= (i > axis ? output_shape[i] : 1);\n}\n"; + op += SP + SP + SP + SP + SP + "idxLinear = elem[axis];\n"; + op += SP + SP + SP + SP + SP + "if (idxLinear >= indicesNumElements) idxLinear %= indicesNumElements;\n}\n"; + + // load gather index and wrap negative if any + op += SP + SP + SP + SP + "int64_t k = indices[idxLinear];\n"; + op += SP + SP + SP + SP + "if (k < 0) k += axisDim;\n"; + op += SP + SP + SP + SP + "if (k < 0) k = 0;\n"; + op += SP + SP + SP + SP + "if (k >= axisDim) k = axisDim - 1;\n"; + + // compute input flattened index + op += SP + SP + SP + SP + "size_t input_idx = 0;\n"; + op += SP + SP + SP + SP + "size_t output_idx = 0;\n"; + op += SP + SP + SP + SP + "for (int i = 0; i < ndim; ++i) {\n"; + op += SP + SP + SP + SP + SP + "size_t coord = elem[i];\n"; + op += SP + SP + SP + SP + SP + "output_idx += coord * output_strides[i];\n}\n"; + op += SP + SP + SP + SP + SP + "if (i == axis) coord = k;\n"; + op += SP + SP + SP + SP + SP + "input_idx += coord * input_strides[i];\n}\n"; + + // write to output tensor + op += SP + SP + SP + SP + "output[output_idx] = input[input_idx];\n"; + op += SP + SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + return SP + "GatherKernel gatherKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + std::stringstream out; + auto length = ConvertShapeToLength(fShapeY); + out << "\n//------ "< workDiv_" << fNY + << "(alpaka::Vec::all((" << length << " + 256 - 1) / 256), " + << "alpaka::Vec::all(256), alpaka::Vec::all(1));\n"; + + out << SP << "alpaka::exec(queue, workDiv_" << fNY + << ", gatherKernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNIndices + << "), alpaka::getPtrNative(deviceBuf_" << fNY + << "), "<< ConvertShapeToString(fShapeY) <<", "<< fAttrAxis <<", "<< fShapeX[fAttrAxis] <<", " + << fShapeIndices.size() <<", " + << ConvertShapeToString(UTILITY::ComputeStrideFromShape(fShapeY)) <<", " + << ConvertShapeToString(UTILITY::ComputeStrideFromShape(fShapeX)) <<", "<< fShapeY.size() + << ",static_cast(" << length << "));\n"; + + return out.str(); + } + }; }//SOFIE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx index 046bf56..1c43724 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx @@ -48,6 +48,8 @@ namespace SOFIE{ fAttrAlpha(alpha), fAttrBeta(beta), fAttrTransA(transA), fAttrTransB(transB), fNA(UTILITY::Clean_name(nameA)), fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY)) { + + fKind = OperatorKind::GEMM; fActivation = activation; fType = "float"; static_assert(std::is_same_v, @@ -60,9 +62,11 @@ namespace SOFIE{ fAttrAlpha(alpha), fAttrBeta(beta), fAttrTransA(transA), fAttrTransB(transB), fNA(UTILITY::Clean_name(nameA)), fNB(UTILITY::Clean_name(nameB)), fNC(UTILITY::Clean_name(nameC)), fNY(UTILITY::Clean_name(nameY)), fActivation(activation) { + fKind = OperatorKind::GEMM; fActivation = activation; fType = "float"; + fInputTensorNames = { fNA, fNB, fNC }; fOutputTensorNames = { fNY }; } @@ -148,7 +152,7 @@ namespace SOFIE{ } } if (model.IsDynamicTensor(fNA) || model.IsDimInputTensor(fNA) ) { - fShapeA = model.GetDynamicTensorShape(fNA); + fShapeA = model.GetDimTensorShape(fNA); fIsDynamic = true; } else { auto shapeA_int = model.GetTensorShape(fNA); @@ -162,7 +166,7 @@ namespace SOFIE{ } if (model.IsDynamicTensor(fNB) || model.IsDimInputTensor(fNB)) { - fShapeB = model.GetDynamicTensorShape(fNB); + fShapeB = model.GetDimTensorShape(fNB); fIsDynamic = true; } else { @@ -191,7 +195,7 @@ namespace SOFIE{ if (!fIsDynamic) { shapeY = ConvertShapeToInt(fShapeY); if (shapeY.empty()) { - throw std::runtime_error("TMVA SOFIE Gemm Op " + fNY + " has invalid shape" + ConvertDynamicShapeToString(fShapeY)); + throw std::runtime_error("TMVA SOFIE Gemm Op " + fNY + " has invalid shape" + ConvertDimShapeToString(fShapeY)); } } @@ -252,15 +256,16 @@ namespace SOFIE{ shapeY.erase(shapeY.end()-1); } - if (!fIsDynamic) + if (!fIsDynamic){ model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), shapeY); + } else model.AddDynamicTensor(fNY, model.GetTensorType(fNA), fShapeY); if (model.Verbose()){ std::cout << "Gemm (or MatMul) " << " ---> " << fNY << " shape "; if (fIsDynamic) - std::cout << ConvertDynamicShapeToString(fShapeY) << std::endl; + std::cout << ConvertDimShapeToString(fShapeY) << std::endl; else std::cout << ConvertShapeToString(shapeY) << std::endl; } @@ -279,9 +284,9 @@ namespace SOFIE{ // include a separate scope to avoid defining unique operator temp variables out << "//--- broadcast bias tensor " << fNC << "for Gemm op\n"; out << SP << "{\n"; - out << " float * data = SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" - << fNC << "," << ConvertShapeToString(fShapeC) << ", " << ConvertDynamicShapeToString(fShapeY) << ");\n"; - auto length = SOFIE::ConvertDynamicShapeToLength(fShapeY); // output size + out << " float * data = SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" + << fNC << "," << ConvertShapeToString(fShapeC) << ", " << ConvertDimShapeToString(fShapeY) << ");\n"; + auto length = SOFIE::ConvertDimShapeToLength(fShapeY); // output size out << SP << SP << "std::copy(data, data + " << length << ", tensor_" << fNC2 << ");\n"; out << SP << SP << "delete [] data;\n"; out << SP << "}\n"; @@ -289,6 +294,29 @@ namespace SOFIE{ return out.str(); } + std::string GenerateInitCode_GPU_ALPAKA() override { + std::stringstream out; + // generate initialization code for broadcasting of bias tensor + if (fShapeC.size() != fShapeY.size() && fNC != fNC2) { + // we broadcast here always C in Y output, so target shape is the one of Y + // no need to call UTILITY::UnidirectionalBroadcastShape. + // here in case of parametric shape we need to assume that the parameters will be defined in the initialization code. + auto targetShape = fShapeY; + // include a separate scope to avoid defining unique operator temp variables + out << "//--- broadcast bias tensor " << fNC << "for Gemm op\n"; + out << SP << "{\n"; + out << " float * data = SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" + << fNC << ".data()," << ConvertShapeToString(fShapeC) << ", " << ConvertDimShapeToString(fShapeY) << ");\n"; + auto length = SOFIE::ConvertDimShapeToLength(fShapeY); // output size + out << SP << SP << "auto hostBuf_"<< fNC2 << " = alpaka::allocBuf(hostAcc, Ext1D::all(Idx{" << length << "}));\n"; + out << SP << SP << "std::memcpy(alpaka::getPtrNative(hostBuf_"<< fNC2 <<"), data, "<< length << " * sizeof(float));\n"; + out << SP << SP << "alpaka::memcpy(queue, deviceBuf_"<< fNC2 << ", hostBuf_"<< fNC2 << ");\n"; + out << SP << SP << "delete [] data;\n"; + out << SP << "}\n"; + } + return out.str(); + } + std::string Generate(std::string opName) override { opName = "op_" + opName; @@ -389,8 +417,105 @@ namespace SOFIE{ return out.str(); } + std::string Generate_GPU_ALPAKA(std::string opName) override { + opName = "op_" + opName; + + if (fShapeA.empty() || fShapeB.empty() || fShapeY.empty() || (fNC != "" && fShapeC.empty())) { + throw std::runtime_error("TMVA SOFIE Gemm Op called to Generate without being initialized first"); + } + std::stringstream out; + out << "\n//--------- Gemm_GPU_ALPAKA\n"; + out << SP << "char " << opName << "_transA = " << (fAttrTransA ? "\'t\'" : "\'n\'") << ";\n"; + out << SP << "char " << opName << "_transB = " << (fAttrTransB ? "\'t\'" : "\'n\'") << ";\n"; + // need to consider case A and B have dim > 2 (for MatMul) + int64_t dimA = fShapeA.size(); + int64_t dimB = fShapeB.size(); + int64_t dimY = fShapeY.size(); + if (dimA != dimB || dimA != dimY) { + throw std::runtime_error("TMVA SOFIE Gemm(MatMul) has invalid shape for inputs or output"); + } + auto m = (fAttrTransA ? fShapeA[dimA-1].GetVal() : fShapeA[dimA-2].GetVal()); + auto n = (fAttrTransB ? fShapeB[dimB-2].GetVal() : fShapeB[dimB-1].GetVal()); + auto k = (fAttrTransA ? fShapeA[dimA-2].GetVal() : fShapeA[dimA-1].GetVal()); + std::vector sY = {fShapeY[dimY-2], fShapeY[dimY-1]}; + // extra dimensions in case of stacked MatMul + std::vector sA; + for (int64_t i = 0; i < dimY-2; i++) { + sA.push_back(fShapeY[i]); + } + auto lengthGemm = ConvertDynamicShapeToLength(sY); // size of the Gemm operation + auto lengthExtra = ConvertDynamicShapeToLength(sA); // extra length in case input tensors are of dim>2 (MatMul) + + out << SP << "int " << opName << "_m = " << m << ";\n"; + out << SP << "int " << opName << "_n = " << n << ";\n"; + out << SP << "int " << opName << "_k = " << k << ";\n"; + out << SP << "float " << opName << "_alpha = " << std::setprecision(std::numeric_limits::max_digits10) << fAttrAlpha << ";\n"; + + // restricting to a 0 beta since BIAS is configured separately through sofieBLAS interface + out << SP << "float " << opName << "_beta = 0;\n"; + + // case bias is present + if (!fNC.empty()){ + if (fNC2 == fNC) { + // add a check in case broadcasting was not needed or done outside of session + // C should have smaller dimension of Y + if (!fIsDynamic) { + if (std::stoi(lengthGemm) != static_cast(ConvertShapeToLength(fShapeC))) + throw std::runtime_error("TMVA SOFIE Gemm Op " + opName + " Bias tensor has not correct size " + + ConvertShapeToString(fShapeC) + " output length " + lengthGemm); + } else { + // add a dynamic check (C should not be a dynamic tensor) + out << SP << "assert(" << lengthGemm << " != " << ConvertShapeToLength(fShapeC) << ");\n"; + } + } + } else { + //in this case fAttrBeta needs to be equal to zero otherwise second time we run we will use + // the previous result + if (fAttrBeta != 0) { + throw std::runtime_error("TMVA SOFIE Gemm Op " + opName + " Bias tensor is not present but beta value in Gemm is not zero"); + } + } + + // include MatMul case where we stack the Gemm operations + // exclude case where we have only 1's in the additional dims + bool doStackMul = dimY > 2 && ( fIsDynamic || std::stoi(lengthExtra) > 1); + if (doStackMul) { + out << SP << "size_t " << opName << "_yoffset = 0;\n"; // needed if we stack the gemm operations + out << SP << "for (int i = 0; i < " << lengthExtra << "; i++){\n"; + out << SP; + } + // in the case of bias + if (!fNC.empty()){ + if (fActivation == EActivationType::RELU){ + out << SP << "blas.gemmrelu("< GetBlasRoutines() override { return { std::string("Gemm"), std::string("Gemv") }; } + std::string GetFusableOutputTensorName() override { + return fNY; + } + + void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function& removal_func){ + removal_func(fNY); + fNY = fusable_tensor_name; + fOutputTensorNames[0] = fNY; + } + std::string GetBlasConfig(){ + int64_t dimA = fShapeA.size(); + int64_t dimB = fShapeB.size(); + auto m = (fAttrTransA ? fShapeA[dimA-1].GetVal() : fShapeA[dimA-2].GetVal()); + auto n = (fAttrTransB ? fShapeB[dimB-2].GetVal() : fShapeB[dimB-1].GetVal()); + auto k = (fAttrTransA ? fShapeA[dimA-2].GetVal() : fShapeA[dimA-1].GetVal()); + return n+", "+m+", "+k; + } }; diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc b/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc index bec7760..ebf4daf 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc +++ b/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc @@ -1,7 +1,6 @@ #ifndef SOFIE_ROPERATOR_LSTM_I #define SOFIE_ROPERATOR_LSTM_I - namespace SOFIE { template @@ -291,7 +290,7 @@ auto ROperator_LSTM::Generate(std::string OpName) // set the input if (fAttrLayout == 0) { - out << SP << fType << " *" << OpName << "_input = tensor_" << fNX << ";\n"; + out << SP << fType << " const *" << OpName << "_input = tensor_" << fNX << ";\n"; } else { if (fUseSession) out << SP << fType << " * " << OpName << "_input = fVec_" << OpName << "_input.data();\n"; diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx index 17b77b3..4a328de 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx @@ -81,7 +81,7 @@ public: throw std::runtime_error("TMVA::SOFIE - Tensor " + fNX + " not found."); } bool isDynamic = model.IsDynamicTensor(fNX); - fShapeX = model.GetDynamicTensorShape(fNX); + fShapeX = model.GetDimTensorShape(fNX); fShapeY = fShapeX; model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); // Type of the output @@ -93,13 +93,13 @@ public: // Shape of fShapeX[0, ..., fAxis) fAxesShape = std::vector(fShapeX.begin(), fShapeX.begin() + fAxis); // Length of the axes - fAxesLength = ConvertDynamicShapeToLength(fAxesShape); + fAxesLength = ConvertDimShapeToLength(fAxesShape); // Shape of fShapeX[fAxis, ..., fSize) fNormalizedShape = std::vector(fShapeX.begin() + fAxis, fShapeX.end()); // Length of the normalized axis - fNormalizedLength = ConvertDynamicShapeToLength(fNormalizedShape); + fNormalizedLength = ConvertDimShapeToLength(fNormalizedShape); // length of the input - fLength = ConvertDynamicShapeToLength(fShapeX); + fLength = ConvertDimShapeToLength(fShapeX); // Type of mean and std ETensorType type = (fAttrStashType == 1) ? ETensorType::FLOAT : model.GetTensorType(fNX); // Mean @@ -146,7 +146,7 @@ public: out << SP << "// Broadcasting the bias of LayerNormalization op\n"; out << SP << "{\n"; out << SP << SP << "float* data = SOFIE::UTILITY::UnidirectionalBroadcast(tensor_"; - out << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertDynamicShapeToString(fShapeX) << ");\n"; + out << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertShapeToString(fShapeX) << ");\n"; out << SP << "std::copy(data, data + " << fLength << ", tensor_" << fNBroadcastedB << ");\n"; out << SP << "delete[] data;\n"; out << SP << "}\n"; @@ -339,5 +339,4 @@ public: } // namespace SOFIE - #endif diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx index 8fefa6d..0f3b699 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx @@ -75,6 +75,60 @@ public: return out.str(); } + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override { + std::string op; + op = "\n//------ LEAKY_RELU_KERNEL_ALPAKA\n"; + op += "struct LeakyReluKernel {\n"; + op += SP + "template\n"; + op += SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements, T alpha) const {\n"; + op += SP + SP + "const auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + "if(idx < numElements) {\n"; + op += SP + SP + "out[idx] = data[idx] >= 0 ? data[idx] : alpha * data[idx];\n"; + op += SP + SP + "}\n"; + op += SP + "}\n"; + op += "};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + return "LeakyReluKernel leakyReluKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("TMVA SOFIE Operator LeakyRelu called to Generate without being initialized first"); + } + + std::stringstream out; + auto length = ConvertShapeToLength(fShape); + out << "\n//------ LEAKY_RELU_GPU_ALPAKA\n"; + out << SP << "constexpr float " << OpName << "_alpha = " << std::setprecision(std::numeric_limits::max_digits10) << falpha << ";\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"< const kernelCfg_" << fNX << " = {elementsPerGrid_" << fNX << ", elementsPerThread_" << fNX << "};\n"; + out << SP << "auto const workDiv_" << fNX << " = alpaka::getValidWorkDiv(kernelCfg_" << fNX << ", devAcc, leakyReluKernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast(" << length << "), " << OpName << "_alpha);\n"; + out << SP << "alpaka::exec(queue, workDiv_" << fNX + << ", leakyReluKernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast(" << length << "), " << OpName << "_alpha);\n"; + return out.str(); + } + + + std::string GetFusableOutputTensorName() override { + return fNY; + } + + void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function& removal_func){ + removal_func(fNX); + removal_func(fNY); + fNX = fusable_tensor_name; + fNY = fusable_tensor_name; + fInputTensorNames[0] = fNX; + fOutputTensorNames[0] = fNY; + } + }; }//SOFIE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc b/src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc index c03c1c2..c10c2a5 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc +++ b/src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc @@ -1,7 +1,6 @@ #ifndef SOFIE_ROPERATOR_RNN_I #define SOFIE_ROPERATOR_RNN_I - namespace SOFIE { template @@ -230,7 +229,7 @@ auto ROperator_RNN::Generate(std::string OpName) // set the input if (fAttrLayout == 0) { if (fType == "float") { - out << SP << "float *" << OpName << "_input = tensor_" << fNX << ";\n"; + out << SP << "float const*" << OpName << "_input = tensor_" << fNX << ";\n"; } } else { if (fUseSession) diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx index 8af272d..3e8605e 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx @@ -8,7 +8,6 @@ #include #include - namespace SOFIE{ template @@ -89,9 +88,9 @@ public: model.AddDynamicTensor(fNOutput, type, fShape); } if (model.Verbose()) { - std::cout << "Range -> output is " << fNOutput << " "; - if (fIsOutputConstant) std::cout << ConvertDynamicShapeToString(fShape) << std::endl; - else std::cout << ConvertDynamicShapeToString(model.GetDynamicTensorShape(fNOutput)) << std::endl; + std::cout << "Range -> output is " << fNOutput << " : " << ConvertShapeToString(fShape); + if (fIsOutputConstant) std::cout << " : " << ConvertValuesToString(model.GetTensorData(fNOutput)); + std::cout << std::endl; } } @@ -121,5 +120,5 @@ public: }; }//SOFIE - + #endif //SOFIE_ROPERATOR_RANGE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx index 8062dca..5fb2f04 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx @@ -24,6 +24,7 @@ public: ROperator_Relu(){} ROperator_Relu(std::string nameX, std::string nameY): fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){ + fKind = OperatorKind::RELU; fInputTensorNames = { fNX }; fOutputTensorNames = { fNY }; } @@ -42,11 +43,11 @@ public: throw std::runtime_error("TMVA SOFIE Relu Op Input Tensor " + fNX + " is not found in model"); } - fShape = model.GetDynamicTensorShape(fNX); + fShape = model.GetDimTensorShape(fNX); model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); if (model.Verbose()) { - std::cout << "Relu : " << fNX << " -> " << fNY << " " << ConvertDynamicShapeToString(fShape) << std::endl; + std::cout << "Relu : " << fNX << " -> " << fNY << " " << ConvertDimShapeToString(fShape) << std::endl; } } @@ -65,6 +66,48 @@ public: return out.str(); } + std::string Generate_GPU_Kernel_ALPAKA() { + std::string op; + op = "\n//------ RELU_KERNEL_ALPAKA\n"; + op += SP + "struct ReluKernel{\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T* data, std::size_t numElements) const {\n"; + op += SP + SP + SP + "for (auto i : alpaka::uniformElements(acc, numElements)) {\n"; + op += SP + SP + SP + "data[i] = (data[i] < 0) ? 0 : data[i];\n"; + op += SP + SP + "}\n"; + op += SP + "}\n};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + return SP + "ReluKernel reluKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("TMVA SOFIE Operator Relu called to Generate without being initialized first"); + } + std::stringstream out; + auto length = ConvertDynamicShapeToLength(fShape); + out << "\n//------ RELU_GPU_ALPAKA\n"; + out << SP << "alpaka::WorkDivMembers workDiv_"<::all("<<(stoi(length)+256-1)/256<<"), alpaka::Vec::all(256), alpaka::Vec::all(1));\n"; + out << SP << "alpaka::exec(queue, workDiv_" << fNX << ", reluKernel, alpaka::getPtrNative(deviceBuf_" << fNX << "), static_cast(" << length << ")); \n"; + return out.str(); + } + + std::string GetFusableOutputTensorName() override { + return fNY; + } + + void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function& removal_func){ + removal_func(fNX); + removal_func(fNY); + fNX = fusable_tensor_name; + fNY = fusable_tensor_name; + fInputTensorNames[0] = fNX; + fOutputTensorNames[0] = fNY; + } }; }//SOFIE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx index 66a7e09..0a21709 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx @@ -6,7 +6,10 @@ #include "SOFIE/RModel.hxx" #include +#include #include +#include + namespace SOFIE{ @@ -19,17 +22,20 @@ class ROperator_Reshape final : public ROperator private: bool fVerbose = false; + bool fDimInput = false; + bool fDynamicShape = false; ReshapeOpMode fOpMode = Reshape; // type of Reshape operator int fAllowZero = 0; // (for Reshape) zero in tensor shape makes output shape equal to input tensor shape int fAxis = 1; // (for Flatten) std::string fNData; // input data tensor name - std::string fNShape; // reshape tensor name + std::string fNInput2; // reshape or axes tensor name depending on operator std::string fNOutput; // output tensor name - std::vector fShapeInput; // input shape data - std::vector fShapeOutput; // output shape data + std::vector fShapeInput; // input shape data + std::vector fShapeOutput; // output shape data std::vector fAttrAxes; // axes attributes (provided for all version of Squeeze/Unsqueeze) + std::vector fShape; // shape tensor values provided for Reshape public: @@ -42,16 +48,16 @@ public: } ROperator_Reshape(){} - ROperator_Reshape(ReshapeOpMode opMode, int attr_value, std::string nameData, std::string nameShape, std::string nameOutput) - : fOpMode(opMode), fNData(UTILITY::Clean_name(nameData)), fNShape(UTILITY::Clean_name(nameShape)), - fNOutput(UTILITY::Clean_name(nameOutput)) + ROperator_Reshape(ReshapeOpMode opMode, int attr_value, std::string nameData, std::string nameInput2, std::string nameOutput) + : fOpMode(opMode), fNData(UTILITY::Clean_name(nameData)), fNInput2(UTILITY::Clean_name(nameInput2)), + fNOutput(UTILITY::Clean_name(nameOutput)) { if (opMode == Reshape) fAllowZero = attr_value; if (opMode == Flatten) fAxis = attr_value; fInputTensorNames = { fNData }; - if(!fNShape.empty()){ - fInputTensorNames.emplace_back(fNShape); + if(!fNInput2.empty()){ + fInputTensorNames.emplace_back(fNInput2); } fOutputTensorNames = { fNOutput }; } @@ -70,94 +76,153 @@ public: auto ret = std::vector(1, input[0]); return ret; } + std::vector> ShapeInference(std::vector> input) override { + return input; + } // output shape - std::vector> ShapeInference(std::vector> input) override { - std::vector> ret; + std::vector> ShapeInference(const std::vector> & input) { + std::vector> ret; auto & input_shape = input[0]; - if (fOpMode == Reshape) { - if (input.size() != 2) throw std::runtime_error("TMVA SOFIE Reshape Op needs 2 input tensors"); - auto output_shape = input[1]; // the provided shape - size_t input_length = ConvertShapeToLength(input_shape); - size_t output_length = ConvertShapeToLength(output_shape); - // (input_length == output_length) is the easy case : (2,3,4) -> (2,12) - if (input_length != output_length) { - if ((output_length == 0 && fAllowZero == 0) || static_cast(output_length) < 0) { - // in this case value 0 or -1 in shape are automatically corrected - bool replacementDone = false; - for (size_t i = 0; i < output_shape.size(); i++) { - if (output_shape[i] == 0 || output_shape[i] == static_cast(-1)) { - if (replacementDone) { - throw std::runtime_error("TMVA Reshape Op : output shape has multiple negative or zero values"); + // correct the provided shape (here we have the value) for 0 or -1 + std::vector output_shape(fShape.size()); + assert(!fShape.empty() && !fDynamicShape); + for (size_t i = 0; i < output_shape.size(); i++) { + if (fShape[i] > 0 || (fAllowZero && fShape[i] >= 0)) + output_shape[i] = Dim{ static_cast(fShape[i]) }; + else if (!fAllowZero && fShape[i] == 0) + output_shape[i] = input_shape[i]; + } + // now case of -1 in shape + for (size_t i = 0; i < output_shape.size(); i++) { + if (fShape[i] == -1) { + auto tmp = output_shape; + tmp.erase(tmp.begin() + i); + auto tmp_length = ConvertDimShapeToLength(tmp); + auto input_length = ConvertDimShapeToLength(input_shape); + if (fVerbose) + std::cout << "reshape- try simplifying " << ConvertDimShapeToString(input_shape) << " with length " + << input_length << " to " << tmp_length << std::endl; + + if (IsInteger(tmp_length) && IsInteger(input_length)) + output_shape[i] = Dim{static_cast(std::stoi(input_length) / std::stoi(tmp_length))}; + else { + //we can try simplifying expression if tmp_length is integer and part of input_length + // contains tmp_length + bool canSimplify = false; + std::vector reduced_input; + if (IsInteger(tmp_length)) { + + // try to tokenize with * the input length + + std::stringstream ss(input_length); + + std::string token; + + // Tokenizing w.r.t. space '*' + while(getline(ss, token, '*')) + { + // remove any whitespace + token.erase(std::remove_if(token.begin(), token.end(), + [](unsigned char x) { return std::isspace(x); }), token.end()); + if (token != tmp_length) { + if (IsInteger(token)) { + size_t il = static_cast(std::stoi(input_length)); + size_t tl = static_cast(std::stoi(tmp_length)); + if ((il % tl) == 0) { + canSimplify = true; + reduced_input.push_back(Dim{il / tl}); + } + } else { + reduced_input.push_back(Dim{token}); + } + } else { + // token is equal to tmp_length, can be not considered and is simplified + canSimplify = true; + } } - auto tmp = output_shape; - tmp.erase(tmp.begin() + i); - auto tmp_length = ConvertShapeToLength(tmp); - output_shape[i] = input_length / tmp_length; - replacementDone = true; } + if (canSimplify) { + // if length contains * we need to add some brackets + std::string res_shape = ConvertDimShapeToLength(reduced_input); + if (res_shape.find('*') != std::string::npos) + output_shape[i] = Dim{std::string("(") + res_shape + ")", static_cast(-1)}; + else + output_shape[i] = Dim{res_shape}; + } + if (!canSimplify) + output_shape[i] = Dim{std::string("(") + input_length + " / (" + tmp_length + "))", static_cast(-1)}; } - if (fVerbose) - std::cout << "Reshape: correct output shape from " << ConvertShapeToString(input[1]) - << " to " << ConvertShapeToString(output_shape) << std::endl; - } - if (ConvertShapeToLength(output_shape) != input_length) { - throw std::runtime_error("TMVA Reshape Op : Invalid shapes : " + ConvertShapeToString(input_shape) + - ConvertShapeToString(output_shape)); + + break; // cannot have more than -1 } + // throw std::runtime_error( + // "TMVA Reshape Op : output shape has multiple negative or zero values"); + } + + if (fVerbose) + std::cout << "Reshape: correct output shape to " << ConvertShapeToString(output_shape) << std::endl; + + if (!fDimInput && ConvertDimShapeToLength(output_shape) != ConvertDimShapeToLength(input_shape)) { + throw std::runtime_error("TMVA Reshape Op : Invalid shapes : " + ConvertShapeToString(input_shape) + + ConvertShapeToString(output_shape)); } ret.push_back(output_shape); } else if (fOpMode == Flatten) { - // flattenig case - size_t inputSize = ConvertShapeToLength(input_shape); - size_t b = input[0][0]; - std::vector newShape = {b, inputSize / b}; + // flatten case + if (fAxis < 0) + fAxis += input_shape.size(); + auto s1 = std::vector(input_shape.begin(), input_shape.begin() + fAxis); + auto s2 = std::vector(input_shape.begin() + fAxis, input_shape.end()); + auto l1 = ConvertDimShapeToLength(s1); + auto l2 = ConvertDimShapeToLength(s2); + std::vector newShape = {Dim{l1}, Dim{l2}}; ret.push_back(newShape); - } else if (fOpMode == Squeeze) { // squeeze // assume no axis is provided - remove all axes with value equal to 1 - auto output_shape = input[0]; - if (input.size() == 1) { + auto output_shape = input_shape; + if (fAttrAxes.empty()) { size_t i = 0; while (i < output_shape.size()) { - if (output_shape[i] == 1 ) { + if (output_shape[i] == Dim{1}) { output_shape.erase(output_shape.begin() + i); - } - else { + } else { i++; } } - } else if (input.size() == 2) { - auto & axes = input[1]; - for (size_t i = 0; i < axes.size(); i++){ - if (output_shape[axes[i]] != 1) - throw std::runtime_error("TMVA Squeeze Op : Invalid axes : " + ConvertShapeToString(axes) + - ConvertShapeToString(output_shape)); + } else { + auto &axes = fAttrAxes; + for (size_t i = 0; i < axes.size(); i++) { + if (axes[i] < 0) + axes[i] += input_shape.size(); + if (!(output_shape[axes[i]] == Dim{1})) + throw std::runtime_error("TMVA Squeeze Op : Invalid axis value " + std::to_string(axes[i]) + + " for " + ConvertShapeToString(output_shape)); output_shape.erase(output_shape.begin() + axes[i]); } } ret.push_back(output_shape); } - else if (fOpMode == Unsqueeze) { // unsqueeze - assert(input.size() == 2); - auto output_shape = input[0]; - auto &axes = input[1]; + std::cout << "doing unsqueeze....\n"; + assert(!fAttrAxes.empty()); + auto output_shape = input_shape; + auto &axes = fAttrAxes; // output rank int64_t r = input[0].size() + axes.size(); - for (auto & a : axes) { + for (auto &a : axes) { int64_t i = static_cast(a); - if ( i < -r || i > r - 1 ) + if (i < -r || i > r - 1) throw std::runtime_error("TMVA Unsqueeze Op - axes input is not in correct range"); if (i >= 0) - output_shape.insert(output_shape.begin() + i, 1); + output_shape.insert(output_shape.begin() + i, Dim{1}); else - //negative axes - output_shape.insert(output_shape.end() + i + 1, 1); + // negative axes + output_shape.insert(output_shape.end() + i + 1, Dim{1}); } ret.push_back(output_shape); } @@ -166,34 +231,51 @@ public: void Initialize(RModel& model) override { + std::cout << "initialize reshape op type " << fOpMode << " - " << fNInput2 << " " << fNData << std::endl; fVerbose = model.Verbose(); if (model.CheckIfTensorAlreadyExist(fNData) == false) { // input must be a graph input, or already initialized intermediate tensor throw std::runtime_error("TMVA Reshape Op Input Tensor " + fNData + " is not found in model"); } - fShapeInput = model.GetTensorShape(fNData); - // check if optional shape tensor exist - if (!fNShape.empty()) { - if (model.CheckIfTensorAlreadyExist(fNShape)) { - auto dptr = model.GetInitializedTensorData(fNShape); - auto input_shape = static_cast(dptr.get()); - auto vec = model.GetTensorShape(fNShape); - assert(vec.size() == 1); - size_t n = vec[0]; // size of shape input tensor - - std::vector descShape(n); - std::copy(input_shape, input_shape + n, descShape.begin()); - fShapeOutput = ShapeInference({fShapeInput, descShape})[0]; - // set flag to not write tensor in weight file. Its data will be hard-coded in way model is constructed - model.SetNotWritableInitializedTensor(fNShape); + fShapeInput = model.GetDimTensorShape(fNData); + fDimInput = model.IsDynamicTensor(fNData); + // check if optional tensor exists defining shape or axes + if (!fNInput2.empty()) { + if (model.CheckIfTensorAlreadyExist(fNInput2)) { + if (model.IsConstantTensor(fNInput2) || model.IsInitializedTensor(fNInput2)) { + // assume input shape is an initialized tensor + auto dptr = model.GetInitializedTensorData(fNInput2); + auto values = static_cast(dptr.get()); + auto vec = model.GetTensorShape(fNInput2); + size_t n = 1; + if (vec.size() > 0) + n = vec[0]; // size of shape input tensor + // copy values in fShape vector or fAttrAxes + if (fOpMode == Reshape) + fShape = std::vector(values, values + n); + else + fAttrAxes = std::vector(values, values + n); + + fShapeOutput = ShapeInference({fShapeInput})[0]; + // set flag to not write tensor in weight file. Its data will be hard-coded in way model is constructed + model.SetNotWritableInitializedTensor(fNInput2); + } else { + // we cannot get shape at initialization time but at run-time + fDynamicShape = true; + // size of shape output us given by size of shape input tensor + auto shapeInput2 = model.GetTensorShape(fNInput2); + fShapeOutput.resize(shapeInput2[0]); + for (size_t i = 0; i < fShapeOutput.size(); i++) { + fShapeOutput[i] = Dim{ std::string("s_") + fNOutput + "_" + std::to_string(i)}; + } + } } else { - throw std::runtime_error("TMVA Reshape Op Shape Tensor " + fNShape + " is not found in model"); + throw std::runtime_error("TMVA Reshape Op 2nd input Tensor " + fNInput2 + " is not found in model"); } } else if (!fAttrAxes.empty()) { - // case fNShape is empty and axes are provided as attributes - std::vector descShape(fAttrAxes.size()); - std::copy(fAttrAxes.begin(), fAttrAxes.end(), descShape.begin()); - fShapeOutput = ShapeInference({fShapeInput, descShape})[0]; + // case fNShape is empty and axes are provided as attributes (e.g. for Unsqueeze) + std::cout << "attribute axes exists\n"; + fShapeOutput = ShapeInference({fShapeInput})[0]; } else if (fOpMode == Flatten || fOpMode == Squeeze) { fShapeOutput = ShapeInference({fShapeInput})[0]; } else { @@ -203,14 +285,26 @@ public: if (model.IsInitializedTensor(fNData) && model.GetTensorType(fNData) == ETensorType::INT64) { fIsOutputConstant = true; auto inputData = static_cast(model.GetInitializedTensorData(fNData).get()); - if (ConvertShapeToLength(fShapeInput) != ConvertShapeToLength(fShapeOutput)) + auto o_shape = ConvertShapeToInt(fShapeOutput); + if (ConvertShapeToLength(ConvertShapeToInt(fShapeInput)) != ConvertShapeToLength(o_shape) ) throw std::runtime_error("TMVA Reshape Op : Invalid Input/Output lengths"); - model.AddConstantTensor(fNOutput, fShapeOutput, inputData); + model.AddConstantTensor(fNOutput, o_shape, inputData); if (model.Verbose()) { std::cout << Name() << " : " << fNData << " " << ConvertShapeToString(fShapeInput) << " --> " << fNOutput << " (constant) " << ConvertShapeToString(fShapeOutput) << " : " << - ConvertValuesToString(ConvertShapeToLength(fShapeOutput), inputData) << std::endl; + ConvertValuesToString(ConvertShapeToLength(o_shape), inputData) << std::endl; } - } else { + } + // for shape tensors we can have it if output shape is size==1 or a scalar + else if (model.IsShapeTensor(fNData) && fShapeOutput.size() <=1) { + fIsOutputConstant = true; + auto inputData = model.GetShapeTensorValues(fNData); + model.AddShapeTensor(fNOutput, inputData); + if (model.Verbose()) { + std::cout << Name() << " : " << fNData << " " << ConvertShapeToString(fShapeInput) << " --> " << fNOutput << " (shape) " << ConvertShapeToString(fShapeOutput) << " : " << + ConvertShapeToString(inputData) << std::endl; + } + } + else { // non-constant case model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fShapeOutput); if (model.Verbose()) @@ -218,32 +312,74 @@ public: } } - std::string Generate(std::string OpName) override { + std::string Generate(std::string opName) override { if (fIsOutputConstant) return ""; //no op for constant tensors - OpName = "op_" + OpName; + std::stringstream out; + std::string opType = "Reshape"; + if (fOpMode == Flatten) + opType = "Flatten"; + else if (fOpMode == Squeeze) + opType = "Squeeze"; + else if (fOpMode == Unsqueeze) + opType = "Unsquueze"; + + out << SP << "///--------" << opType << " operator " << opName << " --> " << ConvertShapeToString(fShapeOutput) << "\n"; + + // in case of dynamic output shape we need to set the shape value from input shape tensor + // and take case of the zero values + if (fDynamicShape) { + for (size_t i = 0; i < fShapeOutput.size(); i++) { + // since fNInput2 values are int64_t, should we check if they are negative? + out << SP << "size_t " << fShapeOutput[i].param << " = " << "tensor_" << fNInput2 << "[" << i << "];\n"; + if (!fAllowZero) + out << SP << "if (tensor_" << fNInput2 << "[" << i << "] <= 0 ) " + << fShapeOutput[i].param << " = " << fShapeInput[i] << ";\n"; + } + } // output of reshape is same as input - size_t length = ConvertShapeToLength(fShapeOutput); - if (length != ConvertShapeToLength(fShapeInput)) { + auto lengthOut = ConvertDimShapeToLength(fShapeOutput); + auto lengthIn = ConvertDimShapeToLength(fShapeInput); + if (lengthOut != lengthIn) { + // check needs to be done at run-time + out << SP << "if (" << lengthOut << "!=" << lengthIn << ")\n"; + out << "throw std::runtime_error(\"TMVA SOFIE Reshape Op : output lengths is different than input one\");\n"; + } + + + out << SP << "std::copy( tensor_" << fNData << ", tensor_" << fNData << " + " << lengthIn << ", " << "tensor_" << fNOutput + << ");\n"; + return out.str(); + } + + std::string Generate_GPU_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; //no op for constant tensors + + opName = "op_" + opName; + + // output of reshape is same as input + auto length = ConvertDimShapeToLength(fShapeOutput); + if (length != ConvertDimShapeToLength(fShapeInput)) { throw std::runtime_error("TMVA SOFIE Reshape Op : wrong output shape - is " + - ConvertShapeToString(fShapeOutput) + " and input is " + - ConvertShapeToString(fShapeInput)); + ConvertDimShapeToString(fShapeOutput) + " and input is " + + ConvertDimShapeToString(fShapeInput)); } std::stringstream out; - std::string opName = "Reshape"; + opName += "_Reshape"; if (fOpMode == Flatten) - opName = "Flatten"; + opName += "_Flatten"; else if (fOpMode == Squeeze) - opName = "Squeeze"; + opName += "_Squeeze"; else if (fOpMode == Unsqueeze) - opName = "Unsquueze"; + opName += "_Unsquueze"; + - out << SP << "///--------" << opName << " operator\n" << std::endl; - out << SP << "std::copy( tensor_" << fNData << ", tensor_" << fNData << " + " << length << ", " << "tensor_" << fNOutput - << ");\n"; + out << SP << "///-------" << opName << " operator\n" << std::endl; + out << SP << "alpaka::memcpy(queue, deviceBuf_" << fNOutput << ", deviceBuf_" << fNData << ");\n"; return out.str(); } + }; }//SOFIE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx index 52bdeae..34e69eb 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx @@ -101,6 +101,26 @@ public: return out.str(); } + std::string Generate_GPU_ALPAKA(std::string OpName) override { + // no need to generate code if the output is constant + if (fIsOutputConstant) return ""; + + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("TMVA SOFIE Shape op called to Generate without being initialized first"); + } + std::stringstream out; + + out << "\n//------ Shape\n"; + // add a dummy statement to avoid warning for unused input + out << SP << "(void) deviceBuf_" << fNX << ";\n"; + size_t length = ConvertShapeToLength(fOutput_shape); + for (size_t id = 0; id < length; id++) { + out << SP << "deviceBuf_" << fNY << "["<< id << "] = " << fShape[fStart+id] << ";\n"; + } + return out.str(); + } + }; }//SOFIE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx index 68edd01..5edbcf9 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx @@ -61,6 +61,59 @@ public: return out.str(); } + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override { + std::string op; + op = "\n//------ SIGMOID_KERNEL_ALPAKA\n"; + op += "struct SigmoidKernel {\n"; + op += SP + "template\n"; + op += SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements) const {\n"; + op += SP + SP + "const auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + "if(idx < numElements) {\n"; + op += SP + SP + SP + SP + "out[idx] = static_cast(1) / (static_cast(1) + exp(-data[idx]));\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + return op; + } + + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + return SP + "SigmoidKernel sigmoidKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("TMVA SOFIE Operator Sigmoid called to Generate without being initialized first"); + } + + std::stringstream out; + auto length = ConvertShapeToLength(fShape); + out << "\n//------ SIGMOID_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"< const kernelCfg_" << fNX << " = {elementsPerGrid_" << fNX << ", elementsPerThread_" << fNX << "};\n"; + out << SP << "auto const workDiv_" << fNX << " = alpaka::getValidWorkDiv(kernelCfg_" << fNX << ", devAcc, sigmoidKernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast(" << length << "));\n"; + out << SP << "alpaka::exec(queue, workDiv_" << fNX + << ", sigmoidKernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast(" << length << "));\n"; + return out.str(); + } + + std::string GetFusableOutputTensorName() override { + return fNY; + } + + void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function& removal_func){ + removal_func(fNX); + removal_func(fNY); + fNX = fusable_tensor_name; + fNY = fusable_tensor_name; + fInputTensorNames[0] = fNX; + fOutputTensorNames[0] = fNY; + } + std::vector GetStdLibs() override { return { std::string("cmath") };} }; diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx index 63fbcb3..6335db3 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx @@ -153,6 +153,72 @@ public: return out.str(); } + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) { + std::string op; + op = "\n//------ SPLIT_KERNEL_ALPAKA\n"; + op += SP + "struct SplitKernel {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const * input, T * output,"; + op += "std::size_t const * input_strides, std::size_t const * output_strides, std::size_t const split_axis, "; + op += "std::size_t const axis_offset, std::size_t const ndim) const {\n"; + op += SP + SP + SP + SP + "auto elements = alpaka::uniformElementsND(acc, alpaka::Vec(output_shape));\n"; + op += SP + SP + SP + SP + "for (auto const& elem : elements) {\n"; + op += SP + SP + SP + SP + SP + "size_t input_idx = 0;\n"; + op += SP + SP + SP + SP + SP + "size_t output_idx = 0;\n"; + op += SP + SP + SP + SP + SP + "for (int i = 0; i < ndim; ++i) {\n"; + op += SP + SP + SP + SP + SP + SP + "size_t output_coord = elem[i];\n"; + op += SP + SP + SP + SP + SP + SP + "size_t input_coord = (i == split_axis) ? (output_coord + axis_offset) : output_coord;\n"; + op += SP + SP + SP + SP + SP + SP + "input_idx += input_coord * input_strides[i];\n"; + op += SP + SP + SP + SP + SP + SP + "output_idx += output_coord * output_strides[i];\n}\n"; + op += SP + SP + SP + SP + SP + "output[output_idx] = input[input_idx];\n"; + op += SP + SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + return SP + "SplitKernel splitKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fOutputShapes.empty()){ + throw std::runtime_error("TMVA SOFIE Operator Split called to Generate without being initialized first"); + } + + std::stringstream out; + out << "\n//------ SPLIT_GPU_ALPAKA\n"; + + bool axis_is_innermost = (fAxis == static_cast(fInputShape.size()) - 1) + && (UTILITY::ComputeStrideFromShape(fInputShape)[fInputShape.size()-1] == 1); + out << SP <<"size_t "<(" << length << ") * sizeof(float);\n"; + out << SP << SP << SP << "alpaka::memcpy(queue, "< workDiv_" << fNYs[i] + << "(alpaka::Vec::all((" << length << " + 256 - 1) / 256), " + << "alpaka::Vec::all(256), alpaka::Vec::all(1));\n"; + + out << SP << "alpaka::exec(queue, workDiv_" << fNYs[i] + << ", splitKernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNYs[i] + << "), "<< ConvertShapeToString(UTILITY::ComputeStrideFromShape(fInputShape)) <<", "<\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const * __restrict__ tensor_X,"; + op += SP + SP + SP + "T * __restrict__ tensor_Y, const int64_t * __restrict__ shape_X,"; + op += SP + SP + SP + "const int64_t * __restrict__ stride_X, const int64_t * __restrict__ shape_Y,"; + op += SP + SP + SP + "const int64_t * __restrict__ stride_Y, std::size_t const ndim) const {\n"; + op += SP + SP + SP + SP + "auto elements = alpaka::uniformElementsND(acc, alpaka::Vec(shape_Y));\n"; + op += SP + SP + SP + SP + "for (auto const& elem: elements) {\n"; + op += SP + SP + SP + SP + SP + "size_t input_idx = 0;\n"; + op += SP + SP + SP + SP + SP + "size_t output_idx = 0;\n"; + op += SP + SP + SP + SP + SP + "for (int i = 0; i < ndim; ++i) {\n"; + op += SP + SP + SP + SP + SP + SP + "size_t input_coord = elem[i] % shape_X[i];\n"; + op += SP + SP + SP + SP + SP + SP + "input_idx += input_coord * stride_X[i];\n"; + op += SP + SP + SP + SP + SP + "output_idx += elem[i] * stride_Y[i];\n}\n"; + op += SP + SP + SP + SP + SP + "tensor_Y[output_idx] = tensor_X[input_idx];\n"; + op += SP + SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + return SP + "TileKernel tileKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fShapeInput.empty() || fShapeY.empty()) { + throw std::runtime_error("TMVA SOFIE Operator Tile called to Generate without being initialized first"); + } + std::stringstream out; + auto length = ConvertShapeToLength(fShapeY); + out << "\n//------ TILE_GPU_ALPAKA\n"; + out << SP << "alpaka::WorkDivMembers workDiv_" << fNY + << "(alpaka::Vec::all((" << length << " + 256 - 1) / 256), " + << "alpaka::Vec::all(256), alpaka::Vec::all(1));\n"; + + out << SP << "alpaka::exec(queue, workDiv_" << fNY + << ", tileKernel, alpaka::getPtrNative(deviceBuf_" << fNInput + << "), alpaka::getPtrNative(deviceBuf_" << fNY + << "), "<< ConvertShapeToString(fShapeInput)<<", "<< ConvertShapeToString(UTILITY::ComputeStrideFromShape(fShapeInput)) <<", " + <\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const * input, T const * output, std::size_t * shape, std::size_t * strides) const {\n"; + op += SP + SP + SP + "for (auto i : alpaka::uniformElementsND(acc, shape)) {\n"; + op += SP + SP + SP + SP + "size_t input_idx = 0;\n"; + + return op; + } + }; diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx index 28ac093..19d217d 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx @@ -7,7 +7,6 @@ #include - namespace SOFIE{ @@ -150,32 +149,86 @@ public: fShapeY = fShapeA; } // check case of constant output (if all inputs are defined) - if (model.IsInitializedTensor(fNA) && model.IsInitializedTensor(fNB) && model.IsInitializedTensor(fNC)) { - std::string nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA; - std::string nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB; + if (model.IsInitializedTensor(fNC)) { + std::string nameC = fNBroadcastedC.empty()? fNC : fNBroadcastedC; - auto dataA = static_cast(model.GetInitializedTensorData(nameA).get()); - auto dataB = static_cast(model.GetInitializedTensorData(nameB).get()); auto dataC = static_cast(model.GetInitializedTensorData(nameC).get()); - std::vector dataY(ConvertShapeToLength(fShapeY)); - for (size_t i = 0; i < dataY.size(); i++) - dataY[i] = (dataC[i]) ? dataA[i] : dataB[i]; - model.AddConstantTensor(fNY, fShapeY, dataY.data()); - // flag tensors to not be written in a file - model.SetNotWritableInitializedTensor(nameA); - model.SetNotWritableInitializedTensor(nameB); model.SetNotWritableInitializedTensor(nameC); + T * dataA = nullptr; + T * dataB = nullptr; + std::vector shapeDataA; + std::vector shapeDataB; + if (model.IsInitializedTensor(fNA)) { + std::string nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA; + dataA = static_cast(model.GetInitializedTensorData(nameA).get()); + // flag tensors to not be written in a file + model.SetNotWritableInitializedTensor(nameA); + } else if (model.IsShapeTensor(fNA)) + shapeDataA = model.GetShapeTensorValues(fNA); + if (model.IsInitializedTensor(fNB)) { + std::string nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB; + dataB = static_cast(model.GetInitializedTensorData(nameB).get()); + model.SetNotWritableInitializedTensor(nameB); + } else if (model.IsShapeTensor(fNB)) + shapeDataB = model.GetShapeTensorValues(fNB); - fIsOutputConstant = true; - if (model.Verbose()) + std::vector dataY; + std::vector shapeDataY; + + bool isOutputConstantTensor = true; + if (dataA && dataB) { + dataY.resize(ConvertShapeToLength(fShapeY)); + for (size_t i = 0; i < dataY.size(); i++) + dataY[i] = (dataC[i]) ? dataA[i] : dataB[i]; + } + else if (dataA && shapeDataB.size()>0 ) { + shapeDataY.resize(ConvertShapeToLength(fShapeY)); + for (size_t i = 0; i < shapeDataY.size(); i++) { + shapeDataY[i] = (dataC[i]) ? Dim{size_t(dataA[i])} : shapeDataB[i]; + isOutputConstantTensor &= !shapeDataY[i].isParam; + } + } + else if (dataB && shapeDataA.size()>0 ) { + shapeDataY.resize(ConvertShapeToLength(fShapeY)); + for (size_t i = 0; i < shapeDataY.size(); i++) { + shapeDataY[i] = (dataC[i]) ? shapeDataB[i] : Dim{size_t(dataB[i])}; + isOutputConstantTensor &= !shapeDataY[i].isParam; + } + } + else if (shapeDataB.size() > 0 && shapeDataA.size()>0 ) { + shapeDataY.resize(ConvertShapeToLength(fShapeY)); + for (size_t i = 0; i < shapeDataY.size(); i++) { + shapeDataY[i] = (dataC[i]) ? shapeDataA[i] : shapeDataB[i]; + isOutputConstantTensor &= !shapeDataY[i].isParam; + } + } + fIsOutputConstant = true; // this contains both case constant tensor output ans shape tensor output + if (isOutputConstantTensor && dataY.empty()) { + dataY.resize(shapeDataY.size()); + for (size_t i = 0; i < shapeDataY.size(); i++) + dataY[i] = static_cast(shapeDataY[i].dim); + } + if (dataY.size() > 0) + model.AddConstantTensor(fNY, fShapeY, dataY.data()); + else if (shapeDataY.size() > 0 ) + model.AddShapeTensor(fNY, shapeDataY, fShapeY.size() == 0); + else { + fIsOutputConstant = false; + } + if (fIsOutputConstant && model.Verbose()) std::cout << "Where op ---> " << fNY << " " << ConvertShapeToString(fShapeY) << " : " - << ConvertValuesToString(dataY) << std::endl; - + << ((dataY.size() > 0) ? ConvertValuesToString(dataY) : ConvertShapeToString(shapeDataY) ) + << ((dataY.size() > 0) ? " (constant)" : " (shape)") << std::endl; + // output is a constant tensor - fOutputTensorNames.pop_back(); + if (fIsOutputConstant) fOutputTensorNames.pop_back(); } - else { + if (!fIsOutputConstant) { model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fShapeY); + if (model.Verbose()) + std::cout << "Where op " << " condition : " << fNC << " " << ConvertShapeToString(fShapeC) << + " X " << fNA << " " << ConvertShapeToString(fShapeA) << " Y " << fNB << " " << ConvertShapeToString(fShapeB) + << " ---> " << fNY << " " << ConvertShapeToString(fShapeY) << std::endl; } } @@ -184,17 +237,17 @@ public: return out.str(); } - std::string Generate(std::string OpName) override { + std::string Generate(std::string opName) override { if (fIsOutputConstant) return ""; - OpName = "op_" + OpName; + opName = "op_" + opName; if (fShapeY.empty()) { throw std::runtime_error("TMVA SOFIE Where Op called to Generate without being initialized first"); } std::stringstream out; - out << SP << "\n//-------- Where \n"; + out << SP << "\n//-------- Where " << opName << " --> " << ConvertShapeToString(fShapeY) << "\n"; size_t length = ConvertShapeToLength(fShapeY); std::string typeName = TensorType::Name(); // Broadcast A if it's uninitialized @@ -216,19 +269,18 @@ public: // special case if C is an input tensor if (fIsInputBoolTensor) { size_t inputLength = ConvertShapeToLength(fShapeC); - out << SP << "std::vector fTensor_" << fNC << "(tensor_" << fNC << ", tensor_" << fNC << " + " << inputLength << ");\n"; + out << SP << "std::vector fTensor_" << fNC << "(tensor_" << fNC << ", tensor_" << fNC << " + " << inputLength << ");\n"; } out << SP << "// Broadcasting uninitialized tensor " << fNC << "\n"; //out << SP << "{\n"; - // for boolean we need to pass vector and use the non-template version of the function - out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast(fTensor_" << fNC << ", " << ConvertShapeToString(fShapeC) << ", " << ConvertShapeToString(fShapeY) + out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast(fTensor_" << fNC << ".data(), " << ConvertShapeToString(fShapeC) << ", " << ConvertShapeToString(fShapeY) << ", fTensor_" << fNBroadcastedC << ");\n"; } std::string nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA; std::string nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB; std::string nameC = fNBroadcastedC.empty()? fNC : fNBroadcastedC; out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n"; - // get output tensor applying condition (note we need to use directly the vector since v.data(), i.e the data pointer, does not exist) + // get output tensor applying condition out << SP << SP << "tensor_" << fNY << "[id] = " << "(fTensor_" << nameC << "[id]) ? tensor_" << nameA << "[id] : tensor_" + nameB + "[id];\n"; out << SP << "}\n"; @@ -240,4 +292,4 @@ public: }//SOFIE -#endif //SOFIE_ROperator_Where +#endif // SOFIE_ROperator_Where diff --git a/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx b/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx index d183052..8b9727b 100644 --- a/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx +++ b/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx @@ -1,7 +1,7 @@ #ifndef SOFIE_SOFIE_COMMON #define SOFIE_SOFIE_COMMON -#include "TMVA/RTensor.hxx" +#include "SOFIE/RTensor.hxx" #include "ROOT/RSpan.hxx" @@ -21,13 +21,10 @@ #include #include - -namespace SOFIE{ - -//typedef RTensor tensor_t; +namespace SOFIE { enum class ETensorType{ - UNDEFINED = 0, FLOAT = 1, UNINT8 = 2, INT8 = 3, UINT16 = 4, INT16 = 5, INT32 = 6, INT64 = 7, STRING = 8, BOOL = 9, //order sensitive + UNDEFINED = 0, FLOAT = 1, UINT8 = 2, INT8 = 3, UINT16 = 4, INT16 = 5, INT32 = 6, INT64 = 7, STRING = 8, BOOL = 9, //order sensitive FLOAT16 = 10, DOUBLE = 11, UINT32 = 12, UINT64 = 13, COMPLEX64 = 14, COMPLEX28 = 15, BFLOAT16 = 16 }; @@ -39,7 +36,7 @@ constexpr size_t GetTypeSize(ETensorType type) { switch (type) { case ETensorType::FLOAT: return sizeof(float); case ETensorType::DOUBLE: return sizeof(double); - case ETensorType::UNINT8: return sizeof(uint8_t); + case ETensorType::UINT8: return sizeof(uint8_t); case ETensorType::INT8: return sizeof(int8_t); case ETensorType::UINT16: return sizeof(uint16_t); case ETensorType::INT16: return sizeof(int16_t); @@ -58,6 +55,9 @@ typedef std::int64_t int_t; std::string ConvertTypeToString(ETensorType type); ETensorType ConvertStringToType(std::string type); +// find if a string represents a number +bool IsInteger(const std::string & s); + struct Dim{ bool isParam = false; size_t dim = 0; @@ -67,16 +67,42 @@ struct Dim{ Dim() {} // constructor for a parametric dimension with the option to pass a default dim value - Dim(const std::string & p, size_t d = 0) : isParam(true), dim(d), param(p) {} + // We use -1 for dim to indicate that the param dimension is an expression (e.g. "d1+d2") + // in case the string represents a number make Dim not parametric + Dim(const std::string & p, size_t d = 0) : isParam(true), dim(d), param(p) + { + if (IsInteger(p)) { + isParam = false; + dim = std::stoi(p); + } + } // constructor for a non-parametric dimension Dim(size_t d) : dim(d) {} std::string GetVal() const { - return (isParam) ? param : std::to_string(dim); + // cast to int64_t for negative shape values + return (isParam) ? param : std::to_string(static_cast(dim)); + } + + std::ostream& operator<< (std::ostream& os) const { + os << GetVal(); + return os; + } + + bool operator==(const Dim& rhs) const { + return (isParam && rhs.isParam) ? param == rhs.param : dim == rhs.dim; + } + bool operator!=(const Dim& rhs) const { + return !(*this == rhs); } }; +//bool operator==(const Dim& lhs, const Dim& rhs); +inline std::ostream & operator<< (std::ostream &os, const Dim &d) { + os << d.GetVal(); + return os; +} struct InputTensorInfo{ ETensorType type; @@ -93,6 +119,18 @@ struct DynamicTensorInfo{ std::vector shape; }; +// template traits for Tensor Shape +template +struct TensorShape {}; +template<> +struct TensorShape { + static bool IsDim() { return true; } +}; +template<> +struct TensorShape { + static bool IsDim() { return false; } +}; + // template traits for Tensor type template struct TensorType {}; @@ -120,6 +158,10 @@ template<> struct TensorType { static const std::string Name() { return "uint64_t"; } }; +template<> +struct TensorType { + static const std::string Name() { return "bool"; } +}; struct TensorMemoryInfo { std::string_view tensor_name; @@ -148,19 +190,26 @@ struct MemoryPoolInfo { std::map available_stack; }; -std::vector ConvertShapeToDim(std::vector shape); +std::vector ConvertShapeToDim(const std::vector & shape); + +std::vector ConvertShapeToInt(const std::vector & shape); + +inline std::size_t ConvertShapeToLength(const std::vector & shape){ + // Empty shape represent scalar values, so we return a length=1 + std::size_t fLength = 1; + for (auto& dim: shape) fLength *= dim; + return fLength; +} + +std::string ConvertShapeToString(const std::vector & shape); +std::string ConvertDimShapeToString(const std::vector & shape); +std::string ConvertShapeToString(const std::vector & shape); -std::vector ConvertShapeToInt(std::vector shape); -std::size_t ConvertShapeToLength(std::vector shape); -std::string ConvertShapeToString(std::vector shape); -std::string ConvertDynamicShapeToString(std::vector shape); -// std::string ConvertShapeToString(std::vector shape) { -// return ConvertDynamicShapeToString(shape); -// } +std::string ConvertDimShapeToLength(const std::vector & shape); +std::string ConvertDynamicShapeToLength(const std::vector & shape); -std::string ConvertDynamicShapeToLength(std::vector shape); template std::string ConvertValToString(T value) { @@ -271,7 +320,7 @@ private: template ETensorType GetTemplatedType(T /*obj*/ ){ if (std::is_same::value) return ETensorType::FLOAT; - if (std::is_same::value) return ETensorType::UNINT8; + if (std::is_same::value) return ETensorType::UINT8; if (std::is_same::value) return ETensorType::INT8; if (std::is_same::value) return ETensorType::UINT16; if (std::is_same::value) return ETensorType::INT16; @@ -287,6 +336,12 @@ ETensorType GetTemplatedType(T /*obj*/ ){ } namespace UTILITY{ + + + +// clean operator and tensor names +std::string Clean_name(std::string input_tensor_name); + // Check if two shapes are equal bool AreSameShape(const std::vector&, const std::vector&); bool AreSameShape(const std::vector&, const std::vector&); @@ -296,10 +351,14 @@ bool AreSameShape(const std::vector&, const std::vector&); // Multidirectional broadcast a list of tensors to the same shape std::vector MultidirectionalBroadcastShape(std::vector>); -// Unidirectional broadcast two shapes to the same shape -std::vector UnidirectionalBroadcastShape(std::vector, std::vector); +// Multidirectional broadcast two shapes to the same shape + +std::pair> MultidirectionalBroadcastShape(std::vector &, std::vector &); +std::vector UnidirectionalBroadcastShape(std::vector &, std::vector &); + +std::pair> MultidirectionalBroadcastShape(std::vector &, std::vector &); + -std::string Clean_name(std::string input_tensor_name); template T* BroadcastConvBias(const T* data, const size_t channel, const std::vector& targetShape) { @@ -352,7 +411,7 @@ void BroadcastTensor(ConstContT data, const std::vector& shape, const st size_t targetLength = broadcastedData.size(); assert(ConvertShapeToLength(targetShape) == targetLength); // special case when broadcasting last dimensions (initial shapes must be the same) - if (shape.front() == targetShape.front() && shape.back() == 1 && size > 1) { + if (size > 1 && shape.front() == targetShape.front() && shape.back() == 1) { size_t bsize = targetShape.back(); // compute the size of the data to broadcast for (int k = int(size)-2; k >=0; k--) { @@ -419,6 +478,7 @@ T* CreateBroadcastTensor(const T* data, const std::vector& shape, const BroadcastTensor, std::span>(inData, shape, targetShape, bData); return broadcastedData; } + // Unidirectional broadcasting shape to targetShape// In unidirectional broadcast - only tensor B can have the shape changed not // tensor A - otherwise is a multidirectional broadcast template @@ -449,8 +509,6 @@ void UnidirectionalBroadcast(const T* data, const std::vector& shape, co } BroadcastTensor>(inData, shape, targetShape, broadcastedData); } -// specialization for vector of boolean -void UnidirectionalBroadcast(const std::vector & data, const std::vector& shape, const std::vector& targetShape, std::vector & broadcastedData); /// compute stride of a tensor given its shape (assume layout is row-major) std::vector ComputeStrideFromShape(const std::vector & shape); @@ -619,7 +677,15 @@ void col2im(const Dtype* data_col, const int channels, //std::cout << "finishing col2imp" << std::endl; } - +// Used at the end of infer() to fill the return object. +template +void FillOutput(T const *arr, std::vector &out, std::size_t n) +{ + out.resize(n); + for (std::size_t i = 0; i < n; ++i) { + out[i] = arr[i]; + } +} } // end namespace UTILITY @@ -631,20 +697,20 @@ extern "C" void sgemm_(const char * transa, const char * transb, const int * m, struct GNN_Data { - TMVA::Experimental::RTensor node_data; // the node feature data, tensor with shape (num_nodes, num_node_features) - TMVA::Experimental::RTensor edge_data; // the edge feature data, tensor with shape (num_edges, num_edge_features) - TMVA::Experimental::RTensor global_data; // the global features, tensor with shape (1, num_global_features) - TMVA::Experimental::RTensor edge_index; // the edge index (receivers and senders for each edge), tensor with shape (2, num_edges) + SOFIE::RTensor node_data; // the node feature data, tensor with shape (num_nodes, num_node_features) + SOFIE::RTensor edge_data; // the edge feature data, tensor with shape (num_edges, num_edge_features) + SOFIE::RTensor global_data; // the global features, tensor with shape (1, num_global_features) + SOFIE::RTensor edge_index; // the edge index (receivers and senders for each edge), tensor with shape (2, num_edges) // edge_index[0,:] are the receivers and edge_index[1,:] are the senders // need to have default constructor since RTensor has not one - GNN_Data(): node_data(TMVA::Experimental::RTensor({})), edge_data(TMVA::Experimental::RTensor({})), global_data(TMVA::Experimental::RTensor({})), edge_index(TMVA::Experimental::RTensor({})) {} + GNN_Data(): node_data(SOFIE::RTensor({})), edge_data(SOFIE::RTensor({})), global_data(SOFIE::RTensor({})), edge_index(SOFIE::RTensor({})) {} }; template -TMVA::Experimental::RTensor Concatenate( TMVA::Experimental::RTensor & t1, TMVA::Experimental::RTensor & t2, int axis = 0) +SOFIE::RTensor Concatenate( SOFIE::RTensor & t1, SOFIE::RTensor & t2, int axis = 0) { // concatenate tensor along axis. Shape must be the same except in the dimension of the concatenated axis if (t1.GetMemoryLayout() != t2.GetMemoryLayout()) @@ -659,8 +725,8 @@ TMVA::Experimental::RTensor Concatenate( TMVA::Experimental::RTensor & t1, } std::vector outShape = shape1; outShape[axis] = shape1[axis] + shape2[axis]; - TMVA::Experimental::RTensor tout(outShape, t1.GetMemoryLayout()); - if (t1.GetMemoryLayout() == TMVA::Experimental::MemoryLayout::ColumnMajor) { + SOFIE::RTensor tout(outShape, t1.GetMemoryLayout()); + if (t1.GetMemoryLayout() == SOFIE::MemoryLayout::ColumnMajor) { throw std::runtime_error("TMVA RTensor Concatenate is not yet supported for column major tensors"); } @@ -693,10 +759,10 @@ inline GNN_Data Concatenate(GNN_Data & data1, GNN_Data & data2, int axis = 0) { inline GNN_Data Copy(const GNN_Data & data) { GNN_Data out; - out.node_data = TMVA::Experimental::RTensor(data.node_data.GetShape()); - out.edge_data = TMVA::Experimental::RTensor(data.edge_data.GetShape()); - out.global_data = TMVA::Experimental::RTensor(data.global_data.GetShape()); - out.edge_index = TMVA::Experimental::RTensor(data.edge_index.GetShape()); + out.node_data = SOFIE::RTensor(data.node_data.GetShape()); + out.edge_data = SOFIE::RTensor(data.edge_data.GetShape()); + out.global_data = SOFIE::RTensor(data.global_data.GetShape()); + out.edge_index = SOFIE::RTensor(data.edge_index.GetShape()); std::copy(data.node_data.GetData(), data.node_data.GetData()+ data.node_data.GetSize(), out.node_data.GetData()); std::copy(data.edge_data.GetData(), data.edge_data.GetData()+ data.edge_data.GetSize(), out.edge_data.GetData()); std::copy(data.global_data.GetData(), data.global_data.GetData()+ data.global_data.GetSize(), out.global_data.GetData()); @@ -704,6 +770,45 @@ inline GNN_Data Copy(const GNN_Data & data) { return out; } -}//SOFIE +inline void Gemm_Call(float *output, bool transa, bool transb, int m, int n, int k, float alpha, const float *A, + const float *B, float beta, const float *C) +{ + char ct = 't'; + char cn = 'n'; + const int *lda = transa ? &k : &m; + const int *ldb = transb ? &n : &k; + const int *ldc = &m; + if (C != nullptr) { + std::copy(C, C + m * n, output); + } + SOFIE::BLAS::sgemm_(transa ? &ct : &cn, transb ? &ct : &cn, &m, &n, &k, &alpha, A, lda, B, ldb, + &beta, output, ldc); +} + +template +void ReadTensorFromStream(std::istream &is, T &target, std::string const &expectedName, std::size_t expectedLength) +{ + std::string name; + std::size_t length; + is >> name >> length; + if (name != expectedName) { + std::string err_msg = + "TMVA-SOFIE failed to read the correct tensor name; expected name is " + expectedName + " , read " + name; + throw std::runtime_error(err_msg); + } + if (length != expectedLength) { + std::string err_msg = "TMVA-SOFIE failed to read the correct tensor size; expected size is " + + std::to_string(expectedLength) + " , read " + std::to_string(length); + throw std::runtime_error(err_msg); + } + for (size_t i = 0; i < length; ++i) { + is >> target[i]; + } + if (is.fail()) { + throw std::runtime_error("TMVA-SOFIE failed to read the values for tensor " + expectedName); + } +} + +} // namespace SOFIE -#endif //TMVA_SOFIE_RMODEL +#endif //SOFIE_COMMON diff --git a/src/SOFIE_core/src/RModel.cxx b/src/SOFIE_core/src/RModel.cxx index e5495ed..0eab8d1 100644 --- a/src/SOFIE_core/src/RModel.cxx +++ b/src/SOFIE_core/src/RModel.cxx @@ -4,55 +4,21 @@ #include #include +#ifdef SOFIE_SUPPORT_ROOT_BINARY #include "TFile.h" +#endif #include "SOFIE/RModel.hxx" #include "SOFIE/SOFIE_common.hxx" - namespace SOFIE { -std::underlying_type_t operator|(Options opA, Options opB) { - return static_cast>(opA) | static_cast>(opB); -} -std::underlying_type_t operator|(std::underlying_type_t opA, Options opB) { - return opA | static_cast>(opB); -} - -RModel::RModel(RModel&& other) { - fInputTensorInfos = std::move(other.fInputTensorInfos); - fReadyInputTensorInfos = std::move(other.fReadyInputTensorInfos); - fOutputTensorNames = other.fOutputTensorNames; - fInputTensorNames = other.fInputTensorNames; - fOperators = std::move(other.fOperators); - fInitializedTensors = std::move(other.fInitializedTensors); - fIntermediateTensorInfos = std::move(other.fIntermediateTensorInfos); - fName = other.fName; - fFileName = other.fFileName; - fParseTime = other.fParseTime; - fGC = other.fGC; - fNeededBlasRoutines = other.fNeededBlasRoutines; - fNeededStdLib = other.fNeededStdLib; +namespace { +const std::string SP = " "; } -RModel& RModel::operator=(RModel&& other) { - fInputTensorInfos = std::move(other.fInputTensorInfos); - fReadyInputTensorInfos = std::move(other.fReadyInputTensorInfos); - fOutputTensorNames = other.fOutputTensorNames; - fInputTensorNames = other.fInputTensorNames; - fOperators = std::move(other.fOperators); - fInitializedTensors = std::move(other.fInitializedTensors); - fIntermediateTensorInfos = std::move(other.fIntermediateTensorInfos); - fName = other.fName; - fFileName = other.fFileName; - fParseTime = other.fParseTime; - fGC = other.fGC; - fNeededBlasRoutines = other.fNeededBlasRoutines; - fNeededStdLib = other.fNeededStdLib; - return *this; -} -const std::vector& RModel::GetTensorShape(std::string name) const { +const std::vector& RModel::GetTensorShape(const std::string & name) const { auto f = fReadyInputTensorInfos.find(name); if (f != fReadyInputTensorInfos.end()) { return f->second.shape; @@ -69,6 +35,16 @@ const std::vector& RModel::GetTensorShape(std::string name) const { if (f4 != fIntermediateTensorInfos.end()) { return f4->second.shape; } + // case of shape tensors + auto f5 = fShapeTensors.find(name); + if (f5 != fShapeTensors.end()) { + // shape is vector of size 1 with size of shape values or just a scalar + if (f5->second.second) // check scalar flag + return std::vector{}; + else + return std::vector{f5->second.first.size()}; + } + if (fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end()) throw std::runtime_error("TMVA SOFIE tensor [" + name + "] is a dynamic tensor. Use GetDynamicTensorShape instead of GetTensorShape"); @@ -78,7 +54,7 @@ const std::vector& RModel::GetTensorShape(std::string name) const { throw std::runtime_error("TMVA SOFIE tensor [" + name + "] for which the shape is requested is not found"); } -std::vector RModel::GetDynamicTensorShape(std::string name) const { +std::vector RModel::GetDimTensorShape(const std::string & name) const { if (auto f = fDynamicTensorInfos.find(name); f != fDynamicTensorInfos.end()) { return f->second.shape; } @@ -89,8 +65,21 @@ std::vector RModel::GetDynamicTensorShape(std::string name) const { // for this we need to return the vector by value return ConvertShapeToDim(GetTensorShape(name)); } +std::vector RModel::GetDynamicTensorShape(const std::string & name) const { + if (auto f = fDynamicTensorInfos.find(name); f != fDynamicTensorInfos.end()) { + return f->second.shape; + } + if (auto f = fInputTensorInfos.find(name); f != fInputTensorInfos.end()) { + return f->second.shape; + } + // throw error if shape is not dynamic + if (!IsDynamicTensor(name)) + throw std::runtime_error("TMVA SOFIE tensor [" + name + "] for which the shape is requested is not dynamic"); + + throw std::runtime_error("TMVA SOFIE tensor [" + name + "] for which the shape is requested is not found"); +} -const ETensorType& RModel::GetTensorType(std::string name) const { +const ETensorType& RModel::GetTensorType(const std::string & name) const { auto f = fReadyInputTensorInfos.find(name); if (f != fReadyInputTensorInfos.end()) { return f->second.type; @@ -111,6 +100,10 @@ const ETensorType& RModel::GetTensorType(std::string name) const { if (f5 != fDynamicTensorInfos.end()){ return f5->second.type; } + // case of shape tensor type is INT64 + if (fShapeTensors.find(name) != fShapeTensors.end()){ + return ETensorType::INT64; + } if (fIsSubGraph && fParentGraph) return fParentGraph->GetTensorType(name); @@ -124,6 +117,7 @@ bool RModel::CheckIfTensorAlreadyExist(std::string tensor_name) { if (fInitializedTensors.find(tensor_name) != fInitializedTensors.end()) return true; if (fIntermediateTensorInfos.find(tensor_name) != fIntermediateTensorInfos.end()) return true; if (fDynamicTensorInfos.find(tensor_name) != fDynamicTensorInfos.end()) return true; + if (fShapeTensors.find(tensor_name) != fShapeTensors.end()) return true; if (fIsSubGraph && fParentGraph) return fParentGraph->CheckIfTensorAlreadyExist(tensor_name); return false; } @@ -192,16 +186,34 @@ void RModel::AddConstantTensor(std::string tensor_name, ETensorType type, std::v tensor_name = UTILITY::Clean_name(tensor_name); //NB: own data if (CheckIfTensorAlreadyExist(tensor_name)) { - throw std::runtime_error("TMVA-SOFIE: initialized tensor with name " + tensor_name + " already exists \n"); + throw std::runtime_error("TMVA-SOFIE: constant tensor with name " + tensor_name + " already exists \n"); } InitializedTensor new_tensor {type, shape, data, true}; // add here flag to specify is a constant tensor fInitializedTensors[tensor_name] = new_tensor; } +void RModel::AddShapeTensor(const std::string & name, const std::vector & shape_values, bool scalar){ + auto tensor_name = UTILITY::Clean_name(name); + if (fShapeTensors.count(tensor_name) != 0) { + throw std::runtime_error("TMVA-SOFIE: shape tensor with name " + tensor_name + " already exists \n"); + } + fShapeTensors[tensor_name] = std::make_pair(shape_values, scalar); +} + +bool RModel::IsShapeTensor(const std::string & tensor_name) const { + return fShapeTensors.count(tensor_name) != 0; +} + +const std::vector & RModel::GetShapeTensorValues(const std::string & tensor_name) const { + //if (!IsShapeTensor(tensor_name) ) return std::vector{}; + return fShapeTensors.at(tensor_name).first; +} + bool RModel::IsInitializedTensor(const std::string& tensorName) const { std::string name = UTILITY::Clean_name(tensorName); return fInitializedTensors.find(name) != fInitializedTensors.end(); } + bool RModel::IsConstantTensor(const std::string& tensorName) const { std::string name = UTILITY::Clean_name(tensorName); auto itr = fInitializedTensors.find(name); @@ -209,9 +221,11 @@ bool RModel::IsConstantTensor(const std::string& tensorName) const { return itr->second.IsConstantTensor(); } +// dynamic tensors include also Dim input tensors bool RModel::IsDynamicTensor(const std::string& tensorName) const { std::string name = UTILITY::Clean_name(tensorName); - return fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end(); + bool ret = fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end(); + return (ret) ? true : IsDimInputTensor(tensorName); } bool RModel::IsDimInputTensor(const std::string& tensorName) const { std::string name = UTILITY::Clean_name(tensorName); @@ -250,17 +264,21 @@ void RModel::AddDynamicTensor(std::string tensor_name, ETensorType type, std::ve // store shape parameter if not existing for (auto &d : shape) { if (d.isParam) { - if (fShapeParams.count(d.param) == 0) { - // case parameter is an expression of some other existing parameter, no need to - // register it - if (d.dim != size_t(-1)) { - fShapeParams[d.param] = std::to_string(d.dim); - } + if (d.dim != size_t(-1)) { + AddShapeParam(d.param, d.dim); } } } } +void RModel::AddShapeParam(const std::string & param, size_t default_value) { + if (fShapeParams.count(param) == 0) { + fShapeParams[param] = std::to_string(default_value); + // add also in the vector list (used to keep the order) + fDimShapeNames.push_back(param); + } +} + void RModel::AddOutputTensorNameList(std::vector outputtensornames) { fOutputTensorNames.clear(); for(auto& it : outputtensornames) { @@ -301,100 +319,180 @@ void RModel::SetNotWritableInitializedTensor(const std::string & tensor_name) { t->second.SetNotWritable(); } -std::string RModel:: AllocateIntermediateMemory(std::span op_output_tensors) { +std::string RModel::AllocateIntermediateMemory(std::span op_output_tensors) +{ + std::stringstream code; - std::string memory_allocation_string = ""; - bool allocated; + if (fVerbose) { + std::cout << "Total chunks allocated\n"; + for (auto chunk = fIntermediateMemoryInfo.total_stack.begin(); chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk) { + std::cout << "..... chunk " << chunk->first << " size " << chunk->second.tensor_size << " " << chunk->second.tensor_name << std::endl; + } + } - for (auto& it : op_output_tensors) { - allocated = false; - if (GetTensorType(std::string(it)) == ETensorType::BOOL || - fInitializedTensors.find(std::string(it)) != fInitializedTensors.end() || - fDynamicTensorInfos.find(std::string(it)) != fDynamicTensorInfos.end()) continue; + auto declareIntermediateTensor = [this, &code](std::string const &name, size_t size, size_t location) { + std::string typeName = ConvertTypeToString(GetTensorType(name)); + code << "\n // Allocating memory for intermediate tensor " << name << " with size " << size << " bytes"; + code << "\n" + << typeName << "* tensor_" << name << " = reinterpret_cast<" << typeName + << "*>(fIntermediateMemoryPool.data() + " << location << ");\n"; + }; + + if (fVerbose) std::cout << "*** AllocateIntermediateMemory: Loop on op output tensors\n"; + // order output tensors by size + std::vector ordered_output_tensors; + + for (auto &it : op_output_tensors) { + auto name = std::string(it); + if (GetTensorType(name) == ETensorType::BOOL || fInitializedTensors.find(name) != fInitializedTensors.end() || + fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end()) + continue; + + auto tensor_size = GetTypeSize(GetTensorType(name)) * ConvertShapeToLength(GetTensorShape(name)); + // important fill the pair in the ordered output tensors with the string view and not the string + TensorMemoryInfo tmi = {it, tensor_size}; + ordered_output_tensors.push_back(tmi); + } + std::sort(ordered_output_tensors.begin(), ordered_output_tensors.end(), + [](const TensorMemoryInfo &a, const TensorMemoryInfo &b) { return a.tensor_size > b.tensor_size; }); - auto tensor_size = GetTypeSize(GetTensorType(std::string(it))) * ConvertShapeToLength(GetTensorShape(std::string(it))); - memory_allocation_string += "\n // Allocating memory for intermediate tensor " + std::string(it) + " with size " + std::to_string(tensor_size) + " bytes"; + for (auto &it : ordered_output_tensors) { + bool allocated = false; + std::string name = std::string{it.tensor_name}; + size_t tensor_size = it.tensor_size; + if (fVerbose) + std::cout << "output tensor " << name << " size " << tensor_size << std::endl; - for (auto chunk = fIntermediateMemoryInfo.available_stack.begin(); chunk != fIntermediateMemoryInfo.available_stack.end(); ) { + for (auto chunk = fIntermediateMemoryInfo.available_stack.begin(); + chunk != fIntermediateMemoryInfo.available_stack.end();) { - // check if available memory chunks can accommodate the tensor - if (chunk->second >= tensor_size) { - auto new_chunk = fIntermediateMemoryInfo.total_stack[chunk->first].split(it, tensor_size); - auto new_chunk_location = chunk->first+chunk->second-tensor_size; - fIntermediateMemoryInfo.total_stack[new_chunk_location] = new_chunk; + if (fVerbose) std::cout << ".. available chunk " << chunk->first << " with size = " << chunk->second; + // check if available memory chunks can accommodate the tensor + if (chunk->second >= tensor_size) { + // need to use here string_view (i.e it.tensor_name) + // split returns the new chunk with size of new tensor. The free chunk is before the used one + auto new_chunk = fIntermediateMemoryInfo.total_stack[chunk->first].split(it.tensor_name, tensor_size); + auto new_chunk_location = chunk->first + chunk->second - tensor_size; + fIntermediateMemoryInfo.total_stack[new_chunk_location] = new_chunk; - memory_allocation_string += "\n" + ConvertTypeToString(GetTensorType(std::string(it))) + - "* tensor_" + std::string(it) + - " = reinterpret_cast<"+ConvertTypeToString(GetTensorType(std::string(it)))+"*>(fIntermediateMemoryPool + " + std::to_string(new_chunk_location) + ");\n"; - chunk->second -= tensor_size; + declareIntermediateTensor(name, tensor_size, new_chunk_location); + chunk->second -= tensor_size; - allocated = true; + allocated = true; - if (chunk->second == 0) { - chunk = fIntermediateMemoryInfo.available_stack.erase(chunk); - } + if (fVerbose) std::cout << " is re-used and split in a new of size " << new_chunk.tensor_size << " at " << new_chunk_location; - break; - } - ++chunk; + if (chunk->second == 0) { + if (fVerbose) std::cout << " and deleted since size matches"; + fIntermediateMemoryInfo.available_stack.erase(chunk); } + if (fVerbose) std::cout << std::endl; + break; + } else if (chunk->first == fIntermediateMemoryInfo.available_stack.rbegin()->first && + fIntermediateMemoryInfo.total_stack.rbegin()->first == chunk->first) { + // case last available chunk is the last in the memory, we can increase that one + fIntermediateMemoryInfo.total_stack[chunk->first] = {it.tensor_name, tensor_size}; + declareIntermediateTensor(name, tensor_size, chunk->first); + fIntermediateMemoryInfo.available_stack.erase(chunk); + allocated = true; + if (fVerbose) std::cout << " is extended with a bigger one of size " << tensor_size << std::endl; + break; + } + ++chunk; + if (fVerbose) std::cout << std::endl; + } - if (!allocated) { - size_t chunk_idx = fIntermediateMemoryInfo.total_stack.empty() - ? 0 - : fIntermediateMemoryInfo.total_stack.rbegin()->first + fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size; + if (!allocated) { + size_t chunk_idx = fIntermediateMemoryInfo.total_stack.empty() + ? 0 + : fIntermediateMemoryInfo.total_stack.rbegin()->first + + fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size; - fIntermediateMemoryInfo.total_stack[chunk_idx] = - { - it, - tensor_size - }; + fIntermediateMemoryInfo.total_stack[chunk_idx] = it; - memory_allocation_string += "\n"+ConvertTypeToString(GetTensorType(std::string(it)))+"* tensor_"+ std::string(it) + "= reinterpret_cast<"+ConvertTypeToString(GetTensorType(std::string(it)))+"*>(fIntermediateMemoryPool + " + std::to_string(chunk_idx) + ");\n"; - } + declareIntermediateTensor(name, tensor_size, chunk_idx); + + if (fVerbose) std::cout << "no chunk available - add in total stack a new chunk with size of tensor and idx : " << chunk_idx + << std::endl; + } } - return memory_allocation_string; + return code.str(); } -void RModel::CheckAndFlushIntermediateMemory(std::span op_input_tensors, const size_t& op_idx){ - for (auto &it : op_input_tensors){ +void RModel::CheckAndFlushIntermediateMemory(std::span op_input_tensors, const size_t& op_idx){ + if (fVerbose) std::cout << "*** CheckAndFlushIntermediateMemory: Loop on input tensors for op " << op_idx << "\n"; + //print available chunks + if (fVerbose) std::cout << "available chunks before freeing them : \n"; + for (auto chunk = fIntermediateMemoryInfo.available_stack.begin(); + chunk != fIntermediateMemoryInfo.available_stack.end(); chunk++) { + if (fVerbose) std::cout << "-- free chunk " << chunk->first << " size = " << chunk->second << std::endl; + } + for (auto &it : op_input_tensors) { // last occurence of the tensor is reached => flush it from memory + if (fVerbose) std::cout << ".. input tensors : " << it; if (fIntermediateTensorFrequencyLookup[it] == op_idx) { + if (fVerbose) std::cout << " flash condition is met - looping on chunks to find matching one \n"; for (auto chunk = fIntermediateMemoryInfo.total_stack.begin(); - chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk ) { - if (chunk->second.tensor_name == it) { - - // check if nearby chunks in available memory can coalesce - auto first_greater = fIntermediateMemoryInfo.available_stack.upper_bound(chunk->first); // smallest element greater than the flushed chunk idx - auto last_smaller = (first_greater == fIntermediateMemoryInfo.available_stack.begin()) ? fIntermediateMemoryInfo.available_stack.end() : std::prev(first_greater); // largest element smaller than the flushed chunk idx - - // check if the next stack entry is actually adjacent in memory - if (last_smaller->first+last_smaller->second + 1 == chunk->first){ - last_smaller->second += chunk->second.tensor_size; - fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(chunk->second); - - if (last_smaller->first + last_smaller->second + 1 == first_greater->first){ - fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(fIntermediateMemoryInfo.total_stack[first_greater->first]); - first_greater = fIntermediateMemoryInfo.available_stack.erase(first_greater); - } - } else{ - if (chunk->first + chunk->second.tensor_size + 1 == first_greater->first){ - fIntermediateMemoryInfo.total_stack[chunk->first].merge(fIntermediateMemoryInfo.total_stack[first_greater->first]); - first_greater = fIntermediateMemoryInfo.available_stack.erase(first_greater); - } - fIntermediateMemoryInfo.available_stack.insert({ - chunk->first, - chunk->second.tensor_size - }); - } + chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk) { + if (fVerbose) std::cout << "--- chunk " << chunk->first << " , " << chunk->second.tensor_name << " size " << chunk->second.tensor_size; + if (chunk->second.tensor_name == it) { + if (fVerbose) std::cout << " -- Found chunk corresponding to input tensor: " << chunk->first; + // check if nearby chunks in available memory can coalesce + auto first_greater = fIntermediateMemoryInfo.available_stack.upper_bound( + chunk->first); // smallest element greater than the flushed chunk idx + auto last_smaller = (first_greater == fIntermediateMemoryInfo.available_stack.begin()) + ? fIntermediateMemoryInfo.available_stack.end() + : std::prev(first_greater); // largest element smaller than the flushed chunk idx + + // check if the next stack entry is actually adjacent in memory + + if (last_smaller != fIntermediateMemoryInfo.available_stack.end() && + last_smaller->first + last_smaller->second == chunk->first) { + // merge chunk with previous one + last_smaller->second += chunk->second.tensor_size; + fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(chunk->second); + if (fVerbose) std::cout << " is adjacent in memory with previous one - merge "; + if (first_greater != fIntermediateMemoryInfo.available_stack.end() && + last_smaller->first + last_smaller->second == first_greater->first) { + // merge also with following one + last_smaller->second += first_greater->second; + fIntermediateMemoryInfo.total_stack[last_smaller->first].merge( + fIntermediateMemoryInfo.total_stack[first_greater->first]); + // delete merged one in available stack and in total stack + fIntermediateMemoryInfo.total_stack.erase(first_greater->first); + fIntermediateMemoryInfo.available_stack.erase(first_greater); + if (fVerbose) std::cout << " merge also with following that is free "; + } + fIntermediateMemoryInfo.total_stack.erase(chunk->first); + if (fVerbose) std::cout << std::endl; + break; + } else if (first_greater != fIntermediateMemoryInfo.available_stack.end() && + chunk->first + chunk->second.tensor_size == first_greater->first) { + // merge with first greater + if (fVerbose) std::cout << " is adjacent in memory with following one - merge \n"; + // cannot modify idx of first_greter. Insert a new one and delete previous one + size_t new_size = chunk->second.tensor_size + first_greater->second; + size_t first_greater_idx = first_greater->first; + fIntermediateMemoryInfo.available_stack.erase(first_greater); + // cannot use anymore first_greater + fIntermediateMemoryInfo.available_stack.insert({chunk->first, new_size}); + fIntermediateMemoryInfo.total_stack[chunk->first].merge( + fIntermediateMemoryInfo.total_stack[first_greater_idx]); + fIntermediateMemoryInfo.total_stack.erase(first_greater_idx); + } else { + fIntermediateMemoryInfo.available_stack.insert({chunk->first, chunk->second.tensor_size}); + if (fVerbose) std::cout << " insert in the available stack the chunk with size " << chunk->second.tensor_size << std::endl; } + chunk->second.tensor_name = "free"; + break; + } } + } else { + if (fVerbose) std::cout << std::endl; } } } - - void RModel::Initialize(int batchSize, bool verbose) { std::map inputParams; if (batchSize > 0) { @@ -442,7 +540,7 @@ void RModel::Initialize(const std::map & inputParams, bool auto shape = ConvertShapeToInt(input.second.shape); if (verbose) std::cout << "converting input shape for " << input.first << " " << ConvertShapeToString(shape) << " from " - << ConvertDynamicShapeToString(input.second.shape) << std::endl; + << ConvertShapeToString(input.second.shape) << std::endl; if (!shape.empty()) { // case shape is defined (not parametric) we add the tensor in the fReadyInputTensorInfos map and // we remove the tensor from the fInputTensorInfo where th eold parametric shape was stored @@ -456,8 +554,12 @@ void RModel::Initialize(const std::map & inputParams, bool else { // store the found parametric shape parameters for (auto &d : input.second.shape) { - if (d.isParam) - fShapeParams[d.param] = std::to_string(d.dim); + if (d.isParam) { + if (fShapeParams.count(d.param) == 0) { + fDimShapeNames.push_back(d.param); + fShapeParams[d.param] = std::to_string(d.dim); + } + } } } } @@ -492,10 +594,11 @@ void RModel::Initialize(const std::map & inputParams, bool } fOperators[op_idx]->Initialize(*this); for(auto &it:fOperators[op_idx]->GetOpOutputTensors()){ + std::string name = std::string{it}; if (fIntermediateTensorFrequencyLookup.find(it) == fIntermediateTensorFrequencyLookup.end() && - std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), std::string(it)) == fOutputTensorNames.end() && - fInitializedTensors.find(std::string(it)) == fInitializedTensors.end() && - fDynamicTensorInfos.find(std::string(it)) == fDynamicTensorInfos.end()){ + std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), name) == fOutputTensorNames.end() && + fInitializedTensors.find(name) == fInitializedTensors.end() && + fDynamicTensorInfos.find(name) == fDynamicTensorInfos.end()){ fIntermediateTensorFrequencyLookup[it] = op_idx; } } @@ -534,43 +637,6 @@ void RModel::InitializeSubGraph(std::shared_ptr graph) { } -// Function to generate the code for declaring and initializing constant tensors -// This is for tensors which are not part of weight files and can be created from the Constant operator -template -std::string GenerateConstantTensorCode(const std::pair &t) -{ - std::stringstream strs; - std::string type = ConvertTypeToString(t.second.type()); - size_t length = ConvertShapeToLength(t.second.shape()); - // avoid using stack sizes for constant tensors to reduce compilation time - bool allocateOnStack = (length > 100) ? false : true; - - const T *data = t.second.data(); - - // and check if all values are the same - bool sameData = false; - // for non stack allocation check if data are the same - if (!allocateOnStack && length > 1) { - size_t idx = 1; - do { - sameData = (data[idx] == data[idx - 1]); - idx++; - } while (sameData && idx < length); - } - if (allocateOnStack) { - strs << type << " tensor_" << t.first << "[" << length << "] = " << ConvertValuesToString(length, data) << ";\n"; - } else { - strs << "std::vector<" << type << "> fTensor_" << t.first << " = "; - if (sameData) - strs << "std::vector<" << type << ">(" << length << ", " << ConvertValToString(data[0]) << ");\n"; - else { - strs << ConvertValuesToString(length, data) << ";\n"; - } - strs << "const " << type << " * tensor_" + t.first + " = fTensor_" + t.first + ".data();\n"; - } - return strs.str(); -} - void RModel::GenerateInitializedTensorInfo() { if (!fInitializedTensors.empty()) @@ -578,10 +644,13 @@ void RModel::GenerateInitializedTensorInfo() for (auto &i : fInitializedTensors) { if (!fUseWeightFile || i.second.IsConstantTensor()) { - if (i.second.type() == ETensorType::FLOAT) + if (i.second.type() == ETensorType::FLOAT) { fGC += GenerateConstantTensorCode(i); - else if (i.second.type() == ETensorType::INT64) + fConstantTensorSize += ConvertShapeToLength(i.second.shape()) * 4; + } else if (i.second.type() == ETensorType::INT64) { fGC += GenerateConstantTensorCode(i); + fConstantTensorSize += ConvertShapeToLength(i.second.shape()) * 8; + } } else { // case of tensors which are read from a file @@ -589,43 +658,55 @@ void RModel::GenerateInitializedTensorInfo() if (i.second.type() == ETensorType::FLOAT) { fGC += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; fGC += "float * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n"; + fWeightsTensorSize += ConvertShapeToLength(i.second.shape()) * 4; } } } } void RModel::GenerateIntermediateMemoryPool() { - if (fIntermediateMemoryInfo.total_stack.size() == 0) return; + if (fIntermediateMemoryInfo.total_stack.empty()) return; fGC += "\n//--- Allocating session memory pool to be used for allocating intermediate tensors\n"; // char memory block is allocated since char takes 1 byte, thus easier to allocate tensors // of other data types - fGC += "char* fIntermediateMemoryPool = new char[" + std::to_string(fIntermediateMemoryInfo.total_stack.rbegin()->first + fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size)+ "];\n\n"; + auto const &totalStack = fIntermediateMemoryInfo.total_stack; + const size_t memPoolSize = totalStack.rbegin()->first + totalStack.rbegin()->second.tensor_size; + fGC += "std::vector fIntermediateMemoryPool = std::vector(" + std::to_string(memPoolSize) + ");\n\n"; } void RModel::GenerateIntermediateTensorInfo() { if (!fIntermediateTensorInfos.empty()) { std::string tensor_declaration_block = ""; - for (auto &i : fIntermediateTensorInfos) { if (i.second.type == ETensorType::BOOL) { - tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(ConvertShapeToLength(i.second.shape)) + ");\n"; - // No pointer allocation needed for BOOL + tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(ConvertShapeToLength(i.second.shape)) + ");\n"; + tensor_declaration_block += "std::uint8_t * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n"; + continue; } - if (fIntermediateTensorFrequencyLookup.find(i.first) == fIntermediateTensorFrequencyLookup.end() && std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), i.first) == fOutputTensorNames.end()) { + bool is_extended = (fOptimizationLevel == OptimizationLevel::kExtended); + bool not_in_freq_map = + (fIntermediateTensorFrequencyLookup.find(i.first) == fIntermediateTensorFrequencyLookup.end()); + bool not_in_output_names = + (std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), i.first) == fOutputTensorNames.end()); + + if ((not_in_freq_map && not_in_output_names) || (!not_in_freq_map && !is_extended && not_in_output_names)) { size_t length = ConvertShapeToLength(i.second.shape); if (i.second.type == ETensorType::FLOAT) { tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; tensor_declaration_block += "float * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n"; + fOtherTensorSize += 4 * length; } else if (i.second.type == ETensorType::DOUBLE) { tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; tensor_declaration_block += "double * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n"; + fOtherTensorSize += 8 * length; } else if (i.second.type == ETensorType::INT64) { tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; tensor_declaration_block += "int64_t * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n"; + fOtherTensorSize += 8 * length; } } } @@ -664,17 +745,17 @@ void RModel::GenerateOperatorDeclarations() { fGC += "\n"; } -void RModel::GenerateDynamicTensorInfo() { - fGC += "//---- allocate the intermediate dynamic tensors\n"; - std::stringstream out; - for (auto & i: fDynamicTensorInfos) { - auto length = ConvertDynamicShapeToLength(i.second.shape); - out << SP << "if (" << length << " > 0) {\n"; - out << SP << SP << "fTensor_" << i.first << ".resize(" << length << ");\n"; - out << SP << SP << "tensor_" << i.first << " = fTensor_" << i.first << ".data();\n"; - out << SP << "}\n"; - } - fGC += out.str(); +void RModel::GenerateDynamicTensorInfo() +{ + std::stringstream out; + for (auto &i : fDynamicTensorInfos) { + auto length = ConvertDynamicShapeToLength(i.second.shape); + out << SP << "if (" << length << " > 0) {\n"; + out << SP << SP << "fTensor_" << i.first << ".resize(" << length << ");\n"; + out << SP << SP << "tensor_" << i.first << " = fTensor_" << i.first << ".data();\n"; + out << SP << "}\n"; + } + fGC += out.str(); } std::string RModel::GenerateInferSignature(bool isdecl) { @@ -702,7 +783,7 @@ std::string RModel::GenerateInferSignature(bool isdecl) { if (type == "other") throw std::runtime_error("TMVA-SOFIE: input tensor " + name + " is of a data type which is not yet supported."); - rGC += type + "* "; + rGC += type + " const* "; } rGC += "tensor_" + name + ","; i_input++; @@ -714,94 +795,82 @@ std::string RModel::GenerateInferSignature(bool isdecl) { namespace { -std::string createOutputTensor(RModel const &rmodel, std::string const &name, bool isIntermediateTensor) -{ - if(name.empty()) return "{}"; - ETensorType eOutputType = rmodel.GetTensorType(name); - std::string outputType = ConvertTypeToString(eOutputType); - if (isIntermediateTensor) { - - if (eOutputType == ETensorType::BOOL) { - return "fTensor_" + name; - } else { - // need to check is size is the same(don't want to return a vector with larger size) - // in that case better to copy - return "std::vector<" + ConvertTypeToString(eOutputType) + ">(tensor_" + name + ", tensor_" + name + " + " + - std::to_string(ConvertShapeToLength(rmodel.GetTensorShape(name))) + ")"; - } - } - // include also dynamic tensors since the vectors can be allocated with a size larger than their output - // we need a special handling for bool type allocated as vector - auto outputLength = ConvertDynamicShapeToLength(rmodel.GetDynamicTensorShape(name)); - if (rmodel.IsDynamicTensor(name) && eOutputType == ETensorType::BOOL) { - return "std::vector(fTensor_" + name + ".begin(), fTensor_" + name + ".begin() + " + outputLength + ")"; - } - return "std::vector<" + outputType + ">(tensor_" + name + ", tensor_" + name + " + " + outputLength + ")"; +std::string typeForOutput(ETensorType t) { + // The std::vector is a special type that is not wrapping continuous memory. + // We don't want to use it as a return type. + if (t == ETensorType::BOOL) t = ETensorType::UINT8; + return ConvertTypeToString(t); } -} // namespace - -void RModel::GenerateOutput() { - - if (fVerbose) - std::cout << "Generating main inference code for " << fName << std::endl; +} +void RModel::GenerateOutput() +{ size_t outputSize = fOutputTensorNames.size(); // assume output types are all the same - if (outputSize == 0) - throw std::runtime_error("TMVA-SOFIE: output size=0 are not supported"); bool sameOutputTypes = true; std::string inferReturnType; // type return by infer function - ETensorType eOutputType = GetTensorType(*fOutputTensorNames.begin()); - std::string outputType = ConvertTypeToString(eOutputType); + ETensorType eFirstOutputType = GetTensorType(*fOutputTensorNames.begin()); fGC += "\n\n"; if (outputSize == 1) { - fGC += "std::vector<" + outputType + ">"; + fGC += "std::vector<" + typeForOutput(eFirstOutputType) + ">"; } else { // if all output types are the same we return an std::vector - otherwise a tuple - for (size_t i = 1; i < outputSize; i++) { - if (GetTensorType(fOutputTensorNames[i]) != eOutputType) + for (std::string const &name : fOutputTensorNames) { + if (GetTensorType(name) != eFirstOutputType) sameOutputTypes = false; } if (sameOutputTypes) - fGC += "std::vector>"; + fGC += "std::vector>"; else { inferReturnType = "std::tuple<"; for (size_t i = 0; i < outputSize; i++) { - inferReturnType += "std::vector<" + ConvertTypeToString(GetTensorType(fOutputTensorNames[i])) + ">"; - if (i < outputSize-1) inferReturnType += ","; + inferReturnType += "std::vector<" + typeForOutput(GetTensorType(fOutputTensorNames[i])) + ">"; + if (i < outputSize - 1) + inferReturnType += ","; } inferReturnType += ">"; fGC += inferReturnType; } } - fGC += " infer("; - - fGC += GenerateInferSignature(); - - fGC += "){\n"; + fGC += " infer(" + GenerateInferSignature() + "){\n"; - for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) { - if (fVerbose) std::cout << "Generating code for operator .... " << op_idx << std::endl; - fGC += (fOperators[op_idx]->Generate(std::to_string(op_idx))); + std::string doInferArgs = GenerateInferSignature(false); + if (!doInferArgs.empty()) + doInferArgs += ","; + for (std::string const &name : fOutputTensorNames) { + fGC += SP + "std::vector<" + typeForOutput(GetTensorType(name)) + " > output_tensor_" + name + ";\n"; + doInferArgs += " output_tensor_" + name + ","; } + if (!doInferArgs.empty()) + doInferArgs.back() = ' '; + + fGC += SP + "doInfer(" + doInferArgs + ");\n"; fGC += SP + "return {"; - for (size_t i = 0; i < outputSize; i++) { - std::string tensorName = *(fOutputTensorNames.begin() + i); - bool isIntermediate = fIntermediateTensorInfos.count(tensorName) > 0; - fGC += createOutputTensor(*this, tensorName, isIntermediate); - if (i < outputSize - 1) + for (size_t i = 0; i < fOutputTensorNames.size(); i++) { + fGC += "output_tensor_" + fOutputTensorNames[i]; + if (i < fOutputTensorNames.size() - 1) fGC += ","; } fGC += "};\n"; - fGC += "}\n"; // end of infer function scope + fGC += "}\n"; // end of infer function scope } void RModel::GenerateSessionCode() { + // Determine the signature of the actual inference function + std::string doInferSignature = GenerateInferSignature(); + if (!doInferSignature.empty()) + doInferSignature += ", "; + for (auto const &name : fOutputTensorNames) { + doInferSignature += " std::vector<" + typeForOutput(GetTensorType(name)) + "> &output_tensor_" + name + ","; + } + doInferSignature.back() = ' '; + + doInferSignature = "void doInfer(" + doInferSignature + ")"; // define the Session struct (for GNN this is generated in RModel_GNN) if (fUseSession && !fIsGNNComponent) { @@ -814,24 +883,31 @@ void RModel::GenerateSessionCode() // generate code for declaring the initialized tensors GenerateInitializedTensorInfo(); - // evaluate total intermediate memory and position intermediate tensor addresses - std::string intermediate_memory_alloc_string = ""; - intermediate_memory_alloc_string += "\n// --- Positioning intermediate tensor memory --"; - for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) { - intermediate_memory_alloc_string += AllocateIntermediateMemory(fOperators[op_idx]->GetOpOutputTensors()); - CheckAndFlushIntermediateMemory(fOperators[op_idx]->GetOpInputTensors(), op_idx); - } + if (fOptimizationLevel == OptimizationLevel::kExtended) { + // evaluate total intermediate memory and position intermediate tensor addresses + std::string intermediate_memory_alloc_string = ""; + intermediate_memory_alloc_string += "\n// --- Positioning intermediate tensor memory --"; + for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) { + if (fVerbose) { + auto op = fOperators[op_idx].get(); + std::cout << "\n******************\n analyzing input/output operator " << op_idx << " " + << typeid(*op).name() << std::endl; + } + intermediate_memory_alloc_string += AllocateIntermediateMemory(fOperators[op_idx]->GetOpOutputTensors()); + CheckAndFlushIntermediateMemory(fOperators[op_idx]->GetOpInputTensors(), op_idx); + } - // to check remaining unused fragments after memory allocation (lesser the better) - // for (const auto &it: fIntermediateMemoryInfo.available_stack){ - // std::cout<<"chunk_idx: "<Generate(std::to_string(op_idx))); + } + + fGC += SP + "using SOFIE::UTILITY::FillOutput;\n\n"; + + for (std::string const &name : fOutputTensorNames) { + // need to check is size is the same (don't want to return a vector with + // larger size) in that case better to copy + bool isIntermediate = fIntermediateTensorInfos.count(name) > 0; + std::string n = isIntermediate ? std::to_string(ConvertShapeToLength(GetTensorShape(name))) + : ConvertDimShapeToLength(GetDimTensorShape(name)); + fGC += SP + "FillOutput(tensor_" + name + ", output_tensor_" + name + ", " + n + ");\n"; + } + + fGC += "}\n\n"; + + // generate the inference overload that returns an output struct GenerateOutput(); // end of session if (fUseSession && !fIsGNNComponent) { - fGC += "}; // end of Session\n"; + fGC += "}; // end of Session\n\n"; } } @@ -982,8 +1088,7 @@ void RModel::ReadInitializedTensorsFromFile(long pos) { fGC += " f.seekg(" + std::to_string(pos) + ");\n"; } - fGC += " std::string tensor_name;\n"; - fGC += " size_t length;\n"; + fGC += " using SOFIE::ReadTensorFromStream;\n"; // loop on tensors and parse the file for (auto& i: fInitializedTensors) { @@ -991,25 +1096,8 @@ void RModel::ReadInitializedTensorsFromFile(long pos) { if (!i.second.IsWeightTensor()) continue; std::string tensor_name = "tensor_" + i.first; if (i.second.type() == ETensorType::FLOAT) { - size_t length = 1; - length = ConvertShapeToLength(i.second.shape()); - std::string slength = std::to_string(length); - fGC += " f >> tensor_name >> length;\n"; - fGC += " if (tensor_name != \"" + tensor_name + "\" ) {\n"; - fGC += " std::string err_msg = \"TMVA-SOFIE failed to read the correct tensor name; expected name is " + - tensor_name + " , read \" + tensor_name;\n"; - fGC += " throw std::runtime_error(err_msg);\n"; - fGC += " }\n"; - fGC += " if (length != " + slength + ") {\n"; - fGC += " std::string err_msg = \"TMVA-SOFIE failed to read the correct tensor size; expected size is " + - slength + " , read \" + std::to_string(length) ;\n"; - fGC += " throw std::runtime_error(err_msg);\n"; - fGC += " }\n"; - fGC += " for (size_t i = 0; i < length; ++i)\n"; - fGC += " f >> " + tensor_name + "[i];\n"; - fGC += " if (f.fail()) {\n"; - fGC += " throw std::runtime_error(\"TMVA-SOFIE failed to read the values for tensor " + tensor_name + "\");\n"; - fGC += " }\n"; + std::string length = std::to_string(ConvertShapeToLength(i.second.shape())); + fGC += " ReadTensorFromStream(f, " + tensor_name + ", \"" + tensor_name + "\", " + length + ");\n"; } else { std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a file"); } @@ -1019,6 +1107,7 @@ void RModel::ReadInitializedTensorsFromFile(long pos) { // generate the code to read initialized tensors from a ROOT data file if(fWeightFile == WeightFileType::RootBinary) { +#ifdef SOFIE_SUPPORT_ROOT_BINARY fGC += " {\n"; fGC += " std::unique_ptr rootFile(TFile::Open(filename.c_str(), \"READ\"));\n"; fGC += " if (!rootFile->IsOpen()) {\n"; @@ -1050,6 +1139,9 @@ void RModel::ReadInitializedTensorsFromFile(long pos) { fGC += " }\n"; } fGC += " }\n"; +#else + throw std::runtime_error("SOFIE was not built with ROOT file support."); +#endif // SOFIE_SUPPORT_ROOT_BINARY } } @@ -1075,6 +1167,7 @@ long RModel::WriteInitializedTensorsToFile(std::string filename) { // Write the initialized tensors to the file if (fWeightFile == WeightFileType::RootBinary) { +#ifdef SOFIE_SUPPORT_ROOT_BINARY if(fIsGNNComponent || fIsGNN) { throw std::runtime_error("SOFIE-GNN yet not supports writing to a ROOT file."); } @@ -1118,6 +1211,9 @@ long RModel::WriteInitializedTensorsToFile(std::string filename) { // this needs to be changed, similar to the text file return -1; +#else + throw std::runtime_error("SOFIE was not built with ROOT file support."); +#endif // SOFIE_SUPPORT_ROOT_BINARY } else if (fWeightFile == WeightFileType::Text) { std::ofstream f; if(fIsGNNComponent) { @@ -1244,9 +1340,9 @@ void RModel::PrintOutputTensors() { for (auto& it: fOutputTensorNames) { std::cout << "Tensor name: \"" << it << "\"\t"; if (!IsDynamicTensor(it)) - std::cout << "shape: " << ConvertShapeToString(GetTensorShape(it)) << std::endl; - else - std::cout << "shape: " << ConvertDynamicShapeToString(GetDynamicTensorShape(it)) << std::endl; + std::cout << "shape: " << ConvertShapeToString(GetTensorShape(it)) << std::endl; + else + std::cout << "shape: " << ConvertShapeToString(GetDynamicTensorShape(it)) << std::endl; } std::cout << "\n"; } @@ -1312,13 +1408,13 @@ void RModel::OutputGenerated(std::string filename, bool append) { void RModel::Streamer(TBuffer &R__b) { if (R__b.IsReading()) { RModel::Class()->ReadBuffer(R__b, this); - for(auto i=RModel::fInitializedTensors.begin(); i!=RModel::fInitializedTensors.end(); ++i) { - i->second.CastPersistentToShared(); + for (auto & i : fInitializedTensors) { + i.second.CastPersistentToShared(); } } else { - for(auto i=RModel::fInitializedTensors.begin(); i!=RModel::fInitializedTensors.end(); ++i) { - i->second.CastSharedToPersistent(); + for (auto & i : fInitializedTensors) { + i.second.CastSharedToPersistent(); } RModel::Class()->WriteBuffer(R__b, this); } diff --git a/src/SOFIE_core/src/RModel_ALPAKA.cxx b/src/SOFIE_core/src/RModel_ALPAKA.cxx new file mode 100644 index 0000000..9ff300a --- /dev/null +++ b/src/SOFIE_core/src/RModel_ALPAKA.cxx @@ -0,0 +1,403 @@ +#include +#include +#include +#include +#include + +#include "TFile.h" +#include "SOFIE/RModel.hxx" +#include "SOFIE/SOFIE_common.hxx" + +namespace SOFIE { + +void RModel::GenerateInitializedTensorInfo_GPU_ALPAKA() { + if (!fInitializedTensors.empty()){ + fGC += "\n// initialized tensors for weights\n"; + } + + for (auto &i : fInitializedTensors) { + if (!fUseWeightFile || i.second.IsConstantTensor()) { + if (i.second.type() == ETensorType::FLOAT) + fGC += GenerateConstantTensorCode(i); + else if (i.second.type() == ETensorType::INT64) + fGC += GenerateConstantTensorCode(i); + + } else { + // case of tensors which are read from a file + size_t length = ConvertShapeToLength(i.second.shape()); + if (i.second.type() == ETensorType::FLOAT) { + fGC += "BufF1D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } + } + } +} + +void RModel::GenerateTemporaryInitializedTensorContainers_GPU_ALPAKA() +{ + if (!fInitializedTensors.empty()) + fGC += "// temporary initialized tensors for loading weights\n"; + + for (auto &i : fInitializedTensors) { + if (!fUseWeightFile || i.second.IsConstantTensor()) { + if (i.second.type() == ETensorType::FLOAT) + fGC += GenerateConstantTensorCode(i); + else if (i.second.type() == ETensorType::INT64) + fGC += GenerateConstantTensorCode(i); + + } else { + // case of tensors which are read from a file + size_t length = ConvertShapeToLength(i.second.shape()); + if (i.second.type() == ETensorType::FLOAT) { + fGC += "std::vector tensor_" + i.first + "(" + std::to_string(length) + ");\n"; + } + } + } +} + +void RModel::GenerateGPU_ALPAKA_Buffers() { + if (!fIntermediateTensorInfos.empty()) { + std::string tensor_declaration_block = ""; + + for (auto &i : fIntermediateTensorInfos) { + if (i.second.type == ETensorType::BOOL) { + tensor_declaration_block += "std::vector fTensor_" + i.first + + " = std::vector(" + + std::to_string(ConvertShapeToLength(i.second.shape)) + + ");\n"; + // No pointer allocation needed for BOOL + } + + size_t length = ConvertShapeToLength(i.second.shape); + + if (i.second.type == ETensorType::FLOAT) { + tensor_declaration_block += "BufF1D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } else if (i.second.type == ETensorType::DOUBLE) { + tensor_declaration_block += "BufD1D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } else if (i.second.type == ETensorType::INT64) { + tensor_declaration_block += "BufI641D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } + } + + if (tensor_declaration_block.length()) { + fGC += "\n//--- declare and allocate the intermediate tensors\n" + tensor_declaration_block; + } + } + + // add also the dynamic tensors (only declarations, allocation will be done later) + if (!fDynamicTensorInfos.empty()) { + fGC += "//--- declare the dynamic tensors\n"; + fGC += "using bufDev_float = alpaka::Buf, size_t>;\n"; + fGC += "using bufDev_double = alpaka::Buf, size_t>;\n"; + fGC += "using bufDev_int64 = alpaka::Buf, size_t>;\n"; + + for (auto &i : fDynamicTensorInfos) { + if (i.second.type == ETensorType::FLOAT) { + fGC += "bufDev_float bufDev_" + i.first + ";\n"; + } else if (i.second.type == ETensorType::DOUBLE) { + fGC += "bufDev_double bufDev_" + i.first + ";\n"; + } else if (i.second.type == ETensorType::INT64) { + fGC += "bufDev_int64 bufDev_" + i.first + ";\n"; + } + } + } +} + +void RModel::GenerateDynamicTensorInfo_GPU_ALPAKA() { + fGC += "//---- allocate the intermediate dynamic tensors\n"; + std::stringstream out; + + for (auto &i : fDynamicTensorInfos) { + auto length = ConvertDynamicShapeToLength(i.second.shape); + out << SP << "if (" << length << " > 0) {\n"; + out << "auto bufDev_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" << length << "}));\n"; + out << SP << "}\n"; + } + fGC += out.str(); +} + +// only supports BufF1D buffer data types for now +std::string RModel::GenerateInferSignature_GPU_ALPAKA(bool isdecl) { + // generate the infer signature given the inputs: eg. "BufF1D const deviceBuf_A, BufF1D const deviceBuf_B" + // if (decl = false) generate only calling signature (deviceBuf_A, deviceBuf_B, ....) + std::string rGC; + std::unordered_map inputParams; + int i_input = 0; + for (auto &name : fInputTensorNames) { + // if is a dynamic tensor pass initial parameters + if (IsDimInputTensor(name)) { + auto shape = GetDynamicTensorShape(name); + for (auto &d : shape) { + std::string pName = d.param; + // need to check if the input parameters is already existing in another input tensor + if (d.isParam && inputParams.count(pName) == 0) { + if (isdecl) rGC += "size_t "; + rGC += d.param + ","; + inputParams[pName] = i_input; + } + } + } + if (isdecl) { + std::string type = "BufF1D"; + if (type == "other") + throw std::runtime_error("TMVA-SOFIE: input tensor " + name + + " is of a data type which is not yet supported."); + rGC += type + " const "; + } + rGC += "deviceBuf_" + name + ","; + i_input++; + } + + if (fInputTensorNames.size() > 0) rGC.pop_back();// remove last "," + return rGC; +} + +void RModel::GenerateOutput_GPU_ALPAKA() { + if (fVerbose) + std::cout << "Generating main inference code for " << fName << std::endl; + + size_t outputSize = fOutputTensorNames.size(); + if (outputSize == 0) + throw std::runtime_error("TMVA-SOFIE: output size=0 are not supported"); + + bool sameOutputTypes = true; + std::string inferReturnType; + ETensorType eOutputType = GetTensorType(*fOutputTensorNames.begin()); + std::string outputType = ConvertTypeToString(eOutputType); + + fGC += "\n\n"; + if (outputSize == 1) { + fGC += "alpaka::Buf"; + } else { + throw std::runtime_error("TMVA-SOFIE: multiple output tensors are not supported in ALPAKA code generation"); + } + + fGC += " infer("; + fGC += GenerateInferSignature_GPU_ALPAKA(); + fGC += "){\n"; + + for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) { + if (fVerbose) + std::cout << "Generating code for operator .... " << op_idx << std::endl; + fGC += (fOperators[op_idx]->Generate_GPU_ALPAKA(std::to_string(op_idx))); + } + + fGC += "\n\n alpaka::wait(queue);\n"; + fGC += SP + "return "; + if (outputSize>1) fGC += " {"; + for (size_t i = 0; i < outputSize; i++) { + std::string tensorName = *(fOutputTensorNames.begin() + i); + bool isIntermediate = fIntermediateTensorInfos.count(tensorName) > 0; + fGC += "deviceBuf_"+tensorName; + if (i < outputSize - 1) + fGC += ","; + } + if (outputSize>1) fGC += " };\n"; + else fGC += ";\n"; + fGC += "}\n"; // end of infer function scope +} + +void RModel::GenerateSessionCode_GPU_ALPAKA() { + + std::set registered_operators; + + fGC += "\n//--- ALPAKA Kernels\n"; + for (size_t id = 0; id < fOperators.size(); id++) { + if(registered_operators.find(fOperators[id]->GetKind()) == registered_operators.end()) { + + if (fVerbose) + std::cout<<"Generating ALPAKA kernel for operator"<< toString(fOperators[id]->GetKind()) << std::endl; + + fGC += fOperators[id]->Generate_GPU_Kernel_ALPAKA(std::to_string(id)); + registered_operators.insert(fOperators[id]->GetKind()); + } + } + + // define the Session struct (for GNN this is generated in RModel_GNN) + fGC += "\n\ntemplate \n"; + if (fUseSession) { + if (!fIsSubGraph) + fGC += "struct Session {\n\n"; + else + fGC += "struct Session_" + fName + " {\n\n"; + } + + // define host and device accelerators + fGC += "using Idx = std::size_t;\n"; + fGC += "using Dim = alpaka::DimInt<1>;\n"; + fGC += "using Acc = alpaka::TagToAcc;\n"; + fGC += "using DevAcc = alpaka::Dev;\n\n"; + fGC += "using QueueProperty = alpaka::NonBlocking;\n"; + fGC += "using QueueAcc = alpaka::Queue;\n\n"; + fGC += "using BufF1D = alpaka::Buf;\n"; + fGC += "using BufD1D = alpaka::Buf;\n"; + fGC += "using BufI641D = alpaka::Buf;\n\n"; + + fGC += "\nalpaka::Platform const platform{};\n"; + fGC += "DevAcc devAcc = alpaka::getDevByIdx(platform, 0);\n"; + fGC += "alpaka::PlatformCpu platformHost{};\n"; + fGC += "alpaka::DevCpu hostAcc = alpaka::getDevByIdx(platformHost, 0);\n"; + fGC += "QueueAcc queue{devAcc};\n"; + fGC += "Idx threadsPerBlock = 256;\n"; + fGC += "\nusing Ext1D = alpaka::Vec;\n"; + fGC += "using Vec = alpaka::Vec;\n"; + if (registered_operators.find(SOFIE::OperatorKind::GEMM) != registered_operators.end()) { + fGC += "\n\n// BLAS declarations\n"; + fGC += "sofieBLAS blas{queue};\n"; + } + + GenerateInitializedTensorInfo_GPU_ALPAKA(); + GenerateGPU_ALPAKA_Buffers(); + GenerateOperatorDeclarations(); + + // add subgraph session + if (!fSubGraphs.empty()) + fGC += "// subgraph sessions\n"; + for (auto &graph : fSubGraphs) { + fGC += "Session_" + graph->fName + " fSession_" + graph->fName + ";\n"; + } + + // Session constructor + if (fUseSession) { + std::string sessionName = "\n\nSession"; + if (fIsSubGraph) + sessionName += "_" + fName; + + if (fUseWeightFile) { + std::string fileName = fName; + if (fWeightFile == WeightFileType::Text) + fileName += ".dat"; + if (fWeightFile == WeightFileType::RootBinary) + fileName += ".root"; + + fGC += sessionName + "(std::string filename =\"" + fileName + "\""; + } else { + fGC += sessionName + "(std::string = \"\""; + } + + if (!fShapeParams.empty()) { + for (auto &p : fShapeParams) { + fGC += ",\n"; + fGC += " size_t " + p.first + " = " + p.second; + } + } + fGC += ") {\n"; + + GenerateTemporaryInitializedTensorContainers_GPU_ALPAKA(); + if (fUseWeightFile) { + fGC += "\n//--- reading weights from file\n"; + ReadInitializedTensorsFromFile(0); + fGC += "\n"; + } + + MoveInitializedTensorsToBuffers_ALPAKA(); + GenerateDynamicTensorInfo_GPU_ALPAKA(); + + for (size_t id = 0; id < fOperators.size(); id++) { + fGC += fOperators[id]->GenerateInitCode_GPU_ALPAKA(); + if (fOperators[id]->GetKind() == OperatorKind::GEMM){ + fGC += "\nblas.AddLayoutConfig("+fOperators[id]->GetBlasConfig()+");\n"; + } + } + + fGC += "\nalpaka::wait(queue);\n"; + fGC += "}\n\n"; + } + + registered_operators.clear(); + for (size_t id = 0; id < fOperators.size(); id++) { + if(registered_operators.find(fOperators[id]->GetKind()) == registered_operators.end()) { + + if (fVerbose) + std::cout<<"Declaring ALPAKA kernel for operator"<< toString(fOperators[id]->GetKind())<Generate_GPU_Kernel_Definitions_ALPAKA(std::to_string(id)); + registered_operators.insert(fOperators[id]->GetKind()); + } + } + + GenerateOutput_GPU_ALPAKA(); + + if (fUseSession && !fIsGNNComponent) { + fGC += "}; // end of Session\n"; + } +} + +void RModel::GenerateGPU_ALPAKA(std::underlying_type_t options, int batchSize, bool verbose) { + fVerbose = verbose; + fBatchSize = batchSize; + + if (static_cast>(Options::kNoSession) & options) { + fUseSession = false; + fWeightFile = WeightFileType::None; + } + if (static_cast>(Options::kNoWeightFile) & options) { + fUseWeightFile = false; + fWeightFile = WeightFileType::None; + } + if (static_cast>(Options::kRootBinaryWeightFile) & options) { + fUseWeightFile = true; + fWeightFile = WeightFileType::RootBinary; + } + if (fUseWeightFile && !fUseSession) { + throw std::runtime_error( + "TMVA-SOFIE: RModel::Generate: cannot use a separate weight file without generating a Session class"); + } + + if (static_cast>(Options::kGNN) & options || + static_cast>(Options::kGNNComponent) & options) + throw std::runtime_error("SOFIE GPU does not yet supports GNN Inference."); + + Initialize(batchSize, verbose); + + std::string hgname; + if (!fIsSubGraph) { + fGC.clear(); + GenerateHeaderInfo_GPU_ALPAKA(hgname); + } + + if (fVerbose) + std::cout << "generate Main session code - model " << fName << std::endl; + + GenerateSessionCode_GPU_ALPAKA(); + + if (!fIsSubGraph) { + fGC += ("} //SOFIE_" + fName + "\n"); + fGC += "\n#endif // " + hgname + "\n"; + } +} + +void RModel::MoveInitializedTensorsToBuffers_ALPAKA(){ + for (auto &i : fInitializedTensors) { + // skip Constant and shape tensors + if (!i.second.IsWeightTensor()) continue; + std::string tensor_name = "tensor_" + i.first; + auto length = ConvertShapeToLength(i.second.shape()); + std::string slength = std::to_string(length); + if (i.second.type() == ETensorType::FLOAT) { + fGC += " auto hostBuf_"+i.first+" = alpaka::allocBuf(hostAcc, Ext1D::all(Idx{"+ slength+"}));\n"; + fGC += " std::memcpy(alpaka::getPtrNative(hostBuf_"+i.first+"), tensor_"+i.first+".data(), "+slength+"* sizeof(float));\n"; + fGC += " alpaka::memcpy(queue, deviceBuf_"+i.first+", hostBuf_"+i.first+");\n"; + } else if (i.second.type() == ETensorType::DOUBLE) { + fGC += " auto hostBuf_"+i.first+" = alpaka::allocBuf(hostAcc, Ext1D::all(Idx{"+ slength+"}));\n"; + fGC += " std::memcpy(alpaka::getPtrNative(hostBuf_"+i.first+"), tensor_"+i.first+".data(), "+slength+"* sizeof(double));\n"; + fGC += " alpaka::memcpy(queue, deviceBuf_"+i.first+", hostBuf_"+i.first+");\n"; + } else if (i.second.type() == ETensorType::INT64) { + fGC += " auto hostBuf_"+i.first+" = alpaka::allocBuf(hostAcc, Ext1D::all(Idx{" + slength + "}));\n"; + fGC += " std::memcpy(alpaka::getPtrNative(hostBuf_"+i.first+"), tensor_"+i.first+".data(), "+slength+"* sizeof(int64_t));"; + fGC += " alpaka::memcpy(queue, deviceBuf_"+i.first+", hostBuf_"+i.first+");\n"; + } else { + std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a ROOT file"); + } + } + } + +} // namespace SOFIE diff --git a/src/SOFIE_core/src/RModel_Base.cxx b/src/SOFIE_core/src/RModel_Base.cxx index d4d1f1c..f212c53 100644 --- a/src/SOFIE_core/src/RModel_Base.cxx +++ b/src/SOFIE_core/src/RModel_Base.cxx @@ -58,6 +58,38 @@ void RModel_Base::GenerateHeaderInfo(std::string& hgname) { } } +void RModel_Base::GenerateHeaderInfo_GPU_ALPAKA(std::string& hgname) { + fGC += ("//Code generated automatically by TMVA for GPU Inference using ALPAKA of Model file [" + fFileName + "] at [" + fParseTime.substr(0, fParseTime.length()-1) +"] \n"); + // add header guards + hgname = fName; + std::transform(hgname.begin(), hgname.end(), hgname.begin(), [](unsigned char c) { + return std::toupper(c); + } ); + hgname = "SOFIE_" + hgname; + fGC += "\n#ifndef " + hgname + "\n"; + fGC += "#define " + hgname + "\n\n"; + for (auto& i: fNeededStdLib) { + fGC += "#include <" + i + ">\n"; + } + for (auto& i: fCustomOpHeaders) { + fGC += "#include \"" + i + "\"\n"; + } + fGC += "#include \n"; + fGC += "#include \n"; + + // for the session we need to include SOFIE_Common functions + //needed for convolution operator (need to add a flag) + fGC += "#include \"SOFIE/SOFIE_common.hxx\"\n"; + if (fUseWeightFile) + fGC += "#include \n"; + // Include TFile when saving the weights in a binary ROOT file + if (fWeightFile == WeightFileType::RootBinary) + fGC += "#include \"TFile.h\"\n"; + + fGC += "\nusing Dim1D = alpaka::DimInt<1>;\n"; + fGC += "\nnamespace SOFIE_" + fName + "{\n"; +} + void RModel_Base::OutputGenerated(std::string filename, bool append) { // the model can be appended only if a file name is provided if (filename.empty()) { diff --git a/src/SOFIE_core/src/RModel_GNN.cxx b/src/SOFIE_core/src/RModel_GNN.cxx index a1dfe06..3dae254 100644 --- a/src/SOFIE_core/src/RModel_GNN.cxx +++ b/src/SOFIE_core/src/RModel_GNN.cxx @@ -94,7 +94,7 @@ void RModel_GNN::Generate() { // the number of output edges features can be smaller, so we need to correct here auto num_edge_features_input = num_edge_features; - auto edges_update_output_shape = edges_update_block->GetFunctionBlock()->GetDynamicTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); + auto edges_update_output_shape = edges_update_block->GetFunctionBlock()->GetDimTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); if(!edges_update_output_shape[1].isParam && edges_update_output_shape[1].dim != num_edge_features_input) { num_edge_features = edges_update_output_shape[1].dim; } @@ -117,7 +117,7 @@ void RModel_GNN::Generate() { // we need to correct the output number of node features auto num_node_features_input = num_node_features; - auto nodes_update_output_shape = nodes_update_block->GetFunctionBlock()->GetDynamicTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); + auto nodes_update_output_shape = nodes_update_block->GetFunctionBlock()->GetDimTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); if(!nodes_update_output_shape[1].isParam && nodes_update_output_shape[1].dim != num_node_features_input) { num_node_features = nodes_update_output_shape[1].dim; } diff --git a/src/SOFIE_core/src/RModel_GraphIndependent.cxx b/src/SOFIE_core/src/RModel_GraphIndependent.cxx index bab06b3..cd62d0c 100644 --- a/src/SOFIE_core/src/RModel_GraphIndependent.cxx +++ b/src/SOFIE_core/src/RModel_GraphIndependent.cxx @@ -81,7 +81,7 @@ void RModel_GraphIndependent::Generate() { // the number of output edges features can be smaller, so we need to correct here // assume num_edge_features is not a parametric shape - auto edges_update_output_shape = edges_update_block->GetFunctionBlock()->GetDynamicTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); + auto edges_update_output_shape = edges_update_block->GetFunctionBlock()->GetDimTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); if(!edges_update_output_shape[1].isParam && edges_update_output_shape[1].dim != num_edge_features_input) { num_edge_features = edges_update_output_shape[1].dim; } @@ -100,7 +100,7 @@ void RModel_GraphIndependent::Generate() { fGC+="};\n}\n"; // we need to correct the output number of node features - auto nodes_update_output_shape = nodes_update_block->GetFunctionBlock()->GetDynamicTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); + auto nodes_update_output_shape = nodes_update_block->GetFunctionBlock()->GetDimTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); if(!nodes_update_output_shape[1].isParam && nodes_update_output_shape[1].dim != num_node_features_input) { num_node_features = nodes_update_output_shape[1].dim; } @@ -119,7 +119,7 @@ void RModel_GraphIndependent::Generate() { // we need to correct the output number of global features // global features are in shape[1] #if 0 - auto globals_update_output_shape = globals_update_block->GetFunctionBlock()->GetDynamicTensorShape(globals_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); + auto globals_update_output_shape = globals_update_block->GetFunctionBlock()->GetDimTensorShape(globals_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); if(!globals_update_output_shape[1].isParam && globals_update_output_shape[1].dim != num_global_features_input) { num_global_features = globals_update_output_shape[1].dim; } diff --git a/src/SOFIE_core/src/SOFIE_common.cxx b/src/SOFIE_core/src/SOFIE_common.cxx index ad74313..05f873b 100644 --- a/src/SOFIE_core/src/SOFIE_common.cxx +++ b/src/SOFIE_core/src/SOFIE_common.cxx @@ -1,15 +1,16 @@ #include "SOFIE/SOFIE_common.hxx" -#include + +#include #include #include +#include - -namespace SOFIE{ +namespace SOFIE { /// @brief Convert shape from integer format to dynamic one (based on Dim) /// @param shape /// @return shape based on Dim -std::vector ConvertShapeToDim(std::vector shape){ +std::vector ConvertShapeToDim(const std::vector & shape){ std::vector ret_shape(shape.size()); for (size_t i =0; i < shape.size(); i++){ ret_shape[i].dim = shape[i]; @@ -20,7 +21,7 @@ std::vector ConvertShapeToDim(std::vector shape){ /// @brief Convert shape based on Dim to integer format /// @param shape /// @return shape based on integer. Return an empty shape in case shape is dynamic (has a parameter) -std::vector ConvertShapeToInt(std::vector shape){ +std::vector ConvertShapeToInt(const std::vector & shape){ std::vector ret_shape(shape.size()); for (size_t i =0; i < shape.size(); i++){ if (shape[i].isParam) { @@ -45,19 +46,14 @@ std::vector ConvertShapeToInt(std::vector shape){ return ret_shape; } - -std::size_t ConvertShapeToLength(std::vector shape){ - // Empty shape represent scalar values, so we return a length=1 - std::size_t fLength = 1; - for (auto& dim: shape) fLength *= dim; - return fLength; -} - std::string ConvertTypeToString(ETensorType type){ switch(type){ case ETensorType::FLOAT : { return "float"; } + case ETensorType::INT8 : { + return "int8_t"; + } case ETensorType::INT16 : { return "int16_t"; } @@ -67,6 +63,9 @@ std::string ConvertTypeToString(ETensorType type){ case ETensorType::INT64 : { return "int64_t"; } + case ETensorType::UINT8 : { + return "uint8_t"; + } case ETensorType::UINT16 : { return "uint16_t"; } @@ -106,7 +105,7 @@ ETensorType ConvertStringToType(std::string type){ } } -std::string ConvertShapeToString(std::vector shape) { +std::string ConvertShapeToString(const std::vector & shape) { std::stringstream out; out << "{ "; for (size_t i = 0; i < shape.size(); i++) { @@ -117,7 +116,7 @@ std::string ConvertShapeToString(std::vector shape) { return out.str(); } -std::string ConvertDynamicShapeToString(std::vector shape) { +std::string ConvertDimShapeToString(const std::vector & shape) { std::stringstream out; out << "{ "; for (size_t i = 0; i < shape.size(); i++) { @@ -128,10 +127,12 @@ std::string ConvertDynamicShapeToString(std::vector shape) { return out.str(); } -std::string ConvertDynamicShapeToLength(std::vector shape) { +std::string ConvertDimShapeToLength(const std::vector & shape) { // convert generic shape to a string // multiply all the integer specified dimensions of the shape std::string length; + // case of empty vectors return 1 + if (shape.empty()) return "1"; size_t int_length = 0; for (size_t i = 0; i < shape.size(); i++) { if (shape[i].isParam) { @@ -145,12 +146,24 @@ std::string ConvertDynamicShapeToLength(std::vector shape) { } } // multiply the integer components to the parametric one + // if larger than 1 if (int_length > 0) { - if (!length.empty()) length += " * "; - length += std::to_string(int_length); + if (!length.empty() && int_length > 1) { + length += " * "; + length += std::to_string(int_length); + } else if (length.empty()) { // case is full known shape + length = std::to_string(int_length); + } } return length; } +std::string ConvertShapeToString(const std::vector & shape) { + return ConvertDimShapeToString(shape); +} +std::string ConvertDynamicShapeToLength(const std::vector & shape) { + return ConvertDimShapeToLength(shape); +} + namespace{ template @@ -169,6 +182,12 @@ static inline void copy_vector_data(int_t no_of_copies, int_t input_size, T* inp } } +bool IsInteger(const std::string & s) { + int value; + auto [ptr, ec] = std::from_chars(s.data(), s.data() + s.size(), value); + return ec == std::errc() && ptr == s.data() + s.size(); +} + bool UTILITY::AreSameShape(const std::vector& shapeA, const std::vector& shapeB) { if (shapeA.size() != shapeB.size()) { return false; @@ -330,17 +349,24 @@ std::vector UTILITY::MultidirectionalBroadcastShape(std::vector UTILITY::UnidirectionalBroadcastShape(std::vector shapeA, std::vector shapeB) +// check multi-directional broadcasting of two shapes (need to pass inputs by non const ref. since we might prepends with one's +// return a pair of integer flag and new broadcasted shape +// if flag = 0: shape are identical +// flag = 1: return shape is equal to A, we broadcast B +// flag = 2: return shape is equal to B we broadcast A +// flag = 3: return shape is common of two we broadcast A and B to output +std::pair> UTILITY::MultidirectionalBroadcastShape(std::vector & shapeA, std::vector & shapeB) { size_t sizeA = shapeA.size(); size_t sizeB = shapeB.size(); // Check if A and B have the same shape if (UTILITY::AreSameShape(shapeA, shapeB)){ - return shapeA; + return std::make_pair(0, shapeA); } // Find the common shape of A and B size_t size = std::max(sizeA, sizeB); if (sizeA < size) { + // prepend 1's in A to make of same shape as B std::vector newShapeA(size, 1); size_t offset = size - sizeA; std::copy(shapeA.begin(), shapeA.end(), newShapeA.begin() + offset); @@ -359,36 +385,116 @@ std::vector UTILITY::UnidirectionalBroadcastShape(std::vector s break; } } + int broadcastFlag = 0; if (broadcastable) { // The output shape is max(outShape, targetShape) std::vector targetShape(size, 1); for (size_t i = 0; i < size; i++) { targetShape[i] = std::max(shapeA[i], shapeB[i]); + if (shapeB[i] < targetShape[i]) broadcastFlag |= 1; + if (shapeA[i] < targetShape[i]) broadcastFlag |= 2; } - return targetShape; + return std::make_pair(broadcastFlag, targetShape); } else { throw - std::runtime_error("TMVA::SOFIE - Error unidirectional broadcasting tensors of shape " + std::runtime_error("TMVA::SOFIE - Error multidirectional broadcasting tensors of shape " + + ConvertShapeToString(shapeA) + " and " + ConvertShapeToString(shapeB) + + " to a common shape."); + } +} +// unidirectional broadcast- only B changes +std::vector UTILITY::UnidirectionalBroadcastShape(std::vector & shapeA, std::vector & shapeB) +{ + auto ret = UTILITY::MultidirectionalBroadcastShape(shapeA, shapeB); + if (ret.first > 1) { + std::runtime_error("TMVA::SOFIE - Error unidirectional broadcasting tensors of shape " + ConvertShapeToString(shapeA) + " and " + ConvertShapeToString(shapeB) + " to a common shape."); } + return ret.second; } -// UNidirectional boradcast specializaiton for vector +// for broadcasting Dim shapes +// flag indicates also which vector needs to be broadcasted +// flag & 1 == 1 : broadcast B -> A +// flag & 2 == 2 : broadcast A -> B +// flag & 4 == 4 a run time check is needed on shapes with values +std::pair> UTILITY::MultidirectionalBroadcastShape(std::vector & shapeA, std::vector & shapeB) { + size_t sizeA = shapeA.size(); + size_t sizeB = shapeB.size(); + // Check if A and B have the same shape + if (UTILITY::AreSameShape(shapeA, shapeB)){ + return std::make_pair(0, shapeA); + } + // Find the common shape of A and B + size_t size = std::max(sizeA, sizeB); + if (sizeA < size) { + // prepend 1's in A to make of same shape as B + std::vector newShapeA(size, Dim{1}); + size_t offset = size - sizeA; + std::copy(shapeA.begin(), shapeA.end(), newShapeA.begin() + offset); + shapeA = std::move(newShapeA); + } + if (sizeB < size) { + std::vector newShapeB(size, Dim{1}); + size_t offset = size - sizeB; + std::copy(shapeB.begin(), shapeB.end(), newShapeB.begin() + offset); + shapeB = std::move(newShapeB); + } + + int broadcastFlag = 0; + // The output shape is targetShape + std::vector targetShape(size); + for (size_t i = 0; i < size; i++) { + // assume we broadcast to the parametric value + if (shapeA[i] == shapeB[i]) { + targetShape[i] = shapeA[i]; + } else if (shapeA[i].isParam && shapeB[i].GetVal() == "1" ) { + // broadcast B to A (case A is parametric with ) + targetShape[i] = shapeA[i]; + broadcastFlag |= 1; + } else if (shapeA[i].GetVal() == "1" && shapeB[i].isParam) { + // broadcast A to B + targetShape[i] = shapeB[i]; + broadcastFlag |= 2; + } else if (!shapeA[i].isParam && !shapeB[i].isParam) { + if (shapeB[i].dim == 1) { + targetShape[i] = shapeA[i]; + broadcastFlag |= 1; + } else if (shapeA[i].dim == 1) { + targetShape[i] = shapeB[i]; + broadcastFlag |= 2; + } else { + // non broadcastable case cannot have A and B two different defined shapes different than one + broadcastFlag = -1; + } + } else if (shapeA[i].isParam && shapeB[i].isParam) { + // full dynamic case - we will decided at run time + std::stringstream s; + s << "std::max(" << shapeA[i] << "," << shapeB[i] << ")"; + // use -1 for dim to indicate is an expression + targetShape[i] = Dim { s.str() , static_cast(-1)}; + broadcastFlag |= 4; + } else if (shapeA[i].isParam && !shapeB[i].isParam) { + // A -> B need to check at run time if consistent + targetShape[i] = shapeB[i]; + broadcastFlag |= 6; + } else if (!shapeA[i].isParam && shapeB[i].isParam) { + // B -> A need to check at run time if consistent + targetShape[i] = shapeA[i]; + broadcastFlag |= 5; + } else { + // all cases should be covered + throw std::runtime_error("TMVA::SOFIE - Fatal error in MultiDirectionalBroadCastDimShape"); + } + } + if (broadcastFlag == -1) { + throw std::runtime_error("TMVA::SOFIE - Error multidirectional broadcasting tensors of shape " + + ConvertDimShapeToString(shapeA) + " and " + ConvertDimShapeToString(shapeB) + + " to a common shape."); + } -// specialization for vector of boolean -void UTILITY::UnidirectionalBroadcast(const std::vector & data, const std::vector& shape, const std::vector& targetShape, std::vector & broadcastedData) - { - // Prepend shape with ones - auto ncdata = const_cast &>(data); - if (shape.size() < targetShape.size()) { - size_t targetSize = targetShape.size(); - std::vector newShape(targetSize, 1); - size_t offset = targetSize - shape.size(); - std::copy(shape.begin(), shape.end(), newShape.begin() + offset); - UTILITY::BroadcastTensor &, std::vector &>(ncdata, newShape, targetShape, broadcastedData); - } - UTILITY::BroadcastTensor &, std::vector &>(ncdata, shape, targetShape, broadcastedData); + return std::make_pair(broadcastFlag, targetShape); } std::string UTILITY::Clean_name(std::string input_tensor_name){ @@ -413,15 +519,22 @@ std::vector UTILITY::ComputeStrideFromShape(const std::vector & shape) // assume row major layout const auto size = shape.size(); std::vector strides(size); - strides[size-1] = Dim{1}; - for (std::size_t i = 1; i < size; i++) { - if (!shape[size-i].isParam && !strides[size-i].isParam) - strides[size - 1 - i] = Dim{strides[size-i].dim * shape[size-i].dim}; - else - strides[size - 1 - i] = Dim{std::string(strides[size-i].GetVal() + "*" + shape[size-i].GetVal())}; + if (size > 0) { + strides[size-1] = Dim{1}; + for (std::size_t i = 1; i < size; i++) { + if (!shape[size-i].isParam && !strides[size-i].isParam) + strides[size - 1 - i] = Dim{strides[size-i].dim * shape[size-i].dim}; + else { + if (strides[size-i].GetVal() == "1") + strides[size - 1 - i] = shape[size-i]; + else if (shape[size-i].GetVal() == "1") + strides[size - 1 - i] = strides[size-i]; + else + strides[size - 1 - i] = Dim{std::string(strides[size-i].GetVal() + "*" + shape[size-i].GetVal())}; + } + } } return strides; } - -}//SOFIE +} // namespace SOFIE diff --git a/src/SOFIE_core/test/CMakeLists.txt b/src/SOFIE_core/test/CMakeLists.txt index 34bb49f..76e5e29 100644 --- a/src/SOFIE_core/test/CMakeLists.txt +++ b/src/SOFIE_core/test/CMakeLists.txt @@ -9,123 +9,243 @@ # @author Federico Sossai, Sanjiban Sengupta ############################################################################ +cmake_minimum_required(VERSION 3.14) +include(FetchContent) + include_directories(${CMAKE_SOURCE_DIR}/src/SOFIE_core/inc) include_directories(${CMAKE_SOURCE_DIR}/src/SOFIE_parsers/inc) +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + if (NOT ONNX_MODELS_DIR) set(ONNX_MODELS_DIR input_models) endif() -# Finding .onnx files to be parsed and creating the appropriate code to -# parse all file. It is much faster to combine all parsing in a single executable -# which will avoid initialization time (especially when using ROOT) -set(CAPTURE_STR "EmitModel( \"@1\", \"@2\");") +option(ENABLE_ALPAKA_TESTS "Enable Alpaka-based SOFIE tests" OFF) +set(ALPAKA_BACKEND "cuda" + CACHE STRING "Alpaka backend to test (cuda, cpu, hip, sycl)") +set_property(CACHE ALPAKA_BACKEND PROPERTY STRINGS cuda cpu hip sycl) + +# String template used to produce calls to EmitModel(...) per file. +set(CAPTURE_STR +"try {\n\ + EmitModel(\"@1\", \"@2\");\n\ +} catch (const std::exception& e) {\n\ + std::string msg = e.what();\n\ + if (msg.find(\"multiple output tensors are not supported\") != std::string::npos) {\n\ + std::cerr << \"[SKIP] Multiple outputs are not supported for @1\" << std::endl;\n\ + } else {\n\ + std::cerr << \"[ERROR] Failed processing @1: \" << msg << std::endl;\n\ + failures++;\n\ + }\n\ +} catch (...) {\n\ + std::cerr << \"[ERROR] Unknown failure processing @1\" << std::endl;\n\ + failures++;\n\ +}\n\ +") + +# --- Collect ONNX files --- set(ALL_CAPTURES "") -# Finding .onnx files to be parsed and creating the appropriate command file(GLOB ONNX_FILES "${ONNX_MODELS_DIR}/*.onnx") + foreach(onnx_file ${ONNX_FILES}) get_filename_component(fname ${onnx_file} NAME_WE) - get_filename_component(fdir ${onnx_file} DIRECTORY) - string(REPLACE "@1" ${onnx_file} cap ${CAPTURE_STR}) - string(REPLACE "@2" ${fname} cap ${cap}) - list(APPEND ALL_CAPTURES ${cap}) + string(REPLACE "@1" "${onnx_file}" cap "${CAPTURE_STR}") + string(REPLACE "@2" "${fname}" cap "${cap}") + string(APPEND ALL_CAPTURES "${cap}") endforeach() -string(REPLACE ";" ";\n" EMIT_CAPTURES "${ALL_CAPTURES}") + +set(EMIT_CAPTURES "${ALL_CAPTURES}") + configure_file(EmitFromONNX.cxx.in EmitFromONNX_all.cxx @ONLY) -configure_file(EmitFromRoot.cxx.in EmitFromRoot_all.cxx @ONLY) +configure_file(EmitFromONNX_GPU_ALPAKA.cxx.in EmitFromONNX_GPU_ALPAKA_all.cxx @ONLY) + +# --- CPU emitter --- +# ROOTTEST_GENERATE_EXECUTABLE(emitFromONNX EmitFromONNX_all.cxx +# LIBRARIES protobuf::libprotobuf SOFIE_core SOFIE_parsers +# FIXTURES_SETUP sofie-compile-models-onnx-build) + +# target_compile_options(emitFromONNX PRIVATE -Wno-unused-parameter -Wno-array-bounds) + +# ROOTTEST_ADD_TEST(SofieCompileModels_ONNX +# COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 ./emitFromONNX +# FIXTURES_REQUIRED sofie-compile-models-onnx-build +# FIXTURES_SETUP sofie-compile-models-onnx +# ) + +# # --- Custom model tests --- +# if (BLAS_FOUND) +# ROOTTEST_GENERATE_EXECUTABLE(TestCustomModelsFromONNX TestCustomModelsFromONNX.cxx +# LIBRARIES +# MathCore +# SOFIE_core +# BLAS::BLAS +# GTest::gtest +# GTest::gtest_main +# FIXTURES_REQUIRED sofie-compile-models-onnx +# FIXTURES_SETUP sofie-test-models-onnx-build +# ) + +# target_include_directories(TestCustomModelsFromONNX PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) + +# ROOTTEST_ADD_TEST(TestCustomModelsFromONNX +# EXEC ./TestCustomModelsFromONNX +# FIXTURES_REQUIRED sofie-test-models-onnx-build +# ) +# endif() + +# # --- Python-based generators --- +# ROOT_FIND_PYTHON_MODULE(torch) +# if (ROOT_TORCH_FOUND) +# configure_file(Conv1dModelGenerator.py Conv1dModelGenerator.py COPYONLY) +# configure_file(Conv2dModelGenerator.py Conv2dModelGenerator.py COPYONLY) +# configure_file(Conv3dModelGenerator.py Conv3dModelGenerator.py COPYONLY) +# configure_file(ConvTrans2dModelGenerator.py ConvTrans2dModelGenerator.py COPYONLY) +# configure_file(LinearModelGenerator.py LinearModelGenerator.py COPYONLY) +# configure_file(RecurrentModelGenerator.py RecurrentModelGenerator.py COPYONLY) + +# if (BLAS_FOUND) +# ROOT_ADD_GTEST(TestSofieModels TestSofieModels.cxx +# LIBRARIES SOFIE_core SOFIE_parsers BLAS::BLAS +# INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR} +# ) +# endif() +# endif() + +# ROOT_EXECUTABLE(emitGNN GNN/EmitGNN.cxx LIBRARIES SOFIE_core) +# ROOT_ADD_TEST(tmva-sofie-EmitGNN COMMAND emitGNN) + +# ROOT_EXECUTABLE(EmitGraphIndependent GNN/EmitGraphIndependent.cxx LIBRARIES SOFIE_core) +# ROOT_ADD_TEST(tmva-sofie-EmitGraphIndependent COMMAND EmitGraphIndependent) + +# ========================= +# ALPAKA TESTS +# ========================= +if (ENABLE_ALPAKA_TESTS) + + string(TOLOWER "${ALPAKA_BACKEND}" _alpaka_backend) + + if (NOT _alpaka_backend IN_LIST ALPAKA_BACKEND) + message(FATAL_ERROR + "Unsupported ALPAKA_BACKEND='${ALPAKA_BACKEND}'. " + "Valid values: cuda, cpu, hip, sycl") + endif() -ROOTTEST_GENERATE_EXECUTABLE(emitFromONNX EmitFromONNX_all.cxx - LIBRARIES protobuf::libprotobuf SOFIE_core SOFIE_parsers - FIXTURES_SETUP sofie-compile-models-onnx-build) + FetchContent_Declare( + sofieBLAS + GIT_REPOSITORY https://github.com/ML4EP/sofieBLAS + GIT_TAG edf2259876e9f4fb5a8f72db20b2dfb5dc26b517 + ) + FetchContent_MakeAvailable(sofieBLAS) -# silence protobuf warnings seen in version 3.0 and 3.6. Not needed from protobuf version 3.17 -target_compile_options(emitFromONNX PRIVATE -Wno-unused-parameter -Wno-array-bounds) + FetchContent_Declare( + alpaka + GIT_REPOSITORY https://github.com/alpaka-group/alpaka + GIT_TAG 2fa91a34ed11b2076e474c5507d920e85cf9b79d + ) + FetchContent_MakeAvailable(alpaka) -ROOTTEST_ADD_TEST(SofieCompileModels_ONNX - COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 ./emitFromONNX ${onnx_file} ${CMAKE_CURRENT_BINARY_DIR}/${fname} - FIXTURES_REQUIRED sofie-compile-models-onnx-build - FIXTURES_SETUP sofie-compile-models-onnx -) + # --- ALPAKA emitter --- + ROOTTEST_GENERATE_EXECUTABLE(emitFromONNXAlpaka EmitFromONNX_GPU_ALPAKA_all.cxx + LIBRARIES protobuf::libprotobuf SOFIE_core SOFIE_parsers + FIXTURES_SETUP sofie-compile-models-onnx-alpaka-build + ) -# Creating a Google Test -if (BLAS_FOUND) # we need BLAS for compiling the models - ROOTTEST_GENERATE_EXECUTABLE(TestCustomModelsFromONNX TestCustomModelsFromONNX.cxx - LIBRARIES - MathCore - SOFIE_core - BLAS::BLAS - GTest::gtest - GTest::gtest_main - FIXTURES_REQUIRED - sofie-compile-models-onnx - FIXTURES_SETUP - sofie-test-models-onnx-build + target_compile_options(emitFromONNXAlpaka PRIVATE -Wno-unused-parameter -Wno-array-bounds) + + if (ONNX_FILES) + foreach(onnx_file ${ONNX_FILES}) + get_filename_component(fname ${onnx_file} NAME_WE) + add_custom_command(TARGET emitFromONNXAlpaka POST_BUILD + COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 + $ + "${onnx_file}" "${CMAKE_CURRENT_BINARY_DIR}/${fname}" + COMMENT "Running ALPAKA emitter on ${onnx_file}") + endforeach() + endif() + + ROOTTEST_ADD_TEST(SofieCompileModels_ONNX_Alpaka + COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 ./emitFromONNXAlpaka + FIXTURES_REQUIRED sofie-compile-models-onnx-alpaka-build + FIXTURES_SETUP sofie-compile-models-onnx-alpaka ) - target_include_directories(TestCustomModelsFromONNX PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) - ROOTTEST_ADD_TEST(TestCustomModelsFromONNX - EXEC ./TestCustomModelsFromONNX - FIXTURES_REQUIRED sofie-test-models-onnx-build) -endif() -# For testing serialisation of RModel object + set(CXXFLAGS -O2 -g -DALPAKA_HAS_STD_ATOMIC_REF) + set(CXX_HOST_FLAGS -fPIC -pthread) + + # ---- Backend selection ---- + if (_alpaka_backend STREQUAL "cuda") + message(STATUS "Enabling Alpaka CUDA tests") + enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) + + set(CUDA_ARCH "sm_86") + set(CXX_CUDA_FLAGS + -arch=${CUDA_ARCH} + -Wno-deprecated-gpu-targets + --extended-lambda + --expt-relaxed-constexpr) + + set_source_files_properties( + TestCustomModelsFromONNXForAlpakaCuda.cxx + PROPERTIES LANGUAGE CUDA + ) + -ROOTTEST_GENERATE_EXECUTABLE(emitFromROOT EmitFromRoot_all.cxx - LIBRARIES protobuf::libprotobuf RIO SOFIE_core SOFIE_parsers - FIXTURES_SETUP sofie-compile-models-onnx-root -) -# silence protobuf warnings seen in version 3.0 and 3.6. Not needed from protobuf version 3.17 -target_compile_options(emitFromROOT PRIVATE -Wno-unused-parameter -Wno-array-bounds) - -# Automatic compilation of headers from root files -ROOTTEST_ADD_TEST(SofieCompileModels_ROOT - COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 ./emitFromROOT - FIXTURES_REQUIRED sofie-compile-models-onnx-root - FIXTURES_SETUP sofie-compile-models-root +ROOTTEST_GENERATE_EXECUTABLE( + TestCustomModelsFromONNXForAlpakaCuda + TestCustomModelsFromONNXForAlpakaCuda.cxx + LIBRARIES MathCore SOFIE_core GTest::gtest GTest::gtest_main + FIXTURES_REQUIRED sofie-compile-models-onnx-alpaka + FIXTURES_SETUP sofie-test-models-onnx-alpaka-build ) -if (BLAS_FOUND) - # Creating a Google Test for Serialisation of RModel - ROOTTEST_GENERATE_EXECUTABLE(TestCustomModelsFromROOT TestCustomModelsFromROOT.cxx - LIBRARIES - SOFIE_core - BLAS::BLAS - GTest::gtest - GTest::gtest_main - FIXTURES_REQUIRED - sofie-compile-models-root - FIXTURES_SETUP - sofie-test-models-root-build - ) - target_include_directories(TestCustomModelsFromROOT PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) - ROOTTEST_ADD_TEST(TestCustomModelsFromROOT - EXEC ./TestCustomModelsFromROOT - FIXTURES_REQUIRED sofie-test-models-root-build) -endif() -# Look for needed Python modules -ROOT_FIND_PYTHON_MODULE(torch) -if (ROOT_TORCH_FOUND) - configure_file(Conv1dModelGenerator.py Conv1dModelGenerator.py COPYONLY) - configure_file(Conv2dModelGenerator.py Conv2dModelGenerator.py COPYONLY) - configure_file(Conv3dModelGenerator.py Conv3dModelGenerator.py COPYONLY) - configure_file(ConvTrans2dModelGenerator.py ConvTrans2dModelGenerator.py COPYONLY) - configure_file(LinearModelGenerator.py LinearModelGenerator.py COPYONLY) - configure_file(RecurrentModelGenerator.py RecurrentModelGenerator.py COPYONLY) - - if (BLAS_FOUND) - ROOT_ADD_GTEST(TestSofieModels TestSofieModels.cxx - LIBRARIES - SOFIE_core - SOFIE_parsers - BLAS::BLAS - INCLUDE_DIRS - ${CMAKE_CURRENT_BINARY_DIR} + target_include_directories( + TestCustomModelsFromONNXForAlpakaCuda PRIVATE + ${CMAKE_CURRENT_BINARY_DIR} + ${alpaka_SOURCE_DIR}/include + ${SOFIE_INCLUDE} + ${sofieblas_SOURCE_DIR}/include + ${ROOT_INCLUDE_DIRS} + ${CUDA_BASE}/include + ${CMAKE_CURRENT_SOURCE_DIR} + ) + + set_target_properties( + TestCustomModelsFromONNXForAlpakaCuda + PROPERTIES CUDA_SEPARABLE_COMPILATION ON + ) + + target_compile_definitions( + TestCustomModelsFromONNXForAlpakaCuda PRIVATE + ALPAKA_ACC_GPU_CUDA_ENABLED + ) + + target_link_directories( + TestCustomModelsFromONNXForAlpakaCuda PRIVATE + ${CUDA_BASE}/lib64 + ) + + target_link_libraries(TestCustomModelsFromONNXForAlpakaCuda + CUDA::cublas + CUDA::cublasLt + CUDA::cudart + ${ROOT_LIBRARIES} + ) + + ROOTTEST_ADD_TEST(TestCustomModelsFromONNXForAlpakaCuda + EXEC ./TestCustomModelsFromONNXForAlpakaCuda + FIXTURES_REQUIRED sofie-compile-models-onnx-alpaka ) - endif() -endif() -ROOT_EXECUTABLE(emitGNN GNN/EmitGNN.cxx LIBRARIES SOFIE_core) -ROOT_ADD_TEST(tmva-sofie-EmitGNN COMMAND emitGNN) + elseif (_alpaka_backend STREQUAL "cpu") + message(STATUS "Alpaka CPU backend selected (not yet implemented)") + elseif (_alpaka_backend STREQUAL "hip") + message(STATUS "Alpaka HIP backend selected (not yet implemented)") + elseif (_alpaka_backend STREQUAL "sycl") + message(STATUS "Alpaka SYCL backend selected (not yet implemented)") + endif() # backend -ROOT_EXECUTABLE(EmitGraphIndependent GNN/EmitGraphIndependent.cxx LIBRARIES SOFIE_core) -ROOT_ADD_TEST(tmva-sofie-EmitGraphIndependent COMMAND EmitGraphIndependent) +endif() # ENABLE_ALPAKA_TESTS diff --git a/src/SOFIE_core/test/EmitFromONNX.cxx.in b/src/SOFIE_core/test/EmitFromONNX.cxx.in index f7a56e2..c464f4d 100644 --- a/src/SOFIE_core/test/EmitFromONNX.cxx.in +++ b/src/SOFIE_core/test/EmitFromONNX.cxx.in @@ -23,7 +23,13 @@ int EmitModel(std::string filename, std::string outname) { int main(int argc, char *argv[]){ -@EMIT_CAPTURES@ ; + + int failures = 0; + + @EMIT_CAPTURES@ + + std::cout << "[SUMMARY for generation from ONNX] Completed with " << failures << " failures" << std::endl; + return failures == 0 ? 0 : 1; } diff --git a/src/SOFIE_core/test/EmitFromONNX_GPU_ALPAKA.cxx.in b/src/SOFIE_core/test/EmitFromONNX_GPU_ALPAKA.cxx.in new file mode 100644 index 0000000..58198c1 --- /dev/null +++ b/src/SOFIE_core/test/EmitFromONNX_GPU_ALPAKA.cxx.in @@ -0,0 +1,27 @@ +// Author: Sanjiban Sengupta + +#include "SOFIE/RModel_Base.hxx" +#include "SOFIE/RModel.hxx" +#include "SOFIE/RModelParser_ONNX.hxx" + +using namespace SOFIE; + +int EmitModel(std::string filename, std::string outname) { + + RModelParser_ONNX parser; + RModel model = parser.Parse(filename); + model.GenerateGPU_ALPAKA(); + model.OutputGenerated(outname+"_FromONNX_GPU_ALPAKA.hxx"); + + return 0; +} + +int main(int argc, char *argv[]) { + + int failures = 0; + + @EMIT_CAPTURES@ + + std::cout << "[SUMMARY for generation from ONNX with ALPAKA] Completed with " << failures << " failures" << std::endl; + return failures == 0 ? 0 : 1; +} diff --git a/src/SOFIE_core/test/EmitFromRoot.cxx.in b/src/SOFIE_core/test/EmitFromRoot.cxx.in index 4a630c7..88c0789 100644 --- a/src/SOFIE_core/test/EmitFromRoot.cxx.in +++ b/src/SOFIE_core/test/EmitFromRoot.cxx.in @@ -43,6 +43,15 @@ int EmitModel(std::string inputfile, std::string outname){ int main(int argc, char *argv[]){ -@EMIT_CAPTURES@ ; + int failures = 0; + @EMIT_CAPTURES@ + + std::cout << "[SUMMARY for generation from ROOT] Completed with " << failures << " failures" << std::endl; + return failures == 0 ? 0 : 1; + + @EMIT_CAPTURES@; + + std::cout << "[SUMMARY] Completed with " << failures << " failures" << std::endl; + return failures == 0 ? 0 : 1; } diff --git a/src/SOFIE_core/test/TestCustomModelsFromONNX.cxx b/src/SOFIE_core/test/TestCustomModelsFromONNX.cxx index d02dc5e..14eb6a3 100644 --- a/src/SOFIE_core/test/TestCustomModelsFromONNX.cxx +++ b/src/SOFIE_core/test/TestCustomModelsFromONNX.cxx @@ -812,7 +812,7 @@ TEST(ONNX, LinearWithLeakyRelu) { constexpr float TOLERANCE = 1; - // Preparing the standard all-ones input + // Preparing input std::vector input({ 0.4369, -0.6882, 1.0309, -1.0263, -0.1519, 1.2237, -0.7054, -0.1762, -0.6811, -2.2597, 1.0388, -0.7993, 0.1468, 1.3257, -0.4714, -0.0958, @@ -2515,7 +2515,7 @@ TEST(ONNX, Equal){ }); SOFIE_Equal::Session s("Equal_FromONNX.dat"); - std::vector output = s.infer(input1.data(),input2.data()); + std::vector output = s.infer(input1.data(),input2.data()); // Checking output size EXPECT_EQ(output.size(), sizeof(Equal_ExpectedOutput::outputs) / sizeof(bool)); @@ -2540,7 +2540,7 @@ TEST(ONNX, LessOrEqual){ }); SOFIE_LessOrEqual::Session s("LessOrEqual_FromONNX.dat"); - std::vector output = s.infer(input1.data(),input2.data()); + std::vector output = s.infer(input1.data(),input2.data()); // Checking output size EXPECT_EQ(output.size(), sizeof(LessOrEqual_ExpectedOutput::outputs) / sizeof(bool)); @@ -2565,7 +2565,7 @@ TEST(ONNX, GreaterOrEqual){ }); SOFIE_GreaterOrEqual::Session s("GreaterOrEqual_FromONNX.dat"); - std::vector output = s.infer(input1.data(),input2.data()); + std::vector output = s.infer(input1.data(),input2.data()); // Checking output size EXPECT_EQ(output.size(), sizeof(GreaterOrEqual_ExpectedOutput::outputs) / sizeof(bool)); @@ -2590,7 +2590,7 @@ TEST(ONNX, Greater){ }); SOFIE_Greater::Session s("Greater_FromONNX.dat"); - std::vector output = s.infer(input1.data(),input2.data()); + std::vector output = s.infer(input1.data(),input2.data()); // Checking output size EXPECT_EQ(output.size(), sizeof(Greater_ExpectedOutput::outputs) / sizeof(bool)); @@ -2615,7 +2615,7 @@ TEST(ONNX, Less){ }); SOFIE_Less::Session s("Less_FromONNX.dat"); - std::vector output = s.infer(input1.data(),input2.data()); + std::vector output = s.infer(input1.data(),input2.data()); // Checking output size EXPECT_EQ(output.size(), sizeof(Less_ExpectedOutput::outputs) / sizeof(bool)); diff --git a/src/SOFIE_core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx b/src/SOFIE_core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx new file mode 100644 index 0000000..1537ea4 --- /dev/null +++ b/src/SOFIE_core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx @@ -0,0 +1,244 @@ +#include +#include + +#include "Linear_16_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/Linear_16.ref.hxx" + +#include "Linear_32_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/Linear_32.ref.hxx" + +#include "Linear_64_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/Linear_64.ref.hxx" + +#include "LinearWithLeakyRelu_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/LinearWithLeakyRelu.ref.hxx" + +#include "LinearWithSigmoid_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/LinearWithSigmoid.ref.hxx" + +#include +#include +#include +#include "gtest/gtest.h" + +constexpr float DEFAULT_TOLERANCE = 1e-3f; + +using Idx = std::size_t; +using Dim = alpaka::DimInt<1>; +using Ext1D = alpaka::Vec; + +class SofieAlpakaTest : public ::testing::Test { +protected: + // Shared devices and platforms + alpaka::PlatformCpu hostPlatform; + alpaka::DevCpu host; + alpaka::PlatformCudaRt platform; + alpaka::DevCudaRt device; + alpaka::Queue queue; + + SofieAlpakaTest() + : hostPlatform{} + , host(alpaka::getDevByIdx(hostPlatform, 0u)) + , platform{} + , device(alpaka::getDevByIdx(platform, 0u)) + , queue(device) + { + } + + void SetUp() override { + cudaDeviceSynchronize(); + } + + void TearDown() override { + alpaka::wait(queue); + cudaDeviceSynchronize(); + } + + ~SofieAlpakaTest() override { + cudaDeviceSynchronize(); + } +}; + +TEST_F(SofieAlpakaTest, Linear16) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + auto A = alpaka::allocBuf(host, Ext1D::all(Idx{1600})); + float *A_ptr = reinterpret_cast(alpaka::getPtrNative(A)); + + for (Idx i = 0; i < 1600; ++i) { + A_ptr[i] = 1.0; + } + + auto A_d = alpaka::allocBuf(device, Ext1D::all(Idx{1600})); + alpaka::memcpy(queue, A_d, A); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{160})); + + { + SOFIE_Linear_16::Session session("Linear_16_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(A_d); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + cudaDeviceSynchronize(); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = Linear_16_ExpectedOutput::all_ones; + + for (size_t i = 0; i < 160; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); + } +} + +TEST_F(SofieAlpakaTest, Linear32) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + auto A = alpaka::allocBuf(host, Ext1D::all(Idx{1600})); + float *A_ptr = reinterpret_cast(alpaka::getPtrNative(A)); + + for (Idx i = 0; i < 1600; ++i) { + A_ptr[i] = 1.0; + } + + auto A_d = alpaka::allocBuf(device, Ext1D::all(Idx{1600})); + alpaka::memcpy(queue, A_d, A); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{160})); + + { + SOFIE_Linear_32::Session session("Linear_32_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(A_d); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + cudaDeviceSynchronize(); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = Linear_32_ExpectedOutput::all_ones; + + for (size_t i = 0; i < 160; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); + } +} + +TEST_F(SofieAlpakaTest, Linear64) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + auto A = alpaka::allocBuf(host, Ext1D::all(Idx{1600})); + float *A_ptr = reinterpret_cast(alpaka::getPtrNative(A)); + + for (Idx i = 0; i < 1600; ++i) { + A_ptr[i] = 1.0; + } + + auto A_d = alpaka::allocBuf(device, Ext1D::all(Idx{1600})); + alpaka::memcpy(queue, A_d, A); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{160})); + + { + SOFIE_Linear_64::Session session("Linear_64_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(A_d); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + cudaDeviceSynchronize(); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = Linear_64_ExpectedOutput::all_ones; + + for (size_t i = 0; i < 160; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); + } +} + +TEST_F(SofieAlpakaTest, LinearWithLeakyRelu) +{ + alpaka::PlatformCpu hostPlatform{}; + auto host = alpaka::getDevByIdx(hostPlatform, 0u); + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + alpaka::PlatformCudaRt platform{}; + alpaka::DevCudaRt device = alpaka::getDevByIdx(platform, 0u); + alpaka::Queue queue{device}; + + std::vector input({ + 0.4369, -0.6882, 1.0309, -1.0263, -0.1519, 1.2237, -0.7054, -0.1762, + -0.6811, -2.2597, 1.0388, -0.7993, 0.1468, 1.3257, -0.4714, -0.0958, + 0.7057, -0.3749, -0.3310, 0.0986, -0.1370, 0.0832, -1.6465, -0.2793 + }); + + auto A = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float *A_ptr = reinterpret_cast(alpaka::getPtrNative(A)); + + for (Idx i = 0; i < input.size(); ++i) { + A_ptr[i] = input[i]; + } + + auto A_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, A_d, A); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{24})); + + { + SOFIE_LinearWithLeakyRelu::Session session; + auto result = session.infer(A_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = LinearWithLeakyRelu_ExpectedOutput::outputs; + + for (size_t i = 0; i < 24; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); + } +} + +TEST_F(SofieAlpakaTest, LinearWithSigmoid) +{ + + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + auto A = alpaka::allocBuf(host, Ext1D::all(Idx{48})); + float *A_ptr = reinterpret_cast(alpaka::getPtrNative(A)); + + for (Idx i = 0; i < 48; ++i) { + A_ptr[i] = 1.0; + } + + auto A_d = alpaka::allocBuf(device, Ext1D::all(Idx{48})); + alpaka::memcpy(queue, A_d, A); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{48})); + + { + SOFIE_LinearWithSigmoid::Session session("LinearWithSigmoid_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(A_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = LinearWithSigmoid_ExpectedOutput::all_ones; + for (size_t i = 0; i < 24; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); + } +} diff --git a/src/SOFIE_parsers/CMakeLists.txt b/src/SOFIE_parsers/CMakeLists.txt index 379b7d7..d77d1e6 100644 --- a/src/SOFIE_parsers/CMakeLists.txt +++ b/src/SOFIE_parsers/CMakeLists.txt @@ -102,6 +102,15 @@ target_include_directories(SOFIE_parsers PUBLIC set_target_properties(SOFIE_parsers PROPERTIES POSITION_INDEPENDENT_CODE TRUE) + ROOT_GENERATE_DICTIONARY(G__SOFIE_parsers ${sources_headers} + LINKDEF inc/LinkDef.h + MODULE SOFIE_parsers + OPTIONS --deep +) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_parsers_rdict.pcm + ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_parsers.rootmap + DESTINATION lib) + install(TARGETS SOFIE_parsers LIBRARY DESTINATION lib ) diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt new file mode 100644 index 0000000..2ede060 --- /dev/null +++ b/src/utils/CMakeLists.txt @@ -0,0 +1,11 @@ +add_library(utils INTERFACE) + +target_include_directories(utils INTERFACE + $ + $ +) + +install( + DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/SOFIE + DESTINATION include +) diff --git a/src/utils/SOFIE/RTensor.hxx b/src/utils/SOFIE/RTensor.hxx new file mode 100644 index 0000000..db82dc9 --- /dev/null +++ b/src/utils/SOFIE/RTensor.hxx @@ -0,0 +1,628 @@ +#ifndef SOFIE_RTENSOR +#define SOFIE_RTENSOR + +#include +#include // std::size_t +#include +#include // std::runtime_error +#include // std::stringstream +#include // std::shared_ptr +#include // std::is_convertible +#include // std::reverse +#include // std::random_access_iterator_tag + +namespace SOFIE { + +/// Memory layout type +enum class MemoryLayout : uint8_t { + RowMajor = 0x01, + ColumnMajor = 0x02 +}; + +namespace Internal { + +/// \brief Get size of tensor from shape vector +/// \param[in] shape Shape vector +/// \return Size of contiguous memory +template +inline std::size_t GetSizeFromShape(const T &shape) +{ + if (shape.size() == 0) + return 0; + std::size_t size = 1; + for (auto &s : shape) + size *= s; + return size; +} + +/// \brief Compute strides from shape vector. +/// \param[in] shape Shape vector +/// \param[in] layout Memory layout +/// \return Size of contiguous memory +/// +/// This information is needed for the multi-dimensional indexing. See here: +/// https://en.wikipedia.org/wiki/Row-_and_column-major_order +/// https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.strides.html +template +inline std::vector ComputeStridesFromShape(const T &shape, MemoryLayout layout) +{ + const auto size = shape.size(); + T strides(size); + if (layout == MemoryLayout::RowMajor) { + for (std::size_t i = 0; i < size; i++) { + if (i == 0) { + strides[size - 1 - i] = 1; + } else { + strides[size - 1 - i] = strides[size - 1 - i + 1] * shape[size - 1 - i + 1]; + } + } + } else if (layout == MemoryLayout::ColumnMajor) { + for (std::size_t i = 0; i < size; i++) { + if (i == 0) { + strides[i] = 1; + } else { + strides[i] = strides[i - 1] * shape[i - 1]; + } + } + } else { + std::stringstream ss; + ss << "Memory layout type is not valid for calculating strides."; + throw std::runtime_error(ss.str()); + } + return strides; +} + +/// \brief Compute indices from global index +/// \param[in] shape Shape vector +/// \param[in] idx Global index +/// \param[in] layout Memory layout +/// \return Indice vector +template +inline T ComputeIndicesFromGlobalIndex(const T& shape, MemoryLayout layout, const typename T::value_type idx) +{ + const auto size = shape.size(); + auto strides = ComputeStridesFromShape(shape, layout); + T indices(size); + auto r = idx; + for (std::size_t i = 0; i < size; i++) { + indices[i] = int(r / strides[i]); + r = r % strides[i]; + } + return indices; +} + +/// \brief Compute global index from indices +/// \param[in] strides Strides vector +/// \param[in] idx Indice vector +/// \return Global index +template +inline std::size_t ComputeGlobalIndex(const U& strides, const V& idx) +{ + std::size_t globalIndex = 0; + const auto size = idx.size(); + for (std::size_t i = 0; i < size; i++) { + globalIndex += strides[size - 1 - i] * idx[size - 1 - i]; + } + return globalIndex; +} + +/// \brief Type checking for all types of a parameter pack, e.g., used in combination with std::is_convertible +template +struct and_types : std::true_type { +}; + +template +struct and_types : std::integral_constant()> { +}; + +/// \brief Copy slice of a tensor recursively from here to there +/// \param[in] here Source tensor +/// \param[in] there Target tensor (slice of source tensor) +/// \param[in] mins Minimum of indices for each dimension +/// \param[in] maxs Maximum of indices for each dimension +/// \param[in] idx Current indices +/// \param[in] active Active index needed to stop the recursion +/// +/// Copy the content of a slice of a tensor from source to target. This is done +/// by recursively iterating over the ranges of the slice for each dimension. +template +void RecursiveCopy(const T &here, T &there, + const std::vector &mins, const std::vector &maxs, + std::vector idx, std::size_t active) +{ + const auto size = idx.size(); + for (std::size_t i = mins[active]; i < maxs[active]; i++) { + idx[active] = i; + if (active == size - 1) { + auto idxThere = idx; + for (std::size_t j = 0; j < size; j++) { + idxThere[j] -= mins[j]; + } + there(idxThere) = here(idx); + } else { + Internal::RecursiveCopy(here, there, mins, maxs, idx, active + 1); + } + } +} + +} // namespace SOFIE::Internal + +/// \class SOFIE::RTensor +/// \brief RTensor is a container with contiguous memory and shape information. +/// \tparam T Data-type of the tensor +/// +/// An RTensor is a vector-like container, which has additional shape information. +/// The elements of the multi-dimensional container can be accessed by their +/// indices in a coherent way without taking care about the one-dimensional memory +/// layout of the contiguous storage. This also allows to manipulate the shape +/// of the container without moving the actual elements in memory. Another feature +/// is that an RTensor can own the underlying contiguous memory but can also represent +/// only a view on existing data without owning it. +template > +class RTensor { +public: + // Typedefs + using Value_t = V; + using Shape_t = std::vector; + using Index_t = Shape_t; + using Slice_t = std::vector; + using Container_t = C; + +private: + Shape_t fShape; + Shape_t fStrides; + std::size_t fSize; + MemoryLayout fLayout; + Value_t *fData; + std::shared_ptr fContainer; + +protected: + void ReshapeInplace(const Shape_t &shape); + +public: + // Constructors + + /// \brief Construct a tensor as view on data + /// \param[in] data Pointer to data contiguous in memory + /// \param[in] shape Shape vector + /// \param[in] layout Memory layout + RTensor(Value_t *data, Shape_t shape, MemoryLayout layout = MemoryLayout::RowMajor) + : fShape(shape), fLayout(layout), fData(data), fContainer(nullptr) + { + fSize = Internal::GetSizeFromShape(shape); + fStrides = Internal::ComputeStridesFromShape(shape, layout); + } + + /// \brief Construct a tensor as view on data + /// \param[in] data Pointer to data contiguous in memory + /// \param[in] shape Shape vector + /// \param[in] strides Strides vector + /// \param[in] layout Memory layout + RTensor(Value_t *data, Shape_t shape, Shape_t strides, MemoryLayout layout = MemoryLayout::RowMajor) + : fShape(shape), fStrides(strides), fLayout(layout), fData(data), fContainer(nullptr) + { + fSize = Internal::GetSizeFromShape(shape); + } + + /// \brief Construct a tensor owning externally provided data + /// \param[in] container Shared pointer to data container + /// \param[in] shape Shape vector + /// \param[in] layout Memory layout + RTensor(std::shared_ptr container, Shape_t shape, + MemoryLayout layout = MemoryLayout::RowMajor) + : fShape(shape), fLayout(layout), fContainer(container) + { + fSize = Internal::GetSizeFromShape(shape); + fStrides = Internal::ComputeStridesFromShape(shape, layout); + fData = std::data(*fContainer); + } + + /// \brief Construct a tensor owning data initialized with new container + /// \param[in] shape Shape vector + /// \param[in] layout Memory layout + RTensor(Shape_t shape, MemoryLayout layout = MemoryLayout::RowMajor) + : fShape(shape), fLayout(layout) + { + // TODO: Document how data pointer is determined using STL iterator interface. + // TODO: Sanitize given container type with type traits + fSize = Internal::GetSizeFromShape(shape); + fStrides = Internal::ComputeStridesFromShape(shape, layout); + fContainer = std::make_shared(fSize); + fData = std::data(*fContainer); + } + + // Access elements + Value_t &operator()(const Index_t &idx); + const Value_t &operator() (const Index_t &idx) const; + template Value_t &operator()(Idx... idx); + template const Value_t &operator() (Idx... idx) const; + + // Access properties + std::size_t GetSize() const { return fSize; } + const Shape_t &GetShape() const { return fShape; } + const Shape_t &GetStrides() const { return fStrides; } + Value_t *GetData() { return fData; } + const Value_t *GetData() const { return fData; } + std::shared_ptr GetContainer() { return fContainer; } + const std::shared_ptr GetContainer() const { return fContainer; } + MemoryLayout GetMemoryLayout() const { return fLayout; } + bool IsView() const { return fContainer == nullptr; } + bool IsOwner() const { return !IsView(); } + + // Copy + RTensor Copy(MemoryLayout layout = MemoryLayout::RowMajor) const; + + // Transformations + RTensor Transpose() const; + RTensor Squeeze() const; + RTensor ExpandDims(int idx) const; + RTensor Reshape(const Shape_t &shape) const; + RTensor Resize(const Shape_t &shape); + RTensor Slice(const Slice_t &slice); + + // Iterator class + class Iterator { + private: + RTensor& fTensor; + Index_t::value_type fGlobalIndex; + public: + using iterator_category = std::random_access_iterator_tag; + using value_type = Value_t; + using difference_type = std::ptrdiff_t; + using pointer = Value_t *; + using reference = Value_t &; + + Iterator(RTensor& x, typename Index_t::value_type idx) : fTensor(x), fGlobalIndex(idx) {} + Iterator& operator++() { fGlobalIndex++; return *this; } + Iterator operator++(int) { auto tmp = *this; operator++(); return tmp; } + Iterator& operator--() { fGlobalIndex--; return *this; } + Iterator operator--(int) { auto tmp = *this; operator--(); return tmp; } + Iterator operator+(difference_type rhs) const { return Iterator(fTensor, fGlobalIndex + rhs); } + Iterator operator-(difference_type rhs) const { return Iterator(fTensor, fGlobalIndex - rhs); } + difference_type operator-(const Iterator& rhs) { return fGlobalIndex - rhs.GetGlobalIndex(); } + Iterator& operator+=(difference_type rhs) { fGlobalIndex += rhs; return *this; } + Iterator& operator-=(difference_type rhs) { fGlobalIndex -= rhs; return *this; } + Value_t& operator*() + { + auto idx = Internal::ComputeIndicesFromGlobalIndex(fTensor.GetShape(), fTensor.GetMemoryLayout(), fGlobalIndex); + return fTensor(idx); + } + bool operator==(const Iterator& rhs) const + { + if (fGlobalIndex == rhs.GetGlobalIndex()) return true; + return false; + } + bool operator!=(const Iterator& rhs) const { return !operator==(rhs); }; + bool operator>(const Iterator& rhs) const { return fGlobalIndex > rhs.GetGlobalIndex(); } + bool operator<(const Iterator& rhs) const { return fGlobalIndex < rhs.GetGlobalIndex(); } + bool operator>=(const Iterator& rhs) const { return fGlobalIndex >= rhs.GetGlobalIndex(); } + bool operator<=(const Iterator& rhs) const { return fGlobalIndex <= rhs.GetGlobalIndex(); } + typename Index_t::value_type GetGlobalIndex() const { return fGlobalIndex; }; + }; + + // Iterator interface + // TODO: Document that the iterator always iterates following the physical memory layout. + Iterator begin() noexcept { + return Iterator(*this, 0); + } + Iterator end() noexcept { + return Iterator(*this, fSize); + } +}; + +/// \brief Reshape tensor in place +/// \param[in] shape Shape vector +/// Reshape tensor without changing the overall size +template +inline void RTensor::ReshapeInplace(const Shape_t &shape) +{ + const auto size = Internal::GetSizeFromShape(shape); + if (size != fSize) { + std::stringstream ss; + ss << "Cannot reshape tensor with size " << fSize << " into shape { "; + for (std::size_t i = 0; i < shape.size(); i++) { + if (i != shape.size() - 1) { + ss << shape[i] << ", "; + } else { + ss << shape[i] << " }."; + } + } + throw std::runtime_error(ss.str()); + } + + // Compute new strides from shape + auto strides = Internal::ComputeStridesFromShape(shape, fLayout); + fShape = shape; + fStrides = strides; +} + + +/// \brief Access elements +/// \param[in] idx Index vector +/// \return Reference to element +template +inline Value_t &RTensor::operator()(const Index_t &idx) +{ + const auto globalIndex = Internal::ComputeGlobalIndex(fStrides, idx); + return fData[globalIndex]; +} + +/// \brief Access elements +/// \param[in] idx Index vector +/// \return Reference to element +template +inline const Value_t &RTensor::operator() (const Index_t &idx) const +{ + const auto globalIndex = Internal::ComputeGlobalIndex(fStrides, idx); + return fData[globalIndex]; +} + +/// \brief Access elements +/// \param[in] idx Indices +/// \return Reference to element +template +template +Value_t &RTensor::operator()(Idx... idx) +{ + static_assert(Internal::and_types...>{}, + "Indices are not convertible to std::size_t."); + return operator()({static_cast(idx)...}); +} + +/// \brief Access elements +/// \param[in] idx Indices +/// \return Reference to element +template +template +const Value_t &RTensor::operator() (Idx... idx) const +{ + static_assert(Internal::and_types...>{}, + "Indices are not convertible to std::size_t."); + return operator()({static_cast(idx)...}); +} + +/// \brief Transpose +/// \returns New RTensor +/// The tensor is transposed by inverting the associated memory layout from row- +/// major to column-major and vice versa. Therefore, the underlying data is not +/// touched. +template +inline RTensor RTensor::Transpose() const +{ + MemoryLayout layout; + // Transpose by inverting memory layout + if (fLayout == MemoryLayout::RowMajor) { + layout = MemoryLayout::ColumnMajor; + } else if (fLayout == MemoryLayout::ColumnMajor) { + layout = MemoryLayout::RowMajor; + } else { + throw std::runtime_error("Memory layout is not known."); + } + + // Create copy of container + RTensor x(fData, fShape, fStrides, layout); + + // Reverse shape + std::reverse(x.fShape.begin(), x.fShape.end()); + + // Reverse strides + std::reverse(x.fStrides.begin(), x.fStrides.end()); + + return x; +} + +/// \brief Squeeze dimensions +/// \returns New RTensor +/// Squeeze removes the dimensions of size one from the shape. +template +inline RTensor RTensor::Squeeze() const +{ + // Remove dimensions of one and associated strides + Shape_t shape; + Shape_t strides; + for (std::size_t i = 0; i < fShape.size(); i++) { + if (fShape[i] != 1) { + shape.emplace_back(fShape[i]); + strides.emplace_back(fStrides[i]); + } + } + + // If all dimensions are 1, we need to keep one. + // This does not apply if the inital shape is already empty. Then, return + // the empty shape. + if (shape.size() == 0 && fShape.size() != 0) { + shape.emplace_back(1); + strides.emplace_back(1); + } + + // Create copy, attach new shape and strides and return + RTensor x(*this); + x.fShape = shape; + x.fStrides = strides; + return x; +} + +/// \brief Expand dimensions +/// \param[in] idx Index in shape vector where dimension is added +/// \returns New RTensor +/// Inserts a dimension of one into the shape. +template +inline RTensor RTensor::ExpandDims(int idx) const +{ + // Compose shape vector with additional dimensions and adjust strides + const int len = fShape.size(); + auto shape = fShape; + auto strides = fStrides; + if (idx < 0) { + idx = len + 1 + idx; + } + if (idx < 0) { + throw std::runtime_error("Given negative index is invalid."); + } + else if (idx > len) { + throw std::runtime_error("Given index is invalid."); + } + shape.insert(shape.begin() + idx, 1); + strides = Internal::ComputeStridesFromShape(shape, fLayout); + + // Create view copy, attach new shape and strides and return + RTensor x(*this); + x.fShape = shape; + x.fStrides = strides; + return x; +} + +/// \brief Reshape tensor +/// \param[in] shape Shape vector +/// \returns New RTensor +/// Reshape tensor without changing the overall size +template +inline RTensor RTensor::Reshape(const Shape_t &shape) const +{ + // Create copy, replace and return + RTensor x(*this); + x.ReshapeInplace(shape); + return x; +} + +/// \brief Resize tensor +/// \param[in] shape Shape vector +/// \returns New RTensor +/// Resize tensor into new shape +template +inline RTensor RTensor::Resize(const Shape_t &shape) +{ + // Create new tensor with the specified shape + RTensor x(shape, fLayout); + + // Copying contents from previous tensor + size_t n = (x.GetSize()>fSize) ? fSize : x.GetSize(); + std::copy(this->GetData(), this->GetData() + n, x.GetData() ); + + return x; +} + +/// \brief Create a slice of the tensor +/// \param[in] slice Slice vector +/// \returns New RTensor +/// A slice is a subset of the tensor defined by a vector of pairs of indices. +template +inline RTensor RTensor::Slice(const Slice_t &slice) +{ + // Sanitize size of slice + const auto sliceSize = slice.size(); + const auto shapeSize = fShape.size(); + if (sliceSize != shapeSize) { + std::stringstream ss; + ss << "Size of slice (" << sliceSize << ") is unequal number of dimensions (" << shapeSize << ")."; + throw std::runtime_error(ss.str()); + } + + // Sanitize slice indices + // TODO: Sanitize slice indices + /* + for (std::size_t i = 0; i < sliceSize; i++) { + } + */ + + // Convert -1 in slice to proper pair of indices + // TODO + + // Recompute shape and size + Shape_t shape(sliceSize); + for (std::size_t i = 0; i < sliceSize; i++) { + shape[i] = slice[i][1] - slice[i][0]; + } + auto size = Internal::GetSizeFromShape(shape); + + // Determine first element contributing to the slice and get the data pointer + Value_t *data; + Shape_t idx(sliceSize); + for (std::size_t i = 0; i < sliceSize; i++) { + idx[i] = slice[i][0]; + } + data = &operator()(idx); + + // Create copy and modify properties + RTensor x(*this); + x.fData = data; + x.fShape = shape; + x.fSize = size; + + // Squeeze tensor and return + return x.Squeeze(); +} + +/// Copy RTensor to new object +/// \param[in] layout Memory layout of the new RTensor +/// \returns New RTensor +/// The operation copies all elements of the current RTensor to a new RTensor +/// with the given layout contiguous in memory. Note that this copies by default +/// to a row major memory layout. +template +inline RTensor RTensor::Copy(MemoryLayout layout) const +{ + // Create new tensor with zeros owning the memory + RTensor r(fShape, layout); + + // Copy over the elements from this tensor + const auto mins = Shape_t(fShape.size()); + const auto maxs = fShape; + auto idx = mins; + Internal::RecursiveCopy(*this, r, mins, maxs, idx, 0); + + return r; +} + +/// \brief Pretty printing +/// \param[in] os Output stream +/// \param[in] x RTensor +/// \return Modified output stream +template +std::ostream &operator<<(std::ostream &os, RTensor &x) +{ + const auto shapeSize = x.GetShape().size(); + if (shapeSize == 1) { + os << "{ "; + const auto size = x.GetSize(); + for (std::size_t i = 0; i < size; i++) { + os << x({i}); + if (i != size - 1) + os << ", "; + } + os << " }"; + } else if (shapeSize == 2) { + os << "{"; + const auto shape = x.GetShape(); + for (std::size_t i = 0; i < shape[0]; i++) { + os << " { "; + for (std::size_t j = 0; j < shape[1]; j++) { + os << x({i, j}); + if (j < shape[1] - 1) { + os << ", "; + } else { + os << " "; + } + } + os << "}"; + } + os << " }"; + } else { + os << "{ printing not yet implemented for this rank }"; + } + return os; +} + +} // namespace SOFIE + +namespace cling { +template +std::string printValue(SOFIE::RTensor *x) +{ + std::stringstream ss; + ss << *x; + return ss.str(); +} +} // namespace cling + +#endif // SOFIE_RTENSOR