diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..182ccd4
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,81 @@
+{
+    "files.associations": {
+        "*.icc": "cpp",
+        "limits": "cpp",
+        "cctype": "cpp",
+        "clocale": "cpp",
+        "cmath": "cpp",
+        "cstdarg": "cpp",
+        "cstddef": "cpp",
+        "cstdio": "cpp",
+        "cstdlib": "cpp",
+        "cstring": "cpp",
+        "ctime": "cpp",
+        "cwchar": "cpp",
+        "cwctype": "cpp",
+        "array": "cpp",
+        "atomic": "cpp",
+        "bit": "cpp",
+        "*.tcc": "cpp",
+        "bitset": "cpp",
+        "compare": "cpp",
+        "complex": "cpp",
+        "concepts": "cpp",
+        "cstdint": "cpp",
+        "deque": "cpp",
+        "map": "cpp",
+        "set": "cpp",
+        "string": "cpp",
+        "unordered_map": "cpp",
+        "unordered_set": "cpp",
+        "vector": "cpp",
+        "exception": "cpp",
+        "algorithm": "cpp",
+        "functional": "cpp",
+        "iterator": "cpp",
+        "memory": "cpp",
+        "memory_resource": "cpp",
+        "numeric": "cpp",
+        "optional": "cpp",
+        "random": "cpp",
+        "regex": "cpp",
+        "string_view": "cpp",
+        "system_error": "cpp",
+        "tuple": "cpp",
+        "type_traits": "cpp",
+        "utility": "cpp",
+        "fstream": "cpp",
+        "initializer_list": "cpp",
+        "iomanip": "cpp",
+        "iosfwd": "cpp",
+        "iostream": "cpp",
+        "istream": "cpp",
+        "new": "cpp",
+        "numbers": "cpp",
+        "ostream": "cpp",
+        "sstream": "cpp",
+        "stdexcept": "cpp",
+        "streambuf": "cpp",
+        "cinttypes": "cpp",
+        "typeinfo": "cpp",
+        "charconv": "cpp",
+        "chrono": "cpp",
+        "condition_variable": "cpp",
+        "list": "cpp",
+        "ratio": "cpp",
+        "future": "cpp",
+        "mutex": "cpp",
+        "semaphore": "cpp",
+        "shared_mutex": "cpp",
+        "span": "cpp",
+        "stop_token": "cpp",
+        "thread": "cpp",
+        "cfenv": "cpp",
+        "variant": "cpp",
+        "format": "cpp",
+        "any": "cpp",
+        "source_location": "cpp",
+        "run_inference_particle_net.C": "cpp",
+        "test.C": "cpp"
+    }
+}
diff --git a/README.md b/README.md
index 97902f8..597cb56 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,11 @@ source setup.sh
 ```
 Now ROOT should also access the SOFIE libraries while it runs. This helps to accelerate development. Submit your developments here and we will proceed with the developments in ROOT carefull.
 
-
+3. To enable testing generated code with alpaka implementations, build using the following command:
+```bash
+cmake -Dtesting=ON -DENABLE_ALPAKA_TESTS=ON -DCMAKE_INSTALL_PREFIX=../install -DCMAKE_BUILD_TYPE=RelWithDebInfo ..
+```
+The default architecture is CUDA, but can be configured using an additional`-DALPAKA_BACKEND=hip` cmake option.
     
 ## Inspiration
 The standalone version of SOFIE is developed with inspiration from the standalone version of RooFit developed by Jonas Rembser that can be found [here](https://github.com/guitargeek/roofit).
diff --git a/settings-dev.cmake b/settings-dev.cmake
new file mode 100644
index 0000000..6a8496f
--- /dev/null
+++ b/settings-dev.cmake
@@ -0,0 +1,7 @@
+set (CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "" FORCE)
+set (CMAKE_INSTALL_PREFIX ../install CACHE STRING "" FORCE)
+set (CMAKE_INSTALL_BINDIR bin CACHE STRING "" FORCE)
+set (CMAKE_INSTALL_INCLUDEDIR include CACHE STRING "" FORCE)
+set (CMAKE_INSTALL_LIBDIR lib CACHE STRING "" FORCE)
+set (testing ON CACHE BOOL "" FORCE)
+set (mathmore ON CACHE BOOL "" FORCE)
diff --git a/src/.vscode/settings.json b/src/.vscode/settings.json
new file mode 100644
index 0000000..8bc121a
--- /dev/null
+++ b/src/.vscode/settings.json
@@ -0,0 +1,61 @@
+{
+    "files.associations": {
+        "*.icc": "cpp",
+        "iostream": "cpp",
+        "ostream": "cpp",
+        "cctype": "cpp",
+        "clocale": "cpp",
+        "cmath": "cpp",
+        "cstdarg": "cpp",
+        "cstddef": "cpp",
+        "cstdio": "cpp",
+        "cstdlib": "cpp",
+        "cstring": "cpp",
+        "ctime": "cpp",
+        "cwchar": "cpp",
+        "cwctype": "cpp",
+        "array": "cpp",
+        "atomic": "cpp",
+        "bit": "cpp",
+        "bitset": "cpp",
+        "compare": "cpp",
+        "complex": "cpp",
+        "concepts": "cpp",
+        "cstdint": "cpp",
+        "deque": "cpp",
+        "map": "cpp",
+        "set": "cpp",
+        "string": "cpp",
+        "unordered_map": "cpp",
+        "unordered_set": "cpp",
+        "vector": "cpp",
+        "exception": "cpp",
+        "algorithm": "cpp",
+        "functional": "cpp",
+        "iterator": "cpp",
+        "memory": "cpp",
+        "memory_resource": "cpp",
+        "numeric": "cpp",
+        "optional": "cpp",
+        "random": "cpp",
+        "regex": "cpp",
+        "string_view": "cpp",
+        "system_error": "cpp",
+        "tuple": "cpp",
+        "type_traits": "cpp",
+        "utility": "cpp",
+        "fstream": "cpp",
+        "initializer_list": "cpp",
+        "iomanip": "cpp",
+        "iosfwd": "cpp",
+        "istream": "cpp",
+        "limits": "cpp",
+        "new": "cpp",
+        "numbers": "cpp",
+        "sstream": "cpp",
+        "stdexcept": "cpp",
+        "streambuf": "cpp",
+        "cinttypes": "cpp",
+        "typeinfo": "cpp"
+    }
+}
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c48e8d1..102ca3b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -8,3 +8,4 @@ set(sofie_legacy_eval_backend ON CACHE BOOL "" FORCE)
 
 add_subdirectory(SOFIE_core)
 add_subdirectory(SOFIE_parsers)
+add_subdirectory(utils)
diff --git a/src/SOFIE_core/CMakeLists.txt b/src/SOFIE_core/CMakeLists.txt
index 7297957..4cab8e0 100644
--- a/src/SOFIE_core/CMakeLists.txt
+++ b/src/SOFIE_core/CMakeLists.txt
@@ -76,6 +76,7 @@ list(TRANSFORM sources_headers PREPEND "inc/")
 set(sources_cxx
     src/RModel_Base.cxx
     src/RModel.cxx
+    src/RModel_ALPAKA.cxx
     src/RModel_GNN.cxx
     src/RModel_GraphIndependent.cxx
     src/RFunction.cxx
@@ -87,18 +88,24 @@ set(sources_cxx
 
 target_sources(SOFIE_core PRIVATE ${sources_headers} ${sources_cxx})
 target_include_directories(SOFIE_core PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/inc)
+target_link_libraries(SOFIE_core PUBLIC utils)
 target_link_libraries(SOFIE_core PUBLIC
     Tree
     Core
     RIO
 )
 
-ROOT_GENERATE_DICTIONARY(G__SOFIE ${sources_headers}
+ROOT_GENERATE_DICTIONARY(G__SOFIE_core ${sources_headers}
     LINKDEF inc/LinkDef.h
     MODULE SOFIE_core
     OPTIONS --deep
 )
 
+# Install the dictionaries.
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_core_rdict.pcm
+              ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_core.rootmap
+        DESTINATION lib)
+
 install(TARGETS SOFIE_core
         LIBRARY DESTINATION lib
 )
diff --git a/src/SOFIE_core/README.md b/src/SOFIE_core/README.md
index 033cad4..2259d7a 100644
--- a/src/SOFIE_core/README.md
+++ b/src/SOFIE_core/README.md
@@ -25,7 +25,6 @@ SOFIE works in a parser-generator working architecture. With SOFIE, the user get
 From ROOT command line, or in a ROOT macro, we can proceed with an ONNX model:
 
 ```c++
-using namespace TMVA::Experimental;
 SOFIE::RModelParser_ONNX parser;
 SOFIE::RModel model = parser.Parse(“./example_model.onnx”);
 model.Generate();
@@ -73,7 +72,6 @@ SOFIE also supports generating inference code with RDataFrame as inputs, refer t
 
 Here is the updated list of supported ONNX operators. You can obtain this list by doing
 ```cpp
-using namespace TMVA::Experimental;
 SOFIE::RModelParser_ONNX parser;
 std::vector<std::string> supportedOperators = parser.GetRegisteredOperators();
 ```
@@ -164,7 +162,6 @@ The above operators are supported for tensors of the following types:
 
 You can also check your model whether all operators are implemented by doing the following:
 ```c++
-using namespace TMVA::Experimental;
 SOFIE::RModelParser_ONNX parser;
 parser.CheckModel("example_model.ONNX");
 ```
diff --git a/src/SOFIE_core/inc/SOFIE/RFunction.hxx b/src/SOFIE_core/inc/SOFIE/RFunction.hxx
index 53c30e3..f79691a 100644
--- a/src/SOFIE_core/inc/SOFIE/RFunction.hxx
+++ b/src/SOFIE_core/inc/SOFIE/RFunction.hxx
@@ -3,6 +3,7 @@
 
 #include "SOFIE/RModel_Base.hxx"
 #include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
 
 #include <memory>
 #include <string>
diff --git a/src/SOFIE_core/inc/SOFIE/RModel.hxx b/src/SOFIE_core/inc/SOFIE/RModel.hxx
index 79541af..ed28b48 100644
--- a/src/SOFIE_core/inc/SOFIE/RModel.hxx
+++ b/src/SOFIE_core/inc/SOFIE/RModel.hxx
@@ -16,14 +16,21 @@ private:
    int fVerbose = 0;
    int fBatchSize = -1;
    long fReadPos = 0;  // reading file position
+   size_t fConstantTensorSize = 0; // size  (in Bytes) of the allocated constant tensors
+   size_t fWeightsTensorSize = 0;  // size  (in Bytes) of the allocated weight tensors
+   size_t fOtherTensorSize = 0;    // size  (in Bytes) of intermediate tensors which are not managed by the memory pool
+
+   OptimizationLevel fOptimizationLevel = OptimizationLevel::kExtended;
 
    std::unordered_map<std::string, InputTensorInfo> fInputTensorInfos; // input tensors where shape may not fully defined or other graph inputs?
    std::unordered_map<std::string, TensorInfo> fReadyInputTensorInfos; // input tensors where shape is full defined
    std::unordered_map<std::string, InitializedTensor> fInitializedTensors;
    std::unordered_map<std::string, TensorInfo> fIntermediateTensorInfos;
    std::unordered_map<std::string, DynamicTensorInfo> fDynamicTensorInfos;
+   std::unordered_map<std::string, std::pair<std::vector<Dim>, bool>> fShapeTensors; // constant tensors describing a shape
    std::unordered_map<std::string, std::string>
       fShapeParams; // parameters defining the dynamic shape (e.g. batch size), store also its default value
+   std::vector<std::string> fDimShapeNames; // parameter names used to define the shapes
    std::vector<std::string> fOutputTensorNames;
    std::vector<std::string> fInputTensorNames; // input tensor names using ONNX order
 
@@ -58,9 +65,14 @@ public:
 
    int Verbose() const { return fVerbose;}
 
-   const std::vector<size_t> &GetTensorShape(std::string name) const;
-   std::vector<Dim> GetDynamicTensorShape(std::string name) const;
-   const ETensorType &GetTensorType(std::string name) const;
+   const std::vector<size_t> &GetTensorShape(const std::string & name) const;
+   std::vector<Dim> GetDimTensorShape(const std::string & name) const;
+   const ETensorType &GetTensorType(const std::string & name) const;
+   std::vector<Dim> GetDynamicTensorShape(const std::string & name) const ;
+
+   // get the values for the tensor representing a shape
+   const std::vector<Dim> & GetShapeTensorValues(const std::string & tensor_name) const;
+
 
    bool CheckIfTensorAlreadyExist(std::string tensor_name);
    void AddInputTensorInfo(std::string input_name, ETensorType type, std::vector<Dim> shape);
@@ -102,6 +114,8 @@ public:
       AddInitializedTensor(tensor_name,  GetTemplatedType(T()), shape, data);
    }
 
+   void AddShapeTensor(const std::string & name, const std::vector<Dim> & shapeValues, bool scalar = false);
+
    // add and initialize subgraph to the model
    void InitializeSubGraph(std::shared_ptr<RModel>  graph);
 
@@ -118,13 +132,15 @@ public:
    bool IsDimInputTensor(const std::string &name) const;
    // check if tensor is a fully specified input tensor
    bool IsReadyInputTensor(const std::string &name) const;
+   /// check if a tensor is a shape tensor
+   bool IsShapeTensor(const std::string & name) const;
 
    // Add intermediate tensor
    void AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector<Dim> dim_shape);
    void AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector<std::size_t> shape);
    // Add an intermediate dynamic tensor
    void AddDynamicTensor(std::string tensor_name, ETensorType type, std::vector<Dim> shape);
-
+   void AddShapeParam(const std::string & name, size_t def_value = 0);
    void AddInputTensorName(std::string name);
    void AddOutputTensorNameList(std::vector<std::string> output_tensor_names);
    void
@@ -132,6 +148,8 @@ public:
    void UpdateInitializedTensor(std::string tensor_name, ETensorType type, std::vector<std::size_t> shape,
                                 std::shared_ptr<void> data);
    std::shared_ptr<void> GetInitializedTensorData(std::string tensor_name);
+   template<class T>
+   std::vector<T> GetTensorData(const std::string & name);
 
    void Initialize(int batchSize = -1, bool verbose = false);
    void Initialize(const std::map<std::string,size_t> & inputParams, bool verbose = false);
@@ -141,34 +159,64 @@ public:
    {
       Generate(static_cast<std::underlying_type_t<Options>>(options), batchSize, pos, verbose);
    }
+   void GenerateGPU_ALPAKA(std::underlying_type_t<Options> options, int batchSize = -1, bool verbose = false);
+   void GenerateGPU_ALPAKA(Options options = Options::kDefault, int batchSize = -1, bool verbose = false)
+   {
+      GenerateGPU_ALPAKA(static_cast<std::underlying_type_t<Options>>(options), batchSize, verbose);
+   }
    // generate the infer function signature. If isdecl= false generate the calling infer function
    // used to infer the sub-graphs
    std::string GenerateInferSignature(bool isdecl = true);
 
+   // generate the infer function signature for inference on ALPAKA. If isdecl= false generate the calling infer function
+   // used to infer the sub-graphs
+   std::string GenerateInferSignature_GPU_ALPAKA(bool isdecl = true);
+
+   void RemoveIntermediateTensor(const std::string& tensor_name){
+      fIntermediateTensorInfos.erase(tensor_name);
+   }
+
    // calculate total intermediate memory and position intermediate tensor addresses
-   std::string AllocateIntermediateMemory(std::span<const std::string_view> op_output_tensors);
-   void CheckAndFlushIntermediateMemory(std::span<const std::string_view> op_output_tensors, const size_t& op_idx);
+   std::string AllocateIntermediateMemory(std::span<const std::string> op_output_tensors);
+   void CheckAndFlushIntermediateMemory(std::span<const std::string> op_output_tensors, const size_t& op_idx);
 
 protected:
    // internal functions
    // generate code for the initialized tensors
    void GenerateInitializedTensorInfo();
+
+   void GenerateInitializedTensorInfo_GPU_ALPAKA(); 
    // generate code for the intermediate tensors
    void GenerateIntermediateTensorInfo();
+
+   // generate code for the temporary initialized tensors containers
+   void GenerateTemporaryInitializedTensorContainers_GPU_ALPAKA();
+
    // generate code for the dynamic tensors
    void GenerateDynamicTensorInfo();
+
+   void GenerateDynamicTensorInfo_GPU_ALPAKA();
    // generate code for declarations needed by operators
    void GenerateOperatorDeclarations();
    // generate code for inference
    void GenerateOutput();
+
+   void GenerateOutput_GPU_ALPAKA();
+
+   void MoveInitializedTensorsToBuffers_ALPAKA();
    // generate code for initializing memory pool for intermediate tensors
    void GenerateIntermediateMemoryPool();
    // Generate all session code
    void GenerateSessionCode();
+   void GenerateSessionCode_GPU_ALPAKA();
+   void GenerateGPU_ALPAKA_Buffers();
+
+   void CheckAndFuseOperators();
 
 public:
    const std::vector<std::string> &GetInputTensorNames() const { return fInputTensorNames; }
    const std::vector<std::string> &GetOutputTensorNames() const { return fOutputTensorNames; }
+   const std::vector<std::string> & GetDimShapeNames() const { return fDimShapeNames; }
 
    void ReadInitializedTensorsFromFile(long);
    long WriteInitializedTensorsToFile(std::string filename = "");
@@ -203,6 +251,21 @@ public:
    ClassDefNV(RModel, 3);
 };
 
+template<class T>
+inline std::vector<T> RModel::GetTensorData(const std::string & name) {
+   if (!IsInitializedTensor(name)) return std::vector<T>{};
+   T * data = static_cast<T*>(GetInitializedTensorData(name).get());
+   size_t size = ConvertShapeToLength(GetTensorShape(name));
+   return std::vector<T>(data, data+size);
+}
+
+template<>
+inline std::vector<Dim> RModel::GetTensorData<Dim>(const std::string & name) {
+   if (!IsShapeTensor(name)) return std::vector<Dim>{};
+   return GetShapeTensorValues(name);
+}
+
+
 } // namespace SOFIE
 
 #endif // SOFIE_RMODEL
diff --git a/src/SOFIE_core/inc/SOFIE/RModel_Base.hxx b/src/SOFIE_core/inc/SOFIE/RModel_Base.hxx
index f8a9d34..deac58b 100644
--- a/src/SOFIE_core/inc/SOFIE/RModel_Base.hxx
+++ b/src/SOFIE_core/inc/SOFIE/RModel_Base.hxx
@@ -12,7 +12,6 @@
 #include <fstream>
 #include <sstream>
 #include "SOFIE/SOFIE_common.hxx"
-#include "SOFIE/ROperator.hxx"
 #include "TBuffer.h"
 
 
@@ -27,10 +26,26 @@ enum class Options {
    kGNNComponent = 0x10,
 };
 
+// Optimization levels inspired by ONNXRuntime.
+// We only get Operator Fusion with the Basic, and
+// memory reuse with Extended. kExtended is enabled
+// by default
+enum class OptimizationLevel {
+   kBasic = 0x0,
+   kExtended = 0x1,
+};
+
 enum class WeightFileType { None, RootBinary, Text };
 
-std::underlying_type_t<Options> operator|(Options opA, Options opB);
-std::underlying_type_t<Options> operator|(std::underlying_type_t<Options> opA, Options opB);
+
+inline std::underlying_type_t<Options> operator|(Options opA, Options opB) {
+    return static_cast<std::underlying_type_t<Options>>(opA) |
+           static_cast<std::underlying_type_t<Options>>(opB);
+}
+
+inline std::underlying_type_t<Options> operator|(std::underlying_type_t<Options> opA, Options opB) {
+    return opA | static_cast<std::underlying_type_t<Options>>(opB);
+}
 
 class RModel_Base {
 
@@ -53,6 +68,43 @@ protected:
    bool fIsGNN = false;
    bool fIsGNNComponent = false;
 
+   // Function to generate the code for declaring and initializing constant tensors
+   // This is for tensors which are not part of weight files and can be created from the Constant operator
+   template <typename T>
+   std::string GenerateConstantTensorCode(const std::pair<std::string, InitializedTensor> &t)
+   {
+      std::stringstream strs;
+      std::string type = ConvertTypeToString(t.second.type());
+      size_t length = ConvertShapeToLength(t.second.shape());
+      // avoid using stack sizes for constant tensors to reduce compilation time
+      bool allocateOnStack = (length > 100) ? false : true;
+
+      const T *data = t.second.data<T>();
+
+      // and check if all values are the same
+      bool sameData = false;
+      // for non stack allocation check if data are the same
+      if (!allocateOnStack && length > 1) {
+         size_t idx = 1;
+         do {
+            sameData = (data[idx] == data[idx - 1]);
+            idx++;
+         } while (sameData && idx < length);
+      }
+      if (allocateOnStack) {
+         strs << type << " tensor_" << t.first << "[" << length << "] = " << ConvertValuesToString(length, data) << ";\n";
+      } else {
+         strs << "std::vector<" << type << "> fTensor_" << t.first << " = ";
+         if (sameData)
+            strs << "std::vector<" << type << ">(" << length << ", " << ConvertValToString(data[0]) << ");\n";
+         else {
+            strs << ConvertValuesToString(length, data) << ";\n";
+         }
+         strs << "const " << type << " * tensor_" + t.first + " = fTensor_" + t.first + ".data();\n";
+      }
+      return strs.str();
+   }
+
 public:
    /**
        Default constructor. Needed to allow serialization of ROOT objects. See
@@ -82,6 +134,7 @@ public:
        fCustomOpHeaders.insert(filename);
    }
    void GenerateHeaderInfo(std::string &hgname);
+   void GenerateHeaderInfo_GPU_ALPAKA(std::string& hgname);
    void PrintGenerated() { std::cout << fGC; }
 
    std::string ReturnGenerated() { return fGC; }
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator.hxx b/src/SOFIE_core/inc/SOFIE/ROperator.hxx
index edbec58..17b62f6 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator.hxx
@@ -2,63 +2,105 @@
 #define SOFIE_ROPERATOR
 
 #include <vector>
+#include <set>
 #include <memory>
 
 #include "SOFIE/SOFIE_common.hxx"
-//#include "RModel.hxx"
-
-
 
 
 namespace SOFIE{
 
 class RModel;
 
+enum class OperatorKind {
+   GEMM = 0,
+   LAYERNORM = 1,
+   RELU = 2,
+   CONSTANT = 3,
+   CONSTANTOFSHAPE = 4,
+   UNDEFINED = 5,
+   CONV=6,
+   BATCHNORM=7
+};
+
+inline const char* toString(OperatorKind kind) {
+   switch (kind) {
+       case OperatorKind::GEMM:       return "GEMM";
+       case OperatorKind::LAYERNORM:  return "LAYERNORM";
+       case OperatorKind::RELU:       return "RELU";
+       case OperatorKind::CONSTANT:        return "CONSTANT";
+       case OperatorKind::CONSTANTOFSHAPE: return "CONSTANTOFSHAPE";
+       case OperatorKind::BATCHNORM:       return "BATCHNORM";  
+       case OperatorKind::CONV:       return "CONV";
+       case OperatorKind::UNDEFINED:  return "UNDEFINED";
+       default:                       return "UNKNOWN";
+   }
+}
+
+inline std::set<OperatorKind> FusableKinds = { OperatorKind::RELU, OperatorKind::LAYERNORM, OperatorKind::BATCHNORM};
+
 class ROperator{
 
 
 public:
    virtual std::vector<std::string> GetBlasRoutines() { return {}; }
    virtual std::vector<std::string> GetStdLibs() { return {}; }
-   virtual std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>>) = 0;
-   virtual std::vector<ETensorType> TypeInference(std::vector<ETensorType>) = 0;
+   virtual std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>>) { return {}; };
+   virtual std::vector<ETensorType> TypeInference(std::vector<ETensorType>) { return {}; };
    virtual void Initialize(RModel&) = 0;
    virtual std::string Generate(std::string OpName) = 0;  //expect unique opName for each operator within the same RModel
+   virtual std::string Generate_GPU_ALPAKA(std::string OpName){ return "";} //expect unique opName for each operator within the same RModel
    // generate initialization code for session constructor
    virtual std::string GenerateInitCode() { return "";}
+   virtual std::string GenerateInitCode_GPU_ALPAKA() { return "";};
    // generate some specific declaration code for Session
    virtual std::string GenerateDeclCode() { return "";}
    // generate session data members specific to operator
    virtual std::string GenerateSessionMembersCode(std::string /*opName*/) { return ""; }
+   virtual std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) { return ""; }
+   virtual std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) { return ""; }
    virtual std::string Header() { return "";}
+   virtual std::string GetFusableOutputTensorName() { return "";}
+   virtual std::string GetBlasConfig() { return ""; }
+   virtual void UpdateFusableTensorName(std::string, const std::function<void(const std::string&)>& removal_func){ return;};
 
    //virtual void Forward_reference() = 0;
    //virtual void Forward_blas() = 0;
    virtual ~ROperator(){}
 
 protected:
-
+   OperatorKind fKind = OperatorKind::UNDEFINED;
+   size_t fOpOrder = 0;
    const std::string SP = "   ";    ///< space used to correctly indent the generated C++ code
    bool fUseSession = false;        ///< flag to identify if using the session class
    bool fIsOutputConstant = false;  ///< flag to identify if operator has a constant output (no need to generate code)
-   
-   mutable std::vector<std::string_view> fInputTensorNames;
-   mutable std::vector<std::string_view> fOutputTensorNames;
+   bool fIsOutputParamShape = false;     ///< flag to identify of the output represents a parametric shape (can be knwon at compile time)
+
+   mutable std::vector<std::string> fInputTensorNames;
+   mutable std::vector<std::string> fOutputTensorNames;
 
 public:
-   std::span<const std::string_view> GetOpInputTensors() const {
+   std::span<const std::string> GetOpInputTensors() const {
       return fInputTensorNames;
    }
 
-   std::span<const std::string_view> GetOpOutputTensors() const {
+   std::span<const std::string> GetOpOutputTensors() const {
       return fOutputTensorNames;
    }
-   
+
+   OperatorKind GetKind() const { return fKind; }
+
+   void RegisterOperatorOrder(const size_t ord){
+      fOpOrder = ord;
+   }
+   size_t GetOpOrder(){
+      return fOpOrder;
+   }
+
 };
 
 
 
 }//SOFIE
 
-
 #endif //SOFIE_OPERATOR
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx
index 127eaff..2d0e6cb 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx
@@ -1,5 +1,5 @@
-#ifndef SOFIE_ROperator_BasicBinary
-#define SOFIE_ROperator_BasicBinary
+#ifndef SOFIE_SOFIE_ROperator_BasicBinary
+#define SOFIE_SOFIE_ROperator_BasicBinary
 
 #include "SOFIE/SOFIE_common.hxx"
 #include "SOFIE/ROperator.hxx"
@@ -7,9 +7,15 @@
 
 #include <sstream>
 
-namespace SOFIE{
+namespace SOFIE {
 
-enum EBasicBinaryOperator { Add, Sub, Mul, Div, Pow };
+enum EBasicBinaryOperator {
+   Add,
+   Sub,
+   Mul,
+   Div,
+   Pow
+};
 
 template <typename T, EBasicBinaryOperator Op1>
 struct BinaryOperatorTrait {};
@@ -17,42 +23,42 @@ struct BinaryOperatorTrait {};
 template <typename T>
 struct BinaryOperatorTrait<T, Add> {
    static const std::string Name() { return "Add"; }
-   static std::string Op(const std::string & t1, const std::string t2) { return t1 + " + " + t2; }
-   static T Func(T t1, T t2) {return  t1 + t2;}
+   static std::string Op(const std::string &t1, const std::string t2) { return t1 + " + " + t2; }
+   static T Func(T t1, T t2) { return t1 + t2; }
 };
 
 template <typename T>
 struct BinaryOperatorTrait<T, Sub> {
    static const std::string Name() { return "Sub"; }
-   static std::string Op(const std::string & t1, const std::string t2) { return t1 + " - " + t2; }
-   static T Func (T t1, T t2) { return t1 - t2;}
+   static std::string Op(const std::string &t1, const std::string t2) { return t1 + " - " + t2; }
+   static T Func(T t1, T t2) { return t1 - t2; }
 };
 
 template <typename T>
 struct BinaryOperatorTrait<T, Mul> {
    static const std::string Name() { return "Mul"; }
-   static std::string Op(const std::string & t1, const std::string t2) { return t1 + " * " + t2; }
-   static T Func (T t1, T t2) { return  t1 * t2;}
+   static std::string Op(const std::string &t1, const std::string t2) { return t1 + " * " + t2; }
+   static T Func(T t1, T t2) { return t1 * t2; }
 };
 
 template <typename T>
 struct BinaryOperatorTrait<T, Div> {
    static const std::string Name() { return "Div"; }
-   static std::string Op(const std::string & t1, const std::string t2) { return t1 + " / " + t2; }
-   static T Func (T t1, T t2) { return t1/t2;}
+   static std::string Op(const std::string &t1, const std::string t2) { return t1 + " / " + t2; }
+   static T Func(T t1, T t2) { return t1 / t2; }
 };
 
 template <typename T>
 struct BinaryOperatorTrait<T, Pow> {
    static const std::string Name() { return "Pow"; }
-   static std::string Op(const std::string & t1, const std::string t2) { return "std::pow(" + t1 + "," + t2 + ")"; }
-   static T Func (T t1, T t2) { return std::pow(t1,t2);}
+   static std::string Op(const std::string &t1, const std::string t2) { return "std::pow(" + t1 + "," + t2 + ")"; }
+   static T Func(T t1, T t2) { return std::pow(t1, t2); }
 };
 
-template<typename T, EBasicBinaryOperator Op>
-class ROperator_BasicBinary final : public ROperator{
+template <typename T, EBasicBinaryOperator Op>
+class ROperator_BasicBinary final : public ROperator {
 private:
-
+   int fBroadcastFlag = 0;
    std::string fNA;
    std::string fNB;
    std::string fNBroadcastedA;
@@ -63,154 +69,446 @@ private:
    std::vector<size_t> fShapeB;
    std::vector<size_t> fShapeY;
 
+   std::vector<Dim> fDimShapeA;
+   std::vector<Dim> fDimShapeB;
+   std::vector<Dim> fDimShapeY;
+
 public:
-   ROperator_BasicBinary(){}
-   ROperator_BasicBinary(std::string nameA, std::string nameB, std::string nameY):
-      fNA(UTILITY::Clean_name(nameA)), fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY)){
-         fInputTensorNames = { fNA, fNB };
-         fOutputTensorNames = { fNY };
-      }
+   ROperator_BasicBinary() {}
+   ROperator_BasicBinary(std::string nameA, std::string nameB, std::string nameY)
+      : fNA(UTILITY::Clean_name(nameA)), fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY))
+   {
+      fInputTensorNames = {fNA, fNB};
+      fOutputTensorNames = {fNY};
+   }
 
    // type of output given input
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-      return input;
-   }
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override { return input; }
 
    // shape of output tensors given input tensors
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override
+   {
       // assume now inputs have same shape (no broadcasting)
       auto ret = std::vector<std::vector<size_t>>(1, input[0]); // return vector size 1 with first input
       return ret;
    }
 
-   void Initialize(RModel& model) override {
+   void Initialize(RModel &model) override
+   {
       // input must be a graph input, or already initialized intermediate tensor
-      if (!model.CheckIfTensorAlreadyExist(fNA)){
+      if (!model.CheckIfTensorAlreadyExist(fNA)) {
          throw std::runtime_error(std::string("TMVA SOFIE Binary Op Input Tensor ") + fNA + "is not found in model");
       }
       if (!model.CheckIfTensorAlreadyExist(fNB)) {
          throw std::runtime_error(std::string("TMVA SOFIE Binary Op Input Tensor ") + fNB + "is not found in model");
       }
-      fShapeA = model.GetTensorShape(fNA);
-      fShapeB = model.GetTensorShape(fNB);
-      bool broadcast = !UTILITY::AreSameShape(fShapeA, fShapeB);
-      if (broadcast) {
-         // Y is the common shape of A and B
-         fShapeY = UTILITY::UnidirectionalBroadcastShape(fShapeA, fShapeB);
-         bool broadcastA = !UTILITY::AreSameShape(fShapeA, fShapeY);
-         bool broadcastB = !UTILITY::AreSameShape(fShapeB, fShapeY);
-         // Broadcast A to Y
-         if (broadcastA) {
-            fNBroadcastedA = "Broadcasted" + fNA + "to" + fNY;
-            if (model.IsInitializedTensor(fNA)) {
-               auto data = model.GetInitializedTensorData(fNA);
-               std::shared_ptr<void> broadcastedData(
-                  UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeA, fShapeY),
-                  std::default_delete<T[]>());
-               // Update the data and the shape of A
-               model.AddConstantTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY, broadcastedData);
-               fShapeA = fShapeY;
+      int dynamicInputs = 0;
+      if (model.IsDynamicTensor(fNA)) {
+         fDimShapeA = model.GetDimTensorShape(fNA);
+         dynamicInputs |= 1;
+      } else {
+         fShapeA = model.GetTensorShape(fNA);
+         fDimShapeA = ConvertShapeToDim(fShapeA);
+      }
+      if (model.IsDynamicTensor(fNB)) {
+         dynamicInputs |= 2;
+         fDimShapeB = model.GetDimTensorShape(fNB);
+      } else {
+         fShapeB = model.GetTensorShape(fNB);
+         fDimShapeB = ConvertShapeToDim(fShapeB);
+      }
+      if (dynamicInputs & 1 && model.Verbose())
+         std::cout << BinaryOperatorTrait<T, Op>::Name() << " : input " << fNA << " is dynamic "
+                   << ConvertDimShapeToString(fDimShapeA) << "  ";
+      if (dynamicInputs & 2 && model.Verbose())
+         std::cout << BinaryOperatorTrait<T, Op>::Name() << " : input " << fNB << " is dynamic "
+                   << ConvertDimShapeToString(fDimShapeB) << "  ";
+      std::cout << std::endl;
+      // check if need to broadcast at initialization time if shapes are known and different
+      // (we could broadcast the tensor tensor to maximum values of dynamic shapes - to be done)
+      // case of known shapes
+      // if shapes are known find the output shape from broadcasting
+      if (dynamicInputs == 0) {
+         auto ret = UTILITY::MultidirectionalBroadcastShape(fShapeA, fShapeB);
+         fBroadcastFlag = ret.first;
+         fShapeY = ret.second;
+         if (model.IsConstantTensor(fNA) && model.IsConstantTensor(fNB)) {
+            bool broadcast = fBroadcastFlag > 0;
+            if (broadcast) {
+               // Y is the common shape of A and B
+               bool broadcastA = fBroadcastFlag & 2;
+               bool broadcastB = fBroadcastFlag & 1;
+               // Broadcast A to Y
+               if (broadcastA) {
+                  fNBroadcastedA = "Broadcasted" + fNA + "to" + fNY;
+                  auto data = model.GetInitializedTensorData(fNA);
+                  std::shared_ptr<void> broadcastedData(
+                     UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeA, fShapeY),
+                     std::default_delete<T[]>());
+                  if (model.Verbose())
+                     std::cout << "broadcasted data A " << ConvertShapeToString(fShapeY) << " : "
+                               << ConvertValuesToString(ConvertShapeToLength(fShapeY),
+                                                        static_cast<T *>(broadcastedData.get()))
+                               << std::endl;
+                  // Update the data and the shape of A
+                  model.AddConstantTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY, broadcastedData);
+                  fShapeA = fShapeY;
+                  fDimShapeA = ConvertShapeToDim(fShapeA);
+               }
+               // Broadcast B to Y
+               if (broadcastB) {
+                  fNBroadcastedB = "Broadcasted" + fNB + "to" + fNY;
+                  auto data = model.GetInitializedTensorData(fNB);
+                  if (model.Verbose())
+                     std::cout << "data B " << ConvertShapeToString(fShapeB) << " : "
+                               << ConvertValuesToString(ConvertShapeToLength(fShapeB), static_cast<T *>(data.get()))
+                               << std::endl;
+                  std::shared_ptr<void> broadcastedData(
+                     UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeB, fShapeY),
+                     std::default_delete<T[]>());
+                  // do not update tensor B but add broadcasted one (since it can be input to some other operators)
+                  if (model.Verbose())
+                     std::cout << "broadcasted data B " << ConvertShapeToString(fShapeY) << " : "
+                               << ConvertValuesToString(ConvertShapeToLength(fShapeY),
+                                                        static_cast<T *>(broadcastedData.get()))
+                               << std::endl;
+                  model.AddConstantTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY, broadcastedData);
+                  fShapeB = fShapeY;
+                  fDimShapeB = ConvertShapeToDim(fShapeB);
+               }
             } else {
-               // Add an intermediate tensor for broadcasting A
-               model.AddIntermediateTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY);
+               fShapeY = fShapeA;
             }
-         }
-         // Broadcast B to Y
-         if (broadcastB) {
-            fNBroadcastedB = "Broadcasted" + fNB + "to" + fNY;
-            if (model.IsInitializedTensor(fNB)) {
-               auto data = model.GetInitializedTensorData(fNB);
-               std::cout << "data B " << ConvertShapeToString(fShapeB) << " : " <<
-                  ConvertValuesToString(ConvertShapeToLength(fShapeB), static_cast<T*>(data.get())) << std::endl;
-               std::shared_ptr<void> broadcastedData(
-                  UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeB, fShapeY),
-                  std::default_delete<T[]>());
-               // do not update tensor B but add broadcasted one (since it can be input to some other operators)
-               std::cout << "broadcasted data B " << ConvertShapeToString(fShapeY) << " : " <<
-                  ConvertValuesToString(ConvertShapeToLength(fShapeY), static_cast<T*>(broadcastedData.get())) << std::endl;
-               model.AddConstantTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY, broadcastedData);
-               fShapeB = fShapeY;
-            } else {
-               // Add an intermediate tensor for broadcasting B
-               model.AddIntermediateTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY);
+            // tensors are constant: perform here the binary operation
+
+            const std::string &nameA = fNBroadcastedA.empty() ? fNA : fNBroadcastedA;
+            const std::string &nameB = fNBroadcastedB.empty() ? fNB : fNBroadcastedB;
+            auto dataA = static_cast<T *>(model.GetInitializedTensorData(nameA).get());
+            auto dataB = static_cast<T *>(model.GetInitializedTensorData(nameB).get());
+            std::vector<T> dataY(ConvertShapeToLength(fShapeY));
+            for (size_t i = 0; i < dataY.size(); i++) {
+               dataY[i] = BinaryOperatorTrait<T, Op>::Func(dataA[i], dataB[i]);
             }
+            model.AddConstantTensor<T>(fNY, fShapeY, dataY.data());
+            // flag tensors to not be written in the weight file
+            model.SetNotWritableInitializedTensor(nameA);
+            model.SetNotWritableInitializedTensor(nameB);
+            fIsOutputConstant = true;
+            if (model.Verbose()) {
+               std::cout << BinaryOperatorTrait<T, Op>::Name() << " : " << fNA << "  " << ConvertShapeToString(fShapeA)
+                         << " , " << fNB << "  " << ConvertShapeToString(fShapeB) << " ---> " << fNY << "  "
+                         << ConvertShapeToString(fShapeY) << " : " << ConvertValuesToString(dataY) << std::endl;
+            }
+         } else {
+            // case of defined and non-constant tensors
+            model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fShapeY);
+            if (model.Verbose()) {
+               std::cout << BinaryOperatorTrait<T, Op>::Name() << " : " << fNA << "  " << ConvertShapeToString(fShapeA)
+                         << " , " << fNB << "  " << ConvertShapeToString(fShapeB) << " ---> " << fNY << "  "
+                         << ConvertShapeToString(fShapeY) << std::endl;
+            }
+            // we convert non-dim shapes to Dim shapes
+            fDimShapeY = ConvertShapeToDim(fShapeY);
          }
       } else {
-         fShapeY = fShapeA;
-      }
-      // check case of constant  output (if all inputs are defined)
-      if (model.IsInitializedTensor(fNA) && model.IsInitializedTensor(fNB)) {
-         const std::string& nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA;
-         const std::string& nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB;
-         auto dataA = static_cast<T *>(model.GetInitializedTensorData(nameA).get());
-         auto dataB = static_cast<T *>(model.GetInitializedTensorData(nameB).get());
-         std::vector<T> dataY(ConvertShapeToLength(fShapeY));
-         for (size_t i = 0; i < dataY.size(); i++) {
-            dataY[i] = BinaryOperatorTrait<T,Op>::Func(dataA[i], dataB[i]);
+         // case A or B have dynamic shapes. We need to broadcast if shape are not same
+         auto ret = UTILITY::MultidirectionalBroadcastShape(fDimShapeA, fDimShapeB);
+         fBroadcastFlag = ret.first;
+         fDimShapeY = ret.second;
+         // case of all parametric shapes and MultiDirectionalBroadcastShape  return the max of the 2
+         // need to do before we declare the output tensor shape and the broadcasted ones
+         if (ret.first & 4) {
+            // check if one of the parameter is an input dimension
+            // define function to find this
+            auto IsInputDimParam = [&](const std::string &p) {
+               auto inputNames = model.GetInputTensorNames();
+               for (auto &input : inputNames) {
+                  for (auto &i_s : model.GetDimTensorShape(input)) {
+                     if (i_s.isParam && i_s.param == p)
+                        return true;
+                  }
+               }
+               return false;
+            };
+            for (size_t i = 0; i < fDimShapeY.size(); i++) {
+               auto &s = fDimShapeY[i];
+               if (s.isParam && s.param.find("std::max") != std::string::npos) {
+                  if (IsInputDimParam(fDimShapeA[i].param)) {
+                     // case dim is 1 we indicate that the input parameter is equal to 1
+                     if (fDimShapeA[i].dim != 1)
+                        s = fDimShapeA[i];
+                     else
+                        s = fDimShapeB[i];
+                  } else if (IsInputDimParam(fDimShapeB[i].param)) {
+                     if (fDimShapeB[i].dim != 1)
+                        s = fDimShapeB[i];
+                     else
+                        s = fDimShapeA[i];
+                  }
+               }
+            }
+         }
+
+         model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fDimShapeY);
+         if (model.Verbose()) {
+            std::cout << BinaryOperatorTrait<T, Op>::Name() << " : " << ConvertShapeToString(fDimShapeA) << " , "
+                      << ConvertShapeToString(fDimShapeB) << " --> " << ConvertShapeToString(fDimShapeY) << std::endl;
          }
-         model.AddConstantTensor<T>(fNY, fShapeY, dataY.data());
-         // flag tensors to not be written in a fil
-         model.SetNotWritableInitializedTensor(nameA);
-         model.SetNotWritableInitializedTensor(nameB);
-         fIsOutputConstant = true;
-         if (model.Verbose())
-            std::cout << "Binary op ---> " << fNY << "  " << ConvertShapeToString(fShapeY) << " : "
-               << ConvertValuesToString(dataY) << std::endl;
-      }
-      else {
-        model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fShapeY);
       }
    }
 
-   std::string GenerateInitCode() override {
+   std::string GenerateInitCode() override
+   {
       std::stringstream out;
       return out.str();
    }
 
-   std::string Generate(std::string OpName) override {
+   std::string Generate(std::string opName) override
+   {
 
-      if (fIsOutputConstant) return "";
+      if (fIsOutputConstant)
+         return "";
 
-      OpName = "op_" + OpName;
+      opName = "op_" + opName;
 
-      if (fShapeY.empty()) {
+      if (fDimShapeY.empty()) {
          throw std::runtime_error("TMVA SOFIE Binary Op called to Generate without being initialized first");
       }
       std::stringstream out;
-      out << SP << "\n//------ " << BinaryOperatorTrait<T,Op>::Name() << "\n";
-      size_t length = ConvertShapeToLength(fShapeY);
+      out << SP << "\n//------ " << opName << "  " << BinaryOperatorTrait<T, Op>::Name() << " --> "
+          << ConvertDimShapeToString(fDimShapeY) << "\n";
+      auto length = ConvertDimShapeToLength(fDimShapeY);
       std::string typeName = TensorType<T>::Name();
-      // Broadcast A if it's uninitialized
-      // use broadcasting function where we pass an already allocated tensor to minimize memory allocations
-      if (fShapeA != fShapeY) {
-         out << SP << "// Broadcasting uninitialized tensor " << fNA << "\n";
-         out << SP  << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNA << ", " << ConvertShapeToString(fShapeA) << ", " << ConvertShapeToString(fShapeY)
-                         << ", fTensor_" << fNBroadcastedA << ");\n";
-      }
-      // Broadcast B if it's uninitialized
-      if (fShapeB != fShapeY) {
-         out << SP << "// Broadcasting uninitialized tensor " << fNB << "\n";
-         out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertShapeToString(fShapeY)
-                   << ", fTensor_" << fNBroadcastedB << ");\n";
-      }
-      const std::string& nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA;
-      const std::string& nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB;
-      out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n";
-      out << SP << SP << "tensor_" << fNY << "[id] = "  << BinaryOperatorTrait<T,Op>::Op( "tensor_" + nameA + "[id]" , "tensor_" + nameB + "[id]") <<  " ;\n";
-      out << SP << "}\n";
+
+      // we need to check if we can broadcast (case flag has bit 4 set)
+
+      if (fBroadcastFlag & 4) {
+         // need to check if shapes are the same
+         auto lengthA = ConvertDimShapeToLength(fDimShapeA);
+         auto lengthB = ConvertDimShapeToLength(fDimShapeB);
+         out << SP << "if (" << lengthA << "!=" << lengthB << ") {\n";
+         // check if A->B or B->A
+         // bool broadcastable = true;
+         for (size_t i = 0; i < fDimShapeY.size(); i++) {
+            if (fBroadcastFlag & 5 && fDimShapeY[i] == fDimShapeA[i] && fDimShapeA[i].dim > 1 &&
+                fDimShapeB[i].isParam) {
+               // B->A B[i] needs to be 1
+               out << SP << SP << "if (" << fDimShapeB[i] << "!= 1)\n";
+               out << SP << SP << SP << "throw std::runtime_error(\"SOFIE - Cannot broadcast B->A in operator "
+                   << opName << "\");\n";
+            }
+            if (fBroadcastFlag & 6 && fDimShapeY[i] == fDimShapeB[i] && fDimShapeB[i].dim > 1 &&
+                fDimShapeA[i].isParam) {
+               // A-> B A[i] needs to be 1
+               out << SP << SP << "if (" << fDimShapeA[i] << "!= 1)\n";
+               out << SP << SP << SP << "throw std::runtime_error(\"SOFIE - Cannot broadcast A->B in operator "
+                   << opName << "\");\n";
+            } else if (fDimShapeA[i].isParam && fDimShapeB[i].isParam) {
+               // both shapes are parametric and we broadcast to maximum
+               // we allocate here output vector
+               out << SP << SP << "if (" << fDimShapeA[i] << " != " << fDimShapeB[i] << " && (" << fDimShapeA[i]
+                   << " != 1 || " << fDimShapeB[i] << " != 1))\n";
+               out << SP << SP << SP << "throw std::runtime_error(\"SOFIE - Cannot broadcast shapes in operator " << opName
+                   << "\");\n";
+            }
+         }
+         out << SP << "}\n";
+      }
+
+      auto stridesA = UTILITY::ComputeStrideFromShape(fDimShapeA);
+      auto stridesB = UTILITY::ComputeStrideFromShape(fDimShapeB);
+      auto stridesY = UTILITY::ComputeStrideFromShape(fDimShapeY);
+
+      std::string compute_idx_A, compute_idx_B, compute_idx_Y;
+      if (fDimShapeA.empty() ||
+          std::all_of(fDimShapeA.begin(), fDimShapeA.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) {
+         compute_idx_A = "0";
+      } else {
+         for (size_t i = 0; i < fDimShapeA.size(); ++i) {
+            if (fDimShapeA[i].dim == 1 || fDimShapeA[i].GetVal() == "1")
+               continue;
+            compute_idx_A += "idx_" + std::to_string(i + (fDimShapeY.size() - fDimShapeA.size()));
+            if (stridesA[i].GetVal() != "1")
+               compute_idx_A += " * " + stridesA[i].GetVal();
+            compute_idx_A += " + ";
+         }
+         // remove last 3 character " + "
+         for (int j = 0; j < 3; j++)
+            compute_idx_A.pop_back();
+      }
+      if (fDimShapeB.empty() ||
+          std::all_of(fDimShapeB.begin(), fDimShapeB.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) {
+         compute_idx_B = "0";
+      } else {
+         for (size_t i = 0; i < fDimShapeB.size(); ++i) {
+            if (fDimShapeB[i].dim == 1 || fDimShapeB[i].GetVal() == "1")
+               continue;
+            compute_idx_B += "idx_" + std::to_string(i + (fDimShapeY.size() - fDimShapeB.size()));
+            if (stridesB[i].GetVal() != "1")
+               compute_idx_B += " * " + stridesB[i].GetVal();
+            compute_idx_B += " + ";
+         }
+          // remove last 3 character " + "
+         for (int j = 0; j < 3; j++)
+            compute_idx_B.pop_back();
+      }
+      int nloop = 0;
+      if (fDimShapeY.empty() ||
+          std::all_of(fDimShapeY.begin(), fDimShapeY.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) {
+         compute_idx_Y = "0";
+      } else {
+         for (size_t i = 0; i < fDimShapeY.size(); ++i) {
+            if (fDimShapeY[i].dim != 1 && fDimShapeY[i].GetVal() != "1") {
+               nloop++;
+               for (int j = 0; j < nloop; j++) out << SP;
+               out << "for (size_t idx_" << i << " = 0; idx_" << i << " < " << fDimShapeY[i]
+                   << "; ++idx_" << i << "){\n";
+               compute_idx_Y += "idx_" + std::to_string(i);
+               if (stridesY[i].GetVal() != "1")
+                  compute_idx_Y += " * " + stridesY[i].GetVal();
+               compute_idx_Y += " + ";
+            }
+         }
+         // remove last 3 characters " + "
+         for (int j = 0; j < 3; j++)
+            compute_idx_Y.pop_back();
+      }
+      for (int j = 0; j < nloop + 1; j++) out << SP;
+      out << "tensor_" << fNY << "[" << compute_idx_Y << "] = "
+          << BinaryOperatorTrait<T, Op>::Op("tensor_" + fNA + "[" + compute_idx_A + "]",
+                                            "tensor_" + fNB + "[" + compute_idx_B + "]")
+          << " ;\n";
+
+      for (int i = nloop; i > 0; i--) {
+         for (int j = 0; j < i; j++) out << SP;
+         out << "}\n";
+      }
       return out.str();
    }
 
-   std::vector<std::string> GetStdLibs() override {
+   std::string Generate_GPU_Kernel_ALPAKA(std::string opName) {
+      if (fIsOutputConstant)
+         return "";
+
+      std::string op;
+      op = "\n//------ "+opName+"_"+BinaryOperatorTrait<T, Op>::Name()+"_KERNEL_ALPAKA\n";
+      op += SP + "struct Binary"+BinaryOperatorTrait<T, Op>::Name()+"Kernel {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const * A, T const * B, T * C,\n";
+      for( size_t i=0; i<fDimShapeY.size(); i++){
+         op += SP + SP + SP + "const size_t size_" + std::to_string(i) + ",\n";
+      }
+      op += SP + SP + SP + ") const{\n";
+      op += SP + SP + SP + SP + "auto elements = alpaka::uniformElementsND(acc, alpaka::Vec<" + std::to_string(fDimShapeY.size()) + ", std::size_t>(";
+      for (size_t i = 0; i < fDimShapeY.size(); i++) {
+         op += "size_" + std::to_string(i);
+      }
+      op.pop_back();
+      op += "));\n";
+      op += SP + SP + SP + SP + "for (auto const& elem : elements) {\n";
+
+      auto stridesA = UTILITY::ComputeStrideFromShape(fDimShapeA);
+      auto stridesB = UTILITY::ComputeStrideFromShape(fDimShapeB);
+      auto stridesY = UTILITY::ComputeStrideFromShape(fDimShapeY);
+
+      std::string compute_idx_A, compute_idx_B, compute_idx_Y;
+      if (fDimShapeA.empty() ||
+          std::all_of(fDimShapeA.begin(), fDimShapeA.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) {
+         compute_idx_A = "0";
+      } else {
+         for (size_t i = 0; i < fDimShapeA.size(); ++i) {
+            if (fDimShapeA[i].dim == 1 || fDimShapeA[i].GetVal() == "1")
+               continue;
+            compute_idx_A += "elem[" + std::to_string(i + (fDimShapeY.size() - fDimShapeA.size())) + "]";
+            if (stridesA[i].GetVal() != "1")
+               compute_idx_A += " * " + stridesA[i].GetVal();
+            compute_idx_A += " + ";
+         }
+         // remove last 3 character " + "
+         for (int j = 0; j < 3; j++)
+            compute_idx_A.pop_back();
+      }
+      if (fDimShapeB.empty() ||
+          std::all_of(fDimShapeB.begin(), fDimShapeB.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) {
+         compute_idx_B = "0";
+      } else {
+         for (size_t i = 0; i < fDimShapeB.size(); ++i) {
+            if (fDimShapeB[i].dim == 1 || fDimShapeB[i].GetVal() == "1")
+               continue;
+            compute_idx_B += "elem[" + std::to_string(i + (fDimShapeY.size() - fDimShapeB.size())) + "]";
+            if (stridesB[i].GetVal() != "1")
+               compute_idx_B += " * " + stridesB[i].GetVal();
+            compute_idx_B += " + ";
+         }
+          // remove last 3 character " + "
+         for (int j = 0; j < 3; j++)
+            compute_idx_B.pop_back();
+      }
+      int nloop = 0;
+      if (fDimShapeY.empty() ||
+          std::all_of(fDimShapeY.begin(), fDimShapeY.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) {
+         compute_idx_Y = "0";
+      } else {
+         for (size_t i = 0; i < fDimShapeY.size(); ++i) {
+            if (fDimShapeY[i].dim != 1 && fDimShapeY[i].GetVal() != "1") {
+               nloop++;
+               for (int j = 0; j < nloop; j++) op += SP;
+               compute_idx_Y += "elem[" + std::to_string(i) + "]";
+               if (stridesY[i].GetVal() != "1")
+                  compute_idx_Y += " * " + stridesY[i].GetVal();
+               compute_idx_Y += " + ";
+            }
+         }
+         // remove last 3 characters " + "
+         for (int j = 0; j < 3; j++)
+            compute_idx_Y.pop_back();
+      }
+      for (int j = 0; j < nloop + 1; j++) op += SP;
+      op += "C[" + compute_idx_Y + "] = "
+          + BinaryOperatorTrait<T, Op>::Op("A[" + compute_idx_A + "]",
+                                            "B[" + compute_idx_B + "]")
+          + " ;\n";
+      for (int i = nloop; i > 0; i--) {
+         for (int j = 0; j < i; j++) op += SP;
+         op += "}\n";
+      }
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string OpName) {
+      if (fIsOutputConstant)
+         return "";
+
+      return SP + "Binary"+BinaryOperatorTrait<T, Op>::Name()+"Kernel " + OpName + "Kernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) {
+      if (fIsOutputConstant)
+         return "";
+
+      if (fDimShapeY.empty()) {
+         throw std::runtime_error("TMVA SOFIE Operator Basic Binary called to Generate without being initialized first");
+      }
+      std::stringstream out;
+      auto length = ConvertDimShapeToLength(fDimShapeY);
+      out << "\n//------ "+OpName+"_ALPAKA\n";
+      out << SP << "alpaka::WorkDivMembers<Dim, Idx> workDiv_"<<fNY<<"(alpaka::Vec<Dim, Idx>::all("<<(stoi(length)+256-1)/256<<"), alpaka::Vec<Dim, Idx>::all(256), alpaka::Vec<Dim, Idx>::all(1));\n";
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << fNY << ", " << OpName << "Kernel, alpaka::getPtrNative(deviceBuf_" << fNA << "), alpaka::getPtrNative(deviceBuf_"<<fNB<<"), alpaka::getPtrNative(deviceBuf_"<<fNY<<")); \n";
+      return out.str();
+   }
+
+   std::vector<std::string> GetStdLibs() override
+   {
       if (Op == EBasicBinaryOperator::Pow) {
-         return { std::string("cmath") };
+         return {std::string("cmath")};
       } else {
          return {};
       }
    }
-};
 
-}//SOFIE
+   
+};
 
+} // namespace SOFIE
 
-#endif //SOFIE_ROperator_BasicBinary
+#endif // SOFIE_ROperator_BasicBinary
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx
index c18c17e..b98ded5 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx
@@ -107,6 +107,33 @@ public:
       return out.str();
    }
 
+   std::string Generate_GPU_Kernel_ALPAKA() {
+      std::string op;
+      op = "\n//------ " + UnaryOpTraits<T, Op>::Name() + "_KERNEL_ALPAKA\n";
+      op += SP + "struct Unary" + UnaryOpTraits<T, Op>::Name() + "Kernel{\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T* data, std::size_t numElements) const {\n";
+      op += SP + SP + SP + "for (auto i : alpaka::uniformElements(acc, numElements)) {\n";
+      op += SP + SP + SP + "data[i] = " << UnaryOpTraits<T, Op>::Op("data[i]") << ";\n";
+      op += SP + SP + "}\n";
+      op += SP + "}\n};\n";
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override {
+      return SP + "Unary" + UnaryOpTraits<T, Op>::Name() + "Kernel " + UnaryOpTraits<T, Op>::Name() + "Kernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      OpName = "op_" + OpName;
+      std::stringstream out;
+      auto length = ConvertShapeToLength(fShapeX);
+      out << "\n//------ "+OpName+"_ALPAKA\n";
+      out << SP << "alpaka::WorkDivMembers<Dim, Idx> workDiv_"<<fNX<<"(alpaka::Vec<Dim, Idx>::all("<<(length+255)/256<<"), alpaka::Vec<Dim, Idx>::all(256), alpaka::Vec<Dim, Idx>::all(1));\n";
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << fNX << ", " << UnaryOpTraits<T, Op>::Name() << "Kernel, alpaka::getPtrNative(deviceBuf_" << fNX << "), static_cast<Idx>(" << length << ")); \n";
+      return out.str();
+   }
+
    std::vector<std::string> GetStdLibs() override {
       if (Op == EBasicUnaryOperator::kSqrt || Op == EBasicUnaryOperator::kExp || Op == EBasicUnaryOperator::kLog) {
          return { std::string("cmath") };
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx
index a27cea4..1a6098d 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx
@@ -1,9 +1,9 @@
 #ifndef SOFIE_ROPERATOR_BatchNormalization
 #define SOFIE_ROPERATOR_BatchNormalization
 
-#include "SOFIE_common.hxx"
-#include "ROperator.hxx"
-#include "RModel.hxx"
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
 
 
 #include <cmath>
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx
index 47c3d66..c813f7c 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx
@@ -90,6 +90,36 @@ public:
       return out.str();
    }
 
+   std::string Generate_GPU_Kernel_ALPAKA() {
+      std::string op;
+      op = "\n//------ CAST_KERNEL_ALPAKA\n";
+      op += SP + "struct CastKernel{\n";
+      op += SP + SP + "template<typename TAcc, typename SrcT, typename DstT>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, SrcT const * src, DstT * dst, std::size_t numElements) const {\n";
+      op += SP + SP + SP + "for (auto i : alpaka::uniformElements(acc, numElements)) {\n";
+      op += SP + SP + SP + "dst[i] = static_cast<DstT>(src[i]);\n";
+      op += SP + SP + "}\n";
+      op += SP + "}\n};\n";
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) {
+      return SP + "CastKernel castKernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      OpName = "op_" + OpName;
+      if (fShape.empty()) {
+         throw std::runtime_error("TMVA SOFIE Operator Cast called to Generate without being initialized first");
+      }
+      std::stringstream out;
+      auto length = ConvertShapeToLength(fShape);
+      out << "\n//------ CAST_GPU_ALPAKA\n";
+      out << SP << "alpaka::WorkDivMembers<Dim, Idx> workDiv_"<<fNX<<"(alpaka::Vec<Dim, Idx>::all("<< (length+255)/256 <<"), alpaka::Vec<Dim, Idx>::all(256), alpaka::Vec<Dim, Idx>::all(1));\n";
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << fNX << ", castKernel, alpaka::getPtrNative(deviceBuf_" << fNX << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast<Idx>(" << length << ")); \n";
+      return out.str();
+   }
+
 };
 
 }//SOFIE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx
index 0d5e574..c828668 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx
@@ -1,5 +1,5 @@
 #ifndef SOFIE_ROPERATOR_Concat
- #define SOFIE_ROPERATOR_Concat
+#define SOFIE_ROPERATOR_Concat
 
 
  #include "SOFIE/SOFIE_common.hxx"
@@ -25,6 +25,7 @@
          std::vector<std::vector<Dim>> fInputShapes;
 
      public:
+
          ROperator_Concat(){}
          ROperator_Concat(std::vector<std::string> inputs, int axis, int newAxis, std::string output):
          fAxis(axis), fnewAxis(newAxis), fOutput(UTILITY::Clean_name(output)) {
@@ -53,6 +54,7 @@
                throw std::runtime_error("TMVA SOFIE Concat Op - invalid axis value ");
 
             int concat_dim=0;
+            // case of Concat (fNewAxis = 0) and not ConcatFromSequence
             if(fnewAxis == 0){
                for (size_t i = 0; i < inputs.size(); i++) {
                   if (i > 0 && inputs[i].size() != inputs[i - 1].size())
@@ -73,6 +75,7 @@
                ret[0][fAxis] = concat_dim;
             }
             std::vector<int> stack;
+            // case ConCatFromSequence
             if(fnewAxis == 1){
                for(size_t i = 0; i < inputs.size(); i++) {
                   if (i > 0 && inputs[i].size() != inputs[i-1].size() )
@@ -96,8 +99,8 @@
          }
 
          // get shape of output given inputs. It is going to be called after initialized
-         std::vector<std::vector<Dim>> ShapeInference(const std::vector<std::vector<Dim>> & inputs) {
-            std::vector<std::vector<Dim>> ret(1);
+         std::vector<Dim> ShapeInference(const std::vector<std::vector<Dim>> & inputs, const RModel & model) {
+            std::vector<Dim> ret(inputs[0].size());
             // treat negative axis case
             if (fAxis<0) {
                fAxis = inputs[0].size()+fAxis;
@@ -105,31 +108,54 @@
             if (fAxis < 0 || fAxis >= (int) inputs[0].size())
                throw std::runtime_error("TMVA SOFIE Concat Op - invalid axis value ");
 
-            int concat_dim=0;
+            Dim concat_dim;
             if(fnewAxis == 0){
                for (size_t i = 0; i < inputs.size(); i++) {
                   if (i > 0 && inputs[i].size() != inputs[i - 1].size())
                      throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have different shapes " + fInputs[i] + " : " +
-                                              ConvertDynamicShapeToString(inputs[i]) + " and " + fInputs[i-1] + " : " + ConvertDynamicShapeToString(inputs[i - 1]));
+                                              ConvertShapeToString(inputs[i]) + " and " + fInputs[i-1] + " : " + ConvertShapeToString(inputs[i - 1]));
                   for (size_t iaxis = 0; iaxis < inputs[i].size(); iaxis++) {
                      if ((int)iaxis == fAxis) {
-                        // support only non-params shape for the concatenation axis
-                        if (inputs[i][iaxis].isParam)
-                           throw std::runtime_error("TMVA SOFIE Concat Op - not supporting input param dimensions for concatenation axis. Input shape is " +
-                                                     ConvertDynamicShapeToString(inputs[i]));
-                        concat_dim += inputs[i][iaxis].dim;
+                        // support both integer and params shape for the concatenation axis
+                        if (concat_dim.param.empty() && concat_dim.dim == 0)
+                           concat_dim = inputs[i][iaxis];
+                        else if (inputs[i][iaxis].isParam || concat_dim.isParam) {
+                           concat_dim =
+                              Dim{ concat_dim.GetVal() + std::string("+ ") + inputs[i][iaxis].GetVal(),
+                                 static_cast<size_t>(-1)};
+                        } else {
+                           concat_dim = Dim { concat_dim.dim + inputs[i][iaxis].dim };
+                        }
+                     }
+                     else if (i == 0) {
+                        ret[iaxis] = inputs[i][iaxis];
                      }
-                     // other dimensions must be the same
-                     else if (i > 0 && inputs[i][iaxis].GetVal() != inputs[i - 1][iaxis].GetVal())
+                     else if ((!inputs[i][iaxis].isParam && !ret[iaxis].isParam) && (inputs[i][iaxis].dim != ret[iaxis].dim)) {
                         throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have wrong shapes " +
-                                                 ConvertDynamicShapeToString(inputs[i]) + " and " +
-                                                 ConvertDynamicShapeToString(inputs[i - 1]));
+                                                 ConvertShapeToString(inputs[i]) + " and " +
+                                                 ConvertShapeToString(inputs[i - 1]));
+                     }
+                     else if (!inputs[i][iaxis].isParam && ret[iaxis].isParam){
+                        // if shape is not parametric use it
+                        ret[iaxis] = inputs[i][iaxis];
+                     }
+                     else if (inputs[i][iaxis].isParam && ret[iaxis].isParam) {
+                        // check which parameter is first in RModel list
+                        auto & dimNames = model.GetDimShapeNames();
+                        auto p1 = std::find(dimNames.begin(), dimNames.end(), inputs[i][iaxis].param);
+                        auto p2 = std::find(dimNames.begin(), dimNames.end(), ret[iaxis].param);
+                        if (p1 < p2) ret[iaxis] = inputs[i][iaxis];
+                     }
+
                   }
+                  // add parenthesis in case is an expression
+                  if (concat_dim.isParam && concat_dim.dim == static_cast<size_t>(-1))
+                     concat_dim =  Dim{ std::string("(") + concat_dim.GetVal() +  std::string(")"), concat_dim.dim };
                }
 
-               // output shape
-               ret[0] = inputs[0];
-               ret[0][fAxis].dim = concat_dim;
+               // output shape for concatenated axis
+               ret[fAxis] = Dim{concat_dim};
+
             }
             // case of stacking (not supported yet)
             // here we need to check that input shapes are the same
@@ -141,24 +167,30 @@
             return ret;
          }
 
-      void Initialize(RModel& model) override {
+         void Initialize(RModel& model) override {
             for (auto &it : fInputs) {
                if (model.CheckIfTensorAlreadyExist(it) == false) {
                   throw std::runtime_error("TMVA SOFIE Concat Op Input Tensor " + it + " is not found in model");
                }
-               fInputShapes.push_back(model.GetDynamicTensorShape(it));
+               fInputShapes.push_back(model.GetDimTensorShape(it));
             }
-            fOutputShape = ShapeInference(fInputShapes)[0];
+            fOutputShape = ShapeInference(fInputShapes, model);
             if (model.Verbose())
-               std::cout << "Output of concat operator has shape " << ConvertDynamicShapeToString(fOutputShape) << std::endl;
+               std::cout << "Output of concat operator has shape " << ConvertDimShapeToString(fOutputShape) << std::endl;
 
             // check if concat has constant inputs , axis 0(concat contigous memory and type is integer)
+            bool isOutputShape = false;
             if (model.GetTensorType(fInputs[0]) == ETensorType::INT64 && fAxis == 0) {
                fIsOutputConstant = true;
+               isOutputShape = true;
+
                for ( auto & input : fInputs) {
                   if (!model.IsInitializedTensor(input)) {
                      fIsOutputConstant = false;
-                     break;
+                     if (!model.IsShapeTensor(input)) {
+                        isOutputShape = false;
+                        break;
+                     }
                   }
                }
                if (fIsOutputConstant) {
@@ -177,26 +209,53 @@
                   model.AddConstantTensor<int64_t>(fOutput, outputShape, outputData.data());
                   if (model.Verbose()) {
                      std::cout << "output of Concat is a constant tensor " << ConvertShapeToString(outputShape) << " : "
-                     << ConvertValuesToString(outputData) << std::endl;
+                     << ConvertValuesToString(outputData) << " (constant)" << std::endl;
                   }
+               } else if (isOutputShape) {
+                  auto outputShape = ConvertShapeToInt(fOutputShape);  // conversion must be possible
+                  std::vector<Dim> outputData(ConvertShapeToLength(outputShape));
+                  size_t offset = 0;
+                  for ( auto & input : fInputs) {
+                     std::vector<Dim> inputData;
+                     auto inputShape = model.GetTensorShape(input); // shape is not dynamic
+                     size_t inputLength = ConvertShapeToLength(inputShape); // shape can be a scalar
+                     if (model.IsShapeTensor(input))
+                        inputData = model.GetShapeTensorValues(input);
+                     else if (model.IsConstantTensor(input)) {
+                        inputData.resize(inputLength);
+                        auto intData = static_cast<int64_t*>(model.GetInitializedTensorData(input).get());
+                        for (size_t i = 0; i < inputData.size(); i++)
+                           inputData[i] = Dim{ static_cast<size_t>(intData[i])};
+                     }
+                     std::cout << "concatenating input data " << inputLength << "  " << inputData[0] << std::endl;
+                     std::copy(inputData.begin(), inputData.end(), outputData.begin() + offset );
+                     offset += inputLength;
+                  }
+                  // add output tensor
+                  model.AddShapeTensor(fOutput,outputData, false); // cannot be a  scalar
+                  if (model.Verbose()) {
+                     std::cout << "output of Concat is a shape tensor " << ConvertShapeToString(outputShape) << " : "
+                     << ConvertShapeToString(outputData) << " (shape)" <<  std::endl;
+                  }
+                  fIsOutputConstant = true;
                }
             }
             if (!fIsOutputConstant) {
                model.AddIntermediateTensor(fOutput, model.GetTensorType(fInputs[0]), fOutputShape);
                if (model.Verbose()) {
-                  std::cout << "Concat ---> " << fOutput << " " <<  ConvertDynamicShapeToString(fOutputShape) << std::endl;
+                  std::cout << "Concat ---> " << fOutput << " " <<  ConvertDimShapeToString(fOutputShape) << std::endl;
                }
             }
          }
 
-         std::string Generate(std::string OpName) override {
+         std::string Generate(std::string opName) override {
             if (fIsOutputConstant) return "";
-            OpName = "op_"+OpName;
+            opName = "op_" + opName;
             if(fOutputShape.empty()){
                   throw std::runtime_error("TMVA SOFIE Concat called to Generate without being initialized first");
             }
             std::stringstream out;
-            out<<"\n//--------- Concat\n";
+            out<<"\n//--------- Concat " << opName << " --> " << ConvertShapeToString(fOutputShape) << "\n";
             // special case when memory is contiguous
             bool hasShapeOnes = true;
             for(int i = 0; i<fAxis; ++i){
@@ -208,7 +267,7 @@
             if (fAxis == 0 || hasShapeOnes) {
                std::string offset;
                for(size_t i=0; i<fInputs.size(); ++i) {
-                  std::string length = ConvertDynamicShapeToLength(fInputShapes[i]);
+                  auto length = ConvertDimShapeToLength(fInputShapes[i]);
                   out << SP << "std::copy(tensor_" <<fInputs[i] << ", tensor_" <<fInputs[i] << "+" << length <<", tensor_"<<fOutput;
                   if (i > 0)  out << offset;
                   offset += " + " + length;
@@ -260,4 +319,5 @@
      };
  }//SOFIE
 
- #endif //SOFIE_ROPERATOR_CONCAT
+
+ #endif //SOFIE_ROPERATOR_CONCAT
\ No newline at end of file
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx
index 0d08432..6590909 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx
@@ -101,6 +101,11 @@ public:
       // no code to generate here. Tensor are defined in Session constructor
       return "//---------------------------------------\n";
    }
+
+   std::string Generate_GPU_ALPAKA(std::string /* OpName */) override {
+      // no code to generate here. Tensor are defined in Session constructor
+      return "//---------------------------------------\n";
+   }
 };
 
 }//SOFIE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx
index 0467385..b9d917b 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx
@@ -1,9 +1,9 @@
 #ifndef SOFIE_SOFIE_ROPERATOR_CONVTRANSPOSE_HXX
 #define SOFIE_SOFIE_ROPERATOR_CONVTRANSPOSE_HXX
 
-#include <SOFIE/SOFIE_common.hxx>
-#include <SOFIE/ROperator.hxx>
-#include <SOFIE/RModel.hxx>
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
 
 #include <memory>
 #include <sstream>
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx
index c834a06..bf163b7 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx
@@ -122,6 +122,60 @@ public:
       return out.str();
    }
 
+   std::string Generate_GPU_Kernel_ALPAKA() {
+      std::string op;
+      op = "\n//------ Expand_KERNEL_ALPAKA\n";
+      op += SP + "struct ExpandKernel {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const * input, T * output, const size_t * input_shape, const size_t * output_shape, const size_t * input_strides, const size_t * output_strides, const size_t ndim){\n";
+      op += SP + SP + SP + SP + "size_t input_idx = 0;\n";
+      op += SP + SP + SP + SP + "size_t output_idx = 0;\n";
+      op += SP + SP + SP + SP + "size_t coord_out;\n";
+      op += SP + SP + SP + SP + "size_t coord_in;\n";
+      op += SP + SP + SP + SP + "auto elements = alpaka::uniformElementsND(acc, alpaka::Vec<ndim, std::size_t>(output_shape));\n";
+      op += SP + SP + SP + SP + "for (auto const& elem : elements) {\n";
+      op += SP + SP + SP + SP + "input_idx = 0;\n";
+      op += SP + SP + SP + SP + "output_idx = 0;\n";
+      op += SP + SP + SP + SP + "for (int i = 0; i < ndim; ++i) {\n";
+      op += SP + SP + SP + SP + SP + "coord_out = elem[i];\n";
+      op += SP + SP + SP + SP + SP + "coord_in = (input_shape[i] == 1) ? 0 : coord_out;\n";
+      op += SP + SP + SP + SP + SP + "input_idx += coord_in * input_strides[i];\n}\n";
+      op += SP + SP + SP + SP + SP + "output_idx += coord_out * output_strides[i];\n}\n";
+      op += SP + SP + SP + SP + SP + "output[output_idx] = input[input_idx];\n";
+      op += SP + SP + SP + SP + "}\n";
+      op += SP + SP + "}\n";
+      op += SP + "};\n";
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override {
+      return SP + "ExpandKernel expandKernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      OpName = "op_" + OpName;
+      if (fShape.empty()) {
+         throw std::runtime_error("TMVA SOFIE Operator Expand called to Generate without being initialized first");
+      }
+
+      std::stringstream out;
+      auto length = ConvertShapeToLength(fShape);
+      out << "\n//------ EXPAND_GPU_ALPAKA\n";
+      out << SP << "alpaka::WorkDivMembers<Dim, Idx> workDiv_" << fNX
+         << "(alpaka::Vec<Dim, Idx>::all((" << length << " + 256 - 1) / 256), "
+         << "alpaka::Vec<Dim, Idx>::all(256), alpaka::Vec<Dim, Idx>::all(1));\n";
+
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << fNX
+         << ", expandKernel, alpaka::getPtrNative(deviceBuf_" << fNX
+         << "), alpaka::getPtrNative(deviceBuf_" << fNY
+         << "), "<< ConvertShapeToString(fShapeX) <<", "<<ConvertShapeToString(fShapeY)<<", "<<ConvertShapeToString(UTILITY::ComputeStrideFromShape(fShapeX))<<", "
+         << ConvertShapeToString(UTILITY::ComputeStrideFromShape(fShapeX))<<", "<<fShapeY.size()<<");\n";
+
+      return out.str();
+   }
+
+
+
 };
 
 }//SOFIE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_GRU.hxx
index bb1a74e..5b553ff 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_GRU.hxx
@@ -11,7 +11,6 @@
 #include <string>
 #include <vector>
 
-
 namespace SOFIE {
 
 /*! \brief Gated Recurrent Unit operator
@@ -91,7 +90,7 @@ template <typename T> class ROperator_GRU final : public ROperator {
          fNSequence_lens(UTILITY::Clean_name(nameSequence_lens)),
          fNInitial_h(UTILITY::Clean_name(nameInitial_h)),
          fNY(UTILITY::Clean_name(nameY)), fNY_h(UTILITY::Clean_name(nameY_h)) {
-      
+
       fInputTensorNames = { fNX, fNW, fNR };
       if (!fNB.empty()){
         fInputTensorNames.emplace_back(fNB);
@@ -123,39 +122,34 @@ template <typename T> class ROperator_GRU final : public ROperator {
     *
     * \param input type of the input tensors
     */
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> /*input*/);
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> /*input*/) override;
 
    /*! \brief Infers the shape of the output tensors
     *
     * \param input shape of the input tensors
     */
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> /*input*/);
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> /*input*/) override;
 
    /*! \brief Initialize the model
     *
     * \param model Model
     */
-   void Initialize(RModel &);
+   void Initialize(RModel &) override;
 
    /*! \brief Generate the inference code
     *
     * \param OpName name of the operator
     */
-   std::string Generate(std::string /*OpName*/);
-
-   /*! \brief Generate the code for the Session internal data vectors
-    *
-    * \param opName name of the operator
-    */
-   std::string GenerateSessionMembersCode(std::string opName);
+   std::string Generate(std::string /*OpName*/) override;
 
    /*! \brief Returns the blas routines needed to compile the generated code
     */
-   std::vector<std::string> GetBlasRoutines() { return { std::string("Gemm"), std::string("Axpy") }; }
+   std::vector<std::string> GetBlasRoutines() override { return { std::string("Gemm"), std::string("Axpy") }; }
 };
 
 } // namespace SOFIE
 
+
 // Implementation of the ROperator_GRU class
 #include "SOFIE/ROperator_GRU.icc"
 
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc b/src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc
index f3813c2..38030d1 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc
@@ -175,51 +175,45 @@ void ROperator_GRU<T>::Initialize(RModel& model){
          fAttrActivations = {"Sigmoid", "Tanh"};
       }
    }
-}
 
-// generate code for Session data members (e.g. internal vectors)
-template <typename T>
-std::string ROperator_GRU<T>::GenerateSessionMembersCode(std::string opName)
-{
-   opName = "op_" + opName;
-   std::stringstream out;
+   // To get unique intermediate tensor names, we add the name of the input
+   // tensor. One might also consider using the index of the operator in the
+   // RMode, but this information is not available in the current scope.
+   std::string opName = "op_gru_" + fNX;
 
    size_t num_directions = fShapeW[0];
    size_t seq_length = (fAttrLayout == 0) ? fShapeX[0] : fShapeX[1];
    size_t batch_size = (fAttrLayout == 0) ? fShapeX[1] : fShapeX[0];
    size_t input_size = fShapeX[2];
 
+   auto declareVector = [&](std::string const &name, std::size_t n){
+      std::string fullName = opName + "_" + name;
+      model.AddIntermediateTensor(fullName, ConvertStringToType(fType), std::vector<std::size_t>{n});
+   };
+
    if (fAttrLayout != 0) {
-      out << "std::vector<" << fType << "> fVec_" << opName << "_input = std::vector<" << fType << ">("
-          << seq_length * batch_size * input_size << ");\n";
-      out << "std::vector<" << fType << "> fVec_" << opName << "_initial_hidden_state = std::vector<" << fType << ">("
-          << num_directions * batch_size * fAttrHiddenSize << ");\n";
-      out << "std::vector<" << fType << "> fVec_" << opName << "_initial_cell_state = std::vector<" << fType << ">("
-          << num_directions * batch_size * fAttrHiddenSize << ");\n";
+      declareVector("input", seq_length * batch_size * input_size);
+      declareVector("initial_hidden_state", num_directions * batch_size * fAttrHiddenSize);
+      declareVector("initial_cell_state", num_directions * batch_size * fAttrHiddenSize);
    }
    // Set the feedforward
    size_t ff_size = seq_length * batch_size * fAttrHiddenSize;
-   out << "std::vector<" << fType << "> fVec_" << opName << "_f_update_gate = std::vector<" << fType << ">(" << ff_size << ");\n";
-   out << "std::vector<" << fType << "> fVec_" << opName << "_f_reset_gate = std::vector<" << fType << ">(" << ff_size << ");\n";
-   out << "std::vector<" << fType << "> fVec_" << opName << "_f_hidden_gate = std::vector<" << fType << ">(" << ff_size << ");\n";
+   declareVector("f_update_gate", ff_size);
+   declareVector("f_reset_gate", ff_size);
+   declareVector("f_hidden_gate", ff_size);
    // gate results
    size_t hs_size = seq_length * num_directions * batch_size * fAttrHiddenSize;
-   out << "std::vector<" << fType << "> fVec_" << opName << "_update_gate = std::vector<" << fType << ">(" << hs_size << ");\n";
-   out << "std::vector<" << fType << "> fVec_" << opName << "_reset_gate = std::vector<" << fType << ">(" << hs_size << ");\n";
-   out << "std::vector<" << fType << "> fVec_" << opName << "_hidden_gate = std::vector<" << fType << ">(" << hs_size << ");\n";
+   declareVector("update_gate", hs_size);
+   declareVector("reset_gate", hs_size);
+   declareVector("hidden_gate", hs_size);
 
    // feedback
-   out << "std::vector<" << fType << "> fVec_" << opName << "_feedback = std::vector<" << fType << ">("
-       << batch_size * fAttrHiddenSize << ");\n";
+   declareVector("feedback", batch_size * fAttrHiddenSize);
 
    // hiddden state
    if (fAttrLayout != 0 || fNY.empty()) {
-      out << "std::vector<" << fType << "> fVec_" << opName << "_hidden_state = std::vector<" << fType << ">(" << hs_size << ");\n";
+      declareVector("hidden_state", hs_size);
    }
-
-   out << "\n";
-
-   return out.str();
 }
 
 
@@ -234,12 +228,14 @@ auto ROperator_GRU<T>::Generate(std::string OpName)
    size_t input_size = fShapeX[2];
    size_t num_directions = fShapeW[0];
 
+   auto getVec = [&](std::string const &name) { return "tensor_op_gru_" + fNX + "_" + name; };
+
    // set the input
    if (fAttrLayout == 0) {
-      out << SP << fType << " *" << OpName << "_input = tensor_" << fNX << ";\n";
+      out << SP << fType << " const* " << OpName << "_input = tensor_" << fNX << ";\n";
    } else {
       if (fUseSession) {
-         out << SP << fType << " * " << OpName << "_input = fVec_" << OpName << "_input.data();\n";
+         out << SP << fType << " * " << OpName << "_input = " << getVec("input") << ";\n";
       } else {
          out << SP << fType << " " << OpName << "_input[" << seq_length * batch_size * input_size << "];\n";
       }
@@ -261,8 +257,7 @@ auto ROperator_GRU<T>::Generate(std::string OpName)
                 << fNInitial_h << ";\n";
       } else {
          if (fUseSession) {
-            out << SP << fType << " * " << OpName << "_initial_hidden_state = fVec_" << OpName
-                << "_initial_hidden_state.data();\n";
+            out << SP << fType << " * " << OpName << "_initial_hidden_state = " << getVec("initial_hidden_state") << ";\n";
          } else {
             out << SP << fType << " " << OpName << "_initial_hidden_state[" << num_directions * batch_size *
                 fAttrHiddenSize << "];\n";
@@ -283,9 +278,9 @@ auto ROperator_GRU<T>::Generate(std::string OpName)
    // Set the feedforward
    size_t feedforward_size = seq_length * batch_size * fAttrHiddenSize;
    if (fUseSession) {
-      out << SP << fType << " * " << OpName << "_f_update_gate = fVec_" << OpName << "_f_update_gate.data();\n";
-      out << SP << fType << " * " << OpName << "_f_reset_gate = fVec_" << OpName << "_f_reset_gate.data();\n";
-      out << SP << fType << " * " << OpName << "_f_hidden_gate = fVec_" << OpName << "_f_hidden_gate.data();\n";
+      out << SP << fType << " * " << OpName << "_f_update_gate = " << getVec("f_update_gate") << ";\n";
+      out << SP << fType << " * " << OpName << "_f_reset_gate = " << getVec("f_reset_gate") << ";\n";
+      out << SP << fType << " * " << OpName << "_f_hidden_gate = " << getVec("f_hidden_gate") << ";\n";
    } else {
       out << SP << fType << " " << OpName << "_f_update_gate[" << feedforward_size << "] = {0};\n";
       out << SP << fType << " " << OpName << "_f_reset_gate[" << feedforward_size << "] = {0};\n";
@@ -294,9 +289,9 @@ auto ROperator_GRU<T>::Generate(std::string OpName)
    // Set the gates
    size_t hidden_state_size = seq_length * num_directions * batch_size * fAttrHiddenSize;
    if (fUseSession) {
-      out << SP << fType << " * " << OpName << "_update_gate = fVec_" << OpName << "_update_gate.data();\n";
-      out << SP << fType << " * " << OpName << "_reset_gate = fVec_" << OpName << "_reset_gate.data();\n";
-      out << SP << fType << " * " << OpName << "_hidden_gate = fVec_" << OpName << "_hidden_gate.data();\n";
+      out << SP << fType << " * " << OpName << "_update_gate = " << getVec("update_gate") << ";\n";
+      out << SP << fType << " * " << OpName << "_reset_gate = " << getVec("reset_gate") << ";\n";
+      out << SP << fType << " * " << OpName << "_hidden_gate = " << getVec("hidden_gate") << ";\n";
    } else {
       out << SP << fType << " " << OpName << "_update_gate[" << hidden_state_size << "] = {0};\n";
       out << SP << fType << " " << OpName << "_reset_gate[" << hidden_state_size << "] = {0};\n";
@@ -307,14 +302,14 @@ auto ROperator_GRU<T>::Generate(std::string OpName)
       out << SP << fType << " *" << OpName << "_hidden_state = tensor_" << fNY << ";\n";
    } else {
       if (fUseSession) {
-         out << SP << fType << " * " << OpName << "_hidden_state = fVec_" << OpName << "_hidden_state.data();\n";
+         out << SP << fType << " * " << OpName << "_hidden_state = " << getVec("hidden_state") << ";\n";
       } else {
          out << SP << fType << " " << OpName << "_hidden_state[" << hidden_state_size << "] = {0};\n";
       }
    }
 
    if (fUseSession) {
-      out << SP << fType << " * " << OpName << "_feedback = fVec_" << OpName << "_feedback.data();\n";
+      out << SP << fType << " * " << OpName << "_feedback = " << getVec("feedback") << ";\n";
    } else {
       out << SP << fType << " " << OpName << "_feedback[" << batch_size * fAttrHiddenSize << "] = {0};\n";
    }
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Gather.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Gather.hxx
index 4d34846..4f685d7 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Gather.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Gather.hxx
@@ -212,6 +212,73 @@ public:
       return out.str();
    }
 
+      std::string Generate_GPU_Kernel_ALPAKA() {
+      std::string op;
+      op = "\n//------ GATHER_KERNEL_ALPAKA\n";
+      op += SP + "struct GatherKernel {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const * input, T const * indices, T * output, std::size_t const * output_shape, std::size_t const axis, std::size_t const axisDim, std::size_t const indicesNumElements, std::size_t const * output_strides, std::size_t const * input_strides, std::size_t const ndim) const {\n";
+      op += SP + SP + SP + SP + "auto elements = alpaka::uniformElementsND(acc, alpaka::Vec<ndim, std::size_t>(output_shape));\n";
+      op += SP + SP + SP + SP + "for (auto const& elem : elements) {\n";
+      
+      // find flattened index for indices tensor
+      op += SP + SP + SP + SP + "int64_t idxLinear = 0;\n{\n";
+      op += SP + SP + SP + SP + SP + "int64_t stride = 1;\n";
+      op += SP + SP + SP + SP + SP + "for (int i = ndim - 1; i >= axis; --i) {;\n";
+      op += SP + SP + SP + SP + SP + "stride *= (i > axis ? output_shape[i] : 1);\n}\n";
+      op += SP + SP + SP + SP + SP + "idxLinear = elem[axis];\n";
+      op += SP + SP + SP + SP + SP + "if (idxLinear >= indicesNumElements) idxLinear %= indicesNumElements;\n}\n";
+
+      // load gather index and wrap negative if any
+      op += SP + SP + SP + SP + "int64_t k = indices[idxLinear];\n";
+      op += SP + SP + SP + SP + "if (k < 0) k += axisDim;\n";
+      op += SP + SP + SP + SP + "if (k < 0) k = 0;\n";
+      op += SP + SP + SP + SP + "if (k >= axisDim) k = axisDim - 1;\n";
+
+      // compute input flattened index
+      op += SP + SP + SP + SP + "size_t input_idx = 0;\n";
+      op += SP + SP + SP + SP + "size_t output_idx = 0;\n";
+      op += SP + SP + SP + SP + "for (int i = 0; i < ndim; ++i) {\n";
+      op += SP + SP + SP + SP + SP + "size_t coord = elem[i];\n";
+      op += SP + SP + SP + SP + SP + "output_idx += coord * output_strides[i];\n}\n";
+      op += SP + SP + SP + SP + SP + "if (i == axis) coord = k;\n";
+      op += SP + SP + SP + SP + SP + "input_idx += coord * input_strides[i];\n}\n";
+
+      // write to output tensor
+      op += SP + SP + SP + SP + "output[output_idx] = input[input_idx];\n";
+      op += SP + SP + SP + SP + "}\n";
+      op += SP + SP + "}\n";
+      op += SP + "};\n";   
+
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override {
+      return SP + "GatherKernel gatherKernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      OpName = "op_" + OpName;
+      std::stringstream out;
+      auto length = ConvertShapeToLength(fShapeY);
+      out << "\n//------ "<<OpName<<"_GATHER_GPU_ALPAKA\n";
+      out << SP << "alpaka::WorkDivMembers<Dim, Idx> workDiv_" << fNY
+         << "(alpaka::Vec<Dim, Idx>::all((" << length << " + 256 - 1) / 256), "
+         << "alpaka::Vec<Dim, Idx>::all(256), alpaka::Vec<Dim, Idx>::all(1));\n";
+
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << fNY
+         << ", gatherKernel, alpaka::getPtrNative(deviceBuf_" << fNX
+         << "), alpaka::getPtrNative(deviceBuf_" << fNIndices
+         << "), alpaka::getPtrNative(deviceBuf_" << fNY
+         << "), "<< ConvertShapeToString(fShapeY) <<", "<< fAttrAxis <<", "<< fShapeX[fAttrAxis] <<", "
+         << fShapeIndices.size() <<", "
+         << ConvertShapeToString(UTILITY::ComputeStrideFromShape(fShapeY)) <<", "
+         << ConvertShapeToString(UTILITY::ComputeStrideFromShape(fShapeX)) <<", "<< fShapeY.size()
+         << ",static_cast<Idx>(" << length << "));\n";
+
+      return out.str();
+   }
+
 };
 
 }//SOFIE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx
index 046bf56..1c43724 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx
@@ -48,6 +48,8 @@ namespace SOFIE{
          fAttrAlpha(alpha), fAttrBeta(beta), fAttrTransA(transA), fAttrTransB(transB), fNA(UTILITY::Clean_name(nameA)),
          fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY))
       {
+
+         fKind = OperatorKind::GEMM;
          fActivation = activation;
          fType = "float";
          static_assert(std::is_same_v<T, float>,
@@ -60,9 +62,11 @@ namespace SOFIE{
          fAttrAlpha(alpha), fAttrBeta(beta), fAttrTransA(transA), fAttrTransB(transB), fNA(UTILITY::Clean_name(nameA)),
          fNB(UTILITY::Clean_name(nameB)), fNC(UTILITY::Clean_name(nameC)), fNY(UTILITY::Clean_name(nameY)), fActivation(activation)
       {
+         fKind = OperatorKind::GEMM;
          fActivation = activation;
          fType = "float";
 
+         fInputTensorNames = { fNA, fNB, fNC };
          fOutputTensorNames = { fNY };
       }
 
@@ -148,7 +152,7 @@ namespace SOFIE{
             }
          }
          if (model.IsDynamicTensor(fNA) || model.IsDimInputTensor(fNA) ) {
-            fShapeA = model.GetDynamicTensorShape(fNA);
+            fShapeA = model.GetDimTensorShape(fNA);
             fIsDynamic = true;
          } else {
             auto shapeA_int = model.GetTensorShape(fNA);
@@ -162,7 +166,7 @@ namespace SOFIE{
          }
 
          if (model.IsDynamicTensor(fNB) || model.IsDimInputTensor(fNB)) {
-            fShapeB = model.GetDynamicTensorShape(fNB);
+            fShapeB = model.GetDimTensorShape(fNB);
             fIsDynamic = true;
          }
          else {
@@ -191,7 +195,7 @@ namespace SOFIE{
          if (!fIsDynamic) {
             shapeY = ConvertShapeToInt(fShapeY);
             if (shapeY.empty()) {
-               throw std::runtime_error("TMVA SOFIE Gemm Op " + fNY + " has invalid shape" + ConvertDynamicShapeToString(fShapeY));
+               throw std::runtime_error("TMVA SOFIE Gemm Op " + fNY + " has invalid shape" + ConvertDimShapeToString(fShapeY));
             }
          }
 
@@ -252,15 +256,16 @@ namespace SOFIE{
                shapeY.erase(shapeY.end()-1);
          }
 
-         if (!fIsDynamic)
+         if (!fIsDynamic){
             model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), shapeY);
+         }
          else
             model.AddDynamicTensor(fNY, model.GetTensorType(fNA), fShapeY);
 
          if (model.Verbose()){
             std::cout << "Gemm (or MatMul) " << " ---> " << fNY << " shape ";
             if (fIsDynamic)
-               std::cout << ConvertDynamicShapeToString(fShapeY) << std::endl;
+               std::cout << ConvertDimShapeToString(fShapeY) << std::endl;
             else
                std::cout << ConvertShapeToString(shapeY) << std::endl;
          }
@@ -279,9 +284,9 @@ namespace SOFIE{
             // include a separate scope to avoid defining unique operator temp variables
             out << "//--- broadcast bias tensor " << fNC << "for Gemm op\n";
             out << SP << "{\n";
-            out << "      float * data = SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_"
-               << fNC << "," << ConvertShapeToString(fShapeC) << ", " << ConvertDynamicShapeToString(fShapeY) << ");\n";
-            auto length = SOFIE::ConvertDynamicShapeToLength(fShapeY); // output size
+            out << "      float * data =  SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_"
+               << fNC << "," << ConvertShapeToString(fShapeC) << ", " << ConvertDimShapeToString(fShapeY) << ");\n";
+            auto length = SOFIE::ConvertDimShapeToLength(fShapeY); // output size
             out << SP << SP << "std::copy(data, data + " << length << ", tensor_" << fNC2 << ");\n";
             out << SP << SP << "delete [] data;\n";
             out << SP << "}\n";
@@ -289,6 +294,29 @@ namespace SOFIE{
          return out.str();
       }
 
+      std::string GenerateInitCode_GPU_ALPAKA() override {
+         std::stringstream out;
+         // generate initialization code for broadcasting of bias tensor
+         if (fShapeC.size() != fShapeY.size() && fNC != fNC2) {
+            // we broadcast here always C in Y output, so target shape is the one of Y
+            // no need to call UTILITY::UnidirectionalBroadcastShape.
+            // here in case of parametric shape we need to assume that the parameters will be defined in the initialization code.
+            auto targetShape = fShapeY;
+            // include a separate scope to avoid defining unique operator temp variables
+            out << "//--- broadcast bias tensor " << fNC << "for Gemm op\n";
+            out << SP << "{\n";
+            out << "      float * data = SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_"
+               << fNC << ".data()," << ConvertShapeToString(fShapeC) << ", " << ConvertDimShapeToString(fShapeY) << ");\n";
+            auto length = SOFIE::ConvertDimShapeToLength(fShapeY); // output size
+            out << SP << SP << "auto hostBuf_"<< fNC2 << " = alpaka::allocBuf<float, Idx>(hostAcc, Ext1D::all(Idx{" << length << "}));\n";
+            out << SP << SP << "std::memcpy(alpaka::getPtrNative(hostBuf_"<< fNC2 <<"), data, "<< length << " * sizeof(float));\n";
+            out << SP << SP << "alpaka::memcpy(queue, deviceBuf_"<< fNC2 << ", hostBuf_"<< fNC2 << ");\n";
+            out << SP << SP << "delete [] data;\n";
+            out << SP << "}\n";
+         }
+         return out.str();
+      }
+
       std::string Generate(std::string opName) override {
          opName = "op_" + opName;
 
@@ -389,8 +417,105 @@ namespace SOFIE{
          return out.str();
       }
 
+      std::string Generate_GPU_ALPAKA(std::string opName) override {
+         opName = "op_" + opName;
+
+         if (fShapeA.empty() || fShapeB.empty() || fShapeY.empty() || (fNC != "" && fShapeC.empty())) {
+            throw std::runtime_error("TMVA SOFIE Gemm Op called to Generate without being initialized first");
+         }
+         std::stringstream out;
+         out << "\n//--------- Gemm_GPU_ALPAKA\n";
+         out << SP << "char " << opName << "_transA = " << (fAttrTransA ? "\'t\'" : "\'n\'") << ";\n";
+         out << SP << "char " << opName << "_transB = " << (fAttrTransB ? "\'t\'" : "\'n\'") << ";\n";
+         // need to consider case A and B have dim > 2 (for MatMul)
+         int64_t dimA = fShapeA.size();
+         int64_t dimB = fShapeB.size();
+         int64_t dimY = fShapeY.size();
+         if (dimA != dimB || dimA != dimY) {
+             throw std::runtime_error("TMVA SOFIE Gemm(MatMul) has invalid shape for inputs or output");
+         }
+         auto m = (fAttrTransA ? fShapeA[dimA-1].GetVal() : fShapeA[dimA-2].GetVal());
+         auto n = (fAttrTransB ? fShapeB[dimB-2].GetVal() : fShapeB[dimB-1].GetVal());
+         auto k = (fAttrTransA ? fShapeA[dimA-2].GetVal() : fShapeA[dimA-1].GetVal());
+         std::vector<Dim> sY = {fShapeY[dimY-2], fShapeY[dimY-1]};
+         // extra dimensions in case of stacked MatMul
+         std::vector<Dim> sA;
+         for (int64_t i = 0; i < dimY-2; i++) {
+            sA.push_back(fShapeY[i]);
+         }
+         auto lengthGemm = ConvertDynamicShapeToLength(sY); // size of the Gemm operation
+         auto lengthExtra = ConvertDynamicShapeToLength(sA); // extra length in case input tensors are of dim>2 (MatMul)
+
+         out << SP << "int " << opName << "_m = " << m << ";\n";
+         out << SP << "int " << opName << "_n = " << n << ";\n";
+         out << SP << "int " << opName << "_k = " << k << ";\n";
+         out << SP << "float " << opName << "_alpha = " << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrAlpha << ";\n";
+         
+         // restricting to a 0 beta since BIAS is configured separately through sofieBLAS interface
+         out << SP << "float " << opName << "_beta = 0;\n";
+
+         // case bias is present
+         if (!fNC.empty()){
+            if (fNC2 == fNC) {
+               // add a check in case broadcasting was not needed or done outside of session
+               // C should have smaller dimension of Y
+               if (!fIsDynamic) {
+                  if (std::stoi(lengthGemm) != static_cast<int>(ConvertShapeToLength(fShapeC)))
+                     throw std::runtime_error("TMVA SOFIE Gemm Op " + opName + " Bias tensor has not correct size "
+                            + ConvertShapeToString(fShapeC) + " output length " + lengthGemm);
+               } else {
+                  // add a dynamic check (C should not be a dynamic tensor)
+                  out << SP << "assert(" << lengthGemm << " != " <<  ConvertShapeToLength(fShapeC) << ");\n";
+               }
+            }
+         } else {
+            //in this case fAttrBeta needs to be equal to zero otherwise second time we run we will use
+            // the previous result
+            if (fAttrBeta != 0) {
+               throw std::runtime_error("TMVA SOFIE Gemm Op " + opName + " Bias tensor is not present but beta value in Gemm is not zero");
+            }
+         }
+
+         // include MatMul case where we stack the Gemm operations
+         // exclude case where we have only 1's in the additional dims
+         bool doStackMul = dimY > 2 && ( fIsDynamic  || std::stoi(lengthExtra) > 1);
+         if (doStackMul) {
+            out << SP << "size_t " << opName << "_yoffset = 0;\n"; // needed if we stack the gemm operations
+            out << SP << "for (int i = 0; i < " << lengthExtra << "; i++){\n";
+            out << SP;
+         }
+         // in the case of bias
+         if (!fNC.empty()){
+            if (fActivation == EActivationType::RELU){
+               out << SP << "blas.gemmrelu("<<opName<<"_transB, "<<opName<<"_transA, "<<opName<<"_n, "<<opName<<"_m, "<<opName<<"_k, "<< opName << "_alpha, deviceBuf_"<<fNB<<",  "<<"deviceBuf_"<<fNA<<", "<<opName << "_beta, deviceBuf_"<<fNC2<<", deviceBuf_"<<fNY<<");\n";
+            } else {
+               out << SP << "blas.gemm("<<opName<<"_transB,"<<opName<<"_transA, "<<opName<<"_n, "<<opName<<"_m, "<<opName<<"_k, "<< opName << "_alpha, deviceBuf_"<<fNB<<",  "<<"deviceBuf_"<<fNA<<", "<<opName << "_beta, deviceBuf_"<<fNC2<<", deviceBuf_"<<fNY<<");\n";
+            }
+         }
+         // need to implement for matmul case without bias
+
+         return out.str();
+      }
+
       std::vector<std::string> GetBlasRoutines() override { return { std::string("Gemm"), std::string("Gemv") }; }
+      std::string GetFusableOutputTensorName() override {
+         return fNY;
+      }
+
+      void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function<void(const std::string&)>& removal_func){
+         removal_func(fNY);
+         fNY = fusable_tensor_name;
+         fOutputTensorNames[0] = fNY;
+      }
 
+      std::string GetBlasConfig(){
+         int64_t dimA = fShapeA.size();
+         int64_t dimB = fShapeB.size();
+         auto m = (fAttrTransA ? fShapeA[dimA-1].GetVal() : fShapeA[dimA-2].GetVal());
+         auto n = (fAttrTransB ? fShapeB[dimB-2].GetVal() : fShapeB[dimB-1].GetVal());
+         auto k = (fAttrTransA ? fShapeA[dimA-2].GetVal() : fShapeA[dimA-1].GetVal());
+         return n+", "+m+", "+k;
+      }
    };
 
 
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc b/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc
index bec7760..ebf4daf 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc
@@ -1,7 +1,6 @@
 #ifndef SOFIE_ROPERATOR_LSTM_I
 #define SOFIE_ROPERATOR_LSTM_I
 
-
 namespace SOFIE {
 
 template<typename T>
@@ -291,7 +290,7 @@ auto ROperator_LSTM<T>::Generate(std::string OpName)
 
    // set the input
    if (fAttrLayout == 0) {
-      out << SP << fType << " *" << OpName << "_input = tensor_" << fNX << ";\n";
+      out << SP << fType << " const *" << OpName << "_input = tensor_" << fNX << ";\n";
    } else {
       if (fUseSession)
          out << SP << fType << " * " << OpName << "_input = fVec_" << OpName << "_input.data();\n";
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx
index 17b77b3..4a328de 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx
@@ -81,7 +81,7 @@ public:
          throw std::runtime_error("TMVA::SOFIE - Tensor " + fNX + " not found.");
       }
       bool isDynamic = model.IsDynamicTensor(fNX);
-      fShapeX = model.GetDynamicTensorShape(fNX);
+      fShapeX = model.GetDimTensorShape(fNX);
       fShapeY = fShapeX;
       model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
       // Type of the output
@@ -93,13 +93,13 @@ public:
       // Shape of fShapeX[0, ..., fAxis)
       fAxesShape = std::vector<Dim>(fShapeX.begin(), fShapeX.begin() + fAxis);
       // Length of the axes
-      fAxesLength = ConvertDynamicShapeToLength(fAxesShape);
+      fAxesLength = ConvertDimShapeToLength(fAxesShape);
       // Shape of fShapeX[fAxis, ..., fSize)
       fNormalizedShape = std::vector<Dim>(fShapeX.begin() + fAxis, fShapeX.end());
       // Length of the normalized axis
-      fNormalizedLength = ConvertDynamicShapeToLength(fNormalizedShape);
+      fNormalizedLength = ConvertDimShapeToLength(fNormalizedShape);
       // length of the input
-      fLength = ConvertDynamicShapeToLength(fShapeX);
+      fLength = ConvertDimShapeToLength(fShapeX);
       // Type of mean and std
       ETensorType type = (fAttrStashType == 1) ? ETensorType::FLOAT : model.GetTensorType(fNX);
       // Mean
@@ -146,7 +146,7 @@ public:
          out << SP << "// Broadcasting the bias of LayerNormalization op\n";
          out << SP << "{\n";
          out << SP << SP << "float* data = SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_";
-         out << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertDynamicShapeToString(fShapeX) << ");\n";
+         out << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertShapeToString(fShapeX) << ");\n";
          out << SP << "std::copy(data, data + " << fLength << ", tensor_" << fNBroadcastedB << ");\n";
          out << SP << "delete[] data;\n";
          out << SP << "}\n";
@@ -339,5 +339,4 @@ public:
 
 } // namespace SOFIE
 
-
 #endif
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx
index 8fefa6d..0f3b699 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx
@@ -75,6 +75,60 @@ public:
       return out.str();
    }
 
+   std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override {
+      std::string op;
+      op = "\n//------ LEAKY_RELU_KERNEL_ALPAKA\n";
+      op += "struct LeakyReluKernel {\n";
+      op += SP + "template<typename TAcc, typename T>\n";
+      op += SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements, T alpha) const {\n";
+      op += SP + SP + "const auto idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + "if(idx < numElements) {\n";
+      op += SP + SP + "out[idx] = data[idx] >= 0 ? data[idx] : alpha * data[idx];\n";
+      op += SP + SP + "}\n";
+      op += SP + "}\n";
+      op += "};\n";
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override {
+      return "LeakyReluKernel leakyReluKernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      OpName = "op_" + OpName;
+      if (fShape.empty()) {
+         throw std::runtime_error("TMVA SOFIE Operator LeakyRelu called to Generate without being initialized first");
+      }
+
+      std::stringstream out;
+      auto length = ConvertShapeToLength(fShape);
+      out << "\n//------ LEAKY_RELU_GPU_ALPAKA\n";
+      out << SP << "constexpr float " << OpName << "_alpha = " << std::setprecision(std::numeric_limits<float>::max_digits10) << falpha << ";\n";
+      out << SP << "auto const elementsPerThread_"<<fNX<<" = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"<<fNX<<" = Vec::all(Idx{"<< length << "});\n";
+      out << SP << "alpaka::KernelCfg<Acc> const kernelCfg_" << fNX << " = {elementsPerGrid_" << fNX << ", elementsPerThread_" << fNX << "};\n";
+      out << SP << "auto const workDiv_" << fNX << " = alpaka::getValidWorkDiv(kernelCfg_" << fNX << ", devAcc, leakyReluKernel, alpaka::getPtrNative(deviceBuf_" << fNX
+         << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast<Idx>(" << length << "), " << OpName << "_alpha);\n";
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << fNX
+         << ", leakyReluKernel, alpaka::getPtrNative(deviceBuf_" << fNX
+         << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast<Idx>(" << length << "), " << OpName << "_alpha);\n";
+      return out.str();
+   }
+
+
+   std::string GetFusableOutputTensorName() override {
+      return fNY;
+   }
+
+   void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function<void(const std::string&)>& removal_func){
+      removal_func(fNX);
+      removal_func(fNY);
+      fNX = fusable_tensor_name;
+      fNY = fusable_tensor_name;
+      fInputTensorNames[0] =  fNX;
+      fOutputTensorNames[0] = fNY;
+   }
+
 };
 
 }//SOFIE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc b/src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc
index c03c1c2..c10c2a5 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc
@@ -1,7 +1,6 @@
 #ifndef SOFIE_ROPERATOR_RNN_I
 #define SOFIE_ROPERATOR_RNN_I
 
-
 namespace SOFIE {
 
 template <typename T>
@@ -230,7 +229,7 @@ auto ROperator_RNN<T>::Generate(std::string OpName)
    // set the input
    if (fAttrLayout == 0) {
       if (fType == "float") {
-         out << SP << "float *" << OpName << "_input = tensor_" << fNX << ";\n";
+         out << SP << "float const*" << OpName << "_input = tensor_" << fNX << ";\n";
       }
    } else {
       if (fUseSession)
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx
index 8af272d..3e8605e 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx
@@ -8,7 +8,6 @@
 #include <sstream>
 #include <algorithm>
 
-
 namespace SOFIE{
 
 template <typename T>
@@ -89,9 +88,9 @@ public:
          model.AddDynamicTensor(fNOutput, type, fShape);
       }
       if (model.Verbose()) {
-         std::cout << "Range -> output is " << fNOutput << " ";
-         if (fIsOutputConstant) std::cout << ConvertDynamicShapeToString(fShape) << std::endl;
-         else std::cout << ConvertDynamicShapeToString(model.GetDynamicTensorShape(fNOutput)) << std::endl;
+         std::cout << "Range -> output is " << fNOutput << " : " << ConvertShapeToString(fShape);
+         if (fIsOutputConstant) std::cout << " : " << ConvertValuesToString(model.GetTensorData<T>(fNOutput));
+         std::cout << std::endl;
       }
    }
 
@@ -121,5 +120,5 @@ public:
 };
 
 }//SOFIE
-
+   
 #endif //SOFIE_ROPERATOR_RANGE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx
index 8062dca..5fb2f04 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx
@@ -24,6 +24,7 @@ public:
    ROperator_Relu(){}
    ROperator_Relu(std::string nameX, std::string nameY):
       fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){
+         fKind = OperatorKind::RELU;
          fInputTensorNames = { fNX };
          fOutputTensorNames = { fNY };
       }
@@ -42,11 +43,11 @@ public:
          throw std::runtime_error("TMVA SOFIE Relu Op Input Tensor " + fNX + " is not found in model");
       }
 
-      fShape = model.GetDynamicTensorShape(fNX);
+      fShape = model.GetDimTensorShape(fNX);
 
       model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape);
       if (model.Verbose()) {
-         std::cout << "Relu : " << fNX << " -> " << fNY << " " << ConvertDynamicShapeToString(fShape) << std::endl;
+         std::cout << "Relu : " << fNX << " -> " << fNY << " " << ConvertDimShapeToString(fShape) << std::endl;
       }
    }
 
@@ -65,6 +66,48 @@ public:
       return out.str();
    }
 
+   std::string Generate_GPU_Kernel_ALPAKA() {
+      std::string op;
+      op = "\n//------ RELU_KERNEL_ALPAKA\n";
+      op += SP + "struct ReluKernel{\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T* data, std::size_t numElements) const {\n";
+      op += SP + SP + SP + "for (auto i : alpaka::uniformElements(acc, numElements)) {\n";
+      op += SP + SP + SP + "data[i] = (data[i] < 0) ? 0 : data[i];\n";
+      op += SP + SP + "}\n";
+      op += SP + "}\n};\n";
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override {
+      return SP + "ReluKernel reluKernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      OpName = "op_" + OpName;
+      if (fShape.empty()) {
+         throw std::runtime_error("TMVA SOFIE Operator Relu called to Generate without being initialized first");
+      }
+      std::stringstream out;
+      auto length = ConvertDynamicShapeToLength(fShape);
+      out << "\n//------ RELU_GPU_ALPAKA\n";
+      out << SP << "alpaka::WorkDivMembers<Dim, Idx> workDiv_"<<fNX<<"(alpaka::Vec<Dim, Idx>::all("<<(stoi(length)+256-1)/256<<"), alpaka::Vec<Dim, Idx>::all(256), alpaka::Vec<Dim, Idx>::all(1));\n";
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << fNX << ", reluKernel, alpaka::getPtrNative(deviceBuf_" << fNX << "), static_cast<Idx>(" << length << ")); \n";
+      return out.str();
+   }
+
+   std::string GetFusableOutputTensorName() override {
+         return fNY;
+   }
+
+   void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function<void(const std::string&)>& removal_func){
+      removal_func(fNX);
+      removal_func(fNY);
+      fNX = fusable_tensor_name;
+      fNY = fusable_tensor_name;
+      fInputTensorNames[0] =  fNX;
+      fOutputTensorNames[0] = fNY;
+   }
 };
 
 }//SOFIE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx
index 66a7e09..0a21709 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx
@@ -6,7 +6,10 @@
 #include "SOFIE/RModel.hxx"
 
 #include <cassert>
+#include <cctype>
 #include <sstream>
+#include <algorithm>
+
 
 namespace SOFIE{
 
@@ -19,17 +22,20 @@ class ROperator_Reshape final : public ROperator
 private:
 
    bool fVerbose = false;
+   bool fDimInput = false;
+   bool fDynamicShape = false;
    ReshapeOpMode fOpMode = Reshape;   // type of Reshape operator
 
    int fAllowZero = 0; // (for Reshape) zero in tensor shape makes output shape equal to input tensor shape
    int fAxis = 1;      // (for Flatten)
 
    std::string fNData;        // input data tensor name
-   std::string fNShape;       // reshape tensor name
+   std::string fNInput2;       // reshape or axes tensor name depending on operator
    std::string fNOutput;               // output tensor name
-   std::vector<size_t> fShapeInput;     // input shape data
-   std::vector<size_t> fShapeOutput;   // output shape data
+   std::vector<Dim> fShapeInput;     // input shape data
+   std::vector<Dim> fShapeOutput;   // output shape data
    std::vector<int64_t> fAttrAxes;         // axes attributes (provided for all version of Squeeze/Unsqueeze)
+   std::vector<int64_t> fShape;     // shape tensor values provided for Reshape
 
 public:
 
@@ -42,16 +48,16 @@ public:
    }
 
    ROperator_Reshape(){}
-   ROperator_Reshape(ReshapeOpMode opMode, int attr_value, std::string nameData, std::string nameShape, std::string nameOutput)
-      : fOpMode(opMode), fNData(UTILITY::Clean_name(nameData)), fNShape(UTILITY::Clean_name(nameShape)),
-      fNOutput(UTILITY::Clean_name(nameOutput))
+   ROperator_Reshape(ReshapeOpMode opMode, int attr_value, std::string nameData, std::string nameInput2, std::string nameOutput)
+      : fOpMode(opMode), fNData(UTILITY::Clean_name(nameData)), fNInput2(UTILITY::Clean_name(nameInput2)),
+         fNOutput(UTILITY::Clean_name(nameOutput))
    {
       if (opMode == Reshape) fAllowZero = attr_value;
       if (opMode == Flatten) fAxis = attr_value;
 
       fInputTensorNames = { fNData };
-      if(!fNShape.empty()){
-         fInputTensorNames.emplace_back(fNShape);
+      if(!fNInput2.empty()){
+         fInputTensorNames.emplace_back(fNInput2);
       }
       fOutputTensorNames = { fNOutput };
    }
@@ -70,94 +76,153 @@ public:
       auto ret = std::vector<ETensorType>(1, input[0]);
       return ret;
    }
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
+      return input;
+   }
 
    // output shape
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      std::vector<std::vector<size_t>> ret;
+   std::vector<std::vector<Dim>> ShapeInference(const std::vector<std::vector<Dim>> & input)  {
+      std::vector<std::vector<Dim>> ret;
       auto & input_shape = input[0];
-
       if (fOpMode == Reshape) {
-         if (input.size() != 2) throw std::runtime_error("TMVA SOFIE Reshape Op needs 2 input tensors");
-         auto output_shape = input[1]; // the provided shape
-         size_t input_length = ConvertShapeToLength(input_shape);
-         size_t output_length = ConvertShapeToLength(output_shape);
-         // (input_length == output_length) is the easy case : (2,3,4) -> (2,12)
-         if (input_length != output_length) {
-            if ((output_length == 0 && fAllowZero == 0) || static_cast<long>(output_length)  < 0) {
-               // in this case value 0 or -1 in shape are automatically corrected
-               bool replacementDone = false;
-               for (size_t i = 0; i < output_shape.size(); i++) {
-                  if (output_shape[i] == 0 || output_shape[i] == static_cast<size_t>(-1)) {
-                     if (replacementDone) {
-                        throw std::runtime_error("TMVA Reshape Op : output shape has multiple negative or zero values");
+         // correct the provided shape (here we have the value) for 0 or -1
+         std::vector<Dim> output_shape(fShape.size());
+         assert(!fShape.empty() && !fDynamicShape);
+         for (size_t i = 0; i < output_shape.size(); i++) {
+            if (fShape[i] > 0 || (fAllowZero && fShape[i] >= 0))
+               output_shape[i] = Dim{ static_cast<size_t>(fShape[i]) };
+            else if (!fAllowZero && fShape[i] == 0)
+               output_shape[i] = input_shape[i];
+         }
+         // now case of -1 in shape
+         for (size_t i = 0; i < output_shape.size(); i++) {
+            if (fShape[i] == -1) {
+               auto tmp = output_shape;
+               tmp.erase(tmp.begin() + i);
+               auto tmp_length = ConvertDimShapeToLength(tmp);
+               auto input_length = ConvertDimShapeToLength(input_shape);
+               if (fVerbose)
+                  std::cout << "reshape- try simplifying " << ConvertDimShapeToString(input_shape) << " with length "
+                            << input_length << " to " << tmp_length << std::endl;
+
+               if (IsInteger(tmp_length) && IsInteger(input_length))
+                  output_shape[i] = Dim{static_cast<size_t>(std::stoi(input_length) / std::stoi(tmp_length))};
+               else {
+                  //we can try simplifying expression if tmp_length is integer and part of input_length
+                  // contains tmp_length
+                  bool canSimplify = false;
+                  std::vector <Dim> reduced_input;
+                  if (IsInteger(tmp_length)) {
+
+                     // try to tokenize with * the input length
+
+                     std::stringstream ss(input_length);
+
+                     std::string token;
+
+                     // Tokenizing w.r.t. space '*'
+                     while(getline(ss, token, '*'))
+                     {
+                        // remove any whitespace
+                        token.erase(std::remove_if(token.begin(), token.end(),
+                                                   [](unsigned char x) { return std::isspace(x); }), token.end());
+                        if (token != tmp_length) {
+                           if (IsInteger(token)) {
+                                 size_t il = static_cast<size_t>(std::stoi(input_length));
+                                 size_t tl = static_cast<size_t>(std::stoi(tmp_length));
+                                 if ((il % tl) == 0) {
+                                 canSimplify = true;
+                                 reduced_input.push_back(Dim{il / tl});
+                                 }
+                           } else {
+                              reduced_input.push_back(Dim{token});
+                           }
+                        } else {
+                           // token is equal to tmp_length, can be not considered and is simplified
+                           canSimplify = true;
+                        }
                      }
-                     auto tmp = output_shape;
-                     tmp.erase(tmp.begin() + i);
-                     auto tmp_length = ConvertShapeToLength(tmp);
-                     output_shape[i] = input_length / tmp_length;
-                     replacementDone = true;
                   }
+                  if (canSimplify) {
+                     // if length contains * we need to add some brackets
+                     std::string res_shape = ConvertDimShapeToLength(reduced_input);
+                     if (res_shape.find('*') != std::string::npos)
+                        output_shape[i] = Dim{std::string("(") + res_shape + ")", static_cast<size_t>(-1)};
+                     else
+                        output_shape[i] = Dim{res_shape};
+                  }
+                  if (!canSimplify)
+                     output_shape[i] = Dim{std::string("(") + input_length + " / (" + tmp_length + "))", static_cast<size_t>(-1)};
                }
-               if (fVerbose)
-                  std::cout << "Reshape: correct output shape from " << ConvertShapeToString(input[1])
-                        << " to " << ConvertShapeToString(output_shape) << std::endl;
-            }
-            if (ConvertShapeToLength(output_shape) != input_length) {
-               throw std::runtime_error("TMVA Reshape Op : Invalid  shapes : " + ConvertShapeToString(input_shape) +
-                                        ConvertShapeToString(output_shape));
+
+               break; // cannot have more than -1
             }
+            //  throw std::runtime_error(
+            //                   "TMVA Reshape Op : output shape has multiple negative or zero values");
+         }
+
+         if (fVerbose)
+            std::cout << "Reshape: correct output shape  to " << ConvertShapeToString(output_shape) << std::endl;
+
+         if (!fDimInput && ConvertDimShapeToLength(output_shape) != ConvertDimShapeToLength(input_shape)) {
+            throw std::runtime_error("TMVA Reshape Op : Invalid  shapes : " + ConvertShapeToString(input_shape) +
+                                     ConvertShapeToString(output_shape));
          }
          ret.push_back(output_shape);
 
       } else if (fOpMode == Flatten) {
-         // flattenig case
-         size_t inputSize = ConvertShapeToLength(input_shape);
-         size_t b = input[0][0];
-         std::vector<size_t> newShape = {b, inputSize / b};
+         // flatten case
+         if (fAxis < 0)
+            fAxis += input_shape.size();
+         auto s1 = std::vector<Dim>(input_shape.begin(), input_shape.begin() + fAxis);
+         auto s2 = std::vector<Dim>(input_shape.begin() + fAxis, input_shape.end());
+         auto l1 = ConvertDimShapeToLength(s1);
+         auto l2 = ConvertDimShapeToLength(s2);
+         std::vector<Dim> newShape = {Dim{l1}, Dim{l2}};
          ret.push_back(newShape);
-
       } else if (fOpMode == Squeeze) {
          // squeeze
          // assume no axis is provided - remove all axes with value equal to 1
-         auto output_shape = input[0];
-         if (input.size() == 1) {
+         auto output_shape = input_shape;
+         if (fAttrAxes.empty()) {
             size_t i = 0;
             while (i < output_shape.size()) {
-               if (output_shape[i] == 1 ) {
+               if (output_shape[i] == Dim{1}) {
                   output_shape.erase(output_shape.begin() + i);
-               }
-               else {
+               } else {
                   i++;
                }
             }
-         } else if (input.size() == 2) {
-            auto & axes = input[1];
-            for (size_t i = 0; i < axes.size(); i++){
-               if (output_shape[axes[i]] != 1)
-                  throw std::runtime_error("TMVA Squeeze Op : Invalid  axes : " + ConvertShapeToString(axes) +
-                                           ConvertShapeToString(output_shape));
+         } else {
+            auto &axes = fAttrAxes;
+            for (size_t i = 0; i < axes.size(); i++) {
+               if (axes[i] < 0)
+                  axes[i] += input_shape.size();
+               if (!(output_shape[axes[i]] == Dim{1}))
+                  throw std::runtime_error("TMVA Squeeze Op : Invalid  axis value " + std::to_string(axes[i]) +
+                                           " for " + ConvertShapeToString(output_shape));
                output_shape.erase(output_shape.begin() + axes[i]);
             }
          }
          ret.push_back(output_shape);
       }
-
       else if (fOpMode == Unsqueeze) {
          // unsqueeze
-         assert(input.size() == 2);
-         auto output_shape = input[0];
-         auto &axes = input[1];
+         std::cout << "doing unsqueeze....\n";
+         assert(!fAttrAxes.empty());
+         auto output_shape = input_shape;
+         auto &axes = fAttrAxes;
          // output rank
          int64_t r = input[0].size() + axes.size();
-         for (auto & a : axes) {
+         for (auto &a : axes) {
             int64_t i = static_cast<int64_t>(a);
-            if ( i < -r  || i > r - 1 )
+            if (i < -r || i > r - 1)
                throw std::runtime_error("TMVA Unsqueeze Op - axes input is not in correct range");
             if (i >= 0)
-               output_shape.insert(output_shape.begin() + i, 1);
+               output_shape.insert(output_shape.begin() + i, Dim{1});
             else
-               //negative axes
-               output_shape.insert(output_shape.end() + i + 1, 1);
+               // negative axes
+               output_shape.insert(output_shape.end() + i + 1, Dim{1});
          }
          ret.push_back(output_shape);
       }
@@ -166,34 +231,51 @@ public:
 
    void Initialize(RModel& model) override {
 
+      std::cout << "initialize reshape op type " << fOpMode << " - " << fNInput2 << " " << fNData << std::endl;
       fVerbose = model.Verbose();
       if (model.CheckIfTensorAlreadyExist(fNData) == false) {
           // input must be a graph input, or already initialized intermediate tensor
          throw std::runtime_error("TMVA Reshape Op Input Tensor " + fNData + "  is not found in model");
       }
-      fShapeInput = model.GetTensorShape(fNData);
-      // check if optional shape tensor exist
-      if (!fNShape.empty()) {
-         if (model.CheckIfTensorAlreadyExist(fNShape)) {
-            auto dptr = model.GetInitializedTensorData(fNShape);
-            auto input_shape = static_cast<int64_t *>(dptr.get());
-            auto vec = model.GetTensorShape(fNShape);
-            assert(vec.size() == 1);
-            size_t n = vec[0]; // size of shape input tensor
-
-            std::vector<size_t> descShape(n);
-            std::copy(input_shape, input_shape + n, descShape.begin());
-            fShapeOutput = ShapeInference({fShapeInput, descShape})[0];
-            // set flag to not write tensor in weight file. Its data will be hard-coded in way model is constructed
-            model.SetNotWritableInitializedTensor(fNShape);
+      fShapeInput = model.GetDimTensorShape(fNData);
+      fDimInput = model.IsDynamicTensor(fNData);
+      // check if optional tensor exists defining shape or axes
+      if (!fNInput2.empty()) {
+         if (model.CheckIfTensorAlreadyExist(fNInput2)) {
+            if (model.IsConstantTensor(fNInput2) || model.IsInitializedTensor(fNInput2)) {
+               // assume input shape is an initialized tensor
+               auto dptr = model.GetInitializedTensorData(fNInput2);
+               auto values = static_cast<int64_t *>(dptr.get());
+               auto vec = model.GetTensorShape(fNInput2);
+               size_t n = 1;
+               if (vec.size() > 0)
+                  n = vec[0]; // size of shape input tensor
+               // copy values in fShape vector or fAttrAxes
+               if (fOpMode == Reshape)
+                  fShape = std::vector<int64_t>(values, values + n);
+               else
+                  fAttrAxes = std::vector<int64_t>(values, values + n);
+
+               fShapeOutput = ShapeInference({fShapeInput})[0];
+               // set flag to not write tensor in weight file. Its data will be hard-coded in way model is constructed
+               model.SetNotWritableInitializedTensor(fNInput2);
+            } else {
+               // we cannot get shape at initialization time but at run-time
+               fDynamicShape = true;
+               // size of shape output us given by size of shape input tensor
+               auto shapeInput2 = model.GetTensorShape(fNInput2);
+               fShapeOutput.resize(shapeInput2[0]);
+               for (size_t i = 0; i < fShapeOutput.size(); i++) {
+                  fShapeOutput[i] = Dim{ std::string("s_") + fNOutput + "_" + std::to_string(i)};
+               }
+            }
          } else {
-            throw std::runtime_error("TMVA Reshape Op Shape Tensor " + fNShape + " is not found in model");
+            throw std::runtime_error("TMVA Reshape Op 2nd input Tensor " + fNInput2 + " is not found in model");
          }
       } else if (!fAttrAxes.empty()) {
-         // case fNShape is empty and axes are provided as attributes
-         std::vector<size_t> descShape(fAttrAxes.size());
-         std::copy(fAttrAxes.begin(), fAttrAxes.end(), descShape.begin());
-         fShapeOutput = ShapeInference({fShapeInput, descShape})[0];
+         // case fNShape is empty and axes are provided as attributes (e.g. for Unsqueeze)
+         std::cout << "attribute axes exists\n";
+         fShapeOutput = ShapeInference({fShapeInput})[0];
       } else if (fOpMode == Flatten || fOpMode == Squeeze) {
          fShapeOutput = ShapeInference({fShapeInput})[0];
       } else {
@@ -203,14 +285,26 @@ public:
       if (model.IsInitializedTensor(fNData) && model.GetTensorType(fNData) == ETensorType::INT64) {
          fIsOutputConstant = true;
          auto inputData = static_cast<int64_t*>(model.GetInitializedTensorData(fNData).get());
-         if (ConvertShapeToLength(fShapeInput) != ConvertShapeToLength(fShapeOutput))
+         auto o_shape = ConvertShapeToInt(fShapeOutput);
+         if (ConvertShapeToLength(ConvertShapeToInt(fShapeInput)) != ConvertShapeToLength(o_shape) )
             throw std::runtime_error("TMVA Reshape Op : Invalid Input/Output lengths");
-         model.AddConstantTensor<int64_t>(fNOutput, fShapeOutput, inputData);
+         model.AddConstantTensor<int64_t>(fNOutput, o_shape, inputData);
          if (model.Verbose()) {
             std::cout << Name() << " : " << fNData << " " << ConvertShapeToString(fShapeInput) << " -->  " << fNOutput << " (constant) " << ConvertShapeToString(fShapeOutput)  << " : " <<
-            ConvertValuesToString(ConvertShapeToLength(fShapeOutput), inputData) << std::endl;
+            ConvertValuesToString(ConvertShapeToLength(o_shape), inputData) << std::endl;
          }
-      } else {
+      }
+      // for shape tensors we can have it if output shape is size==1 or a scalar
+      else if (model.IsShapeTensor(fNData) && fShapeOutput.size() <=1) {
+         fIsOutputConstant = true;
+         auto inputData = model.GetShapeTensorValues(fNData);
+         model.AddShapeTensor(fNOutput, inputData);
+         if (model.Verbose()) {
+            std::cout << Name() << " : " << fNData << " " << ConvertShapeToString(fShapeInput) << " -->  " << fNOutput << " (shape) " << ConvertShapeToString(fShapeOutput)  << " : " <<
+            ConvertShapeToString(inputData) << std::endl;
+         }
+      }
+      else {
          // non-constant case
          model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fShapeOutput);
          if (model.Verbose())
@@ -218,32 +312,74 @@ public:
       }
    }
 
-   std::string Generate(std::string OpName) override {
+   std::string Generate(std::string opName) override {
       if (fIsOutputConstant) return "";  //no op for constant tensors
 
-      OpName = "op_" + OpName;
+      std::stringstream out;
+      std::string opType = "Reshape";
+      if (fOpMode == Flatten)
+         opType = "Flatten";
+      else if (fOpMode == Squeeze)
+         opType = "Squeeze";
+      else if (fOpMode == Unsqueeze)
+         opType = "Unsquueze";
+
+      out << SP << "///--------" << opType << " operator " << opName << " --> " << ConvertShapeToString(fShapeOutput) << "\n";
+
+      // in case of dynamic output shape we need to set the shape value from input shape tensor
+      // and take case of the zero values
+      if (fDynamicShape) {
+         for (size_t i = 0; i < fShapeOutput.size(); i++) {
+            // since fNInput2 values are int64_t, should we check if they are negative?
+            out << SP << "size_t " << fShapeOutput[i].param << " = " << "tensor_" << fNInput2 << "[" << i << "];\n";
+            if (!fAllowZero)
+               out << SP << "if (tensor_" << fNInput2 << "[" << i << "] <= 0 ) "
+                         <<  fShapeOutput[i].param << " = " <<  fShapeInput[i] << ";\n";
+         }
+      }
 
       // output of reshape is same as input
-      size_t length = ConvertShapeToLength(fShapeOutput);
-      if (length != ConvertShapeToLength(fShapeInput)) {
+      auto lengthOut = ConvertDimShapeToLength(fShapeOutput);
+      auto lengthIn = ConvertDimShapeToLength(fShapeInput);
+      if (lengthOut != lengthIn) {
+         // check needs to be done at run-time
+         out << SP << "if (" << lengthOut << "!=" << lengthIn << ")\n";
+         out << "throw std::runtime_error(\"TMVA SOFIE Reshape Op : output lengths is different than input one\");\n";
+      }
+
+
+      out << SP << "std::copy( tensor_" << fNData << ", tensor_" << fNData << " + " << lengthIn << ", " << "tensor_" << fNOutput
+          << ");\n";
+      return out.str();
+   }
+   
+   std::string Generate_GPU_ALPAKA(std::string opName) override {
+      if (fIsOutputConstant) return "";  //no op for constant tensors
+
+      opName = "op_" + opName;
+
+      // output of reshape is same as input
+      auto length = ConvertDimShapeToLength(fShapeOutput);
+      if (length != ConvertDimShapeToLength(fShapeInput)) {
          throw std::runtime_error("TMVA SOFIE Reshape Op : wrong output shape - is " +
-                                  ConvertShapeToString(fShapeOutput) + " and input is " +
-                                  ConvertShapeToString(fShapeInput));
+                                  ConvertDimShapeToString(fShapeOutput) + " and input is " +
+                                  ConvertDimShapeToString(fShapeInput));
       }
       std::stringstream out;
-      std::string opName = "Reshape";
+      opName += "_Reshape";
       if (fOpMode == Flatten)
-         opName = "Flatten";
+         opName += "_Flatten";
       else if (fOpMode == Squeeze)
-         opName = "Squeeze";
+         opName += "_Squeeze";
       else if (fOpMode == Unsqueeze)
-         opName = "Unsquueze";
+         opName += "_Unsquueze";
+      
 
-      out << SP << "///--------" << opName << " operator\n" << std::endl;
-      out << SP << "std::copy( tensor_" << fNData << ", tensor_" << fNData << " + " << length << ", " << "tensor_" << fNOutput
-          << ");\n";
+      out << SP << "///-------" << opName << " operator\n" << std::endl;
+      out << SP << "alpaka::memcpy(queue, deviceBuf_" << fNOutput << ", deviceBuf_" << fNData << ");\n";
       return out.str();
    }
+
 };
 
 }//SOFIE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx
index 52bdeae..34e69eb 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx
@@ -101,6 +101,26 @@ public:
       return out.str();
    }
 
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      // no need to generate code if the output is constant
+      if (fIsOutputConstant) return "";
+
+      OpName = "op_" + OpName;
+      if (fShape.empty()) {
+         throw std::runtime_error("TMVA SOFIE Shape op called to Generate without being initialized first");
+      }
+      std::stringstream out;
+
+      out << "\n//------ Shape\n";
+      // add a dummy statement to avoid warning for unused input
+      out << SP << "(void) deviceBuf_" << fNX << ";\n";
+      size_t length = ConvertShapeToLength(fOutput_shape);
+      for (size_t id = 0; id < length; id++) {
+         out << SP << "deviceBuf_" << fNY << "["<< id << "] = " << fShape[fStart+id] << ";\n";
+      }
+      return out.str();
+   }
+
 };
 
 }//SOFIE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx
index 68edd01..5edbcf9 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx
@@ -61,6 +61,59 @@ public:
       return out.str();
    }
 
+   std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override {
+      std::string op;
+      op = "\n//------ SIGMOID_KERNEL_ALPAKA\n";
+      op += "struct SigmoidKernel {\n";
+      op += SP + "template<typename TAcc, typename T>\n";
+      op += SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements) const {\n";
+      op += SP + SP + "const auto idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + "if(idx < numElements) {\n";
+      op += SP + SP + SP + SP + "out[idx] = static_cast<T>(1) / (static_cast<T>(1) + exp(-data[idx]));\n";
+      op += SP + SP + SP + "}\n";
+      op += SP + SP + "}\n";
+      op += SP + "};\n";
+      return op;
+   }
+
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override {
+      return SP + "SigmoidKernel sigmoidKernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      OpName = "op_" + OpName;
+      if (fShape.empty()) {
+         throw std::runtime_error("TMVA SOFIE Operator Sigmoid called to Generate without being initialized first");
+      }
+
+      std::stringstream out;
+      auto length = ConvertShapeToLength(fShape);
+      out << "\n//------ SIGMOID_GPU_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_"<<fNX<<" = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"<<fNX<<" = Vec::all(Idx{"<< length << "});\n";
+      out << SP << "alpaka::KernelCfg<Acc> const kernelCfg_" << fNX << " = {elementsPerGrid_" << fNX << ", elementsPerThread_" << fNX << "};\n";
+      out << SP << "auto const workDiv_" << fNX << " = alpaka::getValidWorkDiv(kernelCfg_" << fNX << ", devAcc, sigmoidKernel, alpaka::getPtrNative(deviceBuf_" << fNX
+         << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast<Idx>(" << length << "));\n";
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << fNX
+         << ", sigmoidKernel, alpaka::getPtrNative(deviceBuf_" << fNX
+         << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast<Idx>(" << length << "));\n";
+      return out.str();
+   }
+
+   std::string GetFusableOutputTensorName() override {
+      return fNY;
+   }
+
+   void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function<void(const std::string&)>& removal_func){
+      removal_func(fNX);
+      removal_func(fNY);
+      fNX = fusable_tensor_name;
+      fNY = fusable_tensor_name;
+      fInputTensorNames[0] =  fNX;
+      fOutputTensorNames[0] = fNY;
+   }
+
    std::vector<std::string> GetStdLibs() override { return { std::string("cmath") };}
 };
 
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx
index 63fbcb3..6335db3 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx
@@ -153,6 +153,72 @@ public:
       return out.str();
    }
 
+   std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) {
+      std::string op;
+      op = "\n//------ SPLIT_KERNEL_ALPAKA\n";
+      op += SP + "struct SplitKernel {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const * input, T * output,";
+      op +=  "std::size_t const * input_strides, std::size_t const * output_strides,  std::size_t const split_axis, ";
+      op +=  "std::size_t const axis_offset, std::size_t const ndim) const {\n";
+      op += SP + SP + SP + SP + "auto elements = alpaka::uniformElementsND(acc, alpaka::Vec<ndim, std::size_t>(output_shape));\n";
+      op += SP + SP + SP + SP + "for (auto const& elem : elements) {\n";
+      op += SP + SP + SP + SP + SP + "size_t input_idx = 0;\n";
+      op += SP + SP + SP + SP + SP + "size_t output_idx = 0;\n";
+      op += SP + SP + SP + SP + SP + "for (int i = 0; i < ndim; ++i) {\n";
+      op += SP + SP + SP + SP + SP + SP + "size_t output_coord = elem[i];\n";
+      op += SP + SP + SP + SP + SP + SP + "size_t input_coord  = (i == split_axis) ? (output_coord + axis_offset) : output_coord;\n";
+      op += SP + SP + SP + SP + SP + SP + "input_idx += input_coord * input_strides[i];\n";
+      op += SP + SP + SP + SP + SP + SP + "output_idx += output_coord * output_strides[i];\n}\n";
+      op += SP + SP + SP + SP + SP + "output[output_idx] = input[input_idx];\n";
+      op += SP + SP + SP + SP + "}\n";
+      op += SP + SP + "}\n";
+      op += SP + "};\n";
+
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override {
+      return SP + "SplitKernel splitKernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      OpName = "op_" + OpName;
+      if (fOutputShapes.empty()){
+         throw std::runtime_error("TMVA SOFIE Operator Split called to Generate without being initialized first");
+      }
+
+      std::stringstream out;
+      out << "\n//------ SPLIT_GPU_ALPAKA\n";
+
+      bool axis_is_innermost = (fAxis == static_cast<int>(fInputShape.size()) - 1)
+                           && (UTILITY::ComputeStrideFromShape(fInputShape)[fInputShape.size()-1] == 1);
+      out << SP <<"size_t "<<OpName<<"_axis_offset = 0;\n";
+      for(size_t i=0; i<fNYs.size(); ++i){
+            auto length = ConvertShapeToLength(fOutputShapes[i]);
+            out << SP << SP << "int64_t part = "<<fNSplit<<"[i];\n";
+            out << SP << SP << "if (part == 0) { continue; }\n";
+         if(axis_is_innermost) {
+            out << SP << SP << "auto src_ptr = "<<fNX<< " + "<<OpName<<"_axis_offset;\n";
+            out << SP << SP << SP << "size_t bytes = static_cast<size_t>(" << length << ") * sizeof(float);\n";
+            out << SP << SP << SP << "alpaka::memcpy(queue, "<<fNYs[i]<<", src_ptr, bytes);\n"; 
+         } else {
+            out << SP << "alpaka::WorkDivMembers<Dim, Idx> workDiv_" << fNYs[i]
+                  << "(alpaka::Vec<Dim, Idx>::all((" << length << " + 256 - 1) / 256), "
+                  << "alpaka::Vec<Dim, Idx>::all(256), alpaka::Vec<Dim, Idx>::all(1));\n";
+
+            out << SP << "alpaka::exec<Acc>(queue, workDiv_" << fNYs[i]
+               << ", splitKernel, alpaka::getPtrNative(deviceBuf_" << fNX
+               << "), alpaka::getPtrNative(deviceBuf_" << fNYs[i]
+               << "), "<< ConvertShapeToString(UTILITY::ComputeStrideFromShape(fInputShape)) <<", "<<ConvertShapeToString(UTILITY::ComputeStrideFromShape(fOutputShapes[i]))<<", "<<fAxis<<", "
+               << "axis_offset, "<<fNYs[i].size()<<");\n";
+         }
+         if (i < fNYs.size()-1) out << SP << OpName << "_axis_offset += part;\n";
+      }
+      return out.str();
+   }
+
+
 };
 
 }//SOFIE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Tile.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Tile.hxx
index 354fbe3..1b31faa 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Tile.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Tile.hxx
@@ -141,6 +141,56 @@ public:
       out << "}\n";  // end of scope
       return out.str();
    }
+
+   std::string Generate_GPU_Kernel_ALPAKA() {
+      std::string op;
+      op = "\n//------ TILE_KERNEL_ALPAKA\n";
+      op += SP + "struct TileKernel {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const * __restrict__ tensor_X,";
+      op += SP + SP + SP + "T * __restrict__ tensor_Y, const int64_t * __restrict__ shape_X,";
+      op += SP + SP + SP + "const int64_t * __restrict__ stride_X, const int64_t * __restrict__ shape_Y,";
+      op += SP + SP + SP + "const int64_t * __restrict__ stride_Y, std::size_t const ndim) const {\n";
+      op += SP + SP + SP + SP + "auto elements = alpaka::uniformElementsND(acc, alpaka::Vec<ndim, std::size_t>(shape_Y));\n";
+      op += SP + SP + SP + SP + "for (auto const& elem: elements) {\n";
+      op += SP + SP + SP + SP + SP + "size_t input_idx = 0;\n";
+      op += SP + SP + SP + SP + SP + "size_t output_idx = 0;\n";
+      op += SP + SP + SP + SP + SP + "for (int i = 0; i < ndim; ++i) {\n";
+      op += SP + SP + SP + SP + SP + SP + "size_t input_coord = elem[i] % shape_X[i];\n";
+      op += SP + SP + SP + SP + SP + SP + "input_idx += input_coord * stride_X[i];\n";
+      op += SP + SP + SP + SP + SP + "output_idx += elem[i] * stride_Y[i];\n}\n";
+      op += SP + SP + SP + SP + SP + "tensor_Y[output_idx] = tensor_X[input_idx];\n";
+      op += SP + SP + SP + SP + "}\n";
+      op += SP + SP + "}\n";
+      op += SP + "};\n";
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override {
+      return SP + "TileKernel tileKernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      OpName = "op_" + OpName;
+      if (fShapeInput.empty() || fShapeY.empty()) {
+         throw std::runtime_error("TMVA SOFIE Operator Tile called to Generate without being initialized first");
+      }
+      std::stringstream out;
+      auto length = ConvertShapeToLength(fShapeY);
+      out << "\n//------ TILE_GPU_ALPAKA\n";
+      out << SP << "alpaka::WorkDivMembers<Dim, Idx> workDiv_" << fNY
+            << "(alpaka::Vec<Dim, Idx>::all((" << length << " + 256 - 1) / 256), "
+            << "alpaka::Vec<Dim, Idx>::all(256), alpaka::Vec<Dim, Idx>::all(1));\n";
+
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << fNY
+         << ", tileKernel, alpaka::getPtrNative(deviceBuf_" << fNInput
+         << "), alpaka::getPtrNative(deviceBuf_" << fNY
+         << "), "<< ConvertShapeToString(fShapeInput)<<", "<< ConvertShapeToString(UTILITY::ComputeStrideFromShape(fShapeInput)) <<", "
+         <<ConvertShapeToString(fShapeY)<<", "<<ConvertShapeToString(UTILITY::ComputeStrideFromShape(fShapeY))<<", "<<fNY.length()<<");\n";
+
+      return out.str();
+   }
+
 };
 
 }//SOFIE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Transpose.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Transpose.hxx
index 11c40bb..090d2d8 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Transpose.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Transpose.hxx
@@ -165,6 +165,18 @@ public:
       return out.str();
    }
 
+   std::string Generate_GPU_Kernel_ALPAKA() {
+      std::string op;
+      op = "\n//------ TRANSPOSE_KERNEL_ALPAKA\n";
+      op += SP + "struct TransposeKernel{\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const * input, T const * output, std::size_t * shape, std::size_t * strides) const {\n";
+      op += SP + SP + SP + "for (auto i : alpaka::uniformElementsND(acc, shape)) {\n";
+      op += SP + SP + SP + SP + "size_t input_idx = 0;\n";
+
+      return op;
+   }
+
 
 };
 
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx
index 28ac093..19d217d 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx
@@ -7,7 +7,6 @@
 
 #include <sstream>
 
-
 namespace SOFIE{
 
 
@@ -150,32 +149,86 @@ public:
          fShapeY = fShapeA;
       }
       // check case of constant  output (if all inputs are defined)
-      if (model.IsInitializedTensor(fNA) && model.IsInitializedTensor(fNB) && model.IsInitializedTensor(fNC)) {
-         std::string nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA;
-         std::string nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB;
+      if (model.IsInitializedTensor(fNC)) {
+
          std::string nameC = fNBroadcastedC.empty()? fNC : fNBroadcastedC;
-         auto dataA = static_cast<T *>(model.GetInitializedTensorData(nameA).get());
-         auto dataB = static_cast<T *>(model.GetInitializedTensorData(nameB).get());
          auto dataC = static_cast<bool *>(model.GetInitializedTensorData(nameC).get());
-         std::vector<T> dataY(ConvertShapeToLength(fShapeY));
-         for (size_t i = 0; i < dataY.size(); i++)
-             dataY[i] = (dataC[i]) ? dataA[i] : dataB[i];
-         model.AddConstantTensor<T>(fNY, fShapeY, dataY.data());
-         // flag tensors to not be written in a file
-         model.SetNotWritableInitializedTensor(nameA);
-         model.SetNotWritableInitializedTensor(nameB);
          model.SetNotWritableInitializedTensor(nameC);
+         T * dataA = nullptr;
+         T * dataB = nullptr;
+         std::vector<Dim> shapeDataA;
+         std::vector<Dim> shapeDataB;
+         if (model.IsInitializedTensor(fNA)) {
+             std::string nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA;
+             dataA = static_cast<T *>(model.GetInitializedTensorData(nameA).get());
+            // flag tensors to not be written in a file
+            model.SetNotWritableInitializedTensor(nameA);
+         } else if (model.IsShapeTensor(fNA))
+            shapeDataA = model.GetShapeTensorValues(fNA);
+         if (model.IsInitializedTensor(fNB)) {
+            std::string nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB;
+            dataB = static_cast<T *>(model.GetInitializedTensorData(nameB).get());
+            model.SetNotWritableInitializedTensor(nameB);
+         } else if (model.IsShapeTensor(fNB))
+            shapeDataB = model.GetShapeTensorValues(fNB);
 
-         fIsOutputConstant = true;
-         if (model.Verbose())
+         std::vector<T> dataY;
+         std::vector<Dim> shapeDataY;
+
+         bool isOutputConstantTensor = true;
+         if (dataA && dataB) {
+            dataY.resize(ConvertShapeToLength(fShapeY));
+            for (size_t i = 0; i < dataY.size(); i++)
+                dataY[i] = (dataC[i]) ? dataA[i] : dataB[i];
+         }
+         else if (dataA && shapeDataB.size()>0 ) {
+            shapeDataY.resize(ConvertShapeToLength(fShapeY));
+            for (size_t i = 0; i < shapeDataY.size(); i++) {
+               shapeDataY[i] = (dataC[i]) ? Dim{size_t(dataA[i])} : shapeDataB[i];
+               isOutputConstantTensor &= !shapeDataY[i].isParam;
+            }
+         }
+         else if (dataB && shapeDataA.size()>0 ) {
+            shapeDataY.resize(ConvertShapeToLength(fShapeY));
+            for (size_t i = 0; i < shapeDataY.size(); i++) {
+               shapeDataY[i] = (dataC[i]) ? shapeDataB[i] : Dim{size_t(dataB[i])};
+               isOutputConstantTensor &= !shapeDataY[i].isParam;
+            }
+         }
+         else if (shapeDataB.size() > 0  && shapeDataA.size()>0 ) {
+            shapeDataY.resize(ConvertShapeToLength(fShapeY));
+            for (size_t i = 0; i < shapeDataY.size(); i++) {
+               shapeDataY[i] = (dataC[i]) ? shapeDataA[i] : shapeDataB[i];
+               isOutputConstantTensor &= !shapeDataY[i].isParam;
+            }
+         }
+         fIsOutputConstant = true;  // this contains both case constant tensor output ans shape tensor output
+         if (isOutputConstantTensor && dataY.empty()) {
+            dataY.resize(shapeDataY.size());
+            for (size_t i = 0; i < shapeDataY.size(); i++)
+               dataY[i] = static_cast<T>(shapeDataY[i].dim);
+         }
+         if (dataY.size() > 0)
+            model.AddConstantTensor<T>(fNY, fShapeY, dataY.data());
+         else if (shapeDataY.size() > 0 )
+           model.AddShapeTensor(fNY, shapeDataY, fShapeY.size() == 0);
+         else {
+            fIsOutputConstant = false;
+         }
+         if (fIsOutputConstant && model.Verbose())
             std::cout << "Where op ---> " << fNY << "  " << ConvertShapeToString(fShapeY) << " : "
-               << ConvertValuesToString(dataY) << std::endl;
-         
+               << ((dataY.size() > 0) ? ConvertValuesToString(dataY) : ConvertShapeToString(shapeDataY) )
+               << ((dataY.size() > 0) ? " (constant)" : " (shape)") << std::endl;
+
          // output is a constant tensor
-         fOutputTensorNames.pop_back();
+         if (fIsOutputConstant) fOutputTensorNames.pop_back();
       }
-      else {
+      if (!fIsOutputConstant) {
         model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fShapeY);
+        if (model.Verbose())
+            std::cout << "Where op " << " condition : " << fNC << "  " << ConvertShapeToString(fShapeC) <<
+                   " X " << fNA << "  " << ConvertShapeToString(fShapeA) << " Y " <<  fNB << "  " << ConvertShapeToString(fShapeB)
+                   << " ---> " << fNY << "  " << ConvertShapeToString(fShapeY) << std::endl;
       }
    }
 
@@ -184,17 +237,17 @@ public:
       return out.str();
    }
 
-   std::string Generate(std::string OpName) override {
+   std::string Generate(std::string opName) override {
 
       if (fIsOutputConstant) return "";
 
-      OpName = "op_" + OpName;
+      opName = "op_" + opName;
 
       if (fShapeY.empty()) {
          throw std::runtime_error("TMVA SOFIE Where Op called to Generate without being initialized first");
       }
       std::stringstream out;
-      out << SP << "\n//-------- Where   \n";
+      out << SP << "\n//-------- Where " << opName << " --> " << ConvertShapeToString(fShapeY) << "\n";
       size_t length = ConvertShapeToLength(fShapeY);
       std::string typeName = TensorType<T>::Name();
       // Broadcast A if it's uninitialized
@@ -216,19 +269,18 @@ public:
          // special case if C is an input tensor
          if (fIsInputBoolTensor) {
             size_t inputLength = ConvertShapeToLength(fShapeC);
-            out << SP << "std::vector<bool> fTensor_" << fNC << "(tensor_" << fNC <<  ", tensor_" << fNC << " + " << inputLength << ");\n";
+            out << SP << "std::vector<std::uint8_t> fTensor_" << fNC << "(tensor_" << fNC <<  ", tensor_" << fNC << " + " << inputLength << ");\n";
          }
          out << SP << "// Broadcasting uninitialized tensor " << fNC << "\n";
          //out << SP << "{\n";
-         // for boolean we need to pass vector<bool> and use the non-template version of the function
-         out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast(fTensor_" << fNC << ", " << ConvertShapeToString(fShapeC) << ", " << ConvertShapeToString(fShapeY)
+         out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<std::uint8_t>(fTensor_" << fNC << ".data(), " << ConvertShapeToString(fShapeC) << ", " << ConvertShapeToString(fShapeY)
                    << ", fTensor_" << fNBroadcastedC << ");\n";
       }
       std::string nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA;
       std::string nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB;
       std::string nameC = fNBroadcastedC.empty()? fNC : fNBroadcastedC;
       out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n";
-      // get output tensor applying condition (note we need to use directly the vector<bool> since v.data(),  i.e the data pointer, does not exist)
+      // get output tensor applying condition
       out << SP << SP << "tensor_" << fNY << "[id] = "  << "(fTensor_" << nameC << "[id]) ? tensor_"
                                << nameA << "[id] : tensor_" + nameB + "[id];\n";
       out << SP << "}\n";
@@ -240,4 +292,4 @@ public:
 }//SOFIE
 
 
-#endif //SOFIE_ROperator_Where
+#endif // SOFIE_ROperator_Where
diff --git a/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx b/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx
index d183052..8b9727b 100644
--- a/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx
+++ b/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx
@@ -1,7 +1,7 @@
 #ifndef SOFIE_SOFIE_COMMON
 #define SOFIE_SOFIE_COMMON
 
-#include "TMVA/RTensor.hxx"
+#include "SOFIE/RTensor.hxx"
 
 #include "ROOT/RSpan.hxx"
 
@@ -21,13 +21,10 @@
 #include <cassert>
 #include <limits>
 
-
-namespace SOFIE{
-
-//typedef RTensor tensor_t;
+namespace SOFIE {
 
 enum class ETensorType{
-   UNDEFINED = 0, FLOAT = 1, UNINT8 = 2, INT8 = 3, UINT16 = 4, INT16 = 5, INT32 = 6, INT64 = 7, STRING = 8, BOOL = 9, //order sensitive
+   UNDEFINED = 0, FLOAT = 1, UINT8 = 2, INT8 = 3, UINT16 = 4, INT16 = 5, INT32 = 6, INT64 = 7, STRING = 8, BOOL = 9, //order sensitive
     FLOAT16 = 10, DOUBLE = 11, UINT32 = 12, UINT64 = 13, COMPLEX64 = 14, COMPLEX28 = 15, BFLOAT16 = 16
 };
 
@@ -39,7 +36,7 @@ constexpr size_t GetTypeSize(ETensorType type) {
     switch (type) {
         case ETensorType::FLOAT:     return sizeof(float);
         case ETensorType::DOUBLE:    return sizeof(double);
-        case ETensorType::UNINT8:     return sizeof(uint8_t);
+        case ETensorType::UINT8:     return sizeof(uint8_t);
         case ETensorType::INT8:      return sizeof(int8_t);
         case ETensorType::UINT16:    return sizeof(uint16_t);
         case ETensorType::INT16:     return sizeof(int16_t);
@@ -58,6 +55,9 @@ typedef std::int64_t int_t;
 std::string ConvertTypeToString(ETensorType type);
 ETensorType ConvertStringToType(std::string type);
 
+// find if a string represents a number
+bool IsInteger(const std::string & s);
+
 struct Dim{
    bool isParam = false;
    size_t dim = 0;
@@ -67,16 +67,42 @@ struct Dim{
    Dim() {}
 
    // constructor for a parametric dimension with the option to pass a default dim value
-   Dim(const std::string & p, size_t d = 0) : isParam(true), dim(d), param(p) {}
+   // We use -1 for dim to indicate that the param dimension is an expression (e.g. "d1+d2")
+   // in case the string represents a number make Dim not parametric
+   Dim(const std::string & p, size_t d = 0) : isParam(true), dim(d), param(p)
+   {
+      if (IsInteger(p)) {
+            isParam = false;
+            dim = std::stoi(p);
+      }
+   }
 
    // constructor for a non-parametric dimension
    Dim(size_t d) : dim(d) {}
 
    std::string GetVal() const {
-      return (isParam) ? param : std::to_string(dim);
+      // cast to int64_t for negative shape values
+      return (isParam) ? param : std::to_string(static_cast<int64_t>(dim));
+   }
+
+   std::ostream& operator<< (std::ostream& os) const {
+      os << GetVal();
+      return os;
+   }
+
+   bool operator==(const Dim& rhs) const {
+       return (isParam && rhs.isParam) ? param == rhs.param : dim == rhs.dim;
+   }
+   bool operator!=(const Dim& rhs) const {
+       return !(*this == rhs);
    }
 };
 
+//bool operator==(const Dim& lhs, const Dim& rhs);
+inline std::ostream & operator<< (std::ostream &os, const Dim &d) {
+   os << d.GetVal();
+   return os;
+}
 
 struct InputTensorInfo{
    ETensorType type;
@@ -93,6 +119,18 @@ struct DynamicTensorInfo{
    std::vector<Dim> shape;
 };
 
+// template traits for Tensor Shape
+template <typename T>
+struct TensorShape {};
+template<>
+struct TensorShape<Dim> {
+   static bool IsDim() { return true; }
+};
+template<>
+struct TensorShape<size_t> {
+   static bool IsDim() { return false; }
+};
+
 // template traits for Tensor type
 template <typename T>
 struct TensorType {};
@@ -120,6 +158,10 @@ template<>
 struct TensorType<uint64_t> {
    static const std::string Name() { return "uint64_t"; }
 };
+template<>
+struct TensorType<bool> {
+   static const std::string Name() { return "bool"; }
+};
 
 struct TensorMemoryInfo {
    std::string_view tensor_name;
@@ -148,19 +190,26 @@ struct MemoryPoolInfo {
    std::map<size_t, size_t> available_stack;
 };
 
-std::vector<Dim> ConvertShapeToDim(std::vector<size_t> shape);
+std::vector<Dim> ConvertShapeToDim(const std::vector<size_t> & shape);
+
+std::vector<size_t> ConvertShapeToInt(const std::vector<Dim> & shape);
+
+inline std::size_t ConvertShapeToLength(const std::vector<size_t> & shape){
+   // Empty shape represent scalar values, so we return a length=1
+   std::size_t fLength = 1;
+   for (auto& dim: shape) fLength *= dim;
+   return fLength;
+}
+
+std::string ConvertShapeToString(const std::vector<size_t> & shape);
+std::string ConvertDimShapeToString(const std::vector<Dim> & shape);
+std::string ConvertShapeToString(const std::vector<Dim> & shape);
 
-std::vector<size_t> ConvertShapeToInt(std::vector<Dim> shape);
 
-std::size_t ConvertShapeToLength(std::vector<size_t> shape);
 
-std::string ConvertShapeToString(std::vector<size_t> shape);
-std::string ConvertDynamicShapeToString(std::vector<Dim> shape);
-// std::string ConvertShapeToString(std::vector<Dim> shape) {
-//    return ConvertDynamicShapeToString(shape);
-// }
+std::string ConvertDimShapeToLength(const std::vector<Dim> & shape);
+std::string ConvertDynamicShapeToLength(const std::vector<Dim> & shape);
 
-std::string ConvertDynamicShapeToLength(std::vector<Dim> shape);
 
 template<class T>
 std::string ConvertValToString(T value) {
@@ -271,7 +320,7 @@ private:
 template <typename T>
 ETensorType GetTemplatedType(T /*obj*/ ){
    if (std::is_same<T, float>::value) return ETensorType::FLOAT;
-   if (std::is_same<T, uint8_t>::value) return ETensorType::UNINT8;
+   if (std::is_same<T, uint8_t>::value) return ETensorType::UINT8;
    if (std::is_same<T, int8_t>::value) return ETensorType::INT8;
    if (std::is_same<T, uint16_t>::value) return ETensorType::UINT16;
    if (std::is_same<T, int16_t>::value) return ETensorType::INT16;
@@ -287,6 +336,12 @@ ETensorType GetTemplatedType(T /*obj*/ ){
 }
 
 namespace UTILITY{
+
+
+
+// clean operator and tensor names
+std::string Clean_name(std::string input_tensor_name);
+
 // Check if two shapes are equal
 bool AreSameShape(const std::vector<size_t>&, const std::vector<size_t>&);
 bool AreSameShape(const std::vector<size_t>&, const std::vector<Dim>&);
@@ -296,10 +351,14 @@ bool AreSameShape(const std::vector<Dim>&, const std::vector<Dim>&);
 // Multidirectional broadcast a list of tensors to the same shape
 std::vector<size_t> MultidirectionalBroadcastShape(std::vector<std::vector<size_t>>);
 
-// Unidirectional broadcast two shapes to the same shape
-std::vector<size_t> UnidirectionalBroadcastShape(std::vector<size_t>, std::vector<size_t>);
+// Multidirectional broadcast two shapes to the same shape
+
+std::pair<int, std::vector<size_t>> MultidirectionalBroadcastShape(std::vector<size_t> &, std::vector<size_t> &);
+std::vector<size_t> UnidirectionalBroadcastShape(std::vector<size_t> &, std::vector<size_t> &);
+
+std::pair<int, std::vector<Dim>> MultidirectionalBroadcastShape(std::vector<Dim> &, std::vector<Dim> &);
+
 
-std::string Clean_name(std::string input_tensor_name);
 
 template<typename T>
 T* BroadcastConvBias(const T* data, const size_t channel, const std::vector<size_t>& targetShape) {
@@ -352,7 +411,7 @@ void BroadcastTensor(ConstContT data, const std::vector<size_t>& shape, const st
    size_t targetLength = broadcastedData.size();
    assert(ConvertShapeToLength(targetShape) == targetLength);
    // special case when broadcasting last dimensions (initial shapes must be the same)
-   if (shape.front() == targetShape.front() && shape.back() == 1 && size > 1) {
+   if (size > 1 && shape.front() == targetShape.front() && shape.back() == 1) {
       size_t bsize = targetShape.back();
       // compute the size of the data to broadcast
       for (int k = int(size)-2; k >=0; k--) {
@@ -419,6 +478,7 @@ T* CreateBroadcastTensor(const T* data, const std::vector<size_t>& shape, const
    BroadcastTensor<T, std::span<const T>, std::span<T>>(inData, shape, targetShape, bData);
    return broadcastedData;
 }
+
 // Unidirectional broadcasting shape to targetShape// In unidirectional broadcast - only tensor B can have the shape changed not
 // tensor A - otherwise is a multidirectional broadcast
 template<typename T>
@@ -449,8 +509,6 @@ void UnidirectionalBroadcast(const T* data, const std::vector<size_t>& shape, co
    }
    BroadcastTensor<T, std::span<T>>(inData, shape, targetShape, broadcastedData);
 }
-// specialization for vector of boolean
-void UnidirectionalBroadcast(const std::vector<bool> & data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, std::vector<bool> & broadcastedData);
 
 /// compute stride of a tensor given its shape (assume layout is row-major)
 std::vector<size_t> ComputeStrideFromShape(const std::vector<size_t> & shape);
@@ -619,7 +677,15 @@ void col2im(const Dtype* data_col, const int channels,
   //std::cout << "finishing col2imp" << std::endl;
 }
 
-
+// Used at the end of infer() to fill the return object.
+template <class T>
+void FillOutput(T const *arr, std::vector<T> &out, std::size_t n)
+{
+   out.resize(n);
+   for (std::size_t i = 0; i < n; ++i) {
+      out[i] = arr[i];
+   }
+}
 
 }  // end namespace UTILITY
 
@@ -631,20 +697,20 @@ extern "C" void sgemm_(const char * transa, const char * transb, const int * m,
 
 
 struct GNN_Data {
-      TMVA::Experimental::RTensor<float> node_data;      // the node feature data, tensor with shape (num_nodes, num_node_features)
-      TMVA::Experimental::RTensor<float> edge_data;      // the edge feature data, tensor with shape (num_edges, num_edge_features)
-      TMVA::Experimental::RTensor<float> global_data;    // the global features, tensor with shape (1, num_global_features)
-      TMVA::Experimental::RTensor<int> edge_index;       // the edge index (receivers and senders for each edge), tensor with shape (2, num_edges)
+      SOFIE::RTensor<float> node_data;      // the node feature data, tensor with shape (num_nodes, num_node_features)
+      SOFIE::RTensor<float> edge_data;      // the edge feature data, tensor with shape (num_edges, num_edge_features)
+      SOFIE::RTensor<float> global_data;    // the global features, tensor with shape (1, num_global_features)
+      SOFIE::RTensor<int> edge_index;       // the edge index (receivers and senders for each edge), tensor with shape (2, num_edges)
                                      // edge_index[0,:] are the receivers and edge_index[1,:] are the senders
 
 
       // need to have default constructor since RTensor has not one
-      GNN_Data(): node_data(TMVA::Experimental::RTensor<float>({})), edge_data(TMVA::Experimental::RTensor<float>({})), global_data(TMVA::Experimental::RTensor<float>({})), edge_index(TMVA::Experimental::RTensor<int>({})) {}
+      GNN_Data(): node_data(SOFIE::RTensor<float>({})), edge_data(SOFIE::RTensor<float>({})), global_data(SOFIE::RTensor<float>({})), edge_index(SOFIE::RTensor<int>({})) {}
 
 };
 
 template<typename T>
-TMVA::Experimental::RTensor<T> Concatenate( TMVA::Experimental::RTensor<T> & t1,  TMVA::Experimental::RTensor<T> & t2, int axis = 0)
+SOFIE::RTensor<T> Concatenate( SOFIE::RTensor<T> & t1,  SOFIE::RTensor<T> & t2, int axis = 0)
 {
    // concatenate tensor along axis. Shape must be the same except in the dimension of the concatenated axis
    if (t1.GetMemoryLayout() != t2.GetMemoryLayout())
@@ -659,8 +725,8 @@ TMVA::Experimental::RTensor<T> Concatenate( TMVA::Experimental::RTensor<T> & t1,
    }
    std::vector<size_t> outShape = shape1;
    outShape[axis] = shape1[axis] + shape2[axis];
-   TMVA::Experimental::RTensor<T> tout(outShape, t1.GetMemoryLayout());
-   if (t1.GetMemoryLayout() == TMVA::Experimental::MemoryLayout::ColumnMajor) {
+   SOFIE::RTensor<T> tout(outShape, t1.GetMemoryLayout());
+   if (t1.GetMemoryLayout() == SOFIE::MemoryLayout::ColumnMajor) {
       throw std::runtime_error("TMVA RTensor Concatenate is not yet supported for column major tensors");
    }
 
@@ -693,10 +759,10 @@ inline GNN_Data Concatenate(GNN_Data & data1, GNN_Data & data2, int axis = 0) {
 
 inline GNN_Data Copy(const GNN_Data & data) {
    GNN_Data out;
-   out.node_data = TMVA::Experimental::RTensor<float>(data.node_data.GetShape());
-   out.edge_data = TMVA::Experimental::RTensor<float>(data.edge_data.GetShape());
-   out.global_data = TMVA::Experimental::RTensor<float>(data.global_data.GetShape());
-   out.edge_index = TMVA::Experimental::RTensor<int>(data.edge_index.GetShape());
+   out.node_data = SOFIE::RTensor<float>(data.node_data.GetShape());
+   out.edge_data = SOFIE::RTensor<float>(data.edge_data.GetShape());
+   out.global_data = SOFIE::RTensor<float>(data.global_data.GetShape());
+   out.edge_index = SOFIE::RTensor<int>(data.edge_index.GetShape());
    std::copy(data.node_data.GetData(), data.node_data.GetData()+ data.node_data.GetSize(), out.node_data.GetData());
    std::copy(data.edge_data.GetData(), data.edge_data.GetData()+ data.edge_data.GetSize(), out.edge_data.GetData());
    std::copy(data.global_data.GetData(), data.global_data.GetData()+ data.global_data.GetSize(), out.global_data.GetData());
@@ -704,6 +770,45 @@ inline GNN_Data Copy(const GNN_Data & data) {
    return out;
 }
 
-}//SOFIE
+inline void Gemm_Call(float *output, bool transa, bool transb, int m, int n, int k, float alpha, const float *A,
+                      const float *B, float beta, const float *C)
+{
+   char ct = 't';
+   char cn = 'n';
+   const int *lda = transa ? &k : &m;
+   const int *ldb = transb ? &n : &k;
+   const int *ldc = &m;
+   if (C != nullptr) {
+      std::copy(C, C + m * n, output);
+   }
+   SOFIE::BLAS::sgemm_(transa ? &ct : &cn, transb ? &ct : &cn, &m, &n, &k, &alpha, A, lda, B, ldb,
+                                           &beta, output, ldc);
+}
+
+template <class T>
+void ReadTensorFromStream(std::istream &is, T &target, std::string const &expectedName, std::size_t expectedLength)
+{
+   std::string name;
+   std::size_t length;
+   is >> name >> length;
+   if (name != expectedName) {
+      std::string err_msg =
+         "TMVA-SOFIE failed to read the correct tensor name; expected name is " + expectedName + " , read " + name;
+      throw std::runtime_error(err_msg);
+   }
+   if (length != expectedLength) {
+      std::string err_msg = "TMVA-SOFIE failed to read the correct tensor size; expected size is " +
+                            std::to_string(expectedLength) + " , read " + std::to_string(length);
+      throw std::runtime_error(err_msg);
+   }
+   for (size_t i = 0; i < length; ++i) {
+      is >> target[i];
+   }
+   if (is.fail()) {
+      throw std::runtime_error("TMVA-SOFIE failed to read the values for tensor " + expectedName);
+   }
+}
+
+} // namespace SOFIE
 
-#endif //TMVA_SOFIE_RMODEL
+#endif //SOFIE_COMMON
diff --git a/src/SOFIE_core/src/RModel.cxx b/src/SOFIE_core/src/RModel.cxx
index e5495ed..0eab8d1 100644
--- a/src/SOFIE_core/src/RModel.cxx
+++ b/src/SOFIE_core/src/RModel.cxx
@@ -4,55 +4,21 @@
 #include <memory>
 #include <string>
 
+#ifdef SOFIE_SUPPORT_ROOT_BINARY
 #include "TFile.h"
+#endif
 
 #include "SOFIE/RModel.hxx"
 #include "SOFIE/SOFIE_common.hxx"
 
-
 namespace SOFIE {
 
-std::underlying_type_t<Options> operator|(Options opA, Options opB) {
-    return static_cast<std::underlying_type_t<Options>>(opA) | static_cast<std::underlying_type_t<Options>>(opB);
-}
-std::underlying_type_t<Options> operator|(std::underlying_type_t<Options> opA, Options opB) {
-    return opA | static_cast<std::underlying_type_t<Options>>(opB);
-}
-
-RModel::RModel(RModel&& other) {
-    fInputTensorInfos = std::move(other.fInputTensorInfos);
-    fReadyInputTensorInfos = std::move(other.fReadyInputTensorInfos);
-    fOutputTensorNames = other.fOutputTensorNames;
-    fInputTensorNames = other.fInputTensorNames;
-    fOperators = std::move(other.fOperators);
-    fInitializedTensors = std::move(other.fInitializedTensors);
-    fIntermediateTensorInfos = std::move(other.fIntermediateTensorInfos);
-    fName = other.fName;
-    fFileName = other.fFileName;
-    fParseTime = other.fParseTime;
-    fGC = other.fGC;
-    fNeededBlasRoutines = other.fNeededBlasRoutines;
-    fNeededStdLib = other.fNeededStdLib;
+namespace {
+const std::string SP = "   ";
 }
 
-RModel& RModel::operator=(RModel&& other) {
-    fInputTensorInfos = std::move(other.fInputTensorInfos);
-    fReadyInputTensorInfos = std::move(other.fReadyInputTensorInfos);
-    fOutputTensorNames = other.fOutputTensorNames;
-    fInputTensorNames = other.fInputTensorNames;
-    fOperators = std::move(other.fOperators);
-    fInitializedTensors = std::move(other.fInitializedTensors);
-    fIntermediateTensorInfos = std::move(other.fIntermediateTensorInfos);
-    fName = other.fName;
-    fFileName = other.fFileName;
-    fParseTime = other.fParseTime;
-    fGC = other.fGC;
-    fNeededBlasRoutines = other.fNeededBlasRoutines;
-    fNeededStdLib = other.fNeededStdLib;
-    return *this;
-}
 
-const std::vector<size_t>& RModel::GetTensorShape(std::string name) const {
+const std::vector<size_t>& RModel::GetTensorShape(const std::string & name) const {
     auto f = fReadyInputTensorInfos.find(name);
     if (f != fReadyInputTensorInfos.end()) {
         return f->second.shape;
@@ -69,6 +35,16 @@ const std::vector<size_t>& RModel::GetTensorShape(std::string name) const {
     if (f4 != fIntermediateTensorInfos.end()) {
         return f4->second.shape;
     }
+    // case of shape tensors
+    auto f5 = fShapeTensors.find(name);
+    if (f5 != fShapeTensors.end()) {
+      // shape is vector of size 1 with size of shape values or just a scalar
+      if (f5->second.second)  // check scalar flag
+         return std::vector<size_t>{};
+      else
+         return std::vector<size_t>{f5->second.first.size()};
+    }
+
     if (fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end())
       throw std::runtime_error("TMVA SOFIE tensor [" + name + "] is a dynamic tensor. Use GetDynamicTensorShape instead of GetTensorShape");
 
@@ -78,7 +54,7 @@ const std::vector<size_t>& RModel::GetTensorShape(std::string name) const {
     throw std::runtime_error("TMVA SOFIE tensor [" + name + "] for which the shape is requested is not found");
 }
 
-std::vector<Dim> RModel::GetDynamicTensorShape(std::string name) const {
+std::vector<Dim> RModel::GetDimTensorShape(const std::string & name) const {
    if (auto f = fDynamicTensorInfos.find(name); f != fDynamicTensorInfos.end()) {
       return f->second.shape;
    }
@@ -89,8 +65,21 @@ std::vector<Dim> RModel::GetDynamicTensorShape(std::string name) const {
    // for this we need to return the vector by value
    return ConvertShapeToDim(GetTensorShape(name));
 }
+std::vector<Dim> RModel::GetDynamicTensorShape(const std::string & name) const {
+   if (auto f = fDynamicTensorInfos.find(name); f != fDynamicTensorInfos.end()) {
+      return f->second.shape;
+   }
+   if (auto f = fInputTensorInfos.find(name); f != fInputTensorInfos.end()) {
+      return f->second.shape;
+   }
+   // throw error if shape is not dynamic
+   if (!IsDynamicTensor(name))
+      throw std::runtime_error("TMVA SOFIE tensor [" + name + "] for which the shape is requested is not dynamic");
+
+   throw std::runtime_error("TMVA SOFIE tensor [" + name + "] for which the shape is requested is not found");
+}
 
-const ETensorType& RModel::GetTensorType(std::string name) const {
+const ETensorType& RModel::GetTensorType(const std::string & name) const {
     auto f = fReadyInputTensorInfos.find(name);
     if (f != fReadyInputTensorInfos.end()) {
         return f->second.type;
@@ -111,6 +100,10 @@ const ETensorType& RModel::GetTensorType(std::string name) const {
     if (f5 != fDynamicTensorInfos.end()){
       return f5->second.type;
     }
+    // case of shape tensor type is INT64
+    if (fShapeTensors.find(name) != fShapeTensors.end()){
+      return ETensorType::INT64;
+    }
 
     if (fIsSubGraph && fParentGraph)
       return fParentGraph->GetTensorType(name);
@@ -124,6 +117,7 @@ bool RModel::CheckIfTensorAlreadyExist(std::string tensor_name) {
     if (fInitializedTensors.find(tensor_name) != fInitializedTensors.end()) return true;
     if (fIntermediateTensorInfos.find(tensor_name) != fIntermediateTensorInfos.end()) return true;
     if (fDynamicTensorInfos.find(tensor_name) != fDynamicTensorInfos.end()) return true;
+    if (fShapeTensors.find(tensor_name) != fShapeTensors.end()) return true;
     if (fIsSubGraph && fParentGraph) return fParentGraph->CheckIfTensorAlreadyExist(tensor_name);
     return false;
 }
@@ -192,16 +186,34 @@ void RModel::AddConstantTensor(std::string tensor_name, ETensorType type, std::v
     tensor_name = UTILITY::Clean_name(tensor_name);
     //NB: own data
     if (CheckIfTensorAlreadyExist(tensor_name)) {
-        throw std::runtime_error("TMVA-SOFIE: initialized tensor with name " + tensor_name + " already exists \n");
+        throw std::runtime_error("TMVA-SOFIE: constant tensor with name " + tensor_name + " already exists \n");
     }
     InitializedTensor new_tensor {type, shape, data, true};   // add here flag to specify is a constant tensor
     fInitializedTensors[tensor_name] = new_tensor;
 }
 
+void RModel::AddShapeTensor(const std::string & name, const std::vector<Dim> & shape_values, bool scalar){
+   auto tensor_name = UTILITY::Clean_name(name);
+   if (fShapeTensors.count(tensor_name) != 0) {
+      throw std::runtime_error("TMVA-SOFIE: shape tensor with name " + tensor_name + " already exists \n");
+   }
+   fShapeTensors[tensor_name] = std::make_pair(shape_values, scalar);
+}
+
+bool RModel::IsShapeTensor(const std::string & tensor_name) const {
+   return fShapeTensors.count(tensor_name) != 0;
+}
+
+const std::vector<Dim> & RModel::GetShapeTensorValues(const std::string & tensor_name) const {
+   //if (!IsShapeTensor(tensor_name) ) return std::vector<Dim>{};
+   return fShapeTensors.at(tensor_name).first;
+}
+
 bool RModel::IsInitializedTensor(const std::string& tensorName) const {
     std::string name = UTILITY::Clean_name(tensorName);
     return fInitializedTensors.find(name) != fInitializedTensors.end();
 }
+
 bool RModel::IsConstantTensor(const std::string& tensorName) const {
     std::string name = UTILITY::Clean_name(tensorName);
     auto itr = fInitializedTensors.find(name);
@@ -209,9 +221,11 @@ bool RModel::IsConstantTensor(const std::string& tensorName) const {
     return itr->second.IsConstantTensor();
 }
 
+// dynamic tensors include also Dim input tensors
 bool RModel::IsDynamicTensor(const std::string& tensorName) const {
    std::string name = UTILITY::Clean_name(tensorName);
-   return fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end();
+   bool ret = fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end();
+   return (ret) ? true : IsDimInputTensor(tensorName);
 }
 bool RModel::IsDimInputTensor(const std::string& tensorName) const {
    std::string name = UTILITY::Clean_name(tensorName);
@@ -250,17 +264,21 @@ void RModel::AddDynamicTensor(std::string tensor_name, ETensorType type, std::ve
    // store shape parameter if not existing
    for (auto &d : shape) {
       if (d.isParam) {
-         if (fShapeParams.count(d.param) == 0) {
-            // case parameter is an expression of some other existing parameter, no need to
-            // register it
-            if (d.dim != size_t(-1)) {
-              fShapeParams[d.param] = std::to_string(d.dim);
-            }
+         if (d.dim != size_t(-1)) {
+            AddShapeParam(d.param, d.dim);
          }
       }
    }
 }
 
+void RModel::AddShapeParam(const std::string & param, size_t default_value) {
+   if (fShapeParams.count(param) == 0) {
+      fShapeParams[param] = std::to_string(default_value);
+      // add also in the vector list (used to keep the order)
+      fDimShapeNames.push_back(param);
+   }
+}
+
 void RModel::AddOutputTensorNameList(std::vector<std::string> outputtensornames) {
     fOutputTensorNames.clear();
     for(auto& it : outputtensornames) {
@@ -301,100 +319,180 @@ void RModel::SetNotWritableInitializedTensor(const std::string & tensor_name) {
       t->second.SetNotWritable();
    }
 
-std::string RModel:: AllocateIntermediateMemory(std::span<const std::string_view> op_output_tensors) {
+std::string RModel::AllocateIntermediateMemory(std::span<const std::string> op_output_tensors)
+{
+   std::stringstream code;
 
-   std::string memory_allocation_string = "";
-   bool allocated;
+   if (fVerbose) {
+      std::cout << "Total chunks allocated\n";
+      for (auto chunk = fIntermediateMemoryInfo.total_stack.begin(); chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk) {
+         std::cout << "..... chunk " << chunk->first << " size " << chunk->second.tensor_size << " " << chunk->second.tensor_name << std::endl;
+      }
+   }
 
-      for (auto& it : op_output_tensors) {
-         allocated = false;
-         if (GetTensorType(std::string(it)) == ETensorType::BOOL ||
-            fInitializedTensors.find(std::string(it)) != fInitializedTensors.end() ||
-            fDynamicTensorInfos.find(std::string(it)) != fDynamicTensorInfos.end()) continue;
+   auto declareIntermediateTensor = [this, &code](std::string const &name, size_t size, size_t location) {
+      std::string typeName = ConvertTypeToString(GetTensorType(name));
+      code << "\n // Allocating memory for intermediate tensor " << name << " with size " << size << " bytes";
+      code << "\n"
+           << typeName << "* tensor_" << name << " = reinterpret_cast<" << typeName
+           << "*>(fIntermediateMemoryPool.data() + " << location << ");\n";
+   };
+
+   if (fVerbose) std::cout << "*** AllocateIntermediateMemory: Loop on op output tensors\n";
+   // order output tensors by size
+   std::vector<TensorMemoryInfo> ordered_output_tensors;
+
+   for (auto &it : op_output_tensors) {
+      auto name = std::string(it);
+      if (GetTensorType(name) == ETensorType::BOOL || fInitializedTensors.find(name) != fInitializedTensors.end() ||
+          fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end())
+         continue;
+
+      auto tensor_size = GetTypeSize(GetTensorType(name)) * ConvertShapeToLength(GetTensorShape(name));
+      // important fill the pair in the ordered output tensors with the string view and not the string
+      TensorMemoryInfo tmi = {it, tensor_size};
+      ordered_output_tensors.push_back(tmi);
+   }
+   std::sort(ordered_output_tensors.begin(), ordered_output_tensors.end(),
+             [](const TensorMemoryInfo &a, const TensorMemoryInfo &b) { return a.tensor_size > b.tensor_size; });
 
-         auto tensor_size = GetTypeSize(GetTensorType(std::string(it))) * ConvertShapeToLength(GetTensorShape(std::string(it)));
-         memory_allocation_string += "\n // Allocating memory for intermediate tensor " + std::string(it) + " with size " + std::to_string(tensor_size) + " bytes";
+   for (auto &it : ordered_output_tensors) {
+      bool allocated = false;
+      std::string name = std::string{it.tensor_name};
+      size_t tensor_size = it.tensor_size;
+      if (fVerbose)
+         std::cout << "output tensor " << name << " size " << tensor_size << std::endl;
 
-            for (auto chunk = fIntermediateMemoryInfo.available_stack.begin(); chunk != fIntermediateMemoryInfo.available_stack.end(); ) {
+      for (auto chunk = fIntermediateMemoryInfo.available_stack.begin();
+           chunk != fIntermediateMemoryInfo.available_stack.end();) {
 
-                  // check if available memory chunks can accommodate the tensor
-                  if (chunk->second >= tensor_size) {
-                     auto new_chunk = fIntermediateMemoryInfo.total_stack[chunk->first].split(it, tensor_size);
-                     auto new_chunk_location = chunk->first+chunk->second-tensor_size;
-                     fIntermediateMemoryInfo.total_stack[new_chunk_location] = new_chunk;
+         if (fVerbose) std::cout << ".. available chunk " << chunk->first << " with size = " << chunk->second;
+         // check if available memory chunks can accommodate the tensor
+         if (chunk->second >= tensor_size) {
+            // need to use here string_view (i.e it.tensor_name)
+            // split returns the new chunk with size of new tensor. The free chunk is before the used one
+            auto new_chunk = fIntermediateMemoryInfo.total_stack[chunk->first].split(it.tensor_name, tensor_size);
+            auto new_chunk_location = chunk->first + chunk->second - tensor_size;
+            fIntermediateMemoryInfo.total_stack[new_chunk_location] = new_chunk;
 
-                     memory_allocation_string += "\n" + ConvertTypeToString(GetTensorType(std::string(it))) +
-                                                "* tensor_" + std::string(it) +
-                                                " = reinterpret_cast<"+ConvertTypeToString(GetTensorType(std::string(it)))+"*>(fIntermediateMemoryPool + " + std::to_string(new_chunk_location) + ");\n";
-                     chunk->second -= tensor_size;
+            declareIntermediateTensor(name, tensor_size, new_chunk_location);
+            chunk->second -= tensor_size;
 
-                     allocated = true;
+            allocated = true;
 
-                     if (chunk->second == 0) {
-                        chunk = fIntermediateMemoryInfo.available_stack.erase(chunk);
-                     }
+            if (fVerbose) std::cout << " is re-used and split in a new of size " << new_chunk.tensor_size << " at " << new_chunk_location;
 
-                     break;
-                  }
-                  ++chunk;
+            if (chunk->second == 0) {
+               if (fVerbose) std::cout << " and deleted since size matches";
+               fIntermediateMemoryInfo.available_stack.erase(chunk);
             }
+            if (fVerbose) std::cout << std::endl;
+            break;
+         } else if (chunk->first == fIntermediateMemoryInfo.available_stack.rbegin()->first &&
+                    fIntermediateMemoryInfo.total_stack.rbegin()->first == chunk->first) {
+            // case last available chunk is the last in the memory, we can increase that one
+            fIntermediateMemoryInfo.total_stack[chunk->first] = {it.tensor_name, tensor_size};
+            declareIntermediateTensor(name, tensor_size, chunk->first);
+            fIntermediateMemoryInfo.available_stack.erase(chunk);
+            allocated = true;
+            if (fVerbose) std::cout << " is extended  with a bigger one of size " << tensor_size << std::endl;
+            break;
+         }
+         ++chunk;
+         if (fVerbose) std::cout << std::endl;
+      }
 
-         if (!allocated) {
-               size_t chunk_idx = fIntermediateMemoryInfo.total_stack.empty()
-                                 ? 0
-                                 : fIntermediateMemoryInfo.total_stack.rbegin()->first + fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size;
+      if (!allocated) {
+         size_t chunk_idx = fIntermediateMemoryInfo.total_stack.empty()
+                               ? 0
+                               : fIntermediateMemoryInfo.total_stack.rbegin()->first +
+                                    fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size;
 
-               fIntermediateMemoryInfo.total_stack[chunk_idx] =
-                   {
-                     it,
-                     tensor_size
-                   };
+         fIntermediateMemoryInfo.total_stack[chunk_idx] = it;
 
-               memory_allocation_string += "\n"+ConvertTypeToString(GetTensorType(std::string(it)))+"* tensor_"+ std::string(it) + "= reinterpret_cast<"+ConvertTypeToString(GetTensorType(std::string(it)))+"*>(fIntermediateMemoryPool + " + std::to_string(chunk_idx) + ");\n";
-         }
+         declareIntermediateTensor(name, tensor_size, chunk_idx);
+
+         if (fVerbose) std::cout << "no chunk available - add in total stack a new chunk with size of tensor and idx : " << chunk_idx
+                   << std::endl;
+      }
    }
-   return memory_allocation_string;
+   return code.str();
 }
 
-void RModel::CheckAndFlushIntermediateMemory(std::span<const std::string_view> op_input_tensors, const size_t& op_idx){
-   for (auto &it : op_input_tensors){
+void RModel::CheckAndFlushIntermediateMemory(std::span<const std::string> op_input_tensors, const size_t& op_idx){
+   if (fVerbose) std::cout << "*** CheckAndFlushIntermediateMemory: Loop on input tensors for op " << op_idx << "\n";
+   //print available chunks
+   if (fVerbose) std::cout << "available chunks before freeing them : \n";
+   for (auto chunk = fIntermediateMemoryInfo.available_stack.begin();
+        chunk != fIntermediateMemoryInfo.available_stack.end(); chunk++) {
+      if (fVerbose) std::cout << "-- free chunk " << chunk->first <<  " size = " << chunk->second << std::endl;
+   }
+   for (auto &it : op_input_tensors) {
       // last occurence of the tensor is reached => flush it from memory
+      if (fVerbose) std::cout << ".. input tensors : " << it;
       if (fIntermediateTensorFrequencyLookup[it] == op_idx) {
+         if (fVerbose) std::cout << "  flash condition is met - looping on chunks to find matching one \n";
          for (auto chunk = fIntermediateMemoryInfo.total_stack.begin();
-               chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk ) {
-               if (chunk->second.tensor_name == it) {
-
-                     // check if nearby chunks in available memory can coalesce
-                     auto first_greater = fIntermediateMemoryInfo.available_stack.upper_bound(chunk->first); // smallest element greater than the flushed chunk idx
-                     auto last_smaller = (first_greater == fIntermediateMemoryInfo.available_stack.begin()) ? fIntermediateMemoryInfo.available_stack.end() : std::prev(first_greater); // largest element smaller than the flushed chunk idx
-
-                     // check if the next stack entry is actually adjacent in memory
-                     if (last_smaller->first+last_smaller->second + 1 == chunk->first){
-                        last_smaller->second += chunk->second.tensor_size;
-                        fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(chunk->second);
-
-                        if (last_smaller->first + last_smaller->second + 1 == first_greater->first){
-                              fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(fIntermediateMemoryInfo.total_stack[first_greater->first]);
-                              first_greater = fIntermediateMemoryInfo.available_stack.erase(first_greater);
-                        }
-                     } else{
-                        if (chunk->first + chunk->second.tensor_size + 1 == first_greater->first){
-                           fIntermediateMemoryInfo.total_stack[chunk->first].merge(fIntermediateMemoryInfo.total_stack[first_greater->first]);
-                           first_greater = fIntermediateMemoryInfo.available_stack.erase(first_greater);
-                        }
-                        fIntermediateMemoryInfo.available_stack.insert({
-                           chunk->first,
-                           chunk->second.tensor_size
-        });
-                     }
+              chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk) {
+            if (fVerbose) std::cout << "---  chunk " << chunk->first << " , " << chunk->second.tensor_name << " size " << chunk->second.tensor_size;
+            if (chunk->second.tensor_name == it) {
+               if (fVerbose) std::cout << " --  Found chunk corresponding to input tensor:  " << chunk->first;
+               // check if nearby chunks in available memory can coalesce
+               auto first_greater = fIntermediateMemoryInfo.available_stack.upper_bound(
+                  chunk->first); // smallest element greater than the flushed chunk idx
+               auto last_smaller = (first_greater == fIntermediateMemoryInfo.available_stack.begin())
+                                      ? fIntermediateMemoryInfo.available_stack.end()
+                                      : std::prev(first_greater); // largest element smaller than the flushed chunk idx
+
+               // check if the next stack entry is actually adjacent in memory
+
+               if (last_smaller != fIntermediateMemoryInfo.available_stack.end() &&
+                   last_smaller->first + last_smaller->second == chunk->first) {
+                  // merge chunk with previous one
+                  last_smaller->second += chunk->second.tensor_size;
+                  fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(chunk->second);
+                  if (fVerbose) std::cout << " is adjacent in memory with previous one - merge ";
+                  if (first_greater != fIntermediateMemoryInfo.available_stack.end() &&
+                      last_smaller->first + last_smaller->second == first_greater->first) {
+                     // merge also with following one
+                     last_smaller->second += first_greater->second;
+                     fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(
+                        fIntermediateMemoryInfo.total_stack[first_greater->first]);
+                     // delete merged one in available stack and in total stack
+                     fIntermediateMemoryInfo.total_stack.erase(first_greater->first);
+                     fIntermediateMemoryInfo.available_stack.erase(first_greater);
+                     if (fVerbose) std::cout << " merge also with following that is free ";
+                  }
+                  fIntermediateMemoryInfo.total_stack.erase(chunk->first);
+                  if (fVerbose) std::cout << std::endl;
+                  break;
+               } else if (first_greater != fIntermediateMemoryInfo.available_stack.end() &&
+                          chunk->first + chunk->second.tensor_size == first_greater->first) {
+                  // merge with first greater
+                  if (fVerbose) std::cout << " is adjacent in memory with following one - merge \n";
+                  // cannot modify idx of first_greter. Insert a new one and delete previous one
+                  size_t new_size = chunk->second.tensor_size + first_greater->second;
+                  size_t first_greater_idx = first_greater->first;
+                  fIntermediateMemoryInfo.available_stack.erase(first_greater);
+                  // cannot use anymore first_greater
+                  fIntermediateMemoryInfo.available_stack.insert({chunk->first, new_size});
+                  fIntermediateMemoryInfo.total_stack[chunk->first].merge(
+                     fIntermediateMemoryInfo.total_stack[first_greater_idx]);
+                  fIntermediateMemoryInfo.total_stack.erase(first_greater_idx);
+               } else {
+                  fIntermediateMemoryInfo.available_stack.insert({chunk->first, chunk->second.tensor_size});
+                  if (fVerbose) std::cout << " insert in the available stack the chunk with size " << chunk->second.tensor_size << std::endl;
                }
+               chunk->second.tensor_name = "free";
+               break;
+            }
          }
+      } else {
+         if (fVerbose) std::cout << std::endl;
       }
    }
 }
 
-
-
 void RModel::Initialize(int batchSize, bool verbose) {
    std::map<std::string, size_t> inputParams;
    if (batchSize > 0) {
@@ -442,7 +540,7 @@ void RModel::Initialize(const std::map<std::string, size_t> & inputParams, bool
       auto shape = ConvertShapeToInt(input.second.shape);
       if (verbose)
          std::cout << "converting input shape for " << input.first << " " << ConvertShapeToString(shape) << " from "
-            << ConvertDynamicShapeToString(input.second.shape) << std::endl;
+            << ConvertShapeToString(input.second.shape) << std::endl;
       if (!shape.empty()) {
          // case shape is defined (not parametric) we add the tensor in the fReadyInputTensorInfos map and
          // we remove the tensor from the fInputTensorInfo where th eold parametric shape was stored
@@ -456,8 +554,12 @@ void RModel::Initialize(const std::map<std::string, size_t> & inputParams, bool
       else {
          // store the found parametric shape parameters
          for (auto &d : input.second.shape) {
-            if (d.isParam)
-               fShapeParams[d.param] = std::to_string(d.dim);
+            if (d.isParam) {
+               if (fShapeParams.count(d.param) == 0) {
+                  fDimShapeNames.push_back(d.param);
+                  fShapeParams[d.param] = std::to_string(d.dim);
+               }
+            }
          }
       }
    }
@@ -492,10 +594,11 @@ void RModel::Initialize(const std::map<std::string, size_t> & inputParams, bool
       }
       fOperators[op_idx]->Initialize(*this);
       for(auto &it:fOperators[op_idx]->GetOpOutputTensors()){
+         std::string name = std::string{it};
          if (fIntermediateTensorFrequencyLookup.find(it) == fIntermediateTensorFrequencyLookup.end() &&
-             std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), std::string(it)) == fOutputTensorNames.end() &&
-             fInitializedTensors.find(std::string(it)) == fInitializedTensors.end() &&
-             fDynamicTensorInfos.find(std::string(it)) == fDynamicTensorInfos.end()){
+             std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), name) == fOutputTensorNames.end() &&
+             fInitializedTensors.find(name) == fInitializedTensors.end() &&
+             fDynamicTensorInfos.find(name) == fDynamicTensorInfos.end()){
             fIntermediateTensorFrequencyLookup[it] = op_idx;
          }
       }
@@ -534,43 +637,6 @@ void RModel::InitializeSubGraph(std::shared_ptr<RModel>  graph) {
 
 }
 
-// Function to generate the code for declaring and initializing constant tensors
-// This is for tensors which are not part of weight files and can be created from the Constant operator
-template <typename T>
-std::string GenerateConstantTensorCode(const std::pair<std::string, InitializedTensor> &t)
-{
-   std::stringstream strs;
-   std::string type = ConvertTypeToString(t.second.type());
-   size_t length = ConvertShapeToLength(t.second.shape());
-   // avoid using stack sizes for constant tensors to reduce compilation time
-   bool allocateOnStack = (length > 100) ? false : true;
-
-   const T *data = t.second.data<T>();
-
-   // and check if all values are the same
-   bool sameData = false;
-   // for non stack allocation check if data are the same
-   if (!allocateOnStack && length > 1) {
-      size_t idx = 1;
-      do {
-         sameData = (data[idx] == data[idx - 1]);
-         idx++;
-      } while (sameData && idx < length);
-   }
-   if (allocateOnStack) {
-      strs << type << " tensor_" << t.first << "[" << length << "] = " << ConvertValuesToString(length, data) << ";\n";
-   } else {
-      strs << "std::vector<" << type << "> fTensor_" << t.first << " = ";
-      if (sameData)
-         strs << "std::vector<" << type << ">(" << length << ", " << ConvertValToString(data[0]) << ");\n";
-      else {
-         strs << ConvertValuesToString(length, data) << ";\n";
-      }
-      strs << "const " << type << " * tensor_" + t.first + " = fTensor_" + t.first + ".data();\n";
-   }
-   return strs.str();
-}
-
 void RModel::GenerateInitializedTensorInfo()
 {
    if (!fInitializedTensors.empty())
@@ -578,10 +644,13 @@ void RModel::GenerateInitializedTensorInfo()
 
    for (auto &i : fInitializedTensors) {
       if (!fUseWeightFile || i.second.IsConstantTensor()) {
-         if (i.second.type() == ETensorType::FLOAT)
+         if (i.second.type() == ETensorType::FLOAT) {
             fGC += GenerateConstantTensorCode<float>(i);
-         else if (i.second.type() == ETensorType::INT64)
+            fConstantTensorSize += ConvertShapeToLength(i.second.shape()) * 4;
+         } else if (i.second.type() == ETensorType::INT64) {
             fGC += GenerateConstantTensorCode<int64_t>(i);
+            fConstantTensorSize += ConvertShapeToLength(i.second.shape()) * 8;
+         }
 
       } else {
          // case of tensors which are read from a file
@@ -589,43 +658,55 @@ void RModel::GenerateInitializedTensorInfo()
          if (i.second.type() == ETensorType::FLOAT) {
             fGC += "std::vector<float> fTensor_" + i.first + " = std::vector<float>(" + std::to_string(length) + ");\n";
             fGC += "float * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
+            fWeightsTensorSize += ConvertShapeToLength(i.second.shape()) * 4;
          }
       }
    }
 }
 
 void RModel::GenerateIntermediateMemoryPool() {
-   if (fIntermediateMemoryInfo.total_stack.size() == 0) return;
+   if (fIntermediateMemoryInfo.total_stack.empty()) return;
    fGC += "\n//--- Allocating session memory pool to be used for allocating intermediate tensors\n";
 
    // char memory block is allocated since char takes 1 byte, thus easier to allocate tensors
    // of other data types
-   fGC += "char* fIntermediateMemoryPool = new char[" + std::to_string(fIntermediateMemoryInfo.total_stack.rbegin()->first + fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size)+ "];\n\n";
+   auto const &totalStack = fIntermediateMemoryInfo.total_stack;
+   const size_t memPoolSize = totalStack.rbegin()->first + totalStack.rbegin()->second.tensor_size;
+   fGC += "std::vector<char> fIntermediateMemoryPool = std::vector<char>(" + std::to_string(memPoolSize) + ");\n\n";
 }
 
 void RModel::GenerateIntermediateTensorInfo() {
    if (!fIntermediateTensorInfos.empty()) {
       std::string tensor_declaration_block = "";
-
       for (auto &i : fIntermediateTensorInfos) {
          if (i.second.type == ETensorType::BOOL) {
-               tensor_declaration_block += "std::vector<bool> fTensor_" + i.first + " = std::vector<bool>(" + std::to_string(ConvertShapeToLength(i.second.shape)) + ");\n";
-               // No pointer allocation needed for BOOL
+               tensor_declaration_block += "std::vector<std::uint8_t> fTensor_" + i.first + " = std::vector<std::uint8_t>(" + std::to_string(ConvertShapeToLength(i.second.shape)) + ");\n";
+               tensor_declaration_block += "std::uint8_t * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
+               continue;
          }
-         if (fIntermediateTensorFrequencyLookup.find(i.first) == fIntermediateTensorFrequencyLookup.end() && std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), i.first) == fOutputTensorNames.end()) {
+         bool is_extended = (fOptimizationLevel == OptimizationLevel::kExtended);
+         bool not_in_freq_map =
+            (fIntermediateTensorFrequencyLookup.find(i.first) == fIntermediateTensorFrequencyLookup.end());
+         bool not_in_output_names =
+            (std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), i.first) == fOutputTensorNames.end());
+
+         if ((not_in_freq_map && not_in_output_names) || (!not_in_freq_map && !is_extended && not_in_output_names)) {
             size_t length = ConvertShapeToLength(i.second.shape);
 
             if (i.second.type == ETensorType::FLOAT) {
                tensor_declaration_block += "std::vector<float> fTensor_" + i.first + " = std::vector<float>(" + std::to_string(length) + ");\n";
                tensor_declaration_block += "float * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
+               fOtherTensorSize += 4 * length;
             }
             else if (i.second.type == ETensorType::DOUBLE) {
                tensor_declaration_block += "std::vector<double> fTensor_" + i.first + " = std::vector<double>(" + std::to_string(length) + ");\n";
                tensor_declaration_block += "double * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
+               fOtherTensorSize += 8 * length;
             }
             else if (i.second.type == ETensorType::INT64) {
                tensor_declaration_block += "std::vector<int64_t> fTensor_" + i.first + " = std::vector<int64_t>(" + std::to_string(length) + ");\n";
                tensor_declaration_block += "int64_t * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
+               fOtherTensorSize += 8 * length;
             }
          }
       }
@@ -664,17 +745,17 @@ void RModel::GenerateOperatorDeclarations() {
    fGC += "\n";
 }
 
-void RModel::GenerateDynamicTensorInfo() {
-    fGC += "//---- allocate the intermediate dynamic tensors\n";
-    std::stringstream out;
-    for (auto & i: fDynamicTensorInfos) {
-        auto length = ConvertDynamicShapeToLength(i.second.shape);
-        out << SP <<  "if (" << length << " > 0) {\n";
-        out << SP << SP <<  "fTensor_" <<  i.first  <<  ".resize(" <<  length << ");\n";
-        out << SP << SP <<  "tensor_" << i.first << " = fTensor_" << i.first  << ".data();\n";
-        out << SP << "}\n";
-    }
-    fGC += out.str();
+void RModel::GenerateDynamicTensorInfo()
+{
+   std::stringstream out;
+   for (auto &i : fDynamicTensorInfos) {
+      auto length = ConvertDynamicShapeToLength(i.second.shape);
+      out << SP << "if (" << length << " > 0) {\n";
+      out << SP << SP << "fTensor_" << i.first << ".resize(" << length << ");\n";
+      out << SP << SP << "tensor_" << i.first << " = fTensor_" << i.first << ".data();\n";
+      out << SP << "}\n";
+   }
+   fGC += out.str();
 }
 
 std::string RModel::GenerateInferSignature(bool isdecl) {
@@ -702,7 +783,7 @@ std::string RModel::GenerateInferSignature(bool isdecl) {
          if (type == "other")
             throw std::runtime_error("TMVA-SOFIE: input tensor " + name +
                                      " is of a data type which is not yet supported.");
-         rGC += type + "* ";
+         rGC += type + " const* ";
       }
       rGC += "tensor_" + name + ",";
       i_input++;
@@ -714,94 +795,82 @@ std::string RModel::GenerateInferSignature(bool isdecl) {
 
 namespace {
 
-std::string createOutputTensor(RModel const &rmodel, std::string const &name, bool isIntermediateTensor)
-{
-   if(name.empty()) return "{}";
-   ETensorType eOutputType = rmodel.GetTensorType(name);
-   std::string outputType = ConvertTypeToString(eOutputType);
-   if (isIntermediateTensor) {
-
-      if (eOutputType == ETensorType::BOOL) {
-         return "fTensor_" + name;
-      } else {
-         // need to check is size is the same(don't want to return a vector with larger size)
-         // in that case better to copy
-         return "std::vector<" + ConvertTypeToString(eOutputType) + ">(tensor_" + name + ", tensor_" + name + " + " +
-                std::to_string(ConvertShapeToLength(rmodel.GetTensorShape(name))) + ")";
-      }
-   }
-   // include also dynamic tensors since the vectors can be allocated with a size larger than their output
-   // we need a special handling for bool type allocated as vector<bool>
-   auto outputLength = ConvertDynamicShapeToLength(rmodel.GetDynamicTensorShape(name));
-   if (rmodel.IsDynamicTensor(name) && eOutputType == ETensorType::BOOL) {
-      return "std::vector<bool>(fTensor_" + name + ".begin(), fTensor_" + name + ".begin() + " + outputLength + ")";
-   }
-   return "std::vector<" + outputType + ">(tensor_" + name + ", tensor_" + name + " + " + outputLength + ")";
+std::string typeForOutput(ETensorType t) {
+   // The std::vector<bool> is a special type that is not wrapping continuous memory.
+   // We don't want to use it as a return type.
+   if (t == ETensorType::BOOL) t = ETensorType::UINT8;
+   return ConvertTypeToString(t);
 }
 
-} // namespace
-
-void RModel::GenerateOutput() {
-
-   if (fVerbose)
-      std::cout << "Generating main inference code for " << fName << std::endl;
+}
 
+void RModel::GenerateOutput()
+{
    size_t outputSize = fOutputTensorNames.size();
    // assume output types are all the same
-   if (outputSize == 0)
-      throw std::runtime_error("TMVA-SOFIE: output size=0 are not supported");
 
    bool sameOutputTypes = true;
    std::string inferReturnType; // type return by infer function
-   ETensorType eOutputType = GetTensorType(*fOutputTensorNames.begin());
-   std::string outputType = ConvertTypeToString(eOutputType);
+   ETensorType eFirstOutputType = GetTensorType(*fOutputTensorNames.begin());
    fGC += "\n\n";
    if (outputSize == 1) {
-      fGC += "std::vector<" + outputType + ">";
+      fGC += "std::vector<" + typeForOutput(eFirstOutputType) + ">";
    } else {
       // if all output types are the same we return an std::vector - otherwise a tuple
-      for (size_t i = 1; i < outputSize; i++) {
-         if (GetTensorType(fOutputTensorNames[i]) != eOutputType)
+      for (std::string const &name : fOutputTensorNames) {
+         if (GetTensorType(name) != eFirstOutputType)
             sameOutputTypes = false;
       }
       if (sameOutputTypes)
-         fGC += "std::vector<std::vector<" + outputType + ">>";
+         fGC += "std::vector<std::vector<" + typeForOutput(eFirstOutputType) + ">>";
       else {
          inferReturnType = "std::tuple<";
          for (size_t i = 0; i < outputSize; i++) {
-            inferReturnType += "std::vector<" + ConvertTypeToString(GetTensorType(fOutputTensorNames[i])) + ">";
-            if (i < outputSize-1) inferReturnType += ",";
+            inferReturnType += "std::vector<" + typeForOutput(GetTensorType(fOutputTensorNames[i])) + ">";
+            if (i < outputSize - 1)
+               inferReturnType += ",";
          }
          inferReturnType += ">";
          fGC += inferReturnType;
       }
    }
 
-   fGC += " infer(";
-
-   fGC += GenerateInferSignature();
-
-   fGC += "){\n";
+   fGC += " infer(" + GenerateInferSignature() + "){\n";
 
-   for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) {
-      if (fVerbose) std::cout << "Generating code for operator .... " << op_idx << std::endl;
-      fGC += (fOperators[op_idx]->Generate(std::to_string(op_idx)));
+   std::string doInferArgs = GenerateInferSignature(false);
+   if (!doInferArgs.empty())
+      doInferArgs += ",";
+   for (std::string const &name : fOutputTensorNames) {
+      fGC += SP + "std::vector<" + typeForOutput(GetTensorType(name)) + " > output_tensor_" + name + ";\n";
+      doInferArgs += " output_tensor_" + name + ",";
    }
+   if (!doInferArgs.empty())
+      doInferArgs.back() = ' ';
+
+   fGC += SP + "doInfer(" + doInferArgs + ");\n";
 
    fGC += SP + "return {";
-   for (size_t i = 0; i < outputSize; i++) {
-      std::string tensorName = *(fOutputTensorNames.begin() + i);
-      bool isIntermediate = fIntermediateTensorInfos.count(tensorName) > 0;
-      fGC += createOutputTensor(*this, tensorName, isIntermediate);
-      if (i < outputSize - 1)
+   for (size_t i = 0; i < fOutputTensorNames.size(); i++) {
+      fGC += "output_tensor_" + fOutputTensorNames[i];
+      if (i < fOutputTensorNames.size() - 1)
          fGC += ",";
    }
    fGC += "};\n";
-   fGC += "}\n";  // end of infer function scope
+   fGC += "}\n"; // end of infer function scope
 }
 
 void RModel::GenerateSessionCode()
 {
+   // Determine the signature of the actual inference function
+   std::string doInferSignature = GenerateInferSignature();
+   if (!doInferSignature.empty())
+      doInferSignature += ", ";
+   for (auto const &name : fOutputTensorNames) {
+      doInferSignature += " std::vector<" + typeForOutput(GetTensorType(name)) + "> &output_tensor_" + name + ",";
+   }
+   doInferSignature.back() = ' ';
+
+   doInferSignature = "void doInfer(" + doInferSignature + ")";
 
    // define the Session struct (for GNN this is generated in RModel_GNN)
    if (fUseSession && !fIsGNNComponent) {
@@ -814,24 +883,31 @@ void RModel::GenerateSessionCode()
    // generate code for declaring the initialized tensors
    GenerateInitializedTensorInfo();
 
-   // evaluate total intermediate memory and position intermediate tensor addresses
-   std::string intermediate_memory_alloc_string = "";
-   intermediate_memory_alloc_string += "\n// --- Positioning intermediate tensor memory --";
-   for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) {
-      intermediate_memory_alloc_string += AllocateIntermediateMemory(fOperators[op_idx]->GetOpOutputTensors());
-      CheckAndFlushIntermediateMemory(fOperators[op_idx]->GetOpInputTensors(), op_idx);
-   }
+   if (fOptimizationLevel == OptimizationLevel::kExtended) {
+      // evaluate total intermediate memory and position intermediate tensor addresses
+      std::string intermediate_memory_alloc_string = "";
+      intermediate_memory_alloc_string += "\n// --- Positioning intermediate tensor memory --";
+      for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) {
+         if (fVerbose) {
+            auto op = fOperators[op_idx].get();
+            std::cout << "\n******************\n analyzing input/output operator " << op_idx << "  "
+                      << typeid(*op).name() << std::endl;
+         }
+         intermediate_memory_alloc_string += AllocateIntermediateMemory(fOperators[op_idx]->GetOpOutputTensors());
+         CheckAndFlushIntermediateMemory(fOperators[op_idx]->GetOpInputTensors(), op_idx);
+      }
 
-   // to check remaining unused fragments after memory allocation (lesser the better)
-   // for (const auto &it: fIntermediateMemoryInfo.available_stack){
-   //    std::cout<<"chunk_idx: "<<it.first<<", chunk_size: "<<it.second<<"\n";
-   // }
+      // to check remaining unused fragments after memory allocation (lesser the better)
+      // for (const auto &it: fIntermediateMemoryInfo.available_stack){
+      //    std::cout<<"chunk_idx: "<<it.first<<", chunk_size: "<<it.second<<"\n";
+      // }
 
-   // generate the memory pool to be used by intermediate tensors
-   GenerateIntermediateMemoryPool();
+      // generate the memory pool to be used by intermediate tensors
+      GenerateIntermediateMemoryPool();
 
-   // position intermediate tensors
-   fGC += intermediate_memory_alloc_string;
+      // position intermediate tensors
+      fGC += intermediate_memory_alloc_string;
+   }
 
    // generate the declaring the intermediate tensors
    GenerateIntermediateTensorInfo();
@@ -875,10 +951,10 @@ void RModel::GenerateSessionCode()
       }
       // add initialization of shape parameters
       // assume all parameters are of type size_t
-      if (!fShapeParams.empty()) {
-         for (auto &p : fShapeParams) {
+      if (!fDimShapeNames.empty()) {
+         for (auto &p : fDimShapeNames) {
             fGC += ",\n";
-            fGC += "        size_t " + p.first + " = " + p.second;
+            fGC += "        size_t " + p + " = " + fShapeParams[p];
          }
       }
       fGC += ") {\n";
@@ -900,12 +976,42 @@ void RModel::GenerateSessionCode()
 
       fGC += "}\n\n";
    }
+
+   fGC += doInferSignature + "{\n";
+   fGC += "\n";
+
    // generate the inference code
+   if (fVerbose)
+      std::cout << "Generating main inference code for " << fName << std::endl;
+
+   if (fOutputTensorNames.size() == 0)
+      throw std::runtime_error("TMVA-SOFIE: output size=0 are not supported");
+
+   for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) {
+      if (fVerbose)
+         std::cout << "Generating code for operator .... " << op_idx << std::endl;
+      fGC += (fOperators[op_idx]->Generate(std::to_string(op_idx)));
+   }
+
+   fGC += SP + "using SOFIE::UTILITY::FillOutput;\n\n";
+
+   for (std::string const &name : fOutputTensorNames) {
+      // need to check is size is the same (don't want to return a vector with
+      // larger size) in that case better to copy
+      bool isIntermediate = fIntermediateTensorInfos.count(name) > 0;
+      std::string n = isIntermediate ? std::to_string(ConvertShapeToLength(GetTensorShape(name)))
+                                     : ConvertDimShapeToLength(GetDimTensorShape(name));
+      fGC += SP + "FillOutput(tensor_" + name + ", output_tensor_" + name + ", " + n + ");\n";
+   }
+
+   fGC += "}\n\n";
+
+   // generate the inference overload that returns an output struct
    GenerateOutput();
 
    // end of session
    if (fUseSession && !fIsGNNComponent) {
-      fGC += "};   // end of Session\n";
+      fGC += "};   // end of Session\n\n";
    }
 }
 
@@ -982,8 +1088,7 @@ void RModel::ReadInitializedTensorsFromFile(long pos) {
             fGC += "   f.seekg(" + std::to_string(pos) + ");\n";
         }
 
-        fGC += "   std::string tensor_name;\n";
-        fGC += "   size_t length;\n";
+        fGC += "   using SOFIE::ReadTensorFromStream;\n";
 
         // loop on tensors and parse the file
         for (auto& i: fInitializedTensors) {
@@ -991,25 +1096,8 @@ void RModel::ReadInitializedTensorsFromFile(long pos) {
             if (!i.second.IsWeightTensor()) continue;
             std::string tensor_name = "tensor_" + i.first;
             if (i.second.type() == ETensorType::FLOAT) {
-                size_t length = 1;
-                length = ConvertShapeToLength(i.second.shape());
-                std::string slength = std::to_string(length);
-                fGC += "   f >> tensor_name >> length;\n";
-                fGC += "   if (tensor_name != \"" + tensor_name + "\" ) {\n";
-                fGC += "      std::string err_msg = \"TMVA-SOFIE failed to read the correct tensor name; expected name is " +
-                       tensor_name + " , read \" + tensor_name;\n";
-                fGC += "      throw std::runtime_error(err_msg);\n";
-                fGC += "    }\n";
-                fGC += "   if (length != " + slength + ") {\n";
-                fGC += "      std::string err_msg = \"TMVA-SOFIE failed to read the correct tensor size; expected size is " +
-                       slength + " , read \" + std::to_string(length) ;\n";
-                fGC += "      throw std::runtime_error(err_msg);\n";
-                fGC += "    }\n";
-                fGC += "   for (size_t i = 0; i < length; ++i)\n";
-                fGC += "      f >> " + tensor_name + "[i];\n";
-                fGC += "   if (f.fail()) {\n";
-                fGC += "      throw std::runtime_error(\"TMVA-SOFIE failed to read the values for tensor " + tensor_name + "\");\n";
-                fGC += "   }\n";
+               std::string length = std::to_string(ConvertShapeToLength(i.second.shape()));
+               fGC += "   ReadTensorFromStream(f, " + tensor_name + ", \"" + tensor_name + "\", " + length + ");\n";
             } else {
                std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a file");
             }
@@ -1019,6 +1107,7 @@ void RModel::ReadInitializedTensorsFromFile(long pos) {
 
     // generate the code to read initialized tensors from a ROOT data file
     if(fWeightFile == WeightFileType::RootBinary) {
+#ifdef SOFIE_SUPPORT_ROOT_BINARY
         fGC += "  {\n";
         fGC += "   std::unique_ptr<TFile> rootFile(TFile::Open(filename.c_str(), \"READ\"));\n";
         fGC += "   if (!rootFile->IsOpen()) {\n";
@@ -1050,6 +1139,9 @@ void RModel::ReadInitializedTensorsFromFile(long pos) {
             fGC += "  }\n";
         }
         fGC += "  }\n";
+#else
+        throw std::runtime_error("SOFIE was not built with ROOT file support.");
+#endif // SOFIE_SUPPORT_ROOT_BINARY
     }
 }
 
@@ -1075,6 +1167,7 @@ long RModel::WriteInitializedTensorsToFile(std::string filename) {
 
     // Write the initialized tensors to the file
     if (fWeightFile == WeightFileType::RootBinary) {
+#ifdef SOFIE_SUPPORT_ROOT_BINARY
         if(fIsGNNComponent || fIsGNN) {
             throw std::runtime_error("SOFIE-GNN yet not supports writing to a ROOT file.");
         }
@@ -1118,6 +1211,9 @@ long RModel::WriteInitializedTensorsToFile(std::string filename) {
         // this needs to be changed, similar to the text file
         return -1;
 
+#else
+        throw std::runtime_error("SOFIE was not built with ROOT file support.");
+#endif // SOFIE_SUPPORT_ROOT_BINARY
     } else if (fWeightFile == WeightFileType::Text) {
         std::ofstream f;
         if(fIsGNNComponent) {
@@ -1244,9 +1340,9 @@ void RModel::PrintOutputTensors() {
     for (auto& it: fOutputTensorNames) {
         std::cout << "Tensor name: \"" << it << "\"\t";
         if (!IsDynamicTensor(it))
-          std::cout << "shape: " << ConvertShapeToString(GetTensorShape(it)) << std::endl;
-       else
-          std::cout << "shape: " << ConvertDynamicShapeToString(GetDynamicTensorShape(it)) << std::endl;
+           std::cout << "shape: " << ConvertShapeToString(GetTensorShape(it)) << std::endl;
+        else
+          std::cout << "shape: " << ConvertShapeToString(GetDynamicTensorShape(it)) << std::endl;
     }
     std::cout << "\n";
 }
@@ -1312,13 +1408,13 @@ void RModel::OutputGenerated(std::string filename, bool append) {
 void RModel::Streamer(TBuffer &R__b) {
     if (R__b.IsReading()) {
         RModel::Class()->ReadBuffer(R__b, this);
-        for(auto i=RModel::fInitializedTensors.begin(); i!=RModel::fInitializedTensors.end(); ++i) {
-            i->second.CastPersistentToShared();
+        for (auto & i : fInitializedTensors) {
+            i.second.CastPersistentToShared();
         }
     }
     else {
-        for(auto i=RModel::fInitializedTensors.begin(); i!=RModel::fInitializedTensors.end(); ++i) {
-            i->second.CastSharedToPersistent();
+        for (auto & i : fInitializedTensors) {
+            i.second.CastSharedToPersistent();
         }
         RModel::Class()->WriteBuffer(R__b, this);
     }
diff --git a/src/SOFIE_core/src/RModel_ALPAKA.cxx b/src/SOFIE_core/src/RModel_ALPAKA.cxx
new file mode 100644
index 0000000..9ff300a
--- /dev/null
+++ b/src/SOFIE_core/src/RModel_ALPAKA.cxx
@@ -0,0 +1,403 @@
+#include <algorithm>
+#include <cctype>
+#include <limits>
+#include <memory>
+#include <string>
+
+#include "TFile.h"
+#include "SOFIE/RModel.hxx"
+#include "SOFIE/SOFIE_common.hxx"
+
+namespace SOFIE {
+
+void RModel::GenerateInitializedTensorInfo_GPU_ALPAKA() {
+   if (!fInitializedTensors.empty()){
+      fGC += "\n// initialized tensors for weights\n";
+   }
+
+   for (auto &i : fInitializedTensors) {
+      if (!fUseWeightFile || i.second.IsConstantTensor()) {
+         if (i.second.type() == ETensorType::FLOAT)
+            fGC += GenerateConstantTensorCode<float>(i);
+         else if (i.second.type() == ETensorType::INT64)
+            fGC += GenerateConstantTensorCode<int64_t>(i);
+
+      } else {
+         // case of tensors which are read from a file
+         size_t length = ConvertShapeToLength(i.second.shape());
+         if (i.second.type() == ETensorType::FLOAT) {
+            fGC += "BufF1D deviceBuf_" + i.first +
+                   " = alpaka::allocBuf<float, Idx>(devAcc, Ext1D::all(Idx{" +
+                   std::to_string(length) + "}));\n";
+         }
+      }
+   }
+}
+
+void RModel::GenerateTemporaryInitializedTensorContainers_GPU_ALPAKA()
+{
+   if (!fInitializedTensors.empty())
+      fGC += "// temporary initialized tensors for loading weights\n";
+
+   for (auto &i : fInitializedTensors) {
+      if (!fUseWeightFile || i.second.IsConstantTensor()) {
+         if (i.second.type() == ETensorType::FLOAT)
+            fGC += GenerateConstantTensorCode<float>(i);
+         else if (i.second.type() == ETensorType::INT64)
+            fGC += GenerateConstantTensorCode<int64_t>(i);
+
+      } else {
+         // case of tensors which are read from a file
+         size_t length = ConvertShapeToLength(i.second.shape());
+         if (i.second.type() == ETensorType::FLOAT) {
+            fGC += "std::vector<float> tensor_" + i.first + "(" + std::to_string(length) + ");\n";
+         }
+      }
+   }
+}
+
+void RModel::GenerateGPU_ALPAKA_Buffers() {
+   if (!fIntermediateTensorInfos.empty()) {
+      std::string tensor_declaration_block = "";
+
+      for (auto &i : fIntermediateTensorInfos) {
+         if (i.second.type == ETensorType::BOOL) {
+            tensor_declaration_block += "std::vector<bool> fTensor_" + i.first +
+                                        " = std::vector<bool>(" +
+                                        std::to_string(ConvertShapeToLength(i.second.shape)) +
+                                        ");\n";
+            // No pointer allocation needed for BOOL
+         }
+
+         size_t length = ConvertShapeToLength(i.second.shape);
+
+         if (i.second.type == ETensorType::FLOAT) {
+            tensor_declaration_block += "BufF1D deviceBuf_" + i.first +
+                                          " = alpaka::allocBuf<float, size_t>(devAcc, Ext1D::all(Idx{" +
+                                          std::to_string(length) + "}));\n";
+         } else if (i.second.type == ETensorType::DOUBLE) {
+            tensor_declaration_block += "BufD1D deviceBuf_" + i.first +
+                                          " = alpaka::allocBuf<double, size_t>(devAcc, Ext1D::all(Idx{" +
+                                          std::to_string(length) + "}));\n";
+         } else if (i.second.type == ETensorType::INT64) {
+            tensor_declaration_block += "BufI641D deviceBuf_" + i.first +
+                                          " = alpaka::allocBuf<int64_t, size_t>(devAcc, Ext1D::all(Idx{" +
+                                          std::to_string(length) + "}));\n";
+         }
+      }
+
+      if (tensor_declaration_block.length()) {
+         fGC += "\n//--- declare and allocate the intermediate tensors\n" + tensor_declaration_block;
+      }
+   }
+
+   // add also the dynamic tensors (only declarations, allocation will be done later)
+   if (!fDynamicTensorInfos.empty()) {
+      fGC += "//--- declare the dynamic tensors\n";
+      fGC += "using bufDev_float = alpaka::Buf<devAcc, float, alpaka::DimInt<1u>, size_t>;\n";
+      fGC += "using bufDev_double = alpaka::Buf<devAcc, double, alpaka::DimInt<1u>, size_t>;\n";
+      fGC += "using bufDev_int64  = alpaka::Buf<devAcc, int64_t, alpaka::DimInt<1u>, size_t>;\n";
+
+      for (auto &i : fDynamicTensorInfos) {
+         if (i.second.type == ETensorType::FLOAT) {
+            fGC += "bufDev_float bufDev_" + i.first + ";\n";
+         } else if (i.second.type == ETensorType::DOUBLE) {
+            fGC += "bufDev_double bufDev_" + i.first + ";\n";
+         } else if (i.second.type == ETensorType::INT64) {
+            fGC += "bufDev_int64 bufDev_" + i.first + ";\n";
+         }
+      }
+   }
+}
+
+void RModel::GenerateDynamicTensorInfo_GPU_ALPAKA() {
+   fGC += "//---- allocate the intermediate dynamic tensors\n";
+   std::stringstream out;
+
+   for (auto &i : fDynamicTensorInfos) {
+      auto length = ConvertDynamicShapeToLength(i.second.shape);
+      out << SP << "if (" << length << " > 0) {\n";
+      out << "auto bufDev_" + i.first +
+                 " = alpaka::allocBuf<float, size_t>(devAcc, Ext1D::all(Idx{" << length << "}));\n";
+      out << SP << "}\n";
+   }
+   fGC += out.str();
+}
+
+// only supports BufF1D buffer data types for now
+std::string RModel::GenerateInferSignature_GPU_ALPAKA(bool isdecl) {
+   // generate the infer signature given the inputs: eg. "BufF1D const deviceBuf_A, BufF1D const deviceBuf_B"
+   // if (decl = false) generate only calling signature (deviceBuf_A, deviceBuf_B, ....)
+   std::string rGC;
+   std::unordered_map<std::string, int> inputParams;
+   int i_input = 0;
+   for (auto &name : fInputTensorNames) {
+      // if is a dynamic tensor pass initial parameters
+      if (IsDimInputTensor(name)) {
+         auto shape = GetDynamicTensorShape(name);
+         for (auto &d : shape) {
+            std::string pName = d.param;
+            // need to check if the input parameters is already existing in another input tensor
+            if (d.isParam && inputParams.count(pName) == 0) {
+               if (isdecl) rGC += "size_t ";
+               rGC += d.param + ",";
+               inputParams[pName] = i_input;
+            }
+         }
+      }
+      if (isdecl) {
+         std::string type = "BufF1D";
+         if (type == "other")
+            throw std::runtime_error("TMVA-SOFIE: input tensor " + name +
+                                     " is of a data type which is not yet supported.");
+         rGC += type + " const ";
+      }
+      rGC += "deviceBuf_" + name + ",";
+      i_input++;
+   }
+
+   if (fInputTensorNames.size() > 0) rGC.pop_back();// remove last ","
+   return rGC;
+}
+
+void RModel::GenerateOutput_GPU_ALPAKA() {
+   if (fVerbose)
+      std::cout << "Generating main inference code for " << fName << std::endl;
+
+   size_t outputSize = fOutputTensorNames.size();
+   if (outputSize == 0)
+      throw std::runtime_error("TMVA-SOFIE: output size=0 are not supported");
+
+   bool sameOutputTypes = true;
+   std::string inferReturnType;
+   ETensorType eOutputType = GetTensorType(*fOutputTensorNames.begin());
+   std::string outputType = ConvertTypeToString(eOutputType);
+
+   fGC += "\n\n";
+   if (outputSize == 1) {
+      fGC += "alpaka::Buf<Acc, float, Dim, Idx>";
+   } else {
+      throw std::runtime_error("TMVA-SOFIE: multiple output tensors are not supported in ALPAKA code generation");
+   }
+
+   fGC += " infer(";
+   fGC += GenerateInferSignature_GPU_ALPAKA();
+   fGC += "){\n";
+
+   for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) {
+      if (fVerbose)
+         std::cout << "Generating code for operator .... " << op_idx << std::endl;
+      fGC += (fOperators[op_idx]->Generate_GPU_ALPAKA(std::to_string(op_idx)));
+   }
+
+   fGC += "\n\n   alpaka::wait(queue);\n";
+   fGC += SP + "return ";
+   if (outputSize>1) fGC += " {";
+   for (size_t i = 0; i < outputSize; i++) {
+      std::string tensorName = *(fOutputTensorNames.begin() + i);
+      bool isIntermediate = fIntermediateTensorInfos.count(tensorName) > 0;
+      fGC += "deviceBuf_"+tensorName;
+      if (i < outputSize - 1)
+         fGC += ",";
+   }
+   if (outputSize>1) fGC += " };\n";
+   else fGC += ";\n";
+   fGC += "}\n"; // end of infer function scope
+}
+
+void RModel::GenerateSessionCode_GPU_ALPAKA() {
+   
+   std::set<SOFIE::OperatorKind> registered_operators;
+
+   fGC += "\n//--- ALPAKA Kernels\n";
+   for (size_t id = 0; id < fOperators.size(); id++) {
+      if(registered_operators.find(fOperators[id]->GetKind()) == registered_operators.end()) {
+         
+         if (fVerbose)
+            std::cout<<"Generating ALPAKA kernel for operator"<< toString(fOperators[id]->GetKind()) << std::endl;
+         
+         fGC += fOperators[id]->Generate_GPU_Kernel_ALPAKA(std::to_string(id));
+         registered_operators.insert(fOperators[id]->GetKind());
+      }
+   }
+
+   // define the Session struct (for GNN this is generated in RModel_GNN)
+  fGC += "\n\ntemplate <typename tagAcc>\n";
+   if (fUseSession) {
+      if (!fIsSubGraph)
+         fGC += "struct Session {\n\n";
+      else
+         fGC += "struct Session_" + fName + " {\n\n";
+   }
+
+   // define host and device accelerators
+    fGC += "using Idx = std::size_t;\n";
+    fGC += "using Dim = alpaka::DimInt<1>;\n";
+    fGC += "using Acc = alpaka::TagToAcc<tagAcc, Dim, Idx>;\n";
+    fGC += "using DevAcc = alpaka::Dev<Acc>;\n\n";
+    fGC += "using QueueProperty = alpaka::NonBlocking;\n";
+    fGC += "using QueueAcc = alpaka::Queue<Acc, QueueProperty>;\n\n";
+    fGC += "using BufF1D = alpaka::Buf<Acc, float, Dim, Idx>;\n";
+    fGC += "using BufD1D = alpaka::Buf<Acc, double, Dim, Idx>;\n";
+    fGC += "using BufI641D = alpaka::Buf<Acc, int64_t, Dim, Idx>;\n\n";
+
+    fGC += "\nalpaka::Platform<Acc> const platform{};\n";
+    fGC += "DevAcc devAcc = alpaka::getDevByIdx(platform, 0);\n";
+    fGC += "alpaka::PlatformCpu platformHost{};\n";
+    fGC += "alpaka::DevCpu hostAcc = alpaka::getDevByIdx(platformHost, 0);\n";
+    fGC += "QueueAcc queue{devAcc};\n";
+    fGC += "Idx threadsPerBlock = 256;\n";
+    fGC += "\nusing Ext1D = alpaka::Vec<Dim, Idx>;\n";
+    fGC += "using Vec = alpaka::Vec<Dim, Idx>;\n";
+    if (registered_operators.find(SOFIE::OperatorKind::GEMM) != registered_operators.end()) {
+         fGC += "\n\n// BLAS declarations\n";
+         fGC += "sofieBLAS<tagAcc> blas{queue};\n";
+    }
+
+   GenerateInitializedTensorInfo_GPU_ALPAKA();
+   GenerateGPU_ALPAKA_Buffers();
+   GenerateOperatorDeclarations();
+
+   // add subgraph session
+   if (!fSubGraphs.empty())
+      fGC += "//   subgraph sessions\n";
+   for (auto &graph : fSubGraphs) {
+      fGC += "Session_" + graph->fName + "  fSession_" + graph->fName + ";\n";
+   }
+
+   // Session constructor
+   if (fUseSession) {
+      std::string sessionName = "\n\nSession";
+      if (fIsSubGraph)
+         sessionName += "_" + fName;
+
+      if (fUseWeightFile) {
+         std::string fileName = fName;
+         if (fWeightFile == WeightFileType::Text)
+            fileName += ".dat";
+         if (fWeightFile == WeightFileType::RootBinary)
+            fileName += ".root";
+
+         fGC += sessionName + "(std::string filename =\"" + fileName + "\"";
+      } else {
+         fGC += sessionName + "(std::string = \"\"";
+      }
+
+      if (!fShapeParams.empty()) {
+         for (auto &p : fShapeParams) {
+            fGC += ",\n";
+            fGC += "        size_t " + p.first + " = " + p.second;
+         }
+      }
+      fGC += ") {\n";
+      
+      GenerateTemporaryInitializedTensorContainers_GPU_ALPAKA();
+      if (fUseWeightFile) {
+         fGC += "\n//--- reading weights from file\n";
+         ReadInitializedTensorsFromFile(0);
+         fGC += "\n";
+      }
+
+      MoveInitializedTensorsToBuffers_ALPAKA();
+      GenerateDynamicTensorInfo_GPU_ALPAKA();
+
+      for (size_t id = 0; id < fOperators.size(); id++) {
+         fGC += fOperators[id]->GenerateInitCode_GPU_ALPAKA();
+         if (fOperators[id]->GetKind() == OperatorKind::GEMM){
+            fGC += "\nblas.AddLayoutConfig("+fOperators[id]->GetBlasConfig()+");\n";
+         }
+      }
+
+      fGC += "\nalpaka::wait(queue);\n";
+      fGC += "}\n\n";
+   }
+
+   registered_operators.clear();
+      for (size_t id = 0; id < fOperators.size(); id++) {
+      if(registered_operators.find(fOperators[id]->GetKind()) == registered_operators.end()) {
+         
+         if (fVerbose)
+         std::cout<<"Declaring ALPAKA kernel for operator"<< toString(fOperators[id]->GetKind())<<std::endl;
+         
+         fGC += fOperators[id]->Generate_GPU_Kernel_Definitions_ALPAKA(std::to_string(id));
+         registered_operators.insert(fOperators[id]->GetKind());
+      }
+   }
+
+   GenerateOutput_GPU_ALPAKA();
+
+   if (fUseSession && !fIsGNNComponent) {
+      fGC += "};   // end of Session\n";
+   }
+}
+
+void RModel::GenerateGPU_ALPAKA(std::underlying_type_t<Options> options, int batchSize, bool verbose) {
+   fVerbose = verbose;
+   fBatchSize = batchSize;
+
+   if (static_cast<std::underlying_type_t<Options>>(Options::kNoSession) & options) {
+      fUseSession = false;
+      fWeightFile = WeightFileType::None;
+   }
+   if (static_cast<std::underlying_type_t<Options>>(Options::kNoWeightFile) & options) {
+      fUseWeightFile = false;
+      fWeightFile = WeightFileType::None;
+   }
+   if (static_cast<std::underlying_type_t<Options>>(Options::kRootBinaryWeightFile) & options) {
+      fUseWeightFile = true;
+      fWeightFile = WeightFileType::RootBinary;
+   }
+   if (fUseWeightFile && !fUseSession) {
+      throw std::runtime_error(
+          "TMVA-SOFIE: RModel::Generate: cannot use a separate weight file without generating a Session class");
+   }
+
+   if (static_cast<std::underlying_type_t<Options>>(Options::kGNN) & options ||
+       static_cast<std::underlying_type_t<Options>>(Options::kGNNComponent) & options)
+      throw std::runtime_error("SOFIE GPU does not yet supports GNN Inference.");
+
+   Initialize(batchSize, verbose);
+
+   std::string hgname;
+   if (!fIsSubGraph) {
+      fGC.clear();
+      GenerateHeaderInfo_GPU_ALPAKA(hgname);
+   }
+
+   if (fVerbose)
+      std::cout << "generate Main session code - model  " << fName << std::endl;
+
+   GenerateSessionCode_GPU_ALPAKA();
+
+   if (!fIsSubGraph) {
+      fGC += ("} //SOFIE_" + fName + "\n");
+      fGC += "\n#endif  // " + hgname + "\n";
+   }
+}
+
+void RModel::MoveInitializedTensorsToBuffers_ALPAKA(){
+      for (auto &i : fInitializedTensors) {
+         // skip Constant and shape tensors
+         if (!i.second.IsWeightTensor()) continue;
+         std::string tensor_name = "tensor_" + i.first;
+         auto length = ConvertShapeToLength(i.second.shape());
+         std::string slength = std::to_string(length);
+         if (i.second.type() == ETensorType::FLOAT) {
+            fGC += "     auto hostBuf_"+i.first+" = alpaka::allocBuf<float, Idx>(hostAcc, Ext1D::all(Idx{"+ slength+"}));\n";
+            fGC += "     std::memcpy(alpaka::getPtrNative(hostBuf_"+i.first+"), tensor_"+i.first+".data(), "+slength+"* sizeof(float));\n";
+            fGC += "     alpaka::memcpy(queue, deviceBuf_"+i.first+", hostBuf_"+i.first+");\n";
+         } else if (i.second.type() == ETensorType::DOUBLE) {
+            fGC += "     auto hostBuf_"+i.first+" = alpaka::allocBuf<double, Idx>(hostAcc, Ext1D::all(Idx{"+ slength+"}));\n";
+            fGC += "     std::memcpy(alpaka::getPtrNative(hostBuf_"+i.first+"), tensor_"+i.first+".data(), "+slength+"* sizeof(double));\n";
+            fGC += "     alpaka::memcpy(queue, deviceBuf_"+i.first+", hostBuf_"+i.first+");\n";
+         } else if (i.second.type() == ETensorType::INT64) {
+            fGC += "     auto hostBuf_"+i.first+" = alpaka::allocBuf<int64_t, Idx>(hostAcc, Ext1D::all(Idx{" + slength + "}));\n";
+            fGC += "     std::memcpy(alpaka::getPtrNative(hostBuf_"+i.first+"), tensor_"+i.first+".data(), "+slength+"* sizeof(int64_t));";
+            fGC += "     alpaka::memcpy(queue, deviceBuf_"+i.first+", hostBuf_"+i.first+");\n";
+         } else {
+            std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a ROOT file");
+         }
+   }
+  }
+
+} // namespace SOFIE
diff --git a/src/SOFIE_core/src/RModel_Base.cxx b/src/SOFIE_core/src/RModel_Base.cxx
index d4d1f1c..f212c53 100644
--- a/src/SOFIE_core/src/RModel_Base.cxx
+++ b/src/SOFIE_core/src/RModel_Base.cxx
@@ -58,6 +58,38 @@ void RModel_Base::GenerateHeaderInfo(std::string& hgname) {
     }
 }
 
+void RModel_Base::GenerateHeaderInfo_GPU_ALPAKA(std::string& hgname) {
+    fGC += ("//Code generated automatically by TMVA for GPU Inference using ALPAKA of Model file [" + fFileName + "] at [" + fParseTime.substr(0, fParseTime.length()-1) +"] \n");
+    // add header guards
+    hgname = fName;
+    std::transform(hgname.begin(), hgname.end(), hgname.begin(), [](unsigned char c) {
+                       return std::toupper(c);
+                   } );
+    hgname = "SOFIE_" + hgname;
+    fGC += "\n#ifndef " + hgname + "\n";
+    fGC += "#define " + hgname + "\n\n";
+    for (auto& i: fNeededStdLib) {
+        fGC += "#include <" + i + ">\n";
+    }
+    for (auto& i: fCustomOpHeaders) {
+        fGC += "#include \"" + i + "\"\n";
+    }
+    fGC += "#include <alpaka/alpaka.hpp>\n";
+    fGC += "#include <sofieBLAS/sofieBLAS.hpp>\n";
+
+    // for the session we need to include SOFIE_Common functions
+    //needed for convolution operator (need to add a flag)
+    fGC += "#include \"SOFIE/SOFIE_common.hxx\"\n";
+    if (fUseWeightFile)
+        fGC += "#include <fstream>\n";
+    // Include TFile when saving the weights in a binary ROOT file
+    if (fWeightFile == WeightFileType::RootBinary)
+        fGC += "#include \"TFile.h\"\n";
+
+    fGC += "\nusing Dim1D = alpaka::DimInt<1>;\n";
+    fGC += "\nnamespace SOFIE_" + fName + "{\n";
+}
+
 void RModel_Base::OutputGenerated(std::string filename, bool append) {
     // the model can be appended only if a file name is provided
     if (filename.empty()) {
diff --git a/src/SOFIE_core/src/RModel_GNN.cxx b/src/SOFIE_core/src/RModel_GNN.cxx
index a1dfe06..3dae254 100644
--- a/src/SOFIE_core/src/RModel_GNN.cxx
+++ b/src/SOFIE_core/src/RModel_GNN.cxx
@@ -94,7 +94,7 @@ void RModel_GNN::Generate() {
 
     // the number of output edges features can be smaller, so we need to correct here
     auto num_edge_features_input = num_edge_features;
-    auto edges_update_output_shape =  edges_update_block->GetFunctionBlock()->GetDynamicTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
+    auto edges_update_output_shape =  edges_update_block->GetFunctionBlock()->GetDimTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
       if(!edges_update_output_shape[1].isParam && edges_update_output_shape[1].dim != num_edge_features_input) {
           num_edge_features = edges_update_output_shape[1].dim;
     }
@@ -117,7 +117,7 @@ void RModel_GNN::Generate() {
 
     // we need to correct the output number of node features
     auto num_node_features_input = num_node_features;
-    auto nodes_update_output_shape =  nodes_update_block->GetFunctionBlock()->GetDynamicTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
+    auto nodes_update_output_shape =  nodes_update_block->GetFunctionBlock()->GetDimTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
       if(!nodes_update_output_shape[1].isParam && nodes_update_output_shape[1].dim != num_node_features_input) {
           num_node_features = nodes_update_output_shape[1].dim;
     }
diff --git a/src/SOFIE_core/src/RModel_GraphIndependent.cxx b/src/SOFIE_core/src/RModel_GraphIndependent.cxx
index bab06b3..cd62d0c 100644
--- a/src/SOFIE_core/src/RModel_GraphIndependent.cxx
+++ b/src/SOFIE_core/src/RModel_GraphIndependent.cxx
@@ -81,7 +81,7 @@ void RModel_GraphIndependent::Generate() {
 
        // the number of output edges features can be smaller, so we need to correct here
        // assume num_edge_features is not a parametric shape
-       auto edges_update_output_shape =  edges_update_block->GetFunctionBlock()->GetDynamicTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
+       auto edges_update_output_shape =  edges_update_block->GetFunctionBlock()->GetDimTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
        if(!edges_update_output_shape[1].isParam && edges_update_output_shape[1].dim != num_edge_features_input) {
           num_edge_features = edges_update_output_shape[1].dim;
        }
@@ -100,7 +100,7 @@ void RModel_GraphIndependent::Generate() {
       fGC+="};\n}\n";
 
       // we need to correct the output number of node features
-      auto nodes_update_output_shape =  nodes_update_block->GetFunctionBlock()->GetDynamicTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
+      auto nodes_update_output_shape =  nodes_update_block->GetFunctionBlock()->GetDimTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
        if(!nodes_update_output_shape[1].isParam && nodes_update_output_shape[1].dim != num_node_features_input) {
           num_node_features = nodes_update_output_shape[1].dim;
       }
@@ -119,7 +119,7 @@ void RModel_GraphIndependent::Generate() {
       // we need to correct the output number of global features
       // global features are in shape[1]
 #if 0
-      auto globals_update_output_shape =  globals_update_block->GetFunctionBlock()->GetDynamicTensorShape(globals_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
+      auto globals_update_output_shape =  globals_update_block->GetFunctionBlock()->GetDimTensorShape(globals_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
        if(!globals_update_output_shape[1].isParam && globals_update_output_shape[1].dim != num_global_features_input) {
           num_global_features = globals_update_output_shape[1].dim;
        }
diff --git a/src/SOFIE_core/src/SOFIE_common.cxx b/src/SOFIE_core/src/SOFIE_common.cxx
index ad74313..05f873b 100644
--- a/src/SOFIE_core/src/SOFIE_common.cxx
+++ b/src/SOFIE_core/src/SOFIE_common.cxx
@@ -1,15 +1,16 @@
 #include "SOFIE/SOFIE_common.hxx"
-#include<cctype>
+
+#include <cctype>
 #include <sstream>
 #include <stdexcept>
+#include <charconv>
 
-
-namespace SOFIE{
+namespace SOFIE {
 
 /// @brief  Convert shape from integer format to dynamic one (based on Dim)
 /// @param shape
 /// @return shape based on Dim
-std::vector<Dim> ConvertShapeToDim(std::vector<size_t> shape){
+std::vector<Dim> ConvertShapeToDim(const std::vector<size_t> & shape){
    std::vector<Dim> ret_shape(shape.size());
    for (size_t i =0; i < shape.size(); i++){
       ret_shape[i].dim = shape[i];
@@ -20,7 +21,7 @@ std::vector<Dim> ConvertShapeToDim(std::vector<size_t> shape){
 /// @brief Convert shape based on Dim to integer format
 /// @param shape
 /// @return shape based on integer. Return an empty shape in case shape is dynamic (has a parameter)
-std::vector<size_t> ConvertShapeToInt(std::vector<Dim> shape){
+std::vector<size_t> ConvertShapeToInt(const std::vector<Dim> & shape){
    std::vector<size_t> ret_shape(shape.size());
    for (size_t i =0; i < shape.size(); i++){
       if (shape[i].isParam) {
@@ -45,19 +46,14 @@ std::vector<size_t> ConvertShapeToInt(std::vector<Dim> shape){
    return ret_shape;
 }
 
-
-std::size_t ConvertShapeToLength(std::vector<size_t> shape){
-   // Empty shape represent scalar values, so we return a length=1
-   std::size_t fLength = 1;
-   for (auto& dim: shape) fLength *= dim;
-   return fLength;
-}
-
 std::string ConvertTypeToString(ETensorType type){
    switch(type){
       case ETensorType::FLOAT : {
          return "float";
       }
+      case ETensorType::INT8 : {
+         return "int8_t";
+      }
       case ETensorType::INT16 : {
          return "int16_t";
       }
@@ -67,6 +63,9 @@ std::string ConvertTypeToString(ETensorType type){
       case ETensorType::INT64 : {
          return "int64_t";
       }
+      case ETensorType::UINT8 : {
+         return "uint8_t";
+      }
       case ETensorType::UINT16 : {
          return "uint16_t";
       }
@@ -106,7 +105,7 @@ ETensorType ConvertStringToType(std::string type){
    }
 }
 
-std::string ConvertShapeToString(std::vector<size_t> shape) {
+std::string ConvertShapeToString(const std::vector<size_t> & shape) {
    std::stringstream out;
    out << "{ ";
    for (size_t i = 0; i < shape.size(); i++) {
@@ -117,7 +116,7 @@ std::string ConvertShapeToString(std::vector<size_t> shape) {
    return out.str();
 }
 
-std::string ConvertDynamicShapeToString(std::vector<Dim> shape) {
+std::string ConvertDimShapeToString(const std::vector<Dim> & shape) {
    std::stringstream out;
    out << "{ ";
    for (size_t i = 0; i < shape.size(); i++) {
@@ -128,10 +127,12 @@ std::string ConvertDynamicShapeToString(std::vector<Dim> shape) {
    return out.str();
 }
 
-std::string ConvertDynamicShapeToLength(std::vector<Dim> shape) {
+std::string ConvertDimShapeToLength(const std::vector<Dim> & shape) {
    // convert generic shape to a string
    // multiply all the integer specified dimensions of the shape
    std::string length;
+   // case of empty vectors return 1
+   if (shape.empty()) return "1";
    size_t int_length = 0;
    for (size_t i = 0; i < shape.size(); i++) {
       if (shape[i].isParam) {
@@ -145,12 +146,24 @@ std::string ConvertDynamicShapeToLength(std::vector<Dim> shape) {
       }
    }
    // multiply the integer components to the parametric one
+   // if larger than 1
    if (int_length > 0) {
-      if (!length.empty()) length += " * ";
-      length += std::to_string(int_length);
+      if (!length.empty() && int_length > 1) {
+         length += " * ";
+         length += std::to_string(int_length);
+      } else if (length.empty()) { // case is full known shape
+         length = std::to_string(int_length);
+      }
    }
    return length;
 }
+std::string ConvertShapeToString(const std::vector<Dim> & shape) {
+   return ConvertDimShapeToString(shape);
+}
+std::string ConvertDynamicShapeToLength(const std::vector<Dim> & shape) {
+   return ConvertDimShapeToLength(shape);
+}
+
 
 namespace{
 template<typename T>
@@ -169,6 +182,12 @@ static inline void copy_vector_data(int_t no_of_copies, int_t input_size, T* inp
 }
 }
 
+bool IsInteger(const std::string & s) {
+   int value;
+   auto [ptr, ec] = std::from_chars(s.data(), s.data() + s.size(), value);
+   return ec == std::errc() && ptr == s.data() + s.size();
+}
+
 bool UTILITY::AreSameShape(const std::vector<size_t>& shapeA, const std::vector<size_t>& shapeB) {
    if (shapeA.size() != shapeB.size()) {
       return false;
@@ -330,17 +349,24 @@ std::vector<size_t>  UTILITY::MultidirectionalBroadcastShape(std::vector<std::ve
    }
 }
 
-std::vector<size_t>  UTILITY::UnidirectionalBroadcastShape(std::vector<size_t> shapeA, std::vector<size_t> shapeB)
+// check multi-directional broadcasting of two shapes (need to pass inputs by non const ref. since we might prepends with one's
+// return a pair of integer flag and new broadcasted shape
+// if flag = 0: shape are identical
+//    flag = 1: return shape is equal to A, we broadcast B
+//    flag = 2: return shape is equal to B we broadcast A
+//    flag = 3: return shape is common of two we broadcast A and B to output
+std::pair<int, std::vector<size_t>>  UTILITY::MultidirectionalBroadcastShape(std::vector<size_t> & shapeA, std::vector<size_t> & shapeB)
 {
    size_t sizeA = shapeA.size();
    size_t sizeB = shapeB.size();
    // Check if A and B have the same shape
    if (UTILITY::AreSameShape(shapeA, shapeB)){
-      return shapeA;
+      return std::make_pair(0, shapeA);
    }
    // Find the common shape of A and B
    size_t size = std::max(sizeA, sizeB);
    if (sizeA < size) {
+      // prepend 1's in A to make of same shape as B
       std::vector<size_t> newShapeA(size, 1);
       size_t offset = size - sizeA;
       std::copy(shapeA.begin(), shapeA.end(), newShapeA.begin() + offset);
@@ -359,36 +385,116 @@ std::vector<size_t>  UTILITY::UnidirectionalBroadcastShape(std::vector<size_t> s
          break;
       }
    }
+   int broadcastFlag = 0;
    if (broadcastable) {
       // The output shape is max(outShape, targetShape)
       std::vector<size_t> targetShape(size, 1);
       for (size_t i = 0; i < size; i++) {
          targetShape[i] = std::max(shapeA[i], shapeB[i]);
+         if (shapeB[i] < targetShape[i]) broadcastFlag |= 1;
+         if (shapeA[i] < targetShape[i]) broadcastFlag |= 2;
       }
-      return targetShape;
+      return std::make_pair(broadcastFlag, targetShape);
    } else {
       throw
-         std::runtime_error("TMVA::SOFIE - Error unidirectional broadcasting tensors of shape "
+         std::runtime_error("TMVA::SOFIE - Error multidirectional broadcasting tensors of shape "
+            + ConvertShapeToString(shapeA) + " and " + ConvertShapeToString(shapeB)
+            + " to a common shape.");
+   }
+}
+// unidirectional broadcast- only B changes
+std::vector<size_t>  UTILITY::UnidirectionalBroadcastShape(std::vector<size_t> & shapeA, std::vector<size_t> & shapeB)
+{
+   auto ret = UTILITY::MultidirectionalBroadcastShape(shapeA, shapeB);
+   if (ret.first > 1) {
+      std::runtime_error("TMVA::SOFIE - Error unidirectional broadcasting tensors of shape "
             + ConvertShapeToString(shapeA) + " and " + ConvertShapeToString(shapeB)
             + " to a common shape.");
    }
+   return ret.second;
 }
 
-// UNidirectional boradcast specializaiton for vector<bool>
+// for broadcasting Dim shapes
+// flag indicates also which vector needs to be broadcasted
+//    flag & 1 == 1 : broadcast B -> A
+//    flag & 2 == 2 : broadcast A -> B
+//    flag & 4 == 4 a run time check is needed on shapes with values
+std::pair<int, std::vector<Dim>> UTILITY::MultidirectionalBroadcastShape(std::vector<Dim> & shapeA, std::vector<Dim> & shapeB) {
+   size_t sizeA = shapeA.size();
+   size_t sizeB = shapeB.size();
+   // Check if A and B have the same shape
+   if (UTILITY::AreSameShape(shapeA, shapeB)){
+      return std::make_pair(0, shapeA);
+   }
+   // Find the common shape of A and B
+   size_t size = std::max(sizeA, sizeB);
+   if (sizeA < size) {
+      // prepend 1's in A to make of same shape as B
+      std::vector<Dim> newShapeA(size, Dim{1});
+      size_t offset = size - sizeA;
+      std::copy(shapeA.begin(), shapeA.end(), newShapeA.begin() + offset);
+      shapeA = std::move(newShapeA);
+   }
+   if (sizeB < size) {
+      std::vector<Dim> newShapeB(size, Dim{1});
+      size_t offset = size - sizeB;
+      std::copy(shapeB.begin(), shapeB.end(), newShapeB.begin() + offset);
+      shapeB = std::move(newShapeB);
+   }
+
+   int broadcastFlag = 0;
+   // The output shape is targetShape
+   std::vector<Dim> targetShape(size);
+   for (size_t i = 0; i < size; i++) {
+      // assume we broadcast to the parametric value
+      if (shapeA[i] == shapeB[i]) {
+         targetShape[i] = shapeA[i];
+      } else if (shapeA[i].isParam && shapeB[i].GetVal() == "1" ) {
+         // broadcast B to A (case A is parametric with )
+         targetShape[i] = shapeA[i];
+         broadcastFlag |= 1;
+      } else if (shapeA[i].GetVal() == "1" && shapeB[i].isParam) {
+         // broadcast A to B
+         targetShape[i] = shapeB[i];
+         broadcastFlag |= 2;
+      } else if (!shapeA[i].isParam && !shapeB[i].isParam) {
+         if (shapeB[i].dim == 1) {
+            targetShape[i] = shapeA[i];
+            broadcastFlag |= 1;
+         } else if (shapeA[i].dim == 1) {
+            targetShape[i] = shapeB[i];
+            broadcastFlag |= 2;
+         } else {
+            // non broadcastable case cannot have A and B two different defined shapes different than one
+            broadcastFlag = -1;
+         }
+      } else if (shapeA[i].isParam && shapeB[i].isParam) {
+         // full dynamic case - we will decided at run time
+         std::stringstream s;
+         s <<  "std::max(" << shapeA[i] << "," << shapeB[i] << ")";
+         // use -1 for dim to indicate is an expression
+         targetShape[i] = Dim { s.str() , static_cast<size_t>(-1)};
+         broadcastFlag |= 4;
+      } else if (shapeA[i].isParam && !shapeB[i].isParam) {
+         // A -> B need to check at run time if consistent
+         targetShape[i] = shapeB[i];
+         broadcastFlag |= 6;
+      } else if (!shapeA[i].isParam && shapeB[i].isParam) {
+         // B -> A need to check at run time if consistent
+         targetShape[i] = shapeA[i];
+         broadcastFlag |= 5;
+      } else {
+         // all cases should be covered
+         throw std::runtime_error("TMVA::SOFIE - Fatal error in MultiDirectionalBroadCastDimShape");
+      }
+   }
+   if (broadcastFlag == -1) {
+      throw std::runtime_error("TMVA::SOFIE - Error multidirectional broadcasting tensors of shape " +
+                                 ConvertDimShapeToString(shapeA) + " and " + ConvertDimShapeToString(shapeB) +
+                                 " to a common shape.");
+   }
 
-// specialization for vector of boolean
-void UTILITY::UnidirectionalBroadcast(const std::vector<bool> & data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, std::vector<bool> & broadcastedData)
- {
-   // Prepend shape with ones
-   auto ncdata = const_cast<std::vector<bool> &>(data);
-   if (shape.size() < targetShape.size()) {
-      size_t targetSize = targetShape.size();
-      std::vector<size_t> newShape(targetSize, 1);
-      size_t offset = targetSize - shape.size();
-      std::copy(shape.begin(), shape.end(), newShape.begin() + offset);
-      UTILITY::BroadcastTensor<bool, const std::vector<bool> &, std::vector<bool> &>(ncdata, newShape, targetShape, broadcastedData);
-   }
-   UTILITY::BroadcastTensor<bool, const std::vector<bool> &, std::vector<bool> &>(ncdata, shape, targetShape, broadcastedData);
+   return std::make_pair(broadcastFlag, targetShape);
 }
 
 std::string UTILITY::Clean_name(std::string input_tensor_name){
@@ -413,15 +519,22 @@ std::vector<Dim> UTILITY::ComputeStrideFromShape(const std::vector<Dim> & shape)
    // assume row major layout
    const auto size = shape.size();
    std::vector<Dim> strides(size);
-   strides[size-1] = Dim{1};
-   for (std::size_t i = 1; i < size; i++) {
-      if (!shape[size-i].isParam && !strides[size-i].isParam)
-         strides[size - 1 - i] = Dim{strides[size-i].dim * shape[size-i].dim};
-      else
-         strides[size - 1 - i] = Dim{std::string(strides[size-i].GetVal() + "*" + shape[size-i].GetVal())};
+   if (size > 0) {
+      strides[size-1] = Dim{1};
+      for (std::size_t i = 1; i < size; i++) {
+         if (!shape[size-i].isParam && !strides[size-i].isParam)
+            strides[size - 1 - i] = Dim{strides[size-i].dim * shape[size-i].dim};
+         else {
+            if (strides[size-i].GetVal() == "1")
+               strides[size - 1 - i] = shape[size-i];
+            else if (shape[size-i].GetVal() == "1")
+               strides[size - 1 - i] = strides[size-i];
+            else
+              strides[size - 1 - i] = Dim{std::string(strides[size-i].GetVal() + "*" + shape[size-i].GetVal())};
+         }
+      }
    }
    return strides;
 }
 
-
-}//SOFIE
+} // namespace SOFIE
diff --git a/src/SOFIE_core/test/CMakeLists.txt b/src/SOFIE_core/test/CMakeLists.txt
index 34bb49f..76e5e29 100644
--- a/src/SOFIE_core/test/CMakeLists.txt
+++ b/src/SOFIE_core/test/CMakeLists.txt
@@ -9,123 +9,243 @@
 # @author Federico Sossai, Sanjiban Sengupta
 ############################################################################
 
+cmake_minimum_required(VERSION 3.14)
+include(FetchContent)
+
 include_directories(${CMAKE_SOURCE_DIR}/src/SOFIE_core/inc)
 include_directories(${CMAKE_SOURCE_DIR}/src/SOFIE_parsers/inc)
 
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
 if (NOT ONNX_MODELS_DIR)
   set(ONNX_MODELS_DIR input_models)
 endif()
 
-# Finding .onnx files to be parsed and creating the appropriate code to
-# parse all file. It is much faster to combine all parsing in a single executable
-# which will avoid initialization time (especially when using ROOT)
-set(CAPTURE_STR "EmitModel( \"@1\", \"@2\");")
+option(ENABLE_ALPAKA_TESTS "Enable Alpaka-based SOFIE tests" OFF)
+set(ALPAKA_BACKEND "cuda"
+    CACHE STRING "Alpaka backend to test (cuda, cpu, hip, sycl)")
+set_property(CACHE ALPAKA_BACKEND PROPERTY STRINGS cuda cpu hip sycl)
+
+# String template used to produce calls to EmitModel(...) per file.
+set(CAPTURE_STR
+"try {\n\
+    EmitModel(\"@1\", \"@2\");\n\
+} catch (const std::exception& e) {\n\
+    std::string msg = e.what();\n\
+    if (msg.find(\"multiple output tensors are not supported\") != std::string::npos) {\n\
+        std::cerr << \"[SKIP] Multiple outputs are not supported for @1\" << std::endl;\n\
+    } else {\n\
+        std::cerr << \"[ERROR] Failed processing @1: \" << msg << std::endl;\n\
+        failures++;\n\
+    }\n\
+} catch (...) {\n\
+    std::cerr << \"[ERROR] Unknown failure processing @1\" << std::endl;\n\
+    failures++;\n\
+}\n\
+")
+
+# --- Collect ONNX files ---
 set(ALL_CAPTURES "")
-# Finding .onnx files to be parsed and creating the appropriate command
 file(GLOB ONNX_FILES "${ONNX_MODELS_DIR}/*.onnx")
+
 foreach(onnx_file ${ONNX_FILES})
   get_filename_component(fname ${onnx_file} NAME_WE)
-  get_filename_component(fdir ${onnx_file} DIRECTORY)
-  string(REPLACE "@1" ${onnx_file} cap ${CAPTURE_STR})
-  string(REPLACE "@2" ${fname} cap ${cap})
-  list(APPEND ALL_CAPTURES ${cap})
+  string(REPLACE "@1" "${onnx_file}" cap "${CAPTURE_STR}")
+  string(REPLACE "@2" "${fname}" cap "${cap}")
+  string(APPEND ALL_CAPTURES "${cap}")
 endforeach()
-string(REPLACE ";" ";\n" EMIT_CAPTURES "${ALL_CAPTURES}")
+
+set(EMIT_CAPTURES "${ALL_CAPTURES}")
+
 configure_file(EmitFromONNX.cxx.in EmitFromONNX_all.cxx @ONLY)
-configure_file(EmitFromRoot.cxx.in EmitFromRoot_all.cxx @ONLY)
+configure_file(EmitFromONNX_GPU_ALPAKA.cxx.in EmitFromONNX_GPU_ALPAKA_all.cxx @ONLY)
+
+# --- CPU emitter ---
+# ROOTTEST_GENERATE_EXECUTABLE(emitFromONNX EmitFromONNX_all.cxx
+#   LIBRARIES protobuf::libprotobuf SOFIE_core SOFIE_parsers
+#   FIXTURES_SETUP sofie-compile-models-onnx-build)
+
+# target_compile_options(emitFromONNX PRIVATE -Wno-unused-parameter -Wno-array-bounds)
+
+# ROOTTEST_ADD_TEST(SofieCompileModels_ONNX
+#   COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 ./emitFromONNX
+#   FIXTURES_REQUIRED sofie-compile-models-onnx-build
+#   FIXTURES_SETUP sofie-compile-models-onnx
+# )
+
+# # --- Custom model tests ---
+# if (BLAS_FOUND)
+#   ROOTTEST_GENERATE_EXECUTABLE(TestCustomModelsFromONNX TestCustomModelsFromONNX.cxx
+#     LIBRARIES
+#       MathCore
+#       SOFIE_core
+#       BLAS::BLAS
+#       GTest::gtest
+#       GTest::gtest_main
+#     FIXTURES_REQUIRED sofie-compile-models-onnx
+#     FIXTURES_SETUP sofie-test-models-onnx-build
+#   )
+
+#   target_include_directories(TestCustomModelsFromONNX PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+
+#   ROOTTEST_ADD_TEST(TestCustomModelsFromONNX
+#     EXEC ./TestCustomModelsFromONNX
+#     FIXTURES_REQUIRED sofie-test-models-onnx-build
+#   )
+# endif()
+
+# # --- Python-based generators ---
+# ROOT_FIND_PYTHON_MODULE(torch)
+# if (ROOT_TORCH_FOUND)
+#   configure_file(Conv1dModelGenerator.py  Conv1dModelGenerator.py COPYONLY)
+#   configure_file(Conv2dModelGenerator.py  Conv2dModelGenerator.py COPYONLY)
+#   configure_file(Conv3dModelGenerator.py  Conv3dModelGenerator.py COPYONLY)
+#   configure_file(ConvTrans2dModelGenerator.py ConvTrans2dModelGenerator.py COPYONLY)
+#   configure_file(LinearModelGenerator.py LinearModelGenerator.py COPYONLY)
+#   configure_file(RecurrentModelGenerator.py RecurrentModelGenerator.py COPYONLY)
+
+#   if (BLAS_FOUND)
+#     ROOT_ADD_GTEST(TestSofieModels TestSofieModels.cxx
+#       LIBRARIES SOFIE_core SOFIE_parsers BLAS::BLAS
+#       INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}
+#     )
+#   endif()
+# endif()
+
+# ROOT_EXECUTABLE(emitGNN GNN/EmitGNN.cxx LIBRARIES SOFIE_core)
+# ROOT_ADD_TEST(tmva-sofie-EmitGNN COMMAND emitGNN)
+
+# ROOT_EXECUTABLE(EmitGraphIndependent GNN/EmitGraphIndependent.cxx LIBRARIES SOFIE_core)
+# ROOT_ADD_TEST(tmva-sofie-EmitGraphIndependent COMMAND EmitGraphIndependent)
+
+# =========================
+# ALPAKA TESTS
+# =========================
+if (ENABLE_ALPAKA_TESTS)
+
+  string(TOLOWER "${ALPAKA_BACKEND}" _alpaka_backend)
+
+  if (NOT _alpaka_backend IN_LIST ALPAKA_BACKEND)
+    message(FATAL_ERROR
+      "Unsupported ALPAKA_BACKEND='${ALPAKA_BACKEND}'. "
+      "Valid values: cuda, cpu, hip, sycl")
+  endif()
 
-ROOTTEST_GENERATE_EXECUTABLE(emitFromONNX EmitFromONNX_all.cxx
-    LIBRARIES protobuf::libprotobuf SOFIE_core SOFIE_parsers
-                             FIXTURES_SETUP sofie-compile-models-onnx-build)
+  FetchContent_Declare(
+    sofieBLAS
+    GIT_REPOSITORY https://github.com/ML4EP/sofieBLAS
+    GIT_TAG        edf2259876e9f4fb5a8f72db20b2dfb5dc26b517
+  )
+  FetchContent_MakeAvailable(sofieBLAS)
 
-# silence protobuf warnings seen in version 3.0 and 3.6. Not needed from protobuf version 3.17
-target_compile_options(emitFromONNX PRIVATE -Wno-unused-parameter -Wno-array-bounds)
+  FetchContent_Declare(
+    alpaka
+    GIT_REPOSITORY https://github.com/alpaka-group/alpaka
+    GIT_TAG        2fa91a34ed11b2076e474c5507d920e85cf9b79d
+  )
+  FetchContent_MakeAvailable(alpaka)
 
-ROOTTEST_ADD_TEST(SofieCompileModels_ONNX
-  COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 ./emitFromONNX ${onnx_file} ${CMAKE_CURRENT_BINARY_DIR}/${fname}
-  FIXTURES_REQUIRED sofie-compile-models-onnx-build
-  FIXTURES_SETUP sofie-compile-models-onnx
-)
+  # --- ALPAKA emitter ---
+  ROOTTEST_GENERATE_EXECUTABLE(emitFromONNXAlpaka EmitFromONNX_GPU_ALPAKA_all.cxx
+    LIBRARIES protobuf::libprotobuf SOFIE_core SOFIE_parsers
+    FIXTURES_SETUP sofie-compile-models-onnx-alpaka-build
+  )
 
-# Creating a Google Test
-if (BLAS_FOUND)  # we need BLAS for compiling the models
-  ROOTTEST_GENERATE_EXECUTABLE(TestCustomModelsFromONNX TestCustomModelsFromONNX.cxx
-    LIBRARIES
-      MathCore
-      SOFIE_core
-      BLAS::BLAS
-      GTest::gtest
-      GTest::gtest_main
-    FIXTURES_REQUIRED
-      sofie-compile-models-onnx
-    FIXTURES_SETUP
-      sofie-test-models-onnx-build
+  target_compile_options(emitFromONNXAlpaka PRIVATE -Wno-unused-parameter -Wno-array-bounds)
+
+  if (ONNX_FILES)
+    foreach(onnx_file ${ONNX_FILES})
+      get_filename_component(fname ${onnx_file} NAME_WE)
+      add_custom_command(TARGET emitFromONNXAlpaka POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1
+                $<TARGET_FILE:emitFromONNXAlpaka>
+                "${onnx_file}" "${CMAKE_CURRENT_BINARY_DIR}/${fname}"
+        COMMENT "Running ALPAKA emitter on ${onnx_file}")
+    endforeach()
+  endif()
+
+  ROOTTEST_ADD_TEST(SofieCompileModels_ONNX_Alpaka
+    COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 ./emitFromONNXAlpaka
+    FIXTURES_REQUIRED sofie-compile-models-onnx-alpaka-build
+    FIXTURES_SETUP sofie-compile-models-onnx-alpaka
   )
-  target_include_directories(TestCustomModelsFromONNX PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
-  ROOTTEST_ADD_TEST(TestCustomModelsFromONNX
-                    EXEC ./TestCustomModelsFromONNX
-                    FIXTURES_REQUIRED sofie-test-models-onnx-build)
-endif()
 
-# For testing serialisation of RModel object
+  set(CXXFLAGS -O2 -g -DALPAKA_HAS_STD_ATOMIC_REF)
+  set(CXX_HOST_FLAGS -fPIC -pthread)
+
+  # ---- Backend selection ----
+  if (_alpaka_backend STREQUAL "cuda")
+    message(STATUS "Enabling Alpaka CUDA tests")
+    enable_language(CUDA)
+    find_package(CUDAToolkit REQUIRED)
+
+    set(CUDA_ARCH "sm_86")
+    set(CXX_CUDA_FLAGS
+      -arch=${CUDA_ARCH}
+      -Wno-deprecated-gpu-targets
+      --extended-lambda
+      --expt-relaxed-constexpr)
+
+    set_source_files_properties(
+      TestCustomModelsFromONNXForAlpakaCuda.cxx
+      PROPERTIES LANGUAGE CUDA
+    )
+
 
-ROOTTEST_GENERATE_EXECUTABLE(emitFromROOT EmitFromRoot_all.cxx
-    LIBRARIES protobuf::libprotobuf RIO SOFIE_core SOFIE_parsers
-    FIXTURES_SETUP sofie-compile-models-onnx-root
-)
-# silence protobuf warnings seen in version 3.0 and 3.6. Not needed from protobuf version 3.17
-target_compile_options(emitFromROOT PRIVATE -Wno-unused-parameter -Wno-array-bounds)
-
-# Automatic compilation of headers from root files
-ROOTTEST_ADD_TEST(SofieCompileModels_ROOT
-  COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 ./emitFromROOT
-  FIXTURES_REQUIRED sofie-compile-models-onnx-root
-  FIXTURES_SETUP sofie-compile-models-root
+ROOTTEST_GENERATE_EXECUTABLE(
+  TestCustomModelsFromONNXForAlpakaCuda
+  TestCustomModelsFromONNXForAlpakaCuda.cxx
+  LIBRARIES MathCore SOFIE_core GTest::gtest GTest::gtest_main
+  FIXTURES_REQUIRED sofie-compile-models-onnx-alpaka
+  FIXTURES_SETUP sofie-test-models-onnx-alpaka-build
 )
 
-if (BLAS_FOUND)
-  # Creating a Google Test for Serialisation of RModel
-  ROOTTEST_GENERATE_EXECUTABLE(TestCustomModelsFromROOT TestCustomModelsFromROOT.cxx
-    LIBRARIES
-      SOFIE_core
-      BLAS::BLAS
-      GTest::gtest
-      GTest::gtest_main
-    FIXTURES_REQUIRED
-      sofie-compile-models-root
-    FIXTURES_SETUP
-      sofie-test-models-root-build
-  )
-  target_include_directories(TestCustomModelsFromROOT PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
-  ROOTTEST_ADD_TEST(TestCustomModelsFromROOT
-                    EXEC ./TestCustomModelsFromROOT
-                    FIXTURES_REQUIRED sofie-test-models-root-build)
-endif()
 
-# Look for needed Python modules
-ROOT_FIND_PYTHON_MODULE(torch)
-if (ROOT_TORCH_FOUND)
-  configure_file(Conv1dModelGenerator.py  Conv1dModelGenerator.py COPYONLY)
-  configure_file(Conv2dModelGenerator.py  Conv2dModelGenerator.py COPYONLY)
-  configure_file(Conv3dModelGenerator.py  Conv3dModelGenerator.py COPYONLY)
-  configure_file(ConvTrans2dModelGenerator.py  ConvTrans2dModelGenerator.py COPYONLY)
-  configure_file(LinearModelGenerator.py  LinearModelGenerator.py COPYONLY)
-  configure_file(RecurrentModelGenerator.py  RecurrentModelGenerator.py COPYONLY)
-
-  if (BLAS_FOUND)
-    ROOT_ADD_GTEST(TestSofieModels TestSofieModels.cxx
-      LIBRARIES
-        SOFIE_core
-        SOFIE_parsers
-        BLAS::BLAS
-      INCLUDE_DIRS
-        ${CMAKE_CURRENT_BINARY_DIR}
+    target_include_directories(
+      TestCustomModelsFromONNXForAlpakaCuda PRIVATE
+      ${CMAKE_CURRENT_BINARY_DIR}
+      ${alpaka_SOURCE_DIR}/include
+      ${SOFIE_INCLUDE}
+      ${sofieblas_SOURCE_DIR}/include 
+      ${ROOT_INCLUDE_DIRS}
+      ${CUDA_BASE}/include
+      ${CMAKE_CURRENT_SOURCE_DIR}
+    )
+
+    set_target_properties(
+      TestCustomModelsFromONNXForAlpakaCuda
+      PROPERTIES CUDA_SEPARABLE_COMPILATION ON
+    )
+
+    target_compile_definitions(
+      TestCustomModelsFromONNXForAlpakaCuda PRIVATE
+      ALPAKA_ACC_GPU_CUDA_ENABLED
+    )
+
+    target_link_directories(
+      TestCustomModelsFromONNXForAlpakaCuda PRIVATE
+      ${CUDA_BASE}/lib64
+    )
+
+    target_link_libraries(TestCustomModelsFromONNXForAlpakaCuda      
+      CUDA::cublas
+      CUDA::cublasLt
+      CUDA::cudart
+      ${ROOT_LIBRARIES}
+    )
+
+    ROOTTEST_ADD_TEST(TestCustomModelsFromONNXForAlpakaCuda
+      EXEC ./TestCustomModelsFromONNXForAlpakaCuda
+      FIXTURES_REQUIRED sofie-compile-models-onnx-alpaka
     )
-  endif()
-endif()
 
-ROOT_EXECUTABLE(emitGNN GNN/EmitGNN.cxx LIBRARIES SOFIE_core)
-ROOT_ADD_TEST(tmva-sofie-EmitGNN COMMAND emitGNN)
+  elseif (_alpaka_backend STREQUAL "cpu")
+    message(STATUS "Alpaka CPU backend selected (not yet implemented)")
+  elseif (_alpaka_backend STREQUAL "hip")
+    message(STATUS "Alpaka HIP backend selected (not yet implemented)")
+  elseif (_alpaka_backend STREQUAL "sycl")
+    message(STATUS "Alpaka SYCL backend selected (not yet implemented)")
+  endif()  # backend
 
-ROOT_EXECUTABLE(EmitGraphIndependent GNN/EmitGraphIndependent.cxx LIBRARIES SOFIE_core)
-ROOT_ADD_TEST(tmva-sofie-EmitGraphIndependent COMMAND EmitGraphIndependent)
+endif()  # ENABLE_ALPAKA_TESTS
diff --git a/src/SOFIE_core/test/EmitFromONNX.cxx.in b/src/SOFIE_core/test/EmitFromONNX.cxx.in
index f7a56e2..c464f4d 100644
--- a/src/SOFIE_core/test/EmitFromONNX.cxx.in
+++ b/src/SOFIE_core/test/EmitFromONNX.cxx.in
@@ -23,7 +23,13 @@ int EmitModel(std::string filename, std::string outname) {
 
 int main(int argc, char *argv[]){
 
-@EMIT_CAPTURES@ ;
+
+    int failures = 0;
+
+    @EMIT_CAPTURES@
+
+    std::cout << "[SUMMARY for generation from ONNX] Completed with " << failures << " failures" << std::endl;
+    return failures == 0 ? 0 : 1;
 
 }
 
diff --git a/src/SOFIE_core/test/EmitFromONNX_GPU_ALPAKA.cxx.in b/src/SOFIE_core/test/EmitFromONNX_GPU_ALPAKA.cxx.in
new file mode 100644
index 0000000..58198c1
--- /dev/null
+++ b/src/SOFIE_core/test/EmitFromONNX_GPU_ALPAKA.cxx.in
@@ -0,0 +1,27 @@
+// Author: Sanjiban Sengupta
+
+#include "SOFIE/RModel_Base.hxx"
+#include "SOFIE/RModel.hxx"
+#include "SOFIE/RModelParser_ONNX.hxx"
+
+using namespace SOFIE;
+
+int EmitModel(std::string filename, std::string outname) {
+
+   RModelParser_ONNX parser;
+   RModel model = parser.Parse(filename);
+   model.GenerateGPU_ALPAKA();
+   model.OutputGenerated(outname+"_FromONNX_GPU_ALPAKA.hxx");
+
+   return 0;
+}
+
+int main(int argc, char *argv[]) {
+
+    int failures = 0;
+
+    @EMIT_CAPTURES@
+
+    std::cout << "[SUMMARY for generation from ONNX with ALPAKA] Completed with " << failures << " failures" << std::endl;
+    return failures == 0 ? 0 : 1;
+}
diff --git a/src/SOFIE_core/test/EmitFromRoot.cxx.in b/src/SOFIE_core/test/EmitFromRoot.cxx.in
index 4a630c7..88c0789 100644
--- a/src/SOFIE_core/test/EmitFromRoot.cxx.in
+++ b/src/SOFIE_core/test/EmitFromRoot.cxx.in
@@ -43,6 +43,15 @@ int EmitModel(std::string inputfile, std::string outname){
 
 int main(int argc, char *argv[]){
 
-@EMIT_CAPTURES@ ;
+    int failures = 0;
 
+    @EMIT_CAPTURES@
+
+    std::cout << "[SUMMARY for generation from ROOT] Completed with " << failures << " failures" << std::endl;
+    return failures == 0 ? 0 : 1;
+
+   @EMIT_CAPTURES@;
+
+   std::cout << "[SUMMARY] Completed with " << failures << " failures" << std::endl;
+   return failures == 0 ? 0 : 1;
 }
diff --git a/src/SOFIE_core/test/TestCustomModelsFromONNX.cxx b/src/SOFIE_core/test/TestCustomModelsFromONNX.cxx
index d02dc5e..14eb6a3 100644
--- a/src/SOFIE_core/test/TestCustomModelsFromONNX.cxx
+++ b/src/SOFIE_core/test/TestCustomModelsFromONNX.cxx
@@ -812,7 +812,7 @@ TEST(ONNX, LinearWithLeakyRelu)
 {
    constexpr float TOLERANCE = 1;
 
-   // Preparing the standard all-ones input
+   // Preparing input
    std::vector<float> input({
       0.4369, -0.6882,  1.0309, -1.0263, -0.1519,  1.2237, -0.7054, -0.1762,
       -0.6811, -2.2597,  1.0388, -0.7993,  0.1468,  1.3257, -0.4714, -0.0958,
@@ -2515,7 +2515,7 @@ TEST(ONNX, Equal){
    });
 
    SOFIE_Equal::Session s("Equal_FromONNX.dat");
-   std::vector<bool> output = s.infer(input1.data(),input2.data());
+   std::vector<std::uint8_t> output = s.infer(input1.data(),input2.data());
    // Checking output size
    EXPECT_EQ(output.size(), sizeof(Equal_ExpectedOutput::outputs) / sizeof(bool));
 
@@ -2540,7 +2540,7 @@ TEST(ONNX, LessOrEqual){
    });
 
    SOFIE_LessOrEqual::Session s("LessOrEqual_FromONNX.dat");
-   std::vector<bool> output = s.infer(input1.data(),input2.data());
+   std::vector<std::uint8_t> output = s.infer(input1.data(),input2.data());
    // Checking output size
    EXPECT_EQ(output.size(), sizeof(LessOrEqual_ExpectedOutput::outputs) / sizeof(bool));
 
@@ -2565,7 +2565,7 @@ TEST(ONNX, GreaterOrEqual){
    });
 
    SOFIE_GreaterOrEqual::Session s("GreaterOrEqual_FromONNX.dat");
-   std::vector<bool> output = s.infer(input1.data(),input2.data());
+   std::vector<std::uint8_t> output = s.infer(input1.data(),input2.data());
    // Checking output size
    EXPECT_EQ(output.size(), sizeof(GreaterOrEqual_ExpectedOutput::outputs) / sizeof(bool));
 
@@ -2590,7 +2590,7 @@ TEST(ONNX, Greater){
    });
 
    SOFIE_Greater::Session s("Greater_FromONNX.dat");
-   std::vector<bool> output = s.infer(input1.data(),input2.data());
+   std::vector<std::uint8_t> output = s.infer(input1.data(),input2.data());
    // Checking output size
    EXPECT_EQ(output.size(), sizeof(Greater_ExpectedOutput::outputs) / sizeof(bool));
 
@@ -2615,7 +2615,7 @@ TEST(ONNX, Less){
    });
 
    SOFIE_Less::Session s("Less_FromONNX.dat");
-   std::vector<bool> output = s.infer(input1.data(),input2.data());
+   std::vector<std::uint8_t> output = s.infer(input1.data(),input2.data());
    // Checking output size
    EXPECT_EQ(output.size(), sizeof(Less_ExpectedOutput::outputs) / sizeof(bool));
 
diff --git a/src/SOFIE_core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx b/src/SOFIE_core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx
new file mode 100644
index 0000000..1537ea4
--- /dev/null
+++ b/src/SOFIE_core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx
@@ -0,0 +1,244 @@
+#include <numeric>
+#include <cstddef>
+
+#include "Linear_16_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/Linear_16.ref.hxx"
+
+#include "Linear_32_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/Linear_32.ref.hxx"
+
+#include "Linear_64_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/Linear_64.ref.hxx"
+
+#include "LinearWithLeakyRelu_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/LinearWithLeakyRelu.ref.hxx"
+
+#include "LinearWithSigmoid_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/LinearWithSigmoid.ref.hxx"
+
+#include <alpaka/alpaka.hpp>
+#include <cuda_runtime.h>
+#include <nvml.h>
+#include "gtest/gtest.h"
+
+constexpr float DEFAULT_TOLERANCE = 1e-3f;
+
+using Idx = std::size_t;
+using Dim = alpaka::DimInt<1>;
+using Ext1D = alpaka::Vec<Dim, Idx>;
+
+class SofieAlpakaTest : public ::testing::Test {
+protected:
+    // Shared devices and platforms
+    alpaka::PlatformCpu hostPlatform;
+    alpaka::DevCpu host;
+    alpaka::PlatformCudaRt platform;
+    alpaka::DevCudaRt device;
+    alpaka::Queue<alpaka::DevCudaRt, alpaka::NonBlocking> queue;
+
+    SofieAlpakaTest() 
+        : hostPlatform{}
+        , host(alpaka::getDevByIdx(hostPlatform, 0u))
+        , platform{}
+        , device(alpaka::getDevByIdx(platform, 0u))
+        , queue(device)
+    {
+    }
+
+    void SetUp() override {
+        cudaDeviceSynchronize();
+    }
+
+    void TearDown() override {
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+    }
+
+    ~SofieAlpakaTest() override {
+        cudaDeviceSynchronize();
+    }
+};
+
+TEST_F(SofieAlpakaTest, Linear16)
+{
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   auto A = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{1600}));
+   float *A_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(A));
+
+   for (Idx i = 0; i < 1600; ++i) {
+      A_ptr[i] = 1.0;
+   }
+
+   auto A_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{1600}));
+   alpaka::memcpy(queue, A_d, A);
+   alpaka::wait(queue);
+
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{160}));
+   
+   { 
+      SOFIE_Linear_16::Session<alpaka::TagGpuCudaRt> session("Linear_16_FromONNX_GPU_ALPAKA.dat");
+      auto result = session.infer(A_d);
+
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+   } 
+   
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float *correct = Linear_16_ExpectedOutput::all_ones;
+
+   for (size_t i = 0; i < 160; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+   }
+}
+
+TEST_F(SofieAlpakaTest, Linear32)
+{
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   auto A = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{1600}));
+   float *A_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(A));
+
+   for (Idx i = 0; i < 1600; ++i) {
+      A_ptr[i] = 1.0;
+   }
+
+   auto A_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{1600}));
+   alpaka::memcpy(queue, A_d, A);
+   alpaka::wait(queue);
+
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{160}));
+   
+   {
+      SOFIE_Linear_32::Session<alpaka::TagGpuCudaRt> session("Linear_32_FromONNX_GPU_ALPAKA.dat");
+      auto result = session.infer(A_d);
+
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+   }
+   
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float *correct = Linear_32_ExpectedOutput::all_ones;
+
+   for (size_t i = 0; i < 160; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+   }
+}
+
+TEST_F(SofieAlpakaTest, Linear64)
+{
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   auto A = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{1600}));
+   float *A_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(A));
+
+   for (Idx i = 0; i < 1600; ++i) {
+      A_ptr[i] = 1.0;
+   }
+
+   auto A_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{1600}));
+   alpaka::memcpy(queue, A_d, A);
+   alpaka::wait(queue);
+
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{160}));
+   
+   {
+      SOFIE_Linear_64::Session<alpaka::TagGpuCudaRt> session("Linear_64_FromONNX_GPU_ALPAKA.dat");
+      auto result = session.infer(A_d);
+
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+   }
+   
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float *correct = Linear_64_ExpectedOutput::all_ones;
+
+   for (size_t i = 0; i < 160; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+   }
+}
+
+TEST_F(SofieAlpakaTest, LinearWithLeakyRelu)
+{
+   alpaka::PlatformCpu hostPlatform{};
+   auto host = alpaka::getDevByIdx(hostPlatform, 0u);
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   alpaka::PlatformCudaRt platform{};
+   alpaka::DevCudaRt device = alpaka::getDevByIdx(platform, 0u);
+   alpaka::Queue<alpaka::DevCudaRt, alpaka::NonBlocking> queue{device};
+
+   std::vector<float> input({
+      0.4369, -0.6882,  1.0309, -1.0263, -0.1519,  1.2237, -0.7054, -0.1762,
+      -0.6811, -2.2597,  1.0388, -0.7993,  0.1468,  1.3257, -0.4714, -0.0958,
+      0.7057, -0.3749, -0.3310,  0.0986, -0.1370,  0.0832, -1.6465, -0.2793
+   });
+
+   auto A = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+   float *A_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(A));
+
+   for (Idx i = 0; i < input.size(); ++i) {
+      A_ptr[i] = input[i];
+   }
+
+   auto A_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+   alpaka::memcpy(queue, A_d, A);
+   alpaka::wait(queue);
+
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{24}));
+   
+   {
+      SOFIE_LinearWithLeakyRelu::Session<alpaka::TagGpuCudaRt> session;
+      auto result = session.infer(A_d);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+   
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float *correct = LinearWithLeakyRelu_ExpectedOutput::outputs;
+
+   for (size_t i = 0; i < 24; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+   }
+}
+
+TEST_F(SofieAlpakaTest, LinearWithSigmoid)
+{
+
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   auto A = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{48}));
+   float *A_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(A));
+
+   for (Idx i = 0; i < 48; ++i) {
+      A_ptr[i] = 1.0;
+   }
+
+   auto A_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{48}));
+   alpaka::memcpy(queue, A_d, A);
+   alpaka::wait(queue);
+
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{48}));
+   
+   {
+      SOFIE_LinearWithSigmoid::Session<alpaka::TagGpuCudaRt> session("LinearWithSigmoid_FromONNX_GPU_ALPAKA.dat");
+      auto result = session.infer(A_d);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float *correct = LinearWithSigmoid_ExpectedOutput::all_ones;
+   for (size_t i = 0; i < 24; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+   }
+}
diff --git a/src/SOFIE_parsers/CMakeLists.txt b/src/SOFIE_parsers/CMakeLists.txt
index 379b7d7..d77d1e6 100644
--- a/src/SOFIE_parsers/CMakeLists.txt
+++ b/src/SOFIE_parsers/CMakeLists.txt
@@ -102,6 +102,15 @@ target_include_directories(SOFIE_parsers PUBLIC
   set_target_properties(SOFIE_parsers PROPERTIES
   POSITION_INDEPENDENT_CODE TRUE)
 
+  ROOT_GENERATE_DICTIONARY(G__SOFIE_parsers ${sources_headers}
+    LINKDEF inc/LinkDef.h
+    MODULE SOFIE_parsers
+    OPTIONS --deep
+)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_parsers_rdict.pcm
+              ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_parsers.rootmap
+        DESTINATION lib)
+
 install(TARGETS SOFIE_parsers
         LIBRARY DESTINATION lib
 )
diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt
new file mode 100644
index 0000000..2ede060
--- /dev/null
+++ b/src/utils/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_library(utils INTERFACE)
+
+target_include_directories(utils INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:include>
+)
+
+install(
+    DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/SOFIE
+    DESTINATION include
+)
diff --git a/src/utils/SOFIE/RTensor.hxx b/src/utils/SOFIE/RTensor.hxx
new file mode 100644
index 0000000..db82dc9
--- /dev/null
+++ b/src/utils/SOFIE/RTensor.hxx
@@ -0,0 +1,628 @@
+#ifndef SOFIE_RTENSOR
+#define SOFIE_RTENSOR
+
+#include <vector>
+#include <cstddef>     // std::size_t
+#include <cstdint>
+#include <stdexcept>   // std::runtime_error
+#include <sstream>     // std::stringstream
+#include <memory>      // std::shared_ptr
+#include <type_traits> // std::is_convertible
+#include <algorithm>   // std::reverse
+#include <iterator>    // std::random_access_iterator_tag
+
+namespace SOFIE {
+
+/// Memory layout type
+enum class MemoryLayout : uint8_t {
+   RowMajor = 0x01,
+   ColumnMajor = 0x02
+};
+
+namespace Internal {
+
+/// \brief Get size of tensor from shape vector
+/// \param[in] shape Shape vector
+/// \return Size of contiguous memory
+template <typename T>
+inline std::size_t GetSizeFromShape(const T &shape)
+{
+   if (shape.size() == 0)
+      return 0;
+   std::size_t size = 1;
+   for (auto &s : shape)
+      size *= s;
+   return size;
+}
+
+/// \brief Compute strides from shape vector.
+/// \param[in] shape Shape vector
+/// \param[in] layout Memory layout
+/// \return Size of contiguous memory
+///
+/// This information is needed for the multi-dimensional indexing. See here:
+/// https://en.wikipedia.org/wiki/Row-_and_column-major_order
+/// https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.strides.html
+template <typename T>
+inline std::vector<std::size_t> ComputeStridesFromShape(const T &shape, MemoryLayout layout)
+{
+   const auto size = shape.size();
+   T strides(size);
+   if (layout == MemoryLayout::RowMajor) {
+      for (std::size_t i = 0; i < size; i++) {
+         if (i == 0) {
+            strides[size - 1 - i] = 1;
+         } else {
+            strides[size - 1 - i] = strides[size - 1 - i + 1] * shape[size - 1 - i + 1];
+         }
+      }
+   } else if (layout == MemoryLayout::ColumnMajor) {
+      for (std::size_t i = 0; i < size; i++) {
+         if (i == 0) {
+            strides[i] = 1;
+         } else {
+            strides[i] = strides[i - 1] * shape[i - 1];
+         }
+      }
+   } else {
+      std::stringstream ss;
+      ss << "Memory layout type is not valid for calculating strides.";
+      throw std::runtime_error(ss.str());
+   }
+   return strides;
+}
+
+/// \brief Compute indices from global index
+/// \param[in] shape Shape vector
+/// \param[in] idx Global index
+/// \param[in] layout Memory layout
+/// \return Indice vector
+template <typename T>
+inline T ComputeIndicesFromGlobalIndex(const T& shape, MemoryLayout layout, const typename T::value_type idx)
+{
+    const auto size = shape.size();
+    auto strides = ComputeStridesFromShape(shape, layout);
+    T indices(size);
+    auto r = idx;
+    for (std::size_t i = 0; i < size; i++) {
+        indices[i] = int(r / strides[i]);
+        r = r % strides[i];
+    }
+    return indices;
+}
+
+/// \brief Compute global index from indices
+/// \param[in] strides Strides vector
+/// \param[in] idx Indice vector
+/// \return Global index
+template <typename U, typename V>
+inline std::size_t ComputeGlobalIndex(const U& strides, const V& idx)
+{
+   std::size_t globalIndex = 0;
+   const auto size = idx.size();
+   for (std::size_t i = 0; i < size; i++) {
+      globalIndex += strides[size - 1 - i] * idx[size - 1 - i];
+   }
+   return globalIndex;
+}
+
+/// \brief Type checking for all types of a parameter pack, e.g., used in combination with std::is_convertible
+template <class... Ts>
+struct and_types : std::true_type {
+};
+
+template <class T0, class... Ts>
+struct and_types<T0, Ts...> : std::integral_constant<bool, T0() && and_types<Ts...>()> {
+};
+
+/// \brief Copy slice of a tensor recursively from here to there
+/// \param[in] here Source tensor
+/// \param[in] there Target tensor (slice of source tensor)
+/// \param[in] mins Minimum of indices for each dimension
+/// \param[in] maxs Maximum of indices for each dimension
+/// \param[in] idx Current indices
+/// \param[in] active Active index needed to stop the recursion
+///
+/// Copy the content of a slice of a tensor from source to target. This is done
+/// by recursively iterating over the ranges of the slice for each dimension.
+template <typename T>
+void RecursiveCopy(const T &here, T &there,
+                   const std::vector<std::size_t> &mins, const std::vector<std::size_t> &maxs,
+                   std::vector<std::size_t> idx, std::size_t active)
+{
+   const auto size = idx.size();
+   for (std::size_t i = mins[active]; i < maxs[active]; i++) {
+      idx[active] = i;
+      if (active == size - 1) {
+         auto idxThere = idx;
+         for (std::size_t j = 0; j < size; j++) {
+            idxThere[j] -= mins[j];
+         }
+         there(idxThere) = here(idx);
+      } else {
+         Internal::RecursiveCopy(here, there, mins, maxs, idx, active + 1);
+      }
+   }
+}
+
+} // namespace SOFIE::Internal
+
+/// \class SOFIE::RTensor
+/// \brief RTensor is a container with contiguous memory and shape information.
+/// \tparam T Data-type of the tensor
+///
+/// An RTensor is a vector-like container, which has additional shape information.
+/// The elements of the multi-dimensional container can be accessed by their
+/// indices in a coherent way without taking care about the one-dimensional memory
+/// layout of the contiguous storage. This also allows to manipulate the shape
+/// of the container without moving the actual elements in memory. Another feature
+/// is that an RTensor can own the underlying contiguous memory but can also represent
+/// only a view on existing data without owning it.
+template <typename V, typename C = std::vector<V>>
+class RTensor {
+public:
+   // Typedefs
+   using Value_t = V;
+   using Shape_t = std::vector<std::size_t>;
+   using Index_t = Shape_t;
+   using Slice_t = std::vector<Shape_t>;
+   using Container_t = C;
+
+private:
+   Shape_t fShape;
+   Shape_t fStrides;
+   std::size_t fSize;
+   MemoryLayout fLayout;
+   Value_t *fData;
+   std::shared_ptr<Container_t> fContainer;
+
+protected:
+   void ReshapeInplace(const Shape_t &shape);
+
+public:
+   // Constructors
+
+   /// \brief Construct a tensor as view on data
+   /// \param[in] data Pointer to data contiguous in memory
+   /// \param[in] shape Shape vector
+   /// \param[in] layout Memory layout
+   RTensor(Value_t *data, Shape_t shape, MemoryLayout layout = MemoryLayout::RowMajor)
+      : fShape(shape), fLayout(layout), fData(data), fContainer(nullptr)
+   {
+      fSize = Internal::GetSizeFromShape(shape);
+      fStrides = Internal::ComputeStridesFromShape(shape, layout);
+   }
+
+   /// \brief Construct a tensor as view on data
+   /// \param[in] data Pointer to data contiguous in memory
+   /// \param[in] shape Shape vector
+   /// \param[in] strides Strides vector
+   /// \param[in] layout Memory layout
+   RTensor(Value_t *data, Shape_t shape, Shape_t strides, MemoryLayout layout = MemoryLayout::RowMajor)
+      : fShape(shape), fStrides(strides), fLayout(layout), fData(data), fContainer(nullptr)
+   {
+      fSize = Internal::GetSizeFromShape(shape);
+   }
+
+   /// \brief Construct a tensor owning externally provided data
+   /// \param[in] container Shared pointer to data container
+   /// \param[in] shape Shape vector
+   /// \param[in] layout Memory layout
+   RTensor(std::shared_ptr<Container_t> container, Shape_t shape,
+           MemoryLayout layout = MemoryLayout::RowMajor)
+      : fShape(shape), fLayout(layout), fContainer(container)
+   {
+      fSize = Internal::GetSizeFromShape(shape);
+      fStrides = Internal::ComputeStridesFromShape(shape, layout);
+      fData = std::data(*fContainer);
+   }
+
+   /// \brief Construct a tensor owning data initialized with new container
+   /// \param[in] shape Shape vector
+   /// \param[in] layout Memory layout
+   RTensor(Shape_t shape, MemoryLayout layout = MemoryLayout::RowMajor)
+      : fShape(shape), fLayout(layout)
+   {
+      // TODO: Document how data pointer is determined using STL iterator interface.
+      // TODO: Sanitize given container type with type traits
+      fSize = Internal::GetSizeFromShape(shape);
+      fStrides = Internal::ComputeStridesFromShape(shape, layout);
+      fContainer = std::make_shared<Container_t>(fSize);
+      fData = std::data(*fContainer);
+   }
+
+   // Access elements
+   Value_t &operator()(const Index_t &idx);
+   const Value_t &operator() (const Index_t &idx) const;
+   template <typename... Idx> Value_t &operator()(Idx... idx);
+   template <typename... Idx> const Value_t &operator() (Idx... idx) const;
+
+   // Access properties
+   std::size_t GetSize() const { return fSize; }
+   const Shape_t &GetShape() const { return fShape; }
+   const Shape_t &GetStrides() const { return fStrides; }
+   Value_t *GetData() { return fData; }
+   const Value_t *GetData() const { return fData; }
+   std::shared_ptr<Container_t> GetContainer() { return fContainer; }
+   const std::shared_ptr<Container_t> GetContainer() const { return fContainer; }
+   MemoryLayout GetMemoryLayout() const { return fLayout; }
+   bool IsView() const { return fContainer == nullptr; }
+   bool IsOwner() const { return !IsView(); }
+
+   // Copy
+   RTensor<Value_t, Container_t> Copy(MemoryLayout layout = MemoryLayout::RowMajor) const;
+
+   // Transformations
+   RTensor<Value_t, Container_t> Transpose() const;
+   RTensor<Value_t, Container_t> Squeeze() const;
+   RTensor<Value_t, Container_t> ExpandDims(int idx) const;
+   RTensor<Value_t, Container_t> Reshape(const Shape_t &shape) const;
+   RTensor<Value_t, Container_t> Resize(const Shape_t &shape);
+   RTensor<Value_t, Container_t> Slice(const Slice_t &slice);
+
+   // Iterator class
+   class Iterator {
+   private:
+      RTensor<Value_t, Container_t>& fTensor;
+      Index_t::value_type fGlobalIndex;
+   public:
+      using iterator_category = std::random_access_iterator_tag;
+      using value_type = Value_t;
+      using difference_type = std::ptrdiff_t;
+      using pointer = Value_t *;
+      using reference = Value_t &;
+
+      Iterator(RTensor<Value_t, Container_t>& x, typename Index_t::value_type idx) : fTensor(x), fGlobalIndex(idx) {}
+      Iterator& operator++() { fGlobalIndex++; return *this; }
+      Iterator operator++(int) { auto tmp = *this; operator++(); return tmp; }
+      Iterator& operator--() { fGlobalIndex--; return *this; }
+      Iterator operator--(int) { auto tmp = *this; operator--(); return tmp; }
+      Iterator operator+(difference_type rhs) const { return Iterator(fTensor, fGlobalIndex + rhs); }
+      Iterator operator-(difference_type rhs) const { return Iterator(fTensor, fGlobalIndex - rhs); }
+      difference_type operator-(const Iterator& rhs) { return fGlobalIndex - rhs.GetGlobalIndex(); }
+      Iterator& operator+=(difference_type rhs) { fGlobalIndex += rhs; return *this; }
+      Iterator& operator-=(difference_type rhs) { fGlobalIndex -= rhs; return *this; }
+      Value_t& operator*()
+      {
+         auto idx = Internal::ComputeIndicesFromGlobalIndex(fTensor.GetShape(), fTensor.GetMemoryLayout(), fGlobalIndex);
+         return fTensor(idx);
+      }
+      bool operator==(const Iterator& rhs) const
+      {
+         if (fGlobalIndex == rhs.GetGlobalIndex()) return true;
+         return false;
+      }
+      bool operator!=(const Iterator& rhs) const { return !operator==(rhs); };
+      bool operator>(const Iterator& rhs) const { return fGlobalIndex > rhs.GetGlobalIndex(); }
+      bool operator<(const Iterator& rhs) const { return fGlobalIndex < rhs.GetGlobalIndex(); }
+      bool operator>=(const Iterator& rhs) const { return fGlobalIndex >= rhs.GetGlobalIndex(); }
+      bool operator<=(const Iterator& rhs) const { return fGlobalIndex <= rhs.GetGlobalIndex(); }
+      typename Index_t::value_type GetGlobalIndex() const { return fGlobalIndex; };
+   };
+
+   // Iterator interface
+   // TODO: Document that the iterator always iterates following the physical memory layout.
+   Iterator begin() noexcept {
+      return Iterator(*this, 0);
+   }
+   Iterator end() noexcept {
+      return Iterator(*this, fSize);
+   }
+};
+
+/// \brief Reshape tensor in place
+/// \param[in] shape Shape vector
+/// Reshape tensor without changing the overall size
+template <typename Value_t, typename Container_t>
+inline void RTensor<Value_t, Container_t>::ReshapeInplace(const Shape_t &shape)
+{
+   const auto size = Internal::GetSizeFromShape(shape);
+   if (size != fSize) {
+      std::stringstream ss;
+      ss << "Cannot reshape tensor with size " << fSize << " into shape { ";
+      for (std::size_t i = 0; i < shape.size(); i++) {
+         if (i != shape.size() - 1) {
+            ss << shape[i] << ", ";
+         } else {
+            ss << shape[i] << " }.";
+         }
+      }
+      throw std::runtime_error(ss.str());
+   }
+
+   // Compute new strides from shape
+   auto strides = Internal::ComputeStridesFromShape(shape, fLayout);
+   fShape = shape;
+   fStrides = strides;
+}
+
+
+/// \brief Access elements
+/// \param[in] idx Index vector
+/// \return Reference to element
+template <typename Value_t, typename Container_t>
+inline Value_t &RTensor<Value_t, Container_t>::operator()(const Index_t &idx)
+{
+   const auto globalIndex = Internal::ComputeGlobalIndex(fStrides, idx);
+   return fData[globalIndex];
+}
+
+/// \brief Access elements
+/// \param[in] idx Index vector
+/// \return Reference to element
+template <typename Value_t, typename Container_t>
+inline const Value_t &RTensor<Value_t, Container_t>::operator() (const Index_t &idx) const
+{
+   const auto globalIndex = Internal::ComputeGlobalIndex(fStrides, idx);
+   return fData[globalIndex];
+}
+
+/// \brief Access elements
+/// \param[in] idx Indices
+/// \return Reference to element
+template <typename Value_t, typename Container_t>
+template <typename... Idx>
+Value_t &RTensor<Value_t, Container_t>::operator()(Idx... idx)
+{
+   static_assert(Internal::and_types<std::is_convertible<Idx, std::size_t>...>{},
+                 "Indices are not convertible to std::size_t.");
+   return operator()({static_cast<std::size_t>(idx)...});
+}
+
+/// \brief Access elements
+/// \param[in] idx Indices
+/// \return Reference to element
+template <typename Value_t, typename Container_t>
+template <typename... Idx>
+const Value_t &RTensor<Value_t, Container_t>::operator() (Idx... idx) const
+{
+   static_assert(Internal::and_types<std::is_convertible<Idx, std::size_t>...>{},
+                 "Indices are not convertible to std::size_t.");
+   return operator()({static_cast<std::size_t>(idx)...});
+}
+
+/// \brief Transpose
+/// \returns New RTensor
+/// The tensor is transposed by inverting the associated memory layout from row-
+/// major to column-major and vice versa. Therefore, the underlying data is not
+/// touched.
+template <typename Value_t, typename Container_t>
+inline RTensor<Value_t, Container_t> RTensor<Value_t, Container_t>::Transpose() const
+{
+   MemoryLayout layout;
+   // Transpose by inverting memory layout
+   if (fLayout == MemoryLayout::RowMajor) {
+      layout = MemoryLayout::ColumnMajor;
+   } else if (fLayout == MemoryLayout::ColumnMajor) {
+      layout = MemoryLayout::RowMajor;
+   } else {
+      throw std::runtime_error("Memory layout is not known.");
+   }
+
+   // Create copy of container
+   RTensor<Value_t, Container_t> x(fData, fShape, fStrides, layout);
+
+   // Reverse shape
+   std::reverse(x.fShape.begin(), x.fShape.end());
+
+   // Reverse strides
+   std::reverse(x.fStrides.begin(), x.fStrides.end());
+
+   return x;
+}
+
+/// \brief Squeeze dimensions
+/// \returns New RTensor
+/// Squeeze removes the dimensions of size one from the shape.
+template <typename Value_t, typename Container_t>
+inline RTensor<Value_t, Container_t> RTensor<Value_t, Container_t>::Squeeze() const
+{
+   // Remove dimensions of one and associated strides
+   Shape_t shape;
+   Shape_t strides;
+   for (std::size_t i = 0; i < fShape.size(); i++) {
+      if (fShape[i] != 1) {
+         shape.emplace_back(fShape[i]);
+         strides.emplace_back(fStrides[i]);
+      }
+   }
+
+   // If all dimensions are 1, we need to keep one.
+   // This does not apply if the inital shape is already empty. Then, return
+   // the empty shape.
+   if (shape.size() == 0 && fShape.size() != 0) {
+      shape.emplace_back(1);
+      strides.emplace_back(1);
+   }
+
+   // Create copy, attach new shape and strides and return
+   RTensor<Value_t, Container_t> x(*this);
+   x.fShape = shape;
+   x.fStrides = strides;
+   return x;
+}
+
+/// \brief Expand dimensions
+/// \param[in] idx Index in shape vector where dimension is added
+/// \returns New RTensor
+/// Inserts a dimension of one into the shape.
+template <typename Value_t, typename Container_t>
+inline RTensor<Value_t, Container_t> RTensor<Value_t, Container_t>::ExpandDims(int idx) const
+{
+   // Compose shape vector with additional dimensions and adjust strides
+   const int len = fShape.size();
+   auto shape = fShape;
+   auto strides = fStrides;
+   if (idx < 0) {
+      idx = len + 1 + idx;
+   }
+   if (idx < 0) {
+      throw std::runtime_error("Given negative index is invalid.");
+   }
+   else if (idx > len) {
+      throw std::runtime_error("Given index is invalid.");
+   }
+   shape.insert(shape.begin() + idx, 1);
+   strides = Internal::ComputeStridesFromShape(shape, fLayout);
+
+   // Create view copy, attach new shape and strides and return
+   RTensor<Value_t, Container_t> x(*this);
+   x.fShape = shape;
+   x.fStrides = strides;
+   return x;
+}
+
+/// \brief Reshape tensor
+/// \param[in] shape Shape vector
+/// \returns New RTensor
+/// Reshape tensor without changing the overall size
+template <typename Value_t, typename Container_t>
+inline RTensor<Value_t, Container_t> RTensor<Value_t, Container_t>::Reshape(const Shape_t &shape) const
+{
+   // Create copy, replace and return
+   RTensor<Value_t, Container_t> x(*this);
+   x.ReshapeInplace(shape);
+   return x;
+}
+
+/// \brief Resize tensor
+/// \param[in] shape Shape vector
+/// \returns New RTensor
+/// Resize tensor into new shape
+template <typename Value_t, typename Container_t>
+inline RTensor<Value_t, Container_t> RTensor<Value_t, Container_t>::Resize(const Shape_t &shape)
+{
+   // Create new tensor with the specified shape
+   RTensor <Value_t, Container_t> x(shape, fLayout);
+
+   // Copying contents from previous tensor
+   size_t n = (x.GetSize()>fSize) ? fSize : x.GetSize();
+   std::copy(this->GetData(), this->GetData() + n, x.GetData() );
+
+   return x;
+}
+
+/// \brief Create a slice of the tensor
+/// \param[in] slice Slice vector
+/// \returns New RTensor
+/// A slice is a subset of the tensor defined by a vector of pairs of indices.
+template <typename Value_t, typename Container_t>
+inline RTensor<Value_t, Container_t> RTensor<Value_t, Container_t>::Slice(const Slice_t &slice)
+{
+   // Sanitize size of slice
+   const auto sliceSize = slice.size();
+   const auto shapeSize = fShape.size();
+   if (sliceSize != shapeSize) {
+      std::stringstream ss;
+      ss << "Size of slice (" << sliceSize << ") is unequal number of dimensions (" << shapeSize << ").";
+      throw std::runtime_error(ss.str());
+   }
+
+   // Sanitize slice indices
+   // TODO: Sanitize slice indices
+   /*
+   for (std::size_t i = 0; i < sliceSize; i++) {
+   }
+   */
+
+   // Convert -1 in slice to proper pair of indices
+   // TODO
+
+   // Recompute shape and size
+   Shape_t shape(sliceSize);
+   for (std::size_t i = 0; i < sliceSize; i++) {
+      shape[i] = slice[i][1] - slice[i][0];
+   }
+   auto size = Internal::GetSizeFromShape(shape);
+
+   // Determine first element contributing to the slice and get the data pointer
+   Value_t *data;
+   Shape_t idx(sliceSize);
+   for (std::size_t i = 0; i < sliceSize; i++) {
+      idx[i] = slice[i][0];
+   }
+   data = &operator()(idx);
+
+   // Create copy and modify properties
+   RTensor<Value_t, Container_t> x(*this);
+   x.fData = data;
+   x.fShape = shape;
+   x.fSize = size;
+
+   // Squeeze tensor and return
+   return x.Squeeze();
+}
+
+/// Copy RTensor to new object
+/// \param[in] layout Memory layout of the new RTensor
+/// \returns New RTensor
+/// The operation copies all elements of the current RTensor to a new RTensor
+/// with the given layout contiguous in memory. Note that this copies by default
+/// to a row major memory layout.
+template <typename Value_t, typename Container_t>
+inline RTensor<Value_t, Container_t> RTensor<Value_t, Container_t>::Copy(MemoryLayout layout) const
+{
+   // Create new tensor with zeros owning the memory
+   RTensor<Value_t, Container_t> r(fShape, layout);
+
+   // Copy over the elements from this tensor
+   const auto mins = Shape_t(fShape.size());
+   const auto maxs = fShape;
+   auto idx = mins;
+   Internal::RecursiveCopy(*this, r, mins, maxs, idx, 0);
+
+   return r;
+}
+
+/// \brief Pretty printing
+/// \param[in] os Output stream
+/// \param[in] x RTensor
+/// \return Modified output stream
+template <typename T>
+std::ostream &operator<<(std::ostream &os, RTensor<T> &x)
+{
+   const auto shapeSize = x.GetShape().size();
+   if (shapeSize == 1) {
+      os << "{ ";
+      const auto size = x.GetSize();
+      for (std::size_t i = 0; i < size; i++) {
+         os << x({i});
+         if (i != size - 1)
+            os << ", ";
+      }
+      os << " }";
+   } else if (shapeSize == 2) {
+      os << "{";
+      const auto shape = x.GetShape();
+      for (std::size_t i = 0; i < shape[0]; i++) {
+         os << " { ";
+         for (std::size_t j = 0; j < shape[1]; j++) {
+            os << x({i, j});
+            if (j < shape[1] - 1) {
+               os << ", ";
+            } else {
+               os << " ";
+            }
+         }
+         os << "}";
+      }
+      os << " }";
+   } else {
+      os << "{ printing not yet implemented for this rank }";
+   }
+   return os;
+}
+   
+} // namespace SOFIE
+
+namespace cling {
+template <typename T>
+std::string printValue(SOFIE::RTensor<T> *x)
+{
+   std::stringstream ss;
+   ss << *x;
+   return ss.str();
+}
+} // namespace cling
+
+#endif // SOFIE_RTENSOR