From a41bfc43632b09168561faf53da93d40a5348ba2 Mon Sep 17 00:00:00 2001
From: Felix Schlepper <felix.schlepper@cern.ch>
Date: Wed, 6 Aug 2025 14:20:09 +0200
Subject: [PATCH 1/6] ITS: GPU: put trackleting properly on different streams

Signed-off-by: Felix Schlepper <felix.schlepper@cern.ch>
---
 .../GPU/ITStrackingGPU/TimeFrameGPU.h         |  9 +-
 .../ITS/tracking/GPU/ITStrackingGPU/Utils.h   | 25 ++++--
 .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu     | 82 ++++++++++++-------
 .../tracking/GPU/cuda/TrackerTraitsGPU.cxx    |  1 +
 .../ITS/tracking/GPU/cuda/TrackingKernels.cu  | 10 ++-
 5 files changed, 80 insertions(+), 47 deletions(-)
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
index 0ad08fd88ccf5..4656dd1d14846 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
@@ -79,6 +79,7 @@ class TimeFrameGPU : public TimeFrame<nLayers>
     return mGpuStreams[stream];
   }
   auto& getStreams() { return mGpuStreams; }
+  void syncStreams();
   virtual void wipe() final;
 
   /// interface
@@ -108,7 +109,7 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   std::vector<unsigned int> getClusterSizes();
   const unsigned char** getDeviceArrayUsedClusters() const { return mUsedClustersDeviceArray; }
   const int** getDeviceROframeClusters() const { return mROFrameClustersDeviceArray; }
-  Tracklet** getDeviceArrayTracklets() { return mTrackletsDeviceArray; }
+  Tracklet** getDeviceArrayTracklets() { return mTrackletsDevice.data(); }
   int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; }
   int** getDeviceArrayCellsLUT() const { return mCellsLUTDeviceArray; }
   int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLUTDeviceArray; }
@@ -140,7 +141,8 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   int getNumberOfNeighbours() const final;
 
  private:
-  void allocMemAsync(void**, size_t, Stream&, bool); // Abstract owned and unowned memory allocations
+  void allocMemAsync(void**, size_t, Stream&, bool); // Abstract owned and unowned memory allocations on specific stream
+  void allocMem(void**, size_t, bool);               // Abstract owned and unowned memory allocations on default stream
   bool mHostRegistered = false;
   TimeFrameGPUParameters mGpuParams;
 
@@ -167,7 +169,6 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   const unsigned char** mUsedClustersDeviceArray;
   const int** mROFrameClustersDeviceArray;
   std::array<Tracklet*, nLayers - 1> mTrackletsDevice;
-  Tracklet** mTrackletsDeviceArray;
   std::array<int*, nLayers - 1> mTrackletsLUTDevice;
   std::array<int*, nLayers - 2> mCellsLUTDevice;
   std::array<int*, nLayers - 3> mNeighboursLUTDevice;
@@ -195,8 +196,6 @@ class TimeFrameGPU : public TimeFrame<nLayers>
 
   // State
   Streams mGpuStreams;
-  size_t mAvailMemGB;
-  bool mFirstInit = true;
 
   // Temporary buffer for storing output tracks from GPU tracking
   bounded_vector<TrackITSExt> mTrackITSExt;
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
index 802973d5f4000..9d25bd22484fe 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
@@ -94,16 +94,21 @@ class Stream
  public:
 #if defined(__HIPCC__)
   using Handle = hipStream_t;
-  static constexpr Handle Default = 0;
+  static constexpr Handle DefaultStream = 0;
+  // static constexpr unsigned int DefaultFlag = hipStreamNonBlocking; TODO replace once ready
+  static constexpr unsigned int DefaultFlag = 0;
 #elif defined(__CUDACC__)
   using Handle = cudaStream_t;
-  static constexpr Handle Default = 0;
+  static constexpr Handle DefaultStream = 0;
+  // static constexpr unsigned int DefaultFlag = cudaStreamNonBlocking; TODO replace once ready
+  static constexpr unsigned int DefaultFlag = 0;
 #else
   using Handle = void*;
-  static constexpr Handle Default = nullptr;
+  static constexpr Handle DefaultStream = nullptr;
+  static constexpr unsigned int DefaultFlag = 0;
 #endif
 
-  Stream(unsigned int flags = 0)
+  Stream(unsigned int flags = DefaultFlag)
   {
 #if defined(__HIPCC__)
     GPUChkErrS(hipStreamCreateWithFlags(&mHandle, flags));
@@ -115,7 +120,7 @@ class Stream
   Stream(Handle h) : mHandle(h) {}
   ~Stream()
   {
-    if (mHandle != Default) {
+    if (mHandle != DefaultStream) {
 #if defined(__HIPCC__)
       GPUChkErrS(hipStreamDestroy(mHandle));
 #elif defined(__CUDACC__)
@@ -124,7 +129,7 @@ class Stream
     }
   }
 
-  operator bool() const { return mHandle != Default; }
+  operator bool() const { return mHandle != DefaultStream; }
   const Handle& get() { return mHandle; }
   void sync() const
   {
@@ -136,7 +141,7 @@ class Stream
   }
 
  private:
-  Handle mHandle{Default};
+  Handle mHandle{DefaultStream};
 };
 static_assert(sizeof(Stream) == sizeof(void*), "Stream type must match pointer type!");
 
@@ -150,6 +155,12 @@ class Streams
   void clear() { mStreams.clear(); }
   auto& operator[](size_t i) { return mStreams[i % mStreams.size()]; }
   void push_back(const Stream& stream) { mStreams.push_back(stream); }
+  void sync()
+  {
+    for (auto& s : mStreams) {
+      s.sync();
+    }
+  }
 
  private:
   std::vector<Stream> mStreams;
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
index f6d9157b0da68..dafd218c1e811 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
@@ -122,6 +122,17 @@ void TimeFrameGPU<nLayers>::allocMemAsync(void** ptr, size_t size, Stream& strea
   }
 }
 
+template <int nLayers>
+void TimeFrameGPU<nLayers>::allocMem(void** ptr, size_t size, bool extAllocator)
+{
+  if (extAllocator) {
+    *ptr = this->mAllocator->allocate(size);
+  } else {
+    GPULog("Calling default CUDA allocator");
+    GPUChkErrS(cudaMalloc(reinterpret_cast<void**>(ptr), size));
+  }
+}
+
 template <int nLayers>
 void TimeFrameGPU<nLayers>::setDevicePropagator(const o2::base::PropagatorImpl<float>* propagator)
 {
@@ -134,10 +145,10 @@ void TimeFrameGPU<nLayers>::loadIndexTableUtils(const int iteration)
   GPUTimer timer(mGpuStreams[0], "loading indextable utils");
   if (!iteration) {
     GPULog("gpu-allocation: allocating IndexTableUtils buffer, for {:.2f} MB.", sizeof(IndexTableUtils) / constants::MB);
-    allocMemAsync(reinterpret_cast<void**>(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), mGpuStreams[0], this->getExtAllocator());
+    allocMem(reinterpret_cast<void**>(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), this->getExtAllocator());
   }
   GPULog("gpu-transfer: loading IndexTableUtils object, for {:.2f} MB.", sizeof(IndexTableUtils) / constants::MB);
-  GPUChkErrS(cudaMemcpyAsync(mIndexTableUtilsDevice, &(this->mIndexTableUtils), sizeof(IndexTableUtils), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  GPUChkErrS(cudaMemcpy(mIndexTableUtilsDevice, &(this->mIndexTableUtils), sizeof(IndexTableUtils), cudaMemcpyHostToDevice));
 }
 
 template <int nLayers>
@@ -151,9 +162,10 @@ void TimeFrameGPU<nLayers>::loadUnsortedClustersDevice(const int iteration)
       GPUChkErrS(cudaHostRegister(this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable));
       GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDevice[iLayer], this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
     }
-    allocMemAsync(reinterpret_cast<void**>(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), mGpuStreams[0], this->getExtAllocator());
+    mGpuStreams.sync();
+    allocMem(reinterpret_cast<void**>(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), this->getExtAllocator());
     GPUChkErrS(cudaHostRegister(mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable));
-    GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDeviceArray, mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemcpy(mUnsortedClustersDeviceArray, mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice));
   }
 }
 
@@ -164,13 +176,14 @@ void TimeFrameGPU<nLayers>::loadClustersDevice(const int iteration)
     GPUTimer timer(mGpuStreams[0], "loading sorted clusters");
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
       GPULog("gpu-transfer: loading {} clusters on layer {}, for {:.2f} MB.", this->mClusters[iLayer].size(), iLayer, this->mClusters[iLayer].size() * sizeof(Cluster) / constants::MB);
-      allocMemAsync(reinterpret_cast<void**>(&mClustersDevice[iLayer]), this->mClusters[iLayer].size() * sizeof(Cluster), mGpuStreams[0], this->getExtAllocator());
+      allocMemAsync(reinterpret_cast<void**>(&mClustersDevice[iLayer]), this->mClusters[iLayer].size() * sizeof(Cluster), mGpuStreams[iLayer], this->getExtAllocator());
       GPUChkErrS(cudaHostRegister(this->mClusters[iLayer].data(), this->mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable));
-      GPUChkErrS(cudaMemcpyAsync(mClustersDevice[iLayer], this->mClusters[iLayer].data(), this->mClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+      GPUChkErrS(cudaMemcpyAsync(mClustersDevice[iLayer], this->mClusters[iLayer].data(), this->mClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
     }
-    allocMemAsync(reinterpret_cast<void**>(&mClustersDeviceArray), nLayers * sizeof(Cluster*), mGpuStreams[0], this->getExtAllocator());
+    mGpuStreams.sync();
+    allocMem(reinterpret_cast<void**>(&mClustersDeviceArray), nLayers * sizeof(Cluster*), this->getExtAllocator());
     GPUChkErrS(cudaHostRegister(mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable));
-    GPUChkErrS(cudaMemcpyAsync(mClustersDeviceArray, mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemcpy(mClustersDeviceArray, mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice));
   }
 }
 
@@ -181,11 +194,12 @@ void TimeFrameGPU<nLayers>::loadClustersIndexTables(const int iteration)
     GPUTimer timer(mGpuStreams[0], "loading sorted clusters");
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
       GPULog("gpu-transfer: loading clusters indextable for layer {} with {} elements, for {:.2f} MB.", iLayer, this->mIndexTables[iLayer].size(), this->mIndexTables[iLayer].size() * sizeof(int) / constants::MB);
-      allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDevice[iLayer]), this->mIndexTables[iLayer].size() * sizeof(int), mGpuStreams[0], this->getExtAllocator());
-      GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], this->mIndexTables[iLayer].data(), this->mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+      allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDevice[iLayer]), this->mIndexTables[iLayer].size() * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator());
+      GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], this->mIndexTables[iLayer].data(), this->mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
     }
-    allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), mGpuStreams[0], this->getExtAllocator());
-    GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    mGpuStreams.sync();
+    allocMem(reinterpret_cast<void**>(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), this->getExtAllocator());
+    GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice));
   }
 }
 
@@ -196,10 +210,11 @@ void TimeFrameGPU<nLayers>::createUsedClustersDevice(const int iteration)
     GPUTimer timer(mGpuStreams[0], "creating used clusters flags");
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
       GPULog("gpu-transfer: creating {} used clusters flags on layer {}, for {:.2f} MB.", this->mUsedClusters[iLayer].size(), iLayer, this->mUsedClusters[iLayer].size() * sizeof(unsigned char) / constants::MB);
-      allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDevice[iLayer]), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0], this->getExtAllocator());
-      GPUChkErrS(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, this->mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0].get()));
+      allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDevice[iLayer]), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[iLayer], this->getExtAllocator());
+      GPUChkErrS(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, this->mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[iLayer].get()));
     }
-    allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), mGpuStreams[0], this->getExtAllocator());
+    mGpuStreams.sync();
+    allocMem(reinterpret_cast<void**>(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), this->getExtAllocator());
     GPUChkErrS(cudaMemcpyAsync(mUsedClustersDeviceArray, mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   }
 }
@@ -221,11 +236,12 @@ void TimeFrameGPU<nLayers>::loadROframeClustersDevice(const int iteration)
     GPUTimer timer(mGpuStreams[0], "loading ROframe clusters");
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
       GPULog("gpu-transfer: loading {} ROframe clusters info on layer {}, for {:.2f} MB.", this->mROFramesClusters[iLayer].size(), iLayer, this->mROFramesClusters[iLayer].size() * sizeof(int) / constants::MB);
-      allocMemAsync(reinterpret_cast<void**>(&mROFramesClustersDevice[iLayer]), this->mROFramesClusters[iLayer].size() * sizeof(int), mGpuStreams[0], this->getExtAllocator());
+      allocMemAsync(reinterpret_cast<void**>(&mROFramesClustersDevice[iLayer]), this->mROFramesClusters[iLayer].size() * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator());
       GPUChkErrS(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], this->mROFramesClusters[iLayer].data(), this->mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     }
-    allocMemAsync(reinterpret_cast<void**>(&mROFrameClustersDeviceArray), nLayers * sizeof(int*), mGpuStreams[0], this->getExtAllocator());
-    GPUChkErrS(cudaMemcpyAsync(mROFrameClustersDeviceArray, mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    mGpuStreams.sync();
+    allocMem(reinterpret_cast<void**>(&mROFrameClustersDeviceArray), nLayers * sizeof(int*), this->getExtAllocator());
+    GPUChkErrS(cudaMemcpy(mROFrameClustersDeviceArray, mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice));
   }
 }
 
@@ -236,10 +252,11 @@ void TimeFrameGPU<nLayers>::loadTrackingFrameInfoDevice(const int iteration)
   if (!iteration) {
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
       GPULog("gpu-transfer: loading {} tfinfo on layer {}, for {:.2f} MB.", this->mTrackingFrameInfo[iLayer].size(), iLayer, this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo) / constants::MB);
-      allocMemAsync(reinterpret_cast<void**>(&mTrackingFrameInfoDevice[iLayer]), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), mGpuStreams[0], this->getExtAllocator());
+      allocMemAsync(reinterpret_cast<void**>(&mTrackingFrameInfoDevice[iLayer]), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), mGpuStreams[iLayer], this->getExtAllocator());
       GPUChkErrS(cudaHostRegister(this->mTrackingFrameInfo[iLayer].data(), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaHostRegisterPortable));
-      GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDevice[iLayer], this->mTrackingFrameInfo[iLayer].data(), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+      GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDevice[iLayer], this->mTrackingFrameInfo[iLayer].data(), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
     }
+    mGpuStreams.sync();
     allocMemAsync(reinterpret_cast<void**>(&mTrackingFrameInfoDeviceArray), nLayers * sizeof(TrackingFrameInfo*), mGpuStreams[0], this->getExtAllocator());
     GPUChkErrS(cudaHostRegister(mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaHostRegisterPortable));
     GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDeviceArray, mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
@@ -253,9 +270,9 @@ void TimeFrameGPU<nLayers>::loadMultiplicityCutMask(const int iteration)
     GPUTimer timer(mGpuStreams[0], "loading multiplicity cut mask");
     GPULog("gpu-transfer: iteration {} loading multiplicity cut mask with {} elements, for {:.2f} MB.", iteration, this->mMultiplicityCutMask.size(), this->mMultiplicityCutMask.size() * sizeof(bool) / constants::MB);
     if (!iteration) { // only allocate on first call
-      allocMemAsync(reinterpret_cast<void**>(&mMultMaskDevice), this->mMultiplicityCutMask.size() * sizeof(uint8_t), mGpuStreams[0], this->getExtAllocator());
+      allocMem(reinterpret_cast<void**>(&mMultMaskDevice), this->mMultiplicityCutMask.size() * sizeof(uint8_t), this->getExtAllocator());
     }
-    GPUChkErrS(cudaMemcpyAsync(mMultMaskDevice, this->mMultiplicityCutMask.data(), this->mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    GPUChkErrS(cudaMemcpy(mMultMaskDevice, this->mMultiplicityCutMask.data(), this->mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice));
   }
 }
 
@@ -265,11 +282,11 @@ void TimeFrameGPU<nLayers>::loadVertices(const int iteration)
   if (!iteration) {
     GPUTimer timer(mGpuStreams[0], "loading seeding vertices");
     GPULog("gpu-transfer: loading {} ROframes vertices, for {:.2f} MB.", this->mROFramesPV.size(), this->mROFramesPV.size() * sizeof(int) / constants::MB);
-    allocMemAsync(reinterpret_cast<void**>(&mROFramesPVDevice), this->mROFramesPV.size() * sizeof(int), mGpuStreams[0], this->getExtAllocator());
-    GPUChkErrS(cudaMemcpyAsync(mROFramesPVDevice, this->mROFramesPV.data(), this->mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    allocMem(reinterpret_cast<void**>(&mROFramesPVDevice), this->mROFramesPV.size() * sizeof(int), this->getExtAllocator());
+    GPUChkErrS(cudaMemcpy(mROFramesPVDevice, this->mROFramesPV.data(), this->mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice));
     GPULog("gpu-transfer: loading {} seeding vertices, for {:.2f} MB.", this->mPrimaryVertices.size(), this->mPrimaryVertices.size() * sizeof(Vertex) / constants::MB);
-    allocMemAsync(reinterpret_cast<void**>(&mPrimaryVerticesDevice), this->mPrimaryVertices.size() * sizeof(Vertex), mGpuStreams[0], this->getExtAllocator());
-    GPUChkErrS(cudaMemcpyAsync(mPrimaryVerticesDevice, this->mPrimaryVertices.data(), this->mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    allocMem(reinterpret_cast<void**>(&mPrimaryVerticesDevice), this->mPrimaryVertices.size() * sizeof(Vertex), this->getExtAllocator());
+    GPUChkErrS(cudaMemcpy(mPrimaryVerticesDevice, this->mPrimaryVertices.data(), this->mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice));
   }
 }
 
@@ -294,16 +311,13 @@ void TimeFrameGPU<nLayers>::createTrackletsLUTDevice(const int iteration)
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createTrackletsBuffers()
 {
-  GPUTimer timer(mGpuStreams[0], "creating tracklet buffers");
   for (int iLayer{0}; iLayer < nLayers - 1; ++iLayer) {
+    GPUTimer timer(mGpuStreams[iLayer], "creating tracklet buffers");
     mNTracklets[iLayer] = 0;
     GPUChkErrS(cudaMemcpyAsync(&mNTracklets[iLayer], mTrackletsLUTDevice[iLayer] + this->mClusters[iLayer].size(), sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[iLayer].get()));
     GPULog("gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {:.2f} MB.", mNTracklets[iLayer], iLayer, mNTracklets[iLayer] * sizeof(Tracklet) / constants::MB);
     allocMemAsync(reinterpret_cast<void**>(&mTrackletsDevice[iLayer]), mNTracklets[iLayer] * sizeof(Tracklet), mGpuStreams[iLayer], this->getExtAllocator());
   }
-  allocMemAsync(reinterpret_cast<void**>(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), mGpuStreams[0], this->getExtAllocator());
-  GPUChkErrS(cudaHostRegister(mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaHostRegisterPortable));
-  GPUChkErrS(cudaMemcpyAsync(mTrackletsDeviceArray, mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
 }
 
 template <int nLayers>
@@ -524,7 +538,7 @@ void TimeFrameGPU<nLayers>::unregisterRest()
   GPUTimer timer(mGpuStreams[0], "unregistering rest of the host memory");
   GPULog("unregistering rest of the host memory...");
   GPUChkErrS(cudaHostUnregister(mCellsDevice.data()));
-  GPUChkErrS(cudaHostUnregister(mTrackletsDevice.data()));
+  // GPUChkErrS(cudaHostUnregister(mTrackletsDevice.data()));
 }
 
 template <int nLayers>
@@ -553,6 +567,12 @@ void TimeFrameGPU<nLayers>::initialise(const int iteration,
   o2::its::TimeFrame<nLayers>::initialise(iteration, trkParam, maxLayers);
 }
 
+template <int nLayers>
+void TimeFrameGPU<nLayers>::syncStreams()
+{
+  mGpuStreams.sync();
+}
+
 template <int nLayers>
 void TimeFrameGPU<nLayers>::wipe()
 {
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
index eaa6080761fec..9a6fe4e050519 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
@@ -116,6 +116,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
                                          conf.nBlocksLayerTracklets[iteration],
                                          conf.nThreadsLayerTracklets[iteration],
                                          mTimeFrameGPU->getStreams());
+  mTimeFrameGPU->syncStreams(); // TODO evaluate if this can be removed
 }
 
 template <int nLayers>
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
index 02be19b1e3a08..6e678118686ab 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
@@ -876,9 +876,9 @@ GPUhi() void cubExclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stre
   void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
   GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
-  GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes));
+  GPUChkErrS(cudaMallocAsync(&d_temp_storage, temp_storage_bytes, stream));
   GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
-  GPUChkErrS(cudaFree(d_temp_storage));
+  GPUChkErrS(cudaFreeAsync(d_temp_storage, stream));
 }
 
 template <typename Vector>
@@ -893,9 +893,9 @@ GPUhi() void cubInclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stre
   void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
   GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
-  GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes));
+  GPUChkErrS(cudaMallocAsync(&d_temp_storage, temp_storage_bytes, stream));
   GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
-  GPUChkErrS(cudaFree(d_temp_storage));
+  GPUChkErrS(cudaFreeAsync(d_temp_storage, stream));
 }
 
 template <typename Vector>
@@ -1030,6 +1030,8 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
       resolutions[iLayer],
       radii[iLayer + 1] - radii[iLayer],
       mulScatAng[iLayer]);
+    /// Internal thrust allocation serialize this part to a degree
+    /// TODO switch to cub equivelent and do all work on one stream
     thrust::device_ptr<Tracklet> tracklets_ptr(spanTracklets[iLayer]);
     auto nosync_policy = THRUST_NAMESPACE::par_nosync.on(streams[iLayer].get());
     thrust::sort(nosync_policy, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::sort_tracklets());

From 313947653b3bd009c920dccaf1c8ae38c89745bc Mon Sep 17 00:00:00 2001
From: Felix Schlepper <felix.schlepper@cern.ch>
Date: Wed, 6 Aug 2025 15:33:05 +0200
Subject: [PATCH 2/6] ITS: GPU: put cell finding on different streams

Signed-off-by: Felix Schlepper <felix.schlepper@cern.ch>
---
 .../GPU/ITStrackingGPU/TimeFrameGPU.h         |  6 +---
 .../GPU/ITStrackingGPU/TrackingKernels.h      | 11 ++++++--
 .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu     |  8 +++++-
 .../tracking/GPU/cuda/TrackerTraitsGPU.cxx    | 28 ++++++++++++++-----
 .../ITS/tracking/GPU/cuda/TrackingKernels.cu  | 12 ++++----
 5 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
index 4656dd1d14846..afe31b14a4a0a 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
@@ -73,12 +73,8 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   void downloadCellsDevice();
   void downloadCellsLUTDevice();
   void unregisterRest();
-  template <Task task>
-  auto& getStream(const size_t stream)
-  {
-    return mGpuStreams[stream];
-  }
   auto& getStreams() { return mGpuStreams; }
+  void syncStream(const size_t stream);
   void syncStreams();
   virtual void wipe() final;
 
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
index b0fb443513fef..567aa07f42f7c 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
@@ -25,7 +25,10 @@ namespace gpu
 
 #ifdef GPUCA_GPUCODE // GPUg() global kernels must only when compiled by GPU compiler
 
-GPUdi() int4 getEmptyBinsRect() { return int4{0, 0, 0, 0}; }
+GPUdi() int4 getEmptyBinsRect()
+{
+  return int4{0, 0, 0, 0};
+}
 
 GPUd() bool fitTrack(TrackITSExt& track,
                      int start,
@@ -137,7 +140,8 @@ void countCellsHandler(const Cluster** sortedClusters,
                        const float cellDeltaTanLambdaSigma,
                        const float nSigmaCut,
                        const int nBlocks,
-                       const int nThreads);
+                       const int nThreads,
+                       gpu::Streams& streams);
 
 void computeCellsHandler(const Cluster** sortedClusters,
                          const Cluster** unsortedClusters,
@@ -155,7 +159,8 @@ void computeCellsHandler(const Cluster** sortedClusters,
                          const float cellDeltaTanLambdaSigma,
                          const float nSigmaCut,
                          const int nBlocks,
-                         const int nThreads);
+                         const int nThreads,
+                         gpu::Streams& streams);
 
 unsigned int countCellNeighboursHandler(CellSeed** cellsLayersDevice,
                                         int* neighboursLUTs,
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
index dafd218c1e811..da91373611a2d 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
@@ -402,7 +402,7 @@ void TimeFrameGPU<nLayers>::createCellsLUTDevice()
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createCellsBuffers(const int layer)
 {
-  GPUTimer timer(mGpuStreams[0], "creating cells buffers");
+  GPUTimer timer(mGpuStreams[layer], "creating cells buffers");
   mNCells[layer] = 0;
   GPUChkErrS(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mNTracklets[layer], sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[layer].get()));
   GPULog("gpu-transfer: creating cell buffer for {} elements on layer {}, for {:.2f} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeed) / constants::MB);
@@ -567,6 +567,12 @@ void TimeFrameGPU<nLayers>::initialise(const int iteration,
   o2::its::TimeFrame<nLayers>::initialise(iteration, trkParam, maxLayers);
 }
 
+template <int nLayers>
+void TimeFrameGPU<nLayers>::syncStream(const size_t stream)
+{
+  mGpuStreams[stream].sync();
+}
+
 template <int nLayers>
 void TimeFrameGPU<nLayers>::syncStreams()
 {
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
index 9a6fe4e050519..2ae4f0aa91819 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
@@ -116,7 +116,6 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
                                          conf.nBlocksLayerTracklets[iteration],
                                          conf.nThreadsLayerTracklets[iteration],
                                          mTimeFrameGPU->getStreams());
-  mTimeFrameGPU->syncStreams(); // TODO evaluate if this can be removed
 }
 
 template <int nLayers>
@@ -125,18 +124,30 @@ void TrackerTraitsGPU<nLayers>::computeLayerCells(const int iteration)
   mTimeFrameGPU->createCellsLUTDevice();
   auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
 
+  std::vector<bool> isTrackletStreamSynched(this->mTrkParams[iteration].TrackletsPerRoad());
+  auto syncOnce = [&](const int iLayer) {
+    if (!isTrackletStreamSynched[iLayer]) {
+      mTimeFrameGPU->syncStream(iLayer);
+      isTrackletStreamSynched[iLayer] = true;
+    }
+  };
+
   for (int iLayer = 0; iLayer < this->mTrkParams[iteration].CellsPerRoad(); ++iLayer) {
-    if (!mTimeFrameGPU->getNTracklets()[iLayer + 1] || !mTimeFrameGPU->getNTracklets()[iLayer]) {
+    // need to ensure that trackleting on layers iLayer and iLayer + 1 are done (only once)
+    syncOnce(iLayer);
+    syncOnce(iLayer + 1);
+    // if there are no tracklets skip entirely
+    const int currentLayerTrackletsNum{static_cast<int>(mTimeFrameGPU->getNTracklets()[iLayer])};
+    if (!currentLayerTrackletsNum || !mTimeFrameGPU->getNTracklets()[iLayer + 1]) {
       mTimeFrameGPU->getNCells()[iLayer] = 0;
       continue;
     }
-    const int currentLayerTrackletsNum{static_cast<int>(mTimeFrameGPU->getNTracklets()[iLayer])};
     countCellsHandler(mTimeFrameGPU->getDeviceArrayClusters(),
                       mTimeFrameGPU->getDeviceArrayUnsortedClusters(),
                       mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
                       mTimeFrameGPU->getDeviceArrayTracklets(),
                       mTimeFrameGPU->getDeviceArrayTrackletsLUT(),
-                      mTimeFrameGPU->getNTracklets()[iLayer],
+                      currentLayerTrackletsNum,
                       iLayer,
                       nullptr,
                       mTimeFrameGPU->getDeviceArrayCellsLUT(),
@@ -147,14 +158,15 @@ void TrackerTraitsGPU<nLayers>::computeLayerCells(const int iteration)
                       this->mTrkParams[iteration].CellDeltaTanLambdaSigma,
                       this->mTrkParams[iteration].NSigmaCut,
                       conf.nBlocksLayerCells[iteration],
-                      conf.nThreadsLayerCells[iteration]);
+                      conf.nThreadsLayerCells[iteration],
+                      mTimeFrameGPU->getStreams());
     mTimeFrameGPU->createCellsBuffers(iLayer);
     computeCellsHandler(mTimeFrameGPU->getDeviceArrayClusters(),
                         mTimeFrameGPU->getDeviceArrayUnsortedClusters(),
                         mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
                         mTimeFrameGPU->getDeviceArrayTracklets(),
                         mTimeFrameGPU->getDeviceArrayTrackletsLUT(),
-                        mTimeFrameGPU->getNTracklets()[iLayer],
+                        currentLayerTrackletsNum,
                         iLayer,
                         mTimeFrameGPU->getDeviceCells()[iLayer],
                         mTimeFrameGPU->getDeviceArrayCellsLUT(),
@@ -165,8 +177,10 @@ void TrackerTraitsGPU<nLayers>::computeLayerCells(const int iteration)
                         this->mTrkParams[iteration].CellDeltaTanLambdaSigma,
                         this->mTrkParams[iteration].NSigmaCut,
                         conf.nBlocksLayerCells[iteration],
-                        conf.nThreadsLayerCells[iteration]);
+                        conf.nThreadsLayerCells[iteration],
+                        mTimeFrameGPU->getStreams());
   }
+  mTimeFrameGPU->syncStreams(); // TODO evaluate if this can be removed
 }
 
 template <int nLayers>
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
index 6e678118686ab..d4dcda067b26f 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
@@ -1065,9 +1065,10 @@ void countCellsHandler(
   const float cellDeltaTanLambdaSigma,
   const float nSigmaCut,
   const int nBlocks,
-  const int nThreads)
+  const int nThreads,
+  gpu::Streams& streams)
 {
-  gpu::computeLayerCellsKernel<true><<<nBlocks, nThreads>>>(
+  gpu::computeLayerCellsKernel<true><<<nBlocks, nThreads, 0, streams[layer].get()>>>(
     sortedClusters,           // const Cluster**
     unsortedClusters,         // const Cluster**
     tfInfo,                   // const TrackingFrameInfo**
@@ -1082,7 +1083,7 @@ void countCellsHandler(
     maxChi2ClusterAttachment, // const float
     cellDeltaTanLambdaSigma,  // const float
     nSigmaCut);               // const float
-  gpu::cubExclusiveScanInPlace(cellsLUTsHost, nTracklets + 1);
+  gpu::cubExclusiveScanInPlace(cellsLUTsHost, nTracklets + 1, streams[layer].get());
 }
 
 void computeCellsHandler(
@@ -1102,9 +1103,10 @@ void computeCellsHandler(
   const float cellDeltaTanLambdaSigma,
   const float nSigmaCut,
   const int nBlocks,
-  const int nThreads)
+  const int nThreads,
+  gpu::Streams& streams)
 {
-  gpu::computeLayerCellsKernel<false><<<nBlocks, nThreads>>>(
+  gpu::computeLayerCellsKernel<false><<<nBlocks, nThreads, 0, streams[layer].get()>>>(
     sortedClusters,           // const Cluster**
     unsortedClusters,         // const Cluster**
     tfInfo,                   // const TrackingFrameInfo**

From 85287db75519f8b1459d202a7d1df01669068619 Mon Sep 17 00:00:00 2001
From: Felix Schlepper <felix.schlepper@cern.ch>
Date: Wed, 6 Aug 2025 16:28:15 +0200
Subject: [PATCH 3/6] ITS: GPU: put cell neighbour finding on different streams

Signed-off-by: Felix Schlepper <felix.schlepper@cern.ch>
---
 .../GPU/ITStrackingGPU/TimeFrameGPU.h         | 12 ++--
 .../GPU/ITStrackingGPU/TrackingKernels.h      | 35 +++++-----
 .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu     | 59 ++++-------------
 .../tracking/GPU/cuda/TrackerTraitsGPU.cxx    | 66 +++++++++++--------
 .../ITS/tracking/GPU/cuda/TrackingKernels.cu  | 58 ++++++++--------
 5 files changed, 101 insertions(+), 129 deletions(-)

diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
index afe31b14a4a0a..5c10b01412b4e 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
@@ -62,17 +62,15 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   void createCellsDevice();
   void createCellsLUTDevice();
   void createNeighboursIndexTablesDevice();
-  void createNeighboursDevice(const unsigned int layer, const unsigned int nNeighbours);
-  void createNeighboursDevice(const unsigned int layer, std::vector<std::pair<int, int>>& neighbours);
+  void createNeighboursDevice(const unsigned int layer);
   void createNeighboursLUTDevice(const int, const unsigned int);
-  void createNeighboursDeviceArray();
   void createTrackITSExtDevice(bounded_vector<CellSeed>&);
   void downloadTrackITSExtDevice(bounded_vector<CellSeed>&);
   void downloadCellsNeighboursDevice(std::vector<bounded_vector<std::pair<int, int>>>&, const int);
   void downloadNeighboursLUTDevice(bounded_vector<int>&, const int);
   void downloadCellsDevice();
   void downloadCellsLUTDevice();
-  void unregisterRest();
+  auto& getStream(const size_t stream) { return mGpuStreams[stream]; }
   auto& getStreams() { return mGpuStreams; }
   void syncStream(const size_t stream);
   void syncStreams();
@@ -96,7 +94,7 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   gpuPair<int, int>* getDeviceNeighbourPairs(const int layer) { return mNeighbourPairsDevice[layer]; }
   std::array<int*, nLayers - 2>& getDeviceNeighboursAll() { return mNeighboursDevice; }
   int* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; }
-  int** getDeviceNeighboursArray() { return mNeighboursDeviceArray; }
+  int** getDeviceNeighboursArray() { return mNeighboursDevice.data(); }
   TrackingFrameInfo* getDeviceTrackingFrameInfo(const int);
   const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; }
   const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; }
@@ -109,7 +107,7 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; }
   int** getDeviceArrayCellsLUT() const { return mCellsLUTDeviceArray; }
   int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLUTDeviceArray; }
-  CellSeed** getDeviceArrayCells() const { return mCellsDeviceArray; }
+  CellSeed** getDeviceArrayCells() { return mCellsDevice.data(); }
   CellSeed* getDeviceTrackSeeds() { return mTrackSeedsDevice; }
   o2::track::TrackParCovF** getDeviceArrayTrackSeeds() { return mCellSeedsDeviceArray; }
   float** getDeviceArrayTrackSeedsChi2() { return mCellSeedsChi2DeviceArray; }
@@ -176,7 +174,6 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   std::array<CellSeed*, nLayers - 2> mCellsDevice;
   std::array<int*, nLayers - 2> mNeighboursIndexTablesDevice;
   CellSeed* mTrackSeedsDevice;
-  CellSeed** mCellsDeviceArray;
   std::array<o2::track::TrackParCovF*, nLayers - 2> mCellSeedsDevice;
   o2::track::TrackParCovF** mCellSeedsDeviceArray;
   std::array<float*, nLayers - 2> mCellSeedsChi2Device;
@@ -186,7 +183,6 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   TrackITSExt* mTrackITSExtDevice;
   std::array<gpuPair<int, int>*, nLayers - 2> mNeighbourPairsDevice;
   std::array<int*, nLayers - 2> mNeighboursDevice;
-  int** mNeighboursDeviceArray;
   std::array<TrackingFrameInfo*, nLayers> mTrackingFrameInfoDevice;
   const TrackingFrameInfo** mTrackingFrameInfoDeviceArray;
 
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
index 567aa07f42f7c..caa7675756db6 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
@@ -162,21 +162,22 @@ void computeCellsHandler(const Cluster** sortedClusters,
                          const int nThreads,
                          gpu::Streams& streams);
 
-unsigned int countCellNeighboursHandler(CellSeed** cellsLayersDevice,
-                                        int* neighboursLUTs,
-                                        int** cellsLUTs,
-                                        gpuPair<int, int>* cellNeighbours,
-                                        int* neighboursIndexTable,
-                                        const Tracklet** tracklets,
-                                        const int deltaROF,
-                                        const float maxChi2ClusterAttachment,
-                                        const float bz,
-                                        const int layerIndex,
-                                        const unsigned int nCells,
-                                        const unsigned int nCellsNext,
-                                        const int maxCellNeighbours,
-                                        const int nBlocks,
-                                        const int nThreads);
+void countCellNeighboursHandler(CellSeed** cellsLayersDevice,
+                                int* neighboursLUTs,
+                                int** cellsLUTs,
+                                gpuPair<int, int>* cellNeighbours,
+                                int* neighboursIndexTable,
+                                const Tracklet** tracklets,
+                                const int deltaROF,
+                                const float maxChi2ClusterAttachment,
+                                const float bz,
+                                const int layerIndex,
+                                const unsigned int nCells,
+                                const unsigned int nCellsNext,
+                                const int maxCellNeighbours,
+                                const int nBlocks,
+                                const int nThreads,
+                                gpu::Stream& stream);
 
 void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
                                   int* neighboursLUTs,
@@ -192,11 +193,13 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
                                   const unsigned int nCellsNext,
                                   const int maxCellNeighbours,
                                   const int nBlocks,
-                                  const int nThreads);
+                                  const int nThreads,
+                                  gpu::Stream& stream);
 
 int filterCellNeighboursHandler(gpuPair<int, int>*,
                                 int*,
                                 unsigned int,
+                                gpu::Stream&,
                                 o2::its::ExternalAllocator* = nullptr);
 
 template <int nLayers = 7>
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
index da91373611a2d..72a1f98d1b78b 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
@@ -349,26 +349,20 @@ void TimeFrameGPU<nLayers>::createNeighboursIndexTablesDevice()
 {
   GPUTimer timer(mGpuStreams[0], "creating cells neighbours");
   // Here we do also the creation of the CellsDeviceArray, as the cells buffers are populated separately in the previous steps.
-  allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), mGpuStreams[0], this->getExtAllocator());
-  GPUChkErrS(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
-  GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
     GPULog("gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {:.2f} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / constants::MB);
-    allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator());
-    GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0].get()));
-    if (iLayer < nLayers - 3) {
-      mNNeighbours[iLayer] = 0;
-    }
+    allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator());
+    GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[iLayer].get()));
   }
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const unsigned int nCells)
 {
-  GPUTimer timer(mGpuStreams[0], "reserving neighboursLUT");
+  GPUTimer timer(mGpuStreams[layer], "reserving neighboursLUT");
   GPULog("gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {:.2f} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / constants::MB);
-  allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator()); // We need one element more to move exc -> inc
-  GPUChkErrS(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get()));
+  allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), mGpuStreams[layer], this->getExtAllocator()); // We need one element more to move exc -> inc
+  GPUChkErrS(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[layer].get()));
 }
 
 template <int nLayers>
@@ -382,8 +376,6 @@ void TimeFrameGPU<nLayers>::loadCellsDevice()
     GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer].get()));
     GPUChkErrS(cudaMemcpyAsync(mCellsDevice[iLayer], this->mCells[iLayer].data(), this->mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
   }
-  allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), mGpuStreams[0], this->getExtAllocator());
-  GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
 }
 
 template <int nLayers>
@@ -441,35 +433,15 @@ void TimeFrameGPU<nLayers>::loadTrackSeedsDevice(bounded_vector<CellSeed>& seeds
 }
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer, const unsigned int nNeighbours)
+void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer)
 {
-  GPUTimer timer(mGpuStreams[0], "reserving neighbours");
+  GPUTimer timer(mGpuStreams[layer], "reserving neighbours");
+  GPUChkErrS(cudaMemcpyAsync(&(this->mNNeighbours[layer]), &(mNeighboursLUTDevice[layer][this->mNCells[layer + 1] - 1]), sizeof(unsigned int), cudaMemcpyDeviceToHost, mGpuStreams[layer].get()));
   GPULog("gpu-allocation: reserving {} neighbours (pairs), for {:.2f} MB.", nNeighbours, nNeighbours * sizeof(gpuPair<int, int>) / constants::MB);
-  allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), nNeighbours * sizeof(gpuPair<int, int>), mGpuStreams[0], this->getExtAllocator());
-  GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, nNeighbours * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
+  allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), (this->mNNeighbours[layer]) * sizeof(gpuPair<int, int>), mGpuStreams[layer], this->getExtAllocator());
+  GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, (this->mNNeighbours[layer]) * sizeof(gpuPair<int, int>), mGpuStreams[layer].get()));
   GPULog("gpu-allocation: reserving {} neighbours, for {:.2f} MB.", nNeighbours, nNeighbours * sizeof(gpuPair<int, int>) / constants::MB);
-  allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), nNeighbours * sizeof(int), mGpuStreams[0], this->getExtAllocator());
-}
-
-template <int nLayers>
-void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer, std::vector<std::pair<int, int>>& neighbours)
-{
-  GPUTimer timer(mGpuStreams[0], "reserving neighbours");
-  this->mCellsNeighbours[layer].clear();
-  this->mCellsNeighbours[layer].resize(neighbours.size());
-  GPULog("gpu-allocation: reserving {} neighbours (pairs), for {:.2f} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / constants::MB);
-  allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0], this->getExtAllocator());
-  GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
-  GPULog("gpu-allocation: reserving {} neighbours, for {:.2f} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / constants::MB);
-  allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), neighbours.size() * sizeof(int), mGpuStreams[0], this->getExtAllocator());
-}
-
-template <int nLayers>
-void TimeFrameGPU<nLayers>::createNeighboursDeviceArray()
-{
-  GPUTimer timer(mGpuStreams[0], "reserving neighbours");
-  allocMemAsync(reinterpret_cast<void**>(&mNeighboursDeviceArray), (nLayers - 2) * sizeof(int*), mGpuStreams[0], this->getExtAllocator());
-  GPUChkErrS(cudaMemcpyAsync(mNeighboursDeviceArray, mNeighboursDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), (this->mNNeighbours[layer]) * sizeof(int), mGpuStreams[layer], this->getExtAllocator());
 }
 
 template <int nLayers>
@@ -532,15 +504,6 @@ void TimeFrameGPU<nLayers>::downloadTrackITSExtDevice(bounded_vector<CellSeed>&
   GPUChkErrS(cudaHostUnregister(seeds.data()));
 }
 
-template <int nLayers>
-void TimeFrameGPU<nLayers>::unregisterRest()
-{
-  GPUTimer timer(mGpuStreams[0], "unregistering rest of the host memory");
-  GPULog("unregistering rest of the host memory...");
-  GPUChkErrS(cudaHostUnregister(mCellsDevice.data()));
-  // GPUChkErrS(cudaHostUnregister(mTrackletsDevice.data()));
-}
-
 template <int nLayers>
 void TimeFrameGPU<nLayers>::unregisterHostMemory(const int maxLayers)
 {
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
index 2ae4f0aa91819..0fa258f63fd23 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
@@ -180,7 +180,6 @@ void TrackerTraitsGPU<nLayers>::computeLayerCells(const int iteration)
                         conf.nThreadsLayerCells[iteration],
                         mTimeFrameGPU->getStreams());
   }
-  mTimeFrameGPU->syncStreams(); // TODO evaluate if this can be removed
 }
 
 template <int nLayers>
@@ -188,7 +187,20 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
 {
   mTimeFrameGPU->createNeighboursIndexTablesDevice();
   const auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
+
+  std::vector<bool> isCellStreamSynched(this->mTrkParams[iteration].TrackletsPerRoad() - 1);
+  auto syncOnce = [&](const int iLayer) {
+    if (!isCellStreamSynched[iLayer]) {
+      mTimeFrameGPU->syncStream(iLayer);
+      isCellStreamSynched[iLayer] = true;
+    }
+  };
+
   for (int iLayer{0}; iLayer < this->mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) {
+    // ensure that celling is done for iLayer and iLayer+1 is done
+    syncOnce(iLayer);
+    syncOnce(iLayer + 1);
+
     const int currentLayerCellsNum{static_cast<int>(mTimeFrameGPU->getNCells()[iLayer])};
     const int nextLayerCellsNum{static_cast<int>(mTimeFrameGPU->getNCells()[iLayer + 1])};
     if (!nextLayerCellsNum || !currentLayerCellsNum) {
@@ -197,24 +209,23 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
     }
 
     mTimeFrameGPU->createNeighboursLUTDevice(iLayer, nextLayerCellsNum);
-    unsigned int nNeigh = countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
-                                                     mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here.
-                                                     mTimeFrameGPU->getDeviceArrayCellsLUT(),
-                                                     mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
-                                                     mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer),
-                                                     (const Tracklet**)mTimeFrameGPU->getDeviceArrayTracklets(),
-                                                     this->mTrkParams[0].DeltaROF,
-                                                     this->mTrkParams[0].MaxChi2ClusterAttachment,
-                                                     this->mBz,
-                                                     iLayer,
-                                                     currentLayerCellsNum,
-                                                     nextLayerCellsNum,
-                                                     1e2,
-                                                     conf.nBlocksFindNeighbours[iteration],
-                                                     conf.nThreadsFindNeighbours[iteration]);
-
-    mTimeFrameGPU->createNeighboursDevice(iLayer, nNeigh);
-
+    countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
+                               mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here.
+                               mTimeFrameGPU->getDeviceArrayCellsLUT(),
+                               mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
+                               mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer),
+                               (const Tracklet**)mTimeFrameGPU->getDeviceArrayTracklets(),
+                               this->mTrkParams[0].DeltaROF,
+                               this->mTrkParams[0].MaxChi2ClusterAttachment,
+                               this->mBz,
+                               iLayer,
+                               currentLayerCellsNum,
+                               nextLayerCellsNum,
+                               1e2,
+                               conf.nBlocksFindNeighbours[iteration],
+                               conf.nThreadsFindNeighbours[iteration],
+                               mTimeFrameGPU->getStream(iLayer));
+    mTimeFrameGPU->createNeighboursDevice(iLayer);
     computeCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
                                  mTimeFrameGPU->getDeviceNeighboursLUT(iLayer),
                                  mTimeFrameGPU->getDeviceArrayCellsLUT(),
@@ -229,16 +240,15 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
                                  nextLayerCellsNum,
                                  1e2,
                                  conf.nBlocksFindNeighbours[iteration],
-                                 conf.nThreadsFindNeighbours[iteration]);
-
-    nNeigh = filterCellNeighboursHandler(mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
-                                         mTimeFrameGPU->getDeviceNeighbours(iLayer),
-                                         nNeigh,
-                                         mTimeFrameGPU->getExternalAllocator());
-    mTimeFrameGPU->getArrayNNeighbours()[iLayer] = nNeigh;
+                                 conf.nThreadsFindNeighbours[iteration],
+                                 mTimeFrameGPU->getStream(iLayer));
+    mTimeFrameGPU->getArrayNNeighbours()[iLayer] = filterCellNeighboursHandler(mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
+                                                                               mTimeFrameGPU->getDeviceNeighbours(iLayer),
+                                                                               mTimeFrameGPU->getArrayNNeighbours()[iLayer],
+                                                                               mTimeFrameGPU->getStream(iLayer),
+                                                                               mTimeFrameGPU->getExternalAllocator());
   }
-  mTimeFrameGPU->createNeighboursDeviceArray();
-  mTimeFrameGPU->unregisterRest();
+  mTimeFrameGPU->syncStreams(); // TODO evaluate if this can be removed
 };
 
 template <int nLayers>
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
index d4dcda067b26f..060f150bd6f42 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
@@ -1123,23 +1123,24 @@ void computeCellsHandler(
     nSigmaCut);               // const float
 }
 
-unsigned int countCellNeighboursHandler(CellSeed** cellsLayersDevice,
-                                        int* neighboursLUT,
-                                        int** cellsLUTs,
-                                        gpuPair<int, int>* cellNeighbours,
-                                        int* neighboursIndexTable,
-                                        const Tracklet** tracklets,
-                                        const int deltaROF,
-                                        const float maxChi2ClusterAttachment,
-                                        const float bz,
-                                        const int layerIndex,
-                                        const unsigned int nCells,
-                                        const unsigned int nCellsNext,
-                                        const int maxCellNeighbours,
-                                        const int nBlocks,
-                                        const int nThreads)
+void countCellNeighboursHandler(CellSeed** cellsLayersDevice,
+                                int* neighboursLUT,
+                                int** cellsLUTs,
+                                gpuPair<int, int>* cellNeighbours,
+                                int* neighboursIndexTable,
+                                const Tracklet** tracklets,
+                                const int deltaROF,
+                                const float maxChi2ClusterAttachment,
+                                const float bz,
+                                const int layerIndex,
+                                const unsigned int nCells,
+                                const unsigned int nCellsNext,
+                                const int maxCellNeighbours,
+                                const int nBlocks,
+                                const int nThreads,
+                                gpu::Stream& stream)
 {
-  gpu::computeLayerCellNeighboursKernel<true><<<nBlocks, nThreads>>>(
+  gpu::computeLayerCellNeighboursKernel<true><<<nBlocks, nThreads, 0, stream.get()>>>(
     cellsLayersDevice,
     neighboursLUT,
     neighboursIndexTable,
@@ -1152,11 +1153,8 @@ unsigned int countCellNeighboursHandler(CellSeed** cellsLayersDevice,
     layerIndex,
     nCells,
     maxCellNeighbours);
-  gpu::cubInclusiveScanInPlace(neighboursLUT, nCellsNext);
-  gpu::cubExclusiveScanInPlace(neighboursIndexTable, nCells + 1);
-  unsigned int nNeighbours;
-  GPUChkErrS(cudaMemcpy(&nNeighbours, &neighboursLUT[nCellsNext - 1], sizeof(unsigned int), cudaMemcpyDeviceToHost));
-  return nNeighbours;
+  gpu::cubInclusiveScanInPlace(neighboursLUT, nCellsNext, stream.get());
+  gpu::cubExclusiveScanInPlace(neighboursIndexTable, nCells + 1, stream.get());
 }
 
 void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
@@ -1173,10 +1171,10 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
                                   const unsigned int nCellsNext,
                                   const int maxCellNeighbours,
                                   const int nBlocks,
-                                  const int nThreads)
+                                  const int nThreads,
+                                  gpu::Stream& stream)
 {
-
-  gpu::computeLayerCellNeighboursKernel<false><<<nBlocks, nThreads>>>(
+  gpu::computeLayerCellNeighboursKernel<false><<<nBlocks, nThreads, 0, stream.get()>>>(
     cellsLayersDevice,
     neighboursLUT,
     neighboursIndexTable,
@@ -1189,21 +1187,23 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
     layerIndex,
     nCells,
     maxCellNeighbours);
-  GPUChkErrS(cudaPeekAtLastError());
-  GPUChkErrS(cudaDeviceSynchronize());
 }
 
 int filterCellNeighboursHandler(gpuPair<int, int>* cellNeighbourPairs,
                                 int* cellNeighbours,
                                 unsigned int nNeigh,
+                                gpu::Stream& stream,
                                 o2::its::ExternalAllocator* allocator)
 {
+  /// Internal thrust allocation serialize this part to a degree
+  /// TODO switch to cub equivelent and do all work on one stream
+  auto nosync_policy = THRUST_NAMESPACE::par_nosync.on(stream.get());
   thrust::device_ptr<gpuPair<int, int>> neighVectorPairs(cellNeighbourPairs);
   thrust::device_ptr<int> validNeighs(cellNeighbours);
-  auto updatedEnd = thrust::remove_if(neighVectorPairs, neighVectorPairs + nNeigh, gpu::is_invalid_pair<int, int>());
+  auto updatedEnd = thrust::remove_if(nosync_policy, neighVectorPairs, neighVectorPairs + nNeigh, gpu::is_invalid_pair<int, int>());
   size_t newSize = updatedEnd - neighVectorPairs;
-  thrust::stable_sort(neighVectorPairs, neighVectorPairs + newSize, gpu::sort_by_second<int, int>());
-  thrust::transform(neighVectorPairs, neighVectorPairs + newSize, validNeighs, gpu::pair_to_first<int, int>());
+  thrust::stable_sort(nosync_policy, neighVectorPairs, neighVectorPairs + newSize, gpu::sort_by_second<int, int>());
+  thrust::transform(nosync_policy, neighVectorPairs, neighVectorPairs + newSize, validNeighs, gpu::pair_to_first<int, int>());
 
   return newSize;
 }

From 5fc126f4c30410de4d25b44c944a538a6151255c Mon Sep 17 00:00:00 2001
From: Felix Schlepper <felix.schlepper@cern.ch>
Date: Thu, 7 Aug 2025 08:00:13 +0200
Subject: [PATCH 4/6] ITS: GPU: use external allocator for temp storage

Signed-off-by: Felix Schlepper <felix.schlepper@cern.ch>
---
 .../GPU/ITStrackingGPU/TrackingKernels.h      |  6 +-
 .../tracking/GPU/cuda/TrackerTraitsGPU.cxx    |  6 +-
 .../ITS/tracking/GPU/cuda/TrackingKernels.cu  | 67 ++++++++++++-------
 3 files changed, 54 insertions(+), 25 deletions(-)

diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
index caa7675756db6..a7bf4c70bc5c2 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
@@ -86,6 +86,7 @@ void countTrackletsInROFsHandler(const IndexTableUtils* utils,
                                  bounded_vector<float>& resolutions,
                                  std::vector<float>& radii,
                                  bounded_vector<float>& mulScatAng,
+                                 o2::its::ExternalAllocator* alloc,
                                  const int nBlocks,
                                  const int nThreads,
                                  gpu::Streams& streams);
@@ -120,6 +121,7 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
                                    bounded_vector<float>& resolutions,
                                    std::vector<float>& radii,
                                    bounded_vector<float>& mulScatAng,
+                                   o2::its::ExternalAllocator* alloc,
                                    const int nBlocks,
                                    const int nThreads,
                                    gpu::Streams& streams);
@@ -139,6 +141,7 @@ void countCellsHandler(const Cluster** sortedClusters,
                        const float maxChi2ClusterAttachment,
                        const float cellDeltaTanLambdaSigma,
                        const float nSigmaCut,
+                       o2::its::ExternalAllocator* alloc,
                        const int nBlocks,
                        const int nThreads,
                        gpu::Streams& streams);
@@ -175,6 +178,7 @@ void countCellNeighboursHandler(CellSeed** cellsLayersDevice,
                                 const unsigned int nCells,
                                 const unsigned int nCellsNext,
                                 const int maxCellNeighbours,
+                                o2::its::ExternalAllocator* alloc,
                                 const int nBlocks,
                                 const int nThreads,
                                 gpu::Stream& stream);
@@ -213,12 +217,12 @@ void processNeighboursHandler(const int startLayer,
                               gsl::span<int*> neighboursDeviceLUTs,
                               const TrackingFrameInfo** foundTrackingFrameInfo,
                               bounded_vector<CellSeed>& seedsHost,
-                              o2::its::ExternalAllocator*,
                               const float bz,
                               const float MaxChi2ClusterAttachment,
                               const float maxChi2NDF,
                               const o2::base::Propagator* propagator,
                               const o2::base::PropagatorF::MatCorrType matCorrType,
+                              o2::its::ExternalAllocator* alloc,
                               const int nBlocks,
                               const int nThreads);
 
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
index 0fa258f63fd23..e7135e55a16c8 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
@@ -80,6 +80,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
                                        mTimeFrameGPU->getPositionResolutions(),
                                        this->mTrkParams[iteration].LayerRadii,
                                        mTimeFrameGPU->getMSangles(),
+                                       mTimeFrameGPU->getExternalAllocator(),
                                        conf.nBlocksLayerTracklets[iteration],
                                        conf.nThreadsLayerTracklets[iteration],
                                        mTimeFrameGPU->getStreams());
@@ -113,6 +114,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
                                          mTimeFrameGPU->getPositionResolutions(),
                                          this->mTrkParams[iteration].LayerRadii,
                                          mTimeFrameGPU->getMSangles(),
+                                         mTimeFrameGPU->getExternalAllocator(),
                                          conf.nBlocksLayerTracklets[iteration],
                                          conf.nThreadsLayerTracklets[iteration],
                                          mTimeFrameGPU->getStreams());
@@ -157,6 +159,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerCells(const int iteration)
                       this->mTrkParams[iteration].MaxChi2ClusterAttachment,
                       this->mTrkParams[iteration].CellDeltaTanLambdaSigma,
                       this->mTrkParams[iteration].NSigmaCut,
+                      mTimeFrameGPU->getExternalAllocator(),
                       conf.nBlocksLayerCells[iteration],
                       conf.nThreadsLayerCells[iteration],
                       mTimeFrameGPU->getStreams());
@@ -222,6 +225,7 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
                                currentLayerCellsNum,
                                nextLayerCellsNum,
                                1e2,
+                               mTimeFrameGPU->getExternalAllocator(),
                                conf.nBlocksFindNeighbours[iteration],
                                conf.nThreadsFindNeighbours[iteration],
                                mTimeFrameGPU->getStream(iLayer));
@@ -272,12 +276,12 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
                                         mTimeFrameGPU->getDeviceNeighboursLUTs(),
                                         mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
                                         trackSeeds,
-                                        mTimeFrameGPU->getExternalAllocator(),
                                         this->mBz,
                                         this->mTrkParams[0].MaxChi2ClusterAttachment,
                                         this->mTrkParams[0].MaxChi2NDF,
                                         mTimeFrameGPU->getDevicePropagator(),
                                         this->mTrkParams[0].CorrType,
+                                        mTimeFrameGPU->getExternalAllocator(),
                                         conf.nBlocksProcessNeighbours[iteration],
                                         conf.nThreadsProcessNeighbours[iteration]);
     }
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
index 060f150bd6f42..b8eeae12691a3 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
@@ -871,37 +871,53 @@ GPUg() void printCellSeeds(CellSeed* seed, int nCells, const unsigned int tId =
 }
 
 template <typename T>
-GPUhi() void cubExclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stream = nullptr)
+GPUhi() void cubExclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
 {
   void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
   GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
-  GPUChkErrS(cudaMallocAsync(&d_temp_storage, temp_storage_bytes, stream));
+  if (alloc) {
+    d_temp_storage = alloc->allocate(temp_storage_bytes);
+  } else {
+    GPUChkErrS(cudaMallocAsync(&d_temp_storage, temp_storage_bytes, stream));
+  }
   GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
-  GPUChkErrS(cudaFreeAsync(d_temp_storage, stream));
+  if (alloc) {
+    alloc->deallocate(reinterpret_cast<char*>(d_temp_storage), temp_storage_bytes);
+  } else {
+    GPUChkErrS(cudaFreeAsync(d_temp_storage, stream));
+  }
 }
 
 template <typename Vector>
-GPUhi() void cubExclusiveScanInPlace(Vector& in_out, int num_items, cudaStream_t stream = nullptr)
+GPUhi() void cubExclusiveScanInPlace(Vector& in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
 {
-  cubExclusiveScanInPlace(thrust::raw_pointer_cast(in_out.data()), num_items, stream);
+  cubExclusiveScanInPlace(thrust::raw_pointer_cast(in_out.data()), num_items, stream, alloc);
 }
 
 template <typename T>
-GPUhi() void cubInclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stream = nullptr)
+GPUhi() void cubInclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
 {
   void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
   GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
-  GPUChkErrS(cudaMallocAsync(&d_temp_storage, temp_storage_bytes, stream));
+  if (alloc) {
+    d_temp_storage = alloc->allocate(temp_storage_bytes);
+  } else {
+    GPUChkErrS(cudaMallocAsync(&d_temp_storage, temp_storage_bytes, stream));
+  }
   GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
-  GPUChkErrS(cudaFreeAsync(d_temp_storage, stream));
+  if (alloc) {
+    alloc->deallocate(reinterpret_cast<char*>(d_temp_storage), temp_storage_bytes);
+  } else {
+    GPUChkErrS(cudaFreeAsync(d_temp_storage, stream));
+  }
 }
 
 template <typename Vector>
-GPUhi() void cubInclusiveScanInPlace(Vector& in_out, int num_items, cudaStream_t stream = nullptr)
+GPUhi() void cubInclusiveScanInPlace(Vector& in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
 {
-  cubInclusiveScanInPlace(thrust::raw_pointer_cast(in_out.data()), num_items, stream);
+  cubInclusiveScanInPlace(thrust::raw_pointer_cast(in_out.data()), num_items, stream, alloc);
 }
 } // namespace gpu
 
@@ -932,6 +948,7 @@ void countTrackletsInROFsHandler(const IndexTableUtils* utils,
                                  bounded_vector<float>& resolutions,
                                  std::vector<float>& radii,
                                  bounded_vector<float>& mulScatAng,
+                                 o2::its::ExternalAllocator* alloc,
                                  const int nBlocks,
                                  const int nThreads,
                                  gpu::Streams& streams)
@@ -964,7 +981,7 @@ void countTrackletsInROFsHandler(const IndexTableUtils* utils,
       resolutions[iLayer],
       radii[iLayer + 1] - radii[iLayer],
       mulScatAng[iLayer]);
-    gpu::cubExclusiveScanInPlace(trackletsLUTsHost[iLayer], nClusters[iLayer] + 1, streams[iLayer].get());
+    gpu::cubExclusiveScanInPlace(trackletsLUTsHost[iLayer], nClusters[iLayer] + 1, streams[iLayer].get(), alloc);
   }
 }
 
@@ -998,6 +1015,7 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
                                    bounded_vector<float>& resolutions,
                                    std::vector<float>& radii,
                                    bounded_vector<float>& mulScatAng,
+                                   o2::its::ExternalAllocator* alloc,
                                    const int nBlocks,
                                    const int nThreads,
                                    gpu::Streams& streams)
@@ -1043,7 +1061,7 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
         spanTracklets[iLayer],
         trackletsLUTsHost[iLayer],
         nTracklets[iLayer]);
-      gpu::cubExclusiveScanInPlace(trackletsLUTsHost[iLayer], nClusters[iLayer] + 1, streams[iLayer].get());
+      gpu::cubExclusiveScanInPlace(trackletsLUTsHost[iLayer], nClusters[iLayer] + 1, streams[iLayer].get(), alloc);
     }
   }
 }
@@ -1064,6 +1082,7 @@ void countCellsHandler(
   const float maxChi2ClusterAttachment,
   const float cellDeltaTanLambdaSigma,
   const float nSigmaCut,
+  o2::its::ExternalAllocator* alloc,
   const int nBlocks,
   const int nThreads,
   gpu::Streams& streams)
@@ -1083,7 +1102,7 @@ void countCellsHandler(
     maxChi2ClusterAttachment, // const float
     cellDeltaTanLambdaSigma,  // const float
     nSigmaCut);               // const float
-  gpu::cubExclusiveScanInPlace(cellsLUTsHost, nTracklets + 1, streams[layer].get());
+  gpu::cubExclusiveScanInPlace(cellsLUTsHost, nTracklets + 1, streams[layer].get(), alloc);
 }
 
 void computeCellsHandler(
@@ -1136,6 +1155,7 @@ void countCellNeighboursHandler(CellSeed** cellsLayersDevice,
                                 const unsigned int nCells,
                                 const unsigned int nCellsNext,
                                 const int maxCellNeighbours,
+                                o2::its::ExternalAllocator* alloc,
                                 const int nBlocks,
                                 const int nThreads,
                                 gpu::Stream& stream)
@@ -1153,8 +1173,8 @@ void countCellNeighboursHandler(CellSeed** cellsLayersDevice,
     layerIndex,
     nCells,
     maxCellNeighbours);
-  gpu::cubInclusiveScanInPlace(neighboursLUT, nCellsNext, stream.get());
-  gpu::cubExclusiveScanInPlace(neighboursIndexTable, nCells + 1, stream.get());
+  gpu::cubInclusiveScanInPlace(neighboursLUT, nCellsNext, stream.get(), alloc);
+  gpu::cubExclusiveScanInPlace(neighboursIndexTable, nCells + 1, stream.get(), alloc);
 }
 
 void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
@@ -1219,19 +1239,18 @@ void processNeighboursHandler(const int startLayer,
                               gsl::span<int*> neighboursDeviceLUTs,
                               const TrackingFrameInfo** foundTrackingFrameInfo,
                               bounded_vector<CellSeed>& seedsHost,
-                              o2::its::ExternalAllocator* allocator,
                               const float bz,
                               const float maxChi2ClusterAttachment,
                               const float maxChi2NDF,
                               const o2::base::Propagator* propagator,
                               const o2::base::PropagatorF::MatCorrType matCorrType,
+                              o2::its::ExternalAllocator* alloc,
                               const int nBlocks,
                               const int nThreads)
 {
-  auto allocInt = gpu::TypedAllocator<int>(allocator);
-  auto allocCellSeed = gpu::TypedAllocator<CellSeed>(allocator);
-  thrust::device_vector<int, gpu::TypedAllocator<int>> foundSeedsTable(nCells[startLayer] + 1, 0, allocInt); // Shortcut: device_vector skips central memory management, we are relying on the contingency.
-                                                                                                             // TODO: fix this.
+  auto allocInt = gpu::TypedAllocator<int>(alloc);
+  auto allocCellSeed = gpu::TypedAllocator<CellSeed>(alloc);
+  thrust::device_vector<int, gpu::TypedAllocator<int>> foundSeedsTable(nCells[startLayer] + 1, 0, allocInt);
 
   gpu::processNeighboursKernel<true><<<nBlocks, nThreads>>>(
     startLayer,
@@ -1251,7 +1270,7 @@ void processNeighboursHandler(const int startLayer,
     maxChi2ClusterAttachment,
     propagator,
     matCorrType);
-  gpu::cubExclusiveScanInPlace(foundSeedsTable, nCells[startLayer] + 1);
+  gpu::cubExclusiveScanInPlace(foundSeedsTable, nCells[startLayer] + 1, gpu::Stream::DefaultStream, alloc);
 
   thrust::device_vector<int, gpu::TypedAllocator<int>> updatedCellId(foundSeedsTable.back(), 0, allocInt);
   thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>> updatedCellSeed(foundSeedsTable.back(), allocCellSeed);
@@ -1306,7 +1325,7 @@ void processNeighboursHandler(const int startLayer,
       maxChi2ClusterAttachment,
       propagator,
       matCorrType);
-    gpu::cubExclusiveScanInPlace(foundSeedsTable, foundSeedsTable.size());
+    gpu::cubExclusiveScanInPlace(foundSeedsTable, foundSeedsTable.size(), gpu::Stream::DefaultStream, alloc);
 
     auto foundSeeds{foundSeedsTable.back()};
     updatedCellId.resize(foundSeeds);
@@ -1402,6 +1421,7 @@ template void countTrackletsInROFsHandler<7>(const IndexTableUtils* utils,
                                              bounded_vector<float>& resolutions,
                                              std::vector<float>& radii,
                                              bounded_vector<float>& mulScatAng,
+                                             o2::its::ExternalAllocator* alloc,
                                              const int nBlocks,
                                              const int nThreads,
                                              gpu::Streams& streams);
@@ -1435,6 +1455,7 @@ template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils,
                                                bounded_vector<float>& resolutions,
                                                std::vector<float>& radii,
                                                bounded_vector<float>& mulScatAng,
+                                               o2::its::ExternalAllocator* alloc,
                                                const int nBlocks,
                                                const int nThreads,
                                                gpu::Streams& streams);
@@ -1449,12 +1470,12 @@ template void processNeighboursHandler<7>(const int startLayer,
                                           gsl::span<int*> neighboursDeviceLUTs,
                                           const TrackingFrameInfo** foundTrackingFrameInfo,
                                           bounded_vector<CellSeed>& seedsHost,
-                                          o2::its::ExternalAllocator*,
                                           const float bz,
                                           const float maxChi2ClusterAttachment,
                                           const float maxChi2NDF,
                                           const o2::base::Propagator* propagator,
                                           const o2::base::PropagatorF::MatCorrType matCorrType,
+                                          o2::its::ExternalAllocator* alloc,
                                           const int nBlocks,
                                           const int nThreads);
 } // namespace o2::its

From a2993298ccd492f2e72fb6a9f057dde1d6cefe6d Mon Sep 17 00:00:00 2001
From: Felix Schlepper <felix.schlepper@cern.ch>
Date: Thu, 7 Aug 2025 08:06:04 +0200
Subject: [PATCH 5/6] ITS: GPU: simplify stream synchronization

Signed-off-by: Felix Schlepper <felix.schlepper@cern.ch>
---
 .../tracking/GPU/cuda/TrackerTraitsGPU.cxx    | 26 ++++---------------
 1 file changed, 5 insertions(+), 21 deletions(-)

diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
index e7135e55a16c8..bef271a1b0129 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
@@ -126,18 +126,10 @@ void TrackerTraitsGPU<nLayers>::computeLayerCells(const int iteration)
   mTimeFrameGPU->createCellsLUTDevice();
   auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
 
-  std::vector<bool> isTrackletStreamSynched(this->mTrkParams[iteration].TrackletsPerRoad());
-  auto syncOnce = [&](const int iLayer) {
-    if (!isTrackletStreamSynched[iLayer]) {
-      mTimeFrameGPU->syncStream(iLayer);
-      isTrackletStreamSynched[iLayer] = true;
-    }
-  };
-
+  mTimeFrameGPU->syncStream(0);
   for (int iLayer = 0; iLayer < this->mTrkParams[iteration].CellsPerRoad(); ++iLayer) {
-    // need to ensure that trackleting on layers iLayer and iLayer + 1 are done (only once)
-    syncOnce(iLayer);
-    syncOnce(iLayer + 1);
+    mTimeFrameGPU->syncStream(iLayer + 1);
+
     // if there are no tracklets skip entirely
     const int currentLayerTrackletsNum{static_cast<int>(mTimeFrameGPU->getNTracklets()[iLayer])};
     if (!currentLayerTrackletsNum || !mTimeFrameGPU->getNTracklets()[iLayer + 1]) {
@@ -191,18 +183,10 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
   mTimeFrameGPU->createNeighboursIndexTablesDevice();
   const auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
 
-  std::vector<bool> isCellStreamSynched(this->mTrkParams[iteration].TrackletsPerRoad() - 1);
-  auto syncOnce = [&](const int iLayer) {
-    if (!isCellStreamSynched[iLayer]) {
-      mTimeFrameGPU->syncStream(iLayer);
-      isCellStreamSynched[iLayer] = true;
-    }
-  };
-
+  mTimeFrameGPU->syncStream(0);
   for (int iLayer{0}; iLayer < this->mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) {
     // ensure that celling is done for iLayer and iLayer+1 is done
-    syncOnce(iLayer);
-    syncOnce(iLayer + 1);
+    mTimeFrameGPU->syncStream(iLayer + 1);
 
     const int currentLayerCellsNum{static_cast<int>(mTimeFrameGPU->getNCells()[iLayer])};
     const int nextLayerCellsNum{static_cast<int>(mTimeFrameGPU->getNCells()[iLayer + 1])};

From 6e7242de0ecb985bcf969b4e6d2aef5e85751ae1 Mon Sep 17 00:00:00 2001
From: Felix Schlepper <felix.schlepper@cern.ch>
Date: Fri, 8 Aug 2025 12:55:50 +0200
Subject: [PATCH 6/6] ITS: GPU: resolve added TODOs

Signed-off-by: Felix Schlepper <felix.schlepper@cern.ch>
---
 .../ITS/tracking/GPU/ITStrackingGPU/Utils.h   |   6 +-
 .../ITS/tracking/GPU/cuda/TrackingKernels.cu  | 119 ++++++++++++------
 2 files changed, 84 insertions(+), 41 deletions(-)

diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
index 9d25bd22484fe..cd860c47ebd9c 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
@@ -95,13 +95,11 @@ class Stream
 #if defined(__HIPCC__)
   using Handle = hipStream_t;
   static constexpr Handle DefaultStream = 0;
-  // static constexpr unsigned int DefaultFlag = hipStreamNonBlocking; TODO replace once ready
-  static constexpr unsigned int DefaultFlag = 0;
+  static constexpr unsigned int DefaultFlag = hipStreamNonBlocking;
 #elif defined(__CUDACC__)
   using Handle = cudaStream_t;
   static constexpr Handle DefaultStream = 0;
-  // static constexpr unsigned int DefaultFlag = cudaStreamNonBlocking; TODO replace once ready
-  static constexpr unsigned int DefaultFlag = 0;
+  static constexpr unsigned int DefaultFlag = cudaStreamNonBlocking;
 #else
   using Handle = void*;
   static constexpr Handle DefaultStream = nullptr;
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
index b8eeae12691a3..401d98ad63560 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
@@ -145,9 +145,7 @@ GPUd() bool fitTrack(TrackITSExt& track,
 
     if (matCorrType == o2::base::PropagatorF::MatCorrType::USEMatCorrNONE) {
       const float xx0 = (iLayer > 2) ? 1.e-2f : 5.e-3f; // Rough layer thickness
-      constexpr float radiationLength = 9.36f;          // Radiation length of Si [cm]
-      constexpr float density = 2.33f;                  // Density of Si [g/cm^3]
-      if (!track.correctForMaterial(xx0, xx0 * radiationLength * density, true)) {
+      if (!track.correctForMaterial(xx0, xx0 * constants::Radl * constants::Rho, true)) {
         return false;
       }
     }
@@ -728,13 +726,13 @@ GPUg() void processNeighboursKernel(const int layer,
       if (!seed.o2::track::TrackParCov::update(trHit.positionTrackingFrame, trHit.covarianceTrackingFrame)) {
         continue;
       }
-      seed.getClusters()[layer - 1] = neighbourCell.getFirstClusterIndex();
-      seed.setLevel(neighbourCell.getLevel());
-      seed.setFirstTrackletIndex(neighbourCell.getFirstTrackletIndex());
-      seed.setSecondTrackletIndex(neighbourCell.getSecondTrackletIndex());
       if constexpr (dryRun) {
         foundSeedsTable[iCurrentCell]++;
       } else {
+        seed.getClusters()[layer - 1] = neighbourCell.getFirstClusterIndex();
+        seed.setLevel(neighbourCell.getLevel());
+        seed.setFirstTrackletIndex(neighbourCell.getFirstTrackletIndex());
+        seed.setSecondTrackletIndex(neighbourCell.getSecondTrackletIndex());
         updatedCellsIds[foundSeedsTable[iCurrentCell] + foundSeeds] = neighbourCellId;
         updatedCellSeeds[foundSeedsTable[iCurrentCell] + foundSeeds] = seed;
       }
@@ -870,25 +868,35 @@ GPUg() void printCellSeeds(CellSeed* seed, int nCells, const unsigned int tId =
   }
 }
 
-template <typename T>
-GPUhi() void cubExclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
+GPUhi() void allocateMemory(void** p, size_t bytes, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
 {
-  void* d_temp_storage = nullptr;
-  size_t temp_storage_bytes = 0;
-  GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
   if (alloc) {
-    d_temp_storage = alloc->allocate(temp_storage_bytes);
+    *p = alloc->allocate(bytes);
   } else {
-    GPUChkErrS(cudaMallocAsync(&d_temp_storage, temp_storage_bytes, stream));
+    GPUChkErrS(cudaMallocAsync(p, bytes, stream));
   }
-  GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
+}
+
+GPUhi() void deallocateMemory(void* p, size_t bytes, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
+{
   if (alloc) {
-    alloc->deallocate(reinterpret_cast<char*>(d_temp_storage), temp_storage_bytes);
+    alloc->deallocate(reinterpret_cast<char*>(p), bytes);
   } else {
-    GPUChkErrS(cudaFreeAsync(d_temp_storage, stream));
+    GPUChkErrS(cudaFreeAsync(p, stream));
   }
 }
 
+template <typename T>
+GPUhi() void cubExclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
+{
+  void* d_temp_storage = nullptr;
+  size_t temp_storage_bytes = 0;
+  GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
+  allocateMemory(&d_temp_storage, temp_storage_bytes, stream, alloc);
+  GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
+  deallocateMemory(d_temp_storage, temp_storage_bytes, stream, alloc);
+}
+
 template <typename Vector>
 GPUhi() void cubExclusiveScanInPlace(Vector& in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
 {
@@ -901,21 +909,13 @@ GPUhi() void cubInclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stre
   void* d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
   GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
-  if (alloc) {
-    d_temp_storage = alloc->allocate(temp_storage_bytes);
-  } else {
-    GPUChkErrS(cudaMallocAsync(&d_temp_storage, temp_storage_bytes, stream));
-  }
+  allocateMemory(&d_temp_storage, temp_storage_bytes, stream, alloc);
   GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream));
-  if (alloc) {
-    alloc->deallocate(reinterpret_cast<char*>(d_temp_storage), temp_storage_bytes);
-  } else {
-    GPUChkErrS(cudaFreeAsync(d_temp_storage, stream));
-  }
+  deallocateMemory(d_temp_storage, temp_storage_bytes, stream, alloc);
 }
 
 template <typename Vector>
-GPUhi() void cubInclusiveScanInPlace(Vector& in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
+GPUhi() void cubInclusiveScanInPlace(Vector& in_out, int num_items, cudaStream_t stream = nullptr, o2::its::ExternalAllocator* alloc = nullptr)
 {
   cubInclusiveScanInPlace(thrust::raw_pointer_cast(in_out.data()), num_items, stream, alloc);
 }
@@ -1048,13 +1048,30 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
       resolutions[iLayer],
       radii[iLayer + 1] - radii[iLayer],
       mulScatAng[iLayer]);
-    /// Internal thrust allocation serialize this part to a degree
-    /// TODO switch to cub equivelent and do all work on one stream
-    thrust::device_ptr<Tracklet> tracklets_ptr(spanTracklets[iLayer]);
-    auto nosync_policy = THRUST_NAMESPACE::par_nosync.on(streams[iLayer].get());
-    thrust::sort(nosync_policy, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::sort_tracklets());
-    auto unique_end = thrust::unique(nosync_policy, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::equal_tracklets());
-    nTracklets[iLayer] = unique_end - tracklets_ptr;
+    if (nTracklets[iLayer]) {
+      Tracklet *tracklets_in = spanTracklets[iLayer], *tracklets_out{nullptr};
+      size_t n = nTracklets[iLayer];
+      size_t sort_temp_bytes = 0;
+      GPUChkErrS(cub::DeviceMergeSort::SortKeys(nullptr, sort_temp_bytes, tracklets_in, n, gpu::sort_tracklets{}, streams[iLayer].get()));
+      void* sort_temp_storage = nullptr;
+      gpu::allocateMemory(&sort_temp_storage, sort_temp_bytes, streams[iLayer].get(), alloc);
+      GPUChkErrS(cub::DeviceMergeSort::SortKeys(sort_temp_storage, sort_temp_bytes, tracklets_in, n, gpu::sort_tracklets{}, streams[iLayer].get()));
+      gpu::allocateMemory(reinterpret_cast<void**>(&tracklets_out), n * sizeof(Tracklet), streams[iLayer].get(), alloc);
+      size_t unique_temp_bytes = 0;
+      int* num_selected = nullptr;
+      gpu::allocateMemory(reinterpret_cast<void**>(&num_selected), sizeof(int), streams[iLayer].get(), alloc);
+      GPUChkErrS(cub::DeviceSelect::Unique(nullptr, unique_temp_bytes, tracklets_in, tracklets_out, num_selected, n, streams[iLayer].get()));
+      void* unique_temp_storage = nullptr;
+      gpu::allocateMemory(&unique_temp_storage, unique_temp_bytes, streams[iLayer].get(), alloc);
+      GPUChkErrS(cub::DeviceSelect::Unique(unique_temp_storage, unique_temp_bytes, tracklets_in, tracklets_out, num_selected, n, streams[iLayer].get()));
+      GPUChkErrS(cudaMemcpyAsync(tracklets_in, tracklets_out, n * sizeof(Tracklet), cudaMemcpyDeviceToDevice, streams[iLayer].get()));
+      GPUChkErrS(cudaMemcpyAsync(&nTracklets[iLayer], num_selected, sizeof(int), cudaMemcpyDeviceToHost, streams[iLayer].get()));
+      streams[iLayer].sync();
+      gpu::deallocateMemory(tracklets_out, n * sizeof(Tracklet), streams[iLayer].get(), alloc);
+      gpu::deallocateMemory(sort_temp_storage, sort_temp_bytes, streams[iLayer].get(), alloc);
+      gpu::deallocateMemory(unique_temp_storage, unique_temp_bytes, streams[iLayer].get(), alloc);
+      gpu::deallocateMemory(num_selected, sizeof(int), streams[iLayer].get(), alloc);
+    }
     if (iLayer > 0) {
       GPUChkErrS(cudaMemsetAsync(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int), streams[iLayer].get()));
       gpu::compileTrackletsLookupTableKernel<<<nBlocks, nThreads, 0, streams[iLayer].get()>>>(
@@ -1215,8 +1232,35 @@ int filterCellNeighboursHandler(gpuPair<int, int>* cellNeighbourPairs,
                                 gpu::Stream& stream,
                                 o2::its::ExternalAllocator* allocator)
 {
-  /// Internal thrust allocation serialize this part to a degree
-  /// TODO switch to cub equivelent and do all work on one stream
+#ifndef __HIPCC__
+  int* d_num_selected = nullptr;
+  gpu::allocateMemory(reinterpret_cast<void**>(&d_num_selected), sizeof(int), stream.get(), allocator);
+  size_t select_bytes = 0;
+  GPUChkErrS(cub::DeviceSelect::If(nullptr, select_bytes, cellNeighbourPairs, static_cast<gpuPair<int, int>*>(nullptr), d_num_selected, nNeigh, gpu::is_valid_pair<int, int>(), stream.get()));
+  void* select_temp = nullptr;
+  gpu::allocateMemory(&select_temp, select_bytes, stream.get(), allocator);
+  gpuPair<int, int>* d_temp_valid = nullptr;
+  gpu::allocateMemory(reinterpret_cast<void**>(&d_temp_valid), nNeigh * sizeof(gpuPair<int, int>), stream.get(), allocator);
+  GPUChkErrS(cub::DeviceSelect::If(select_temp, select_bytes, cellNeighbourPairs, d_temp_valid, d_num_selected, nNeigh, gpu::is_valid_pair<int, int>(), stream.get()));
+  int newSize = 0;
+  GPUChkErrS(cudaMemcpyAsync(&newSize, d_num_selected, sizeof(int), cudaMemcpyDeviceToHost, stream.get()));
+  stream.sync(); // needed to get newSize
+  size_t sort_bytes = 0;
+  GPUChkErrS(cub::DeviceMergeSort::SortPairs(nullptr, sort_bytes, d_temp_valid, d_temp_valid, newSize, gpu::sort_by_second<int, int>(), stream.get()));
+  void* sort_temp = nullptr;
+  gpu::allocateMemory(&sort_temp, sort_bytes, stream.get(), allocator);
+  GPUChkErrS(cub::DeviceMergeSort::SortPairs(sort_temp, sort_bytes, d_temp_valid, d_temp_valid, newSize, gpu::sort_by_second<int, int>(), stream.get()));
+  size_t transform_bytes = 0;
+  GPUChkErrS(cub::DeviceTransform::Transform(nullptr, transform_bytes, d_temp_valid, cellNeighbours, newSize, gpu::pair_to_first<int, int>(), stream.get()));
+  void* transform_temp = nullptr;
+  gpu::allocateMemory(&transform_temp, transform_bytes, stream.get(), allocator);
+  GPUChkErrS(cub::DeviceTransform::Transform(transform_temp, transform_bytes, d_temp_valid, cellNeighbours, newSize, gpu::pair_to_first<int, int>(), stream.get()));
+  gpu::deallocateMemory(transform_temp, transform_bytes, stream.get(), allocator);
+  gpu::deallocateMemory(d_temp_valid, newSize * sizeof(gpuPair<int, int>), stream.get(), allocator);
+  gpu::deallocateMemory(sort_temp, sort_bytes, stream.get(), allocator);
+  gpu::deallocateMemory(d_num_selected, sizeof(int), stream.get(), allocator);
+  gpu::deallocateMemory(select_temp, select_bytes, stream.get(), allocator);
+#else // FIXME using thrust here since hipcub does not yet have DeviceTransform
   auto nosync_policy = THRUST_NAMESPACE::par_nosync.on(stream.get());
   thrust::device_ptr<gpuPair<int, int>> neighVectorPairs(cellNeighbourPairs);
   thrust::device_ptr<int> validNeighs(cellNeighbours);
@@ -1224,6 +1268,7 @@ int filterCellNeighboursHandler(gpuPair<int, int>* cellNeighbourPairs,
   size_t newSize = updatedEnd - neighVectorPairs;
   thrust::stable_sort(nosync_policy, neighVectorPairs, neighVectorPairs + newSize, gpu::sort_by_second<int, int>());
   thrust::transform(nosync_policy, neighVectorPairs, neighVectorPairs + newSize, validNeighs, gpu::pair_to_first<int, int>());
+#endif
 
   return newSize;
 }