From a41bfc43632b09168561faf53da93d40a5348ba2 Mon Sep 17 00:00:00 2001 From: Felix Schlepper Date: Wed, 6 Aug 2025 14:20:09 +0200 Subject: [PATCH 1/6] ITS: GPU: put trackleting properly on different streams Signed-off-by: Felix Schlepper --- .../GPU/ITStrackingGPU/TimeFrameGPU.h | 9 +- .../ITS/tracking/GPU/ITStrackingGPU/Utils.h | 25 ++++-- .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 82 ++++++++++++------- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 1 + .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 10 ++- 5 files changed, 80 insertions(+), 47 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index 0ad08fd88ccf5..4656dd1d14846 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -79,6 +79,7 @@ class TimeFrameGPU : public TimeFrame return mGpuStreams[stream]; } auto& getStreams() { return mGpuStreams; } + void syncStreams(); virtual void wipe() final; /// interface @@ -108,7 +109,7 @@ class TimeFrameGPU : public TimeFrame std::vector getClusterSizes(); const unsigned char** getDeviceArrayUsedClusters() const { return mUsedClustersDeviceArray; } const int** getDeviceROframeClusters() const { return mROFrameClustersDeviceArray; } - Tracklet** getDeviceArrayTracklets() { return mTrackletsDeviceArray; } + Tracklet** getDeviceArrayTracklets() { return mTrackletsDevice.data(); } int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; } int** getDeviceArrayCellsLUT() const { return mCellsLUTDeviceArray; } int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLUTDeviceArray; } @@ -140,7 +141,8 @@ class TimeFrameGPU : public TimeFrame int getNumberOfNeighbours() const final; private: - void allocMemAsync(void**, size_t, Stream&, bool); // Abstract owned and unowned memory allocations + void allocMemAsync(void**, size_t, Stream&, bool); // Abstract owned and unowned memory allocations on specific stream + void allocMem(void**, size_t, bool); // Abstract owned and unowned memory allocations on default stream bool mHostRegistered = false; TimeFrameGPUParameters mGpuParams; @@ -167,7 +169,6 @@ class TimeFrameGPU : public TimeFrame const unsigned char** mUsedClustersDeviceArray; const int** mROFrameClustersDeviceArray; std::array mTrackletsDevice; - Tracklet** mTrackletsDeviceArray; std::array mTrackletsLUTDevice; std::array mCellsLUTDevice; std::array mNeighboursLUTDevice; @@ -195,8 +196,6 @@ class TimeFrameGPU : public TimeFrame // State Streams mGpuStreams; - size_t mAvailMemGB; - bool mFirstInit = true; // Temporary buffer for storing output tracks from GPU tracking bounded_vector mTrackITSExt; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h index 802973d5f4000..9d25bd22484fe 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h @@ -94,16 +94,21 @@ class Stream public: #if defined(__HIPCC__) using Handle = hipStream_t; - static constexpr Handle Default = 0; + static constexpr Handle DefaultStream = 0; + // static constexpr unsigned int DefaultFlag = hipStreamNonBlocking; TODO replace once ready + static constexpr unsigned int DefaultFlag = 0; #elif defined(__CUDACC__) using Handle = cudaStream_t; - static constexpr Handle Default = 0; + static constexpr Handle DefaultStream = 0; + // static constexpr unsigned int DefaultFlag = cudaStreamNonBlocking; TODO replace once ready + static constexpr unsigned int DefaultFlag = 0; #else using Handle = void*; - static constexpr Handle Default = nullptr; + static constexpr Handle DefaultStream = nullptr; + static constexpr unsigned int DefaultFlag = 0; #endif - Stream(unsigned int flags = 0) + Stream(unsigned int flags = DefaultFlag) { #if defined(__HIPCC__) GPUChkErrS(hipStreamCreateWithFlags(&mHandle, flags)); @@ -115,7 +120,7 @@ class Stream Stream(Handle h) : mHandle(h) {} ~Stream() { - if (mHandle != Default) { + if (mHandle != DefaultStream) { #if defined(__HIPCC__) GPUChkErrS(hipStreamDestroy(mHandle)); #elif defined(__CUDACC__) @@ -124,7 +129,7 @@ class Stream } } - operator bool() const { return mHandle != Default; } + operator bool() const { return mHandle != DefaultStream; } const Handle& get() { return mHandle; } void sync() const { @@ -136,7 +141,7 @@ class Stream } private: - Handle mHandle{Default}; + Handle mHandle{DefaultStream}; }; static_assert(sizeof(Stream) == sizeof(void*), "Stream type must match pointer type!"); @@ -150,6 +155,12 @@ class Streams void clear() { mStreams.clear(); } auto& operator[](size_t i) { return mStreams[i % mStreams.size()]; } void push_back(const Stream& stream) { mStreams.push_back(stream); } + void sync() + { + for (auto& s : mStreams) { + s.sync(); + } + } private: std::vector mStreams; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index f6d9157b0da68..dafd218c1e811 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -122,6 +122,17 @@ void TimeFrameGPU::allocMemAsync(void** ptr, size_t size, Stream& strea } } +template +void TimeFrameGPU::allocMem(void** ptr, size_t size, bool extAllocator) +{ + if (extAllocator) { + *ptr = this->mAllocator->allocate(size); + } else { + GPULog("Calling default CUDA allocator"); + GPUChkErrS(cudaMalloc(reinterpret_cast(ptr), size)); + } +} + template void TimeFrameGPU::setDevicePropagator(const o2::base::PropagatorImpl* propagator) { @@ -134,10 +145,10 @@ void TimeFrameGPU::loadIndexTableUtils(const int iteration) GPUTimer timer(mGpuStreams[0], "loading indextable utils"); if (!iteration) { GPULog("gpu-allocation: allocating IndexTableUtils buffer, for {:.2f} MB.", sizeof(IndexTableUtils) / constants::MB); - allocMemAsync(reinterpret_cast(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), mGpuStreams[0], this->getExtAllocator()); + allocMem(reinterpret_cast(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), this->getExtAllocator()); } GPULog("gpu-transfer: loading IndexTableUtils object, for {:.2f} MB.", sizeof(IndexTableUtils) / constants::MB); - GPUChkErrS(cudaMemcpyAsync(mIndexTableUtilsDevice, &(this->mIndexTableUtils), sizeof(IndexTableUtils), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpy(mIndexTableUtilsDevice, &(this->mIndexTableUtils), sizeof(IndexTableUtils), cudaMemcpyHostToDevice)); } template @@ -151,9 +162,10 @@ void TimeFrameGPU::loadUnsortedClustersDevice(const int iteration) GPUChkErrS(cudaHostRegister(this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDevice[iLayer], this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get())); } - allocMemAsync(reinterpret_cast(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), mGpuStreams[0], this->getExtAllocator()); + mGpuStreams.sync(); + allocMem(reinterpret_cast(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), this->getExtAllocator()); GPUChkErrS(cudaHostRegister(mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDeviceArray, mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpy(mUnsortedClustersDeviceArray, mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice)); } } @@ -164,13 +176,14 @@ void TimeFrameGPU::loadClustersDevice(const int iteration) GPUTimer timer(mGpuStreams[0], "loading sorted clusters"); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { GPULog("gpu-transfer: loading {} clusters on layer {}, for {:.2f} MB.", this->mClusters[iLayer].size(), iLayer, this->mClusters[iLayer].size() * sizeof(Cluster) / constants::MB); - allocMemAsync(reinterpret_cast(&mClustersDevice[iLayer]), this->mClusters[iLayer].size() * sizeof(Cluster), mGpuStreams[0], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mClustersDevice[iLayer]), this->mClusters[iLayer].size() * sizeof(Cluster), mGpuStreams[iLayer], this->getExtAllocator()); GPUChkErrS(cudaHostRegister(this->mClusters[iLayer].data(), this->mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mClustersDevice[iLayer], this->mClusters[iLayer].data(), this->mClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(mClustersDevice[iLayer], this->mClusters[iLayer].data(), this->mClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get())); } - allocMemAsync(reinterpret_cast(&mClustersDeviceArray), nLayers * sizeof(Cluster*), mGpuStreams[0], this->getExtAllocator()); + mGpuStreams.sync(); + allocMem(reinterpret_cast(&mClustersDeviceArray), nLayers * sizeof(Cluster*), this->getExtAllocator()); GPUChkErrS(cudaHostRegister(mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mClustersDeviceArray, mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpy(mClustersDeviceArray, mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice)); } } @@ -181,11 +194,12 @@ void TimeFrameGPU::loadClustersIndexTables(const int iteration) GPUTimer timer(mGpuStreams[0], "loading sorted clusters"); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { GPULog("gpu-transfer: loading clusters indextable for layer {} with {} elements, for {:.2f} MB.", iLayer, this->mIndexTables[iLayer].size(), this->mIndexTables[iLayer].size() * sizeof(int) / constants::MB); - allocMemAsync(reinterpret_cast(&mClustersIndexTablesDevice[iLayer]), this->mIndexTables[iLayer].size() * sizeof(int), mGpuStreams[0], this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], this->mIndexTables[iLayer].data(), this->mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + allocMemAsync(reinterpret_cast(&mClustersIndexTablesDevice[iLayer]), this->mIndexTables[iLayer].size() * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator()); + GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], this->mIndexTables[iLayer].data(), this->mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get())); } - allocMemAsync(reinterpret_cast(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), mGpuStreams[0], this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + mGpuStreams.sync(); + allocMem(reinterpret_cast(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), this->getExtAllocator()); + GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice)); } } @@ -196,10 +210,11 @@ void TimeFrameGPU::createUsedClustersDevice(const int iteration) GPUTimer timer(mGpuStreams[0], "creating used clusters flags"); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { GPULog("gpu-transfer: creating {} used clusters flags on layer {}, for {:.2f} MB.", this->mUsedClusters[iLayer].size(), iLayer, this->mUsedClusters[iLayer].size() * sizeof(unsigned char) / constants::MB); - allocMemAsync(reinterpret_cast(&mUsedClustersDevice[iLayer]), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0], this->getExtAllocator()); - GPUChkErrS(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, this->mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0].get())); + allocMemAsync(reinterpret_cast(&mUsedClustersDevice[iLayer]), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[iLayer], this->getExtAllocator()); + GPUChkErrS(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, this->mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[iLayer].get())); } - allocMemAsync(reinterpret_cast(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), mGpuStreams[0], this->getExtAllocator()); + mGpuStreams.sync(); + allocMem(reinterpret_cast(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), this->getExtAllocator()); GPUChkErrS(cudaMemcpyAsync(mUsedClustersDeviceArray, mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } } @@ -221,11 +236,12 @@ void TimeFrameGPU::loadROframeClustersDevice(const int iteration) GPUTimer timer(mGpuStreams[0], "loading ROframe clusters"); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { GPULog("gpu-transfer: loading {} ROframe clusters info on layer {}, for {:.2f} MB.", this->mROFramesClusters[iLayer].size(), iLayer, this->mROFramesClusters[iLayer].size() * sizeof(int) / constants::MB); - allocMemAsync(reinterpret_cast(&mROFramesClustersDevice[iLayer]), this->mROFramesClusters[iLayer].size() * sizeof(int), mGpuStreams[0], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mROFramesClustersDevice[iLayer]), this->mROFramesClusters[iLayer].size() * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator()); GPUChkErrS(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], this->mROFramesClusters[iLayer].data(), this->mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } - allocMemAsync(reinterpret_cast(&mROFrameClustersDeviceArray), nLayers * sizeof(int*), mGpuStreams[0], this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mROFrameClustersDeviceArray, mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + mGpuStreams.sync(); + allocMem(reinterpret_cast(&mROFrameClustersDeviceArray), nLayers * sizeof(int*), this->getExtAllocator()); + GPUChkErrS(cudaMemcpy(mROFrameClustersDeviceArray, mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice)); } } @@ -236,10 +252,11 @@ void TimeFrameGPU::loadTrackingFrameInfoDevice(const int iteration) if (!iteration) { for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { GPULog("gpu-transfer: loading {} tfinfo on layer {}, for {:.2f} MB.", this->mTrackingFrameInfo[iLayer].size(), iLayer, this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo) / constants::MB); - allocMemAsync(reinterpret_cast(&mTrackingFrameInfoDevice[iLayer]), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), mGpuStreams[0], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mTrackingFrameInfoDevice[iLayer]), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), mGpuStreams[iLayer], this->getExtAllocator()); GPUChkErrS(cudaHostRegister(this->mTrackingFrameInfo[iLayer].data(), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDevice[iLayer], this->mTrackingFrameInfo[iLayer].data(), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDevice[iLayer], this->mTrackingFrameInfo[iLayer].data(), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get())); } + mGpuStreams.sync(); allocMemAsync(reinterpret_cast(&mTrackingFrameInfoDeviceArray), nLayers * sizeof(TrackingFrameInfo*), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaHostRegister(mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaHostRegisterPortable)); GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDeviceArray, mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); @@ -253,9 +270,9 @@ void TimeFrameGPU::loadMultiplicityCutMask(const int iteration) GPUTimer timer(mGpuStreams[0], "loading multiplicity cut mask"); GPULog("gpu-transfer: iteration {} loading multiplicity cut mask with {} elements, for {:.2f} MB.", iteration, this->mMultiplicityCutMask.size(), this->mMultiplicityCutMask.size() * sizeof(bool) / constants::MB); if (!iteration) { // only allocate on first call - allocMemAsync(reinterpret_cast(&mMultMaskDevice), this->mMultiplicityCutMask.size() * sizeof(uint8_t), mGpuStreams[0], this->getExtAllocator()); + allocMem(reinterpret_cast(&mMultMaskDevice), this->mMultiplicityCutMask.size() * sizeof(uint8_t), this->getExtAllocator()); } - GPUChkErrS(cudaMemcpyAsync(mMultMaskDevice, this->mMultiplicityCutMask.data(), this->mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpy(mMultMaskDevice, this->mMultiplicityCutMask.data(), this->mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice)); } } @@ -265,11 +282,11 @@ void TimeFrameGPU::loadVertices(const int iteration) if (!iteration) { GPUTimer timer(mGpuStreams[0], "loading seeding vertices"); GPULog("gpu-transfer: loading {} ROframes vertices, for {:.2f} MB.", this->mROFramesPV.size(), this->mROFramesPV.size() * sizeof(int) / constants::MB); - allocMemAsync(reinterpret_cast(&mROFramesPVDevice), this->mROFramesPV.size() * sizeof(int), mGpuStreams[0], this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mROFramesPVDevice, this->mROFramesPV.data(), this->mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + allocMem(reinterpret_cast(&mROFramesPVDevice), this->mROFramesPV.size() * sizeof(int), this->getExtAllocator()); + GPUChkErrS(cudaMemcpy(mROFramesPVDevice, this->mROFramesPV.data(), this->mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice)); GPULog("gpu-transfer: loading {} seeding vertices, for {:.2f} MB.", this->mPrimaryVertices.size(), this->mPrimaryVertices.size() * sizeof(Vertex) / constants::MB); - allocMemAsync(reinterpret_cast(&mPrimaryVerticesDevice), this->mPrimaryVertices.size() * sizeof(Vertex), mGpuStreams[0], this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mPrimaryVerticesDevice, this->mPrimaryVertices.data(), this->mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + allocMem(reinterpret_cast(&mPrimaryVerticesDevice), this->mPrimaryVertices.size() * sizeof(Vertex), this->getExtAllocator()); + GPUChkErrS(cudaMemcpy(mPrimaryVerticesDevice, this->mPrimaryVertices.data(), this->mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice)); } } @@ -294,16 +311,13 @@ void TimeFrameGPU::createTrackletsLUTDevice(const int iteration) template void TimeFrameGPU::createTrackletsBuffers() { - GPUTimer timer(mGpuStreams[0], "creating tracklet buffers"); for (int iLayer{0}; iLayer < nLayers - 1; ++iLayer) { + GPUTimer timer(mGpuStreams[iLayer], "creating tracklet buffers"); mNTracklets[iLayer] = 0; GPUChkErrS(cudaMemcpyAsync(&mNTracklets[iLayer], mTrackletsLUTDevice[iLayer] + this->mClusters[iLayer].size(), sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[iLayer].get())); GPULog("gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {:.2f} MB.", mNTracklets[iLayer], iLayer, mNTracklets[iLayer] * sizeof(Tracklet) / constants::MB); allocMemAsync(reinterpret_cast(&mTrackletsDevice[iLayer]), mNTracklets[iLayer] * sizeof(Tracklet), mGpuStreams[iLayer], this->getExtAllocator()); } - allocMemAsync(reinterpret_cast(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), mGpuStreams[0], this->getExtAllocator()); - GPUChkErrS(cudaHostRegister(mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mTrackletsDeviceArray, mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } template @@ -524,7 +538,7 @@ void TimeFrameGPU::unregisterRest() GPUTimer timer(mGpuStreams[0], "unregistering rest of the host memory"); GPULog("unregistering rest of the host memory..."); GPUChkErrS(cudaHostUnregister(mCellsDevice.data())); - GPUChkErrS(cudaHostUnregister(mTrackletsDevice.data())); + // GPUChkErrS(cudaHostUnregister(mTrackletsDevice.data())); } template @@ -553,6 +567,12 @@ void TimeFrameGPU::initialise(const int iteration, o2::its::TimeFrame::initialise(iteration, trkParam, maxLayers); } +template +void TimeFrameGPU::syncStreams() +{ + mGpuStreams.sync(); +} + template void TimeFrameGPU::wipe() { diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index eaa6080761fec..9a6fe4e050519 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -116,6 +116,7 @@ void TrackerTraitsGPU::computeLayerTracklets(const int iteration, int i conf.nBlocksLayerTracklets[iteration], conf.nThreadsLayerTracklets[iteration], mTimeFrameGPU->getStreams()); + mTimeFrameGPU->syncStreams(); // TODO evaluate if this can be removed } template diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 02be19b1e3a08..6e678118686ab 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -876,9 +876,9 @@ GPUhi() void cubExclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stre void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream)); - GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + GPUChkErrS(cudaMallocAsync(&d_temp_storage, temp_storage_bytes, stream)); GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream)); - GPUChkErrS(cudaFree(d_temp_storage)); + GPUChkErrS(cudaFreeAsync(d_temp_storage, stream)); } template @@ -893,9 +893,9 @@ GPUhi() void cubInclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stre void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream)); - GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + GPUChkErrS(cudaMallocAsync(&d_temp_storage, temp_storage_bytes, stream)); GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream)); - GPUChkErrS(cudaFree(d_temp_storage)); + GPUChkErrS(cudaFreeAsync(d_temp_storage, stream)); } template @@ -1030,6 +1030,8 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, resolutions[iLayer], radii[iLayer + 1] - radii[iLayer], mulScatAng[iLayer]); + /// Internal thrust allocation serialize this part to a degree + /// TODO switch to cub equivelent and do all work on one stream thrust::device_ptr tracklets_ptr(spanTracklets[iLayer]); auto nosync_policy = THRUST_NAMESPACE::par_nosync.on(streams[iLayer].get()); thrust::sort(nosync_policy, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::sort_tracklets()); From 313947653b3bd009c920dccaf1c8ae38c89745bc Mon Sep 17 00:00:00 2001 From: Felix Schlepper Date: Wed, 6 Aug 2025 15:33:05 +0200 Subject: [PATCH 2/6] ITS: GPU: put cell finding on different streams Signed-off-by: Felix Schlepper --- .../GPU/ITStrackingGPU/TimeFrameGPU.h | 6 +--- .../GPU/ITStrackingGPU/TrackingKernels.h | 11 ++++++-- .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 8 +++++- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 28 ++++++++++++++----- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 12 ++++---- 5 files changed, 44 insertions(+), 21 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index 4656dd1d14846..afe31b14a4a0a 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -73,12 +73,8 @@ class TimeFrameGPU : public TimeFrame void downloadCellsDevice(); void downloadCellsLUTDevice(); void unregisterRest(); - template - auto& getStream(const size_t stream) - { - return mGpuStreams[stream]; - } auto& getStreams() { return mGpuStreams; } + void syncStream(const size_t stream); void syncStreams(); virtual void wipe() final; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index b0fb443513fef..567aa07f42f7c 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -25,7 +25,10 @@ namespace gpu #ifdef GPUCA_GPUCODE // GPUg() global kernels must only when compiled by GPU compiler -GPUdi() int4 getEmptyBinsRect() { return int4{0, 0, 0, 0}; } +GPUdi() int4 getEmptyBinsRect() +{ + return int4{0, 0, 0, 0}; +} GPUd() bool fitTrack(TrackITSExt& track, int start, @@ -137,7 +140,8 @@ void countCellsHandler(const Cluster** sortedClusters, const float cellDeltaTanLambdaSigma, const float nSigmaCut, const int nBlocks, - const int nThreads); + const int nThreads, + gpu::Streams& streams); void computeCellsHandler(const Cluster** sortedClusters, const Cluster** unsortedClusters, @@ -155,7 +159,8 @@ void computeCellsHandler(const Cluster** sortedClusters, const float cellDeltaTanLambdaSigma, const float nSigmaCut, const int nBlocks, - const int nThreads); + const int nThreads, + gpu::Streams& streams); unsigned int countCellNeighboursHandler(CellSeed** cellsLayersDevice, int* neighboursLUTs, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index dafd218c1e811..da91373611a2d 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -402,7 +402,7 @@ void TimeFrameGPU::createCellsLUTDevice() template void TimeFrameGPU::createCellsBuffers(const int layer) { - GPUTimer timer(mGpuStreams[0], "creating cells buffers"); + GPUTimer timer(mGpuStreams[layer], "creating cells buffers"); mNCells[layer] = 0; GPUChkErrS(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mNTracklets[layer], sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[layer].get())); GPULog("gpu-transfer: creating cell buffer for {} elements on layer {}, for {:.2f} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeed) / constants::MB); @@ -567,6 +567,12 @@ void TimeFrameGPU::initialise(const int iteration, o2::its::TimeFrame::initialise(iteration, trkParam, maxLayers); } +template +void TimeFrameGPU::syncStream(const size_t stream) +{ + mGpuStreams[stream].sync(); +} + template void TimeFrameGPU::syncStreams() { diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 9a6fe4e050519..2ae4f0aa91819 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -116,7 +116,6 @@ void TrackerTraitsGPU::computeLayerTracklets(const int iteration, int i conf.nBlocksLayerTracklets[iteration], conf.nThreadsLayerTracklets[iteration], mTimeFrameGPU->getStreams()); - mTimeFrameGPU->syncStreams(); // TODO evaluate if this can be removed } template @@ -125,18 +124,30 @@ void TrackerTraitsGPU::computeLayerCells(const int iteration) mTimeFrameGPU->createCellsLUTDevice(); auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); + std::vector isTrackletStreamSynched(this->mTrkParams[iteration].TrackletsPerRoad()); + auto syncOnce = [&](const int iLayer) { + if (!isTrackletStreamSynched[iLayer]) { + mTimeFrameGPU->syncStream(iLayer); + isTrackletStreamSynched[iLayer] = true; + } + }; + for (int iLayer = 0; iLayer < this->mTrkParams[iteration].CellsPerRoad(); ++iLayer) { - if (!mTimeFrameGPU->getNTracklets()[iLayer + 1] || !mTimeFrameGPU->getNTracklets()[iLayer]) { + // need to ensure that trackleting on layers iLayer and iLayer + 1 are done (only once) + syncOnce(iLayer); + syncOnce(iLayer + 1); + // if there are no tracklets skip entirely + const int currentLayerTrackletsNum{static_cast(mTimeFrameGPU->getNTracklets()[iLayer])}; + if (!currentLayerTrackletsNum || !mTimeFrameGPU->getNTracklets()[iLayer + 1]) { mTimeFrameGPU->getNCells()[iLayer] = 0; continue; } - const int currentLayerTrackletsNum{static_cast(mTimeFrameGPU->getNTracklets()[iLayer])}; countCellsHandler(mTimeFrameGPU->getDeviceArrayClusters(), mTimeFrameGPU->getDeviceArrayUnsortedClusters(), mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), mTimeFrameGPU->getDeviceArrayTracklets(), mTimeFrameGPU->getDeviceArrayTrackletsLUT(), - mTimeFrameGPU->getNTracklets()[iLayer], + currentLayerTrackletsNum, iLayer, nullptr, mTimeFrameGPU->getDeviceArrayCellsLUT(), @@ -147,14 +158,15 @@ void TrackerTraitsGPU::computeLayerCells(const int iteration) this->mTrkParams[iteration].CellDeltaTanLambdaSigma, this->mTrkParams[iteration].NSigmaCut, conf.nBlocksLayerCells[iteration], - conf.nThreadsLayerCells[iteration]); + conf.nThreadsLayerCells[iteration], + mTimeFrameGPU->getStreams()); mTimeFrameGPU->createCellsBuffers(iLayer); computeCellsHandler(mTimeFrameGPU->getDeviceArrayClusters(), mTimeFrameGPU->getDeviceArrayUnsortedClusters(), mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), mTimeFrameGPU->getDeviceArrayTracklets(), mTimeFrameGPU->getDeviceArrayTrackletsLUT(), - mTimeFrameGPU->getNTracklets()[iLayer], + currentLayerTrackletsNum, iLayer, mTimeFrameGPU->getDeviceCells()[iLayer], mTimeFrameGPU->getDeviceArrayCellsLUT(), @@ -165,8 +177,10 @@ void TrackerTraitsGPU::computeLayerCells(const int iteration) this->mTrkParams[iteration].CellDeltaTanLambdaSigma, this->mTrkParams[iteration].NSigmaCut, conf.nBlocksLayerCells[iteration], - conf.nThreadsLayerCells[iteration]); + conf.nThreadsLayerCells[iteration], + mTimeFrameGPU->getStreams()); } + mTimeFrameGPU->syncStreams(); // TODO evaluate if this can be removed } template diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 6e678118686ab..d4dcda067b26f 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -1065,9 +1065,10 @@ void countCellsHandler( const float cellDeltaTanLambdaSigma, const float nSigmaCut, const int nBlocks, - const int nThreads) + const int nThreads, + gpu::Streams& streams) { - gpu::computeLayerCellsKernel<<>>( + gpu::computeLayerCellsKernel<<>>( sortedClusters, // const Cluster** unsortedClusters, // const Cluster** tfInfo, // const TrackingFrameInfo** @@ -1082,7 +1083,7 @@ void countCellsHandler( maxChi2ClusterAttachment, // const float cellDeltaTanLambdaSigma, // const float nSigmaCut); // const float - gpu::cubExclusiveScanInPlace(cellsLUTsHost, nTracklets + 1); + gpu::cubExclusiveScanInPlace(cellsLUTsHost, nTracklets + 1, streams[layer].get()); } void computeCellsHandler( @@ -1102,9 +1103,10 @@ void computeCellsHandler( const float cellDeltaTanLambdaSigma, const float nSigmaCut, const int nBlocks, - const int nThreads) + const int nThreads, + gpu::Streams& streams) { - gpu::computeLayerCellsKernel<<>>( + gpu::computeLayerCellsKernel<<>>( sortedClusters, // const Cluster** unsortedClusters, // const Cluster** tfInfo, // const TrackingFrameInfo** From 85287db75519f8b1459d202a7d1df01669068619 Mon Sep 17 00:00:00 2001 From: Felix Schlepper Date: Wed, 6 Aug 2025 16:28:15 +0200 Subject: [PATCH 3/6] ITS: GPU: put cell neighbour finding on different streams Signed-off-by: Felix Schlepper --- .../GPU/ITStrackingGPU/TimeFrameGPU.h | 12 ++-- .../GPU/ITStrackingGPU/TrackingKernels.h | 35 +++++----- .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 59 ++++------------- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 66 +++++++++++-------- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 58 ++++++++-------- 5 files changed, 101 insertions(+), 129 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index afe31b14a4a0a..5c10b01412b4e 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -62,17 +62,15 @@ class TimeFrameGPU : public TimeFrame void createCellsDevice(); void createCellsLUTDevice(); void createNeighboursIndexTablesDevice(); - void createNeighboursDevice(const unsigned int layer, const unsigned int nNeighbours); - void createNeighboursDevice(const unsigned int layer, std::vector>& neighbours); + void createNeighboursDevice(const unsigned int layer); void createNeighboursLUTDevice(const int, const unsigned int); - void createNeighboursDeviceArray(); void createTrackITSExtDevice(bounded_vector&); void downloadTrackITSExtDevice(bounded_vector&); void downloadCellsNeighboursDevice(std::vector>>&, const int); void downloadNeighboursLUTDevice(bounded_vector&, const int); void downloadCellsDevice(); void downloadCellsLUTDevice(); - void unregisterRest(); + auto& getStream(const size_t stream) { return mGpuStreams[stream]; } auto& getStreams() { return mGpuStreams; } void syncStream(const size_t stream); void syncStreams(); @@ -96,7 +94,7 @@ class TimeFrameGPU : public TimeFrame gpuPair* getDeviceNeighbourPairs(const int layer) { return mNeighbourPairsDevice[layer]; } std::array& getDeviceNeighboursAll() { return mNeighboursDevice; } int* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; } - int** getDeviceNeighboursArray() { return mNeighboursDeviceArray; } + int** getDeviceNeighboursArray() { return mNeighboursDevice.data(); } TrackingFrameInfo* getDeviceTrackingFrameInfo(const int); const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; } const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; } @@ -109,7 +107,7 @@ class TimeFrameGPU : public TimeFrame int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; } int** getDeviceArrayCellsLUT() const { return mCellsLUTDeviceArray; } int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLUTDeviceArray; } - CellSeed** getDeviceArrayCells() const { return mCellsDeviceArray; } + CellSeed** getDeviceArrayCells() { return mCellsDevice.data(); } CellSeed* getDeviceTrackSeeds() { return mTrackSeedsDevice; } o2::track::TrackParCovF** getDeviceArrayTrackSeeds() { return mCellSeedsDeviceArray; } float** getDeviceArrayTrackSeedsChi2() { return mCellSeedsChi2DeviceArray; } @@ -176,7 +174,6 @@ class TimeFrameGPU : public TimeFrame std::array mCellsDevice; std::array mNeighboursIndexTablesDevice; CellSeed* mTrackSeedsDevice; - CellSeed** mCellsDeviceArray; std::array mCellSeedsDevice; o2::track::TrackParCovF** mCellSeedsDeviceArray; std::array mCellSeedsChi2Device; @@ -186,7 +183,6 @@ class TimeFrameGPU : public TimeFrame TrackITSExt* mTrackITSExtDevice; std::array*, nLayers - 2> mNeighbourPairsDevice; std::array mNeighboursDevice; - int** mNeighboursDeviceArray; std::array mTrackingFrameInfoDevice; const TrackingFrameInfo** mTrackingFrameInfoDeviceArray; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index 567aa07f42f7c..caa7675756db6 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -162,21 +162,22 @@ void computeCellsHandler(const Cluster** sortedClusters, const int nThreads, gpu::Streams& streams); -unsigned int countCellNeighboursHandler(CellSeed** cellsLayersDevice, - int* neighboursLUTs, - int** cellsLUTs, - gpuPair* cellNeighbours, - int* neighboursIndexTable, - const Tracklet** tracklets, - const int deltaROF, - const float maxChi2ClusterAttachment, - const float bz, - const int layerIndex, - const unsigned int nCells, - const unsigned int nCellsNext, - const int maxCellNeighbours, - const int nBlocks, - const int nThreads); +void countCellNeighboursHandler(CellSeed** cellsLayersDevice, + int* neighboursLUTs, + int** cellsLUTs, + gpuPair* cellNeighbours, + int* neighboursIndexTable, + const Tracklet** tracklets, + const int deltaROF, + const float maxChi2ClusterAttachment, + const float bz, + const int layerIndex, + const unsigned int nCells, + const unsigned int nCellsNext, + const int maxCellNeighbours, + const int nBlocks, + const int nThreads, + gpu::Stream& stream); void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, int* neighboursLUTs, @@ -192,11 +193,13 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, const unsigned int nCellsNext, const int maxCellNeighbours, const int nBlocks, - const int nThreads); + const int nThreads, + gpu::Stream& stream); int filterCellNeighboursHandler(gpuPair*, int*, unsigned int, + gpu::Stream&, o2::its::ExternalAllocator* = nullptr); template diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index da91373611a2d..72a1f98d1b78b 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -349,26 +349,20 @@ void TimeFrameGPU::createNeighboursIndexTablesDevice() { GPUTimer timer(mGpuStreams[0], "creating cells neighbours"); // Here we do also the creation of the CellsDeviceArray, as the cells buffers are populated separately in the previous steps. - allocMemAsync(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), mGpuStreams[0], this->getExtAllocator()); - GPUChkErrS(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { GPULog("gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {:.2f} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / constants::MB); - allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator()); - GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0].get())); - if (iLayer < nLayers - 3) { - mNNeighbours[iLayer] = 0; - } + allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator()); + GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[iLayer].get())); } } template void TimeFrameGPU::createNeighboursLUTDevice(const int layer, const unsigned int nCells) { - GPUTimer timer(mGpuStreams[0], "reserving neighboursLUT"); + GPUTimer timer(mGpuStreams[layer], "reserving neighboursLUT"); GPULog("gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {:.2f} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / constants::MB); - allocMemAsync(reinterpret_cast(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator()); // We need one element more to move exc -> inc - GPUChkErrS(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get())); + allocMemAsync(reinterpret_cast(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), mGpuStreams[layer], this->getExtAllocator()); // We need one element more to move exc -> inc + GPUChkErrS(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[layer].get())); } template @@ -382,8 +376,6 @@ void TimeFrameGPU::loadCellsDevice() GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer].get())); GPUChkErrS(cudaMemcpyAsync(mCellsDevice[iLayer], this->mCells[iLayer].data(), this->mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get())); } - allocMemAsync(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), mGpuStreams[0], this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } template @@ -441,35 +433,15 @@ void TimeFrameGPU::loadTrackSeedsDevice(bounded_vector& seeds } template -void TimeFrameGPU::createNeighboursDevice(const unsigned int layer, const unsigned int nNeighbours) +void TimeFrameGPU::createNeighboursDevice(const unsigned int layer) { - GPUTimer timer(mGpuStreams[0], "reserving neighbours"); + GPUTimer timer(mGpuStreams[layer], "reserving neighbours"); + GPUChkErrS(cudaMemcpyAsync(&(this->mNNeighbours[layer]), &(mNeighboursLUTDevice[layer][this->mNCells[layer + 1] - 1]), sizeof(unsigned int), cudaMemcpyDeviceToHost, mGpuStreams[layer].get())); GPULog("gpu-allocation: reserving {} neighbours (pairs), for {:.2f} MB.", nNeighbours, nNeighbours * sizeof(gpuPair) / constants::MB); - allocMemAsync(reinterpret_cast(&mNeighbourPairsDevice[layer]), nNeighbours * sizeof(gpuPair), mGpuStreams[0], this->getExtAllocator()); - GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, nNeighbours * sizeof(gpuPair), mGpuStreams[0].get())); + allocMemAsync(reinterpret_cast(&mNeighbourPairsDevice[layer]), (this->mNNeighbours[layer]) * sizeof(gpuPair), mGpuStreams[layer], this->getExtAllocator()); + GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, (this->mNNeighbours[layer]) * sizeof(gpuPair), mGpuStreams[layer].get())); GPULog("gpu-allocation: reserving {} neighbours, for {:.2f} MB.", nNeighbours, nNeighbours * sizeof(gpuPair) / constants::MB); - allocMemAsync(reinterpret_cast(&mNeighboursDevice[layer]), nNeighbours * sizeof(int), mGpuStreams[0], this->getExtAllocator()); -} - -template -void TimeFrameGPU::createNeighboursDevice(const unsigned int layer, std::vector>& neighbours) -{ - GPUTimer timer(mGpuStreams[0], "reserving neighbours"); - this->mCellsNeighbours[layer].clear(); - this->mCellsNeighbours[layer].resize(neighbours.size()); - GPULog("gpu-allocation: reserving {} neighbours (pairs), for {:.2f} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair) / constants::MB); - allocMemAsync(reinterpret_cast(&mNeighbourPairsDevice[layer]), neighbours.size() * sizeof(gpuPair), mGpuStreams[0], this->getExtAllocator()); - GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair), mGpuStreams[0].get())); - GPULog("gpu-allocation: reserving {} neighbours, for {:.2f} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair) / constants::MB); - allocMemAsync(reinterpret_cast(&mNeighboursDevice[layer]), neighbours.size() * sizeof(int), mGpuStreams[0], this->getExtAllocator()); -} - -template -void TimeFrameGPU::createNeighboursDeviceArray() -{ - GPUTimer timer(mGpuStreams[0], "reserving neighbours"); - allocMemAsync(reinterpret_cast(&mNeighboursDeviceArray), (nLayers - 2) * sizeof(int*), mGpuStreams[0], this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mNeighboursDeviceArray, mNeighboursDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + allocMemAsync(reinterpret_cast(&mNeighboursDevice[layer]), (this->mNNeighbours[layer]) * sizeof(int), mGpuStreams[layer], this->getExtAllocator()); } template @@ -532,15 +504,6 @@ void TimeFrameGPU::downloadTrackITSExtDevice(bounded_vector& GPUChkErrS(cudaHostUnregister(seeds.data())); } -template -void TimeFrameGPU::unregisterRest() -{ - GPUTimer timer(mGpuStreams[0], "unregistering rest of the host memory"); - GPULog("unregistering rest of the host memory..."); - GPUChkErrS(cudaHostUnregister(mCellsDevice.data())); - // GPUChkErrS(cudaHostUnregister(mTrackletsDevice.data())); -} - template void TimeFrameGPU::unregisterHostMemory(const int maxLayers) { diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 2ae4f0aa91819..0fa258f63fd23 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -180,7 +180,6 @@ void TrackerTraitsGPU::computeLayerCells(const int iteration) conf.nThreadsLayerCells[iteration], mTimeFrameGPU->getStreams()); } - mTimeFrameGPU->syncStreams(); // TODO evaluate if this can be removed } template @@ -188,7 +187,20 @@ void TrackerTraitsGPU::findCellsNeighbours(const int iteration) { mTimeFrameGPU->createNeighboursIndexTablesDevice(); const auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); + + std::vector isCellStreamSynched(this->mTrkParams[iteration].TrackletsPerRoad() - 1); + auto syncOnce = [&](const int iLayer) { + if (!isCellStreamSynched[iLayer]) { + mTimeFrameGPU->syncStream(iLayer); + isCellStreamSynched[iLayer] = true; + } + }; + for (int iLayer{0}; iLayer < this->mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) { + // ensure that celling is done for iLayer and iLayer+1 is done + syncOnce(iLayer); + syncOnce(iLayer + 1); + const int currentLayerCellsNum{static_cast(mTimeFrameGPU->getNCells()[iLayer])}; const int nextLayerCellsNum{static_cast(mTimeFrameGPU->getNCells()[iLayer + 1])}; if (!nextLayerCellsNum || !currentLayerCellsNum) { @@ -197,24 +209,23 @@ void TrackerTraitsGPU::findCellsNeighbours(const int iteration) } mTimeFrameGPU->createNeighboursLUTDevice(iLayer, nextLayerCellsNum); - unsigned int nNeigh = countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(), - mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here. - mTimeFrameGPU->getDeviceArrayCellsLUT(), - mTimeFrameGPU->getDeviceNeighbourPairs(iLayer), - mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer), - (const Tracklet**)mTimeFrameGPU->getDeviceArrayTracklets(), - this->mTrkParams[0].DeltaROF, - this->mTrkParams[0].MaxChi2ClusterAttachment, - this->mBz, - iLayer, - currentLayerCellsNum, - nextLayerCellsNum, - 1e2, - conf.nBlocksFindNeighbours[iteration], - conf.nThreadsFindNeighbours[iteration]); - - mTimeFrameGPU->createNeighboursDevice(iLayer, nNeigh); - + countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(), + mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here. + mTimeFrameGPU->getDeviceArrayCellsLUT(), + mTimeFrameGPU->getDeviceNeighbourPairs(iLayer), + mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer), + (const Tracklet**)mTimeFrameGPU->getDeviceArrayTracklets(), + this->mTrkParams[0].DeltaROF, + this->mTrkParams[0].MaxChi2ClusterAttachment, + this->mBz, + iLayer, + currentLayerCellsNum, + nextLayerCellsNum, + 1e2, + conf.nBlocksFindNeighbours[iteration], + conf.nThreadsFindNeighbours[iteration], + mTimeFrameGPU->getStream(iLayer)); + mTimeFrameGPU->createNeighboursDevice(iLayer); computeCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(), mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), mTimeFrameGPU->getDeviceArrayCellsLUT(), @@ -229,16 +240,15 @@ void TrackerTraitsGPU::findCellsNeighbours(const int iteration) nextLayerCellsNum, 1e2, conf.nBlocksFindNeighbours[iteration], - conf.nThreadsFindNeighbours[iteration]); - - nNeigh = filterCellNeighboursHandler(mTimeFrameGPU->getDeviceNeighbourPairs(iLayer), - mTimeFrameGPU->getDeviceNeighbours(iLayer), - nNeigh, - mTimeFrameGPU->getExternalAllocator()); - mTimeFrameGPU->getArrayNNeighbours()[iLayer] = nNeigh; + conf.nThreadsFindNeighbours[iteration], + mTimeFrameGPU->getStream(iLayer)); + mTimeFrameGPU->getArrayNNeighbours()[iLayer] = filterCellNeighboursHandler(mTimeFrameGPU->getDeviceNeighbourPairs(iLayer), + mTimeFrameGPU->getDeviceNeighbours(iLayer), + mTimeFrameGPU->getArrayNNeighbours()[iLayer], + mTimeFrameGPU->getStream(iLayer), + mTimeFrameGPU->getExternalAllocator()); } - mTimeFrameGPU->createNeighboursDeviceArray(); - mTimeFrameGPU->unregisterRest(); + mTimeFrameGPU->syncStreams(); // TODO evaluate if this can be removed }; template diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index d4dcda067b26f..060f150bd6f42 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -1123,23 +1123,24 @@ void computeCellsHandler( nSigmaCut); // const float } -unsigned int countCellNeighboursHandler(CellSeed** cellsLayersDevice, - int* neighboursLUT, - int** cellsLUTs, - gpuPair* cellNeighbours, - int* neighboursIndexTable, - const Tracklet** tracklets, - const int deltaROF, - const float maxChi2ClusterAttachment, - const float bz, - const int layerIndex, - const unsigned int nCells, - const unsigned int nCellsNext, - const int maxCellNeighbours, - const int nBlocks, - const int nThreads) +void countCellNeighboursHandler(CellSeed** cellsLayersDevice, + int* neighboursLUT, + int** cellsLUTs, + gpuPair* cellNeighbours, + int* neighboursIndexTable, + const Tracklet** tracklets, + const int deltaROF, + const float maxChi2ClusterAttachment, + const float bz, + const int layerIndex, + const unsigned int nCells, + const unsigned int nCellsNext, + const int maxCellNeighbours, + const int nBlocks, + const int nThreads, + gpu::Stream& stream) { - gpu::computeLayerCellNeighboursKernel<<>>( + gpu::computeLayerCellNeighboursKernel<<>>( cellsLayersDevice, neighboursLUT, neighboursIndexTable, @@ -1152,11 +1153,8 @@ unsigned int countCellNeighboursHandler(CellSeed** cellsLayersDevice, layerIndex, nCells, maxCellNeighbours); - gpu::cubInclusiveScanInPlace(neighboursLUT, nCellsNext); - gpu::cubExclusiveScanInPlace(neighboursIndexTable, nCells + 1); - unsigned int nNeighbours; - GPUChkErrS(cudaMemcpy(&nNeighbours, &neighboursLUT[nCellsNext - 1], sizeof(unsigned int), cudaMemcpyDeviceToHost)); - return nNeighbours; + gpu::cubInclusiveScanInPlace(neighboursLUT, nCellsNext, stream.get()); + gpu::cubExclusiveScanInPlace(neighboursIndexTable, nCells + 1, stream.get()); } void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, @@ -1173,10 +1171,10 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, const unsigned int nCellsNext, const int maxCellNeighbours, const int nBlocks, - const int nThreads) + const int nThreads, + gpu::Stream& stream) { - - gpu::computeLayerCellNeighboursKernel<<>>( + gpu::computeLayerCellNeighboursKernel<<>>( cellsLayersDevice, neighboursLUT, neighboursIndexTable, @@ -1189,21 +1187,23 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, layerIndex, nCells, maxCellNeighbours); - GPUChkErrS(cudaPeekAtLastError()); - GPUChkErrS(cudaDeviceSynchronize()); } int filterCellNeighboursHandler(gpuPair* cellNeighbourPairs, int* cellNeighbours, unsigned int nNeigh, + gpu::Stream& stream, o2::its::ExternalAllocator* allocator) { + /// Internal thrust allocation serialize this part to a degree + /// TODO switch to cub equivelent and do all work on one stream + auto nosync_policy = THRUST_NAMESPACE::par_nosync.on(stream.get()); thrust::device_ptr> neighVectorPairs(cellNeighbourPairs); thrust::device_ptr validNeighs(cellNeighbours); - auto updatedEnd = thrust::remove_if(neighVectorPairs, neighVectorPairs + nNeigh, gpu::is_invalid_pair()); + auto updatedEnd = thrust::remove_if(nosync_policy, neighVectorPairs, neighVectorPairs + nNeigh, gpu::is_invalid_pair()); size_t newSize = updatedEnd - neighVectorPairs; - thrust::stable_sort(neighVectorPairs, neighVectorPairs + newSize, gpu::sort_by_second()); - thrust::transform(neighVectorPairs, neighVectorPairs + newSize, validNeighs, gpu::pair_to_first()); + thrust::stable_sort(nosync_policy, neighVectorPairs, neighVectorPairs + newSize, gpu::sort_by_second()); + thrust::transform(nosync_policy, neighVectorPairs, neighVectorPairs + newSize, validNeighs, gpu::pair_to_first()); return newSize; } From 5fc126f4c30410de4d25b44c944a538a6151255c Mon Sep 17 00:00:00 2001 From: Felix Schlepper Date: Thu, 7 Aug 2025 08:00:13 +0200 Subject: [PATCH 4/6] ITS: GPU: use external allocator for temp storage Signed-off-by: Felix Schlepper --- .../GPU/ITStrackingGPU/TrackingKernels.h | 6 +- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 6 +- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 67 ++++++++++++------- 3 files changed, 54 insertions(+), 25 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index caa7675756db6..a7bf4c70bc5c2 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -86,6 +86,7 @@ void countTrackletsInROFsHandler(const IndexTableUtils* utils, bounded_vector& resolutions, std::vector& radii, bounded_vector& mulScatAng, + o2::its::ExternalAllocator* alloc, const int nBlocks, const int nThreads, gpu::Streams& streams); @@ -120,6 +121,7 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, bounded_vector& resolutions, std::vector& radii, bounded_vector& mulScatAng, + o2::its::ExternalAllocator* alloc, const int nBlocks, const int nThreads, gpu::Streams& streams); @@ -139,6 +141,7 @@ void countCellsHandler(const Cluster** sortedClusters, const float maxChi2ClusterAttachment, const float cellDeltaTanLambdaSigma, const float nSigmaCut, + o2::its::ExternalAllocator* alloc, const int nBlocks, const int nThreads, gpu::Streams& streams); @@ -175,6 +178,7 @@ void countCellNeighboursHandler(CellSeed** cellsLayersDevice, const unsigned int nCells, const unsigned int nCellsNext, const int maxCellNeighbours, + o2::its::ExternalAllocator* alloc, const int nBlocks, const int nThreads, gpu::Stream& stream); @@ -213,12 +217,12 @@ void processNeighboursHandler(const int startLayer, gsl::span neighboursDeviceLUTs, const TrackingFrameInfo** foundTrackingFrameInfo, bounded_vector& seedsHost, - o2::its::ExternalAllocator*, const float bz, const float MaxChi2ClusterAttachment, const float maxChi2NDF, const o2::base::Propagator* propagator, const o2::base::PropagatorF::MatCorrType matCorrType, + o2::its::ExternalAllocator* alloc, const int nBlocks, const int nThreads); diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 0fa258f63fd23..e7135e55a16c8 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -80,6 +80,7 @@ void TrackerTraitsGPU::computeLayerTracklets(const int iteration, int i mTimeFrameGPU->getPositionResolutions(), this->mTrkParams[iteration].LayerRadii, mTimeFrameGPU->getMSangles(), + mTimeFrameGPU->getExternalAllocator(), conf.nBlocksLayerTracklets[iteration], conf.nThreadsLayerTracklets[iteration], mTimeFrameGPU->getStreams()); @@ -113,6 +114,7 @@ void TrackerTraitsGPU::computeLayerTracklets(const int iteration, int i mTimeFrameGPU->getPositionResolutions(), this->mTrkParams[iteration].LayerRadii, mTimeFrameGPU->getMSangles(), + mTimeFrameGPU->getExternalAllocator(), conf.nBlocksLayerTracklets[iteration], conf.nThreadsLayerTracklets[iteration], mTimeFrameGPU->getStreams()); @@ -157,6 +159,7 @@ void TrackerTraitsGPU::computeLayerCells(const int iteration) this->mTrkParams[iteration].MaxChi2ClusterAttachment, this->mTrkParams[iteration].CellDeltaTanLambdaSigma, this->mTrkParams[iteration].NSigmaCut, + mTimeFrameGPU->getExternalAllocator(), conf.nBlocksLayerCells[iteration], conf.nThreadsLayerCells[iteration], mTimeFrameGPU->getStreams()); @@ -222,6 +225,7 @@ void TrackerTraitsGPU::findCellsNeighbours(const int iteration) currentLayerCellsNum, nextLayerCellsNum, 1e2, + mTimeFrameGPU->getExternalAllocator(), conf.nBlocksFindNeighbours[iteration], conf.nThreadsFindNeighbours[iteration], mTimeFrameGPU->getStream(iLayer)); @@ -272,12 +276,12 @@ void TrackerTraitsGPU::findRoads(const int iteration) mTimeFrameGPU->getDeviceNeighboursLUTs(), mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), trackSeeds, - mTimeFrameGPU->getExternalAllocator(), this->mBz, this->mTrkParams[0].MaxChi2ClusterAttachment, this->mTrkParams[0].MaxChi2NDF, mTimeFrameGPU->getDevicePropagator(), this->mTrkParams[0].CorrType, + mTimeFrameGPU->getExternalAllocator(), conf.nBlocksProcessNeighbours[iteration], conf.nThreadsProcessNeighbours[iteration]); } diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 060f150bd6f42..b8eeae12691a3 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -871,37 +871,53 @@ GPUg() void printCellSeeds(CellSeed* seed, int nCells, const unsigned int tId = } template -GPUhi() void cubExclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stream = nullptr) +GPUhi() void cubExclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr) { void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream)); - GPUChkErrS(cudaMallocAsync(&d_temp_storage, temp_storage_bytes, stream)); + if (alloc) { + d_temp_storage = alloc->allocate(temp_storage_bytes); + } else { + GPUChkErrS(cudaMallocAsync(&d_temp_storage, temp_storage_bytes, stream)); + } GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream)); - GPUChkErrS(cudaFreeAsync(d_temp_storage, stream)); + if (alloc) { + alloc->deallocate(reinterpret_cast(d_temp_storage), temp_storage_bytes); + } else { + GPUChkErrS(cudaFreeAsync(d_temp_storage, stream)); + } } template -GPUhi() void cubExclusiveScanInPlace(Vector& in_out, int num_items, cudaStream_t stream = nullptr) +GPUhi() void cubExclusiveScanInPlace(Vector& in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr) { - cubExclusiveScanInPlace(thrust::raw_pointer_cast(in_out.data()), num_items, stream); + cubExclusiveScanInPlace(thrust::raw_pointer_cast(in_out.data()), num_items, stream, alloc); } template -GPUhi() void cubInclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stream = nullptr) +GPUhi() void cubInclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr) { void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream)); - GPUChkErrS(cudaMallocAsync(&d_temp_storage, temp_storage_bytes, stream)); + if (alloc) { + d_temp_storage = alloc->allocate(temp_storage_bytes); + } else { + GPUChkErrS(cudaMallocAsync(&d_temp_storage, temp_storage_bytes, stream)); + } GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream)); - GPUChkErrS(cudaFreeAsync(d_temp_storage, stream)); + if (alloc) { + alloc->deallocate(reinterpret_cast(d_temp_storage), temp_storage_bytes); + } else { + GPUChkErrS(cudaFreeAsync(d_temp_storage, stream)); + } } template -GPUhi() void cubInclusiveScanInPlace(Vector& in_out, int num_items, cudaStream_t stream = nullptr) +GPUhi() void cubInclusiveScanInPlace(Vector& in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr) { - cubInclusiveScanInPlace(thrust::raw_pointer_cast(in_out.data()), num_items, stream); + cubInclusiveScanInPlace(thrust::raw_pointer_cast(in_out.data()), num_items, stream, alloc); } } // namespace gpu @@ -932,6 +948,7 @@ void countTrackletsInROFsHandler(const IndexTableUtils* utils, bounded_vector& resolutions, std::vector& radii, bounded_vector& mulScatAng, + o2::its::ExternalAllocator* alloc, const int nBlocks, const int nThreads, gpu::Streams& streams) @@ -964,7 +981,7 @@ void countTrackletsInROFsHandler(const IndexTableUtils* utils, resolutions[iLayer], radii[iLayer + 1] - radii[iLayer], mulScatAng[iLayer]); - gpu::cubExclusiveScanInPlace(trackletsLUTsHost[iLayer], nClusters[iLayer] + 1, streams[iLayer].get()); + gpu::cubExclusiveScanInPlace(trackletsLUTsHost[iLayer], nClusters[iLayer] + 1, streams[iLayer].get(), alloc); } } @@ -998,6 +1015,7 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, bounded_vector& resolutions, std::vector& radii, bounded_vector& mulScatAng, + o2::its::ExternalAllocator* alloc, const int nBlocks, const int nThreads, gpu::Streams& streams) @@ -1043,7 +1061,7 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, spanTracklets[iLayer], trackletsLUTsHost[iLayer], nTracklets[iLayer]); - gpu::cubExclusiveScanInPlace(trackletsLUTsHost[iLayer], nClusters[iLayer] + 1, streams[iLayer].get()); + gpu::cubExclusiveScanInPlace(trackletsLUTsHost[iLayer], nClusters[iLayer] + 1, streams[iLayer].get(), alloc); } } } @@ -1064,6 +1082,7 @@ void countCellsHandler( const float maxChi2ClusterAttachment, const float cellDeltaTanLambdaSigma, const float nSigmaCut, + o2::its::ExternalAllocator* alloc, const int nBlocks, const int nThreads, gpu::Streams& streams) @@ -1083,7 +1102,7 @@ void countCellsHandler( maxChi2ClusterAttachment, // const float cellDeltaTanLambdaSigma, // const float nSigmaCut); // const float - gpu::cubExclusiveScanInPlace(cellsLUTsHost, nTracklets + 1, streams[layer].get()); + gpu::cubExclusiveScanInPlace(cellsLUTsHost, nTracklets + 1, streams[layer].get(), alloc); } void computeCellsHandler( @@ -1136,6 +1155,7 @@ void countCellNeighboursHandler(CellSeed** cellsLayersDevice, const unsigned int nCells, const unsigned int nCellsNext, const int maxCellNeighbours, + o2::its::ExternalAllocator* alloc, const int nBlocks, const int nThreads, gpu::Stream& stream) @@ -1153,8 +1173,8 @@ void countCellNeighboursHandler(CellSeed** cellsLayersDevice, layerIndex, nCells, maxCellNeighbours); - gpu::cubInclusiveScanInPlace(neighboursLUT, nCellsNext, stream.get()); - gpu::cubExclusiveScanInPlace(neighboursIndexTable, nCells + 1, stream.get()); + gpu::cubInclusiveScanInPlace(neighboursLUT, nCellsNext, stream.get(), alloc); + gpu::cubExclusiveScanInPlace(neighboursIndexTable, nCells + 1, stream.get(), alloc); } void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, @@ -1219,19 +1239,18 @@ void processNeighboursHandler(const int startLayer, gsl::span neighboursDeviceLUTs, const TrackingFrameInfo** foundTrackingFrameInfo, bounded_vector& seedsHost, - o2::its::ExternalAllocator* allocator, const float bz, const float maxChi2ClusterAttachment, const float maxChi2NDF, const o2::base::Propagator* propagator, const o2::base::PropagatorF::MatCorrType matCorrType, + o2::its::ExternalAllocator* alloc, const int nBlocks, const int nThreads) { - auto allocInt = gpu::TypedAllocator(allocator); - auto allocCellSeed = gpu::TypedAllocator(allocator); - thrust::device_vector> foundSeedsTable(nCells[startLayer] + 1, 0, allocInt); // Shortcut: device_vector skips central memory management, we are relying on the contingency. - // TODO: fix this. + auto allocInt = gpu::TypedAllocator(alloc); + auto allocCellSeed = gpu::TypedAllocator(alloc); + thrust::device_vector> foundSeedsTable(nCells[startLayer] + 1, 0, allocInt); gpu::processNeighboursKernel<<>>( startLayer, @@ -1251,7 +1270,7 @@ void processNeighboursHandler(const int startLayer, maxChi2ClusterAttachment, propagator, matCorrType); - gpu::cubExclusiveScanInPlace(foundSeedsTable, nCells[startLayer] + 1); + gpu::cubExclusiveScanInPlace(foundSeedsTable, nCells[startLayer] + 1, gpu::Stream::DefaultStream, alloc); thrust::device_vector> updatedCellId(foundSeedsTable.back(), 0, allocInt); thrust::device_vector> updatedCellSeed(foundSeedsTable.back(), allocCellSeed); @@ -1306,7 +1325,7 @@ void processNeighboursHandler(const int startLayer, maxChi2ClusterAttachment, propagator, matCorrType); - gpu::cubExclusiveScanInPlace(foundSeedsTable, foundSeedsTable.size()); + gpu::cubExclusiveScanInPlace(foundSeedsTable, foundSeedsTable.size(), gpu::Stream::DefaultStream, alloc); auto foundSeeds{foundSeedsTable.back()}; updatedCellId.resize(foundSeeds); @@ -1402,6 +1421,7 @@ template void countTrackletsInROFsHandler<7>(const IndexTableUtils* utils, bounded_vector& resolutions, std::vector& radii, bounded_vector& mulScatAng, + o2::its::ExternalAllocator* alloc, const int nBlocks, const int nThreads, gpu::Streams& streams); @@ -1435,6 +1455,7 @@ template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils, bounded_vector& resolutions, std::vector& radii, bounded_vector& mulScatAng, + o2::its::ExternalAllocator* alloc, const int nBlocks, const int nThreads, gpu::Streams& streams); @@ -1449,12 +1470,12 @@ template void processNeighboursHandler<7>(const int startLayer, gsl::span neighboursDeviceLUTs, const TrackingFrameInfo** foundTrackingFrameInfo, bounded_vector& seedsHost, - o2::its::ExternalAllocator*, const float bz, const float maxChi2ClusterAttachment, const float maxChi2NDF, const o2::base::Propagator* propagator, const o2::base::PropagatorF::MatCorrType matCorrType, + o2::its::ExternalAllocator* alloc, const int nBlocks, const int nThreads); } // namespace o2::its From a2993298ccd492f2e72fb6a9f057dde1d6cefe6d Mon Sep 17 00:00:00 2001 From: Felix Schlepper Date: Thu, 7 Aug 2025 08:06:04 +0200 Subject: [PATCH 5/6] ITS: GPU: simplify stream synchronization Signed-off-by: Felix Schlepper --- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 26 ++++--------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index e7135e55a16c8..bef271a1b0129 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -126,18 +126,10 @@ void TrackerTraitsGPU::computeLayerCells(const int iteration) mTimeFrameGPU->createCellsLUTDevice(); auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); - std::vector isTrackletStreamSynched(this->mTrkParams[iteration].TrackletsPerRoad()); - auto syncOnce = [&](const int iLayer) { - if (!isTrackletStreamSynched[iLayer]) { - mTimeFrameGPU->syncStream(iLayer); - isTrackletStreamSynched[iLayer] = true; - } - }; - + mTimeFrameGPU->syncStream(0); for (int iLayer = 0; iLayer < this->mTrkParams[iteration].CellsPerRoad(); ++iLayer) { - // need to ensure that trackleting on layers iLayer and iLayer + 1 are done (only once) - syncOnce(iLayer); - syncOnce(iLayer + 1); + mTimeFrameGPU->syncStream(iLayer + 1); + // if there are no tracklets skip entirely const int currentLayerTrackletsNum{static_cast(mTimeFrameGPU->getNTracklets()[iLayer])}; if (!currentLayerTrackletsNum || !mTimeFrameGPU->getNTracklets()[iLayer + 1]) { @@ -191,18 +183,10 @@ void TrackerTraitsGPU::findCellsNeighbours(const int iteration) mTimeFrameGPU->createNeighboursIndexTablesDevice(); const auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); - std::vector isCellStreamSynched(this->mTrkParams[iteration].TrackletsPerRoad() - 1); - auto syncOnce = [&](const int iLayer) { - if (!isCellStreamSynched[iLayer]) { - mTimeFrameGPU->syncStream(iLayer); - isCellStreamSynched[iLayer] = true; - } - }; - + mTimeFrameGPU->syncStream(0); for (int iLayer{0}; iLayer < this->mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) { // ensure that celling is done for iLayer and iLayer+1 is done - syncOnce(iLayer); - syncOnce(iLayer + 1); + mTimeFrameGPU->syncStream(iLayer + 1); const int currentLayerCellsNum{static_cast(mTimeFrameGPU->getNCells()[iLayer])}; const int nextLayerCellsNum{static_cast(mTimeFrameGPU->getNCells()[iLayer + 1])}; From 6e7242de0ecb985bcf969b4e6d2aef5e85751ae1 Mon Sep 17 00:00:00 2001 From: Felix Schlepper Date: Fri, 8 Aug 2025 12:55:50 +0200 Subject: [PATCH 6/6] ITS: GPU: resolve added TODOs Signed-off-by: Felix Schlepper --- .../ITS/tracking/GPU/ITStrackingGPU/Utils.h | 6 +- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 119 ++++++++++++------ 2 files changed, 84 insertions(+), 41 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h index 9d25bd22484fe..cd860c47ebd9c 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h @@ -95,13 +95,11 @@ class Stream #if defined(__HIPCC__) using Handle = hipStream_t; static constexpr Handle DefaultStream = 0; - // static constexpr unsigned int DefaultFlag = hipStreamNonBlocking; TODO replace once ready - static constexpr unsigned int DefaultFlag = 0; + static constexpr unsigned int DefaultFlag = hipStreamNonBlocking; #elif defined(__CUDACC__) using Handle = cudaStream_t; static constexpr Handle DefaultStream = 0; - // static constexpr unsigned int DefaultFlag = cudaStreamNonBlocking; TODO replace once ready - static constexpr unsigned int DefaultFlag = 0; + static constexpr unsigned int DefaultFlag = cudaStreamNonBlocking; #else using Handle = void*; static constexpr Handle DefaultStream = nullptr; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index b8eeae12691a3..401d98ad63560 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -145,9 +145,7 @@ GPUd() bool fitTrack(TrackITSExt& track, if (matCorrType == o2::base::PropagatorF::MatCorrType::USEMatCorrNONE) { const float xx0 = (iLayer > 2) ? 1.e-2f : 5.e-3f; // Rough layer thickness - constexpr float radiationLength = 9.36f; // Radiation length of Si [cm] - constexpr float density = 2.33f; // Density of Si [g/cm^3] - if (!track.correctForMaterial(xx0, xx0 * radiationLength * density, true)) { + if (!track.correctForMaterial(xx0, xx0 * constants::Radl * constants::Rho, true)) { return false; } } @@ -728,13 +726,13 @@ GPUg() void processNeighboursKernel(const int layer, if (!seed.o2::track::TrackParCov::update(trHit.positionTrackingFrame, trHit.covarianceTrackingFrame)) { continue; } - seed.getClusters()[layer - 1] = neighbourCell.getFirstClusterIndex(); - seed.setLevel(neighbourCell.getLevel()); - seed.setFirstTrackletIndex(neighbourCell.getFirstTrackletIndex()); - seed.setSecondTrackletIndex(neighbourCell.getSecondTrackletIndex()); if constexpr (dryRun) { foundSeedsTable[iCurrentCell]++; } else { + seed.getClusters()[layer - 1] = neighbourCell.getFirstClusterIndex(); + seed.setLevel(neighbourCell.getLevel()); + seed.setFirstTrackletIndex(neighbourCell.getFirstTrackletIndex()); + seed.setSecondTrackletIndex(neighbourCell.getSecondTrackletIndex()); updatedCellsIds[foundSeedsTable[iCurrentCell] + foundSeeds] = neighbourCellId; updatedCellSeeds[foundSeedsTable[iCurrentCell] + foundSeeds] = seed; } @@ -870,25 +868,35 @@ GPUg() void printCellSeeds(CellSeed* seed, int nCells, const unsigned int tId = } } -template -GPUhi() void cubExclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr) +GPUhi() void allocateMemory(void** p, size_t bytes, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr) { - void* d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream)); if (alloc) { - d_temp_storage = alloc->allocate(temp_storage_bytes); + *p = alloc->allocate(bytes); } else { - GPUChkErrS(cudaMallocAsync(&d_temp_storage, temp_storage_bytes, stream)); + GPUChkErrS(cudaMallocAsync(p, bytes, stream)); } - GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream)); +} + +GPUhi() void deallocateMemory(void* p, size_t bytes, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr) +{ if (alloc) { - alloc->deallocate(reinterpret_cast(d_temp_storage), temp_storage_bytes); + alloc->deallocate(reinterpret_cast(p), bytes); } else { - GPUChkErrS(cudaFreeAsync(d_temp_storage, stream)); + GPUChkErrS(cudaFreeAsync(p, stream)); } } +template +GPUhi() void cubExclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr) +{ + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream)); + allocateMemory(&d_temp_storage, temp_storage_bytes, stream, alloc); + GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream)); + deallocateMemory(d_temp_storage, temp_storage_bytes, stream, alloc); +} + template GPUhi() void cubExclusiveScanInPlace(Vector& in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr) { @@ -901,21 +909,13 @@ GPUhi() void cubInclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stre void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream)); - if (alloc) { - d_temp_storage = alloc->allocate(temp_storage_bytes); - } else { - GPUChkErrS(cudaMallocAsync(&d_temp_storage, temp_storage_bytes, stream)); - } + allocateMemory(&d_temp_storage, temp_storage_bytes, stream, alloc); GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream)); - if (alloc) { - alloc->deallocate(reinterpret_cast(d_temp_storage), temp_storage_bytes); - } else { - GPUChkErrS(cudaFreeAsync(d_temp_storage, stream)); - } + deallocateMemory(d_temp_storage, temp_storage_bytes, stream, alloc); } template -GPUhi() void cubInclusiveScanInPlace(Vector& in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr) +GPUhi() void cubInclusiveScanInPlace(Vector& in_out, int num_items, cudaStream_t stream = nullptr, o2::its::ExternalAllocator* alloc = nullptr) { cubInclusiveScanInPlace(thrust::raw_pointer_cast(in_out.data()), num_items, stream, alloc); } @@ -1048,13 +1048,30 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, resolutions[iLayer], radii[iLayer + 1] - radii[iLayer], mulScatAng[iLayer]); - /// Internal thrust allocation serialize this part to a degree - /// TODO switch to cub equivelent and do all work on one stream - thrust::device_ptr tracklets_ptr(spanTracklets[iLayer]); - auto nosync_policy = THRUST_NAMESPACE::par_nosync.on(streams[iLayer].get()); - thrust::sort(nosync_policy, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::sort_tracklets()); - auto unique_end = thrust::unique(nosync_policy, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::equal_tracklets()); - nTracklets[iLayer] = unique_end - tracklets_ptr; + if (nTracklets[iLayer]) { + Tracklet *tracklets_in = spanTracklets[iLayer], *tracklets_out{nullptr}; + size_t n = nTracklets[iLayer]; + size_t sort_temp_bytes = 0; + GPUChkErrS(cub::DeviceMergeSort::SortKeys(nullptr, sort_temp_bytes, tracklets_in, n, gpu::sort_tracklets{}, streams[iLayer].get())); + void* sort_temp_storage = nullptr; + gpu::allocateMemory(&sort_temp_storage, sort_temp_bytes, streams[iLayer].get(), alloc); + GPUChkErrS(cub::DeviceMergeSort::SortKeys(sort_temp_storage, sort_temp_bytes, tracklets_in, n, gpu::sort_tracklets{}, streams[iLayer].get())); + gpu::allocateMemory(reinterpret_cast(&tracklets_out), n * sizeof(Tracklet), streams[iLayer].get(), alloc); + size_t unique_temp_bytes = 0; + int* num_selected = nullptr; + gpu::allocateMemory(reinterpret_cast(&num_selected), sizeof(int), streams[iLayer].get(), alloc); + GPUChkErrS(cub::DeviceSelect::Unique(nullptr, unique_temp_bytes, tracklets_in, tracklets_out, num_selected, n, streams[iLayer].get())); + void* unique_temp_storage = nullptr; + gpu::allocateMemory(&unique_temp_storage, unique_temp_bytes, streams[iLayer].get(), alloc); + GPUChkErrS(cub::DeviceSelect::Unique(unique_temp_storage, unique_temp_bytes, tracklets_in, tracklets_out, num_selected, n, streams[iLayer].get())); + GPUChkErrS(cudaMemcpyAsync(tracklets_in, tracklets_out, n * sizeof(Tracklet), cudaMemcpyDeviceToDevice, streams[iLayer].get())); + GPUChkErrS(cudaMemcpyAsync(&nTracklets[iLayer], num_selected, sizeof(int), cudaMemcpyDeviceToHost, streams[iLayer].get())); + streams[iLayer].sync(); + gpu::deallocateMemory(tracklets_out, n * sizeof(Tracklet), streams[iLayer].get(), alloc); + gpu::deallocateMemory(sort_temp_storage, sort_temp_bytes, streams[iLayer].get(), alloc); + gpu::deallocateMemory(unique_temp_storage, unique_temp_bytes, streams[iLayer].get(), alloc); + gpu::deallocateMemory(num_selected, sizeof(int), streams[iLayer].get(), alloc); + } if (iLayer > 0) { GPUChkErrS(cudaMemsetAsync(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int), streams[iLayer].get())); gpu::compileTrackletsLookupTableKernel<<>>( @@ -1215,8 +1232,35 @@ int filterCellNeighboursHandler(gpuPair* cellNeighbourPairs, gpu::Stream& stream, o2::its::ExternalAllocator* allocator) { - /// Internal thrust allocation serialize this part to a degree - /// TODO switch to cub equivelent and do all work on one stream +#ifndef __HIPCC__ + int* d_num_selected = nullptr; + gpu::allocateMemory(reinterpret_cast(&d_num_selected), sizeof(int), stream.get(), allocator); + size_t select_bytes = 0; + GPUChkErrS(cub::DeviceSelect::If(nullptr, select_bytes, cellNeighbourPairs, static_cast*>(nullptr), d_num_selected, nNeigh, gpu::is_valid_pair(), stream.get())); + void* select_temp = nullptr; + gpu::allocateMemory(&select_temp, select_bytes, stream.get(), allocator); + gpuPair* d_temp_valid = nullptr; + gpu::allocateMemory(reinterpret_cast(&d_temp_valid), nNeigh * sizeof(gpuPair), stream.get(), allocator); + GPUChkErrS(cub::DeviceSelect::If(select_temp, select_bytes, cellNeighbourPairs, d_temp_valid, d_num_selected, nNeigh, gpu::is_valid_pair(), stream.get())); + int newSize = 0; + GPUChkErrS(cudaMemcpyAsync(&newSize, d_num_selected, sizeof(int), cudaMemcpyDeviceToHost, stream.get())); + stream.sync(); // needed to get newSize + size_t sort_bytes = 0; + GPUChkErrS(cub::DeviceMergeSort::SortPairs(nullptr, sort_bytes, d_temp_valid, d_temp_valid, newSize, gpu::sort_by_second(), stream.get())); + void* sort_temp = nullptr; + gpu::allocateMemory(&sort_temp, sort_bytes, stream.get(), allocator); + GPUChkErrS(cub::DeviceMergeSort::SortPairs(sort_temp, sort_bytes, d_temp_valid, d_temp_valid, newSize, gpu::sort_by_second(), stream.get())); + size_t transform_bytes = 0; + GPUChkErrS(cub::DeviceTransform::Transform(nullptr, transform_bytes, d_temp_valid, cellNeighbours, newSize, gpu::pair_to_first(), stream.get())); + void* transform_temp = nullptr; + gpu::allocateMemory(&transform_temp, transform_bytes, stream.get(), allocator); + GPUChkErrS(cub::DeviceTransform::Transform(transform_temp, transform_bytes, d_temp_valid, cellNeighbours, newSize, gpu::pair_to_first(), stream.get())); + gpu::deallocateMemory(transform_temp, transform_bytes, stream.get(), allocator); + gpu::deallocateMemory(d_temp_valid, newSize * sizeof(gpuPair), stream.get(), allocator); + gpu::deallocateMemory(sort_temp, sort_bytes, stream.get(), allocator); + gpu::deallocateMemory(d_num_selected, sizeof(int), stream.get(), allocator); + gpu::deallocateMemory(select_temp, select_bytes, stream.get(), allocator); +#else // FIXME using thrust here since hipcub does not yet have DeviceTransform auto nosync_policy = THRUST_NAMESPACE::par_nosync.on(stream.get()); thrust::device_ptr> neighVectorPairs(cellNeighbourPairs); thrust::device_ptr validNeighs(cellNeighbours); @@ -1224,6 +1268,7 @@ int filterCellNeighboursHandler(gpuPair* cellNeighbourPairs, size_t newSize = updatedEnd - neighVectorPairs; thrust::stable_sort(nosync_policy, neighVectorPairs, neighVectorPairs + newSize, gpu::sort_by_second()); thrust::transform(nosync_policy, neighVectorPairs, neighVectorPairs + newSize, validNeighs, gpu::pair_to_first()); +#endif return newSize; }