Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 41 additions & 34 deletions cuda_core/cuda/core/_cpp/resource_handles.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,15 +171,15 @@ struct ContextBox {
};
} // namespace

ContextHandle create_context_handle_ref(CUcontext ctx) noexcept {
ContextHandle create_context_handle_ref(CUcontext ctx) {
auto box = std::make_shared<const ContextBox>(ContextBox{ctx});
return ContextHandle(box, &box->resource);
}

// Thread-local cache of primary contexts indexed by device ID
static thread_local std::vector<ContextHandle> primary_context_cache;

ContextHandle get_primary_context(int device_id) noexcept {
ContextHandle get_primary_context(int device_id) {
// Check thread-local cache
if (static_cast<size_t>(device_id) < primary_context_cache.size()) {
if (auto cached = primary_context_cache[device_id]) {
Expand Down Expand Up @@ -212,7 +212,7 @@ ContextHandle get_primary_context(int device_id) noexcept {
return h;
}

ContextHandle get_current_context() noexcept {
ContextHandle get_current_context() {
GILReleaseGuard gil;
CUcontext ctx = nullptr;
if (CUDA_SUCCESS != (err = p_cuCtxGetCurrent(&ctx))) {
Expand All @@ -234,7 +234,7 @@ struct StreamBox {
};
} // namespace

StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) noexcept {
StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) {
GILReleaseGuard gil;
CUstream stream;
if (CUDA_SUCCESS != (err = p_cuStreamCreateWithPriority(&stream, flags, priority))) {
Expand All @@ -252,12 +252,12 @@ StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int p
return StreamHandle(box, &box->resource);
}

StreamHandle create_stream_handle_ref(CUstream stream) noexcept {
StreamHandle create_stream_handle_ref(CUstream stream) {
auto box = std::make_shared<const StreamBox>(StreamBox{stream});
return StreamHandle(box, &box->resource);
}

StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner) noexcept {
StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner) {
if (!owner) {
return create_stream_handle_ref(stream);
}
Expand All @@ -281,12 +281,12 @@ StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner) n
return StreamHandle(box, &box->resource);
}

StreamHandle get_legacy_stream() noexcept {
StreamHandle get_legacy_stream() {
static StreamHandle handle = create_stream_handle_ref(CU_STREAM_LEGACY);
return handle;
}

StreamHandle get_per_thread_stream() noexcept {
StreamHandle get_per_thread_stream() {
static StreamHandle handle = create_stream_handle_ref(CU_STREAM_PER_THREAD);
return handle;
}
Expand All @@ -301,7 +301,7 @@ struct EventBox {
};
} // namespace

EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) noexcept {
EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) {
GILReleaseGuard gil;
CUevent event;
if (CUDA_SUCCESS != (err = p_cuEventCreate(&event, flags))) {
Expand All @@ -319,11 +319,11 @@ EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) noexcep
return EventHandle(box, &box->resource);
}

EventHandle create_event_handle_noctx(unsigned int flags) noexcept {
EventHandle create_event_handle_noctx(unsigned int flags) {
return create_event_handle(ContextHandle{}, flags);
}

EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) noexcept {
EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) {
GILReleaseGuard gil;
CUevent event;
if (CUDA_SUCCESS != (err = p_cuIpcOpenEventHandle(&event, ipc_handle))) {
Expand Down Expand Up @@ -353,19 +353,24 @@ struct MemoryPoolBox {

// Helper to clear peer access before destroying a memory pool.
// Works around nvbug 5698116: recycled pool handles inherit peer access state.
static void clear_mempool_peer_access(CUmemoryPool pool) {
int device_count = 0;
if (p_cuDeviceGetCount(&device_count) != CUDA_SUCCESS || device_count <= 0) {
return;
}
// Must be noexcept since it's called from a shared_ptr deleter.
static void clear_mempool_peer_access(CUmemoryPool pool) noexcept {
try {
int device_count = 0;
if (p_cuDeviceGetCount(&device_count) != CUDA_SUCCESS || device_count <= 0) {
return;
}

std::vector<CUmemAccessDesc> clear_access(device_count);
for (int i = 0; i < device_count; ++i) {
clear_access[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
clear_access[i].location.id = i;
clear_access[i].flags = CU_MEM_ACCESS_FLAGS_PROT_NONE;
std::vector<CUmemAccessDesc> clear_access(device_count);
for (int i = 0; i < device_count; ++i) {
clear_access[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
clear_access[i].location.id = i;
clear_access[i].flags = CU_MEM_ACCESS_FLAGS_PROT_NONE;
}
p_cuMemPoolSetAccess(pool, clear_access.data(), device_count); // Best effort
} catch (...) {
// Swallow exceptions - this is best-effort cleanup in destructor context
}
p_cuMemPoolSetAccess(pool, clear_access.data(), device_count); // Best effort
}

static MemoryPoolHandle wrap_mempool_owned(CUmemoryPool pool) {
Expand All @@ -381,7 +386,7 @@ static MemoryPoolHandle wrap_mempool_owned(CUmemoryPool pool) {
return MemoryPoolHandle(box, &box->resource);
}

MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props) noexcept {
MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props) {
GILReleaseGuard gil;
CUmemoryPool pool;
if (CUDA_SUCCESS != (err = p_cuMemPoolCreate(&pool, &props))) {
Expand All @@ -390,12 +395,12 @@ MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props) noexcept {
return wrap_mempool_owned(pool);
}

MemoryPoolHandle create_mempool_handle_ref(CUmemoryPool pool) noexcept {
MemoryPoolHandle create_mempool_handle_ref(CUmemoryPool pool) {
auto box = std::make_shared<const MemoryPoolBox>(MemoryPoolBox{pool});
return MemoryPoolHandle(box, &box->resource);
}

MemoryPoolHandle get_device_mempool(int device_id) noexcept {
MemoryPoolHandle get_device_mempool(int device_id) {
GILReleaseGuard gil;
CUmemoryPool pool;
if (CUDA_SUCCESS != (err = p_cuDeviceGetMemPool(&pool, device_id))) {
Expand All @@ -404,7 +409,7 @@ MemoryPoolHandle get_device_mempool(int device_id) noexcept {
return create_mempool_handle_ref(pool);
}

MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type) noexcept {
MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type) {
GILReleaseGuard gil;
CUmemoryPool pool;
auto handle_ptr = reinterpret_cast<void*>(static_cast<uintptr_t>(fd));
Expand Down Expand Up @@ -448,7 +453,7 @@ void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) no
get_box(h)->h_stream = std::move(h_stream);
}

DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool, StreamHandle h_stream) noexcept {
DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool, StreamHandle h_stream) {
GILReleaseGuard gil;
CUdeviceptr ptr;
if (CUDA_SUCCESS != (err = p_cuMemAllocFromPoolAsync(&ptr, size, *h_pool, as_cu(h_stream)))) {
Expand All @@ -466,7 +471,7 @@ DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool,
return DevicePtrHandle(box, &box->resource);
}

DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) noexcept {
DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) {
GILReleaseGuard gil;
CUdeviceptr ptr;
if (CUDA_SUCCESS != (err = p_cuMemAllocAsync(&ptr, size, as_cu(h_stream)))) {
Expand All @@ -484,7 +489,7 @@ DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) noexce
return DevicePtrHandle(box, &box->resource);
}

DevicePtrHandle deviceptr_alloc(size_t size) noexcept {
DevicePtrHandle deviceptr_alloc(size_t size) {
GILReleaseGuard gil;
CUdeviceptr ptr;
if (CUDA_SUCCESS != (err = p_cuMemAlloc(&ptr, size))) {
Expand All @@ -502,7 +507,7 @@ DevicePtrHandle deviceptr_alloc(size_t size) noexcept {
return DevicePtrHandle(box, &box->resource);
}

DevicePtrHandle deviceptr_alloc_host(size_t size) noexcept {
DevicePtrHandle deviceptr_alloc_host(size_t size) {
GILReleaseGuard gil;
void* ptr;
if (CUDA_SUCCESS != (err = p_cuMemAllocHost(&ptr, size))) {
Expand All @@ -520,12 +525,12 @@ DevicePtrHandle deviceptr_alloc_host(size_t size) noexcept {
return DevicePtrHandle(box, &box->resource);
}

DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr) noexcept {
DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr) {
auto box = std::make_shared<DevicePtrBox>(DevicePtrBox{ptr, StreamHandle{}});
return DevicePtrHandle(box, &box->resource);
}

DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner) noexcept {
DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner) {
if (!owner) {
return deviceptr_create_ref(ptr);
}
Expand Down Expand Up @@ -607,7 +612,7 @@ struct ExportDataKeyHash {
static std::mutex ipc_ptr_cache_mutex;
static std::unordered_map<ExportDataKey, std::weak_ptr<DevicePtrBox>, ExportDataKeyHash> ipc_ptr_cache;

DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export_data, StreamHandle h_stream) noexcept {
DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export_data, StreamHandle h_stream) {
auto data = const_cast<CUmemPoolPtrExportData*>(
reinterpret_cast<const CUmemPoolPtrExportData*>(export_data));

Expand Down Expand Up @@ -639,14 +644,16 @@ DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export
new DevicePtrBox{ptr, h_stream},
[h_pool, key](DevicePtrBox* b) {
GILReleaseGuard gil;
{
try {
std::lock_guard<std::mutex> lock(ipc_ptr_cache_mutex);
// Only erase if expired - avoids race where another thread
// replaced the entry with a new import before we acquired the lock.
auto it = ipc_ptr_cache.find(key);
if (it != ipc_ptr_cache.end() && it->second.expired()) {
ipc_ptr_cache.erase(it);
}
} catch (...) {
// Cache cleanup is best-effort - swallow exceptions in destructor context
}
p_cuMemFreeAsync(b->resource, as_cu(b->h_stream));
delete b;
Expand Down
44 changes: 22 additions & 22 deletions cuda_core/cuda/core/_cpp/resource_handles.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,15 +75,15 @@ using MemoryPoolHandle = std::shared_ptr<const CUmemoryPool>;
// ============================================================================

// Function to create a non-owning context handle (references existing context).
ContextHandle create_context_handle_ref(CUcontext ctx) noexcept;
ContextHandle create_context_handle_ref(CUcontext ctx);

// Get handle to the primary context for a device (with thread-local caching)
// Returns empty handle on error (caller must check)
ContextHandle get_primary_context(int device_id) noexcept;
ContextHandle get_primary_context(int device_id);

// Get handle to the current CUDA context
// Returns empty handle if no context is current (caller must check)
ContextHandle get_current_context() noexcept;
ContextHandle get_current_context();

// ============================================================================
// Stream handle functions
Expand All @@ -93,26 +93,26 @@ ContextHandle get_current_context() noexcept;
// The stream structurally depends on the provided context handle.
// When the last reference is released, cuStreamDestroy is called automatically.
// Returns empty handle on error (caller must check).
StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) noexcept;
StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority);

// Create a non-owning stream handle (references existing stream).
// Use for borrowed streams (from foreign code) or built-in streams.
// The stream will NOT be destroyed when the handle is released.
// Caller is responsible for keeping the stream's context alive.
StreamHandle create_stream_handle_ref(CUstream stream) noexcept;
StreamHandle create_stream_handle_ref(CUstream stream);

// Create a non-owning stream handle that prevents a Python owner from being GC'd.
// The owner's refcount is incremented; decremented when handle is released.
// The owner is responsible for keeping the stream's context alive.
StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner) noexcept;
StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner);

// Get non-owning handle to the legacy default stream (CU_STREAM_LEGACY)
// Note: Legacy stream has no specific context dependency.
StreamHandle get_legacy_stream() noexcept;
StreamHandle get_legacy_stream();

// Get non-owning handle to the per-thread default stream (CU_STREAM_PER_THREAD)
// Note: Per-thread stream has no specific context dependency.
StreamHandle get_per_thread_stream() noexcept;
StreamHandle get_per_thread_stream();

// ============================================================================
// Event handle functions
Expand All @@ -122,19 +122,19 @@ StreamHandle get_per_thread_stream() noexcept;
// The event structurally depends on the provided context handle.
// When the last reference is released, cuEventDestroy is called automatically.
// Returns empty handle on error (caller must check).
EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) noexcept;
EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags);

// Create an owning event handle without context dependency.
// Use for temporary events that are created and destroyed in the same scope.
// When the last reference is released, cuEventDestroy is called automatically.
// Returns empty handle on error (caller must check).
EventHandle create_event_handle_noctx(unsigned int flags) noexcept;
EventHandle create_event_handle_noctx(unsigned int flags);

// Create an owning event handle from an IPC handle.
// The originating process owns the event and its context.
// When the last reference is released, cuEventDestroy is called automatically.
// Returns empty handle on error (caller must check).
EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) noexcept;
EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle);

// ============================================================================
// Memory pool handle functions
Expand All @@ -144,22 +144,22 @@ EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) noexcept
// Memory pools are device-scoped (not context-scoped).
// When the last reference is released, cuMemPoolDestroy is called automatically.
// Returns empty handle on error (caller must check).
MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props) noexcept;
MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props);

// Create a non-owning memory pool handle (references existing pool).
// Use for device default/current pools that are managed by the driver.
// The pool will NOT be destroyed when the handle is released.
MemoryPoolHandle create_mempool_handle_ref(CUmemoryPool pool) noexcept;
MemoryPoolHandle create_mempool_handle_ref(CUmemoryPool pool);

// Get non-owning handle to the current memory pool for a device.
// Returns empty handle on error (caller must check).
MemoryPoolHandle get_device_mempool(int device_id) noexcept;
MemoryPoolHandle get_device_mempool(int device_id);

// Create an owning memory pool handle from an IPC import.
// The file descriptor is NOT owned by this handle (caller manages FD separately).
// When the last reference is released, cuMemPoolDestroy is called automatically.
// Returns empty handle on error (caller must check).
MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type) noexcept;
MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type);

// ============================================================================
// Device pointer handle functions
Expand All @@ -174,33 +174,33 @@ using DevicePtrHandle = std::shared_ptr<const CUdeviceptr>;
DevicePtrHandle deviceptr_alloc_from_pool(
size_t size,
MemoryPoolHandle h_pool,
StreamHandle h_stream) noexcept;
StreamHandle h_stream);

// Allocate device memory asynchronously via cuMemAllocAsync.
// When the last reference is released, cuMemFreeAsync is called on the stored stream.
// Returns empty handle on error (caller must check).
DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) noexcept;
DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream);

// Allocate device memory synchronously via cuMemAlloc.
// When the last reference is released, cuMemFree is called.
// Returns empty handle on error (caller must check).
DevicePtrHandle deviceptr_alloc(size_t size) noexcept;
DevicePtrHandle deviceptr_alloc(size_t size);

// Allocate pinned host memory via cuMemAllocHost.
// When the last reference is released, cuMemFreeHost is called.
// Returns empty handle on error (caller must check).
DevicePtrHandle deviceptr_alloc_host(size_t size) noexcept;
DevicePtrHandle deviceptr_alloc_host(size_t size);

// Create a non-owning device pointer handle (references existing pointer).
// Use for foreign pointers (e.g., from external libraries).
// The pointer will NOT be freed when the handle is released.
DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr) noexcept;
DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr);

// Create a non-owning device pointer handle that prevents a Python owner from being GC'd.
// The owner's refcount is incremented; decremented when handle is released.
// The pointer will NOT be freed when the handle is released.
// If owner is nullptr, equivalent to deviceptr_create_ref.
DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner) noexcept;
DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner);

// Import a device pointer from IPC via cuMemPoolImportPointer.
// When the last reference is released, cuMemFreeAsync is called on the stored stream.
Expand All @@ -209,7 +209,7 @@ DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner) no
DevicePtrHandle deviceptr_import_ipc(
MemoryPoolHandle h_pool,
const void* export_data,
StreamHandle h_stream) noexcept;
StreamHandle h_stream);

// Access the deallocation stream for a device pointer handle (read-only).
// For non-owning handles, the stream is not used but can still be accessed.
Expand Down
Loading
Loading