From c761a7ffa71553bd6e89b9ea7673ec17d8aa2dc2 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 21 Jan 2026 13:39:46 -0800
Subject: [PATCH 01/11] Cythonize _module.py (Phase 2a: cdef classes)

Convert Kernel, ObjectCode, and KernelOccupancy to cdef classes with
proper .pxd declarations. This phase establishes the Cython structure
while maintaining Python driver module usage.

Changes:
- Rename _module.py to _module.pyx
- Create _module.pxd with cdef class declarations
- Convert Kernel, ObjectCode, KernelOccupancy to cdef class
- Remove _backend dict in favor of direct driver calls
- Add _init_py() Python-accessible factory for ObjectCode
- Update _program.py and _linker.py to use _init_py()
- Fix test to handle cdef class property descriptors

Phase 2b will convert driver calls to cydriver with nogil blocks.
Phase 2c will add RAII handles to resource_handles.
---
 cuda_core/cuda/core/_linker.py                |   2 +-
 cuda_core/cuda/core/_module.pxd               |  42 ++++++
 .../cuda/core/{_module.py => _module.pyx}     | 130 +++++++++---------
 cuda_core/cuda/core/_program.py               |   6 +-
 cuda_core/tests/test_module.py                |   7 +-
 5 files changed, 114 insertions(+), 73 deletions(-)
 create mode 100644 cuda_core/cuda/core/_module.pxd
 rename cuda_core/cuda/core/{_module.py => _module.pyx} (90%)

diff --git a/cuda_core/cuda/core/_linker.py b/cuda_core/cuda/core/_linker.py
index df38502b93..e36dea7d15 100644
--- a/cuda_core/cuda/core/_linker.py
+++ b/cuda_core/cuda/core/_linker.py
@@ -529,7 +529,7 @@ def link(self, target_type) -> ObjectCode:
                 addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle))
                 code = (ctypes.c_char * size).from_address(addr)
 
-        return ObjectCode._init(bytes(code), target_type, name=self._options.name)
+        return ObjectCode._init_py(bytes(code), target_type, name=self._options.name)
 
     def get_error_log(self) -> str:
         """Get the error log generated by the linker.
diff --git a/cuda_core/cuda/core/_module.pxd b/cuda_core/cuda/core/_module.pxd
new file mode 100644
index 0000000000..4b8688bf2f
--- /dev/null
+++ b/cuda_core/cuda/core/_module.pxd
@@ -0,0 +1,42 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cdef class ObjectCode
+cdef class Kernel
+cdef class KernelOccupancy
+
+
+cdef class Kernel:
+    cdef public:
+        object _handle      # CUkernel (will become KernelHandle in phase 2c)
+        object _module      # ObjectCode reference
+        object _attributes  # KernelAttributes (lazy)
+        object _occupancy   # KernelOccupancy (lazy)
+
+    cdef object __weakref__  # Enable weak references
+
+    @staticmethod
+    cdef Kernel _from_obj(object obj, ObjectCode mod)
+
+
+cdef class ObjectCode:
+    cdef public:
+        object _handle      # CUlibrary (will become LibraryHandle in phase 2c)
+        str _code_type
+        object _module      # bytes/str source
+        dict _sym_map
+        str _name
+
+    @staticmethod
+    cdef ObjectCode _init(object module, str code_type, str name=*, dict symbol_mapping=*)
+
+    cdef int _lazy_load_module(self) except -1
+
+
+cdef class KernelOccupancy:
+    cdef public:
+        object _handle      # CUkernel reference
+
+    @staticmethod
+    cdef KernelOccupancy _init(object handle)
diff --git a/cuda_core/cuda/core/_module.py b/cuda_core/cuda/core/_module.pyx
similarity index 90%
rename from cuda_core/cuda/core/_module.py
rename to cuda_core/cuda/core/_module.pyx
index 6abb7dfd31..9c5bc3cf47 100644
--- a/cuda_core/cuda/core/_module.py
+++ b/cuda_core/cuda/core/_module.pyx
@@ -17,7 +17,9 @@
     assert_type_str_or_bytes_like,
     raise_code_path_meant_to_be_unreachable,
 )
-from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return, precondition
+from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return
+
+__all__ = ["Kernel", "ObjectCode"]
 
 # Lazy initialization state and synchronization
 # For Python 3.13t (free-threaded builds), we use a lock to ensure thread-safe initialization.
@@ -28,7 +30,7 @@
 _py_minor_ver = None
 _driver_ver = None
 _kernel_ctypes = None
-_backend = {}
+_paraminfo_supported = False
 
 
 def _lazy_init():
@@ -54,19 +56,12 @@ def _lazy_init():
         if _inited:
             return
 
-        global _py_major_ver, _py_minor_ver, _driver_ver, _kernel_ctypes, _backend
+        global _py_major_ver, _py_minor_ver, _driver_ver, _kernel_ctypes, _paraminfo_supported
         # binding availability depends on cuda-python version
         _py_major_ver, _py_minor_ver = get_binding_version()
-        _backend = {
-            "file": driver.cuLibraryLoadFromFile,
-            "data": driver.cuLibraryLoadData,
-            "kernel": driver.cuLibraryGetKernel,
-            "attribute": driver.cuKernelGetAttribute,
-        }
         _kernel_ctypes = (driver.CUkernel,)
         _driver_ver = handle_return(driver.cuDriverGetVersion())
-        if _driver_ver >= 12040:
-            _backend["paraminfo"] = driver.cuKernelGetParamInfo
+        _paraminfo_supported = _driver_ver >= 12040
 
         # Mark as initialized (must be last to ensure all state is set)
         _inited = True
@@ -97,6 +92,12 @@ def _get_kernel_ctypes():
     return _kernel_ctypes
 
 
+def _is_paraminfo_supported():
+    """Return True if cuKernelGetParamInfo is available (driver >= 12.4)."""
+    _lazy_init()
+    return _paraminfo_supported
+
+
 @functools.cache
 def _is_cukernel_get_library_supported() -> bool:
     """Return True when cuKernelGetLibrary is available for inverse kernel-to-library lookup.
@@ -116,20 +117,19 @@ def _make_dummy_library_handle():
 
 
 class KernelAttributes:
-    def __new__(self, *args, **kwargs):
-        raise RuntimeError("KernelAttributes cannot be instantiated directly. Please use Kernel APIs.")
+    """Provides access to kernel attributes. Uses weakref to avoid preventing Kernel GC."""
+
+    __slots__ = ("_kernel", "_cache")
 
-    slots = ("_kernel", "_cache", "_loader")
+    def __new__(cls, *args, **kwargs):
+        raise RuntimeError("KernelAttributes cannot be instantiated directly. Please use Kernel APIs.")
 
     @classmethod
     def _init(cls, kernel):
-        self = super().__new__(cls)
+        self = object.__new__(cls)
         self._kernel = weakref.ref(kernel)
         self._cache = {}
-
-        # Ensure backend is initialized before setting loader
         _lazy_init()
-        self._loader = _backend
         return self
 
     def _get_cached_attribute(self, device_id: Device | int, attribute: driver.CUfunction_attribute) -> int:
@@ -142,7 +142,7 @@ def _get_cached_attribute(self, device_id: Device | int, attribute: driver.CUfun
         kernel = self._kernel()
         if kernel is None:
             raise RuntimeError("Cannot access kernel attributes for expired Kernel object")
-        result = handle_return(self._loader["attribute"](attribute, kernel._handle, device_id))
+        result = handle_return(driver.cuKernelGetAttribute(attribute, kernel._handle, device_id))
         self._cache[cache_key] = result
         return result
 
@@ -246,21 +246,18 @@ def cluster_scheduling_policy_preference(self, device_id: Device | int = None) -
 MaxPotentialBlockSizeOccupancyResult = namedtuple("MaxPotential", ("min_grid_size", "max_block_size"))
 
 
-class KernelOccupancy:
+cdef class KernelOccupancy:
     """This class offers methods to query occupancy metrics that help determine optimal
     launch parameters such as block size, grid size, and shared memory usage.
     """
 
-    def __new__(self, *args, **kwargs):
+    def __init__(self, *args, **kwargs):
         raise RuntimeError("KernelOccupancy cannot be instantiated directly. Please use Kernel APIs.")
 
-    slots = ("_handle",)
-
-    @classmethod
-    def _init(cls, handle):
-        self = super().__new__(cls)
+    @staticmethod
+    cdef KernelOccupancy _init(object handle):
+        cdef KernelOccupancy self = KernelOccupancy.__new__(KernelOccupancy)
         self._handle = handle
-
         return self
 
     def max_active_blocks_per_multiprocessor(self, block_size: int, dynamic_shared_memory_size: int) -> int:
@@ -412,7 +409,7 @@ def max_active_clusters(self, config: LaunchConfig, stream: Stream | None = None
 ParamInfo = namedtuple("ParamInfo", ["offset", "size"])
 
 
-class Kernel:
+cdef class Kernel:
     """Represent a compiled kernel that had been loaded onto the device.
 
     Kernel instances can execution when passed directly into the
@@ -423,16 +420,13 @@ class Kernel:
 
     """
 
-    __slots__ = ("_handle", "_module", "_attributes", "_occupancy", "__weakref__")
-
-    def __new__(self, *args, **kwargs):
+    def __init__(self, *args, **kwargs):
         raise RuntimeError("Kernel objects cannot be instantiated directly. Please use ObjectCode APIs.")
 
-    @classmethod
-    def _from_obj(cls, obj, mod):
+    @staticmethod
+    cdef Kernel _from_obj(object obj, ObjectCode mod):
         assert_type(obj, _get_kernel_ctypes())
-        assert_type(mod, ObjectCode)
-        ker = super().__new__(cls)
+        cdef Kernel ker = Kernel.__new__(Kernel)
         ker._handle = obj
         ker._module = mod
         ker._attributes = None
@@ -447,8 +441,7 @@ def attributes(self) -> KernelAttributes:
         return self._attributes
 
     def _get_arguments_info(self, param_info=False) -> tuple[int, list[ParamInfo]]:
-        attr_impl = self.attributes
-        if "paraminfo" not in attr_impl._loader:
+        if not _is_paraminfo_supported():
             driver_ver = _get_driver_ver()
             raise NotImplementedError(
                 "Driver version 12.4 or newer is required for this function. "
@@ -457,7 +450,7 @@ def _get_arguments_info(self, param_info=False) -> tuple[int, list[ParamInfo]]:
         arg_pos = 0
         param_info_data = []
         while True:
-            result = attr_impl._loader["paraminfo"](self._handle, arg_pos)
+            result = driver.cuKernelGetParamInfo(self._handle, arg_pos)
             if result[0] != driver.CUresult.CUDA_SUCCESS:
                 break
             if param_info:
@@ -516,7 +509,7 @@ def from_handle(handle: int, mod: ObjectCode = None) -> Kernel:
             # For CUkernel, we can (optionally) inverse-lookup the owning CUlibrary via
             # cuKernelGetLibrary (added in CUDA 12.5). If the API is not available, we fall
             # back to a non-null dummy handle purely to disable lazy loading.
-            mod = ObjectCode._init(b"", "cubin")
+            mod = ObjectCode._init(b"", "cubin", "", None)
             if _is_cukernel_get_library_supported():
                 try:
                     mod._handle = handle_return(driver.cuKernelGetLibrary(kernel_obj))
@@ -531,8 +524,9 @@ def from_handle(handle: int, mod: ObjectCode = None) -> Kernel:
 
 CodeTypeT = bytes | bytearray | str
 
+_supported_code_type = ("cubin", "ptx", "ltoir", "fatbin", "object", "library")
 
-class ObjectCode:
+cdef class ObjectCode:
     """Represent a compiled program to be loaded onto the device.
 
     This object provides a unified interface for different types of
@@ -546,26 +540,20 @@ class ObjectCode:
     :class:`~cuda.core.Program`
     """
 
-    __slots__ = ("_handle", "_code_type", "_module", "_loader", "_sym_map", "_name")
-    _supported_code_type = ("cubin", "ptx", "ltoir", "fatbin", "object", "library")
-
-    def __new__(self, *args, **kwargs):
+    def __init__(self, *args, **kwargs):
         raise RuntimeError(
             "ObjectCode objects cannot be instantiated directly. "
             "Please use ObjectCode APIs (from_cubin, from_ptx) or Program APIs (compile)."
         )
 
-    @classmethod
-    def _init(cls, module, code_type, *, name: str = "", symbol_mapping: dict | None = None):
-        self = super().__new__(cls)
-        assert code_type in self._supported_code_type, f"{code_type=} is not supported"
+    @staticmethod
+    cdef ObjectCode _init(object module, str code_type, str name = "", dict symbol_mapping = None):
+        assert code_type in _supported_code_type, f"{code_type=} is not supported"
+        cdef ObjectCode self = ObjectCode.__new__(ObjectCode)
 
         # handle is assigned during _lazy_load
         self._handle = None
-
-        # Ensure backend is initialized before setting loader
         _lazy_init()
-        self._loader = _backend
 
         self._code_type = code_type
         self._module = module
@@ -575,9 +563,14 @@ def _init(cls, module, code_type, *, name: str = "", symbol_mapping: dict | None
         return self
 
     @classmethod
-    def _reduce_helper(self, module, code_type, name, symbol_mapping):
+    def _init_py(cls, module, code_type, *, name: str = "", symbol_mapping: dict | None = None):
+        """Python-accessible factory method for use by _program.py and _linker.py."""
+        return ObjectCode._init(module, code_type, name if name else "", symbol_mapping)
+
+    @classmethod
+    def _reduce_helper(cls, module, code_type, name, symbol_mapping):
         # just for forwarding kwargs
-        return ObjectCode._init(module, code_type, name=name, symbol_mapping=symbol_mapping)
+        return ObjectCode._init(module, code_type, name if name else "", symbol_mapping)
 
     def __reduce__(self):
         return ObjectCode._reduce_helper, (self._module, self._code_type, self._name, self._sym_map)
@@ -598,7 +591,7 @@ def from_cubin(module: bytes | str, *, name: str = "", symbol_mapping: dict | No
             should be mapped to the mangled names before trying to retrieve
             them (default to no mappings).
         """
-        return ObjectCode._init(module, "cubin", name=name, symbol_mapping=symbol_mapping)
+        return ObjectCode._init(module, "cubin", name, symbol_mapping)
 
     @staticmethod
     def from_ptx(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode:
@@ -616,7 +609,7 @@ def from_ptx(module: bytes | str, *, name: str = "", symbol_mapping: dict | None
             should be mapped to the mangled names before trying to retrieve
             them (default to no mappings).
         """
-        return ObjectCode._init(module, "ptx", name=name, symbol_mapping=symbol_mapping)
+        return ObjectCode._init(module, "ptx", name, symbol_mapping)
 
     @staticmethod
     def from_ltoir(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode:
@@ -634,7 +627,7 @@ def from_ltoir(module: bytes | str, *, name: str = "", symbol_mapping: dict | No
             should be mapped to the mangled names before trying to retrieve
             them (default to no mappings).
         """
-        return ObjectCode._init(module, "ltoir", name=name, symbol_mapping=symbol_mapping)
+        return ObjectCode._init(module, "ltoir", name, symbol_mapping)
 
     @staticmethod
     def from_fatbin(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode:
@@ -652,7 +645,7 @@ def from_fatbin(module: bytes | str, *, name: str = "", symbol_mapping: dict | N
             should be mapped to the mangled names before trying to retrieve
             them (default to no mappings).
         """
-        return ObjectCode._init(module, "fatbin", name=name, symbol_mapping=symbol_mapping)
+        return ObjectCode._init(module, "fatbin", name, symbol_mapping)
 
     @staticmethod
     def from_object(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode:
@@ -670,7 +663,7 @@ def from_object(module: bytes | str, *, name: str = "", symbol_mapping: dict | N
             should be mapped to the mangled names before trying to retrieve
             them (default to no mappings).
         """
-        return ObjectCode._init(module, "object", name=name, symbol_mapping=symbol_mapping)
+        return ObjectCode._init(module, "object", name, symbol_mapping)
 
     @staticmethod
     def from_library(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode:
@@ -688,24 +681,24 @@ def from_library(module: bytes | str, *, name: str = "", symbol_mapping: dict |
             should be mapped to the mangled names before trying to retrieve
             them (default to no mappings).
         """
-        return ObjectCode._init(module, "library", name=name, symbol_mapping=symbol_mapping)
+        return ObjectCode._init(module, "library", name, symbol_mapping)
 
     # TODO: do we want to unload in a finalizer? Probably not..
 
-    def _lazy_load_module(self, *args, **kwargs):
+    cdef int _lazy_load_module(self) except -1:
         if self._handle is not None:
-            return
+            return 0
         module = self._module
         assert_type_str_or_bytes_like(module)
         if isinstance(module, str):
-            self._handle = handle_return(self._loader["file"](module.encode(), [], [], 0, [], [], 0))
-            return
+            self._handle = handle_return(driver.cuLibraryLoadFromFile(module.encode(), [], [], 0, [], [], 0))
+            return 0
         if isinstance(module, (bytes, bytearray)):
-            self._handle = handle_return(self._loader["data"](module, [], [], 0, [], [], 0))
-            return
+            self._handle = handle_return(driver.cuLibraryLoadData(module, [], [], 0, [], [], 0))
+            return 0
         raise_code_path_meant_to_be_unreachable()
+        return -1
 
-    @precondition(_lazy_load_module)
     def get_kernel(self, name) -> Kernel:
         """Return the :obj:`~_module.Kernel` of a specified name from this object code.
 
@@ -720,6 +713,7 @@ def get_kernel(self, name) -> Kernel:
             Newly created kernel object.
 
         """
+        self._lazy_load_module()
         supported_code_types = ("cubin", "ptx", "fatbin")
         if self._code_type not in supported_code_types:
             raise RuntimeError(f'Unsupported code type "{self._code_type}" ({supported_code_types=})')
@@ -728,7 +722,7 @@ def get_kernel(self, name) -> Kernel:
         except KeyError:
             name = name.encode()
 
-        data = handle_return(self._loader["kernel"](self._handle, name))
+        data = handle_return(driver.cuLibraryGetKernel(self._handle, name))
         return Kernel._from_obj(data, self)
 
     @property
@@ -747,7 +741,6 @@ def code_type(self) -> str:
         return self._code_type
 
     @property
-    @precondition(_lazy_load_module)
     def handle(self):
         """Return the underlying handle object.
 
@@ -756,4 +749,5 @@ def handle(self):
             This handle is a Python object. To get the memory address of the underlying C
             handle, call ``int(ObjectCode.handle)``.
         """
+        self._lazy_load_module()
         return self._handle
diff --git a/cuda_core/cuda/core/_program.py b/cuda_core/cuda/core/_program.py
index 1ef1aa51f5..b5f8fbfa0b 100644
--- a/cuda_core/cuda/core/_program.py
+++ b/cuda_core/cuda/core/_program.py
@@ -688,7 +688,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
         elif code_type == "ptx":
             assert_type(code, str)
             self._linker = Linker(
-                ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options)
+                ObjectCode._init_py(code.encode(), code_type), options=self._translate_program_options(options)
             )
             self._backend = self._linker.backend
 
@@ -806,7 +806,7 @@ def compile(self, target_type, name_expressions=(), logs=None):
                     handle_return(nvrtc.nvrtcGetProgramLog(self._mnff.handle, log), handle=self._mnff.handle)
                     logs.write(log.decode("utf-8", errors="backslashreplace"))
 
-            return ObjectCode._init(data, target_type, symbol_mapping=symbol_mapping, name=self._options.name)
+            return ObjectCode._init_py(data, target_type, symbol_mapping=symbol_mapping, name=self._options.name)
 
         elif self._backend == "NVVM":
             if target_type not in ("ptx", "ltoir"):
@@ -832,7 +832,7 @@ def compile(self, target_type, name_expressions=(), logs=None):
                     nvvm.get_program_log(self._mnff.handle, log)
                     logs.write(log.decode("utf-8", errors="backslashreplace"))
 
-            return ObjectCode._init(data, target_type, name=self._options.name)
+            return ObjectCode._init_py(data, target_type, name=self._options.name)
 
         supported_backends = ("nvJitLink", "driver")
         if self._backend not in supported_backends:
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index f9bbcd3e4c..843fbf677f 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -231,7 +231,12 @@ def test_saxpy_arguments(get_saxpy_kernel_cubin, cuda12_4_prerequisite_check):
             _ = krn.num_arguments
         return
 
-    assert "ParamInfo" in str(type(krn).arguments_info.fget.__annotations__)
+    # Check that arguments_info returns ParamInfo objects (works for both Python and Cython classes)
+    # For Python classes: type(krn).arguments_info.fget.__annotations__ contains ParamInfo
+    # For Cython cdef classes: property descriptors don't have .fget, so we check the actual values
+    prop = type(krn).arguments_info
+    if hasattr(prop, "fget") and hasattr(prop.fget, "__annotations__"):
+        assert "ParamInfo" in str(prop.fget.__annotations__)
     arg_info = krn.arguments_info
     n_args = len(arg_info)
     assert n_args == krn.num_arguments

From 47e00e26f5cf5c0a6b228dbe61a09f3755c7962f Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 21 Jan 2026 14:55:17 -0800
Subject: [PATCH 02/11] Phase 2a refinements: hide private attrs, add public
 properties

- Use strong types in .pxd (ObjectCode, KernelOccupancy)
- Remove cdef public - attributes now private to C level
- Add Kernel.handle property for external access
- Add ObjectCode.symbol_mapping property (symmetric with input)
- Update _launcher.pyx, _linker.py, tests to use public APIs
---
 cuda_core/cuda/core/_launcher.pyx |  3 +--
 cuda_core/cuda/core/_linker.py    | 10 ++++----
 cuda_core/cuda/core/_module.pxd   | 15 ++++++-----
 cuda_core/cuda/core/_module.pyx   | 18 ++++++++++++-
 cuda_core/tests/test_module.py    | 42 +++++++++++++++----------------
 cuda_core/tests/test_program.py   |  4 +--
 6 files changed, 52 insertions(+), 40 deletions(-)

diff --git a/cuda_core/cuda/core/_launcher.pyx b/cuda_core/cuda/core/_launcher.pyx
index 9559f7697a..9066e72732 100644
--- a/cuda_core/cuda/core/_launcher.pyx
+++ b/cuda_core/cuda/core/_launcher.pyx
@@ -77,11 +77,10 @@ def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kern
     cdef ParamHolder ker_args = ParamHolder(kernel_args)
     cdef void** args_ptr = <void**><uintptr_t>(ker_args.ptr)
 
-    # TODO: cythonize Module/Kernel/...
     # Note: We now use CUkernel handles exclusively (CUDA 12+), but they can be cast to
     # CUfunction for use with cuLaunchKernel, as both handle types are interchangeable
     # for kernel launch purposes.
-    cdef cydriver.CUfunction func_handle = <cydriver.CUfunction>(<uintptr_t>(kernel._handle))
+    cdef cydriver.CUfunction func_handle = <cydriver.CUfunction>(<uintptr_t>(kernel.handle))
 
     # Note: CUkernel can still be launched via cuLaunchKernel (not just cuLaunchKernelEx).
     # We check both binding & driver versions here mainly to see if the "Ex" API is
diff --git a/cuda_core/cuda/core/_linker.py b/cuda_core/cuda/core/_linker.py
index e36dea7d15..5ce6fb73ee 100644
--- a/cuda_core/cuda/core/_linker.py
+++ b/cuda_core/cuda/core/_linker.py
@@ -444,13 +444,13 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None):
             self._add_code_object(code)
 
     def _add_code_object(self, object_code: ObjectCode):
-        data = object_code._module
+        data = object_code.code
         with _exception_manager(self):
             name_str = f"{object_code.name}"
             if _nvjitlink and isinstance(data, bytes):
                 _nvjitlink.add_data(
                     self._mnff.handle,
-                    self._input_type_from_code_type(object_code._code_type),
+                    self._input_type_from_code_type(object_code.code_type),
                     data,
                     len(data),
                     name_str,
@@ -458,7 +458,7 @@ def _add_code_object(self, object_code: ObjectCode):
             elif _nvjitlink and isinstance(data, str):
                 _nvjitlink.add_file(
                     self._mnff.handle,
-                    self._input_type_from_code_type(object_code._code_type),
+                    self._input_type_from_code_type(object_code.code_type),
                     data,
                 )
             elif (not _nvjitlink) and isinstance(data, bytes):
@@ -466,7 +466,7 @@ def _add_code_object(self, object_code: ObjectCode):
                 handle_return(
                     _driver.cuLinkAddData(
                         self._mnff.handle,
-                        self._input_type_from_code_type(object_code._code_type),
+                        self._input_type_from_code_type(object_code.code_type),
                         data,
                         len(data),
                         name_bytes,
@@ -481,7 +481,7 @@ def _add_code_object(self, object_code: ObjectCode):
                 handle_return(
                     _driver.cuLinkAddFile(
                         self._mnff.handle,
-                        self._input_type_from_code_type(object_code._code_type),
+                        self._input_type_from_code_type(object_code.code_type),
                         data.encode(),
                         0,
                         None,
diff --git a/cuda_core/cuda/core/_module.pxd b/cuda_core/cuda/core/_module.pxd
index 4b8688bf2f..b83e08ed47 100644
--- a/cuda_core/cuda/core/_module.pxd
+++ b/cuda_core/cuda/core/_module.pxd
@@ -8,20 +8,19 @@ cdef class KernelOccupancy
 
 
 cdef class Kernel:
-    cdef public:
+    cdef:
         object _handle      # CUkernel (will become KernelHandle in phase 2c)
-        object _module      # ObjectCode reference
-        object _attributes  # KernelAttributes (lazy)
-        object _occupancy   # KernelOccupancy (lazy)
-
-    cdef object __weakref__  # Enable weak references
+        ObjectCode _module  # ObjectCode reference
+        object _attributes  # KernelAttributes (regular Python class)
+        KernelOccupancy _occupancy  # KernelOccupancy (lazy)
+        object __weakref__  # Enable weak references
 
     @staticmethod
     cdef Kernel _from_obj(object obj, ObjectCode mod)
 
 
 cdef class ObjectCode:
-    cdef public:
+    cdef:
         object _handle      # CUlibrary (will become LibraryHandle in phase 2c)
         str _code_type
         object _module      # bytes/str source
@@ -35,7 +34,7 @@ cdef class ObjectCode:
 
 
 cdef class KernelOccupancy:
-    cdef public:
+    cdef:
         object _handle      # CUkernel reference
 
     @staticmethod
diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx
index 9c5bc3cf47..69d1db4272 100644
--- a/cuda_core/cuda/core/_module.pyx
+++ b/cuda_core/cuda/core/_module.pyx
@@ -142,7 +142,7 @@ class KernelAttributes:
         kernel = self._kernel()
         if kernel is None:
             raise RuntimeError("Cannot access kernel attributes for expired Kernel object")
-        result = handle_return(driver.cuKernelGetAttribute(attribute, kernel._handle, device_id))
+        result = handle_return(driver.cuKernelGetAttribute(attribute, kernel.handle, device_id))
         self._cache[cache_key] = result
         return result
 
@@ -480,6 +480,17 @@ cdef class Kernel:
             self._occupancy = KernelOccupancy._init(self._handle)
         return self._occupancy
 
+    @property
+    def handle(self):
+        """Return the underlying kernel handle object.
+
+        .. caution::
+
+            This handle is a Python object. To get the memory address of the underlying C
+            handle, call ``int(Kernel.handle)``.
+        """
+        return self._handle
+
     @staticmethod
     def from_handle(handle: int, mod: ObjectCode = None) -> Kernel:
         """Creates a new :obj:`Kernel` object from a foreign kernel handle.
@@ -740,6 +751,11 @@ cdef class ObjectCode:
         """Return the type of the underlying code object."""
         return self._code_type
 
+    @property
+    def symbol_mapping(self) -> dict:
+        """Return a copy of the symbol mapping dictionary."""
+        return dict(self._sym_map)
+
     @property
     def handle(self):
         """Return the underlying handle object.
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index 843fbf677f..d5a35a1ea5 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -79,7 +79,7 @@ def get_saxpy_kernel_ptx(init_cuda):
         "ptx",
         name_expressions=("saxpy<float>", "saxpy<double>"),
     )
-    ptx = mod._module
+    ptx = mod.code
     return ptx, mod
 
 
@@ -100,10 +100,10 @@ def test_get_kernel(init_cuda):
         if any("The CUDA driver version is older than the backend version" in str(warning.message) for warning in w):
             pytest.skip("PTX version too new for current driver")
 
-    assert object_code._handle is None
+    # Verify lazy loading: get_kernel triggers module loading and returns a valid kernel
     kernel = object_code.get_kernel("ABC")
-    assert object_code._handle is not None
-    assert kernel._handle is not None
+    assert object_code.handle is not None
+    assert kernel.handle is not None
 
 
 @pytest.mark.parametrize(
@@ -143,7 +143,7 @@ def test_read_only_kernel_attributes(get_saxpy_kernel_cubin, attr, expected_type
 
 def test_object_code_load_ptx(get_saxpy_kernel_ptx):
     ptx, mod = get_saxpy_kernel_ptx
-    sym_map = mod._sym_map
+    sym_map = mod.symbol_mapping
     mod_obj = ObjectCode.from_ptx(ptx, symbol_mapping=sym_map)
     assert mod.code == ptx
     if not Program._can_load_generated_ptx():
@@ -153,7 +153,7 @@ def test_object_code_load_ptx(get_saxpy_kernel_ptx):
 
 def test_object_code_load_ptx_from_file(get_saxpy_kernel_ptx, tmp_path):
     ptx, mod = get_saxpy_kernel_ptx
-    sym_map = mod._sym_map
+    sym_map = mod.symbol_mapping
     assert isinstance(ptx, bytes)
     ptx_file = tmp_path / "test.ptx"
     ptx_file.write_bytes(ptx)
@@ -167,8 +167,8 @@ def test_object_code_load_ptx_from_file(get_saxpy_kernel_ptx, tmp_path):
 
 def test_object_code_load_cubin(get_saxpy_kernel_cubin):
     _, mod = get_saxpy_kernel_cubin
-    cubin = mod._module
-    sym_map = mod._sym_map
+    cubin = mod.code
+    sym_map = mod.symbol_mapping
     assert isinstance(cubin, bytes)
     mod = ObjectCode.from_cubin(cubin, symbol_mapping=sym_map)
     assert mod.code == cubin
@@ -177,8 +177,8 @@ def test_object_code_load_cubin(get_saxpy_kernel_cubin):
 
 def test_object_code_load_cubin_from_file(get_saxpy_kernel_cubin, tmp_path):
     _, mod = get_saxpy_kernel_cubin
-    cubin = mod._module
-    sym_map = mod._sym_map
+    cubin = mod.code
+    sym_map = mod.symbol_mapping
     assert isinstance(cubin, bytes)
     cubin_file = tmp_path / "test.cubin"
     cubin_file.write_bytes(cubin)
@@ -194,14 +194,13 @@ def test_object_code_handle(get_saxpy_kernel_cubin):
 
 def test_object_code_load_ltoir(get_saxpy_kernel_ltoir):
     mod = get_saxpy_kernel_ltoir
-    ltoir = mod._module
-    sym_map = mod._sym_map
+    ltoir = mod.code
+    sym_map = mod.symbol_mapping
     assert isinstance(ltoir, bytes)
     mod_obj = ObjectCode.from_ltoir(ltoir, symbol_mapping=sym_map)
     assert mod_obj.code == ltoir
     assert mod_obj.code_type == "ltoir"
     # ltoir doesn't support kernel retrieval directly as it's used for linking
-    assert mod_obj._handle is None
     # Test that get_kernel fails for unsupported code type
     with pytest.raises(RuntimeError, match=r'Unsupported code type "ltoir"'):
         mod_obj.get_kernel("saxpy<float>")
@@ -209,8 +208,8 @@ def test_object_code_load_ltoir(get_saxpy_kernel_ltoir):
 
 def test_object_code_load_ltoir_from_file(get_saxpy_kernel_ltoir, tmp_path):
     mod = get_saxpy_kernel_ltoir
-    ltoir = mod._module
-    sym_map = mod._sym_map
+    ltoir = mod.code
+    sym_map = mod.symbol_mapping
     assert isinstance(ltoir, bytes)
     ltoir_file = tmp_path / "test.ltoir"
     ltoir_file.write_bytes(ltoir)
@@ -218,7 +217,6 @@ def test_object_code_load_ltoir_from_file(get_saxpy_kernel_ltoir, tmp_path):
     assert mod_obj.code == str(ltoir_file)
     assert mod_obj.code_type == "ltoir"
     # ltoir doesn't support kernel retrieval directly as it's used for linking
-    assert mod_obj._handle is None
 
 
 def test_saxpy_arguments(get_saxpy_kernel_cubin, cuda12_4_prerequisite_check):
@@ -423,7 +421,7 @@ def test_module_serialization_roundtrip(get_saxpy_kernel_cubin):
 
     assert isinstance(result, ObjectCode)
     assert objcode.code == result.code
-    assert objcode._sym_map == result._sym_map
+    assert objcode.symbol_mapping == result.symbol_mapping
     assert objcode.code_type == result.code_type
 
 
@@ -432,7 +430,7 @@ def test_kernel_from_handle(get_saxpy_kernel_cubin):
     original_kernel, objcode = get_saxpy_kernel_cubin
 
     # Get the handle from the original kernel
-    handle = int(original_kernel._handle)
+    handle = int(original_kernel.handle)
 
     # Create a new Kernel from the handle
     kernel_from_handle = Kernel.from_handle(handle, objcode)
@@ -449,7 +447,7 @@ def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin):
     original_kernel, _ = get_saxpy_kernel_cubin
 
     # Get the handle from the original kernel
-    handle = int(original_kernel._handle)
+    handle = int(original_kernel.handle)
 
     # Create a new Kernel from the handle without a module
     # This is supported on CUDA 12+ backend (CUkernel)
@@ -486,7 +484,7 @@ def test_kernel_from_handle_type_validation(invalid_value):
 def test_kernel_from_handle_invalid_module_type(get_saxpy_kernel_cubin):
     """Test Kernel.from_handle() with invalid module parameter"""
     original_kernel, _ = get_saxpy_kernel_cubin
-    handle = int(original_kernel._handle)
+    handle = int(original_kernel.handle)
 
     # Invalid module type (should fail type assertion in _from_obj)
     with pytest.raises((TypeError, AssertionError)):
@@ -499,7 +497,7 @@ def test_kernel_from_handle_invalid_module_type(get_saxpy_kernel_cubin):
 def test_kernel_from_handle_multiple_instances(get_saxpy_kernel_cubin):
     """Test creating multiple Kernel instances from the same handle"""
     original_kernel, objcode = get_saxpy_kernel_cubin
-    handle = int(original_kernel._handle)
+    handle = int(original_kernel.handle)
 
     # Create multiple Kernel instances from the same handle
     kernel1 = Kernel.from_handle(handle, objcode)
@@ -512,4 +510,4 @@ def test_kernel_from_handle_multiple_instances(get_saxpy_kernel_cubin):
     assert isinstance(kernel3, Kernel)
 
     # All should reference the same underlying CUDA kernel handle
-    assert int(kernel1._handle) == int(kernel2._handle) == int(kernel3._handle) == handle
+    assert int(kernel1.handle) == int(kernel2.handle) == int(kernel3.handle) == handle
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 9a9e4926ae..e2b3783dd7 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -335,7 +335,7 @@ def test_cpp_program_with_pch_options(init_cuda, tmp_path):
 
 @pytest.mark.parametrize("options", options)
 def test_ptx_program_with_various_options(init_cuda, ptx_code_object, options):
-    program = Program(ptx_code_object._module.decode(), "ptx", options=options)
+    program = Program(ptx_code_object.code.decode(), "ptx", options=options)
     assert program.backend == ("driver" if is_culink_backend else "nvJitLink")
     program.compile("cubin")
     program.close()
@@ -378,7 +378,7 @@ def test_program_compile_valid_target_type(init_cuda):
         ptx_kernel = ptx_object_code.get_kernel("my_kernel")
         assert isinstance(ptx_kernel, Kernel)
 
-    program = Program(ptx_object_code._module.decode(), "ptx", options={"name": "24"})
+    program = Program(ptx_object_code.code.decode(), "ptx", options={"name": "24"})
     cubin_object_code = program.compile("cubin")
     assert isinstance(cubin_object_code, ObjectCode)
     assert cubin_object_code.name == "24"

From 3eb6b523b43555c9b05b281cdd18e3d93c7e1e22 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 21 Jan 2026 15:24:22 -0800
Subject: [PATCH 03/11] Convert module-level functions and
 Kernel._get_arguments_info to cdef

- Module globals: _inited, _py_major_ver, _py_minor_ver, _driver_ver,
  _kernel_ctypes, _paraminfo_supported -> cdef typed
- Module functions: _lazy_init, _get_py_major_ver, _get_py_minor_ver,
  _get_driver_ver, _get_kernel_ctypes, _is_paraminfo_supported,
  _make_dummy_library_handle -> cdef inline with exception specs
- Module constant: _supported_code_type -> cdef tuple
- Kernel._get_arguments_info -> cdef tuple

Note: KernelAttributes remains a regular Python class due to
segfaults when converted to cdef class (likely due to weakref
interaction with cdef class properties).
---
 cuda_core/cuda/core/_module.pxd |  4 ++-
 cuda_core/cuda/core/_module.pyx | 48 +++++++++++++++++----------------
 2 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/cuda_core/cuda/core/_module.pxd b/cuda_core/cuda/core/_module.pxd
index b83e08ed47..5f0fb9b572 100644
--- a/cuda_core/cuda/core/_module.pxd
+++ b/cuda_core/cuda/core/_module.pxd
@@ -11,13 +11,15 @@ cdef class Kernel:
     cdef:
         object _handle      # CUkernel (will become KernelHandle in phase 2c)
         ObjectCode _module  # ObjectCode reference
-        object _attributes  # KernelAttributes (regular Python class)
+        object _attributes  # KernelAttributes (regular Python class, lazy)
         KernelOccupancy _occupancy  # KernelOccupancy (lazy)
         object __weakref__  # Enable weak references
 
     @staticmethod
     cdef Kernel _from_obj(object obj, ObjectCode mod)
 
+    cdef tuple _get_arguments_info(self, bint param_info=*)
+
 
 cdef class ObjectCode:
     cdef:
diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx
index 69d1db4272..74c92993fc 100644
--- a/cuda_core/cuda/core/_module.pyx
+++ b/cuda_core/cuda/core/_module.pyx
@@ -24,16 +24,16 @@ __all__ = ["Kernel", "ObjectCode"]
 # Lazy initialization state and synchronization
 # For Python 3.13t (free-threaded builds), we use a lock to ensure thread-safe initialization.
 # For regular Python builds with GIL, the lock overhead is minimal and the code remains safe.
-_init_lock = threading.Lock()
-_inited = False
-_py_major_ver = None
-_py_minor_ver = None
-_driver_ver = None
-_kernel_ctypes = None
-_paraminfo_supported = False
+cdef object _init_lock = threading.Lock()
+cdef bint _inited = False
+cdef int _py_major_ver = 0
+cdef int _py_minor_ver = 0
+cdef int _driver_ver = 0
+cdef tuple _kernel_ctypes = None
+cdef bint _paraminfo_supported = False
 
 
-def _lazy_init():
+cdef int _lazy_init() except -1:
     """
     Initialize module-level state in a thread-safe manner.
 
@@ -48,13 +48,13 @@ def _lazy_init():
     global _inited
     # Fast path: already initialized (no lock needed for read)
     if _inited:
-        return
+        return 0
 
     # Slow path: acquire lock and initialize
     with _init_lock:
         # Double-check: another thread might have initialized while we waited
         if _inited:
-            return
+            return 0
 
         global _py_major_ver, _py_minor_ver, _driver_ver, _kernel_ctypes, _paraminfo_supported
         # binding availability depends on cuda-python version
@@ -66,33 +66,35 @@ def _lazy_init():
         # Mark as initialized (must be last to ensure all state is set)
         _inited = True
 
+    return 0
+
 
-# Auto-initializing property accessors
-def _get_py_major_ver():
+# Auto-initializing accessors (cdef for internal use)
+cdef inline int _get_py_major_ver() except -1:
     """Get the Python binding major version, initializing if needed."""
     _lazy_init()
     return _py_major_ver
 
 
-def _get_py_minor_ver():
+cdef inline int _get_py_minor_ver() except -1:
     """Get the Python binding minor version, initializing if needed."""
     _lazy_init()
     return _py_minor_ver
 
 
-def _get_driver_ver():
+cdef inline int _get_driver_ver() except -1:
     """Get the CUDA driver version, initializing if needed."""
     _lazy_init()
     return _driver_ver
 
 
-def _get_kernel_ctypes():
+cdef inline tuple _get_kernel_ctypes():
     """Get the kernel ctypes tuple, initializing if needed."""
     _lazy_init()
     return _kernel_ctypes
 
 
-def _is_paraminfo_supported():
+cdef inline bint _is_paraminfo_supported() except -1:
     """Return True if cuKernelGetParamInfo is available (driver >= 12.4)."""
     _lazy_init()
     return _paraminfo_supported
@@ -111,7 +113,7 @@ def _is_cukernel_get_library_supported() -> bool:
     )
 
 
-def _make_dummy_library_handle():
+cdef inline object _make_dummy_library_handle():
     """Create a non-null placeholder CUlibrary handle to disable lazy loading."""
     return driver.CUlibrary(1) if hasattr(driver, "CUlibrary") else 1
 
@@ -132,10 +134,10 @@ class KernelAttributes:
         _lazy_init()
         return self
 
-    def _get_cached_attribute(self, device_id: Device | int, attribute: driver.CUfunction_attribute) -> int:
+    def _get_cached_attribute(self, device_id, attribute):
         """Helper function to get a cached attribute or fetch and cache it if not present."""
         device_id = Device(device_id).device_id
-        cache_key = device_id, attribute
+        cache_key = (device_id, attribute)
         result = self._cache.get(cache_key, cache_key)
         if result is not cache_key:
             return result
@@ -440,15 +442,15 @@ cdef class Kernel:
             self._attributes = KernelAttributes._init(self)
         return self._attributes
 
-    def _get_arguments_info(self, param_info=False) -> tuple[int, list[ParamInfo]]:
+    cdef tuple _get_arguments_info(self, bint param_info=False):
         if not _is_paraminfo_supported():
             driver_ver = _get_driver_ver()
             raise NotImplementedError(
                 "Driver version 12.4 or newer is required for this function. "
                 f"Using driver version {driver_ver // 1000}.{(driver_ver % 1000) // 10}"
             )
-        arg_pos = 0
-        param_info_data = []
+        cdef int arg_pos = 0
+        cdef list param_info_data = []
         while True:
             result = driver.cuKernelGetParamInfo(self._handle, arg_pos)
             if result[0] != driver.CUresult.CUDA_SUCCESS:
@@ -535,7 +537,7 @@ cdef class Kernel:
 
 CodeTypeT = bytes | bytearray | str
 
-_supported_code_type = ("cubin", "ptx", "ltoir", "fatbin", "object", "library")
+cdef tuple _supported_code_type = ("cubin", "ptx", "ltoir", "fatbin", "object", "library")
 
 cdef class ObjectCode:
     """Represent a compiled program to be loaded onto the device.

From 0cbeea8174cc9bc55b4d25d99d901aec5edc9d7e Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 21 Jan 2026 15:53:25 -0800
Subject: [PATCH 04/11] Convert KernelAttributes to cdef class

Follow the _MemPoolAttributes pattern:
- cdef class with inline cdef attributes (_kernel_weakref, _cache)
- _init as @classmethod (not @staticmethod cdef)
- _get_cached_attribute and _resolve_device_id use except? -1
- Explicit cast when dereferencing weakref
---
 cuda_core/cuda/core/_module.pyx | 75 +++++++++++++++++++++------------
 1 file changed, 49 insertions(+), 26 deletions(-)

diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx
index 74c92993fc..564244e521 100644
--- a/cuda_core/cuda/core/_module.pyx
+++ b/cuda_core/cuda/core/_module.pyx
@@ -118,89 +118,110 @@ cdef inline object _make_dummy_library_handle():
     return driver.CUlibrary(1) if hasattr(driver, "CUlibrary") else 1
 
 
-class KernelAttributes:
+cdef class KernelAttributes:
     """Provides access to kernel attributes. Uses weakref to avoid preventing Kernel GC."""
 
-    __slots__ = ("_kernel", "_cache")
+    cdef:
+        object _kernel_weakref
+        dict _cache
 
-    def __new__(cls, *args, **kwargs):
+    def __init__(self, *args, **kwargs):
         raise RuntimeError("KernelAttributes cannot be instantiated directly. Please use Kernel APIs.")
 
     @classmethod
     def _init(cls, kernel):
-        self = object.__new__(cls)
-        self._kernel = weakref.ref(kernel)
+        cdef KernelAttributes self = KernelAttributes.__new__(cls)
+        self._kernel_weakref = weakref.ref(kernel)
         self._cache = {}
         _lazy_init()
         return self
 
-    def _get_cached_attribute(self, device_id, attribute):
+    cdef int _get_cached_attribute(self, int device_id, object attribute) except? -1:
         """Helper function to get a cached attribute or fetch and cache it if not present."""
-        device_id = Device(device_id).device_id
-        cache_key = (device_id, attribute)
+        cdef tuple cache_key = (device_id, attribute)
         result = self._cache.get(cache_key, cache_key)
         if result is not cache_key:
             return result
-        kernel = self._kernel()
+        cdef Kernel kernel = <Kernel>(self._kernel_weakref())
         if kernel is None:
             raise RuntimeError("Cannot access kernel attributes for expired Kernel object")
-        result = handle_return(driver.cuKernelGetAttribute(attribute, kernel.handle, device_id))
+        result = handle_return(driver.cuKernelGetAttribute(attribute, kernel._handle, device_id))
         self._cache[cache_key] = result
         return result
 
+    cdef inline int _resolve_device_id(self, device_id) except? -1:
+        """Convert Device or int to device_id int."""
+        return Device(device_id).device_id
+
     def max_threads_per_block(self, device_id: Device | int = None) -> int:
         """int : The maximum number of threads per block.
         This attribute is read-only."""
         return self._get_cached_attribute(
-            device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
+            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
         )
 
     def shared_size_bytes(self, device_id: Device | int = None) -> int:
         """int : The size in bytes of statically-allocated shared memory required by this function.
         This attribute is read-only."""
-        return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES)
+        return self._get_cached_attribute(
+            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES
+        )
 
     def const_size_bytes(self, device_id: Device | int = None) -> int:
         """int : The size in bytes of user-allocated constant memory required by this function.
         This attribute is read-only."""
-        return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES)
+        return self._get_cached_attribute(
+            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES
+        )
 
     def local_size_bytes(self, device_id: Device | int = None) -> int:
         """int : The size in bytes of local memory used by each thread of this function.
         This attribute is read-only."""
-        return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES)
+        return self._get_cached_attribute(
+            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES
+        )
 
     def num_regs(self, device_id: Device | int = None) -> int:
         """int : The number of registers used by each thread of this function.
         This attribute is read-only."""
-        return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NUM_REGS)
+        return self._get_cached_attribute(
+            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NUM_REGS
+        )
 
     def ptx_version(self, device_id: Device | int = None) -> int:
         """int : The PTX virtual architecture version for which the function was compiled.
         This attribute is read-only."""
-        return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PTX_VERSION)
+        return self._get_cached_attribute(
+            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PTX_VERSION
+        )
 
     def binary_version(self, device_id: Device | int = None) -> int:
         """int : The binary architecture version for which the function was compiled.
         This attribute is read-only."""
-        return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_BINARY_VERSION)
+        return self._get_cached_attribute(
+            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_BINARY_VERSION
+        )
 
     def cache_mode_ca(self, device_id: Device | int = None) -> bool:
         """bool : Whether the function has been compiled with user specified option "-Xptxas --dlcm=ca" set.
         This attribute is read-only."""
-        return bool(self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CACHE_MODE_CA))
+        return bool(
+            self._get_cached_attribute(
+                self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CACHE_MODE_CA
+            )
+        )
 
     def max_dynamic_shared_size_bytes(self, device_id: Device | int = None) -> int:
         """int : The maximum size in bytes of dynamically-allocated shared memory that can be used
         by this function."""
         return self._get_cached_attribute(
-            device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
+            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
         )
 
     def preferred_shared_memory_carveout(self, device_id: Device | int = None) -> int:
         """int : The shared memory carveout preference, in percent of the total shared memory."""
         return self._get_cached_attribute(
-            device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
+            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
         )
 
     def cluster_size_must_be_set(self, device_id: Device | int = None) -> bool:
@@ -208,40 +229,42 @@ class KernelAttributes:
         This attribute is read-only."""
         return bool(
             self._get_cached_attribute(
-                device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET
+                self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET
             )
         )
 
     def required_cluster_width(self, device_id: Device | int = None) -> int:
         """int : The required cluster width in blocks."""
         return self._get_cached_attribute(
-            device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH
+            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH
         )
 
     def required_cluster_height(self, device_id: Device | int = None) -> int:
         """int : The required cluster height in blocks."""
         return self._get_cached_attribute(
-            device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT
+            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT
         )
 
     def required_cluster_depth(self, device_id: Device | int = None) -> int:
         """int : The required cluster depth in blocks."""
         return self._get_cached_attribute(
-            device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH
+            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH
         )
 
     def non_portable_cluster_size_allowed(self, device_id: Device | int = None) -> bool:
         """bool : Whether the function can be launched with non-portable cluster size."""
         return bool(
             self._get_cached_attribute(
-                device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED
+                self._resolve_device_id(device_id),
+                driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED,
             )
         )
 
     def cluster_scheduling_policy_preference(self, device_id: Device | int = None) -> int:
         """int : The block scheduling policy of a function."""
         return self._get_cached_attribute(
-            device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE
+            self._resolve_device_id(device_id),
+            driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE,
         )
 
 

From 3044b45547ffb1bb56474728c01f8d20dfa2306e Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 21 Jan 2026 17:16:57 -0800
Subject: [PATCH 05/11] Add LibraryHandle and KernelHandle to resource_handles
 infrastructure

Extends the RAII handle system to support CUlibrary and CUkernel driver
objects used in _module.pyx. This provides automatic lifetime management
and proper cleanup for library and kernel handles.

Changes:
- Add LibraryHandle/KernelHandle types with factory functions
- Update Kernel, ObjectCode, KernelOccupancy to use typed handles
- Move KernelAttributes cdef block to .pxd for strong typing
- Update _launcher.pyx to access kernel handle directly via cdef
---
 cuda_core/cuda/core/_cpp/resource_handles.cpp |  82 +++++++++++++
 cuda_core/cuda/core/_cpp/resource_handles.hpp |  66 ++++++++++
 cuda_core/cuda/core/_launcher.pyx             |   4 +-
 cuda_core/cuda/core/_module.pxd               |  26 ++--
 cuda_core/cuda/core/_module.pyx               | 114 +++++++++++-------
 cuda_core/cuda/core/_resource_handles.pxd     |  18 +++
 cuda_core/cuda/core/_resource_handles.pyx     |  28 +++++
 7 files changed, 285 insertions(+), 53 deletions(-)

diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp
index 724ea97169..7e6e388579 100644
--- a/cuda_core/cuda/core/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp
@@ -51,6 +51,11 @@ decltype(&cuMemFreeHost) p_cuMemFreeHost = nullptr;
 
 decltype(&cuMemPoolImportPointer) p_cuMemPoolImportPointer = nullptr;
 
+decltype(&cuLibraryLoadFromFile) p_cuLibraryLoadFromFile = nullptr;
+decltype(&cuLibraryLoadData) p_cuLibraryLoadData = nullptr;
+decltype(&cuLibraryUnload) p_cuLibraryUnload = nullptr;
+decltype(&cuLibraryGetKernel) p_cuLibraryGetKernel = nullptr;
+
 // ============================================================================
 // GIL management helpers
 // ============================================================================
@@ -682,4 +687,81 @@ DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export
     }
 }
 
+// ============================================================================
+// Library Handles
+// ============================================================================
+
+namespace {
+struct LibraryBox {
+    CUlibrary resource;
+};
+}  // namespace
+
+LibraryHandle create_library_handle_from_file(const char* path) {
+    GILReleaseGuard gil;
+    CUlibrary library;
+    if (CUDA_SUCCESS != (err = p_cuLibraryLoadFromFile(&library, path, nullptr, nullptr, 0, nullptr, nullptr, 0))) {
+        return {};
+    }
+
+    auto box = std::shared_ptr<const LibraryBox>(
+        new LibraryBox{library},
+        [](const LibraryBox* b) {
+            GILReleaseGuard gil;
+            p_cuLibraryUnload(b->resource);
+            delete b;
+        }
+    );
+    return LibraryHandle(box, &box->resource);
+}
+
+LibraryHandle create_library_handle_from_data(const void* data) {
+    GILReleaseGuard gil;
+    CUlibrary library;
+    if (CUDA_SUCCESS != (err = p_cuLibraryLoadData(&library, data, nullptr, nullptr, 0, nullptr, nullptr, 0))) {
+        return {};
+    }
+
+    auto box = std::shared_ptr<const LibraryBox>(
+        new LibraryBox{library},
+        [](const LibraryBox* b) {
+            GILReleaseGuard gil;
+            p_cuLibraryUnload(b->resource);
+            delete b;
+        }
+    );
+    return LibraryHandle(box, &box->resource);
+}
+
+LibraryHandle create_library_handle_ref(CUlibrary library) {
+    auto box = std::make_shared<const LibraryBox>(LibraryBox{library});
+    return LibraryHandle(box, &box->resource);
+}
+
+// ============================================================================
+// Kernel Handles
+// ============================================================================
+
+namespace {
+struct KernelBox {
+    CUkernel resource;
+    LibraryHandle h_library;  // Keeps library alive
+};
+}  // namespace
+
+KernelHandle get_kernel_from_library(LibraryHandle h_library, const char* name) {
+    GILReleaseGuard gil;
+    CUkernel kernel;
+    if (CUDA_SUCCESS != (err = p_cuLibraryGetKernel(&kernel, *h_library, name))) {
+        return {};
+    }
+
+    return create_kernel_handle_ref(kernel, h_library);
+}
+
+KernelHandle create_kernel_handle_ref(CUkernel kernel, LibraryHandle h_library) {
+    auto box = std::make_shared<const KernelBox>(KernelBox{kernel, h_library});
+    return KernelHandle(box, &box->resource);
+}
+
 }  // namespace cuda_core
diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp
index 4a6d9bb241..ba76a6c054 100644
--- a/cuda_core/cuda/core/_cpp/resource_handles.hpp
+++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp
@@ -61,6 +61,12 @@ extern decltype(&cuMemFreeHost) p_cuMemFreeHost;
 
 extern decltype(&cuMemPoolImportPointer) p_cuMemPoolImportPointer;
 
+// Library
+extern decltype(&cuLibraryLoadFromFile) p_cuLibraryLoadFromFile;
+extern decltype(&cuLibraryLoadData) p_cuLibraryLoadData;
+extern decltype(&cuLibraryUnload) p_cuLibraryUnload;
+extern decltype(&cuLibraryGetKernel) p_cuLibraryGetKernel;
+
 // ============================================================================
 // Handle type aliases - expose only the raw CUDA resource
 // ============================================================================
@@ -69,6 +75,8 @@ using ContextHandle = std::shared_ptr<const CUcontext>;
 using StreamHandle = std::shared_ptr<const CUstream>;
 using EventHandle = std::shared_ptr<const CUevent>;
 using MemoryPoolHandle = std::shared_ptr<const CUmemoryPool>;
+using LibraryHandle = std::shared_ptr<const CUlibrary>;
+using KernelHandle = std::shared_ptr<const CUkernel>;
 
 // ============================================================================
 // Context handle functions
@@ -218,6 +226,40 @@ StreamHandle deallocation_stream(const DevicePtrHandle& h) noexcept;
 // Set the deallocation stream for a device pointer handle.
 void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) noexcept;
 
+// ============================================================================
+// Library handle functions
+// ============================================================================
+
+// Create an owning library handle by loading from a file path.
+// When the last reference is released, cuLibraryUnload is called automatically.
+// Returns empty handle on error (caller must check).
+LibraryHandle create_library_handle_from_file(const char* path);
+
+// Create an owning library handle by loading from memory data.
+// The driver makes an internal copy of the data; caller can free it after return.
+// When the last reference is released, cuLibraryUnload is called automatically.
+// Returns empty handle on error (caller must check).
+LibraryHandle create_library_handle_from_data(const void* data);
+
+// Create a non-owning library handle (references existing library).
+// Use for borrowed libraries (e.g., from foreign code).
+// The library will NOT be unloaded when the handle is released.
+LibraryHandle create_library_handle_ref(CUlibrary library);
+
+// ============================================================================
+// Kernel handle functions
+// ============================================================================
+
+// Get a kernel from a library by name.
+// The kernel structurally depends on the provided library handle.
+// Kernels have no explicit destroy - their lifetime is tied to the library.
+// Returns empty handle on error (caller must check).
+KernelHandle get_kernel_from_library(LibraryHandle h_library, const char* name);
+
+// Create a non-owning kernel handle with library dependency.
+// Use for borrowed kernels. The library handle keeps the library alive.
+KernelHandle create_kernel_handle_ref(CUkernel kernel, LibraryHandle h_library);
+
 // ============================================================================
 // Overloaded helper functions to extract raw resources from handles
 // ============================================================================
@@ -243,6 +285,14 @@ inline CUdeviceptr as_cu(const DevicePtrHandle& h) noexcept {
     return h ? *h : 0;
 }
 
+inline CUlibrary as_cu(const LibraryHandle& h) noexcept {
+    return h ? *h : nullptr;
+}
+
+inline CUkernel as_cu(const KernelHandle& h) noexcept {
+    return h ? *h : nullptr;
+}
+
 // as_intptr() - extract handle as intptr_t for Python interop
 // Using signed intptr_t per C standard convention and issue #1342
 inline std::intptr_t as_intptr(const ContextHandle& h) noexcept {
@@ -265,6 +315,14 @@ inline std::intptr_t as_intptr(const DevicePtrHandle& h) noexcept {
     return static_cast<std::intptr_t>(as_cu(h));
 }
 
+inline std::intptr_t as_intptr(const LibraryHandle& h) noexcept {
+    return reinterpret_cast<std::intptr_t>(as_cu(h));
+}
+
+inline std::intptr_t as_intptr(const KernelHandle& h) noexcept {
+    return reinterpret_cast<std::intptr_t>(as_cu(h));
+}
+
 // as_py() - convert handle to Python driver wrapper object (returns new reference)
 namespace detail {
 // n.b. class lookup is not cached to avoid deadlock hazard, see DESIGN.md
@@ -300,4 +358,12 @@ inline PyObject* as_py(const DevicePtrHandle& h) noexcept {
     return detail::make_py("CUdeviceptr", as_intptr(h));
 }
 
+inline PyObject* as_py(const LibraryHandle& h) noexcept {
+    return detail::make_py("CUlibrary", as_intptr(h));
+}
+
+inline PyObject* as_py(const KernelHandle& h) noexcept {
+    return detail::make_py("CUkernel", as_intptr(h));
+}
+
 }  // namespace cuda_core
diff --git a/cuda_core/cuda/core/_launcher.pyx b/cuda_core/cuda/core/_launcher.pyx
index 9066e72732..48eb2038b2 100644
--- a/cuda_core/cuda/core/_launcher.pyx
+++ b/cuda_core/cuda/core/_launcher.pyx
@@ -8,6 +8,7 @@ from cuda.bindings cimport cydriver
 
 from cuda.core._launch_config cimport LaunchConfig
 from cuda.core._kernel_arg_handler cimport ParamHolder
+from cuda.core._module cimport Kernel
 from cuda.core._resource_handles cimport as_cu
 from cuda.core._stream cimport Stream_accept, Stream
 from cuda.core._utils.cuda_utils cimport (
@@ -80,7 +81,8 @@ def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kern
     # Note: We now use CUkernel handles exclusively (CUDA 12+), but they can be cast to
     # CUfunction for use with cuLaunchKernel, as both handle types are interchangeable
     # for kernel launch purposes.
-    cdef cydriver.CUfunction func_handle = <cydriver.CUfunction>(<uintptr_t>(kernel.handle))
+    cdef Kernel ker = <Kernel>kernel
+    cdef cydriver.CUfunction func_handle = <cydriver.CUfunction>as_cu(ker._h_kernel)
 
     # Note: CUkernel can still be launched via cuLaunchKernel (not just cuLaunchKernelEx).
     # We check both binding & driver versions here mainly to see if the "Ex" API is
diff --git a/cuda_core/cuda/core/_module.pxd b/cuda_core/cuda/core/_module.pxd
index 5f0fb9b572..c17c7163bd 100644
--- a/cuda_core/cuda/core/_module.pxd
+++ b/cuda_core/cuda/core/_module.pxd
@@ -2,28 +2,31 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from cuda.core._resource_handles cimport LibraryHandle, KernelHandle
+
 cdef class ObjectCode
 cdef class Kernel
 cdef class KernelOccupancy
+cdef class KernelAttributes
 
 
 cdef class Kernel:
     cdef:
-        object _handle      # CUkernel (will become KernelHandle in phase 2c)
+        KernelHandle _h_kernel
         ObjectCode _module  # ObjectCode reference
-        object _attributes  # KernelAttributes (regular Python class, lazy)
-        KernelOccupancy _occupancy  # KernelOccupancy (lazy)
+        KernelAttributes _attributes  # lazy
+        KernelOccupancy _occupancy  # lazy
         object __weakref__  # Enable weak references
 
     @staticmethod
-    cdef Kernel _from_obj(object obj, ObjectCode mod)
+    cdef Kernel _from_obj(KernelHandle h_kernel, ObjectCode mod)
 
     cdef tuple _get_arguments_info(self, bint param_info=*)
 
 
 cdef class ObjectCode:
     cdef:
-        object _handle      # CUlibrary (will become LibraryHandle in phase 2c)
+        LibraryHandle _h_library
         str _code_type
         object _module      # bytes/str source
         dict _sym_map
@@ -37,7 +40,16 @@ cdef class ObjectCode:
 
 cdef class KernelOccupancy:
     cdef:
-        object _handle      # CUkernel reference
+        KernelHandle _h_kernel
 
     @staticmethod
-    cdef KernelOccupancy _init(object handle)
+    cdef KernelOccupancy _init(KernelHandle h_kernel)
+
+
+cdef class KernelAttributes:
+    cdef:
+        object _kernel_weakref
+        dict _cache
+
+    cdef int _get_cached_attribute(self, int device_id, object attribute) except? -1
+    cdef int _resolve_device_id(self, device_id) except? -1
diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx
index 564244e521..423290d477 100644
--- a/cuda_core/cuda/core/_module.pyx
+++ b/cuda_core/cuda/core/_module.pyx
@@ -11,13 +11,25 @@ from collections import namedtuple
 
 from cuda.core._device import Device
 from cuda.core._launch_config import LaunchConfig, _to_native_launch_config
+from cuda.core._resource_handles cimport (
+    LibraryHandle,
+    KernelHandle,
+    create_library_handle_from_file,
+    create_library_handle_from_data,
+    create_library_handle_ref,
+    get_kernel_from_library,
+    create_kernel_handle_ref,
+    get_last_error,
+    as_py,
+)
 from cuda.core._stream import Stream
 from cuda.core._utils.clear_error_support import (
-    assert_type,
     assert_type_str_or_bytes_like,
     raise_code_path_meant_to_be_unreachable,
 )
-from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
+from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return
+from cuda.bindings cimport cydriver
 
 __all__ = ["Kernel", "ObjectCode"]
 
@@ -113,18 +125,14 @@ def _is_cukernel_get_library_supported() -> bool:
     )
 
 
-cdef inline object _make_dummy_library_handle():
-    """Create a non-null placeholder CUlibrary handle to disable lazy loading."""
-    return driver.CUlibrary(1) if hasattr(driver, "CUlibrary") else 1
+cdef inline LibraryHandle _make_empty_library_handle():
+    """Create an empty LibraryHandle to indicate no library loaded."""
+    return LibraryHandle()  # Empty shared_ptr
 
 
 cdef class KernelAttributes:
     """Provides access to kernel attributes. Uses weakref to avoid preventing Kernel GC."""
 
-    cdef:
-        object _kernel_weakref
-        dict _cache
-
     def __init__(self, *args, **kwargs):
         raise RuntimeError("KernelAttributes cannot be instantiated directly. Please use Kernel APIs.")
 
@@ -145,7 +153,7 @@ cdef class KernelAttributes:
         cdef Kernel kernel = <Kernel>(self._kernel_weakref())
         if kernel is None:
             raise RuntimeError("Cannot access kernel attributes for expired Kernel object")
-        result = handle_return(driver.cuKernelGetAttribute(attribute, kernel._handle, device_id))
+        result = handle_return(driver.cuKernelGetAttribute(attribute, as_py(kernel._h_kernel), device_id))
         self._cache[cache_key] = result
         return result
 
@@ -280,9 +288,9 @@ cdef class KernelOccupancy:
         raise RuntimeError("KernelOccupancy cannot be instantiated directly. Please use Kernel APIs.")
 
     @staticmethod
-    cdef KernelOccupancy _init(object handle):
+    cdef KernelOccupancy _init(KernelHandle h_kernel):
         cdef KernelOccupancy self = KernelOccupancy.__new__(KernelOccupancy)
-        self._handle = handle
+        self._h_kernel = h_kernel
         return self
 
     def max_active_blocks_per_multiprocessor(self, block_size: int, dynamic_shared_memory_size: int) -> int:
@@ -311,7 +319,9 @@ cdef class KernelOccupancy:
 
         """
         return handle_return(
-            driver.cuOccupancyMaxActiveBlocksPerMultiprocessor(self._handle, block_size, dynamic_shared_memory_size)
+            driver.cuOccupancyMaxActiveBlocksPerMultiprocessor(
+                as_py(self._h_kernel), block_size, dynamic_shared_memory_size
+            )
         )
 
     def max_potential_block_size(
@@ -346,16 +356,17 @@ cdef class KernelOccupancy:
             Interpreter Lock may lead to deadlocks.
 
         """
+        cdef object kernel_py = as_py(self._h_kernel)
         if isinstance(dynamic_shared_memory_needed, int):
             min_grid_size, max_block_size = handle_return(
                 driver.cuOccupancyMaxPotentialBlockSize(
-                    self._handle, None, dynamic_shared_memory_needed, block_size_limit
+                    kernel_py, None, dynamic_shared_memory_needed, block_size_limit
                 )
             )
         elif isinstance(dynamic_shared_memory_needed, driver.CUoccupancyB2DSize):
             min_grid_size, max_block_size = handle_return(
                 driver.cuOccupancyMaxPotentialBlockSize(
-                    self._handle, dynamic_shared_memory_needed.getPtr(), 0, block_size_limit
+                    kernel_py, dynamic_shared_memory_needed.getPtr(), 0, block_size_limit
                 )
             )
         else:
@@ -383,7 +394,9 @@ cdef class KernelOccupancy:
             Dynamic shared memory available per block for given launch configuration.
         """
         return handle_return(
-            driver.cuOccupancyAvailableDynamicSMemPerBlock(self._handle, num_blocks_per_multiprocessor, block_size)
+            driver.cuOccupancyAvailableDynamicSMemPerBlock(
+                as_py(self._h_kernel), num_blocks_per_multiprocessor, block_size
+            )
         )
 
     def max_potential_cluster_size(self, config: LaunchConfig, stream: Stream | None = None) -> int:
@@ -406,7 +419,7 @@ cdef class KernelOccupancy:
         drv_cfg = _to_native_launch_config(config)
         if stream is not None:
             drv_cfg.hStream = stream.handle
-        return handle_return(driver.cuOccupancyMaxPotentialClusterSize(self._handle, drv_cfg))
+        return handle_return(driver.cuOccupancyMaxPotentialClusterSize(as_py(self._h_kernel), drv_cfg))
 
     def max_active_clusters(self, config: LaunchConfig, stream: Stream | None = None) -> int:
         """Maximum number of active clusters on the target device.
@@ -428,7 +441,7 @@ cdef class KernelOccupancy:
         drv_cfg = _to_native_launch_config(config)
         if stream is not None:
             drv_cfg.hStream = stream.handle
-        return handle_return(driver.cuOccupancyMaxActiveClusters(self._handle, drv_cfg))
+        return handle_return(driver.cuOccupancyMaxActiveClusters(as_py(self._h_kernel), drv_cfg))
 
 
 ParamInfo = namedtuple("ParamInfo", ["offset", "size"])
@@ -449,10 +462,9 @@ cdef class Kernel:
         raise RuntimeError("Kernel objects cannot be instantiated directly. Please use ObjectCode APIs.")
 
     @staticmethod
-    cdef Kernel _from_obj(object obj, ObjectCode mod):
-        assert_type(obj, _get_kernel_ctypes())
+    cdef Kernel _from_obj(KernelHandle h_kernel, ObjectCode mod):
         cdef Kernel ker = Kernel.__new__(Kernel)
-        ker._handle = obj
+        ker._h_kernel = h_kernel
         ker._module = mod
         ker._attributes = None
         ker._occupancy = None
@@ -474,8 +486,9 @@ cdef class Kernel:
             )
         cdef int arg_pos = 0
         cdef list param_info_data = []
+        cdef object kernel_py = as_py(self._h_kernel)
         while True:
-            result = driver.cuKernelGetParamInfo(self._handle, arg_pos)
+            result = driver.cuKernelGetParamInfo(kernel_py, arg_pos)
             if result[0] != driver.CUresult.CUDA_SUCCESS:
                 break
             if param_info:
@@ -502,7 +515,7 @@ cdef class Kernel:
     def occupancy(self) -> KernelOccupancy:
         """Get the occupancy information for launching this kernel."""
         if self._occupancy is None:
-            self._occupancy = KernelOccupancy._init(self._handle)
+            self._occupancy = KernelOccupancy._init(self._h_kernel)
         return self._occupancy
 
     @property
@@ -514,7 +527,7 @@ cdef class Kernel:
             This handle is a Python object. To get the memory address of the underlying C
             handle, call ``int(Kernel.handle)``.
         """
-        return self._handle
+        return as_py(self._h_kernel)
 
     @staticmethod
     def from_handle(handle: int, mod: ObjectCode = None) -> Kernel:
@@ -537,25 +550,28 @@ cdef class Kernel:
         if not isinstance(handle, int):
             raise TypeError(f"handle must be an integer, got {type(handle).__name__}")
 
-        # Convert the integer handle to CUkernel driver type
-        kernel_obj = driver.CUkernel(handle)
+        # Convert the integer handle to CUkernel
+        cdef cydriver.CUkernel cu_kernel = <cydriver.CUkernel><void*><size_t>handle
+        cdef KernelHandle h_kernel
 
-        # If no module provided, create a placeholder
+        # If no module provided, create a placeholder and try to get the library
         if mod is None:
-            # For CUkernel, we can (optionally) inverse-lookup the owning CUlibrary via
-            # cuKernelGetLibrary (added in CUDA 12.5). If the API is not available, we fall
-            # back to a non-null dummy handle purely to disable lazy loading.
             mod = ObjectCode._init(b"", "cubin", "", None)
             if _is_cukernel_get_library_supported():
+                # Try to get the owning library via cuKernelGetLibrary
                 try:
-                    mod._handle = handle_return(driver.cuKernelGetLibrary(kernel_obj))
-                except (CUDAError, RuntimeError):
-                    # Best-effort: don't fail construction if inverse lookup fails.
-                    mod._handle = _make_dummy_library_handle()
-            else:
-                mod._handle = _make_dummy_library_handle()
+                    cu_library = handle_return(driver.cuKernelGetLibrary(driver.CUkernel(handle)))
+                    mod._h_library = create_library_handle_ref(<cydriver.CUlibrary><void*><size_t>int(cu_library))
+                except Exception:
+                    # Best-effort: don't fail construction if inverse lookup fails
+                    pass
+
+        # Create kernel handle with library dependency
+        h_kernel = create_kernel_handle_ref(cu_kernel, mod._h_library)
+        if not h_kernel:
+            HANDLE_RETURN(get_last_error())
 
-        return Kernel._from_obj(kernel_obj, mod)
+        return Kernel._from_obj(h_kernel, mod)
 
 
 CodeTypeT = bytes | bytearray | str
@@ -587,8 +603,8 @@ cdef class ObjectCode:
         assert code_type in _supported_code_type, f"{code_type=} is not supported"
         cdef ObjectCode self = ObjectCode.__new__(ObjectCode)
 
-        # handle is assigned during _lazy_load
-        self._handle = None
+        # _h_library is assigned during _lazy_load_module
+        self._h_library = LibraryHandle()  # Empty handle
         _lazy_init()
 
         self._code_type = code_type
@@ -722,15 +738,21 @@ cdef class ObjectCode:
     # TODO: do we want to unload in a finalizer? Probably not..
 
     cdef int _lazy_load_module(self) except -1:
-        if self._handle is not None:
+        if self._h_library:
             return 0
         module = self._module
         assert_type_str_or_bytes_like(module)
+        cdef bytes path_bytes
         if isinstance(module, str):
-            self._handle = handle_return(driver.cuLibraryLoadFromFile(module.encode(), [], [], 0, [], [], 0))
+            path_bytes = module.encode()
+            self._h_library = create_library_handle_from_file(<const char*>path_bytes)
+            if not self._h_library:
+                HANDLE_RETURN(get_last_error())
             return 0
         if isinstance(module, (bytes, bytearray)):
-            self._handle = handle_return(driver.cuLibraryLoadData(module, [], [], 0, [], [], 0))
+            self._h_library = create_library_handle_from_data(<const void*><char*>module)
+            if not self._h_library:
+                HANDLE_RETURN(get_last_error())
             return 0
         raise_code_path_meant_to_be_unreachable()
         return -1
@@ -758,8 +780,10 @@ cdef class ObjectCode:
         except KeyError:
             name = name.encode()
 
-        data = handle_return(driver.cuLibraryGetKernel(self._handle, name))
-        return Kernel._from_obj(data, self)
+        cdef KernelHandle h_kernel = get_kernel_from_library(self._h_library, <const char*>name)
+        if not h_kernel:
+            HANDLE_RETURN(get_last_error())
+        return Kernel._from_obj(h_kernel, self)
 
     @property
     def code(self) -> CodeTypeT:
@@ -791,4 +815,4 @@ cdef class ObjectCode:
             handle, call ``int(ObjectCode.handle)``.
         """
         self._lazy_load_module()
-        return self._handle
+        return as_py(self._h_library)
diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd
index 7a634f3a82..10816481b0 100644
--- a/cuda_core/cuda/core/_resource_handles.pxd
+++ b/cuda_core/cuda/core/_resource_handles.pxd
@@ -21,6 +21,8 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     ctypedef shared_ptr[const cydriver.CUevent] EventHandle
     ctypedef shared_ptr[const cydriver.CUmemoryPool] MemoryPoolHandle
     ctypedef shared_ptr[const cydriver.CUdeviceptr] DevicePtrHandle
+    ctypedef shared_ptr[const cydriver.CUlibrary] LibraryHandle
+    ctypedef shared_ptr[const cydriver.CUkernel] KernelHandle
 
     # as_cu() - extract the raw CUDA handle (inline C++)
     cydriver.CUcontext as_cu(ContextHandle h) noexcept nogil
@@ -28,6 +30,8 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     cydriver.CUevent as_cu(EventHandle h) noexcept nogil
     cydriver.CUmemoryPool as_cu(MemoryPoolHandle h) noexcept nogil
     cydriver.CUdeviceptr as_cu(DevicePtrHandle h) noexcept nogil
+    cydriver.CUlibrary as_cu(LibraryHandle h) noexcept nogil
+    cydriver.CUkernel as_cu(KernelHandle h) noexcept nogil
 
     # as_intptr() - extract handle as intptr_t for Python interop (inline C++)
     intptr_t as_intptr(ContextHandle h) noexcept nogil
@@ -35,6 +39,8 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     intptr_t as_intptr(EventHandle h) noexcept nogil
     intptr_t as_intptr(MemoryPoolHandle h) noexcept nogil
     intptr_t as_intptr(DevicePtrHandle h) noexcept nogil
+    intptr_t as_intptr(LibraryHandle h) noexcept nogil
+    intptr_t as_intptr(KernelHandle h) noexcept nogil
 
     # as_py() - convert handle to Python driver wrapper object (inline C++; requires GIL)
     object as_py(ContextHandle h)
@@ -42,6 +48,8 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     object as_py(EventHandle h)
     object as_py(MemoryPoolHandle h)
     object as_py(DevicePtrHandle h)
+    object as_py(LibraryHandle h)
+    object as_py(KernelHandle h)
 
 
 # =============================================================================
@@ -94,3 +102,13 @@ cdef DevicePtrHandle deviceptr_import_ipc(
     MemoryPoolHandle h_pool, const void* export_data, StreamHandle h_stream) nogil except+
 cdef StreamHandle deallocation_stream(const DevicePtrHandle& h) noexcept nogil
 cdef void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) noexcept nogil
+
+# Library handles
+cdef LibraryHandle create_library_handle_from_file(const char* path) nogil except+
+cdef LibraryHandle create_library_handle_from_data(const void* data) nogil except+
+cdef LibraryHandle create_library_handle_ref(cydriver.CUlibrary library) nogil except+
+
+# Kernel handles
+cdef KernelHandle get_kernel_from_library(LibraryHandle h_library, const char* name) nogil except+
+cdef KernelHandle create_kernel_handle_ref(
+    cydriver.CUkernel kernel, LibraryHandle h_library) nogil except+
diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx
index 7989cd1bb0..a65836b864 100644
--- a/cuda_core/cuda/core/_resource_handles.pyx
+++ b/cuda_core/cuda/core/_resource_handles.pyx
@@ -21,6 +21,8 @@ from ._resource_handles cimport (
     EventHandle,
     MemoryPoolHandle,
     DevicePtrHandle,
+    LibraryHandle,
+    KernelHandle,
 )
 
 import cuda.bindings.cydriver as cydriver
@@ -91,6 +93,20 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     void set_deallocation_stream "cuda_core::set_deallocation_stream" (
         const DevicePtrHandle& h, StreamHandle h_stream) noexcept nogil
 
+    # Library handles
+    LibraryHandle create_library_handle_from_file "cuda_core::create_library_handle_from_file" (
+        const char* path) nogil except+
+    LibraryHandle create_library_handle_from_data "cuda_core::create_library_handle_from_data" (
+        const void* data) nogil except+
+    LibraryHandle create_library_handle_ref "cuda_core::create_library_handle_ref" (
+        cydriver.CUlibrary library) nogil except+
+
+    # Kernel handles
+    KernelHandle get_kernel_from_library "cuda_core::get_kernel_from_library" (
+        LibraryHandle h_library, const char* name) nogil except+
+    KernelHandle create_kernel_handle_ref "cuda_core::create_kernel_handle_ref" (
+        cydriver.CUkernel kernel, LibraryHandle h_library) nogil except+
+
 
 # =============================================================================
 # CUDA Driver API capsule
@@ -152,6 +168,12 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
     # IPC
     void* p_cuMemPoolImportPointer "reinterpret_cast<void*&>(cuda_core::p_cuMemPoolImportPointer)"
 
+    # Library
+    void* p_cuLibraryLoadFromFile "reinterpret_cast<void*&>(cuda_core::p_cuLibraryLoadFromFile)"
+    void* p_cuLibraryLoadData "reinterpret_cast<void*&>(cuda_core::p_cuLibraryLoadData)"
+    void* p_cuLibraryUnload "reinterpret_cast<void*&>(cuda_core::p_cuLibraryUnload)"
+    void* p_cuLibraryGetKernel "reinterpret_cast<void*&>(cuda_core::p_cuLibraryGetKernel)"
+
 
 # Initialize driver function pointers from cydriver.__pyx_capi__ at module load
 cdef void* _get_driver_fn(str name):
@@ -195,3 +217,9 @@ p_cuMemFreeHost = _get_driver_fn("cuMemFreeHost")
 
 # IPC
 p_cuMemPoolImportPointer = _get_driver_fn("cuMemPoolImportPointer")
+
+# Library
+p_cuLibraryLoadFromFile = _get_driver_fn("cuLibraryLoadFromFile")
+p_cuLibraryLoadData = _get_driver_fn("cuLibraryLoadData")
+p_cuLibraryUnload = _get_driver_fn("cuLibraryUnload")
+p_cuLibraryGetKernel = _get_driver_fn("cuLibraryGetKernel")

From 43623aab7e707aaffbb131bc1078c39d3e78aaf9 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 21 Jan 2026 17:36:04 -0800
Subject: [PATCH 06/11] Convert _module.pyx driver calls to cydriver with nogil

Replaces Python-level driver API calls with low-level cydriver calls
wrapped in nogil blocks for improved performance. This allows the GIL
to be released during CUDA driver operations.

Changes:
- cuDriverGetVersion, cuKernelGetAttribute, cuKernelGetParamInfo
- cuOccupancy* functions (with appropriate GIL handling for callbacks)
- cuKernelGetLibrary
- Update KernelAttributes._get_cached_attribute to use cydriver types
---
 cuda_core/cuda/core/_module.pxd |   3 +-
 cuda_core/cuda/core/_module.pyx | 168 +++++++++++++++++++-------------
 2 files changed, 104 insertions(+), 67 deletions(-)

diff --git a/cuda_core/cuda/core/_module.pxd b/cuda_core/cuda/core/_module.pxd
index c17c7163bd..b49f064d52 100644
--- a/cuda_core/cuda/core/_module.pxd
+++ b/cuda_core/cuda/core/_module.pxd
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from cuda.bindings cimport cydriver
 from cuda.core._resource_handles cimport LibraryHandle, KernelHandle
 
 cdef class ObjectCode
@@ -51,5 +52,5 @@ cdef class KernelAttributes:
         object _kernel_weakref
         dict _cache
 
-    cdef int _get_cached_attribute(self, int device_id, object attribute) except? -1
+    cdef int _get_cached_attribute(self, int device_id, cydriver.CUfunction_attribute attribute) except? -1
     cdef int _resolve_device_id(self, device_id) except? -1
diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx
index 423290d477..d3631c5661 100644
--- a/cuda_core/cuda/core/_module.pyx
+++ b/cuda_core/cuda/core/_module.pyx
@@ -4,13 +4,17 @@
 
 from __future__ import annotations
 
+from libc.stddef cimport size_t
+
 import functools
 import threading
 import weakref
 from collections import namedtuple
 
 from cuda.core._device import Device
-from cuda.core._launch_config import LaunchConfig, _to_native_launch_config
+from cuda.core._launch_config cimport LaunchConfig
+from cuda.core._launch_config import LaunchConfig
+from cuda.core._stream cimport Stream
 from cuda.core._resource_handles cimport (
     LibraryHandle,
     KernelHandle,
@@ -20,6 +24,7 @@ from cuda.core._resource_handles cimport (
     get_kernel_from_library,
     create_kernel_handle_ref,
     get_last_error,
+    as_cu,
     as_py,
 )
 from cuda.core._stream import Stream
@@ -28,7 +33,7 @@ from cuda.core._utils.clear_error_support import (
     raise_code_path_meant_to_be_unreachable,
 )
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
-from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return
+from cuda.core._utils.cuda_utils import driver, get_binding_version
 from cuda.bindings cimport cydriver
 
 __all__ = ["Kernel", "ObjectCode"]
@@ -62,6 +67,7 @@ cdef int _lazy_init() except -1:
     if _inited:
         return 0
 
+    cdef int drv_ver
     # Slow path: acquire lock and initialize
     with _init_lock:
         # Double-check: another thread might have initialized while we waited
@@ -72,7 +78,9 @@ cdef int _lazy_init() except -1:
         # binding availability depends on cuda-python version
         _py_major_ver, _py_minor_ver = get_binding_version()
         _kernel_ctypes = (driver.CUkernel,)
-        _driver_ver = handle_return(driver.cuDriverGetVersion())
+        with nogil:
+            HANDLE_RETURN(cydriver.cuDriverGetVersion(&drv_ver))
+        _driver_ver = drv_ver
         _paraminfo_supported = _driver_ver >= 12040
 
         # Mark as initialized (must be last to ensure all state is set)
@@ -144,16 +152,18 @@ cdef class KernelAttributes:
         _lazy_init()
         return self
 
-    cdef int _get_cached_attribute(self, int device_id, object attribute) except? -1:
+    cdef int _get_cached_attribute(self, int device_id, cydriver.CUfunction_attribute attribute) except? -1:
         """Helper function to get a cached attribute or fetch and cache it if not present."""
-        cdef tuple cache_key = (device_id, attribute)
-        result = self._cache.get(cache_key, cache_key)
-        if result is not cache_key:
-            return result
+        cdef tuple cache_key = (device_id, <int>attribute)
+        cached = self._cache.get(cache_key, cache_key)
+        if cached is not cache_key:
+            return cached
         cdef Kernel kernel = <Kernel>(self._kernel_weakref())
         if kernel is None:
             raise RuntimeError("Cannot access kernel attributes for expired Kernel object")
-        result = handle_return(driver.cuKernelGetAttribute(attribute, as_py(kernel._h_kernel), device_id))
+        cdef int result
+        with nogil:
+            HANDLE_RETURN(cydriver.cuKernelGetAttribute(&result, attribute, as_cu(kernel._h_kernel), device_id))
         self._cache[cache_key] = result
         return result
 
@@ -165,49 +175,49 @@ cdef class KernelAttributes:
         """int : The maximum number of threads per block.
         This attribute is read-only."""
         return self._get_cached_attribute(
-            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
+            self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
         )
 
     def shared_size_bytes(self, device_id: Device | int = None) -> int:
         """int : The size in bytes of statically-allocated shared memory required by this function.
         This attribute is read-only."""
         return self._get_cached_attribute(
-            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES
+            self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES
         )
 
     def const_size_bytes(self, device_id: Device | int = None) -> int:
         """int : The size in bytes of user-allocated constant memory required by this function.
         This attribute is read-only."""
         return self._get_cached_attribute(
-            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES
+            self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES
         )
 
     def local_size_bytes(self, device_id: Device | int = None) -> int:
         """int : The size in bytes of local memory used by each thread of this function.
         This attribute is read-only."""
         return self._get_cached_attribute(
-            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES
+            self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES
         )
 
     def num_regs(self, device_id: Device | int = None) -> int:
         """int : The number of registers used by each thread of this function.
         This attribute is read-only."""
         return self._get_cached_attribute(
-            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NUM_REGS
+            self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_NUM_REGS
         )
 
     def ptx_version(self, device_id: Device | int = None) -> int:
         """int : The PTX virtual architecture version for which the function was compiled.
         This attribute is read-only."""
         return self._get_cached_attribute(
-            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PTX_VERSION
+            self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_PTX_VERSION
         )
 
     def binary_version(self, device_id: Device | int = None) -> int:
         """int : The binary architecture version for which the function was compiled.
         This attribute is read-only."""
         return self._get_cached_attribute(
-            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_BINARY_VERSION
+            self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_BINARY_VERSION
         )
 
     def cache_mode_ca(self, device_id: Device | int = None) -> bool:
@@ -215,7 +225,7 @@ cdef class KernelAttributes:
         This attribute is read-only."""
         return bool(
             self._get_cached_attribute(
-                self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CACHE_MODE_CA
+                self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_CACHE_MODE_CA
             )
         )
 
@@ -223,13 +233,13 @@ cdef class KernelAttributes:
         """int : The maximum size in bytes of dynamically-allocated shared memory that can be used
         by this function."""
         return self._get_cached_attribute(
-            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
+            self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
         )
 
     def preferred_shared_memory_carveout(self, device_id: Device | int = None) -> int:
         """int : The shared memory carveout preference, in percent of the total shared memory."""
         return self._get_cached_attribute(
-            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
+            self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
         )
 
     def cluster_size_must_be_set(self, device_id: Device | int = None) -> bool:
@@ -237,26 +247,26 @@ cdef class KernelAttributes:
         This attribute is read-only."""
         return bool(
             self._get_cached_attribute(
-                self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET
+                self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET
             )
         )
 
     def required_cluster_width(self, device_id: Device | int = None) -> int:
         """int : The required cluster width in blocks."""
         return self._get_cached_attribute(
-            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH
+            self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH
         )
 
     def required_cluster_height(self, device_id: Device | int = None) -> int:
         """int : The required cluster height in blocks."""
         return self._get_cached_attribute(
-            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT
+            self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT
         )
 
     def required_cluster_depth(self, device_id: Device | int = None) -> int:
         """int : The required cluster depth in blocks."""
         return self._get_cached_attribute(
-            self._resolve_device_id(device_id), driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH
+            self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH
         )
 
     def non_portable_cluster_size_allowed(self, device_id: Device | int = None) -> bool:
@@ -264,7 +274,7 @@ cdef class KernelAttributes:
         return bool(
             self._get_cached_attribute(
                 self._resolve_device_id(device_id),
-                driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED,
+                cydriver.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED,
             )
         )
 
@@ -272,7 +282,7 @@ cdef class KernelAttributes:
         """int : The block scheduling policy of a function."""
         return self._get_cached_attribute(
             self._resolve_device_id(device_id),
-            driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE,
+            cydriver.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE,
         )
 
 
@@ -318,11 +328,15 @@ cdef class KernelOccupancy:
             theoretical multiprocessor utilization (occupancy).
 
         """
-        return handle_return(
-            driver.cuOccupancyMaxActiveBlocksPerMultiprocessor(
-                as_py(self._h_kernel), block_size, dynamic_shared_memory_size
-            )
-        )
+        cdef int num_blocks
+        cdef int c_block_size = block_size
+        cdef size_t c_shmem_size = dynamic_shared_memory_size
+        cdef cydriver.CUfunction func = <cydriver.CUfunction>as_cu(self._h_kernel)
+        with nogil:
+            HANDLE_RETURN(cydriver.cuOccupancyMaxActiveBlocksPerMultiprocessor(
+                &num_blocks, func, c_block_size, c_shmem_size
+            ))
+        return num_blocks
 
     def max_potential_block_size(
         self, dynamic_shared_memory_needed: int | driver.CUoccupancyB2DSize, block_size_limit: int
@@ -356,19 +370,23 @@ cdef class KernelOccupancy:
             Interpreter Lock may lead to deadlocks.
 
         """
-        cdef object kernel_py = as_py(self._h_kernel)
+        cdef int min_grid_size, max_block_size
+        cdef cydriver.CUfunction func = <cydriver.CUfunction>as_cu(self._h_kernel)
+        cdef cydriver.CUoccupancyB2DSize callback
+        cdef size_t c_shmem_size
+        cdef int c_block_size_limit = block_size_limit
         if isinstance(dynamic_shared_memory_needed, int):
-            min_grid_size, max_block_size = handle_return(
-                driver.cuOccupancyMaxPotentialBlockSize(
-                    kernel_py, None, dynamic_shared_memory_needed, block_size_limit
-                )
-            )
+            c_shmem_size = dynamic_shared_memory_needed
+            with nogil:
+                HANDLE_RETURN(cydriver.cuOccupancyMaxPotentialBlockSize(
+                    &min_grid_size, &max_block_size, func, NULL, c_shmem_size, c_block_size_limit
+                ))
         elif isinstance(dynamic_shared_memory_needed, driver.CUoccupancyB2DSize):
-            min_grid_size, max_block_size = handle_return(
-                driver.cuOccupancyMaxPotentialBlockSize(
-                    kernel_py, dynamic_shared_memory_needed.getPtr(), 0, block_size_limit
-                )
-            )
+            # Callback may require GIL, so don't use nogil here
+            callback = <cydriver.CUoccupancyB2DSize><size_t>dynamic_shared_memory_needed.getPtr()
+            HANDLE_RETURN(cydriver.cuOccupancyMaxPotentialBlockSize(
+                &min_grid_size, &max_block_size, func, callback, 0, c_block_size_limit
+            ))
         else:
             raise TypeError(
                 "dynamic_shared_memory_needed expected to have type int, or CUoccupancyB2DSize, "
@@ -393,11 +411,15 @@ cdef class KernelOccupancy:
         int
             Dynamic shared memory available per block for given launch configuration.
         """
-        return handle_return(
-            driver.cuOccupancyAvailableDynamicSMemPerBlock(
-                as_py(self._h_kernel), num_blocks_per_multiprocessor, block_size
-            )
-        )
+        cdef size_t dynamic_smem_size
+        cdef int c_num_blocks = num_blocks_per_multiprocessor
+        cdef int c_block_size = block_size
+        cdef cydriver.CUfunction func = <cydriver.CUfunction>as_cu(self._h_kernel)
+        with nogil:
+            HANDLE_RETURN(cydriver.cuOccupancyAvailableDynamicSMemPerBlock(
+                &dynamic_smem_size, func, c_num_blocks, c_block_size
+            ))
+        return dynamic_smem_size
 
     def max_potential_cluster_size(self, config: LaunchConfig, stream: Stream | None = None) -> int:
         """Maximum potential cluster size.
@@ -416,10 +438,16 @@ cdef class KernelOccupancy:
         int
             The maximum cluster size that can be launched for this kernel and launch configuration.
         """
-        drv_cfg = _to_native_launch_config(config)
+        cdef cydriver.CUlaunchConfig drv_cfg = (<LaunchConfig>config)._to_native_launch_config()
+        cdef Stream s
         if stream is not None:
-            drv_cfg.hStream = stream.handle
-        return handle_return(driver.cuOccupancyMaxPotentialClusterSize(as_py(self._h_kernel), drv_cfg))
+            s = <Stream>stream
+            drv_cfg.hStream = as_cu(s._h_stream)
+        cdef int cluster_size
+        cdef cydriver.CUfunction func = <cydriver.CUfunction>as_cu(self._h_kernel)
+        with nogil:
+            HANDLE_RETURN(cydriver.cuOccupancyMaxPotentialClusterSize(&cluster_size, func, &drv_cfg))
+        return cluster_size
 
     def max_active_clusters(self, config: LaunchConfig, stream: Stream | None = None) -> int:
         """Maximum number of active clusters on the target device.
@@ -438,10 +466,16 @@ cdef class KernelOccupancy:
         int
             The maximum number of clusters that could co-exist on the target device.
         """
-        drv_cfg = _to_native_launch_config(config)
+        cdef cydriver.CUlaunchConfig drv_cfg = (<LaunchConfig>config)._to_native_launch_config()
+        cdef Stream s
         if stream is not None:
-            drv_cfg.hStream = stream.handle
-        return handle_return(driver.cuOccupancyMaxActiveClusters(as_py(self._h_kernel), drv_cfg))
+            s = <Stream>stream
+            drv_cfg.hStream = as_cu(s._h_stream)
+        cdef int num_clusters
+        cdef cydriver.CUfunction func = <cydriver.CUfunction>as_cu(self._h_kernel)
+        with nogil:
+            HANDLE_RETURN(cydriver.cuOccupancyMaxActiveClusters(&num_clusters, func, &drv_cfg))
+        return num_clusters
 
 
 ParamInfo = namedtuple("ParamInfo", ["offset", "size"])
@@ -484,19 +518,21 @@ cdef class Kernel:
                 "Driver version 12.4 or newer is required for this function. "
                 f"Using driver version {driver_ver // 1000}.{(driver_ver % 1000) // 10}"
             )
-        cdef int arg_pos = 0
+        cdef size_t arg_pos = 0
         cdef list param_info_data = []
-        cdef object kernel_py = as_py(self._h_kernel)
+        cdef cydriver.CUkernel cu_kernel = as_cu(self._h_kernel)
+        cdef size_t param_offset, param_size
+        cdef cydriver.CUresult err
         while True:
-            result = driver.cuKernelGetParamInfo(kernel_py, arg_pos)
-            if result[0] != driver.CUresult.CUDA_SUCCESS:
+            with nogil:
+                err = cydriver.cuKernelGetParamInfo(cu_kernel, arg_pos, &param_offset, &param_size)
+            if err != cydriver.CUDA_SUCCESS:
                 break
             if param_info:
-                p_info = ParamInfo(offset=result[1], size=result[2])
-                param_info_data.append(p_info)
+                param_info_data.append(ParamInfo(offset=param_offset, size=param_size))
             arg_pos = arg_pos + 1
-        if result[0] != driver.CUresult.CUDA_ERROR_INVALID_VALUE:
-            handle_return(result)
+        if err != cydriver.CUDA_ERROR_INVALID_VALUE:
+            HANDLE_RETURN(err)
         return arg_pos, param_info_data
 
     @property
@@ -553,18 +589,18 @@ cdef class Kernel:
         # Convert the integer handle to CUkernel
         cdef cydriver.CUkernel cu_kernel = <cydriver.CUkernel><void*><size_t>handle
         cdef KernelHandle h_kernel
+        cdef cydriver.CUlibrary cu_library
+        cdef cydriver.CUresult err
 
         # If no module provided, create a placeholder and try to get the library
         if mod is None:
             mod = ObjectCode._init(b"", "cubin", "", None)
             if _is_cukernel_get_library_supported():
                 # Try to get the owning library via cuKernelGetLibrary
-                try:
-                    cu_library = handle_return(driver.cuKernelGetLibrary(driver.CUkernel(handle)))
-                    mod._h_library = create_library_handle_ref(<cydriver.CUlibrary><void*><size_t>int(cu_library))
-                except Exception:
-                    # Best-effort: don't fail construction if inverse lookup fails
-                    pass
+                with nogil:
+                    err = cydriver.cuKernelGetLibrary(&cu_library, cu_kernel)
+                if err == cydriver.CUDA_SUCCESS:
+                    mod._h_library = create_library_handle_ref(cu_library)
 
         # Create kernel handle with library dependency
         h_kernel = create_kernel_handle_ref(cu_kernel, mod._h_library)

From f5ad78791abbec65da2fa0aefc3655ed222aefc4 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 22 Jan 2026 07:57:46 -0800
Subject: [PATCH 07/11] Fix SEGV in Kernel.from_handle with non-int types

Remove type annotation from handle parameter to prevent Cython's
automatic float-to-int coercion, which caused a segmentation fault.
The manual isinstance check properly validates all non-int types.
---
 cuda_core/cuda/core/_module.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx
index d3631c5661..d5a50589a6 100644
--- a/cuda_core/cuda/core/_module.pyx
+++ b/cuda_core/cuda/core/_module.pyx
@@ -566,7 +566,7 @@ cdef class Kernel:
         return as_py(self._h_kernel)
 
     @staticmethod
-    def from_handle(handle: int, mod: ObjectCode = None) -> Kernel:
+    def from_handle(handle, mod: ObjectCode = None) -> Kernel:
         """Creates a new :obj:`Kernel` object from a foreign kernel handle.
 
         Uses a CUkernel pointer address to create a new :obj:`Kernel` object.

From a3b32a4b92bca9bb052d033d6fbe52a645606013 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 22 Jan 2026 10:21:02 -0800
Subject: [PATCH 08/11] Refactor ObjectCode._init and add kernel lifetime test

- Change ObjectCode._init from cdef to @classmethod def, matching the
  pattern used by Buffer, Stream, Graph, and other classes
- Remove _init_py wrapper (no longer needed)
- Update callers in _program.py and _linker.py
- Add test_kernel_keeps_library_alive to verify that a Kernel keeps its
  underlying library alive after ObjectCode goes out of scope
---
 cuda_core/cuda/core/_linker.py  |  2 +-
 cuda_core/cuda/core/_module.pxd |  3 --
 cuda_core/cuda/core/_module.pyx | 27 +++++++----------
 cuda_core/cuda/core/_program.py |  6 ++--
 cuda_core/tests/test_module.py  | 53 +++++++++++++++++++++++++++++++++
 5 files changed, 68 insertions(+), 23 deletions(-)

diff --git a/cuda_core/cuda/core/_linker.py b/cuda_core/cuda/core/_linker.py
index 5ce6fb73ee..ab08709690 100644
--- a/cuda_core/cuda/core/_linker.py
+++ b/cuda_core/cuda/core/_linker.py
@@ -529,7 +529,7 @@ def link(self, target_type) -> ObjectCode:
                 addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle))
                 code = (ctypes.c_char * size).from_address(addr)
 
-        return ObjectCode._init_py(bytes(code), target_type, name=self._options.name)
+        return ObjectCode._init(bytes(code), target_type, name=self._options.name)
 
     def get_error_log(self) -> str:
         """Get the error log generated by the linker.
diff --git a/cuda_core/cuda/core/_module.pxd b/cuda_core/cuda/core/_module.pxd
index b49f064d52..ac512b39c8 100644
--- a/cuda_core/cuda/core/_module.pxd
+++ b/cuda_core/cuda/core/_module.pxd
@@ -33,9 +33,6 @@ cdef class ObjectCode:
         dict _sym_map
         str _name
 
-    @staticmethod
-    cdef ObjectCode _init(object module, str code_type, str name=*, dict symbol_mapping=*)
-
     cdef int _lazy_load_module(self) except -1
 
 
diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx
index d5a50589a6..a14d7888b0 100644
--- a/cuda_core/cuda/core/_module.pyx
+++ b/cuda_core/cuda/core/_module.pyx
@@ -594,7 +594,7 @@ cdef class Kernel:
 
         # If no module provided, create a placeholder and try to get the library
         if mod is None:
-            mod = ObjectCode._init(b"", "cubin", "", None)
+            mod = ObjectCode._init(b"", "cubin")
             if _is_cukernel_get_library_supported():
                 # Try to get the owning library via cuKernelGetLibrary
                 with nogil:
@@ -634,8 +634,8 @@ cdef class ObjectCode:
             "Please use ObjectCode APIs (from_cubin, from_ptx) or Program APIs (compile)."
         )
 
-    @staticmethod
-    cdef ObjectCode _init(object module, str code_type, str name = "", dict symbol_mapping = None):
+    @classmethod
+    def _init(cls, module, code_type, *, name: str = "", symbol_mapping: dict | None = None):
         assert code_type in _supported_code_type, f"{code_type=} is not supported"
         cdef ObjectCode self = ObjectCode.__new__(ObjectCode)
 
@@ -646,19 +646,14 @@ cdef class ObjectCode:
         self._code_type = code_type
         self._module = module
         self._sym_map = {} if symbol_mapping is None else symbol_mapping
-        self._name = name
+        self._name = name if name else ""
 
         return self
 
-    @classmethod
-    def _init_py(cls, module, code_type, *, name: str = "", symbol_mapping: dict | None = None):
-        """Python-accessible factory method for use by _program.py and _linker.py."""
-        return ObjectCode._init(module, code_type, name if name else "", symbol_mapping)
-
     @classmethod
     def _reduce_helper(cls, module, code_type, name, symbol_mapping):
         # just for forwarding kwargs
-        return ObjectCode._init(module, code_type, name if name else "", symbol_mapping)
+        return cls._init(module, code_type, name=name if name else "", symbol_mapping=symbol_mapping)
 
     def __reduce__(self):
         return ObjectCode._reduce_helper, (self._module, self._code_type, self._name, self._sym_map)
@@ -679,7 +674,7 @@ cdef class ObjectCode:
             should be mapped to the mangled names before trying to retrieve
             them (default to no mappings).
         """
-        return ObjectCode._init(module, "cubin", name, symbol_mapping)
+        return ObjectCode._init(module, "cubin", name=name, symbol_mapping=symbol_mapping)
 
     @staticmethod
     def from_ptx(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode:
@@ -697,7 +692,7 @@ cdef class ObjectCode:
             should be mapped to the mangled names before trying to retrieve
             them (default to no mappings).
         """
-        return ObjectCode._init(module, "ptx", name, symbol_mapping)
+        return ObjectCode._init(module, "ptx", name=name, symbol_mapping=symbol_mapping)
 
     @staticmethod
     def from_ltoir(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode:
@@ -715,7 +710,7 @@ cdef class ObjectCode:
             should be mapped to the mangled names before trying to retrieve
             them (default to no mappings).
         """
-        return ObjectCode._init(module, "ltoir", name, symbol_mapping)
+        return ObjectCode._init(module, "ltoir", name=name, symbol_mapping=symbol_mapping)
 
     @staticmethod
     def from_fatbin(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode:
@@ -733,7 +728,7 @@ cdef class ObjectCode:
             should be mapped to the mangled names before trying to retrieve
             them (default to no mappings).
         """
-        return ObjectCode._init(module, "fatbin", name, symbol_mapping)
+        return ObjectCode._init(module, "fatbin", name=name, symbol_mapping=symbol_mapping)
 
     @staticmethod
     def from_object(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode:
@@ -751,7 +746,7 @@ cdef class ObjectCode:
             should be mapped to the mangled names before trying to retrieve
             them (default to no mappings).
         """
-        return ObjectCode._init(module, "object", name, symbol_mapping)
+        return ObjectCode._init(module, "object", name=name, symbol_mapping=symbol_mapping)
 
     @staticmethod
     def from_library(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode:
@@ -769,7 +764,7 @@ cdef class ObjectCode:
             should be mapped to the mangled names before trying to retrieve
             them (default to no mappings).
         """
-        return ObjectCode._init(module, "library", name, symbol_mapping)
+        return ObjectCode._init(module, "library", name=name, symbol_mapping=symbol_mapping)
 
     # TODO: do we want to unload in a finalizer? Probably not..
 
diff --git a/cuda_core/cuda/core/_program.py b/cuda_core/cuda/core/_program.py
index b5f8fbfa0b..1ef1aa51f5 100644
--- a/cuda_core/cuda/core/_program.py
+++ b/cuda_core/cuda/core/_program.py
@@ -688,7 +688,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
         elif code_type == "ptx":
             assert_type(code, str)
             self._linker = Linker(
-                ObjectCode._init_py(code.encode(), code_type), options=self._translate_program_options(options)
+                ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options)
             )
             self._backend = self._linker.backend
 
@@ -806,7 +806,7 @@ def compile(self, target_type, name_expressions=(), logs=None):
                     handle_return(nvrtc.nvrtcGetProgramLog(self._mnff.handle, log), handle=self._mnff.handle)
                     logs.write(log.decode("utf-8", errors="backslashreplace"))
 
-            return ObjectCode._init_py(data, target_type, symbol_mapping=symbol_mapping, name=self._options.name)
+            return ObjectCode._init(data, target_type, symbol_mapping=symbol_mapping, name=self._options.name)
 
         elif self._backend == "NVVM":
             if target_type not in ("ptx", "ltoir"):
@@ -832,7 +832,7 @@ def compile(self, target_type, name_expressions=(), logs=None):
                     nvvm.get_program_log(self._mnff.handle, log)
                     logs.write(log.decode("utf-8", errors="backslashreplace"))
 
-            return ObjectCode._init_py(data, target_type, name=self._options.name)
+            return ObjectCode._init(data, target_type, name=self._options.name)
 
         supported_backends = ("nvJitLink", "driver")
         if self._backend not in supported_backends:
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index d5a35a1ea5..8b1667d2f7 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -511,3 +511,56 @@ def test_kernel_from_handle_multiple_instances(get_saxpy_kernel_cubin):
 
     # All should reference the same underlying CUDA kernel handle
     assert int(kernel1.handle) == int(kernel2.handle) == int(kernel3.handle) == handle
+
+
+def test_kernel_keeps_library_alive(init_cuda):
+    """Test that a Kernel keeps its underlying library alive after ObjectCode goes out of scope."""
+    import gc
+
+    import numpy as np
+
+    def get_kernel_only():
+        """Return a kernel, letting ObjectCode go out of scope."""
+        code = """
+        extern "C" __global__ void write_value(int* out) {
+            if (threadIdx.x == 0 && blockIdx.x == 0) {
+                *out = 42;
+            }
+        }
+        """
+        program = Program(code, "c++")
+        object_code = program.compile("cubin")
+        kernel = object_code.get_kernel("write_value")
+        # ObjectCode goes out of scope here
+        return kernel
+
+    kernel = get_kernel_only()
+
+    # Force GC to ensure ObjectCode destructor runs
+    gc.collect()
+
+    # Kernel should still be valid
+    assert kernel.handle is not None
+    assert kernel.num_arguments == 1
+
+    # Actually launch the kernel to prove library is still loaded
+    device = Device()
+    stream = device.create_stream()
+
+    # Allocate pinned host buffer and device buffer
+    pinned_mr = cuda.core.LegacyPinnedMemoryResource()
+    host_buf = pinned_mr.allocate(4)  # sizeof(int)
+    result = np.from_dlpack(host_buf).view(np.int32)
+    result[:] = 0
+
+    dev_buf = device.memory_resource.allocate(4)
+
+    # Launch kernel
+    config = cuda.core.LaunchConfig(grid=1, block=1)
+    cuda.core.launch(stream, config, kernel, dev_buf)
+
+    # Copy result back to host
+    dev_buf.copy_to(host_buf, stream=stream)
+    stream.sync()
+
+    assert result[0] == 42, f"Expected 42, got {result[0]}"

From 915f061ce706a5ebd4df520a342fb6cb25889232 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 22 Jan 2026 11:20:37 -0800
Subject: [PATCH 09/11] Simplify resource handle patterns and clean up tests

- Remove Kernel._module (ObjectCode reference no longer needed since
  KernelHandle keeps library alive via LibraryHandle dependency)
- Simplify Kernel._from_obj signature (remove unused ObjectCode param)
- KernelAttributes: store KernelHandle instead of weakref to Kernel
- Rename get_kernel_from_library to create_kernel_handle for consistency
- Remove fragile annotation introspection from test_saxpy_arguments
---
 cuda_core/cuda/core/_cpp/resource_handles.cpp |  2 +-
 cuda_core/cuda/core/_cpp/resource_handles.hpp |  2 +-
 cuda_core/cuda/core/_module.pxd               |  9 +++---
 cuda_core/cuda/core/_module.pyx               | 29 ++++++++-----------
 cuda_core/cuda/core/_resource_handles.pxd     |  2 +-
 cuda_core/cuda/core/_resource_handles.pyx     |  2 +-
 cuda_core/tests/test_module.py                |  6 ----
 7 files changed, 21 insertions(+), 31 deletions(-)

diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp
index 7e6e388579..c4bb47261a 100644
--- a/cuda_core/cuda/core/_cpp/resource_handles.cpp
+++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp
@@ -749,7 +749,7 @@ struct KernelBox {
 };
 }  // namespace
 
-KernelHandle get_kernel_from_library(LibraryHandle h_library, const char* name) {
+KernelHandle create_kernel_handle(LibraryHandle h_library, const char* name) {
     GILReleaseGuard gil;
     CUkernel kernel;
     if (CUDA_SUCCESS != (err = p_cuLibraryGetKernel(&kernel, *h_library, name))) {
diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp
index ba76a6c054..1df181ee56 100644
--- a/cuda_core/cuda/core/_cpp/resource_handles.hpp
+++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp
@@ -254,7 +254,7 @@ LibraryHandle create_library_handle_ref(CUlibrary library);
 // The kernel structurally depends on the provided library handle.
 // Kernels have no explicit destroy - their lifetime is tied to the library.
 // Returns empty handle on error (caller must check).
-KernelHandle get_kernel_from_library(LibraryHandle h_library, const char* name);
+KernelHandle create_kernel_handle(LibraryHandle h_library, const char* name);
 
 // Create a non-owning kernel handle with library dependency.
 // Use for borrowed kernels. The library handle keeps the library alive.
diff --git a/cuda_core/cuda/core/_module.pxd b/cuda_core/cuda/core/_module.pxd
index ac512b39c8..9333703175 100644
--- a/cuda_core/cuda/core/_module.pxd
+++ b/cuda_core/cuda/core/_module.pxd
@@ -14,13 +14,11 @@ cdef class KernelAttributes
 cdef class Kernel:
     cdef:
         KernelHandle _h_kernel
-        ObjectCode _module  # ObjectCode reference
         KernelAttributes _attributes  # lazy
         KernelOccupancy _occupancy  # lazy
-        object __weakref__  # Enable weak references
 
     @staticmethod
-    cdef Kernel _from_obj(KernelHandle h_kernel, ObjectCode mod)
+    cdef Kernel _from_obj(KernelHandle h_kernel)
 
     cdef tuple _get_arguments_info(self, bint param_info=*)
 
@@ -46,8 +44,11 @@ cdef class KernelOccupancy:
 
 cdef class KernelAttributes:
     cdef:
-        object _kernel_weakref
+        KernelHandle _h_kernel
         dict _cache
 
+    @staticmethod
+    cdef KernelAttributes _init(KernelHandle h_kernel)
+
     cdef int _get_cached_attribute(self, int device_id, cydriver.CUfunction_attribute attribute) except? -1
     cdef int _resolve_device_id(self, device_id) except? -1
diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx
index a14d7888b0..a1a7dc46e2 100644
--- a/cuda_core/cuda/core/_module.pyx
+++ b/cuda_core/cuda/core/_module.pyx
@@ -8,7 +8,6 @@ from libc.stddef cimport size_t
 
 import functools
 import threading
-import weakref
 from collections import namedtuple
 
 from cuda.core._device import Device
@@ -21,7 +20,7 @@ from cuda.core._resource_handles cimport (
     create_library_handle_from_file,
     create_library_handle_from_data,
     create_library_handle_ref,
-    get_kernel_from_library,
+    create_kernel_handle,
     create_kernel_handle_ref,
     get_last_error,
     as_cu,
@@ -139,15 +138,15 @@ cdef inline LibraryHandle _make_empty_library_handle():
 
 
 cdef class KernelAttributes:
-    """Provides access to kernel attributes. Uses weakref to avoid preventing Kernel GC."""
+    """Provides access to kernel attributes."""
 
     def __init__(self, *args, **kwargs):
         raise RuntimeError("KernelAttributes cannot be instantiated directly. Please use Kernel APIs.")
 
-    @classmethod
-    def _init(cls, kernel):
-        cdef KernelAttributes self = KernelAttributes.__new__(cls)
-        self._kernel_weakref = weakref.ref(kernel)
+    @staticmethod
+    cdef KernelAttributes _init(KernelHandle h_kernel):
+        cdef KernelAttributes self = KernelAttributes.__new__(KernelAttributes)
+        self._h_kernel = h_kernel
         self._cache = {}
         _lazy_init()
         return self
@@ -158,12 +157,9 @@ cdef class KernelAttributes:
         cached = self._cache.get(cache_key, cache_key)
         if cached is not cache_key:
             return cached
-        cdef Kernel kernel = <Kernel>(self._kernel_weakref())
-        if kernel is None:
-            raise RuntimeError("Cannot access kernel attributes for expired Kernel object")
         cdef int result
         with nogil:
-            HANDLE_RETURN(cydriver.cuKernelGetAttribute(&result, attribute, as_cu(kernel._h_kernel), device_id))
+            HANDLE_RETURN(cydriver.cuKernelGetAttribute(&result, attribute, as_cu(self._h_kernel), device_id))
         self._cache[cache_key] = result
         return result
 
@@ -496,10 +492,9 @@ cdef class Kernel:
         raise RuntimeError("Kernel objects cannot be instantiated directly. Please use ObjectCode APIs.")
 
     @staticmethod
-    cdef Kernel _from_obj(KernelHandle h_kernel, ObjectCode mod):
+    cdef Kernel _from_obj(KernelHandle h_kernel):
         cdef Kernel ker = Kernel.__new__(Kernel)
         ker._h_kernel = h_kernel
-        ker._module = mod
         ker._attributes = None
         ker._occupancy = None
         return ker
@@ -508,7 +503,7 @@ cdef class Kernel:
     def attributes(self) -> KernelAttributes:
         """Get the read-only attributes of this kernel."""
         if self._attributes is None:
-            self._attributes = KernelAttributes._init(self)
+            self._attributes = KernelAttributes._init(self._h_kernel)
         return self._attributes
 
     cdef tuple _get_arguments_info(self, bint param_info=False):
@@ -607,7 +602,7 @@ cdef class Kernel:
         if not h_kernel:
             HANDLE_RETURN(get_last_error())
 
-        return Kernel._from_obj(h_kernel, mod)
+        return Kernel._from_obj(h_kernel)
 
 
 CodeTypeT = bytes | bytearray | str
@@ -811,10 +806,10 @@ cdef class ObjectCode:
         except KeyError:
             name = name.encode()
 
-        cdef KernelHandle h_kernel = get_kernel_from_library(self._h_library, <const char*>name)
+        cdef KernelHandle h_kernel = create_kernel_handle(self._h_library, <const char*>name)
         if not h_kernel:
             HANDLE_RETURN(get_last_error())
-        return Kernel._from_obj(h_kernel, self)
+        return Kernel._from_obj(h_kernel)
 
     @property
     def code(self) -> CodeTypeT:
diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd
index 10816481b0..b146d93aa7 100644
--- a/cuda_core/cuda/core/_resource_handles.pxd
+++ b/cuda_core/cuda/core/_resource_handles.pxd
@@ -109,6 +109,6 @@ cdef LibraryHandle create_library_handle_from_data(const void* data) nogil excep
 cdef LibraryHandle create_library_handle_ref(cydriver.CUlibrary library) nogil except+
 
 # Kernel handles
-cdef KernelHandle get_kernel_from_library(LibraryHandle h_library, const char* name) nogil except+
+cdef KernelHandle create_kernel_handle(LibraryHandle h_library, const char* name) nogil except+
 cdef KernelHandle create_kernel_handle_ref(
     cydriver.CUkernel kernel, LibraryHandle h_library) nogil except+
diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx
index a65836b864..022929f7e3 100644
--- a/cuda_core/cuda/core/_resource_handles.pyx
+++ b/cuda_core/cuda/core/_resource_handles.pyx
@@ -102,7 +102,7 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
         cydriver.CUlibrary library) nogil except+
 
     # Kernel handles
-    KernelHandle get_kernel_from_library "cuda_core::get_kernel_from_library" (
+    KernelHandle create_kernel_handle "cuda_core::create_kernel_handle" (
         LibraryHandle h_library, const char* name) nogil except+
     KernelHandle create_kernel_handle_ref "cuda_core::create_kernel_handle_ref" (
         cydriver.CUkernel kernel, LibraryHandle h_library) nogil except+
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index 8b1667d2f7..72591b54d5 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -229,12 +229,6 @@ def test_saxpy_arguments(get_saxpy_kernel_cubin, cuda12_4_prerequisite_check):
             _ = krn.num_arguments
         return
 
-    # Check that arguments_info returns ParamInfo objects (works for both Python and Cython classes)
-    # For Python classes: type(krn).arguments_info.fget.__annotations__ contains ParamInfo
-    # For Cython cdef classes: property descriptors don't have .fget, so we check the actual values
-    prop = type(krn).arguments_info
-    if hasattr(prop, "fget") and hasattr(prop.fget, "__annotations__"):
-        assert "ParamInfo" in str(prop.fget.__annotations__)
     arg_info = krn.arguments_info
     n_args = len(arg_info)
     assert n_args == krn.num_arguments

From f569e6ce47e813f0beea75d0e13dfeef45266a4a Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Fri, 23 Jan 2026 15:07:16 -0800
Subject: [PATCH 10/11] Simplify _MemPoolAttributes to use direct
 MemoryPoolHandle

Replace weakref pattern with direct MemoryPoolHandle storage in
_MemPoolAttributes. The handle's shared_ptr keeps the underlying
pool alive, so attributes remain accessible after the MR is deleted.

Note: _MemPool retains __weakref__ because the IPC subsystem uses
WeakValueDictionary to track memory resources across processes.
---
 cuda_core/cuda/core/_memory/_memory_pool.pxd | 10 ++++++++++
 cuda_core/cuda/core/_memory/_memory_pool.pyx | 21 +++++++-------------
 cuda_core/tests/test_memory.py               | 20 ++++---------------
 3 files changed, 21 insertions(+), 30 deletions(-)

diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pxd b/cuda_core/cuda/core/_memory/_memory_pool.pxd
index eaff8e4bab..bb192110c6 100644
--- a/cuda_core/cuda/core/_memory/_memory_pool.pxd
+++ b/cuda_core/cuda/core/_memory/_memory_pool.pxd
@@ -19,6 +19,16 @@ cdef class _MemPool(MemoryResource):
         object                __weakref__
 
 
+cdef class _MemPoolAttributes:
+    cdef:
+        MemoryPoolHandle _h_pool
+
+    @staticmethod
+    cdef _MemPoolAttributes _init(MemoryPoolHandle h_pool)
+
+    cdef int _getattribute(self, cydriver.CUmemPool_attribute attr_enum, void* value) except? -1
+
+
 cdef class _MemPoolOptions:
 
     cdef:
diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pyx b/cuda_core/cuda/core/_memory/_memory_pool.pyx
index 4fc5bdc04b..f33a5dc077 100644
--- a/cuda_core/cuda/core/_memory/_memory_pool.pyx
+++ b/cuda_core/cuda/core/_memory/_memory_pool.pyx
@@ -29,7 +29,6 @@ from cuda.core._utils.cuda_utils cimport (
 )
 
 import platform  # no-cython-lint
-import weakref
 
 from cuda.core._utils.cuda_utils import driver
 
@@ -45,16 +44,15 @@ cdef class _MemPoolOptions:
 
 
 cdef class _MemPoolAttributes:
-    cdef:
-        object _mr_weakref
+    """Provides access to memory pool attributes."""
 
     def __init__(self, *args, **kwargs):
         raise RuntimeError("_MemPoolAttributes cannot be instantiated directly. Please use MemoryResource APIs.")
 
-    @classmethod
-    def _init(cls, mr):
-        cdef _MemPoolAttributes self = _MemPoolAttributes.__new__(cls)
-        self._mr_weakref = mr
+    @staticmethod
+    cdef _MemPoolAttributes _init(MemoryPoolHandle h_pool):
+        cdef _MemPoolAttributes self = _MemPoolAttributes.__new__(_MemPoolAttributes)
+        self._h_pool = h_pool
         return self
 
     def __repr__(self):
@@ -64,12 +62,8 @@ cdef class _MemPoolAttributes:
         )
 
     cdef int _getattribute(self, cydriver.CUmemPool_attribute attr_enum, void* value) except?-1:
-        cdef _MemPool mr = <_MemPool>(self._mr_weakref())
-        if mr is None:
-            raise RuntimeError("_MemPool is expired")
-        cdef cydriver.CUmemoryPool pool_handle = as_cu(mr._h_pool)
         with nogil:
-            HANDLE_RETURN(cydriver.cuMemPoolGetAttribute(pool_handle, attr_enum, value))
+            HANDLE_RETURN(cydriver.cuMemPoolGetAttribute(as_cu(self._h_pool), attr_enum, value))
         return 0
 
     @property
@@ -197,8 +191,7 @@ cdef class _MemPool(MemoryResource):
     def attributes(self) -> _MemPoolAttributes:
         """Memory pool attributes."""
         if self._attributes is None:
-            ref = weakref.ref(self)
-            self._attributes = _MemPoolAttributes._init(ref)
+            self._attributes = _MemPoolAttributes._init(self._h_pool)
         return self._attributes
 
     @property
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 8851a4600a..47091995e7 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1162,7 +1162,7 @@ def test_mempool_attributes_repr(memory_resource_factory):
 
 
 def test_mempool_attributes_ownership(memory_resource_factory):
-    """Ensure the attributes bundle handles references correctly for all memory resource types."""
+    """Ensure the attributes bundle keeps the pool alive via the handle."""
     MR, MRops = memory_resource_factory
     device = Device()
 
@@ -1190,21 +1190,9 @@ def test_mempool_attributes_ownership(memory_resource_factory):
     mr.close()
     del mr
 
-    # After deleting the memory resource, the attributes suite is disconnected.
-    with pytest.raises(RuntimeError, match="is expired"):
-        _ = attributes.used_mem_high
-
-    # Even when a new object is created (we found a case where the same
-    # mempool handle was really reused).
-    if MR is DeviceMemoryResource:
-        mr = MR(device, dict(max_size=POOL_SIZE))  # noqa: F841
-    elif MR is PinnedMemoryResource:
-        mr = MR(dict(max_size=POOL_SIZE))  # noqa: F841
-    elif MR is ManagedMemoryResource:
-        mr = create_managed_memory_resource_or_skip(dict())  # noqa: F841
-
-    with pytest.raises(RuntimeError, match="is expired"):
-        _ = attributes.used_mem_high
+    # The attributes bundle keeps the pool alive via MemoryPoolHandle,
+    # so accessing attributes still works even after the MR is deleted.
+    _ = attributes.used_mem_high  # Should not raise
 
 
 # Ensure that memory views dellocate their reference to dlpack tensors

From 3e6c0d27ee944245c7a1ad4c685ca1ecc950bfca Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Fri, 23 Jan 2026 15:18:09 -0800
Subject: [PATCH 11/11] Fix access violation in occupancy queries with
 uninitialized hStream

Zero-initialize CUlaunchConfig struct to prevent garbage values in
hStream field when no stream is provided. The driver dereferences
hStream even when querying occupancy, causing access violations on
some platforms (observed on Windows with RTX Pro 6000).
---
 cuda_core/cuda/core/_launch_config.pyx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/_launch_config.pyx b/cuda_core/cuda/core/_launch_config.pyx
index 032c40bd78..9093a01425 100644
--- a/cuda_core/cuda/core/_launch_config.pyx
+++ b/cuda_core/cuda/core/_launch_config.pyx
@@ -2,6 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from libc.string cimport memset
+
 from cuda.core._utils.cuda_utils cimport (
     HANDLE_RETURN,
 )
@@ -139,9 +141,9 @@ cdef class LaunchConfig:
 
     cdef cydriver.CUlaunchConfig _to_native_launch_config(self):
         _lazy_init()
-        # TODO: memset to zero?
         cdef cydriver.CUlaunchConfig drv_cfg
         cdef cydriver.CUlaunchAttribute attr
+        memset(&drv_cfg, 0, sizeof(drv_cfg))
         self._attrs.resize(0)
 
         # Handle grid dimensions and cluster configuration