Implement Buffer.fill() redesign

Andy-Jost · Andy-Jost · commit 030e42392aae · 2025-12-11T15:07:48.000-08:00
Simplify the API by removing the explicit width parameter and inferring
width from the value. Accepts int in [0,256) for 1-byte fills, or
collections.abc.Buffer objects (1, 2, or 4 bytes) for multi-byte fills.
diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
@@ -15,8 +15,14 @@ from cuda.core.experimental._stream cimport Stream_accept, Stream
 from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
 
 import abc
+import sys
 from typing import TypeVar, Union
 
+if sys.version_info >= (3, 12):
+    from collections.abc import Buffer as BufferProtocol
+else:
+    BufferProtocol = object
+
 from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
 from cuda.core.experimental._utils.cuda_utils import driver
 from cuda.core.experimental._device import Device
@@ -203,58 +209,75 @@ cdef class Buffer:
                 s
             ))
 
-    def fill(self, value: int, width: int, *, stream: Stream | GraphBuilder):
-        """Fill this buffer with a value pattern asynchronously on the given stream.
+    def fill(self, value: int | BufferProtocol, *, stream: Stream | GraphBuilder):
+        """Fill this buffer with a repeating byte pattern.
 
         Parameters
         ----------
-        value : int
-            Integer value to fill the buffer with
-        width : int
-            Width in bytes for each element (must be 1, 2, or 4)
+        value : int | :obj:`collections.abc.Buffer`
+            - int: Must be in range [0, 256). Converted to 1 byte.
+            - :obj:`collections.abc.Buffer`: Must be 1, 2, or 4 bytes.
         stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
-            Keyword argument specifying the stream for the asynchronous fill
+            Stream for the asynchronous fill operation.
 
         Raises
         ------
+        TypeError
+            If value is not an int and does not support the buffer protocol.
         ValueError
-            If width is not 1, 2, or 4, if value is out of range for the width,
-            or if buffer size is not divisible by width
+            If value byte length is not 1, 2, or 4.
+            If buffer size is not divisible by value byte length.
+        OverflowError
+            If int value is outside [0, 256).
 
         """
         cdef Stream s_stream = Stream_accept(stream)
         cdef unsigned char c_value8
         cdef unsigned short c_value16
         cdef unsigned int c_value32
         cdef size_t N
+        cdef size_t width
+        cdef bytes pattern
+
+        # Get fill pattern from value
+        if isinstance(value, int):
+            # int.to_bytes raises OverflowError if not in [0, 256)
+            pattern = value.to_bytes(1, "little")
+        else:
+            try:
+                mv = memoryview(value)
+            except TypeError:
+                raise TypeError(
+                    f"value must be an int or support the buffer protocol, got {type(value).__name__}"
+                ) from None
+            pattern = mv.tobytes()
+
+        width = len(pattern)
 
         # Validate width
         if width not in (1, 2, 4):
-            raise ValueError(f"width must be 1, 2, or 4, got {width}")
+            raise ValueError(f"value must be 1, 2, or 4 bytes, got {width}")
 
         # Validate buffer size modulus.
         cdef size_t buffer_size = self._size
         if buffer_size % width != 0:
-            raise ValueError(f"buffer size ({buffer_size}) must be divisible by width ({width})")
-
-        # Map width (bytes) to bitwidth and validate value
-        cdef int bitwidth = width * 8
-        _validate_value_against_bitwidth(bitwidth, value, is_signed=False)
+            raise ValueError(f"buffer size ({buffer_size}) must be divisible by {width}")
 
-        # Validate value fits in width and perform fill
+        # Perform fill based on width
         cdef cydriver.CUstream s = s_stream._handle
+        int_value = int.from_bytes(pattern, "little")
         if width == 1:
-            c_value8 = <unsigned char>value
+            c_value8 = int_value
             N = buffer_size
             with nogil:
                 HANDLE_RETURN(cydriver.cuMemsetD8Async(<cydriver.CUdeviceptr>self._ptr, c_value8, N, s))
         elif width == 2:
-            c_value16 = <unsigned short>value
+            c_value16 = int_value
             N = buffer_size // 2
             with nogil:
                 HANDLE_RETURN(cydriver.cuMemsetD16Async(<cydriver.CUdeviceptr>self._ptr, c_value16, N, s))
         else:  # width == 4
-            c_value32 = <unsigned int>value
+            c_value32 = int_value
             N = buffer_size // 4
             with nogil:
                 HANDLE_RETURN(cydriver.cuMemsetD32Async(<cydriver.CUdeviceptr>self._ptr, c_value32, N, s))
diff --git a/cuda_core/tests/test_graph_mem.py b/cuda_core/tests/test_graph_mem.py
@@ -112,7 +112,7 @@ def apply_kernels(mr, stream, out):
         # Fills out with 3
         def apply_kernels(mr, stream, out):
             buffer = mr.allocate(NBYTES, stream=stream)
-            buffer.fill(3, width=1, stream=stream)
+            buffer.fill(3, stream=stream)
             out.copy_from(buffer, stream=stream)
             buffer.close()
 
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
@@ -226,75 +226,108 @@ def test_buffer_copy_from():
 def buffer_fill(dummy_mr: MemoryResource, device: Device, check=False):
     stream = device.create_stream()
 
-    # Test width=1 (byte fill)
+    # Test 1-byte fill (int in [0, 256))
     buffer1 = dummy_mr.allocate(size=1024)
-    buffer1.fill(0x42, width=1, stream=stream)
+    buffer1.fill(0x42, stream=stream)
     device.sync()
 
     if check:
         ptr = ctypes.cast(buffer1.handle, ctypes.POINTER(ctypes.c_byte))
         for i in range(10):
             assert ptr[i] == 0x42
 
-    # Test error: invalid width
-    for bad_width in [w for w in range(-10, 10) if w not in (1, 2, 4)]:
-        with pytest.raises(ValueError, match="width must be 1, 2, or 4"):
-            buffer1.fill(0x42, width=bad_width, stream=stream)
+    # Test error: int value out of range (OverflowError)
+    for bad_value in [-42, -1, 256, 1000]:
+        with pytest.raises(OverflowError):
+            buffer1.fill(bad_value, stream=stream)
 
-    # Test error: value out of range for width=1
-    for bad_value in [-42, -1, 256]:
-        with pytest.raises(ValueError, match="value must be in range \\[0, 255\\]"):
-            buffer1.fill(bad_value, width=1, stream=stream)
-
-    # Test error: buffer size not divisible by width
-    for bad_size in [1025, 1027, 1029, 1031]:  # Not divisible by 2
-        buffer_err = dummy_mr.allocate(size=1025)
-        with pytest.raises(ValueError, match="must be divisible"):
-            buffer_err.fill(0x1234, width=2, stream=stream)
-        buffer_err.close()
+    # Test error: invalid type (not int and not buffer-protocol)
+    with pytest.raises(TypeError, match="must be an int or support the buffer protocol"):
+        buffer1.fill("invalid", stream=stream)
 
     buffer1.close()
 
-    # Test width=2 (16-bit fill)
-    buffer2 = dummy_mr.allocate(size=1024)  # Divisible by 2
-    buffer2.fill(0x1234, width=2, stream=stream)
+    # Test 2-byte fill via numpy uint16
+    if np is not None:
+        buffer2 = dummy_mr.allocate(size=1024)  # Divisible by 2
+        buffer2.fill(np.uint16(0x1234), stream=stream)
+        device.sync()
+
+        if check:
+            ptr = ctypes.cast(buffer2.handle, ctypes.POINTER(ctypes.c_uint16))
+            for i in range(5):
+                assert ptr[i] == 0x1234
+
+        buffer2.close()
+
+    # Test 2-byte fill via raw bytes
+    buffer2b = dummy_mr.allocate(size=1024)
+    buffer2b.fill(b"\x34\x12", stream=stream)  # 0x1234 in little-endian
     device.sync()
 
     if check:
-        ptr = ctypes.cast(buffer2.handle, ctypes.POINTER(ctypes.c_uint16))
+        ptr = ctypes.cast(buffer2b.handle, ctypes.POINTER(ctypes.c_uint16))
         for i in range(5):
             assert ptr[i] == 0x1234
 
-    # Test error: value out of range for width=2
-    for bad_value in [-42, -1, 65536, 65537, 100000]:
-        with pytest.raises(ValueError, match="value must be in range \\[0, 65535\\]"):
-            buffer2.fill(bad_value, width=2, stream=stream)
+    # Test error: buffer size not divisible by 2
+    buffer_err = dummy_mr.allocate(size=1025)
+    with pytest.raises(ValueError, match="must be divisible by 2"):
+        buffer_err.fill(b"\x12\x34", stream=stream)
+    buffer_err.close()
 
-    buffer2.close()
+    buffer2b.close()
+
+    # Test 4-byte fill via numpy uint32
+    if np is not None:
+        buffer4 = dummy_mr.allocate(size=1024)  # Divisible by 4
+        buffer4.fill(np.uint32(0xDEADBEEF), stream=stream)
+        device.sync()
+
+        if check:
+            ptr = ctypes.cast(buffer4.handle, ctypes.POINTER(ctypes.c_uint32))
+            for i in range(5):
+                assert ptr[i] == 0xDEADBEEF
 
-    # Test width=4 (32-bit fill)
-    buffer4 = dummy_mr.allocate(size=1024)  # Divisible by 4
-    buffer4.fill(0xDEADBEEF, width=4, stream=stream)
+        buffer4.close()
+
+    # Test 4-byte fill via raw bytes
+    buffer4b = dummy_mr.allocate(size=1024)
+    buffer4b.fill(b"\xef\xbe\xad\xde", stream=stream)  # 0xDEADBEEF in little-endian
     device.sync()
 
     if check:
-        ptr = ctypes.cast(buffer4.handle, ctypes.POINTER(ctypes.c_uint32))
+        ptr = ctypes.cast(buffer4b.handle, ctypes.POINTER(ctypes.c_uint32))
         for i in range(5):
             assert ptr[i] == 0xDEADBEEF
 
-    # Test error: value out of range for width=4
-    for bad_value in [-42, -1, 4294967296, 4294967297, 5000000000]:
-        with pytest.raises(ValueError, match="value must be in range \\[0, 4294967295\\]"):
-            buffer4.fill(bad_value, width=4, stream=stream)
-
-    # Test error: buffer size not divisible by width
-    for bad_size in [1025, 1026, 1027, 1029, 1030, 1031]:  # Not divisible by 4
+    # Test error: buffer size not divisible by 4
+    for bad_size in [1025, 1026, 1027]:
         buffer_err2 = dummy_mr.allocate(size=bad_size)
-        with pytest.raises(ValueError, match="must be divisible"):
-            buffer_err2.fill(0xDEADBEEF, width=4, stream=stream)
+        with pytest.raises(ValueError, match="must be divisible by 4"):
+            buffer_err2.fill(b"\xde\xad\xbe\xef", stream=stream)
         buffer_err2.close()
 
-    buffer4.close()
+    buffer4b.close()
+
+    # Test error: invalid byte length (not 1, 2, or 4)
+    buffer_err3 = dummy_mr.allocate(size=1024)
+    with pytest.raises(ValueError, match="value must be 1, 2, or 4 bytes, got 3"):
+        buffer_err3.fill(b"\x01\x02\x03", stream=stream)
+    buffer_err3.close()
+
+    # Test float32 fill via numpy
+    if np is not None:
+        buffer_float = dummy_mr.allocate(size=1024)
+        buffer_float.fill(np.float32(1.0), stream=stream)
+        device.sync()
+
+        if check:
+            ptr = ctypes.cast(buffer_float.handle, ctypes.POINTER(ctypes.c_float))
+            for i in range(5):
+                assert ptr[i] == 1.0
+
+        buffer_float.close()
 
 
 def test_buffer_fill():