From 7d80728540b1a90ed62ac7cc9c7d8daa409c6ab8 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Mon, 3 Mar 2025 16:55:19 +0100 Subject: [PATCH 1/3] BUG: Fix searchsorted and CheckFromAny byte-swapping logic This closes gh-28190 and fixes another issue in the initial code that triggered the regression. Note that we may still want to avoid this, since this does lead to constructing (view compatible) structured dtypes unnecessarily here. It would also compactify the dtype. For building unnecessary dtypes, the better solution may be to just introduce a "canonical" flag to the dtypes (now that we have the space). --- numpy/_core/src/multiarray/ctors.c | 9 +++++---- numpy/_core/src/multiarray/item_selection.c | 15 ++++----------- numpy/_core/tests/test_regression.py | 6 ++++++ 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/numpy/_core/src/multiarray/ctors.c b/numpy/_core/src/multiarray/ctors.c index 383aa200e21d..fe0a82ac8559 100644 --- a/numpy/_core/src/multiarray/ctors.c +++ b/numpy/_core/src/multiarray/ctors.c @@ -1806,26 +1806,27 @@ PyArray_CheckFromAny(PyObject *op, PyArray_Descr *descr, int min_depth, * Internal version of PyArray_CheckFromAny that accepts a dtypemeta. Borrows * references to the descriptor and dtype. */ - NPY_NO_EXPORT PyObject * PyArray_CheckFromAny_int(PyObject *op, PyArray_Descr *in_descr, PyArray_DTypeMeta *in_DType, int min_depth, int max_depth, int requires, PyObject *context) { PyObject *obj; + Py_XINCREF(in_descr); /* take ownership as we may replace it */ if (requires & NPY_ARRAY_NOTSWAPPED) { if (!in_descr && PyArray_Check(op)) { in_descr = PyArray_DESCR((PyArrayObject *)op); - Py_INCREF(in_descr); } - if (in_descr) { - PyArray_DESCR_REPLACE_CANONICAL(in_descr); + PyArray_DESCR_REPLACE_CANONICAL(in_descr); + if (in_descr == NULL) { + return NULL; } } int was_scalar; obj = PyArray_FromAny_int(op, in_descr, in_DType, min_depth, max_depth, requires, context, &was_scalar); + Py_XDECREF(in_descr); if (obj == NULL) { return NULL; } diff --git a/numpy/_core/src/multiarray/item_selection.c b/numpy/_core/src/multiarray/item_selection.c index d02e420c36a1..205df1412581 100644 --- a/numpy/_core/src/multiarray/item_selection.c +++ b/numpy/_core/src/multiarray/item_selection.c @@ -2127,7 +2127,6 @@ PyArray_SearchSorted(PyArrayObject *op1, PyObject *op2, if (dtype == NULL) { return NULL; } - /* refs to dtype we own = 1 */ /* Look for binary search function */ if (perm) { @@ -2138,26 +2137,20 @@ PyArray_SearchSorted(PyArrayObject *op1, PyObject *op2, } if (binsearch == NULL && argbinsearch == NULL) { PyErr_SetString(PyExc_TypeError, "compare not supported for type"); - /* refs to dtype we own = 1 */ Py_DECREF(dtype); - /* refs to dtype we own = 0 */ return NULL; } - /* need ap2 as contiguous array and of right type */ - /* refs to dtype we own = 1 */ - Py_INCREF(dtype); - /* refs to dtype we own = 2 */ + /* need ap2 as contiguous array and of right dtype (steals and may be replace it) */ ap2 = (PyArrayObject *)PyArray_CheckFromAny(op2, dtype, 0, 0, NPY_ARRAY_CARRAY_RO | NPY_ARRAY_NOTSWAPPED, NULL); - /* refs to dtype we own = 1, array creation steals one even on failure */ if (ap2 == NULL) { - Py_DECREF(dtype); - /* refs to dtype we own = 0 */ return NULL; } + /* dtype was stolen, replace it in case the array creation replaced it. */ + dtype = (PyArray_Descr *)Py_NewRef(PyArray_DESCR(ap2)); /* * If the needle (ap2) is larger than the haystack (op1) we copy the @@ -2166,9 +2159,9 @@ PyArray_SearchSorted(PyArrayObject *op1, PyObject *op2, if (PyArray_SIZE(ap2) > PyArray_SIZE(op1)) { ap1_flags |= NPY_ARRAY_CARRAY_RO; } + /* dtype is stolen, after this we have no reference */ ap1 = (PyArrayObject *)PyArray_CheckFromAny((PyObject *)op1, dtype, 1, 1, ap1_flags, NULL); - /* refs to dtype we own = 0, array creation steals one even on failure */ if (ap1 == NULL) { goto fail; } diff --git a/numpy/_core/tests/test_regression.py b/numpy/_core/tests/test_regression.py index 826bc60a4735..e1de6aca1899 100644 --- a/numpy/_core/tests/test_regression.py +++ b/numpy/_core/tests/test_regression.py @@ -2652,3 +2652,9 @@ def test_sort_overlap(self): inp = np.linspace(0, size, num=size, dtype=np.intc) out = np.sort(inp) assert_equal(inp, out) + + def test_searchsorted_structured(self): + # gh-28190 + x = np.array([(0, 1.)], dtype=[('time', ' Date: Tue, 4 Mar 2025 21:29:50 +0100 Subject: [PATCH 2/3] PoC,WIP: Proof of concept for extending buffer protocol string This is a proof of concept, that allows roundtripping code such as: ``` a = np.arange(100000).astype("T") print(memoryview(a).format) # just to see it via_memview_string = np.asarray(memoryview(a)) a = np.arange(1000).astype("m8[ms]") # or even struct "m8[...],i" print(memoryview(a).format) # just to see it via_memview_timedelta = np.asarray(memoryview(a)) ``` I opted to simply store the `descr` pointer directly. This seems just as well because the descr owns the side-car buffer. One could still export it to non-Python (somewhat) since accessing that side-car buffer _doesn't_ need Python API in the end (just struct layout information). --- numpy/_core/_internal.py | 29 +++++++++++++++++++++++++++++ numpy/_core/src/multiarray/buffer.c | 17 +++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/numpy/_core/_internal.py b/numpy/_core/_internal.py index 45d0a07855a1..e9112aeeda7a 100644 --- a/numpy/_core/_internal.py +++ b/numpy/_core/_internal.py @@ -627,6 +627,10 @@ def _view_is_safe(oldtype, newtype): 'X': 'function pointers', } +_time_unit_map = ("Y", "M", "W", "", "D", "h", "m", "s", "ms", + "us", "ns", "ps", "fs", "as", "generic") + + class _Stream: def __init__(self, s): self.s = s @@ -721,6 +725,31 @@ def __dtype_from_pep3118(stream, is_subdtype): if stream.consume('T{'): value, align = __dtype_from_pep3118( stream, is_subdtype=True) + elif stream.consume("[numpy$"): + # TODO: Clearly, we would need a registration and a C callback + # probably, i.e. a slot that is called based on the name. + dtype_str = stream.consume_until("]") + module, name, *params = dtype_str.split(":", 2) + numpy_byteorder = {'@': '=', '^': '='}.get( + stream.byteorder, stream.byteorder) + if module == "numpy.dtypes": + if name == "TimeDelta64DType": + unit, num = [int(i, 16) for i in params[0].split(":")] + unit = _time_unit_map[unit] + value = dtype(f"{numpy_byteorder}m8[{num}{unit}]") + elif name == "StringDType": + # Just stores the Python object (ignores any other state). + # (We could always do this, but I think it is nice to avoid + # objects when possible..) + import ctypes + value = ctypes.cast(int(params[0], 16), ctypes.py_object).value + if type(value) != StringDType: + raise SystemError("Critical error, dtype not a StringDType.") + else: + raise NotImplementedError(f"Unknown NumPy dtype: {module}:{name}") + align = value.alignment + else: + raise NotImplementedError(f"Unknown NumPy dtype: {module}:{name}") elif stream.next in type_map_chars: if stream.next == 'Z': typechar = stream.advance(2) diff --git a/numpy/_core/src/multiarray/buffer.c b/numpy/_core/src/multiarray/buffer.c index fcff3ad6ca74..7e6079daf97d 100644 --- a/numpy/_core/src/multiarray/buffer.c +++ b/numpy/_core/src/multiarray/buffer.c @@ -18,6 +18,8 @@ #include "arrayobject.h" #include "scalartypes.h" #include "dtypemeta.h" +#include "descriptor.h" +#include "_datetime.h" /************************************************************************* **************** Implement Buffer Protocol **************************** @@ -420,6 +422,21 @@ _buffer_format_string(PyArray_Descr *descr, _tmp_string_t *str, if (_append_str(str, buf) < 0) return -1; break; } + case NPY_TIMEDELTA: { + char buf[128]; + PyArray_DatetimeMetaData *meta = get_datetime_metadata_from_dtype(descr); + PyOS_snprintf(buf, sizeof(buf), "[numpy$numpy.dtypes:TimeDelta64DType:%x:%x]", + meta->base, meta->num); + if (_append_str(str, buf) < 0) return -1; + break; + } + case NPY_VSTRING: { + char buf[128]; + PyOS_snprintf(buf, sizeof(buf), "[numpy$numpy.dtypes:StringDType:%zx]", + (Py_uintptr_t)descr); + if (_append_str(str, buf) < 0) return -1; + break; + } default: if (PyDataType_ISLEGACY(descr)) { PyErr_Format(PyExc_ValueError, From 55dfd59a327af60c0399bce54d80ce48b376857e Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Sun, 16 Mar 2025 15:57:26 +0100 Subject: [PATCH 3/3] Add basic PoC for string dtype and typed memoryview --- numpy/__init__.cython-30.pxd | 3 ++ numpy/_core/tests/examples/cython/checks.pyx | 35 ++++++++++++++++++++ numpy/_core/tests/test_cython.py | 8 +++++ 3 files changed, 46 insertions(+) diff --git a/numpy/__init__.cython-30.pxd b/numpy/__init__.cython-30.pxd index 1c557b1ebb9f..b6b16cac2e13 100644 --- a/numpy/__init__.cython-30.pxd +++ b/numpy/__init__.cython-30.pxd @@ -7,6 +7,8 @@ # Author: Dag Sverre Seljebotn # +cimport cython + from cpython.ref cimport Py_INCREF from cpython.object cimport PyObject, PyTypeObject, PyObject_TypeCheck cimport libc.stdio as stdio @@ -1221,6 +1223,7 @@ cdef extern from "numpy/ndarraytypes.h": ctypedef struct npy_string_allocator: pass + @cython.extended_buffer_regex("numpy", r"numpy.dtypes.StringDType:[\da-f]+") ctypedef struct npy_packed_static_string: pass diff --git a/numpy/_core/tests/examples/cython/checks.pyx b/numpy/_core/tests/examples/cython/checks.pyx index 6cdfd787c838..879b9ac1e5a2 100644 --- a/numpy/_core/tests/examples/cython/checks.pyx +++ b/numpy/_core/tests/examples/cython/checks.pyx @@ -4,6 +4,9 @@ Functions in this module give python-space wrappers for cython functions exposed in numpy/__init__.pxd, so they can be tested in test_cython.py """ +from libc.stdint cimport uintptr_t +from cpython.ref cimport Py_INCREF + cimport numpy as cnp cnp.import_array() @@ -358,6 +361,38 @@ def npystring_allocators_other_types(arr1, arr2): return ret +cdef cnp.PyArray_StringDTypeObject *string_dtype_from_format(char *fmt): + # This helper should be part of the numpy.pyd of course (but needs to be + # in it's own `.pyd` for NumPy backwards compatibility). + cdef cnp.PyArray_StringDTypeObject descr + cdef uintptr_t ptr + if not fmt.startswith(b"[numpy$numpy.dtypes:StringDType:"): + raise NotImplementedError("not able to parse this format (yet)") + fmt = fmt + len(b"[numpy$numpy.dtypes:StringDType:") + len_fmt = len(fmt) + assert fmt[len_fmt-1] == "]" + ptr = int(fmt[:len_fmt-1], 16) + return ptr + + +def npystring_write_memview(cnp.npy_packed_static_string[:] mview): + cdef char *string = "Hello world, hello Pythonistas" + cdef size_t size = len(string) + cdef size_t i + + cdef cnp.PyArray_StringDTypeObject *descr = string_dtype_from_format(mview.format) + allocator = cnp.NpyString_acquire_allocator(descr) + + # copy string->packed_string, the pointer to the underlying array buffer + for i in range(mview.shape[0]): + ret = cnp.NpyString_pack(allocator, &mview[i], string, size) + if ret < 0: + break + + cnp.NpyString_release_allocator(allocator) + return 1 + + def check_npy_uintp_type_enum(): # Regression test for gh-27890: cnp.NPY_UINTP was not defined. # Cython would fail to compile this before gh-27890 was fixed. diff --git a/numpy/_core/tests/test_cython.py b/numpy/_core/tests/test_cython.py index 53c3d92f8a14..4b3d0999165f 100644 --- a/numpy/_core/tests/test_cython.py +++ b/numpy/_core/tests/test_cython.py @@ -329,6 +329,14 @@ def test_npystring_multiple_allocators(install_temp): assert arr1[-1] is None assert arr2[0] == "test this" +def test_pystring_pack_mview(install_temp): + """Test basic memoryview interface""" + import checks + + dt = np.dtypes.StringDType(na_object=None) + arr = np.array(['abcd', 'b', 'c'], dtype=dt) + checks.npystring_write_memview(arr) + assert (arr == "Hello world, hello Pythonistas").all() def test_npystring_allocators_other_dtype(install_temp): """Check that allocators for non-StringDType arrays is NULL."""