From e57ff98faeaa56dff3d2a3cfa5d25aa9ae6ea187 Mon Sep 17 00:00:00 2001 From: David Gidwani Date: Sat, 6 Dec 2025 01:18:47 -0500 Subject: [PATCH 1/2] feat(scan): add buffer protocol support for zero-copy scanning - Update Database_scan to accept any buffer protocol object (memoryview, bytearray, etc.) instead of requiring exact bytes objects - Update Stream_scan to use y* format specifier for buffer protocol support - Add tests for memoryview and bytearray scanning in block, stream, and chimera modes Closes #250 --- src/hyperscan/extension.c | 33 +++++---- tests/test_hyperscan.py | 149 +++++++++++++++++++++++++++++--------- tests/test_threading.py | 10 +-- 3 files changed, 140 insertions(+), 52 deletions(-) diff --git a/src/hyperscan/extension.c b/src/hyperscan/extension.c index f34e497..57ce975 100644 --- a/src/hyperscan/extension.c +++ b/src/hyperscan/extension.c @@ -774,15 +774,18 @@ static PyObject *Database_scan(Database *self, PyObject *args, PyObject *kwds) Py_XDECREF(fast_seq); HANDLE_HYPERSCAN_ERR(hs_err, NULL); } else { - if (!PyBytes_CheckExact(odata)) { + if (!PyObject_CheckBuffer(odata)) { PyErr_SetString(PyExc_TypeError, "a bytes-like object is required"); HS_LOCK_RETURN_NULL(); } - char *data = PyBytes_AsString(odata); - if (data == NULL) + Py_buffer view; + if (PyObject_GetBuffer(odata, &view, PyBUF_SIMPLE) == -1) { HS_LOCK_RETURN_NULL(); - Py_ssize_t length = PyBytes_Size(odata); + } + + char *data = (char *)view.buf; + Py_ssize_t length = view.len; if (self->chimera) { ch_error_t ch_err; @@ -798,6 +801,7 @@ static PyObject *Database_scan(Database *self, PyObject *args, PyObject *kwds) NULL, ocallback == Py_None ? NULL : (void *)&cctx); Py_END_ALLOW_THREADS; + PyBuffer_Release(&view); if (PyErr_Occurred()) { HS_LOCK_RETURN_NULL(); } @@ -815,6 +819,7 @@ static PyObject *Database_scan(Database *self, PyObject *args, PyObject *kwds) ocallback == Py_None ? NULL : hs_match_handler, ocallback == Py_None ? NULL : (void *)&cctx); Py_END_ALLOW_THREADS; + PyBuffer_Release(&view); if (PyErr_Occurred()) { HS_LOCK_RETURN_NULL(); } @@ -1119,9 +1124,8 @@ static PyObject *Stream_scan(Stream *self, PyObject *args, PyObject *kwds) HS_LOCK_DECLARE(); HS_LOCK_ACQUIRE_OR_RETURN_NULL(); - char *data; - Py_ssize_t length; - uint32_t flags; + Py_buffer view; + uint32_t flags = 0; PyObject *ocallback = Py_None, *octx = Py_None, *oscratch = Py_None; static char *kwlist[] = { @@ -1129,15 +1133,15 @@ static PyObject *Stream_scan(Stream *self, PyObject *args, PyObject *kwds) if (!PyArg_ParseTupleAndKeywords( args, kwds, - "s#|IOOO", + "y*|IOOO", kwlist, - &data, - &length, + &view, &flags, &oscratch, &ocallback, - &octx)) + &octx)) { HS_LOCK_RETURN_NULL(); + } if (PyObject_Not(ocallback)) ocallback = self->cctx->callback; @@ -1153,6 +1157,7 @@ static PyObject *Stream_scan(Stream *self, PyObject *args, PyObject *kwds) if (!PyObject_IsInstance(oscratch, (PyObject *)&ScratchType)) { PyErr_SetString( PyExc_TypeError, "scratch must be a hyperscan.Scratch instance"); + PyBuffer_Release(&view); HS_LOCK_RETURN_NULL(); } scratch = (Scratch *)oscratch; @@ -1161,6 +1166,7 @@ static PyObject *Stream_scan(Stream *self, PyObject *args, PyObject *kwds) py_scan_callback_ctx cctx = {ocallback, octx}; if (db->chimera) { + PyBuffer_Release(&view); PyErr_SetString(PyExc_RuntimeError, "chimera does not support streams"); HS_LOCK_RETURN_NULL(); } else { @@ -1168,13 +1174,14 @@ static PyObject *Stream_scan(Stream *self, PyObject *args, PyObject *kwds) Py_BEGIN_ALLOW_THREADS; hs_err = hs_scan_stream( self->identifier, - data, - length, + (char *)view.buf, + view.len, flags, scratch->hs_scratch, ocallback == Py_None ? NULL : hs_match_handler, ocallback == Py_None ? NULL : (void *)&cctx); Py_END_ALLOW_THREADS; + PyBuffer_Release(&view); HANDLE_HYPERSCAN_ERR(hs_err, NULL); } diff --git a/tests/test_hyperscan.py b/tests/test_hyperscan.py index e9d5ea5..961d8b1 100644 --- a/tests/test_hyperscan.py +++ b/tests/test_hyperscan.py @@ -84,6 +84,21 @@ def test_chimera_scan(database_chimera, mocker): ) +def test_chimera_scan_memoryview(database_chimera, mocker): + """Test chimera scanning with memoryview (buffer protocol support, issue #250).""" + callback = mocker.Mock(return_value=None) + + database_chimera.scan(memoryview(b"foobar"), match_event_handler=callback) + callback.assert_has_calls( + [ + mocker.call(0, 0, 3, 0, [(1, 0, 3)], None), + mocker.call(1, 0, 6, 0, [(1, 0, 6)], None), + mocker.call(2, 3, 6, 0, [(1, 3, 6)], None), + ], + any_order=True, + ) + + def test_block_scan(database_block, mocker): callback = mocker.Mock(return_value=None) @@ -99,6 +114,38 @@ def test_block_scan(database_block, mocker): ) +def test_block_scan_memoryview(database_block, mocker): + """Test scanning with memoryview (buffer protocol support, issue #250).""" + callback = mocker.Mock(return_value=None) + + database_block.scan(memoryview(b"foobar"), match_event_handler=callback) + callback.assert_has_calls( + [ + mocker.call(0, 0, 2, 0, None), + mocker.call(0, 0, 3, 0, None), + mocker.call(1, 0, 6, 0, None), + mocker.call(2, 3, 6, 0, None), + ], + any_order=True, + ) + + +def test_block_scan_bytearray(database_block, mocker): + """Test scanning with bytearray (buffer protocol support, issue #250).""" + callback = mocker.Mock(return_value=None) + + database_block.scan(bytearray(b"foobar"), match_event_handler=callback) + callback.assert_has_calls( + [ + mocker.call(0, 0, 2, 0, None), + mocker.call(0, 0, 3, 0, None), + mocker.call(1, 0, 6, 0, None), + mocker.call(2, 3, 6, 0, None), + ], + any_order=True, + ) + + def test_stream_scan(database_stream, mocker): callback = mocker.Mock(return_value=None) @@ -119,6 +166,42 @@ def test_stream_scan(database_stream, mocker): ) +def test_stream_scan_memoryview(database_stream, mocker): + """Test stream scanning with memoryview (buffer protocol support, issue #250).""" + callback = mocker.Mock(return_value=None) + + with database_stream.stream(match_event_handler=callback) as stream: + stream.scan(memoryview(b"foo")) + stream.scan(memoryview(b"bar")) + callback.assert_has_calls( + [ + mocker.call(0, 0, 2, 0, None), + mocker.call(0, 0, 3, 0, None), + mocker.call(1, 0, 6, 0, None), + mocker.call(2, 3, 6, 0, None), + ], + any_order=True, + ) + + +def test_stream_scan_bytearray(database_stream, mocker): + """Test stream scanning with bytearray (buffer protocol support, issue #250).""" + callback = mocker.Mock(return_value=None) + + with database_stream.stream(match_event_handler=callback) as stream: + stream.scan(bytearray(b"foo")) + stream.scan(bytearray(b"bar")) + callback.assert_has_calls( + [ + mocker.call(0, 0, 2, 0, None), + mocker.call(0, 0, 3, 0, None), + mocker.call(1, 0, 6, 0, None), + mocker.call(2, 3, 6, 0, None), + ], + any_order=True, + ) + + def test_vectored_scan(database_vector, mocker): """Test vectored scanning across multiple buffers. @@ -136,8 +219,8 @@ def test_vectored_scan(database_vector, mocker): callback.assert_has_calls( [ # Pattern 0 (fo+): matches in buffer 0 and buffer 1 - mocker.call(0, 0, 5, 0, None), # 'fo' at positions 3-4 - mocker.call(0, 0, 6, 0, None), # 'foo' at positions 3-5 + mocker.call(0, 0, 5, 0, None), # 'fo' at positions 3-4 + mocker.call(0, 0, 6, 0, None), # 'foo' at positions 3-5 mocker.call(0, 0, 13, 0, None), # 'fo' in buffer 1 at pos 11-12 # Pattern 2 (BAR): matches in buffer 1 and buffer 2 mocker.call(2, 14, 17, 0, None), # 'bar' in buffer 1 @@ -334,92 +417,92 @@ def test_literal_expressions(mocker): def test_unicode_expressions(): """Test unicode pattern compilation and scanning (issue #207). - + This test validates that Unicode patterns (Arabic/Hebrew text) compile and match correctly after fixing PCRE UTF-8 support in the build system. - + Background: The original issue was "Expression is not valid UTF-8" errors when compiling valid UTF-8 patterns. This was caused by PCRE being built without UTF-8 support in v0.7.9+ when the build system switched from setup.py to CMake. - + Note on HS_FLAG_UTF8: We avoid using HS_FLAG_UTF8 by default due to known Hyperscan/Vectorscan limitations and bugs: - - intel/hyperscan#57: UTF-8 match failures with \\Q...\\E patterns + - intel/hyperscan#57: UTF-8 match failures with \\Q...\\E patterns - intel/hyperscan#133: Parser bug with Ragel v7 incorrectly rejecting valid UTF-8 - intel/hyperscan#163: Performance issues with UTF-8 + case-insensitive flags - + Unicode patterns work correctly without HS_FLAG_UTF8 when PCRE has proper UTF-8 support, which is what our CMake fixes provide. """ complex_patterns = [ - r'السلام عليكم\s<\/span>', - r'ועליכום הסלאם\s<\/span>' - ] - - simple_patterns = [ - 'السلام عليكم', - 'ועליכום הסلאם' + r"السلام عليكم\s<\/span>", + r"ועליכום הסלאם\s<\/span>", ] - + + simple_patterns = ["السلام عليكم", "ועליכום הסلאם"] + db_complex = hyperscan.Database() db_complex.compile(expressions=complex_patterns) - + db_simple = hyperscan.Database() db_simple.compile(expressions=simple_patterns) - - bytes_patterns = [p.encode('utf-8') for p in simple_patterns] + + bytes_patterns = [p.encode("utf-8") for p in simple_patterns] db_bytes = hyperscan.Database() db_bytes.compile(expressions=bytes_patterns) - + db_utf8 = hyperscan.Database() try: db_utf8.compile(expressions=simple_patterns, flags=hyperscan.HS_FLAG_UTF8) except Exception as e: pytest.skip(f"HS_FLAG_UTF8 validation failed (known limitation): {e}") - + test_text = 'السلام عليكم ' - + scratch = hyperscan.Scratch(db_complex) db_complex.scratch = scratch - + matches = [] + def on_match(pattern_id, from_offset, to_offset, flags, context): matches.append((pattern_id, from_offset, to_offset)) return 0 - + # The primary issue was compilation failure with "Expression is not valid UTF-8" # If we reach this point, the compilation succeeded, which is the main fix - + # Test matching to verify patterns actually work # Try matching the first simple pattern against itself pattern_text = simple_patterns[0] # 'السلام عليكم' - + scratch_simple = hyperscan.Scratch(db_simple) db_simple.scratch = scratch_simple - + simple_matches = [] + def on_simple_match(pattern_id, from_offset, to_offset, flags, context): simple_matches.append((pattern_id, from_offset, to_offset)) return 0 - - db_simple.scan(pattern_text.encode('utf-8'), match_event_handler=on_simple_match) - + + db_simple.scan(pattern_text.encode("utf-8"), match_event_handler=on_simple_match) + # The fact that we compiled successfully is the main victory # But let's also verify basic functionality works if len(simple_matches) == 0: # If unicode matching fails, at least verify bytes patterns work # This ensures our PCRE fixes don't break basic functionality test_db = hyperscan.Database() - test_db.compile(expressions=[b'test']) + test_db.compile(expressions=[b"test"]) test_scratch = hyperscan.Scratch(test_db) test_db.scratch = test_scratch - + test_matches = [] + def on_test_match(pattern_id, from_offset, to_offset, flags, context): test_matches.append((pattern_id, from_offset, to_offset)) return 0 - - test_db.scan(b'test', match_event_handler=on_test_match) + + test_db.scan(b"test", match_event_handler=on_test_match) assert len(test_matches) > 0, "Basic pattern matching should work" diff --git a/tests/test_threading.py b/tests/test_threading.py index 0807afa..6e84499 100644 --- a/tests/test_threading.py +++ b/tests/test_threading.py @@ -44,9 +44,7 @@ def blocking_callback(*args): callback_ready.set() # The first thread to enter the scan should block here until contention # is detected by the other thread. - assert release_event.wait( - timeout=5 - ), "expected scratch contention not observed" + assert release_event.wait(timeout=5), "expected scratch contention not observed" return 0 def worker(slot: int): @@ -66,9 +64,9 @@ def worker(slot: int): for thread in threads: thread.start() - assert callback_ready.wait( - timeout=5 - ), "scan callback did not run; scratch contention test invalid" + assert callback_ready.wait(timeout=5), ( + "scan callback did not run; scratch contention test invalid" + ) for thread in threads: thread.join() From 757495ded9983b648c847faa5c0dbaa26436c5e0 Mon Sep 17 00:00:00 2001 From: David Gidwani Date: Sat, 6 Dec 2025 01:24:55 -0500 Subject: [PATCH 2/2] chore(tests): remove issue numbers from docstrings --- tests/test_hyperscan.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_hyperscan.py b/tests/test_hyperscan.py index 961d8b1..e762b3b 100644 --- a/tests/test_hyperscan.py +++ b/tests/test_hyperscan.py @@ -85,7 +85,7 @@ def test_chimera_scan(database_chimera, mocker): def test_chimera_scan_memoryview(database_chimera, mocker): - """Test chimera scanning with memoryview (buffer protocol support, issue #250).""" + """Test chimera scanning with memoryview (buffer protocol support).""" callback = mocker.Mock(return_value=None) database_chimera.scan(memoryview(b"foobar"), match_event_handler=callback) @@ -115,7 +115,7 @@ def test_block_scan(database_block, mocker): def test_block_scan_memoryview(database_block, mocker): - """Test scanning with memoryview (buffer protocol support, issue #250).""" + """Test scanning with memoryview (buffer protocol support).""" callback = mocker.Mock(return_value=None) database_block.scan(memoryview(b"foobar"), match_event_handler=callback) @@ -131,7 +131,7 @@ def test_block_scan_memoryview(database_block, mocker): def test_block_scan_bytearray(database_block, mocker): - """Test scanning with bytearray (buffer protocol support, issue #250).""" + """Test scanning with bytearray (buffer protocol support).""" callback = mocker.Mock(return_value=None) database_block.scan(bytearray(b"foobar"), match_event_handler=callback) @@ -167,7 +167,7 @@ def test_stream_scan(database_stream, mocker): def test_stream_scan_memoryview(database_stream, mocker): - """Test stream scanning with memoryview (buffer protocol support, issue #250).""" + """Test stream scanning with memoryview (buffer protocol support).""" callback = mocker.Mock(return_value=None) with database_stream.stream(match_event_handler=callback) as stream: @@ -185,7 +185,7 @@ def test_stream_scan_memoryview(database_stream, mocker): def test_stream_scan_bytearray(database_stream, mocker): - """Test stream scanning with bytearray (buffer protocol support, issue #250).""" + """Test stream scanning with bytearray (buffer protocol support).""" callback = mocker.Mock(return_value=None) with database_stream.stream(match_event_handler=callback) as stream: