Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 20 additions & 13 deletions src/hyperscan/extension.c
Original file line number Diff line number Diff line change
Expand Up @@ -774,15 +774,18 @@ static PyObject *Database_scan(Database *self, PyObject *args, PyObject *kwds)
Py_XDECREF(fast_seq);
HANDLE_HYPERSCAN_ERR(hs_err, NULL);
} else {
if (!PyBytes_CheckExact(odata)) {
if (!PyObject_CheckBuffer(odata)) {
PyErr_SetString(PyExc_TypeError, "a bytes-like object is required");
HS_LOCK_RETURN_NULL();
}

char *data = PyBytes_AsString(odata);
if (data == NULL)
Py_buffer view;
if (PyObject_GetBuffer(odata, &view, PyBUF_SIMPLE) == -1) {
HS_LOCK_RETURN_NULL();
Py_ssize_t length = PyBytes_Size(odata);
}

char *data = (char *)view.buf;
Py_ssize_t length = view.len;

if (self->chimera) {
ch_error_t ch_err;
Expand All @@ -798,6 +801,7 @@ static PyObject *Database_scan(Database *self, PyObject *args, PyObject *kwds)
NULL,
ocallback == Py_None ? NULL : (void *)&cctx);
Py_END_ALLOW_THREADS;
PyBuffer_Release(&view);
if (PyErr_Occurred()) {
HS_LOCK_RETURN_NULL();
}
Expand All @@ -815,6 +819,7 @@ static PyObject *Database_scan(Database *self, PyObject *args, PyObject *kwds)
ocallback == Py_None ? NULL : hs_match_handler,
ocallback == Py_None ? NULL : (void *)&cctx);
Py_END_ALLOW_THREADS;
PyBuffer_Release(&view);
if (PyErr_Occurred()) {
HS_LOCK_RETURN_NULL();
}
Expand Down Expand Up @@ -1119,25 +1124,24 @@ static PyObject *Stream_scan(Stream *self, PyObject *args, PyObject *kwds)
HS_LOCK_DECLARE();
HS_LOCK_ACQUIRE_OR_RETURN_NULL();

char *data;
Py_ssize_t length;
uint32_t flags;
Py_buffer view;
uint32_t flags = 0;
PyObject *ocallback = Py_None, *octx = Py_None, *oscratch = Py_None;

static char *kwlist[] = {
"data", "flags", "scratch", "match_event_handler", "context", NULL};
if (!PyArg_ParseTupleAndKeywords(
args,
kwds,
"s#|IOOO",
"y*|IOOO",
kwlist,
&data,
&length,
&view,
&flags,
&oscratch,
&ocallback,
&octx))
&octx)) {
HS_LOCK_RETURN_NULL();
}

if (PyObject_Not(ocallback))
ocallback = self->cctx->callback;
Expand All @@ -1153,6 +1157,7 @@ static PyObject *Stream_scan(Stream *self, PyObject *args, PyObject *kwds)
if (!PyObject_IsInstance(oscratch, (PyObject *)&ScratchType)) {
PyErr_SetString(
PyExc_TypeError, "scratch must be a hyperscan.Scratch instance");
PyBuffer_Release(&view);
HS_LOCK_RETURN_NULL();
}
scratch = (Scratch *)oscratch;
Expand All @@ -1161,20 +1166,22 @@ static PyObject *Stream_scan(Stream *self, PyObject *args, PyObject *kwds)
py_scan_callback_ctx cctx = {ocallback, octx};

if (db->chimera) {
PyBuffer_Release(&view);
PyErr_SetString(PyExc_RuntimeError, "chimera does not support streams");
HS_LOCK_RETURN_NULL();
} else {
hs_error_t hs_err;
Py_BEGIN_ALLOW_THREADS;
hs_err = hs_scan_stream(
self->identifier,
data,
length,
(char *)view.buf,
view.len,
flags,
scratch->hs_scratch,
ocallback == Py_None ? NULL : hs_match_handler,
ocallback == Py_None ? NULL : (void *)&cctx);
Py_END_ALLOW_THREADS;
PyBuffer_Release(&view);
HANDLE_HYPERSCAN_ERR(hs_err, NULL);
}

Expand Down
149 changes: 116 additions & 33 deletions tests/test_hyperscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,21 @@ def test_chimera_scan(database_chimera, mocker):
)


def test_chimera_scan_memoryview(database_chimera, mocker):
"""Test chimera scanning with memoryview (buffer protocol support)."""
callback = mocker.Mock(return_value=None)

database_chimera.scan(memoryview(b"foobar"), match_event_handler=callback)
callback.assert_has_calls(
[
mocker.call(0, 0, 3, 0, [(1, 0, 3)], None),
mocker.call(1, 0, 6, 0, [(1, 0, 6)], None),
mocker.call(2, 3, 6, 0, [(1, 3, 6)], None),
],
any_order=True,
)


def test_block_scan(database_block, mocker):
callback = mocker.Mock(return_value=None)

Expand All @@ -99,6 +114,38 @@ def test_block_scan(database_block, mocker):
)


def test_block_scan_memoryview(database_block, mocker):
"""Test scanning with memoryview (buffer protocol support)."""
callback = mocker.Mock(return_value=None)

database_block.scan(memoryview(b"foobar"), match_event_handler=callback)
callback.assert_has_calls(
[
mocker.call(0, 0, 2, 0, None),
mocker.call(0, 0, 3, 0, None),
mocker.call(1, 0, 6, 0, None),
mocker.call(2, 3, 6, 0, None),
],
any_order=True,
)


def test_block_scan_bytearray(database_block, mocker):
"""Test scanning with bytearray (buffer protocol support)."""
callback = mocker.Mock(return_value=None)

database_block.scan(bytearray(b"foobar"), match_event_handler=callback)
callback.assert_has_calls(
[
mocker.call(0, 0, 2, 0, None),
mocker.call(0, 0, 3, 0, None),
mocker.call(1, 0, 6, 0, None),
mocker.call(2, 3, 6, 0, None),
],
any_order=True,
)


def test_stream_scan(database_stream, mocker):
callback = mocker.Mock(return_value=None)

Expand All @@ -119,6 +166,42 @@ def test_stream_scan(database_stream, mocker):
)


def test_stream_scan_memoryview(database_stream, mocker):
"""Test stream scanning with memoryview (buffer protocol support)."""
callback = mocker.Mock(return_value=None)

with database_stream.stream(match_event_handler=callback) as stream:
stream.scan(memoryview(b"foo"))
stream.scan(memoryview(b"bar"))
callback.assert_has_calls(
[
mocker.call(0, 0, 2, 0, None),
mocker.call(0, 0, 3, 0, None),
mocker.call(1, 0, 6, 0, None),
mocker.call(2, 3, 6, 0, None),
],
any_order=True,
)


def test_stream_scan_bytearray(database_stream, mocker):
"""Test stream scanning with bytearray (buffer protocol support)."""
callback = mocker.Mock(return_value=None)

with database_stream.stream(match_event_handler=callback) as stream:
stream.scan(bytearray(b"foo"))
stream.scan(bytearray(b"bar"))
callback.assert_has_calls(
[
mocker.call(0, 0, 2, 0, None),
mocker.call(0, 0, 3, 0, None),
mocker.call(1, 0, 6, 0, None),
mocker.call(2, 3, 6, 0, None),
],
any_order=True,
)


def test_vectored_scan(database_vector, mocker):
"""Test vectored scanning across multiple buffers.

Expand All @@ -136,8 +219,8 @@ def test_vectored_scan(database_vector, mocker):
callback.assert_has_calls(
[
# Pattern 0 (fo+): matches in buffer 0 and buffer 1
mocker.call(0, 0, 5, 0, None), # 'fo' at positions 3-4
mocker.call(0, 0, 6, 0, None), # 'foo' at positions 3-5
mocker.call(0, 0, 5, 0, None), # 'fo' at positions 3-4
mocker.call(0, 0, 6, 0, None), # 'foo' at positions 3-5
mocker.call(0, 0, 13, 0, None), # 'fo' in buffer 1 at pos 11-12
# Pattern 2 (BAR): matches in buffer 1 and buffer 2
mocker.call(2, 14, 17, 0, None), # 'bar' in buffer 1
Expand Down Expand Up @@ -334,92 +417,92 @@ def test_literal_expressions(mocker):

def test_unicode_expressions():
"""Test unicode pattern compilation and scanning (issue #207).

This test validates that Unicode patterns (Arabic/Hebrew text) compile and match
correctly after fixing PCRE UTF-8 support in the build system.

Background:
The original issue was "Expression is not valid UTF-8" errors when compiling
valid UTF-8 patterns. This was caused by PCRE being built without UTF-8 support
in v0.7.9+ when the build system switched from setup.py to CMake.

Note on HS_FLAG_UTF8:
We avoid using HS_FLAG_UTF8 by default due to known Hyperscan/Vectorscan
limitations and bugs:
- intel/hyperscan#57: UTF-8 match failures with \\Q...\\E patterns
- intel/hyperscan#57: UTF-8 match failures with \\Q...\\E patterns
- intel/hyperscan#133: Parser bug with Ragel v7 incorrectly rejecting valid UTF-8
- intel/hyperscan#163: Performance issues with UTF-8 + case-insensitive flags

Unicode patterns work correctly without HS_FLAG_UTF8 when PCRE has proper
UTF-8 support, which is what our CMake fixes provide.
"""
complex_patterns = [
r'<span\s+.*>السلام عليكم\s<\/span>',
r'<span\s+.*>ועליכום הסלאם\s<\/span>'
]

simple_patterns = [
'السلام عليكم',
'ועליכום הסلאם'
r"<span\s+.*>السلام عليكم\s<\/span>",
r"<span\s+.*>ועליכום הסלאם\s<\/span>",
]


simple_patterns = ["السلام عليكم", "ועליכום הסلאם"]

db_complex = hyperscan.Database()
db_complex.compile(expressions=complex_patterns)

db_simple = hyperscan.Database()
db_simple.compile(expressions=simple_patterns)
bytes_patterns = [p.encode('utf-8') for p in simple_patterns]

bytes_patterns = [p.encode("utf-8") for p in simple_patterns]
db_bytes = hyperscan.Database()
db_bytes.compile(expressions=bytes_patterns)

db_utf8 = hyperscan.Database()
try:
db_utf8.compile(expressions=simple_patterns, flags=hyperscan.HS_FLAG_UTF8)
except Exception as e:
pytest.skip(f"HS_FLAG_UTF8 validation failed (known limitation): {e}")

test_text = '<span class="greeting">السلام عليكم </span>'

scratch = hyperscan.Scratch(db_complex)
db_complex.scratch = scratch

matches = []

def on_match(pattern_id, from_offset, to_offset, flags, context):
matches.append((pattern_id, from_offset, to_offset))
return 0

# The primary issue was compilation failure with "Expression is not valid UTF-8"
# If we reach this point, the compilation succeeded, which is the main fix

# Test matching to verify patterns actually work
# Try matching the first simple pattern against itself
pattern_text = simple_patterns[0] # 'السلام عليكم'

scratch_simple = hyperscan.Scratch(db_simple)
db_simple.scratch = scratch_simple

simple_matches = []

def on_simple_match(pattern_id, from_offset, to_offset, flags, context):
simple_matches.append((pattern_id, from_offset, to_offset))
return 0
db_simple.scan(pattern_text.encode('utf-8'), match_event_handler=on_simple_match)

db_simple.scan(pattern_text.encode("utf-8"), match_event_handler=on_simple_match)

# The fact that we compiled successfully is the main victory
# But let's also verify basic functionality works
if len(simple_matches) == 0:
# If unicode matching fails, at least verify bytes patterns work
# This ensures our PCRE fixes don't break basic functionality
test_db = hyperscan.Database()
test_db.compile(expressions=[b'test'])
test_db.compile(expressions=[b"test"])
test_scratch = hyperscan.Scratch(test_db)
test_db.scratch = test_scratch

test_matches = []

def on_test_match(pattern_id, from_offset, to_offset, flags, context):
test_matches.append((pattern_id, from_offset, to_offset))
return 0
test_db.scan(b'test', match_event_handler=on_test_match)

test_db.scan(b"test", match_event_handler=on_test_match)
assert len(test_matches) > 0, "Basic pattern matching should work"
10 changes: 4 additions & 6 deletions tests/test_threading.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,7 @@ def blocking_callback(*args):
callback_ready.set()
# The first thread to enter the scan should block here until contention
# is detected by the other thread.
assert release_event.wait(
timeout=5
), "expected scratch contention not observed"
assert release_event.wait(timeout=5), "expected scratch contention not observed"
return 0

def worker(slot: int):
Expand All @@ -66,9 +64,9 @@ def worker(slot: int):
for thread in threads:
thread.start()

assert callback_ready.wait(
timeout=5
), "scan callback did not run; scratch contention test invalid"
assert callback_ready.wait(timeout=5), (
"scan callback did not run; scratch contention test invalid"
)

for thread in threads:
thread.join()
Expand Down
Loading