From d75fdf8a437074676840a96272ba5a4e735add93 Mon Sep 17 00:00:00 2001 From: Radu Berinde Date: Wed, 14 Jan 2026 13:14:01 -0800 Subject: [PATCH] Minor performance improvement in binary fuse build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During peeling, we can calculate the third index by XORing all known indexes instead of looking it up in the little table. This yields a modest improvement. On an Apple M1: ``` ❯ benchstat /tmp/before.txt /tmp/after.txt name old time/op new time/op delta BinaryFusePopulate/8/n=1000000-10 29.0ms ± 5% 27.3ms ± 1% -5.77% (p=0.000 n=10+9) name old MKeys/s new MKeys/s delta BinaryFusePopulate/8/n=1000000-10 34.5 ± 5% 36.6 ± 1% +6.06% (p=0.000 n=10+9) ``` On an n4d-standard-8 (AMD Turin): ``` name old time/op new time/op delta BinaryFusePopulate/8/n=1000000-8 34.9ms ± 1% 34.4ms ± 1% -1.44% (p=0.000 n=10+9) name old MKeys/s new MKeys/s delta BinaryFusePopulate/8/n=1000000-8 28.7 ± 1% 29.1 ± 1% +1.45% (p=0.000 n=10+9) ``` --- binaryfusefilter.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/binaryfusefilter.go b/binaryfusefilter.go index f7093b0..9ddd165 100644 --- a/binaryfusefilter.go +++ b/binaryfusefilter.go @@ -206,12 +206,12 @@ func buildBinaryFuse[T Unsigned](b *BinaryFuseBuilder, keys []uint64) (_ BinaryF index1, index2, index3 := filter.getHashFromHash(hash) - h012[1] = index2 - h012[2] = index3 - h012[3] = index1 - h012[4] = h012[1] + // We set h012[i] = []uint32{index1, index2, index3}[(found+1)%3] + h012[0] = index2 + h012[1] = index3 + h012[2] = index1 - other_index1 := h012[found+1] + other_index1 := h012[found] alone[Qsize] = other_index1 if (t2count[other_index1] >> 2) == 2 { Qsize++ @@ -220,7 +220,7 @@ func buildBinaryFuse[T Unsigned](b *BinaryFuseBuilder, keys []uint64) (_ BinaryF t2count[other_index1] ^= filter.mod3(found + 1) // could use this instead: tabmod3[found+1] t2hash[other_index1] ^= hash - other_index2 := h012[found+2] + other_index2 := index1 ^ index2 ^ index3 ^ index ^ other_index1 alone[Qsize] = other_index2 if (t2count[other_index2] >> 2) == 2 { Qsize++