From 7101ba040e4b7501a877f893009f1e705b32b591 Mon Sep 17 00:00:00 2001 From: hhwyt Date: Mon, 17 Nov 2025 14:34:44 +0800 Subject: [PATCH 1/2] Fix data race in MemTable::GetBloomFilter() on ARM weak memory model Use release-acquire memory ordering instead of relaxed to ensure DynamicBloom object is fully initialized before pointer publication. Problem: - On ARM (weak memory model), using relaxed ordering allows the pointer to be published before DynamicBloom::data_ is initialized - Readers may see non-null pointer but access uninitialized data_, causing crashes Solution: - Use memory_order_acquire for load and memory_order_release for store - This establishes synchronizes-with relationship, guaranteeing readers only see fully initialized objects Performance impact: - The acquire load executes on every call (hot path in Get/Add operations) - Overhead is minimal: typically 0-2 cycles on x86 (same as relaxed), ~1-3 cycles on ARM (may add memory barrier) - This overhead is <1% of total latency compared to subsequent bloom filter operations (hash computation + memory access) Signed-off-by: hhwyt --- db/memtable.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/db/memtable.h b/db/memtable.h index b2df0df816d6..e024ab8820a2 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -719,7 +719,23 @@ class MemTable { inline DynamicBloom* GetBloomFilter() { if (needs_bloom_filter_) { - auto ptr = bloom_filter_ptr_.load(std::memory_order_relaxed); + // Uses release-acquire to prevent data race on ARM weak memory model. + // Without it: Thread1 may publish ptr before DynamicBloom::data_ is + // initialized, Thread2 may see non-null ptr but access uninitialized + // data_ (crash). With release-acquire: store(release) ensures all prior + // writes are visible, load(acquire) ensures we see them, guaranteeing + // fully initialized object. + // Performance: This load() executes on every read/write call (hot path). + // Acquire vs relaxed overhead: On x86, both compile to the same + // instruction (MOV), so cost is identical. On ARM, acquire may add a + // memory barrier (~1-3 cycles). In practice, since bloom_filter_ptr_ is + // written once (during initialization with release) and then only read, + // the cache line is typically in Shared state at runtime, meaning both + // acquire and relaxed loads read from local cache with similar + // performance. The acquire overhead is typically negligible compared to + // subsequent bloom filter operations (hash computation and memory + // access). + auto ptr = bloom_filter_ptr_.load(std::memory_order_acquire); if (UNLIKELY(ptr == nullptr)) { std::lock_guard guard(bloom_filter_mutex_); if (bloom_filter_ == nullptr) { @@ -729,7 +745,7 @@ class MemTable { moptions_.memtable_huge_page_size, logger_)); } ptr = bloom_filter_.get(); - bloom_filter_ptr_.store(ptr, std::memory_order_relaxed); + bloom_filter_ptr_.store(ptr, std::memory_order_release); } return ptr; } From 7af9a505e0a1f2bbdb3509cb8ffea7fa6859efe4 Mon Sep 17 00:00:00 2001 From: hhwyt Date: Mon, 17 Nov 2025 16:52:59 +0800 Subject: [PATCH 2/2] polish comment Signed-off-by: hhwyt --- db/memtable.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/db/memtable.h b/db/memtable.h index e024ab8820a2..33ba42168f23 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -719,12 +719,15 @@ class MemTable { inline DynamicBloom* GetBloomFilter() { if (needs_bloom_filter_) { - // Uses release-acquire to prevent data race on ARM weak memory model. - // Without it: Thread1 may publish ptr before DynamicBloom::data_ is - // initialized, Thread2 may see non-null ptr but access uninitialized - // data_ (crash). With release-acquire: store(release) ensures all prior - // writes are visible, load(acquire) ensures we see them, guaranteeing - // fully initialized object. + // Uses release-acquire to prevent data race on weak memory models (e.g., + // ARM). Without it: Thread1 in DynamicBloom::new() first initializes + // data_, then stores bloom_filter_ptr_. With relaxed ordering, these two + // writes may reach another CPU's cache in reversed order, causing Thread2 + // to see non-null bloom_filter_ptr_ but uninitialized data_ (crash). With + // release-acquire: store(release) ensures all prior writes (including + // data_ initialization) are visible before bloom_filter_ptr_ update, + // load(acquire) ensures we see them in order, guaranteeing fully + // initialized object. // Performance: This load() executes on every read/write call (hot path). // Acquire vs relaxed overhead: On x86, both compile to the same // instruction (MOV), so cost is identical. On ARM, acquire may add a