diff --git a/db/memtable.h b/db/memtable.h index b2df0df816d6..33ba42168f23 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -719,7 +719,26 @@ class MemTable { inline DynamicBloom* GetBloomFilter() { if (needs_bloom_filter_) { - auto ptr = bloom_filter_ptr_.load(std::memory_order_relaxed); + // Uses release-acquire to prevent data race on weak memory models (e.g., + // ARM). Without it: Thread1 in DynamicBloom::new() first initializes + // data_, then stores bloom_filter_ptr_. With relaxed ordering, these two + // writes may reach another CPU's cache in reversed order, causing Thread2 + // to see non-null bloom_filter_ptr_ but uninitialized data_ (crash). With + // release-acquire: store(release) ensures all prior writes (including + // data_ initialization) are visible before bloom_filter_ptr_ update, + // load(acquire) ensures we see them in order, guaranteeing fully + // initialized object. + // Performance: This load() executes on every read/write call (hot path). + // Acquire vs relaxed overhead: On x86, both compile to the same + // instruction (MOV), so cost is identical. On ARM, acquire may add a + // memory barrier (~1-3 cycles). In practice, since bloom_filter_ptr_ is + // written once (during initialization with release) and then only read, + // the cache line is typically in Shared state at runtime, meaning both + // acquire and relaxed loads read from local cache with similar + // performance. The acquire overhead is typically negligible compared to + // subsequent bloom filter operations (hash computation and memory + // access). + auto ptr = bloom_filter_ptr_.load(std::memory_order_acquire); if (UNLIKELY(ptr == nullptr)) { std::lock_guard guard(bloom_filter_mutex_); if (bloom_filter_ == nullptr) { @@ -729,7 +748,7 @@ class MemTable { moptions_.memtable_huge_page_size, logger_)); } ptr = bloom_filter_.get(); - bloom_filter_ptr_.store(ptr, std::memory_order_relaxed); + bloom_filter_ptr_.store(ptr, std::memory_order_release); } return ptr; }