From 7101ba040e4b7501a877f893009f1e705b32b591 Mon Sep 17 00:00:00 2001
From: hhwyt <hhwyt1@gmail.com>
Date: Mon, 17 Nov 2025 14:34:44 +0800
Subject: [PATCH 1/2] Fix data race in MemTable::GetBloomFilter() on ARM weak
 memory model

Use release-acquire memory ordering instead of relaxed to ensure
DynamicBloom object is fully initialized before pointer publication.

Problem:
- On ARM (weak memory model), using relaxed ordering allows the pointer
  to be published before DynamicBloom::data_ is initialized
- Readers may see non-null pointer but access uninitialized data_,
  causing crashes

Solution:
- Use memory_order_acquire for load and memory_order_release for store
- This establishes synchronizes-with relationship, guaranteeing readers
  only see fully initialized objects

Performance impact:
- The acquire load executes on every call (hot path in Get/Add operations)
- Overhead is minimal: typically 0-2 cycles on x86 (same as relaxed),
  ~1-3 cycles on ARM (may add memory barrier)
- This overhead is <1% of total latency compared to subsequent bloom
  filter operations (hash computation + memory access)

Signed-off-by: hhwyt <hhwyt1@gmail.com>
---
 db/memtable.h | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)
diff --git a/db/memtable.h b/db/memtable.h
index b2df0df816d6..e024ab8820a2 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -719,7 +719,23 @@ class MemTable {
 
   inline DynamicBloom* GetBloomFilter() {
     if (needs_bloom_filter_) {
-      auto ptr = bloom_filter_ptr_.load(std::memory_order_relaxed);
+      // Uses release-acquire to prevent data race on ARM weak memory model.
+      // Without it: Thread1 may publish ptr before DynamicBloom::data_ is
+      // initialized, Thread2 may see non-null ptr but access uninitialized
+      // data_ (crash). With release-acquire: store(release) ensures all prior
+      // writes are visible, load(acquire) ensures we see them, guaranteeing
+      // fully initialized object.
+      // Performance: This load() executes on every read/write call (hot path).
+      // Acquire vs relaxed overhead: On x86, both compile to the same
+      // instruction (MOV), so cost is identical. On ARM, acquire may add a
+      // memory barrier (~1-3 cycles). In practice, since bloom_filter_ptr_ is
+      // written once (during initialization with release) and then only read,
+      // the cache line is typically in Shared state at runtime, meaning both
+      // acquire and relaxed loads read from local cache with similar
+      // performance. The acquire overhead is typically negligible compared to
+      // subsequent bloom filter operations (hash computation and memory
+      // access).
+      auto ptr = bloom_filter_ptr_.load(std::memory_order_acquire);
       if (UNLIKELY(ptr == nullptr)) {
         std::lock_guard<SpinMutex> guard(bloom_filter_mutex_);
         if (bloom_filter_ == nullptr) {
@@ -729,7 +745,7 @@ class MemTable {
                                moptions_.memtable_huge_page_size, logger_));
         }
         ptr = bloom_filter_.get();
-        bloom_filter_ptr_.store(ptr, std::memory_order_relaxed);
+        bloom_filter_ptr_.store(ptr, std::memory_order_release);
       }
       return ptr;
     }

From 7af9a505e0a1f2bbdb3509cb8ffea7fa6859efe4 Mon Sep 17 00:00:00 2001
From: hhwyt <hhwyt1@gmail.com>
Date: Mon, 17 Nov 2025 16:52:59 +0800
Subject: [PATCH 2/2] polish comment

Signed-off-by: hhwyt <hhwyt1@gmail.com>
---
 db/memtable.h | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/db/memtable.h b/db/memtable.h
index e024ab8820a2..33ba42168f23 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -719,12 +719,15 @@ class MemTable {
 
   inline DynamicBloom* GetBloomFilter() {
     if (needs_bloom_filter_) {
-      // Uses release-acquire to prevent data race on ARM weak memory model.
-      // Without it: Thread1 may publish ptr before DynamicBloom::data_ is
-      // initialized, Thread2 may see non-null ptr but access uninitialized
-      // data_ (crash). With release-acquire: store(release) ensures all prior
-      // writes are visible, load(acquire) ensures we see them, guaranteeing
-      // fully initialized object.
+      // Uses release-acquire to prevent data race on weak memory models (e.g.,
+      // ARM). Without it: Thread1 in DynamicBloom::new() first initializes
+      // data_, then stores bloom_filter_ptr_. With relaxed ordering, these two
+      // writes may reach another CPU's cache in reversed order, causing Thread2
+      // to see non-null bloom_filter_ptr_ but uninitialized data_ (crash). With
+      // release-acquire: store(release) ensures all prior writes (including
+      // data_ initialization) are visible before bloom_filter_ptr_ update,
+      // load(acquire) ensures we see them in order, guaranteeing fully
+      // initialized object.
       // Performance: This load() executes on every read/write call (hot path).
       // Acquire vs relaxed overhead: On x86, both compile to the same
       // instruction (MOV), so cost is identical. On ARM, acquire may add a