chore: change how we track memory_budget during evictions (#3457)

* chore: change how we track memory_budget during evictions We compared memory_budget vs 0 before inserting a new item in DbSlice, and retired cool pages if we are low on memory. The problem - when we decide whether we allow growing a table, we estimate the possible object size increase due to the future table growth. And the memory check described before was not consistent with the actual logic that rejected the insertion. Moreover, the memory_budget tracking interaction with EvictionPolicy was over-complicated: we passed the memory_budget counter to the evp object and then read it back, even though evp did not track object deletions memory impact during objects evictions. Now, we remove the responsibility from evp to update memory_budget_ so it's solely updated by DbSlice. We also update memory_budget_ during deletions, and when we pass it to evp, we add cool memory size as potential memory resource to avoid rejections in case we have lots of cool memory. Fixes #3456 --------- Signed-off-by: Roman Gershman <roman@dragonflydb.io>
2025-05-11 18:35:46 +02:00 · 2024-08-07 15:43:45 +03:00 · 2024-08-07 15:43:45 +03:00 · 7c84b8e524
commit 7c84b8e524
parent 41be819dbb
4 changed files with 51 additions and 42 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -118,6 +118,7 @@ jobs:
          df -h
          echo "-----------------------------"
          ninja src/all
+
      - name: PostFail
        if: failure()
        run: |
@ -155,7 +156,7 @@ jobs:
          timeout 5m ./multi_test --multi_exec_mode=1
          timeout 5m ./multi_test --multi_exec_mode=3
          timeout 5m ./json_family_test --jsonpathv2=false
-
+          timeout 5m ./tiered_storage_test --vmodule=db_slice=2 --logtostderr
      - name: Upload unit logs on failure
        if: failure()
        uses: actions/upload-artifact@v4
--- a/src/server/db_slice.cc
+++ b/src/server/db_slice.cc
@ -73,10 +73,12 @@ class PrimeEvictionPolicy {
  static constexpr bool can_evict = true;  // we implement eviction functionality.
  static constexpr bool can_gc = true;

-  PrimeEvictionPolicy(const DbContext& cntx, bool can_evict, ssize_t mem_budget, ssize_t soft_limit,
+  // mem_offset - memory_offset that we should account for in addition to DbSlice::memory_budget.
+  // May be negative.
+  PrimeEvictionPolicy(const DbContext& cntx, bool can_evict, ssize_t mem_offset, ssize_t soft_limit,
                      DbSlice* db_slice, bool apply_memory_limit)
      : db_slice_(db_slice),
-        mem_budget_(mem_budget),
+        mem_offset_(mem_offset),
        soft_limit_(soft_limit),
        cntx_(cntx),
        can_evict_(can_evict),
@ -85,7 +87,6 @@ class PrimeEvictionPolicy {

  // A hook function that is called every time a segment is full and requires splitting.
  void RecordSplit(PrimeTable::Segment_t* segment) {
-    mem_budget_ -= PrimeTable::kSegBytes;
    DVLOG(2) << "split: " << segment->SlowSize() << "/" << segment->capacity();
  }

@ -94,10 +95,6 @@ class PrimeEvictionPolicy {
  unsigned GarbageCollect(const PrimeTable::HotspotBuckets& eb, PrimeTable* me);
  unsigned Evict(const PrimeTable::HotspotBuckets& eb, PrimeTable* me);

-  ssize_t mem_budget() const {
-    return mem_budget_;
-  }
-
  unsigned evicted() const {
    return evicted_;
  }
@ -108,7 +105,7 @@ class PrimeEvictionPolicy {

 private:
  DbSlice* db_slice_;
-  ssize_t mem_budget_;
+  ssize_t mem_offset_;
  ssize_t soft_limit_ = 0;
  const DbContext cntx_;

@ -136,7 +133,8 @@ class PrimeBumpPolicy {
 };

 bool PrimeEvictionPolicy::CanGrow(const PrimeTable& tbl) const {
-  if (!apply_memory_limit_ || mem_budget_ > soft_limit_)
+  ssize_t mem_available = db_slice_->memory_budget() + mem_offset_;
+  if (!apply_memory_limit_ || mem_available > soft_limit_)
    return true;

  DCHECK_LE(tbl.size(), tbl.capacity());
@ -145,11 +143,11 @@ bool PrimeEvictionPolicy::CanGrow(const PrimeTable& tbl) const {
  // we estimate how much memory we will take with the current capacity
  // even though we may currently use less memory.
  // see https://github.com/dragonflydb/dragonfly/issues/256#issuecomment-1227095503
-  size_t new_available = (tbl.capacity() - tbl.size()) + PrimeTable::kSegCapacity;
-  bool res =
-      mem_budget_ > int64_t(PrimeTable::kSegBytes + db_slice_->bytes_per_object() * new_available *
-                                                        GetFlag(FLAGS_table_growth_margin));
-  VLOG(2) << "available: " << new_available << ", res: " << res;
+  size_t table_free_items = (tbl.capacity() - tbl.size()) + PrimeTable::kSegCapacity;
+  size_t obj_bytes_estimation =
+      db_slice_->bytes_per_object() * table_free_items * GetFlag(FLAGS_table_growth_margin);
+  bool res = mem_available > int64_t(PrimeTable::kSegBytes + obj_bytes_estimation);
+  VLOG(2) << "available: " << table_free_items << ", res: " << res;

  return res;
 }
@ -575,12 +573,21 @@ OpResult<DbSlice::AddOrFindResult> DbSlice::AddOrFindInternal(const Context& cnt
  // It's a new entry.
  CallChangeCallbacks(cntx.db_index, key, {key});

+  ssize_t memory_offset = -key.size();
+
  // If we are low on memory due to cold storage, free some memory.
-  if (memory_budget_ < ssize_t(key.size()) && owner_->tiered_storage()) {
+  if (owner_->tiered_storage()) {
    // At least 40KB bytes to cover potential segment split.
-    size_t goal = std::max<size_t>(key.size() * 2 - memory_budget_, 40_KB);
-    size_t reclaimed = owner_->tiered_storage()->ReclaimMemory(goal);
-    memory_budget_ += reclaimed;
+    ssize_t red_line = std::max<size_t>(key.size() * 2, 40_KB);
+    if (memory_budget_ < red_line) {
+      size_t goal = red_line - memory_budget_;
+      size_t reclaimed = owner_->tiered_storage()->ReclaimMemory(goal);
+      memory_budget_ += reclaimed;
+    }
+
+    // CoolMemoryUsage is the memory that we can always reclaim, like in the block above,
+    // therefore we include it for PrimeEvictionPolicy considerations.
+    memory_offset += owner_->tiered_storage()->CoolMemoryUsage();
  }

  // In case we are loading from rdb file or replicating we want to disable conservative memory
@ -594,37 +601,36 @@ OpResult<DbSlice::AddOrFindResult> DbSlice::AddOrFindInternal(const Context& cnt
  bool apply_memory_limit =
      !owner_->IsReplica() && !(ServerState::tlocal()->gstate() == GlobalState::LOADING);

-  PrimeEvictionPolicy evp{cntx,
-                          (bool(caching_mode_) && !owner_->IsReplica()),
-                          int64_t(memory_budget_ - key.size()),
-                          ssize_t(soft_budget_limit_),
-                          this,
-                          apply_memory_limit};
-
  // If we are over limit in non-cache scenario, just be conservative and throw.
-  if (apply_memory_limit && !caching_mode_ && evp.mem_budget() < 0) {
-    VLOG(2) << "AddOrFind: over limit, budget: " << evp.mem_budget();
+  if (apply_memory_limit && !caching_mode_ && memory_budget_ + memory_offset < 0) {
+    VLOG(2) << "AddOrFind: over limit, budget: " << memory_budget_;
    events_.insertion_rejections++;
    return OpStatus::OUT_OF_MEMORY;
  }

+  PrimeEvictionPolicy evp{cntx,          (bool(caching_mode_) && !owner_->IsReplica()),
+                          memory_offset, ssize_t(soft_budget_limit_),
+                          this,          apply_memory_limit};
+
  // Fast-path if change_cb_ is empty so we Find or Add using
  // the insert operation: twice more efficient.
  CompactObj co_key{key};
  PrimeIterator it;

-  size_t table_before = db.table_memory();
+  ssize_t table_before = db.prime.mem_usage();

  try {
    it = db.prime.InsertNew(std::move(co_key), PrimeValue{}, evp);
  } catch (bad_alloc& e) {
-    VLOG(2) << "AddOrFind2: bad alloc exception, budget: " << evp.mem_budget();
+    VLOG(2) << "AddOrFind: bad alloc exception, budget: " << memory_budget_ + memory_offset;
    events_.insertion_rejections++;
    return OpStatus::OUT_OF_MEMORY;
  }

-  size_t evicted_obj_bytes = 0;
-  if (evp.mem_budget() < 0 && apply_memory_limit) {
+  ssize_t table_increase = db.prime.mem_usage() - table_before;
+  memory_budget_ -= table_increase;
+
+  if (memory_budget_ < 0 && apply_memory_limit) {
    // We may reach the state when our memory usage is below the limit even if we
    // do not add new segments. For example, we have half full segments
    // and we add new objects or update the existing ones and our memory usage grows.
@ -636,12 +642,11 @@ OpResult<DbSlice::AddOrFindResult> DbSlice::AddOrFindInternal(const Context& cnt
 #if 0
    size_t evict_goal = std::max<size_t>(512, (-evp.mem_budget()) / 32);
    auto [items, bytes] = FreeMemWithEvictionStep(cntx.db_index, it.segment_id(), evict_goal);
-    evicted_obj_bytes = bytes;
    events_.hard_evictions += items;
 #endif
  }

-  table_memory_ += (db.table_memory() - table_before);
+  table_memory_ += table_increase;
  entries_count_++;

  db.stats.inline_keys += it->first.IsInline();
@ -654,8 +659,6 @@ OpResult<DbSlice::AddOrFindResult> DbSlice::AddOrFindInternal(const Context& cnt
  events_.stash_unloaded = db.prime.stash_unloaded();
  events_.evicted_keys += evp.evicted();
  events_.garbage_checked += evp.checked();
-
-  memory_budget_ = evp.mem_budget() + evicted_obj_bytes;
  if (cluster::IsClusterEnabled()) {
    cluster::SlotId sid = cluster::KeySlot(key);
    db.slots_stats[sid].key_count += 1;
@ -1470,9 +1473,9 @@ void DbSlice::PerformDeletion(Iterator del_it, ExpIterator exp_it, DbTable* tabl
    shard_owner()->tiered_storage()->Delete(table->index, &del_it->second);
  }

-  size_t value_heap_size = pv.MallocUsed();
+  ssize_t value_heap_size = pv.MallocUsed(), key_size_used = del_it->first.MallocUsed();
  stats.inline_keys -= del_it->first.IsInline();
-  AccountObjectMemory(del_it.key(), del_it->first.ObjType(), -del_it->first.MallocUsed(),
+  AccountObjectMemory(del_it.key(), del_it->first.ObjType(), -key_size_used,
                      table);                                                // Key
  AccountObjectMemory(del_it.key(), pv.ObjType(), -value_heap_size, table);  // Value
  if (pv.ObjType() == OBJ_HASH && pv.Encoding() == kEncodingListPack) {
@ -1487,8 +1490,14 @@ void DbSlice::PerformDeletion(Iterator del_it, ExpIterator exp_it, DbTable* tabl
  }

  table->prime.Erase(del_it.GetInnerIt());
-  table_memory_ += (table->table_memory() - table_before);
+
+  // Note, currently we do not shrink our tables upon deletion.
+  // This DCHECK ensures that if we decide to do so, we will have to update table_memory_
+  // accordingly.
+  DCHECK_EQ(table->table_memory(), table_before);
+
  --entries_count_;
+  memory_budget_ += (value_heap_size + key_size_used);

  SendInvalidationTrackingMessage(del_it.key());
 }
--- a/src/server/db_slice.h
+++ b/src/server/db_slice.h
@ -575,7 +575,7 @@ class DbSlice {
  bool expire_allowed_ = true;

  uint64_t version_ = 1;  // Used to version entries in the PrimeTable.
-  ssize_t memory_budget_ = SSIZE_MAX;
+  ssize_t memory_budget_ = SSIZE_MAX / 2;
  size_t bytes_per_object_ = 0;
  size_t soft_budget_limit_ = 0;
  size_t table_memory_ = 0;
--- a/src/server/tiered_storage_test.cc
+++ b/src/server/tiered_storage_test.cc
@ -310,8 +310,7 @@ TEST_F(TieredStorageTest, MemoryPressure) {
      resp = Run({"INFO", "ALL"});
      ASSERT_FALSE(true) << i << "\nInfo ALL:\n" << resp.GetString();
    }
-    // TODO: to remove it once used_mem_current is updated frequently.
-    ThisFiber::SleepFor(300us);
+    ThisFiber::SleepFor(100us);
  }

  EXPECT_LT(used_mem_peak.load(), 20_MB);