chore: transaction simplification (#2347)

chore: simplify transaction multi-locking Also, add the ananlysis routine that determines whether the schewduled transaction is contended with other transaction in a shard thread. Signed-off-by: Roman Gershman <roman@dragonflydb.io>
2025-05-10 18:05:44 +02:00 · 2023-12-31 17:02:12 +02:00 · 2023-12-31 17:02:12 +02:00 · 1fb0a486ac
commit 1fb0a486ac
parent ddbdf63470
6 changed files with 87 additions and 131 deletions
--- a/src/server/db_slice.cc
+++ b/src/server/db_slice.cc
@ -941,11 +941,6 @@ bool DbSlice::Acquire(IntentLock::Mode mode, const KeyLockArgs& lock_args) {
  return lock_acquired;
 }

-void DbSlice::Release(IntentLock::Mode mode, DbIndex db_index, std::string_view key,
-                      unsigned count) {
-  return ReleaseNormalized(mode, db_index, KeyLockArgs::GetLockKey(key), count);
-}
-
 void DbSlice::ReleaseNormalized(IntentLock::Mode mode, DbIndex db_index, std::string_view key,
                                unsigned count) {
  DCHECK_EQ(key, KeyLockArgs::GetLockKey(key));
--- a/src/server/db_slice.h
+++ b/src/server/db_slice.h
@ -283,8 +283,6 @@ class DbSlice {

  void Release(IntentLock::Mode m, const KeyLockArgs& lock_args);

-  void Release(IntentLock::Mode m, DbIndex db_index, std::string_view key, unsigned count);
-
  // Returns true if the key can be locked under m. Does not lock.
  bool CheckLock(IntentLock::Mode m, DbIndex dbid, std::string_view key) const;

@ -391,14 +389,14 @@ class DbSlice {
  // Delete a key referred by its iterator.
  void PerformDeletion(PrimeIterator del_it, EngineShard* shard, DbTable* table);

- private:
-  void PreUpdate(DbIndex db_ind, PrimeIterator it);
-  void PostUpdate(DbIndex db_ind, PrimeIterator it, std::string_view key, size_t orig_size);
-
  // Releases a single key. `key` must have been normalized by GetLockKey().
  void ReleaseNormalized(IntentLock::Mode m, DbIndex db_index, std::string_view key,
                         unsigned count);

+ private:
+  void PreUpdate(DbIndex db_ind, PrimeIterator it);
+  void PostUpdate(DbIndex db_ind, PrimeIterator it, std::string_view key, size_t orig_size);
+
  AddOrFindResult AddOrUpdateInternal(const Context& cntx, std::string_view key, PrimeValue obj,
                                      uint64_t expire_at_ms, bool force_update) noexcept(false);

--- a/src/server/engine_shard_set.cc
+++ b/src/server/engine_shard_set.cc
@ -158,6 +158,34 @@ class RoundRobinSharder {
  static Mutex mutex_;
 };

+bool HasContendedLocks(unsigned shard_id, Transaction* trx, DbTable* table) {
+  bool has_contended_locks = false;
+
+  if (trx->IsMulti()) {
+    trx->IterateMultiLocks(shard_id, [&](const string& key) {
+      auto it = table->trans_locks.find(key);
+      DCHECK(it != table->trans_locks.end());
+      if (it->second.IsContended()) {
+        has_contended_locks = true;
+      }
+    });
+  } else {
+    KeyLockArgs lock_args = trx->GetLockArgs(shard_id);
+    for (size_t i = 0; i < lock_args.args.size(); i += lock_args.key_step) {
+      string_view s = KeyLockArgs::GetLockKey(lock_args.args[i]);
+      auto it = table->trans_locks.find(s);
+      DCHECK(it != table->trans_locks.end());
+      if (it != table->trans_locks.end()) {
+        if (it->second.IsContended()) {
+          has_contended_locks = true;
+          break;
+        }
+      }
+    }
+  }
+  return has_contended_locks;
+}
+
 thread_local string RoundRobinSharder::round_robin_prefix_;
 thread_local vector<ShardId> RoundRobinSharder::round_robin_shards_tl_cache_;
 vector<ShardId> RoundRobinSharder::round_robin_shards_;
@ -533,44 +561,6 @@ void EngineShard::RemoveContTx(Transaction* tx) {
  }
 }

-#if 0
-// There are several cases that contain proof of convergence for this shard:
-// 1. txq_ empty - it means that anything that is goonna be scheduled will already be scheduled
-//    with txid > notifyid.
-// 2. committed_txid_ > notifyid - similarly, this shard can not affect the result with timestamp
-//    notifyid.
-// 3. committed_txid_ == notifyid, then if a transaction in progress (continuation_trans_ != NULL)
-//    the this transaction can still affect the result, hence we require continuation_trans_ is null
-//    which will point to converged result @notifyid. However, we never awake a transaction
-//    when there is a multi-hop transaction in progress to avoid false positives.
-//    Therefore, continuation_trans_ must always be null when calling this function.
-// 4. Finally with committed_txid_ < notifyid.
-//    we can check if the next in line (HeadScore) is after notifyid in that case we can also
-//    conclude regarding the result convergence for this shard.
-//
-bool EngineShard::HasResultConverged(TxId notifyid) const {
-  CHECK(continuation_trans_ == nullptr);
-
-  if (committed_txid_ >= notifyid)
-    return true;
-
-  // This could happen if a single lpush (not in transaction) woke multi-shard blpop.
-  DVLOG(1) << "HasResultConverged: cmtxid - " << committed_txid_ << " vs " << notifyid;
-
-  // We must check for txq head - it's not an optimization - we need it for correctness.
-  // If a multi-transaction has been scheduled and it does not have any presence in
-  // this shard (no actual keys) and we won't check for it HasResultConverged will
-  // return false. The blocked transaction will wait for this shard to progress and
-  // will also block other shards from progressing (where it has been notified).
-  // If this multi-transaction has presence in those shards, it won't progress there as well.
-  // Therefore, we will get a deadlock. By checking txid of the head we will avoid this situation:
-  // if the head.txid is after notifyid then this shard obviously converged.
-  // if the head.txid <= notifyid that transaction will be able to progress in other shards.
-  // and we must wait for it to finish.
-  return txq_.Empty() || txq_.HeadScore() > notifyid;
-}
-#endif
-
 void EngineShard::Heartbeat() {
  CacheStats();

@ -736,22 +726,7 @@ auto EngineShard::AnalyzeTxQueue() -> TxQueueInfo {
        info.tx_global++;
      } else {
        DbTable* table = db_slice().GetDBTable(trx->GetDbIndex());
-        bool can_run = true;
-
-        if (!trx->IsMulti()) {
-          KeyLockArgs lock_args = trx->GetLockArgs(sid);
-          for (size_t i = 0; i < lock_args.args.size(); i += lock_args.key_step) {
-            string_view s = KeyLockArgs::GetLockKey(lock_args.args[i]);
-            auto it = table->trans_locks.find(s);
-            DCHECK(it != table->trans_locks.end());
-            if (it != table->trans_locks.end()) {
-              if (it->second.IsContended()) {
-                can_run = false;
-                break;
-              }
-            }
-          }
-        }
+        bool can_run = !HasContendedLocks(sid, trx, table);
        if (can_run) {
          info.tx_runnable++;
        }
--- a/src/server/multi_test.cc
+++ b/src/server/multi_test.cc
@ -904,7 +904,8 @@ TEST_F(MultiTest, TestLockedKeys) {
  EXPECT_EQ(Run({"multi"}), "OK");
  EXPECT_EQ(Run({"set", "key1", "val1"}), "QUEUED");
  EXPECT_EQ(Run({"set", "key2", "val2"}), "QUEUED");
-  EXPECT_THAT(Run({"exec"}), RespArray(ElementsAre("OK", "OK")));
+  EXPECT_EQ(Run({"mset", "key1", "val3", "key1", "val4"}), "QUEUED");
+  EXPECT_THAT(Run({"exec"}), RespArray(ElementsAre("OK", "OK", "OK")));
  fb.Join();
  EXPECT_FALSE(service_->IsLocked(0, "key1"));
  EXPECT_FALSE(service_->IsLocked(0, "key2"));
--- a/src/server/transaction.cc
+++ b/src/server/transaction.cc
@ -90,7 +90,7 @@ void Transaction::InitGlobal() {
  EnableAllShards();
 }

-void Transaction::BuildShardIndex(KeyIndex key_index, bool rev_mapping,
+void Transaction::BuildShardIndex(const KeyIndex& key_index, bool rev_mapping,
                                  std::vector<PerShardCache>* out) {
  auto args = full_args_;

@ -157,38 +157,23 @@ void Transaction::InitShardData(absl::Span<const PerShardCache> shard_index, siz
  CHECK_EQ(args_.size(), num_args);
 }

-void Transaction::InitMultiData(KeyIndex key_index) {
+void Transaction::RecordMultiLocks(const KeyIndex& key_index) {
  DCHECK(multi_);
+  DCHECK(!multi_->lock_mode);

  if (multi_->mode == NON_ATOMIC)
    return;

-  IntentLock::Mode mode = Mode();
-
-  auto& tmp_uniques = tmp_space.uniq_keys;
-
-  auto lock_key = [this, mode, &tmp_uniques](string_view key) {
-    if (auto [_, inserted] = tmp_uniques.insert(KeyLockArgs::GetLockKey(key)); !inserted)
-      return;
-
-    multi_->lock_counts[key][mode]++;
-  };
-
-  // With EVAL, we call this function for EVAL itself as well as for each command
-  // for eval. currently, we lock everything only during the eval call.
-  if (!multi_->locks_recorded) {
-    tmp_uniques.clear();
+  auto lock_key = [this](string_view key) { multi_->locks.emplace(KeyLockArgs::GetLockKey(key)); };

+  multi_->lock_mode.emplace(Mode());
  for (size_t i = key_index.start; i < key_index.end; i += key_index.step)
    lock_key(ArgS(full_args_, i));
  if (key_index.bonus)
    lock_key(ArgS(full_args_, *key_index.bonus));

-    multi_->locks_recorded = true;
-  }
-
  DCHECK(IsAtomicMulti());
-  DCHECK(multi_->mode == GLOBAL || !multi_->lock_counts.empty());
+  DCHECK(multi_->mode == GLOBAL || !multi_->locks.empty());
 }

 void Transaction::StoreKeysInArgs(KeyIndex key_index, bool rev_mapping) {
@ -230,15 +215,13 @@ void Transaction::StoreKeysInArgs(KeyIndex key_index, bool rev_mapping) {
 *
 **/

-void Transaction::InitByKeys(KeyIndex key_index) {
-  auto args = full_args_;
-
-  if (key_index.start == args.size()) {  // eval with 0 keys.
+void Transaction::InitByKeys(const KeyIndex& key_index) {
+  if (key_index.start == full_args_.size()) {  // eval with 0 keys.
    CHECK(absl::StartsWith(cid_->name(), "EVAL")) << cid_->name();
    return;
  }

-  DCHECK_LT(key_index.start, args.size());
+  DCHECK_LT(key_index.start, full_args_.size());

  bool needs_reverse_mapping = cid_->opt_mask() & CO::REVERSE_MAPPING;

@ -265,7 +248,7 @@ void Transaction::InitByKeys(KeyIndex key_index) {

  shard_data_.resize(shard_set->size());  // shard_data isn't sparse, so we must allocate for all :(
  DCHECK(key_index.step == 1 || key_index.step == 2);
-  DCHECK(key_index.step != 2 || (args.size() % 2) == 0);
+  DCHECK(key_index.step != 2 || (full_args_.size() % 2) == 0);

  // Safe, because flow below is not preemptive.
  auto& shard_index = tmp_space.GetShardIndex(shard_data_.size());
@ -276,8 +259,8 @@ void Transaction::InitByKeys(KeyIndex key_index) {
  // Initialize shard data based on distributed arguments.
  InitShardData(shard_index, key_index.num_args(), needs_reverse_mapping);

-  if (multi_)
-    InitMultiData(key_index);
+  if (multi_ && !multi_->lock_mode)
+    RecordMultiLocks(key_index);

  DVLOG(1) << "InitByArgs " << DebugId() << " " << args_.front();

@ -298,7 +281,7 @@ void Transaction::InitByKeys(KeyIndex key_index) {
  // Validation. Check reverse mapping was built correctly.
  if (needs_reverse_mapping) {
    for (size_t i = 0; i < args_.size(); ++i) {
-      DCHECK_EQ(args_[i], ArgS(args, reverse_index_[i])) << args;
+      DCHECK_EQ(args_[i], ArgS(full_args_, reverse_index_[i])) << full_args_;
    }
  }

@ -373,7 +356,7 @@ void Transaction::StartMultiGlobal(DbIndex dbid) {
  multi_->mode = GLOBAL;
  InitBase(dbid, {});
  InitGlobal();
-  multi_->locks_recorded = true;
+  multi_->lock_mode = IntentLock::EXCLUSIVE;

  ScheduleInternal();
 }
@ -782,10 +765,10 @@ void Transaction::UnlockMulti() {
    return;

  auto sharded_keys = make_shared<vector<KeyList>>(shard_set->size());
-  while (!multi_->lock_counts.empty()) {
-    auto entry = multi_->lock_counts.extract(multi_->lock_counts.begin());
-    ShardId sid = Shard(entry.key(), sharded_keys->size());
-    (*sharded_keys)[sid].emplace_back(std::move(entry.key()), entry.mapped());
+  while (!multi_->locks.empty()) {
+    auto entry = multi_->locks.extract(multi_->locks.begin());
+    ShardId sid = Shard(entry.value(), sharded_keys->size());
+    (*sharded_keys)[sid].emplace_back(std::move(entry.value()));
  }

  unsigned shard_journals_cnt =
@ -796,8 +779,8 @@ void Transaction::UnlockMulti() {

  use_count_.fetch_add(shard_data_.size(), std::memory_order_relaxed);
  for (ShardId i = 0; i < shard_data_.size(); ++i) {
-    shard_set->Add(i, [this, sharded_keys, shard_journals_cnt]() {
-      this->UnlockMultiShardCb(*sharded_keys, EngineShard::tlocal(), shard_journals_cnt);
+    shard_set->Add(i, [this, sharded_keys, i, shard_journals_cnt]() {
+      this->UnlockMultiShardCb((*sharded_keys)[i], EngineShard::tlocal(), shard_journals_cnt);
      intrusive_ptr_release(this);
    });
  }
@ -857,7 +840,7 @@ void Transaction::ExecuteAsync() {

  DCHECK_GT(unique_shard_cnt_, 0u);
  DCHECK_GT(use_count_.load(memory_order_relaxed), 0u);
-  DCHECK(!IsAtomicMulti() || multi_->locks_recorded);
+  DCHECK(!IsAtomicMulti() || multi_->lock_mode.has_value());

  // We do not necessarily Execute this transaction in 'cb' below. It well may be that it will be
  // executed by the engine shard once it has been armed and coordinator thread will finish the
@ -941,6 +924,16 @@ void Transaction::Refurbish() {
  cb_ptr_ = nullptr;
 }

+void Transaction::IterateMultiLocks(ShardId sid, std::function<void(const std::string&)> cb) const {
+  unsigned shard_num = shard_set->size();
+  for (const auto& key : multi_->locks) {
+    ShardId key_sid = Shard(key, shard_num);
+    if (key_sid == sid) {
+      cb(key);
+    }
+  }
+}
+
 void Transaction::EnableShard(ShardId sid) {
  unique_shard_cnt_ = 1;
  unique_shard_id_ = sid;
@ -1290,8 +1283,10 @@ OpStatus Transaction::RunSquashedMultiCb(RunnableType cb) {
  return status;
 }

-void Transaction::UnlockMultiShardCb(const std::vector<KeyList>& sharded_keys, EngineShard* shard,
+void Transaction::UnlockMultiShardCb(const KeyList& sharded_keys, EngineShard* shard,
                                     uint32_t shard_journals_cnt) {
+  DCHECK(multi_ && multi_->lock_mode);
+
  auto journal = shard->journal();

  if (journal != nullptr && multi_->shard_journal_write[shard->shard_id()]) {
@ -1301,20 +1296,13 @@ void Transaction::UnlockMultiShardCb(const std::vector<KeyList>& sharded_keys, E
  if (multi_->mode == GLOBAL) {
    shard->shard_lock()->Release(IntentLock::EXCLUSIVE);
  } else {
+    for (const auto& key : sharded_keys) {
+      shard->db_slice().ReleaseNormalized(*multi_->lock_mode, db_index_, key, 1);
+    }
+  }
+
  ShardId sid = shard->shard_id();
-    for (const auto& k_v : sharded_keys[sid]) {
-      auto release = [&](IntentLock::Mode mode) {
-        if (k_v.second[mode]) {
-          shard->db_slice().Release(mode, db_index_, k_v.first, k_v.second[mode]);
-        }
-      };
-
-      release(IntentLock::SHARED);
-      release(IntentLock::EXCLUSIVE);
-    }
-  }
-
-  auto& sd = shard_data_[SidToId(shard->shard_id())];
+  auto& sd = shard_data_[SidToId(sid)];
  sd.local_mask |= UNLOCK_MULTI;

  // It does not have to be that all shards in multi transaction execute this tx.
--- a/src/server/transaction.h
+++ b/src/server/transaction.h
@ -325,6 +325,8 @@ class Transaction {

  void Refurbish();

+  void IterateMultiLocks(ShardId sid, std::function<void(const std::string&)> cb) const;
+
 private:
  // Holds number of locks for each IntentLock::Mode: shared and exlusive.
  struct LockCnt {
@ -341,7 +343,7 @@ class Transaction {
  };

  // owned std::string because callbacks its used in run fully async and can outlive the entries.
-  using KeyList = std::vector<std::pair<std::string, LockCnt>>;
+  using KeyList = std::vector<std::string>;

  struct alignas(64) PerShardData {
    PerShardData(PerShardData&&) noexcept {
@ -377,15 +379,14 @@ class Transaction {
    MultiRole role;
    MultiMode mode;

-    absl::flat_hash_map<std::string, LockCnt> lock_counts;
+    std::optional<IntentLock::Mode> lock_mode;
+    absl::flat_hash_set<std::string> locks;

    // The shard_journal_write vector variable is used to determine the number of shards
    // involved in a multi-command transaction. This information is utilized by replicas when
    // executing multi-command. For every write to a shard journal, the corresponding index in the
    // vector is marked as true.
    absl::InlinedVector<bool, 4> shard_journal_write;
-
-    bool locks_recorded = false;
  };

  enum CoordinatorState : uint8_t {
@ -416,20 +417,20 @@ class Transaction {
  void InitGlobal();

  // Init with a set of keys.
-  void InitByKeys(KeyIndex keys);
+  void InitByKeys(const KeyIndex& keys);

  void EnableShard(ShardId sid);
  void EnableAllShards();

  // Build shard index by distributing the arguments by shards based on the key index.
-  void BuildShardIndex(KeyIndex keys, bool rev_mapping, std::vector<PerShardCache>* out);
+  void BuildShardIndex(const KeyIndex& keys, bool rev_mapping, std::vector<PerShardCache>* out);

  // Init shard data from shard index.
  void InitShardData(absl::Span<const PerShardCache> shard_index, size_t num_args,
                     bool rev_mapping);

  // Init multi. Record locks if needed.
-  void InitMultiData(KeyIndex keys);
+  void RecordMultiLocks(const KeyIndex& keys);

  // Store all key index keys in args_. Used only for single shard initialization.
  void StoreKeysInArgs(KeyIndex keys, bool rev_mapping);
@ -467,7 +468,7 @@ class Transaction {
  // Run callback inline as part of multi stub.
  OpStatus RunSquashedMultiCb(RunnableType cb);

-  void UnlockMultiShardCb(const std::vector<KeyList>& sharded_keys, EngineShard* shard,
+  void UnlockMultiShardCb(const KeyList& sharded_keys, EngineShard* shard,
                          uint32_t shard_journals_cnt);

  // In a multi-command transaction, we determine the number of shard journals that we wrote entries
@ -585,8 +586,6 @@ class Transaction {

 private:
  struct TLTmpSpace {
-    absl::flat_hash_set<std::string_view> uniq_keys;
-
    std::vector<PerShardCache>& GetShardIndex(unsigned size);

   private: