chore: transaction simplification (#2347)

chore: simplify transaction multi-locking

Also, add the ananlysis routine that determines whether the schewduled transaction is contended with other transaction in a
shard thread.

Signed-off-by: Roman Gershman <roman@dragonflydb.io>
This commit is contained in:
Roman Gershman 2023-12-31 17:02:12 +02:00 committed by GitHub
parent ddbdf63470
commit 1fb0a486ac
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 87 additions and 131 deletions

View file

@ -941,11 +941,6 @@ bool DbSlice::Acquire(IntentLock::Mode mode, const KeyLockArgs& lock_args) {
return lock_acquired; return lock_acquired;
} }
void DbSlice::Release(IntentLock::Mode mode, DbIndex db_index, std::string_view key,
unsigned count) {
return ReleaseNormalized(mode, db_index, KeyLockArgs::GetLockKey(key), count);
}
void DbSlice::ReleaseNormalized(IntentLock::Mode mode, DbIndex db_index, std::string_view key, void DbSlice::ReleaseNormalized(IntentLock::Mode mode, DbIndex db_index, std::string_view key,
unsigned count) { unsigned count) {
DCHECK_EQ(key, KeyLockArgs::GetLockKey(key)); DCHECK_EQ(key, KeyLockArgs::GetLockKey(key));

View file

@ -283,8 +283,6 @@ class DbSlice {
void Release(IntentLock::Mode m, const KeyLockArgs& lock_args); void Release(IntentLock::Mode m, const KeyLockArgs& lock_args);
void Release(IntentLock::Mode m, DbIndex db_index, std::string_view key, unsigned count);
// Returns true if the key can be locked under m. Does not lock. // Returns true if the key can be locked under m. Does not lock.
bool CheckLock(IntentLock::Mode m, DbIndex dbid, std::string_view key) const; bool CheckLock(IntentLock::Mode m, DbIndex dbid, std::string_view key) const;
@ -391,14 +389,14 @@ class DbSlice {
// Delete a key referred by its iterator. // Delete a key referred by its iterator.
void PerformDeletion(PrimeIterator del_it, EngineShard* shard, DbTable* table); void PerformDeletion(PrimeIterator del_it, EngineShard* shard, DbTable* table);
private:
void PreUpdate(DbIndex db_ind, PrimeIterator it);
void PostUpdate(DbIndex db_ind, PrimeIterator it, std::string_view key, size_t orig_size);
// Releases a single key. `key` must have been normalized by GetLockKey(). // Releases a single key. `key` must have been normalized by GetLockKey().
void ReleaseNormalized(IntentLock::Mode m, DbIndex db_index, std::string_view key, void ReleaseNormalized(IntentLock::Mode m, DbIndex db_index, std::string_view key,
unsigned count); unsigned count);
private:
void PreUpdate(DbIndex db_ind, PrimeIterator it);
void PostUpdate(DbIndex db_ind, PrimeIterator it, std::string_view key, size_t orig_size);
AddOrFindResult AddOrUpdateInternal(const Context& cntx, std::string_view key, PrimeValue obj, AddOrFindResult AddOrUpdateInternal(const Context& cntx, std::string_view key, PrimeValue obj,
uint64_t expire_at_ms, bool force_update) noexcept(false); uint64_t expire_at_ms, bool force_update) noexcept(false);

View file

@ -158,6 +158,34 @@ class RoundRobinSharder {
static Mutex mutex_; static Mutex mutex_;
}; };
bool HasContendedLocks(unsigned shard_id, Transaction* trx, DbTable* table) {
bool has_contended_locks = false;
if (trx->IsMulti()) {
trx->IterateMultiLocks(shard_id, [&](const string& key) {
auto it = table->trans_locks.find(key);
DCHECK(it != table->trans_locks.end());
if (it->second.IsContended()) {
has_contended_locks = true;
}
});
} else {
KeyLockArgs lock_args = trx->GetLockArgs(shard_id);
for (size_t i = 0; i < lock_args.args.size(); i += lock_args.key_step) {
string_view s = KeyLockArgs::GetLockKey(lock_args.args[i]);
auto it = table->trans_locks.find(s);
DCHECK(it != table->trans_locks.end());
if (it != table->trans_locks.end()) {
if (it->second.IsContended()) {
has_contended_locks = true;
break;
}
}
}
}
return has_contended_locks;
}
thread_local string RoundRobinSharder::round_robin_prefix_; thread_local string RoundRobinSharder::round_robin_prefix_;
thread_local vector<ShardId> RoundRobinSharder::round_robin_shards_tl_cache_; thread_local vector<ShardId> RoundRobinSharder::round_robin_shards_tl_cache_;
vector<ShardId> RoundRobinSharder::round_robin_shards_; vector<ShardId> RoundRobinSharder::round_robin_shards_;
@ -533,44 +561,6 @@ void EngineShard::RemoveContTx(Transaction* tx) {
} }
} }
#if 0
// There are several cases that contain proof of convergence for this shard:
// 1. txq_ empty - it means that anything that is goonna be scheduled will already be scheduled
// with txid > notifyid.
// 2. committed_txid_ > notifyid - similarly, this shard can not affect the result with timestamp
// notifyid.
// 3. committed_txid_ == notifyid, then if a transaction in progress (continuation_trans_ != NULL)
// the this transaction can still affect the result, hence we require continuation_trans_ is null
// which will point to converged result @notifyid. However, we never awake a transaction
// when there is a multi-hop transaction in progress to avoid false positives.
// Therefore, continuation_trans_ must always be null when calling this function.
// 4. Finally with committed_txid_ < notifyid.
// we can check if the next in line (HeadScore) is after notifyid in that case we can also
// conclude regarding the result convergence for this shard.
//
bool EngineShard::HasResultConverged(TxId notifyid) const {
CHECK(continuation_trans_ == nullptr);
if (committed_txid_ >= notifyid)
return true;
// This could happen if a single lpush (not in transaction) woke multi-shard blpop.
DVLOG(1) << "HasResultConverged: cmtxid - " << committed_txid_ << " vs " << notifyid;
// We must check for txq head - it's not an optimization - we need it for correctness.
// If a multi-transaction has been scheduled and it does not have any presence in
// this shard (no actual keys) and we won't check for it HasResultConverged will
// return false. The blocked transaction will wait for this shard to progress and
// will also block other shards from progressing (where it has been notified).
// If this multi-transaction has presence in those shards, it won't progress there as well.
// Therefore, we will get a deadlock. By checking txid of the head we will avoid this situation:
// if the head.txid is after notifyid then this shard obviously converged.
// if the head.txid <= notifyid that transaction will be able to progress in other shards.
// and we must wait for it to finish.
return txq_.Empty() || txq_.HeadScore() > notifyid;
}
#endif
void EngineShard::Heartbeat() { void EngineShard::Heartbeat() {
CacheStats(); CacheStats();
@ -736,22 +726,7 @@ auto EngineShard::AnalyzeTxQueue() -> TxQueueInfo {
info.tx_global++; info.tx_global++;
} else { } else {
DbTable* table = db_slice().GetDBTable(trx->GetDbIndex()); DbTable* table = db_slice().GetDBTable(trx->GetDbIndex());
bool can_run = true; bool can_run = !HasContendedLocks(sid, trx, table);
if (!trx->IsMulti()) {
KeyLockArgs lock_args = trx->GetLockArgs(sid);
for (size_t i = 0; i < lock_args.args.size(); i += lock_args.key_step) {
string_view s = KeyLockArgs::GetLockKey(lock_args.args[i]);
auto it = table->trans_locks.find(s);
DCHECK(it != table->trans_locks.end());
if (it != table->trans_locks.end()) {
if (it->second.IsContended()) {
can_run = false;
break;
}
}
}
}
if (can_run) { if (can_run) {
info.tx_runnable++; info.tx_runnable++;
} }

View file

@ -904,7 +904,8 @@ TEST_F(MultiTest, TestLockedKeys) {
EXPECT_EQ(Run({"multi"}), "OK"); EXPECT_EQ(Run({"multi"}), "OK");
EXPECT_EQ(Run({"set", "key1", "val1"}), "QUEUED"); EXPECT_EQ(Run({"set", "key1", "val1"}), "QUEUED");
EXPECT_EQ(Run({"set", "key2", "val2"}), "QUEUED"); EXPECT_EQ(Run({"set", "key2", "val2"}), "QUEUED");
EXPECT_THAT(Run({"exec"}), RespArray(ElementsAre("OK", "OK"))); EXPECT_EQ(Run({"mset", "key1", "val3", "key1", "val4"}), "QUEUED");
EXPECT_THAT(Run({"exec"}), RespArray(ElementsAre("OK", "OK", "OK")));
fb.Join(); fb.Join();
EXPECT_FALSE(service_->IsLocked(0, "key1")); EXPECT_FALSE(service_->IsLocked(0, "key1"));
EXPECT_FALSE(service_->IsLocked(0, "key2")); EXPECT_FALSE(service_->IsLocked(0, "key2"));

View file

@ -90,7 +90,7 @@ void Transaction::InitGlobal() {
EnableAllShards(); EnableAllShards();
} }
void Transaction::BuildShardIndex(KeyIndex key_index, bool rev_mapping, void Transaction::BuildShardIndex(const KeyIndex& key_index, bool rev_mapping,
std::vector<PerShardCache>* out) { std::vector<PerShardCache>* out) {
auto args = full_args_; auto args = full_args_;
@ -157,38 +157,23 @@ void Transaction::InitShardData(absl::Span<const PerShardCache> shard_index, siz
CHECK_EQ(args_.size(), num_args); CHECK_EQ(args_.size(), num_args);
} }
void Transaction::InitMultiData(KeyIndex key_index) { void Transaction::RecordMultiLocks(const KeyIndex& key_index) {
DCHECK(multi_); DCHECK(multi_);
DCHECK(!multi_->lock_mode);
if (multi_->mode == NON_ATOMIC) if (multi_->mode == NON_ATOMIC)
return; return;
IntentLock::Mode mode = Mode(); auto lock_key = [this](string_view key) { multi_->locks.emplace(KeyLockArgs::GetLockKey(key)); };
auto& tmp_uniques = tmp_space.uniq_keys; multi_->lock_mode.emplace(Mode());
for (size_t i = key_index.start; i < key_index.end; i += key_index.step)
auto lock_key = [this, mode, &tmp_uniques](string_view key) { lock_key(ArgS(full_args_, i));
if (auto [_, inserted] = tmp_uniques.insert(KeyLockArgs::GetLockKey(key)); !inserted) if (key_index.bonus)
return; lock_key(ArgS(full_args_, *key_index.bonus));
multi_->lock_counts[key][mode]++;
};
// With EVAL, we call this function for EVAL itself as well as for each command
// for eval. currently, we lock everything only during the eval call.
if (!multi_->locks_recorded) {
tmp_uniques.clear();
for (size_t i = key_index.start; i < key_index.end; i += key_index.step)
lock_key(ArgS(full_args_, i));
if (key_index.bonus)
lock_key(ArgS(full_args_, *key_index.bonus));
multi_->locks_recorded = true;
}
DCHECK(IsAtomicMulti()); DCHECK(IsAtomicMulti());
DCHECK(multi_->mode == GLOBAL || !multi_->lock_counts.empty()); DCHECK(multi_->mode == GLOBAL || !multi_->locks.empty());
} }
void Transaction::StoreKeysInArgs(KeyIndex key_index, bool rev_mapping) { void Transaction::StoreKeysInArgs(KeyIndex key_index, bool rev_mapping) {
@ -230,15 +215,13 @@ void Transaction::StoreKeysInArgs(KeyIndex key_index, bool rev_mapping) {
* *
**/ **/
void Transaction::InitByKeys(KeyIndex key_index) { void Transaction::InitByKeys(const KeyIndex& key_index) {
auto args = full_args_; if (key_index.start == full_args_.size()) { // eval with 0 keys.
if (key_index.start == args.size()) { // eval with 0 keys.
CHECK(absl::StartsWith(cid_->name(), "EVAL")) << cid_->name(); CHECK(absl::StartsWith(cid_->name(), "EVAL")) << cid_->name();
return; return;
} }
DCHECK_LT(key_index.start, args.size()); DCHECK_LT(key_index.start, full_args_.size());
bool needs_reverse_mapping = cid_->opt_mask() & CO::REVERSE_MAPPING; bool needs_reverse_mapping = cid_->opt_mask() & CO::REVERSE_MAPPING;
@ -265,7 +248,7 @@ void Transaction::InitByKeys(KeyIndex key_index) {
shard_data_.resize(shard_set->size()); // shard_data isn't sparse, so we must allocate for all :( shard_data_.resize(shard_set->size()); // shard_data isn't sparse, so we must allocate for all :(
DCHECK(key_index.step == 1 || key_index.step == 2); DCHECK(key_index.step == 1 || key_index.step == 2);
DCHECK(key_index.step != 2 || (args.size() % 2) == 0); DCHECK(key_index.step != 2 || (full_args_.size() % 2) == 0);
// Safe, because flow below is not preemptive. // Safe, because flow below is not preemptive.
auto& shard_index = tmp_space.GetShardIndex(shard_data_.size()); auto& shard_index = tmp_space.GetShardIndex(shard_data_.size());
@ -276,8 +259,8 @@ void Transaction::InitByKeys(KeyIndex key_index) {
// Initialize shard data based on distributed arguments. // Initialize shard data based on distributed arguments.
InitShardData(shard_index, key_index.num_args(), needs_reverse_mapping); InitShardData(shard_index, key_index.num_args(), needs_reverse_mapping);
if (multi_) if (multi_ && !multi_->lock_mode)
InitMultiData(key_index); RecordMultiLocks(key_index);
DVLOG(1) << "InitByArgs " << DebugId() << " " << args_.front(); DVLOG(1) << "InitByArgs " << DebugId() << " " << args_.front();
@ -298,7 +281,7 @@ void Transaction::InitByKeys(KeyIndex key_index) {
// Validation. Check reverse mapping was built correctly. // Validation. Check reverse mapping was built correctly.
if (needs_reverse_mapping) { if (needs_reverse_mapping) {
for (size_t i = 0; i < args_.size(); ++i) { for (size_t i = 0; i < args_.size(); ++i) {
DCHECK_EQ(args_[i], ArgS(args, reverse_index_[i])) << args; DCHECK_EQ(args_[i], ArgS(full_args_, reverse_index_[i])) << full_args_;
} }
} }
@ -373,7 +356,7 @@ void Transaction::StartMultiGlobal(DbIndex dbid) {
multi_->mode = GLOBAL; multi_->mode = GLOBAL;
InitBase(dbid, {}); InitBase(dbid, {});
InitGlobal(); InitGlobal();
multi_->locks_recorded = true; multi_->lock_mode = IntentLock::EXCLUSIVE;
ScheduleInternal(); ScheduleInternal();
} }
@ -519,7 +502,7 @@ bool Transaction::RunInShard(EngineShard* shard, bool txq_ooo) {
// is concluding. // is concluding.
bool remove_txq = tx_stop_runnig || !txq_ooo; bool remove_txq = tx_stop_runnig || !txq_ooo;
if (remove_txq && sd.pq_pos != TxQueue::kEnd) { if (remove_txq && sd.pq_pos != TxQueue::kEnd) {
VLOG(2) << "Remove from txq" << this->DebugId(); VLOG(2) << "Remove from txq " << this->DebugId();
shard->txq()->Remove(sd.pq_pos); shard->txq()->Remove(sd.pq_pos);
sd.pq_pos = TxQueue::kEnd; sd.pq_pos = TxQueue::kEnd;
} }
@ -782,10 +765,10 @@ void Transaction::UnlockMulti() {
return; return;
auto sharded_keys = make_shared<vector<KeyList>>(shard_set->size()); auto sharded_keys = make_shared<vector<KeyList>>(shard_set->size());
while (!multi_->lock_counts.empty()) { while (!multi_->locks.empty()) {
auto entry = multi_->lock_counts.extract(multi_->lock_counts.begin()); auto entry = multi_->locks.extract(multi_->locks.begin());
ShardId sid = Shard(entry.key(), sharded_keys->size()); ShardId sid = Shard(entry.value(), sharded_keys->size());
(*sharded_keys)[sid].emplace_back(std::move(entry.key()), entry.mapped()); (*sharded_keys)[sid].emplace_back(std::move(entry.value()));
} }
unsigned shard_journals_cnt = unsigned shard_journals_cnt =
@ -796,8 +779,8 @@ void Transaction::UnlockMulti() {
use_count_.fetch_add(shard_data_.size(), std::memory_order_relaxed); use_count_.fetch_add(shard_data_.size(), std::memory_order_relaxed);
for (ShardId i = 0; i < shard_data_.size(); ++i) { for (ShardId i = 0; i < shard_data_.size(); ++i) {
shard_set->Add(i, [this, sharded_keys, shard_journals_cnt]() { shard_set->Add(i, [this, sharded_keys, i, shard_journals_cnt]() {
this->UnlockMultiShardCb(*sharded_keys, EngineShard::tlocal(), shard_journals_cnt); this->UnlockMultiShardCb((*sharded_keys)[i], EngineShard::tlocal(), shard_journals_cnt);
intrusive_ptr_release(this); intrusive_ptr_release(this);
}); });
} }
@ -857,7 +840,7 @@ void Transaction::ExecuteAsync() {
DCHECK_GT(unique_shard_cnt_, 0u); DCHECK_GT(unique_shard_cnt_, 0u);
DCHECK_GT(use_count_.load(memory_order_relaxed), 0u); DCHECK_GT(use_count_.load(memory_order_relaxed), 0u);
DCHECK(!IsAtomicMulti() || multi_->locks_recorded); DCHECK(!IsAtomicMulti() || multi_->lock_mode.has_value());
// We do not necessarily Execute this transaction in 'cb' below. It well may be that it will be // We do not necessarily Execute this transaction in 'cb' below. It well may be that it will be
// executed by the engine shard once it has been armed and coordinator thread will finish the // executed by the engine shard once it has been armed and coordinator thread will finish the
@ -941,6 +924,16 @@ void Transaction::Refurbish() {
cb_ptr_ = nullptr; cb_ptr_ = nullptr;
} }
void Transaction::IterateMultiLocks(ShardId sid, std::function<void(const std::string&)> cb) const {
unsigned shard_num = shard_set->size();
for (const auto& key : multi_->locks) {
ShardId key_sid = Shard(key, shard_num);
if (key_sid == sid) {
cb(key);
}
}
}
void Transaction::EnableShard(ShardId sid) { void Transaction::EnableShard(ShardId sid) {
unique_shard_cnt_ = 1; unique_shard_cnt_ = 1;
unique_shard_id_ = sid; unique_shard_id_ = sid;
@ -1290,8 +1283,10 @@ OpStatus Transaction::RunSquashedMultiCb(RunnableType cb) {
return status; return status;
} }
void Transaction::UnlockMultiShardCb(const std::vector<KeyList>& sharded_keys, EngineShard* shard, void Transaction::UnlockMultiShardCb(const KeyList& sharded_keys, EngineShard* shard,
uint32_t shard_journals_cnt) { uint32_t shard_journals_cnt) {
DCHECK(multi_ && multi_->lock_mode);
auto journal = shard->journal(); auto journal = shard->journal();
if (journal != nullptr && multi_->shard_journal_write[shard->shard_id()]) { if (journal != nullptr && multi_->shard_journal_write[shard->shard_id()]) {
@ -1301,20 +1296,13 @@ void Transaction::UnlockMultiShardCb(const std::vector<KeyList>& sharded_keys, E
if (multi_->mode == GLOBAL) { if (multi_->mode == GLOBAL) {
shard->shard_lock()->Release(IntentLock::EXCLUSIVE); shard->shard_lock()->Release(IntentLock::EXCLUSIVE);
} else { } else {
ShardId sid = shard->shard_id(); for (const auto& key : sharded_keys) {
for (const auto& k_v : sharded_keys[sid]) { shard->db_slice().ReleaseNormalized(*multi_->lock_mode, db_index_, key, 1);
auto release = [&](IntentLock::Mode mode) {
if (k_v.second[mode]) {
shard->db_slice().Release(mode, db_index_, k_v.first, k_v.second[mode]);
}
};
release(IntentLock::SHARED);
release(IntentLock::EXCLUSIVE);
} }
} }
auto& sd = shard_data_[SidToId(shard->shard_id())]; ShardId sid = shard->shard_id();
auto& sd = shard_data_[SidToId(sid)];
sd.local_mask |= UNLOCK_MULTI; sd.local_mask |= UNLOCK_MULTI;
// It does not have to be that all shards in multi transaction execute this tx. // It does not have to be that all shards in multi transaction execute this tx.

View file

@ -325,6 +325,8 @@ class Transaction {
void Refurbish(); void Refurbish();
void IterateMultiLocks(ShardId sid, std::function<void(const std::string&)> cb) const;
private: private:
// Holds number of locks for each IntentLock::Mode: shared and exlusive. // Holds number of locks for each IntentLock::Mode: shared and exlusive.
struct LockCnt { struct LockCnt {
@ -341,7 +343,7 @@ class Transaction {
}; };
// owned std::string because callbacks its used in run fully async and can outlive the entries. // owned std::string because callbacks its used in run fully async and can outlive the entries.
using KeyList = std::vector<std::pair<std::string, LockCnt>>; using KeyList = std::vector<std::string>;
struct alignas(64) PerShardData { struct alignas(64) PerShardData {
PerShardData(PerShardData&&) noexcept { PerShardData(PerShardData&&) noexcept {
@ -377,15 +379,14 @@ class Transaction {
MultiRole role; MultiRole role;
MultiMode mode; MultiMode mode;
absl::flat_hash_map<std::string, LockCnt> lock_counts; std::optional<IntentLock::Mode> lock_mode;
absl::flat_hash_set<std::string> locks;
// The shard_journal_write vector variable is used to determine the number of shards // The shard_journal_write vector variable is used to determine the number of shards
// involved in a multi-command transaction. This information is utilized by replicas when // involved in a multi-command transaction. This information is utilized by replicas when
// executing multi-command. For every write to a shard journal, the corresponding index in the // executing multi-command. For every write to a shard journal, the corresponding index in the
// vector is marked as true. // vector is marked as true.
absl::InlinedVector<bool, 4> shard_journal_write; absl::InlinedVector<bool, 4> shard_journal_write;
bool locks_recorded = false;
}; };
enum CoordinatorState : uint8_t { enum CoordinatorState : uint8_t {
@ -416,20 +417,20 @@ class Transaction {
void InitGlobal(); void InitGlobal();
// Init with a set of keys. // Init with a set of keys.
void InitByKeys(KeyIndex keys); void InitByKeys(const KeyIndex& keys);
void EnableShard(ShardId sid); void EnableShard(ShardId sid);
void EnableAllShards(); void EnableAllShards();
// Build shard index by distributing the arguments by shards based on the key index. // Build shard index by distributing the arguments by shards based on the key index.
void BuildShardIndex(KeyIndex keys, bool rev_mapping, std::vector<PerShardCache>* out); void BuildShardIndex(const KeyIndex& keys, bool rev_mapping, std::vector<PerShardCache>* out);
// Init shard data from shard index. // Init shard data from shard index.
void InitShardData(absl::Span<const PerShardCache> shard_index, size_t num_args, void InitShardData(absl::Span<const PerShardCache> shard_index, size_t num_args,
bool rev_mapping); bool rev_mapping);
// Init multi. Record locks if needed. // Init multi. Record locks if needed.
void InitMultiData(KeyIndex keys); void RecordMultiLocks(const KeyIndex& keys);
// Store all key index keys in args_. Used only for single shard initialization. // Store all key index keys in args_. Used only for single shard initialization.
void StoreKeysInArgs(KeyIndex keys, bool rev_mapping); void StoreKeysInArgs(KeyIndex keys, bool rev_mapping);
@ -467,7 +468,7 @@ class Transaction {
// Run callback inline as part of multi stub. // Run callback inline as part of multi stub.
OpStatus RunSquashedMultiCb(RunnableType cb); OpStatus RunSquashedMultiCb(RunnableType cb);
void UnlockMultiShardCb(const std::vector<KeyList>& sharded_keys, EngineShard* shard, void UnlockMultiShardCb(const KeyList& sharded_keys, EngineShard* shard,
uint32_t shard_journals_cnt); uint32_t shard_journals_cnt);
// In a multi-command transaction, we determine the number of shard journals that we wrote entries // In a multi-command transaction, we determine the number of shard journals that we wrote entries
@ -585,8 +586,6 @@ class Transaction {
private: private:
struct TLTmpSpace { struct TLTmpSpace {
absl::flat_hash_set<std::string_view> uniq_keys;
std::vector<PerShardCache>& GetShardIndex(unsigned size); std::vector<PerShardCache>& GetShardIndex(unsigned size);
private: private: