dragonfly/src/server/transaction.h
Shahar Mike 734401098c
opt(server): Execute lua on target shard, if it's 1 (#1639)
* opt(server): Execute lua on target shard, if it's 1

This will save hops by short-circuiting execution of commands.

* Reuse unique shard id from tx
Only switch threads for LOCK_AHEAD

* Signedness
2023-08-09 14:18:34 +03:00

599 lines
21 KiB
C++

// Copyright 2022, DragonflyDB authors. All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once
#include <absl/container/flat_hash_map.h>
#include <absl/container/flat_hash_set.h>
#include <absl/container/inlined_vector.h>
#include <absl/functional/function_ref.h>
#include <string_view>
#include <variant>
#include <vector>
#include "core/intent_lock.h"
#include "core/tx_queue.h"
#include "facade/op_status.h"
#include "server/common.h"
#include "server/journal/types.h"
#include "server/table.h"
namespace dfly {
class EngineShard;
class BlockingController;
using facade::OpResult;
using facade::OpStatus;
// Central building block of the transactional framework.
//
// Use it to run callbacks on the shard threads - such dispatches are called hops.
// The shards to run on are determined by the keys of the underlying command.
// Global transactions run on all shards.
//
// Use ScheduleSingleHop() if only a single hop is needed.
// Otherwise, schedule the transaction with Schedule() and run successive hops
// with Execute().
//
// 1. Multi transactions
//
// Multi transactions are handled by a single transaction, which exposes the same interface for
// commands as regular transactions, but internally avoids rescheduling. There are multiple modes in
// which a mutli-transaction can run, those are documented in the MultiMode enum.
//
// The flow of EXEC and EVAL is as follows:
//
// ```
// trans->StartMulti_MultiMode_()
// for ([cmd, args]) {
// trans->MultiSwitchCmd(cmd) // 1. Set new command
// trans->InitByArgs(args) // 2. Re-initialize with arguments
// cmd->Invoke(trans) // 3. Run
// }
// trans->UnlockMulti()
// ```
//
// 2. Multi squashing
//
// An important optimization for multi transactions is executing multiple single shard commands in
// parallel. Because multiple commands are "squashed" into a single hop, its called multi squashing.
// To mock the interface for commands, special "stub" transactions are created for each shard that
// directly execute hop callbacks without any scheduling. Transaction roles are represented by the
// MultiRole enum. See MultiCommandSquasher for the detailed squashing approach.
//
// The flow is as follows:
//
// ```
// for (cmd in single_shard_sequence)
// sharded[shard].push_back(cmd)
//
// tx->PrepareSquashedMultiHop()
// tx->ScheduleSingleHop({
// Transaction stub_tx {tx}
// for (cmd)
// // use stub_tx as regular multi tx, see 1. above
// })
//
// ```
class Transaction {
friend class BlockingController;
Transaction(const Transaction&);
void operator=(const Transaction&) = delete;
~Transaction(); // Transactions are reference counted with intrusive_ptr.
friend void intrusive_ptr_add_ref(Transaction* trans) noexcept {
trans->use_count_.fetch_add(1, std::memory_order_relaxed);
}
friend void intrusive_ptr_release(Transaction* trans) noexcept {
if (1 == trans->use_count_.fetch_sub(1, std::memory_order_release)) {
std::atomic_thread_fence(std::memory_order_acquire);
delete trans;
}
}
public:
using time_point = ::std::chrono::steady_clock::time_point;
// Runnable that is run on shards during hop executions (often named callback).
using RunnableType = absl::FunctionRef<OpStatus(Transaction* t, EngineShard*)>;
// Provides keys to block on for specific shard.
using WaitKeysProvider = std::function<ArgSlice(Transaction*, EngineShard* shard)>;
// Modes in which a multi transaction can run.
enum MultiMode {
// Invalid state.
NOT_DETERMINED = 0,
// Global transaction.
GLOBAL = 1,
// Keys are locked ahead during Schedule.
LOCK_AHEAD = 2,
// Each command is executed separately. Equivalent to a pipeline.
NON_ATOMIC = 3,
};
// Squashed parallel execution requires a separate transaction for each shard. Those "stubs"
// perform no scheduling or real hops, but instead execute the handlers directly inline.
enum MultiRole {
DEFAULT = 0, // Regular multi transaction
SQUASHER = 1, // Owner of stub transactions
SQUASHED_STUB = 2, // Stub transaction
};
// State on specific shard.
enum LocalMask : uint16_t {
ACTIVE = 1, // Set on all active shards.
// UNUSED = 1 << 1,
OUT_OF_ORDER = 1 << 2, // Whether it can run as out of order
KEYLOCK_ACQUIRED = 1 << 3, // Whether its key locks are acquired
SUSPENDED_Q = 1 << 4, // Whether is suspended (by WatchInShard())
AWAKED_Q = 1 << 5, // Whether it was awakened (by NotifySuspended())
EXPIRED_Q = 1 << 6, // Whether it timed out and should be dropped
UNLOCK_MULTI = 1 << 7, // Whether this shard executed UnlockMultiShardCb
};
public:
explicit Transaction(const CommandId* cid);
explicit Transaction(const Transaction* parent);
// Initialize from command (args) on specific db.
OpStatus InitByArgs(DbIndex index, CmdArgList args);
// Get command arguments for specific shard. Called from shard thread.
ArgSlice GetShardArgs(ShardId sid) const;
// Map arg_index from GetShardArgs slice to index in original command slice from InitByArgs.
size_t ReverseArgIndex(ShardId shard_id, size_t arg_index) const;
// Schedule transaction.
// Usually used for multi hop transactions like RENAME or BLPOP.
// For single hop transactions use ScheduleSingleHop instead.
void Schedule();
// Execute transaction hop. If conclude is true, it is removed from the pending queue.
void Execute(RunnableType cb, bool conclude);
// Execute single hop and conclude.
// Callback should return OK for multi key invocations, otherwise return value is ill-defined.
OpStatus ScheduleSingleHop(RunnableType cb);
// Execute single hop with return value and conclude.
// Can be used only for single key invocations, because it writes a into shared variable.
template <typename F> auto ScheduleSingleHopT(F&& f) -> decltype(f(this, nullptr));
// Conclude transaction
void Conclude();
// Called by engine shard to execute a transaction hop.
// txq_ooo is set to true if the transaction is running out of order
// not as the tx queue head.
// Returns true if the transaction continues running in the thread
bool RunInShard(EngineShard* shard, bool txq_ooo);
// Registers transaction into watched queue and blocks until a) either notification is received.
// or b) tp is reached. If tp is time_point::max() then waits indefinitely.
// Expects that the transaction had been scheduled before, and uses Execute(.., true) to register.
// Returns false if timeout occurred, true if was notified by one of the keys.
bool WaitOnWatch(const time_point& tp, WaitKeysProvider cb);
// Returns true if transaction is awaked, false if it's timed-out and can be removed from the
// blocking queue.
bool NotifySuspended(TxId committed_ts, ShardId sid, std::string_view key);
// Cancel all blocking watches. Set COORD_CANCELLED.
// Must be called from coordinator thread.
void CancelBlocking();
// In some cases for non auto-journaling commands we want to enable the auto journal flow.
void RenableAutoJournal() {
renabled_auto_journal_.store(true, std::memory_order_relaxed);
}
// Prepare a squashed hop on given shards.
// Only compatible with multi modes that acquire all locks ahead - global and lock_ahead.
void PrepareSquashedMultiHop(const CommandId* cid, absl::FunctionRef<bool(ShardId)> enabled);
// Start multi in GLOBAL mode.
void StartMultiGlobal(DbIndex dbid);
// Start multi in LOCK_AHEAD mode with given keys.
void StartMultiLockedAhead(DbIndex dbid, CmdArgList keys);
// Start multi in NON_ATOMIC mode.
void StartMultiNonAtomic();
// Report which shards had write commands that executed on stub transactions
// and thus did not mark itself in MultiData::shard_journal_write.
void ReportWritesSquashedMulti(absl::FunctionRef<bool(ShardId)> had_write);
// Unlock key locks of a multi transaction.
void UnlockMulti();
// Set new command for multi transaction.
void MultiSwitchCmd(const CommandId* cid);
// Returns locking arguments needed for DbSlice to Acquire/Release transactional locks.
// Runs in the shard thread.
KeyLockArgs GetLockArgs(ShardId sid) const;
//! Returns true if the transaction spans this shard_id.
//! Runs from the coordinator thread.
bool IsActive(ShardId shard_id) const {
return unique_shard_cnt_ == 1 ? unique_shard_id_ == shard_id
: shard_data_[shard_id].arg_count > 0;
}
//! Returns true if the transaction is armed for execution on this sid (used to avoid
//! duplicate runs). Supports local transactions under multi as well.
//! Can be used in contexts that wait for an event to happen.
bool IsArmedInShard(ShardId sid) const {
// For multi transactions shard_data_ spans all shards.
if (sid >= shard_data_.size())
sid = 0;
// We use acquire so that no reordering will move before this load.
return run_count_.load(std::memory_order_acquire) > 0 &&
shard_data_[sid].is_armed.load(std::memory_order_relaxed);
}
// Called from engine set shard threads.
uint16_t GetLocalMask(ShardId sid) const {
return shard_data_[SidToId(sid)].local_mask;
}
uint32_t GetLocalTxqPos(ShardId sid) const {
return shard_data_[SidToId(sid)].pq_pos;
}
TxId txid() const {
return txid_;
}
IntentLock::Mode Mode() const; // Based on command mask
std::string_view Name() const; // Based on command name
uint32_t GetUniqueShardCnt() const {
return unique_shard_cnt_;
}
// This method is meaningless if GetUniqueShardCnt() != 1.
ShardId GetUniqueShard() const;
bool IsMulti() const {
return bool(multi_);
}
MultiMode GetMultiMode() const {
return multi_->mode;
}
bool IsGlobal() const;
bool IsOOO() const {
return coordinator_state_ & COORD_OOO;
}
// If blocking tx was woken up on this shard, get wake key.
std::optional<std::string_view> GetWakeKey(ShardId sid) const;
OpArgs GetOpArgs(EngineShard* shard) const {
return OpArgs{shard, this, GetDbContext()};
}
DbContext GetDbContext() const {
return DbContext{.db_index = db_index_, .time_now_ms = time_now_ms_};
}
DbIndex GetDbIndex() const {
return db_index_;
}
const CommandId* GetCId() const {
return cid_;
}
std::string DebugId() const;
// Write a journal entry to a shard journal with the given payload. When logging a non-automatic
// journal command, multiple journal entries may be necessary. In this case, call with set
// multi_commands to true and call the FinishLogJournalOnShard function after logging the final
// entry.
void LogJournalOnShard(EngineShard* shard, journal::Entry::Payload&& payload, uint32_t shard_cnt,
bool multi_commands, bool allow_await) const;
void FinishLogJournalOnShard(EngineShard* shard, uint32_t shard_cnt) const;
private:
// Holds number of locks for each IntentLock::Mode: shared and exlusive.
struct LockCnt {
unsigned& operator[](IntentLock::Mode mode) {
return cnt[int(mode)];
}
unsigned operator[](IntentLock::Mode mode) const {
return cnt[int(mode)];
}
private:
unsigned cnt[2] = {0, 0};
};
// owned std::string because callbacks its used in run fully async and can outlive the entries.
using KeyList = std::vector<std::pair<std::string, LockCnt>>;
struct PerShardData {
PerShardData(PerShardData&&) noexcept {
}
PerShardData() = default;
// this is the only variable that is accessed by both shard and coordinator threads.
std::atomic_bool is_armed{false};
// We pad with some memory so that atomic loads won't cause false sharing between threads.
char pad[46]; // to make sure PerShardData is 64 bytes and takes full cacheline.
uint32_t arg_start = 0; // Indices into args_ array.
uint32_t arg_count = 0;
// Needed to rollback inconsistent schedulings or remove OOO transactions from
// tx queue.
uint32_t pq_pos = TxQueue::kEnd;
// Accessed within shard thread.
// Bitmask of LocalMask enums.
uint16_t local_mask = 0;
// Index of key relative to args in shard that the shard was woken up after blocking wait.
uint16_t wake_key_pos = UINT16_MAX;
};
static_assert(sizeof(PerShardData) == 64); // cacheline
// State of a multi transaction.
struct MultiData {
MultiRole role;
MultiMode mode;
absl::flat_hash_map<std::string, LockCnt> lock_counts;
// The shard_journal_write vector variable is used to determine the number of shards
// involved in a multi-command transaction. This information is utilized by replicas when
// executing multi-command. For every write to a shard journal, the corresponding index in the
// vector is marked as true.
absl::InlinedVector<bool, 4> shard_journal_write;
bool locks_recorded = false;
};
enum CoordinatorState : uint8_t {
COORD_SCHED = 1,
COORD_EXEC = 2,
COORD_EXEC_CONCLUDING = 1 << 2, // Whether its the last hop of a transaction
COORD_BLOCKED = 1 << 3,
COORD_CANCELLED = 1 << 4,
COORD_OOO = 1 << 5,
};
struct PerShardCache {
std::vector<std::string_view> args;
std::vector<uint32_t> original_index;
void Clear() {
args.clear();
original_index.clear();
}
};
private:
// Init basic fields and reset re-usable.
void InitBase(DbIndex dbid, CmdArgList args);
// Init as a global transaction.
void InitGlobal();
// Init when command has no keys and it need to use transaction framework
void InitNoKey();
// Init with a set of keys.
void InitByKeys(KeyIndex keys);
// Build shard index by distributing the arguments by shards based on the key index.
void BuildShardIndex(KeyIndex keys, bool rev_mapping, std::vector<PerShardCache>* out);
// Init shard data from shard index.
void InitShardData(absl::Span<const PerShardCache> shard_index, size_t num_args,
bool rev_mapping);
// Init multi. Record locks if needed.
void InitMultiData(KeyIndex keys);
// Store all key index keys in args_. Used only for single shard initialization.
void StoreKeysInArgs(KeyIndex keys, bool rev_mapping);
// Generic schedule used from Schedule() and ScheduleSingleHop() on slow path.
void ScheduleInternal();
// Schedule if only one shard is active.
// Returns true if transaction ran out-of-order during the scheduling phase.
bool ScheduleUniqueShard(EngineShard* shard);
// Schedule on shards transaction queue.
// Returns pair(schedule_success, lock_granted)
// schedule_success is true if transaction was scheduled on db_slice.
// lock_granted is true if lock was granted for all the keys on this shard.
// Runs in the shard thread.
std::pair<bool, bool> ScheduleInShard(EngineShard* shard);
// Optimized version of RunInShard for single shard uncontended cases.
void RunQuickie(EngineShard* shard);
void ExecuteAsync();
// Adds itself to watched queue in the shard. Must run in that shard thread.
OpStatus WatchInShard(ArgSlice keys, EngineShard* shard);
// Expire blocking transaction, unlock keys and unregister it from the blocking controller
void ExpireBlocking(WaitKeysProvider wcb);
void ExpireShardCb(ArgSlice wkeys, EngineShard* shard);
// Returns true if we need to follow up with PollExecution on this shard.
bool CancelShardCb(EngineShard* shard);
// Run callback inline as part of multi stub.
OpStatus RunSquashedMultiCb(RunnableType cb);
void UnlockMultiShardCb(const std::vector<KeyList>& sharded_keys, EngineShard* shard,
uint32_t shard_journals_cnt);
// In a multi-command transaction, we determine the number of shard journals that we wrote entries
// to by updating the shard_journal_write vector during command execution. The total number of
// shard journals written to can be found by summing the true values in the vector. This value is
// then written to each shard journal with the journal EXEC op, enabling replication to
// synchronize the multi-shard transaction.
uint32_t CalcMultiNumOfShardJournals() const;
void WaitForShardCallbacks() {
run_ec_.await([this] { return 0 == run_count_.load(std::memory_order_relaxed); });
// no reads after this fence will be reordered before it, and if a store operation sequenced
// before some release operation that happened before the fence in another thread, this store
// will be visible after the fence.
// In this specific case we synchronize with DecreaseRunCnt that releases run_count_.
// See #997 before changing it.
std::atomic_thread_fence(std::memory_order_acquire);
seqlock_.fetch_add(1, std::memory_order_relaxed);
}
// Log command in shard's journal, if this is a write command with auto-journaling enabled.
// Should be called immediately after the last phase (hop).
void LogAutoJournalOnShard(EngineShard* shard);
// Returns the previous value of run count.
uint32_t DecreaseRunCnt();
uint32_t GetUseCount() const {
return use_count_.load(std::memory_order_relaxed);
}
// Whether the transaction is multi and runs in an atomic mode.
// This, instead of just IsMulti(), should be used to check for the possibility of
// different optimizations, because they can safely be applied to non-atomic multi
// transactions as well.
bool IsAtomicMulti() const {
return multi_ && multi_->mode != NON_ATOMIC;
}
bool IsActiveMulti() const {
return multi_ && multi_->role != SQUASHED_STUB;
}
unsigned SidToId(ShardId sid) const {
return sid < shard_data_.size() ? sid : 0;
}
// Iterate over shards and run function accepting (PerShardData&, ShardId) on all active ones.
template <typename F> void IterateActiveShards(F&& f) {
if (!global_ && unique_shard_cnt_ == 1) { // unique_shard_id_ is set only for non-global.
auto i = unique_shard_id_;
f(shard_data_[SidToId(i)], i);
} else {
for (ShardId i = 0; i < shard_data_.size(); ++i) {
if (auto& sd = shard_data_[i]; global_ || (sd.local_mask & ACTIVE)) {
f(sd, i);
}
}
}
}
private:
// shard_data spans all the shards in ess_.
// I wish we could use a dense array of size [0..uniq_shards] but since
// multiple threads access this array to synchronize between themselves using
// PerShardData.state, it can be tricky. The complication comes from multi_ transactions where
// scheduled transaction is accessed between operations as well.
// Stores per-shard data.
// For non-multi transactions, it can be of size one in case only one shard is active
// (unique_shard_cnt_ = 1).
// Never access directly with index, always use SidToId.
absl::InlinedVector<PerShardData, 4> shard_data_; // length = shard_count
// Stores arguments of the transaction (i.e. keys + values) partitioned by shards.
absl::InlinedVector<std::string_view, 4> args_;
// Stores the full undivided command.
CmdArgList full_args_;
// True if NO_AUTOJOURNAL command asked to enable auto journal
std::atomic<bool> renabled_auto_journal_ = false;
// Reverse argument mapping for ReverseArgIndex to convert from shard index to original index.
std::vector<uint32_t> reverse_index_;
RunnableType* cb_ptr_ = nullptr; // Run on shard threads
const CommandId* cid_; // Underlying command
std::unique_ptr<MultiData> multi_; // Initialized when the transaction is multi/exec.
TxId txid_{0};
bool global_{false};
DbIndex db_index_{0};
uint64_t time_now_ms_{0};
std::atomic<uint32_t> wakeup_requested_{0}; // whether tx was woken up
std::atomic_uint32_t use_count_{0}, run_count_{0}, seqlock_{0};
// unique_shard_cnt_ and unique_shard_id_ are accessed only by coordinator thread.
uint32_t unique_shard_cnt_{0}; // Number of unique shards active
ShardId unique_shard_id_{kInvalidSid}; // Set if unique_shard_cnt_ = 1
EventCount blocking_ec_; // Used to wake blocking transactions.
EventCount run_ec_; // Used to wait for shard callbacks
// Transaction coordinator state, written and read by coordinator thread.
// Can be read by shard threads as long as we respect ordering rules, i.e. when
// they read this variable the coordinator thread is stalled and can not cause data races.
// If COORDINATOR_XXX has been set, it means we passed or crossed stage XXX.
uint8_t coordinator_state_ = 0;
// Used for single-hop transactions with unique_shards_ == 1, hence no data-race.
OpStatus local_result_ = OpStatus::OK;
private:
struct TLTmpSpace {
absl::flat_hash_set<std::string_view> uniq_keys;
std::vector<PerShardCache>& GetShardIndex(unsigned size);
private:
std::vector<PerShardCache> shard_cache;
};
static thread_local TLTmpSpace tmp_space;
};
template <typename F> auto Transaction::ScheduleSingleHopT(F&& f) -> decltype(f(this, nullptr)) {
decltype(f(this, nullptr)) res;
ScheduleSingleHop([&res, f = std::forward<F>(f)](Transaction* t, EngineShard* shard) {
res = f(t, shard);
return res.status();
});
return res;
}
inline uint16_t trans_id(const Transaction* ptr) {
return (intptr_t(ptr) >> 8) & 0xFFFF;
}
OpResult<KeyIndex> DetermineKeys(const CommandId* cid, CmdArgList args);
} // namespace dfly