feat(transaction): Single hop blocking, callback flags (#2393)

* feat(transaction): Single hop blocking, callback flags
This commit is contained in:
Vladislav 2024-01-15 21:13:22 +03:00 committed by GitHub
parent b6f4370ae7
commit de817098a7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 211 additions and 116 deletions

View file

@ -3,6 +3,7 @@
//
#include "server/container_utils.h"
#include "base/flags.h"
#include "base/logging.h"
#include "core/sorted_map.h"
#include "core/string_map.h"
@ -21,8 +22,101 @@ extern "C" {
#include "redis/zset.h"
}
ABSL_FLAG(bool, singlehop_blocking, true, "Use single hop optimization for blocking commands");
namespace dfly::container_utils {
namespace {
struct ShardFFResult {
PrimeKey key;
ShardId sid = kInvalidSid;
};
// Find first non-empty key of a single shard transaction, pass it to `func` and return the key.
// If no such key exists or a wrong type is found, the apropriate status is returned.
// Optimized version of `FindFirstNonEmpty` below.
OpResult<std::string> FindFirstNonEmptySingleShard(Transaction* trans, int req_obj_type,
BlockingResultCb func) {
DCHECK_EQ(trans->GetUniqueShardCnt(), 1u);
std::string key;
auto cb = [&](Transaction* t, EngineShard* shard) -> Transaction::RunnableResult {
auto args = t->GetShardArgs(shard->shard_id());
auto ff_res = shard->db_slice().FindFirstReadOnly(t->GetDbContext(), args, req_obj_type);
if (ff_res == OpStatus::WRONG_TYPE)
return OpStatus::WRONG_TYPE;
if (ff_res == OpStatus::KEY_NOTFOUND)
return {OpStatus::KEY_NOTFOUND, Transaction::RunnableResult::AVOID_CONCLUDING};
CHECK(ff_res.ok()); // No other errors possible
ff_res->first->first.GetString(&key);
func(t, shard, key);
return OpStatus::OK;
};
// Schedule single hop and hopefully find a key, otherwise avoid concluding
OpStatus status = trans->ScheduleSingleHop(cb);
if (status == OpStatus::OK)
return key;
return status;
}
// Find first non-empty key (sorted by order in command arguments) and return it,
// otherwise return not found or wrong type error.
OpResult<ShardFFResult> FindFirstNonEmpty(Transaction* trans, int req_obj_type) {
DCHECK_GT(trans->GetUniqueShardCnt(), 1u);
using FFResult = std::tuple<PrimeKey, unsigned, ShardId>; // key, argument index, sid
VLOG(2) << "FindFirst::Find " << trans->DebugId();
// Holds Find results: (iterator to a found key, and its index in the passed arguments).
// See DbSlice::FindFirst for more details.
std::vector<OpResult<FFResult>> find_res(shard_set->size());
std::fill(find_res.begin(), find_res.end(), OpStatus::KEY_NOTFOUND);
auto cb = [&](Transaction* t, EngineShard* shard) {
auto args = t->GetShardArgs(shard->shard_id());
auto ff_res = shard->db_slice().FindFirstReadOnly(t->GetDbContext(), args, req_obj_type);
if (ff_res) {
find_res[shard->shard_id()] =
FFResult{ff_res->first->first.AsRef(), ff_res->second, shard->shard_id()};
} else {
find_res[shard->shard_id()] = ff_res.status();
}
return OpStatus::OK;
};
trans->Execute(std::move(cb), false);
// If any key is of the wrong type, report it immediately
if (std::find(find_res.begin(), find_res.end(), OpStatus::WRONG_TYPE) != find_res.end())
return OpStatus::WRONG_TYPE;
// Order result by their keys position in the command arguments, push errors to back
auto comp = [trans](const OpResult<FFResult>& lhs, const OpResult<FFResult>& rhs) {
if (!lhs || !rhs)
return lhs.ok();
size_t i1 = trans->ReverseArgIndex(std::get<ShardId>(*lhs), std::get<unsigned>(*lhs));
size_t i2 = trans->ReverseArgIndex(std::get<ShardId>(*rhs), std::get<unsigned>(*rhs));
return i1 < i2;
};
// Find first element by the order above, so the first key. Returns error only if all are errors
auto it = std::min_element(find_res.begin(), find_res.end(), comp);
DCHECK(it != find_res.end());
if (*it == OpStatus::KEY_NOTFOUND)
return OpStatus::KEY_NOTFOUND;
CHECK(it->ok()); // No other errors than WRONG_TYPE and KEY_NOTFOUND
FFResult& res = **it;
return ShardFFResult{std::get<PrimeKey>(res).AsRef(), std::get<ShardId>(res)};
}
} // namespace
using namespace std;
quicklistEntry QLEntry() {
@ -174,76 +268,25 @@ string_view LpGetView(uint8_t* lp_it, uint8_t int_buf[]) {
return std::string_view{reinterpret_cast<char*>(elem), size_t(ele_len)};
}
OpResult<ShardFFResult> FindFirstNonEmptyKey(Transaction* trans, int req_obj_type) {
using FFResult = std::pair<PrimeKey, unsigned>; // key, argument index.
VLOG(2) << "FindFirst::Find " << trans->DebugId();
// Holds Find results: (iterator to a found key, and its index in the passed arguments).
// See DbSlice::FindFirst for more details.
// spans all the shards for now.
std::vector<OpResult<FFResult>> find_res(shard_set->size());
std::fill(find_res.begin(), find_res.end(), OpStatus::KEY_NOTFOUND);
auto cb = [&](Transaction* t, EngineShard* shard) {
auto args = t->GetShardArgs(shard->shard_id());
OpResult<std::pair<PrimeConstIterator, unsigned>> ff_res =
shard->db_slice().FindFirstReadOnly(t->GetDbContext(), args, req_obj_type);
if (ff_res) {
FFResult ff_result(ff_res->first->first.AsRef(), ff_res->second);
find_res[shard->shard_id()] = std::move(ff_result);
} else {
find_res[shard->shard_id()] = ff_res.status();
}
return OpStatus::OK;
};
trans->Execute(std::move(cb), false);
uint32_t min_arg_indx = UINT32_MAX;
ShardFFResult shard_result;
// We iterate over all results to find the key with the minimal arg_index
// after reversing the arg indexing permutation.
for (size_t sid = 0; sid < find_res.size(); ++sid) {
const auto& fr = find_res[sid];
auto status = fr.status();
if (status == OpStatus::KEY_NOTFOUND)
continue;
if (status == OpStatus::WRONG_TYPE) {
return status;
}
CHECK(fr);
const auto& it_pos = fr.value();
size_t arg_indx = trans->ReverseArgIndex(sid, it_pos.second);
if (arg_indx < min_arg_indx) {
min_arg_indx = arg_indx;
shard_result.sid = sid;
// we do not dereference the key, do not extract the string value, so it it
// ok to just move it. We can not dereference it due to limitations of SmallString
// that rely on thread-local data-structure for pointer translation.
shard_result.key = it_pos.first.AsRef();
}
}
if (shard_result.sid == kInvalidSid) {
return OpStatus::KEY_NOTFOUND;
}
return OpResult<ShardFFResult>{std::move(shard_result)};
}
OpResult<string> RunCbOnFirstNonEmptyBlocking(Transaction* trans, int req_obj_type,
BlockingResultCb func, unsigned limit_ms,
bool* block_flag) {
trans->Schedule();
string result_key;
OpResult<ShardFFResult> result = FindFirstNonEmptyKey(trans, req_obj_type);
// Fast path. If we have only a single shard, we can run opportunistically with a single hop.
// If we don't find anything, we abort concluding and keep scheduled.
// Slow path: schedule, find results from shards, execute action if found.
OpResult<ShardFFResult> result;
if (trans->GetUniqueShardCnt() == 1 && absl::GetFlag(FLAGS_singlehop_blocking)) {
auto res = FindFirstNonEmptySingleShard(trans, req_obj_type, func);
if (res.ok())
return res;
else
result = res.status();
} else {
trans->Schedule();
result = FindFirstNonEmpty(trans, req_obj_type);
}
// If a non-empty key exists, execute the callback immediately
if (result.ok()) {
@ -255,7 +298,6 @@ OpResult<string> RunCbOnFirstNonEmptyBlocking(Transaction* trans, int req_obj_ty
return OpStatus::OK;
};
trans->Execute(std::move(cb), true);
return result_key;
}
@ -271,6 +313,7 @@ OpResult<string> RunCbOnFirstNonEmptyBlocking(Transaction* trans, int req_obj_ty
return OpStatus::TIMED_OUT;
}
DCHECK(trans->IsScheduled()); // single shard optimization didn't forget to schedule
VLOG(1) << "Blocking " << trans->DebugId();
// If timeout (limit_ms) is zero, block indefinitely

View file

@ -82,13 +82,6 @@ std::string_view LpGetView(uint8_t* lp_it, uint8_t int_buf[]);
// Find value by key and return stringview to it, otherwise nullopt.
std::optional<std::string_view> LpFind(uint8_t* lp, std::string_view key, uint8_t int_buf[]);
struct ShardFFResult {
PrimeKey key;
ShardId sid = kInvalidSid;
};
OpResult<ShardFFResult> FindFirstNonEmptyKey(Transaction* trans, int req_obj_type);
using BlockingResultCb =
std::function<void(Transaction*, EngineShard*, std::string_view /* key */)>;

View file

@ -472,23 +472,22 @@ bool Transaction::RunInShard(EngineShard* shard, bool txq_ooo) {
/*************************************************************************/
// Actually running the callback.
// If you change the logic here, also please change the logic
RunnableResult result;
try {
OpStatus status = OpStatus::OK;
// if a transaction is suspended, we still run it because of brpoplpush/blmove case
// that needs to run lpush on its suspended shard.
status = (*cb_ptr_)(this, shard);
result = (*cb_ptr_)(this, shard);
if (unique_shard_cnt_ == 1) {
cb_ptr_ = nullptr; // We can do it because only a single thread runs the callback.
local_result_ = status;
local_result_ = result;
} else {
if (status == OpStatus::OUT_OF_MEMORY) {
if (result == OpStatus::OUT_OF_MEMORY) {
absl::base_internal::SpinLockHolder lk{&local_result_mu_};
CHECK(local_result_ == OpStatus::OK || local_result_ == OpStatus::OUT_OF_MEMORY);
local_result_ = status;
local_result_ = result;
} else {
CHECK_EQ(OpStatus::OK, status);
CHECK_EQ(OpStatus::OK, result);
}
}
} catch (std::bad_alloc&) {
@ -500,15 +499,25 @@ bool Transaction::RunInShard(EngineShard* shard, bool txq_ooo) {
}
/*************************************************************************/
// at least the coordinator thread owns the reference.
DCHECK_GE(GetUseCount(), 1u);
shard->db_slice().OnCbFinish();
// Handle result flags to alter behaviour.
if (result.flags & RunnableResult::AVOID_CONCLUDING) {
// Multi shard callbacks should either all or none choose to conclude. Because they can't
// communicate, the must know their decision ahead, consequently there is no point in using this
// flag.
CHECK_EQ(unique_shard_cnt_, 1u);
DCHECK(is_concluding || multi_->concluding);
is_concluding = false;
}
// Log to jounrnal only once the command finished running
if (is_concluding || (multi_ && multi_->concluding))
LogAutoJournalOnShard(shard);
shard->db_slice().OnCbFinish();
// at least the coordinator thread owns the reference.
DCHECK_GE(GetUseCount(), 1u);
// If we're the head of tx queue (txq_ooo is false), we remove ourselves upon first invocation
// and successive hops are run by continuation_trans_ in engine shard.
// Otherwise we can remove ourselves only when we're concluding (so no more hops will follow).
@ -928,6 +937,8 @@ void Transaction::ExecuteAsync() {
}
void Transaction::Conclude() {
if (!IsScheduled())
return;
auto cb = [](Transaction* t, EngineShard* shard) { return OpStatus::OK; };
Execute(std::move(cb), true);
}
@ -963,7 +974,7 @@ void Transaction::EnableAllShards() {
sd.local_mask |= ACTIVE;
}
void Transaction::RunQuickie(EngineShard* shard) {
Transaction::RunnableResult Transaction::RunQuickie(EngineShard* shard) {
DCHECK(!IsAtomicMulti());
DCHECK(shard_data_.size() == 1u || multi_->mode == NON_ATOMIC);
DCHECK_NE(unique_shard_id_, kInvalidSid);
@ -976,19 +987,23 @@ void Transaction::RunQuickie(EngineShard* shard) {
DCHECK(cb_ptr_) << DebugId() << " " << shard->shard_id();
// Calling the callback in somewhat safe way
RunnableResult result;
try {
local_result_ = (*cb_ptr_)(this, shard);
result = (*cb_ptr_)(this, shard);
} catch (std::bad_alloc&) {
LOG_FIRST_N(ERROR, 16) << " out of memory";
local_result_ = OpStatus::OUT_OF_MEMORY;
result = OpStatus::OUT_OF_MEMORY;
} catch (std::exception& e) {
LOG(FATAL) << "Unexpected exception " << e.what();
}
shard->db_slice().OnCbFinish();
LogAutoJournalOnShard(shard);
// Handling the result, along with conclusion and journaling, is done by the caller
sd.is_armed.store(false, memory_order_relaxed);
cb_ptr_ = nullptr; // We can do it because only a single shard runs the callback.
return result;
}
// runs in coordinator thread.
@ -1030,10 +1045,11 @@ KeyLockArgs Transaction::GetLockArgs(ShardId sid) const {
// Runs within a engine shard thread.
// Optimized path that schedules and runs transactions out of order if possible.
// Returns true if was eagerly executed, false if it was scheduled into queue.
// Returns true if eagerly executed, false if the callback will be handled by the transaction
// queue.
bool Transaction::ScheduleUniqueShard(EngineShard* shard) {
DCHECK(!IsAtomicMulti());
DCHECK_EQ(0u, txid_);
DCHECK_EQ(txid_, 0u);
DCHECK(shard_data_.size() == 1u || multi_->mode == NON_ATOMIC);
DCHECK_NE(unique_shard_id_, kInvalidSid);
@ -1043,31 +1059,45 @@ bool Transaction::ScheduleUniqueShard(EngineShard* shard) {
auto& sd = shard_data_[SidToId(unique_shard_id_)];
DCHECK_EQ(TxQueue::kEnd, sd.pq_pos);
// Fast path - for uncontended keys, just run the callback.
// That applies for single key operations like set, get, lpush etc.
if (shard->db_slice().CheckLock(mode, lock_args) && shard->shard_lock()->Check(mode)) {
RunQuickie(shard);
return true;
bool unlocked_keys =
shard->db_slice().CheckLock(mode, lock_args) && shard->shard_lock()->Check(mode);
bool quick_run = unlocked_keys;
// Fast path. If none of the keys are locked, we can run briefly atomically on the thread
// without acquiring them at all.
if (quick_run) {
auto result = RunQuickie(shard);
local_result_ = result.status;
if (result.flags & RunnableResult::AVOID_CONCLUDING) {
// If we want to run again, we have to actually acquire keys, but keep ourselves disarmed
DCHECK_EQ(sd.is_armed, false);
unlocked_keys = false;
} else {
LogAutoJournalOnShard(shard);
}
}
// we can do it because only a single thread writes into txid_ and sd.
txid_ = op_seq.fetch_add(1, memory_order_relaxed);
sd.pq_pos = shard->txq()->Insert(this);
// Slow path. Some of the keys are locked, so we schedule on the transaction queue.
if (!unlocked_keys) {
coordinator_state_ |= COORD_SCHED; // safe because single shard
txid_ = op_seq.fetch_add(1, memory_order_relaxed); // -
sd.pq_pos = shard->txq()->Insert(this);
DCHECK_EQ(0, sd.local_mask & KEYLOCK_ACQUIRED);
DCHECK_EQ(sd.local_mask & KEYLOCK_ACQUIRED, 0);
shard->db_slice().Acquire(mode, lock_args);
sd.local_mask |= KEYLOCK_ACQUIRED;
shard->db_slice().Acquire(mode, lock_args);
sd.local_mask |= KEYLOCK_ACQUIRED;
DVLOG(1) << "Rescheduling " << DebugId() << " into TxQueue of size " << shard->txq()->size();
DVLOG(1) << "Rescheduling " << DebugId() << " into TxQueue of size " << shard->txq()->size();
// If there are blocked transactons waiting for this tx keys, we will add this transaction
// to the tx-queue (these keys will be contended). This will happen even if the queue was empty.
// In that case we must poll the queue, because there will be no other callback trigerring the
// queue before us.
shard->PollExecution("schedule_unique", nullptr);
}
// If there are blocked transactons waiting for this tx keys, we will add this transaction
// to the tx-queue (these keys will be contended). This will happen even if the queue was empty.
// In that case we must poll the queue, because there will be no other callback trigerring the
// queue before us.
shard->PollExecution("schedule_unique", nullptr);
return false;
return quick_run;
}
// This function should not block since it's run via RunBriefInParallel.
@ -1303,11 +1333,14 @@ void Transaction::ExpireShardCb(ArgSlice wkeys, EngineShard* shard) {
OpStatus Transaction::RunSquashedMultiCb(RunnableType cb) {
DCHECK(multi_ && multi_->role == SQUASHED_STUB);
DCHECK_EQ(unique_shard_cnt_, 1u);
auto* shard = EngineShard::tlocal();
auto status = cb(this, shard);
auto result = cb(this, shard);
shard->db_slice().OnCbFinish();
LogAutoJournalOnShard(shard);
return status;
DCHECK_EQ(result.flags, 0); // if it's sophisticated, we shouldn't squash it
return result;
}
void Transaction::UnlockMultiShardCb(const KeyList& sharded_keys, EngineShard* shard,

View file

@ -85,7 +85,6 @@ using facade::OpStatus;
// })
//
// ```
class Transaction {
friend class BlockingController;
@ -106,9 +105,32 @@ class Transaction {
}
public:
// Result returned by callbacks. Most should use the implcit conversion from OpStatus.
struct RunnableResult {
enum Flag : uint16_t {
// Can be issued by a **single** shard callback to avoid concluding, i.e. perform one more hop
// even if not requested ahead. Used for blocking command fallback.
AVOID_CONCLUDING = 1,
};
RunnableResult(OpStatus status = OpStatus::OK, uint16_t flags = 0)
: status(status), flags(flags) {
}
operator OpStatus() const {
return status;
}
OpStatus status;
uint16_t flags;
};
static_assert(sizeof(RunnableResult) == 4);
using time_point = ::std::chrono::steady_clock::time_point;
// Runnable that is run on shards during hop executions (often named callback).
using RunnableType = absl::FunctionRef<OpStatus(Transaction* t, EngineShard*)>;
// Callacks should return `OpStatus` which is implicitly converitble to `RunnableResult`!
using RunnableType = absl::FunctionRef<RunnableResult(Transaction* t, EngineShard*)>;
// Provides keys to block on for specific shard.
using WaitKeysProvider = std::function<ArgSlice(Transaction*, EngineShard* shard)>;
@ -175,7 +197,7 @@ class Transaction {
// Can be used only for single key invocations, because it writes a into shared variable.
template <typename F> auto ScheduleSingleHopT(F&& f) -> decltype(f(this, nullptr));
// Conclude transaction
// Conclude transaction. Ignored if not scheduled
void Conclude();
// Called by engine shard to execute a transaction hop.
@ -278,6 +300,10 @@ class Transaction {
return bool(multi_);
}
bool IsScheduled() const {
return coordinator_state_ & COORD_SCHED;
}
MultiMode GetMultiMode() const {
return multi_->mode;
}
@ -455,7 +481,7 @@ class Transaction {
std::pair<bool, bool> ScheduleInShard(EngineShard* shard);
// Optimized version of RunInShard for single shard uncontended cases.
void RunQuickie(EngineShard* shard);
RunnableResult RunQuickie(EngineShard* shard);
void ExecuteAsync();