feat(server): refactor allow preempt on journal record (#4393)

* feat server: refactor allow preempt on journal record

Signed-off-by: adi_holden <adi@dragonflydb.io>
This commit is contained in:
adiholden 2025-01-02 12:16:21 +02:00 committed by GitHub
parent 7a68528022
commit a3ef239ac7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 152 additions and 122 deletions

View file

@ -543,7 +543,7 @@ void WriteFlushSlotsToJournal(const SlotRanges& slot_ranges) {
// TODO: Break slot migration upon FLUSHSLOTS
journal->RecordEntry(/* txid= */ 0, journal::Op::COMMAND, /* dbid= */ 0,
/* shard_cnt= */ shard_set->size(), nullopt,
Payload("DFLYCLUSTER", args_view), false);
Payload("DFLYCLUSTER", args_view));
};
shard_set->pool()->AwaitFiberOnAll(std::move(cb));
}

View file

@ -144,6 +144,8 @@ unsigned PrimeEvictionPolicy::GarbageCollect(const PrimeTable::HotspotBuckets& e
if (db_slice_->WillBlockOnJournalWrite()) {
return res;
}
// Disable flush journal changes to prevent preemtion in GarbageCollect.
journal::JournalFlushGuard journal_flush_guard(db_slice_->shard_owner()->journal());
// bool should_print = (eb.key_hash % 128) == 0;
@ -172,6 +174,8 @@ unsigned PrimeEvictionPolicy::GarbageCollect(const PrimeTable::HotspotBuckets& e
unsigned PrimeEvictionPolicy::Evict(const PrimeTable::HotspotBuckets& eb, PrimeTable* me) {
if (!can_evict_ || db_slice_->WillBlockOnJournalWrite())
return 0;
// Disable flush journal changes to prevent preemtion in evict.
journal::JournalFlushGuard journal_flush_guard(db_slice_->shard_owner()->journal());
constexpr size_t kNumStashBuckets = ABSL_ARRAYSIZE(eb.probes.by_type.stash_buckets);
@ -195,7 +199,7 @@ unsigned PrimeEvictionPolicy::Evict(const PrimeTable::HotspotBuckets& eb, PrimeT
// log the evicted keys to journal.
if (auto journal = db_slice_->shard_owner()->journal(); journal) {
RecordExpiry(cntx_.db_index, key, false);
RecordExpiry(cntx_.db_index, key);
}
db_slice_->PerformDeletion(DbSlice::Iterator(last_slot_it, StringOrView::FromView(key)), table);
@ -453,7 +457,7 @@ OpResult<DbSlice::PrimeItAndExp> DbSlice::FindInternal(const Context& cntx, std:
}
if (res.it->second.HasExpire()) { // check expiry state
res = ExpireIfNeeded(cntx, res.it, true);
res = ExpireIfNeeded(cntx, res.it);
if (!IsValid(res.it)) {
return OpStatus::KEY_NOTFOUND;
}
@ -1077,12 +1081,11 @@ void DbSlice::PostUpdate(DbIndex db_ind, Iterator it, std::string_view key, size
}
DbSlice::ItAndExp DbSlice::ExpireIfNeeded(const Context& cntx, Iterator it) const {
auto res = ExpireIfNeeded(cntx, it.GetInnerIt(), false);
auto res = ExpireIfNeeded(cntx, it.GetInnerIt());
return {.it = Iterator::FromPrime(res.it), .exp_it = ExpIterator::FromPrime(res.exp_it)};
}
DbSlice::PrimeItAndExp DbSlice::ExpireIfNeeded(const Context& cntx, PrimeIterator it,
bool preempts) const {
DbSlice::PrimeItAndExp DbSlice::ExpireIfNeeded(const Context& cntx, PrimeIterator it) const {
if (!it->second.HasExpire()) {
LOG(ERROR) << "Invalid call to ExpireIfNeeded";
return {it, ExpireIterator{}};
@ -1112,7 +1115,7 @@ DbSlice::PrimeItAndExp DbSlice::ExpireIfNeeded(const Context& cntx, PrimeIterato
// Replicate expiry
if (auto journal = owner_->journal(); journal) {
RecordExpiry(cntx.db_index, key, preempts);
RecordExpiry(cntx.db_index, key);
}
if (expired_keys_events_recording_)
@ -1136,6 +1139,8 @@ void DbSlice::ExpireAllIfNeeded() {
// We hold no locks to any of the keys so we should Wait() here such that
// we don't preempt in ExpireIfNeeded
block_counter_.Wait();
// Disable flush journal changes to prevent preemtion in traverse.
journal::JournalFlushGuard journal_flush_guard(owner_->journal());
for (DbIndex db_index = 0; db_index < db_arr_.size(); db_index++) {
if (!db_arr_[db_index])
@ -1148,7 +1153,7 @@ void DbSlice::ExpireAllIfNeeded() {
LOG(ERROR) << "Expire entry " << exp_it->first.ToString() << " not found in prime table";
return;
}
ExpireIfNeeded(Context{nullptr, db_index, GetCurrentTimeMs()}, prime_it, false);
ExpireIfNeeded(Context{nullptr, db_index, GetCurrentTimeMs()}, prime_it);
};
ExpireTable::Cursor cursor;
@ -1212,7 +1217,7 @@ auto DbSlice::DeleteExpiredStep(const Context& cntx, unsigned count) -> DeleteEx
auto prime_it = db.prime.Find(it->first);
CHECK(!prime_it.is_done());
result.deleted_bytes += prime_it->first.MallocUsed() + prime_it->second.MallocUsed();
ExpireIfNeeded(cntx, prime_it, false);
ExpireIfNeeded(cntx, prime_it);
++result.deleted;
} else {
result.survivor_ttl_sum += ttl;
@ -1280,7 +1285,7 @@ pair<uint64_t, size_t> DbSlice::FreeMemWithEvictionStep(DbIndex db_ind, size_t s
// fiber preemption could happen in this phase.
for (string_view key : keys_to_journal) {
if (auto journal = owner_->journal(); journal)
RecordExpiry(db_ind, key, false);
RecordExpiry(db_ind, key);
if (expired_keys_events_recording_)
db_table->expired_keys_events_.emplace_back(key);

View file

@ -553,7 +553,7 @@ class DbSlice {
ExpireIterator exp_it;
};
PrimeItAndExp ExpireIfNeeded(const Context& cntx, PrimeIterator it, bool preempts = false) const;
PrimeItAndExp ExpireIfNeeded(const Context& cntx, PrimeIterator it) const;
OpResult<AddOrFindResult> AddOrFindInternal(const Context& cntx, std::string_view key);

View file

@ -78,7 +78,7 @@ bool WaitReplicaFlowToCatchup(absl::Time end_time, const DflyCmd::ReplicaInfo* r
// We don't want any writes to the journal after we send the `PING`,
// and expirations could ruin that.
namespaces->GetDefaultNamespace().GetDbSlice(shard->shard_id()).SetExpireAllowed(false);
shard->journal()->RecordEntry(0, journal::Op::PING, 0, 0, nullopt, {}, true);
shard->journal()->RecordEntry(0, journal::Op::PING, 0, 0, nullopt, {});
const FlowInfo* flow = &replica->flows[shard->shard_id()];
while (flow->last_acked_lsn < shard->journal()->GetLsn()) {

View file

@ -12,8 +12,8 @@
extern "C" {
#include "redis/zmalloc.h"
}
#include "server/engine_shard_set.h"
#include "server/journal/journal.h"
#include "server/namespaces.h"
#include "server/search/doc_index.h"
#include "server/server_state.h"
@ -756,8 +756,9 @@ void EngineShard::Heartbeat() {
}
void EngineShard::RetireExpiredAndEvict() {
{
FiberAtomicGuard guard;
// Disable flush journal changes to prevent preemtion
journal::JournalFlushGuard journal_flush_guard(journal_);
// TODO: iterate over all namespaces
DbSlice& db_slice = namespaces->GetDefaultNamespace().GetDbSlice(shard_id());
constexpr double kTtlDeleteLimit = 200;
@ -806,13 +807,6 @@ void EngineShard::RetireExpiredAndEvict() {
eviction_goal -= std::min(eviction_goal, evicted_bytes);
}
}
}
// Journal entries for expired entries are not writen to socket in the loop above.
// Trigger write to socket when loop finishes.
if (auto journal = EngineShard::tlocal()->journal(); journal) {
TriggerJournalWriteToSink();
}
}
void EngineShard::CacheStats() {

View file

@ -616,7 +616,8 @@ void OpScan(const OpArgs& op_args, const ScanOpts& scan_opts, uint64_t* cursor,
// we enter the callback in a timing when journaling will not cause preemptions. Otherwise,
// the bucket might change as we Traverse and yield.
db_slice.BlockingCounter()->Wait();
// Disable flush journal changes to prevent preemtion in traverse.
journal::JournalFlushGuard journal_flush_guard(op_args.shard->journal());
util::FiberAtomicGuard guard;
unsigned cnt = 0;
@ -630,6 +631,7 @@ void OpScan(const OpArgs& op_args, const ScanOpts& scan_opts, uint64_t* cursor,
cur = prime_table->Traverse(
cur, [&](PrimeIterator it) { cnt += ScanCb(op_args, it, scan_opts, &scratch, vec); });
} while (cur && cnt < scan_opts.limit);
VLOG(1) << "OpScan " << db_slice.shard_id() << " cursor: " << cur.value();
*cursor = cur.value();
}

View file

@ -84,8 +84,12 @@ LSN Journal::GetLsn() const {
}
void Journal::RecordEntry(TxId txid, Op opcode, DbIndex dbid, unsigned shard_cnt,
std::optional<cluster::SlotId> slot, Entry::Payload payload, bool await) {
journal_slice.AddLogRecord(Entry{txid, opcode, dbid, shard_cnt, slot, std::move(payload)}, await);
std::optional<cluster::SlotId> slot, Entry::Payload payload) {
journal_slice.AddLogRecord(Entry{txid, opcode, dbid, shard_cnt, slot, std::move(payload)});
}
void Journal::SetFlushMode(bool allow_flush) {
journal_slice.SetFlushMode(allow_flush);
}
} // namespace journal

View file

@ -3,8 +3,8 @@
//
#pragma once
#include "server/journal/types.h"
#include "util/fibers/detail/fiber_interface.h"
#include "util/proactor_pool.h"
namespace dfly {
@ -35,11 +35,36 @@ class Journal {
LSN GetLsn() const;
void RecordEntry(TxId txid, Op opcode, DbIndex dbid, unsigned shard_cnt,
std::optional<cluster::SlotId> slot, Entry::Payload payload, bool await);
std::optional<cluster::SlotId> slot, Entry::Payload payload);
void SetFlushMode(bool allow_flush);
private:
mutable util::fb2::Mutex state_mu_;
};
class JournalFlushGuard {
public:
explicit JournalFlushGuard(Journal* journal) : journal_(journal) {
if (journal_) {
journal_->SetFlushMode(false);
}
util::fb2::detail::EnterFiberAtomicSection();
}
~JournalFlushGuard() {
util::fb2::detail::LeaveFiberAtomicSection();
if (journal_) {
journal_->SetFlushMode(true); // Restore the state on destruction
}
}
JournalFlushGuard(const JournalFlushGuard&) = delete;
JournalFlushGuard& operator=(const JournalFlushGuard&) = delete;
private:
Journal* journal_;
};
} // namespace journal
} // namespace dfly

View file

@ -142,34 +142,37 @@ std::string_view JournalSlice::GetEntry(LSN lsn) const {
return (*ring_buffer_)[lsn - start].data;
}
void JournalSlice::AddLogRecord(const Entry& entry, bool await) {
void JournalSlice::SetFlushMode(bool allow_flush) {
DCHECK(allow_flush != enable_journal_flush_);
enable_journal_flush_ = allow_flush;
if (allow_flush) {
JournalItem item;
item.lsn = -1;
item.opcode = Op::NOOP;
item.data = "";
item.slot = {};
CallOnChange(item);
}
}
void JournalSlice::AddLogRecord(const Entry& entry) {
DCHECK(ring_buffer_);
JournalItem dummy;
JournalItem* item;
if (entry.opcode == Op::NOOP) {
item = &dummy;
item->lsn = -1;
item->opcode = entry.opcode;
item->data = "";
item->slot = entry.slot;
} else {
JournalItem item;
{
FiberAtomicGuard fg;
// GetTail gives a pointer to a new tail entry in the buffer, possibly overriding the last entry
// if the buffer is full.
item = &dummy;
item->opcode = entry.opcode;
item->lsn = lsn_++;
item->cmd = entry.payload.cmd;
item->slot = entry.slot;
item.opcode = entry.opcode;
item.lsn = lsn_++;
item.cmd = entry.payload.cmd;
item.slot = entry.slot;
io::BufSink buf_sink{&ring_serialize_buf_};
JournalWriter writer{&buf_sink};
writer.Write(entry);
item->data = io::View(ring_serialize_buf_.InputBuffer());
item.data = io::View(ring_serialize_buf_.InputBuffer());
ring_serialize_buf_.Clear();
VLOG(2) << "Writing item [" << item->lsn << "]: " << entry.ToString();
VLOG(2) << "Writing item [" << item.lsn << "]: " << entry.ToString();
}
#if 0
@ -180,20 +183,18 @@ void JournalSlice::AddLogRecord(const Entry& entry, bool await) {
file_offset_ += line.size();
}
#endif
CallOnChange(item);
}
// TODO: Remove the callbacks, replace with notifiers
{
void JournalSlice::CallOnChange(const JournalItem& item) {
std::shared_lock lk(cb_mu_);
DVLOG(2) << "AddLogRecord: run callbacks for " << entry.ToString()
<< " num callbacks: " << change_cb_arr_.size();
const size_t size = change_cb_arr_.size();
auto k_v = change_cb_arr_.begin();
for (size_t i = 0; i < size; ++i) {
k_v->second(*item, await);
k_v->second(item, enable_journal_flush_);
++k_v;
}
}
}
uint32_t JournalSlice::RegisterOnChange(ChangeCallback cb) {

View file

@ -37,7 +37,7 @@ class JournalSlice {
return slice_index_ != UINT32_MAX;
}
void AddLogRecord(const Entry& entry, bool await);
void AddLogRecord(const Entry& entry);
// Register a callback that will be called every time a new entry is
// added to the journal.
@ -54,8 +54,16 @@ class JournalSlice {
/// from the buffer.
bool IsLSNInBuffer(LSN lsn) const;
std::string_view GetEntry(LSN lsn) const;
// SetFlushMode with allow_flush=false is used to disable preemptions during
// subsequent calls to AddLogRecord.
// SetFlushMode with allow_flush=true flushes all log records aggregated
// since the last call with allow_flush=false. This call may preempt.
// The caller must ensure that no preemptions occur between the initial call
// with allow_flush=false and the subsequent call with allow_flush=true.
void SetFlushMode(bool allow_flush);
private:
void CallOnChange(const JournalItem& item);
// std::string shard_path_;
// std::unique_ptr<LinuxFile> shard_file_;
std::optional<base::RingBuffer<JournalItem>> ring_buffer_;
@ -69,6 +77,7 @@ class JournalSlice {
uint32_t slice_index_ = UINT32_MAX;
uint32_t next_cb_id_ = 1;
std::error_code status_ec_;
bool enable_journal_flush_ = true;
};
} // namespace journal

View file

@ -56,6 +56,7 @@ void JournalStreamer::Start(util::FiberSocketBase* dest, bool send_lsn) {
if (allow_await) {
ThrottleIfNeeded();
// No record to write, just await if data was written so consumer will read the data.
// TODO: shouldnt we trigger async write in noop??
if (item.opcode == Op::NOOP)
return;
}

View file

@ -1474,15 +1474,15 @@ void Transaction::LogAutoJournalOnShard(EngineShard* shard, RunnableResult resul
}
// Record to journal autojournal commands, here we allow await which anables writing to sync
// the journal change.
LogJournalOnShard(shard, std::move(entry_payload), unique_shard_cnt_, true);
LogJournalOnShard(shard, std::move(entry_payload), unique_shard_cnt_);
}
void Transaction::LogJournalOnShard(EngineShard* shard, journal::Entry::Payload&& payload,
uint32_t shard_cnt, bool allow_await) const {
uint32_t shard_cnt) const {
auto journal = shard->journal();
CHECK(journal);
journal->RecordEntry(txid_, journal::Op::COMMAND, db_index_, shard_cnt,
unique_slot_checker_.GetUniqueSlotId(), std::move(payload), allow_await);
unique_slot_checker_.GetUniqueSlotId(), std::move(payload));
}
void Transaction::ReviveAutoJournal() {

View file

@ -341,8 +341,8 @@ class Transaction {
void PrepareMultiForScheduleSingleHop(Namespace* ns, ShardId sid, DbIndex db, CmdArgList args);
// Write a journal entry to a shard journal with the given payload.
void LogJournalOnShard(EngineShard* shard, journal::Entry::Payload&& payload, uint32_t shard_cnt,
bool allow_await) const;
void LogJournalOnShard(EngineShard* shard, journal::Entry::Payload&& payload,
uint32_t shard_cnt) const;
// Re-enable auto journal for commands marked as NO_AUTOJOURNAL. Call during setup.
void ReviveAutoJournal();

View file

@ -53,32 +53,21 @@ size_t ShardArgs::Size() const {
void RecordJournal(const OpArgs& op_args, string_view cmd, const ShardArgs& args,
uint32_t shard_cnt) {
VLOG(2) << "Logging command " << cmd << " from txn " << op_args.tx->txid();
op_args.tx->LogJournalOnShard(op_args.shard, Payload(cmd, args), shard_cnt, true);
op_args.tx->LogJournalOnShard(op_args.shard, Payload(cmd, args), shard_cnt);
}
void RecordJournal(const OpArgs& op_args, std::string_view cmd, facade::ArgSlice args,
uint32_t shard_cnt) {
VLOG(2) << "Logging command " << cmd << " from txn " << op_args.tx->txid();
op_args.tx->LogJournalOnShard(op_args.shard, Payload(cmd, args), shard_cnt, true);
op_args.tx->LogJournalOnShard(op_args.shard, Payload(cmd, args), shard_cnt);
}
void RecordExpiry(DbIndex dbid, string_view key, bool preempts) {
void RecordExpiry(DbIndex dbid, string_view key) {
auto journal = EngineShard::tlocal()->journal();
CHECK(journal);
if (!preempts) {
util::FiberAtomicGuard guard;
journal->RecordEntry(0, journal::Op::EXPIRED, dbid, 1, cluster::KeySlot(key),
Payload("DEL", ArgSlice{key}), preempts);
return;
}
journal->RecordEntry(0, journal::Op::EXPIRED, dbid, 1, cluster::KeySlot(key),
Payload("DEL", ArgSlice{key}), preempts);
}
void TriggerJournalWriteToSink() {
auto journal = EngineShard::tlocal()->journal();
CHECK(journal);
journal->RecordEntry(0, journal::Op::NOOP, 0, 0, nullopt, {}, true);
journal->RecordEntry(0, journal::Op::EXPIRED, dbid, 1, cluster::KeySlot(key),
Payload("DEL", ArgSlice{key}));
}
LockTag::LockTag(std::string_view key) {

View file

@ -224,7 +224,7 @@ void RecordJournal(const OpArgs& op_args, std::string_view cmd, ArgSlice args,
// Record expiry in journal with independent transaction. Must be called from shard thread holding
// key.
void RecordExpiry(DbIndex dbid, std::string_view key, bool preempts = false);
void RecordExpiry(DbIndex dbid, std::string_view key);
// Trigger journal write to sink, no journal record will be added to journal.
// Must be called from shard thread of journal to sink.