feat(server): refactor allow preempt on journal record (#4393)

* feat server: refactor allow preempt on journal record

Signed-off-by: adi_holden <adi@dragonflydb.io>
This commit is contained in:
adiholden 2025-01-02 12:16:21 +02:00 committed by GitHub
parent 7a68528022
commit a3ef239ac7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 152 additions and 122 deletions

View file

@ -543,7 +543,7 @@ void WriteFlushSlotsToJournal(const SlotRanges& slot_ranges) {
// TODO: Break slot migration upon FLUSHSLOTS // TODO: Break slot migration upon FLUSHSLOTS
journal->RecordEntry(/* txid= */ 0, journal::Op::COMMAND, /* dbid= */ 0, journal->RecordEntry(/* txid= */ 0, journal::Op::COMMAND, /* dbid= */ 0,
/* shard_cnt= */ shard_set->size(), nullopt, /* shard_cnt= */ shard_set->size(), nullopt,
Payload("DFLYCLUSTER", args_view), false); Payload("DFLYCLUSTER", args_view));
}; };
shard_set->pool()->AwaitFiberOnAll(std::move(cb)); shard_set->pool()->AwaitFiberOnAll(std::move(cb));
} }

View file

@ -144,6 +144,8 @@ unsigned PrimeEvictionPolicy::GarbageCollect(const PrimeTable::HotspotBuckets& e
if (db_slice_->WillBlockOnJournalWrite()) { if (db_slice_->WillBlockOnJournalWrite()) {
return res; return res;
} }
// Disable flush journal changes to prevent preemtion in GarbageCollect.
journal::JournalFlushGuard journal_flush_guard(db_slice_->shard_owner()->journal());
// bool should_print = (eb.key_hash % 128) == 0; // bool should_print = (eb.key_hash % 128) == 0;
@ -172,6 +174,8 @@ unsigned PrimeEvictionPolicy::GarbageCollect(const PrimeTable::HotspotBuckets& e
unsigned PrimeEvictionPolicy::Evict(const PrimeTable::HotspotBuckets& eb, PrimeTable* me) { unsigned PrimeEvictionPolicy::Evict(const PrimeTable::HotspotBuckets& eb, PrimeTable* me) {
if (!can_evict_ || db_slice_->WillBlockOnJournalWrite()) if (!can_evict_ || db_slice_->WillBlockOnJournalWrite())
return 0; return 0;
// Disable flush journal changes to prevent preemtion in evict.
journal::JournalFlushGuard journal_flush_guard(db_slice_->shard_owner()->journal());
constexpr size_t kNumStashBuckets = ABSL_ARRAYSIZE(eb.probes.by_type.stash_buckets); constexpr size_t kNumStashBuckets = ABSL_ARRAYSIZE(eb.probes.by_type.stash_buckets);
@ -195,7 +199,7 @@ unsigned PrimeEvictionPolicy::Evict(const PrimeTable::HotspotBuckets& eb, PrimeT
// log the evicted keys to journal. // log the evicted keys to journal.
if (auto journal = db_slice_->shard_owner()->journal(); journal) { if (auto journal = db_slice_->shard_owner()->journal(); journal) {
RecordExpiry(cntx_.db_index, key, false); RecordExpiry(cntx_.db_index, key);
} }
db_slice_->PerformDeletion(DbSlice::Iterator(last_slot_it, StringOrView::FromView(key)), table); db_slice_->PerformDeletion(DbSlice::Iterator(last_slot_it, StringOrView::FromView(key)), table);
@ -453,7 +457,7 @@ OpResult<DbSlice::PrimeItAndExp> DbSlice::FindInternal(const Context& cntx, std:
} }
if (res.it->second.HasExpire()) { // check expiry state if (res.it->second.HasExpire()) { // check expiry state
res = ExpireIfNeeded(cntx, res.it, true); res = ExpireIfNeeded(cntx, res.it);
if (!IsValid(res.it)) { if (!IsValid(res.it)) {
return OpStatus::KEY_NOTFOUND; return OpStatus::KEY_NOTFOUND;
} }
@ -1077,12 +1081,11 @@ void DbSlice::PostUpdate(DbIndex db_ind, Iterator it, std::string_view key, size
} }
DbSlice::ItAndExp DbSlice::ExpireIfNeeded(const Context& cntx, Iterator it) const { DbSlice::ItAndExp DbSlice::ExpireIfNeeded(const Context& cntx, Iterator it) const {
auto res = ExpireIfNeeded(cntx, it.GetInnerIt(), false); auto res = ExpireIfNeeded(cntx, it.GetInnerIt());
return {.it = Iterator::FromPrime(res.it), .exp_it = ExpIterator::FromPrime(res.exp_it)}; return {.it = Iterator::FromPrime(res.it), .exp_it = ExpIterator::FromPrime(res.exp_it)};
} }
DbSlice::PrimeItAndExp DbSlice::ExpireIfNeeded(const Context& cntx, PrimeIterator it, DbSlice::PrimeItAndExp DbSlice::ExpireIfNeeded(const Context& cntx, PrimeIterator it) const {
bool preempts) const {
if (!it->second.HasExpire()) { if (!it->second.HasExpire()) {
LOG(ERROR) << "Invalid call to ExpireIfNeeded"; LOG(ERROR) << "Invalid call to ExpireIfNeeded";
return {it, ExpireIterator{}}; return {it, ExpireIterator{}};
@ -1112,7 +1115,7 @@ DbSlice::PrimeItAndExp DbSlice::ExpireIfNeeded(const Context& cntx, PrimeIterato
// Replicate expiry // Replicate expiry
if (auto journal = owner_->journal(); journal) { if (auto journal = owner_->journal(); journal) {
RecordExpiry(cntx.db_index, key, preempts); RecordExpiry(cntx.db_index, key);
} }
if (expired_keys_events_recording_) if (expired_keys_events_recording_)
@ -1136,6 +1139,8 @@ void DbSlice::ExpireAllIfNeeded() {
// We hold no locks to any of the keys so we should Wait() here such that // We hold no locks to any of the keys so we should Wait() here such that
// we don't preempt in ExpireIfNeeded // we don't preempt in ExpireIfNeeded
block_counter_.Wait(); block_counter_.Wait();
// Disable flush journal changes to prevent preemtion in traverse.
journal::JournalFlushGuard journal_flush_guard(owner_->journal());
for (DbIndex db_index = 0; db_index < db_arr_.size(); db_index++) { for (DbIndex db_index = 0; db_index < db_arr_.size(); db_index++) {
if (!db_arr_[db_index]) if (!db_arr_[db_index])
@ -1148,7 +1153,7 @@ void DbSlice::ExpireAllIfNeeded() {
LOG(ERROR) << "Expire entry " << exp_it->first.ToString() << " not found in prime table"; LOG(ERROR) << "Expire entry " << exp_it->first.ToString() << " not found in prime table";
return; return;
} }
ExpireIfNeeded(Context{nullptr, db_index, GetCurrentTimeMs()}, prime_it, false); ExpireIfNeeded(Context{nullptr, db_index, GetCurrentTimeMs()}, prime_it);
}; };
ExpireTable::Cursor cursor; ExpireTable::Cursor cursor;
@ -1212,7 +1217,7 @@ auto DbSlice::DeleteExpiredStep(const Context& cntx, unsigned count) -> DeleteEx
auto prime_it = db.prime.Find(it->first); auto prime_it = db.prime.Find(it->first);
CHECK(!prime_it.is_done()); CHECK(!prime_it.is_done());
result.deleted_bytes += prime_it->first.MallocUsed() + prime_it->second.MallocUsed(); result.deleted_bytes += prime_it->first.MallocUsed() + prime_it->second.MallocUsed();
ExpireIfNeeded(cntx, prime_it, false); ExpireIfNeeded(cntx, prime_it);
++result.deleted; ++result.deleted;
} else { } else {
result.survivor_ttl_sum += ttl; result.survivor_ttl_sum += ttl;
@ -1280,7 +1285,7 @@ pair<uint64_t, size_t> DbSlice::FreeMemWithEvictionStep(DbIndex db_ind, size_t s
// fiber preemption could happen in this phase. // fiber preemption could happen in this phase.
for (string_view key : keys_to_journal) { for (string_view key : keys_to_journal) {
if (auto journal = owner_->journal(); journal) if (auto journal = owner_->journal(); journal)
RecordExpiry(db_ind, key, false); RecordExpiry(db_ind, key);
if (expired_keys_events_recording_) if (expired_keys_events_recording_)
db_table->expired_keys_events_.emplace_back(key); db_table->expired_keys_events_.emplace_back(key);

View file

@ -553,7 +553,7 @@ class DbSlice {
ExpireIterator exp_it; ExpireIterator exp_it;
}; };
PrimeItAndExp ExpireIfNeeded(const Context& cntx, PrimeIterator it, bool preempts = false) const; PrimeItAndExp ExpireIfNeeded(const Context& cntx, PrimeIterator it) const;
OpResult<AddOrFindResult> AddOrFindInternal(const Context& cntx, std::string_view key); OpResult<AddOrFindResult> AddOrFindInternal(const Context& cntx, std::string_view key);

View file

@ -78,7 +78,7 @@ bool WaitReplicaFlowToCatchup(absl::Time end_time, const DflyCmd::ReplicaInfo* r
// We don't want any writes to the journal after we send the `PING`, // We don't want any writes to the journal after we send the `PING`,
// and expirations could ruin that. // and expirations could ruin that.
namespaces->GetDefaultNamespace().GetDbSlice(shard->shard_id()).SetExpireAllowed(false); namespaces->GetDefaultNamespace().GetDbSlice(shard->shard_id()).SetExpireAllowed(false);
shard->journal()->RecordEntry(0, journal::Op::PING, 0, 0, nullopt, {}, true); shard->journal()->RecordEntry(0, journal::Op::PING, 0, 0, nullopt, {});
const FlowInfo* flow = &replica->flows[shard->shard_id()]; const FlowInfo* flow = &replica->flows[shard->shard_id()];
while (flow->last_acked_lsn < shard->journal()->GetLsn()) { while (flow->last_acked_lsn < shard->journal()->GetLsn()) {

View file

@ -12,8 +12,8 @@
extern "C" { extern "C" {
#include "redis/zmalloc.h" #include "redis/zmalloc.h"
} }
#include "server/engine_shard_set.h" #include "server/engine_shard_set.h"
#include "server/journal/journal.h"
#include "server/namespaces.h" #include "server/namespaces.h"
#include "server/search/doc_index.h" #include "server/search/doc_index.h"
#include "server/server_state.h" #include "server/server_state.h"
@ -756,62 +756,56 @@ void EngineShard::Heartbeat() {
} }
void EngineShard::RetireExpiredAndEvict() { void EngineShard::RetireExpiredAndEvict() {
{ // Disable flush journal changes to prevent preemtion
FiberAtomicGuard guard; journal::JournalFlushGuard journal_flush_guard(journal_);
// TODO: iterate over all namespaces
DbSlice& db_slice = namespaces->GetDefaultNamespace().GetDbSlice(shard_id());
constexpr double kTtlDeleteLimit = 200;
uint32_t traversed = GetMovingSum6(TTL_TRAVERSE); // TODO: iterate over all namespaces
uint32_t deleted = GetMovingSum6(TTL_DELETE); DbSlice& db_slice = namespaces->GetDefaultNamespace().GetDbSlice(shard_id());
unsigned ttl_delete_target = 5; constexpr double kTtlDeleteLimit = 200;
if (deleted > 10) { uint32_t traversed = GetMovingSum6(TTL_TRAVERSE);
// deleted should be <= traversed. uint32_t deleted = GetMovingSum6(TTL_DELETE);
// hence we map our delete/traversed ratio into a range [0, kTtlDeleteLimit). unsigned ttl_delete_target = 5;
// The higher ttl_delete_target the more likely we have lots of expired items that need
// to be deleted.
ttl_delete_target = kTtlDeleteLimit * double(deleted) / (double(traversed) + 10);
}
DbContext db_cntx; if (deleted > 10) {
db_cntx.time_now_ms = GetCurrentTimeMs(); // deleted should be <= traversed.
// hence we map our delete/traversed ratio into a range [0, kTtlDeleteLimit).
size_t eviction_goal = GetFlag(FLAGS_enable_heartbeat_eviction) ? CalculateEvictionBytes() : 0; // The higher ttl_delete_target the more likely we have lots of expired items that need
// to be deleted.
for (unsigned i = 0; i < db_slice.db_array_size(); ++i) { ttl_delete_target = kTtlDeleteLimit * double(deleted) / (double(traversed) + 10);
if (!db_slice.IsDbValid(i))
continue;
db_cntx.db_index = i;
auto [pt, expt] = db_slice.GetTables(i);
if (expt->size() > pt->size() / 4) {
DbSlice::DeleteExpiredStats stats = db_slice.DeleteExpiredStep(db_cntx, ttl_delete_target);
eviction_goal -= std::min(eviction_goal, size_t(stats.deleted_bytes));
counter_[TTL_TRAVERSE].IncBy(stats.traversed);
counter_[TTL_DELETE].IncBy(stats.deleted);
}
if (eviction_goal) {
uint32_t starting_segment_id = rand() % pt->GetSegmentCount();
auto [evicted_items, evicted_bytes] =
db_slice.FreeMemWithEvictionStep(i, starting_segment_id, eviction_goal);
DVLOG(2) << "Heartbeat eviction: Expected to evict " << eviction_goal
<< " bytes. Actually evicted " << evicted_items << " items, " << evicted_bytes
<< " bytes. Max eviction per heartbeat: "
<< GetFlag(FLAGS_max_eviction_per_heartbeat);
eviction_goal -= std::min(eviction_goal, evicted_bytes);
}
}
} }
// Journal entries for expired entries are not writen to socket in the loop above. DbContext db_cntx;
// Trigger write to socket when loop finishes. db_cntx.time_now_ms = GetCurrentTimeMs();
if (auto journal = EngineShard::tlocal()->journal(); journal) {
TriggerJournalWriteToSink(); size_t eviction_goal = GetFlag(FLAGS_enable_heartbeat_eviction) ? CalculateEvictionBytes() : 0;
for (unsigned i = 0; i < db_slice.db_array_size(); ++i) {
if (!db_slice.IsDbValid(i))
continue;
db_cntx.db_index = i;
auto [pt, expt] = db_slice.GetTables(i);
if (expt->size() > pt->size() / 4) {
DbSlice::DeleteExpiredStats stats = db_slice.DeleteExpiredStep(db_cntx, ttl_delete_target);
eviction_goal -= std::min(eviction_goal, size_t(stats.deleted_bytes));
counter_[TTL_TRAVERSE].IncBy(stats.traversed);
counter_[TTL_DELETE].IncBy(stats.deleted);
}
if (eviction_goal) {
uint32_t starting_segment_id = rand() % pt->GetSegmentCount();
auto [evicted_items, evicted_bytes] =
db_slice.FreeMemWithEvictionStep(i, starting_segment_id, eviction_goal);
DVLOG(2) << "Heartbeat eviction: Expected to evict " << eviction_goal
<< " bytes. Actually evicted " << evicted_items << " items, " << evicted_bytes
<< " bytes. Max eviction per heartbeat: "
<< GetFlag(FLAGS_max_eviction_per_heartbeat);
eviction_goal -= std::min(eviction_goal, evicted_bytes);
}
} }
} }

View file

@ -616,7 +616,8 @@ void OpScan(const OpArgs& op_args, const ScanOpts& scan_opts, uint64_t* cursor,
// we enter the callback in a timing when journaling will not cause preemptions. Otherwise, // we enter the callback in a timing when journaling will not cause preemptions. Otherwise,
// the bucket might change as we Traverse and yield. // the bucket might change as we Traverse and yield.
db_slice.BlockingCounter()->Wait(); db_slice.BlockingCounter()->Wait();
// Disable flush journal changes to prevent preemtion in traverse.
journal::JournalFlushGuard journal_flush_guard(op_args.shard->journal());
util::FiberAtomicGuard guard; util::FiberAtomicGuard guard;
unsigned cnt = 0; unsigned cnt = 0;
@ -630,6 +631,7 @@ void OpScan(const OpArgs& op_args, const ScanOpts& scan_opts, uint64_t* cursor,
cur = prime_table->Traverse( cur = prime_table->Traverse(
cur, [&](PrimeIterator it) { cnt += ScanCb(op_args, it, scan_opts, &scratch, vec); }); cur, [&](PrimeIterator it) { cnt += ScanCb(op_args, it, scan_opts, &scratch, vec); });
} while (cur && cnt < scan_opts.limit); } while (cur && cnt < scan_opts.limit);
VLOG(1) << "OpScan " << db_slice.shard_id() << " cursor: " << cur.value(); VLOG(1) << "OpScan " << db_slice.shard_id() << " cursor: " << cur.value();
*cursor = cur.value(); *cursor = cur.value();
} }

View file

@ -84,8 +84,12 @@ LSN Journal::GetLsn() const {
} }
void Journal::RecordEntry(TxId txid, Op opcode, DbIndex dbid, unsigned shard_cnt, void Journal::RecordEntry(TxId txid, Op opcode, DbIndex dbid, unsigned shard_cnt,
std::optional<cluster::SlotId> slot, Entry::Payload payload, bool await) { std::optional<cluster::SlotId> slot, Entry::Payload payload) {
journal_slice.AddLogRecord(Entry{txid, opcode, dbid, shard_cnt, slot, std::move(payload)}, await); journal_slice.AddLogRecord(Entry{txid, opcode, dbid, shard_cnt, slot, std::move(payload)});
}
void Journal::SetFlushMode(bool allow_flush) {
journal_slice.SetFlushMode(allow_flush);
} }
} // namespace journal } // namespace journal

View file

@ -3,8 +3,8 @@
// //
#pragma once #pragma once
#include "server/journal/types.h" #include "server/journal/types.h"
#include "util/fibers/detail/fiber_interface.h"
#include "util/proactor_pool.h" #include "util/proactor_pool.h"
namespace dfly { namespace dfly {
@ -35,11 +35,36 @@ class Journal {
LSN GetLsn() const; LSN GetLsn() const;
void RecordEntry(TxId txid, Op opcode, DbIndex dbid, unsigned shard_cnt, void RecordEntry(TxId txid, Op opcode, DbIndex dbid, unsigned shard_cnt,
std::optional<cluster::SlotId> slot, Entry::Payload payload, bool await); std::optional<cluster::SlotId> slot, Entry::Payload payload);
void SetFlushMode(bool allow_flush);
private: private:
mutable util::fb2::Mutex state_mu_; mutable util::fb2::Mutex state_mu_;
}; };
class JournalFlushGuard {
public:
explicit JournalFlushGuard(Journal* journal) : journal_(journal) {
if (journal_) {
journal_->SetFlushMode(false);
}
util::fb2::detail::EnterFiberAtomicSection();
}
~JournalFlushGuard() {
util::fb2::detail::LeaveFiberAtomicSection();
if (journal_) {
journal_->SetFlushMode(true); // Restore the state on destruction
}
}
JournalFlushGuard(const JournalFlushGuard&) = delete;
JournalFlushGuard& operator=(const JournalFlushGuard&) = delete;
private:
Journal* journal_;
};
} // namespace journal } // namespace journal
} // namespace dfly } // namespace dfly

View file

@ -142,34 +142,37 @@ std::string_view JournalSlice::GetEntry(LSN lsn) const {
return (*ring_buffer_)[lsn - start].data; return (*ring_buffer_)[lsn - start].data;
} }
void JournalSlice::AddLogRecord(const Entry& entry, bool await) { void JournalSlice::SetFlushMode(bool allow_flush) {
DCHECK(allow_flush != enable_journal_flush_);
enable_journal_flush_ = allow_flush;
if (allow_flush) {
JournalItem item;
item.lsn = -1;
item.opcode = Op::NOOP;
item.data = "";
item.slot = {};
CallOnChange(item);
}
}
void JournalSlice::AddLogRecord(const Entry& entry) {
DCHECK(ring_buffer_); DCHECK(ring_buffer_);
JournalItem dummy; JournalItem item;
JournalItem* item; {
if (entry.opcode == Op::NOOP) {
item = &dummy;
item->lsn = -1;
item->opcode = entry.opcode;
item->data = "";
item->slot = entry.slot;
} else {
FiberAtomicGuard fg; FiberAtomicGuard fg;
// GetTail gives a pointer to a new tail entry in the buffer, possibly overriding the last entry item.opcode = entry.opcode;
// if the buffer is full. item.lsn = lsn_++;
item = &dummy; item.cmd = entry.payload.cmd;
item->opcode = entry.opcode; item.slot = entry.slot;
item->lsn = lsn_++;
item->cmd = entry.payload.cmd;
item->slot = entry.slot;
io::BufSink buf_sink{&ring_serialize_buf_}; io::BufSink buf_sink{&ring_serialize_buf_};
JournalWriter writer{&buf_sink}; JournalWriter writer{&buf_sink};
writer.Write(entry); writer.Write(entry);
item->data = io::View(ring_serialize_buf_.InputBuffer()); item.data = io::View(ring_serialize_buf_.InputBuffer());
ring_serialize_buf_.Clear(); ring_serialize_buf_.Clear();
VLOG(2) << "Writing item [" << item->lsn << "]: " << entry.ToString(); VLOG(2) << "Writing item [" << item.lsn << "]: " << entry.ToString();
} }
#if 0 #if 0
@ -180,19 +183,17 @@ void JournalSlice::AddLogRecord(const Entry& entry, bool await) {
file_offset_ += line.size(); file_offset_ += line.size();
} }
#endif #endif
CallOnChange(item);
}
// TODO: Remove the callbacks, replace with notifiers void JournalSlice::CallOnChange(const JournalItem& item) {
{ std::shared_lock lk(cb_mu_);
std::shared_lock lk(cb_mu_);
DVLOG(2) << "AddLogRecord: run callbacks for " << entry.ToString()
<< " num callbacks: " << change_cb_arr_.size();
const size_t size = change_cb_arr_.size(); const size_t size = change_cb_arr_.size();
auto k_v = change_cb_arr_.begin(); auto k_v = change_cb_arr_.begin();
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
k_v->second(*item, await); k_v->second(item, enable_journal_flush_);
++k_v; ++k_v;
}
} }
} }

View file

@ -37,7 +37,7 @@ class JournalSlice {
return slice_index_ != UINT32_MAX; return slice_index_ != UINT32_MAX;
} }
void AddLogRecord(const Entry& entry, bool await); void AddLogRecord(const Entry& entry);
// Register a callback that will be called every time a new entry is // Register a callback that will be called every time a new entry is
// added to the journal. // added to the journal.
@ -54,8 +54,16 @@ class JournalSlice {
/// from the buffer. /// from the buffer.
bool IsLSNInBuffer(LSN lsn) const; bool IsLSNInBuffer(LSN lsn) const;
std::string_view GetEntry(LSN lsn) const; std::string_view GetEntry(LSN lsn) const;
// SetFlushMode with allow_flush=false is used to disable preemptions during
// subsequent calls to AddLogRecord.
// SetFlushMode with allow_flush=true flushes all log records aggregated
// since the last call with allow_flush=false. This call may preempt.
// The caller must ensure that no preemptions occur between the initial call
// with allow_flush=false and the subsequent call with allow_flush=true.
void SetFlushMode(bool allow_flush);
private: private:
void CallOnChange(const JournalItem& item);
// std::string shard_path_; // std::string shard_path_;
// std::unique_ptr<LinuxFile> shard_file_; // std::unique_ptr<LinuxFile> shard_file_;
std::optional<base::RingBuffer<JournalItem>> ring_buffer_; std::optional<base::RingBuffer<JournalItem>> ring_buffer_;
@ -69,6 +77,7 @@ class JournalSlice {
uint32_t slice_index_ = UINT32_MAX; uint32_t slice_index_ = UINT32_MAX;
uint32_t next_cb_id_ = 1; uint32_t next_cb_id_ = 1;
std::error_code status_ec_; std::error_code status_ec_;
bool enable_journal_flush_ = true;
}; };
} // namespace journal } // namespace journal

View file

@ -56,6 +56,7 @@ void JournalStreamer::Start(util::FiberSocketBase* dest, bool send_lsn) {
if (allow_await) { if (allow_await) {
ThrottleIfNeeded(); ThrottleIfNeeded();
// No record to write, just await if data was written so consumer will read the data. // No record to write, just await if data was written so consumer will read the data.
// TODO: shouldnt we trigger async write in noop??
if (item.opcode == Op::NOOP) if (item.opcode == Op::NOOP)
return; return;
} }

View file

@ -1474,15 +1474,15 @@ void Transaction::LogAutoJournalOnShard(EngineShard* shard, RunnableResult resul
} }
// Record to journal autojournal commands, here we allow await which anables writing to sync // Record to journal autojournal commands, here we allow await which anables writing to sync
// the journal change. // the journal change.
LogJournalOnShard(shard, std::move(entry_payload), unique_shard_cnt_, true); LogJournalOnShard(shard, std::move(entry_payload), unique_shard_cnt_);
} }
void Transaction::LogJournalOnShard(EngineShard* shard, journal::Entry::Payload&& payload, void Transaction::LogJournalOnShard(EngineShard* shard, journal::Entry::Payload&& payload,
uint32_t shard_cnt, bool allow_await) const { uint32_t shard_cnt) const {
auto journal = shard->journal(); auto journal = shard->journal();
CHECK(journal); CHECK(journal);
journal->RecordEntry(txid_, journal::Op::COMMAND, db_index_, shard_cnt, journal->RecordEntry(txid_, journal::Op::COMMAND, db_index_, shard_cnt,
unique_slot_checker_.GetUniqueSlotId(), std::move(payload), allow_await); unique_slot_checker_.GetUniqueSlotId(), std::move(payload));
} }
void Transaction::ReviveAutoJournal() { void Transaction::ReviveAutoJournal() {

View file

@ -341,8 +341,8 @@ class Transaction {
void PrepareMultiForScheduleSingleHop(Namespace* ns, ShardId sid, DbIndex db, CmdArgList args); void PrepareMultiForScheduleSingleHop(Namespace* ns, ShardId sid, DbIndex db, CmdArgList args);
// Write a journal entry to a shard journal with the given payload. // Write a journal entry to a shard journal with the given payload.
void LogJournalOnShard(EngineShard* shard, journal::Entry::Payload&& payload, uint32_t shard_cnt, void LogJournalOnShard(EngineShard* shard, journal::Entry::Payload&& payload,
bool allow_await) const; uint32_t shard_cnt) const;
// Re-enable auto journal for commands marked as NO_AUTOJOURNAL. Call during setup. // Re-enable auto journal for commands marked as NO_AUTOJOURNAL. Call during setup.
void ReviveAutoJournal(); void ReviveAutoJournal();

View file

@ -53,32 +53,21 @@ size_t ShardArgs::Size() const {
void RecordJournal(const OpArgs& op_args, string_view cmd, const ShardArgs& args, void RecordJournal(const OpArgs& op_args, string_view cmd, const ShardArgs& args,
uint32_t shard_cnt) { uint32_t shard_cnt) {
VLOG(2) << "Logging command " << cmd << " from txn " << op_args.tx->txid(); VLOG(2) << "Logging command " << cmd << " from txn " << op_args.tx->txid();
op_args.tx->LogJournalOnShard(op_args.shard, Payload(cmd, args), shard_cnt, true); op_args.tx->LogJournalOnShard(op_args.shard, Payload(cmd, args), shard_cnt);
} }
void RecordJournal(const OpArgs& op_args, std::string_view cmd, facade::ArgSlice args, void RecordJournal(const OpArgs& op_args, std::string_view cmd, facade::ArgSlice args,
uint32_t shard_cnt) { uint32_t shard_cnt) {
VLOG(2) << "Logging command " << cmd << " from txn " << op_args.tx->txid(); VLOG(2) << "Logging command " << cmd << " from txn " << op_args.tx->txid();
op_args.tx->LogJournalOnShard(op_args.shard, Payload(cmd, args), shard_cnt, true); op_args.tx->LogJournalOnShard(op_args.shard, Payload(cmd, args), shard_cnt);
} }
void RecordExpiry(DbIndex dbid, string_view key, bool preempts) { void RecordExpiry(DbIndex dbid, string_view key) {
auto journal = EngineShard::tlocal()->journal(); auto journal = EngineShard::tlocal()->journal();
CHECK(journal); CHECK(journal);
if (!preempts) {
util::FiberAtomicGuard guard;
journal->RecordEntry(0, journal::Op::EXPIRED, dbid, 1, cluster::KeySlot(key),
Payload("DEL", ArgSlice{key}), preempts);
return;
}
journal->RecordEntry(0, journal::Op::EXPIRED, dbid, 1, cluster::KeySlot(key), journal->RecordEntry(0, journal::Op::EXPIRED, dbid, 1, cluster::KeySlot(key),
Payload("DEL", ArgSlice{key}), preempts); Payload("DEL", ArgSlice{key}));
}
void TriggerJournalWriteToSink() {
auto journal = EngineShard::tlocal()->journal();
CHECK(journal);
journal->RecordEntry(0, journal::Op::NOOP, 0, 0, nullopt, {}, true);
} }
LockTag::LockTag(std::string_view key) { LockTag::LockTag(std::string_view key) {

View file

@ -224,7 +224,7 @@ void RecordJournal(const OpArgs& op_args, std::string_view cmd, ArgSlice args,
// Record expiry in journal with independent transaction. Must be called from shard thread holding // Record expiry in journal with independent transaction. Must be called from shard thread holding
// key. // key.
void RecordExpiry(DbIndex dbid, std::string_view key, bool preempts = false); void RecordExpiry(DbIndex dbid, std::string_view key);
// Trigger journal write to sink, no journal record will be added to journal. // Trigger journal write to sink, no journal record will be added to journal.
// Must be called from shard thread of journal to sink. // Must be called from shard thread of journal to sink.