dragonfly/src/server/snapshot.cc
Stepan Bagritsevich 9fbb301988
chore(snapshot): Small cleanup in Snapshot code (#4377)
* chore(snapshot): Small cleanup in Snapshot code

Signed-off-by: Stepan Bagritsevich <stefan@dragonflydb.io>

* refactor: address comments

Signed-off-by: Stepan Bagritsevich <stefan@dragonflydb.io>

---------

Signed-off-by: Stepan Bagritsevich <stefan@dragonflydb.io>
2024-12-26 16:02:20 +00:00

446 lines
15 KiB
C++

// Copyright 2024, DragonflyDB authors. All rights reserved.
// See LICENSE for licensing terms.
//
#include "server/snapshot.h"
#include <absl/strings/match.h>
#include <absl/strings/str_cat.h>
#include <mutex>
#include "base/flags.h"
#include "base/logging.h"
#include "core/heap_size.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/journal/journal.h"
#include "server/rdb_extensions.h"
#include "server/rdb_save.h"
#include "server/server_state.h"
#include "server/tiered_storage.h"
#include "util/fibers/synchronization.h"
namespace dfly {
using namespace std;
using namespace util;
using namespace chrono_literals;
using facade::operator""_MB;
using facade::operator""_KB;
namespace {
thread_local absl::flat_hash_set<SliceSnapshot*> tl_slice_snapshots;
constexpr size_t kMinBlobSize = 32_KB;
} // namespace
SliceSnapshot::SliceSnapshot(CompressionMode compression_mode, DbSlice* slice,
SnapshotDataConsumerInterface* consumer, Context* cntx)
: db_slice_(slice),
db_array_(slice->databases()),
compression_mode_(compression_mode),
consumer_(consumer),
cntx_(cntx) {
tl_slice_snapshots.insert(this);
}
SliceSnapshot::~SliceSnapshot() {
DCHECK(db_slice_->shard_owner()->IsMyThread());
tl_slice_snapshots.erase(this);
}
size_t SliceSnapshot::GetThreadLocalMemoryUsage() {
size_t mem = 0;
for (SliceSnapshot* snapshot : tl_slice_snapshots) {
mem += snapshot->GetBufferCapacity();
}
return mem;
}
bool SliceSnapshot::IsSnaphotInProgress() {
return tl_slice_snapshots.size() > 0;
}
void SliceSnapshot::Start(bool stream_journal, SnapshotFlush allow_flush) {
DCHECK(!snapshot_fb_.IsJoinable());
auto db_cb = [this](DbIndex db_index, const DbSlice::ChangeReq& req) {
OnDbChange(db_index, req);
};
snapshot_version_ = db_slice_->RegisterOnChange(std::move(db_cb));
if (stream_journal) {
auto* journal = db_slice_->shard_owner()->journal();
DCHECK(journal);
auto journal_cb = [this](const journal::JournalItem& item, bool await) {
OnJournalEntry(item, await);
};
journal_cb_id_ = journal->RegisterOnChange(std::move(journal_cb));
}
const auto flush_threshold = ServerState::tlocal()->serialization_max_chunk_size;
std::function<void(size_t, RdbSerializer::FlushState)> flush_fun;
if (flush_threshold != 0 && allow_flush == SnapshotFlush::kAllow) {
flush_fun = [this, flush_threshold](size_t bytes_serialized,
RdbSerializer::FlushState flush_state) {
if (bytes_serialized > flush_threshold) {
size_t serialized = FlushSerialized(flush_state);
VLOG(2) << "FlushSerialized " << serialized << " bytes";
auto& stats = ServerState::tlocal()->stats;
++stats.big_value_preemptions;
}
};
}
serializer_ = std::make_unique<RdbSerializer>(compression_mode_, flush_fun);
VLOG(1) << "DbSaver::Start - saving entries with version less than " << snapshot_version_;
snapshot_fb_ = fb2::Fiber("snapshot", [this, stream_journal] {
this->IterateBucketsFb(stream_journal);
db_slice_->UnregisterOnChange(snapshot_version_);
consumer_->Finalize();
});
}
void SliceSnapshot::StartIncremental(LSN start_lsn) {
serializer_ = std::make_unique<RdbSerializer>(compression_mode_);
snapshot_fb_ = fb2::Fiber("incremental_snapshot",
[start_lsn, this] { this->SwitchIncrementalFb(start_lsn); });
}
// Called only for replication use-case.
void SliceSnapshot::FinalizeJournalStream(bool cancel) {
DVLOG(1) << "Finalize Snapshot";
DCHECK(db_slice_->shard_owner()->IsMyThread());
if (!journal_cb_id_) { // Finalize only once.
return;
}
uint32_t cb_id = journal_cb_id_;
journal_cb_id_ = 0;
// Wait for serialization to finish in any case.
snapshot_fb_.JoinIfNeeded();
auto* journal = db_slice_->shard_owner()->journal();
journal->UnregisterOnChange(cb_id);
if (!cancel) {
serializer_->SendJournalOffset(journal->GetLsn());
PushSerialized(true);
}
}
// The algorithm is to go over all the buckets and serialize those with
// version < snapshot_version_. In order to serialize each physical bucket exactly once we update
// bucket version to snapshot_version_ once it has been serialized.
// We handle serialization at physical bucket granularity.
// To further complicate things, Table::Traverse covers a logical bucket that may comprise of
// several physical buckets in dash table. For example, items belonging to logical bucket 0
// can reside in buckets 0,1 and stash buckets 56-59.
// PrimeTable::Traverse guarantees an atomic traversal of a single logical bucket,
// it also guarantees 100% coverage of all items that exists when the traversal started
// and survived until it finished.
// Serializes all the entries with version less than snapshot_version_.
void SliceSnapshot::IterateBucketsFb(bool send_full_sync_cut) {
{
auto fiber_name = absl::StrCat("SliceSnapshot-", ProactorBase::me()->GetPoolIndex());
ThisFiber::SetName(std::move(fiber_name));
}
PrimeTable::Cursor cursor;
for (DbIndex db_indx = 0; db_indx < db_array_.size(); ++db_indx) {
stats_.keys_total += db_slice_->DbSize(db_indx);
}
for (DbIndex db_indx = 0; db_indx < db_array_.size(); ++db_indx) {
if (cntx_->IsCancelled())
return;
if (!db_array_[db_indx])
continue;
uint64_t last_yield = 0;
PrimeTable* pt = &db_array_[db_indx]->prime;
VLOG(1) << "Start traversing " << pt->size() << " items for index " << db_indx;
do {
if (cntx_->IsCancelled()) {
return;
}
PrimeTable::Cursor next = pt->TraverseBuckets(
cursor, [this, &db_indx](auto it) { return BucketSaveCb(db_indx, it); });
cursor = next;
PushSerialized(false);
if (stats_.loop_serialized >= last_yield + 100) {
DVLOG(2) << "Before sleep " << ThisFiber::GetName();
ThisFiber::Yield();
DVLOG(2) << "After sleep";
last_yield = stats_.loop_serialized;
// Push in case other fibers (writes commands that pushed previous values)
// filled the buffer.
PushSerialized(false);
}
} while (cursor);
DVLOG(2) << "after loop " << ThisFiber::GetName();
PushSerialized(true);
} // for (dbindex)
CHECK(!serialize_bucket_running_);
if (send_full_sync_cut) {
CHECK(!serializer_->SendFullSyncCut());
PushSerialized(true);
}
// serialized + side_saved must be equal to the total saved.
VLOG(1) << "Exit SnapshotSerializer (loop_serialized/side_saved/cbcalls): "
<< stats_.loop_serialized << "/" << stats_.side_saved << "/" << stats_.savecb_calls;
}
void SliceSnapshot::SwitchIncrementalFb(LSN lsn) {
auto* journal = db_slice_->shard_owner()->journal();
DCHECK(journal);
DCHECK_LE(lsn, journal->GetLsn()) << "The replica tried to sync from the future.";
VLOG(1) << "Starting incremental snapshot from lsn=" << lsn;
// The replica sends the LSN of the next entry is wants to receive.
while (!cntx_->IsCancelled() && journal->IsLSNInBuffer(lsn)) {
serializer_->WriteJournalEntry(journal->GetEntry(lsn));
PushSerialized(false);
lsn++;
}
VLOG(1) << "Last LSN sent in incremental snapshot was " << (lsn - 1);
// This check is safe, but it is not trivially safe.
// We rely here on the fact that JournalSlice::AddLogRecord can
// only preempt while holding the callback lock.
// That guarantees that if we have processed the last LSN the callback
// will only be added after JournalSlice::AddLogRecord has finished
// iterating its callbacks and we won't process the record twice.
// We have to make sure we don't preempt ourselves before registering the callback!
// GetLsn() is always the next lsn that we expect to create.
if (journal->GetLsn() == lsn) {
{
FiberAtomicGuard fg;
serializer_->SendFullSyncCut();
}
auto journal_cb = [this](const journal::JournalItem& item, bool await) {
OnJournalEntry(item, await);
};
journal_cb_id_ = journal->RegisterOnChange(std::move(journal_cb));
PushSerialized(true);
} else {
// We stopped but we didn't manage to send the whole stream.
cntx_->ReportError(
std::make_error_code(errc::state_not_recoverable),
absl::StrCat("Partial sync was unsuccessful because entry #", lsn,
" was dropped from the buffer. Current lsn=", journal->GetLsn()));
FinalizeJournalStream(true);
}
}
bool SliceSnapshot::BucketSaveCb(DbIndex db_index, PrimeTable::bucket_iterator it) {
std::lock_guard guard(big_value_mu_);
++stats_.savecb_calls;
auto check = [&](auto v) {
if (v >= snapshot_version_) {
// either has been already serialized or added after snapshotting started.
DVLOG(3) << "Skipped " << it.segment_id() << ":" << it.bucket_id() << " at " << v;
++stats_.skipped;
return false;
}
return true;
};
if (!check(it.GetVersion())) {
return false;
}
db_slice_->FlushChangeToEarlierCallbacks(db_index, DbSlice::Iterator::FromPrime(it),
snapshot_version_);
auto* blocking_counter = db_slice_->BlockingCounter();
// Locking this never preempts. We merely just increment the underline counter such that
// if SerializeBucket preempts, Heartbeat() won't run because the blocking counter is not
// zero.
std::lock_guard blocking_counter_guard(*blocking_counter);
stats_.loop_serialized += SerializeBucket(db_index, it);
return false;
}
unsigned SliceSnapshot::SerializeBucket(DbIndex db_index, PrimeTable::bucket_iterator it) {
DCHECK_LT(it.GetVersion(), snapshot_version_);
// traverse physical bucket and write it into string file.
serialize_bucket_running_ = true;
it.SetVersion(snapshot_version_);
unsigned result = 0;
while (!it.is_done()) {
++result;
// might preempt due to big value serialization.
SerializeEntry(db_index, it->first, it->second);
++it;
}
serialize_bucket_running_ = false;
return result;
}
void SliceSnapshot::SerializeEntry(DbIndex db_indx, const PrimeKey& pk, const PrimeValue& pv) {
if (pv.IsExternal() && pv.IsCool())
return SerializeEntry(db_indx, pk, pv.GetCool().record->value);
time_t expire_time = 0;
if (pv.HasExpire()) {
auto eit = db_array_[db_indx]->expire.Find(pk);
expire_time = db_slice_->ExpireTime(eit);
}
uint32_t mc_flags = pv.HasFlag() ? db_slice_->GetMCFlag(db_indx, pk) : 0;
if (pv.IsExternal()) {
// We can't block, so we just schedule a tiered read and append it to the delayed entries
util::fb2::Future<PrimeValue> future;
EngineShard::tlocal()->tiered_storage()->Read(
db_indx, pk.ToString(), pv,
[future](const std::string& v) mutable { future.Resolve(PrimeValue(v)); });
delayed_entries_.push_back(
{db_indx, PrimeKey(pk.ToString()), std::move(future), expire_time, mc_flags});
++type_freq_map_[RDB_TYPE_STRING];
} else {
io::Result<uint8_t> res = serializer_->SaveEntry(pk, pv, expire_time, mc_flags, db_indx);
CHECK(res);
++type_freq_map_[*res];
}
}
size_t SliceSnapshot::FlushSerialized(SerializerBase::FlushState flush_state) {
io::StringFile sfile;
serializer_->FlushToSink(&sfile, flush_state);
size_t serialized = sfile.val.size();
if (serialized == 0)
return 0;
uint64_t id = rec_id_++;
DVLOG(2) << "Pushing " << id;
fb2::NoOpLock lk;
// We create a critical section here that ensures that records are pushed in sequential order.
// As a result, it is not possible for two fiber producers to push concurrently.
// If A.id = 5, and then B.id = 6, and both are blocked here, it means that last_pushed_id_ < 4.
// Once last_pushed_id_ = 4, A will be unblocked, while B will wait until A finishes pushing and
// update last_pushed_id_ to 5.
seq_cond_.wait(lk, [&] { return id == this->last_pushed_id_ + 1; });
// Blocking point.
consumer_->ConsumeData(std::move(sfile.val), cntx_);
DCHECK_EQ(last_pushed_id_ + 1, id);
last_pushed_id_ = id;
seq_cond_.notify_all();
VLOG(2) << "Pushed with Serialize() " << serialized;
return serialized;
}
bool SliceSnapshot::PushSerialized(bool force) {
if (!force && serializer_->SerializedLen() < kMinBlobSize)
return false;
// Flush any of the leftovers to avoid interleavings
size_t serialized = FlushSerialized(FlushState::kFlushEndEntry);
if (!delayed_entries_.empty()) {
// Async bucket serialization might have accumulated some delayed values.
// Because we can finally block in this function, we'll await and serialize them
do {
auto& entry = delayed_entries_.back();
serializer_->SaveEntry(entry.key, entry.value.Get(), entry.expire, entry.dbid,
entry.mc_flags);
delayed_entries_.pop_back();
} while (!delayed_entries_.empty());
// blocking point.
serialized += FlushSerialized(FlushState::kFlushEndEntry);
}
return serialized > 0;
}
void SliceSnapshot::OnDbChange(DbIndex db_index, const DbSlice::ChangeReq& req) {
std::lock_guard guard(big_value_mu_);
PrimeTable* table = db_slice_->GetTables(db_index).first;
const PrimeTable::bucket_iterator* bit = req.update();
if (bit) {
if (bit->GetVersion() < snapshot_version_) {
stats_.side_saved += SerializeBucket(db_index, *bit);
}
} else {
string_view key = get<string_view>(req.change);
table->CVCUponInsert(snapshot_version_, key, [this, db_index](PrimeTable::bucket_iterator it) {
DCHECK_LT(it.GetVersion(), snapshot_version_);
stats_.side_saved += SerializeBucket(db_index, it);
});
}
}
// For any key any journal entry must arrive at the replica strictly after its first original rdb
// value. This is guaranteed by the fact that OnJournalEntry runs always after OnDbChange, and
// no database switch can be performed between those two calls, because they are part of one
// transaction.
void SliceSnapshot::OnJournalEntry(const journal::JournalItem& item, bool await) {
// To enable journal flushing to sync after non auto journal command is executed we call
// TriggerJournalWriteToSink. This call uses the NOOP opcode with await=true. Since there is no
// additional journal change to serialize, it simply invokes PushSerialized.
std::lock_guard guard(big_value_mu_);
if (item.opcode != journal::Op::NOOP) {
serializer_->WriteJournalEntry(item.data);
}
if (await) {
// This is the only place that flushes in streaming mode
// once the iterate buckets fiber finished.
PushSerialized(false);
}
}
size_t SliceSnapshot::GetBufferCapacity() const {
if (serializer_ == nullptr) {
return 0;
}
return serializer_->GetBufferCapacity();
}
size_t SliceSnapshot::GetTempBuffersSize() const {
if (serializer_ == nullptr) {
return 0;
}
return serializer_->GetTempBufferSize();
}
RdbSaver::SnapshotStats SliceSnapshot::GetCurrentSnapshotProgress() const {
return {stats_.loop_serialized + stats_.side_saved, stats_.keys_total};
}
} // namespace dfly