From 843a40dba91495301ce28e92689f84bf09644ca3 Mon Sep 17 00:00:00 2001 From: Kostas Kyrimis Date: Wed, 7 May 2025 10:15:14 +0300 Subject: [PATCH] chore(metrics): add rdb_bgsave_in_progress and rdb_last_bgsave_status (#5061) * add metric rdb_bgsave_in_progress * add metric rdb_last_bgsave_status --- src/server/detail/save_stages_controller.h | 10 +++++++-- src/server/server_family.cc | 26 ++++++++++++++++++---- src/server/server_family.h | 10 ++++++++- 3 files changed, 39 insertions(+), 7 deletions(-) diff --git a/src/server/detail/save_stages_controller.h b/src/server/detail/save_stages_controller.h index 564a672ac..0cb17f7a9 100644 --- a/src/server/detail/save_stages_controller.h +++ b/src/server/detail/save_stages_controller.h @@ -35,6 +35,8 @@ struct SaveStagesInputs { Service* service_; util::fb2::FiberQueueThreadPool* fq_threadpool_; std::shared_ptr snapshot_storage_; + // true if the command that triggered this flow is bgsave. false otherwise. + bool is_bg_save_; }; class RdbSnapshot { @@ -77,7 +79,7 @@ class RdbSnapshot { }; struct SaveStagesController : public SaveStagesInputs { - SaveStagesController(SaveStagesInputs&& input); + explicit SaveStagesController(SaveStagesInputs&& input); // Objects of this class are used concurrently. Call this function // in a mutually exlusive context to avoid data races. // Also call this function before any call to `WaitAllSnapshots` @@ -97,6 +99,10 @@ struct SaveStagesController : public SaveStagesInputs { uint32_t GetCurrentSaveDuration(); RdbSaver::SnapshotStats GetCurrentSnapshotProgress() const; + bool IsBgSave() const { + return is_bg_save_; + } + private: // In the new version (.dfs) we store a file for every shard and one more summary file. // Summary file is always last in snapshots array. @@ -126,7 +132,6 @@ struct SaveStagesController : public SaveStagesInputs { void RunStage(void (SaveStagesController::*cb)(unsigned)); - private: time_t start_time_; std::filesystem::path full_path_; @@ -135,6 +140,7 @@ struct SaveStagesController : public SaveStagesInputs { absl::flat_hash_map rdb_name_map_; util::fb2::Mutex rdb_name_map_mu_; + bool is_bg_save_ = false; }; GenericError ValidateFilename(const std::filesystem::path& filename, bool new_version); diff --git a/src/server/server_family.cc b/src/server/server_family.cc index 9c7129cdf..967d15db0 100644 --- a/src/server/server_family.cc +++ b/src/server/server_family.cc @@ -1707,7 +1707,8 @@ GenericError ServerFamily::DoSave(bool ignore_state) { } GenericError ServerFamily::DoSaveCheckAndStart(const SaveCmdOptions& save_cmd_opts, - Transaction* trans, bool ignore_state) { + Transaction* trans, DoSaveCheckAndStartOpts opts) { + auto [ignore_state, bg_save] = opts; auto state = ServerState::tlocal()->gstate(); // In some cases we want to create a snapshot even if server is not active, f.e in takeover @@ -1728,7 +1729,7 @@ GenericError ServerFamily::DoSaveCheckAndStart(const SaveCmdOptions& save_cmd_op save_controller_ = make_unique(detail::SaveStagesInputs{ save_cmd_opts.new_version, save_cmd_opts.cloud_uri, save_cmd_opts.basename, trans, - &service_, fq_threadpool_.get(), snapshot_storage}); + &service_, fq_threadpool_.get(), snapshot_storage, opts.bg_save}); auto res = save_controller_->InitResourcesAndStart(); @@ -1736,8 +1737,13 @@ GenericError ServerFamily::DoSaveCheckAndStart(const SaveCmdOptions& save_cmd_op DCHECK_EQ(res->error, true); last_save_info_.SetLastSaveError(*res); save_controller_.reset(); + if (bg_save) { + last_save_info_.last_bgsave_status = false; + } return res->error; } + + last_save_info_.bgsave_in_progress = bg_save; } return {}; } @@ -1751,6 +1757,11 @@ GenericError ServerFamily::WaitUntilSaveFinished(Transaction* trans, bool ignore util::fb2::LockGuard lk(save_mu_); save_info = save_controller_->Finalize(); + if (save_controller_->IsBgSave()) { + last_save_info_.bgsave_in_progress = false; + last_save_info_.last_bgsave_status = !save_info.error; + } + if (save_info.error) { last_save_info_.SetLastSaveError(save_info); } else { @@ -1767,7 +1778,8 @@ GenericError ServerFamily::WaitUntilSaveFinished(Transaction* trans, bool ignore GenericError ServerFamily::DoSave(const SaveCmdOptions& save_cmd_opts, Transaction* trans, bool ignore_state) { - if (auto ec = DoSaveCheckAndStart(save_cmd_opts, trans, ignore_state); ec) { + DoSaveCheckAndStartOpts opts{.ignore_state = ignore_state}; + if (auto ec = DoSaveCheckAndStart(save_cmd_opts, trans, opts); ec) { return ec; } @@ -2182,7 +2194,8 @@ void ServerFamily::BgSave(CmdArgList args, const CommandContext& cmd_cntx) { return; } - if (auto ec = DoSaveCheckAndStart(*maybe_res, cmd_cntx.tx); ec) { + DoSaveCheckAndStartOpts opts{.bg_save = true}; + if (auto ec = DoSaveCheckAndStart(*maybe_res, cmd_cntx.tx, opts); ec) { cmd_cntx.rb->SendError(ec.Format()); return; } @@ -2621,6 +2634,11 @@ string ServerFamily::FormatInfoMetrics(const Metrics& m, std::string_view sectio } append("rdb_changes_since_last_success_save", m.events.update); + auto save = GetLastSaveInfo(); + append("rdb_bgsave_in_progress", static_cast(save.bgsave_in_progress)); + std::string val = save.last_bgsave_status ? "ok" : "err"; + append("rdb_last_bgsave_status", val); + // when last failed save append("last_failed_save", save_info.last_error_time); append("last_error", save_info.last_error.Format()); diff --git a/src/server/server_family.h b/src/server/server_family.h index 013a75d2c..66ca716a5 100644 --- a/src/server/server_family.h +++ b/src/server/server_family.h @@ -146,6 +146,9 @@ struct LastSaveInfo { GenericError last_error; time_t last_error_time = 0; // epoch time in seconds. time_t failed_duration_sec = 0; // epoch time in seconds. + // false if last attempt failed + bool last_bgsave_status = true; + bool bgsave_in_progress = false; }; struct SnapshotSpec { @@ -340,8 +343,13 @@ class ServerFamily { void BgSaveFb(boost::intrusive_ptr trans); + struct DoSaveCheckAndStartOpts { + bool ignore_state = false; + bool bg_save = false; + }; + GenericError DoSaveCheckAndStart(const SaveCmdOptions& save_cmd_opts, Transaction* trans, - bool ignore_state = false) ABSL_LOCKS_EXCLUDED(save_mu_); + DoSaveCheckAndStartOpts opts) ABSL_LOCKS_EXCLUDED(save_mu_); GenericError WaitUntilSaveFinished(Transaction* trans, bool ignore_state = false) ABSL_NO_THREAD_SAFETY_ANALYSIS;