chore: add oom stats to /metrics (#2680)

* chore: add oom stats to /metrics Expose oom/cmd errors when we reject executing a command if we reached OOM state (controlled by oom_deny_ratio flag). Expose oom/insert errors when we do not insert a new key or do not grow a dashtable (controlled by table_growth_margin). Move OOM command check to a place that covers all types of transactions - including multi and squashing transactions. --------- Signed-off-by: Roman Gershman <roman@dragonflydb.io>
2025-05-11 10:25:47 +02:00 · 2024-03-03 20:01:21 +02:00 · 2024-03-03 20:01:21 +02:00 · 0c1150956b
commit 0c1150956b
parent 7c443f3a15
4 changed files with 25 additions and 12 deletions
--- a/src/server/main_service.cc
+++ b/src/server/main_service.cc
@ -968,7 +968,19 @@ static optional<ErrorReply> VerifyConnectionAclStatus(const CommandId* cid,
 optional<ErrorReply> Service::VerifyCommandExecution(const CommandId* cid,
                                                     const ConnectionContext* cntx,
                                                     CmdArgList tail_args) {
-  // TODO: Move OOM check here
+  ServerState& etl = *ServerState::tlocal();
  if ((cid->opt_mask() & CO::DENYOOM) && etl.is_master) {
    uint64_t start_ns = absl::GetCurrentTimeNanos();
    uint64_t used_memory = etl.GetUsedMemory(start_ns);
    double oom_deny_ratio = GetFlag(FLAGS_oom_deny_ratio);
    if (used_memory > (max_memory_limit * oom_deny_ratio)) {
      etl.stats.oom_error_cmd_cnt++;
      return facade::ErrorReply{kOutOfMemory};
    }
  }
  return VerifyConnectionAclStatus(cid, cntx, "ACL rules changed between the MULTI and EXEC",
                                   tail_args);
 }
@ -1136,16 +1148,6 @@ void Service::DispatchCommand(CmdArgList args, facade::ConnectionContext* cntx)
    return cntx->SendSimpleString("QUEUED");
  }
  if (cid->opt_mask() & CO::DENYOOM && etl.is_master) {
    uint64_t start_ns = absl::GetCurrentTimeNanos();
    uint64_t used_memory = etl.GetUsedMemory(start_ns);
    double oom_deny_ratio = GetFlag(FLAGS_oom_deny_ratio);
    if (used_memory > (max_memory_limit * oom_deny_ratio)) {
      return cntx->reply_builder()->SendError(kOutOfMemory);
    }
  }
  // Create command transaction
  intrusive_ptr<Transaction> dist_trans;
--- a/src/server/server_family.cc
+++ b/src/server/server_family.cc
@ -1025,6 +1025,13 @@ void PrintPrometheusMetrics(const Metrics& m, StringResponse* resp) {
                            &resp->body());
  AppendMetricWithoutLabels("memory_max_bytes", "", max_memory_limit, MetricType::GAUGE,
                            &resp->body());
  if (m.events.insertion_rejections | m.coordinator_stats.oom_error_cmd_cnt) {
    AppendMetricValue("oom_errors_total", m.events.insertion_rejections, {"type"}, {"insert"},
                      &resp->body());
    AppendMetricValue("oom_errors_total", m.coordinator_stats.oom_error_cmd_cnt, {"type"}, {"cmd"},
                      &resp->body());
  }
  if (sdata_res.has_value()) {
    size_t rss = sdata_res->vm_rss + sdata_res->hugetlb_pages;
    AppendMetricWithoutLabels("used_memory_rss_bytes", "", rss, MetricType::GAUGE, &resp->body());
--- a/src/server/server_state.cc
+++ b/src/server/server_state.cc
@ -28,7 +28,7 @@ ServerState::Stats::Stats(unsigned num_shards) : tx_width_freq_arr(num_shards) {
 }
 ServerState::Stats& ServerState::Stats::Add(const ServerState::Stats& other) {
-  static_assert(sizeof(Stats) == 14 * 8, "Stats size mismatch");
+  static_assert(sizeof(Stats) == 15 * 8, "Stats size mismatch");
  for (int i = 0; i < NUM_TX_TYPES; ++i) {
    this->tx_type_cnt[i] += other.tx_type_cnt[i];
@ -44,6 +44,7 @@ ServerState::Stats& ServerState::Stats::Add(const ServerState::Stats& other) {
  this->multi_squash_exec_reply_usec += other.multi_squash_exec_reply_usec;
  this->blocked_on_interpreter += other.blocked_on_interpreter;
  this->oom_error_cmd_cnt += other.oom_error_cmd_cnt;
  if (this->tx_width_freq_arr.size() > 0) {
    DCHECK_EQ(this->tx_width_freq_arr.size(), other.tx_width_freq_arr.size());
--- a/src/server/server_state.h
+++ b/src/server/server_state.h
@ -118,6 +118,9 @@ class ServerState {  // public struct - to allow initialization.
    uint64_t blocked_on_interpreter = 0;
    // Number of times we rejected command dispatch due to OOM condition.
    uint64_t oom_error_cmd_cnt = 0;
    std::valarray<uint64_t> tx_width_freq_arr;
  };