chore: add oom stats to /metrics (#2680)

* chore: add oom stats to /metrics

Expose oom/cmd errors when we reject executing a command if we reached OOM state (controlled by oom_deny_ratio flag).
Expose oom/insert errors when we do not insert a new key or do not grow a dashtable (controlled by table_growth_margin).

Move OOM command check to a place that covers all types of transactions - including multi and squashing transactions. 
 
---------

Signed-off-by: Roman Gershman <roman@dragonflydb.io>
This commit is contained in:
Roman Gershman 2024-03-03 20:01:21 +02:00 committed by GitHub
parent 7c443f3a15
commit 0c1150956b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 25 additions and 12 deletions

View file

@ -968,7 +968,19 @@ static optional<ErrorReply> VerifyConnectionAclStatus(const CommandId* cid,
optional<ErrorReply> Service::VerifyCommandExecution(const CommandId* cid, optional<ErrorReply> Service::VerifyCommandExecution(const CommandId* cid,
const ConnectionContext* cntx, const ConnectionContext* cntx,
CmdArgList tail_args) { CmdArgList tail_args) {
// TODO: Move OOM check here ServerState& etl = *ServerState::tlocal();
if ((cid->opt_mask() & CO::DENYOOM) && etl.is_master) {
uint64_t start_ns = absl::GetCurrentTimeNanos();
uint64_t used_memory = etl.GetUsedMemory(start_ns);
double oom_deny_ratio = GetFlag(FLAGS_oom_deny_ratio);
if (used_memory > (max_memory_limit * oom_deny_ratio)) {
etl.stats.oom_error_cmd_cnt++;
return facade::ErrorReply{kOutOfMemory};
}
}
return VerifyConnectionAclStatus(cid, cntx, "ACL rules changed between the MULTI and EXEC", return VerifyConnectionAclStatus(cid, cntx, "ACL rules changed between the MULTI and EXEC",
tail_args); tail_args);
} }
@ -1136,16 +1148,6 @@ void Service::DispatchCommand(CmdArgList args, facade::ConnectionContext* cntx)
return cntx->SendSimpleString("QUEUED"); return cntx->SendSimpleString("QUEUED");
} }
if (cid->opt_mask() & CO::DENYOOM && etl.is_master) {
uint64_t start_ns = absl::GetCurrentTimeNanos();
uint64_t used_memory = etl.GetUsedMemory(start_ns);
double oom_deny_ratio = GetFlag(FLAGS_oom_deny_ratio);
if (used_memory > (max_memory_limit * oom_deny_ratio)) {
return cntx->reply_builder()->SendError(kOutOfMemory);
}
}
// Create command transaction // Create command transaction
intrusive_ptr<Transaction> dist_trans; intrusive_ptr<Transaction> dist_trans;

View file

@ -1025,6 +1025,13 @@ void PrintPrometheusMetrics(const Metrics& m, StringResponse* resp) {
&resp->body()); &resp->body());
AppendMetricWithoutLabels("memory_max_bytes", "", max_memory_limit, MetricType::GAUGE, AppendMetricWithoutLabels("memory_max_bytes", "", max_memory_limit, MetricType::GAUGE,
&resp->body()); &resp->body());
if (m.events.insertion_rejections | m.coordinator_stats.oom_error_cmd_cnt) {
AppendMetricValue("oom_errors_total", m.events.insertion_rejections, {"type"}, {"insert"},
&resp->body());
AppendMetricValue("oom_errors_total", m.coordinator_stats.oom_error_cmd_cnt, {"type"}, {"cmd"},
&resp->body());
}
if (sdata_res.has_value()) { if (sdata_res.has_value()) {
size_t rss = sdata_res->vm_rss + sdata_res->hugetlb_pages; size_t rss = sdata_res->vm_rss + sdata_res->hugetlb_pages;
AppendMetricWithoutLabels("used_memory_rss_bytes", "", rss, MetricType::GAUGE, &resp->body()); AppendMetricWithoutLabels("used_memory_rss_bytes", "", rss, MetricType::GAUGE, &resp->body());

View file

@ -28,7 +28,7 @@ ServerState::Stats::Stats(unsigned num_shards) : tx_width_freq_arr(num_shards) {
} }
ServerState::Stats& ServerState::Stats::Add(const ServerState::Stats& other) { ServerState::Stats& ServerState::Stats::Add(const ServerState::Stats& other) {
static_assert(sizeof(Stats) == 14 * 8, "Stats size mismatch"); static_assert(sizeof(Stats) == 15 * 8, "Stats size mismatch");
for (int i = 0; i < NUM_TX_TYPES; ++i) { for (int i = 0; i < NUM_TX_TYPES; ++i) {
this->tx_type_cnt[i] += other.tx_type_cnt[i]; this->tx_type_cnt[i] += other.tx_type_cnt[i];
@ -44,6 +44,7 @@ ServerState::Stats& ServerState::Stats::Add(const ServerState::Stats& other) {
this->multi_squash_exec_reply_usec += other.multi_squash_exec_reply_usec; this->multi_squash_exec_reply_usec += other.multi_squash_exec_reply_usec;
this->blocked_on_interpreter += other.blocked_on_interpreter; this->blocked_on_interpreter += other.blocked_on_interpreter;
this->oom_error_cmd_cnt += other.oom_error_cmd_cnt;
if (this->tx_width_freq_arr.size() > 0) { if (this->tx_width_freq_arr.size() > 0) {
DCHECK_EQ(this->tx_width_freq_arr.size(), other.tx_width_freq_arr.size()); DCHECK_EQ(this->tx_width_freq_arr.size(), other.tx_width_freq_arr.size());

View file

@ -118,6 +118,9 @@ class ServerState { // public struct - to allow initialization.
uint64_t blocked_on_interpreter = 0; uint64_t blocked_on_interpreter = 0;
// Number of times we rejected command dispatch due to OOM condition.
uint64_t oom_error_cmd_cnt = 0;
std::valarray<uint64_t> tx_width_freq_arr; std::valarray<uint64_t> tx_width_freq_arr;
}; };