opt(server): Short-circuit ExecuteAsync(). (#1601)

* feat(server): Short-circuit ExecuteAsync(). * Do not leak (hopefully :) * Add documentation about coordinator_index_ * Use ServerState::tlocal()->thread_index()
2025-05-10 18:05:44 +02:00 · 2023-07-31 12:12:59 +03:00 · 2023-07-31 12:12:59 +03:00 · fba0800081
commit fba0800081
parent 4c85d5825d
2 changed files with 15 additions and 6 deletions
--- a/src/server/transaction.cc
+++ b/src/server/transaction.cc
@ -136,7 +136,7 @@ void Transaction::InitShardData(absl::Span<const PerShardCache> shard_index, siz
    sd.arg_count = si.args.size();
    sd.arg_start = args_.size();

-    // Multi transactions can re-intitialize on different shards, so clear ACTIVE flag.
+    // Multi transactions can re-initialize on different shards, so clear ACTIVE flag.
    if (multi_)
      sd.local_mask &= ~ACTIVE;

@ -212,7 +212,7 @@ void Transaction::StoreKeysInArgs(KeyIndex key_index, bool rev_mapping) {
 /**
 *
 * There are 4 options that we consider here:
- * a. T spans a single shard and its not multi.
+ * a. T spans a single shard and it's not multi.
 *    unique_shard_id_ is predefined before the schedule() is called.
 *    In that case only a single thread will be scheduled and it will use shard_data[0] just because
 *    shard_data.size() = 1. Coordinator thread can access any data because there is a
@ -828,7 +828,7 @@ void Transaction::ExecuteAsync() {
  // We do not necessarily Execute this transaction in 'cb' below. It well may be that it will be
  // executed by the engine shard once it has been armed and coordinator thread will finish the
  // transaction before engine shard thread stops accessing it. Therefore, we increase reference
-  // by number of callbacks accessesing 'this' to allow callbacks to execute shard->Execute(this);
+  // by number of callbacks accessing 'this' to allow callbacks to execute shard->Execute(this);
  // safely.
  use_count_.fetch_add(unique_shard_cnt_, memory_order_relaxed);

@ -844,6 +844,15 @@ void Transaction::ExecuteAsync() {
  // IsArmedInShard in other threads.
  run_count_.store(unique_shard_cnt_, memory_order_release);

+  // Execute inline when we can. We can't use coordinator_index_ because we may offload this
+  // function to run in a different thread.
+  if (unique_shard_cnt_ == 1 && ServerState::tlocal()->thread_index() == unique_shard_id_) {
+    DVLOG(1) << "Short-circuit ExecuteAsync " << DebugId();
+    EngineShard::tlocal()->PollExecution("exec_cb", this);
+    intrusive_ptr_release(this);  // against use_count_.fetch_add above.
+    return;
+  }
+
  // We verify seq lock has the same generation number. See below for more info.
  auto cb = [seq, this] {
    EngineShard* shard = EngineShard::tlocal();
@ -856,7 +865,7 @@ void Transaction::ExecuteAsync() {
      DVLOG(3) << "PollExecCb " << DebugId() << " sid(" << shard->shard_id() << ") "
               << run_count_.load(memory_order_relaxed);

-      // We also make sure that for mult-operation transactions like Multi/Eval
+      // We also make sure that for multi-operation transactions like Multi/Eval
      // this callback runs on a correct operation. We want to avoid a situation
      // where the first operation is executed and the second operation is armed and
      // now this callback from the previous operation finally runs and calls PollExecution.
--- a/src/server/transaction.h
+++ b/src/server/transaction.h
@ -131,7 +131,7 @@ class Transaction {
    // UNUSED = 1 << 1,
    OUT_OF_ORDER = 1 << 2,      // Whether it can run as out of order
    KEYLOCK_ACQUIRED = 1 << 3,  // Whether its key locks are acquired
-    SUSPENDED_Q = 1 << 4,       // Whether is suspened (by WatchInShard())
+    SUSPENDED_Q = 1 << 4,       // Whether is suspended (by WatchInShard())
    AWAKED_Q = 1 << 5,          // Whether it was awakened (by NotifySuspended())
    EXPIRED_Q = 1 << 6,         // Whether it timed out and should be dropped
    UNLOCK_MULTI = 1 << 7,      // Whether this shard executed UnlockMultiShardCb
@ -333,7 +333,7 @@ class Transaction {
    // this is the only variable that is accessed by both shard and coordinator threads.
    std::atomic_bool is_armed{false};

-    // We pad with some memory so that atomic loads won't cause false sharing betweem threads.
+    // We pad with some memory so that atomic loads won't cause false sharing between threads.
    char pad[46];  // to make sure PerShardData is 64 bytes and takes full cacheline.

    uint32_t arg_start = 0;  // Indices into args_ array.