From 1b51e82e55efccbe6e35a2907295c4a7e415322b Mon Sep 17 00:00:00 2001 From: Vladislav Date: Sun, 18 Feb 2024 15:36:14 +0300 Subject: [PATCH] chore(transaction): Add debug stats for fail printing (#2600) * chore(transaction): Add debug stats for per shard data --------- Signed-off-by: Vladislav Oleshko --- src/server/transaction.cc | 30 ++++++++++++++++++++++++++---- src/server/transaction.h | 16 +++++++++++++++- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/src/server/transaction.cc b/src/server/transaction.cc index 988113de6..59c47b931 100644 --- a/src/server/transaction.cc +++ b/src/server/transaction.cc @@ -111,7 +111,7 @@ void Transaction::PhasedBarrier::Dec(Transaction* keep_alive) { ::boost::intrusive_ptr guard(keep_alive); uint32_t before = count_.fetch_sub(1); - CHECK_GE(before, 1u); + CHECK_GE(before, 1u) << keep_alive->DEBUG_PrintFailState(EngineShard::tlocal()->shard_id()); if (before == 1) ec_.notify(); } @@ -209,6 +209,7 @@ void Transaction::InitBase(DbIndex dbid, CmdArgList args) { db_index_ = dbid; full_args_ = args; local_result_ = OpStatus::OK; + stats_.coordinator_index = ProactorBase::me() ? ProactorBase::me()->GetPoolIndex() : kInvalidSid; } void Transaction::InitGlobal() { @@ -600,10 +601,12 @@ bool Transaction::RunInShard(EngineShard* shard, bool txq_ooo) { unsigned idx = SidToId(shard->shard_id()); auto& sd = shard_data_[idx]; - CHECK(sd.local_mask & ARMED); + CHECK(sd.local_mask & ARMED) << DEBUG_PrintFailState(shard->shard_id()); sd.local_mask &= ~ARMED; - CHECK_GT(run_barrier_.DEBUG_Count(), 0u); + sd.stats.total_runs++; + + DCHECK_GT(run_barrier_.DEBUG_Count(), 0u); VLOG(2) << "RunInShard: " << DebugId() << " sid:" << shard->shard_id() << " " << sd.local_mask; bool was_suspended = sd.local_mask & SUSPENDED_Q; @@ -743,6 +746,8 @@ void Transaction::ScheduleInternal() { // Loop until successfully scheduled in all shards. while (true) { + stats_.schedule_attempts++; + txid_ = op_seq.fetch_add(1, memory_order_relaxed); InitTxTime(); @@ -1021,6 +1026,21 @@ void Transaction::FIX_ConcludeJournalExec() { MultiReportJournalOnShard(EngineShard::tlocal()); } +string Transaction::DEBUG_PrintFailState(ShardId sid) const { + auto res = StrCat( + "usc: ", unique_shard_cnt_, ", name:", GetCId()->name(), + ", usecnt:", use_count_.load(memory_order_relaxed), ", runcnt: ", run_barrier_.DEBUG_Count(), + ", coordstate: ", coordinator_state_, ", coord native thread: ", stats_.coordinator_index, + ", schedule attempts: ", stats_.schedule_attempts, ", report from sid: ", sid, "\n"); + std::atomic_thread_fence(memory_order_acquire); + for (unsigned i = 0; i < shard_data_.size(); ++i) { + const auto& sd = shard_data_[i]; + absl::StrAppend(&res, "- shard: ", i, " local_mask:", sd.local_mask, + " total_runs: ", sd.stats.total_runs, "\n"); + } + return res; +} + void Transaction::EnableShard(ShardId sid) { unique_shard_cnt_ = 1; unique_shard_id_ = sid; @@ -1048,9 +1068,11 @@ Transaction::RunnableResult Transaction::RunQuickie(EngineShard* shard) { DVLOG(1) << "RunQuickSingle " << DebugId() << " " << shard->shard_id(); DCHECK(cb_ptr_) << DebugId() << " " << shard->shard_id(); - CHECK(sd.local_mask & ARMED); + CHECK(sd.local_mask & ARMED) << DEBUG_PrintFailState(shard->shard_id()); sd.local_mask &= ~ARMED; + sd.stats.total_runs++; + // Calling the callback in somewhat safe way RunnableResult result; try { diff --git a/src/server/transaction.h b/src/server/transaction.h index b8ac0c0e8..f1ea955b0 100644 --- a/src/server/transaction.h +++ b/src/server/transaction.h @@ -354,6 +354,9 @@ class Transaction { // Send journal EXEC opcode after a series of MULTI commands on the currently active shard void FIX_ConcludeJournalExec(); + // Print in-dept failure state for debugging. + std::string DEBUG_PrintFailState(ShardId sid) const; + private: // Holds number of locks for each IntentLock::Mode: shared and exlusive. struct LockCnt { @@ -382,8 +385,13 @@ class Transaction { // Index of key relative to args in shard that the shard was woken up after blocking wait. uint16_t wake_key_pos = UINT16_MAX; + // Irrational stats purely for debugging purposes. + struct Stats { + unsigned total_runs = 0; // total number of runs + } stats; + // Prevent "false sharing" between cache lines: occupy a full cache line (64 bytes) - char pad[64 - 4 * sizeof(uint32_t)]; + char pad[64 - 4 * sizeof(uint32_t) - sizeof(Stats)]; }; static_assert(sizeof(PerShardData) == 64); // cacheline @@ -626,6 +634,12 @@ class Transaction { OpStatus local_result_ = OpStatus::OK; absl::base_internal::SpinLock local_result_mu_; + // Stats purely for debugging purposes + struct Stats { + size_t schedule_attempts = 0; + ShardId coordinator_index = 0; + } stats_; + private: struct TLTmpSpace { std::vector& GetShardIndex(unsigned size);