diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 25fe6f9db..3cd962adb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -78,7 +78,7 @@ jobs: echo Run ctest -V -L DFLY #GLOG_logtostderr=1 GLOG_vmodule=transaction=1,engine_shard_set=1 GLOG_logtostderr=1 GLOG_vmodule=rdb_load=1,rdb_save=1,snapshot=1 ctest -V -L DFLY - ./dragonfly_test --mem_defrag_threshold=0.05 # trying to catch issue with defrag + ./dragonfly_test --gtest_repeat=100 # GLOG_logtostderr=1 GLOG_vmodule=transaction=1,engine_shard_set=1 CTEST_OUTPUT_ON_FAILURE=1 ninja server/test lint-test-chart: runs-on: ubuntu-latest diff --git a/src/server/dragonfly_test.cc b/src/server/dragonfly_test.cc index b920646ba..a2e42a636 100644 --- a/src/server/dragonfly_test.cc +++ b/src/server/dragonfly_test.cc @@ -471,6 +471,33 @@ TEST_F(DflyEngineTest, EvalBug713) { fb0.Join(); } +// Tests deadlock that happenned due to a fact that trans->Schedule was called +// before interpreter->Lock(). +// +// The problematic scenario: +// 1. transaction 1 schedules itself and blocks on an interpreter lock +// 2. transaction 2 schedules itself, but meanwhile an interpreter unlocks itself and +// transaction 2 grabs the lock but can not progress due to transaction 1 already +// scheduled before. +TEST_F(DflyEngineTest, EvalBug713b) { + const char* script = "return redis.call('get', KEYS[1])"; + + const uint32_t kNumFibers = 20; + fibers_ext::Fiber fibers[kNumFibers]; + + for (unsigned j = 0; j < kNumFibers; ++j) { + fibers[j] = pp_->at(1)->LaunchFiber([=, this] { + for (unsigned i = 0; i < 50; ++i) { + Run(StrCat("fb", j), {"eval", script, "3", kKeySid0, kKeySid1, kKeySid2}); + } + }); + } + + for (unsigned j = 0; j < kNumFibers; ++j) { + fibers[j].Join(); + } +} + TEST_F(DflyEngineTest, EvalSha) { auto resp = Run({"script", "load", "return 5"}); EXPECT_THAT(resp, ArgType(RespExpr::STRING)); diff --git a/src/server/main_service.cc b/src/server/main_service.cc index b5e9d5508..d74595573 100644 --- a/src/server/main_service.cc +++ b/src/server/main_service.cc @@ -906,6 +906,8 @@ void Service::Unwatch(CmdArgList args, ConnectionContext* cntx) { void Service::CallFromScript(CmdArgList args, ObjectExplorer* reply, ConnectionContext* cntx) { DCHECK(cntx->transaction); + DVLOG(1) << "CallFromScript " << cntx->transaction->DebugId() << " " << ArgS(args, 0); + InterpreterReplier replier(reply); facade::SinkReplyBuilder* orig = cntx->Inject(&replier); @@ -1013,11 +1015,11 @@ void Service::EvalInternal(const EvalArgs& eval_args, Interpreter* interpreter, } DCHECK(cntx->transaction); + auto lk = interpreter->Lock(); + if (!eval_args.keys.empty()) cntx->transaction->Schedule(); - auto lk = interpreter->Lock(); - interpreter->SetGlobalArray("KEYS", eval_args.keys); interpreter->SetGlobalArray("ARGV", eval_args.args); interpreter->SetRedisFunc( @@ -1039,12 +1041,12 @@ void Service::EvalInternal(const EvalArgs& eval_args, Interpreter* interpreter, CHECK(result == Interpreter::RUN_OK); EvalSerializer ser{static_cast(cntx->reply_builder())}; - if (!interpreter->IsResultSafe()) { (*cntx)->SendError("reached lua stack limit"); } else { interpreter->SerializeResult(&ser); } + interpreter->ResetStack(); } diff --git a/src/server/transaction.cc b/src/server/transaction.cc index 5498cce2c..88fe86f6b 100644 --- a/src/server/transaction.cc +++ b/src/server/transaction.cc @@ -904,7 +904,7 @@ pair Transaction::ScheduleInShard(EngineShard* shard) { // All transactions in the queue must acquire the intent lock. lock_granted = shard->db_slice().Acquire(mode, lock_args) && shard_unlocked; sd.local_mask |= KEYLOCK_ACQUIRED; - DVLOG(1) << "Lock granted " << lock_granted << " for trans " << DebugId(); + DVLOG(2) << "Lock granted " << lock_granted << " for trans " << DebugId(); } if (!txq->Empty()) {