takeover: Cancel blocking commands (#1514)

* fix: Cancel blocking commands when performing a takeover

* Add some comments

* Make CancelBlocking a method of ConnectionContext

* add a small todo
This commit is contained in:
Roy Jacobson 2023-07-05 17:09:10 +02:00 committed by GitHub
parent 646f5304a5
commit 0f69d32b11
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 47 additions and 14 deletions

View file

@ -915,4 +915,8 @@ std::string Connection::RemoteEndpointAddress() const {
return re.address().to_string();
}
ConnectionContext* Connection::cntx() {
return cc_.get();
}
} // namespace facade

View file

@ -163,6 +163,8 @@ class Connection : public util::Connection {
return name_;
}
ConnectionContext* cntx();
protected:
void OnShutdown() override;
void OnPreMigrateThread() override;

View file

@ -9,6 +9,7 @@
#include "server/engine_shard_set.h"
#include "server/server_family.h"
#include "server/server_state.h"
#include "server/transaction.h"
#include "src/facade/dragonfly_connection.h"
namespace dfly {
@ -93,6 +94,12 @@ void ConnectionContext::ChangeMonitor(bool start) {
EnableMonitoring(start);
}
void ConnectionContext::CancelBlocking() {
if (transaction) {
transaction->CancelBlocking();
}
}
vector<unsigned> ChangeSubscriptions(bool pattern, CmdArgList args, bool to_add, bool to_reply,
ConnectionContext* conn) {
vector<unsigned> result(to_reply ? args.size() : 0, 0);

View file

@ -175,6 +175,7 @@ class ConnectionContext : public facade::ConnectionContext {
void UnsubscribeAll(bool to_reply);
void PUnsubscribeAll(bool to_reply);
void ChangeMonitor(bool start); // either start or stop monitor on a given connection
void CancelBlocking(); // Cancel an ongoing blocking transaction if there is one.
// Whether this connection is a connection from a replica to its master.
// This flag is true only on replica side, where we need to setup a special ConnectionContext

View file

@ -377,8 +377,9 @@ void DflyCmd::TakeOver(CmdArgList args, ConnectionContext* cntx) {
absl::Time start = absl::Now();
AggregateStatus status;
// TODO: We should cancel blocking commands before awaiting all
// dispatches to finish.
// We need to await for all dispatches to finish: Otherwise a transaction might be scheduled
// after this function exits but before the actual shutdown.
sf_->CancelBlockingCommands();
if (!sf_->AwaitDispatches(timeout_dur, [self = cntx->owner()](util::Connection* conn) {
// The only command that is currently dispatching should be the takeover command -
// so we wait until this is true.
@ -387,7 +388,12 @@ void DflyCmd::TakeOver(CmdArgList args, ConnectionContext* cntx) {
LOG(WARNING) << "Couldn't wait for commands to finish dispatching. " << timeout_dur;
status = OpStatus::TIMED_OUT;
}
VLOG(1) << "AwaitDispatches done";
// We have this guard to disable expirations: We don't want any writes to the journal after
// we send the `PING`, and expirations could ruin that.
// TODO: Decouple disabling expirations from TransactionGuard because we don't
// really need TransactionGuard here.
TransactionGuard tg{cntx->transaction, /*disable_expirations=*/true};
if (*status == OpStatus::OK) {

View file

@ -1028,9 +1028,7 @@ facade::ConnectionContext* Service::CreateContext(util::FiberSocketBase* peer,
// a bit of a hack. I set up breaker callback here for the owner.
// Should work though it's confusing to have it here.
owner->RegisterBreakHook([res, this](uint32_t) {
if (res->transaction) {
res->transaction->BreakOnShutdown();
}
res->CancelBlocking();
this->server_family().BreakOnShutdown();
});

View file

@ -1260,6 +1260,19 @@ void ServerFamily::BreakOnShutdown() {
dfly_cmd_->BreakOnShutdown();
}
void ServerFamily::CancelBlockingCommands() {
auto cb = [](unsigned thread_index, util::Connection* conn) {
facade::ConnectionContext* fc = static_cast<facade::Connection*>(conn)->cntx();
if (fc) {
ConnectionContext* cntx = static_cast<ConnectionContext*>(fc);
cntx->CancelBlocking();
}
};
for (auto* listener : listeners_) {
listener->TraverseConnections(cb);
}
}
bool ServerFamily::AwaitDispatches(absl::Duration timeout,
const std::function<bool(util::Connection*)>& filter) {
auto start = absl::Now();

View file

@ -156,6 +156,8 @@ class ServerFamily {
void BreakOnShutdown();
void CancelBlockingCommands();
bool AwaitDispatches(absl::Duration timeout,
const std::function<bool(util::Connection*)>& filter);

View file

@ -1389,7 +1389,7 @@ void Transaction::FinishLogJournalOnShard(EngineShard* shard, uint32_t shard_cnt
journal->RecordEntry(txid_, journal::Op::EXEC, db_index_, shard_cnt, {}, false);
}
void Transaction::BreakOnShutdown() {
void Transaction::CancelBlocking() {
if (coordinator_state_ & COORD_BLOCKED) {
coordinator_state_ |= COORD_CANCELLED;
blocking_ec_.notify();

View file

@ -183,8 +183,9 @@ class Transaction {
// blocking queue.
bool NotifySuspended(TxId committed_ts, ShardId sid, std::string_view key);
// Cancel all blocking watches on shutdown. Set COORD_CANCELLED.
void BreakOnShutdown();
// Cancel all blocking watches. Set COORD_CANCELLED.
// Must be called from coordinator thread.
void CancelBlocking();
// In some cases for non auto-journaling commands we want to enable the auto journal flow.
void RenableAutoJournal() {

View file

@ -1120,12 +1120,11 @@ async def test_take_over_counters(df_local_factory, master_threads, replica_thre
async def block_during_takeover():
"Add a blocking command during takeover to make sure it doesn't block it."
# TODO: We need to make takeover interrupt blocking commands.
return
try:
await c_blocking.execute_command("BLPOP BLOCKING_KEY1 BLOCKING_KEY2 10")
except redis.exceptions.ConnectionError:
pass
start = time.time()
# The command should just be canceled
assert await c_blocking.execute_command("BLPOP BLOCKING_KEY1 BLOCKING_KEY2 100") is None
# And it should happen in reasonable amount of time.
assert time.time() - start < 10
async def delayed_takeover():
await asyncio.sleep(1)