mirror of
https://github.com/dragonflydb/dragonfly.git
synced 2025-05-11 10:25:47 +02:00
fix: do not migrate during connection close (#2570)
* fix: do not migrate during connection close Fixes #2569 Before the change we had a corner case where Dragonfly would call OnPreMigrateThread but would not call CancelOnErrorCb because OnBreakCb has already been called (it resets break_cb_engaged_) On the other hand in OnPostMigrateThread we called RegisterOnErrorCb if breaker_cb_ which resulted in double registration. This change simplifies the logic by removing break_cb_engaged_ flag since CancelOnErrorCb is safe to call if nothing is registered. Moreover, we now skip Migrate flow if a socket is being closed. --------- Signed-off-by: Roman Gershman <roman@dragonflydb.io>
This commit is contained in:
parent
6d11f86091
commit
4000adf57f
9 changed files with 46 additions and 33 deletions
2
helio
2
helio
|
@ -1 +1 @@
|
||||||
Subproject commit d1d06c486f2a6f79e1eb6249c921cc6ecf06d30e
|
Subproject commit 432ffbae89d909f3fbbc25fdeb77689c3f09ea75
|
|
@ -502,20 +502,15 @@ void Connection::OnShutdown() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void Connection::OnPreMigrateThread() {
|
void Connection::OnPreMigrateThread() {
|
||||||
// If we migrating to another io_uring we should cancel any pending requests we have.
|
CHECK(!cc_->conn_closing);
|
||||||
if (break_cb_engaged_) {
|
|
||||||
socket_->CancelOnErrorCb();
|
socket_->CancelOnErrorCb();
|
||||||
break_cb_engaged_ = false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Connection::OnPostMigrateThread() {
|
void Connection::OnPostMigrateThread() {
|
||||||
// Once we migrated, we should rearm OnBreakCb callback.
|
// Once we migrated, we should rearm OnBreakCb callback.
|
||||||
if (breaker_cb_) {
|
if (breaker_cb_) {
|
||||||
DCHECK(!break_cb_engaged_);
|
|
||||||
|
|
||||||
socket_->RegisterOnErrorCb([this](int32_t mask) { this->OnBreakCb(mask); });
|
socket_->RegisterOnErrorCb([this](int32_t mask) { this->OnBreakCb(mask); });
|
||||||
break_cb_engaged_ = true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update tl variables
|
// Update tl variables
|
||||||
|
@ -594,14 +589,11 @@ void Connection::HandleRequests() {
|
||||||
cc_.reset(service_->CreateContext(peer, this));
|
cc_.reset(service_->CreateContext(peer, this));
|
||||||
if (breaker_cb_) {
|
if (breaker_cb_) {
|
||||||
socket_->RegisterOnErrorCb([this](int32_t mask) { this->OnBreakCb(mask); });
|
socket_->RegisterOnErrorCb([this](int32_t mask) { this->OnBreakCb(mask); });
|
||||||
break_cb_engaged_ = true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ConnectionFlow(peer);
|
ConnectionFlow(peer);
|
||||||
|
|
||||||
if (break_cb_engaged_) {
|
socket_->CancelOnErrorCb(); // noop if nothing is registered.
|
||||||
socket_->CancelOnErrorCb();
|
|
||||||
}
|
|
||||||
|
|
||||||
cc_.reset();
|
cc_.reset();
|
||||||
}
|
}
|
||||||
|
@ -975,14 +967,13 @@ void Connection::OnBreakCb(int32_t mask) {
|
||||||
<< cc_->reply_builder()->IsSendActive() << " " << cc_->reply_builder()->GetError();
|
<< cc_->reply_builder()->IsSendActive() << " " << cc_->reply_builder()->GetError();
|
||||||
|
|
||||||
cc_->conn_closing = true;
|
cc_->conn_closing = true;
|
||||||
break_cb_engaged_ = false; // do not attempt to cancel it.
|
|
||||||
|
|
||||||
breaker_cb_(mask);
|
breaker_cb_(mask);
|
||||||
evc_.notify(); // Notify dispatch fiber.
|
evc_.notify(); // Notify dispatch fiber.
|
||||||
}
|
}
|
||||||
|
|
||||||
void Connection::HandleMigrateRequest() {
|
void Connection::HandleMigrateRequest() {
|
||||||
if (!migration_request_) {
|
if (cc_->conn_closing || !migration_request_) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -996,6 +987,7 @@ void Connection::HandleMigrateRequest() {
|
||||||
// We don't support migrating with subscriptions as it would require moving thread local
|
// We don't support migrating with subscriptions as it would require moving thread local
|
||||||
// handles. We can't check above, as the queue might have contained a subscribe request.
|
// handles. We can't check above, as the queue might have contained a subscribe request.
|
||||||
if (cc_->subscriptions == 0) {
|
if (cc_->subscriptions == 0) {
|
||||||
|
stats_->num_migrations++;
|
||||||
migration_request_ = nullptr;
|
migration_request_ = nullptr;
|
||||||
|
|
||||||
DecreaseStatsOnClose();
|
DecreaseStatsOnClose();
|
||||||
|
|
|
@ -401,7 +401,6 @@ class Connection : public util::Connection {
|
||||||
std::string name_;
|
std::string name_;
|
||||||
|
|
||||||
unsigned parser_error_ = 0;
|
unsigned parser_error_ = 0;
|
||||||
bool break_cb_engaged_ = false;
|
|
||||||
|
|
||||||
BreakerCb breaker_cb_;
|
BreakerCb breaker_cb_;
|
||||||
std::unique_ptr<Shutdown> shutdown_cb_;
|
std::unique_ptr<Shutdown> shutdown_cb_;
|
||||||
|
|
|
@ -20,7 +20,7 @@ constexpr size_t kSizeConnStats = sizeof(ConnectionStats);
|
||||||
|
|
||||||
ConnectionStats& ConnectionStats::operator+=(const ConnectionStats& o) {
|
ConnectionStats& ConnectionStats::operator+=(const ConnectionStats& o) {
|
||||||
// To break this code deliberately if we add/remove a field to this struct.
|
// To break this code deliberately if we add/remove a field to this struct.
|
||||||
static_assert(kSizeConnStats == 104u);
|
static_assert(kSizeConnStats == 112u);
|
||||||
|
|
||||||
ADD(read_buf_capacity);
|
ADD(read_buf_capacity);
|
||||||
ADD(dispatch_queue_entries);
|
ADD(dispatch_queue_entries);
|
||||||
|
@ -36,6 +36,7 @@ ConnectionStats& ConnectionStats::operator+=(const ConnectionStats& o) {
|
||||||
ADD(num_conns);
|
ADD(num_conns);
|
||||||
ADD(num_replicas);
|
ADD(num_replicas);
|
||||||
ADD(num_blocked_clients);
|
ADD(num_blocked_clients);
|
||||||
|
ADD(num_migrations);
|
||||||
|
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
|
@ -61,7 +61,7 @@ struct ConnectionStats {
|
||||||
uint32_t num_conns = 0;
|
uint32_t num_conns = 0;
|
||||||
uint32_t num_replicas = 0;
|
uint32_t num_replicas = 0;
|
||||||
uint32_t num_blocked_clients = 0;
|
uint32_t num_blocked_clients = 0;
|
||||||
|
uint64_t num_migrations = 0;
|
||||||
ConnectionStats& operator+=(const ConnectionStats& o);
|
ConnectionStats& operator+=(const ConnectionStats& o);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -1441,6 +1441,9 @@ void DbSlice::TrackKeys(const facade::Connection::WeakRef& conn, const ArgSlice&
|
||||||
}
|
}
|
||||||
|
|
||||||
void DbSlice::SendInvalidationTrackingMessage(std::string_view key) {
|
void DbSlice::SendInvalidationTrackingMessage(std::string_view key) {
|
||||||
|
if (client_tracking_map_.empty())
|
||||||
|
return;
|
||||||
|
|
||||||
auto it = client_tracking_map_.find(key);
|
auto it = client_tracking_map_.find(key);
|
||||||
if (it != client_tracking_map_.end()) {
|
if (it != client_tracking_map_.end()) {
|
||||||
// notify all the clients.
|
// notify all the clients.
|
||||||
|
|
|
@ -1859,6 +1859,7 @@ void ServerFamily::Info(CmdArgList args, ConnectionContext* cntx) {
|
||||||
append("total_pipelined_commands", conn_stats.pipelined_cmd_cnt);
|
append("total_pipelined_commands", conn_stats.pipelined_cmd_cnt);
|
||||||
append("pipelined_latency_usec", conn_stats.pipelined_cmd_latency);
|
append("pipelined_latency_usec", conn_stats.pipelined_cmd_latency);
|
||||||
append("total_net_input_bytes", conn_stats.io_read_bytes);
|
append("total_net_input_bytes", conn_stats.io_read_bytes);
|
||||||
|
append("connection_migrations", conn_stats.num_migrations);
|
||||||
append("total_net_output_bytes", reply_stats.io_write_bytes);
|
append("total_net_output_bytes", reply_stats.io_write_bytes);
|
||||||
append("instantaneous_input_kbps", -1);
|
append("instantaneous_input_kbps", -1);
|
||||||
append("instantaneous_output_kbps", -1);
|
append("instantaneous_output_kbps", -1);
|
||||||
|
|
|
@ -3,11 +3,15 @@ import async_timeout
|
||||||
from redis import asyncio as aioredis
|
from redis import asyncio as aioredis
|
||||||
import time
|
import time
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
import pytest
|
import pytest
|
||||||
import random
|
import random
|
||||||
import itertools
|
import itertools
|
||||||
import random
|
import random
|
||||||
import string
|
import string
|
||||||
|
|
||||||
|
from .instance import DflyInstance
|
||||||
|
|
||||||
from . import dfly_args, dfly_multi_test_args
|
from . import dfly_args, dfly_multi_test_args
|
||||||
|
|
||||||
DJANGO_CACHEOPS_SCRIPT = """
|
DJANGO_CACHEOPS_SCRIPT = """
|
||||||
|
@ -310,3 +314,28 @@ async def test_one_interpreter(async_client: aioredis.Redis):
|
||||||
# At least some connection was seen blocked
|
# At least some connection was seen blocked
|
||||||
# Flaky: release build is too fast and never blocks
|
# Flaky: release build is too fast and never blocks
|
||||||
# assert max_blocked > 0
|
# assert max_blocked > 0
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Tests migrate/close interaction for the connection
|
||||||
|
Reproduces #2569
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@dfly_args({"proactor_threads": "4", "pipeline_squash": 0})
|
||||||
|
async def test_migrate_close_connection(async_client: aioredis.Redis, df_server: DflyInstance):
|
||||||
|
sha = await async_client.script_load("return redis.call('GET', KEYS[1])")
|
||||||
|
|
||||||
|
async def run():
|
||||||
|
reader, writer = await asyncio.open_connection("localhost", df_server.port)
|
||||||
|
|
||||||
|
# write a EVALSHA that will ask for migration (75% it's on the wrong shard)
|
||||||
|
writer.write((f"EVALSHA {sha} 1 a\r\n").encode())
|
||||||
|
await writer.drain()
|
||||||
|
|
||||||
|
# disconnect the client connection
|
||||||
|
writer.close()
|
||||||
|
await writer.wait_closed()
|
||||||
|
|
||||||
|
tasks = [asyncio.create_task(run()) for _ in range(50)]
|
||||||
|
await asyncio.gather(*tasks)
|
||||||
|
|
|
@ -5,7 +5,6 @@ import aiohttp
|
||||||
import logging
|
import logging
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Dict, Optional, List, Union
|
from typing import Dict, Optional, List, Union
|
||||||
import os
|
|
||||||
import re
|
import re
|
||||||
import psutil
|
import psutil
|
||||||
import itertools
|
import itertools
|
||||||
|
@ -163,22 +162,13 @@ class DflyInstance:
|
||||||
if proc is None:
|
if proc is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
# if we have log files, it means that we started a process.
|
|
||||||
# if it died before we could stop it, we should raise an exception
|
|
||||||
if self.log_files:
|
|
||||||
exitcode = proc.poll()
|
|
||||||
if exitcode is not None:
|
|
||||||
if exitcode != 0:
|
|
||||||
raise Exception(f"Process exited with code {exitcode}")
|
|
||||||
return
|
|
||||||
|
|
||||||
logging.debug(f"Stopping instance on {self._port}")
|
logging.debug(f"Stopping instance on {self._port}")
|
||||||
try:
|
try:
|
||||||
if kill:
|
if kill:
|
||||||
proc.kill()
|
proc.kill()
|
||||||
else:
|
else:
|
||||||
proc.terminate()
|
proc.terminate()
|
||||||
proc.communicate(timeout=30)
|
proc.communicate(timeout=15)
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
# We need to send SIGUSR1 to DF such that it prints the stacktrace
|
# We need to send SIGUSR1 to DF such that it prints the stacktrace
|
||||||
proc.send_signal(signal.SIGUSR1)
|
proc.send_signal(signal.SIGUSR1)
|
||||||
|
@ -203,11 +193,9 @@ class DflyInstance:
|
||||||
self._port = None
|
self._port = None
|
||||||
|
|
||||||
all_args = self.format_args(self.args)
|
all_args = self.format_args(self.args)
|
||||||
arg_str = " ".join(all_args)
|
logging.debug(f"Starting instance with arguments {all_args} from {self.params.path}")
|
||||||
bin_path = os.path.realpath(self.params.path)
|
|
||||||
logging.debug(f"Starting {bin_path} with arguments: {arg_str}")
|
|
||||||
|
|
||||||
run_cmd = [bin_path, *all_args]
|
run_cmd = [self.params.path, *all_args]
|
||||||
if self.params.gdb:
|
if self.params.gdb:
|
||||||
run_cmd = ["gdb", "--ex", "r", "--args"] + run_cmd
|
run_cmd = ["gdb", "--ex", "r", "--args"] + run_cmd
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue