mirror of
https://github.com/dragonflydb/dragonfly.git
synced 2025-05-11 02:15:45 +02:00
feat(server): Save on shutdown (#1086)
* feat(server): Save snapshot on shutdown * CR * Change save on shutdown to be conditional on --dbfilename. * Support SHUTDOWN [NO]SAVE and fix unit test * Better wait for DB loading * Fix DF format loading state bug * Fix some fallout from auto save
This commit is contained in:
parent
dd97b36965
commit
246f6093db
6 changed files with 99 additions and 37 deletions
|
@ -170,13 +170,9 @@ void DebugCmd::Reload(CmdArgList args) {
|
||||||
|
|
||||||
if (save) {
|
if (save) {
|
||||||
string err_details;
|
string err_details;
|
||||||
const CommandId* cid = sf_.service().FindCmd("SAVE");
|
|
||||||
CHECK_NOTNULL(cid);
|
|
||||||
intrusive_ptr<Transaction> trans(new Transaction{cid, ServerState::tlocal()->thread_index()});
|
|
||||||
trans->InitByArgs(0, {});
|
|
||||||
VLOG(1) << "Performing save";
|
VLOG(1) << "Performing save";
|
||||||
|
|
||||||
GenericError ec = sf_.DoSave(absl::GetFlag(FLAGS_df_snapshot_format), trans.get());
|
GenericError ec = sf_.DoSave();
|
||||||
if (ec) {
|
if (ec) {
|
||||||
return (*cntx_)->SendError(ec.Format());
|
return (*cntx_)->SendError(ec.Format());
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,18 +38,21 @@ namespace dfly {
|
||||||
|
|
||||||
class RdbTest : public BaseFamilyTest {
|
class RdbTest : public BaseFamilyTest {
|
||||||
protected:
|
protected:
|
||||||
static void SetUpTestSuite();
|
|
||||||
void TearDown();
|
void TearDown();
|
||||||
|
void SetUp();
|
||||||
|
|
||||||
io::FileSource GetSource(string name);
|
io::FileSource GetSource(string name);
|
||||||
};
|
};
|
||||||
|
|
||||||
void RdbTest::SetUpTestSuite() {
|
void RdbTest::SetUp() {
|
||||||
BaseFamilyTest::SetUpTestSuite();
|
|
||||||
SetFlag(&FLAGS_dbfilename, "rdbtestdump");
|
SetFlag(&FLAGS_dbfilename, "rdbtestdump");
|
||||||
|
BaseFamilyTest::SetUp();
|
||||||
}
|
}
|
||||||
|
|
||||||
void RdbTest::TearDown() {
|
void RdbTest::TearDown() {
|
||||||
|
// Disable save on shutdown
|
||||||
|
SetFlag(&FLAGS_dbfilename, "");
|
||||||
|
|
||||||
auto rdb_files = io::StatFiles("rdbtestdump*");
|
auto rdb_files = io::StatFiles("rdbtestdump*");
|
||||||
CHECK(rdb_files);
|
CHECK(rdb_files);
|
||||||
for (const auto& fl : *rdb_files) {
|
for (const auto& fl : *rdb_files) {
|
||||||
|
|
|
@ -467,7 +467,7 @@ void ServerFamily::Init(util::AcceptServer* acceptor, util::ListenerInterface* m
|
||||||
if (!save_time.empty()) {
|
if (!save_time.empty()) {
|
||||||
std::optional<SnapshotSpec> spec = ParseSaveSchedule(save_time);
|
std::optional<SnapshotSpec> spec = ParseSaveSchedule(save_time);
|
||||||
if (spec) {
|
if (spec) {
|
||||||
snapshot_fiber_ = service_.proactor_pool().GetNextProactor()->LaunchFiber(
|
snapshot_schedule_fb_ = service_.proactor_pool().GetNextProactor()->LaunchFiber(
|
||||||
[save_spec = std::move(spec.value()), this] { SnapshotScheduling(save_spec); });
|
[save_spec = std::move(spec.value()), this] { SnapshotScheduling(save_spec); });
|
||||||
} else {
|
} else {
|
||||||
LOG(WARNING) << "Invalid snapshot time specifier " << save_time;
|
LOG(WARNING) << "Invalid snapshot time specifier " << save_time;
|
||||||
|
@ -481,9 +481,18 @@ void ServerFamily::Shutdown() {
|
||||||
if (load_result_.valid())
|
if (load_result_.valid())
|
||||||
load_result_.wait();
|
load_result_.wait();
|
||||||
|
|
||||||
is_snapshot_done_.Notify();
|
schedule_done_.Notify();
|
||||||
if (snapshot_fiber_.IsJoinable()) {
|
if (snapshot_schedule_fb_.IsJoinable()) {
|
||||||
snapshot_fiber_.Join();
|
snapshot_schedule_fb_.Join();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (save_on_shutdown_ && !absl::GetFlag(FLAGS_dbfilename).empty()) {
|
||||||
|
shard_set->pool()->GetNextProactor()->Await([this] {
|
||||||
|
GenericError ec = DoSave();
|
||||||
|
if (ec) {
|
||||||
|
LOG(WARNING) << "Failed to perform snapshot " << ec.Format();
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
pb_task_->Await([this] {
|
pb_task_->Await([this] {
|
||||||
|
@ -608,7 +617,7 @@ Future<std::error_code> ServerFamily::Load(const std::string& load_path) {
|
||||||
void ServerFamily::SnapshotScheduling(const SnapshotSpec& spec) {
|
void ServerFamily::SnapshotScheduling(const SnapshotSpec& spec) {
|
||||||
const auto loop_sleep_time = std::chrono::seconds(20);
|
const auto loop_sleep_time = std::chrono::seconds(20);
|
||||||
while (true) {
|
while (true) {
|
||||||
if (is_snapshot_done_.WaitFor(loop_sleep_time)) {
|
if (schedule_done_.WaitFor(loop_sleep_time)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -629,13 +638,7 @@ void ServerFamily::SnapshotScheduling(const SnapshotSpec& spec) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const CommandId* cid = service().FindCmd("SAVE");
|
GenericError ec = DoSave();
|
||||||
CHECK_NOTNULL(cid);
|
|
||||||
boost::intrusive_ptr<Transaction> trans(
|
|
||||||
new Transaction{cid, ServerState::tlocal()->thread_index()});
|
|
||||||
trans->InitByArgs(0, {});
|
|
||||||
|
|
||||||
GenericError ec = DoSave(absl::GetFlag(FLAGS_df_snapshot_format), trans.get());
|
|
||||||
if (ec) {
|
if (ec) {
|
||||||
LOG(WARNING) << "Failed to perform snapshot " << ec.Format();
|
LOG(WARNING) << "Failed to perform snapshot " << ec.Format();
|
||||||
}
|
}
|
||||||
|
@ -665,9 +668,6 @@ error_code ServerFamily::LoadRdb(const std::string& rdb_file) {
|
||||||
} else {
|
} else {
|
||||||
ec = res.error();
|
ec = res.error();
|
||||||
}
|
}
|
||||||
|
|
||||||
service_.SwitchState(GlobalState::LOADING, GlobalState::ACTIVE);
|
|
||||||
|
|
||||||
return ec;
|
return ec;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -924,6 +924,15 @@ GenericError DoPartialSave(PartialSaveOpts opts, const dfly::StringVec& scripts,
|
||||||
return local_ec;
|
return local_ec;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GenericError ServerFamily::DoSave() {
|
||||||
|
const CommandId* cid = service().FindCmd("SAVE");
|
||||||
|
CHECK_NOTNULL(cid);
|
||||||
|
boost::intrusive_ptr<Transaction> trans(
|
||||||
|
new Transaction{cid, ServerState::tlocal()->thread_index()});
|
||||||
|
trans->InitByArgs(0, {});
|
||||||
|
return DoSave(absl::GetFlag(FLAGS_df_snapshot_format), trans.get());
|
||||||
|
}
|
||||||
|
|
||||||
GenericError ServerFamily::DoSave(bool new_version, Transaction* trans) {
|
GenericError ServerFamily::DoSave(bool new_version, Transaction* trans) {
|
||||||
fs::path dir_path(GetFlag(FLAGS_dir));
|
fs::path dir_path(GetFlag(FLAGS_dir));
|
||||||
AggregateGenericError ec;
|
AggregateGenericError ec;
|
||||||
|
@ -2036,6 +2045,22 @@ void ServerFamily::Latency(CmdArgList args, ConnectionContext* cntx) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ServerFamily::_Shutdown(CmdArgList args, ConnectionContext* cntx) {
|
void ServerFamily::_Shutdown(CmdArgList args, ConnectionContext* cntx) {
|
||||||
|
if (args.size() > 1) {
|
||||||
|
(*cntx)->SendError(kSyntaxErr);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (args.size() == 1) {
|
||||||
|
auto sub_cmd = ArgS(args, 0);
|
||||||
|
if (absl::EqualsIgnoreCase(sub_cmd, "SAVE")) {
|
||||||
|
} else if (absl::EqualsIgnoreCase(sub_cmd, "NOSAVE")) {
|
||||||
|
save_on_shutdown_ = false;
|
||||||
|
} else {
|
||||||
|
(*cntx)->SendError(kSyntaxErr);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
CHECK_NOTNULL(acceptor_)->Stop();
|
CHECK_NOTNULL(acceptor_)->Stop();
|
||||||
(*cntx)->SendOk();
|
(*cntx)->SendOk();
|
||||||
}
|
}
|
||||||
|
@ -2079,7 +2104,7 @@ void ServerFamily::Register(CommandRegistry* registry) {
|
||||||
<< CI{"LATENCY", CO::NOSCRIPT | CO::LOADING | CO::FAST, -2, 0, 0, 0}.HFUNC(Latency)
|
<< CI{"LATENCY", CO::NOSCRIPT | CO::LOADING | CO::FAST, -2, 0, 0, 0}.HFUNC(Latency)
|
||||||
<< CI{"MEMORY", kMemOpts, -2, 0, 0, 0}.HFUNC(Memory)
|
<< CI{"MEMORY", kMemOpts, -2, 0, 0, 0}.HFUNC(Memory)
|
||||||
<< CI{"SAVE", CO::ADMIN | CO::GLOBAL_TRANS, -1, 0, 0, 0}.HFUNC(Save)
|
<< CI{"SAVE", CO::ADMIN | CO::GLOBAL_TRANS, -1, 0, 0, 0}.HFUNC(Save)
|
||||||
<< CI{"SHUTDOWN", CO::ADMIN | CO::NOSCRIPT | CO::LOADING, 1, 0, 0, 0}.HFUNC(_Shutdown)
|
<< CI{"SHUTDOWN", CO::ADMIN | CO::NOSCRIPT | CO::LOADING, -1, 0, 0, 0}.HFUNC(_Shutdown)
|
||||||
<< CI{"SLAVEOF", kReplicaOpts, 3, 0, 0, 0}.HFUNC(ReplicaOf)
|
<< CI{"SLAVEOF", kReplicaOpts, 3, 0, 0, 0}.HFUNC(ReplicaOf)
|
||||||
<< CI{"READONLY", CO::READONLY, 1, 0, 0, 0}.HFUNC(ReadOnly)
|
<< CI{"READONLY", CO::READONLY, 1, 0, 0, 0}.HFUNC(ReadOnly)
|
||||||
<< CI{"REPLICAOF", kReplicaOpts, 3, 0, 0, 0}.HFUNC(ReplicaOf)
|
<< CI{"REPLICAOF", kReplicaOpts, 3, 0, 0, 0}.HFUNC(ReplicaOf)
|
||||||
|
|
|
@ -91,6 +91,10 @@ class ServerFamily {
|
||||||
// if new_version is true, saves DF specific, non redis compatible snapshot.
|
// if new_version is true, saves DF specific, non redis compatible snapshot.
|
||||||
GenericError DoSave(bool new_version, Transaction* transaction);
|
GenericError DoSave(bool new_version, Transaction* transaction);
|
||||||
|
|
||||||
|
// Calls DoSave with a default generated transaction and with the format
|
||||||
|
// specified in --df_snapshot_format
|
||||||
|
GenericError DoSave();
|
||||||
|
|
||||||
// Burns down and destroy all the data from the database.
|
// Burns down and destroy all the data from the database.
|
||||||
// if kDbAll is passed, burns all the databases to the ground.
|
// if kDbAll is passed, burns all the databases to the ground.
|
||||||
std::error_code Drakarys(Transaction* transaction, DbIndex db_ind);
|
std::error_code Drakarys(Transaction* transaction, DbIndex db_ind);
|
||||||
|
@ -161,7 +165,7 @@ class ServerFamily {
|
||||||
|
|
||||||
void SnapshotScheduling(const SnapshotSpec& time);
|
void SnapshotScheduling(const SnapshotSpec& time);
|
||||||
|
|
||||||
Fiber snapshot_fiber_;
|
Fiber snapshot_schedule_fb_;
|
||||||
Future<std::error_code> load_result_;
|
Future<std::error_code> load_result_;
|
||||||
|
|
||||||
uint32_t stats_caching_task_ = 0;
|
uint32_t stats_caching_task_ = 0;
|
||||||
|
@ -186,7 +190,11 @@ class ServerFamily {
|
||||||
std::shared_ptr<LastSaveInfo> last_save_info_; // protected by save_mu_;
|
std::shared_ptr<LastSaveInfo> last_save_info_; // protected by save_mu_;
|
||||||
std::atomic_bool is_saving_{false};
|
std::atomic_bool is_saving_{false};
|
||||||
|
|
||||||
Done is_snapshot_done_;
|
// Used to override save on shutdown behavior that is usually set
|
||||||
|
// be --dbfilename.
|
||||||
|
bool save_on_shutdown_{true};
|
||||||
|
|
||||||
|
Done schedule_done_;
|
||||||
std::unique_ptr<FiberQueueThreadPool> fq_threadpool_;
|
std::unique_ptr<FiberQueueThreadPool> fq_threadpool_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -36,9 +36,9 @@ replication_cases = [
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("t_master, t_replicas, seeder_config", replication_cases)
|
@pytest.mark.parametrize("t_master, t_replicas, seeder_config", replication_cases)
|
||||||
async def test_replication_all(df_local_factory, df_seeder_factory, t_master, t_replicas, seeder_config):
|
async def test_replication_all(df_local_factory, df_seeder_factory, t_master, t_replicas, seeder_config):
|
||||||
master = df_local_factory.create(port=BASE_PORT, proactor_threads=t_master)
|
master = df_local_factory.create(port=BASE_PORT, proactor_threads=t_master, dbfilename="")
|
||||||
replicas = [
|
replicas = [
|
||||||
df_local_factory.create(port=BASE_PORT+i+1, proactor_threads=t)
|
df_local_factory.create(port=BASE_PORT+i+1, proactor_threads=t, dbfilename="")
|
||||||
for i, t in enumerate(t_replicas)
|
for i, t in enumerate(t_replicas)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -148,10 +148,10 @@ disconnect_cases = [
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("t_master, t_crash_fs, t_crash_ss, t_disonnect, n_keys", disconnect_cases)
|
@pytest.mark.parametrize("t_master, t_crash_fs, t_crash_ss, t_disonnect, n_keys", disconnect_cases)
|
||||||
async def test_disconnect_replica(df_local_factory: DflyInstanceFactory, df_seeder_factory, t_master, t_crash_fs, t_crash_ss, t_disonnect, n_keys):
|
async def test_disconnect_replica(df_local_factory: DflyInstanceFactory, df_seeder_factory, t_master, t_crash_fs, t_crash_ss, t_disonnect, n_keys):
|
||||||
master = df_local_factory.create(port=BASE_PORT, proactor_threads=t_master)
|
master = df_local_factory.create(port=BASE_PORT, proactor_threads=t_master, dbfilename="")
|
||||||
replicas = [
|
replicas = [
|
||||||
(df_local_factory.create(
|
(df_local_factory.create(
|
||||||
port=BASE_PORT+i+1, proactor_threads=t), crash_fs)
|
port=BASE_PORT+i+1, proactor_threads=t, dbfilename=""), crash_fs)
|
||||||
for i, (t, crash_fs) in enumerate(
|
for i, (t, crash_fs) in enumerate(
|
||||||
chain(
|
chain(
|
||||||
zip(t_crash_fs, repeat(DISCONNECT_CRASH_FULL_SYNC)),
|
zip(t_crash_fs, repeat(DISCONNECT_CRASH_FULL_SYNC)),
|
||||||
|
@ -284,10 +284,10 @@ master_crash_cases = [
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("t_master, t_replicas, n_random_crashes, n_keys", master_crash_cases)
|
@pytest.mark.parametrize("t_master, t_replicas, n_random_crashes, n_keys", master_crash_cases)
|
||||||
async def test_disconnect_master(df_local_factory, df_seeder_factory, t_master, t_replicas, n_random_crashes, n_keys):
|
async def test_disconnect_master(df_local_factory, df_seeder_factory, t_master, t_replicas, n_random_crashes, n_keys):
|
||||||
master = df_local_factory.create(port=1111, proactor_threads=t_master)
|
master = df_local_factory.create(port=1111, proactor_threads=t_master, dbfilename="")
|
||||||
replicas = [
|
replicas = [
|
||||||
df_local_factory.create(
|
df_local_factory.create(
|
||||||
port=BASE_PORT+i+1, proactor_threads=t)
|
port=BASE_PORT+i+1, proactor_threads=t, dbfilename="")
|
||||||
for i, t in enumerate(t_replicas)
|
for i, t in enumerate(t_replicas)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -398,8 +398,8 @@ async def test_cancel_replication_immediately(df_local_factory, df_seeder_factor
|
||||||
"""
|
"""
|
||||||
COMMANDS_TO_ISSUE = 40
|
COMMANDS_TO_ISSUE = 40
|
||||||
|
|
||||||
replica = df_local_factory.create(port=BASE_PORT, v=1)
|
replica = df_local_factory.create(port=BASE_PORT, dbfilename="")
|
||||||
masters = [df_local_factory.create(port=BASE_PORT+i+1) for i in range(4)]
|
masters = [df_local_factory.create(port=BASE_PORT+i+1, dbfilename="") for i in range(4)]
|
||||||
seeders = [df_seeder_factory.create(port=m.port) for m in masters]
|
seeders = [df_seeder_factory.create(port=m.port) for m in masters]
|
||||||
|
|
||||||
df_local_factory.start_all([replica] + masters)
|
df_local_factory.start_all([replica] + masters)
|
||||||
|
|
|
@ -4,6 +4,7 @@ import os
|
||||||
import glob
|
import glob
|
||||||
import aioredis
|
import aioredis
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import aioredis
|
||||||
|
|
||||||
from . import dfly_args
|
from . import dfly_args
|
||||||
from .utility import DflySeeder, wait_available_async
|
from .utility import DflySeeder, wait_available_async
|
||||||
|
@ -18,7 +19,8 @@ class SnapshotTestBase:
|
||||||
self.tmp_dir = tmp_dir
|
self.tmp_dir = tmp_dir
|
||||||
|
|
||||||
def get_main_file(self, pattern):
|
def get_main_file(self, pattern):
|
||||||
def is_main(f): return "summary" in f if pattern.endswith("dfs") else True
|
def is_main(f): return "summary" in f if pattern.endswith(
|
||||||
|
"dfs") else True
|
||||||
files = glob.glob(str(self.tmp_dir.absolute()) + '/' + pattern)
|
files = glob.glob(str(self.tmp_dir.absolute()) + '/' + pattern)
|
||||||
possible_mains = list(filter(is_main, files))
|
possible_mains = list(filter(is_main, files))
|
||||||
assert len(possible_mains) == 1, possible_mains
|
assert len(possible_mains) == 1, possible_mains
|
||||||
|
@ -92,6 +94,8 @@ class TestDflySnapshot(SnapshotTestBase):
|
||||||
assert await seeder.compare(start_capture)
|
assert await seeder.compare(start_capture)
|
||||||
|
|
||||||
# We spawn instances manually, so reduce memory usage of default to minimum
|
# We spawn instances manually, so reduce memory usage of default to minimum
|
||||||
|
|
||||||
|
|
||||||
@dfly_args({"proactor_threads": "1"})
|
@dfly_args({"proactor_threads": "1"})
|
||||||
class TestDflyAutoLoadSnapshot(SnapshotTestBase):
|
class TestDflyAutoLoadSnapshot(SnapshotTestBase):
|
||||||
"""Test automatic loading of dump files on startup with timestamp"""
|
"""Test automatic loading of dump files on startup with timestamp"""
|
||||||
|
@ -138,7 +142,8 @@ class TestPeriodicSnapshot(SnapshotTestBase):
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_snapshot(self, df_seeder_factory, df_server):
|
async def test_snapshot(self, df_seeder_factory, df_server):
|
||||||
seeder = df_seeder_factory.create(port=df_server.port, keys=10, multi_transaction_probability=0)
|
seeder = df_seeder_factory.create(
|
||||||
|
port=df_server.port, keys=10, multi_transaction_probability=0)
|
||||||
await seeder.run(target_deviation=0.5)
|
await seeder.run(target_deviation=0.5)
|
||||||
|
|
||||||
time.sleep(60)
|
time.sleep(60)
|
||||||
|
@ -156,9 +161,34 @@ class TestPathEscapes(SnapshotTestBase):
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_snapshot(self, df_local_factory):
|
async def test_snapshot(self, df_local_factory):
|
||||||
df_server = df_local_factory.create(dbfilename="../../../../etc/passwd")
|
df_server = df_local_factory.create(
|
||||||
|
dbfilename="../../../../etc/passwd")
|
||||||
try:
|
try:
|
||||||
df_server.start()
|
df_server.start()
|
||||||
assert False, "Server should not start correctly"
|
assert False, "Server should not start correctly"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@dfly_args({**BASIC_ARGS, "dbfilename": "test-shutdown"})
|
||||||
|
class TestDflySnapshotOnShutdown(SnapshotTestBase):
|
||||||
|
"""Test multi file snapshot"""
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def setup(self, tmp_dir: Path):
|
||||||
|
self.tmp_dir = tmp_dir
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_snapshot(self, df_seeder_factory, df_server):
|
||||||
|
seeder = df_seeder_factory.create(port=df_server.port, **SEEDER_ARGS)
|
||||||
|
await seeder.run(target_deviation=0.1)
|
||||||
|
|
||||||
|
start_capture = await seeder.capture()
|
||||||
|
|
||||||
|
df_server.stop()
|
||||||
|
df_server.start()
|
||||||
|
|
||||||
|
a_client = aioredis.Redis(port=df_server.port)
|
||||||
|
await wait_available_async(a_client)
|
||||||
|
await a_client.connection_pool.disconnect()
|
||||||
|
|
||||||
|
assert await seeder.compare(start_capture)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue