Slot migration cancel crash fix (#2934)

fix(cluster): crash #2928
This commit is contained in:
Borys 2024-04-19 14:31:42 +03:00 committed by GitHub
parent c42b3dc02f
commit 7666aae6dc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 38 additions and 39 deletions

View file

@ -494,6 +494,8 @@ void ClusterFamily::DflyClusterConfig(CmdArgList args, ConnectionContext* cntx)
lock_guard gu(set_config_mu); lock_guard gu(set_config_mu);
lock_guard config_update_lk(
config_update_mu_); // to prevent simultaneous update config from outgoing migration
// TODO we shouldn't provide cntx into StartSlotMigrations // TODO we shouldn't provide cntx into StartSlotMigrations
if (!StartSlotMigrations(new_config->GetNewOutgoingMigrations(tl_cluster_config), cntx)) { if (!StartSlotMigrations(new_config->GetNewOutgoingMigrations(tl_cluster_config), cntx)) {
return cntx->SendError("Can't start the migration"); return cntx->SendError("Can't start the migration");
@ -706,19 +708,17 @@ void ClusterFamily::DflyMigrate(CmdArgList args, ConnectionContext* cntx) {
} }
} }
IncomingSlotMigration* ClusterFamily::CreateIncomingMigration(std::string source_id, std::shared_ptr<IncomingSlotMigration> ClusterFamily::CreateIncomingMigration(std::string source_id,
SlotRanges slots, SlotRanges slots,
uint32_t shards_num) { uint32_t shards_num) {
lock_guard lk(migration_mu_); lock_guard lk(migration_mu_);
for (const auto& mj : incoming_migrations_jobs_) { for (const auto& mj : incoming_migrations_jobs_) {
if (mj->GetSourceID() == source_id) { if (mj->GetSourceID() == source_id) {
return nullptr; return nullptr;
} }
} }
return incoming_migrations_jobs_ return incoming_migrations_jobs_.emplace_back(make_shared<IncomingSlotMigration>(
.emplace_back(make_shared<IncomingSlotMigration>( std::move(source_id), &server_family_->service(), std::move(slots), shards_num));
std::move(source_id), &server_family_->service(), std::move(slots), shards_num))
.get();
} }
std::shared_ptr<IncomingSlotMigration> ClusterFamily::GetIncomingMigration( std::shared_ptr<IncomingSlotMigration> ClusterFamily::GetIncomingMigration(
@ -742,7 +742,7 @@ void ClusterFamily::RemoveOutgoingMigrations(const std::vector<MigrationInfo>& m
OutgoingMigration& migration = *it->get(); OutgoingMigration& migration = *it->get();
LOG(INFO) << "Outgoing migration cancelled: slots " << SlotRange::ToString(migration.GetSlots()) LOG(INFO) << "Outgoing migration cancelled: slots " << SlotRange::ToString(migration.GetSlots())
<< " to " << migration.GetHostIp() << ":" << migration.GetPort(); << " to " << migration.GetHostIp() << ":" << migration.GetPort();
migration.Cancel(); migration.Finish();
outgoing_migration_jobs_.erase(it); outgoing_migration_jobs_.erase(it);
} }
@ -833,8 +833,10 @@ void ClusterFamily::DflyMigrateFlow(CmdArgList args, ConnectionContext* cntx) {
cntx->conn()->SetName(absl::StrCat("migration_flow_", source_id)); cntx->conn()->SetName(absl::StrCat("migration_flow_", source_id));
auto migration = GetIncomingMigration(source_id); auto migration = GetIncomingMigration(source_id);
if (!migration) if (!migration) {
// TODO process error when migration is canceled
return cntx->SendError(kIdNotFound); return cntx->SendError(kIdNotFound);
}
DCHECK(cntx->sync_dispatch); DCHECK(cntx->sync_dispatch);
// we do this to be ignored by the dispatch tracker // we do this to be ignored by the dispatch tracker
@ -847,7 +849,7 @@ void ClusterFamily::DflyMigrateFlow(CmdArgList args, ConnectionContext* cntx) {
} }
void ClusterFamily::UpdateConfig(const std::vector<SlotRange>& slots, bool enable) { void ClusterFamily::UpdateConfig(const std::vector<SlotRange>& slots, bool enable) {
lock_guard gu(set_config_mu); lock_guard gu(config_update_mu_);
auto new_config = tl_cluster_config->CloneWithChanges(slots, enable); auto new_config = tl_cluster_config->CloneWithChanges(slots, enable);
@ -870,6 +872,7 @@ void ClusterFamily::DflyMigrateAck(CmdArgList args, ConnectionContext* cntx) {
[source_id](const auto& m) { return m.node_id == source_id; }); [source_id](const auto& m) { return m.node_id == source_id; });
if (m_it == in_migrations.end()) { if (m_it == in_migrations.end()) {
LOG(WARNING) << "migration isn't in config"; LOG(WARNING) << "migration isn't in config";
// TODO process error if migration was canceled
return cntx->SendLong(OutgoingMigration::kInvalidAttempt); return cntx->SendLong(OutgoingMigration::kInvalidAttempt);
} }

View file

@ -73,8 +73,9 @@ class ClusterFamily {
void DflyMigrateAck(CmdArgList args, ConnectionContext* cntx); void DflyMigrateAck(CmdArgList args, ConnectionContext* cntx);
// create a IncomingSlotMigration entity which will execute migration // create a IncomingSlotMigration entity which will execute migration
IncomingSlotMigration* CreateIncomingMigration(std::string source_id, SlotRanges slots, std::shared_ptr<IncomingSlotMigration> CreateIncomingMigration(std::string source_id,
uint32_t shards_num); SlotRanges slots,
uint32_t shards_num);
std::shared_ptr<IncomingSlotMigration> GetIncomingMigration(std::string_view source_id); std::shared_ptr<IncomingSlotMigration> GetIncomingMigration(std::string_view source_id);
@ -97,6 +98,8 @@ class ClusterFamily {
private: private:
ClusterShardInfo GetEmulatedShardInfo(ConnectionContext* cntx) const; ClusterShardInfo GetEmulatedShardInfo(ConnectionContext* cntx) const;
mutable util::fb2::Mutex config_update_mu_;
std::string id_; std::string id_;
ServerFamily* server_family_ = nullptr; ServerFamily* server_family_ = nullptr;

View file

@ -81,15 +81,12 @@ OutgoingMigration::~OutgoingMigration() {
main_sync_fb_.JoinIfNeeded(); main_sync_fb_.JoinIfNeeded();
} }
void OutgoingMigration::Cancel() { void OutgoingMigration::Finish() {
state_.store(MigrationState::C_CANCELLED); shard_set->pool()->AwaitFiberOnAll([this](util::ProactorBase* pb) {
if (const auto* shard = EngineShard::tlocal(); shard)
auto start_cb = [this](util::ProactorBase* pb) {
if (auto* shard = EngineShard::tlocal(); shard) {
slot_migrations_[shard->shard_id()]->Cancel(); slot_migrations_[shard->shard_id()]->Cancel();
} });
}; state_.store(MigrationState::C_FINISHED);
shard_set->pool()->AwaitFiberOnAll(std::move(start_cb));
} }
MigrationState OutgoingMigration::GetState() const { MigrationState OutgoingMigration::GetState() const {
@ -108,8 +105,6 @@ void OutgoingMigration::SyncFb() {
} }
}; };
state_.store(MigrationState::C_SYNC);
shard_set->pool()->AwaitFiberOnAll(std::move(start_cb)); shard_set->pool()->AwaitFiberOnAll(std::move(start_cb));
for (auto& migration : slot_migrations_) { for (auto& migration : slot_migrations_) {
@ -121,13 +116,13 @@ void OutgoingMigration::SyncFb() {
// TODO implement blocking on migrated slots only // TODO implement blocking on migrated slots only
long attempt = 0; long attempt = 0;
while (state_.load() != MigrationState::C_CANCELLED && !FinishMigration(++attempt)) { while (state_.load() != MigrationState::C_FINISHED && !FinalyzeMigration(++attempt)) {
// process commands that were on pause and try again // process commands that were on pause and try again
ThisFiber::SleepFor(500ms); ThisFiber::SleepFor(500ms);
} }
} }
bool OutgoingMigration::FinishMigration(long attempt) { bool OutgoingMigration::FinalyzeMigration(long attempt) {
bool is_block_active = true; bool is_block_active = true;
auto is_pause_in_progress = [&is_block_active] { return is_block_active; }; auto is_pause_in_progress = [&is_block_active] { return is_block_active; };
auto pause_fb_opt = Pause(server_family_->GetNonPriviligedListeners(), nullptr, auto pause_fb_opt = Pause(server_family_->GetNonPriviligedListeners(), nullptr,
@ -181,12 +176,8 @@ bool OutgoingMigration::FinishMigration(long attempt) {
} }
} while (attempt_res != attempt); } while (attempt_res != attempt);
shard_set->pool()->AwaitFiberOnAll([this](util::ProactorBase* pb) { Finish();
if (const auto* shard = EngineShard::tlocal(); shard)
slot_migrations_[shard->shard_id()]->Cancel();
});
state_.store(MigrationState::C_FINISHED);
cf_->UpdateConfig(migration_info_.slot_ranges, false); cf_->UpdateConfig(migration_info_.slot_ranges, false);
VLOG(1) << "Config is updated for " << cf_->MyID(); VLOG(1) << "Config is updated for " << cf_->MyID();
return true; return true;

View file

@ -28,10 +28,8 @@ class OutgoingMigration : private ProtocolClient {
// start migration process, sends INIT command to the target node // start migration process, sends INIT command to the target node
std::error_code Start(ConnectionContext* cntx); std::error_code Start(ConnectionContext* cntx);
// should be run for all shards // mark migration as FINISHED and cancel migration if it's not finished yet
void StartFlow(journal::Journal* journal, io::Sink* dest); void Finish();
void Cancel();
MigrationState GetState() const; MigrationState GetState() const;
@ -54,18 +52,21 @@ class OutgoingMigration : private ProtocolClient {
static constexpr long kInvalidAttempt = -1; static constexpr long kInvalidAttempt = -1;
private: private:
// should be run for all shards
void StartFlow(journal::Journal* journal, io::Sink* dest);
MigrationState GetStateImpl() const; MigrationState GetStateImpl() const;
// SliceSlotMigration manages state and data transfering for the corresponding shard // SliceSlotMigration manages state and data transfering for the corresponding shard
class SliceSlotMigration; class SliceSlotMigration;
void SyncFb(); void SyncFb();
bool FinishMigration(long attempt); bool FinalyzeMigration(long attempt);
private: private:
MigrationInfo migration_info_; MigrationInfo migration_info_;
Context cntx_; Context cntx_;
mutable util::fb2::Mutex flows_mu_; mutable util::fb2::Mutex finish_mu_;
std::vector<std::unique_ptr<SliceSlotMigration>> slot_migrations_ ABSL_GUARDED_BY(flows_mu_); std::vector<std::unique_ptr<SliceSlotMigration>> slot_migrations_;
ServerFamily* server_family_; ServerFamily* server_family_;
ClusterFamily* cf_; ClusterFamily* cf_;

View file

@ -112,11 +112,12 @@ RestoreStreamer::~RestoreStreamer() {
} }
void RestoreStreamer::Cancel() { void RestoreStreamer::Cancel() {
if (snapshot_version_ != 0) { auto sver = snapshot_version_;
snapshot_version_ = 0; // to prevent double cancel in another fiber
if (sver != 0) {
fiber_cancellation_.Cancel(); fiber_cancellation_.Cancel();
db_slice_->UnregisterOnChange(snapshot_version_); db_slice_->UnregisterOnChange(sver);
JournalStreamer::Cancel(); JournalStreamer::Cancel();
snapshot_version_ = 0;
} }
} }