mirror of
https://github.com/dragonflydb/dragonfly.git
synced 2025-05-10 18:05:44 +02:00
bug(replication): fix deadlock in cancle replication flow (#1007)
Signed-off-by: adi_holden <adi@dragonflydb.io>
This commit is contained in:
parent
44c477142a
commit
0312b66244
4 changed files with 15 additions and 20 deletions
|
@ -517,23 +517,11 @@ void DflyCmd::CancelReplication(uint32_t sync_id, shared_ptr<ReplicaInfo> replic
|
|||
replica_ptr->state.store(SyncState::CANCELLED, memory_order_release);
|
||||
replica_ptr->cntx.Cancel();
|
||||
|
||||
// Run cleanup for shard threads.
|
||||
shard_set->AwaitRunningOnShardQueue([replica_ptr](EngineShard* shard) {
|
||||
FlowInfo* flow = &replica_ptr->flows[shard->shard_id()];
|
||||
if (flow->cleanup) {
|
||||
flow->cleanup();
|
||||
}
|
||||
});
|
||||
|
||||
// Wait for tasks to finish.
|
||||
shard_set->pool()->AwaitFiberOnAll([replica_ptr](unsigned index, auto*) {
|
||||
FlowInfo* flow = &replica_ptr->flows[index];
|
||||
|
||||
// Cleanup hasn't been run for io-thread.
|
||||
if (EngineShard::tlocal() == nullptr) {
|
||||
if (flow->cleanup) {
|
||||
flow->cleanup();
|
||||
}
|
||||
if (flow->cleanup) {
|
||||
flow->cleanup();
|
||||
}
|
||||
|
||||
if (flow->full_sync_fb.IsJoinable()) {
|
||||
|
|
|
@ -5,7 +5,6 @@
|
|||
#include "server/io_utils.h"
|
||||
|
||||
#include "base/flags.h"
|
||||
#include "base/logging.h"
|
||||
#include "server/error.h"
|
||||
|
||||
using namespace std;
|
||||
|
@ -41,8 +40,7 @@ error_code BufferedStreamerBase::ConsumeIntoSink(io::Sink* dest) {
|
|||
// Wait for more data or stop signal.
|
||||
waker_.await([this]() { return buffered_ > 0 || IsStopped(); });
|
||||
// Break immediately on cancellation.
|
||||
if (cll_->IsCancelled()) {
|
||||
waker_.notifyAll(); // Wake consumer if it missed it.
|
||||
if (IsStopped()) {
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -62,7 +60,6 @@ error_code BufferedStreamerBase::ConsumeIntoSink(io::Sink* dest) {
|
|||
// TODO: shrink big stash.
|
||||
consumer_buf_.Clear();
|
||||
}
|
||||
|
||||
return std::error_code{};
|
||||
}
|
||||
|
||||
|
|
|
@ -25,11 +25,17 @@ uint64_t JournalStreamer::GetRecordCount() const {
|
|||
}
|
||||
|
||||
void JournalStreamer::Cancel() {
|
||||
Finalize(); // Finalize must be called before UnregisterOnChange because we first need to stop
|
||||
// writing to buffer and notify the all the producers.
|
||||
// Writing to journal holds mutex protecting change_cb_arr_, than the fiber can
|
||||
// preemt when calling NotifyWritten and it will not run again till notified.
|
||||
// UnregisterOnChange will try to lock the mutex therefor calling UnregisterOnChange
|
||||
// before Finalize may cause deadlock.
|
||||
journal_->UnregisterOnChange(journal_cb_id_);
|
||||
Finalize();
|
||||
|
||||
if (write_fb_.IsJoinable())
|
||||
if (write_fb_.IsJoinable()) {
|
||||
write_fb_.Join();
|
||||
}
|
||||
}
|
||||
|
||||
void JournalStreamer::WriterFb(io::Sink* dest) {
|
||||
|
|
|
@ -189,6 +189,7 @@ async def test_disconnect_replica(df_local_factory: DflyInstanceFactory, df_seed
|
|||
await c_replica.execute_command("REPLICAOF localhost " + str(master.port))
|
||||
if crash_type == 0:
|
||||
await asyncio.sleep(random.random()/100+0.01)
|
||||
await c_replica.connection_pool.disconnect()
|
||||
replica.stop(kill=True)
|
||||
else:
|
||||
await wait_available_async(c_replica)
|
||||
|
@ -208,6 +209,7 @@ async def test_disconnect_replica(df_local_factory: DflyInstanceFactory, df_seed
|
|||
# Run stable state crashes
|
||||
async def stable_sync(replica, c_replica, crash_type):
|
||||
await asyncio.sleep(random.random() / 100)
|
||||
await c_replica.connection_pool.disconnect()
|
||||
replica.stop(kill=True)
|
||||
|
||||
await asyncio.gather(*(stable_sync(*args) for args
|
||||
|
@ -249,11 +251,13 @@ async def test_disconnect_replica(df_local_factory: DflyInstanceFactory, df_seed
|
|||
for replica, c_replica, _ in replicas_of_type(lambda t: t == 2):
|
||||
assert await c_replica.ping()
|
||||
assert await seeder.compare(capture, port=replica.port)
|
||||
await c_replica.connection_pool.disconnect()
|
||||
|
||||
# Check master survived all disconnects
|
||||
assert await c_master.ping()
|
||||
await c_master.close()
|
||||
|
||||
|
||||
"""
|
||||
Test stopping master during different phases.
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue