mirror of
https://github.com/dragonflydb/dragonfly.git
synced 2025-05-11 10:25:47 +02:00
bug(replication): fix deadlock in cancle replication flow (#1007)
Signed-off-by: adi_holden <adi@dragonflydb.io>
This commit is contained in:
parent
44c477142a
commit
0312b66244
4 changed files with 15 additions and 20 deletions
|
@ -517,24 +517,12 @@ void DflyCmd::CancelReplication(uint32_t sync_id, shared_ptr<ReplicaInfo> replic
|
||||||
replica_ptr->state.store(SyncState::CANCELLED, memory_order_release);
|
replica_ptr->state.store(SyncState::CANCELLED, memory_order_release);
|
||||||
replica_ptr->cntx.Cancel();
|
replica_ptr->cntx.Cancel();
|
||||||
|
|
||||||
// Run cleanup for shard threads.
|
|
||||||
shard_set->AwaitRunningOnShardQueue([replica_ptr](EngineShard* shard) {
|
|
||||||
FlowInfo* flow = &replica_ptr->flows[shard->shard_id()];
|
|
||||||
if (flow->cleanup) {
|
|
||||||
flow->cleanup();
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// Wait for tasks to finish.
|
// Wait for tasks to finish.
|
||||||
shard_set->pool()->AwaitFiberOnAll([replica_ptr](unsigned index, auto*) {
|
shard_set->pool()->AwaitFiberOnAll([replica_ptr](unsigned index, auto*) {
|
||||||
FlowInfo* flow = &replica_ptr->flows[index];
|
FlowInfo* flow = &replica_ptr->flows[index];
|
||||||
|
|
||||||
// Cleanup hasn't been run for io-thread.
|
|
||||||
if (EngineShard::tlocal() == nullptr) {
|
|
||||||
if (flow->cleanup) {
|
if (flow->cleanup) {
|
||||||
flow->cleanup();
|
flow->cleanup();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (flow->full_sync_fb.IsJoinable()) {
|
if (flow->full_sync_fb.IsJoinable()) {
|
||||||
flow->full_sync_fb.Join();
|
flow->full_sync_fb.Join();
|
||||||
|
|
|
@ -5,7 +5,6 @@
|
||||||
#include "server/io_utils.h"
|
#include "server/io_utils.h"
|
||||||
|
|
||||||
#include "base/flags.h"
|
#include "base/flags.h"
|
||||||
#include "base/logging.h"
|
|
||||||
#include "server/error.h"
|
#include "server/error.h"
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
@ -41,8 +40,7 @@ error_code BufferedStreamerBase::ConsumeIntoSink(io::Sink* dest) {
|
||||||
// Wait for more data or stop signal.
|
// Wait for more data or stop signal.
|
||||||
waker_.await([this]() { return buffered_ > 0 || IsStopped(); });
|
waker_.await([this]() { return buffered_ > 0 || IsStopped(); });
|
||||||
// Break immediately on cancellation.
|
// Break immediately on cancellation.
|
||||||
if (cll_->IsCancelled()) {
|
if (IsStopped()) {
|
||||||
waker_.notifyAll(); // Wake consumer if it missed it.
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -62,7 +60,6 @@ error_code BufferedStreamerBase::ConsumeIntoSink(io::Sink* dest) {
|
||||||
// TODO: shrink big stash.
|
// TODO: shrink big stash.
|
||||||
consumer_buf_.Clear();
|
consumer_buf_.Clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
return std::error_code{};
|
return std::error_code{};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -25,11 +25,17 @@ uint64_t JournalStreamer::GetRecordCount() const {
|
||||||
}
|
}
|
||||||
|
|
||||||
void JournalStreamer::Cancel() {
|
void JournalStreamer::Cancel() {
|
||||||
|
Finalize(); // Finalize must be called before UnregisterOnChange because we first need to stop
|
||||||
|
// writing to buffer and notify the all the producers.
|
||||||
|
// Writing to journal holds mutex protecting change_cb_arr_, than the fiber can
|
||||||
|
// preemt when calling NotifyWritten and it will not run again till notified.
|
||||||
|
// UnregisterOnChange will try to lock the mutex therefor calling UnregisterOnChange
|
||||||
|
// before Finalize may cause deadlock.
|
||||||
journal_->UnregisterOnChange(journal_cb_id_);
|
journal_->UnregisterOnChange(journal_cb_id_);
|
||||||
Finalize();
|
|
||||||
|
|
||||||
if (write_fb_.IsJoinable())
|
if (write_fb_.IsJoinable()) {
|
||||||
write_fb_.Join();
|
write_fb_.Join();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void JournalStreamer::WriterFb(io::Sink* dest) {
|
void JournalStreamer::WriterFb(io::Sink* dest) {
|
||||||
|
|
|
@ -189,6 +189,7 @@ async def test_disconnect_replica(df_local_factory: DflyInstanceFactory, df_seed
|
||||||
await c_replica.execute_command("REPLICAOF localhost " + str(master.port))
|
await c_replica.execute_command("REPLICAOF localhost " + str(master.port))
|
||||||
if crash_type == 0:
|
if crash_type == 0:
|
||||||
await asyncio.sleep(random.random()/100+0.01)
|
await asyncio.sleep(random.random()/100+0.01)
|
||||||
|
await c_replica.connection_pool.disconnect()
|
||||||
replica.stop(kill=True)
|
replica.stop(kill=True)
|
||||||
else:
|
else:
|
||||||
await wait_available_async(c_replica)
|
await wait_available_async(c_replica)
|
||||||
|
@ -208,6 +209,7 @@ async def test_disconnect_replica(df_local_factory: DflyInstanceFactory, df_seed
|
||||||
# Run stable state crashes
|
# Run stable state crashes
|
||||||
async def stable_sync(replica, c_replica, crash_type):
|
async def stable_sync(replica, c_replica, crash_type):
|
||||||
await asyncio.sleep(random.random() / 100)
|
await asyncio.sleep(random.random() / 100)
|
||||||
|
await c_replica.connection_pool.disconnect()
|
||||||
replica.stop(kill=True)
|
replica.stop(kill=True)
|
||||||
|
|
||||||
await asyncio.gather(*(stable_sync(*args) for args
|
await asyncio.gather(*(stable_sync(*args) for args
|
||||||
|
@ -249,11 +251,13 @@ async def test_disconnect_replica(df_local_factory: DflyInstanceFactory, df_seed
|
||||||
for replica, c_replica, _ in replicas_of_type(lambda t: t == 2):
|
for replica, c_replica, _ in replicas_of_type(lambda t: t == 2):
|
||||||
assert await c_replica.ping()
|
assert await c_replica.ping()
|
||||||
assert await seeder.compare(capture, port=replica.port)
|
assert await seeder.compare(capture, port=replica.port)
|
||||||
|
await c_replica.connection_pool.disconnect()
|
||||||
|
|
||||||
# Check master survived all disconnects
|
# Check master survived all disconnects
|
||||||
assert await c_master.ping()
|
assert await c_master.ping()
|
||||||
await c_master.close()
|
await c_master.close()
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Test stopping master during different phases.
|
Test stopping master during different phases.
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue