// Copyright 2022, DragonflyDB authors. All rights reserved. // See LICENSE for licensing terms. // #include "server/replica.h" extern "C" { #include "redis/rdb.h" } #include #include #include #include #include #include "base/logging.h" #include "facade/dragonfly_connection.h" #include "facade/redis_parser.h" #include "server/error.h" #include "server/journal/executor.h" #include "server/journal/serializer.h" #include "server/main_service.h" #include "server/rdb_load.h" #include "util/proactor_base.h" namespace dfly { using namespace std; using namespace util; using namespace boost::asio; using namespace facade; using absl::StrCat; namespace { // TODO: 2. Use time-out on socket-reads so that we would not deadlock on unresponsive master. // 3. Support ipv6 at some point. int ResolveDns(std::string_view host, char* dest) { struct addrinfo hints, *servinfo; memset(&hints, 0, sizeof(hints)); hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; hints.ai_protocol = IPPROTO_TCP; hints.ai_flags = AI_ALL; int res = getaddrinfo(host.data(), NULL, &hints, &servinfo); if (res != 0) return res; static_assert(INET_ADDRSTRLEN < INET6_ADDRSTRLEN, ""); res = EAI_FAMILY; for (addrinfo* p = servinfo; p != NULL; p = p->ai_next) { if (p->ai_family == AF_INET) { struct sockaddr_in* ipv4 = (struct sockaddr_in*)p->ai_addr; const char* inet_res = inet_ntop(p->ai_family, &ipv4->sin_addr, dest, INET6_ADDRSTRLEN); CHECK_NOTNULL(inet_res); res = 0; break; } LOG(WARNING) << "Only IPv4 is supported"; } freeaddrinfo(servinfo); return res; } error_code Recv(FiberSocketBase* input, base::IoBuf* dest) { auto buf = dest->AppendBuffer(); io::Result exp_size = input->Recv(buf); if (!exp_size) return exp_size.error(); dest->CommitWrite(*exp_size); return error_code{}; } constexpr unsigned kRdbEofMarkSize = 40; // Distribute flow indices over all available threads (shard_set pool size). vector> Partition(unsigned num_flows) { vector> partition(shard_set->pool()->size()); for (unsigned i = 0; i < num_flows; ++i) { partition[i % partition.size()].push_back(i); } return partition; } } // namespace Replica::Replica(string host, uint16_t port, Service* se) : service_(*se) { master_context_.host = std::move(host); master_context_.port = port; } Replica::Replica(const MasterContext& context, uint32_t dfly_flow_id, Service* service, std::shared_ptr shared_exe_data) : service_(*service), master_context_(context) { master_context_.dfly_flow_id = dfly_flow_id; multi_shard_exe_ = shared_exe_data; } Replica::~Replica() { if (sync_fb_.joinable()) sync_fb_.join(); if (sock_) { auto ec = sock_->Close(); LOG_IF(ERROR, ec) << "Error closing replica socket " << ec; } } static const char kConnErr[] = "could not connect to master: "; bool Replica::Start(ConnectionContext* cntx) { CHECK(!sock_); ProactorBase* mythread = ProactorBase::me(); CHECK(mythread); // 1. Connect socket. error_code ec = ConnectSocket(); if (ec) { (*cntx)->SendError(StrCat(kConnErr, ec.message())); return false; } // 2. Greet. state_mask_ = R_ENABLED | R_TCP_CONNECTED; last_io_time_ = mythread->GetMonotonicTimeNs(); ec = Greet(); if (ec) { (*cntx)->SendError(StrCat("could not greet master ", ec.message())); return false; } // 3. Init basic context. cntx_.Reset(absl::bind_front(&Replica::DefaultErrorHandler, this)); // 4. Spawn main coordination fiber. sync_fb_ = ::boost::fibers::fiber(&Replica::MainReplicationFb, this); (*cntx)->SendOk(); return true; } void Replica::Stop() { // Mark disabled, prevent from retrying. if (sock_) { sock_->proactor()->Await([this] { state_mask_ = 0; // Specifically ~R_ENABLED. cntx_.Cancel(); // Context is fully resposible for cleanup. }); } // Make sure the replica fully stopped and did all cleanup, // so we can freely release resources (connections). sync_fb_.join(); } void Replica::Pause(bool pause) { sock_->proactor()->Await([&] { is_paused_ = pause; }); } void Replica::MainReplicationFb() { error_code ec; while (state_mask_ & R_ENABLED) { // Discard all previous errors and set default error handler. cntx_.Reset(absl::bind_front(&Replica::DefaultErrorHandler, this)); // 1. Connect socket. if ((state_mask_ & R_TCP_CONNECTED) == 0) { fibers_ext::SleepFor(500ms); if (is_paused_) continue; ec = ConnectSocket(); if (ec) { LOG(ERROR) << "Error connecting " << ec; continue; } VLOG(1) << "Replica socket connected"; state_mask_ |= R_TCP_CONNECTED; } // 2. Greet. if ((state_mask_ & R_GREETED) == 0) { ec = Greet(); if (ec) { LOG(INFO) << "Error greeting " << ec; state_mask_ &= ~R_TCP_CONNECTED; continue; } } // 3. Initiate full sync if ((state_mask_ & R_SYNC_OK) == 0) { // Make sure we're in LOADING state. if (service_.SwitchState(GlobalState::ACTIVE, GlobalState::LOADING) != GlobalState::LOADING) { state_mask_ = 0; continue; } if (HasDflyMaster()) ec = InitiateDflySync(); else ec = InitiatePSync(); service_.SwitchState(GlobalState::LOADING, GlobalState::ACTIVE); if (ec) { LOG(WARNING) << "Error syncing " << ec << " " << ec.message(); state_mask_ &= R_ENABLED; // reset all flags besides R_ENABLED JoinAllFlows(); continue; } state_mask_ |= R_SYNC_OK; } // 4. Start stable state sync. DCHECK(state_mask_ & R_SYNC_OK); if (HasDflyMaster()) ec = ConsumeDflyStream(); else ec = ConsumeRedisStream(); JoinAllFlows(); state_mask_ &= ~R_SYNC_OK; } VLOG(1) << "Main replication fiber finished"; } error_code Replica::ConnectSocket() { sock_.reset(ProactorBase::me()->CreateSocket()); char ip_addr[INET6_ADDRSTRLEN]; int resolve_res = ResolveDns(master_context_.host, ip_addr); if (resolve_res != 0) { LOG(ERROR) << "Dns error " << gai_strerror(resolve_res) << ", host: " << master_context_.host; return make_error_code(errc::host_unreachable); } master_context_.endpoint = {ip::make_address(ip_addr), master_context_.port}; /* These may help but require additional field testing to learn. int yes = 1; CHECK_EQ(0, setsockopt(sock_->native_handle(), IPPROTO_TCP, TCP_NODELAY, &yes, sizeof(yes))); CHECK_EQ(0, setsockopt(sock_->native_handle(), SOL_SOCKET, SO_KEEPALIVE, &yes, sizeof(yes))); int intv = 15; CHECK_EQ(0, setsockopt(sock_->native_handle(), IPPROTO_TCP, TCP_KEEPIDLE, &intv, sizeof(intv))); intv /= 3; CHECK_EQ(0, setsockopt(sock_->native_handle(), IPPROTO_TCP, TCP_KEEPINTVL, &intv, sizeof(intv))); intv = 3; CHECK_EQ(0, setsockopt(sock_->native_handle(), IPPROTO_TCP, TCP_KEEPCNT, &intv, sizeof(intv))); */ return sock_->Connect(master_context_.endpoint); } error_code Replica::Greet() { parser_.reset(new RedisParser{false}); base::IoBuf io_buf{128}; ReqSerializer serializer{sock_.get()}; uint32_t consumed = 0; // Corresponds to server.repl_state == REPL_STATE_CONNECTING state in redis RETURN_ON_ERR(SendCommand("PING", &serializer)); // optional. RETURN_ON_ERR(ReadRespReply(&io_buf, &consumed)); if (!CheckRespIsSimpleReply("PONG")) { LOG(ERROR) << "Bad pong response " << ToSV(io_buf.InputBuffer()); return make_error_code(errc::bad_message); } io_buf.ConsumeInput(consumed); // TODO: we may also send REPLCONF listening-port, ip-address // See server.repl_state == REPL_STATE_SEND_PORT condition in replication.c // Corresponds to server.repl_state == REPL_STATE_SEND_CAPA RETURN_ON_ERR(SendCommand("REPLCONF capa eof capa psync2", &serializer)); RETURN_ON_ERR(ReadRespReply(&io_buf, &consumed)); if (!CheckRespIsSimpleReply("OK")) { LOG(ERROR) << "Bad REPLCONF response " << ToSV(io_buf.InputBuffer()); return make_error_code(errc::bad_message); } io_buf.ConsumeInput(consumed); // Announce that we are the dragonfly client. // Note that we currently do not support dragonfly->redis replication. RETURN_ON_ERR(SendCommand("REPLCONF capa dragonfly", &serializer)); RETURN_ON_ERR(ReadRespReply(&io_buf, &consumed)); if (!CheckRespFirstTypes({RespExpr::STRING})) { return make_error_code(errc::bad_message); } string_view cmd = ToSV(resp_args_[0].GetBuf()); if (resp_args_.size() == 1) { // Redis if (cmd != "OK") { LOG(ERROR) << "Unexpected response " << cmd; return make_error_code(errc::bad_message); } } else if (resp_args_.size() == 3) { // it's dragonfly master. // Reponse is: if (!CheckRespFirstTypes({RespExpr::STRING, RespExpr::STRING, RespExpr::INT64}) || resp_args_[0].GetBuf().size() != CONFIG_RUN_ID_SIZE) { LOG(ERROR) << "Unexpected response " << ToSV(io_buf.InputBuffer()); return make_error_code(errc::bad_message); } string_view param0 = ToSV(resp_args_[0].GetBuf()); string_view param1 = ToSV(resp_args_[1].GetBuf()); int64 param2 = get(resp_args_[2].u); if (param2 <= 0 || param2 > 1024) { // sanity check, we support upto 1024 shards. // It's not that we can not support more but it's probably highly unlikely that someone // will run dragonfly with more than 1024 cores. LOG(ERROR) << "Invalid flow count " << param2; return make_error_code(errc::bad_message); } master_context_.master_repl_id = param0; master_context_.dfly_session_id = param1; num_df_flows_ = param2; VLOG(1) << "Master id: " << param0 << ", sync id: " << param1 << ", num journals " << num_df_flows_; } else { LOG(ERROR) << "Bad response " << ToSV(io_buf.InputBuffer()); return make_error_code(errc::bad_message); } io_buf.ConsumeInput(consumed); state_mask_ |= R_GREETED; return error_code{}; } error_code Replica::InitiatePSync() { base::IoBuf io_buf{128}; ReqSerializer serializer{sock_.get()}; // Corresponds to server.repl_state == REPL_STATE_SEND_PSYNC string id("?"); // corresponds to null master id and null offset int64_t offs = -1; if (!master_context_.master_repl_id.empty()) { // in case we synced before id = master_context_.master_repl_id; // provide the replication offset and master id offs = repl_offs_; // to try incremental sync. } RETURN_ON_ERR(SendCommand(StrCat("PSYNC ", id, " ", offs), &serializer)); // Master may delay sync response with "repl_diskless_sync_delay" PSyncResponse repl_header; RETURN_ON_ERR(ParseReplicationHeader(&io_buf, &repl_header)); ProactorBase* sock_thread = sock_->proactor(); string* token = absl::get_if(&repl_header.fullsync); size_t snapshot_size = SIZE_MAX; if (!token) { snapshot_size = absl::get(repl_header.fullsync); } last_io_time_ = sock_thread->GetMonotonicTimeNs(); // we get token for diskless redis replication. For disk based replication // we get the snapshot size. if (snapshot_size || token != nullptr) { // full sync // Start full sync state_mask_ |= R_SYNCING; SocketSource ss{sock_.get()}; io::PrefixSource ps{io_buf.InputBuffer(), &ss}; RdbLoader loader(NULL); loader.set_source_limit(snapshot_size); // TODO: to allow registering callbacks within loader to send '\n' pings back to master. // Also to allow updating last_io_time_. error_code ec = loader.Load(&ps); RETURN_ON_ERR(ec); VLOG(1) << "full sync completed"; if (token) { uint8_t buf[kRdbEofMarkSize]; io::PrefixSource chained(loader.Leftover(), &ps); VLOG(1) << "Before reading from chained stream"; io::Result eof_res = chained.Read(io::MutableBytes{buf}); CHECK(eof_res && *eof_res == kRdbEofMarkSize); VLOG(1) << "Comparing token " << ToSV(buf); // TODO: handle gracefully... CHECK_EQ(0, memcmp(token->data(), buf, kRdbEofMarkSize)); CHECK(chained.unused_prefix().empty()); } else { CHECK_EQ(0u, loader.Leftover().size()); CHECK_EQ(snapshot_size, loader.bytes_read()); } CHECK(ps.unused_prefix().empty()); io_buf.ConsumeInput(io_buf.InputLen()); last_io_time_ = sock_thread->GetMonotonicTimeNs(); } state_mask_ &= ~R_SYNCING; state_mask_ |= R_SYNC_OK; // There is a data race condition in Redis-master code, where "ACK 0" handler may be // triggered before Redis is ready to transition to the streaming state and it silenty ignores // "ACK 0". We reduce the chance it happens with this delay. fibers_ext::SleepFor(50ms); return error_code{}; } // Initialize and start sub-replica for each flow. error_code Replica::InitiateDflySync() { DCHECK_GT(num_df_flows_, 0u); multi_shard_exe_.reset(new MultiShardExecution()); shard_flows_.resize(num_df_flows_); for (unsigned i = 0; i < num_df_flows_; ++i) { shard_flows_[i].reset(new Replica(master_context_, i, &service_, multi_shard_exe_)); } // Blocked on until all flows got full sync cut. fibers_ext::BlockingCounter sync_block{num_df_flows_}; auto err_handler = [this, sync_block](const auto& ge) mutable { sync_block.Cancel(); // Unblock this function. DefaultErrorHandler(ge); // Close sockets to unblock flows. }; RETURN_ON_ERR(cntx_.Switch(std::move(err_handler))); // Start full sync flows. auto partition = Partition(num_df_flows_); shard_set->pool()->AwaitFiberOnAll([&](unsigned index, auto*) { for (auto id : partition[index]) { auto ec = shard_flows_[id]->StartFullSyncFlow(sync_block, &cntx_); if (ec) cntx_.Error(ec); } }); RETURN_ON_ERR(cntx_.GetError()); // Send DFLY SYNC. if (auto ec = SendNextPhaseRequest(); ec) { return cntx_.Error(ec); } // Wait for all flows to receive full sync cut. // In case of an error, this is unblocked by the error handler. LOG(INFO) << "Waiting for all full sync cut confirmations"; sync_block.Wait(); LOG(INFO) << "Full sync finished"; return cntx_.GetError(); } error_code Replica::ConsumeRedisStream() { base::IoBuf io_buf(16_KB); parser_.reset(new RedisParser); ReqSerializer serializer{sock_.get()}; // Master waits for this command in order to start sending replication stream. RETURN_ON_ERR(SendCommand("REPLCONF ACK 0", &serializer)); VLOG(1) << "Before reading repl-log"; // Redis sends eiher pings every "repl_ping_slave_period" time inside replicationCron(). // or, alternatively, write commands stream coming from propagate() function. // Replica connection must send "REPLCONF ACK xxx" in order to make sure that master replication // buffer gets disposed of already processed commands. error_code ec; time_t last_ack = time(nullptr); string ack_cmd; // basically reflection of dragonfly_connection IoLoop function. while (!ec) { io::MutableBytes buf = io_buf.AppendBuffer(); io::Result size_res = sock_->Recv(buf); if (!size_res) return size_res.error(); VLOG(1) << "Read replication stream of " << *size_res << " bytes"; last_io_time_ = sock_->proactor()->GetMonotonicTimeNs(); io_buf.CommitWrite(*size_res); repl_offs_ += *size_res; // Send repl ack back to master. if (repl_offs_ > ack_offs_ + 1024 || time(nullptr) > last_ack + 5) { ack_cmd.clear(); absl::StrAppend(&ack_cmd, "REPLCONF ACK ", repl_offs_); RETURN_ON_ERR(SendCommand(ack_cmd, &serializer)); } ec = ParseAndExecute(&io_buf); } VLOG(1) << "ConsumeRedisStream finished"; return ec; } error_code Replica::ConsumeDflyStream() { // Send DFLY STARTSTABLE. if (auto ec = SendNextPhaseRequest(); ec) { return cntx_.Error(ec); } // Wait for all flows to finish full sync. JoinAllFlows(); RETURN_ON_ERR(cntx_.Switch(absl::bind_front(&Replica::DefaultErrorHandler, this))); vector> partition = Partition(num_df_flows_); shard_set->pool()->AwaitFiberOnAll([&](unsigned index, auto*) { const auto& local_ids = partition[index]; for (unsigned id : local_ids) { auto ec = shard_flows_[id]->StartStableSyncFlow(&cntx_); if (ec) cntx_.Error(ec); } }); return cntx_.GetError(); } void Replica::CloseAllSockets() { if (sock_) { sock_->proactor()->Await([this] { auto ec = sock_->Shutdown(SHUT_RDWR); LOG_IF(ERROR, ec) << "Could not shutdown socket " << ec; }); } for (auto& flow : shard_flows_) { flow->CloseAllSockets(); } } void Replica::JoinAllFlows() { for (auto& flow : shard_flows_) { if (flow->sync_fb_.joinable()) { flow->sync_fb_.join(); } } } void Replica::DefaultErrorHandler(const GenericError& err) { CloseAllSockets(); } error_code Replica::SendNextPhaseRequest() { ReqSerializer serializer{sock_.get()}; // Ask master to start sending replication stream string request = (state_mask_ & R_SYNC_OK) ? "STARTSTABLE" : "SYNC"; RETURN_ON_ERR( SendCommand(StrCat("DFLY ", request, " ", master_context_.dfly_session_id), &serializer)); base::IoBuf io_buf{128}; unsigned consumed = 0; RETURN_ON_ERR(ReadRespReply(&io_buf, &consumed)); if (!CheckRespIsSimpleReply("OK")) { LOG(ERROR) << "Phase transition failed " << ToSV(io_buf.InputBuffer()); return make_error_code(errc::bad_message); } return std::error_code{}; } error_code Replica::StartFullSyncFlow(fibers_ext::BlockingCounter sb, Context* cntx) { CHECK(!sock_); DCHECK(!master_context_.master_repl_id.empty() && !master_context_.dfly_session_id.empty()); ProactorBase* mythread = ProactorBase::me(); CHECK(mythread); sock_.reset(mythread->CreateSocket()); RETURN_ON_ERR(sock_->Connect(master_context_.endpoint)); VLOG(1) << "Sending on flow " << master_context_.master_repl_id << " " << master_context_.dfly_session_id << " " << master_context_.dfly_flow_id; ReqSerializer serializer{sock_.get()}; auto cmd = StrCat("DFLY FLOW ", master_context_.master_repl_id, " ", master_context_.dfly_session_id, " ", master_context_.dfly_flow_id); RETURN_ON_ERR(SendCommand(cmd, &serializer)); parser_.reset(new RedisParser{false}); // client mode leftover_buf_.reset(new base::IoBuf(128)); unsigned consumed = 0; RETURN_ON_ERR(ReadRespReply(leftover_buf_.get(), &consumed)); // uses parser_ if (!CheckRespFirstTypes({RespExpr::STRING, RespExpr::STRING})) { LOG(ERROR) << "Bad FLOW response " << ToSV(leftover_buf_->InputBuffer()); return make_error_code(errc::bad_message); } string_view flow_directive = ToSV(resp_args_[0].GetBuf()); string eof_token; if (flow_directive == "FULL") { eof_token = ToSV(resp_args_[1].GetBuf()); } else { LOG(ERROR) << "Bad FLOW response " << ToSV(leftover_buf_->InputBuffer()); } leftover_buf_->ConsumeInput(consumed); state_mask_ = R_ENABLED | R_TCP_CONNECTED; // We can not discard io_buf because it may contain data // besides the response we parsed. Therefore we pass it further to ReplicateDFFb. sync_fb_ = ::boost::fibers::fiber(&Replica::FullSyncDflyFb, this, move(eof_token), sb, cntx); return error_code{}; } error_code Replica::StartStableSyncFlow(Context* cntx) { DCHECK(!master_context_.master_repl_id.empty() && !master_context_.dfly_session_id.empty()); ProactorBase* mythread = ProactorBase::me(); CHECK(mythread); CHECK(sock_->IsOpen()); // sock_.reset(mythread->CreateSocket()); // RETURN_ON_ERR(sock_->Connect(master_context_.master_ep)); sync_fb_ = ::boost::fibers::fiber(&Replica::StableSyncDflyFb, this, cntx); return std::error_code{}; } void Replica::FullSyncDflyFb(string eof_token, fibers_ext::BlockingCounter bc, Context* cntx) { DCHECK(leftover_buf_); SocketSource ss{sock_.get()}; io::PrefixSource ps{leftover_buf_->InputBuffer(), &ss}; RdbLoader loader(&service_); loader.SetFullSyncCutCb([bc, ran = false]() mutable { if (!ran) { bc.Dec(); ran = true; } }); // Load incoming rdb stream. if (std::error_code ec = loader.Load(&ps); ec) { cntx->Error(ec, "Error loading rdb format"); return; } // Try finding eof token. io::PrefixSource chained_tail{loader.Leftover(), &ps}; if (!eof_token.empty()) { unique_ptr buf{new uint8_t[eof_token.size()]}; io::Result res = chained_tail.ReadAtLeast(io::MutableBytes{buf.get(), eof_token.size()}, eof_token.size()); if (!res || *res != eof_token.size()) { cntx->Error(std::make_error_code(errc::protocol_error), "Error finding eof token in stream"); return; } } // Keep loader leftover. io::Bytes unused = chained_tail.unused_prefix(); if (unused.size() > 0) { leftover_buf_.reset(new base::IoBuf{unused.size()}); auto mut_bytes = leftover_buf_->AppendBuffer(); memcpy(mut_bytes.data(), unused.data(), unused.size()); leftover_buf_->CommitWrite(unused.size()); } else { leftover_buf_.reset(); } VLOG(1) << "FullSyncDflyFb finished after reading " << loader.bytes_read() << " bytes"; } void Replica::StableSyncDflyFb(Context* cntx) { // Check leftover from full sync. io::Bytes prefix{}; if (leftover_buf_ && leftover_buf_->InputLen() > 0) { prefix = leftover_buf_->InputBuffer(); } SocketSource ss{sock_.get()}; io::PrefixSource ps{prefix, &ss}; JournalReader reader{0}; JournalExecutor executor{&service_}; while (!cntx->IsCancelled()) { auto res = reader.ReadEntry(&ps); if (!res) { cntx->Error(res.error(), "Journal format error"); return; } ExecuteEntry(&executor, res.value()); last_io_time_ = sock_->proactor()->GetMonotonicTimeNs(); } return; } void Replica::ExecuteEntry(JournalExecutor* executor, journal::ParsedEntry& entry) { if (entry.shard_cnt <= 1) { // not multi shard cmd executor->Execute(entry); return; } // Multi shard command flow: // step 1: Fiber wait until all the fibers that should execute this tranaction got // to the journal entry of the transaction. // step 2: execute the command (All fibers) // step 3: Fiber wait until all fibers finished the execution // By step 1 we enforce that replica will execute multi shard commands that finished on master // By step 3 we ensures the correctness of flushall/flushdb commands // TODO: this implemantaion does not support atomicity in replica // Although multi shard transaction happen in step 2 very close to each other, // user can query replica between executions. // To support atomicity we should have one fiber in step 2 which will excute all the entries of // the transaction together. In case of global comand such as flushdb the command can be executed // by only one fiber. // TODO: support error handler in this flow // Only the first fiber to reach the transaction will create data for transaction in map multi_shard_exe_->map_mu.lock(); auto [it, was_insert] = multi_shard_exe_->tx_sync_execution.emplace(entry.txid, entry.shard_cnt); // Note: we must release the mutex befor calling wait on barrier multi_shard_exe_->map_mu.unlock(); VLOG(2) << "txid: " << entry.txid << " unique_shard_cnt_: " << entry.shard_cnt << " was_insert: " << was_insert; // step 1 it->second.barrier.wait(); // step 2 executor->Execute(entry); // step 3 it->second.barrier.wait(); // Note: erase from map can be done only after all fibers returned from wait. // The last fiber which will decrease the counter to 0 will be the one to erase the data from map auto val = it->second.counter.fetch_sub(1, std::memory_order_relaxed); VLOG(2) << "txid: " << entry.txid << " unique_shard_cnt_: " << entry.shard_cnt << " counter: " << val; if (val == 1) { std::lock_guard lg{multi_shard_exe_->map_mu}; multi_shard_exe_->tx_sync_execution.erase(entry.txid); } } error_code Replica::ReadRespReply(base::IoBuf* io_buf, uint32_t* consumed) { DCHECK(parser_); error_code ec; // basically reflection of dragonfly_connection IoLoop function. while (!ec) { io::MutableBytes buf = io_buf->AppendBuffer(); io::Result size_res = sock_->Recv(buf); if (!size_res) return size_res.error(); VLOG(2) << "Read master response of " << *size_res << " bytes"; last_io_time_ = sock_->proactor()->GetMonotonicTimeNs(); io_buf->CommitWrite(*size_res); RedisParser::Result result = parser_->Parse(io_buf->InputBuffer(), consumed, &resp_args_); if (result == RedisParser::OK && !resp_args_.empty()) { return error_code{}; // success path } if (result != RedisParser::INPUT_PENDING) { LOG(ERROR) << "Invalid parser status " << result << " for buffer of size " << io_buf->InputLen(); return std::make_error_code(std::errc::bad_message); } io_buf->ConsumeInput(*consumed); } return ec; } error_code Replica::ParseReplicationHeader(base::IoBuf* io_buf, PSyncResponse* dest) { std::string_view str; RETURN_ON_ERR(ReadLine(io_buf, &str)); DCHECK(!str.empty()); std::string_view header; bool valid = false; // non-empty lines if (str[0] != '+') { goto bad_header; } header = str.substr(1); VLOG(1) << "header: " << header; if (absl::ConsumePrefix(&header, "FULLRESYNC ")) { // +FULLRESYNC db7bd45bf68ae9b1acac33acb 123\r\n // master_id repl_offset size_t pos = header.find(' '); if (pos != std::string_view::npos) { if (absl::SimpleAtoi(header.substr(pos + 1), &repl_offs_)) { master_context_.master_repl_id = string(header.substr(0, pos)); valid = true; VLOG(1) << "master repl_id " << master_context_.master_repl_id << " / " << repl_offs_; } } if (!valid) goto bad_header; io_buf->ConsumeInput(str.size() + 2); RETURN_ON_ERR(ReadLine(io_buf, &str)); // Read the next line parsed below. // Readline checks for non ws character first before searching for eol // so str must be non empty. DCHECK(!str.empty()); if (str[0] != '$') { goto bad_header; } std::string_view token = str.substr(1); VLOG(1) << "token: " << token; if (absl::ConsumePrefix(&token, "EOF:")) { CHECK_EQ(kRdbEofMarkSize, token.size()) << token; dest->fullsync.emplace(token); VLOG(1) << "Token: " << token; } else { size_t rdb_size = 0; if (!absl::SimpleAtoi(token, &rdb_size)) return std::make_error_code(std::errc::illegal_byte_sequence); VLOG(1) << "rdb size " << rdb_size; dest->fullsync.emplace(rdb_size); } io_buf->ConsumeInput(str.size() + 2); } else if (absl::ConsumePrefix(&header, "CONTINUE")) { // we send psync2 so we should get master replid. // That could change due to redis failovers. // TODO: part sync dest->fullsync.emplace(0); } return error_code{}; bad_header: LOG(ERROR) << "Bad replication header: " << str; return std::make_error_code(std::errc::illegal_byte_sequence); } error_code Replica::ReadLine(base::IoBuf* io_buf, string_view* line) { size_t eol_pos; std::string_view input_str = ToSV(io_buf->InputBuffer()); // consume whitespace. while (true) { auto it = find_if_not(input_str.begin(), input_str.end(), absl::ascii_isspace); size_t ws_len = it - input_str.begin(); io_buf->ConsumeInput(ws_len); input_str = ToSV(io_buf->InputBuffer()); if (!input_str.empty()) break; RETURN_ON_ERR(Recv(sock_.get(), io_buf)); input_str = ToSV(io_buf->InputBuffer()); }; // find eol. while (true) { eol_pos = input_str.find('\n'); if (eol_pos != std::string_view::npos) { DCHECK_GT(eol_pos, 0u); // can not be 0 because then would be consumed as a whitespace. if (input_str[eol_pos - 1] != '\r') { break; } *line = input_str.substr(0, eol_pos - 1); return error_code{}; } RETURN_ON_ERR(Recv(sock_.get(), io_buf)); input_str = ToSV(io_buf->InputBuffer()); } LOG(ERROR) << "Bad replication header: " << input_str; return std::make_error_code(std::errc::illegal_byte_sequence); } error_code Replica::ParseAndExecute(base::IoBuf* io_buf) { VLOG(1) << "ParseAndExecute: input len " << io_buf->InputLen(); if (parser_->stash_size() > 0) { DVLOG(1) << "Stash " << *parser_->stash()[0]; } uint32_t consumed = 0; RedisParser::Result result = RedisParser::OK; io::NullSink null_sink; // we never reply back on the commands. ConnectionContext conn_context{&null_sink, nullptr}; conn_context.is_replicating = true; do { result = parser_->Parse(io_buf->InputBuffer(), &consumed, &resp_args_); switch (result) { case RedisParser::OK: if (!resp_args_.empty()) { VLOG(2) << "Got command " << ToSV(resp_args_[0].GetBuf()) << ToSV(resp_args_[1].GetBuf()) << "\n consumed: " << consumed; facade::RespToArgList(resp_args_, &cmd_str_args_); CmdArgList arg_list{cmd_str_args_.data(), cmd_str_args_.size()}; service_.DispatchCommand(arg_list, &conn_context); } io_buf->ConsumeInput(consumed); break; case RedisParser::INPUT_PENDING: io_buf->ConsumeInput(consumed); break; default: LOG(ERROR) << "Invalid parser status " << result << " for buffer of size " << io_buf->InputLen(); return std::make_error_code(std::errc::bad_message); } } while (io_buf->InputLen() > 0 && result == RedisParser::OK); VLOG(1) << "ParseAndExecute: " << io_buf->InputLen() << " " << ToSV(io_buf->InputBuffer()); return error_code{}; } Replica::Info Replica::GetInfo() const { CHECK(sock_); return sock_->proactor()->AwaitBrief([this] { auto last_io_time = last_io_time_; for (const auto& flow : shard_flows_) { // Get last io time from all sub flows. last_io_time = std::max(last_io_time, flow->last_io_time_); } Info res; res.host = master_context_.host; res.port = master_context_.port; res.master_link_established = (state_mask_ & R_TCP_CONNECTED); res.sync_in_progress = (state_mask_ & R_SYNCING); res.master_last_io_sec = (ProactorBase::GetMonotonicTimeNs() - last_io_time) / 1000000000UL; return res; }); } bool Replica::CheckRespIsSimpleReply(string_view reply) const { return resp_args_.size() == 1 || resp_args_.front().type == RespExpr::STRING || ToSV(resp_args_.front().GetBuf()) == reply; } bool Replica::CheckRespFirstTypes(initializer_list types) const { unsigned i = 0; for (RespExpr::Type type : types) { if (i >= resp_args_.size() || resp_args_[i].type != type) return false; ++i; } return true; } error_code Replica::SendCommand(string_view command, ReqSerializer* serializer) { serializer->SendCommand(command); error_code ec = serializer->ec(); if (!ec) { last_io_time_ = sock_->proactor()->GetMonotonicTimeNs(); } return ec; } } // namespace dfly