chore: parser clean ups (#4077)

1. Eliminate redundant states
2. Eliminate redundant member variables

Added some comments. No functional changes.

Signed-off-by: Roman Gershman <roman@dragonflydb.io>
This commit is contained in:
Roman Gershman 2024-11-07 10:07:48 +02:00 committed by GitHub
parent f8b3fa0d7b
commit c19af2bd43
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 128 additions and 122 deletions

View file

@ -21,55 +21,56 @@ auto RedisParser::Parse(Buffer str, uint32_t* consumed, RespExpr::Vec* res) -> R
} }
if (state_ == CMD_COMPLETE_S) { if (state_ == CMD_COMPLETE_S) {
state_ = INIT_S;
}
if (state_ == INIT_S) {
InitStart(str[0], res); InitStart(str[0], res);
} else {
// We continue parsing in the middle.
if (!cached_expr_)
cached_expr_ = res;
} }
DCHECK(state_ != CMD_COMPLETE_S);
if (!cached_expr_) ResultConsumed resultc{OK, 0};
cached_expr_ = res;
do {
if (str.empty()) {
resultc.first = INPUT_PENDING;
break;
}
while (state_ != CMD_COMPLETE_S) {
last_consumed_ = 0;
switch (state_) { switch (state_) {
case MAP_LEN_S: case MAP_LEN_S:
case ARRAY_LEN_S: case ARRAY_LEN_S:
last_result_ = ConsumeArrayLen(str); resultc = ConsumeArrayLen(str);
break; break;
case PARSE_ARG_S: case PARSE_ARG_S:
if (str.size() == 0 || (str.size() < 4 && str[0] != '_')) { if (str.size() == 0 || (str.size() < 4 && str[0] != '_')) {
last_result_ = INPUT_PENDING; resultc.first = INPUT_PENDING;
} else { } else {
last_result_ = ParseArg(str); resultc = ParseArg(str);
} }
break; break;
case INLINE_S: case INLINE_S:
DCHECK(parse_stack_.empty()); DCHECK(parse_stack_.empty());
last_result_ = ParseInline(str); resultc = ParseInline(str);
break; break;
case BULK_STR_S: case BULK_STR_S:
last_result_ = ConsumeBulk(str); resultc = ConsumeBulk(str);
break;
case FINISH_ARG_S:
HandleFinishArg();
break; break;
default: default:
LOG(FATAL) << "Unexpected state " << int(state_); LOG(FATAL) << "Unexpected state " << int(state_);
} }
*consumed += last_consumed_; *consumed += resultc.second;
if (last_result_ != OK) { if (resultc.first != OK) {
break; break;
} }
str.remove_prefix(last_consumed_); str.remove_prefix(exchange(resultc.second, 0));
} } while (state_ != CMD_COMPLETE_S);
if (last_result_ == INPUT_PENDING) { if (resultc.first == INPUT_PENDING) {
StashState(res); StashState(res);
} else if (last_result_ == OK) { } else if (resultc.first == OK) {
DCHECK(cached_expr_); DCHECK(cached_expr_);
if (res != cached_expr_) { if (res != cached_expr_) {
DCHECK(!stash_.empty()); DCHECK(!stash_.empty());
@ -78,7 +79,7 @@ auto RedisParser::Parse(Buffer str, uint32_t* consumed, RespExpr::Vec* res) -> R
} }
} }
return last_result_; return resultc.first;
} }
void RedisParser::InitStart(uint8_t prefix_b, RespExpr::Vec* res) { void RedisParser::InitStart(uint8_t prefix_b, RespExpr::Vec* res) {
@ -150,17 +151,20 @@ void RedisParser::StashState(RespExpr::Vec* res) {
} }
} }
auto RedisParser::ParseInline(Buffer str) -> Result { auto RedisParser::ParseInline(Buffer str) -> ResultConsumed {
DCHECK(!str.empty()); DCHECK(!str.empty());
uint8_t* ptr = str.begin(); uint8_t* ptr = str.begin();
uint8_t* end = str.end(); uint8_t* end = str.end();
uint8_t* token_start = ptr; uint8_t* token_start = ptr;
if (is_broken_token_) { auto find_token_end = [&] {
while (ptr != end && *ptr > 32) while (ptr != end && *ptr > 32)
++ptr; ++ptr;
};
if (is_broken_token_) {
find_token_end();
size_t len = ptr - token_start; size_t len = ptr - token_start;
ExtendLastString(Buffer(token_start, len)); ExtendLastString(Buffer(token_start, len));
@ -182,80 +186,69 @@ auto RedisParser::ParseInline(Buffer str) -> Result {
DCHECK(!is_broken_token_); DCHECK(!is_broken_token_);
token_start = ptr; token_start = ptr;
while (ptr != end && *ptr > 32) find_token_end();
++ptr;
cached_expr_->emplace_back(RespExpr::STRING); cached_expr_->emplace_back(RespExpr::STRING);
cached_expr_->back().u = Buffer{token_start, size_t(ptr - token_start)}; cached_expr_->back().u = Buffer{token_start, size_t(ptr - token_start)};
} }
last_consumed_ = ptr - str.data(); uint32_t last_consumed = ptr - str.data();
if (ptr == end) { // we have not finished parsing. if (ptr == end) { // we have not finished parsing.
if (ptr[-1] > 32) { if (ptr[-1] > 32) {
// we stopped in the middle of the token. // we stopped in the middle of the token.
is_broken_token_ = true; is_broken_token_ = true;
} }
return INPUT_PENDING; return {INPUT_PENDING, last_consumed};
} else {
++last_consumed_; // consume the delimiter as well.
} }
++last_consumed; // consume the delimiter as well.
state_ = CMD_COMPLETE_S; state_ = CMD_COMPLETE_S;
return OK; return {OK, last_consumed};
} }
auto RedisParser::ParseNum(Buffer str, int64_t* res) -> Result { // Parse lines like:'$5\r\n' or '*2\r\n'
if (str.size() < 4) { auto RedisParser::ParseLen(Buffer str, int64_t* res) -> ResultConsumed {
return INPUT_PENDING; DCHECK(!str.empty());
}
DCHECK(str[0] == '$' || str[0] == '*' || str[0] == '%' || str[0] == '~');
char* s = reinterpret_cast<char*>(str.data() + 1); char* s = reinterpret_cast<char*>(str.data() + 1);
char* pos = reinterpret_cast<char*>(memchr(s, '\n', str.size() - 1)); char* pos = reinterpret_cast<char*>(memchr(s, '\n', str.size() - 1));
if (!pos) { if (!pos) {
return str.size() < 32 ? INPUT_PENDING : BAD_INT; Result r = str.size() < 32 ? INPUT_PENDING : BAD_ARRAYLEN;
return {r, 0};
} }
if (pos[-1] != '\r') { if (pos[-1] != '\r') {
return BAD_INT; return {BAD_ARRAYLEN, 0};
} }
bool success = absl::SimpleAtoi(std::string_view{s, size_t(pos - s - 1)}, res); bool success = absl::SimpleAtoi(std::string_view{s, size_t(pos - s - 1)}, res);
if (!success) { return ResultConsumed{success ? OK : BAD_ARRAYLEN, (pos - s) + 2};
return BAD_INT;
}
last_consumed_ = (pos - s) + 2;
return OK;
} }
auto RedisParser::ConsumeArrayLen(Buffer str) -> Result { auto RedisParser::ConsumeArrayLen(Buffer str) -> ResultConsumed {
int64_t len; int64_t len;
Result res = ParseNum(str, &len); ResultConsumed res = ParseLen(str, &len);
if (res.first != OK) {
return res;
}
if (state_ == MAP_LEN_S) { if (state_ == MAP_LEN_S) {
// Map starts with %N followed by an array of 2*N elements. // Map starts with %N followed by an array of 2*N elements.
// Even elements are keys, odd elements are values. // Even elements are keys, odd elements are values.
len *= 2; len *= 2;
} }
switch (res) {
case INPUT_PENDING:
return INPUT_PENDING;
case BAD_INT:
return BAD_ARRAYLEN;
case OK:
if (len < -1 || len > max_arr_len_) {
LOG_IF(WARNING, len > max_arr_len_) << "Multibulk len is too large " << len;
return BAD_ARRAYLEN; if (len < -1 || len > max_arr_len_) {
} LOG_IF(WARNING, len > max_arr_len_) << "Multibulk len is too large " << len;
break;
default: return {BAD_ARRAYLEN, res.second};
LOG(ERROR) << "Unexpected result " << res;
} }
if (server_mode_ && (parse_stack_.size() > 0 || !cached_expr_->empty())) if (server_mode_ && (parse_stack_.size() > 0 || !cached_expr_->empty()))
return BAD_STRING; return {BAD_STRING, res.second};
if (len <= 0) { if (len <= 0) {
cached_expr_->emplace_back(len == -1 ? RespExpr::NIL_ARRAY : RespExpr::ARRAY); cached_expr_->emplace_back(len == -1 ? RespExpr::NIL_ARRAY : RespExpr::ARRAY);
@ -265,9 +258,13 @@ auto RedisParser::ConsumeArrayLen(Buffer str) -> Result {
static RespVec empty_vec; static RespVec empty_vec;
cached_expr_->back().u = &empty_vec; cached_expr_->back().u = &empty_vec;
} }
state_ = (parse_stack_.empty()) ? CMD_COMPLETE_S : FINISH_ARG_S; if (parse_stack_.empty()) {
state_ = CMD_COMPLETE_S;
} else {
HandleFinishArg();
}
return OK; return {OK, res.second};
} }
if (state_ == PARSE_ARG_S) { if (state_ == PARSE_ARG_S) {
@ -286,54 +283,49 @@ auto RedisParser::ConsumeArrayLen(Buffer str) -> Result {
DVLOG(1) << "PushStack: (" << len << ", " << cached_expr_ << ")"; DVLOG(1) << "PushStack: (" << len << ", " << cached_expr_ << ")";
parse_stack_.emplace_back(len, cached_expr_); parse_stack_.emplace_back(len, cached_expr_);
return OK; return {OK, res.second};
} }
auto RedisParser::ParseArg(Buffer str) -> Result { auto RedisParser::ParseArg(Buffer str) -> ResultConsumed {
char c = str[0]; char c = str[0];
if (c == '$') { if (c == '$') {
int64_t len; int64_t len;
Result res = ParseNum(str, &len); ResultConsumed res = ParseLen(str, &len);
switch (res) { if (res.first != OK) {
case INPUT_PENDING: return res;
return INPUT_PENDING;
case BAD_INT:
return BAD_ARRAYLEN;
case OK:
if (len < -1 || len > kMaxBulkLen)
return BAD_ARRAYLEN;
break;
default:
LOG(ERROR) << "Unexpected result " << res;
} }
if (len < 0) { // Resp2 NIL if (len < -1 || len > kMaxBulkLen)
state_ = FINISH_ARG_S; return {BAD_ARRAYLEN, res.second};
if (len == -1) { // Resp2 NIL
cached_expr_->emplace_back(RespExpr::NIL); cached_expr_->emplace_back(RespExpr::NIL);
cached_expr_->back().u = Buffer{};
HandleFinishArg();
} else { } else {
DVLOG(1) << "String(" << len << ")"; DVLOG(1) << "String(" << len << ")";
cached_expr_->emplace_back(RespExpr::STRING); cached_expr_->emplace_back(RespExpr::STRING);
cached_expr_->back().u = Buffer{};
bulk_len_ = len; bulk_len_ = len;
state_ = BULK_STR_S; state_ = BULK_STR_S;
} }
cached_expr_->back().u = Buffer{};
return OK; return {OK, res.second};
} }
if (server_mode_) { if (server_mode_) {
return BAD_BULKLEN; return {BAD_BULKLEN, 0};
} }
if (c == '_') { // Resp3 NIL if (c == '_') { // Resp3 NIL
// TODO: Do we need to validate that str[1:2] == "\r\n"? // TODO: Do we need to validate that str[1:2] == "\r\n"?
state_ = FINISH_ARG_S;
cached_expr_->emplace_back(RespExpr::NIL); cached_expr_->emplace_back(RespExpr::NIL);
cached_expr_->back().u = Buffer{}; cached_expr_->back().u = Buffer{};
last_consumed_ += 3; // '_','\r','\n' HandleFinishArg();
return OK; return {OK, 3}; // // '_','\r','\n'
} }
if (c == '*') { if (c == '*') {
@ -346,54 +338,60 @@ auto RedisParser::ParseArg(Buffer str) -> Result {
if (c == '+' || c == '-') { // Simple string or error. if (c == '+' || c == '-') { // Simple string or error.
DCHECK(!server_mode_); DCHECK(!server_mode_);
if (!eol) { if (!eol) {
return str.size() < 256 ? INPUT_PENDING : BAD_STRING; Result r = str.size() < 256 ? INPUT_PENDING : BAD_STRING;
return {r, 0};
} }
if (eol[-1] != '\r') if (eol[-1] != '\r')
return BAD_STRING; return {BAD_STRING, 0};
cached_expr_->emplace_back(c == '+' ? RespExpr::STRING : RespExpr::ERROR); cached_expr_->emplace_back(c == '+' ? RespExpr::STRING : RespExpr::ERROR);
cached_expr_->back().u = Buffer{reinterpret_cast<uint8_t*>(s), size_t((eol - 1) - s)}; cached_expr_->back().u = Buffer{reinterpret_cast<uint8_t*>(s), size_t((eol - 1) - s)};
} else if (c == ':') { } else if (c == ':') {
DCHECK(!server_mode_); DCHECK(!server_mode_);
if (!eol) { if (!eol) {
return str.size() < 32 ? INPUT_PENDING : BAD_INT; Result r = str.size() < 32 ? INPUT_PENDING : BAD_INT;
return {r, 0};
} }
int64_t ival; int64_t ival;
std::string_view tok{s, size_t((eol - s) - 1)}; std::string_view tok{s, size_t((eol - s) - 1)};
if (eol[-1] != '\r' || !absl::SimpleAtoi(tok, &ival)) if (eol[-1] != '\r' || !absl::SimpleAtoi(tok, &ival))
return BAD_INT; return {BAD_INT, 0};
cached_expr_->emplace_back(RespExpr::INT64); cached_expr_->emplace_back(RespExpr::INT64);
cached_expr_->back().u = ival; cached_expr_->back().u = ival;
} else if (c == ',') { } else if (c == ',') {
DCHECK(!server_mode_); DCHECK(!server_mode_);
if (!eol) { if (!eol) {
return str.size() < 32 ? INPUT_PENDING : BAD_DOUBLE; Result r = str.size() < 32 ? INPUT_PENDING : BAD_DOUBLE;
return {r, 0};
} }
double_t dval; double_t dval;
std::string_view tok{s, size_t((eol - s) - 1)}; std::string_view tok{s, size_t((eol - s) - 1)};
if (eol[-1] != '\r' || !absl::SimpleAtod(tok, &dval)) if (eol[-1] != '\r' || !absl::SimpleAtod(tok, &dval))
return BAD_INT; return {BAD_INT, 0};
cached_expr_->emplace_back(RespExpr::DOUBLE); cached_expr_->emplace_back(RespExpr::DOUBLE);
cached_expr_->back().u = dval; cached_expr_->back().u = dval;
} else { } else {
return BAD_STRING; return {BAD_STRING, 0};
} }
last_consumed_ = (eol - s) + 2; HandleFinishArg();
state_ = FINISH_ARG_S;
return OK; return {OK, (eol - s) + 2};
} }
auto RedisParser::ConsumeBulk(Buffer str) -> Result { auto RedisParser::ConsumeBulk(Buffer str) -> ResultConsumed {
auto& bulk_str = get<Buffer>(cached_expr_->back().u); auto& bulk_str = get<Buffer>(cached_expr_->back().u);
uint32_t consumed = 0;
if (str.size() >= bulk_len_ + 2) { if (str.size() >= bulk_len_ + 2) {
if (str[bulk_len_] != '\r' || str[bulk_len_ + 1] != '\n') { if (str[bulk_len_] != '\r' || str[bulk_len_ + 1] != '\n') {
return BAD_STRING; return {BAD_STRING, 0};
} }
if (bulk_len_) { if (bulk_len_) {
@ -405,11 +403,11 @@ auto RedisParser::ConsumeBulk(Buffer str) -> Result {
} }
} }
is_broken_token_ = false; is_broken_token_ = false;
state_ = FINISH_ARG_S; consumed = bulk_len_ + 2;
last_consumed_ = bulk_len_ + 2;
bulk_len_ = 0; bulk_len_ = 0;
HandleFinishArg();
return OK; return {OK, consumed};
} }
if (str.size() >= 32) { if (str.size() >= 32) {
@ -429,11 +427,11 @@ auto RedisParser::ConsumeBulk(Buffer str) -> Result {
is_broken_token_ = true; is_broken_token_ = true;
cached_expr_->back().has_support = true; cached_expr_->back().has_support = true;
} }
last_consumed_ = len; consumed = len;
bulk_len_ -= len; bulk_len_ -= len;
} }
return INPUT_PENDING; return {INPUT_PENDING, consumed};
} }
void RedisParser::HandleFinishArg() { void RedisParser::HandleFinishArg() {

View file

@ -24,23 +24,31 @@ class RedisParser {
public: public:
constexpr static long kMaxBulkLen = 256 * (1ul << 20); // 256MB. constexpr static long kMaxBulkLen = 256 * (1ul << 20); // 256MB.
enum Result { OK, INPUT_PENDING, BAD_ARRAYLEN, BAD_BULKLEN, BAD_STRING, BAD_INT, BAD_DOUBLE }; enum Result : uint8_t {
OK,
INPUT_PENDING,
BAD_ARRAYLEN,
BAD_BULKLEN,
BAD_STRING,
BAD_INT,
BAD_DOUBLE
};
using Buffer = RespExpr::Buffer; using Buffer = RespExpr::Buffer;
explicit RedisParser(uint32_t max_arr_len = UINT32_MAX, bool server_mode = true) explicit RedisParser(uint32_t max_arr_len = UINT32_MAX, bool server_mode = true)
: max_arr_len_(max_arr_len), server_mode_(server_mode) { : server_mode_(server_mode), max_arr_len_(max_arr_len) {
} }
/** /**
* @brief Parses str into res. "consumed" stores number of bytes consumed from str. * @brief Parses str into res. "consumed" stores number of bytes consumed from str.
* *
* A caller should not invalidate str if the parser returns RESP_OK as long as he continues * A caller should not invalidate str if the parser returns RESP_OK as long as he continues
* accessing res. However, if parser returns MORE_INPUT a caller may discard consumed * accessing res. However, if parser returns INPUT_PENDING a caller may discard consumed
* part of str because parser caches the intermediate state internally according to 'consumed' * part of str because parser caches the intermediate state internally according to 'consumed'
* result. * result.
* *
* Note: A parser does not always guarantee progress, i.e. if a small buffer was passed it may * Note: A parser does not always guarantee progress, i.e. if a small buffer was passed it may
* returns MORE_INPUT with consumed == 0. * returns INPUT_PENDING with consumed == 0.
* *
*/ */
@ -64,49 +72,49 @@ class RedisParser {
size_t UsedMemory() const; size_t UsedMemory() const;
private: private:
using ResultConsumed = std::pair<Result, uint32_t>;
void InitStart(uint8_t prefix_b, RespVec* res); void InitStart(uint8_t prefix_b, RespVec* res);
void StashState(RespVec* res); void StashState(RespVec* res);
// Skips the first character (*). // Skips the first character (*).
Result ConsumeArrayLen(Buffer str); ResultConsumed ConsumeArrayLen(Buffer str);
Result ParseArg(Buffer str); ResultConsumed ParseArg(Buffer str);
Result ConsumeBulk(Buffer str); ResultConsumed ConsumeBulk(Buffer str);
Result ParseInline(Buffer str); ResultConsumed ParseInline(Buffer str);
ResultConsumed ParseLen(Buffer str, int64_t* res);
// Updates last_consumed_
Result ParseNum(Buffer str, int64_t* res);
void HandleFinishArg(); void HandleFinishArg();
void ExtendLastString(Buffer str); void ExtendLastString(Buffer str);
enum State : uint8_t { enum State : uint8_t {
INIT_S = 0,
INLINE_S, INLINE_S,
ARRAY_LEN_S, ARRAY_LEN_S,
MAP_LEN_S, MAP_LEN_S,
PARSE_ARG_S, // Parse [$:+-]string\r\n PARSE_ARG_S, // Parse [$:+-]string\r\n
BULK_STR_S, BULK_STR_S,
FINISH_ARG_S,
CMD_COMPLETE_S, CMD_COMPLETE_S,
}; };
State state_ = INIT_S; State state_ = CMD_COMPLETE_S;
Result last_result_ = OK; bool is_broken_token_ = false; // whether the last inline string was broken in the middle.
bool server_mode_ = true;
uint32_t last_consumed_ = 0;
uint32_t bulk_len_ = 0; uint32_t bulk_len_ = 0;
uint32_t last_stashed_level_ = 0, last_stashed_index_ = 0; uint32_t last_stashed_level_ = 0, last_stashed_index_ = 0;
uint32_t max_arr_len_;
// Points either to the result passed by the caller or to the stash.
RespVec* cached_expr_ = nullptr;
// expected expression length, pointer to expression vector. // expected expression length, pointer to expression vector.
// For server mode, the length is at most 1.
absl::InlinedVector<std::pair<uint32_t, RespVec*>, 4> parse_stack_; absl::InlinedVector<std::pair<uint32_t, RespVec*>, 4> parse_stack_;
std::vector<std::unique_ptr<RespVec>> stash_; std::vector<std::unique_ptr<RespVec>> stash_;
using Blob = std::vector<uint8_t>; using Blob = std::vector<uint8_t>;
std::vector<Blob> buf_stash_; std::vector<Blob> buf_stash_;
RespVec* cached_expr_ = nullptr;
uint32_t max_arr_len_;
bool is_broken_token_ = false;
bool server_mode_ = true;
}; };
} // namespace facade } // namespace facade