From d8fce103eb3aead98daad510766a8c838b6e00fb Mon Sep 17 00:00:00 2001 From: Roman Gershman Date: Wed, 7 May 2025 11:39:18 +0300 Subject: [PATCH] chore: reorganize compact object mask bits Specifically get rit of MaskEnum and replace it with explicit bits aliasing the mask. Reorganize the encoding bits to be able to store 4 states that include huffman encoding. Solves the first part of #4880 Signed-off-by: Roman Gershman --- src/core/compact_object.cc | 66 +++++++++---------- src/core/compact_object.h | 112 ++++++++++++++------------------ src/core/compact_object_test.cc | 1 + 3 files changed, 81 insertions(+), 98 deletions(-) diff --git a/src/core/compact_object.cc b/src/core/compact_object.cc index c44326c19..5e8dd187a 100644 --- a/src/core/compact_object.cc +++ b/src/core/compact_object.cc @@ -804,14 +804,13 @@ size_t CompactObj::Size() const { LOG(DFATAL) << "Should not reach " << int(taglen_); } } - uint8_t encoded = (mask_ & kEncMask); - return encoded ? DecodedLen(raw_size) : raw_size; + return mask_bits_.encoding ? DecodedLen(raw_size) : raw_size; } uint64_t CompactObj::HashCode() const { DCHECK(taglen_ != JSON_TAG) << "JSON type cannot be used for keys!"; - uint8_t encoded = (mask_ & kEncMask); + uint8_t encoded = mask_bits_.encoding; if (IsInline()) { if (encoded) { char buf[kInlineLen * 2]; @@ -887,7 +886,8 @@ void CompactObj::SetInt(int64_t val) { DCHECK(!IsExternal()); if (INT_TAG != taglen_) { - SetMeta(INT_TAG, mask_ & ~kEncMask); + SetMeta(INT_TAG, mask_); + mask_bits_.encoding = NONE_ENC; } u_.ival = val; @@ -970,8 +970,9 @@ SBF* CompactObj::GetSBF() const { } void CompactObj::SetString(std::string_view str) { - uint8_t mask = mask_ & ~kEncMask; CHECK(!IsExternal()); + mask_bits_.encoding = NONE_ENC; + // Trying auto-detection heuristics first. if (str.size() <= 20) { long long ival; @@ -979,14 +980,14 @@ void CompactObj::SetString(std::string_view str) { // We use redis string2ll to be compatible with Redis. if (string2ll(str.data(), str.size(), &ival)) { - SetMeta(INT_TAG, mask); + SetMeta(INT_TAG, mask_); u_.ival = ival; return; } if (str.size() <= kInlineLen) { - SetMeta(str.size(), mask); + SetMeta(str.size(), mask_); if (!str.empty()) memcpy(u_.inline_str, str.data(), str.size()); return; @@ -997,8 +998,9 @@ void CompactObj::SetString(std::string_view str) { } void CompactObj::ReserveString(size_t size) { - uint8_t mask = mask_ & ~kEncMask; - SetMeta(ROBJ_TAG, mask); + mask_bits_.encoding = NONE_ENC; + SetMeta(ROBJ_TAG, mask_); + u_.r_obj.ReserveString(size, tl.local_mr); } @@ -1006,16 +1008,16 @@ void CompactObj::AppendString(std::string_view str) { u_.r_obj.AppendString(str, tl.local_mr); } +// TODO: to simplify this code using GetString(char*) variant. string_view CompactObj::GetSlice(string* scratch) const { CHECK(!IsExternal()); - uint8_t is_encoded = mask_ & kEncMask; if (IsInline()) { - if (is_encoded) { + if (mask_bits_.encoding) { size_t decoded_len = taglen_ + 2; // must be this because we either shortened 17 or 18. - DCHECK_EQ(is_encoded, ASCII2_ENC_BIT); + DCHECK_EQ(mask_bits_.encoding, ASCII2_ENC); DCHECK_EQ(decoded_len, ascii_len(taglen_)); scratch->resize(decoded_len); @@ -1033,7 +1035,7 @@ string_view CompactObj::GetSlice(string* scratch) const { return *scratch; } - if (is_encoded) { + if (mask_bits_.encoding) { if (taglen_ == ROBJ_TAG) { CHECK_EQ(OBJ_STRING, u_.r_obj.type()); DCHECK_EQ(OBJ_ENCODING_RAW, u_.r_obj.encoding()); @@ -1120,14 +1122,13 @@ void __attribute__((noinline)) CompactObj::GetString(string* res) const { void CompactObj::GetString(char* dest) const { CHECK(!IsExternal()); - uint8_t is_encoded = mask_ & kEncMask; if (IsInline()) { - if (is_encoded) { + if (mask_bits_.encoding) { size_t decoded_len = taglen_ + 2; // must be this because we either shortened 17 or 18. - DCHECK_EQ(is_encoded, ASCII2_ENC_BIT); + DCHECK_EQ(mask_bits_.encoding, ASCII2_ENC); DCHECK_EQ(decoded_len, ascii_len(taglen_)); detail::ascii_unpack(to_byte(u_.inline_str), decoded_len, dest); @@ -1144,7 +1145,7 @@ void CompactObj::GetString(char* dest) const { return; } - if (is_encoded) { + if (mask_bits_.encoding) { if (taglen_ == ROBJ_TAG) { CHECK_EQ(OBJ_STRING, u_.r_obj.type()); DCHECK_EQ(OBJ_ENCODING_RAW, u_.r_obj.encoding()); @@ -1220,7 +1221,9 @@ auto CompactObj::GetCool() const -> CoolItem { void CompactObj::ImportExternal(const CompactObj& src) { DCHECK(src.IsExternal()); - SetMeta(EXTERNAL_TAG, src.mask_ & kEncMask); + uint8_t encoding = src.mask_bits_.encoding; + SetMeta(EXTERNAL_TAG, 0); + mask_bits_.encoding = encoding; u_.ext_ptr = src.u_.ext_ptr; } @@ -1238,13 +1241,11 @@ void CompactObj::Materialize(std::string_view blob, bool is_raw) { DCHECK_GT(blob.size(), kInlineLen); if (is_raw) { - uint8_t mask = mask_; - if (kUseSmallStrings && SmallString::CanAllocate(blob.size())) { - SetMeta(SMALL_TAG, mask); + SetMeta(SMALL_TAG, mask_); tl.small_str_bytes += u_.small_str.Assign(blob); } else { - SetMeta(ROBJ_TAG, mask); + SetMeta(ROBJ_TAG, mask_); u_.r_obj.SetString(blob, tl.local_mr); } } else { @@ -1317,8 +1318,8 @@ size_t CompactObj::MallocUsed(bool slow) const { bool CompactObj::operator==(const CompactObj& o) const { DCHECK(taglen_ != JSON_TAG && o.taglen_ != JSON_TAG) << "cannot use JSON type to check equal"; - uint8_t m1 = mask_ & kEncMask; - uint8_t m2 = o.mask_ & kEncMask; + uint8_t m1 = mask_bits_.encoding; + uint8_t m2 = o.mask_bits_.encoding; if (m1 != m2) return false; @@ -1433,8 +1434,8 @@ bool CompactObj::CmpEncoded(string_view sv) const { void CompactObj::EncodeString(string_view str) { DCHECK_GT(str.size(), kInlineLen); + DCHECK_EQ(NONE_ENC, mask_bits_.encoding); - uint8_t mask = mask_ & ~kEncMask; string_view encoded = str; bool is_ascii = kUseAsciiEncoding && detail::validate_ascii_fast(str.data(), str.size()); @@ -1443,11 +1444,10 @@ void CompactObj::EncodeString(string_view str) { size_t rev_len = ascii_len(encode_len); if (rev_len == str.size()) { - mask |= ASCII2_ENC_BIT; // str hits its highest bound. + mask_bits_.encoding = ASCII2_ENC; // str hits its highest bound. } else { CHECK_EQ(str.size(), rev_len - 1) << "Bad ascii encoding for len " << str.size(); - - mask |= ASCII1_ENC_BIT; + mask_bits_.encoding = ASCII1_ENC; // str is shorter than its highest bound. } tl.tmp_buf.resize(encode_len); @@ -1455,7 +1455,7 @@ void CompactObj::EncodeString(string_view str) { encoded = string_view{reinterpret_cast(tl.tmp_buf.data()), encode_len}; if (encoded.size() <= kInlineLen) { - SetMeta(encoded.size(), mask); + SetMeta(encoded.size(), mask_); detail::ascii_pack(str.data(), str.size(), reinterpret_cast(u_.inline_str)); return; @@ -1464,20 +1464,19 @@ void CompactObj::EncodeString(string_view str) { if (kUseSmallStrings && SmallString::CanAllocate(encoded.size())) { if (taglen_ == 0) { - SetMeta(SMALL_TAG, mask); + SetMeta(SMALL_TAG, mask_); tl.small_str_bytes += u_.small_str.Assign(encoded); return; } if (taglen_ == SMALL_TAG && encoded.size() <= u_.small_str.size()) { - mask_ = mask; tl.small_str_bytes -= u_.small_str.MallocUsed(); tl.small_str_bytes += u_.small_str.Assign(encoded); return; } } - SetMeta(ROBJ_TAG, mask); + SetMeta(ROBJ_TAG, mask_); u_.r_obj.SetString(encoded, tl.local_mr); } @@ -1501,7 +1500,8 @@ StringOrView CompactObj::GetRawString() const { } size_t CompactObj::DecodedLen(size_t sz) const { - return ascii_len(sz) - ((mask_ & ASCII1_ENC_BIT) ? 1 : 0); + unsigned delta = (mask_bits_.encoding == ASCII1_ENC ? 1 : 0); + return ascii_len(sz) - delta; } MemoryResource* CompactObj::memory_resource() { diff --git a/src/core/compact_object.h b/src/core/compact_object.h index 7d7272de8..fd05c2e06 100644 --- a/src/core/compact_object.h +++ b/src/core/compact_object.h @@ -125,36 +125,13 @@ class CompactObj { SBF_TAG = 22, }; - enum MaskBit { - REF_BIT = 1, - EXPIRE_BIT = 2, // Mark objects that have expiry timestamp assigned. - FLAG_BIT = 4, // Used to mark keys that have memcache flags assigned. - - // ascii encoding is not an injective function. it compresses 8 bytes to 7 but also 7 to 7. - // therefore, in order to know the original length we introduce 2 flags that - // correct the length upon decoding. ASCII1_ENC_BIT rounds down the decoded length, - // while ASCII2_ENC_BIT rounds it up. See DecodedLen implementation for more info. - ASCII1_ENC_BIT = 8, - ASCII2_ENC_BIT = 0x10, - - // IO_PENDING is set when the tiered storage has issued an i/o request to save the value. It is - // cleared when the io request finishes or is cancelled. - IO_PENDING = 0x20, - - // Applied only on keys that should be deleted asynchronously. - // (it can be the same value as IO_PENDING) that is applied only on values. - KEY_ASYNC_DELETE = 0x20, - STICKY = 0x40, - - // TOUCHED used to determin which items are hot/cold. - // by checking if the item was touched from the last time we - // reached this item while travering the database to set items as cold. - // https://junchengyang.com/publication/nsdi24-SIEVE.pdf - TOUCHED = 0x80, + enum Encoding : uint8_t { + NONE_ENC = 0, + ASCII1_ENC = 1, + ASCII2_ENC = 2, + HUFFMAN_ENC = 3, // TBD }; - static constexpr uint8_t kEncMask = ASCII1_ENC_BIT | ASCII2_ENC_BIT; - public: using PrefixArray = std::vector; using MemoryResource = detail::RobjWrapper::MemoryResource; @@ -185,13 +162,14 @@ class CompactObj { CompactObj res; memcpy(&res.u_, &u_, sizeof(u_)); res.taglen_ = taglen_; - res.mask_ = mask_ | REF_BIT; + res.mask_ = mask_; + res.mask_bits_.ref = 1; return res; } bool IsRef() const { - return mask_ & REF_BIT; + return mask_bits_.ref; } std::string_view GetSlice(std::string* scratch) const; @@ -222,73 +200,53 @@ class CompactObj { } bool HasExpire() const { - return mask_ & EXPIRE_BIT; + return mask_bits_.expire; } void SetExpire(bool e) { - if (e) { - mask_ |= EXPIRE_BIT; - } else { - mask_ &= ~EXPIRE_BIT; - } + mask_bits_.expire = e; } bool HasFlag() const { - return mask_ & FLAG_BIT; + return mask_bits_.mc_flag; } void SetFlag(bool e) { - if (e) { - mask_ |= FLAG_BIT; - } else { - mask_ &= ~FLAG_BIT; - } + mask_bits_.mc_flag = e; } bool WasTouched() const { - return mask_ & TOUCHED; + return mask_bits_.touched; } void SetTouched(bool e) { - if (e) { - mask_ |= TOUCHED; - } else { - mask_ &= ~TOUCHED; - } + mask_bits_.touched = e; } bool DefragIfNeeded(float ratio); void SetAsyncDelete() { - mask_ |= KEY_ASYNC_DELETE; + mask_bits_.io_pending = 1; // io_pending flag is used for async delete for keys. } bool IsAsyncDelete() const { - return mask_ & KEY_ASYNC_DELETE; + return mask_bits_.io_pending; } bool HasStashPending() const { - return mask_ & IO_PENDING; + return mask_bits_.io_pending; } void SetStashPending(bool b) { - if (b) { - mask_ |= IO_PENDING; - } else { - mask_ &= ~IO_PENDING; - } + mask_bits_.io_pending = b; } bool IsSticky() const { - return mask_ & STICKY; + return mask_bits_.sticky; } - void SetSticky(bool s) { - if (s) { - mask_ |= STICKY; - } else { - mask_ &= ~STICKY; - } + void SetSticky(bool e) { + mask_bits_.sticky = e; } unsigned Encoding() const; @@ -525,14 +483,38 @@ class CompactObj { // static_assert(sizeof(u_) == 16); - uint8_t mask_ = 0; + union { + uint8_t mask_ = 0; + struct { + uint8_t ref : 1; // Mark objects that have expiry timestamp assigned. + uint8_t expire : 1; + uint8_t mc_flag : 1; // Marks keys that have memcache flags assigned. + + // ascii encoding is not an injective function. it compresses 8 bytes to 7 but also 7 to 7. + // therefore, in order to know the original length we introduce 2 flags that + // correct the length upon decoding. ASCII1_ENC_BIT rounds down the decoded length, + // while ASCII2_ENC_BIT rounds it up. See DecodedLen implementation for more info. + uint8_t encoding : 2; + + // IO_PENDING is set when the tiered storage has issued an i/o request to save the value. + // It is cleared when the io request finishes or is cancelled. + uint8_t io_pending : 1; // also serves as async-delete for keys. + uint8_t sticky : 1; + + // TOUCHED used to determin which items are hot/cold. + // by checking if the item was touched from the last time we + // reached this item while travering the database to set items as cold. + // https://junchengyang.com/publication/nsdi24-SIEVE.pdf + uint8_t touched : 1; // used to mark keys that were accessed. + } mask_bits_; + }; // We currently reserve 5 bits for tags and 3 bits for extending the mask. currently reserved. uint8_t taglen_ = 0; }; inline bool CompactObj::operator==(std::string_view sv) const { - if (mask_ & kEncMask) + if (mask_bits_.encoding) return CmpEncoded(sv); if (IsInline()) { diff --git a/src/core/compact_object_test.cc b/src/core/compact_object_test.cc index eedce4fee..9c2cf09e0 100644 --- a/src/core/compact_object_test.cc +++ b/src/core/compact_object_test.cc @@ -607,6 +607,7 @@ TEST_F(CompactObjectTest, RawInterface) { str.assign(50, char(200)); // non ascii cobj_.SetString(str); + ASSERT_EQ(str, cobj_.GetSlice(&tmp)); { auto raw_blob = cobj_.GetRawString();