chore: reorganize compact object mask bits (#5077)

Specifically get rit of MaskEnum and replace it with explicit bits aliasing the mask.
Reorganize the encoding bits to be able to store 4 states that include huffman encoding.

Solves the first part of #4880

Signed-off-by: Roman Gershman <roman@dragonflydb.io>
This commit is contained in:
Roman Gershman 2025-05-07 15:08:22 +03:00 committed by GitHub
parent 05d99769e1
commit 54328fd00e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 81 additions and 98 deletions

View file

@ -804,14 +804,13 @@ size_t CompactObj::Size() const {
LOG(DFATAL) << "Should not reach " << int(taglen_); LOG(DFATAL) << "Should not reach " << int(taglen_);
} }
} }
uint8_t encoded = (mask_ & kEncMask); return mask_bits_.encoding ? DecodedLen(raw_size) : raw_size;
return encoded ? DecodedLen(raw_size) : raw_size;
} }
uint64_t CompactObj::HashCode() const { uint64_t CompactObj::HashCode() const {
DCHECK(taglen_ != JSON_TAG) << "JSON type cannot be used for keys!"; DCHECK(taglen_ != JSON_TAG) << "JSON type cannot be used for keys!";
uint8_t encoded = (mask_ & kEncMask); uint8_t encoded = mask_bits_.encoding;
if (IsInline()) { if (IsInline()) {
if (encoded) { if (encoded) {
char buf[kInlineLen * 2]; char buf[kInlineLen * 2];
@ -887,7 +886,8 @@ void CompactObj::SetInt(int64_t val) {
DCHECK(!IsExternal()); DCHECK(!IsExternal());
if (INT_TAG != taglen_) { if (INT_TAG != taglen_) {
SetMeta(INT_TAG, mask_ & ~kEncMask); SetMeta(INT_TAG, mask_);
mask_bits_.encoding = NONE_ENC;
} }
u_.ival = val; u_.ival = val;
@ -970,8 +970,9 @@ SBF* CompactObj::GetSBF() const {
} }
void CompactObj::SetString(std::string_view str) { void CompactObj::SetString(std::string_view str) {
uint8_t mask = mask_ & ~kEncMask;
CHECK(!IsExternal()); CHECK(!IsExternal());
mask_bits_.encoding = NONE_ENC;
// Trying auto-detection heuristics first. // Trying auto-detection heuristics first.
if (str.size() <= 20) { if (str.size() <= 20) {
long long ival; long long ival;
@ -979,14 +980,14 @@ void CompactObj::SetString(std::string_view str) {
// We use redis string2ll to be compatible with Redis. // We use redis string2ll to be compatible with Redis.
if (string2ll(str.data(), str.size(), &ival)) { if (string2ll(str.data(), str.size(), &ival)) {
SetMeta(INT_TAG, mask); SetMeta(INT_TAG, mask_);
u_.ival = ival; u_.ival = ival;
return; return;
} }
if (str.size() <= kInlineLen) { if (str.size() <= kInlineLen) {
SetMeta(str.size(), mask); SetMeta(str.size(), mask_);
if (!str.empty()) if (!str.empty())
memcpy(u_.inline_str, str.data(), str.size()); memcpy(u_.inline_str, str.data(), str.size());
return; return;
@ -997,8 +998,9 @@ void CompactObj::SetString(std::string_view str) {
} }
void CompactObj::ReserveString(size_t size) { void CompactObj::ReserveString(size_t size) {
uint8_t mask = mask_ & ~kEncMask; mask_bits_.encoding = NONE_ENC;
SetMeta(ROBJ_TAG, mask); SetMeta(ROBJ_TAG, mask_);
u_.r_obj.ReserveString(size, tl.local_mr); u_.r_obj.ReserveString(size, tl.local_mr);
} }
@ -1006,16 +1008,16 @@ void CompactObj::AppendString(std::string_view str) {
u_.r_obj.AppendString(str, tl.local_mr); u_.r_obj.AppendString(str, tl.local_mr);
} }
// TODO: to simplify this code using GetString(char*) variant.
string_view CompactObj::GetSlice(string* scratch) const { string_view CompactObj::GetSlice(string* scratch) const {
CHECK(!IsExternal()); CHECK(!IsExternal());
uint8_t is_encoded = mask_ & kEncMask;
if (IsInline()) { if (IsInline()) {
if (is_encoded) { if (mask_bits_.encoding) {
size_t decoded_len = taglen_ + 2; size_t decoded_len = taglen_ + 2;
// must be this because we either shortened 17 or 18. // must be this because we either shortened 17 or 18.
DCHECK_EQ(is_encoded, ASCII2_ENC_BIT); DCHECK_EQ(mask_bits_.encoding, ASCII2_ENC);
DCHECK_EQ(decoded_len, ascii_len(taglen_)); DCHECK_EQ(decoded_len, ascii_len(taglen_));
scratch->resize(decoded_len); scratch->resize(decoded_len);
@ -1033,7 +1035,7 @@ string_view CompactObj::GetSlice(string* scratch) const {
return *scratch; return *scratch;
} }
if (is_encoded) { if (mask_bits_.encoding) {
if (taglen_ == ROBJ_TAG) { if (taglen_ == ROBJ_TAG) {
CHECK_EQ(OBJ_STRING, u_.r_obj.type()); CHECK_EQ(OBJ_STRING, u_.r_obj.type());
DCHECK_EQ(OBJ_ENCODING_RAW, u_.r_obj.encoding()); DCHECK_EQ(OBJ_ENCODING_RAW, u_.r_obj.encoding());
@ -1120,14 +1122,13 @@ void __attribute__((noinline)) CompactObj::GetString(string* res) const {
void CompactObj::GetString(char* dest) const { void CompactObj::GetString(char* dest) const {
CHECK(!IsExternal()); CHECK(!IsExternal());
uint8_t is_encoded = mask_ & kEncMask;
if (IsInline()) { if (IsInline()) {
if (is_encoded) { if (mask_bits_.encoding) {
size_t decoded_len = taglen_ + 2; size_t decoded_len = taglen_ + 2;
// must be this because we either shortened 17 or 18. // must be this because we either shortened 17 or 18.
DCHECK_EQ(is_encoded, ASCII2_ENC_BIT); DCHECK_EQ(mask_bits_.encoding, ASCII2_ENC);
DCHECK_EQ(decoded_len, ascii_len(taglen_)); DCHECK_EQ(decoded_len, ascii_len(taglen_));
detail::ascii_unpack(to_byte(u_.inline_str), decoded_len, dest); detail::ascii_unpack(to_byte(u_.inline_str), decoded_len, dest);
@ -1144,7 +1145,7 @@ void CompactObj::GetString(char* dest) const {
return; return;
} }
if (is_encoded) { if (mask_bits_.encoding) {
if (taglen_ == ROBJ_TAG) { if (taglen_ == ROBJ_TAG) {
CHECK_EQ(OBJ_STRING, u_.r_obj.type()); CHECK_EQ(OBJ_STRING, u_.r_obj.type());
DCHECK_EQ(OBJ_ENCODING_RAW, u_.r_obj.encoding()); DCHECK_EQ(OBJ_ENCODING_RAW, u_.r_obj.encoding());
@ -1220,7 +1221,9 @@ auto CompactObj::GetCool() const -> CoolItem {
void CompactObj::ImportExternal(const CompactObj& src) { void CompactObj::ImportExternal(const CompactObj& src) {
DCHECK(src.IsExternal()); DCHECK(src.IsExternal());
SetMeta(EXTERNAL_TAG, src.mask_ & kEncMask); uint8_t encoding = src.mask_bits_.encoding;
SetMeta(EXTERNAL_TAG, 0);
mask_bits_.encoding = encoding;
u_.ext_ptr = src.u_.ext_ptr; u_.ext_ptr = src.u_.ext_ptr;
} }
@ -1238,13 +1241,11 @@ void CompactObj::Materialize(std::string_view blob, bool is_raw) {
DCHECK_GT(blob.size(), kInlineLen); DCHECK_GT(blob.size(), kInlineLen);
if (is_raw) { if (is_raw) {
uint8_t mask = mask_;
if (kUseSmallStrings && SmallString::CanAllocate(blob.size())) { if (kUseSmallStrings && SmallString::CanAllocate(blob.size())) {
SetMeta(SMALL_TAG, mask); SetMeta(SMALL_TAG, mask_);
tl.small_str_bytes += u_.small_str.Assign(blob); tl.small_str_bytes += u_.small_str.Assign(blob);
} else { } else {
SetMeta(ROBJ_TAG, mask); SetMeta(ROBJ_TAG, mask_);
u_.r_obj.SetString(blob, tl.local_mr); u_.r_obj.SetString(blob, tl.local_mr);
} }
} else { } else {
@ -1317,8 +1318,8 @@ size_t CompactObj::MallocUsed(bool slow) const {
bool CompactObj::operator==(const CompactObj& o) const { bool CompactObj::operator==(const CompactObj& o) const {
DCHECK(taglen_ != JSON_TAG && o.taglen_ != JSON_TAG) << "cannot use JSON type to check equal"; DCHECK(taglen_ != JSON_TAG && o.taglen_ != JSON_TAG) << "cannot use JSON type to check equal";
uint8_t m1 = mask_ & kEncMask; uint8_t m1 = mask_bits_.encoding;
uint8_t m2 = o.mask_ & kEncMask; uint8_t m2 = o.mask_bits_.encoding;
if (m1 != m2) if (m1 != m2)
return false; return false;
@ -1433,8 +1434,8 @@ bool CompactObj::CmpEncoded(string_view sv) const {
void CompactObj::EncodeString(string_view str) { void CompactObj::EncodeString(string_view str) {
DCHECK_GT(str.size(), kInlineLen); DCHECK_GT(str.size(), kInlineLen);
DCHECK_EQ(NONE_ENC, mask_bits_.encoding);
uint8_t mask = mask_ & ~kEncMask;
string_view encoded = str; string_view encoded = str;
bool is_ascii = kUseAsciiEncoding && detail::validate_ascii_fast(str.data(), str.size()); bool is_ascii = kUseAsciiEncoding && detail::validate_ascii_fast(str.data(), str.size());
@ -1443,11 +1444,10 @@ void CompactObj::EncodeString(string_view str) {
size_t rev_len = ascii_len(encode_len); size_t rev_len = ascii_len(encode_len);
if (rev_len == str.size()) { if (rev_len == str.size()) {
mask |= ASCII2_ENC_BIT; // str hits its highest bound. mask_bits_.encoding = ASCII2_ENC; // str hits its highest bound.
} else { } else {
CHECK_EQ(str.size(), rev_len - 1) << "Bad ascii encoding for len " << str.size(); CHECK_EQ(str.size(), rev_len - 1) << "Bad ascii encoding for len " << str.size();
mask_bits_.encoding = ASCII1_ENC; // str is shorter than its highest bound.
mask |= ASCII1_ENC_BIT;
} }
tl.tmp_buf.resize(encode_len); tl.tmp_buf.resize(encode_len);
@ -1455,7 +1455,7 @@ void CompactObj::EncodeString(string_view str) {
encoded = string_view{reinterpret_cast<char*>(tl.tmp_buf.data()), encode_len}; encoded = string_view{reinterpret_cast<char*>(tl.tmp_buf.data()), encode_len};
if (encoded.size() <= kInlineLen) { if (encoded.size() <= kInlineLen) {
SetMeta(encoded.size(), mask); SetMeta(encoded.size(), mask_);
detail::ascii_pack(str.data(), str.size(), reinterpret_cast<uint8_t*>(u_.inline_str)); detail::ascii_pack(str.data(), str.size(), reinterpret_cast<uint8_t*>(u_.inline_str));
return; return;
@ -1464,20 +1464,19 @@ void CompactObj::EncodeString(string_view str) {
if (kUseSmallStrings && SmallString::CanAllocate(encoded.size())) { if (kUseSmallStrings && SmallString::CanAllocate(encoded.size())) {
if (taglen_ == 0) { if (taglen_ == 0) {
SetMeta(SMALL_TAG, mask); SetMeta(SMALL_TAG, mask_);
tl.small_str_bytes += u_.small_str.Assign(encoded); tl.small_str_bytes += u_.small_str.Assign(encoded);
return; return;
} }
if (taglen_ == SMALL_TAG && encoded.size() <= u_.small_str.size()) { if (taglen_ == SMALL_TAG && encoded.size() <= u_.small_str.size()) {
mask_ = mask;
tl.small_str_bytes -= u_.small_str.MallocUsed(); tl.small_str_bytes -= u_.small_str.MallocUsed();
tl.small_str_bytes += u_.small_str.Assign(encoded); tl.small_str_bytes += u_.small_str.Assign(encoded);
return; return;
} }
} }
SetMeta(ROBJ_TAG, mask); SetMeta(ROBJ_TAG, mask_);
u_.r_obj.SetString(encoded, tl.local_mr); u_.r_obj.SetString(encoded, tl.local_mr);
} }
@ -1501,7 +1500,8 @@ StringOrView CompactObj::GetRawString() const {
} }
size_t CompactObj::DecodedLen(size_t sz) const { size_t CompactObj::DecodedLen(size_t sz) const {
return ascii_len(sz) - ((mask_ & ASCII1_ENC_BIT) ? 1 : 0); unsigned delta = (mask_bits_.encoding == ASCII1_ENC ? 1 : 0);
return ascii_len(sz) - delta;
} }
MemoryResource* CompactObj::memory_resource() { MemoryResource* CompactObj::memory_resource() {

View file

@ -125,36 +125,13 @@ class CompactObj {
SBF_TAG = 22, SBF_TAG = 22,
}; };
enum MaskBit { enum Encoding : uint8_t {
REF_BIT = 1, NONE_ENC = 0,
EXPIRE_BIT = 2, // Mark objects that have expiry timestamp assigned. ASCII1_ENC = 1,
FLAG_BIT = 4, // Used to mark keys that have memcache flags assigned. ASCII2_ENC = 2,
HUFFMAN_ENC = 3, // TBD
// ascii encoding is not an injective function. it compresses 8 bytes to 7 but also 7 to 7.
// therefore, in order to know the original length we introduce 2 flags that
// correct the length upon decoding. ASCII1_ENC_BIT rounds down the decoded length,
// while ASCII2_ENC_BIT rounds it up. See DecodedLen implementation for more info.
ASCII1_ENC_BIT = 8,
ASCII2_ENC_BIT = 0x10,
// IO_PENDING is set when the tiered storage has issued an i/o request to save the value. It is
// cleared when the io request finishes or is cancelled.
IO_PENDING = 0x20,
// Applied only on keys that should be deleted asynchronously.
// (it can be the same value as IO_PENDING) that is applied only on values.
KEY_ASYNC_DELETE = 0x20,
STICKY = 0x40,
// TOUCHED used to determin which items are hot/cold.
// by checking if the item was touched from the last time we
// reached this item while travering the database to set items as cold.
// https://junchengyang.com/publication/nsdi24-SIEVE.pdf
TOUCHED = 0x80,
}; };
static constexpr uint8_t kEncMask = ASCII1_ENC_BIT | ASCII2_ENC_BIT;
public: public:
using PrefixArray = std::vector<std::string_view>; using PrefixArray = std::vector<std::string_view>;
using MemoryResource = detail::RobjWrapper::MemoryResource; using MemoryResource = detail::RobjWrapper::MemoryResource;
@ -185,13 +162,14 @@ class CompactObj {
CompactObj res; CompactObj res;
memcpy(&res.u_, &u_, sizeof(u_)); memcpy(&res.u_, &u_, sizeof(u_));
res.taglen_ = taglen_; res.taglen_ = taglen_;
res.mask_ = mask_ | REF_BIT; res.mask_ = mask_;
res.mask_bits_.ref = 1;
return res; return res;
} }
bool IsRef() const { bool IsRef() const {
return mask_ & REF_BIT; return mask_bits_.ref;
} }
std::string_view GetSlice(std::string* scratch) const; std::string_view GetSlice(std::string* scratch) const;
@ -222,73 +200,53 @@ class CompactObj {
} }
bool HasExpire() const { bool HasExpire() const {
return mask_ & EXPIRE_BIT; return mask_bits_.expire;
} }
void SetExpire(bool e) { void SetExpire(bool e) {
if (e) { mask_bits_.expire = e;
mask_ |= EXPIRE_BIT;
} else {
mask_ &= ~EXPIRE_BIT;
}
} }
bool HasFlag() const { bool HasFlag() const {
return mask_ & FLAG_BIT; return mask_bits_.mc_flag;
} }
void SetFlag(bool e) { void SetFlag(bool e) {
if (e) { mask_bits_.mc_flag = e;
mask_ |= FLAG_BIT;
} else {
mask_ &= ~FLAG_BIT;
}
} }
bool WasTouched() const { bool WasTouched() const {
return mask_ & TOUCHED; return mask_bits_.touched;
} }
void SetTouched(bool e) { void SetTouched(bool e) {
if (e) { mask_bits_.touched = e;
mask_ |= TOUCHED;
} else {
mask_ &= ~TOUCHED;
}
} }
bool DefragIfNeeded(float ratio); bool DefragIfNeeded(float ratio);
void SetAsyncDelete() { void SetAsyncDelete() {
mask_ |= KEY_ASYNC_DELETE; mask_bits_.io_pending = 1; // io_pending flag is used for async delete for keys.
} }
bool IsAsyncDelete() const { bool IsAsyncDelete() const {
return mask_ & KEY_ASYNC_DELETE; return mask_bits_.io_pending;
} }
bool HasStashPending() const { bool HasStashPending() const {
return mask_ & IO_PENDING; return mask_bits_.io_pending;
} }
void SetStashPending(bool b) { void SetStashPending(bool b) {
if (b) { mask_bits_.io_pending = b;
mask_ |= IO_PENDING;
} else {
mask_ &= ~IO_PENDING;
}
} }
bool IsSticky() const { bool IsSticky() const {
return mask_ & STICKY; return mask_bits_.sticky;
} }
void SetSticky(bool s) { void SetSticky(bool e) {
if (s) { mask_bits_.sticky = e;
mask_ |= STICKY;
} else {
mask_ &= ~STICKY;
}
} }
unsigned Encoding() const; unsigned Encoding() const;
@ -525,14 +483,38 @@ class CompactObj {
// //
static_assert(sizeof(u_) == 16); static_assert(sizeof(u_) == 16);
uint8_t mask_ = 0; union {
uint8_t mask_ = 0;
struct {
uint8_t ref : 1; // Mark objects that have expiry timestamp assigned.
uint8_t expire : 1;
uint8_t mc_flag : 1; // Marks keys that have memcache flags assigned.
// ascii encoding is not an injective function. it compresses 8 bytes to 7 but also 7 to 7.
// therefore, in order to know the original length we introduce 2 flags that
// correct the length upon decoding. ASCII1_ENC_BIT rounds down the decoded length,
// while ASCII2_ENC_BIT rounds it up. See DecodedLen implementation for more info.
uint8_t encoding : 2;
// IO_PENDING is set when the tiered storage has issued an i/o request to save the value.
// It is cleared when the io request finishes or is cancelled.
uint8_t io_pending : 1; // also serves as async-delete for keys.
uint8_t sticky : 1;
// TOUCHED used to determin which items are hot/cold.
// by checking if the item was touched from the last time we
// reached this item while travering the database to set items as cold.
// https://junchengyang.com/publication/nsdi24-SIEVE.pdf
uint8_t touched : 1; // used to mark keys that were accessed.
} mask_bits_;
};
// We currently reserve 5 bits for tags and 3 bits for extending the mask. currently reserved. // We currently reserve 5 bits for tags and 3 bits for extending the mask. currently reserved.
uint8_t taglen_ = 0; uint8_t taglen_ = 0;
}; };
inline bool CompactObj::operator==(std::string_view sv) const { inline bool CompactObj::operator==(std::string_view sv) const {
if (mask_ & kEncMask) if (mask_bits_.encoding)
return CmpEncoded(sv); return CmpEncoded(sv);
if (IsInline()) { if (IsInline()) {

View file

@ -607,6 +607,7 @@ TEST_F(CompactObjectTest, RawInterface) {
str.assign(50, char(200)); // non ascii str.assign(50, char(200)); // non ascii
cobj_.SetString(str); cobj_.SetString(str);
ASSERT_EQ(str, cobj_.GetSlice(&tmp));
{ {
auto raw_blob = cobj_.GetRawString(); auto raw_blob = cobj_.GetRawString();