chore: reorganize compact object mask bits (#5077)

Specifically get rit of MaskEnum and replace it with explicit bits aliasing the mask.
Reorganize the encoding bits to be able to store 4 states that include huffman encoding.

Solves the first part of #4880

Signed-off-by: Roman Gershman <roman@dragonflydb.io>
This commit is contained in:
Roman Gershman 2025-05-07 15:08:22 +03:00 committed by GitHub
parent 05d99769e1
commit 54328fd00e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 81 additions and 98 deletions

View file

@ -804,14 +804,13 @@ size_t CompactObj::Size() const {
LOG(DFATAL) << "Should not reach " << int(taglen_);
}
}
uint8_t encoded = (mask_ & kEncMask);
return encoded ? DecodedLen(raw_size) : raw_size;
return mask_bits_.encoding ? DecodedLen(raw_size) : raw_size;
}
uint64_t CompactObj::HashCode() const {
DCHECK(taglen_ != JSON_TAG) << "JSON type cannot be used for keys!";
uint8_t encoded = (mask_ & kEncMask);
uint8_t encoded = mask_bits_.encoding;
if (IsInline()) {
if (encoded) {
char buf[kInlineLen * 2];
@ -887,7 +886,8 @@ void CompactObj::SetInt(int64_t val) {
DCHECK(!IsExternal());
if (INT_TAG != taglen_) {
SetMeta(INT_TAG, mask_ & ~kEncMask);
SetMeta(INT_TAG, mask_);
mask_bits_.encoding = NONE_ENC;
}
u_.ival = val;
@ -970,8 +970,9 @@ SBF* CompactObj::GetSBF() const {
}
void CompactObj::SetString(std::string_view str) {
uint8_t mask = mask_ & ~kEncMask;
CHECK(!IsExternal());
mask_bits_.encoding = NONE_ENC;
// Trying auto-detection heuristics first.
if (str.size() <= 20) {
long long ival;
@ -979,14 +980,14 @@ void CompactObj::SetString(std::string_view str) {
// We use redis string2ll to be compatible with Redis.
if (string2ll(str.data(), str.size(), &ival)) {
SetMeta(INT_TAG, mask);
SetMeta(INT_TAG, mask_);
u_.ival = ival;
return;
}
if (str.size() <= kInlineLen) {
SetMeta(str.size(), mask);
SetMeta(str.size(), mask_);
if (!str.empty())
memcpy(u_.inline_str, str.data(), str.size());
return;
@ -997,8 +998,9 @@ void CompactObj::SetString(std::string_view str) {
}
void CompactObj::ReserveString(size_t size) {
uint8_t mask = mask_ & ~kEncMask;
SetMeta(ROBJ_TAG, mask);
mask_bits_.encoding = NONE_ENC;
SetMeta(ROBJ_TAG, mask_);
u_.r_obj.ReserveString(size, tl.local_mr);
}
@ -1006,16 +1008,16 @@ void CompactObj::AppendString(std::string_view str) {
u_.r_obj.AppendString(str, tl.local_mr);
}
// TODO: to simplify this code using GetString(char*) variant.
string_view CompactObj::GetSlice(string* scratch) const {
CHECK(!IsExternal());
uint8_t is_encoded = mask_ & kEncMask;
if (IsInline()) {
if (is_encoded) {
if (mask_bits_.encoding) {
size_t decoded_len = taglen_ + 2;
// must be this because we either shortened 17 or 18.
DCHECK_EQ(is_encoded, ASCII2_ENC_BIT);
DCHECK_EQ(mask_bits_.encoding, ASCII2_ENC);
DCHECK_EQ(decoded_len, ascii_len(taglen_));
scratch->resize(decoded_len);
@ -1033,7 +1035,7 @@ string_view CompactObj::GetSlice(string* scratch) const {
return *scratch;
}
if (is_encoded) {
if (mask_bits_.encoding) {
if (taglen_ == ROBJ_TAG) {
CHECK_EQ(OBJ_STRING, u_.r_obj.type());
DCHECK_EQ(OBJ_ENCODING_RAW, u_.r_obj.encoding());
@ -1120,14 +1122,13 @@ void __attribute__((noinline)) CompactObj::GetString(string* res) const {
void CompactObj::GetString(char* dest) const {
CHECK(!IsExternal());
uint8_t is_encoded = mask_ & kEncMask;
if (IsInline()) {
if (is_encoded) {
if (mask_bits_.encoding) {
size_t decoded_len = taglen_ + 2;
// must be this because we either shortened 17 or 18.
DCHECK_EQ(is_encoded, ASCII2_ENC_BIT);
DCHECK_EQ(mask_bits_.encoding, ASCII2_ENC);
DCHECK_EQ(decoded_len, ascii_len(taglen_));
detail::ascii_unpack(to_byte(u_.inline_str), decoded_len, dest);
@ -1144,7 +1145,7 @@ void CompactObj::GetString(char* dest) const {
return;
}
if (is_encoded) {
if (mask_bits_.encoding) {
if (taglen_ == ROBJ_TAG) {
CHECK_EQ(OBJ_STRING, u_.r_obj.type());
DCHECK_EQ(OBJ_ENCODING_RAW, u_.r_obj.encoding());
@ -1220,7 +1221,9 @@ auto CompactObj::GetCool() const -> CoolItem {
void CompactObj::ImportExternal(const CompactObj& src) {
DCHECK(src.IsExternal());
SetMeta(EXTERNAL_TAG, src.mask_ & kEncMask);
uint8_t encoding = src.mask_bits_.encoding;
SetMeta(EXTERNAL_TAG, 0);
mask_bits_.encoding = encoding;
u_.ext_ptr = src.u_.ext_ptr;
}
@ -1238,13 +1241,11 @@ void CompactObj::Materialize(std::string_view blob, bool is_raw) {
DCHECK_GT(blob.size(), kInlineLen);
if (is_raw) {
uint8_t mask = mask_;
if (kUseSmallStrings && SmallString::CanAllocate(blob.size())) {
SetMeta(SMALL_TAG, mask);
SetMeta(SMALL_TAG, mask_);
tl.small_str_bytes += u_.small_str.Assign(blob);
} else {
SetMeta(ROBJ_TAG, mask);
SetMeta(ROBJ_TAG, mask_);
u_.r_obj.SetString(blob, tl.local_mr);
}
} else {
@ -1317,8 +1318,8 @@ size_t CompactObj::MallocUsed(bool slow) const {
bool CompactObj::operator==(const CompactObj& o) const {
DCHECK(taglen_ != JSON_TAG && o.taglen_ != JSON_TAG) << "cannot use JSON type to check equal";
uint8_t m1 = mask_ & kEncMask;
uint8_t m2 = o.mask_ & kEncMask;
uint8_t m1 = mask_bits_.encoding;
uint8_t m2 = o.mask_bits_.encoding;
if (m1 != m2)
return false;
@ -1433,8 +1434,8 @@ bool CompactObj::CmpEncoded(string_view sv) const {
void CompactObj::EncodeString(string_view str) {
DCHECK_GT(str.size(), kInlineLen);
DCHECK_EQ(NONE_ENC, mask_bits_.encoding);
uint8_t mask = mask_ & ~kEncMask;
string_view encoded = str;
bool is_ascii = kUseAsciiEncoding && detail::validate_ascii_fast(str.data(), str.size());
@ -1443,11 +1444,10 @@ void CompactObj::EncodeString(string_view str) {
size_t rev_len = ascii_len(encode_len);
if (rev_len == str.size()) {
mask |= ASCII2_ENC_BIT; // str hits its highest bound.
mask_bits_.encoding = ASCII2_ENC; // str hits its highest bound.
} else {
CHECK_EQ(str.size(), rev_len - 1) << "Bad ascii encoding for len " << str.size();
mask |= ASCII1_ENC_BIT;
mask_bits_.encoding = ASCII1_ENC; // str is shorter than its highest bound.
}
tl.tmp_buf.resize(encode_len);
@ -1455,7 +1455,7 @@ void CompactObj::EncodeString(string_view str) {
encoded = string_view{reinterpret_cast<char*>(tl.tmp_buf.data()), encode_len};
if (encoded.size() <= kInlineLen) {
SetMeta(encoded.size(), mask);
SetMeta(encoded.size(), mask_);
detail::ascii_pack(str.data(), str.size(), reinterpret_cast<uint8_t*>(u_.inline_str));
return;
@ -1464,20 +1464,19 @@ void CompactObj::EncodeString(string_view str) {
if (kUseSmallStrings && SmallString::CanAllocate(encoded.size())) {
if (taglen_ == 0) {
SetMeta(SMALL_TAG, mask);
SetMeta(SMALL_TAG, mask_);
tl.small_str_bytes += u_.small_str.Assign(encoded);
return;
}
if (taglen_ == SMALL_TAG && encoded.size() <= u_.small_str.size()) {
mask_ = mask;
tl.small_str_bytes -= u_.small_str.MallocUsed();
tl.small_str_bytes += u_.small_str.Assign(encoded);
return;
}
}
SetMeta(ROBJ_TAG, mask);
SetMeta(ROBJ_TAG, mask_);
u_.r_obj.SetString(encoded, tl.local_mr);
}
@ -1501,7 +1500,8 @@ StringOrView CompactObj::GetRawString() const {
}
size_t CompactObj::DecodedLen(size_t sz) const {
return ascii_len(sz) - ((mask_ & ASCII1_ENC_BIT) ? 1 : 0);
unsigned delta = (mask_bits_.encoding == ASCII1_ENC ? 1 : 0);
return ascii_len(sz) - delta;
}
MemoryResource* CompactObj::memory_resource() {

View file

@ -125,36 +125,13 @@ class CompactObj {
SBF_TAG = 22,
};
enum MaskBit {
REF_BIT = 1,
EXPIRE_BIT = 2, // Mark objects that have expiry timestamp assigned.
FLAG_BIT = 4, // Used to mark keys that have memcache flags assigned.
// ascii encoding is not an injective function. it compresses 8 bytes to 7 but also 7 to 7.
// therefore, in order to know the original length we introduce 2 flags that
// correct the length upon decoding. ASCII1_ENC_BIT rounds down the decoded length,
// while ASCII2_ENC_BIT rounds it up. See DecodedLen implementation for more info.
ASCII1_ENC_BIT = 8,
ASCII2_ENC_BIT = 0x10,
// IO_PENDING is set when the tiered storage has issued an i/o request to save the value. It is
// cleared when the io request finishes or is cancelled.
IO_PENDING = 0x20,
// Applied only on keys that should be deleted asynchronously.
// (it can be the same value as IO_PENDING) that is applied only on values.
KEY_ASYNC_DELETE = 0x20,
STICKY = 0x40,
// TOUCHED used to determin which items are hot/cold.
// by checking if the item was touched from the last time we
// reached this item while travering the database to set items as cold.
// https://junchengyang.com/publication/nsdi24-SIEVE.pdf
TOUCHED = 0x80,
enum Encoding : uint8_t {
NONE_ENC = 0,
ASCII1_ENC = 1,
ASCII2_ENC = 2,
HUFFMAN_ENC = 3, // TBD
};
static constexpr uint8_t kEncMask = ASCII1_ENC_BIT | ASCII2_ENC_BIT;
public:
using PrefixArray = std::vector<std::string_view>;
using MemoryResource = detail::RobjWrapper::MemoryResource;
@ -185,13 +162,14 @@ class CompactObj {
CompactObj res;
memcpy(&res.u_, &u_, sizeof(u_));
res.taglen_ = taglen_;
res.mask_ = mask_ | REF_BIT;
res.mask_ = mask_;
res.mask_bits_.ref = 1;
return res;
}
bool IsRef() const {
return mask_ & REF_BIT;
return mask_bits_.ref;
}
std::string_view GetSlice(std::string* scratch) const;
@ -222,73 +200,53 @@ class CompactObj {
}
bool HasExpire() const {
return mask_ & EXPIRE_BIT;
return mask_bits_.expire;
}
void SetExpire(bool e) {
if (e) {
mask_ |= EXPIRE_BIT;
} else {
mask_ &= ~EXPIRE_BIT;
}
mask_bits_.expire = e;
}
bool HasFlag() const {
return mask_ & FLAG_BIT;
return mask_bits_.mc_flag;
}
void SetFlag(bool e) {
if (e) {
mask_ |= FLAG_BIT;
} else {
mask_ &= ~FLAG_BIT;
}
mask_bits_.mc_flag = e;
}
bool WasTouched() const {
return mask_ & TOUCHED;
return mask_bits_.touched;
}
void SetTouched(bool e) {
if (e) {
mask_ |= TOUCHED;
} else {
mask_ &= ~TOUCHED;
}
mask_bits_.touched = e;
}
bool DefragIfNeeded(float ratio);
void SetAsyncDelete() {
mask_ |= KEY_ASYNC_DELETE;
mask_bits_.io_pending = 1; // io_pending flag is used for async delete for keys.
}
bool IsAsyncDelete() const {
return mask_ & KEY_ASYNC_DELETE;
return mask_bits_.io_pending;
}
bool HasStashPending() const {
return mask_ & IO_PENDING;
return mask_bits_.io_pending;
}
void SetStashPending(bool b) {
if (b) {
mask_ |= IO_PENDING;
} else {
mask_ &= ~IO_PENDING;
}
mask_bits_.io_pending = b;
}
bool IsSticky() const {
return mask_ & STICKY;
return mask_bits_.sticky;
}
void SetSticky(bool s) {
if (s) {
mask_ |= STICKY;
} else {
mask_ &= ~STICKY;
}
void SetSticky(bool e) {
mask_bits_.sticky = e;
}
unsigned Encoding() const;
@ -525,14 +483,38 @@ class CompactObj {
//
static_assert(sizeof(u_) == 16);
uint8_t mask_ = 0;
union {
uint8_t mask_ = 0;
struct {
uint8_t ref : 1; // Mark objects that have expiry timestamp assigned.
uint8_t expire : 1;
uint8_t mc_flag : 1; // Marks keys that have memcache flags assigned.
// ascii encoding is not an injective function. it compresses 8 bytes to 7 but also 7 to 7.
// therefore, in order to know the original length we introduce 2 flags that
// correct the length upon decoding. ASCII1_ENC_BIT rounds down the decoded length,
// while ASCII2_ENC_BIT rounds it up. See DecodedLen implementation for more info.
uint8_t encoding : 2;
// IO_PENDING is set when the tiered storage has issued an i/o request to save the value.
// It is cleared when the io request finishes or is cancelled.
uint8_t io_pending : 1; // also serves as async-delete for keys.
uint8_t sticky : 1;
// TOUCHED used to determin which items are hot/cold.
// by checking if the item was touched from the last time we
// reached this item while travering the database to set items as cold.
// https://junchengyang.com/publication/nsdi24-SIEVE.pdf
uint8_t touched : 1; // used to mark keys that were accessed.
} mask_bits_;
};
// We currently reserve 5 bits for tags and 3 bits for extending the mask. currently reserved.
uint8_t taglen_ = 0;
};
inline bool CompactObj::operator==(std::string_view sv) const {
if (mask_ & kEncMask)
if (mask_bits_.encoding)
return CmpEncoded(sv);
if (IsInline()) {

View file

@ -607,6 +607,7 @@ TEST_F(CompactObjectTest, RawInterface) {
str.assign(50, char(200)); // non ascii
cobj_.SetString(str);
ASSERT_EQ(str, cobj_.GetSlice(&tmp));
{
auto raw_blob = cobj_.GetRawString();