mirror of
https://github.com/dragonflydb/dragonfly.git
synced 2025-05-10 18:05:44 +02:00
chore: support fetching a huffman encoded string from CompactObj (#5086)
chore: supports reading huffman encoded strings from CompactObj This requires implementing HashCode, operator== methods as well. Fixes #4880 Signed-off-by: Roman Gershman <roman@dragonflydb.io>
This commit is contained in:
parent
e148f1aa69
commit
8ce8ee68df
5 changed files with 122 additions and 40 deletions
|
@ -787,13 +787,17 @@ CompactObj& CompactObj::operator=(CompactObj&& o) noexcept {
|
|||
|
||||
size_t CompactObj::Size() const {
|
||||
size_t raw_size = 0;
|
||||
|
||||
uint8_t first_byte = 0;
|
||||
if (IsInline()) {
|
||||
raw_size = taglen_;
|
||||
first_byte = u_.inline_str[0];
|
||||
} else {
|
||||
switch (taglen_) {
|
||||
case SMALL_TAG:
|
||||
raw_size = u_.small_str.size();
|
||||
if (mask_bits_.encoding == HUFFMAN_ENC) {
|
||||
return DecodedLen(raw_size, u_.small_str.first_byte());
|
||||
}
|
||||
break;
|
||||
case INT_TAG: {
|
||||
absl::AlphaNum an(u_.ival);
|
||||
|
@ -802,11 +806,16 @@ size_t CompactObj::Size() const {
|
|||
}
|
||||
case EXTERNAL_TAG:
|
||||
raw_size = u_.ext_ptr.serialized_size;
|
||||
CHECK(mask_bits_.encoding != HUFFMAN_ENC);
|
||||
break;
|
||||
case ROBJ_TAG:
|
||||
raw_size = u_.r_obj.Size();
|
||||
if (mask_bits_.encoding == HUFFMAN_ENC) {
|
||||
return DecodedLen(raw_size, *(uint8_t*)u_.r_obj.inner_obj());
|
||||
}
|
||||
break;
|
||||
case JSON_TAG:
|
||||
DCHECK_EQ(mask_bits_.encoding, NONE_ENC);
|
||||
if (JsonEnconding() == kEncodingJsonFlat) {
|
||||
raw_size = u_.json_obj.flat.json_len;
|
||||
} else {
|
||||
|
@ -814,48 +823,54 @@ size_t CompactObj::Size() const {
|
|||
}
|
||||
break;
|
||||
case SBF_TAG:
|
||||
DCHECK_EQ(mask_bits_.encoding, NONE_ENC);
|
||||
raw_size = u_.sbf->current_size();
|
||||
break;
|
||||
default:
|
||||
LOG(DFATAL) << "Should not reach " << int(taglen_);
|
||||
}
|
||||
}
|
||||
return mask_bits_.encoding ? DecodedLen(raw_size) : raw_size;
|
||||
return mask_bits_.encoding ? DecodedLen(raw_size, first_byte) : raw_size;
|
||||
}
|
||||
|
||||
uint64_t CompactObj::HashCode() const {
|
||||
DCHECK(taglen_ != JSON_TAG) << "JSON type cannot be used for keys!";
|
||||
|
||||
uint8_t encoded = mask_bits_.encoding;
|
||||
if (mask_bits_.encoding == NONE_ENC) {
|
||||
if (IsInline()) {
|
||||
return XXH3_64bits_withSeed(u_.inline_str, taglen_, kHashSeed);
|
||||
}
|
||||
|
||||
switch (taglen_) {
|
||||
case SMALL_TAG:
|
||||
return u_.small_str.HashCode();
|
||||
case ROBJ_TAG:
|
||||
return u_.r_obj.HashCode();
|
||||
case INT_TAG: {
|
||||
absl::AlphaNum an(u_.ival);
|
||||
return XXH3_64bits_withSeed(an.data(), an.size(), kHashSeed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DCHECK(mask_bits_.encoding);
|
||||
|
||||
if (IsInline()) {
|
||||
if (encoded) {
|
||||
char buf[kInlineLen * 2];
|
||||
size_t decoded_len = DecodedLen(taglen_);
|
||||
char buf[kInlineLen * 3]; // should suffice for most huffman decodings.
|
||||
size_t decoded_len = DecodedLen(taglen_, u_.inline_str[0]);
|
||||
if (mask_bits_.encoding == HUFFMAN_ENC) {
|
||||
if (decoded_len <= sizeof(buf) &&
|
||||
tl.huff_decoder.Decode({u_.inline_str + 1, size_t(taglen_ - 1)}, decoded_len, buf)) {
|
||||
return XXH3_64bits_withSeed(buf, decoded_len, kHashSeed);
|
||||
}
|
||||
} else {
|
||||
detail::ascii_unpack(to_byte(u_.inline_str), decoded_len, buf);
|
||||
return XXH3_64bits_withSeed(buf, decoded_len, kHashSeed);
|
||||
}
|
||||
return XXH3_64bits_withSeed(u_.inline_str, taglen_, kHashSeed);
|
||||
}
|
||||
|
||||
if (encoded) {
|
||||
string_view sv = GetSlice(&tl.tmp_str);
|
||||
return XXH3_64bits_withSeed(sv.data(), sv.size(), kHashSeed);
|
||||
}
|
||||
|
||||
switch (taglen_) {
|
||||
case SMALL_TAG:
|
||||
return u_.small_str.HashCode();
|
||||
case ROBJ_TAG:
|
||||
return u_.r_obj.HashCode();
|
||||
case INT_TAG: {
|
||||
absl::AlphaNum an(u_.ival);
|
||||
return XXH3_64bits_withSeed(an.data(), an.size(), kHashSeed);
|
||||
}
|
||||
}
|
||||
// We need hash only for keys.
|
||||
LOG(DFATAL) << "Should not reach " << int(taglen_);
|
||||
|
||||
return 0;
|
||||
string_view sv = GetSlice(&tl.tmp_str);
|
||||
return XXH3_64bits_withSeed(sv.data(), sv.size(), kHashSeed);
|
||||
}
|
||||
|
||||
uint64_t CompactObj::HashCode(string_view str) {
|
||||
|
@ -1111,7 +1126,8 @@ void CompactObj::GetString(char* dest) const {
|
|||
detail::ascii_unpack(to_byte(u_.inline_str), taglen_ + 2, dest);
|
||||
break;
|
||||
case HUFFMAN_ENC:
|
||||
tl.huff_decoder.Decode(u_.inline_str, taglen_, dest);
|
||||
tl.huff_decoder.Decode({u_.inline_str + 1, size_t(taglen_ - 1)},
|
||||
u_.inline_str[0] + taglen_ - 1, dest);
|
||||
break;
|
||||
case NONE_ENC:
|
||||
memcpy(dest, u_.inline_str, taglen_);
|
||||
|
@ -1132,24 +1148,39 @@ void CompactObj::GetString(char* dest) const {
|
|||
if (taglen_ == ROBJ_TAG) {
|
||||
CHECK_EQ(OBJ_STRING, u_.r_obj.type());
|
||||
DCHECK_EQ(OBJ_ENCODING_RAW, u_.r_obj.encoding());
|
||||
size_t decoded_len = DecodedLen(u_.r_obj.Size());
|
||||
size_t decoded_len = DecodedLen(u_.r_obj.Size(), *(const uint8_t*)u_.r_obj.inner_obj());
|
||||
if (mask_bits_.encoding == HUFFMAN_ENC) {
|
||||
CHECK(tl.huff_decoder.Decode({(const char*)u_.r_obj.inner_obj() + 1, u_.r_obj.Size() - 1},
|
||||
decoded_len, dest));
|
||||
return;
|
||||
}
|
||||
detail::ascii_unpack_simd(to_byte(u_.r_obj.inner_obj()), decoded_len, dest);
|
||||
} else if (taglen_ == SMALL_TAG) {
|
||||
size_t decoded_len = DecodedLen(u_.small_str.size());
|
||||
} else {
|
||||
CHECK_EQ(SMALL_TAG, taglen_);
|
||||
string_view slices[2];
|
||||
unsigned num = u_.small_str.GetV(slices);
|
||||
DCHECK_EQ(2u, num);
|
||||
size_t decoded_len = DecodedLen(u_.small_str.size(), slices[0][0]);
|
||||
|
||||
if (mask_bits_.encoding == HUFFMAN_ENC) {
|
||||
tl.tmp_buf.resize(slices[0].size() + slices[1].size() - 1);
|
||||
uint8_t* next = tl.tmp_buf.data();
|
||||
memcpy(next, slices[0].data() + 1, slices[0].size() - 1);
|
||||
next += slices[0].size() - 1;
|
||||
memcpy(next, slices[1].data(), slices[1].size());
|
||||
string_view src(reinterpret_cast<const char*>(tl.tmp_buf.data()), tl.tmp_buf.size());
|
||||
CHECK(tl.huff_decoder.Decode(src, decoded_len, dest));
|
||||
return;
|
||||
}
|
||||
|
||||
// we left some space on the left to allow inplace ascii unpacking.
|
||||
size_t space_left = decoded_len - u_.small_str.size();
|
||||
|
||||
string_view slices[2];
|
||||
unsigned num = u_.small_str.GetV(slices);
|
||||
DCHECK_EQ(2u, num);
|
||||
char* next = dest + space_left;
|
||||
memcpy(next, slices[0].data(), slices[0].size());
|
||||
next += slices[0].size();
|
||||
memcpy(next, slices[1].data(), slices[1].size());
|
||||
detail::ascii_unpack_simd(reinterpret_cast<uint8_t*>(dest + space_left), decoded_len, dest);
|
||||
} else {
|
||||
LOG(FATAL) << "Unsupported tag " << int(taglen_);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
@ -1343,8 +1374,25 @@ bool CompactObj::EqualNonInline(std::string_view sv) const {
|
|||
}
|
||||
|
||||
bool CompactObj::CmpEncoded(string_view sv) const {
|
||||
size_t encode_len = binpacked_len(sv.size());
|
||||
if (mask_bits_.encoding == HUFFMAN_ENC) {
|
||||
size_t sz = Size();
|
||||
if (sv.size() != sz)
|
||||
return false;
|
||||
|
||||
if (IsInline()) {
|
||||
constexpr size_t kMaxHuffLen = kInlineLen * 3;
|
||||
if (sz <= kMaxHuffLen) {
|
||||
char buf[kMaxHuffLen];
|
||||
CHECK(tl.huff_decoder.Decode({u_.inline_str + 1, size_t(taglen_ - 1)}, sz, buf));
|
||||
return sv == string_view(buf, sz);
|
||||
}
|
||||
}
|
||||
tl.tmp_str.resize(sz);
|
||||
GetString(tl.tmp_str.data());
|
||||
return sv == tl.tmp_str;
|
||||
}
|
||||
|
||||
size_t encode_len = binpacked_len(sv.size());
|
||||
if (IsInline()) {
|
||||
if (encode_len != taglen_)
|
||||
return false;
|
||||
|
@ -1524,8 +1572,12 @@ StringOrView CompactObj::GetRawString() const {
|
|||
return {};
|
||||
}
|
||||
|
||||
size_t CompactObj::DecodedLen(size_t sz) const {
|
||||
unsigned delta = (mask_bits_.encoding == ASCII1_ENC ? 1 : 0);
|
||||
size_t CompactObj::DecodedLen(size_t sz, uint8_t b) const {
|
||||
DCHECK(mask_bits_.encoding);
|
||||
if (mask_bits_.encoding == HUFFMAN_ENC) {
|
||||
return sz + b - 1;
|
||||
}
|
||||
unsigned delta = (mask_bits_.encoding == ASCII1_ENC) ? 1 : 0;
|
||||
return ascii_len(sz) - delta;
|
||||
}
|
||||
|
||||
|
|
|
@ -410,7 +410,7 @@ class CompactObj {
|
|||
|
||||
private:
|
||||
void EncodeString(std::string_view str);
|
||||
size_t DecodedLen(size_t sz) const;
|
||||
size_t DecodedLen(size_t sz, uint8_t firstb) const;
|
||||
|
||||
bool EqualNonInline(std::string_view sv) const;
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
#include "core/compact_object.h"
|
||||
|
||||
#include <absl/strings/str_cat.h>
|
||||
#include <gtest/gtest.h>
|
||||
#include <mimalloc.h>
|
||||
#include <xxhash.h>
|
||||
|
||||
|
@ -13,6 +14,7 @@
|
|||
#include "base/logging.h"
|
||||
#include "core/detail/bitpacking.h"
|
||||
#include "core/flat_set.h"
|
||||
#include "core/huff_coder.h"
|
||||
#include "core/mi_memory_resource.h"
|
||||
#include "core/string_set.h"
|
||||
|
||||
|
@ -656,6 +658,30 @@ TEST_F(CompactObjectTest, lpGetInteger) {
|
|||
lpFree(lp);
|
||||
}
|
||||
|
||||
TEST_F(CompactObjectTest, HuffMan) {
|
||||
array<unsigned, 256> hist;
|
||||
hist.fill(1);
|
||||
hist['a'] = 100;
|
||||
hist['b'] = 50;
|
||||
HuffmanEncoder encoder;
|
||||
ASSERT_TRUE(encoder.Build(hist.data(), hist.size() - 1, nullptr));
|
||||
string bindata = encoder.Export();
|
||||
ASSERT_TRUE(CompactObj::InitHuffmanThreadLocal(bindata));
|
||||
for (unsigned i = 30; i < 2048; i += 10) {
|
||||
string data(i, 'a');
|
||||
cobj_.SetString(data);
|
||||
bool malloc_used = i >= 60;
|
||||
ASSERT_EQ(malloc_used, cobj_.MallocUsed() > 0) << i;
|
||||
ASSERT_EQ(data.size(), cobj_.Size());
|
||||
ASSERT_EQ(CompactObj::HashCode(data), cobj_.HashCode());
|
||||
|
||||
string actual;
|
||||
cobj_.GetString(&actual);
|
||||
EXPECT_EQ(data, actual);
|
||||
EXPECT_EQ(cobj_, data);
|
||||
}
|
||||
}
|
||||
|
||||
static void ascii_pack_naive(const char* ascii, size_t len, uint8_t* bin) {
|
||||
const char* end = ascii + len;
|
||||
|
||||
|
|
|
@ -144,7 +144,7 @@ bool HuffmanDecoder::Decode(std::string_view src, size_t dest_size, char* dest)
|
|||
HUF_decompress1X_usingDTable(dest, dest_size, src.data(), src.size(), huf_dtable_.get(), 1);
|
||||
|
||||
if (HUF_isError(res)) {
|
||||
LOG(FATAL) << "Failed to decompress: " << HUF_getErrorName(res);
|
||||
LOG(DFATAL) << "Failed to decompress: " << HUF_getErrorName(res);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
|
|
@ -50,6 +50,10 @@ class SmallString {
|
|||
|
||||
bool DefragIfNeeded(float ratio);
|
||||
|
||||
uint8_t first_byte() const {
|
||||
return prefix_[0];
|
||||
}
|
||||
|
||||
private:
|
||||
// prefix of the string that is broken down into 2 parts.
|
||||
char prefix_[kPrefLen];
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue