chore: introduce huffman encoder class (#5076)

Move the code in debugcmd.cc into HuffmanEncoder.

One of the building blocks for #4883

Signed-off-by: Roman Gershman <roman@dragonflydb.io>
This commit is contained in:
Roman Gershman 2025-05-07 10:02:40 +03:00 committed by GitHub
parent 3d79664a19
commit 3f3d232211
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 220 additions and 44 deletions

View file

@ -21,7 +21,7 @@ add_subdirectory(json)
set(SEARCH_LIB query_parser)
add_library(dfly_core allocation_tracker.cc bloom.cc compact_object.cc dense_set.cc
dragonfly_core.cc extent_tree.cc
dragonfly_core.cc extent_tree.cc huff_coder.cc
interpreter.cc glob_matcher.cc mi_memory_resource.cc qlist.cc sds_utils.cc
segment_allocator.cc score_map.cc small_string.cc sorted_map.cc task_queue.cc
tx_queue.cc string_set.cc string_map.cc top_keys.cc detail/bitpacking.cc)

View file

@ -23,6 +23,7 @@
#include "base/gtest.h"
#include "base/logging.h"
#include "core/glob_matcher.h"
#include "core/huff_coder.h"
#include "core/intent_lock.h"
#include "core/tx_queue.h"
@ -188,6 +189,43 @@ TEST_F(StringMatchTest, Special) {
EXPECT_TRUE(MatchLen("foo\\", "foo\\", 0));
}
class HuffCoderTest : public ::testing::Test {
protected:
HuffmanEncoder encoder_;
string error_msg_;
const string_view good_table_{
"\x1b\x10\xd8\n\n\x19\xc6\x0c\xc3\x30\x0c\x43\x1e\x93\xe4\x11roB\xf6\xde\xbb\x18V\xc2Zk\x03"sv};
};
TEST_F(HuffCoderTest, Load) {
string data("bad");
ASSERT_FALSE(encoder_.Load(data, &error_msg_));
data = good_table_;
ASSERT_TRUE(encoder_.Load(data, &error_msg_)) << error_msg_;
data.append("foo");
encoder_.Reset();
ASSERT_FALSE(encoder_.Load(data, &error_msg_));
}
TEST_F(HuffCoderTest, Encode) {
ASSERT_TRUE(encoder_.Load(good_table_, &error_msg_)) << error_msg_;
EXPECT_EQ(1, encoder_.BitCount('x'));
EXPECT_EQ(3, encoder_.BitCount(':'));
EXPECT_EQ(5, encoder_.BitCount('2'));
EXPECT_EQ(5, encoder_.BitCount('3'));
string data("x:23xx");
uint8_t dest[100];
uint32_t dest_size = sizeof(dest);
ASSERT_TRUE(encoder_.Encode(data, dest, &dest_size, &error_msg_));
ASSERT_EQ(3, dest_size);
}
using benchmark::DoNotOptimize;
// Parse Double benchmarks

110
src/core/huff_coder.cc Normal file
View file

@ -0,0 +1,110 @@
// Copyright 2025, DragonflyDB authors. All rights reserved.
// See LICENSE for licensing terms.
//
#include "core/huff_coder.h"
#include "base/logging.h"
extern "C" {
#include "huff/huf.h"
}
using namespace std;
namespace dfly {
constexpr size_t kWspSize = HUF_CTABLE_WORKSPACE_SIZE;
bool HuffmanEncoder::Load(std::string_view binary_data, std::string* error_msg) {
CHECK(!huf_ctable_);
huf_ctable_.reset(new HUF_CElt[HUF_CTABLE_SIZE_ST(255)]);
table_max_symbol_ = 255;
unsigned has_zero_weights = 0;
size_t read_size = HUF_readCTable(huf_ctable_.get(), &table_max_symbol_, binary_data.data(),
binary_data.size(), &has_zero_weights);
if (HUF_isError(read_size)) {
huf_ctable_.reset();
*error_msg = HUF_getErrorName(read_size);
return false;
}
if (read_size != binary_data.size()) {
*error_msg = "Corrupted data";
huf_ctable_.reset();
return false;
}
return true;
}
bool HuffmanEncoder::Build(const unsigned hist[], unsigned max_symbol, std::string* error_msg) {
CHECK(!huf_ctable_);
huf_ctable_.reset(new HUF_CElt[HUF_CTABLE_SIZE_ST(max_symbol)]);
unique_ptr<uint32_t[]> wrkspace(new uint32_t[HUF_CTABLE_WORKSPACE_SIZE_U32]);
size_t num_bits =
HUF_buildCTable_wksp(huf_ctable_.get(), hist, max_symbol, 0, wrkspace.get(), kWspSize);
if (HUF_isError(num_bits)) {
*error_msg = HUF_getErrorName(num_bits);
huf_ctable_.reset();
return false;
}
num_bits_ = static_cast<uint8_t>(num_bits);
table_max_symbol_ = max_symbol;
return true;
}
void HuffmanEncoder::Reset() {
huf_ctable_.reset();
table_max_symbol_ = 0;
}
bool HuffmanEncoder::Encode(std::string_view data, uint8_t* dest, uint32_t* dest_size,
std::string* error_msg) const {
DCHECK(huf_ctable_);
size_t res =
HUF_compress1X_usingCTable(dest, *dest_size, data.data(), data.size(), huf_ctable_.get(), 0);
if (HUF_isError(res)) {
*error_msg = HUF_getErrorName(res);
return false;
}
*dest_size = static_cast<uint32_t>(res);
return true;
}
unsigned HuffmanEncoder::BitCount(uint8_t symbol) const {
DCHECK(huf_ctable_);
return HUF_getNbBitsFromCTable(huf_ctable_.get(), symbol);
}
size_t HuffmanEncoder::EstimateCompressedSize(const unsigned hist[], unsigned max_symbol) const {
DCHECK(huf_ctable_);
size_t res = HUF_estimateCompressedSize(huf_ctable_.get(), hist, max_symbol);
return res;
}
string HuffmanEncoder::Export() const {
DCHECK(huf_ctable_);
// Reverse engineered: (maxSymbolValue + 1) / 2 + 1.
constexpr unsigned kMaxTableSize = 130;
string res;
res.resize(kMaxTableSize);
unique_ptr<uint32_t[]> wrkspace(new uint32_t[HUF_CTABLE_WORKSPACE_SIZE_U32]);
// Seems we can reuse the same workspace, its capacity is enough.
size_t size = HUF_writeCTable_wksp(res.data(), res.size(), huf_ctable_.get(), table_max_symbol_,
num_bits_, wrkspace.get(), kWspSize);
CHECK(!HUF_isError(size));
res.resize(size);
return res;
}
} // namespace dfly

50
src/core/huff_coder.h Normal file
View file

@ -0,0 +1,50 @@
// Copyright 2025, DragonflyDB authors. All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once
#include <cstdint>
#include <memory>
#include <string_view>
namespace dfly {
class HuffmanEncoder {
public:
bool Build(const unsigned hist[], unsigned max_symbol, std::string* error_msg);
bool Encode(std::string_view data, uint8_t* dest, uint32_t* dest_size,
std::string* error_msg) const;
unsigned BitCount(uint8_t symbol) const;
size_t EstimateCompressedSize(const unsigned hist[], unsigned max_symbol) const;
void Reset();
// Load using the serialized data produced by Export().
bool Load(std::string_view binary_data, std::string* error_msg);
// Exports a binary representation of the table, that can be loaded using Load().
std::string Export() const;
uint8_t num_bits() const {
return num_bits_;
}
bool valid() const {
return bool(huf_ctable_);
}
unsigned max_symbol() const {
return table_max_symbol_;
}
private:
using HUF_CElt = size_t;
std::unique_ptr<HUF_CElt[]> huf_ctable_;
unsigned table_max_symbol_ = 0;
uint8_t num_bits_ = 0;
};
} // namespace dfly

View file

@ -13,6 +13,7 @@ extern "C" {
#include <absl/cleanup/cleanup.h>
#include <absl/random/random.h>
#include <absl/strings/escaping.h>
#include <absl/strings/match.h>
#include <absl/strings/str_cat.h>
#include <lz4.h>
@ -24,6 +25,7 @@ extern "C" {
#include "base/flags.h"
#include "base/logging.h"
#include "core/compact_object.h"
#include "core/huff_coder.h"
#include "core/qlist.h"
#include "core/sorted_map.h"
#include "core/string_map.h"
@ -1318,71 +1320,47 @@ void DebugCmd::Compression(CmdArgList args, facade::SinkReplyBuilder* builder) {
});
size_t num_bits = 0, compressed_size = 0, raw_size = 0;
unsigned table_max_symbol = 255;
if (hist.max_symbol) {
HUF_CREATE_STATIC_CTABLE(huf_ctable, HufHist::kMaxSymbol);
HuffmanEncoder huff_enc;
string err_msg;
unique_ptr<uint32_t[]> wrkspace(new uint32_t[HUF_CTABLE_WORKSPACE_SIZE_U32]);
constexpr size_t kWspSize = HUF_CTABLE_WORKSPACE_SIZE;
raw_size = 0;
for (unsigned i = 0; i <= HufHist::kMaxSymbol; i++) {
raw_size += hist.hist[i];
// force non-zero weights for all symbols.
if (hist.hist[i] == 0)
hist.hist[i] = 1;
}
if (bintable.empty()) {
table_max_symbol = hist.max_symbol;
num_bits = HUF_buildCTable_wksp(huf_ctable, hist.hist.data(), table_max_symbol, 0,
wrkspace.get(), kWspSize);
if (HUF_isError(num_bits)) {
return rb->SendError(StrCat("Internal error: ", HUF_getErrorName(num_bits)));
if (!huff_enc.Build(hist.hist.data(), HufHist::kMaxSymbol, &err_msg)) {
return rb->SendError(StrCat("Internal error: ", err_msg));
}
} else {
// Try to read the bintable and create a ctable from it.
unsigned has_zero_weights = 1;
size_t read_size = HUF_readCTable(huf_ctable, &table_max_symbol, bintable.data(),
bintable.size(), &has_zero_weights);
if (HUF_isError(read_size)) {
return rb->SendError(StrCat("Internal error: ", HUF_getErrorName(read_size)));
}
if (read_size != bintable.size()) {
return rb->SendError("Invalid bintable");
if (!huff_enc.Load(bintable, &err_msg)) {
return rb->SendError(StrCat("Internal error: ", err_msg));
}
}
compressed_size = HUF_estimateCompressedSize(huf_ctable, hist.hist.data(), table_max_symbol);
for (unsigned i = table_max_symbol + 1; i <= hist.max_symbol; i++) {
compressed_size += hist.hist[i];
}
raw_size = 0;
for (unsigned i = 0; i <= hist.max_symbol; i++) {
raw_size += hist.hist[i];
}
num_bits = huff_enc.num_bits();
compressed_size = huff_enc.EstimateCompressedSize(hist.hist.data(), HufHist::kMaxSymbol);
if (print_bintable) {
// Reverse engineered: (maxSymbolValue + 1) / 2 + 1.
constexpr unsigned kMaxTableSize = 130;
bintable.resize(kMaxTableSize);
// Seems we can reuse the same workspace, its capacity is enough.
size_t res = HUF_writeCTable_wksp(bintable.data(), kMaxTableSize, huf_ctable,
table_max_symbol, num_bits, wrkspace.get(), kWspSize);
if (HUF_isError(res)) {
return rb->SendError(StrCat("Internal error: ", HUF_getErrorName(res)));
}
bintable.resize(res);
bintable = huff_enc.Export();
VLOG(1) << "bintable: " << absl::CHexEscape(bintable);
} else {
bintable.clear();
}
}
unsigned map_len = print_bintable ? 7 : 6;
unsigned map_len = print_bintable ? 6 : 5;
rb->StartCollection(map_len, RedisReplyBuilder::CollectionType::MAP);
rb->SendSimpleString("max_symbol");
rb->SendLong(hist.max_symbol);
// in case we load a bintable, table_max_symbol may be different from max_symbol.
// if it's smaller, it means our table can not encode all symbols.
rb->SendSimpleString("table_max_symbol");
rb->SendLong(table_max_symbol);
rb->SendSimpleString("max_bits");
rb->SendLong(num_bits);
rb->SendSimpleString("raw_size");