mirror of
https://github.com/dragonflydb/dragonfly.git
synced 2025-05-10 18:05:44 +02:00
chore: introduce huffman encoder class
Move the code in debugcmd.cc into HuffmanEncoder. One of the building blocks for #4883 Signed-off-by: Roman Gershman <roman@dragonflydb.io>
This commit is contained in:
parent
b3e0bcfb31
commit
9595453ce1
5 changed files with 220 additions and 44 deletions
|
@ -21,7 +21,7 @@ add_subdirectory(json)
|
|||
set(SEARCH_LIB query_parser)
|
||||
|
||||
add_library(dfly_core allocation_tracker.cc bloom.cc compact_object.cc dense_set.cc
|
||||
dragonfly_core.cc extent_tree.cc
|
||||
dragonfly_core.cc extent_tree.cc huff_coder.cc
|
||||
interpreter.cc glob_matcher.cc mi_memory_resource.cc qlist.cc sds_utils.cc
|
||||
segment_allocator.cc score_map.cc small_string.cc sorted_map.cc task_queue.cc
|
||||
tx_queue.cc string_set.cc string_map.cc top_keys.cc detail/bitpacking.cc)
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
#include "base/gtest.h"
|
||||
#include "base/logging.h"
|
||||
#include "core/glob_matcher.h"
|
||||
#include "core/huff_coder.h"
|
||||
#include "core/intent_lock.h"
|
||||
#include "core/tx_queue.h"
|
||||
|
||||
|
@ -188,6 +189,43 @@ TEST_F(StringMatchTest, Special) {
|
|||
EXPECT_TRUE(MatchLen("foo\\", "foo\\", 0));
|
||||
}
|
||||
|
||||
class HuffCoderTest : public ::testing::Test {
|
||||
protected:
|
||||
HuffmanEncoder encoder_;
|
||||
string error_msg_;
|
||||
const string_view good_table_{
|
||||
"\x1b\x10\xd8\n\n\x19\xc6\x0c\xc3\x30\x0c\x43\x1e\x93\xe4\x11roB\xf6\xde\xbb\x18V\xc2Zk\x03"sv};
|
||||
};
|
||||
|
||||
TEST_F(HuffCoderTest, Load) {
|
||||
string data("bad");
|
||||
|
||||
ASSERT_FALSE(encoder_.Load(data, &error_msg_));
|
||||
|
||||
data = good_table_;
|
||||
ASSERT_TRUE(encoder_.Load(data, &error_msg_)) << error_msg_;
|
||||
|
||||
data.append("foo");
|
||||
encoder_.Reset();
|
||||
ASSERT_FALSE(encoder_.Load(data, &error_msg_));
|
||||
}
|
||||
|
||||
TEST_F(HuffCoderTest, Encode) {
|
||||
ASSERT_TRUE(encoder_.Load(good_table_, &error_msg_)) << error_msg_;
|
||||
|
||||
EXPECT_EQ(1, encoder_.BitCount('x'));
|
||||
EXPECT_EQ(3, encoder_.BitCount(':'));
|
||||
EXPECT_EQ(5, encoder_.BitCount('2'));
|
||||
EXPECT_EQ(5, encoder_.BitCount('3'));
|
||||
|
||||
string data("x:23xx");
|
||||
|
||||
uint8_t dest[100];
|
||||
uint32_t dest_size = sizeof(dest);
|
||||
ASSERT_TRUE(encoder_.Encode(data, dest, &dest_size, &error_msg_));
|
||||
ASSERT_EQ(3, dest_size);
|
||||
}
|
||||
|
||||
using benchmark::DoNotOptimize;
|
||||
|
||||
// Parse Double benchmarks
|
||||
|
|
110
src/core/huff_coder.cc
Normal file
110
src/core/huff_coder.cc
Normal file
|
@ -0,0 +1,110 @@
|
|||
// Copyright 2025, DragonflyDB authors. All rights reserved.
|
||||
// See LICENSE for licensing terms.
|
||||
//
|
||||
|
||||
#include "core/huff_coder.h"
|
||||
|
||||
#include "base/logging.h"
|
||||
|
||||
extern "C" {
|
||||
#include "huff/huf.h"
|
||||
}
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace dfly {
|
||||
|
||||
constexpr size_t kWspSize = HUF_CTABLE_WORKSPACE_SIZE;
|
||||
|
||||
bool HuffmanEncoder::Load(std::string_view binary_data, std::string* error_msg) {
|
||||
CHECK(!huf_ctable_);
|
||||
|
||||
huf_ctable_.reset(new HUF_CElt[HUF_CTABLE_SIZE_ST(255)]);
|
||||
table_max_symbol_ = 255;
|
||||
|
||||
unsigned has_zero_weights = 0;
|
||||
size_t read_size = HUF_readCTable(huf_ctable_.get(), &table_max_symbol_, binary_data.data(),
|
||||
binary_data.size(), &has_zero_weights);
|
||||
|
||||
if (HUF_isError(read_size)) {
|
||||
huf_ctable_.reset();
|
||||
*error_msg = HUF_getErrorName(read_size);
|
||||
return false;
|
||||
}
|
||||
if (read_size != binary_data.size()) {
|
||||
*error_msg = "Corrupted data";
|
||||
huf_ctable_.reset();
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool HuffmanEncoder::Build(const unsigned hist[], unsigned max_symbol, std::string* error_msg) {
|
||||
CHECK(!huf_ctable_);
|
||||
huf_ctable_.reset(new HUF_CElt[HUF_CTABLE_SIZE_ST(max_symbol)]);
|
||||
|
||||
unique_ptr<uint32_t[]> wrkspace(new uint32_t[HUF_CTABLE_WORKSPACE_SIZE_U32]);
|
||||
|
||||
size_t num_bits =
|
||||
HUF_buildCTable_wksp(huf_ctable_.get(), hist, max_symbol, 0, wrkspace.get(), kWspSize);
|
||||
if (HUF_isError(num_bits)) {
|
||||
*error_msg = HUF_getErrorName(num_bits);
|
||||
huf_ctable_.reset();
|
||||
return false;
|
||||
}
|
||||
num_bits_ = static_cast<uint8_t>(num_bits);
|
||||
table_max_symbol_ = max_symbol;
|
||||
return true;
|
||||
}
|
||||
|
||||
void HuffmanEncoder::Reset() {
|
||||
huf_ctable_.reset();
|
||||
table_max_symbol_ = 0;
|
||||
}
|
||||
|
||||
bool HuffmanEncoder::Encode(std::string_view data, uint8_t* dest, uint32_t* dest_size,
|
||||
std::string* error_msg) const {
|
||||
DCHECK(huf_ctable_);
|
||||
|
||||
size_t res =
|
||||
HUF_compress1X_usingCTable(dest, *dest_size, data.data(), data.size(), huf_ctable_.get(), 0);
|
||||
|
||||
if (HUF_isError(res)) {
|
||||
*error_msg = HUF_getErrorName(res);
|
||||
return false;
|
||||
}
|
||||
*dest_size = static_cast<uint32_t>(res);
|
||||
return true;
|
||||
}
|
||||
|
||||
unsigned HuffmanEncoder::BitCount(uint8_t symbol) const {
|
||||
DCHECK(huf_ctable_);
|
||||
return HUF_getNbBitsFromCTable(huf_ctable_.get(), symbol);
|
||||
}
|
||||
|
||||
size_t HuffmanEncoder::EstimateCompressedSize(const unsigned hist[], unsigned max_symbol) const {
|
||||
DCHECK(huf_ctable_);
|
||||
size_t res = HUF_estimateCompressedSize(huf_ctable_.get(), hist, max_symbol);
|
||||
return res;
|
||||
}
|
||||
|
||||
string HuffmanEncoder::Export() const {
|
||||
DCHECK(huf_ctable_);
|
||||
|
||||
// Reverse engineered: (maxSymbolValue + 1) / 2 + 1.
|
||||
constexpr unsigned kMaxTableSize = 130;
|
||||
string res;
|
||||
res.resize(kMaxTableSize);
|
||||
|
||||
unique_ptr<uint32_t[]> wrkspace(new uint32_t[HUF_CTABLE_WORKSPACE_SIZE_U32]);
|
||||
|
||||
// Seems we can reuse the same workspace, its capacity is enough.
|
||||
size_t size = HUF_writeCTable_wksp(res.data(), res.size(), huf_ctable_.get(), table_max_symbol_,
|
||||
num_bits_, wrkspace.get(), kWspSize);
|
||||
CHECK(!HUF_isError(size));
|
||||
res.resize(size);
|
||||
return res;
|
||||
}
|
||||
|
||||
} // namespace dfly
|
50
src/core/huff_coder.h
Normal file
50
src/core/huff_coder.h
Normal file
|
@ -0,0 +1,50 @@
|
|||
// Copyright 2025, DragonflyDB authors. All rights reserved.
|
||||
// See LICENSE for licensing terms.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string_view>
|
||||
|
||||
namespace dfly {
|
||||
|
||||
class HuffmanEncoder {
|
||||
public:
|
||||
bool Build(const unsigned hist[], unsigned max_symbol, std::string* error_msg);
|
||||
|
||||
bool Encode(std::string_view data, uint8_t* dest, uint32_t* dest_size,
|
||||
std::string* error_msg) const;
|
||||
unsigned BitCount(uint8_t symbol) const;
|
||||
|
||||
size_t EstimateCompressedSize(const unsigned hist[], unsigned max_symbol) const;
|
||||
|
||||
void Reset();
|
||||
|
||||
// Load using the serialized data produced by Export().
|
||||
bool Load(std::string_view binary_data, std::string* error_msg);
|
||||
|
||||
// Exports a binary representation of the table, that can be loaded using Load().
|
||||
std::string Export() const;
|
||||
|
||||
uint8_t num_bits() const {
|
||||
return num_bits_;
|
||||
}
|
||||
|
||||
bool valid() const {
|
||||
return bool(huf_ctable_);
|
||||
}
|
||||
|
||||
unsigned max_symbol() const {
|
||||
return table_max_symbol_;
|
||||
}
|
||||
|
||||
private:
|
||||
using HUF_CElt = size_t;
|
||||
std::unique_ptr<HUF_CElt[]> huf_ctable_;
|
||||
unsigned table_max_symbol_ = 0;
|
||||
uint8_t num_bits_ = 0;
|
||||
};
|
||||
|
||||
} // namespace dfly
|
|
@ -13,6 +13,7 @@ extern "C" {
|
|||
|
||||
#include <absl/cleanup/cleanup.h>
|
||||
#include <absl/random/random.h>
|
||||
#include <absl/strings/escaping.h>
|
||||
#include <absl/strings/match.h>
|
||||
#include <absl/strings/str_cat.h>
|
||||
#include <lz4.h>
|
||||
|
@ -24,6 +25,7 @@ extern "C" {
|
|||
#include "base/flags.h"
|
||||
#include "base/logging.h"
|
||||
#include "core/compact_object.h"
|
||||
#include "core/huff_coder.h"
|
||||
#include "core/qlist.h"
|
||||
#include "core/sorted_map.h"
|
||||
#include "core/string_map.h"
|
||||
|
@ -1318,71 +1320,47 @@ void DebugCmd::Compression(CmdArgList args, facade::SinkReplyBuilder* builder) {
|
|||
});
|
||||
|
||||
size_t num_bits = 0, compressed_size = 0, raw_size = 0;
|
||||
unsigned table_max_symbol = 255;
|
||||
|
||||
if (hist.max_symbol) {
|
||||
HUF_CREATE_STATIC_CTABLE(huf_ctable, HufHist::kMaxSymbol);
|
||||
HuffmanEncoder huff_enc;
|
||||
string err_msg;
|
||||
|
||||
unique_ptr<uint32_t[]> wrkspace(new uint32_t[HUF_CTABLE_WORKSPACE_SIZE_U32]);
|
||||
constexpr size_t kWspSize = HUF_CTABLE_WORKSPACE_SIZE;
|
||||
raw_size = 0;
|
||||
for (unsigned i = 0; i <= HufHist::kMaxSymbol; i++) {
|
||||
raw_size += hist.hist[i];
|
||||
|
||||
// force non-zero weights for all symbols.
|
||||
if (hist.hist[i] == 0)
|
||||
hist.hist[i] = 1;
|
||||
}
|
||||
|
||||
if (bintable.empty()) {
|
||||
table_max_symbol = hist.max_symbol;
|
||||
num_bits = HUF_buildCTable_wksp(huf_ctable, hist.hist.data(), table_max_symbol, 0,
|
||||
wrkspace.get(), kWspSize);
|
||||
if (HUF_isError(num_bits)) {
|
||||
return rb->SendError(StrCat("Internal error: ", HUF_getErrorName(num_bits)));
|
||||
if (!huff_enc.Build(hist.hist.data(), HufHist::kMaxSymbol, &err_msg)) {
|
||||
return rb->SendError(StrCat("Internal error: ", err_msg));
|
||||
}
|
||||
} else {
|
||||
// Try to read the bintable and create a ctable from it.
|
||||
unsigned has_zero_weights = 1;
|
||||
|
||||
size_t read_size = HUF_readCTable(huf_ctable, &table_max_symbol, bintable.data(),
|
||||
bintable.size(), &has_zero_weights);
|
||||
if (HUF_isError(read_size)) {
|
||||
return rb->SendError(StrCat("Internal error: ", HUF_getErrorName(read_size)));
|
||||
}
|
||||
if (read_size != bintable.size()) {
|
||||
return rb->SendError("Invalid bintable");
|
||||
if (!huff_enc.Load(bintable, &err_msg)) {
|
||||
return rb->SendError(StrCat("Internal error: ", err_msg));
|
||||
}
|
||||
}
|
||||
|
||||
compressed_size = HUF_estimateCompressedSize(huf_ctable, hist.hist.data(), table_max_symbol);
|
||||
for (unsigned i = table_max_symbol + 1; i <= hist.max_symbol; i++) {
|
||||
compressed_size += hist.hist[i];
|
||||
}
|
||||
raw_size = 0;
|
||||
for (unsigned i = 0; i <= hist.max_symbol; i++) {
|
||||
raw_size += hist.hist[i];
|
||||
}
|
||||
num_bits = huff_enc.num_bits();
|
||||
compressed_size = huff_enc.EstimateCompressedSize(hist.hist.data(), HufHist::kMaxSymbol);
|
||||
|
||||
if (print_bintable) {
|
||||
// Reverse engineered: (maxSymbolValue + 1) / 2 + 1.
|
||||
constexpr unsigned kMaxTableSize = 130;
|
||||
bintable.resize(kMaxTableSize);
|
||||
|
||||
// Seems we can reuse the same workspace, its capacity is enough.
|
||||
size_t res = HUF_writeCTable_wksp(bintable.data(), kMaxTableSize, huf_ctable,
|
||||
table_max_symbol, num_bits, wrkspace.get(), kWspSize);
|
||||
if (HUF_isError(res)) {
|
||||
return rb->SendError(StrCat("Internal error: ", HUF_getErrorName(res)));
|
||||
}
|
||||
bintable.resize(res);
|
||||
bintable = huff_enc.Export();
|
||||
VLOG(1) << "bintable: " << absl::CHexEscape(bintable);
|
||||
} else {
|
||||
bintable.clear();
|
||||
}
|
||||
}
|
||||
|
||||
unsigned map_len = print_bintable ? 7 : 6;
|
||||
unsigned map_len = print_bintable ? 6 : 5;
|
||||
|
||||
rb->StartCollection(map_len, RedisReplyBuilder::CollectionType::MAP);
|
||||
rb->SendSimpleString("max_symbol");
|
||||
rb->SendLong(hist.max_symbol);
|
||||
|
||||
// in case we load a bintable, table_max_symbol may be different from max_symbol.
|
||||
// if it's smaller, it means our table can not encode all symbols.
|
||||
rb->SendSimpleString("table_max_symbol");
|
||||
rb->SendLong(table_max_symbol);
|
||||
rb->SendSimpleString("max_bits");
|
||||
rb->SendLong(num_bits);
|
||||
rb->SendSimpleString("raw_size");
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue