From 3f3d232211e2d0f4eba574baae05abcc36b3ee7a Mon Sep 17 00:00:00 2001 From: Roman Gershman Date: Wed, 7 May 2025 10:02:40 +0300 Subject: [PATCH] chore: introduce huffman encoder class (#5076) Move the code in debugcmd.cc into HuffmanEncoder. One of the building blocks for #4883 Signed-off-by: Roman Gershman --- src/core/CMakeLists.txt | 2 +- src/core/dfly_core_test.cc | 38 +++++++++++++ src/core/huff_coder.cc | 110 +++++++++++++++++++++++++++++++++++++ src/core/huff_coder.h | 50 +++++++++++++++++ src/server/debugcmd.cc | 64 +++++++-------------- 5 files changed, 220 insertions(+), 44 deletions(-) create mode 100644 src/core/huff_coder.cc create mode 100644 src/core/huff_coder.h diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index f3bcbd567..833dbc7c6 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -21,7 +21,7 @@ add_subdirectory(json) set(SEARCH_LIB query_parser) add_library(dfly_core allocation_tracker.cc bloom.cc compact_object.cc dense_set.cc - dragonfly_core.cc extent_tree.cc + dragonfly_core.cc extent_tree.cc huff_coder.cc interpreter.cc glob_matcher.cc mi_memory_resource.cc qlist.cc sds_utils.cc segment_allocator.cc score_map.cc small_string.cc sorted_map.cc task_queue.cc tx_queue.cc string_set.cc string_map.cc top_keys.cc detail/bitpacking.cc) diff --git a/src/core/dfly_core_test.cc b/src/core/dfly_core_test.cc index c08900451..b5f907c2b 100644 --- a/src/core/dfly_core_test.cc +++ b/src/core/dfly_core_test.cc @@ -23,6 +23,7 @@ #include "base/gtest.h" #include "base/logging.h" #include "core/glob_matcher.h" +#include "core/huff_coder.h" #include "core/intent_lock.h" #include "core/tx_queue.h" @@ -188,6 +189,43 @@ TEST_F(StringMatchTest, Special) { EXPECT_TRUE(MatchLen("foo\\", "foo\\", 0)); } +class HuffCoderTest : public ::testing::Test { + protected: + HuffmanEncoder encoder_; + string error_msg_; + const string_view good_table_{ + "\x1b\x10\xd8\n\n\x19\xc6\x0c\xc3\x30\x0c\x43\x1e\x93\xe4\x11roB\xf6\xde\xbb\x18V\xc2Zk\x03"sv}; +}; + +TEST_F(HuffCoderTest, Load) { + string data("bad"); + + ASSERT_FALSE(encoder_.Load(data, &error_msg_)); + + data = good_table_; + ASSERT_TRUE(encoder_.Load(data, &error_msg_)) << error_msg_; + + data.append("foo"); + encoder_.Reset(); + ASSERT_FALSE(encoder_.Load(data, &error_msg_)); +} + +TEST_F(HuffCoderTest, Encode) { + ASSERT_TRUE(encoder_.Load(good_table_, &error_msg_)) << error_msg_; + + EXPECT_EQ(1, encoder_.BitCount('x')); + EXPECT_EQ(3, encoder_.BitCount(':')); + EXPECT_EQ(5, encoder_.BitCount('2')); + EXPECT_EQ(5, encoder_.BitCount('3')); + + string data("x:23xx"); + + uint8_t dest[100]; + uint32_t dest_size = sizeof(dest); + ASSERT_TRUE(encoder_.Encode(data, dest, &dest_size, &error_msg_)); + ASSERT_EQ(3, dest_size); +} + using benchmark::DoNotOptimize; // Parse Double benchmarks diff --git a/src/core/huff_coder.cc b/src/core/huff_coder.cc new file mode 100644 index 000000000..aecd2acc7 --- /dev/null +++ b/src/core/huff_coder.cc @@ -0,0 +1,110 @@ +// Copyright 2025, DragonflyDB authors. All rights reserved. +// See LICENSE for licensing terms. +// + +#include "core/huff_coder.h" + +#include "base/logging.h" + +extern "C" { +#include "huff/huf.h" +} + +using namespace std; + +namespace dfly { + +constexpr size_t kWspSize = HUF_CTABLE_WORKSPACE_SIZE; + +bool HuffmanEncoder::Load(std::string_view binary_data, std::string* error_msg) { + CHECK(!huf_ctable_); + + huf_ctable_.reset(new HUF_CElt[HUF_CTABLE_SIZE_ST(255)]); + table_max_symbol_ = 255; + + unsigned has_zero_weights = 0; + size_t read_size = HUF_readCTable(huf_ctable_.get(), &table_max_symbol_, binary_data.data(), + binary_data.size(), &has_zero_weights); + + if (HUF_isError(read_size)) { + huf_ctable_.reset(); + *error_msg = HUF_getErrorName(read_size); + return false; + } + if (read_size != binary_data.size()) { + *error_msg = "Corrupted data"; + huf_ctable_.reset(); + return false; + } + + return true; +} + +bool HuffmanEncoder::Build(const unsigned hist[], unsigned max_symbol, std::string* error_msg) { + CHECK(!huf_ctable_); + huf_ctable_.reset(new HUF_CElt[HUF_CTABLE_SIZE_ST(max_symbol)]); + + unique_ptr wrkspace(new uint32_t[HUF_CTABLE_WORKSPACE_SIZE_U32]); + + size_t num_bits = + HUF_buildCTable_wksp(huf_ctable_.get(), hist, max_symbol, 0, wrkspace.get(), kWspSize); + if (HUF_isError(num_bits)) { + *error_msg = HUF_getErrorName(num_bits); + huf_ctable_.reset(); + return false; + } + num_bits_ = static_cast(num_bits); + table_max_symbol_ = max_symbol; + return true; +} + +void HuffmanEncoder::Reset() { + huf_ctable_.reset(); + table_max_symbol_ = 0; +} + +bool HuffmanEncoder::Encode(std::string_view data, uint8_t* dest, uint32_t* dest_size, + std::string* error_msg) const { + DCHECK(huf_ctable_); + + size_t res = + HUF_compress1X_usingCTable(dest, *dest_size, data.data(), data.size(), huf_ctable_.get(), 0); + + if (HUF_isError(res)) { + *error_msg = HUF_getErrorName(res); + return false; + } + *dest_size = static_cast(res); + return true; +} + +unsigned HuffmanEncoder::BitCount(uint8_t symbol) const { + DCHECK(huf_ctable_); + return HUF_getNbBitsFromCTable(huf_ctable_.get(), symbol); +} + +size_t HuffmanEncoder::EstimateCompressedSize(const unsigned hist[], unsigned max_symbol) const { + DCHECK(huf_ctable_); + size_t res = HUF_estimateCompressedSize(huf_ctable_.get(), hist, max_symbol); + return res; +} + +string HuffmanEncoder::Export() const { + DCHECK(huf_ctable_); + + // Reverse engineered: (maxSymbolValue + 1) / 2 + 1. + constexpr unsigned kMaxTableSize = 130; + string res; + res.resize(kMaxTableSize); + + unique_ptr wrkspace(new uint32_t[HUF_CTABLE_WORKSPACE_SIZE_U32]); + + // Seems we can reuse the same workspace, its capacity is enough. + size_t size = HUF_writeCTable_wksp(res.data(), res.size(), huf_ctable_.get(), table_max_symbol_, + num_bits_, wrkspace.get(), kWspSize); + CHECK(!HUF_isError(size)); + res.resize(size); + return res; +} + +} // namespace dfly diff --git a/src/core/huff_coder.h b/src/core/huff_coder.h new file mode 100644 index 000000000..d9bf13b1e --- /dev/null +++ b/src/core/huff_coder.h @@ -0,0 +1,50 @@ +// Copyright 2025, DragonflyDB authors. All rights reserved. +// See LICENSE for licensing terms. +// + +#pragma once + +#include +#include +#include + +namespace dfly { + +class HuffmanEncoder { + public: + bool Build(const unsigned hist[], unsigned max_symbol, std::string* error_msg); + + bool Encode(std::string_view data, uint8_t* dest, uint32_t* dest_size, + std::string* error_msg) const; + unsigned BitCount(uint8_t symbol) const; + + size_t EstimateCompressedSize(const unsigned hist[], unsigned max_symbol) const; + + void Reset(); + + // Load using the serialized data produced by Export(). + bool Load(std::string_view binary_data, std::string* error_msg); + + // Exports a binary representation of the table, that can be loaded using Load(). + std::string Export() const; + + uint8_t num_bits() const { + return num_bits_; + } + + bool valid() const { + return bool(huf_ctable_); + } + + unsigned max_symbol() const { + return table_max_symbol_; + } + + private: + using HUF_CElt = size_t; + std::unique_ptr huf_ctable_; + unsigned table_max_symbol_ = 0; + uint8_t num_bits_ = 0; +}; + +} // namespace dfly diff --git a/src/server/debugcmd.cc b/src/server/debugcmd.cc index ed75a4a21..bee5f1807 100644 --- a/src/server/debugcmd.cc +++ b/src/server/debugcmd.cc @@ -13,6 +13,7 @@ extern "C" { #include #include +#include #include #include #include @@ -24,6 +25,7 @@ extern "C" { #include "base/flags.h" #include "base/logging.h" #include "core/compact_object.h" +#include "core/huff_coder.h" #include "core/qlist.h" #include "core/sorted_map.h" #include "core/string_map.h" @@ -1318,71 +1320,47 @@ void DebugCmd::Compression(CmdArgList args, facade::SinkReplyBuilder* builder) { }); size_t num_bits = 0, compressed_size = 0, raw_size = 0; - unsigned table_max_symbol = 255; if (hist.max_symbol) { - HUF_CREATE_STATIC_CTABLE(huf_ctable, HufHist::kMaxSymbol); + HuffmanEncoder huff_enc; + string err_msg; - unique_ptr wrkspace(new uint32_t[HUF_CTABLE_WORKSPACE_SIZE_U32]); - constexpr size_t kWspSize = HUF_CTABLE_WORKSPACE_SIZE; + raw_size = 0; + for (unsigned i = 0; i <= HufHist::kMaxSymbol; i++) { + raw_size += hist.hist[i]; + + // force non-zero weights for all symbols. + if (hist.hist[i] == 0) + hist.hist[i] = 1; + } if (bintable.empty()) { - table_max_symbol = hist.max_symbol; - num_bits = HUF_buildCTable_wksp(huf_ctable, hist.hist.data(), table_max_symbol, 0, - wrkspace.get(), kWspSize); - if (HUF_isError(num_bits)) { - return rb->SendError(StrCat("Internal error: ", HUF_getErrorName(num_bits))); + if (!huff_enc.Build(hist.hist.data(), HufHist::kMaxSymbol, &err_msg)) { + return rb->SendError(StrCat("Internal error: ", err_msg)); } } else { // Try to read the bintable and create a ctable from it. - unsigned has_zero_weights = 1; - - size_t read_size = HUF_readCTable(huf_ctable, &table_max_symbol, bintable.data(), - bintable.size(), &has_zero_weights); - if (HUF_isError(read_size)) { - return rb->SendError(StrCat("Internal error: ", HUF_getErrorName(read_size))); - } - if (read_size != bintable.size()) { - return rb->SendError("Invalid bintable"); + if (!huff_enc.Load(bintable, &err_msg)) { + return rb->SendError(StrCat("Internal error: ", err_msg)); } } - - compressed_size = HUF_estimateCompressedSize(huf_ctable, hist.hist.data(), table_max_symbol); - for (unsigned i = table_max_symbol + 1; i <= hist.max_symbol; i++) { - compressed_size += hist.hist[i]; - } - raw_size = 0; - for (unsigned i = 0; i <= hist.max_symbol; i++) { - raw_size += hist.hist[i]; - } + num_bits = huff_enc.num_bits(); + compressed_size = huff_enc.EstimateCompressedSize(hist.hist.data(), HufHist::kMaxSymbol); if (print_bintable) { - // Reverse engineered: (maxSymbolValue + 1) / 2 + 1. - constexpr unsigned kMaxTableSize = 130; - bintable.resize(kMaxTableSize); - - // Seems we can reuse the same workspace, its capacity is enough. - size_t res = HUF_writeCTable_wksp(bintable.data(), kMaxTableSize, huf_ctable, - table_max_symbol, num_bits, wrkspace.get(), kWspSize); - if (HUF_isError(res)) { - return rb->SendError(StrCat("Internal error: ", HUF_getErrorName(res))); - } - bintable.resize(res); + bintable = huff_enc.Export(); + VLOG(1) << "bintable: " << absl::CHexEscape(bintable); } else { bintable.clear(); } } - unsigned map_len = print_bintable ? 7 : 6; + unsigned map_len = print_bintable ? 6 : 5; rb->StartCollection(map_len, RedisReplyBuilder::CollectionType::MAP); rb->SendSimpleString("max_symbol"); rb->SendLong(hist.max_symbol); - // in case we load a bintable, table_max_symbol may be different from max_symbol. - // if it's smaller, it means our table can not encode all symbols. - rb->SendSimpleString("table_max_symbol"); - rb->SendLong(table_max_symbol); rb->SendSimpleString("max_bits"); rb->SendLong(num_bits); rb->SendSimpleString("raw_size");