mirror of
https://github.com/dragonflydb/dragonfly.git
synced 2025-05-11 18:35:46 +02:00
feat: implement ascii_unpack using SIMD instructions (#573)
Signed-off-by: Roman Gershman <roman@dragonflydb.io>
This commit is contained in:
parent
bcafd7e25d
commit
c18cb8208e
4 changed files with 117 additions and 25 deletions
|
@ -736,7 +736,7 @@ string_view CompactObj::GetSlice(string* scratch) const {
|
|||
DCHECK_EQ(OBJ_ENCODING_RAW, u_.r_obj.encoding());
|
||||
size_t decoded_len = DecodedLen(u_.r_obj.Size());
|
||||
scratch->resize(decoded_len);
|
||||
detail::ascii_unpack(to_byte(u_.r_obj.inner_obj()), decoded_len, scratch->data());
|
||||
detail::ascii_unpack_simd(to_byte(u_.r_obj.inner_obj()), decoded_len, scratch->data());
|
||||
} else if (taglen_ == SMALL_TAG) {
|
||||
size_t decoded_len = DecodedLen(u_.small_str.size());
|
||||
size_t space_left = decoded_len - u_.small_str.size();
|
||||
|
@ -749,8 +749,8 @@ string_view CompactObj::GetSlice(string* scratch) const {
|
|||
memcpy(next, slices[0].data(), slices[0].size());
|
||||
next += slices[0].size();
|
||||
memcpy(next, slices[1].data(), slices[1].size());
|
||||
detail::ascii_unpack(reinterpret_cast<uint8_t*>(scratch->data() + space_left), decoded_len,
|
||||
scratch->data());
|
||||
detail::ascii_unpack_simd(reinterpret_cast<uint8_t*>(scratch->data() + space_left),
|
||||
decoded_len, scratch->data());
|
||||
} else {
|
||||
LOG(FATAL) << "Unsupported tag " << int(taglen_);
|
||||
}
|
||||
|
@ -839,7 +839,7 @@ void CompactObj::GetString(char* dest) const {
|
|||
CHECK_EQ(OBJ_STRING, u_.r_obj.type());
|
||||
DCHECK_EQ(OBJ_ENCODING_RAW, u_.r_obj.encoding());
|
||||
size_t decoded_len = DecodedLen(u_.r_obj.Size());
|
||||
detail::ascii_unpack(to_byte(u_.r_obj.inner_obj()), decoded_len, dest);
|
||||
detail::ascii_unpack_simd(to_byte(u_.r_obj.inner_obj()), decoded_len, dest);
|
||||
} else if (taglen_ == SMALL_TAG) {
|
||||
size_t decoded_len = DecodedLen(u_.small_str.size());
|
||||
|
||||
|
@ -853,7 +853,7 @@ void CompactObj::GetString(char* dest) const {
|
|||
memcpy(next, slices[0].data(), slices[0].size());
|
||||
next += slices[0].size();
|
||||
memcpy(next, slices[1].data(), slices[1].size());
|
||||
detail::ascii_unpack(reinterpret_cast<uint8_t*>(dest + space_left), decoded_len, dest);
|
||||
detail::ascii_unpack_simd(reinterpret_cast<uint8_t*>(dest + space_left), decoded_len, dest);
|
||||
} else {
|
||||
LOG(FATAL) << "Unsupported tag " << int(taglen_);
|
||||
}
|
||||
|
|
|
@ -192,20 +192,20 @@ TEST_F(CompactObjectTest, AsciiUtil) {
|
|||
|
||||
char outbuf[32] = "xxxxxxxxxxxxxx";
|
||||
detail::ascii_pack_simd(data.data(), 7, buf);
|
||||
detail::ascii_unpack(buf, 7, outbuf);
|
||||
detail::ascii_unpack_simd(buf, 7, outbuf);
|
||||
|
||||
ASSERT_EQ('x', outbuf[7]) << outbuf;
|
||||
std::string_view actual{outbuf, 7};
|
||||
ASSERT_EQ(data.substr(0, 7), actual);
|
||||
|
||||
string data3;
|
||||
for (unsigned i = 0; i < 97; ++i) {
|
||||
for (unsigned i = 0; i < 13; ++i) {
|
||||
data3.append("12345678910");
|
||||
}
|
||||
string act_str(data3.size(), 'y');
|
||||
std::vector<uint8_t> binvec(detail::binpacked_len(data3.size()));
|
||||
detail::ascii_pack_simd(data3.data(), data3.size(), binvec.data());
|
||||
detail::ascii_unpack(binvec.data(), data3.size(), act_str.data());
|
||||
detail::ascii_unpack_simd(binvec.data(), data3.size(), act_str.data());
|
||||
|
||||
ASSERT_EQ(data3, act_str);
|
||||
}
|
||||
|
@ -483,6 +483,29 @@ static void ascii_pack_naive(const char* ascii, size_t len, uint8_t* bin) {
|
|||
}
|
||||
}
|
||||
|
||||
static void ascii_unpack_naive(const uint8_t* bin, size_t ascii_len, char* ascii) {
|
||||
constexpr uint8_t kM = 0x7F;
|
||||
uint8_t p = 0;
|
||||
unsigned i = 0;
|
||||
|
||||
while (ascii_len >= 8) {
|
||||
for (i = 0; i < 7; ++i) {
|
||||
uint8_t src = *bin; // keep on stack in case we unpack inplace.
|
||||
*ascii++ = (p >> (8 - i)) | ((src << i) & kM);
|
||||
p = src;
|
||||
++bin;
|
||||
}
|
||||
|
||||
ascii_len -= 8;
|
||||
*ascii++ = p >> 1;
|
||||
}
|
||||
|
||||
DCHECK_LT(ascii_len, 8u);
|
||||
for (i = 0; i < ascii_len; ++i) {
|
||||
*ascii++ = *bin++;
|
||||
}
|
||||
}
|
||||
|
||||
static void BM_PackNaive(benchmark::State& state) {
|
||||
string val(1024, 'a');
|
||||
uint8_t buf[1024];
|
||||
|
@ -523,4 +546,40 @@ static void BM_PackSimd(benchmark::State& state) {
|
|||
}
|
||||
BENCHMARK(BM_PackSimd);
|
||||
|
||||
static void BM_UnpackNaive(benchmark::State& state) {
|
||||
string val(1024, 'a');
|
||||
uint8_t buf[1024];
|
||||
|
||||
detail::ascii_pack(val.data(), val.size(), buf);
|
||||
|
||||
while (state.KeepRunning()) {
|
||||
ascii_unpack_naive(buf, val.size(), val.data());
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_UnpackNaive);
|
||||
|
||||
static void BM_Unpack(benchmark::State& state) {
|
||||
string val(1024, 'a');
|
||||
uint8_t buf[1024];
|
||||
|
||||
detail::ascii_pack(val.data(), val.size(), buf);
|
||||
|
||||
while (state.KeepRunning()) {
|
||||
detail::ascii_unpack(buf, val.size(), val.data());
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_Unpack);
|
||||
|
||||
static void BM_UnpackSimd(benchmark::State& state) {
|
||||
string val(1024, 'a');
|
||||
uint8_t buf[1024];
|
||||
|
||||
detail::ascii_pack(val.data(), val.size(), buf);
|
||||
|
||||
while (state.KeepRunning()) {
|
||||
detail::ascii_unpack_simd(buf, val.size(), val.data());
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_UnpackSimd);
|
||||
|
||||
} // namespace dfly
|
||||
|
|
|
@ -98,8 +98,6 @@ void ascii_pack2(const char* ascii, size_t len, uint8_t* bin) {
|
|||
|
||||
// The algo - do in parallel what ascii_pack does on two uint64_t integers
|
||||
void ascii_pack_simd(const char* ascii, size_t len, uint8_t* bin) {
|
||||
__m128i val;
|
||||
|
||||
// I leave out 16 bytes in addition to 16 that we load in the loop
|
||||
// because we store into bin full 16 bytes instead of 14. To prevent data
|
||||
// overwrite we finish loop one iteration earlier.
|
||||
|
@ -108,7 +106,7 @@ void ascii_pack_simd(const char* ascii, size_t len, uint8_t* bin) {
|
|||
// Skips 8th byte (indexc 7) in the lower 8-byte part.
|
||||
const __m128i control = _mm_set_epi8(-1, -1, 14, 13, 12, 11, 10, 9, 8, 6, 5, 4, 3, 2, 1, 0);
|
||||
|
||||
__m128i rpart, lpart;
|
||||
__m128i val, rpart, lpart;
|
||||
|
||||
// Based on the question I asked here: https://stackoverflow.com/q/74831843/2280111
|
||||
while (ascii <= end) {
|
||||
|
@ -149,28 +147,62 @@ void ascii_pack_simd(const char* ascii, size_t len, uint8_t* bin) {
|
|||
// however, if binary data is positioned on the right of the ascii buffer with empty space on the
|
||||
// left than we can unpack inplace.
|
||||
void ascii_unpack(const uint8_t* bin, size_t ascii_len, char* ascii) {
|
||||
constexpr uint8_t kM = 0x7F;
|
||||
uint8_t p = 0;
|
||||
unsigned i = 0;
|
||||
uint64_t val;
|
||||
|
||||
while (ascii_len >= 8) {
|
||||
for (i = 0; i < 7; ++i) {
|
||||
uint8_t src = *bin; // keep on stack in case we unpack inplace.
|
||||
*ascii++ = (p >> (8 - i)) | ((src << i) & kM);
|
||||
p = src;
|
||||
++bin;
|
||||
}
|
||||
const char* end = ascii + ascii_len - 8;
|
||||
while (ascii <= end) {
|
||||
memcpy(&val, bin, 8);
|
||||
|
||||
ascii_len -= 8;
|
||||
*ascii++ = p >> 1;
|
||||
val = ((val & 0x00FFFFFFF0000000) << 4) | (val & 0x000000000FFFFFFF);
|
||||
val = ((val & 0xFFFFC000FFFFC000) << 2) | (val & 0x00003FFF00003FFF);
|
||||
val = ((val & 0x7F807F807F807F80) << 1) | (val & 0x007F007F007F007F);
|
||||
memcpy(ascii, &val, 8);
|
||||
|
||||
ascii += 8;
|
||||
bin += 7;
|
||||
}
|
||||
|
||||
DCHECK_LT(ascii_len, 8u);
|
||||
for (i = 0; i < ascii_len; ++i) {
|
||||
end += 8;
|
||||
while (ascii < end) {
|
||||
*ascii++ = *bin++;
|
||||
}
|
||||
}
|
||||
|
||||
void ascii_unpack_simd(const uint8_t* bin, size_t ascii_len, char* ascii) {
|
||||
__m128i val, rpart, lpart;
|
||||
|
||||
size_t round_down_len = (ascii_len & ~size_t(0x0F));
|
||||
const char* end = ascii + round_down_len;
|
||||
|
||||
// shifts the second 7-byte blob to the left.
|
||||
const __m128i control = _mm_set_epi8(14, 13, 12, 11, 10, 9, 8, 7, -1, 6, 5, 4, 3, 2, 1, 0);
|
||||
|
||||
while (ascii < end) {
|
||||
val = _mm_loadu_si128(reinterpret_cast<const __m128i*>(bin));
|
||||
val = _mm_shuffle_epi8(val, control);
|
||||
|
||||
rpart = _mm_and_si128(val, _mm_set1_epi64x(0x000000000FFFFFFF));
|
||||
lpart = _mm_and_si128(val, _mm_set1_epi64x(0x00FFFFFFF0000000));
|
||||
val = _mm_or_si128(_mm_slli_epi64(lpart, 4), rpart);
|
||||
|
||||
rpart = _mm_and_si128(val, _mm_set1_epi64x(0x00003FFF00003FFF));
|
||||
lpart = _mm_and_si128(val, _mm_set1_epi64x(0xFFFFC000FFFFC000));
|
||||
val = _mm_or_si128(_mm_slli_epi64(lpart, 2), rpart);
|
||||
|
||||
rpart = _mm_and_si128(val, _mm_set1_epi64x(0x007F007F007F007F));
|
||||
lpart = _mm_and_si128(val, _mm_set1_epi64x(0x7F807F807F807F80));
|
||||
val = _mm_or_si128(_mm_slli_epi64(lpart, 1), rpart);
|
||||
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(ascii), val);
|
||||
ascii += 16;
|
||||
bin += 14;
|
||||
}
|
||||
|
||||
ascii_len -= round_down_len;
|
||||
if (ascii_len)
|
||||
ascii_unpack(bin, ascii_len, ascii);
|
||||
}
|
||||
|
||||
// compares packed and unpacked strings. packed must be of length = binpacked_len(ascii_len).
|
||||
bool compare_packed(const uint8_t* packed, const char* ascii, size_t ascii_len) {
|
||||
unsigned i = 0;
|
||||
|
|
|
@ -19,6 +19,7 @@ bool validate_ascii_fast(const char* src, size_t len);
|
|||
// however, if binary data is positioned on the right of the ascii buffer with empty space on the
|
||||
// left than we can unpack inplace.
|
||||
void ascii_unpack(const uint8_t* bin, size_t ascii_len, char* ascii);
|
||||
void ascii_unpack_simd(const uint8_t* bin, size_t ascii_len, char* ascii);
|
||||
|
||||
// packs ascii string (does not verify) into binary form saving 1 bit per byte on average (12.5%).
|
||||
void ascii_pack(const char* ascii, size_t len, uint8_t* bin);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue