From d64b4e01ea4fe3abb3c79798a2967a001722c0d5 Mon Sep 17 00:00:00 2001 From: Roman Gershman Date: Sun, 2 Jan 2022 10:40:16 +0200 Subject: [PATCH] Implement basic list operations Fix redis linker dependencies. Bring more redis (BSD-3) code. Fix bugs related to multi-database redis. Introduce multiple types for redis values. --- core/op_status.h | 1 + redis/CMakeLists.txt | 6 +- redis/crc64.c | 161 ++ redis/crc64.h | 13 + redis/crcspeed.c | 282 +++ redis/crcspeed.h | 60 + redis/debug.c | 113 + redis/dict.c | 2 - redis/object.h | 2 - redis/redis_aux.c | 50 +- redis/redis_aux.h | 6 - redis/siphash.c | 373 +++ redis/t_zset.c | 4358 ++++++++++++++++++++++++++++++++++++ server/CMakeLists.txt | 4 +- server/common.cc | 1 + server/common_types.h | 2 + server/db_slice.cc | 2 +- server/db_slice.h | 2 +- server/engine_shard_set.cc | 3 + server/engine_shard_set.h | 6 + server/error.h | 1 + server/list_family.cc | 282 +++ server/list_family.h | 38 + server/main_service.cc | 2 + server/string_family.cc | 31 +- server/table.h | 6 + 26 files changed, 5734 insertions(+), 73 deletions(-) create mode 100644 redis/crc64.c create mode 100644 redis/crc64.h create mode 100644 redis/crcspeed.c create mode 100644 redis/crcspeed.h create mode 100644 redis/debug.c create mode 100644 redis/siphash.c create mode 100644 redis/t_zset.c create mode 100644 server/list_family.cc create mode 100644 server/list_family.h diff --git a/core/op_status.h b/core/op_status.h index 8be7e524a..060db350c 100644 --- a/core/op_status.h +++ b/core/op_status.h @@ -12,6 +12,7 @@ enum class OpStatus : uint16_t { OK, KEY_NOTFOUND, SKIPPED, + WRONG_TYPE, }; class OpResultBase { diff --git a/redis/CMakeLists.txt b/redis/CMakeLists.txt index d69da47ec..7f4166da7 100644 --- a/redis/CMakeLists.txt +++ b/redis/CMakeLists.txt @@ -1,5 +1,5 @@ -add_library(redis_lib endianconv.c intset.c listpack.c object.c - lzf_c.c lzf_d.c sds.c sha256.c - quicklist.c util.c zmalloc.c) +add_library(redis_lib crc64.c crcspeed.c debug.c dict.c endianconv.c intset.c + listpack.c mt19937-64.c object.c lzf_c.c lzf_d.c sds.c sha256.c + quicklist.c redis_aux.c siphash.c t_zset.c util.c zmalloc.c) cxx_link(redis_lib) diff --git a/redis/crc64.c b/redis/crc64.c new file mode 100644 index 000000000..73e039145 --- /dev/null +++ b/redis/crc64.c @@ -0,0 +1,161 @@ +/* Copyright (c) 2014, Matt Stancliff + * Copyright (c) 2020, Amazon Web Services + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. */ + +#include "crc64.h" +#include "crcspeed.h" +static uint64_t crc64_table[8][256] = {{0}}; + +#define POLY UINT64_C(0xad93d23594c935a9) +/******************** BEGIN GENERATED PYCRC FUNCTIONS ********************/ +/** + * Generated on Sun Dec 21 14:14:07 2014, + * by pycrc v0.8.2, https://www.tty1.net/pycrc/ + * + * LICENSE ON GENERATED CODE: + * ========================== + * As of version 0.6, pycrc is released under the terms of the MIT licence. + * The code generated by pycrc is not considered a substantial portion of the + * software, therefore the author of pycrc will not claim any copyright on + * the generated code. + * ========================== + * + * CRC configuration: + * Width = 64 + * Poly = 0xad93d23594c935a9 + * XorIn = 0xffffffffffffffff + * ReflectIn = True + * XorOut = 0x0000000000000000 + * ReflectOut = True + * Algorithm = bit-by-bit-fast + * + * Modifications after generation (by matt): + * - included finalize step in-line with update for single-call generation + * - re-worked some inner variable architectures + * - adjusted function parameters to match expected prototypes. + *****************************************************************************/ + +/** + * Reflect all bits of a \a data word of \a data_len bytes. + * + * \param data The data word to be reflected. + * \param data_len The width of \a data expressed in number of bits. + * \return The reflected data. + *****************************************************************************/ +static inline uint_fast64_t crc_reflect(uint_fast64_t data, size_t data_len) { + uint_fast64_t ret = data & 0x01; + + for (size_t i = 1; i < data_len; i++) { + data >>= 1; + ret = (ret << 1) | (data & 0x01); + } + + return ret; +} + +/** + * Update the crc value with new data. + * + * \param crc The current crc value. + * \param data Pointer to a buffer of \a data_len bytes. + * \param data_len Number of bytes in the \a data buffer. + * \return The updated crc value. + ******************************************************************************/ +uint64_t _crc64(uint_fast64_t crc, const void *in_data, const uint64_t len) { + const uint8_t *data = in_data; + unsigned long long bit; + + for (uint64_t offset = 0; offset < len; offset++) { + uint8_t c = data[offset]; + for (uint_fast8_t i = 0x01; i & 0xff; i <<= 1) { + bit = crc & 0x8000000000000000; + if (c & i) { + bit = !bit; + } + + crc <<= 1; + if (bit) { + crc ^= POLY; + } + } + + crc &= 0xffffffffffffffff; + } + + crc = crc & 0xffffffffffffffff; + return crc_reflect(crc, 64) ^ 0x0000000000000000; +} + +/******************** END GENERATED PYCRC FUNCTIONS ********************/ + +/* Initializes the 16KB lookup tables. */ +void crc64_init(void) { + crcspeed64native_init(_crc64, crc64_table); +} + +/* Compute crc64 */ +uint64_t crc64(uint64_t crc, const unsigned char *s, uint64_t l) { + return crcspeed64native(crc64_table, crc, (void *) s, l); +} + +/* Test main */ +#ifdef REDIS_TEST +#include + +#define UNUSED(x) (void)(x) +int crc64Test(int argc, char *argv[], int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + crc64_init(); + printf("[calcula]: e9c6d914c4b8d9ca == %016" PRIx64 "\n", + (uint64_t)_crc64(0, "123456789", 9)); + printf("[64speed]: e9c6d914c4b8d9ca == %016" PRIx64 "\n", + (uint64_t)crc64(0, (unsigned char*)"123456789", 9)); + char li[] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed " + "do eiusmod tempor incididunt ut labore et dolore magna " + "aliqua. Ut enim ad minim veniam, quis nostrud exercitation " + "ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis " + "aute irure dolor in reprehenderit in voluptate velit esse " + "cillum dolore eu fugiat nulla pariatur. Excepteur sint " + "occaecat cupidatat non proident, sunt in culpa qui officia " + "deserunt mollit anim id est laborum."; + printf("[calcula]: c7794709e69683b3 == %016" PRIx64 "\n", + (uint64_t)_crc64(0, li, sizeof(li))); + printf("[64speed]: c7794709e69683b3 == %016" PRIx64 "\n", + (uint64_t)crc64(0, (unsigned char*)li, sizeof(li))); + return 0; +} + +#endif + +#ifdef REDIS_TEST_MAIN +int main(int argc, char *argv[]) { + return crc64Test(argc, argv); +} + +#endif diff --git a/redis/crc64.h b/redis/crc64.h new file mode 100644 index 000000000..e0fccd98b --- /dev/null +++ b/redis/crc64.h @@ -0,0 +1,13 @@ +#ifndef CRC64_H +#define CRC64_H + +#include + +void crc64_init(void); +uint64_t crc64(uint64_t crc, const unsigned char *s, uint64_t l); + +#ifdef REDIS_TEST +int crc64Test(int argc, char *argv[], int flags); +#endif + +#endif diff --git a/redis/crcspeed.c b/redis/crcspeed.c new file mode 100644 index 000000000..67cb8fd9f --- /dev/null +++ b/redis/crcspeed.c @@ -0,0 +1,282 @@ +/* + * Copyright (C) 2013 Mark Adler + * Originally by: crc64.c Version 1.4 16 Dec 2013 Mark Adler + * Modifications by Matt Stancliff : + * - removed CRC64-specific behavior + * - added generation of lookup tables by parameters + * - removed inversion of CRC input/result + * - removed automatic initialization in favor of explicit initialization + + This software is provided 'as-is', without any express or implied + warranty. In no event will the author be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Mark Adler + madler@alumni.caltech.edu + */ + +#include "crcspeed.h" + +/* Fill in a CRC constants table. */ +void crcspeed64little_init(crcfn64 crcfn, uint64_t table[8][256]) { + uint64_t crc; + + /* generate CRCs for all single byte sequences */ + for (int n = 0; n < 256; n++) { + unsigned char v = n; + table[0][n] = crcfn(0, &v, 1); + } + + /* generate nested CRC table for future slice-by-8 lookup */ + for (int n = 0; n < 256; n++) { + crc = table[0][n]; + for (int k = 1; k < 8; k++) { + crc = table[0][crc & 0xff] ^ (crc >> 8); + table[k][n] = crc; + } + } +} + +void crcspeed16little_init(crcfn16 crcfn, uint16_t table[8][256]) { + uint16_t crc; + + /* generate CRCs for all single byte sequences */ + for (int n = 0; n < 256; n++) { + table[0][n] = crcfn(0, &n, 1); + } + + /* generate nested CRC table for future slice-by-8 lookup */ + for (int n = 0; n < 256; n++) { + crc = table[0][n]; + for (int k = 1; k < 8; k++) { + crc = table[0][(crc >> 8) & 0xff] ^ (crc << 8); + table[k][n] = crc; + } + } +} + +/* Reverse the bytes in a 64-bit word. */ +static inline uint64_t rev8(uint64_t a) { +#if defined(__GNUC__) || defined(__clang__) + return __builtin_bswap64(a); +#else + uint64_t m; + + m = UINT64_C(0xff00ff00ff00ff); + a = ((a >> 8) & m) | (a & m) << 8; + m = UINT64_C(0xffff0000ffff); + a = ((a >> 16) & m) | (a & m) << 16; + return a >> 32 | a << 32; +#endif +} + +/* This function is called once to initialize the CRC table for use on a + big-endian architecture. */ +void crcspeed64big_init(crcfn64 fn, uint64_t big_table[8][256]) { + /* Create the little endian table then reverse all the entries. */ + crcspeed64little_init(fn, big_table); + for (int k = 0; k < 8; k++) { + for (int n = 0; n < 256; n++) { + big_table[k][n] = rev8(big_table[k][n]); + } + } +} + +void crcspeed16big_init(crcfn16 fn, uint16_t big_table[8][256]) { + /* Create the little endian table then reverse all the entries. */ + crcspeed16little_init(fn, big_table); + for (int k = 0; k < 8; k++) { + for (int n = 0; n < 256; n++) { + big_table[k][n] = rev8(big_table[k][n]); + } + } +} + +/* Calculate a non-inverted CRC multiple bytes at a time on a little-endian + * architecture. If you need inverted CRC, invert *before* calling and invert + * *after* calling. + * 64 bit crc = process 8 bytes at once; + */ +uint64_t crcspeed64little(uint64_t little_table[8][256], uint64_t crc, + void *buf, size_t len) { + unsigned char *next = buf; + + /* process individual bytes until we reach an 8-byte aligned pointer */ + while (len && ((uintptr_t)next & 7) != 0) { + crc = little_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); + len--; + } + + /* fast middle processing, 8 bytes (aligned!) per loop */ + while (len >= 8) { + crc ^= *(uint64_t *)next; + crc = little_table[7][crc & 0xff] ^ + little_table[6][(crc >> 8) & 0xff] ^ + little_table[5][(crc >> 16) & 0xff] ^ + little_table[4][(crc >> 24) & 0xff] ^ + little_table[3][(crc >> 32) & 0xff] ^ + little_table[2][(crc >> 40) & 0xff] ^ + little_table[1][(crc >> 48) & 0xff] ^ + little_table[0][crc >> 56]; + next += 8; + len -= 8; + } + + /* process remaining bytes (can't be larger than 8) */ + while (len) { + crc = little_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); + len--; + } + + return crc; +} + +uint16_t crcspeed16little(uint16_t little_table[8][256], uint16_t crc, + void *buf, size_t len) { + unsigned char *next = buf; + + /* process individual bytes until we reach an 8-byte aligned pointer */ + while (len && ((uintptr_t)next & 7) != 0) { + crc = little_table[0][((crc >> 8) ^ *next++) & 0xff] ^ (crc << 8); + len--; + } + + /* fast middle processing, 8 bytes (aligned!) per loop */ + while (len >= 8) { + uint64_t n = *(uint64_t *)next; + crc = little_table[7][(n & 0xff) ^ ((crc >> 8) & 0xff)] ^ + little_table[6][((n >> 8) & 0xff) ^ (crc & 0xff)] ^ + little_table[5][(n >> 16) & 0xff] ^ + little_table[4][(n >> 24) & 0xff] ^ + little_table[3][(n >> 32) & 0xff] ^ + little_table[2][(n >> 40) & 0xff] ^ + little_table[1][(n >> 48) & 0xff] ^ + little_table[0][n >> 56]; + next += 8; + len -= 8; + } + + /* process remaining bytes (can't be larger than 8) */ + while (len) { + crc = little_table[0][((crc >> 8) ^ *next++) & 0xff] ^ (crc << 8); + len--; + } + + return crc; +} + +/* Calculate a non-inverted CRC eight bytes at a time on a big-endian + * architecture. + */ +uint64_t crcspeed64big(uint64_t big_table[8][256], uint64_t crc, void *buf, + size_t len) { + unsigned char *next = buf; + + crc = rev8(crc); + while (len && ((uintptr_t)next & 7) != 0) { + crc = big_table[0][(crc >> 56) ^ *next++] ^ (crc << 8); + len--; + } + + while (len >= 8) { + crc ^= *(uint64_t *)next; + crc = big_table[0][crc & 0xff] ^ + big_table[1][(crc >> 8) & 0xff] ^ + big_table[2][(crc >> 16) & 0xff] ^ + big_table[3][(crc >> 24) & 0xff] ^ + big_table[4][(crc >> 32) & 0xff] ^ + big_table[5][(crc >> 40) & 0xff] ^ + big_table[6][(crc >> 48) & 0xff] ^ + big_table[7][crc >> 56]; + next += 8; + len -= 8; + } + + while (len) { + crc = big_table[0][(crc >> 56) ^ *next++] ^ (crc << 8); + len--; + } + + return rev8(crc); +} + +/* WARNING: Completely untested on big endian architecture. Possibly broken. */ +uint16_t crcspeed16big(uint16_t big_table[8][256], uint16_t crc_in, void *buf, + size_t len) { + unsigned char *next = buf; + uint64_t crc = crc_in; + + crc = rev8(crc); + while (len && ((uintptr_t)next & 7) != 0) { + crc = big_table[0][((crc >> (56 - 8)) ^ *next++) & 0xff] ^ (crc >> 8); + len--; + } + + while (len >= 8) { + uint64_t n = *(uint64_t *)next; + crc = big_table[0][(n & 0xff) ^ ((crc >> (56 - 8)) & 0xff)] ^ + big_table[1][((n >> 8) & 0xff) ^ (crc & 0xff)] ^ + big_table[2][(n >> 16) & 0xff] ^ + big_table[3][(n >> 24) & 0xff] ^ + big_table[4][(n >> 32) & 0xff] ^ + big_table[5][(n >> 40) & 0xff] ^ + big_table[6][(n >> 48) & 0xff] ^ + big_table[7][n >> 56]; + next += 8; + len -= 8; + } + + while (len) { + crc = big_table[0][((crc >> (56 - 8)) ^ *next++) & 0xff] ^ (crc >> 8); + len--; + } + + return rev8(crc); +} + +/* Return the CRC of buf[0..len-1] with initial crc, processing eight bytes + at a time using passed-in lookup table. + This selects one of two routines depending on the endianess of + the architecture. */ +uint64_t crcspeed64native(uint64_t table[8][256], uint64_t crc, void *buf, + size_t len) { + uint64_t n = 1; + + return *(char *)&n ? crcspeed64little(table, crc, buf, len) + : crcspeed64big(table, crc, buf, len); +} + +uint16_t crcspeed16native(uint16_t table[8][256], uint16_t crc, void *buf, + size_t len) { + uint64_t n = 1; + + return *(char *)&n ? crcspeed16little(table, crc, buf, len) + : crcspeed16big(table, crc, buf, len); +} + +/* Initialize CRC lookup table in architecture-dependent manner. */ +void crcspeed64native_init(crcfn64 fn, uint64_t table[8][256]) { + uint64_t n = 1; + + *(char *)&n ? crcspeed64little_init(fn, table) + : crcspeed64big_init(fn, table); +} + +void crcspeed16native_init(crcfn16 fn, uint16_t table[8][256]) { + uint64_t n = 1; + + *(char *)&n ? crcspeed16little_init(fn, table) + : crcspeed16big_init(fn, table); +} diff --git a/redis/crcspeed.h b/redis/crcspeed.h new file mode 100644 index 000000000..d7ee95ebb --- /dev/null +++ b/redis/crcspeed.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2014, Matt Stancliff + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. */ + +#ifndef CRCSPEED_H +#define CRCSPEED_H + +#include +#include + +typedef uint64_t (*crcfn64)(uint64_t, const void *, const uint64_t); +typedef uint16_t (*crcfn16)(uint16_t, const void *, const uint64_t); + +/* CRC-64 */ +void crcspeed64little_init(crcfn64 fn, uint64_t table[8][256]); +void crcspeed64big_init(crcfn64 fn, uint64_t table[8][256]); +void crcspeed64native_init(crcfn64 fn, uint64_t table[8][256]); + +uint64_t crcspeed64little(uint64_t table[8][256], uint64_t crc, void *buf, + size_t len); +uint64_t crcspeed64big(uint64_t table[8][256], uint64_t crc, void *buf, + size_t len); +uint64_t crcspeed64native(uint64_t table[8][256], uint64_t crc, void *buf, + size_t len); + +/* CRC-16 */ +void crcspeed16little_init(crcfn16 fn, uint16_t table[8][256]); +void crcspeed16big_init(crcfn16 fn, uint16_t table[8][256]); +void crcspeed16native_init(crcfn16 fn, uint16_t table[8][256]); + +uint16_t crcspeed16little(uint16_t table[8][256], uint16_t crc, void *buf, + size_t len); +uint16_t crcspeed16big(uint16_t table[8][256], uint16_t crc, void *buf, + size_t len); +uint16_t crcspeed16native(uint16_t table[8][256], uint16_t crc, void *buf, + size_t len); +#endif diff --git a/redis/debug.c b/redis/debug.c new file mode 100644 index 000000000..444ab8668 --- /dev/null +++ b/redis/debug.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2009-2020, Salvatore Sanfilippo + * Copyright (c) 2020, Redis Labs, Inc + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#include +#include +#include +#include + +#include "util.h" + +int verbosity = LL_NOTICE; + +void serverLog(int level, const char *fmt, ...) { + va_list ap; + char msg[LOG_MAX_LEN]; + + if ((level&0xff) < verbosity) return; + + va_start(ap, fmt); + vsnprintf(msg, sizeof(msg), fmt, ap); + va_end(ap); + + fprintf(stdout, "%s\n",msg); +} + +void _serverPanic(const char *file, int line, const char *msg, ...) { + va_list ap; + va_start(ap,msg); + char fmtmsg[256]; + vsnprintf(fmtmsg,sizeof(fmtmsg),msg,ap); + va_end(ap); + + serverLog(LL_WARNING, "------------------------------------------------"); + serverLog(LL_WARNING, "!!! Software Failure. Press left mouse button to continue"); + serverLog(LL_WARNING, "Guru Meditation: %s #%s:%d", fmtmsg,file,line); +#ifndef NDEBUG + __assert_fail(msg, file, line, ""); +#endif +} + +void _serverAssert(const char *estr, const char *file, int line) { + serverLog(LL_WARNING,"=== ASSERTION FAILED ==="); + serverLog(LL_WARNING,"==> %s:%d '%s' is not true",file,line,estr); +} + + + +/* Low level logging. To use only for very big messages, otherwise + * serverLog() is to prefer. */ +void serverLogRaw(int level, const char *msg) { + FILE *fp; + int log_to_stdout = 1; + + level &= 0xff; /* clear flags */ + if (level < verbosity) return; + + fp = stdout; + + fprintf(fp,"%s",msg); + fflush(fp); + + if (!log_to_stdout) fclose(fp); +} + +void serverLogHexDump(int level, char *descr, void *value, size_t len) { + char buf[65], *b; + unsigned char *v = value; + char charset[] = "0123456789abcdef"; + + serverLog(level,"%s (hexdump of %zu bytes):", descr, len); + b = buf; + while(len) { + b[0] = charset[(*v)>>4]; + b[1] = charset[(*v)&0xf]; + b[2] = '\0'; + b += 2; + len--; + v++; + if (b-buf == 64 || len == 0) { + serverLogRaw(level|LL_RAW,buf); + b = buf; + } + } + serverLogRaw(level|LL_RAW,"\n"); +} diff --git a/redis/dict.c b/redis/dict.c index 2e14da5f3..fb5b81fa6 100644 --- a/redis/dict.c +++ b/redis/dict.c @@ -33,8 +33,6 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#include "fmacros.h" - #include #include #include diff --git a/redis/object.h b/redis/object.h index b64a82cc7..9536eb12c 100644 --- a/redis/object.h +++ b/redis/object.h @@ -6,8 +6,6 @@ #include "dict.h" #include "sds.h" #include "quicklist.h" -#include "util.h" - /* The actual Redis Object */ #define OBJ_STRING 0U /* String object. */ diff --git a/redis/redis_aux.c b/redis/redis_aux.c index 252bea9f2..66e4cb927 100644 --- a/redis/redis_aux.c +++ b/redis/redis_aux.c @@ -1,6 +1,7 @@ #include "redis_aux.h" #include +#include #include "crc64.h" #include "object.h" @@ -78,20 +79,6 @@ size_t sdsZmallocSize(sds s) { return zmalloc_size(sh); } -/* Return 1 if currently we allow dict to expand. Dict may allocate huge - * memory to contain hash buckets when dict expands, that may lead redis - * rejects user's requests or evicts some keys, we can stop dict to expand - * provisionally if used memory will be over maxmemory after dict expands, - * but to guarantee the performance of redis, we still allow dict to expand - * if dict load factor exceeds HASHTABLE_MAX_LOAD_FACTOR. */ -static int dictExpandAllowed(size_t moreMem, double usedRatio) { - if (usedRatio <= HASHTABLE_MAX_LOAD_FACTOR) { - return !overMaxmemoryAfterAlloc(moreMem); - } else { - return 1; - } -} - /* Set dictionary type. Keys are SDS strings, values are not used. */ dictType setDictType = { dictSdsHash, /* hash function */ @@ -113,38 +100,3 @@ dictType zsetDictType = { NULL, /* val destructor */ NULL /* allow to expand */ }; - -static void dictObjectDestructor(dict* privdata, void* val) { - DICT_NOTUSED(privdata); - - if (val == NULL) - return; /* Lazy freeing will set value to NULL. */ - decrRefCount(val); -} - -/* Db->dict, keys are sds strings, vals are Redis objects. */ -dictType dbDictType = { - dictSdsHash, /* hash function */ - NULL, /* key dup */ - NULL, /* val dup */ - dictSdsKeyCompare, /* key compare */ - dictSdsDestructor, /* key destructor */ - dictObjectDestructor, /* val destructor */ - dictExpandAllowed /* allow to expand */ -}; - -/* Db->expires */ -/* Db->expires stores keys that are contained in redis_dict_. - Which means that uniqueness of the strings is guaranteed through uniqueness of the pointers - Therefore, it's enough to compare and hash keys by their addresses without reading the contents - of the key -*/ -dictType keyPtrDictType = { - dictPtrHash, /* hash function */ - NULL, /* key dup */ - NULL, /* val dup */ - dictPtrKeyCompare, /* key compare */ - NULL, /* key destructor */ - NULL, /* val destructor */ - NULL /* allow to expand */ -}; diff --git a/redis/redis_aux.h b/redis/redis_aux.h index bb691f978..761485b41 100644 --- a/redis/redis_aux.h +++ b/redis/redis_aux.h @@ -35,10 +35,8 @@ int htNeedsResize(dict *dict); // moved from server.cc /* Hash table types */ -extern dictType dbDictType; extern dictType zsetDictType; extern dictType setDictType; -extern dictType keyPtrDictType; extern dictType hashDictType; /* To improve the quality of the LRU approximation we take a set of keys @@ -63,10 +61,6 @@ struct evictionPoolEntry { uint64_t dictSdsHash(const void *key); int dictSdsKeyCompare(dict *privdata, const void *key1, const void *key2); void dictSdsDestructor(dict *privdata, void *val); -int getMaxmemoryState(size_t *total, size_t *logical, size_t *tofree, float *level); - -void evictionPoolPopulate(int dbid, dict *sampledict, dict *keydict, struct evictionPoolEntry *pool); -int overMaxmemoryAfterAlloc(size_t moremem); size_t sdsZmallocSize(sds s) ; typedef struct ServerStub { diff --git a/redis/siphash.c b/redis/siphash.c new file mode 100644 index 000000000..2713d89ee --- /dev/null +++ b/redis/siphash.c @@ -0,0 +1,373 @@ +/* + SipHash reference C implementation + + Copyright (c) 2012-2016 Jean-Philippe Aumasson + + Copyright (c) 2012-2014 Daniel J. Bernstein + Copyright (c) 2017 Salvatore Sanfilippo + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along + with this software. If not, see + . + + ---------------------------------------------------------------------------- + + This version was modified by Salvatore Sanfilippo + in the following ways: + + 1. We use SipHash 1-2. This is not believed to be as strong as the + suggested 2-4 variant, but AFAIK there are not trivial attacks + against this reduced-rounds version, and it runs at the same speed + as Murmurhash2 that we used previously, while the 2-4 variant slowed + down Redis by a 4% figure more or less. + 2. Hard-code rounds in the hope the compiler can optimize it more + in this raw from. Anyway we always want the standard 2-4 variant. + 3. Modify the prototype and implementation so that the function directly + returns an uint64_t value, the hash itself, instead of receiving an + output buffer. This also means that the output size is set to 8 bytes + and the 16 bytes output code handling was removed. + 4. Provide a case insensitive variant to be used when hashing strings that + must be considered identical by the hash table regardless of the case. + If we don't have directly a case insensitive hash function, we need to + perform a text transformation in some temporary buffer, which is costly. + 5. Remove debugging code. + 6. Modified the original test.c file to be a stand-alone function testing + the function in the new form (returning an uint64_t) using just the + relevant test vector. + */ +#include +#include +#include +#include +#include + +/* Fast tolower() alike function that does not care about locale + * but just returns a-z instead of A-Z. */ +int siptlw(int c) { + if (c >= 'A' && c <= 'Z') { + return c+('a'-'A'); + } else { + return c; + } +} + +#if defined(__has_attribute) +#if __has_attribute(no_sanitize) +#define NO_SANITIZE(sanitizer) __attribute__((no_sanitize(sanitizer))) +#endif +#endif + +#if !defined(NO_SANITIZE) +#define NO_SANITIZE(sanitizer) +#endif + +/* Test of the CPU is Little Endian and supports not aligned accesses. + * Two interesting conditions to speedup the function that happen to be + * in most of x86 servers. */ +#if defined(__X86_64__) || defined(__x86_64__) || defined (__i386__) \ + || defined (__aarch64__) || defined (__arm64__) +#define UNALIGNED_LE_CPU +#endif + +#define ROTL(x, b) (uint64_t)(((x) << (b)) | ((x) >> (64 - (b)))) + +#define U32TO8_LE(p, v) \ + (p)[0] = (uint8_t)((v)); \ + (p)[1] = (uint8_t)((v) >> 8); \ + (p)[2] = (uint8_t)((v) >> 16); \ + (p)[3] = (uint8_t)((v) >> 24); + +#define U64TO8_LE(p, v) \ + U32TO8_LE((p), (uint32_t)((v))); \ + U32TO8_LE((p) + 4, (uint32_t)((v) >> 32)); + +#ifdef UNALIGNED_LE_CPU +#define U8TO64_LE(p) (*((uint64_t*)(p))) +#else +#define U8TO64_LE(p) \ + (((uint64_t)((p)[0])) | ((uint64_t)((p)[1]) << 8) | \ + ((uint64_t)((p)[2]) << 16) | ((uint64_t)((p)[3]) << 24) | \ + ((uint64_t)((p)[4]) << 32) | ((uint64_t)((p)[5]) << 40) | \ + ((uint64_t)((p)[6]) << 48) | ((uint64_t)((p)[7]) << 56)) +#endif + +#define U8TO64_LE_NOCASE(p) \ + (((uint64_t)(siptlw((p)[0]))) | \ + ((uint64_t)(siptlw((p)[1])) << 8) | \ + ((uint64_t)(siptlw((p)[2])) << 16) | \ + ((uint64_t)(siptlw((p)[3])) << 24) | \ + ((uint64_t)(siptlw((p)[4])) << 32) | \ + ((uint64_t)(siptlw((p)[5])) << 40) | \ + ((uint64_t)(siptlw((p)[6])) << 48) | \ + ((uint64_t)(siptlw((p)[7])) << 56)) + +#define SIPROUND \ + do { \ + v0 += v1; \ + v1 = ROTL(v1, 13); \ + v1 ^= v0; \ + v0 = ROTL(v0, 32); \ + v2 += v3; \ + v3 = ROTL(v3, 16); \ + v3 ^= v2; \ + v0 += v3; \ + v3 = ROTL(v3, 21); \ + v3 ^= v0; \ + v2 += v1; \ + v1 = ROTL(v1, 17); \ + v1 ^= v2; \ + v2 = ROTL(v2, 32); \ + } while (0) + +NO_SANITIZE("alignment") +uint64_t siphash(const uint8_t *in, const size_t inlen, const uint8_t *k) { +#ifndef UNALIGNED_LE_CPU + uint64_t hash; + uint8_t *out = (uint8_t*) &hash; +#endif + uint64_t v0 = 0x736f6d6570736575ULL; + uint64_t v1 = 0x646f72616e646f6dULL; + uint64_t v2 = 0x6c7967656e657261ULL; + uint64_t v3 = 0x7465646279746573ULL; + uint64_t k0 = U8TO64_LE(k); + uint64_t k1 = U8TO64_LE(k + 8); + uint64_t m; + const uint8_t *end = in + inlen - (inlen % sizeof(uint64_t)); + const int left = inlen & 7; + uint64_t b = ((uint64_t)inlen) << 56; + v3 ^= k1; + v2 ^= k0; + v1 ^= k1; + v0 ^= k0; + + for (; in != end; in += 8) { + m = U8TO64_LE(in); + v3 ^= m; + + SIPROUND; + + v0 ^= m; + } + + switch (left) { + case 7: b |= ((uint64_t)in[6]) << 48; /* fall-thru */ + case 6: b |= ((uint64_t)in[5]) << 40; /* fall-thru */ + case 5: b |= ((uint64_t)in[4]) << 32; /* fall-thru */ + case 4: b |= ((uint64_t)in[3]) << 24; /* fall-thru */ + case 3: b |= ((uint64_t)in[2]) << 16; /* fall-thru */ + case 2: b |= ((uint64_t)in[1]) << 8; /* fall-thru */ + case 1: b |= ((uint64_t)in[0]); break; + case 0: break; + } + + v3 ^= b; + + SIPROUND; + + v0 ^= b; + v2 ^= 0xff; + + SIPROUND; + SIPROUND; + + b = v0 ^ v1 ^ v2 ^ v3; +#ifndef UNALIGNED_LE_CPU + U64TO8_LE(out, b); + return hash; +#else + return b; +#endif +} + +NO_SANITIZE("alignment") +uint64_t siphash_nocase(const uint8_t *in, const size_t inlen, const uint8_t *k) +{ +#ifndef UNALIGNED_LE_CPU + uint64_t hash; + uint8_t *out = (uint8_t*) &hash; +#endif + uint64_t v0 = 0x736f6d6570736575ULL; + uint64_t v1 = 0x646f72616e646f6dULL; + uint64_t v2 = 0x6c7967656e657261ULL; + uint64_t v3 = 0x7465646279746573ULL; + uint64_t k0 = U8TO64_LE(k); + uint64_t k1 = U8TO64_LE(k + 8); + uint64_t m; + const uint8_t *end = in + inlen - (inlen % sizeof(uint64_t)); + const int left = inlen & 7; + uint64_t b = ((uint64_t)inlen) << 56; + v3 ^= k1; + v2 ^= k0; + v1 ^= k1; + v0 ^= k0; + + for (; in != end; in += 8) { + m = U8TO64_LE_NOCASE(in); + v3 ^= m; + + SIPROUND; + + v0 ^= m; + } + + switch (left) { + case 7: b |= ((uint64_t)siptlw(in[6])) << 48; /* fall-thru */ + case 6: b |= ((uint64_t)siptlw(in[5])) << 40; /* fall-thru */ + case 5: b |= ((uint64_t)siptlw(in[4])) << 32; /* fall-thru */ + case 4: b |= ((uint64_t)siptlw(in[3])) << 24; /* fall-thru */ + case 3: b |= ((uint64_t)siptlw(in[2])) << 16; /* fall-thru */ + case 2: b |= ((uint64_t)siptlw(in[1])) << 8; /* fall-thru */ + case 1: b |= ((uint64_t)siptlw(in[0])); break; + case 0: break; + } + + v3 ^= b; + + SIPROUND; + + v0 ^= b; + v2 ^= 0xff; + + SIPROUND; + SIPROUND; + + b = v0 ^ v1 ^ v2 ^ v3; +#ifndef UNALIGNED_LE_CPU + U64TO8_LE(out, b); + return hash; +#else + return b; +#endif +} + + +/* --------------------------------- TEST ------------------------------------ */ + +#ifdef SIPHASH_TEST + +const uint8_t vectors_sip64[64][8] = { + { 0x31, 0x0e, 0x0e, 0xdd, 0x47, 0xdb, 0x6f, 0x72, }, + { 0xfd, 0x67, 0xdc, 0x93, 0xc5, 0x39, 0xf8, 0x74, }, + { 0x5a, 0x4f, 0xa9, 0xd9, 0x09, 0x80, 0x6c, 0x0d, }, + { 0x2d, 0x7e, 0xfb, 0xd7, 0x96, 0x66, 0x67, 0x85, }, + { 0xb7, 0x87, 0x71, 0x27, 0xe0, 0x94, 0x27, 0xcf, }, + { 0x8d, 0xa6, 0x99, 0xcd, 0x64, 0x55, 0x76, 0x18, }, + { 0xce, 0xe3, 0xfe, 0x58, 0x6e, 0x46, 0xc9, 0xcb, }, + { 0x37, 0xd1, 0x01, 0x8b, 0xf5, 0x00, 0x02, 0xab, }, + { 0x62, 0x24, 0x93, 0x9a, 0x79, 0xf5, 0xf5, 0x93, }, + { 0xb0, 0xe4, 0xa9, 0x0b, 0xdf, 0x82, 0x00, 0x9e, }, + { 0xf3, 0xb9, 0xdd, 0x94, 0xc5, 0xbb, 0x5d, 0x7a, }, + { 0xa7, 0xad, 0x6b, 0x22, 0x46, 0x2f, 0xb3, 0xf4, }, + { 0xfb, 0xe5, 0x0e, 0x86, 0xbc, 0x8f, 0x1e, 0x75, }, + { 0x90, 0x3d, 0x84, 0xc0, 0x27, 0x56, 0xea, 0x14, }, + { 0xee, 0xf2, 0x7a, 0x8e, 0x90, 0xca, 0x23, 0xf7, }, + { 0xe5, 0x45, 0xbe, 0x49, 0x61, 0xca, 0x29, 0xa1, }, + { 0xdb, 0x9b, 0xc2, 0x57, 0x7f, 0xcc, 0x2a, 0x3f, }, + { 0x94, 0x47, 0xbe, 0x2c, 0xf5, 0xe9, 0x9a, 0x69, }, + { 0x9c, 0xd3, 0x8d, 0x96, 0xf0, 0xb3, 0xc1, 0x4b, }, + { 0xbd, 0x61, 0x79, 0xa7, 0x1d, 0xc9, 0x6d, 0xbb, }, + { 0x98, 0xee, 0xa2, 0x1a, 0xf2, 0x5c, 0xd6, 0xbe, }, + { 0xc7, 0x67, 0x3b, 0x2e, 0xb0, 0xcb, 0xf2, 0xd0, }, + { 0x88, 0x3e, 0xa3, 0xe3, 0x95, 0x67, 0x53, 0x93, }, + { 0xc8, 0xce, 0x5c, 0xcd, 0x8c, 0x03, 0x0c, 0xa8, }, + { 0x94, 0xaf, 0x49, 0xf6, 0xc6, 0x50, 0xad, 0xb8, }, + { 0xea, 0xb8, 0x85, 0x8a, 0xde, 0x92, 0xe1, 0xbc, }, + { 0xf3, 0x15, 0xbb, 0x5b, 0xb8, 0x35, 0xd8, 0x17, }, + { 0xad, 0xcf, 0x6b, 0x07, 0x63, 0x61, 0x2e, 0x2f, }, + { 0xa5, 0xc9, 0x1d, 0xa7, 0xac, 0xaa, 0x4d, 0xde, }, + { 0x71, 0x65, 0x95, 0x87, 0x66, 0x50, 0xa2, 0xa6, }, + { 0x28, 0xef, 0x49, 0x5c, 0x53, 0xa3, 0x87, 0xad, }, + { 0x42, 0xc3, 0x41, 0xd8, 0xfa, 0x92, 0xd8, 0x32, }, + { 0xce, 0x7c, 0xf2, 0x72, 0x2f, 0x51, 0x27, 0x71, }, + { 0xe3, 0x78, 0x59, 0xf9, 0x46, 0x23, 0xf3, 0xa7, }, + { 0x38, 0x12, 0x05, 0xbb, 0x1a, 0xb0, 0xe0, 0x12, }, + { 0xae, 0x97, 0xa1, 0x0f, 0xd4, 0x34, 0xe0, 0x15, }, + { 0xb4, 0xa3, 0x15, 0x08, 0xbe, 0xff, 0x4d, 0x31, }, + { 0x81, 0x39, 0x62, 0x29, 0xf0, 0x90, 0x79, 0x02, }, + { 0x4d, 0x0c, 0xf4, 0x9e, 0xe5, 0xd4, 0xdc, 0xca, }, + { 0x5c, 0x73, 0x33, 0x6a, 0x76, 0xd8, 0xbf, 0x9a, }, + { 0xd0, 0xa7, 0x04, 0x53, 0x6b, 0xa9, 0x3e, 0x0e, }, + { 0x92, 0x59, 0x58, 0xfc, 0xd6, 0x42, 0x0c, 0xad, }, + { 0xa9, 0x15, 0xc2, 0x9b, 0xc8, 0x06, 0x73, 0x18, }, + { 0x95, 0x2b, 0x79, 0xf3, 0xbc, 0x0a, 0xa6, 0xd4, }, + { 0xf2, 0x1d, 0xf2, 0xe4, 0x1d, 0x45, 0x35, 0xf9, }, + { 0x87, 0x57, 0x75, 0x19, 0x04, 0x8f, 0x53, 0xa9, }, + { 0x10, 0xa5, 0x6c, 0xf5, 0xdf, 0xcd, 0x9a, 0xdb, }, + { 0xeb, 0x75, 0x09, 0x5c, 0xcd, 0x98, 0x6c, 0xd0, }, + { 0x51, 0xa9, 0xcb, 0x9e, 0xcb, 0xa3, 0x12, 0xe6, }, + { 0x96, 0xaf, 0xad, 0xfc, 0x2c, 0xe6, 0x66, 0xc7, }, + { 0x72, 0xfe, 0x52, 0x97, 0x5a, 0x43, 0x64, 0xee, }, + { 0x5a, 0x16, 0x45, 0xb2, 0x76, 0xd5, 0x92, 0xa1, }, + { 0xb2, 0x74, 0xcb, 0x8e, 0xbf, 0x87, 0x87, 0x0a, }, + { 0x6f, 0x9b, 0xb4, 0x20, 0x3d, 0xe7, 0xb3, 0x81, }, + { 0xea, 0xec, 0xb2, 0xa3, 0x0b, 0x22, 0xa8, 0x7f, }, + { 0x99, 0x24, 0xa4, 0x3c, 0xc1, 0x31, 0x57, 0x24, }, + { 0xbd, 0x83, 0x8d, 0x3a, 0xaf, 0xbf, 0x8d, 0xb7, }, + { 0x0b, 0x1a, 0x2a, 0x32, 0x65, 0xd5, 0x1a, 0xea, }, + { 0x13, 0x50, 0x79, 0xa3, 0x23, 0x1c, 0xe6, 0x60, }, + { 0x93, 0x2b, 0x28, 0x46, 0xe4, 0xd7, 0x06, 0x66, }, + { 0xe1, 0x91, 0x5f, 0x5c, 0xb1, 0xec, 0xa4, 0x6c, }, + { 0xf3, 0x25, 0x96, 0x5c, 0xa1, 0x6d, 0x62, 0x9f, }, + { 0x57, 0x5f, 0xf2, 0x8e, 0x60, 0x38, 0x1b, 0xe5, }, + { 0x72, 0x45, 0x06, 0xeb, 0x4c, 0x32, 0x8a, 0x95, }, +}; + + +/* Test siphash using a test vector. Returns 0 if the function passed + * all the tests, otherwise 1 is returned. + * + * IMPORTANT: The test vector is for SipHash 2-4. Before running + * the test revert back the siphash() function to 2-4 rounds since + * now it uses 1-2 rounds. */ +int siphash_test(void) { + uint8_t in[64], k[16]; + int i; + int fails = 0; + + for (i = 0; i < 16; ++i) + k[i] = i; + + for (i = 0; i < 64; ++i) { + in[i] = i; + uint64_t hash = siphash(in, i, k); + const uint8_t *v = NULL; + v = (uint8_t *)vectors_sip64; + if (memcmp(&hash, v + (i * 8), 8)) { + /* printf("fail for %d bytes\n", i); */ + fails++; + } + } + + /* Run a few basic tests with the case insensitive version. */ + uint64_t h1, h2; + h1 = siphash((uint8_t*)"hello world",11,(uint8_t*)"1234567812345678"); + h2 = siphash_nocase((uint8_t*)"hello world",11,(uint8_t*)"1234567812345678"); + if (h1 != h2) fails++; + + h1 = siphash((uint8_t*)"hello world",11,(uint8_t*)"1234567812345678"); + h2 = siphash_nocase((uint8_t*)"HELLO world",11,(uint8_t*)"1234567812345678"); + if (h1 != h2) fails++; + + h1 = siphash((uint8_t*)"HELLO world",11,(uint8_t*)"1234567812345678"); + h2 = siphash_nocase((uint8_t*)"HELLO world",11,(uint8_t*)"1234567812345678"); + if (h1 == h2) fails++; + + if (!fails) return 0; + return 1; +} + +int main(void) { + if (siphash_test() == 0) { + printf("SipHash test: OK\n"); + return 0; + } else { + printf("SipHash test: FAILED\n"); + return 1; + } +} + +#endif diff --git a/redis/t_zset.c b/redis/t_zset.c new file mode 100644 index 000000000..70df3b718 --- /dev/null +++ b/redis/t_zset.c @@ -0,0 +1,4358 @@ +/* + * Copyright (c) 2009-2012, Salvatore Sanfilippo + * Copyright (c) 2009-2012, Pieter Noordhuis + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/*----------------------------------------------------------------------------- + * Sorted set API + *----------------------------------------------------------------------------*/ + +/* ZSETs are ordered sets using two data structures to hold the same elements + * in order to get O(log(N)) INSERT and REMOVE operations into a sorted + * data structure. + * + * The elements are added to a hash table mapping Redis objects to scores. + * At the same time the elements are added to a skip list mapping scores + * to Redis objects (so objects are sorted by scores in this "view"). + * + * Note that the SDS string representing the element is the same in both + * the hash table and skiplist in order to save memory. What we do in order + * to manage the shared SDS string more easily is to free the SDS string + * only in zslFreeNode(). The dictionary has no value free method set. + * So we should always remove an element from the dictionary, and later from + * the skiplist. + * + * This skiplist implementation is almost a C translation of the original + * algorithm described by William Pugh in "Skip Lists: A Probabilistic + * Alternative to Balanced Trees", modified in three ways: + * a) this implementation allows for repeated scores. + * b) the comparison is not just by key (our 'score') but by satellite data. + * c) there is a back pointer, so it's a doubly linked list with the back + * pointers being only at "level 1". This allows to traverse the list + * from tail to head, useful for ZREVRANGE. */ + +#include +#include +#include + +#include "listpack.h" +#include "redis_aux.h" +#include "sds.h" +#include "object.h" +#include "zmalloc.h" +#include "zset.h" +#include "util.h" + +/*----------------------------------------------------------------------------- + * Skiplist implementation of the low level API + *----------------------------------------------------------------------------*/ + +int zslLexValueGteMin(sds value, const zlexrangespec *spec); +int zslLexValueLteMax(sds value, const zlexrangespec *spec); + +/* Create a skiplist node with the specified number of levels. + * The SDS string 'ele' is referenced by the node after the call. */ +zskiplistNode *zslCreateNode(int level, double score, sds ele) { + zskiplistNode *zn = + zmalloc(sizeof(*zn)+level*sizeof(struct zskiplistLevel)); + zn->score = score; + zn->ele = ele; + return zn; +} + +/* Create a new skiplist. */ +zskiplist *zslCreate(void) { + int j; + zskiplist *zsl; + + zsl = zmalloc(sizeof(*zsl)); + zsl->level = 1; + zsl->length = 0; + zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL); + for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) { + zsl->header->level[j].forward = NULL; + zsl->header->level[j].span = 0; + } + zsl->header->backward = NULL; + zsl->tail = NULL; + return zsl; +} + +/* Free the specified skiplist node. The referenced SDS string representation + * of the element is freed too, unless node->ele is set to NULL before calling + * this function. */ +void zslFreeNode(zskiplistNode *node) { + sdsfree(node->ele); + zfree(node); +} + +/* Free a whole skiplist. */ +void zslFree(zskiplist *zsl) { + zskiplistNode *node = zsl->header->level[0].forward, *next; + + zfree(zsl->header); + while(node) { + next = node->level[0].forward; + zslFreeNode(node); + node = next; + } + zfree(zsl); +} + +/* Returns a random level for the new skiplist node we are going to create. + * The return value of this function is between 1 and ZSKIPLIST_MAXLEVEL + * (both inclusive), with a powerlaw-alike distribution where higher + * levels are less likely to be returned. */ +int zslRandomLevel(void) { + int level = 1; + while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF)) + level += 1; + return (levelheader; + for (i = zsl->level-1; i >= 0; i--) { + /* store rank that is crossed to reach the insert position */ + rank[i] = i == (zsl->level-1) ? 0 : rank[i+1]; + while (x->level[i].forward && + (x->level[i].forward->score < score || + (x->level[i].forward->score == score && + sdscmp(x->level[i].forward->ele,ele) < 0))) + { + rank[i] += x->level[i].span; + x = x->level[i].forward; + } + update[i] = x; + } + /* we assume the element is not already inside, since we allow duplicated + * scores, reinserting the same element should never happen since the + * caller of zslInsert() should test in the hash table if the element is + * already inside or not. */ + level = zslRandomLevel(); + if (level > zsl->level) { + for (i = zsl->level; i < level; i++) { + rank[i] = 0; + update[i] = zsl->header; + update[i]->level[i].span = zsl->length; + } + zsl->level = level; + } + x = zslCreateNode(level,score,ele); + for (i = 0; i < level; i++) { + x->level[i].forward = update[i]->level[i].forward; + update[i]->level[i].forward = x; + + /* update span covered by update[i] as x is inserted here */ + x->level[i].span = update[i]->level[i].span - (rank[0] - rank[i]); + update[i]->level[i].span = (rank[0] - rank[i]) + 1; + } + + /* increment span for untouched levels */ + for (i = level; i < zsl->level; i++) { + update[i]->level[i].span++; + } + + x->backward = (update[0] == zsl->header) ? NULL : update[0]; + if (x->level[0].forward) + x->level[0].forward->backward = x; + else + zsl->tail = x; + zsl->length++; + return x; +} + +/* Internal function used by zslDelete, zslDeleteRangeByScore and + * zslDeleteRangeByRank. */ +void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) { + int i; + for (i = 0; i < zsl->level; i++) { + if (update[i]->level[i].forward == x) { + update[i]->level[i].span += x->level[i].span - 1; + update[i]->level[i].forward = x->level[i].forward; + } else { + update[i]->level[i].span -= 1; + } + } + if (x->level[0].forward) { + x->level[0].forward->backward = x->backward; + } else { + zsl->tail = x->backward; + } + while(zsl->level > 1 && zsl->header->level[zsl->level-1].forward == NULL) + zsl->level--; + zsl->length--; +} + +/* Delete an element with matching score/element from the skiplist. + * The function returns 1 if the node was found and deleted, otherwise + * 0 is returned. + * + * If 'node' is NULL the deleted node is freed by zslFreeNode(), otherwise + * it is not freed (but just unlinked) and *node is set to the node pointer, + * so that it is possible for the caller to reuse the node (including the + * referenced SDS string at node->ele). */ +int zslDelete(zskiplist *zsl, double score, sds ele, zskiplistNode **node) { + zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x; + int i; + + x = zsl->header; + for (i = zsl->level-1; i >= 0; i--) { + while (x->level[i].forward && + (x->level[i].forward->score < score || + (x->level[i].forward->score == score && + sdscmp(x->level[i].forward->ele,ele) < 0))) + { + x = x->level[i].forward; + } + update[i] = x; + } + /* We may have multiple elements with the same score, what we need + * is to find the element with both the right score and object. */ + x = x->level[0].forward; + if (x && score == x->score && sdscmp(x->ele,ele) == 0) { + zslDeleteNode(zsl, x, update); + if (!node) + zslFreeNode(x); + else + *node = x; + return 1; + } + return 0; /* not found */ +} + +/* Update the score of an element inside the sorted set skiplist. + * Note that the element must exist and must match 'score'. + * This function does not update the score in the hash table side, the + * caller should take care of it. + * + * Note that this function attempts to just update the node, in case after + * the score update, the node would be exactly at the same position. + * Otherwise the skiplist is modified by removing and re-adding a new + * element, which is more costly. + * + * The function returns the updated element skiplist node pointer. */ +zskiplistNode *zslUpdateScore(zskiplist *zsl, double curscore, sds ele, double newscore) { + zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x; + int i; + + /* We need to seek to element to update to start: this is useful anyway, + * we'll have to update or remove it. */ + x = zsl->header; + for (i = zsl->level-1; i >= 0; i--) { + while (x->level[i].forward && + (x->level[i].forward->score < curscore || + (x->level[i].forward->score == curscore && + sdscmp(x->level[i].forward->ele,ele) < 0))) + { + x = x->level[i].forward; + } + update[i] = x; + } + + /* Jump to our element: note that this function assumes that the + * element with the matching score exists. */ + x = x->level[0].forward; + serverAssert(x && curscore == x->score && sdscmp(x->ele,ele) == 0); + + /* If the node, after the score update, would be still exactly + * at the same position, we can just update the score without + * actually removing and re-inserting the element in the skiplist. */ + if ((x->backward == NULL || x->backward->score < newscore) && + (x->level[0].forward == NULL || x->level[0].forward->score > newscore)) + { + x->score = newscore; + return x; + } + + /* No way to reuse the old node: we need to remove and insert a new + * one at a different place. */ + zslDeleteNode(zsl, x, update); + zskiplistNode *newnode = zslInsert(zsl,newscore,x->ele); + /* We reused the old node x->ele SDS string, free the node now + * since zslInsert created a new one. */ + x->ele = NULL; + zslFreeNode(x); + return newnode; +} + +int zslValueGteMin(double value, const zrangespec *spec) { + return spec->minex ? (value > spec->min) : (value >= spec->min); +} + +int zslValueLteMax(double value, const zrangespec *spec) { + return spec->maxex ? (value < spec->max) : (value <= spec->max); +} + +/* Returns if there is a part of the zset is in range. */ +int zslIsInRange(zskiplist *zsl, const zrangespec *range) { + zskiplistNode *x; + + /* Test for ranges that will always be empty. */ + if (range->min > range->max || + (range->min == range->max && (range->minex || range->maxex))) + return 0; + x = zsl->tail; + if (x == NULL || !zslValueGteMin(x->score,range)) + return 0; + x = zsl->header->level[0].forward; + if (x == NULL || !zslValueLteMax(x->score,range)) + return 0; + return 1; +} + +/* Find the first node that is contained in the specified range. + * Returns NULL when no element is contained in the range. */ +zskiplistNode *zslFirstInRange(zskiplist *zsl, const zrangespec *range) { + zskiplistNode *x; + int i; + + /* If everything is out of range, return early. */ + if (!zslIsInRange(zsl,range)) return NULL; + + x = zsl->header; + for (i = zsl->level-1; i >= 0; i--) { + /* Go forward while *OUT* of range. */ + while (x->level[i].forward && + !zslValueGteMin(x->level[i].forward->score,range)) + x = x->level[i].forward; + } + + /* This is an inner range, so the next node cannot be NULL. */ + x = x->level[0].forward; + serverAssert(x != NULL); + + /* Check if score <= max. */ + if (!zslValueLteMax(x->score,range)) return NULL; + return x; +} + +/* Find the last node that is contained in the specified range. + * Returns NULL when no element is contained in the range. */ +zskiplistNode *zslLastInRange(zskiplist *zsl, const zrangespec *range) { + zskiplistNode *x; + int i; + + /* If everything is out of range, return early. */ + if (!zslIsInRange(zsl,range)) return NULL; + + x = zsl->header; + for (i = zsl->level-1; i >= 0; i--) { + /* Go forward while *IN* range. */ + while (x->level[i].forward && + zslValueLteMax(x->level[i].forward->score,range)) + x = x->level[i].forward; + } + + /* This is an inner range, so this node cannot be NULL. */ + serverAssert(x != NULL); + + /* Check if score >= min. */ + if (!zslValueGteMin(x->score,range)) return NULL; + return x; +} + +/* Delete all the elements with score between min and max from the skiplist. + * Both min and max can be inclusive or exclusive (see range->minex and + * range->maxex). When inclusive a score >= min && score <= max is deleted. + * Note that this function takes the reference to the hash table view of the + * sorted set, in order to remove the elements from the hash table too. */ +unsigned long zslDeleteRangeByScore(zskiplist *zsl, zrangespec *range, dict *dict) { + zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x; + unsigned long removed = 0; + int i; + + x = zsl->header; + for (i = zsl->level-1; i >= 0; i--) { + while (x->level[i].forward && + !zslValueGteMin(x->level[i].forward->score, range)) + x = x->level[i].forward; + update[i] = x; + } + + /* Current node is the last with score < or <= min. */ + x = x->level[0].forward; + + /* Delete nodes while in range. */ + while (x && zslValueLteMax(x->score, range)) { + zskiplistNode *next = x->level[0].forward; + zslDeleteNode(zsl,x,update); + dictDelete(dict,x->ele); + zslFreeNode(x); /* Here is where x->ele is actually released. */ + removed++; + x = next; + } + return removed; +} + +unsigned long zslDeleteRangeByLex(zskiplist *zsl, zlexrangespec *range, dict *dict) { + zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x; + unsigned long removed = 0; + int i; + + + x = zsl->header; + for (i = zsl->level-1; i >= 0; i--) { + while (x->level[i].forward && + !zslLexValueGteMin(x->level[i].forward->ele,range)) + x = x->level[i].forward; + update[i] = x; + } + + /* Current node is the last with score < or <= min. */ + x = x->level[0].forward; + + /* Delete nodes while in range. */ + while (x && zslLexValueLteMax(x->ele,range)) { + zskiplistNode *next = x->level[0].forward; + zslDeleteNode(zsl,x,update); + dictDelete(dict,x->ele); + zslFreeNode(x); /* Here is where x->ele is actually released. */ + removed++; + x = next; + } + return removed; +} + +/* Delete all the elements with rank between start and end from the skiplist. + * Start and end are inclusive. Note that start and end need to be 1-based */ +unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) { + zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x; + unsigned long traversed = 0, removed = 0; + int i; + + x = zsl->header; + for (i = zsl->level-1; i >= 0; i--) { + while (x->level[i].forward && (traversed + x->level[i].span) < start) { + traversed += x->level[i].span; + x = x->level[i].forward; + } + update[i] = x; + } + + traversed++; + x = x->level[0].forward; + while (x && traversed <= end) { + zskiplistNode *next = x->level[0].forward; + zslDeleteNode(zsl,x,update); + dictDelete(dict,x->ele); + zslFreeNode(x); + removed++; + traversed++; + x = next; + } + return removed; +} + +/* Find the rank for an element by both score and key. + * Returns 0 when the element cannot be found, rank otherwise. + * Note that the rank is 1-based due to the span of zsl->header to the + * first element. */ +unsigned long zslGetRank(zskiplist *zsl, double score, sds ele) { + zskiplistNode *x; + unsigned long rank = 0; + int i; + + x = zsl->header; + for (i = zsl->level-1; i >= 0; i--) { + while (x->level[i].forward && + (x->level[i].forward->score < score || + (x->level[i].forward->score == score && + sdscmp(x->level[i].forward->ele,ele) <= 0))) { + rank += x->level[i].span; + x = x->level[i].forward; + } + + /* x might be equal to zsl->header, so test if obj is non-NULL */ + if (x->ele && x->score == score && sdscmp(x->ele,ele) == 0) { + return rank; + } + } + return 0; +} + +/* Finds an element by its rank. The rank argument needs to be 1-based. */ +zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) { + zskiplistNode *x; + unsigned long traversed = 0; + int i; + + x = zsl->header; + for (i = zsl->level-1; i >= 0; i--) { + while (x->level[i].forward && (traversed + x->level[i].span) <= rank) + { + traversed += x->level[i].span; + x = x->level[i].forward; + } + if (traversed == rank) { + return x; + } + } + return NULL; +} + +#ifdef ROMAN_CLIENT_DISABLE + +/* Populate the rangespec according to the objects min and max. */ +static int zslParseRange(robj *min, robj *max, zrangespec *spec) { + char *eptr; + spec->minex = spec->maxex = 0; + + /* Parse the min-max interval. If one of the values is prefixed + * by the "(" character, it's considered "open". For instance + * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max + * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */ + if (min->encoding == OBJ_ENCODING_INT) { + spec->min = (long)min->ptr; + } else { + if (((char*)min->ptr)[0] == '(') { + spec->min = strtod((char*)min->ptr+1,&eptr); + if (eptr[0] != '\0' || isnan(spec->min)) return C_ERR; + spec->minex = 1; + } else { + spec->min = strtod((char*)min->ptr,&eptr); + if (eptr[0] != '\0' || isnan(spec->min)) return C_ERR; + } + } + if (max->encoding == OBJ_ENCODING_INT) { + spec->max = (long)max->ptr; + } else { + if (((char*)max->ptr)[0] == '(') { + spec->max = strtod((char*)max->ptr+1,&eptr); + if (eptr[0] != '\0' || isnan(spec->max)) return C_ERR; + spec->maxex = 1; + } else { + spec->max = strtod((char*)max->ptr,&eptr); + if (eptr[0] != '\0' || isnan(spec->max)) return C_ERR; + } + } + + return C_OK; +} +#endif + +/* ------------------------ Lexicographic ranges ---------------------------- */ + +/* Parse max or min argument of ZRANGEBYLEX. + * (foo means foo (open interval) + * [foo means foo (closed interval) + * - means the min string possible + * + means the max string possible + * + * If the string is valid the *dest pointer is set to the redis object + * that will be used for the comparison, and ex will be set to 0 or 1 + * respectively if the item is exclusive or inclusive. C_OK will be + * returned. + * + * If the string is not a valid range C_ERR is returned, and the value + * of *dest and *ex is undefined. */ +int zslParseLexRangeItem(robj *item, sds *dest, int *ex) { + char *c = item->ptr; + + switch(c[0]) { + case '+': + if (c[1] != '\0') return C_ERR; + *ex = 1; + *dest = shared.maxstring; + return C_OK; + case '-': + if (c[1] != '\0') return C_ERR; + *ex = 1; + *dest = shared.minstring; + return C_OK; + case '(': + *ex = 1; + *dest = sdsnewlen(c+1,sdslen(c)-1); + return C_OK; + case '[': + *ex = 0; + *dest = sdsnewlen(c+1,sdslen(c)-1); + return C_OK; + default: + return C_ERR; + } +} + +/* Free a lex range structure, must be called only after zslParseLexRange() + * populated the structure with success (C_OK returned). */ +void zslFreeLexRange(zlexrangespec *spec) { + if (spec->min != shared.minstring && + spec->min != shared.maxstring) sdsfree(spec->min); + if (spec->max != shared.minstring && + spec->max != shared.maxstring) sdsfree(spec->max); +} + +/* Populate the lex rangespec according to the objects min and max. + * + * Return C_OK on success. On error C_ERR is returned. + * When OK is returned the structure must be freed with zslFreeLexRange(), + * otherwise no release is needed. */ +int zslParseLexRange(robj *min, robj *max, zlexrangespec *spec) { + /* The range can't be valid if objects are integer encoded. + * Every item must start with ( or [. */ + if (min->encoding == OBJ_ENCODING_INT || + max->encoding == OBJ_ENCODING_INT) return C_ERR; + + spec->min = spec->max = NULL; + if (zslParseLexRangeItem(min, &spec->min, &spec->minex) == C_ERR || + zslParseLexRangeItem(max, &spec->max, &spec->maxex) == C_ERR) { + zslFreeLexRange(spec); + return C_ERR; + } else { + return C_OK; + } +} + +/* This is just a wrapper to sdscmp() that is able to + * handle shared.minstring and shared.maxstring as the equivalent of + * -inf and +inf for strings */ +int sdscmplex(sds a, sds b) { + if (a == b) return 0; + if (a == shared.minstring || b == shared.maxstring) return -1; + if (a == shared.maxstring || b == shared.minstring) return 1; + return sdscmp(a,b); +} + +int zslLexValueGteMin(sds value, const zlexrangespec *spec) { + return spec->minex ? + (sdscmplex(value,spec->min) > 0) : + (sdscmplex(value,spec->min) >= 0); +} + +int zslLexValueLteMax(sds value, const zlexrangespec *spec) { + return spec->maxex ? + (sdscmplex(value,spec->max) < 0) : + (sdscmplex(value,spec->max) <= 0); +} + +/* Returns if there is a part of the zset is in the lex range. */ +int zslIsInLexRange(zskiplist *zsl, const zlexrangespec *range) { + zskiplistNode *x; + + /* Test for ranges that will always be empty. */ + int cmp = sdscmplex(range->min,range->max); + if (cmp > 0 || (cmp == 0 && (range->minex || range->maxex))) + return 0; + x = zsl->tail; + if (x == NULL || !zslLexValueGteMin(x->ele,range)) + return 0; + x = zsl->header->level[0].forward; + if (x == NULL || !zslLexValueLteMax(x->ele,range)) + return 0; + return 1; +} + +/* Find the first node that is contained in the specified lex range. + * Returns NULL when no element is contained in the range. */ +zskiplistNode *zslFirstInLexRange(zskiplist *zsl, zlexrangespec *range) { + zskiplistNode *x; + int i; + + /* If everything is out of range, return early. */ + if (!zslIsInLexRange(zsl,range)) return NULL; + + x = zsl->header; + for (i = zsl->level-1; i >= 0; i--) { + /* Go forward while *OUT* of range. */ + while (x->level[i].forward && + !zslLexValueGteMin(x->level[i].forward->ele,range)) + x = x->level[i].forward; + } + + /* This is an inner range, so the next node cannot be NULL. */ + x = x->level[0].forward; + serverAssert(x != NULL); + + /* Check if score <= max. */ + if (!zslLexValueLteMax(x->ele,range)) return NULL; + return x; +} + +/* Find the last node that is contained in the specified range. + * Returns NULL when no element is contained in the range. */ +zskiplistNode *zslLastInLexRange(zskiplist *zsl, zlexrangespec *range) { + zskiplistNode *x; + int i; + + /* If everything is out of range, return early. */ + if (!zslIsInLexRange(zsl,range)) return NULL; + + x = zsl->header; + for (i = zsl->level-1; i >= 0; i--) { + /* Go forward while *IN* range. */ + while (x->level[i].forward && + zslLexValueLteMax(x->level[i].forward->ele,range)) + x = x->level[i].forward; + } + + /* This is an inner range, so this node cannot be NULL. */ + serverAssert(x != NULL); + + /* Check if score >= min. */ + if (!zslLexValueGteMin(x->ele,range)) return NULL; + return x; +} + +/*----------------------------------------------------------------------------- + * Listpack-backed sorted set API + *----------------------------------------------------------------------------*/ + +double zzlStrtod(unsigned char *vstr, unsigned int vlen) { + char buf[128]; + if (vlen > sizeof(buf)) + vlen = sizeof(buf); + memcpy(buf,vstr,vlen); + buf[vlen] = '\0'; + return strtod(buf,NULL); + } + +double zzlGetScore(unsigned char *sptr) { + unsigned char *vstr; + unsigned int vlen; + long long vlong; + double score; + + serverAssert(sptr != NULL); + vstr = lpGetValue(sptr,&vlen,&vlong); + + if (vstr) { + score = zzlStrtod(vstr,vlen); + } else { + score = vlong; + } + + return score; +} + +/* Return a listpack element as an SDS string. */ +sds lpGetObject(unsigned char *sptr) { + unsigned char *vstr; + unsigned int vlen; + long long vlong; + + serverAssert(sptr != NULL); + vstr = lpGetValue(sptr,&vlen,&vlong); + + if (vstr) { + return sdsnewlen((char*)vstr,vlen); + } else { + return sdsfromlonglong(vlong); + } +} + +/* Compare element in sorted set with given element. */ +int zzlCompareElements(unsigned char *eptr, unsigned char *cstr, unsigned int clen) { + unsigned char *vstr; + unsigned int vlen; + long long vlong; + unsigned char vbuf[32]; + int minlen, cmp; + + vstr = lpGetValue(eptr,&vlen,&vlong); + if (vstr == NULL) { + /* Store string representation of long long in buf. */ + vlen = ll2string((char*)vbuf,sizeof(vbuf),vlong); + vstr = vbuf; + } + + minlen = (vlen < clen) ? vlen : clen; + cmp = memcmp(vstr,cstr,minlen); + if (cmp == 0) return vlen-clen; + return cmp; +} + +unsigned int zzlLength(unsigned char *zl) { + return lpLength(zl)/2; +} + +/* Move to next entry based on the values in eptr and sptr. Both are set to + * NULL when there is no next entry. */ +void zzlNext(unsigned char *zl, unsigned char **eptr, unsigned char **sptr) { + unsigned char *_eptr, *_sptr; + serverAssert(*eptr != NULL && *sptr != NULL); + + _eptr = lpNext(zl,*sptr); + if (_eptr != NULL) { + _sptr = lpNext(zl,_eptr); + serverAssert(_sptr != NULL); + } else { + /* No next entry. */ + _sptr = NULL; + } + + *eptr = _eptr; + *sptr = _sptr; +} + +/* Move to the previous entry based on the values in eptr and sptr. Both are + * set to NULL when there is no prev entry. */ +void zzlPrev(unsigned char *zl, unsigned char **eptr, unsigned char **sptr) { + unsigned char *_eptr, *_sptr; + serverAssert(*eptr != NULL && *sptr != NULL); + + _sptr = lpPrev(zl,*eptr); + if (_sptr != NULL) { + _eptr = lpPrev(zl,_sptr); + serverAssert(_eptr != NULL); + } else { + /* No previous entry. */ + _eptr = NULL; + } + + *eptr = _eptr; + *sptr = _sptr; +} + +/* Returns if there is a part of the zset is in range. Should only be used + * internally by zzlFirstInRange and zzlLastInRange. */ +int zzlIsInRange(unsigned char *zl, const zrangespec *range) { + unsigned char *p; + double score; + + /* Test for ranges that will always be empty. */ + if (range->min > range->max || + (range->min == range->max && (range->minex || range->maxex))) + return 0; + + p = lpSeek(zl,-1); /* Last score. */ + if (p == NULL) return 0; /* Empty sorted set */ + score = zzlGetScore(p); + if (!zslValueGteMin(score,range)) + return 0; + + p = lpSeek(zl,1); /* First score. */ + serverAssert(p != NULL); + score = zzlGetScore(p); + if (!zslValueLteMax(score,range)) + return 0; + + return 1; +} + +/* Find pointer to the first element contained in the specified range. + * Returns NULL when no element is contained in the range. */ +unsigned char *zzlFirstInRange(unsigned char *zl, const zrangespec *range) { + unsigned char *eptr = lpSeek(zl,0), *sptr; + double score; + + /* If everything is out of range, return early. */ + if (!zzlIsInRange(zl,range)) return NULL; + + while (eptr != NULL) { + sptr = lpNext(zl,eptr); + serverAssert(sptr != NULL); + + score = zzlGetScore(sptr); + if (zslValueGteMin(score,range)) { + /* Check if score <= max. */ + if (zslValueLteMax(score,range)) + return eptr; + return NULL; + } + + /* Move to next element. */ + eptr = lpNext(zl,sptr); + } + + return NULL; +} + +/* Find pointer to the last element contained in the specified range. + * Returns NULL when no element is contained in the range. */ +unsigned char *zzlLastInRange(unsigned char *zl, const zrangespec *range) { + unsigned char *eptr = lpSeek(zl,-2), *sptr; + double score; + + /* If everything is out of range, return early. */ + if (!zzlIsInRange(zl,range)) return NULL; + + while (eptr != NULL) { + sptr = lpNext(zl,eptr); + serverAssert(sptr != NULL); + + score = zzlGetScore(sptr); + if (zslValueLteMax(score,range)) { + /* Check if score >= min. */ + if (zslValueGteMin(score,range)) + return eptr; + return NULL; + } + + /* Move to previous element by moving to the score of previous element. + * When this returns NULL, we know there also is no element. */ + sptr = lpPrev(zl,eptr); + if (sptr != NULL) + serverAssert((eptr = lpPrev(zl,sptr)) != NULL); + else + eptr = NULL; + } + + return NULL; +} + +int zzlLexValueGteMin(unsigned char *p, const zlexrangespec *spec) { + sds value = lpGetObject(p); + int res = zslLexValueGteMin(value,spec); + sdsfree(value); + return res; +} + +int zzlLexValueLteMax(unsigned char *p, const zlexrangespec *spec) { + sds value = lpGetObject(p); + int res = zslLexValueLteMax(value,spec); + sdsfree(value); + return res; +} + +/* Returns if there is a part of the zset is in range. Should only be used + * internally by zzlFirstInRange and zzlLastInRange. */ +int zzlIsInLexRange(unsigned char *zl, zlexrangespec *range) { + unsigned char *p; + + /* Test for ranges that will always be empty. */ + int cmp = sdscmplex(range->min,range->max); + if (cmp > 0 || (cmp == 0 && (range->minex || range->maxex))) + return 0; + + p = lpSeek(zl,-2); /* Last element. */ + if (p == NULL) return 0; + if (!zzlLexValueGteMin(p,range)) + return 0; + + p = lpSeek(zl,0); /* First element. */ + serverAssert(p != NULL); + if (!zzlLexValueLteMax(p,range)) + return 0; + + return 1; +} + +/* Find pointer to the first element contained in the specified lex range. + * Returns NULL when no element is contained in the range. */ +unsigned char *zzlFirstInLexRange(unsigned char *zl, zlexrangespec *range) { + unsigned char *eptr = lpSeek(zl,0), *sptr; + + /* If everything is out of range, return early. */ + if (!zzlIsInLexRange(zl,range)) return NULL; + + while (eptr != NULL) { + if (zzlLexValueGteMin(eptr,range)) { + /* Check if score <= max. */ + if (zzlLexValueLteMax(eptr,range)) + return eptr; + return NULL; + } + + /* Move to next element. */ + sptr = lpNext(zl,eptr); /* This element score. Skip it. */ + serverAssert(sptr != NULL); + eptr = lpNext(zl,sptr); /* Next element. */ + } + + return NULL; +} + +/* Find pointer to the last element contained in the specified lex range. + * Returns NULL when no element is contained in the range. */ +unsigned char *zzlLastInLexRange(unsigned char *zl, zlexrangespec *range) { + unsigned char *eptr = lpSeek(zl,-2), *sptr; + + /* If everything is out of range, return early. */ + if (!zzlIsInLexRange(zl,range)) return NULL; + + while (eptr != NULL) { + if (zzlLexValueLteMax(eptr,range)) { + /* Check if score >= min. */ + if (zzlLexValueGteMin(eptr,range)) + return eptr; + return NULL; + } + + /* Move to previous element by moving to the score of previous element. + * When this returns NULL, we know there also is no element. */ + sptr = lpPrev(zl,eptr); + if (sptr != NULL) + serverAssert((eptr = lpPrev(zl,sptr)) != NULL); + else + eptr = NULL; + } + + return NULL; +} + +unsigned char *zzlFind(unsigned char *lp, sds ele, double *score) { + unsigned char *eptr, *sptr; + + if ((eptr = lpFirst(lp)) == NULL) return NULL; + eptr = lpFind(lp, eptr, (unsigned char*)ele, sdslen(ele), 1); + if (eptr) { + sptr = lpNext(lp,eptr); + serverAssert(sptr != NULL); + + /* Matching element, pull out score. */ + if (score != NULL) *score = zzlGetScore(sptr); + return eptr; + } + + return NULL; +} + +/* Delete (element,score) pair from listpack. Use local copy of eptr because we + * don't want to modify the one given as argument. */ +unsigned char *zzlDelete(unsigned char *zl, unsigned char *eptr) { + return lpDeleteRangeWithEntry(zl,&eptr,2); +} + +unsigned char *zzlInsertAt(unsigned char *zl, unsigned char *eptr, sds ele, double score) { + unsigned char *sptr; + char scorebuf[128]; + int scorelen; + + scorelen = d2string(scorebuf,sizeof(scorebuf),score); + if (eptr == NULL) { + zl = lpAppend(zl,(unsigned char*)ele,sdslen(ele)); + zl = lpAppend(zl,(unsigned char*)scorebuf,scorelen); + } else { + /* Insert member before the element 'eptr'. */ + zl = lpInsertString(zl,(unsigned char*)ele,sdslen(ele),eptr,LP_BEFORE,&sptr); + + /* Insert score after the member. */ + zl = lpInsertString(zl,(unsigned char*)scorebuf,scorelen,sptr,LP_AFTER,NULL); + } + return zl; +} + +/* Insert (element,score) pair in listpack. This function assumes the element is + * not yet present in the list. */ +unsigned char *zzlInsert(unsigned char *zl, sds ele, double score) { + unsigned char *eptr = lpSeek(zl,0), *sptr; + double s; + + while (eptr != NULL) { + sptr = lpNext(zl,eptr); + serverAssert(sptr != NULL); + s = zzlGetScore(sptr); + + if (s > score) { + /* First element with score larger than score for element to be + * inserted. This means we should take its spot in the list to + * maintain ordering. */ + zl = zzlInsertAt(zl,eptr,ele,score); + break; + } else if (s == score) { + /* Ensure lexicographical ordering for elements. */ + if (zzlCompareElements(eptr,(unsigned char*)ele,sdslen(ele)) > 0) { + zl = zzlInsertAt(zl,eptr,ele,score); + break; + } + } + + /* Move to next element. */ + eptr = lpNext(zl,sptr); + } + + /* Push on tail of list when it was not yet inserted. */ + if (eptr == NULL) + zl = zzlInsertAt(zl,NULL,ele,score); + return zl; +} + +unsigned char *zzlDeleteRangeByScore(unsigned char *zl, zrangespec *range, unsigned long *deleted) { + unsigned char *eptr, *sptr; + double score; + unsigned long num = 0; + + if (deleted != NULL) *deleted = 0; + + eptr = zzlFirstInRange(zl,range); + if (eptr == NULL) return zl; + + /* When the tail of the listpack is deleted, eptr will be NULL. */ + while (eptr && (sptr = lpNext(zl,eptr)) != NULL) { + score = zzlGetScore(sptr); + if (zslValueLteMax(score,range)) { + /* Delete both the element and the score. */ + zl = lpDeleteRangeWithEntry(zl,&eptr,2); + num++; + } else { + /* No longer in range. */ + break; + } + } + + if (deleted != NULL) *deleted = num; + return zl; +} + +unsigned char *zzlDeleteRangeByLex(unsigned char *zl, zlexrangespec *range, unsigned long *deleted) { + unsigned char *eptr, *sptr; + unsigned long num = 0; + + if (deleted != NULL) *deleted = 0; + + eptr = zzlFirstInLexRange(zl,range); + if (eptr == NULL) return zl; + + /* When the tail of the listpack is deleted, eptr will be NULL. */ + while (eptr && (sptr = lpNext(zl,eptr)) != NULL) { + if (zzlLexValueLteMax(eptr,range)) { + /* Delete both the element and the score. */ + zl = lpDeleteRangeWithEntry(zl,&eptr,2); + num++; + } else { + /* No longer in range. */ + break; + } + } + + if (deleted != NULL) *deleted = num; + return zl; +} + +/* Delete all the elements with rank between start and end from the skiplist. + * Start and end are inclusive. Note that start and end need to be 1-based */ +unsigned char *zzlDeleteRangeByRank(unsigned char *zl, unsigned int start, unsigned int end, unsigned long *deleted) { + unsigned int num = (end-start)+1; + if (deleted) *deleted = num; + zl = lpDeleteRange(zl,2*(start-1),2*num); + return zl; +} + +/*----------------------------------------------------------------------------- + * Common sorted set API + *----------------------------------------------------------------------------*/ + +unsigned long zsetLength(const robj *zobj) { + unsigned long length = 0; + if (zobj->encoding == OBJ_ENCODING_LISTPACK) { + length = zzlLength(zobj->ptr); + } else if (zobj->encoding == OBJ_ENCODING_SKIPLIST) { + length = ((const zset*)zobj->ptr)->zsl->length; + } else { + serverPanic("Unknown sorted set encoding"); + } + return length; +} + +void zsetConvert(robj *zobj, int encoding) { + zset *zs; + zskiplistNode *node, *next; + sds ele; + double score; + + if (zobj->encoding == encoding) return; + if (zobj->encoding == OBJ_ENCODING_LISTPACK) { + unsigned char *zl = zobj->ptr; + unsigned char *eptr, *sptr; + unsigned char *vstr; + unsigned int vlen; + long long vlong; + + if (encoding != OBJ_ENCODING_SKIPLIST) + serverPanic("Unknown target encoding"); + + zs = zmalloc(sizeof(*zs)); + zs->dict = dictCreate(&zsetDictType); + zs->zsl = zslCreate(); + + eptr = lpSeek(zl,0); + serverAssertWithInfo(NULL,zobj,eptr != NULL); + sptr = lpNext(zl,eptr); + serverAssertWithInfo(NULL,zobj,sptr != NULL); + + while (eptr != NULL) { + score = zzlGetScore(sptr); + vstr = lpGetValue(eptr,&vlen,&vlong); + if (vstr == NULL) + ele = sdsfromlonglong(vlong); + else + ele = sdsnewlen((char*)vstr,vlen); + + node = zslInsert(zs->zsl,score,ele); + serverAssert(dictAdd(zs->dict,ele,&node->score) == DICT_OK); + zzlNext(zl,&eptr,&sptr); + } + + zfree(zobj->ptr); + zobj->ptr = zs; + zobj->encoding = OBJ_ENCODING_SKIPLIST; + } else if (zobj->encoding == OBJ_ENCODING_SKIPLIST) { + unsigned char *zl = lpNew(0); + + if (encoding != OBJ_ENCODING_LISTPACK) + serverPanic("Unknown target encoding"); + + /* Approach similar to zslFree(), since we want to free the skiplist at + * the same time as creating the listpack. */ + zs = zobj->ptr; + dictRelease(zs->dict); + node = zs->zsl->header->level[0].forward; + zfree(zs->zsl->header); + zfree(zs->zsl); + + while (node) { + zl = zzlInsertAt(zl,NULL,node->ele,node->score); + next = node->level[0].forward; + zslFreeNode(node); + node = next; + } + + zfree(zs); + zobj->ptr = zl; + zobj->encoding = OBJ_ENCODING_LISTPACK; + } else { + serverPanic("Unknown sorted set encoding"); + } +} + +/* Convert the sorted set object into a listpack if it is not already a listpack + * and if the number of elements and the maximum element size and total elements size + * are within the expected ranges. */ +void zsetConvertToListpackIfNeeded(robj *zobj, size_t maxelelen, size_t totelelen) { + if (zobj->encoding == OBJ_ENCODING_LISTPACK) return; + zset *zset = zobj->ptr; + + if (zset->zsl->length <= server.zset_max_listpack_entries && + maxelelen <= server.zset_max_listpack_value && + lpSafeToAdd(NULL, totelelen)) + { + zsetConvert(zobj,OBJ_ENCODING_LISTPACK); + } +} + +/* Return (by reference) the score of the specified member of the sorted set + * storing it into *score. If the element does not exist C_ERR is returned + * otherwise C_OK is returned and *score is correctly populated. + * If 'zobj' or 'member' is NULL, C_ERR is returned. */ +int zsetScore(robj *zobj, sds member, double *score) { + if (!zobj || !member) return C_ERR; + + if (zobj->encoding == OBJ_ENCODING_LISTPACK) { + if (zzlFind(zobj->ptr, member, score) == NULL) return C_ERR; + } else if (zobj->encoding == OBJ_ENCODING_SKIPLIST) { + zset *zs = zobj->ptr; + dictEntry *de = dictFind(zs->dict, member); + if (de == NULL) return C_ERR; + *score = *(double*)dictGetVal(de); + } else { + serverPanic("Unknown sorted set encoding"); + } + return C_OK; +} + +/* Add a new element or update the score of an existing element in a sorted + * set, regardless of its encoding. + * + * The set of flags change the command behavior. + * + * The input flags are the following: + * + * ZADD_INCR: Increment the current element score by 'score' instead of updating + * the current element score. If the element does not exist, we + * assume 0 as previous score. + * ZADD_NX: Perform the operation only if the element does not exist. + * ZADD_XX: Perform the operation only if the element already exist. + * ZADD_GT: Perform the operation on existing elements only if the new score is + * greater than the current score. + * ZADD_LT: Perform the operation on existing elements only if the new score is + * less than the current score. + * + * When ZADD_INCR is used, the new score of the element is stored in + * '*newscore' if 'newscore' is not NULL. + * + * The returned flags are the following: + * + * ZADD_NAN: The resulting score is not a number. + * ZADD_ADDED: The element was added (not present before the call). + * ZADD_UPDATED: The element score was updated. + * ZADD_NOP: No operation was performed because of NX or XX. + * + * Return value: + * + * The function returns 1 on success, and sets the appropriate flags + * ADDED or UPDATED to signal what happened during the operation (note that + * none could be set if we re-added an element using the same score it used + * to have, or in the case a zero increment is used). + * + * The function returns 0 on error, currently only when the increment + * produces a NAN condition, or when the 'score' value is NAN since the + * start. + * + * The command as a side effect of adding a new element may convert the sorted + * set internal encoding from listpack to hashtable+skiplist. + * + * Memory management of 'ele': + * + * The function does not take ownership of the 'ele' SDS string, but copies + * it if needed. */ +int zsetAdd(robj *zobj, double score, sds ele, int in_flags, int *out_flags, double *newscore) { + /* Turn options into simple to check vars. */ + int incr = (in_flags & ZADD_IN_INCR) != 0; + int nx = (in_flags & ZADD_IN_NX) != 0; + int xx = (in_flags & ZADD_IN_XX) != 0; + int gt = (in_flags & ZADD_IN_GT) != 0; + int lt = (in_flags & ZADD_IN_LT) != 0; + *out_flags = 0; /* We'll return our response flags. */ + double curscore; + + /* NaN as input is an error regardless of all the other parameters. */ + if (isnan(score)) { + *out_flags = ZADD_OUT_NAN; + return 0; + } + + /* Update the sorted set according to its encoding. */ + if (zobj->encoding == OBJ_ENCODING_LISTPACK) { + unsigned char *eptr; + + if ((eptr = zzlFind(zobj->ptr,ele,&curscore)) != NULL) { + /* NX? Return, same element already exists. */ + if (nx) { + *out_flags |= ZADD_OUT_NOP; + return 1; + } + + /* Prepare the score for the increment if needed. */ + if (incr) { + score += curscore; + if (isnan(score)) { + *out_flags |= ZADD_OUT_NAN; + return 0; + } + } + + /* GT/LT? Only update if score is greater/less than current. */ + if ((lt && score >= curscore) || (gt && score <= curscore)) { + *out_flags |= ZADD_OUT_NOP; + return 1; + } + + if (newscore) *newscore = score; + + /* Remove and re-insert when score changed. */ + if (score != curscore) { + zobj->ptr = zzlDelete(zobj->ptr,eptr); + zobj->ptr = zzlInsert(zobj->ptr,ele,score); + *out_flags |= ZADD_OUT_UPDATED; + } + return 1; + } else if (!xx) { + /* check if the element is too large or the list + * becomes too long *before* executing zzlInsert. */ + if (zzlLength(zobj->ptr)+1 > server.zset_max_listpack_entries || + sdslen(ele) > server.zset_max_listpack_value || + !lpSafeToAdd(zobj->ptr, sdslen(ele))) + { + zsetConvert(zobj,OBJ_ENCODING_SKIPLIST); + } else { + zobj->ptr = zzlInsert(zobj->ptr,ele,score); + if (newscore) *newscore = score; + *out_flags |= ZADD_OUT_ADDED; + return 1; + } + } else { + *out_flags |= ZADD_OUT_NOP; + return 1; + } + } + + /* Note that the above block handling listpack would have either returned or + * converted the key to skiplist. */ + if (zobj->encoding == OBJ_ENCODING_SKIPLIST) { + zset *zs = zobj->ptr; + zskiplistNode *znode; + dictEntry *de; + + de = dictFind(zs->dict,ele); + if (de != NULL) { + /* NX? Return, same element already exists. */ + if (nx) { + *out_flags |= ZADD_OUT_NOP; + return 1; + } + + curscore = *(double*)dictGetVal(de); + + /* Prepare the score for the increment if needed. */ + if (incr) { + score += curscore; + if (isnan(score)) { + *out_flags |= ZADD_OUT_NAN; + return 0; + } + } + + /* GT/LT? Only update if score is greater/less than current. */ + if ((lt && score >= curscore) || (gt && score <= curscore)) { + *out_flags |= ZADD_OUT_NOP; + return 1; + } + + if (newscore) *newscore = score; + + /* Remove and re-insert when score changes. */ + if (score != curscore) { + znode = zslUpdateScore(zs->zsl,curscore,ele,score); + /* Note that we did not removed the original element from + * the hash table representing the sorted set, so we just + * update the score. */ + dictGetVal(de) = &znode->score; /* Update score ptr. */ + *out_flags |= ZADD_OUT_UPDATED; + } + return 1; + } else if (!xx) { + ele = sdsdup(ele); + znode = zslInsert(zs->zsl,score,ele); + serverAssert(dictAdd(zs->dict,ele,&znode->score) == DICT_OK); + *out_flags |= ZADD_OUT_ADDED; + if (newscore) *newscore = score; + return 1; + } else { + *out_flags |= ZADD_OUT_NOP; + return 1; + } + } else { + serverPanic("Unknown sorted set encoding"); + } + return 0; /* Never reached. */ +} + +/* Deletes the element 'ele' from the sorted set encoded as a skiplist+dict, + * returning 1 if the element existed and was deleted, 0 otherwise (the + * element was not there). It does not resize the dict after deleting the + * element. */ +static int zsetRemoveFromSkiplist(zset *zs, sds ele) { + dictEntry *de; + double score; + + de = dictUnlink(zs->dict,ele); + if (de != NULL) { + /* Get the score in order to delete from the skiplist later. */ + score = *(double*)dictGetVal(de); + + /* Delete from the hash table and later from the skiplist. + * Note that the order is important: deleting from the skiplist + * actually releases the SDS string representing the element, + * which is shared between the skiplist and the hash table, so + * we need to delete from the skiplist as the final step. */ + dictFreeUnlinkedEntry(zs->dict,de); + + /* Delete from skiplist. */ + int retval = zslDelete(zs->zsl,score,ele,NULL); + serverAssert(retval); + + return 1; + } + + return 0; +} + +/* Delete the element 'ele' from the sorted set, returning 1 if the element + * existed and was deleted, 0 otherwise (the element was not there). */ +int zsetDel(robj *zobj, sds ele) { + if (zobj->encoding == OBJ_ENCODING_LISTPACK) { + unsigned char *eptr; + + if ((eptr = zzlFind(zobj->ptr,ele,NULL)) != NULL) { + zobj->ptr = zzlDelete(zobj->ptr,eptr); + return 1; + } + } else if (zobj->encoding == OBJ_ENCODING_SKIPLIST) { + zset *zs = zobj->ptr; + if (zsetRemoveFromSkiplist(zs, ele)) { + if (htNeedsResize(zs->dict)) dictResize(zs->dict); + return 1; + } + } else { + serverPanic("Unknown sorted set encoding"); + } + return 0; /* No such element found. */ +} + +/* Given a sorted set object returns the 0-based rank of the object or + * -1 if the object does not exist. + * + * For rank we mean the position of the element in the sorted collection + * of elements. So the first element has rank 0, the second rank 1, and so + * forth up to length-1 elements. + * + * If 'reverse' is false, the rank is returned considering as first element + * the one with the lowest score. Otherwise if 'reverse' is non-zero + * the rank is computed considering as element with rank 0 the one with + * the highest score. */ +long zsetRank(robj *zobj, sds ele, int reverse) { + unsigned long llen; + unsigned long rank; + + llen = zsetLength(zobj); + + if (zobj->encoding == OBJ_ENCODING_LISTPACK) { + unsigned char *zl = zobj->ptr; + unsigned char *eptr, *sptr; + + eptr = lpSeek(zl,0); + serverAssert(eptr != NULL); + sptr = lpNext(zl,eptr); + serverAssert(sptr != NULL); + + rank = 1; + while(eptr != NULL) { + if (lpCompare(eptr,(unsigned char*)ele,sdslen(ele))) + break; + rank++; + zzlNext(zl,&eptr,&sptr); + } + + if (eptr != NULL) { + if (reverse) + return llen-rank; + else + return rank-1; + } else { + return -1; + } + } else if (zobj->encoding == OBJ_ENCODING_SKIPLIST) { + zset *zs = zobj->ptr; + zskiplist *zsl = zs->zsl; + dictEntry *de; + double score; + + de = dictFind(zs->dict,ele); + if (de != NULL) { + score = *(double*)dictGetVal(de); + rank = zslGetRank(zsl,score,ele); + /* Existing elements always have a rank. */ + serverAssert(rank != 0); + if (reverse) + return llen-rank; + else + return rank-1; + } else { + return -1; + } + } else { + serverPanic("Unknown sorted set encoding"); + } +} + +/* This is a helper function for the COPY command. + * Duplicate a sorted set object, with the guarantee that the returned object + * has the same encoding as the original one. + * + * The resulting object always has refcount set to 1 */ +robj *zsetDup(robj *o) { + robj *zobj; + zset *zs; + zset *new_zs; + + serverAssert(o->type == OBJ_ZSET); + + /* Create a new sorted set object that have the same encoding as the original object's encoding */ + if (o->encoding == OBJ_ENCODING_LISTPACK) { + unsigned char *zl = o->ptr; + size_t sz = lpBytes(zl); + unsigned char *new_zl = zmalloc(sz); + memcpy(new_zl, zl, sz); + zobj = createObject(OBJ_ZSET, new_zl); + zobj->encoding = OBJ_ENCODING_LISTPACK; + } else if (o->encoding == OBJ_ENCODING_SKIPLIST) { + zobj = createZsetObject(); + zs = o->ptr; + new_zs = zobj->ptr; + dictExpand(new_zs->dict,dictSize(zs->dict)); + zskiplist *zsl = zs->zsl; + zskiplistNode *ln; + sds ele; + long llen = zsetLength(o); + + /* We copy the skiplist elements from the greatest to the + * smallest (that's trivial since the elements are already ordered in + * the skiplist): this improves the load process, since the next loaded + * element will always be the smaller, so adding to the skiplist + * will always immediately stop at the head, making the insertion + * O(1) instead of O(log(N)). */ + ln = zsl->tail; + while (llen--) { + ele = ln->ele; + sds new_ele = sdsdup(ele); + zskiplistNode *znode = zslInsert(new_zs->zsl,ln->score,new_ele); + dictAdd(new_zs->dict,new_ele,&znode->score); + ln = ln->backward; + } + } else { + serverPanic("Unknown sorted set encoding"); + } + return zobj; +} + +/* Create a new sds string from the listpack entry. */ +sds zsetSdsFromListpackEntry(listpackEntry *e) { + return e->sval ? sdsnewlen(e->sval, e->slen) : sdsfromlonglong(e->lval); +} + +#ifdef ROMAN_CLIENT_DISABLE + +/* Reply with bulk string from the listpack entry. */ +void zsetReplyFromListpackEntry(client *c, listpackEntry *e) { + if (e->sval) + addReplyBulkCBuffer(c, e->sval, e->slen); + else + addReplyBulkLongLong(c, e->lval); +} + + +/* Return random element from a non empty zset. + * 'key' and 'val' will be set to hold the element. + * The memory in `key` is not to be freed or modified by the caller. + * 'score' can be NULL in which case it's not extracted. */ +void zsetTypeRandomElement(robj *zsetobj, unsigned long zsetsize, listpackEntry *key, double *score) { + if (zsetobj->encoding == OBJ_ENCODING_SKIPLIST) { + zset *zs = zsetobj->ptr; + dictEntry *de = dictGetFairRandomKey(zs->dict); + sds s = dictGetKey(de); + key->sval = (unsigned char*)s; + key->slen = sdslen(s); + if (score) + *score = *(double*)dictGetVal(de); + } else if (zsetobj->encoding == OBJ_ENCODING_LISTPACK) { + listpackEntry val; + lpRandomPair(zsetobj->ptr, zsetsize, key, &val); + if (score) { + if (val.sval) { + *score = zzlStrtod(val.sval,val.slen); + } else { + *score = (double)val.lval; + } + } + } else { + serverPanic("Unknown zset encoding"); + } +} + +/*----------------------------------------------------------------------------- + * Sorted set commands + *----------------------------------------------------------------------------*/ + +/* This generic command implements both ZADD and ZINCRBY. */ +void zaddGenericCommand(client *c, int flags) { + static char *nanerr = "resulting score is not a number (NaN)"; + robj *key = c->argv[1]; + robj *zobj; + sds ele; + double score = 0, *scores = NULL; + int j, elements, ch = 0; + int scoreidx = 0; + /* The following vars are used in order to track what the command actually + * did during the execution, to reply to the client and to trigger the + * notification of keyspace change. */ + int added = 0; /* Number of new elements added. */ + int updated = 0; /* Number of elements with updated score. */ + int processed = 0; /* Number of elements processed, may remain zero with + options like XX. */ + + /* Parse options. At the end 'scoreidx' is set to the argument position + * of the score of the first score-element pair. */ + scoreidx = 2; + while(scoreidx < c->argc) { + char *opt = c->argv[scoreidx]->ptr; + if (!strcasecmp(opt,"nx")) flags |= ZADD_IN_NX; + else if (!strcasecmp(opt,"xx")) flags |= ZADD_IN_XX; + else if (!strcasecmp(opt,"ch")) ch = 1; /* Return num of elements added or updated. */ + else if (!strcasecmp(opt,"incr")) flags |= ZADD_IN_INCR; + else if (!strcasecmp(opt,"gt")) flags |= ZADD_IN_GT; + else if (!strcasecmp(opt,"lt")) flags |= ZADD_IN_LT; + else break; + scoreidx++; + } + + /* Turn options into simple to check vars. */ + int incr = (flags & ZADD_IN_INCR) != 0; + int nx = (flags & ZADD_IN_NX) != 0; + int xx = (flags & ZADD_IN_XX) != 0; + int gt = (flags & ZADD_IN_GT) != 0; + int lt = (flags & ZADD_IN_LT) != 0; + + /* After the options, we expect to have an even number of args, since + * we expect any number of score-element pairs. */ + elements = c->argc-scoreidx; + if (elements % 2 || !elements) { + addReplyErrorObject(c,shared.syntaxerr); + return; + } + elements /= 2; /* Now this holds the number of score-element pairs. */ + + /* Check for incompatible options. */ + if (nx && xx) { + addReplyError(c, + "XX and NX options at the same time are not compatible"); + return; + } + + if ((gt && nx) || (lt && nx) || (gt && lt)) { + addReplyError(c, + "GT, LT, and/or NX options at the same time are not compatible"); + return; + } + /* Note that XX is compatible with either GT or LT */ + + if (incr && elements > 1) { + addReplyError(c, + "INCR option supports a single increment-element pair"); + return; + } + + /* Start parsing all the scores, we need to emit any syntax error + * before executing additions to the sorted set, as the command should + * either execute fully or nothing at all. */ + scores = zmalloc(sizeof(double)*elements); + for (j = 0; j < elements; j++) { + if (getDoubleFromObjectOrReply(c,c->argv[scoreidx+j*2],&scores[j],NULL) + != C_OK) goto cleanup; + } + + /* Lookup the key and create the sorted set if does not exist. */ + zobj = lookupKeyWrite(c->db,key); + if (checkType(c,zobj,OBJ_ZSET)) goto cleanup; + if (zobj == NULL) { + if (xx) goto reply_to_client; /* No key + XX option: nothing to do. */ + if (server.zset_max_listpack_entries == 0 || + server.zset_max_listpack_value < sdslen(c->argv[scoreidx+1]->ptr)) + { + zobj = createZsetObject(); + } else { + zobj = createZsetListpackObject(); + } + dbAdd(c->db,key,zobj); + } + + for (j = 0; j < elements; j++) { + double newscore; + score = scores[j]; + int retflags = 0; + + ele = c->argv[scoreidx+1+j*2]->ptr; + int retval = zsetAdd(zobj, score, ele, flags, &retflags, &newscore); + if (retval == 0) { + addReplyError(c,nanerr); + goto cleanup; + } + if (retflags & ZADD_OUT_ADDED) added++; + if (retflags & ZADD_OUT_UPDATED) updated++; + if (!(retflags & ZADD_OUT_NOP)) processed++; + score = newscore; + } + server.dirty += (added+updated); + +reply_to_client: + if (incr) { /* ZINCRBY or INCR option. */ + if (processed) + addReplyDouble(c,score); + else + addReplyNull(c); + } else { /* ZADD. */ + addReplyLongLong(c,ch ? added+updated : added); + } + +cleanup: + zfree(scores); + if (added || updated) { + signalModifiedKey(c,c->db,key); + notifyKeyspaceEvent(NOTIFY_ZSET, + incr ? "zincr" : "zadd", key, c->db->id); + } +} + +void zaddCommand(client *c) { + zaddGenericCommand(c,ZADD_IN_NONE); +} + +void zincrbyCommand(client *c) { + zaddGenericCommand(c,ZADD_IN_INCR); +} + +void zremCommand(client *c) { + robj *key = c->argv[1]; + robj *zobj; + int deleted = 0, keyremoved = 0, j; + + if ((zobj = lookupKeyWriteOrReply(c,key,shared.czero)) == NULL || + checkType(c,zobj,OBJ_ZSET)) return; + + for (j = 2; j < c->argc; j++) { + if (zsetDel(zobj,c->argv[j]->ptr)) deleted++; + if (zsetLength(zobj) == 0) { + dbDelete(c->db,key); + keyremoved = 1; + break; + } + } + + if (deleted) { + notifyKeyspaceEvent(NOTIFY_ZSET,"zrem",key,c->db->id); + if (keyremoved) + notifyKeyspaceEvent(NOTIFY_GENERIC,"del",key,c->db->id); + signalModifiedKey(c,c->db,key); + server.dirty += deleted; + } + addReplyLongLong(c,deleted); +} + +typedef enum { + ZRANGE_AUTO = 0, + ZRANGE_RANK, + ZRANGE_SCORE, + ZRANGE_LEX, +} zrange_type; + +/* Implements ZREMRANGEBYRANK, ZREMRANGEBYSCORE, ZREMRANGEBYLEX commands. */ +void zremrangeGenericCommand(client *c, zrange_type rangetype) { + robj *key = c->argv[1]; + robj *zobj; + int keyremoved = 0; + unsigned long deleted = 0; + zrangespec range; + zlexrangespec lexrange; + long start, end, llen; + char *notify_type = NULL; + + /* Step 1: Parse the range. */ + if (rangetype == ZRANGE_RANK) { + notify_type = "zremrangebyrank"; + if ((getLongFromObjectOrReply(c,c->argv[2],&start,NULL) != C_OK) || + (getLongFromObjectOrReply(c,c->argv[3],&end,NULL) != C_OK)) + return; + } else if (rangetype == ZRANGE_SCORE) { + notify_type = "zremrangebyscore"; + if (zslParseRange(c->argv[2],c->argv[3],&range) != C_OK) { + addReplyError(c,"min or max is not a float"); + return; + } + } else if (rangetype == ZRANGE_LEX) { + notify_type = "zremrangebylex"; + if (zslParseLexRange(c->argv[2],c->argv[3],&lexrange) != C_OK) { + addReplyError(c,"min or max not valid string range item"); + return; + } + } else { + serverPanic("unknown rangetype %d", (int)rangetype); + } + + /* Step 2: Lookup & range sanity checks if needed. */ + if ((zobj = lookupKeyWriteOrReply(c,key,shared.czero)) == NULL || + checkType(c,zobj,OBJ_ZSET)) goto cleanup; + + if (rangetype == ZRANGE_RANK) { + /* Sanitize indexes. */ + llen = zsetLength(zobj); + if (start < 0) start = llen+start; + if (end < 0) end = llen+end; + if (start < 0) start = 0; + + /* Invariant: start >= 0, so this test will be true when end < 0. + * The range is empty when start > end or start >= length. */ + if (start > end || start >= llen) { + addReply(c,shared.czero); + goto cleanup; + } + if (end >= llen) end = llen-1; + } + + /* Step 3: Perform the range deletion operation. */ + if (zobj->encoding == OBJ_ENCODING_LISTPACK) { + switch(rangetype) { + case ZRANGE_AUTO: + case ZRANGE_RANK: + zobj->ptr = zzlDeleteRangeByRank(zobj->ptr,start+1,end+1,&deleted); + break; + case ZRANGE_SCORE: + zobj->ptr = zzlDeleteRangeByScore(zobj->ptr,&range,&deleted); + break; + case ZRANGE_LEX: + zobj->ptr = zzlDeleteRangeByLex(zobj->ptr,&lexrange,&deleted); + break; + } + if (zzlLength(zobj->ptr) == 0) { + dbDelete(c->db,key); + keyremoved = 1; + } + } else if (zobj->encoding == OBJ_ENCODING_SKIPLIST) { + zset *zs = zobj->ptr; + switch(rangetype) { + case ZRANGE_AUTO: + case ZRANGE_RANK: + deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict); + break; + case ZRANGE_SCORE: + deleted = zslDeleteRangeByScore(zs->zsl,&range,zs->dict); + break; + case ZRANGE_LEX: + deleted = zslDeleteRangeByLex(zs->zsl,&lexrange,zs->dict); + break; + } + if (htNeedsResize(zs->dict)) dictResize(zs->dict); + if (dictSize(zs->dict) == 0) { + dbDelete(c->db,key); + keyremoved = 1; + } + } else { + serverPanic("Unknown sorted set encoding"); + } + + /* Step 4: Notifications and reply. */ + if (deleted) { + signalModifiedKey(c,c->db,key); + notifyKeyspaceEvent(NOTIFY_ZSET,notify_type,key,c->db->id); + if (keyremoved) + notifyKeyspaceEvent(NOTIFY_GENERIC,"del",key,c->db->id); + } + server.dirty += deleted; + addReplyLongLong(c,deleted); + +cleanup: + if (rangetype == ZRANGE_LEX) zslFreeLexRange(&lexrange); +} + +void zremrangebyrankCommand(client *c) { + zremrangeGenericCommand(c,ZRANGE_RANK); +} + +void zremrangebyscoreCommand(client *c) { + zremrangeGenericCommand(c,ZRANGE_SCORE); +} + +void zremrangebylexCommand(client *c) { + zremrangeGenericCommand(c,ZRANGE_LEX); +} + +typedef struct { + robj *subject; + int type; /* Set, sorted set */ + int encoding; + double weight; + + union { + /* Set iterators. */ + union _iterset { + struct { + intset *is; + int ii; + } is; + struct { + dict *dict; + dictIterator *di; + dictEntry *de; + } ht; + } set; + + /* Sorted set iterators. */ + union _iterzset { + struct { + unsigned char *zl; + unsigned char *eptr, *sptr; + } zl; + struct { + zset *zs; + zskiplistNode *node; + } sl; + } zset; + } iter; +} zsetopsrc; + + +/* Use dirty flags for pointers that need to be cleaned up in the next + * iteration over the zsetopval. The dirty flag for the long long value is + * special, since long long values don't need cleanup. Instead, it means that + * we already checked that "ell" holds a long long, or tried to convert another + * representation into a long long value. When this was successful, + * OPVAL_VALID_LL is set as well. */ +#define OPVAL_DIRTY_SDS 1 +#define OPVAL_DIRTY_LL 2 +#define OPVAL_VALID_LL 4 + +/* Store value retrieved from the iterator. */ +typedef struct { + int flags; + unsigned char _buf[32]; /* Private buffer. */ + sds ele; + unsigned char *estr; + unsigned int elen; + long long ell; + double score; +} zsetopval; + +typedef union _iterset iterset; +typedef union _iterzset iterzset; + +void zuiInitIterator(zsetopsrc *op) { + if (op->subject == NULL) + return; + + if (op->type == OBJ_SET) { + iterset *it = &op->iter.set; + if (op->encoding == OBJ_ENCODING_INTSET) { + it->is.is = op->subject->ptr; + it->is.ii = 0; + } else if (op->encoding == OBJ_ENCODING_HT) { + it->ht.dict = op->subject->ptr; + it->ht.di = dictGetIterator(op->subject->ptr); + it->ht.de = dictNext(it->ht.di); + } else { + serverPanic("Unknown set encoding"); + } + } else if (op->type == OBJ_ZSET) { + /* Sorted sets are traversed in reverse order to optimize for + * the insertion of the elements in a new list as in + * ZDIFF/ZINTER/ZUNION */ + iterzset *it = &op->iter.zset; + if (op->encoding == OBJ_ENCODING_LISTPACK) { + it->zl.zl = op->subject->ptr; + it->zl.eptr = lpSeek(it->zl.zl,-2); + if (it->zl.eptr != NULL) { + it->zl.sptr = lpNext(it->zl.zl,it->zl.eptr); + serverAssert(it->zl.sptr != NULL); + } + } else if (op->encoding == OBJ_ENCODING_SKIPLIST) { + it->sl.zs = op->subject->ptr; + it->sl.node = it->sl.zs->zsl->tail; + } else { + serverPanic("Unknown sorted set encoding"); + } + } else { + serverPanic("Unsupported type"); + } +} + +void zuiClearIterator(zsetopsrc *op) { + if (op->subject == NULL) + return; + + if (op->type == OBJ_SET) { + iterset *it = &op->iter.set; + if (op->encoding == OBJ_ENCODING_INTSET) { + UNUSED(it); /* skip */ + } else if (op->encoding == OBJ_ENCODING_HT) { + dictReleaseIterator(it->ht.di); + } else { + serverPanic("Unknown set encoding"); + } + } else if (op->type == OBJ_ZSET) { + iterzset *it = &op->iter.zset; + if (op->encoding == OBJ_ENCODING_LISTPACK) { + UNUSED(it); /* skip */ + } else if (op->encoding == OBJ_ENCODING_SKIPLIST) { + UNUSED(it); /* skip */ + } else { + serverPanic("Unknown sorted set encoding"); + } + } else { + serverPanic("Unsupported type"); + } +} + +void zuiDiscardDirtyValue(zsetopval *val) { + if (val->flags & OPVAL_DIRTY_SDS) { + sdsfree(val->ele); + val->ele = NULL; + val->flags &= ~OPVAL_DIRTY_SDS; + } +} + +unsigned long zuiLength(zsetopsrc *op) { + if (op->subject == NULL) + return 0; + + if (op->type == OBJ_SET) { + if (op->encoding == OBJ_ENCODING_INTSET) { + return intsetLen(op->subject->ptr); + } else if (op->encoding == OBJ_ENCODING_HT) { + dict *ht = op->subject->ptr; + return dictSize(ht); + } else { + serverPanic("Unknown set encoding"); + } + } else if (op->type == OBJ_ZSET) { + if (op->encoding == OBJ_ENCODING_LISTPACK) { + return zzlLength(op->subject->ptr); + } else if (op->encoding == OBJ_ENCODING_SKIPLIST) { + zset *zs = op->subject->ptr; + return zs->zsl->length; + } else { + serverPanic("Unknown sorted set encoding"); + } + } else { + serverPanic("Unsupported type"); + } +} + +/* Check if the current value is valid. If so, store it in the passed structure + * and move to the next element. If not valid, this means we have reached the + * end of the structure and can abort. */ +int zuiNext(zsetopsrc *op, zsetopval *val) { + if (op->subject == NULL) + return 0; + + zuiDiscardDirtyValue(val); + + memset(val,0,sizeof(zsetopval)); + + if (op->type == OBJ_SET) { + iterset *it = &op->iter.set; + if (op->encoding == OBJ_ENCODING_INTSET) { + int64_t ell; + + if (!intsetGet(it->is.is,it->is.ii,&ell)) + return 0; + val->ell = ell; + val->score = 1.0; + + /* Move to next element. */ + it->is.ii++; + } else if (op->encoding == OBJ_ENCODING_HT) { + if (it->ht.de == NULL) + return 0; + val->ele = dictGetKey(it->ht.de); + val->score = 1.0; + + /* Move to next element. */ + it->ht.de = dictNext(it->ht.di); + } else { + serverPanic("Unknown set encoding"); + } + } else if (op->type == OBJ_ZSET) { + iterzset *it = &op->iter.zset; + if (op->encoding == OBJ_ENCODING_LISTPACK) { + /* No need to check both, but better be explicit. */ + if (it->zl.eptr == NULL || it->zl.sptr == NULL) + return 0; + val->estr = lpGetValue(it->zl.eptr,&val->elen,&val->ell); + val->score = zzlGetScore(it->zl.sptr); + + /* Move to next element (going backwards, see zuiInitIterator). */ + zzlPrev(it->zl.zl,&it->zl.eptr,&it->zl.sptr); + } else if (op->encoding == OBJ_ENCODING_SKIPLIST) { + if (it->sl.node == NULL) + return 0; + val->ele = it->sl.node->ele; + val->score = it->sl.node->score; + + /* Move to next element. (going backwards, see zuiInitIterator) */ + it->sl.node = it->sl.node->backward; + } else { + serverPanic("Unknown sorted set encoding"); + } + } else { + serverPanic("Unsupported type"); + } + return 1; +} + +int zuiLongLongFromValue(zsetopval *val) { + if (!(val->flags & OPVAL_DIRTY_LL)) { + val->flags |= OPVAL_DIRTY_LL; + + if (val->ele != NULL) { + if (string2ll(val->ele,sdslen(val->ele),&val->ell)) + val->flags |= OPVAL_VALID_LL; + } else if (val->estr != NULL) { + if (string2ll((char*)val->estr,val->elen,&val->ell)) + val->flags |= OPVAL_VALID_LL; + } else { + /* The long long was already set, flag as valid. */ + val->flags |= OPVAL_VALID_LL; + } + } + return val->flags & OPVAL_VALID_LL; +} + +sds zuiSdsFromValue(zsetopval *val) { + if (val->ele == NULL) { + if (val->estr != NULL) { + val->ele = sdsnewlen((char*)val->estr,val->elen); + } else { + val->ele = sdsfromlonglong(val->ell); + } + val->flags |= OPVAL_DIRTY_SDS; + } + return val->ele; +} + +/* This is different from zuiSdsFromValue since returns a new SDS string + * which is up to the caller to free. */ +sds zuiNewSdsFromValue(zsetopval *val) { + if (val->flags & OPVAL_DIRTY_SDS) { + /* We have already one to return! */ + sds ele = val->ele; + val->flags &= ~OPVAL_DIRTY_SDS; + val->ele = NULL; + return ele; + } else if (val->ele) { + return sdsdup(val->ele); + } else if (val->estr) { + return sdsnewlen((char*)val->estr,val->elen); + } else { + return sdsfromlonglong(val->ell); + } +} + +int zuiBufferFromValue(zsetopval *val) { + if (val->estr == NULL) { + if (val->ele != NULL) { + val->elen = sdslen(val->ele); + val->estr = (unsigned char*)val->ele; + } else { + val->elen = ll2string((char*)val->_buf,sizeof(val->_buf),val->ell); + val->estr = val->_buf; + } + } + return 1; +} + +/* Find value pointed to by val in the source pointer to by op. When found, + * return 1 and store its score in target. Return 0 otherwise. */ +int zuiFind(zsetopsrc *op, zsetopval *val, double *score) { + if (op->subject == NULL) + return 0; + + if (op->type == OBJ_SET) { + if (op->encoding == OBJ_ENCODING_INTSET) { + if (zuiLongLongFromValue(val) && + intsetFind(op->subject->ptr,val->ell)) + { + *score = 1.0; + return 1; + } else { + return 0; + } + } else if (op->encoding == OBJ_ENCODING_HT) { + dict *ht = op->subject->ptr; + zuiSdsFromValue(val); + if (dictFind(ht,val->ele) != NULL) { + *score = 1.0; + return 1; + } else { + return 0; + } + } else { + serverPanic("Unknown set encoding"); + } + } else if (op->type == OBJ_ZSET) { + zuiSdsFromValue(val); + + if (op->encoding == OBJ_ENCODING_LISTPACK) { + if (zzlFind(op->subject->ptr,val->ele,score) != NULL) { + /* Score is already set by zzlFind. */ + return 1; + } else { + return 0; + } + } else if (op->encoding == OBJ_ENCODING_SKIPLIST) { + zset *zs = op->subject->ptr; + dictEntry *de; + if ((de = dictFind(zs->dict,val->ele)) != NULL) { + *score = *(double*)dictGetVal(de); + return 1; + } else { + return 0; + } + } else { + serverPanic("Unknown sorted set encoding"); + } + } else { + serverPanic("Unsupported type"); + } +} + +int zuiCompareByCardinality(const void *s1, const void *s2) { + unsigned long first = zuiLength((zsetopsrc*)s1); + unsigned long second = zuiLength((zsetopsrc*)s2); + if (first > second) return 1; + if (first < second) return -1; + return 0; +} + +static int zuiCompareByRevCardinality(const void *s1, const void *s2) { + return zuiCompareByCardinality(s1, s2) * -1; +} + +#define REDIS_AGGR_SUM 1 +#define REDIS_AGGR_MIN 2 +#define REDIS_AGGR_MAX 3 +#define zunionInterDictValue(_e) (dictGetVal(_e) == NULL ? 1.0 : *(double*)dictGetVal(_e)) + +inline static void zunionInterAggregate(double *target, double val, int aggregate) { + if (aggregate == REDIS_AGGR_SUM) { + *target = *target + val; + /* The result of adding two doubles is NaN when one variable + * is +inf and the other is -inf. When these numbers are added, + * we maintain the convention of the result being 0.0. */ + if (isnan(*target)) *target = 0.0; + } else if (aggregate == REDIS_AGGR_MIN) { + *target = val < *target ? val : *target; + } else if (aggregate == REDIS_AGGR_MAX) { + *target = val > *target ? val : *target; + } else { + /* safety net */ + serverPanic("Unknown ZUNION/INTER aggregate type"); + } +} + +static size_t zsetDictGetMaxElementLength(dict *d, size_t *totallen) { + dictIterator *di; + dictEntry *de; + size_t maxelelen = 0; + + di = dictGetIterator(d); + + while((de = dictNext(di)) != NULL) { + sds ele = dictGetKey(de); + if (sdslen(ele) > maxelelen) maxelelen = sdslen(ele); + if (totallen) + (*totallen) += sdslen(ele); + } + + dictReleaseIterator(di); + + return maxelelen; +} + +static void zdiffAlgorithm1(zsetopsrc *src, long setnum, zset *dstzset, size_t *maxelelen, size_t *totelelen) { + /* DIFF Algorithm 1: + * + * We perform the diff by iterating all the elements of the first set, + * and only adding it to the target set if the element does not exist + * into all the other sets. + * + * This way we perform at max N*M operations, where N is the size of + * the first set, and M the number of sets. + * + * There is also a O(K*log(K)) cost for adding the resulting elements + * to the target set, where K is the final size of the target set. + * + * The final complexity of this algorithm is O(N*M + K*log(K)). */ + int j; + zsetopval zval; + zskiplistNode *znode; + sds tmp; + + /* With algorithm 1 it is better to order the sets to subtract + * by decreasing size, so that we are more likely to find + * duplicated elements ASAP. */ + qsort(src+1,setnum-1,sizeof(zsetopsrc),zuiCompareByRevCardinality); + + memset(&zval, 0, sizeof(zval)); + zuiInitIterator(&src[0]); + while (zuiNext(&src[0],&zval)) { + double value; + int exists = 0; + + for (j = 1; j < setnum; j++) { + /* It is not safe to access the zset we are + * iterating, so explicitly check for equal object. + * This check isn't really needed anymore since we already + * check for a duplicate set in the zsetChooseDiffAlgorithm + * function, but we're leaving it for future-proofing. */ + if (src[j].subject == src[0].subject || + zuiFind(&src[j],&zval,&value)) { + exists = 1; + break; + } + } + + if (!exists) { + tmp = zuiNewSdsFromValue(&zval); + znode = zslInsert(dstzset->zsl,zval.score,tmp); + dictAdd(dstzset->dict,tmp,&znode->score); + if (sdslen(tmp) > *maxelelen) *maxelelen = sdslen(tmp); + (*totelelen) += sdslen(tmp); + } + } + zuiClearIterator(&src[0]); +} + + +static void zdiffAlgorithm2(zsetopsrc *src, long setnum, zset *dstzset, size_t *maxelelen, size_t *totelelen) { + /* DIFF Algorithm 2: + * + * Add all the elements of the first set to the auxiliary set. + * Then remove all the elements of all the next sets from it. + * + + * This is O(L + (N-K)log(N)) where L is the sum of all the elements in every + * set, N is the size of the first set, and K is the size of the result set. + * + * Note that from the (L-N) dict searches, (N-K) got to the zsetRemoveFromSkiplist + * which costs log(N) + * + * There is also a O(K) cost at the end for finding the largest element + * size, but this doesn't change the algorithm complexity since K < L, and + * O(2L) is the same as O(L). */ + int j; + int cardinality = 0; + zsetopval zval; + zskiplistNode *znode; + sds tmp; + + for (j = 0; j < setnum; j++) { + if (zuiLength(&src[j]) == 0) continue; + + memset(&zval, 0, sizeof(zval)); + zuiInitIterator(&src[j]); + while (zuiNext(&src[j],&zval)) { + if (j == 0) { + tmp = zuiNewSdsFromValue(&zval); + znode = zslInsert(dstzset->zsl,zval.score,tmp); + dictAdd(dstzset->dict,tmp,&znode->score); + cardinality++; + } else { + tmp = zuiSdsFromValue(&zval); + if (zsetRemoveFromSkiplist(dstzset, tmp)) { + cardinality--; + } + } + + /* Exit if result set is empty as any additional removal + * of elements will have no effect. */ + if (cardinality == 0) break; + } + zuiClearIterator(&src[j]); + + if (cardinality == 0) break; + } + + /* Resize dict if needed after removing multiple elements */ + if (htNeedsResize(dstzset->dict)) dictResize(dstzset->dict); + + /* Using this algorithm, we can't calculate the max element as we go, + * we have to iterate through all elements to find the max one after. */ + *maxelelen = zsetDictGetMaxElementLength(dstzset->dict, totelelen); +} + +static int zsetChooseDiffAlgorithm(zsetopsrc *src, long setnum) { + int j; + + /* Select what DIFF algorithm to use. + * + * Algorithm 1 is O(N*M + K*log(K)) where N is the size of the + * first set, M the total number of sets, and K is the size of the + * result set. + * + * Algorithm 2 is O(L + (N-K)log(N)) where L is the total number of elements + * in all the sets, N is the size of the first set, and K is the size of the + * result set. + * + * We compute what is the best bet with the current input here. */ + long long algo_one_work = 0; + long long algo_two_work = 0; + + for (j = 0; j < setnum; j++) { + /* If any other set is equal to the first set, there is nothing to be + * done, since we would remove all elements anyway. */ + if (j > 0 && src[0].subject == src[j].subject) { + return 0; + } + + algo_one_work += zuiLength(&src[0]); + algo_two_work += zuiLength(&src[j]); + } + + /* Algorithm 1 has better constant times and performs less operations + * if there are elements in common. Give it some advantage. */ + algo_one_work /= 2; + return (algo_one_work <= algo_two_work) ? 1 : 2; +} + +static void zdiff(zsetopsrc *src, long setnum, zset *dstzset, size_t *maxelelen, size_t *totelelen) { + /* Skip everything if the smallest input is empty. */ + if (zuiLength(&src[0]) > 0) { + int diff_algo = zsetChooseDiffAlgorithm(src, setnum); + if (diff_algo == 1) { + zdiffAlgorithm1(src, setnum, dstzset, maxelelen, totelelen); + } else if (diff_algo == 2) { + zdiffAlgorithm2(src, setnum, dstzset, maxelelen, totelelen); + } else if (diff_algo != 0) { + serverPanic("Unknown algorithm"); + } + } +} + +dictType setAccumulatorDictType = { + dictSdsHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictSdsKeyCompare, /* key compare */ + NULL, /* key destructor */ + NULL, /* val destructor */ + NULL /* allow to expand */ +}; + +/* The zunionInterDiffGenericCommand() function is called in order to implement the + * following commands: ZUNION, ZINTER, ZDIFF, ZUNIONSTORE, ZINTERSTORE, ZDIFFSTORE, + * ZINTERCARD. + * + * 'numkeysIndex' parameter position of key number. for ZUNION/ZINTER/ZDIFF command, + * this value is 1, for ZUNIONSTORE/ZINTERSTORE/ZDIFFSTORE command, this value is 2. + * + * 'op' SET_OP_INTER, SET_OP_UNION or SET_OP_DIFF. + * + * 'cardinality_only' is currently only applicable when 'op' is SET_OP_INTER. + * Work for SINTERCARD, only return the cardinality with minimum processing and memory overheads. + */ +void zunionInterDiffGenericCommand(client *c, robj *dstkey, int numkeysIndex, int op, + int cardinality_only) { + int i, j; + long setnum; + int aggregate = REDIS_AGGR_SUM; + zsetopsrc *src; + zsetopval zval; + sds tmp; + size_t maxelelen = 0, totelelen = 0; + robj *dstobj; + zset *dstzset; + zskiplistNode *znode; + int withscores = 0; + unsigned long cardinality = 0; + long limit = 0; /* Stop searching after reaching the limit. 0 means unlimited. */ + + /* expect setnum input keys to be given */ + if ((getLongFromObjectOrReply(c, c->argv[numkeysIndex], &setnum, NULL) != C_OK)) + return; + + if (setnum < 1) { + addReplyErrorFormat(c, + "at least 1 input key is needed for %s", c->cmd->name); + return; + } + + /* test if the expected number of keys would overflow */ + if (setnum > (c->argc-(numkeysIndex+1))) { + addReplyErrorObject(c,shared.syntaxerr); + return; + } + + /* read keys to be used for input */ + src = zcalloc(sizeof(zsetopsrc) * setnum); + for (i = 0, j = numkeysIndex+1; i < setnum; i++, j++) { + robj *obj = lookupKeyRead(c->db, c->argv[j]); + if (obj != NULL) { + if (obj->type != OBJ_ZSET && obj->type != OBJ_SET) { + zfree(src); + addReplyErrorObject(c,shared.wrongtypeerr); + return; + } + + src[i].subject = obj; + src[i].type = obj->type; + src[i].encoding = obj->encoding; + } else { + src[i].subject = NULL; + } + + /* Default all weights to 1. */ + src[i].weight = 1.0; + } + + /* parse optional extra arguments */ + if (j < c->argc) { + int remaining = c->argc - j; + + while (remaining) { + if (op != SET_OP_DIFF && !cardinality_only && + remaining >= (setnum + 1) && + !strcasecmp(c->argv[j]->ptr,"weights")) + { + j++; remaining--; + for (i = 0; i < setnum; i++, j++, remaining--) { + if (getDoubleFromObjectOrReply(c,c->argv[j],&src[i].weight, + "weight value is not a float") != C_OK) + { + zfree(src); + return; + } + } + } else if (op != SET_OP_DIFF && !cardinality_only && + remaining >= 2 && + !strcasecmp(c->argv[j]->ptr,"aggregate")) + { + j++; remaining--; + if (!strcasecmp(c->argv[j]->ptr,"sum")) { + aggregate = REDIS_AGGR_SUM; + } else if (!strcasecmp(c->argv[j]->ptr,"min")) { + aggregate = REDIS_AGGR_MIN; + } else if (!strcasecmp(c->argv[j]->ptr,"max")) { + aggregate = REDIS_AGGR_MAX; + } else { + zfree(src); + addReplyErrorObject(c,shared.syntaxerr); + return; + } + j++; remaining--; + } else if (remaining >= 1 && + !dstkey && !cardinality_only && + !strcasecmp(c->argv[j]->ptr,"withscores")) + { + j++; remaining--; + withscores = 1; + } else if (cardinality_only && remaining >= 2 && + !strcasecmp(c->argv[j]->ptr, "limit")) + { + j++; remaining--; + if (getPositiveLongFromObjectOrReply(c, c->argv[j], &limit, + "LIMIT can't be negative") != C_OK) + { + zfree(src); + return; + } + j++; remaining--; + } else { + zfree(src); + addReplyErrorObject(c,shared.syntaxerr); + return; + } + } + } + + if (op != SET_OP_DIFF) { + /* sort sets from the smallest to largest, this will improve our + * algorithm's performance */ + qsort(src,setnum,sizeof(zsetopsrc),zuiCompareByCardinality); + } + + dstobj = createZsetObject(); + dstzset = dstobj->ptr; + memset(&zval, 0, sizeof(zval)); + + if (op == SET_OP_INTER) { + /* Skip everything if the smallest input is empty. */ + if (zuiLength(&src[0]) > 0) { + /* Precondition: as src[0] is non-empty and the inputs are ordered + * by size, all src[i > 0] are non-empty too. */ + zuiInitIterator(&src[0]); + while (zuiNext(&src[0],&zval)) { + double score, value; + + score = src[0].weight * zval.score; + if (isnan(score)) score = 0; + + for (j = 1; j < setnum; j++) { + /* It is not safe to access the zset we are + * iterating, so explicitly check for equal object. */ + if (src[j].subject == src[0].subject) { + value = zval.score*src[j].weight; + zunionInterAggregate(&score,value,aggregate); + } else if (zuiFind(&src[j],&zval,&value)) { + value *= src[j].weight; + zunionInterAggregate(&score,value,aggregate); + } else { + break; + } + } + + /* Only continue when present in every input. */ + if (j == setnum && cardinality_only) { + cardinality++; + + /* We stop the searching after reaching the limit. */ + if (limit && cardinality >= (unsigned long)limit) { + /* Cleanup before we break the zuiNext loop. */ + zuiDiscardDirtyValue(&zval); + break; + } + } else if (j == setnum) { + tmp = zuiNewSdsFromValue(&zval); + znode = zslInsert(dstzset->zsl,score,tmp); + dictAdd(dstzset->dict,tmp,&znode->score); + totelelen += sdslen(tmp); + if (sdslen(tmp) > maxelelen) maxelelen = sdslen(tmp); + } + } + zuiClearIterator(&src[0]); + } + } else if (op == SET_OP_UNION) { + dict *accumulator = dictCreate(&setAccumulatorDictType); + dictIterator *di; + dictEntry *de, *existing; + double score; + + if (setnum) { + /* Our union is at least as large as the largest set. + * Resize the dictionary ASAP to avoid useless rehashing. */ + dictExpand(accumulator,zuiLength(&src[setnum-1])); + } + + /* Step 1: Create a dictionary of elements -> aggregated-scores + * by iterating one sorted set after the other. */ + for (i = 0; i < setnum; i++) { + if (zuiLength(&src[i]) == 0) continue; + + zuiInitIterator(&src[i]); + while (zuiNext(&src[i],&zval)) { + /* Initialize value */ + score = src[i].weight * zval.score; + if (isnan(score)) score = 0; + + /* Search for this element in the accumulating dictionary. */ + de = dictAddRaw(accumulator,zuiSdsFromValue(&zval),&existing); + /* If we don't have it, we need to create a new entry. */ + if (!existing) { + tmp = zuiNewSdsFromValue(&zval); + /* Remember the longest single element encountered, + * to understand if it's possible to convert to listpack + * at the end. */ + totelelen += sdslen(tmp); + if (sdslen(tmp) > maxelelen) maxelelen = sdslen(tmp); + /* Update the element with its initial score. */ + dictSetKey(accumulator, de, tmp); + dictSetDoubleVal(de,score); + } else { + /* Update the score with the score of the new instance + * of the element found in the current sorted set. + * + * Here we access directly the dictEntry double + * value inside the union as it is a big speedup + * compared to using the getDouble/setDouble API. */ + zunionInterAggregate(&existing->v.d,score,aggregate); + } + } + zuiClearIterator(&src[i]); + } + + /* Step 2: convert the dictionary into the final sorted set. */ + di = dictGetIterator(accumulator); + + /* We now are aware of the final size of the resulting sorted set, + * let's resize the dictionary embedded inside the sorted set to the + * right size, in order to save rehashing time. */ + dictExpand(dstzset->dict,dictSize(accumulator)); + + while((de = dictNext(di)) != NULL) { + sds ele = dictGetKey(de); + score = dictGetDoubleVal(de); + znode = zslInsert(dstzset->zsl,score,ele); + dictAdd(dstzset->dict,ele,&znode->score); + } + dictReleaseIterator(di); + dictRelease(accumulator); + } else if (op == SET_OP_DIFF) { + zdiff(src, setnum, dstzset, &maxelelen, &totelelen); + } else { + serverPanic("Unknown operator"); + } + + if (dstkey) { + if (dstzset->zsl->length) { + zsetConvertToListpackIfNeeded(dstobj, maxelelen, totelelen); + setKey(c, c->db, dstkey, dstobj, 0); + addReplyLongLong(c, zsetLength(dstobj)); + notifyKeyspaceEvent(NOTIFY_ZSET, + (op == SET_OP_UNION) ? "zunionstore" : + (op == SET_OP_INTER ? "zinterstore" : "zdiffstore"), + dstkey, c->db->id); + server.dirty++; + } else { + addReply(c, shared.czero); + if (dbDelete(c->db, dstkey)) { + signalModifiedKey(c, c->db, dstkey); + notifyKeyspaceEvent(NOTIFY_GENERIC, "del", dstkey, c->db->id); + server.dirty++; + } + } + } else if (cardinality_only) { + addReplyLongLong(c, cardinality); + } else { + unsigned long length = dstzset->zsl->length; + zskiplist *zsl = dstzset->zsl; + zskiplistNode *zn = zsl->header->level[0].forward; + /* In case of WITHSCORES, respond with a single array in RESP2, and + * nested arrays in RESP3. We can't use a map response type since the + * client library needs to know to respect the order. */ + if (withscores && c->resp == 2) + addReplyArrayLen(c, length*2); + else + addReplyArrayLen(c, length); + + while (zn != NULL) { + if (withscores && c->resp > 2) addReplyArrayLen(c,2); + addReplyBulkCBuffer(c,zn->ele,sdslen(zn->ele)); + if (withscores) addReplyDouble(c,zn->score); + zn = zn->level[0].forward; + } + } + decrRefCount(dstobj); + zfree(src); +} + +/* ZUNIONSTORE destination numkeys key [key ...] [WEIGHTS weight] [AGGREGATE SUM|MIN|MAX] */ +void zunionstoreCommand(client *c) { + zunionInterDiffGenericCommand(c, c->argv[1], 2, SET_OP_UNION, 0); +} + +/* ZINTERSTORE destination numkeys key [key ...] [WEIGHTS weight] [AGGREGATE SUM|MIN|MAX] */ +void zinterstoreCommand(client *c) { + zunionInterDiffGenericCommand(c, c->argv[1], 2, SET_OP_INTER, 0); +} + +/* ZDIFFSTORE destination numkeys key [key ...] */ +void zdiffstoreCommand(client *c) { + zunionInterDiffGenericCommand(c, c->argv[1], 2, SET_OP_DIFF, 0); +} + +/* ZUNION numkeys key [key ...] [WEIGHTS weight] [AGGREGATE SUM|MIN|MAX] [WITHSCORES] */ +void zunionCommand(client *c) { + zunionInterDiffGenericCommand(c, NULL, 1, SET_OP_UNION, 0); +} + +/* ZINTER numkeys key [key ...] [WEIGHTS weight] [AGGREGATE SUM|MIN|MAX] [WITHSCORES] */ +void zinterCommand(client *c) { + zunionInterDiffGenericCommand(c, NULL, 1, SET_OP_INTER, 0); +} + +/* ZINTERCARD numkeys key [key ...] [LIMIT limit] */ +void zinterCardCommand(client *c) { + zunionInterDiffGenericCommand(c, NULL, 1, SET_OP_INTER, 1); +} + +/* ZDIFF numkeys key [key ...] [WITHSCORES] */ +void zdiffCommand(client *c) { + zunionInterDiffGenericCommand(c, NULL, 1, SET_OP_DIFF, 0); +} + +typedef enum { + ZRANGE_DIRECTION_AUTO = 0, + ZRANGE_DIRECTION_FORWARD, + ZRANGE_DIRECTION_REVERSE +} zrange_direction; + +typedef enum { + ZRANGE_CONSUMER_TYPE_CLIENT = 0, + ZRANGE_CONSUMER_TYPE_INTERNAL +} zrange_consumer_type; + +typedef struct zrange_result_handler zrange_result_handler; + +typedef void (*zrangeResultBeginFunction)(zrange_result_handler *c); +typedef void (*zrangeResultFinalizeFunction)( + zrange_result_handler *c, size_t result_count); +typedef void (*zrangeResultEmitCBufferFunction)( + zrange_result_handler *c, const void *p, size_t len, double score); +typedef void (*zrangeResultEmitLongLongFunction)( + zrange_result_handler *c, long long ll, double score); + +void zrangeGenericCommand (zrange_result_handler *handler, int argc_start, int store, + zrange_type rangetype, zrange_direction direction); + +/* Interface struct for ZRANGE/ZRANGESTORE generic implementation. + * There is one implementation of this interface that sends a RESP reply to clients. + * and one implementation that stores the range result into a zset object. */ +struct zrange_result_handler { + zrange_consumer_type type; + client *client; + robj *dstkey; + robj *dstobj; + void *userdata; + int withscores; + int should_emit_array_length; + zrangeResultBeginFunction beginResultEmission; + zrangeResultFinalizeFunction finalizeResultEmission; + zrangeResultEmitCBufferFunction emitResultFromCBuffer; + zrangeResultEmitLongLongFunction emitResultFromLongLong; +}; + +/* Result handler methods for responding the ZRANGE to clients. */ +static void zrangeResultBeginClient(zrange_result_handler *handler) { + handler->userdata = addReplyDeferredLen(handler->client); +} + +static void zrangeResultEmitCBufferToClient(zrange_result_handler *handler, + const void *value, size_t value_length_in_bytes, double score) +{ + if (handler->should_emit_array_length) { + addReplyArrayLen(handler->client, 2); + } + + addReplyBulkCBuffer(handler->client, value, value_length_in_bytes); + + if (handler->withscores) { + addReplyDouble(handler->client, score); + } +} + +static void zrangeResultEmitLongLongToClient(zrange_result_handler *handler, + long long value, double score) +{ + if (handler->should_emit_array_length) { + addReplyArrayLen(handler->client, 2); + } + + addReplyBulkLongLong(handler->client, value); + + if (handler->withscores) { + addReplyDouble(handler->client, score); + } +} + +static void zrangeResultFinalizeClient(zrange_result_handler *handler, + size_t result_count) +{ + /* In case of WITHSCORES, respond with a single array in RESP2, and + * nested arrays in RESP3. We can't use a map response type since the + * client library needs to know to respect the order. */ + if (handler->withscores && (handler->client->resp == 2)) { + result_count *= 2; + } + + setDeferredArrayLen(handler->client, handler->userdata, result_count); +} + +/* Result handler methods for storing the ZRANGESTORE to a zset. */ +static void zrangeResultBeginStore(zrange_result_handler *handler) +{ + handler->dstobj = createZsetListpackObject(); +} + +static void zrangeResultEmitCBufferForStore(zrange_result_handler *handler, + const void *value, size_t value_length_in_bytes, double score) +{ + double newscore; + int retflags = 0; + sds ele = sdsnewlen(value, value_length_in_bytes); + int retval = zsetAdd(handler->dstobj, score, ele, ZADD_IN_NONE, &retflags, &newscore); + sdsfree(ele); + serverAssert(retval); +} + +static void zrangeResultEmitLongLongForStore(zrange_result_handler *handler, + long long value, double score) +{ + double newscore; + int retflags = 0; + sds ele = sdsfromlonglong(value); + int retval = zsetAdd(handler->dstobj, score, ele, ZADD_IN_NONE, &retflags, &newscore); + sdsfree(ele); + serverAssert(retval); +} + +static void zrangeResultFinalizeStore(zrange_result_handler *handler, size_t result_count) +{ + if (result_count) { + setKey(handler->client, handler->client->db, handler->dstkey, handler->dstobj, 0); + addReplyLongLong(handler->client, result_count); + notifyKeyspaceEvent(NOTIFY_ZSET, "zrangestore", handler->dstkey, handler->client->db->id); + server.dirty++; + } else { + addReply(handler->client, shared.czero); + if (dbDelete(handler->client->db, handler->dstkey)) { + signalModifiedKey(handler->client, handler->client->db, handler->dstkey); + notifyKeyspaceEvent(NOTIFY_GENERIC, "del", handler->dstkey, handler->client->db->id); + server.dirty++; + } + } + decrRefCount(handler->dstobj); +} + +/* Initialize the consumer interface type with the requested type. */ +static void zrangeResultHandlerInit(zrange_result_handler *handler, + client *client, zrange_consumer_type type) +{ + memset(handler, 0, sizeof(*handler)); + + handler->client = client; + + switch (type) { + case ZRANGE_CONSUMER_TYPE_CLIENT: + handler->beginResultEmission = zrangeResultBeginClient; + handler->finalizeResultEmission = zrangeResultFinalizeClient; + handler->emitResultFromCBuffer = zrangeResultEmitCBufferToClient; + handler->emitResultFromLongLong = zrangeResultEmitLongLongToClient; + break; + + case ZRANGE_CONSUMER_TYPE_INTERNAL: + handler->beginResultEmission = zrangeResultBeginStore; + handler->finalizeResultEmission = zrangeResultFinalizeStore; + handler->emitResultFromCBuffer = zrangeResultEmitCBufferForStore; + handler->emitResultFromLongLong = zrangeResultEmitLongLongForStore; + break; + } +} + +static void zrangeResultHandlerScoreEmissionEnable(zrange_result_handler *handler) { + handler->withscores = 1; + handler->should_emit_array_length = (handler->client->resp > 2); +} + +static void zrangeResultHandlerDestinationKeySet (zrange_result_handler *handler, + robj *dstkey) +{ + handler->dstkey = dstkey; +} + +/* This command implements ZRANGE, ZREVRANGE. */ +void genericZrangebyrankCommand(zrange_result_handler *handler, + robj *zobj, long start, long end, int withscores, int reverse) { + + client *c = handler->client; + long llen; + long rangelen; + size_t result_cardinality; + + /* Sanitize indexes. */ + llen = zsetLength(zobj); + if (start < 0) start = llen+start; + if (end < 0) end = llen+end; + if (start < 0) start = 0; + + handler->beginResultEmission(handler); + + /* Invariant: start >= 0, so this test will be true when end < 0. + * The range is empty when start > end or start >= length. */ + if (start > end || start >= llen) { + handler->finalizeResultEmission(handler, 0); + return; + } + if (end >= llen) end = llen-1; + rangelen = (end-start)+1; + result_cardinality = rangelen; + + if (zobj->encoding == OBJ_ENCODING_LISTPACK) { + unsigned char *zl = zobj->ptr; + unsigned char *eptr, *sptr; + unsigned char *vstr; + unsigned int vlen; + long long vlong; + double score = 0.0; + + if (reverse) + eptr = lpSeek(zl,-2-(2*start)); + else + eptr = lpSeek(zl,2*start); + + serverAssertWithInfo(c,zobj,eptr != NULL); + sptr = lpNext(zl,eptr); + + while (rangelen--) { + serverAssertWithInfo(c,zobj,eptr != NULL && sptr != NULL); + vstr = lpGetValue(eptr,&vlen,&vlong); + + if (withscores) /* don't bother to extract the score if it's gonna be ignored. */ + score = zzlGetScore(sptr); + + if (vstr == NULL) { + handler->emitResultFromLongLong(handler, vlong, score); + } else { + handler->emitResultFromCBuffer(handler, vstr, vlen, score); + } + + if (reverse) + zzlPrev(zl,&eptr,&sptr); + else + zzlNext(zl,&eptr,&sptr); + } + + } else if (zobj->encoding == OBJ_ENCODING_SKIPLIST) { + zset *zs = zobj->ptr; + zskiplist *zsl = zs->zsl; + zskiplistNode *ln; + + /* Check if starting point is trivial, before doing log(N) lookup. */ + if (reverse) { + ln = zsl->tail; + if (start > 0) + ln = zslGetElementByRank(zsl,llen-start); + } else { + ln = zsl->header->level[0].forward; + if (start > 0) + ln = zslGetElementByRank(zsl,start+1); + } + + while(rangelen--) { + serverAssertWithInfo(c,zobj,ln != NULL); + sds ele = ln->ele; + handler->emitResultFromCBuffer(handler, ele, sdslen(ele), ln->score); + ln = reverse ? ln->backward : ln->level[0].forward; + } + } else { + serverPanic("Unknown sorted set encoding"); + } + + handler->finalizeResultEmission(handler, result_cardinality); +} + +/* ZRANGESTORE [BYSCORE | BYLEX] [REV] [LIMIT offset count] */ +void zrangestoreCommand (client *c) { + robj *dstkey = c->argv[1]; + zrange_result_handler handler; + zrangeResultHandlerInit(&handler, c, ZRANGE_CONSUMER_TYPE_INTERNAL); + zrangeResultHandlerDestinationKeySet(&handler, dstkey); + zrangeGenericCommand(&handler, 2, 1, ZRANGE_AUTO, ZRANGE_DIRECTION_AUTO); +} + +/* ZRANGE [BYSCORE | BYLEX] [REV] [WITHSCORES] [LIMIT offset count] */ +void zrangeCommand(client *c) { + zrange_result_handler handler; + zrangeResultHandlerInit(&handler, c, ZRANGE_CONSUMER_TYPE_CLIENT); + zrangeGenericCommand(&handler, 1, 0, ZRANGE_AUTO, ZRANGE_DIRECTION_AUTO); +} + +/* ZREVRANGE [WITHSCORES] */ +void zrevrangeCommand(client *c) { + zrange_result_handler handler; + zrangeResultHandlerInit(&handler, c, ZRANGE_CONSUMER_TYPE_CLIENT); + zrangeGenericCommand(&handler, 1, 0, ZRANGE_RANK, ZRANGE_DIRECTION_REVERSE); +} + +/* This command implements ZRANGEBYSCORE, ZREVRANGEBYSCORE. */ +void genericZrangebyscoreCommand(zrange_result_handler *handler, + zrangespec *range, robj *zobj, long offset, long limit, + int reverse) { + unsigned long rangelen = 0; + + handler->beginResultEmission(handler); + + /* For invalid offset, return directly. */ + if (offset > 0 && offset >= (long)zsetLength(zobj)) { + handler->finalizeResultEmission(handler, 0); + return; + } + + if (zobj->encoding == OBJ_ENCODING_LISTPACK) { + unsigned char *zl = zobj->ptr; + unsigned char *eptr, *sptr; + unsigned char *vstr; + unsigned int vlen; + long long vlong; + + /* If reversed, get the last node in range as starting point. */ + if (reverse) { + eptr = zzlLastInRange(zl,range); + } else { + eptr = zzlFirstInRange(zl,range); + } + + /* Get score pointer for the first element. */ + if (eptr) + sptr = lpNext(zl,eptr); + + /* If there is an offset, just traverse the number of elements without + * checking the score because that is done in the next loop. */ + while (eptr && offset--) { + if (reverse) { + zzlPrev(zl,&eptr,&sptr); + } else { + zzlNext(zl,&eptr,&sptr); + } + } + + while (eptr && limit--) { + double score = zzlGetScore(sptr); + + /* Abort when the node is no longer in range. */ + if (reverse) { + if (!zslValueGteMin(score,range)) break; + } else { + if (!zslValueLteMax(score,range)) break; + } + + vstr = lpGetValue(eptr,&vlen,&vlong); + rangelen++; + if (vstr == NULL) { + handler->emitResultFromLongLong(handler, vlong, score); + } else { + handler->emitResultFromCBuffer(handler, vstr, vlen, score); + } + + /* Move to next node */ + if (reverse) { + zzlPrev(zl,&eptr,&sptr); + } else { + zzlNext(zl,&eptr,&sptr); + } + } + } else if (zobj->encoding == OBJ_ENCODING_SKIPLIST) { + zset *zs = zobj->ptr; + zskiplist *zsl = zs->zsl; + zskiplistNode *ln; + + /* If reversed, get the last node in range as starting point. */ + if (reverse) { + ln = zslLastInRange(zsl,range); + } else { + ln = zslFirstInRange(zsl,range); + } + + /* If there is an offset, just traverse the number of elements without + * checking the score because that is done in the next loop. */ + while (ln && offset--) { + if (reverse) { + ln = ln->backward; + } else { + ln = ln->level[0].forward; + } + } + + while (ln && limit--) { + /* Abort when the node is no longer in range. */ + if (reverse) { + if (!zslValueGteMin(ln->score,range)) break; + } else { + if (!zslValueLteMax(ln->score,range)) break; + } + + rangelen++; + handler->emitResultFromCBuffer(handler, ln->ele, sdslen(ln->ele), ln->score); + + /* Move to next node */ + if (reverse) { + ln = ln->backward; + } else { + ln = ln->level[0].forward; + } + } + } else { + serverPanic("Unknown sorted set encoding"); + } + + handler->finalizeResultEmission(handler, rangelen); +} + +/* ZRANGEBYSCORE [WITHSCORES] [LIMIT offset count] */ +void zrangebyscoreCommand(client *c) { + zrange_result_handler handler; + zrangeResultHandlerInit(&handler, c, ZRANGE_CONSUMER_TYPE_CLIENT); + zrangeGenericCommand(&handler, 1, 0, ZRANGE_SCORE, ZRANGE_DIRECTION_FORWARD); +} + +/* ZREVRANGEBYSCORE [WITHSCORES] [LIMIT offset count] */ +void zrevrangebyscoreCommand(client *c) { + zrange_result_handler handler; + zrangeResultHandlerInit(&handler, c, ZRANGE_CONSUMER_TYPE_CLIENT); + zrangeGenericCommand(&handler, 1, 0, ZRANGE_SCORE, ZRANGE_DIRECTION_REVERSE); +} + +void zcountCommand(client *c) { + robj *key = c->argv[1]; + robj *zobj; + zrangespec range; + unsigned long count = 0; + + /* Parse the range arguments */ + if (zslParseRange(c->argv[2],c->argv[3],&range) != C_OK) { + addReplyError(c,"min or max is not a float"); + return; + } + + /* Lookup the sorted set */ + if ((zobj = lookupKeyReadOrReply(c, key, shared.czero)) == NULL || + checkType(c, zobj, OBJ_ZSET)) return; + + if (zobj->encoding == OBJ_ENCODING_LISTPACK) { + unsigned char *zl = zobj->ptr; + unsigned char *eptr, *sptr; + double score; + + /* Use the first element in range as the starting point */ + eptr = zzlFirstInRange(zl,&range); + + /* No "first" element */ + if (eptr == NULL) { + addReply(c, shared.czero); + return; + } + + /* First element is in range */ + sptr = lpNext(zl,eptr); + score = zzlGetScore(sptr); + serverAssertWithInfo(c,zobj,zslValueLteMax(score,&range)); + + /* Iterate over elements in range */ + while (eptr) { + score = zzlGetScore(sptr); + + /* Abort when the node is no longer in range. */ + if (!zslValueLteMax(score,&range)) { + break; + } else { + count++; + zzlNext(zl,&eptr,&sptr); + } + } + } else if (zobj->encoding == OBJ_ENCODING_SKIPLIST) { + zset *zs = zobj->ptr; + zskiplist *zsl = zs->zsl; + zskiplistNode *zn; + unsigned long rank; + + /* Find first element in range */ + zn = zslFirstInRange(zsl, &range); + + /* Use rank of first element, if any, to determine preliminary count */ + if (zn != NULL) { + rank = zslGetRank(zsl, zn->score, zn->ele); + count = (zsl->length - (rank - 1)); + + /* Find last element in range */ + zn = zslLastInRange(zsl, &range); + + /* Use rank of last element, if any, to determine the actual count */ + if (zn != NULL) { + rank = zslGetRank(zsl, zn->score, zn->ele); + count -= (zsl->length - rank); + } + } + } else { + serverPanic("Unknown sorted set encoding"); + } + + addReplyLongLong(c, count); +} + +void zlexcountCommand(client *c) { + robj *key = c->argv[1]; + robj *zobj; + zlexrangespec range; + unsigned long count = 0; + + /* Parse the range arguments */ + if (zslParseLexRange(c->argv[2],c->argv[3],&range) != C_OK) { + addReplyError(c,"min or max not valid string range item"); + return; + } + + /* Lookup the sorted set */ + if ((zobj = lookupKeyReadOrReply(c, key, shared.czero)) == NULL || + checkType(c, zobj, OBJ_ZSET)) + { + zslFreeLexRange(&range); + return; + } + + if (zobj->encoding == OBJ_ENCODING_LISTPACK) { + unsigned char *zl = zobj->ptr; + unsigned char *eptr, *sptr; + + /* Use the first element in range as the starting point */ + eptr = zzlFirstInLexRange(zl,&range); + + /* No "first" element */ + if (eptr == NULL) { + zslFreeLexRange(&range); + addReply(c, shared.czero); + return; + } + + /* First element is in range */ + sptr = lpNext(zl,eptr); + serverAssertWithInfo(c,zobj,zzlLexValueLteMax(eptr,&range)); + + /* Iterate over elements in range */ + while (eptr) { + /* Abort when the node is no longer in range. */ + if (!zzlLexValueLteMax(eptr,&range)) { + break; + } else { + count++; + zzlNext(zl,&eptr,&sptr); + } + } + } else if (zobj->encoding == OBJ_ENCODING_SKIPLIST) { + zset *zs = zobj->ptr; + zskiplist *zsl = zs->zsl; + zskiplistNode *zn; + unsigned long rank; + + /* Find first element in range */ + zn = zslFirstInLexRange(zsl, &range); + + /* Use rank of first element, if any, to determine preliminary count */ + if (zn != NULL) { + rank = zslGetRank(zsl, zn->score, zn->ele); + count = (zsl->length - (rank - 1)); + + /* Find last element in range */ + zn = zslLastInLexRange(zsl, &range); + + /* Use rank of last element, if any, to determine the actual count */ + if (zn != NULL) { + rank = zslGetRank(zsl, zn->score, zn->ele); + count -= (zsl->length - rank); + } + } + } else { + serverPanic("Unknown sorted set encoding"); + } + + zslFreeLexRange(&range); + addReplyLongLong(c, count); +} + +/* This command implements ZRANGEBYLEX, ZREVRANGEBYLEX. */ +void genericZrangebylexCommand(zrange_result_handler *handler, + zlexrangespec *range, robj *zobj, int withscores, long offset, long limit, + int reverse) +{ + unsigned long rangelen = 0; + + handler->beginResultEmission(handler); + + if (zobj->encoding == OBJ_ENCODING_LISTPACK) { + unsigned char *zl = zobj->ptr; + unsigned char *eptr, *sptr; + unsigned char *vstr; + unsigned int vlen; + long long vlong; + + /* If reversed, get the last node in range as starting point. */ + if (reverse) { + eptr = zzlLastInLexRange(zl,range); + } else { + eptr = zzlFirstInLexRange(zl,range); + } + + /* Get score pointer for the first element. */ + if (eptr) + sptr = lpNext(zl,eptr); + + /* If there is an offset, just traverse the number of elements without + * checking the score because that is done in the next loop. */ + while (eptr && offset--) { + if (reverse) { + zzlPrev(zl,&eptr,&sptr); + } else { + zzlNext(zl,&eptr,&sptr); + } + } + + while (eptr && limit--) { + double score = 0; + if (withscores) /* don't bother to extract the score if it's gonna be ignored. */ + score = zzlGetScore(sptr); + + /* Abort when the node is no longer in range. */ + if (reverse) { + if (!zzlLexValueGteMin(eptr,range)) break; + } else { + if (!zzlLexValueLteMax(eptr,range)) break; + } + + vstr = lpGetValue(eptr,&vlen,&vlong); + rangelen++; + if (vstr == NULL) { + handler->emitResultFromLongLong(handler, vlong, score); + } else { + handler->emitResultFromCBuffer(handler, vstr, vlen, score); + } + + /* Move to next node */ + if (reverse) { + zzlPrev(zl,&eptr,&sptr); + } else { + zzlNext(zl,&eptr,&sptr); + } + } + } else if (zobj->encoding == OBJ_ENCODING_SKIPLIST) { + zset *zs = zobj->ptr; + zskiplist *zsl = zs->zsl; + zskiplistNode *ln; + + /* If reversed, get the last node in range as starting point. */ + if (reverse) { + ln = zslLastInLexRange(zsl,range); + } else { + ln = zslFirstInLexRange(zsl,range); + } + + /* If there is an offset, just traverse the number of elements without + * checking the score because that is done in the next loop. */ + while (ln && offset--) { + if (reverse) { + ln = ln->backward; + } else { + ln = ln->level[0].forward; + } + } + + while (ln && limit--) { + /* Abort when the node is no longer in range. */ + if (reverse) { + if (!zslLexValueGteMin(ln->ele,range)) break; + } else { + if (!zslLexValueLteMax(ln->ele,range)) break; + } + + rangelen++; + handler->emitResultFromCBuffer(handler, ln->ele, sdslen(ln->ele), ln->score); + + /* Move to next node */ + if (reverse) { + ln = ln->backward; + } else { + ln = ln->level[0].forward; + } + } + } else { + serverPanic("Unknown sorted set encoding"); + } + + handler->finalizeResultEmission(handler, rangelen); +} + +/* ZRANGEBYLEX [LIMIT offset count] */ +void zrangebylexCommand(client *c) { + zrange_result_handler handler; + zrangeResultHandlerInit(&handler, c, ZRANGE_CONSUMER_TYPE_CLIENT); + zrangeGenericCommand(&handler, 1, 0, ZRANGE_LEX, ZRANGE_DIRECTION_FORWARD); +} + +/* ZREVRANGEBYLEX [LIMIT offset count] */ +void zrevrangebylexCommand(client *c) { + zrange_result_handler handler; + zrangeResultHandlerInit(&handler, c, ZRANGE_CONSUMER_TYPE_CLIENT); + zrangeGenericCommand(&handler, 1, 0, ZRANGE_LEX, ZRANGE_DIRECTION_REVERSE); +} + +/** + * This function handles ZRANGE and ZRANGESTORE, and also the deprecated + * Z[REV]RANGE[BYPOS|BYLEX] commands. + * + * The simple ZRANGE and ZRANGESTORE can take _AUTO in rangetype and direction, + * other command pass explicit value. + * + * The argc_start points to the src key argument, so following syntax is like: + * [BYSCORE | BYLEX] [REV] [WITHSCORES] [LIMIT offset count] + */ +void zrangeGenericCommand(zrange_result_handler *handler, int argc_start, int store, + zrange_type rangetype, zrange_direction direction) +{ + client *c = handler->client; + robj *key = c->argv[argc_start]; + robj *zobj; + zrangespec range; + zlexrangespec lexrange; + int minidx = argc_start + 1; + int maxidx = argc_start + 2; + + /* Options common to all */ + long opt_start = 0; + long opt_end = 0; + int opt_withscores = 0; + long opt_offset = 0; + long opt_limit = -1; + + /* Step 1: Skip the args and parse remaining optional arguments. */ + for (int j=argc_start + 3; j < c->argc; j++) { + int leftargs = c->argc-j-1; + if (!store && !strcasecmp(c->argv[j]->ptr,"withscores")) { + opt_withscores = 1; + } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) { + if ((getLongFromObjectOrReply(c, c->argv[j+1], &opt_offset, NULL) != C_OK) || + (getLongFromObjectOrReply(c, c->argv[j+2], &opt_limit, NULL) != C_OK)) + { + return; + } + j += 2; + } else if (direction == ZRANGE_DIRECTION_AUTO && + !strcasecmp(c->argv[j]->ptr,"rev")) + { + direction = ZRANGE_DIRECTION_REVERSE; + } else if (rangetype == ZRANGE_AUTO && + !strcasecmp(c->argv[j]->ptr,"bylex")) + { + rangetype = ZRANGE_LEX; + } else if (rangetype == ZRANGE_AUTO && + !strcasecmp(c->argv[j]->ptr,"byscore")) + { + rangetype = ZRANGE_SCORE; + } else { + addReplyErrorObject(c,shared.syntaxerr); + return; + } + } + + /* Use defaults if not overridden by arguments. */ + if (direction == ZRANGE_DIRECTION_AUTO) + direction = ZRANGE_DIRECTION_FORWARD; + if (rangetype == ZRANGE_AUTO) + rangetype = ZRANGE_RANK; + + /* Check for conflicting arguments. */ + if (opt_limit != -1 && rangetype == ZRANGE_RANK) { + addReplyError(c,"syntax error, LIMIT is only supported in combination with either BYSCORE or BYLEX"); + return; + } + if (opt_withscores && rangetype == ZRANGE_LEX) { + addReplyError(c,"syntax error, WITHSCORES not supported in combination with BYLEX"); + return; + } + + if (direction == ZRANGE_DIRECTION_REVERSE && + ((ZRANGE_SCORE == rangetype) || (ZRANGE_LEX == rangetype))) + { + /* Range is given as [max,min] */ + int tmp = maxidx; + maxidx = minidx; + minidx = tmp; + } + + /* Step 2: Parse the range. */ + switch (rangetype) { + case ZRANGE_AUTO: + case ZRANGE_RANK: + /* Z[REV]RANGE, ZRANGESTORE [REV]RANGE */ + if ((getLongFromObjectOrReply(c, c->argv[minidx], &opt_start,NULL) != C_OK) || + (getLongFromObjectOrReply(c, c->argv[maxidx], &opt_end,NULL) != C_OK)) + { + return; + } + break; + + case ZRANGE_SCORE: + /* Z[REV]RANGEBYSCORE, ZRANGESTORE [REV]RANGEBYSCORE */ + if (zslParseRange(c->argv[minidx], c->argv[maxidx], &range) != C_OK) { + addReplyError(c, "min or max is not a float"); + return; + } + break; + + case ZRANGE_LEX: + /* Z[REV]RANGEBYLEX, ZRANGESTORE [REV]RANGEBYLEX */ + if (zslParseLexRange(c->argv[minidx], c->argv[maxidx], &lexrange) != C_OK) { + addReplyError(c, "min or max not valid string range item"); + return; + } + break; + } + + if (opt_withscores || store) { + zrangeResultHandlerScoreEmissionEnable(handler); + } + + /* Step 3: Lookup the key and get the range. */ + zobj = lookupKeyRead(c->db, key); + if (zobj == NULL) { + if (store) { + handler->beginResultEmission(handler); + handler->finalizeResultEmission(handler, 0); + } else { + addReply(c,shared.emptyarray); + } + goto cleanup; + } + + if (checkType(c,zobj,OBJ_ZSET)) goto cleanup; + + /* Step 4: Pass this to the command-specific handler. */ + switch (rangetype) { + case ZRANGE_AUTO: + case ZRANGE_RANK: + genericZrangebyrankCommand(handler, zobj, opt_start, opt_end, + opt_withscores || store, direction == ZRANGE_DIRECTION_REVERSE); + break; + + case ZRANGE_SCORE: + genericZrangebyscoreCommand(handler, &range, zobj, opt_offset, + opt_limit, direction == ZRANGE_DIRECTION_REVERSE); + break; + + case ZRANGE_LEX: + genericZrangebylexCommand(handler, &lexrange, zobj, opt_withscores || store, + opt_offset, opt_limit, direction == ZRANGE_DIRECTION_REVERSE); + break; + } + + /* Instead of returning here, we'll just fall-through the clean-up. */ + +cleanup: + + if (rangetype == ZRANGE_LEX) { + zslFreeLexRange(&lexrange); + } +} + +void zcardCommand(client *c) { + robj *key = c->argv[1]; + robj *zobj; + + if ((zobj = lookupKeyReadOrReply(c,key,shared.czero)) == NULL || + checkType(c,zobj,OBJ_ZSET)) return; + + addReplyLongLong(c,zsetLength(zobj)); +} + +void zscoreCommand(client *c) { + robj *key = c->argv[1]; + robj *zobj; + double score; + + if ((zobj = lookupKeyReadOrReply(c,key,shared.null[c->resp])) == NULL || + checkType(c,zobj,OBJ_ZSET)) return; + + if (zsetScore(zobj,c->argv[2]->ptr,&score) == C_ERR) { + addReplyNull(c); + } else { + addReplyDouble(c,score); + } +} + +void zmscoreCommand(client *c) { + robj *key = c->argv[1]; + robj *zobj; + double score; + zobj = lookupKeyRead(c->db,key); + if (checkType(c,zobj,OBJ_ZSET)) return; + + addReplyArrayLen(c,c->argc - 2); + for (int j = 2; j < c->argc; j++) { + /* Treat a missing set the same way as an empty set */ + if (zobj == NULL || zsetScore(zobj,c->argv[j]->ptr,&score) == C_ERR) { + addReplyNull(c); + } else { + addReplyDouble(c,score); + } + } +} + +void zrankGenericCommand(client *c, int reverse) { + robj *key = c->argv[1]; + robj *ele = c->argv[2]; + robj *zobj; + long rank; + + if ((zobj = lookupKeyReadOrReply(c,key,shared.null[c->resp])) == NULL || + checkType(c,zobj,OBJ_ZSET)) return; + + serverAssertWithInfo(c,ele,sdsEncodedObject(ele)); + rank = zsetRank(zobj,ele->ptr,reverse); + if (rank >= 0) { + addReplyLongLong(c,rank); + } else { + addReplyNull(c); + } +} + +void zrankCommand(client *c) { + zrankGenericCommand(c, 0); +} + +void zrevrankCommand(client *c) { + zrankGenericCommand(c, 1); +} + +void zscanCommand(client *c) { + robj *o; + unsigned long cursor; + + if (parseScanCursorOrReply(c,c->argv[2],&cursor) == C_ERR) return; + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptyscan)) == NULL || + checkType(c,o,OBJ_ZSET)) return; + scanGenericCommand(c,o,cursor); +} + +/* This command implements the generic zpop operation, used by: + * ZPOPMIN, ZPOPMAX, BZPOPMIN, BZPOPMAX and ZMPOP. This function is also used + * inside blocked.c in the unblocking stage of BZPOPMIN, BZPOPMAX and BZMPOP. + * + * If 'emitkey' is true also the key name is emitted, useful for the blocking + * behavior of BZPOP[MIN|MAX], since we can block into multiple keys. + * Or in ZMPOP/BZMPOP, because we also can take multiple keys. + * + * 'count' is the number of elements requested to pop, or -1 for plain single pop. + * + * 'use_nested_array' when false it generates a flat array (with or without key name). + * When true, it generates a nested 2 level array of field + score pairs, or 3 level when emitkey is set. + * + * 'reply_nil_when_empty' when true we reply a NIL if we are not able to pop up any elements. + * Like in ZMPOP/BZMPOP we reply with a structured nested array containing key name + * and member + score pairs. In these commands, we reply with null when we have no result. + * Otherwise in ZPOPMIN/ZPOPMAX we reply an empty array by default. + * + * 'deleted' is an optional output argument to get an indication + * if the key got deleted by this function. + * */ +void genericZpopCommand(client *c, robj **keyv, int keyc, int where, int emitkey, + long count, int use_nested_array, int reply_nil_when_empty, int *deleted) { + int idx; + robj *key = NULL; + robj *zobj = NULL; + sds ele; + double score; + + if (deleted) *deleted = 0; + + /* Check type and break on the first error, otherwise identify candidate. */ + idx = 0; + while (idx < keyc) { + key = keyv[idx++]; + zobj = lookupKeyWrite(c->db,key); + if (!zobj) continue; + if (checkType(c,zobj,OBJ_ZSET)) return; + break; + } + + /* No candidate for zpopping, return empty. */ + if (!zobj) { + if (reply_nil_when_empty) { + addReplyNullArray(c); + } else { + addReply(c,shared.emptyarray); + } + return; + } + + if (count == 0) { + /* ZPOPMIN/ZPOPMAX with count 0. */ + addReply(c, shared.emptyarray); + return; + } + + long result_count = 0; + + /* When count is -1, we need to correct it to 1 for plain single pop. */ + if (count == -1) count = 1; + + long llen = zsetLength(zobj); + long rangelen = (count > llen) ? llen : count; + + if (!use_nested_array && !emitkey) { + /* ZPOPMIN/ZPOPMAX with or without COUNT option in RESP2. */ + addReplyArrayLen(c, rangelen * 2); + } else if (use_nested_array && !emitkey) { + /* ZPOPMIN/ZPOPMAX with COUNT option in RESP3. */ + addReplyArrayLen(c, rangelen); + } else if (!use_nested_array && emitkey) { + /* BZPOPMIN/BZPOPMAX in RESP2 and RESP3. */ + addReplyArrayLen(c, rangelen * 2 + 1); + addReplyBulk(c, key); + } else if (use_nested_array && emitkey) { + /* ZMPOP/BZMPOP in RESP2 and RESP3. */ + addReplyArrayLen(c, 2); + addReplyBulk(c, key); + addReplyArrayLen(c, rangelen); + } + + /* Remove the element. */ + do { + if (zobj->encoding == OBJ_ENCODING_LISTPACK) { + unsigned char *zl = zobj->ptr; + unsigned char *eptr, *sptr; + unsigned char *vstr; + unsigned int vlen; + long long vlong; + + /* Get the first or last element in the sorted set. */ + eptr = lpSeek(zl,where == ZSET_MAX ? -2 : 0); + serverAssertWithInfo(c,zobj,eptr != NULL); + vstr = lpGetValue(eptr,&vlen,&vlong); + if (vstr == NULL) + ele = sdsfromlonglong(vlong); + else + ele = sdsnewlen(vstr,vlen); + + /* Get the score. */ + sptr = lpNext(zl,eptr); + serverAssertWithInfo(c,zobj,sptr != NULL); + score = zzlGetScore(sptr); + } else if (zobj->encoding == OBJ_ENCODING_SKIPLIST) { + zset *zs = zobj->ptr; + zskiplist *zsl = zs->zsl; + zskiplistNode *zln; + + /* Get the first or last element in the sorted set. */ + zln = (where == ZSET_MAX ? zsl->tail : + zsl->header->level[0].forward); + + /* There must be an element in the sorted set. */ + serverAssertWithInfo(c,zobj,zln != NULL); + ele = sdsdup(zln->ele); + score = zln->score; + } else { + serverPanic("Unknown sorted set encoding"); + } + + serverAssertWithInfo(c,zobj,zsetDel(zobj,ele)); + server.dirty++; + + if (result_count == 0) { /* Do this only for the first iteration. */ + char *events[2] = {"zpopmin","zpopmax"}; + notifyKeyspaceEvent(NOTIFY_ZSET,events[where],key,c->db->id); + signalModifiedKey(c,c->db,key); + } + + if (use_nested_array) { + addReplyArrayLen(c,2); + } + addReplyBulkCBuffer(c,ele,sdslen(ele)); + addReplyDouble(c,score); + sdsfree(ele); + ++result_count; + } while(--rangelen); + + /* Remove the key, if indeed needed. */ + if (zsetLength(zobj) == 0) { + if (deleted) *deleted = 1; + + dbDelete(c->db,key); + notifyKeyspaceEvent(NOTIFY_GENERIC,"del",key,c->db->id); + } + + if (c->cmd->proc == zmpopCommand) { + /* Always replicate it as ZPOP[MIN|MAX] with COUNT option instead of ZMPOP. */ + robj *count_obj = createStringObjectFromLongLong((count > llen) ? llen : count); + rewriteClientCommandVector(c, 3, + (where == ZSET_MAX) ? shared.zpopmax : shared.zpopmin, + key, count_obj); + decrRefCount(count_obj); + } +} + +/* ZPOPMIN/ZPOPMAX key [] */ +void zpopMinMaxCommand(client *c, int where) { + if (c->argc > 3) { + addReplyErrorObject(c,shared.syntaxerr); + return; + } + + long count = -1; /* -1 for plain single pop. */ + if (c->argc == 3 && getPositiveLongFromObjectOrReply(c, c->argv[2], &count, NULL) != C_OK) + return; + + /* Respond with a single (flat) array in RESP2 or if count is -1 + * (returning a single element). In RESP3, when count > 0 use nested array. */ + int use_nested_array = (c->resp > 2 && count != -1); + + genericZpopCommand(c, &c->argv[1], 1, where, 0, count, use_nested_array, 0, NULL); +} + +/* ZPOPMIN key [] */ +void zpopminCommand(client *c) { + zpopMinMaxCommand(c, ZSET_MIN); +} + +/* ZMAXPOP key [] */ +void zpopmaxCommand(client *c) { + zpopMinMaxCommand(c, ZSET_MAX); +} + +/* BZPOPMIN, BZPOPMAX, BZMPOP actual implementation. + * + * 'numkeys' is the number of keys. + * + * 'timeout_idx' parameter position of block timeout. + * + * 'where' ZSET_MIN or ZSET_MAX. + * + * 'count' is the number of elements requested to pop, or -1 for plain single pop. + * + * 'use_nested_array' when false it generates a flat array (with or without key name). + * When true, it generates a nested 3 level array of keyname, field + score pairs. + * */ +void blockingGenericZpopCommand(client *c, robj **keys, int numkeys, int where, + int timeout_idx, long count, int use_nested_array, int reply_nil_when_empty) { + robj *o; + robj *key; + mstime_t timeout; + int j; + + if (getTimeoutFromObjectOrReply(c,c->argv[timeout_idx],&timeout,UNIT_SECONDS) + != C_OK) return; + + for (j = 0; j < numkeys; j++) { + key = keys[j]; + o = lookupKeyWrite(c->db,key); + /* Non-existing key, move to next key. */ + if (o == NULL) continue; + + if (checkType(c,o,OBJ_ZSET)) return; + + long llen = zsetLength(o); + /* Empty zset, move to next key. */ + if (llen == 0) continue; + + /* Non empty zset, this is like a normal ZPOP[MIN|MAX]. */ + genericZpopCommand(c, &key, 1, where, 1, count, use_nested_array, reply_nil_when_empty, NULL); + + if (count == -1) { + /* Replicate it as ZPOP[MIN|MAX] instead of BZPOP[MIN|MAX]. */ + rewriteClientCommandVector(c,2, + (where == ZSET_MAX) ? shared.zpopmax : shared.zpopmin, + key); + } else { + /* Replicate it as ZPOP[MIN|MAX] with COUNT option. */ + robj *count_obj = createStringObjectFromLongLong((count > llen) ? llen : count); + rewriteClientCommandVector(c, 3, + (where == ZSET_MAX) ? shared.zpopmax : shared.zpopmin, + key, count_obj); + decrRefCount(count_obj); + } + + return; + } + + /* If we are not allowed to block the client and the zset is empty the only thing + * we can do is treating it as a timeout (even with timeout 0). */ + if (c->flags & CLIENT_DENY_BLOCKING) { + addReplyNullArray(c); + return; + } + + /* If the keys do not exist we must block */ + struct blockPos pos = {where}; + blockForKeys(c,BLOCKED_ZSET,c->argv+1,c->argc-2,count,timeout,NULL,&pos,NULL); +} + +// BZPOPMIN key [key ...] timeout +void bzpopminCommand(client *c) { + blockingGenericZpopCommand(c, c->argv+1, c->argc-2, ZSET_MIN, c->argc-1, -1, 0, 0); +} + +// BZPOPMAX key [key ...] timeout +void bzpopmaxCommand(client *c) { + blockingGenericZpopCommand(c, c->argv+1, c->argc-2, ZSET_MAX, c->argc-1, -1, 0, 0); +} + +static void zarndmemberReplyWithListpack(client *c, unsigned int count, listpackEntry *keys, listpackEntry *vals) { + for (unsigned long i = 0; i < count; i++) { + if (vals && c->resp > 2) + addReplyArrayLen(c,2); + if (keys[i].sval) + addReplyBulkCBuffer(c, keys[i].sval, keys[i].slen); + else + addReplyBulkLongLong(c, keys[i].lval); + if (vals) { + if (vals[i].sval) { + addReplyDouble(c, zzlStrtod(vals[i].sval,vals[i].slen)); + } else + addReplyDouble(c, vals[i].lval); + } + } +} + +/* How many times bigger should be the zset compared to the requested size + * for us to not use the "remove elements" strategy? Read later in the + * implementation for more info. */ +#define ZRANDMEMBER_SUB_STRATEGY_MUL 3 + +/* If client is trying to ask for a very large number of random elements, + * queuing may consume an unlimited amount of memory, so we want to limit + * the number of randoms per time. */ +#define ZRANDMEMBER_RANDOM_SAMPLE_LIMIT 1000 + +void zrandmemberWithCountCommand(client *c, long l, int withscores) { + unsigned long count, size; + int uniq = 1; + robj *zsetobj; + + if ((zsetobj = lookupKeyReadOrReply(c, c->argv[1], shared.emptyarray)) + == NULL || checkType(c, zsetobj, OBJ_ZSET)) return; + size = zsetLength(zsetobj); + + if(l >= 0) { + count = (unsigned long) l; + } else { + count = -l; + uniq = 0; + } + + /* If count is zero, serve it ASAP to avoid special cases later. */ + if (count == 0) { + addReply(c,shared.emptyarray); + return; + } + + /* CASE 1: The count was negative, so the extraction method is just: + * "return N random elements" sampling the whole set every time. + * This case is trivial and can be served without auxiliary data + * structures. This case is the only one that also needs to return the + * elements in random order. */ + if (!uniq || count == 1) { + if (withscores && c->resp == 2) + addReplyArrayLen(c, count*2); + else + addReplyArrayLen(c, count); + if (zsetobj->encoding == OBJ_ENCODING_SKIPLIST) { + zset *zs = zsetobj->ptr; + while (count--) { + dictEntry *de = dictGetFairRandomKey(zs->dict); + sds key = dictGetKey(de); + if (withscores && c->resp > 2) + addReplyArrayLen(c,2); + addReplyBulkCBuffer(c, key, sdslen(key)); + if (withscores) + addReplyDouble(c, *(double*)dictGetVal(de)); + } + } else if (zsetobj->encoding == OBJ_ENCODING_LISTPACK) { + listpackEntry *keys, *vals = NULL; + unsigned long limit, sample_count; + limit = count > ZRANDMEMBER_RANDOM_SAMPLE_LIMIT ? ZRANDMEMBER_RANDOM_SAMPLE_LIMIT : count; + keys = zmalloc(sizeof(listpackEntry)*limit); + if (withscores) + vals = zmalloc(sizeof(listpackEntry)*limit); + while (count) { + sample_count = count > limit ? limit : count; + count -= sample_count; + lpRandomPairs(zsetobj->ptr, sample_count, keys, vals); + zarndmemberReplyWithListpack(c, sample_count, keys, vals); + } + zfree(keys); + zfree(vals); + } + return; + } + + zsetopsrc src; + zsetopval zval; + src.subject = zsetobj; + src.type = zsetobj->type; + src.encoding = zsetobj->encoding; + zuiInitIterator(&src); + memset(&zval, 0, sizeof(zval)); + + /* Initiate reply count, RESP3 responds with nested array, RESP2 with flat one. */ + long reply_size = count < size ? count : size; + if (withscores && c->resp == 2) + addReplyArrayLen(c, reply_size*2); + else + addReplyArrayLen(c, reply_size); + + /* CASE 2: + * The number of requested elements is greater than the number of + * elements inside the zset: simply return the whole zset. */ + if (count >= size) { + while (zuiNext(&src, &zval)) { + if (withscores && c->resp > 2) + addReplyArrayLen(c,2); + addReplyBulkSds(c, zuiNewSdsFromValue(&zval)); + if (withscores) + addReplyDouble(c, zval.score); + } + zuiClearIterator(&src); + return; + } + + /* CASE 3: + * The number of elements inside the zset is not greater than + * ZRANDMEMBER_SUB_STRATEGY_MUL times the number of requested elements. + * In this case we create a dict from scratch with all the elements, and + * subtract random elements to reach the requested number of elements. + * + * This is done because if the number of requested elements is just + * a bit less than the number of elements in the set, the natural approach + * used into CASE 4 is highly inefficient. */ + if (count*ZRANDMEMBER_SUB_STRATEGY_MUL > size) { + dict *d = dictCreate(&sdsReplyDictType); + dictExpand(d, size); + /* Add all the elements into the temporary dictionary. */ + while (zuiNext(&src, &zval)) { + sds key = zuiNewSdsFromValue(&zval); + dictEntry *de = dictAddRaw(d, key, NULL); + serverAssert(de); + if (withscores) + dictSetDoubleVal(de, zval.score); + } + serverAssert(dictSize(d) == size); + + /* Remove random elements to reach the right count. */ + while (size > count) { + dictEntry *de; + de = dictGetFairRandomKey(d); + dictUnlink(d,dictGetKey(de)); + sdsfree(dictGetKey(de)); + dictFreeUnlinkedEntry(d,de); + size--; + } + + /* Reply with what's in the dict and release memory */ + dictIterator *di; + dictEntry *de; + di = dictGetIterator(d); + while ((de = dictNext(di)) != NULL) { + if (withscores && c->resp > 2) + addReplyArrayLen(c,2); + addReplyBulkSds(c, dictGetKey(de)); + if (withscores) + addReplyDouble(c, dictGetDoubleVal(de)); + } + + dictReleaseIterator(di); + dictRelease(d); + } + + /* CASE 4: We have a big zset compared to the requested number of elements. + * In this case we can simply get random elements from the zset and add + * to the temporary set, trying to eventually get enough unique elements + * to reach the specified count. */ + else { + if (zsetobj->encoding == OBJ_ENCODING_LISTPACK) { + /* it is inefficient to repeatedly pick one random element from a + * listpack. so we use this instead: */ + listpackEntry *keys, *vals = NULL; + keys = zmalloc(sizeof(listpackEntry)*count); + if (withscores) + vals = zmalloc(sizeof(listpackEntry)*count); + serverAssert(lpRandomPairsUnique(zsetobj->ptr, count, keys, vals) == count); + zarndmemberReplyWithListpack(c, count, keys, vals); + zfree(keys); + zfree(vals); + zuiClearIterator(&src); + return; + } + + /* Hashtable encoding (generic implementation) */ + unsigned long added = 0; + dict *d = dictCreate(&hashDictType); + dictExpand(d, count); + + while (added < count) { + listpackEntry key; + double score; + zsetTypeRandomElement(zsetobj, size, &key, withscores ? &score: NULL); + + /* Try to add the object to the dictionary. If it already exists + * free it, otherwise increment the number of objects we have + * in the result dictionary. */ + sds skey = zsetSdsFromListpackEntry(&key); + if (dictAdd(d,skey,NULL) != DICT_OK) { + sdsfree(skey); + continue; + } + added++; + + if (withscores && c->resp > 2) + addReplyArrayLen(c,2); + zsetReplyFromListpackEntry(c, &key); + if (withscores) + addReplyDouble(c, score); + } + + /* Release memory */ + dictRelease(d); + } + zuiClearIterator(&src); +} + +/* ZRANDMEMBER key [ [WITHSCORES]] */ +void zrandmemberCommand(client *c) { + long l; + int withscores = 0; + robj *zset; + listpackEntry ele; + + if (c->argc >= 3) { + if (getLongFromObjectOrReply(c,c->argv[2],&l,NULL) != C_OK) return; + if (c->argc > 4 || (c->argc == 4 && strcasecmp(c->argv[3]->ptr,"withscores"))) { + addReplyErrorObject(c,shared.syntaxerr); + return; + } else if (c->argc == 4) + withscores = 1; + zrandmemberWithCountCommand(c, l, withscores); + return; + } + + /* Handle variant without argument. Reply with simple bulk string */ + if ((zset = lookupKeyReadOrReply(c,c->argv[1],shared.null[c->resp]))== NULL || + checkType(c,zset,OBJ_ZSET)) { + return; + } + + zsetTypeRandomElement(zset, zsetLength(zset), &ele,NULL); + zsetReplyFromListpackEntry(c,&ele); +} + +/* ZMPOP/BZMPOP + * + * 'numkeys_idx' parameter position of key number. + * 'is_block' this indicates whether it is a blocking variant. */ +void zmpopGenericCommand(client *c, int numkeys_idx, int is_block) { + long j; + long numkeys = 0; /* Number of keys. */ + int where = 0; /* ZSET_MIN or ZSET_MAX. */ + long count = -1; /* Reply will consist of up to count elements, depending on the zset's length. */ + + /* Parse the numkeys. */ + if (getRangeLongFromObjectOrReply(c, c->argv[numkeys_idx], 1, LONG_MAX, + &numkeys, "numkeys should be greater than 0") != C_OK) + return; + + /* Parse the where. where_idx: the index of where in the c->argv. */ + long where_idx = numkeys_idx + numkeys + 1; + if (where_idx >= c->argc) { + addReplyErrorObject(c, shared.syntaxerr); + return; + } + if (!strcasecmp(c->argv[where_idx]->ptr, "MIN")) { + where = ZSET_MIN; + } else if (!strcasecmp(c->argv[where_idx]->ptr, "MAX")) { + where = ZSET_MAX; + } else { + addReplyErrorObject(c, shared.syntaxerr); + return; + } + + /* Parse the optional arguments. */ + for (j = where_idx + 1; j < c->argc; j++) { + char *opt = c->argv[j]->ptr; + int moreargs = (c->argc - 1) - j; + + if (count == -1 && !strcasecmp(opt, "COUNT") && moreargs) { + j++; + if (getRangeLongFromObjectOrReply(c, c->argv[j], 1, LONG_MAX, + &count,"count should be greater than 0") != C_OK) + return; + } else { + addReplyErrorObject(c, shared.syntaxerr); + return; + } + } + + if (count == -1) count = 1; + + if (is_block) { + /* BLOCK. We will handle CLIENT_DENY_BLOCKING flag in blockingGenericZpopCommand. */ + blockingGenericZpopCommand(c, c->argv+numkeys_idx+1, numkeys, where, 1, count, 1, 1); + } else { + /* NON-BLOCK */ + genericZpopCommand(c, c->argv+numkeys_idx+1, numkeys, where, 1, count, 1, 1, NULL); + } +} + +/* ZMPOP numkeys [ ...] MIN|MAX [COUNT count] */ +void zmpopCommand(client *c) { + zmpopGenericCommand(c, 1, 0); +} + +/* BZMPOP timeout numkeys [ ...] MIN|MAX [COUNT count] */ +void bzmpopCommand(client *c) { + zmpopGenericCommand(c, 2, 1); +} + +#endif diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt index 02a5e69f9..b502ca915 100644 --- a/server/CMakeLists.txt +++ b/server/CMakeLists.txt @@ -4,10 +4,10 @@ cxx_link(dragonfly base dragonfly_lib) add_library(dragonfly_lib command_registry.cc common.cc config_flags.cc conn_context.cc db_slice.cc debugcmd.cc dragonfly_listener.cc dragonfly_connection.cc engine_shard_set.cc generic_family.cc - main_service.cc memcache_parser.cc + list_family.cc main_service.cc memcache_parser.cc redis_parser.cc reply_builder.cc string_family.cc transaction.cc) -cxx_link(dragonfly_lib dfly_core uring_fiber_lib +cxx_link(dragonfly_lib dfly_core redis_lib uring_fiber_lib fibers_ext strings_lib http_server_lib tls_lib) add_library(dfly_test_lib test_utils.cc) diff --git a/server/common.cc b/server/common.cc index f1e2af236..b646870a9 100644 --- a/server/common.cc +++ b/server/common.cc @@ -19,6 +19,7 @@ string WrongNumArgsError(std::string_view cmd) { } const char kSyntaxErr[] = "syntax error"; +const char kWrongTypeErr[] = "-WRONGTYPE Operation against a key holding the wrong kind of value"; const char kInvalidIntErr[] = "value is not an integer or out of range"; const char kUintErr[] = "value is out of range, must be positive"; const char kDbIndOutOfRangeErr[] = "DB index is out of range"; diff --git a/server/common_types.h b/server/common_types.h index 5134326df..2ea0477e8 100644 --- a/server/common_types.h +++ b/server/common_types.h @@ -12,6 +12,8 @@ namespace dfly { +enum class ListDir : uint8_t { LEFT, RIGHT }; + enum class Protocol : uint8_t { MEMCACHE = 1, REDIS = 2 diff --git a/server/db_slice.cc b/server/db_slice.cc index 34cbfeeb6..9cd452ea5 100644 --- a/server/db_slice.cc +++ b/server/db_slice.cc @@ -42,7 +42,7 @@ void DbSlice::Reserve(DbIndex db_ind, size_t key_size) { db->main_table.reserve(key_size); } -auto DbSlice::Find(DbIndex db_index, std::string_view key) const -> OpResult { +auto DbSlice::Find(DbIndex db_index, std::string_view key, unsigned obj_type) const -> OpResult { auto [it, expire_it] = FindExt(db_index, key); if (it == MainIterator{}) diff --git a/server/db_slice.h b/server/db_slice.h index eb7f79b1b..fc3dbec55 100644 --- a/server/db_slice.h +++ b/server/db_slice.h @@ -42,7 +42,7 @@ class DbSlice { return now_ms_; } - OpResult Find(DbIndex db_index, std::string_view key) const; + OpResult Find(DbIndex db_index, std::string_view key, unsigned obj_type) const; // Returns (value, expire) dict entries if key exists, null if it does not exist or has expired. std::pair FindExt(DbIndex db_ind, std::string_view key) const; diff --git a/server/engine_shard_set.cc b/server/engine_shard_set.cc index 966e5c0e7..aecbad860 100644 --- a/server/engine_shard_set.cc +++ b/server/engine_shard_set.cc @@ -32,11 +32,14 @@ EngineShard::EngineShard(util::ProactorBase* pb) // absl::GetCurrentTimeNanos() returns current time since the Unix Epoch. shard->db_slice().UpdateExpireClock(absl::GetCurrentTimeNanos() / 1000000); }); + + tmp_str = sdsempty(); } EngineShard::~EngineShard() { queue_.Shutdown(); fiber_q_.join(); + sdsfree(tmp_str); if (periodic_task_) { ProactorBase::me()->CancelPeriodic(periodic_task_); } diff --git a/server/engine_shard_set.h b/server/engine_shard_set.h index 23cf8ff5a..b9a4c7c79 100644 --- a/server/engine_shard_set.h +++ b/server/engine_shard_set.h @@ -4,6 +4,10 @@ #pragma once +extern "C" { + #include "redis/sds.h" +} + #include #include "core/tx_queue.h" @@ -58,6 +62,8 @@ class EngineShard { return txq_.Insert(trans); } + sds tmp_str; + private: EngineShard(util::ProactorBase* pb); diff --git a/server/error.h b/server/error.h index aec842f9b..e302ce68b 100644 --- a/server/error.h +++ b/server/error.h @@ -11,6 +11,7 @@ namespace dfly { std::string WrongNumArgsError(std::string_view cmd); extern const char kSyntaxErr[]; +extern const char kWrongTypeErr[]; extern const char kInvalidIntErr[]; extern const char kUintErr[]; extern const char kDbIndOutOfRangeErr[]; diff --git a/server/list_family.cc b/server/list_family.cc new file mode 100644 index 000000000..4456f1597 --- /dev/null +++ b/server/list_family.cc @@ -0,0 +1,282 @@ +// Copyright 2021, Roman Gershman. All rights reserved. +// See LICENSE for licensing terms. +// +#include "server/list_family.h" + +extern "C" { +#include "redis/object.h" +#include "redis/sds.h" +} + +#include + +#include "base/logging.h" +#include "server/command_registry.h" +#include "server/conn_context.h" +#include "server/engine_shard_set.h" +#include "server/error.h" +#include "server/transaction.h" + +/** + * The number of entries allowed per internal list node can be specified + * as a fixed maximum size or a maximum number of elements. + * For a fixed maximum size, use -5 through -1, meaning: + * -5: max size: 64 Kb <-- not recommended for normal workloads + * -4: max size: 32 Kb <-- not recommended + * -3: max size: 16 Kb <-- probably not recommended + * -2: max size: 8 Kb <-- good + * -1: max size: 4 Kb <-- good + * Positive numbers mean store up to _exactly_ that number of elements + * per list node. + * The highest performing option is usually -2 (8 Kb size) or -1 (4 Kb size), + * but if your use case is unique, adjust the settings as necessary. + * + */ +DEFINE_int32(list_max_listpack_size, -2, "Maximum ziplist size, default is 8kb"); + +/** + * Lists may also be compressed. + * Compress depth is the number of quicklist ziplist nodes from *each* side of + * the list to *exclude* from compression. The head and tail of the list + * are always uncompressed for fast push/pop operations. Settings are: + * 0: disable all list compression + * 1: depth 1 means "don't start compressing until after 1 node into the list, + * going from either the head or tail" + * So: [head]->node->node->...->node->[tail] + * [head], [tail] will always be uncompressed; inner nodes will compress. + * 2: [head]->[next]->node->node->...->node->[prev]->[tail] + * 2 here means: don't compress head or head->next or tail->prev or tail, + * but compress all nodes between them. + * 3: [head]->[next]->[next]->node->node->...->node->[prev]->[prev]->[tail] + * etc. + * + */ + +DEFINE_int32(list_compress_depth, 0, "Compress depth of the list. Default is no compression"); + +namespace dfly { + +using namespace std; +namespace { + +quicklist* GetQL(const MainValue& mv) { + DCHECK(mv.robj); + robj* o = (robj*)mv.robj; + return (quicklist*)o->ptr; +} + +void* listPopSaver(unsigned char* data, size_t sz) { + return createStringObject((char*)data, sz); +} + +OpResult ListPop(DbIndex db_ind, const MainValue& mv, ListDir dir) { + VLOG(1) << "Pop db_indx " << db_ind; + + if (mv.ObjType() != OBJ_LIST) + return OpStatus::WRONG_TYPE; + + long long vlong; + robj* value = NULL; + + int ql_where = (dir == ListDir::LEFT) ? QUICKLIST_HEAD : QUICKLIST_TAIL; + quicklist* ql = GetQL(mv); + + // Empty list automatically removes the key (see below). + CHECK_EQ(1, + quicklistPopCustom(ql, ql_where, (unsigned char**)&value, NULL, &vlong, listPopSaver)); + string res; + if (value) { + DCHECK(value->encoding == OBJ_ENCODING_EMBSTR || value->encoding == OBJ_ENCODING_RAW); + sds s = (sds)(value->ptr); + res = string{s, sdslen(s)}; + decrRefCount(value); + } else { + res = absl::StrCat(vlong); + } + + return res; +} + +} // namespace + +void ListFamily::LPush(CmdArgList args, ConnectionContext* cntx) { + return PushGeneric(ListDir::LEFT, std::move(args), cntx); +} + +void ListFamily::LPop(CmdArgList args, ConnectionContext* cntx) { + return PopGeneric(ListDir::LEFT, std::move(args), cntx); +} + +void ListFamily::RPush(CmdArgList args, ConnectionContext* cntx) { + return PushGeneric(ListDir::RIGHT, std::move(args), cntx); +} + +void ListFamily::RPop(CmdArgList args, ConnectionContext* cntx) { + return PopGeneric(ListDir::RIGHT, std::move(args), cntx); +} + +void ListFamily::LLen(CmdArgList args, ConnectionContext* cntx) { + auto key = ArgS(args, 1); + auto cb = [&](Transaction* t, EngineShard* shard) { + return OpLen(OpArgs{shard, t->db_index()}, key); + }; + OpResult result = cntx->transaction->ScheduleSingleHopT(std::move(cb)); + if (result) { + cntx->SendLong(result.value()); + } else if (result.status() == OpStatus::KEY_NOTFOUND) { + cntx->SendLong(0); + } else { + cntx->SendError(result.status()); + } +} + +void ListFamily::LIndex(CmdArgList args, ConnectionContext* cntx) { + std::string_view key = ArgS(args, 1); + std::string_view index_str = ArgS(args, 2); + int32_t index; + if (!absl::SimpleAtoi(index_str, &index)) { + cntx->SendError(kInvalidIntErr); + return; + } + + auto cb = [&](Transaction* t, EngineShard* shard) { + return OpIndex(OpArgs{shard, t->db_index()}, key, index); + }; + OpResult result = cntx->transaction->ScheduleSingleHopT(std::move(cb)); + if (result) { + cntx->SendBulkString(result.value()); + } else { + cntx->SendNull(); + } +} + +void ListFamily::PushGeneric(ListDir dir, const CmdArgList& args, ConnectionContext* cntx) { + std::string_view key = ArgS(args, 1); + vector vals(args.size() - 2); + for (size_t i = 2; i < args.size(); ++i) { + vals[i - 2] = ArgS(args, i); + } + absl::Span span{vals.data(), vals.size()}; + auto cb = [&](Transaction* t, EngineShard* shard) { + return OpPush(OpArgs{shard, t->db_index()}, key, dir, span); + }; + + OpResult result = cntx->transaction->ScheduleSingleHopT(std::move(cb)); + switch (result.status()) { + case OpStatus::KEY_NOTFOUND: + return cntx->SendNull(); + case OpStatus::WRONG_TYPE: + return cntx->SendError(kWrongTypeErr); + default:; + } + + return cntx->SendLong(result.value()); +} + +void ListFamily::PopGeneric(ListDir dir, const CmdArgList& args, ConnectionContext* cntx) { + std::string_view key = ArgS(args, 1); + + auto cb = [&](Transaction* t, EngineShard* shard) { + return OpPop(OpArgs{shard, t->db_index()}, key, dir); + }; + + OpResult result = cntx->transaction->ScheduleSingleHopT(std::move(cb)); + + switch (result.status()) { + case OpStatus::KEY_NOTFOUND: + return cntx->SendNull(); + case OpStatus::WRONG_TYPE: + return cntx->SendError(kWrongTypeErr); + default:; + } + + return cntx->SendBulkString(result.value()); +} + +OpResult ListFamily::OpPush(const OpArgs& op_args, std::string_view key, ListDir dir, + const absl::Span& vals) { + EngineShard* es = op_args.shard; + auto [it, new_key] = es->db_slice().AddOrFind(op_args.db_ind, key); + quicklist* ql; + + if (new_key) { + robj* o = createQuicklistObject(); + ql = (quicklist*)o->ptr; + quicklistSetOptions(ql, FLAGS_list_max_listpack_size, FLAGS_list_compress_depth); + it->second.robj = o; + it->second.obj_type = OBJ_LIST; + } else { + if (it->second.ObjType() != OBJ_LIST) + return OpStatus::WRONG_TYPE; + ql = GetQL(it->second); + } + + // Left push is LIST_HEAD. + int pos = (dir == ListDir::LEFT) ? QUICKLIST_HEAD : QUICKLIST_TAIL; + + for (auto v : vals) { + es->tmp_str = sdscpylen(es->tmp_str, v.data(), v.size()); + quicklistPush(ql, es->tmp_str, sdslen(es->tmp_str), pos); + } + + return quicklistCount(ql); +} + +OpResult ListFamily::OpPop(const OpArgs& op_args, string_view key, ListDir dir) { + auto& db_slice = op_args.shard->db_slice(); + auto [it, expire] = db_slice.FindExt(op_args.db_ind, key); + if (it == MainIterator{}) + return OpStatus::KEY_NOTFOUND; + + OpResult res = ListPop(op_args.db_ind, it->second, dir); + if (res) { + quicklist* ql = GetQL(it->second); + if (quicklistCount(ql) == 0) { + CHECK(db_slice.Del(op_args.db_ind, it)); + } + } + return res; +} + +OpResult ListFamily::OpLen(const OpArgs& op_args, std::string_view key) { + auto res = op_args.shard->db_slice().Find(op_args.db_ind, key, OBJ_LIST); + if (!res) + return res.status(); + + quicklist* ql = GetQL(res.value()->second); + + return quicklistCount(ql); +} + +OpResult ListFamily::OpIndex(const OpArgs& op_args, std::string_view key, long index) { + auto res = op_args.shard->db_slice().Find(op_args.db_ind, key, OBJ_LIST); + if (!res) + return res.status(); + quicklist* ql = GetQL(res.value()->second); + quicklistEntry entry; + quicklistIter* iter = quicklistGetIteratorEntryAtIdx(ql, index, &entry); + + if (!iter) + return OpStatus::KEY_NOTFOUND; + + if (entry.value) { + return string{reinterpret_cast(entry.value), entry.sz}; + } else { + return absl::StrCat(entry.longval); + } +} + +using CI = CommandId; + +#define HFUNC(x) SetHandler(&ListFamily::x) + +void ListFamily::Register(CommandRegistry* registry) { + *registry << CI{"LPUSH", CO::WRITE | CO::FAST | CO::DENYOOM, -3, 1, 1, 1}.HFUNC(LPush) + << CI{"LPOP", CO::WRITE | CO::FAST | CO::DENYOOM, 2, 1, 1, 1}.HFUNC(LPop) + << CI{"RPUSH", CO::WRITE | CO::FAST | CO::DENYOOM, -3, 1, 1, 1}.HFUNC(RPush) + << CI{"RPOP", CO::WRITE | CO::FAST | CO::DENYOOM, 2, 1, 1, 1}.HFUNC(RPop) + << CI{"LLEN", CO::READONLY | CO::FAST, 2, 1, 1, 1}.HFUNC(LLen) + << CI{"LINDEX", CO::READONLY, 3, 1, 1, 1}.HFUNC(LIndex); +} + +} // namespace dfly diff --git a/server/list_family.h b/server/list_family.h new file mode 100644 index 000000000..151c66e06 --- /dev/null +++ b/server/list_family.h @@ -0,0 +1,38 @@ +// Copyright 2021, Roman Gershman. All rights reserved. +// See LICENSE for licensing terms. +// + +#pragma once + +#include "core/op_status.h" +#include "server/common_types.h" + +namespace dfly { + +class ConnectionContext; +class CommandRegistry; +class EngineShard; + +class ListFamily { + public: + static void Register(CommandRegistry* registry); + + private: + static void LPush(CmdArgList args, ConnectionContext* cntx); + static void RPush(CmdArgList args, ConnectionContext* cntx); + static void LPop(CmdArgList args, ConnectionContext* cntx); + static void RPop(CmdArgList args, ConnectionContext* cntx); + static void LLen(CmdArgList args, ConnectionContext* cntx); + static void LIndex(CmdArgList args, ConnectionContext* cntx); + + static void PopGeneric(ListDir dir, const CmdArgList& args, ConnectionContext* cntx); + static void PushGeneric(ListDir dir, const CmdArgList& args, ConnectionContext* cntx); + + static OpResult OpPush(const OpArgs& op_args, std::string_view key, ListDir dir, + const absl::Span& vals); + static OpResult OpPop(const OpArgs& op_args, std::string_view key, ListDir dir); + static OpResult OpLen(const OpArgs& op_args, std::string_view key); + static OpResult OpIndex(const OpArgs& op_args, std::string_view key, long index); +}; + +} // namespace dfly diff --git a/server/main_service.cc b/server/main_service.cc index 813591108..5994038dc 100644 --- a/server/main_service.cc +++ b/server/main_service.cc @@ -15,6 +15,7 @@ #include "server/debugcmd.h" #include "server/error.h" #include "server/generic_family.h" +#include "server/list_family.h" #include "server/string_family.h" #include "server/transaction.h" #include "util/metrics/metrics.h" @@ -207,6 +208,7 @@ void Service::RegisterCommands() { StringFamily::Register(®istry_); GenericFamily::Register(®istry_); + ListFamily::Register(®istry_); } } // namespace dfly diff --git a/server/string_family.cc b/server/string_family.cc index c6bbfc16d..077427c32 100644 --- a/server/string_family.cc +++ b/server/string_family.cc @@ -4,6 +4,10 @@ #include "server/string_family.h" +extern "C" { +#include "redis/object.h" +} + #include #include "base/logging.h" @@ -46,6 +50,8 @@ OpResult SetCmd::Set(const SetParams& params, std::string_view key, std::s return OpStatus::SKIPPED; if (params.prev_val) { + if (it->second.ObjType() != OBJ_STRING) + return OpStatus::WRONG_TYPE; params.prev_val->emplace(it->second.str); } @@ -68,7 +74,12 @@ OpResult SetCmd::SetExisting(DbIndex db_ind, std::string_view value, uint6 db_slice_->Expire(db_ind, dest, expire_at_ms); } + if (dest->second.robj) { + decrRefCountVoid(dest->second.robj); + dest->second.robj = nullptr; + } dest->second = value; + dest->second.obj_type = OBJ_STRING; return OpStatus::OK; } @@ -139,7 +150,7 @@ void StringFamily::Get(CmdArgList args, ConnectionContext* cntx) { std::string_view key = ArgS(args, 1); auto cb = [&](Transaction* t, EngineShard* shard) -> OpResult { - OpResult it_res = shard->db_slice().Find(cntx->db_index(), key); + OpResult it_res = shard->db_slice().Find(cntx->db_index(), key, OBJ_STRING); if (!it_res.ok()) return it_res.status(); @@ -156,8 +167,14 @@ void StringFamily::Get(CmdArgList args, ConnectionContext* cntx) { DVLOG(1) << "GET " << trans->DebugId() << ": " << key << " " << result.value(); cntx->SendGetReply(key, 0, result.value()); } else { - DVLOG(1) << "GET " << key << " nil"; - cntx->SendGetNotFound(); + switch (result.status()) { + case OpStatus::WRONG_TYPE: + cntx->SendError(kWrongTypeErr); + break; + default: + DVLOG(1) << "GET " << key << " nil"; + cntx->SendGetNotFound(); + } } } @@ -253,9 +270,9 @@ auto StringFamily::OpMGet(const Transaction* t, EngineShard* shard) -> MGetRespo auto& db_slice = shard->db_slice(); for (size_t i = 0; i < args.size(); ++i) { - OpResult de_res = db_slice.Find(0, args[i]); - if (de_res.ok()) { - response[i] = de_res.value()->second.str; + OpResult it_res = db_slice.Find(t->db_index(), args[i], OBJ_STRING); + if (it_res.ok()) { + response[i] = it_res.value()->second.str; } } @@ -266,7 +283,7 @@ OpStatus StringFamily::OpMSet(const Transaction* t, EngineShard* es) { ArgSlice largs = t->ShardArgsInShard(es->shard_id()); CHECK(!largs.empty() && largs.size() % 2 == 0); - SetCmd::SetParams params{0}; + SetCmd::SetParams params{t->db_index()}; SetCmd sg(&es->db_slice()); for (size_t i = 0; i < largs.size(); i += 2) { DVLOG(1) << "MSet " << largs[i] << ":" << largs[i + 1]; diff --git a/server/table.h b/server/table.h index a88b3381e..53d00a05a 100644 --- a/server/table.h +++ b/server/table.h @@ -10,6 +10,8 @@ namespace dfly { struct MainValue { std::string str; + void* robj = nullptr; + uint8_t obj_type = 0; MainValue() = default; MainValue(std::string_view s) : str(s) { @@ -23,6 +25,10 @@ struct MainValue { has_expire_ = b; } + unsigned ObjType() const { + return obj_type; + } + private: bool has_expire_ = false; };