fix(search_family): Support boolean and nullable types in indexes (#4314)

* fix(search_family): Support boolean and nullable types in indexes

fixes dragonflydb#4107, dragonflydb#4129

Signed-off-by: Stepan Bagritsevich <stefan@dragonflydb.io>

* refactor: address comments

Signed-off-by: Stepan Bagritsevich <stefan@dragonflydb.io>

---------

Signed-off-by: Stepan Bagritsevich <stefan@dragonflydb.io>
This commit is contained in:
Stepan Bagritsevich 2024-12-24 10:52:39 +04:00 committed by GitHub
parent 01f24da2b6
commit 3c7e31240f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 191 additions and 76 deletions

View file

@ -9,6 +9,7 @@
#include "server/search/doc_accessors.h"
#include <absl/functional/any_invocable.h>
#include <absl/strings/str_cat.h>
#include <absl/strings/str_join.h>
@ -72,10 +73,31 @@ FieldValue ExtractSortableValue(const search::Schema& schema, string_view key, s
FieldValue ExtractSortableValueFromJson(const search::Schema& schema, string_view key,
const JsonType& json) {
if (json.is_null()) {
return std::monostate{};
}
auto json_as_string = json.to_string();
return ExtractSortableValue(schema, key, json_as_string);
}
/* Returns true if json elements were successfully processed. */
template <typename Callback>
bool ProcessJsonElements(const std::vector<JsonType>& json_elements, Callback&& cb) {
auto process = [&cb](const auto& json_range) -> bool {
for (const auto& json : json_range) {
if (!json.is_null() && !cb(json)) {
return false;
}
}
return true;
};
if (!json_elements[0].is_array()) {
return process(json_elements);
}
return json_elements.size() == 1 && process(json_elements[0].array_range());
}
} // namespace
SearchDocData BaseAccessor::Serialize(const search::Schema& schema,
@ -127,6 +149,10 @@ std::optional<BaseAccessor::NumsList> BaseAccessor::GetNumbers(
return nums_list;
}
std::optional<BaseAccessor::StringList> BaseAccessor::GetTags(std::string_view active_field) const {
return GetStrings(active_field);
}
std::optional<BaseAccessor::StringList> ListPackAccessor::GetStrings(
string_view active_field) const {
auto strsv = container_utils::LpFind(lp_, active_field, intbuf_[0].data());
@ -192,8 +218,17 @@ struct JsonAccessor::JsonPathContainer {
variant<json::Path, jsoncons::jsonpath::jsonpath_expression<JsonType>> val;
};
std::optional<BaseAccessor::StringList> JsonAccessor::GetStrings(string_view active_field) const {
auto* path = GetPath(active_field);
std::optional<BaseAccessor::StringList> JsonAccessor::GetStrings(std::string_view field) const {
return GetStrings(field, false);
}
std::optional<BaseAccessor::StringList> JsonAccessor::GetTags(std::string_view active_field) const {
return GetStrings(active_field, true);
}
std::optional<BaseAccessor::StringList> JsonAccessor::GetStrings(std::string_view field,
bool accept_boolean_values) const {
auto* path = GetPath(field);
if (!path)
return search::EmptyAccessResult<StringList>();
@ -201,8 +236,18 @@ std::optional<BaseAccessor::StringList> JsonAccessor::GetStrings(string_view act
if (path_res.empty())
return search::EmptyAccessResult<StringList>();
auto is_convertible_to_string = [](bool accept_boolean_values) -> bool (*)(const JsonType& json) {
if (accept_boolean_values) {
return [](const JsonType& json) -> bool { return json.is_string() || json.is_bool(); };
} else {
return [](const JsonType& json) -> bool { return json.is_string(); };
}
}(accept_boolean_values);
if (path_res.size() == 1 && !path_res[0].is_array()) {
if (!path_res[0].is_string())
if (path_res[0].is_null())
return StringList{};
if (!is_convertible_to_string(path_res[0]))
return std::nullopt;
buf_ = path_res[0].as_string();
@ -213,33 +258,21 @@ std::optional<BaseAccessor::StringList> JsonAccessor::GetStrings(string_view act
// First, grow buffer and compute string sizes
vector<size_t> sizes;
sizes.reserve(path_res.size());
// Returns true if json element is convertiable to string
auto add_json_element_to_buf = [&](const JsonType& json) -> bool {
if (!is_convertible_to_string(json))
return false;
auto add_json_to_buf = [&](const JsonType& json) {
size_t start = buf_.size();
buf_ += json.as_string();
sizes.push_back(buf_.size() - start);
return true;
};
if (!path_res[0].is_array()) {
sizes.reserve(path_res.size());
for (const auto& element : path_res) {
if (!element.is_string())
return std::nullopt;
add_json_to_buf(element);
}
} else {
if (path_res.size() > 1) {
return std::nullopt;
}
sizes.reserve(path_res[0].size());
for (const auto& element : path_res[0].array_range()) {
if (!element.is_string())
return std::nullopt;
add_json_to_buf(element);
}
if (!ProcessJsonElements(path_res, std::move(add_json_element_to_buf))) {
return std::nullopt;
}
// Reposition start pointers to the most recent allocation of buf
@ -260,7 +293,7 @@ std::optional<BaseAccessor::VectorInfo> JsonAccessor::GetVector(string_view acti
return VectorInfo{};
auto res = path->Evaluate(json_);
if (res.empty())
if (res.empty() || res[0].is_null())
return VectorInfo{};
if (!res[0].is_array())
@ -290,24 +323,18 @@ std::optional<BaseAccessor::NumsList> JsonAccessor::GetNumbers(string_view activ
return search::EmptyAccessResult<NumsList>();
NumsList nums_list;
if (!path_res[0].is_array()) {
nums_list.reserve(path_res.size());
for (const auto& element : path_res) {
if (!element.is_number())
return std::nullopt;
nums_list.push_back(element.as<double>());
}
} else {
if (path_res.size() > 1) {
return std::nullopt;
}
nums_list.reserve(path_res.size());
nums_list.reserve(path_res[0].size());
for (const auto& element : path_res[0].array_range()) {
if (!element.is_number())
return std::nullopt;
nums_list.push_back(element.as<double>());
}
// Returns true if json element is convertiable to number
auto add_json_element = [&](const JsonType& json) -> bool {
if (!json.is_number())
return false;
nums_list.push_back(json.as<double>());
return true;
};
if (!ProcessJsonElements(path_res, std::move(add_json_element))) {
return std::nullopt;
}
return nums_list;
}

View file

@ -40,8 +40,9 @@ struct BaseAccessor : public search::DocumentAccessor {
virtual SearchDocData SerializeDocument(const search::Schema& schema) const;
// Default implementation uses GetStrings
virtual std::optional<VectorInfo> GetVector(std::string_view active_field) const;
virtual std::optional<NumsList> GetNumbers(std::string_view active_field) const;
virtual std::optional<VectorInfo> GetVector(std::string_view active_field) const override;
virtual std::optional<NumsList> GetNumbers(std::string_view active_field) const override;
virtual std::optional<StringList> GetTags(std::string_view active_field) const override;
};
// Accessor for hashes stored with listpack
@ -81,6 +82,7 @@ struct JsonAccessor : public BaseAccessor {
std::optional<StringList> GetStrings(std::string_view field) const override;
std::optional<VectorInfo> GetVector(std::string_view field) const override;
std::optional<NumsList> GetNumbers(std::string_view active_field) const override;
std::optional<StringList> GetTags(std::string_view active_field) const override;
// The JsonAccessor works with structured types and not plain strings, so an overload is needed
SearchDocData Serialize(const search::Schema& schema,
@ -91,6 +93,9 @@ struct JsonAccessor : public BaseAccessor {
static void RemoveFieldFromCache(std::string_view field);
private:
/* If accept_boolean_values is true, then json boolean values are converted to strings */
std::optional<StringList> GetStrings(std::string_view field, bool accept_boolean_values) const;
/// Parses `field` into a JSON path. Caches the results internally.
JsonPathContainer* GetPath(std::string_view field) const;

View file

@ -1298,9 +1298,15 @@ TEST_F(SearchFamilyTest, WrongFieldTypeHardJson) {
Run({"JSON.SET", "j1", ".", R"({"data":1,"name":"doc_with_int"})"});
Run({"JSON.SET", "j2", ".", R"({"data":"1","name":"doc_with_int_as_string"})"});
Run({"JSON.SET", "j3", ".", R"({"data":"string","name":"doc_with_string"})"});
Run({"JSON.SET", "j4", ".", R"({"name":"no_data"})"});
Run({"JSON.SET", "j5", ".", R"({"data":[5,4,3],"name":"doc_with_vector"})"});
Run({"JSON.SET", "j6", ".", R"({"data":"[5,4,3]","name":"doc_with_vector_as_string"})"});
Run({"JSON.SET", "j4", ".",
R"({"data":["first", "second", "third"],"name":"doc_with_strings"})"});
Run({"JSON.SET", "j5", ".", R"({"name":"no_data"})"});
Run({"JSON.SET", "j6", ".", R"({"data":[5,4,3],"name":"doc_with_vector"})"});
Run({"JSON.SET", "j7", ".", R"({"data":"[5,4,3]","name":"doc_with_vector_as_string"})"});
Run({"JSON.SET", "j8", ".", R"({"data":null,"name":"doc_with_null"})"});
Run({"JSON.SET", "j9", ".", R"({"data":[null, null, null],"name":"doc_with_nulls"})"});
Run({"JSON.SET", "j10", ".", R"({"data":true,"name":"doc_with_boolean"})"});
Run({"JSON.SET", "j11", ".", R"({"data":[true, false, true],"name":"doc_with_booleans"})"});
auto resp = Run({"FT.CREATE", "i1", "ON", "JSON", "SCHEMA", "$.data", "AS", "data", "NUMERIC"});
EXPECT_EQ(resp, "OK");
@ -1328,25 +1334,25 @@ TEST_F(SearchFamilyTest, WrongFieldTypeHardJson) {
EXPECT_EQ(resp, "OK");
resp = Run({"FT.SEARCH", "i1", "*"});
EXPECT_THAT(resp, AreDocIds("j1", "j4", "j5"));
EXPECT_THAT(resp, AreDocIds("j1", "j5", "j6", "j8", "j9"));
resp = Run({"FT.SEARCH", "i2", "*"});
EXPECT_THAT(resp, AreDocIds("j1", "j4", "j5"));
EXPECT_THAT(resp, AreDocIds("j1", "j5", "j6", "j8", "j9"));
resp = Run({"FT.SEARCH", "i3", "*"});
EXPECT_THAT(resp, AreDocIds("j2", "j3", "j6", "j4"));
EXPECT_THAT(resp, AreDocIds("j2", "j3", "j4", "j5", "j7", "j8", "j9", "j10", "j11"));
resp = Run({"FT.SEARCH", "i4", "*"});
EXPECT_THAT(resp, AreDocIds("j2", "j3", "j6", "j4"));
EXPECT_THAT(resp, AreDocIds("j2", "j3", "j4", "j5", "j7", "j8", "j9", "j10", "j11"));
resp = Run({"FT.SEARCH", "i5", "*"});
EXPECT_THAT(resp, AreDocIds("j4", "j2", "j3", "j6"));
EXPECT_THAT(resp, AreDocIds("j2", "j3", "j4", "j5", "j7", "j8", "j9"));
resp = Run({"FT.SEARCH", "i6", "*"});
EXPECT_THAT(resp, AreDocIds("j4", "j2", "j3", "j6"));
EXPECT_THAT(resp, AreDocIds("j2", "j3", "j4", "j5", "j7", "j8", "j9"));
resp = Run({"FT.SEARCH", "i7", "*"});
EXPECT_THAT(resp, AreDocIds("j4", "j5"));
EXPECT_THAT(resp, AreDocIds("j5", "j6", "j8"));
}
TEST_F(SearchFamilyTest, WrongFieldTypeHardHash) {
@ -1417,6 +1423,12 @@ TEST_F(SearchFamilyTest, WrongVectorFieldType) {
Run({"JSON.SET", "j6", ".", R"({"name":"doc_with_no_field"})"});
Run({"JSON.SET", "j7", ".",
R"({"vector_field": [999999999999999999999999999999999999999, -999999999999999999999999999999999999999, 500000000000000000000000000000000000000], "name": "doc_with_out_of_range_values"})"});
Run({"JSON.SET", "j8", ".", R"({"vector_field":null, "name": "doc_with_null"})"});
Run({"JSON.SET", "j9", ".", R"({"vector_field":[null, null, null], "name": "doc_with_nulls"})"});
Run({"JSON.SET", "j10", ".", R"({"vector_field":true, "name": "doc_with_boolean"})"});
Run({"JSON.SET", "j11", ".",
R"({"vector_field":[true, false, true], "name": "doc_with_booleans"})"});
Run({"JSON.SET", "j12", ".", R"({"vector_field":1, "name": "doc_with_int"})"});
auto resp =
Run({"FT.CREATE", "index", "ON", "JSON", "SCHEMA", "$.vector_field", "AS", "vector_field",
@ -1424,7 +1436,7 @@ TEST_F(SearchFamilyTest, WrongVectorFieldType) {
EXPECT_EQ(resp, "OK");
resp = Run({"FT.SEARCH", "index", "*"});
EXPECT_THAT(resp, AreDocIds("j6", "j7", "j1", "j4"));
EXPECT_THAT(resp, AreDocIds("j6", "j7", "j1", "j4", "j8"));
}
#ifndef SANITIZERS