mirror of
https://github.com/dragonflydb/dragonfly.git
synced 2025-05-11 02:15:45 +02:00
parent
d876bcd5cb
commit
e71f083f34
10 changed files with 138 additions and 40 deletions
|
@ -39,10 +39,13 @@ string ToLower(string_view word) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get all words from text as matched by the ICU library
|
// Get all words from text as matched by the ICU library
|
||||||
absl::flat_hash_set<std::string> TokenizeWords(std::string_view text) {
|
absl::flat_hash_set<std::string> TokenizeWords(std::string_view text,
|
||||||
|
const TextIndex::StopWords& stopwords) {
|
||||||
absl::flat_hash_set<std::string> words;
|
absl::flat_hash_set<std::string> words;
|
||||||
for (std::string_view word : una::views::word_only::utf8(text))
|
for (std::string_view word : una::views::word_only::utf8(text)) {
|
||||||
words.insert(una::cases::to_lowercase_utf8(word));
|
if (std::string word_lc = una::cases::to_lowercase_utf8(word); !stopwords.contains(word_lc))
|
||||||
|
words.insert(std::move(word_lc));
|
||||||
|
}
|
||||||
return words;
|
return words;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -166,7 +169,7 @@ template struct BaseStringIndex<CompressedSortedSet>;
|
||||||
template struct BaseStringIndex<SortedVector>;
|
template struct BaseStringIndex<SortedVector>;
|
||||||
|
|
||||||
absl::flat_hash_set<std::string> TextIndex::Tokenize(std::string_view value) const {
|
absl::flat_hash_set<std::string> TextIndex::Tokenize(std::string_view value) const {
|
||||||
return TokenizeWords(value);
|
return TokenizeWords(value, *stopwords_);
|
||||||
}
|
}
|
||||||
|
|
||||||
absl::flat_hash_set<std::string> TagIndex::Tokenize(std::string_view value) const {
|
absl::flat_hash_set<std::string> TagIndex::Tokenize(std::string_view value) const {
|
||||||
|
|
|
@ -87,10 +87,16 @@ template <typename C> struct BaseStringIndex : public BaseIndex {
|
||||||
// Index for text fields.
|
// Index for text fields.
|
||||||
// Hashmap based lookup per word.
|
// Hashmap based lookup per word.
|
||||||
struct TextIndex : public BaseStringIndex<CompressedSortedSet> {
|
struct TextIndex : public BaseStringIndex<CompressedSortedSet> {
|
||||||
TextIndex(PMR_NS::memory_resource* mr) : BaseStringIndex(mr, false) {
|
using StopWords = absl::flat_hash_set<std::string>;
|
||||||
|
|
||||||
|
TextIndex(PMR_NS::memory_resource* mr, const StopWords* stopwords)
|
||||||
|
: BaseStringIndex(mr, false), stopwords_{stopwords} {
|
||||||
}
|
}
|
||||||
|
|
||||||
absl::flat_hash_set<std::string> Tokenize(std::string_view value) const override;
|
absl::flat_hash_set<std::string> Tokenize(std::string_view value) const override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
const StopWords* stopwords_;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Index for text fields.
|
// Index for text fields.
|
||||||
|
|
|
@ -13,6 +13,7 @@
|
||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
#include <variant>
|
#include <variant>
|
||||||
|
|
||||||
|
#include "absl/container/flat_hash_set.h"
|
||||||
#include "base/logging.h"
|
#include "base/logging.h"
|
||||||
#include "core/overloaded.h"
|
#include "core/overloaded.h"
|
||||||
#include "core/search/ast_expr.h"
|
#include "core/search/ast_expr.h"
|
||||||
|
@ -454,8 +455,18 @@ string_view Schema::LookupAlias(string_view alias) const {
|
||||||
return alias;
|
return alias;
|
||||||
}
|
}
|
||||||
|
|
||||||
FieldIndices::FieldIndices(Schema schema, PMR_NS::memory_resource* mr)
|
IndicesOptions::IndicesOptions() {
|
||||||
: schema_{std::move(schema)}, all_ids_{}, indices_{} {
|
static absl::flat_hash_set<std::string> kDefaultStopwords{
|
||||||
|
"a", "is", "the", "an", "and", "are", "as", "at", "be", "but", "by",
|
||||||
|
"for", "if", "in", "into", "it", "no", "not", "of", "on", "or", "such",
|
||||||
|
"that", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"};
|
||||||
|
|
||||||
|
stopwords = kDefaultStopwords;
|
||||||
|
}
|
||||||
|
|
||||||
|
FieldIndices::FieldIndices(const Schema& schema, const IndicesOptions& options,
|
||||||
|
PMR_NS::memory_resource* mr)
|
||||||
|
: schema_{schema}, options_{options} {
|
||||||
CreateIndices(mr);
|
CreateIndices(mr);
|
||||||
CreateSortIndices(mr);
|
CreateSortIndices(mr);
|
||||||
}
|
}
|
||||||
|
@ -467,7 +478,7 @@ void FieldIndices::CreateIndices(PMR_NS::memory_resource* mr) {
|
||||||
|
|
||||||
switch (field_info.type) {
|
switch (field_info.type) {
|
||||||
case SchemaField::TEXT:
|
case SchemaField::TEXT:
|
||||||
indices_[field_ident] = make_unique<TextIndex>(mr);
|
indices_[field_ident] = make_unique<TextIndex>(mr, &options_.stopwords);
|
||||||
break;
|
break;
|
||||||
case SchemaField::NUMERIC:
|
case SchemaField::NUMERIC:
|
||||||
indices_[field_ident] = make_unique<NumericIndex>(mr);
|
indices_[field_ident] = make_unique<NumericIndex>(mr);
|
||||||
|
@ -546,7 +557,7 @@ BaseSortIndex* FieldIndices::GetSortIndex(string_view field) const {
|
||||||
|
|
||||||
std::vector<TextIndex*> FieldIndices::GetAllTextIndices() const {
|
std::vector<TextIndex*> FieldIndices::GetAllTextIndices() const {
|
||||||
vector<TextIndex*> out;
|
vector<TextIndex*> out;
|
||||||
for (auto& [field_name, field_info] : schema_.fields) {
|
for (const auto& [field_name, field_info] : schema_.fields) {
|
||||||
if (field_info.type != SchemaField::TEXT || (field_info.flags & SchemaField::NOINDEX) > 0)
|
if (field_info.type != SchemaField::TEXT || (field_info.flags & SchemaField::NOINDEX) > 0)
|
||||||
continue;
|
continue;
|
||||||
auto* index = dynamic_cast<TextIndex*>(GetIndex(field_name));
|
auto* index = dynamic_cast<TextIndex*>(GetIndex(field_name));
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <absl/container/flat_hash_map.h>
|
#include <absl/container/flat_hash_map.h>
|
||||||
|
#include <absl/container/flat_hash_set.h>
|
||||||
|
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
@ -61,11 +62,20 @@ struct Schema {
|
||||||
std::string_view LookupAlias(std::string_view alias) const;
|
std::string_view LookupAlias(std::string_view alias) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct IndicesOptions {
|
||||||
|
IndicesOptions();
|
||||||
|
explicit IndicesOptions(absl::flat_hash_set<std::string> stopwords)
|
||||||
|
: stopwords{std::move(stopwords)} {
|
||||||
|
}
|
||||||
|
|
||||||
|
absl::flat_hash_set<std::string> stopwords;
|
||||||
|
};
|
||||||
|
|
||||||
// Collection of indices for all fields in schema
|
// Collection of indices for all fields in schema
|
||||||
class FieldIndices {
|
class FieldIndices {
|
||||||
public:
|
public:
|
||||||
// Create indices based on schema
|
// Create indices based on schema and options. Both must outlive the indices
|
||||||
FieldIndices(Schema schema, PMR_NS::memory_resource* mr);
|
FieldIndices(const Schema& schema, const IndicesOptions& options, PMR_NS::memory_resource* mr);
|
||||||
|
|
||||||
void Add(DocId doc, DocumentAccessor* access);
|
void Add(DocId doc, DocumentAccessor* access);
|
||||||
void Remove(DocId doc, DocumentAccessor* access);
|
void Remove(DocId doc, DocumentAccessor* access);
|
||||||
|
@ -84,8 +94,8 @@ class FieldIndices {
|
||||||
void CreateIndices(PMR_NS::memory_resource* mr);
|
void CreateIndices(PMR_NS::memory_resource* mr);
|
||||||
void CreateSortIndices(PMR_NS::memory_resource* mr);
|
void CreateSortIndices(PMR_NS::memory_resource* mr);
|
||||||
|
|
||||||
private:
|
const Schema& schema_;
|
||||||
Schema schema_;
|
const IndicesOptions& options_;
|
||||||
std::vector<DocId> all_ids_;
|
std::vector<DocId> all_ids_;
|
||||||
absl::flat_hash_map<std::string, std::unique_ptr<BaseIndex>> indices_;
|
absl::flat_hash_map<std::string, std::unique_ptr<BaseIndex>> indices_;
|
||||||
absl::flat_hash_map<std::string, std::unique_ptr<BaseSortIndex>> sort_indices_;
|
absl::flat_hash_map<std::string, std::unique_ptr<BaseSortIndex>> sort_indices_;
|
||||||
|
|
|
@ -66,6 +66,8 @@ struct MockedDocument : public DocumentAccessor {
|
||||||
Map fields_{};
|
Map fields_{};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
IndicesOptions kEmptyOptions{{}};
|
||||||
|
|
||||||
Schema MakeSimpleSchema(initializer_list<pair<string_view, SchemaField::FieldType>> ilist) {
|
Schema MakeSimpleSchema(initializer_list<pair<string_view, SchemaField::FieldType>> ilist) {
|
||||||
Schema schema;
|
Schema schema;
|
||||||
for (auto [name, type] : ilist) {
|
for (auto [name, type] : ilist) {
|
||||||
|
@ -105,7 +107,7 @@ class SearchTest : public ::testing::Test {
|
||||||
bool Check() {
|
bool Check() {
|
||||||
absl::Cleanup cl{[this] { entries_.clear(); }};
|
absl::Cleanup cl{[this] { entries_.clear(); }};
|
||||||
|
|
||||||
FieldIndices index{schema_, PMR_NS::get_default_resource()};
|
FieldIndices index{schema_, kEmptyOptions, PMR_NS::get_default_resource()};
|
||||||
|
|
||||||
shuffle(entries_.begin(), entries_.end(), default_random_engine{});
|
shuffle(entries_.begin(), entries_.end(), default_random_engine{});
|
||||||
for (DocId i = 0; i < entries_.size(); i++)
|
for (DocId i = 0; i < entries_.size(); i++)
|
||||||
|
@ -372,6 +374,36 @@ TEST_F(SearchTest, IntegerTerms) {
|
||||||
EXPECT_TRUE(Check()) << GetError();
|
EXPECT_TRUE(Check()) << GetError();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(SearchTest, StopWords) {
|
||||||
|
auto schema = MakeSimpleSchema({{"title", SchemaField::TEXT}});
|
||||||
|
IndicesOptions options{{"some", "words", "are", "left", "out"}};
|
||||||
|
|
||||||
|
FieldIndices indices{schema, options, PMR_NS::get_default_resource()};
|
||||||
|
SearchAlgorithm algo{};
|
||||||
|
QueryParams params;
|
||||||
|
|
||||||
|
vector<string> documents = {"some words left out", //
|
||||||
|
"some can be found", //
|
||||||
|
"words are never matched", //
|
||||||
|
"explicitly found!"};
|
||||||
|
for (size_t i = 0; i < documents.size(); i++) {
|
||||||
|
MockedDocument doc{{{"title", documents[i]}}};
|
||||||
|
indices.Add(i, &doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
// words is a stopword
|
||||||
|
algo.Init("words", ¶ms);
|
||||||
|
EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre());
|
||||||
|
|
||||||
|
// some is a stopword
|
||||||
|
algo.Init("some", ¶ms);
|
||||||
|
EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre());
|
||||||
|
|
||||||
|
// found is not a stopword
|
||||||
|
algo.Init("found", ¶ms);
|
||||||
|
EXPECT_THAT(algo.Search(&indices).ids, testing::UnorderedElementsAre(1, 3));
|
||||||
|
}
|
||||||
|
|
||||||
std::string ToBytes(absl::Span<const float> vec) {
|
std::string ToBytes(absl::Span<const float> vec) {
|
||||||
return string{reinterpret_cast<const char*>(vec.data()), sizeof(float) * vec.size()};
|
return string{reinterpret_cast<const char*>(vec.data()), sizeof(float) * vec.size()};
|
||||||
}
|
}
|
||||||
|
@ -380,7 +412,7 @@ TEST_F(SearchTest, Errors) {
|
||||||
auto schema = MakeSimpleSchema(
|
auto schema = MakeSimpleSchema(
|
||||||
{{"score", SchemaField::NUMERIC}, {"even", SchemaField::TAG}, {"pos", SchemaField::VECTOR}});
|
{{"score", SchemaField::NUMERIC}, {"even", SchemaField::TAG}, {"pos", SchemaField::VECTOR}});
|
||||||
schema.fields["pos"].special_params = SchemaField::VectorParams{false, 1};
|
schema.fields["pos"].special_params = SchemaField::VectorParams{false, 1};
|
||||||
FieldIndices indices{schema, PMR_NS::get_default_resource()};
|
FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource()};
|
||||||
|
|
||||||
SearchAlgorithm algo{};
|
SearchAlgorithm algo{};
|
||||||
QueryParams params;
|
QueryParams params;
|
||||||
|
@ -404,7 +436,7 @@ class KnnTest : public SearchTest, public testing::WithParamInterface<bool /* hn
|
||||||
TEST_P(KnnTest, Simple1D) {
|
TEST_P(KnnTest, Simple1D) {
|
||||||
auto schema = MakeSimpleSchema({{"even", SchemaField::TAG}, {"pos", SchemaField::VECTOR}});
|
auto schema = MakeSimpleSchema({{"even", SchemaField::TAG}, {"pos", SchemaField::VECTOR}});
|
||||||
schema.fields["pos"].special_params = SchemaField::VectorParams{GetParam(), 1};
|
schema.fields["pos"].special_params = SchemaField::VectorParams{GetParam(), 1};
|
||||||
FieldIndices indices{schema, PMR_NS::get_default_resource()};
|
FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource()};
|
||||||
|
|
||||||
// Place points on a straight line
|
// Place points on a straight line
|
||||||
for (size_t i = 0; i < 100; i++) {
|
for (size_t i = 0; i < 100; i++) {
|
||||||
|
@ -461,7 +493,7 @@ TEST_P(KnnTest, Simple2D) {
|
||||||
|
|
||||||
auto schema = MakeSimpleSchema({{"pos", SchemaField::VECTOR}});
|
auto schema = MakeSimpleSchema({{"pos", SchemaField::VECTOR}});
|
||||||
schema.fields["pos"].special_params = SchemaField::VectorParams{GetParam(), 2};
|
schema.fields["pos"].special_params = SchemaField::VectorParams{GetParam(), 2};
|
||||||
FieldIndices indices{schema, PMR_NS::get_default_resource()};
|
FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource()};
|
||||||
|
|
||||||
for (size_t i = 0; i < ABSL_ARRAYSIZE(kTestCoords); i++) {
|
for (size_t i = 0; i < ABSL_ARRAYSIZE(kTestCoords); i++) {
|
||||||
string coords = ToBytes({kTestCoords[i].first, kTestCoords[i].second});
|
string coords = ToBytes({kTestCoords[i].first, kTestCoords[i].second});
|
||||||
|
@ -523,7 +555,7 @@ TEST_P(KnnTest, Cosine) {
|
||||||
auto schema = MakeSimpleSchema({{"pos", SchemaField::VECTOR}});
|
auto schema = MakeSimpleSchema({{"pos", SchemaField::VECTOR}});
|
||||||
schema.fields["pos"].special_params =
|
schema.fields["pos"].special_params =
|
||||||
SchemaField::VectorParams{GetParam(), 2, VectorSimilarity::COSINE};
|
SchemaField::VectorParams{GetParam(), 2, VectorSimilarity::COSINE};
|
||||||
FieldIndices indices{schema, PMR_NS::get_default_resource()};
|
FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource()};
|
||||||
|
|
||||||
for (size_t i = 0; i < ABSL_ARRAYSIZE(kTestCoords); i++) {
|
for (size_t i = 0; i < ABSL_ARRAYSIZE(kTestCoords); i++) {
|
||||||
string coords = ToBytes({kTestCoords[i].first, kTestCoords[i].second});
|
string coords = ToBytes({kTestCoords[i].first, kTestCoords[i].second});
|
||||||
|
@ -567,7 +599,7 @@ TEST_P(KnnTest, AddRemove) {
|
||||||
auto schema = MakeSimpleSchema({{"pos", SchemaField::VECTOR}});
|
auto schema = MakeSimpleSchema({{"pos", SchemaField::VECTOR}});
|
||||||
schema.fields["pos"].special_params =
|
schema.fields["pos"].special_params =
|
||||||
SchemaField::VectorParams{GetParam(), 1, VectorSimilarity::L2};
|
SchemaField::VectorParams{GetParam(), 1, VectorSimilarity::L2};
|
||||||
FieldIndices indices{schema, PMR_NS::get_default_resource()};
|
FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource()};
|
||||||
|
|
||||||
vector<MockedDocument> documents(10);
|
vector<MockedDocument> documents(10);
|
||||||
for (size_t i = 0; i < 10; i++) {
|
for (size_t i = 0; i < 10; i++) {
|
||||||
|
@ -615,7 +647,7 @@ TEST_P(KnnTest, AutoResize) {
|
||||||
auto schema = MakeSimpleSchema({{"pos", SchemaField::VECTOR}});
|
auto schema = MakeSimpleSchema({{"pos", SchemaField::VECTOR}});
|
||||||
schema.fields["pos"].special_params =
|
schema.fields["pos"].special_params =
|
||||||
SchemaField::VectorParams{GetParam(), 1, VectorSimilarity::L2, kInitialCapacity};
|
SchemaField::VectorParams{GetParam(), 1, VectorSimilarity::L2, kInitialCapacity};
|
||||||
FieldIndices indices{schema, PMR_NS::get_default_resource()};
|
FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource()};
|
||||||
|
|
||||||
for (size_t i = 0; i < 100; i++) {
|
for (size_t i = 0; i < 100; i++) {
|
||||||
MockedDocument doc{Map{{"pos", ToBytes({float(i)})}}};
|
MockedDocument doc{Map{{"pos", ToBytes({float(i)})}}};
|
||||||
|
@ -634,7 +666,7 @@ static void BM_VectorSearch(benchmark::State& state) {
|
||||||
|
|
||||||
auto schema = MakeSimpleSchema({{"pos", SchemaField::VECTOR}});
|
auto schema = MakeSimpleSchema({{"pos", SchemaField::VECTOR}});
|
||||||
schema.fields["pos"].special_params = SchemaField::VectorParams{false, ndims};
|
schema.fields["pos"].special_params = SchemaField::VectorParams{false, ndims};
|
||||||
FieldIndices indices{schema, PMR_NS::get_default_resource()};
|
FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource()};
|
||||||
|
|
||||||
auto random_vec = [ndims]() {
|
auto random_vec = [ndims]() {
|
||||||
vector<float> coords;
|
vector<float> coords;
|
||||||
|
|
|
@ -8,6 +8,7 @@
|
||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
|
#include "absl/strings/str_cat.h"
|
||||||
#include "base/logging.h"
|
#include "base/logging.h"
|
||||||
#include "core/overloaded.h"
|
#include "core/overloaded.h"
|
||||||
#include "core/search/indices.h"
|
#include "core/search/indices.h"
|
||||||
|
@ -89,6 +90,11 @@ string DocIndexInfo::BuildRestoreCommand() const {
|
||||||
if (!base_index.prefix.empty())
|
if (!base_index.prefix.empty())
|
||||||
absl::StrAppend(&out, " PREFIX", " 1 ", base_index.prefix);
|
absl::StrAppend(&out, " PREFIX", " 1 ", base_index.prefix);
|
||||||
|
|
||||||
|
// STOPWORDS
|
||||||
|
absl::StrAppend(&out, " STOPWORDS ", base_index.options.stopwords.size());
|
||||||
|
for (const auto& sw : base_index.options.stopwords)
|
||||||
|
absl::StrAppend(&out, " ", sw);
|
||||||
|
|
||||||
absl::StrAppend(&out, " SCHEMA");
|
absl::StrAppend(&out, " SCHEMA");
|
||||||
for (const auto& [fident, finfo] : base_index.schema.fields) {
|
for (const auto& [fident, finfo] : base_index.schema.fields) {
|
||||||
// Store field name, alias and type
|
// Store field name, alias and type
|
||||||
|
@ -170,36 +176,35 @@ bool DocIndex::Matches(string_view key, unsigned obj_code) const {
|
||||||
return obj_code == GetObjCode() && key.rfind(prefix, 0) == 0;
|
return obj_code == GetObjCode() && key.rfind(prefix, 0) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
ShardDocIndex::ShardDocIndex(shared_ptr<DocIndex> index)
|
ShardDocIndex::ShardDocIndex(shared_ptr<const DocIndex> index)
|
||||||
: base_{std::move(index)}, indices_{{}, nullptr}, key_index_{} {
|
: base_{std::move(index)}, key_index_{} {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShardDocIndex::Rebuild(const OpArgs& op_args, PMR_NS::memory_resource* mr) {
|
void ShardDocIndex::Rebuild(const OpArgs& op_args, PMR_NS::memory_resource* mr) {
|
||||||
key_index_ = DocKeyIndex{};
|
key_index_ = DocKeyIndex{};
|
||||||
indices_ = search::FieldIndices{base_->schema, mr};
|
indices_.emplace(base_->schema, base_->options, mr);
|
||||||
|
|
||||||
auto cb = [this](string_view key, BaseAccessor* doc) { indices_.Add(key_index_.Add(key), doc); };
|
auto cb = [this](string_view key, BaseAccessor* doc) { indices_->Add(key_index_.Add(key), doc); };
|
||||||
TraverseAllMatching(*base_, op_args, cb);
|
TraverseAllMatching(*base_, op_args, cb);
|
||||||
|
|
||||||
was_built_ = true;
|
|
||||||
VLOG(1) << "Indexed " << key_index_.Size() << " docs on " << base_->prefix;
|
VLOG(1) << "Indexed " << key_index_.Size() << " docs on " << base_->prefix;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShardDocIndex::AddDoc(string_view key, const DbContext& db_cntx, const PrimeValue& pv) {
|
void ShardDocIndex::AddDoc(string_view key, const DbContext& db_cntx, const PrimeValue& pv) {
|
||||||
if (!was_built_)
|
if (!indices_)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
auto accessor = GetAccessor(db_cntx, pv);
|
auto accessor = GetAccessor(db_cntx, pv);
|
||||||
indices_.Add(key_index_.Add(key), accessor.get());
|
indices_->Add(key_index_.Add(key), accessor.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShardDocIndex::RemoveDoc(string_view key, const DbContext& db_cntx, const PrimeValue& pv) {
|
void ShardDocIndex::RemoveDoc(string_view key, const DbContext& db_cntx, const PrimeValue& pv) {
|
||||||
if (!was_built_)
|
if (!indices_)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
auto accessor = GetAccessor(db_cntx, pv);
|
auto accessor = GetAccessor(db_cntx, pv);
|
||||||
DocId id = key_index_.Remove(key);
|
DocId id = key_index_.Remove(key);
|
||||||
indices_.Remove(id, accessor.get());
|
indices_->Remove(id, accessor.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ShardDocIndex::Matches(string_view key, unsigned obj_code) const {
|
bool ShardDocIndex::Matches(string_view key, unsigned obj_code) const {
|
||||||
|
@ -209,7 +214,7 @@ bool ShardDocIndex::Matches(string_view key, unsigned obj_code) const {
|
||||||
SearchResult ShardDocIndex::Search(const OpArgs& op_args, const SearchParams& params,
|
SearchResult ShardDocIndex::Search(const OpArgs& op_args, const SearchParams& params,
|
||||||
search::SearchAlgorithm* search_algo) const {
|
search::SearchAlgorithm* search_algo) const {
|
||||||
auto& db_slice = op_args.GetDbSlice();
|
auto& db_slice = op_args.GetDbSlice();
|
||||||
auto search_results = search_algo->Search(&indices_, params.limit_offset + params.limit_total);
|
auto search_results = search_algo->Search(&*indices_, params.limit_offset + params.limit_total);
|
||||||
|
|
||||||
if (!search_results.error.empty())
|
if (!search_results.error.empty())
|
||||||
return SearchResult{facade::ErrorReply{std::move(search_results.error)}};
|
return SearchResult{facade::ErrorReply{std::move(search_results.error)}};
|
||||||
|
@ -253,7 +258,7 @@ vector<SearchDocData> ShardDocIndex::SearchForAggregator(
|
||||||
const OpArgs& op_args, const AggregateParams& params,
|
const OpArgs& op_args, const AggregateParams& params,
|
||||||
search::SearchAlgorithm* search_algo) const {
|
search::SearchAlgorithm* search_algo) const {
|
||||||
auto& db_slice = op_args.GetDbSlice();
|
auto& db_slice = op_args.GetDbSlice();
|
||||||
auto search_results = search_algo->Search(&indices_);
|
auto search_results = search_algo->Search(&*indices_);
|
||||||
|
|
||||||
if (!search_results.error.empty())
|
if (!search_results.error.empty())
|
||||||
return {};
|
return {};
|
||||||
|
@ -267,7 +272,7 @@ vector<SearchDocData> ShardDocIndex::SearchForAggregator(
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
auto accessor = GetAccessor(op_args.db_cntx, (*it)->second);
|
auto accessor = GetAccessor(op_args.db_cntx, (*it)->second);
|
||||||
auto extracted = indices_.ExtractStoredValues(doc);
|
auto extracted = indices_->ExtractStoredValues(doc);
|
||||||
|
|
||||||
SearchDocData loaded;
|
SearchDocData loaded;
|
||||||
if (params.load_fields.ShouldReturnAllFields()) {
|
if (params.load_fields.ShouldReturnAllFields()) {
|
||||||
|
@ -290,7 +295,7 @@ DocIndexInfo ShardDocIndex::GetInfo() const {
|
||||||
}
|
}
|
||||||
|
|
||||||
io::Result<StringVec, ErrorReply> ShardDocIndex::GetTagVals(string_view field) const {
|
io::Result<StringVec, ErrorReply> ShardDocIndex::GetTagVals(string_view field) const {
|
||||||
search::BaseIndex* base_index = indices_.GetIndex(field);
|
search::BaseIndex* base_index = indices_->GetIndex(field);
|
||||||
if (base_index == nullptr) {
|
if (base_index == nullptr) {
|
||||||
return make_unexpected(ErrorReply{"-No such field"});
|
return make_unexpected(ErrorReply{"-No such field"});
|
||||||
}
|
}
|
||||||
|
@ -312,8 +317,8 @@ ShardDocIndex* ShardDocIndices::GetIndex(string_view name) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShardDocIndices::InitIndex(const OpArgs& op_args, std::string_view name,
|
void ShardDocIndices::InitIndex(const OpArgs& op_args, std::string_view name,
|
||||||
shared_ptr<DocIndex> index_ptr) {
|
shared_ptr<const DocIndex> index_ptr) {
|
||||||
auto shard_index = make_unique<ShardDocIndex>(index_ptr);
|
auto shard_index = make_unique<ShardDocIndex>(std::move(index_ptr));
|
||||||
auto [it, _] = indices_.emplace(name, std::move(shard_index));
|
auto [it, _] = indices_.emplace(name, std::move(shard_index));
|
||||||
|
|
||||||
// Don't build while loading, shutting down, etc.
|
// Don't build while loading, shutting down, etc.
|
||||||
|
|
|
@ -120,6 +120,7 @@ struct DocIndex {
|
||||||
bool Matches(std::string_view key, unsigned obj_code) const;
|
bool Matches(std::string_view key, unsigned obj_code) const;
|
||||||
|
|
||||||
search::Schema schema;
|
search::Schema schema;
|
||||||
|
search::IndicesOptions options{};
|
||||||
std::string prefix{};
|
std::string prefix{};
|
||||||
DataType type{HASH};
|
DataType type{HASH};
|
||||||
};
|
};
|
||||||
|
@ -156,7 +157,7 @@ class ShardDocIndex {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// Index must be rebuilt at least once after intialization
|
// Index must be rebuilt at least once after intialization
|
||||||
ShardDocIndex(std::shared_ptr<DocIndex> index);
|
ShardDocIndex(std::shared_ptr<const DocIndex> index);
|
||||||
|
|
||||||
// Perform search on all indexed documents and return results.
|
// Perform search on all indexed documents and return results.
|
||||||
SearchResult Search(const OpArgs& op_args, const SearchParams& params,
|
SearchResult Search(const OpArgs& op_args, const SearchParams& params,
|
||||||
|
@ -182,9 +183,8 @@ class ShardDocIndex {
|
||||||
void Rebuild(const OpArgs& op_args, PMR_NS::memory_resource* mr);
|
void Rebuild(const OpArgs& op_args, PMR_NS::memory_resource* mr);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool was_built_ = false;
|
|
||||||
std::shared_ptr<const DocIndex> base_;
|
std::shared_ptr<const DocIndex> base_;
|
||||||
search::FieldIndices indices_;
|
std::optional<search::FieldIndices> indices_;
|
||||||
DocKeyIndex key_index_;
|
DocKeyIndex key_index_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -198,7 +198,8 @@ class ShardDocIndices {
|
||||||
|
|
||||||
// Init index: create shard local state for given index with given name.
|
// Init index: create shard local state for given index with given name.
|
||||||
// Build if instance is in active state.
|
// Build if instance is in active state.
|
||||||
void InitIndex(const OpArgs& op_args, std::string_view name, std::shared_ptr<DocIndex> index);
|
void InitIndex(const OpArgs& op_args, std::string_view name,
|
||||||
|
std::shared_ptr<const DocIndex> index);
|
||||||
|
|
||||||
// Drop index, return true if it existed and was dropped
|
// Drop index, return true if it existed and was dropped
|
||||||
bool DropIndex(std::string_view name);
|
bool DropIndex(std::string_view name);
|
||||||
|
|
|
@ -485,6 +485,14 @@ void SearchFamily::FtCreate(CmdArgList args, ConnectionContext* cntx) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// STOWORDS count [words...]
|
||||||
|
if (parser.Check("STOPWORDS")) {
|
||||||
|
index.options.stopwords.clear();
|
||||||
|
for (size_t num = parser.Next<size_t>(); num > 0; num--)
|
||||||
|
index.options.stopwords.emplace(parser.Next());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// SCHEMA
|
// SCHEMA
|
||||||
if (parser.Check("SCHEMA")) {
|
if (parser.Check("SCHEMA")) {
|
||||||
auto schema = ParseSchemaOrReply(index.type, parser.Tail(), cntx);
|
auto schema = ParseSchemaOrReply(index.type, parser.Tail(), cntx);
|
||||||
|
|
|
@ -604,6 +604,22 @@ TEST_F(SearchFamilyTest, TestReturn) {
|
||||||
EXPECT_THAT(resp, MatchEntry("k0", "vec_return", "20"));
|
EXPECT_THAT(resp, MatchEntry("k0", "vec_return", "20"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(SearchFamilyTest, TestStopWords) {
|
||||||
|
Run({"ft.create", "i1", "STOPWORDS", "3", "red", "green", "blue", "SCHEMA", "title", "TEXT"});
|
||||||
|
|
||||||
|
Run({"hset", "d:1", "title", "ReD? parrot flies away"});
|
||||||
|
Run({"hset", "d:2", "title", "GrEEn crocodile eats you"});
|
||||||
|
Run({"hset", "d:3", "title", "BLUe. Whale surfes the sea"});
|
||||||
|
|
||||||
|
EXPECT_THAT(Run({"ft.search", "i1", "red"}), kNoResults);
|
||||||
|
EXPECT_THAT(Run({"ft.search", "i1", "green"}), kNoResults);
|
||||||
|
EXPECT_THAT(Run({"ft.search", "i1", "blue"}), kNoResults);
|
||||||
|
|
||||||
|
EXPECT_THAT(Run({"ft.search", "i1", "parrot"}), AreDocIds("d:1"));
|
||||||
|
EXPECT_THAT(Run({"ft.search", "i1", "crocodile"}), AreDocIds("d:2"));
|
||||||
|
EXPECT_THAT(Run({"ft.search", "i1", "whale"}), AreDocIds("d:3"));
|
||||||
|
}
|
||||||
|
|
||||||
TEST_F(SearchFamilyTest, SimpleUpdates) {
|
TEST_F(SearchFamilyTest, SimpleUpdates) {
|
||||||
EXPECT_EQ(Run({"ft.create", "i1", "schema", "title", "text", "visits", "numeric"}), "OK");
|
EXPECT_EQ(Run({"ft.create", "i1", "schema", "title", "text", "visits", "numeric"}), "OK");
|
||||||
|
|
||||||
|
|
|
@ -388,6 +388,7 @@ async def test_index_persistence(df_server):
|
||||||
i1 = client.ft("i1")
|
i1 = client.ft("i1")
|
||||||
await i1.create_index(
|
await i1.create_index(
|
||||||
fix_schema_naming(IndexType.JSON, SCHEMA_1),
|
fix_schema_naming(IndexType.JSON, SCHEMA_1),
|
||||||
|
stopwords=["interesting", "stopwords"],
|
||||||
definition=IndexDefinition(index_type=IndexType.JSON, prefix=["blog-"]),
|
definition=IndexDefinition(index_type=IndexType.JSON, prefix=["blog-"]),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -470,6 +471,11 @@ async def test_index_persistence(df_server):
|
||||||
"age"
|
"age"
|
||||||
] == "199"
|
] == "199"
|
||||||
|
|
||||||
|
# Check stopwords were loaded
|
||||||
|
await client.json().set("blog-sw1", ".", {"title": "some stopwords"})
|
||||||
|
assert (await i1.search("some")).total == 1
|
||||||
|
assert (await i1.search("stopwords")).total == 0
|
||||||
|
|
||||||
await i1.dropindex()
|
await i1.dropindex()
|
||||||
await i2.dropindex()
|
await i2.dropindex()
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue