chore: basic syntax parsing added for suffix and infix search

This commit is contained in:
Volodymyr Yavdoshenko 2025-05-09 15:39:20 +03:00
parent 561c0a4733
commit 01e72683b6
No known key found for this signature in database
GPG key ID: 24BC74845F4F4064
6 changed files with 139 additions and 60 deletions

View file

@ -20,7 +20,12 @@ AstTermNode::AstTermNode(string term) : term{std::move(term)} {
} }
AstPrefixNode::AstPrefixNode(string prefix) : prefix{std::move(prefix)} { AstPrefixNode::AstPrefixNode(string prefix) : prefix{std::move(prefix)} {
this->prefix.pop_back(); }
AstSuffixNode::AstSuffixNode(string suffix) : suffix{std::move(suffix)} {
}
AstInfixNode::AstInfixNode(string infix) : infix{std::move(infix)} {
} }
AstRangeNode::AstRangeNode(double lo, bool lo_excl, double hi, bool hi_excl) AstRangeNode::AstRangeNode(double lo, bool lo_excl, double hi, bool hi_excl)

View file

@ -38,6 +38,18 @@ struct AstPrefixNode {
std::string prefix; std::string prefix;
}; };
struct AstSuffixNode {
explicit AstSuffixNode(std::string suffix);
std::string suffix;
};
struct AstInfixNode {
explicit AstInfixNode(std::string infix);
std::string infix;
};
// Matches numeric range // Matches numeric range
struct AstRangeNode { struct AstRangeNode {
AstRangeNode(double lo, bool lo_excl, double hi, bool hi_excl); AstRangeNode(double lo, bool lo_excl, double hi, bool hi_excl);
@ -73,7 +85,7 @@ struct AstFieldNode {
// Stores a list of tags for a tag query // Stores a list of tags for a tag query
struct AstTagsNode { struct AstTagsNode {
using TagValue = std::variant<AstTermNode, AstPrefixNode>; using TagValue = std::variant<AstTermNode, AstPrefixNode, AstSuffixNode, AstInfixNode>;
struct TagValueProxy struct TagValueProxy
: public AstTagsNode::TagValue { // bison needs it to be default constructible : public AstTagsNode::TagValue { // bison needs it to be default constructible
@ -83,6 +95,10 @@ struct AstTagsNode {
} }
TagValueProxy(AstTermNode tv) : AstTagsNode::TagValue(std::move(tv)) { TagValueProxy(AstTermNode tv) : AstTagsNode::TagValue(std::move(tv)) {
} }
TagValueProxy(AstSuffixNode tv) : AstTagsNode::TagValue(std::move(tv)) {
}
TagValueProxy(AstInfixNode tv) : AstTagsNode::TagValue(std::move(tv)) {
}
}; };
AstTagsNode(TagValue); AstTagsNode(TagValue);
@ -111,8 +127,9 @@ struct AstKnnNode {
std::optional<float> ef_runtime; std::optional<float> ef_runtime;
}; };
using NodeVariants = std::variant<std::monostate, AstStarNode, AstStarFieldNode, AstTermNode, using NodeVariants =
AstPrefixNode, AstRangeNode, AstNegateNode, AstLogicalNode, std::variant<std::monostate, AstStarNode, AstStarFieldNode, AstTermNode, AstPrefixNode,
AstSuffixNode, AstInfixNode, AstRangeNode, AstNegateNode, AstLogicalNode,
AstFieldNode, AstTagsNode, AstKnnNode>; AstFieldNode, AstTagsNode, AstKnnNode>;
struct AstNode : public NodeVariants { struct AstNode : public NodeVariants {

View file

@ -26,17 +26,19 @@
using dfly::search::Parser; using dfly::search::Parser;
using namespace std; using namespace std;
enum class TagType { PREFIX, SUFFIX, INFIX, REGULAR };
Parser::symbol_type make_StringLit(string_view src, const Parser::location_type& loc); Parser::symbol_type make_StringLit(string_view src, const Parser::location_type& loc);
Parser::symbol_type make_TagVal(string_view src, bool is_prefix, const Parser::location_type& loc); Parser::symbol_type make_Tag(string_view src, TagType type, const Parser::location_type& loc);
%} %}
dq \" dq \"
sq \' sq \'
esc_chars ['"\?\\abfnrtv] esc_chars ['"\?\\abfnrtv]
esc_seq \\{esc_chars} esc_seq \\{esc_chars}
term_char \w term_ch \w
tag_val_char {term_char}|\\[,.<>{}\[\]\\\"\':;!@#$%^&*()\-+=~\/ ] tag_val_ch {term_ch}|\\[,.<>{}\[\]\\\"\':;!@#$%^&*()\-+=~\/ ]
asterisk_char \* astrsk_ch \*
%{ %{
@ -73,13 +75,17 @@ asterisk_char \*
{dq}([^"]|{esc_seq})*{dq} return make_StringLit(matched_view(1, 1), loc()); {dq}([^"]|{esc_seq})*{dq} return make_StringLit(matched_view(1, 1), loc());
{sq}([^']|{esc_seq})*{sq} return make_StringLit(matched_view(1, 1), loc()); {sq}([^']|{esc_seq})*{sq} return make_StringLit(matched_view(1, 1), loc());
"$"{term_char}+ return ParseParam(str(), loc()); "$"{term_ch}+ return ParseParam(str(), loc());
"@"{term_char}+ return Parser::make_FIELD(str(), loc()); "@"{term_ch}+ return Parser::make_FIELD(str(), loc());
{term_char}+{asterisk_char} return Parser::make_PREFIX(str(), loc()); {term_ch}+{astrsk_ch} return Parser::make_PREFIX(string{matched_view(0, 1)}, loc());
{astrsk_ch}{term_ch}+ return Parser::make_SUFFIX(string{matched_view(1, 0)}, loc());
{astrsk_ch}{term_ch}+{astrsk_ch} return Parser::make_INFIX(string{matched_view(1, 1)}, loc());
{term_char}+ return Parser::make_TERM(str(), loc()); {term_ch}+ return Parser::make_TERM(str(), loc());
{tag_val_char}+{asterisk_char} return make_TagVal(str(), true, loc()); {tag_val_ch}+{astrsk_ch} return make_Tag(str(), TagType::PREFIX, loc());
{tag_val_char}+ return make_TagVal(str(), false, loc()); {astrsk_ch}{tag_val_ch}+ return make_Tag(str(), TagType::SUFFIX, loc());
{astrsk_ch}{tag_val_ch}+{astrsk_ch} return make_Tag(str(), TagType::INFIX, loc());
{tag_val_ch}+ return make_Tag(str(), TagType::REGULAR, loc());
<<EOF>> return Parser::make_YYEOF(loc()); <<EOF>> return Parser::make_YYEOF(loc());
%% %%
@ -92,14 +98,20 @@ Parser::symbol_type make_StringLit(string_view src, const Parser::location_type&
return Parser::make_TERM(res, loc); return Parser::make_TERM(res, loc);
} }
Parser::symbol_type make_TagVal(string_view src, bool is_prefix, const Parser::location_type& loc) { Parser::symbol_type make_Tag(string_view src, TagType type, const Parser::location_type& loc) {
string res; string res;
res.reserve(src.size()); res.reserve(src.size());
bool escaped = false; // Determine processing boundaries
size_t len = is_prefix ? src.size() - 1 : src.size(); // Exclude the '*' at the end for prefix size_t start = (type == TagType::SUFFIX || type == TagType::INFIX) ? 1 : 0;
size_t end = src.size();
if (type == TagType::PREFIX || type == TagType::INFIX) {
end--; // Skip the last '*' character
}
for (size_t i = 0; i < len; ++i) { // Handle escaping
bool escaped = false;
for (size_t i = start; i < end; ++i) {
if (escaped) { if (escaped) {
escaped = false; escaped = false;
} else if (src[i] == '\\') { } else if (src[i] == '\\') {
@ -109,11 +121,16 @@ Parser::symbol_type make_TagVal(string_view src, bool is_prefix, const Parser::l
res.push_back(src[i]); res.push_back(src[i]);
} }
// Add '*' back for prefix // Return the appropriate token type
if (is_prefix) { switch (type) {
res.push_back('*'); case TagType::PREFIX:
return Parser::make_PREFIX(res, loc); return Parser::make_PREFIX(res, loc);
} case TagType::SUFFIX:
return Parser::make_SUFFIX(res, loc);
case TagType::INFIX:
return Parser::make_INFIX(res, loc);
case TagType::REGULAR:
default:
return Parser::make_TAG_VAL(res, loc); return Parser::make_TAG_VAL(res, loc);
}
} }

View file

@ -67,7 +67,7 @@ double toDouble(string_view src);
// Needed 0 at the end to satisfy bison 3.5.1 // Needed 0 at the end to satisfy bison 3.5.1
%token YYEOF 0 %token YYEOF 0
%token <std::string> TERM "term" TAG_VAL "tag_val" PARAM "param" FIELD "field" PREFIX "prefix" %token <std::string> TERM "term" TAG_VAL "tag_val" PARAM "param" FIELD "field" PREFIX "prefix" SUFFIX "suffix" INFIX "infix"
%precedence TERM TAG_VAL %precedence TERM TAG_VAL
%left OR_OP %left OR_OP
@ -138,6 +138,8 @@ search_unary_expr:
| NOT_OP search_unary_expr { $$ = AstNegateNode(std::move($2)); } | NOT_OP search_unary_expr { $$ = AstNegateNode(std::move($2)); }
| TERM { $$ = AstTermNode(std::move($1)); } | TERM { $$ = AstTermNode(std::move($1)); }
| PREFIX { $$ = AstPrefixNode(std::move($1)); } | PREFIX { $$ = AstPrefixNode(std::move($1)); }
| SUFFIX { $$ = AstSuffixNode(std::move($1)); }
| INFIX { $$ = AstInfixNode(std::move($1)); }
| UINT32 { $$ = AstTermNode(std::move($1)); } | UINT32 { $$ = AstTermNode(std::move($1)); }
| FIELD COLON field_cond { $$ = AstFieldNode(std::move($1), std::move($3)); } | FIELD COLON field_cond { $$ = AstFieldNode(std::move($1), std::move($3)); }
@ -188,6 +190,8 @@ tag_list:
tag_list_element: tag_list_element:
TERM { $$ = AstTermNode(std::move($1)); } TERM { $$ = AstTermNode(std::move($1)); }
| PREFIX { $$ = AstPrefixNode(std::move($1)); } | PREFIX { $$ = AstPrefixNode(std::move($1)); }
| SUFFIX { $$ = AstSuffixNode(std::move($1)); }
| INFIX { $$ = AstInfixNode(std::move($1)); }
| UINT32 { $$ = AstTermNode(std::move($1)); } | UINT32 { $$ = AstTermNode(std::move($1)); }
| DOUBLE { $$ = AstTermNode(std::move($1)); } | DOUBLE { $$ = AstTermNode(std::move($1)); }
| TAG_VAL { $$ = AstTermNode(std::move($1)); } | TAG_VAL { $$ = AstTermNode(std::move($1)); }

View file

@ -120,6 +120,12 @@ struct ProfileBuilder {
void operator()(std::string* out, const AstPrefixNode& node) const { void operator()(std::string* out, const AstPrefixNode& node) const {
out->append(node.prefix); out->append(node.prefix);
} }
void operator()(std::string* out, const AstSuffixNode& node) const {
out->append(node.suffix);
}
void operator()(std::string* out, const AstInfixNode& node) const {
out->append(node.infix);
}
void operator()(std::string* out, const AstTermNode& node) const { void operator()(std::string* out, const AstTermNode& node) const {
out->append(node.term); out->append(node.term);
} }
@ -131,6 +137,8 @@ struct ProfileBuilder {
[](monostate) -> string { return ""s; }, [](monostate) -> string { return ""s; },
[](const AstTermNode& n) { return absl::StrCat("Term{", n.term, "}"); }, [](const AstTermNode& n) { return absl::StrCat("Term{", n.term, "}"); },
[](const AstPrefixNode& n) { return absl::StrCat("Prefix{", n.prefix, "}"); }, [](const AstPrefixNode& n) { return absl::StrCat("Prefix{", n.prefix, "}"); },
[](const AstSuffixNode& n) { return absl::StrCat("Suffix{", n.suffix, "}"); },
[](const AstInfixNode& n) { return absl::StrCat("Infix{", n.infix, "}"); },
[](const AstRangeNode& n) { return absl::StrCat("Range{", n.lo, "<>", n.hi, "}"); }, [](const AstRangeNode& n) { return absl::StrCat("Range{", n.lo, "<>", n.hi, "}"); },
[](const AstLogicalNode& n) { [](const AstLogicalNode& n) {
auto op = n.op == AstLogicalNode::AND ? "and" : "or"; auto op = n.op == AstLogicalNode::AND ? "and" : "or";
@ -268,6 +276,18 @@ struct BasicSearch {
return result; return result;
} }
template <typename C>
IndexResult CollectSuffixMatches(BaseStringIndex<C>* index, std::string_view suffix) {
// TODO: Implement full text search for suffix
return IndexResult{};
}
template <typename C>
IndexResult CollectInfixMatches(BaseStringIndex<C>* index, std::string_view infix) {
// TODO: Implement full text search for infix
return IndexResult{};
}
IndexResult Search(monostate, string_view) { IndexResult Search(monostate, string_view) {
return vector<DocId>{}; return vector<DocId>{};
} }
@ -346,6 +366,16 @@ struct BasicSearch {
return UnifyResults(GetSubResults(indices, mapping), LogicOp::OR); return UnifyResults(GetSubResults(indices, mapping), LogicOp::OR);
} }
IndexResult Search(const AstSuffixNode& node, string_view active_field) {
// TODO: Implement full text search for suffix
return IndexResult{};
}
IndexResult Search(const AstInfixNode& node, string_view active_field) {
// TODO: Implement full text search for infix
return IndexResult{};
}
// [range]: access field's numeric index // [range]: access field's numeric index
IndexResult Search(const AstRangeNode& node, string_view active_field) { IndexResult Search(const AstRangeNode& node, string_view active_field) {
DCHECK(!active_field.empty()); DCHECK(!active_field.empty());
@ -392,6 +422,12 @@ struct BasicSearch {
}, },
[tag_index, this](const AstPrefixNode& prefix) { [tag_index, this](const AstPrefixNode& prefix) {
return CollectPrefixMatches(tag_index, prefix.prefix); return CollectPrefixMatches(tag_index, prefix.prefix);
},
[tag_index, this](const AstSuffixNode& suffix) {
return CollectSuffixMatches(tag_index, suffix.suffix);
},
[tag_index, this](const AstInfixNode& infix) {
return CollectInfixMatches(tag_index, infix.infix);
}}; }};
auto mapping = [ov](const auto& tag) { return visit(ov, tag); }; auto mapping = [ov](const auto& tag) { return visit(ov, tag); };
return UnifyResults(GetSubResults(node.tags, mapping), LogicOp::OR); return UnifyResults(GetSubResults(node.tags, mapping), LogicOp::OR);

View file

@ -155,7 +155,7 @@ TEST_F(SearchParserTest, Scanner) {
// Prefix simple // Prefix simple
SetInput("pre*"); SetInput("pre*");
NEXT_EQ(TOK_PREFIX, string, "pre*"); NEXT_EQ(TOK_PREFIX, string, "pre");
// TODO: uncomment when we support escaped terms // TODO: uncomment when we support escaped terms
// Prefix escaped (redis doesn't support quoted prefix matches) // Prefix escaped (redis doesn't support quoted prefix matches)
@ -167,7 +167,7 @@ TEST_F(SearchParserTest, Scanner) {
NEXT_EQ(TOK_FIELD, string, "@color"); NEXT_EQ(TOK_FIELD, string, "@color");
NEXT_TOK(TOK_COLON); NEXT_TOK(TOK_COLON);
NEXT_TOK(TOK_LCURLBR); NEXT_TOK(TOK_LCURLBR);
NEXT_EQ(TOK_PREFIX, string, "prefix*"); NEXT_EQ(TOK_PREFIX, string, "prefix");
NEXT_TOK(TOK_RCURLBR); NEXT_TOK(TOK_RCURLBR);
// Prefix escaped star // Prefix escaped star
@ -196,28 +196,28 @@ TEST_F(SearchParserTest, EscapedTagPrefixes) {
NEXT_EQ(TOK_FIELD, string, "@name"); NEXT_EQ(TOK_FIELD, string, "@name");
NEXT_TOK(TOK_COLON); NEXT_TOK(TOK_COLON);
NEXT_TOK(TOK_LCURLBR); NEXT_TOK(TOK_LCURLBR);
NEXT_EQ(TOK_PREFIX, string, "escape-err*"); NEXT_EQ(TOK_PREFIX, string, "escape-err");
NEXT_TOK(TOK_RCURLBR); NEXT_TOK(TOK_RCURLBR);
SetInput("@name:{escape\\+pre*}"); SetInput("@name:{escape\\+pre*}");
NEXT_EQ(TOK_FIELD, string, "@name"); NEXT_EQ(TOK_FIELD, string, "@name");
NEXT_TOK(TOK_COLON); NEXT_TOK(TOK_COLON);
NEXT_TOK(TOK_LCURLBR); NEXT_TOK(TOK_LCURLBR);
NEXT_EQ(TOK_PREFIX, string, "escape+pre*"); NEXT_EQ(TOK_PREFIX, string, "escape+pre");
NEXT_TOK(TOK_RCURLBR); NEXT_TOK(TOK_RCURLBR);
SetInput("@name:{escape\\.pre*}"); SetInput("@name:{escape\\.pre*}");
NEXT_EQ(TOK_FIELD, string, "@name"); NEXT_EQ(TOK_FIELD, string, "@name");
NEXT_TOK(TOK_COLON); NEXT_TOK(TOK_COLON);
NEXT_TOK(TOK_LCURLBR); NEXT_TOK(TOK_LCURLBR);
NEXT_EQ(TOK_PREFIX, string, "escape.pre*"); NEXT_EQ(TOK_PREFIX, string, "escape.pre");
NEXT_TOK(TOK_RCURLBR); NEXT_TOK(TOK_RCURLBR);
SetInput("@name:{complex\\-escape\\+with\\.many\\*chars*}"); SetInput("@name:{complex\\-escape\\+with\\.many\\*chars*}");
NEXT_EQ(TOK_FIELD, string, "@name"); NEXT_EQ(TOK_FIELD, string, "@name");
NEXT_TOK(TOK_COLON); NEXT_TOK(TOK_COLON);
NEXT_TOK(TOK_LCURLBR); NEXT_TOK(TOK_LCURLBR);
NEXT_EQ(TOK_PREFIX, string, "complex-escape+with.many*chars*"); NEXT_EQ(TOK_PREFIX, string, "complex-escape+with.many*chars");
NEXT_TOK(TOK_RCURLBR); NEXT_TOK(TOK_RCURLBR);
} }