chore: basic syntax parsing added for suffix and infix search

This commit is contained in:
Volodymyr Yavdoshenko 2025-05-09 15:39:20 +03:00
parent 561c0a4733
commit 01e72683b6
No known key found for this signature in database
GPG key ID: 24BC74845F4F4064
6 changed files with 139 additions and 60 deletions

View file

@ -20,7 +20,12 @@ AstTermNode::AstTermNode(string term) : term{std::move(term)} {
}
AstPrefixNode::AstPrefixNode(string prefix) : prefix{std::move(prefix)} {
this->prefix.pop_back();
}
AstSuffixNode::AstSuffixNode(string suffix) : suffix{std::move(suffix)} {
}
AstInfixNode::AstInfixNode(string infix) : infix{std::move(infix)} {
}
AstRangeNode::AstRangeNode(double lo, bool lo_excl, double hi, bool hi_excl)

View file

@ -38,6 +38,18 @@ struct AstPrefixNode {
std::string prefix;
};
struct AstSuffixNode {
explicit AstSuffixNode(std::string suffix);
std::string suffix;
};
struct AstInfixNode {
explicit AstInfixNode(std::string infix);
std::string infix;
};
// Matches numeric range
struct AstRangeNode {
AstRangeNode(double lo, bool lo_excl, double hi, bool hi_excl);
@ -73,7 +85,7 @@ struct AstFieldNode {
// Stores a list of tags for a tag query
struct AstTagsNode {
using TagValue = std::variant<AstTermNode, AstPrefixNode>;
using TagValue = std::variant<AstTermNode, AstPrefixNode, AstSuffixNode, AstInfixNode>;
struct TagValueProxy
: public AstTagsNode::TagValue { // bison needs it to be default constructible
@ -83,6 +95,10 @@ struct AstTagsNode {
}
TagValueProxy(AstTermNode tv) : AstTagsNode::TagValue(std::move(tv)) {
}
TagValueProxy(AstSuffixNode tv) : AstTagsNode::TagValue(std::move(tv)) {
}
TagValueProxy(AstInfixNode tv) : AstTagsNode::TagValue(std::move(tv)) {
}
};
AstTagsNode(TagValue);
@ -111,9 +127,10 @@ struct AstKnnNode {
std::optional<float> ef_runtime;
};
using NodeVariants = std::variant<std::monostate, AstStarNode, AstStarFieldNode, AstTermNode,
AstPrefixNode, AstRangeNode, AstNegateNode, AstLogicalNode,
AstFieldNode, AstTagsNode, AstKnnNode>;
using NodeVariants =
std::variant<std::monostate, AstStarNode, AstStarFieldNode, AstTermNode, AstPrefixNode,
AstSuffixNode, AstInfixNode, AstRangeNode, AstNegateNode, AstLogicalNode,
AstFieldNode, AstTagsNode, AstKnnNode>;
struct AstNode : public NodeVariants {
using variant::variant;

View file

@ -26,17 +26,19 @@
using dfly::search::Parser;
using namespace std;
enum class TagType { PREFIX, SUFFIX, INFIX, REGULAR };
Parser::symbol_type make_StringLit(string_view src, const Parser::location_type& loc);
Parser::symbol_type make_TagVal(string_view src, bool is_prefix, const Parser::location_type& loc);
Parser::symbol_type make_Tag(string_view src, TagType type, const Parser::location_type& loc);
%}
dq \"
sq \'
esc_chars ['"\?\\abfnrtv]
esc_seq \\{esc_chars}
term_char \w
tag_val_char {term_char}|\\[,.<>{}\[\]\\\"\':;!@#$%^&*()\-+=~\/ ]
asterisk_char \*
dq \"
sq \'
esc_chars ['"\?\\abfnrtv]
esc_seq \\{esc_chars}
term_ch \w
tag_val_ch {term_ch}|\\[,.<>{}\[\]\\\"\':;!@#$%^&*()\-+=~\/ ]
astrsk_ch \*
%{
@ -67,21 +69,25 @@ asterisk_char \*
"AS" return Parser::make_AS (loc());
"EF_RUNTIME" return Parser::make_EF_RUNTIME (loc());
[0-9]{1,9} return Parser::make_UINT32(str(), loc());
[+-]?(([0-9]*[.])?[0-9]+|inf) return Parser::make_DOUBLE(str(), loc());
[0-9]{1,9} return Parser::make_UINT32(str(), loc());
[+-]?(([0-9]*[.])?[0-9]+|inf) return Parser::make_DOUBLE(str(), loc());
{dq}([^"]|{esc_seq})*{dq} return make_StringLit(matched_view(1, 1), loc());
{sq}([^']|{esc_seq})*{sq} return make_StringLit(matched_view(1, 1), loc());
{dq}([^"]|{esc_seq})*{dq} return make_StringLit(matched_view(1, 1), loc());
{sq}([^']|{esc_seq})*{sq} return make_StringLit(matched_view(1, 1), loc());
"$"{term_char}+ return ParseParam(str(), loc());
"@"{term_char}+ return Parser::make_FIELD(str(), loc());
{term_char}+{asterisk_char} return Parser::make_PREFIX(str(), loc());
"$"{term_ch}+ return ParseParam(str(), loc());
"@"{term_ch}+ return Parser::make_FIELD(str(), loc());
{term_ch}+{astrsk_ch} return Parser::make_PREFIX(string{matched_view(0, 1)}, loc());
{astrsk_ch}{term_ch}+ return Parser::make_SUFFIX(string{matched_view(1, 0)}, loc());
{astrsk_ch}{term_ch}+{astrsk_ch} return Parser::make_INFIX(string{matched_view(1, 1)}, loc());
{term_char}+ return Parser::make_TERM(str(), loc());
{tag_val_char}+{asterisk_char} return make_TagVal(str(), true, loc());
{tag_val_char}+ return make_TagVal(str(), false, loc());
{term_ch}+ return Parser::make_TERM(str(), loc());
{tag_val_ch}+{astrsk_ch} return make_Tag(str(), TagType::PREFIX, loc());
{astrsk_ch}{tag_val_ch}+ return make_Tag(str(), TagType::SUFFIX, loc());
{astrsk_ch}{tag_val_ch}+{astrsk_ch} return make_Tag(str(), TagType::INFIX, loc());
{tag_val_ch}+ return make_Tag(str(), TagType::REGULAR, loc());
<<EOF>> return Parser::make_YYEOF(loc());
<<EOF>> return Parser::make_YYEOF(loc());
%%
Parser::symbol_type make_StringLit(string_view src, const Parser::location_type& loc) {
@ -92,14 +98,20 @@ Parser::symbol_type make_StringLit(string_view src, const Parser::location_type&
return Parser::make_TERM(res, loc);
}
Parser::symbol_type make_TagVal(string_view src, bool is_prefix, const Parser::location_type& loc) {
Parser::symbol_type make_Tag(string_view src, TagType type, const Parser::location_type& loc) {
string res;
res.reserve(src.size());
bool escaped = false;
size_t len = is_prefix ? src.size() - 1 : src.size(); // Exclude the '*' at the end for prefix
// Determine processing boundaries
size_t start = (type == TagType::SUFFIX || type == TagType::INFIX) ? 1 : 0;
size_t end = src.size();
if (type == TagType::PREFIX || type == TagType::INFIX) {
end--; // Skip the last '*' character
}
for (size_t i = 0; i < len; ++i) {
// Handle escaping
bool escaped = false;
for (size_t i = start; i < end; ++i) {
if (escaped) {
escaped = false;
} else if (src[i] == '\\') {
@ -109,11 +121,16 @@ Parser::symbol_type make_TagVal(string_view src, bool is_prefix, const Parser::l
res.push_back(src[i]);
}
// Add '*' back for prefix
if (is_prefix) {
res.push_back('*');
return Parser::make_PREFIX(res, loc);
// Return the appropriate token type
switch (type) {
case TagType::PREFIX:
return Parser::make_PREFIX(res, loc);
case TagType::SUFFIX:
return Parser::make_SUFFIX(res, loc);
case TagType::INFIX:
return Parser::make_INFIX(res, loc);
case TagType::REGULAR:
default:
return Parser::make_TAG_VAL(res, loc);
}
return Parser::make_TAG_VAL(res, loc);
}

View file

@ -67,7 +67,7 @@ double toDouble(string_view src);
// Needed 0 at the end to satisfy bison 3.5.1
%token YYEOF 0
%token <std::string> TERM "term" TAG_VAL "tag_val" PARAM "param" FIELD "field" PREFIX "prefix"
%token <std::string> TERM "term" TAG_VAL "tag_val" PARAM "param" FIELD "field" PREFIX "prefix" SUFFIX "suffix" INFIX "infix"
%precedence TERM TAG_VAL
%left OR_OP
@ -134,24 +134,26 @@ search_or_expr:
| search_expr OR_OP search_unary_expr { $$ = AstLogicalNode(std::move($1), std::move($3), AstLogicalNode::OR); }
search_unary_expr:
LPAREN search_expr RPAREN { $$ = std::move($2); }
LPAREN search_expr RPAREN { $$ = std::move($2); }
| NOT_OP search_unary_expr { $$ = AstNegateNode(std::move($2)); }
| TERM { $$ = AstTermNode(std::move($1)); }
| TERM { $$ = AstTermNode(std::move($1)); }
| PREFIX { $$ = AstPrefixNode(std::move($1)); }
| UINT32 { $$ = AstTermNode(std::move($1)); }
| SUFFIX { $$ = AstSuffixNode(std::move($1)); }
| INFIX { $$ = AstInfixNode(std::move($1)); }
| UINT32 { $$ = AstTermNode(std::move($1)); }
| FIELD COLON field_cond { $$ = AstFieldNode(std::move($1), std::move($3)); }
field_cond:
TERM { $$ = AstTermNode(std::move($1)); }
| UINT32 { $$ = AstTermNode(std::move($1)); }
| STAR { $$ = AstStarFieldNode(); }
TERM { $$ = AstTermNode(std::move($1)); }
| UINT32 { $$ = AstTermNode(std::move($1)); }
| STAR { $$ = AstStarFieldNode(); }
| NOT_OP field_cond { $$ = AstNegateNode(std::move($2)); }
| LPAREN field_cond_expr RPAREN { $$ = std::move($2); }
| LBRACKET numeric_filter_expr RBRACKET { $$ = std::move($2); }
| LCURLBR tag_list RCURLBR { $$ = std::move($2); }
numeric_filter_expr:
opt_lparen generic_number opt_lparen generic_number { $$ = AstRangeNode($2, $1, $4, $3); }
opt_lparen generic_number opt_lparen generic_number { $$ = AstRangeNode($2, $1, $4, $3); }
| opt_lparen generic_number COMMA opt_lparen generic_number { $$ = AstRangeNode($2, $1, $5, $4); }
generic_number:
@ -163,9 +165,9 @@ opt_lparen:
| LPAREN { $$ = true; }
field_cond_expr:
field_unary_expr { $$ = std::move($1); }
| field_and_expr { $$ = std::move($1); }
| field_or_expr { $$ = std::move($1); }
field_unary_expr { $$ = std::move($1); }
| field_and_expr { $$ = std::move($1); }
| field_or_expr { $$ = std::move($1); }
field_and_expr:
field_unary_expr field_unary_expr %prec AND_OP { $$ = AstLogicalNode(std::move($1), std::move($2), AstLogicalNode::AND); }
@ -176,21 +178,23 @@ field_or_expr:
| field_cond_expr OR_OP field_and_expr { $$ = AstLogicalNode(std::move($1), std::move($3), AstLogicalNode::OR); }
field_unary_expr:
LPAREN field_cond_expr RPAREN { $$ = std::move($2); }
| NOT_OP field_unary_expr { $$ = AstNegateNode(std::move($2)); }
| TERM { $$ = AstTermNode(std::move($1)); }
| UINT32 { $$ = AstTermNode(std::move($1)); }
LPAREN field_cond_expr RPAREN { $$ = std::move($2); }
| NOT_OP field_unary_expr { $$ = AstNegateNode(std::move($2)); }
| TERM { $$ = AstTermNode(std::move($1)); }
| UINT32 { $$ = AstTermNode(std::move($1)); }
tag_list:
tag_list_element { $$ = AstTagsNode(std::move($1)); }
tag_list_element { $$ = AstTagsNode(std::move($1)); }
| tag_list OR_OP tag_list_element { $$ = AstTagsNode(std::move($1), std::move($3)); }
tag_list_element:
TERM { $$ = AstTermNode(std::move($1)); }
TERM { $$ = AstTermNode(std::move($1)); }
| PREFIX { $$ = AstPrefixNode(std::move($1)); }
| UINT32 { $$ = AstTermNode(std::move($1)); }
| DOUBLE { $$ = AstTermNode(std::move($1)); }
| TAG_VAL { $$ = AstTermNode(std::move($1)); }
| SUFFIX { $$ = AstSuffixNode(std::move($1)); }
| INFIX { $$ = AstInfixNode(std::move($1)); }
| UINT32 { $$ = AstTermNode(std::move($1)); }
| DOUBLE { $$ = AstTermNode(std::move($1)); }
| TAG_VAL { $$ = AstTermNode(std::move($1)); }
%%

View file

@ -120,6 +120,12 @@ struct ProfileBuilder {
void operator()(std::string* out, const AstPrefixNode& node) const {
out->append(node.prefix);
}
void operator()(std::string* out, const AstSuffixNode& node) const {
out->append(node.suffix);
}
void operator()(std::string* out, const AstInfixNode& node) const {
out->append(node.infix);
}
void operator()(std::string* out, const AstTermNode& node) const {
out->append(node.term);
}
@ -131,6 +137,8 @@ struct ProfileBuilder {
[](monostate) -> string { return ""s; },
[](const AstTermNode& n) { return absl::StrCat("Term{", n.term, "}"); },
[](const AstPrefixNode& n) { return absl::StrCat("Prefix{", n.prefix, "}"); },
[](const AstSuffixNode& n) { return absl::StrCat("Suffix{", n.suffix, "}"); },
[](const AstInfixNode& n) { return absl::StrCat("Infix{", n.infix, "}"); },
[](const AstRangeNode& n) { return absl::StrCat("Range{", n.lo, "<>", n.hi, "}"); },
[](const AstLogicalNode& n) {
auto op = n.op == AstLogicalNode::AND ? "and" : "or";
@ -268,6 +276,18 @@ struct BasicSearch {
return result;
}
template <typename C>
IndexResult CollectSuffixMatches(BaseStringIndex<C>* index, std::string_view suffix) {
// TODO: Implement full text search for suffix
return IndexResult{};
}
template <typename C>
IndexResult CollectInfixMatches(BaseStringIndex<C>* index, std::string_view infix) {
// TODO: Implement full text search for infix
return IndexResult{};
}
IndexResult Search(monostate, string_view) {
return vector<DocId>{};
}
@ -346,6 +366,16 @@ struct BasicSearch {
return UnifyResults(GetSubResults(indices, mapping), LogicOp::OR);
}
IndexResult Search(const AstSuffixNode& node, string_view active_field) {
// TODO: Implement full text search for suffix
return IndexResult{};
}
IndexResult Search(const AstInfixNode& node, string_view active_field) {
// TODO: Implement full text search for infix
return IndexResult{};
}
// [range]: access field's numeric index
IndexResult Search(const AstRangeNode& node, string_view active_field) {
DCHECK(!active_field.empty());
@ -392,6 +422,12 @@ struct BasicSearch {
},
[tag_index, this](const AstPrefixNode& prefix) {
return CollectPrefixMatches(tag_index, prefix.prefix);
},
[tag_index, this](const AstSuffixNode& suffix) {
return CollectSuffixMatches(tag_index, suffix.suffix);
},
[tag_index, this](const AstInfixNode& infix) {
return CollectInfixMatches(tag_index, infix.infix);
}};
auto mapping = [ov](const auto& tag) { return visit(ov, tag); };
return UnifyResults(GetSubResults(node.tags, mapping), LogicOp::OR);

View file

@ -155,7 +155,7 @@ TEST_F(SearchParserTest, Scanner) {
// Prefix simple
SetInput("pre*");
NEXT_EQ(TOK_PREFIX, string, "pre*");
NEXT_EQ(TOK_PREFIX, string, "pre");
// TODO: uncomment when we support escaped terms
// Prefix escaped (redis doesn't support quoted prefix matches)
@ -167,7 +167,7 @@ TEST_F(SearchParserTest, Scanner) {
NEXT_EQ(TOK_FIELD, string, "@color");
NEXT_TOK(TOK_COLON);
NEXT_TOK(TOK_LCURLBR);
NEXT_EQ(TOK_PREFIX, string, "prefix*");
NEXT_EQ(TOK_PREFIX, string, "prefix");
NEXT_TOK(TOK_RCURLBR);
// Prefix escaped star
@ -196,28 +196,28 @@ TEST_F(SearchParserTest, EscapedTagPrefixes) {
NEXT_EQ(TOK_FIELD, string, "@name");
NEXT_TOK(TOK_COLON);
NEXT_TOK(TOK_LCURLBR);
NEXT_EQ(TOK_PREFIX, string, "escape-err*");
NEXT_EQ(TOK_PREFIX, string, "escape-err");
NEXT_TOK(TOK_RCURLBR);
SetInput("@name:{escape\\+pre*}");
NEXT_EQ(TOK_FIELD, string, "@name");
NEXT_TOK(TOK_COLON);
NEXT_TOK(TOK_LCURLBR);
NEXT_EQ(TOK_PREFIX, string, "escape+pre*");
NEXT_EQ(TOK_PREFIX, string, "escape+pre");
NEXT_TOK(TOK_RCURLBR);
SetInput("@name:{escape\\.pre*}");
NEXT_EQ(TOK_FIELD, string, "@name");
NEXT_TOK(TOK_COLON);
NEXT_TOK(TOK_LCURLBR);
NEXT_EQ(TOK_PREFIX, string, "escape.pre*");
NEXT_EQ(TOK_PREFIX, string, "escape.pre");
NEXT_TOK(TOK_RCURLBR);
SetInput("@name:{complex\\-escape\\+with\\.many\\*chars*}");
NEXT_EQ(TOK_FIELD, string, "@name");
NEXT_TOK(TOK_COLON);
NEXT_TOK(TOK_LCURLBR);
NEXT_EQ(TOK_PREFIX, string, "complex-escape+with.many*chars*");
NEXT_EQ(TOK_PREFIX, string, "complex-escape+with.many*chars");
NEXT_TOK(TOK_RCURLBR);
}