From 01e72683b610554bb5ef24eff8011b7849e72878 Mon Sep 17 00:00:00 2001 From: Volodymyr Yavdoshenko Date: Fri, 9 May 2025 15:39:20 +0300 Subject: [PATCH] chore: basic syntax parsing added for suffix and infix search --- src/core/search/ast_expr.cc | 7 ++- src/core/search/ast_expr.h | 25 +++++++-- src/core/search/lexer.lex | 75 ++++++++++++++++----------- src/core/search/parser.y | 44 +++++++++------- src/core/search/search.cc | 36 +++++++++++++ src/core/search/search_parser_test.cc | 12 ++--- 6 files changed, 139 insertions(+), 60 deletions(-) diff --git a/src/core/search/ast_expr.cc b/src/core/search/ast_expr.cc index 6fd9894fc..3c7fc6669 100644 --- a/src/core/search/ast_expr.cc +++ b/src/core/search/ast_expr.cc @@ -20,7 +20,12 @@ AstTermNode::AstTermNode(string term) : term{std::move(term)} { } AstPrefixNode::AstPrefixNode(string prefix) : prefix{std::move(prefix)} { - this->prefix.pop_back(); +} + +AstSuffixNode::AstSuffixNode(string suffix) : suffix{std::move(suffix)} { +} + +AstInfixNode::AstInfixNode(string infix) : infix{std::move(infix)} { } AstRangeNode::AstRangeNode(double lo, bool lo_excl, double hi, bool hi_excl) diff --git a/src/core/search/ast_expr.h b/src/core/search/ast_expr.h index 6f1980982..4df1fc5c9 100644 --- a/src/core/search/ast_expr.h +++ b/src/core/search/ast_expr.h @@ -38,6 +38,18 @@ struct AstPrefixNode { std::string prefix; }; +struct AstSuffixNode { + explicit AstSuffixNode(std::string suffix); + + std::string suffix; +}; + +struct AstInfixNode { + explicit AstInfixNode(std::string infix); + + std::string infix; +}; + // Matches numeric range struct AstRangeNode { AstRangeNode(double lo, bool lo_excl, double hi, bool hi_excl); @@ -73,7 +85,7 @@ struct AstFieldNode { // Stores a list of tags for a tag query struct AstTagsNode { - using TagValue = std::variant; + using TagValue = std::variant; struct TagValueProxy : public AstTagsNode::TagValue { // bison needs it to be default constructible @@ -83,6 +95,10 @@ struct AstTagsNode { } TagValueProxy(AstTermNode tv) : AstTagsNode::TagValue(std::move(tv)) { } + TagValueProxy(AstSuffixNode tv) : AstTagsNode::TagValue(std::move(tv)) { + } + TagValueProxy(AstInfixNode tv) : AstTagsNode::TagValue(std::move(tv)) { + } }; AstTagsNode(TagValue); @@ -111,9 +127,10 @@ struct AstKnnNode { std::optional ef_runtime; }; -using NodeVariants = std::variant; +using NodeVariants = + std::variant; struct AstNode : public NodeVariants { using variant::variant; diff --git a/src/core/search/lexer.lex b/src/core/search/lexer.lex index 9c33e54ae..8cc33a9ee 100644 --- a/src/core/search/lexer.lex +++ b/src/core/search/lexer.lex @@ -26,17 +26,19 @@ using dfly::search::Parser; using namespace std; + enum class TagType { PREFIX, SUFFIX, INFIX, REGULAR }; + Parser::symbol_type make_StringLit(string_view src, const Parser::location_type& loc); - Parser::symbol_type make_TagVal(string_view src, bool is_prefix, const Parser::location_type& loc); + Parser::symbol_type make_Tag(string_view src, TagType type, const Parser::location_type& loc); %} -dq \" -sq \' -esc_chars ['"\?\\abfnrtv] -esc_seq \\{esc_chars} -term_char \w -tag_val_char {term_char}|\\[,.<>{}\[\]\\\"\':;!@#$%^&*()\-+=~\/ ] -asterisk_char \* +dq \" +sq \' +esc_chars ['"\?\\abfnrtv] +esc_seq \\{esc_chars} +term_ch \w +tag_val_ch {term_ch}|\\[,.<>{}\[\]\\\"\':;!@#$%^&*()\-+=~\/ ] +astrsk_ch \* %{ @@ -67,21 +69,25 @@ asterisk_char \* "AS" return Parser::make_AS (loc()); "EF_RUNTIME" return Parser::make_EF_RUNTIME (loc()); -[0-9]{1,9} return Parser::make_UINT32(str(), loc()); -[+-]?(([0-9]*[.])?[0-9]+|inf) return Parser::make_DOUBLE(str(), loc()); +[0-9]{1,9} return Parser::make_UINT32(str(), loc()); +[+-]?(([0-9]*[.])?[0-9]+|inf) return Parser::make_DOUBLE(str(), loc()); -{dq}([^"]|{esc_seq})*{dq} return make_StringLit(matched_view(1, 1), loc()); -{sq}([^']|{esc_seq})*{sq} return make_StringLit(matched_view(1, 1), loc()); +{dq}([^"]|{esc_seq})*{dq} return make_StringLit(matched_view(1, 1), loc()); +{sq}([^']|{esc_seq})*{sq} return make_StringLit(matched_view(1, 1), loc()); -"$"{term_char}+ return ParseParam(str(), loc()); -"@"{term_char}+ return Parser::make_FIELD(str(), loc()); -{term_char}+{asterisk_char} return Parser::make_PREFIX(str(), loc()); +"$"{term_ch}+ return ParseParam(str(), loc()); +"@"{term_ch}+ return Parser::make_FIELD(str(), loc()); +{term_ch}+{astrsk_ch} return Parser::make_PREFIX(string{matched_view(0, 1)}, loc()); +{astrsk_ch}{term_ch}+ return Parser::make_SUFFIX(string{matched_view(1, 0)}, loc()); +{astrsk_ch}{term_ch}+{astrsk_ch} return Parser::make_INFIX(string{matched_view(1, 1)}, loc()); -{term_char}+ return Parser::make_TERM(str(), loc()); -{tag_val_char}+{asterisk_char} return make_TagVal(str(), true, loc()); -{tag_val_char}+ return make_TagVal(str(), false, loc()); +{term_ch}+ return Parser::make_TERM(str(), loc()); +{tag_val_ch}+{astrsk_ch} return make_Tag(str(), TagType::PREFIX, loc()); +{astrsk_ch}{tag_val_ch}+ return make_Tag(str(), TagType::SUFFIX, loc()); +{astrsk_ch}{tag_val_ch}+{astrsk_ch} return make_Tag(str(), TagType::INFIX, loc()); +{tag_val_ch}+ return make_Tag(str(), TagType::REGULAR, loc()); -<> return Parser::make_YYEOF(loc()); +<> return Parser::make_YYEOF(loc()); %% Parser::symbol_type make_StringLit(string_view src, const Parser::location_type& loc) { @@ -92,14 +98,20 @@ Parser::symbol_type make_StringLit(string_view src, const Parser::location_type& return Parser::make_TERM(res, loc); } -Parser::symbol_type make_TagVal(string_view src, bool is_prefix, const Parser::location_type& loc) { +Parser::symbol_type make_Tag(string_view src, TagType type, const Parser::location_type& loc) { string res; res.reserve(src.size()); - bool escaped = false; - size_t len = is_prefix ? src.size() - 1 : src.size(); // Exclude the '*' at the end for prefix + // Determine processing boundaries + size_t start = (type == TagType::SUFFIX || type == TagType::INFIX) ? 1 : 0; + size_t end = src.size(); + if (type == TagType::PREFIX || type == TagType::INFIX) { + end--; // Skip the last '*' character + } - for (size_t i = 0; i < len; ++i) { + // Handle escaping + bool escaped = false; + for (size_t i = start; i < end; ++i) { if (escaped) { escaped = false; } else if (src[i] == '\\') { @@ -109,11 +121,16 @@ Parser::symbol_type make_TagVal(string_view src, bool is_prefix, const Parser::l res.push_back(src[i]); } - // Add '*' back for prefix - if (is_prefix) { - res.push_back('*'); - return Parser::make_PREFIX(res, loc); + // Return the appropriate token type + switch (type) { + case TagType::PREFIX: + return Parser::make_PREFIX(res, loc); + case TagType::SUFFIX: + return Parser::make_SUFFIX(res, loc); + case TagType::INFIX: + return Parser::make_INFIX(res, loc); + case TagType::REGULAR: + default: + return Parser::make_TAG_VAL(res, loc); } - - return Parser::make_TAG_VAL(res, loc); } diff --git a/src/core/search/parser.y b/src/core/search/parser.y index 9da2ddb13..8bd2f796f 100644 --- a/src/core/search/parser.y +++ b/src/core/search/parser.y @@ -67,7 +67,7 @@ double toDouble(string_view src); // Needed 0 at the end to satisfy bison 3.5.1 %token YYEOF 0 -%token TERM "term" TAG_VAL "tag_val" PARAM "param" FIELD "field" PREFIX "prefix" +%token TERM "term" TAG_VAL "tag_val" PARAM "param" FIELD "field" PREFIX "prefix" SUFFIX "suffix" INFIX "infix" %precedence TERM TAG_VAL %left OR_OP @@ -134,24 +134,26 @@ search_or_expr: | search_expr OR_OP search_unary_expr { $$ = AstLogicalNode(std::move($1), std::move($3), AstLogicalNode::OR); } search_unary_expr: - LPAREN search_expr RPAREN { $$ = std::move($2); } + LPAREN search_expr RPAREN { $$ = std::move($2); } | NOT_OP search_unary_expr { $$ = AstNegateNode(std::move($2)); } - | TERM { $$ = AstTermNode(std::move($1)); } + | TERM { $$ = AstTermNode(std::move($1)); } | PREFIX { $$ = AstPrefixNode(std::move($1)); } - | UINT32 { $$ = AstTermNode(std::move($1)); } + | SUFFIX { $$ = AstSuffixNode(std::move($1)); } + | INFIX { $$ = AstInfixNode(std::move($1)); } + | UINT32 { $$ = AstTermNode(std::move($1)); } | FIELD COLON field_cond { $$ = AstFieldNode(std::move($1), std::move($3)); } field_cond: - TERM { $$ = AstTermNode(std::move($1)); } - | UINT32 { $$ = AstTermNode(std::move($1)); } - | STAR { $$ = AstStarFieldNode(); } + TERM { $$ = AstTermNode(std::move($1)); } + | UINT32 { $$ = AstTermNode(std::move($1)); } + | STAR { $$ = AstStarFieldNode(); } | NOT_OP field_cond { $$ = AstNegateNode(std::move($2)); } | LPAREN field_cond_expr RPAREN { $$ = std::move($2); } | LBRACKET numeric_filter_expr RBRACKET { $$ = std::move($2); } | LCURLBR tag_list RCURLBR { $$ = std::move($2); } numeric_filter_expr: - opt_lparen generic_number opt_lparen generic_number { $$ = AstRangeNode($2, $1, $4, $3); } + opt_lparen generic_number opt_lparen generic_number { $$ = AstRangeNode($2, $1, $4, $3); } | opt_lparen generic_number COMMA opt_lparen generic_number { $$ = AstRangeNode($2, $1, $5, $4); } generic_number: @@ -163,9 +165,9 @@ opt_lparen: | LPAREN { $$ = true; } field_cond_expr: - field_unary_expr { $$ = std::move($1); } - | field_and_expr { $$ = std::move($1); } - | field_or_expr { $$ = std::move($1); } + field_unary_expr { $$ = std::move($1); } + | field_and_expr { $$ = std::move($1); } + | field_or_expr { $$ = std::move($1); } field_and_expr: field_unary_expr field_unary_expr %prec AND_OP { $$ = AstLogicalNode(std::move($1), std::move($2), AstLogicalNode::AND); } @@ -176,21 +178,23 @@ field_or_expr: | field_cond_expr OR_OP field_and_expr { $$ = AstLogicalNode(std::move($1), std::move($3), AstLogicalNode::OR); } field_unary_expr: - LPAREN field_cond_expr RPAREN { $$ = std::move($2); } - | NOT_OP field_unary_expr { $$ = AstNegateNode(std::move($2)); } - | TERM { $$ = AstTermNode(std::move($1)); } - | UINT32 { $$ = AstTermNode(std::move($1)); } + LPAREN field_cond_expr RPAREN { $$ = std::move($2); } + | NOT_OP field_unary_expr { $$ = AstNegateNode(std::move($2)); } + | TERM { $$ = AstTermNode(std::move($1)); } + | UINT32 { $$ = AstTermNode(std::move($1)); } tag_list: - tag_list_element { $$ = AstTagsNode(std::move($1)); } + tag_list_element { $$ = AstTagsNode(std::move($1)); } | tag_list OR_OP tag_list_element { $$ = AstTagsNode(std::move($1), std::move($3)); } tag_list_element: - TERM { $$ = AstTermNode(std::move($1)); } + TERM { $$ = AstTermNode(std::move($1)); } | PREFIX { $$ = AstPrefixNode(std::move($1)); } - | UINT32 { $$ = AstTermNode(std::move($1)); } - | DOUBLE { $$ = AstTermNode(std::move($1)); } - | TAG_VAL { $$ = AstTermNode(std::move($1)); } + | SUFFIX { $$ = AstSuffixNode(std::move($1)); } + | INFIX { $$ = AstInfixNode(std::move($1)); } + | UINT32 { $$ = AstTermNode(std::move($1)); } + | DOUBLE { $$ = AstTermNode(std::move($1)); } + | TAG_VAL { $$ = AstTermNode(std::move($1)); } %% diff --git a/src/core/search/search.cc b/src/core/search/search.cc index 4d50d71fd..66a14648b 100644 --- a/src/core/search/search.cc +++ b/src/core/search/search.cc @@ -120,6 +120,12 @@ struct ProfileBuilder { void operator()(std::string* out, const AstPrefixNode& node) const { out->append(node.prefix); } + void operator()(std::string* out, const AstSuffixNode& node) const { + out->append(node.suffix); + } + void operator()(std::string* out, const AstInfixNode& node) const { + out->append(node.infix); + } void operator()(std::string* out, const AstTermNode& node) const { out->append(node.term); } @@ -131,6 +137,8 @@ struct ProfileBuilder { [](monostate) -> string { return ""s; }, [](const AstTermNode& n) { return absl::StrCat("Term{", n.term, "}"); }, [](const AstPrefixNode& n) { return absl::StrCat("Prefix{", n.prefix, "}"); }, + [](const AstSuffixNode& n) { return absl::StrCat("Suffix{", n.suffix, "}"); }, + [](const AstInfixNode& n) { return absl::StrCat("Infix{", n.infix, "}"); }, [](const AstRangeNode& n) { return absl::StrCat("Range{", n.lo, "<>", n.hi, "}"); }, [](const AstLogicalNode& n) { auto op = n.op == AstLogicalNode::AND ? "and" : "or"; @@ -268,6 +276,18 @@ struct BasicSearch { return result; } + template + IndexResult CollectSuffixMatches(BaseStringIndex* index, std::string_view suffix) { + // TODO: Implement full text search for suffix + return IndexResult{}; + } + + template + IndexResult CollectInfixMatches(BaseStringIndex* index, std::string_view infix) { + // TODO: Implement full text search for infix + return IndexResult{}; + } + IndexResult Search(monostate, string_view) { return vector{}; } @@ -346,6 +366,16 @@ struct BasicSearch { return UnifyResults(GetSubResults(indices, mapping), LogicOp::OR); } + IndexResult Search(const AstSuffixNode& node, string_view active_field) { + // TODO: Implement full text search for suffix + return IndexResult{}; + } + + IndexResult Search(const AstInfixNode& node, string_view active_field) { + // TODO: Implement full text search for infix + return IndexResult{}; + } + // [range]: access field's numeric index IndexResult Search(const AstRangeNode& node, string_view active_field) { DCHECK(!active_field.empty()); @@ -392,6 +422,12 @@ struct BasicSearch { }, [tag_index, this](const AstPrefixNode& prefix) { return CollectPrefixMatches(tag_index, prefix.prefix); + }, + [tag_index, this](const AstSuffixNode& suffix) { + return CollectSuffixMatches(tag_index, suffix.suffix); + }, + [tag_index, this](const AstInfixNode& infix) { + return CollectInfixMatches(tag_index, infix.infix); }}; auto mapping = [ov](const auto& tag) { return visit(ov, tag); }; return UnifyResults(GetSubResults(node.tags, mapping), LogicOp::OR); diff --git a/src/core/search/search_parser_test.cc b/src/core/search/search_parser_test.cc index 73049e599..8affceb5a 100644 --- a/src/core/search/search_parser_test.cc +++ b/src/core/search/search_parser_test.cc @@ -155,7 +155,7 @@ TEST_F(SearchParserTest, Scanner) { // Prefix simple SetInput("pre*"); - NEXT_EQ(TOK_PREFIX, string, "pre*"); + NEXT_EQ(TOK_PREFIX, string, "pre"); // TODO: uncomment when we support escaped terms // Prefix escaped (redis doesn't support quoted prefix matches) @@ -167,7 +167,7 @@ TEST_F(SearchParserTest, Scanner) { NEXT_EQ(TOK_FIELD, string, "@color"); NEXT_TOK(TOK_COLON); NEXT_TOK(TOK_LCURLBR); - NEXT_EQ(TOK_PREFIX, string, "prefix*"); + NEXT_EQ(TOK_PREFIX, string, "prefix"); NEXT_TOK(TOK_RCURLBR); // Prefix escaped star @@ -196,28 +196,28 @@ TEST_F(SearchParserTest, EscapedTagPrefixes) { NEXT_EQ(TOK_FIELD, string, "@name"); NEXT_TOK(TOK_COLON); NEXT_TOK(TOK_LCURLBR); - NEXT_EQ(TOK_PREFIX, string, "escape-err*"); + NEXT_EQ(TOK_PREFIX, string, "escape-err"); NEXT_TOK(TOK_RCURLBR); SetInput("@name:{escape\\+pre*}"); NEXT_EQ(TOK_FIELD, string, "@name"); NEXT_TOK(TOK_COLON); NEXT_TOK(TOK_LCURLBR); - NEXT_EQ(TOK_PREFIX, string, "escape+pre*"); + NEXT_EQ(TOK_PREFIX, string, "escape+pre"); NEXT_TOK(TOK_RCURLBR); SetInput("@name:{escape\\.pre*}"); NEXT_EQ(TOK_FIELD, string, "@name"); NEXT_TOK(TOK_COLON); NEXT_TOK(TOK_LCURLBR); - NEXT_EQ(TOK_PREFIX, string, "escape.pre*"); + NEXT_EQ(TOK_PREFIX, string, "escape.pre"); NEXT_TOK(TOK_RCURLBR); SetInput("@name:{complex\\-escape\\+with\\.many\\*chars*}"); NEXT_EQ(TOK_FIELD, string, "@name"); NEXT_TOK(TOK_COLON); NEXT_TOK(TOK_LCURLBR); - NEXT_EQ(TOK_PREFIX, string, "complex-escape+with.many*chars*"); + NEXT_EQ(TOK_PREFIX, string, "complex-escape+with.many*chars"); NEXT_TOK(TOK_RCURLBR); }