diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a7c0dd4eb..b84b5b04a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -37,31 +37,20 @@ function(gen_bison name) set_source_files_properties(${name}.cc ${name}_base.h PROPERTIES GENERATED TRUE) endfunction() -function(gen_flex name) - GET_FILENAME_COMPONENT(_in ${name}.lex ABSOLUTE) - cur_gen_dir(gen_dir) - # set(lib_name "${name}_flex") - - set(full_path_cc ${gen_dir}/${name}.cc) - ADD_CUSTOM_COMMAND( - OUTPUT ${full_path_cc} - COMMAND mkdir -p ${gen_dir} - COMMAND ${CMAKE_COMMAND} -E remove ${gen_dir}/${name}.ih - COMMAND flex -o ${gen_dir}/${name}.cc --c++ ${_in} - DEPENDS ${_in} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - COMMENT "Generating lexer from ${name}.lex" VERBATIM) - - set_source_files_properties(${gen_dir}/${name}.h ${gen_dir}/${name}.cc ${gen_dir}/${name}_base.h - PROPERTIES GENERATED TRUE) -endfunction() - add_third_party( dconv URL https://github.com/google/double-conversion/archive/refs/tags/v3.2.0.tar.gz LIB libdouble-conversion.a ) +add_third_party( + reflex + URL https://github.com/Genivia/RE-flex/archive/refs/tags/v3.3.2.tar.gz + CONFIGURE_COMMAND /configure --disable-avx --prefix=${THIRD_PARTY_LIB_DIR}/reflex +) + +set(REFLEX "${THIRD_PARTY_LIB_DIR}/reflex/bin/reflex") + add_third_party( jsoncons URL https://github.com/danielaparker/jsoncons/archive/refs/tags/v0.168.7.tar.gz @@ -111,6 +100,25 @@ else(ENABLE_GIT_VERSION) set(PRJ_BUILD_TIME "bigbang") endif(ENABLE_GIT_VERSION) + +function(gen_flex name) + GET_FILENAME_COMPONENT(_in ${name}.lex ABSOLUTE) + cur_gen_dir(gen_dir) + + ADD_CUSTOM_COMMAND( + OUTPUT ${gen_dir}/${name}.cc ${gen_dir}/${name}.h + COMMAND mkdir -p ${gen_dir} + + COMMAND ${REFLEX} -o ${gen_dir}/${name}.cc --unicode --header-file=${gen_dir}/${name}.h + --bison-complete --bison-locations ${_in} + DEPENDS ${_in} reflex_project + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + COMMENT "Generating lexer from ${name}.lex" VERBATIM) + + set_source_files_properties(${gen_dir}/${name}.h ${gen_dir}/${name}.cc + PROPERTIES GENERATED TRUE) +endfunction() + # the output file resides in the build directory. configure_file(server/version.cc.in "${CMAKE_CURRENT_SOURCE_DIR}/server/version.cc" @ONLY) diff --git a/src/core/search/CMakeLists.txt b/src/core/search/CMakeLists.txt index a2de29810..cced83d0b 100644 --- a/src/core/search/CMakeLists.txt +++ b/src/core/search/CMakeLists.txt @@ -6,5 +6,5 @@ gen_bison(parser) cur_gen_dir(gen_dir) add_library(query_parser query_driver.cc ${gen_dir}/parser.cc ${gen_dir}/lexer.cc) -target_link_libraries(query_parser base absl::strings) +target_link_libraries(query_parser base absl::strings TRDP::reflex) cxx_test(search_parser_test query_parser) diff --git a/src/core/search/ast_expr.h b/src/core/search/ast_expr.h new file mode 100644 index 000000000..492fbbf52 --- /dev/null +++ b/src/core/search/ast_expr.h @@ -0,0 +1,25 @@ +// Copyright 2023, DragonflyDB authors. All rights reserved. +// See LICENSE for licensing terms. +// + +#pragma once + +#include + +namespace dfly { + +namespace search { + +class AstExpr {}; + +} // namespace search +} // namespace dfly + +namespace std { + +inline std::ostream& operator<<(std::ostream& os, const dfly::search::AstExpr& ast) { + os << "ast"; + return os; +} + +} // namespace std diff --git a/src/core/search/lexer.lex b/src/core/search/lexer.lex index 03f9158f0..890e600a9 100644 --- a/src/core/search/lexer.lex +++ b/src/core/search/lexer.lex @@ -1,21 +1,24 @@ +%top{ + // Our lexer need to know about Parser::symbol_type + #include "core/search/parser.hh" +} -/* Seems that flex does not have unicode support. - TODO: to consider https://en.wikipedia.org/wiki/RE/flex in the future. -*/ %{ #include #include - #include "base/logging.h" - #include "core/search/query_driver.h" - // Define main lexer function. QueryDriver is the shared state between scanner and parser - #undef YY_DECL - #define YY_DECL auto dfly::search::Scanner::ParserLex(QueryDriver& driver) -> Parser::symbol_type + #include "base/logging.h" + + #define DFLY_LEXER_CC 1 + #include "core/search/scanner.h" + #undef DFLY_LEXER_CC %} -%option noyywrap nounput noinput batch debug -%option yyclass="dfly::Scanner" -%option c++ +%o bison-cc-namespace="dfly.search" bison-cc-parser="Parser" +%o namespace="dfly.search" +%o class="Scanner" lex="Lex" +%o nodefault batch +/* %o debug */ /* Declarations before lexer implementation. */ %{ @@ -32,35 +35,38 @@ dq \" esc_chars ['"\?\\abfnrtv] esc_seq \\{esc_chars} str_char ([^"]|{esc_seq}) +term_char [_]|\w + %{ // Code run each time a pattern is matched. - # define YY_USER_ACTION loc.columns (yyleng); %} %% %{ - // A handy shortcut to the location held by the driver. - auto& loc = driver.location; - // Code run each time yylex is called. - loc.step (); + // Code run each time lex() is called. %} -{blank}+ loc.step (); +[[:space:]]+ // skip white space -\n loc.lines (yyleng); loc.step (); +"(" return Parser::make_LPAREN (loc()); +")" return Parser::make_RPAREN (loc()); +"*" return Parser::make_STAR (loc()); +"~" return Parser::make_NOT_OP (loc()); +":" return Parser::make_COLON (loc()); +"=>" return Parser::make_ARROW (loc()); -"(" return Parser::make_LPAREN (loc); -")" return Parser::make_RPAREN (loc); +-?[0-9]+ return make_INT64(matched_view(), loc()); --?[0-9]+ return make_INT64(Matched(), loc); +{dq}{str_char}*{dq} return make_StringLit(matched_view(1, 1), loc()); -{dq}{str_char}*{dq} return make_StringLit(string_view{YYText(), size_t(YYLeng())}, loc); +"$"{term_char}+ return Parser::make_PARAM(str(), loc()); +"@"{term_char}+ return Parser::make_FIELD(str(), loc()); -[[:alnum:]]+ return Parser::make_TERM(Matched(), loc); +{term_char}+ return Parser::make_TERM(str(), loc()); -<> return Parser::make_YYEOF(loc); +<> return Parser::make_YYEOF(loc()); %% Parser::symbol_type @@ -74,11 +80,6 @@ make_INT64 (string_view str, const Parser::location_type& loc) } Parser::symbol_type make_StringLit(string_view src, const Parser::location_type& loc) { - DCHECK_GE(src.size(), 2u); - - // Remove quotes - src.remove_prefix(1); - src.remove_suffix(1); string res; if (!absl::CUnescape(src, &res)) { throw Parser::syntax_error (loc, "bad escaped string: " + string(src)); diff --git a/src/core/search/parser.y b/src/core/search/parser.y index 63a816ff8..f7d346f01 100644 --- a/src/core/search/parser.y +++ b/src/core/search/parser.y @@ -13,6 +13,8 @@ // Added to header file before parser declaration. %code requires { + #include "core/search/ast_expr.h" + namespace dfly { namespace search { class QueryDriver; @@ -24,11 +26,12 @@ %code { #include "core/search/query_driver.h" -#define yylex driver.scanner()->ParserLex +#define yylex driver->scanner()->Lex + +using namespace std; } -// Only for parser -%param { QueryDriver& driver } +%parse-param { QueryDriver *driver } %locations @@ -40,36 +43,35 @@ %token LPAREN "(" RPAREN ")" + STAR "*" + ARROW "=>" + COLON ":" + NOT_OP "~" ; %token YYEOF -%token TERM "term" +%token TERM "term" PARAM "param" FIELD "field" + %token INT64 "int64" -%nterm bool_expr +%nterm search_expr %printer { yyo << $$; } <*>; %% -%start input; -input: - %empty - | bool_expr - { - std::cout << $1 << std::endl; - } +query: + search_expr + | query search_expr ; -bool_expr: TERM { - std::cout << $1 << std::endl; -} | TERM bool_expr { - std::cout << $1 << std::endl; +search_expr: TERM { + cout << $1 << endl; } %% void -dfly::search::Parser::error(const location_type& l, const std::string& m) +dfly::search::Parser::error(const location_type& l, const string& m) { - std::cerr << l << ": " << m << '\n'; + cerr << l << ": " << m << '\n'; } diff --git a/src/core/search/query_driver.h b/src/core/search/query_driver.h index f07f55f2c..0b13d1fbd 100644 --- a/src/core/search/query_driver.h +++ b/src/core/search/query_driver.h @@ -5,8 +5,8 @@ #pragma once #include -#include +#include "core/search/ast_expr.h" #include "core/search/parser.hh" #include "core/search/scanner.h" @@ -23,19 +23,22 @@ class QueryDriver { return scanner_.get(); } - void SetInput(const std::string& str) { - istr_.str(str); - scanner()->switch_streams(&istr_); + void SetInput(std::string str) { + cur_str_ = std::move(str); + scanner()->in(cur_str_); } Parser::symbol_type Lex() { - return scanner()->ParserLex(*this); + return scanner()->Lex(); } Parser::location_type location; + void Add(AstExpr) { + } + private: - std::istringstream istr_; + std::string cur_str_; std::unique_ptr scanner_; }; diff --git a/src/core/search/scanner.h b/src/core/search/scanner.h index 7a9e2a2f8..19f8f3688 100644 --- a/src/core/search/scanner.h +++ b/src/core/search/scanner.h @@ -4,27 +4,30 @@ #pragma once -#if !defined(yyFlexLexerOnce) -#include +// We should not include lexer.h when compiling from lexer.cc file because it already +// includes lexer.h +#ifndef DFLY_LEXER_CC +#include "core/search/lexer.h" #endif -#include "core/search/parser.hh" - namespace dfly { namespace search { -class QueryDriver; - -class Scanner : public yyFlexLexer { +class Scanner : public Lexer { public: Scanner() { } - Parser::symbol_type ParserLex(QueryDriver& drv); + Parser::symbol_type Lex(); private: - std::string Matched() const { - return std::string(YYText(), YYLeng()); + std::string_view matched_view(size_t skip_left = 0, size_t skip_right = 0) const { + std::string_view res(matcher().begin() + skip_left, matcher().size() - skip_left - skip_right); + return res; + } + + dfly::search::location loc() { + return location(); } }; diff --git a/src/core/search/search_parser_test.cc b/src/core/search/search_parser_test.cc index c386ab0ad..04b222167 100644 --- a/src/core/search/search_parser_test.cc +++ b/src/core/search/search_parser_test.cc @@ -14,7 +14,7 @@ using namespace std; class SearchParserTest : public ::testing::Test { protected: SearchParserTest() { - // query_driver_.scanner()->set_debug(1); + query_driver_.scanner()->set_debug(1); } void SetInput(const std::string& str) { @@ -25,6 +25,12 @@ class SearchParserTest : public ::testing::Test { return query_driver_.Lex(); } + int Parse(const std::string& str) { + query_driver_.SetInput(str); + + return Parser(&query_driver_)(); + } + QueryDriver query_driver_; }; @@ -32,7 +38,7 @@ class SearchParserTest : public ::testing::Test { #define NEXT_EQ(tok_enum, type, val) \ { \ auto tok = Lex(); \ - EXPECT_EQ(tok.type_get(), Parser::token::tok_enum); \ + ASSERT_EQ(tok.type_get(), Parser::token::tok_enum); \ EXPECT_EQ(val, tok.value.as()); \ } @@ -60,6 +66,20 @@ TEST_F(SearchParserTest, Scanner) { SetInput(R"( "hello\"world" )"); NEXT_EQ(TOK_TERM, string, R"(hello"world)"); + + SetInput(" $param @field:hello"); + NEXT_EQ(TOK_PARAM, string, "$param"); + NEXT_EQ(TOK_FIELD, string, "@field"); + NEXT_TOK(TOK_COLON); + NEXT_EQ(TOK_TERM, string, "hello"); + + SetInput("почтальон Печкин"); + NEXT_EQ(TOK_TERM, string, "почтальон"); + NEXT_EQ(TOK_TERM, string, "Печкин"); +} + +TEST_F(SearchParserTest, Parse) { + EXPECT_EQ(0, Parse(" foo ")); } } // namespace search