feat: Add unicode support and replace flex with reflex. (#1143)

Also, add basic test with Parser.

Signed-off-by: Roman Gershman <roman@dragonflydb.io>
This commit is contained in:
Roman Gershman 2023-04-26 17:12:00 +02:00 committed by GitHub
parent 0b13eaa943
commit bf6ee50920
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 147 additions and 85 deletions

View file

@ -37,31 +37,20 @@ function(gen_bison name)
set_source_files_properties(${name}.cc ${name}_base.h PROPERTIES GENERATED TRUE) set_source_files_properties(${name}.cc ${name}_base.h PROPERTIES GENERATED TRUE)
endfunction() endfunction()
function(gen_flex name)
GET_FILENAME_COMPONENT(_in ${name}.lex ABSOLUTE)
cur_gen_dir(gen_dir)
# set(lib_name "${name}_flex")
set(full_path_cc ${gen_dir}/${name}.cc)
ADD_CUSTOM_COMMAND(
OUTPUT ${full_path_cc}
COMMAND mkdir -p ${gen_dir}
COMMAND ${CMAKE_COMMAND} -E remove ${gen_dir}/${name}.ih
COMMAND flex -o ${gen_dir}/${name}.cc --c++ ${_in}
DEPENDS ${_in}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMENT "Generating lexer from ${name}.lex" VERBATIM)
set_source_files_properties(${gen_dir}/${name}.h ${gen_dir}/${name}.cc ${gen_dir}/${name}_base.h
PROPERTIES GENERATED TRUE)
endfunction()
add_third_party( add_third_party(
dconv dconv
URL https://github.com/google/double-conversion/archive/refs/tags/v3.2.0.tar.gz URL https://github.com/google/double-conversion/archive/refs/tags/v3.2.0.tar.gz
LIB libdouble-conversion.a LIB libdouble-conversion.a
) )
add_third_party(
reflex
URL https://github.com/Genivia/RE-flex/archive/refs/tags/v3.3.2.tar.gz
CONFIGURE_COMMAND <SOURCE_DIR>/configure --disable-avx --prefix=${THIRD_PARTY_LIB_DIR}/reflex
)
set(REFLEX "${THIRD_PARTY_LIB_DIR}/reflex/bin/reflex")
add_third_party( add_third_party(
jsoncons jsoncons
URL https://github.com/danielaparker/jsoncons/archive/refs/tags/v0.168.7.tar.gz URL https://github.com/danielaparker/jsoncons/archive/refs/tags/v0.168.7.tar.gz
@ -111,6 +100,25 @@ else(ENABLE_GIT_VERSION)
set(PRJ_BUILD_TIME "bigbang") set(PRJ_BUILD_TIME "bigbang")
endif(ENABLE_GIT_VERSION) endif(ENABLE_GIT_VERSION)
function(gen_flex name)
GET_FILENAME_COMPONENT(_in ${name}.lex ABSOLUTE)
cur_gen_dir(gen_dir)
ADD_CUSTOM_COMMAND(
OUTPUT ${gen_dir}/${name}.cc ${gen_dir}/${name}.h
COMMAND mkdir -p ${gen_dir}
COMMAND ${REFLEX} -o ${gen_dir}/${name}.cc --unicode --header-file=${gen_dir}/${name}.h
--bison-complete --bison-locations ${_in}
DEPENDS ${_in} reflex_project
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMENT "Generating lexer from ${name}.lex" VERBATIM)
set_source_files_properties(${gen_dir}/${name}.h ${gen_dir}/${name}.cc
PROPERTIES GENERATED TRUE)
endfunction()
# the output file resides in the build directory. # the output file resides in the build directory.
configure_file(server/version.cc.in "${CMAKE_CURRENT_SOURCE_DIR}/server/version.cc" @ONLY) configure_file(server/version.cc.in "${CMAKE_CURRENT_SOURCE_DIR}/server/version.cc" @ONLY)

View file

@ -6,5 +6,5 @@ gen_bison(parser)
cur_gen_dir(gen_dir) cur_gen_dir(gen_dir)
add_library(query_parser query_driver.cc ${gen_dir}/parser.cc ${gen_dir}/lexer.cc) add_library(query_parser query_driver.cc ${gen_dir}/parser.cc ${gen_dir}/lexer.cc)
target_link_libraries(query_parser base absl::strings) target_link_libraries(query_parser base absl::strings TRDP::reflex)
cxx_test(search_parser_test query_parser) cxx_test(search_parser_test query_parser)

View file

@ -0,0 +1,25 @@
// Copyright 2023, DragonflyDB authors. All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once
#include <ostream>
namespace dfly {
namespace search {
class AstExpr {};
} // namespace search
} // namespace dfly
namespace std {
inline std::ostream& operator<<(std::ostream& os, const dfly::search::AstExpr& ast) {
os << "ast";
return os;
}
} // namespace std

View file

@ -1,21 +1,24 @@
%top{
// Our lexer need to know about Parser::symbol_type
#include "core/search/parser.hh"
}
/* Seems that flex does not have unicode support.
TODO: to consider https://en.wikipedia.org/wiki/RE/flex in the future.
*/
%{ %{
#include <absl/strings/escaping.h> #include <absl/strings/escaping.h>
#include <absl/strings/numbers.h> #include <absl/strings/numbers.h>
#include "base/logging.h"
#include "core/search/query_driver.h"
// Define main lexer function. QueryDriver is the shared state between scanner and parser #include "base/logging.h"
#undef YY_DECL
#define YY_DECL auto dfly::search::Scanner::ParserLex(QueryDriver& driver) -> Parser::symbol_type #define DFLY_LEXER_CC 1
#include "core/search/scanner.h"
#undef DFLY_LEXER_CC
%} %}
%option noyywrap nounput noinput batch debug %o bison-cc-namespace="dfly.search" bison-cc-parser="Parser"
%option yyclass="dfly::Scanner" %o namespace="dfly.search"
%option c++ %o class="Scanner" lex="Lex"
%o nodefault batch
/* %o debug */
/* Declarations before lexer implementation. */ /* Declarations before lexer implementation. */
%{ %{
@ -32,35 +35,38 @@ dq \"
esc_chars ['"\?\\abfnrtv] esc_chars ['"\?\\abfnrtv]
esc_seq \\{esc_chars} esc_seq \\{esc_chars}
str_char ([^"]|{esc_seq}) str_char ([^"]|{esc_seq})
term_char [_]|\w
%{ %{
// Code run each time a pattern is matched. // Code run each time a pattern is matched.
# define YY_USER_ACTION loc.columns (yyleng);
%} %}
%% %%
%{ %{
// A handy shortcut to the location held by the driver. // Code run each time lex() is called.
auto& loc = driver.location;
// Code run each time yylex is called.
loc.step ();
%} %}
{blank}+ loc.step (); [[:space:]]+ // skip white space
\n loc.lines (yyleng); loc.step (); "(" return Parser::make_LPAREN (loc());
")" return Parser::make_RPAREN (loc());
"*" return Parser::make_STAR (loc());
"~" return Parser::make_NOT_OP (loc());
":" return Parser::make_COLON (loc());
"=>" return Parser::make_ARROW (loc());
"(" return Parser::make_LPAREN (loc); -?[0-9]+ return make_INT64(matched_view(), loc());
")" return Parser::make_RPAREN (loc);
-?[0-9]+ return make_INT64(Matched(), loc); {dq}{str_char}*{dq} return make_StringLit(matched_view(1, 1), loc());
{dq}{str_char}*{dq} return make_StringLit(string_view{YYText(), size_t(YYLeng())}, loc); "$"{term_char}+ return Parser::make_PARAM(str(), loc());
"@"{term_char}+ return Parser::make_FIELD(str(), loc());
[[:alnum:]]+ return Parser::make_TERM(Matched(), loc); {term_char}+ return Parser::make_TERM(str(), loc());
<<EOF>> return Parser::make_YYEOF(loc); <<EOF>> return Parser::make_YYEOF(loc());
%% %%
Parser::symbol_type Parser::symbol_type
@ -74,11 +80,6 @@ make_INT64 (string_view str, const Parser::location_type& loc)
} }
Parser::symbol_type make_StringLit(string_view src, const Parser::location_type& loc) { Parser::symbol_type make_StringLit(string_view src, const Parser::location_type& loc) {
DCHECK_GE(src.size(), 2u);
// Remove quotes
src.remove_prefix(1);
src.remove_suffix(1);
string res; string res;
if (!absl::CUnescape(src, &res)) { if (!absl::CUnescape(src, &res)) {
throw Parser::syntax_error (loc, "bad escaped string: " + string(src)); throw Parser::syntax_error (loc, "bad escaped string: " + string(src));

View file

@ -13,6 +13,8 @@
// Added to header file before parser declaration. // Added to header file before parser declaration.
%code requires { %code requires {
#include "core/search/ast_expr.h"
namespace dfly { namespace dfly {
namespace search { namespace search {
class QueryDriver; class QueryDriver;
@ -24,11 +26,12 @@
%code { %code {
#include "core/search/query_driver.h" #include "core/search/query_driver.h"
#define yylex driver.scanner()->ParserLex #define yylex driver->scanner()->Lex
using namespace std;
} }
// Only for parser %parse-param { QueryDriver *driver }
%param { QueryDriver& driver }
%locations %locations
@ -40,36 +43,35 @@
%token %token
LPAREN "(" LPAREN "("
RPAREN ")" RPAREN ")"
STAR "*"
ARROW "=>"
COLON ":"
NOT_OP "~"
; ;
%token YYEOF %token YYEOF
%token <std::string> TERM "term" %token <std::string> TERM "term" PARAM "param" FIELD "field"
%token <int64_t> INT64 "int64" %token <int64_t> INT64 "int64"
%nterm <int> bool_expr %nterm <AstExpr> search_expr
%printer { yyo << $$; } <*>; %printer { yyo << $$; } <*>;
%% %%
%start input;
input: query:
%empty search_expr
| bool_expr | query search_expr
{
std::cout << $1 << std::endl;
}
; ;
bool_expr: TERM { search_expr: TERM {
std::cout << $1 << std::endl; cout << $1 << endl;
} | TERM bool_expr {
std::cout << $1 << std::endl;
} }
%% %%
void void
dfly::search::Parser::error(const location_type& l, const std::string& m) dfly::search::Parser::error(const location_type& l, const string& m)
{ {
std::cerr << l << ": " << m << '\n'; cerr << l << ": " << m << '\n';
} }

View file

@ -5,8 +5,8 @@
#pragma once #pragma once
#include <memory> #include <memory>
#include <sstream>
#include "core/search/ast_expr.h"
#include "core/search/parser.hh" #include "core/search/parser.hh"
#include "core/search/scanner.h" #include "core/search/scanner.h"
@ -23,19 +23,22 @@ class QueryDriver {
return scanner_.get(); return scanner_.get();
} }
void SetInput(const std::string& str) { void SetInput(std::string str) {
istr_.str(str); cur_str_ = std::move(str);
scanner()->switch_streams(&istr_); scanner()->in(cur_str_);
} }
Parser::symbol_type Lex() { Parser::symbol_type Lex() {
return scanner()->ParserLex(*this); return scanner()->Lex();
} }
Parser::location_type location; Parser::location_type location;
void Add(AstExpr) {
}
private: private:
std::istringstream istr_; std::string cur_str_;
std::unique_ptr<Scanner> scanner_; std::unique_ptr<Scanner> scanner_;
}; };

View file

@ -4,27 +4,30 @@
#pragma once #pragma once
#if !defined(yyFlexLexerOnce) // We should not include lexer.h when compiling from lexer.cc file because it already
#include <FlexLexer.h> // includes lexer.h
#ifndef DFLY_LEXER_CC
#include "core/search/lexer.h"
#endif #endif
#include "core/search/parser.hh"
namespace dfly { namespace dfly {
namespace search { namespace search {
class QueryDriver; class Scanner : public Lexer {
class Scanner : public yyFlexLexer {
public: public:
Scanner() { Scanner() {
} }
Parser::symbol_type ParserLex(QueryDriver& drv); Parser::symbol_type Lex();
private: private:
std::string Matched() const { std::string_view matched_view(size_t skip_left = 0, size_t skip_right = 0) const {
return std::string(YYText(), YYLeng()); std::string_view res(matcher().begin() + skip_left, matcher().size() - skip_left - skip_right);
return res;
}
dfly::search::location loc() {
return location();
} }
}; };

View file

@ -14,7 +14,7 @@ using namespace std;
class SearchParserTest : public ::testing::Test { class SearchParserTest : public ::testing::Test {
protected: protected:
SearchParserTest() { SearchParserTest() {
// query_driver_.scanner()->set_debug(1); query_driver_.scanner()->set_debug(1);
} }
void SetInput(const std::string& str) { void SetInput(const std::string& str) {
@ -25,6 +25,12 @@ class SearchParserTest : public ::testing::Test {
return query_driver_.Lex(); return query_driver_.Lex();
} }
int Parse(const std::string& str) {
query_driver_.SetInput(str);
return Parser(&query_driver_)();
}
QueryDriver query_driver_; QueryDriver query_driver_;
}; };
@ -32,7 +38,7 @@ class SearchParserTest : public ::testing::Test {
#define NEXT_EQ(tok_enum, type, val) \ #define NEXT_EQ(tok_enum, type, val) \
{ \ { \
auto tok = Lex(); \ auto tok = Lex(); \
EXPECT_EQ(tok.type_get(), Parser::token::tok_enum); \ ASSERT_EQ(tok.type_get(), Parser::token::tok_enum); \
EXPECT_EQ(val, tok.value.as<type>()); \ EXPECT_EQ(val, tok.value.as<type>()); \
} }
@ -60,6 +66,20 @@ TEST_F(SearchParserTest, Scanner) {
SetInput(R"( "hello\"world" )"); SetInput(R"( "hello\"world" )");
NEXT_EQ(TOK_TERM, string, R"(hello"world)"); NEXT_EQ(TOK_TERM, string, R"(hello"world)");
SetInput(" $param @field:hello");
NEXT_EQ(TOK_PARAM, string, "$param");
NEXT_EQ(TOK_FIELD, string, "@field");
NEXT_TOK(TOK_COLON);
NEXT_EQ(TOK_TERM, string, "hello");
SetInput("почтальон Печкин");
NEXT_EQ(TOK_TERM, string, "почтальон");
NEXT_EQ(TOK_TERM, string, "Печкин");
}
TEST_F(SearchParserTest, Parse) {
EXPECT_EQ(0, Parse(" foo "));
} }
} // namespace search } // namespace search