mirror of
https://github.com/dragonflydb/dragonfly.git
synced 2025-05-11 02:15:45 +02:00
feat: Add unicode support and replace flex with reflex. (#1143)
Also, add basic test with Parser. Signed-off-by: Roman Gershman <roman@dragonflydb.io>
This commit is contained in:
parent
0b13eaa943
commit
bf6ee50920
8 changed files with 147 additions and 85 deletions
|
@ -37,31 +37,20 @@ function(gen_bison name)
|
|||
set_source_files_properties(${name}.cc ${name}_base.h PROPERTIES GENERATED TRUE)
|
||||
endfunction()
|
||||
|
||||
function(gen_flex name)
|
||||
GET_FILENAME_COMPONENT(_in ${name}.lex ABSOLUTE)
|
||||
cur_gen_dir(gen_dir)
|
||||
# set(lib_name "${name}_flex")
|
||||
|
||||
set(full_path_cc ${gen_dir}/${name}.cc)
|
||||
ADD_CUSTOM_COMMAND(
|
||||
OUTPUT ${full_path_cc}
|
||||
COMMAND mkdir -p ${gen_dir}
|
||||
COMMAND ${CMAKE_COMMAND} -E remove ${gen_dir}/${name}.ih
|
||||
COMMAND flex -o ${gen_dir}/${name}.cc --c++ ${_in}
|
||||
DEPENDS ${_in}
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
COMMENT "Generating lexer from ${name}.lex" VERBATIM)
|
||||
|
||||
set_source_files_properties(${gen_dir}/${name}.h ${gen_dir}/${name}.cc ${gen_dir}/${name}_base.h
|
||||
PROPERTIES GENERATED TRUE)
|
||||
endfunction()
|
||||
|
||||
add_third_party(
|
||||
dconv
|
||||
URL https://github.com/google/double-conversion/archive/refs/tags/v3.2.0.tar.gz
|
||||
LIB libdouble-conversion.a
|
||||
)
|
||||
|
||||
add_third_party(
|
||||
reflex
|
||||
URL https://github.com/Genivia/RE-flex/archive/refs/tags/v3.3.2.tar.gz
|
||||
CONFIGURE_COMMAND <SOURCE_DIR>/configure --disable-avx --prefix=${THIRD_PARTY_LIB_DIR}/reflex
|
||||
)
|
||||
|
||||
set(REFLEX "${THIRD_PARTY_LIB_DIR}/reflex/bin/reflex")
|
||||
|
||||
add_third_party(
|
||||
jsoncons
|
||||
URL https://github.com/danielaparker/jsoncons/archive/refs/tags/v0.168.7.tar.gz
|
||||
|
@ -111,6 +100,25 @@ else(ENABLE_GIT_VERSION)
|
|||
set(PRJ_BUILD_TIME "bigbang")
|
||||
endif(ENABLE_GIT_VERSION)
|
||||
|
||||
|
||||
function(gen_flex name)
|
||||
GET_FILENAME_COMPONENT(_in ${name}.lex ABSOLUTE)
|
||||
cur_gen_dir(gen_dir)
|
||||
|
||||
ADD_CUSTOM_COMMAND(
|
||||
OUTPUT ${gen_dir}/${name}.cc ${gen_dir}/${name}.h
|
||||
COMMAND mkdir -p ${gen_dir}
|
||||
|
||||
COMMAND ${REFLEX} -o ${gen_dir}/${name}.cc --unicode --header-file=${gen_dir}/${name}.h
|
||||
--bison-complete --bison-locations ${_in}
|
||||
DEPENDS ${_in} reflex_project
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
COMMENT "Generating lexer from ${name}.lex" VERBATIM)
|
||||
|
||||
set_source_files_properties(${gen_dir}/${name}.h ${gen_dir}/${name}.cc
|
||||
PROPERTIES GENERATED TRUE)
|
||||
endfunction()
|
||||
|
||||
# the output file resides in the build directory.
|
||||
configure_file(server/version.cc.in "${CMAKE_CURRENT_SOURCE_DIR}/server/version.cc" @ONLY)
|
||||
|
||||
|
|
|
@ -6,5 +6,5 @@ gen_bison(parser)
|
|||
cur_gen_dir(gen_dir)
|
||||
|
||||
add_library(query_parser query_driver.cc ${gen_dir}/parser.cc ${gen_dir}/lexer.cc)
|
||||
target_link_libraries(query_parser base absl::strings)
|
||||
target_link_libraries(query_parser base absl::strings TRDP::reflex)
|
||||
cxx_test(search_parser_test query_parser)
|
||||
|
|
25
src/core/search/ast_expr.h
Normal file
25
src/core/search/ast_expr.h
Normal file
|
@ -0,0 +1,25 @@
|
|||
// Copyright 2023, DragonflyDB authors. All rights reserved.
|
||||
// See LICENSE for licensing terms.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ostream>
|
||||
|
||||
namespace dfly {
|
||||
|
||||
namespace search {
|
||||
|
||||
class AstExpr {};
|
||||
|
||||
} // namespace search
|
||||
} // namespace dfly
|
||||
|
||||
namespace std {
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const dfly::search::AstExpr& ast) {
|
||||
os << "ast";
|
||||
return os;
|
||||
}
|
||||
|
||||
} // namespace std
|
|
@ -1,21 +1,24 @@
|
|||
%top{
|
||||
// Our lexer need to know about Parser::symbol_type
|
||||
#include "core/search/parser.hh"
|
||||
}
|
||||
|
||||
/* Seems that flex does not have unicode support.
|
||||
TODO: to consider https://en.wikipedia.org/wiki/RE/flex in the future.
|
||||
*/
|
||||
%{
|
||||
#include <absl/strings/escaping.h>
|
||||
#include <absl/strings/numbers.h>
|
||||
#include "base/logging.h"
|
||||
#include "core/search/query_driver.h"
|
||||
|
||||
// Define main lexer function. QueryDriver is the shared state between scanner and parser
|
||||
#undef YY_DECL
|
||||
#define YY_DECL auto dfly::search::Scanner::ParserLex(QueryDriver& driver) -> Parser::symbol_type
|
||||
#include "base/logging.h"
|
||||
|
||||
#define DFLY_LEXER_CC 1
|
||||
#include "core/search/scanner.h"
|
||||
#undef DFLY_LEXER_CC
|
||||
%}
|
||||
|
||||
%option noyywrap nounput noinput batch debug
|
||||
%option yyclass="dfly::Scanner"
|
||||
%option c++
|
||||
%o bison-cc-namespace="dfly.search" bison-cc-parser="Parser"
|
||||
%o namespace="dfly.search"
|
||||
%o class="Scanner" lex="Lex"
|
||||
%o nodefault batch
|
||||
/* %o debug */
|
||||
|
||||
/* Declarations before lexer implementation. */
|
||||
%{
|
||||
|
@ -32,35 +35,38 @@ dq \"
|
|||
esc_chars ['"\?\\abfnrtv]
|
||||
esc_seq \\{esc_chars}
|
||||
str_char ([^"]|{esc_seq})
|
||||
term_char [_]|\w
|
||||
|
||||
|
||||
%{
|
||||
// Code run each time a pattern is matched.
|
||||
# define YY_USER_ACTION loc.columns (yyleng);
|
||||
%}
|
||||
|
||||
%%
|
||||
|
||||
%{
|
||||
// A handy shortcut to the location held by the driver.
|
||||
auto& loc = driver.location;
|
||||
// Code run each time yylex is called.
|
||||
loc.step ();
|
||||
// Code run each time lex() is called.
|
||||
%}
|
||||
|
||||
{blank}+ loc.step ();
|
||||
[[:space:]]+ // skip white space
|
||||
|
||||
\n loc.lines (yyleng); loc.step ();
|
||||
"(" return Parser::make_LPAREN (loc());
|
||||
")" return Parser::make_RPAREN (loc());
|
||||
"*" return Parser::make_STAR (loc());
|
||||
"~" return Parser::make_NOT_OP (loc());
|
||||
":" return Parser::make_COLON (loc());
|
||||
"=>" return Parser::make_ARROW (loc());
|
||||
|
||||
"(" return Parser::make_LPAREN (loc);
|
||||
")" return Parser::make_RPAREN (loc);
|
||||
-?[0-9]+ return make_INT64(matched_view(), loc());
|
||||
|
||||
-?[0-9]+ return make_INT64(Matched(), loc);
|
||||
{dq}{str_char}*{dq} return make_StringLit(matched_view(1, 1), loc());
|
||||
|
||||
{dq}{str_char}*{dq} return make_StringLit(string_view{YYText(), size_t(YYLeng())}, loc);
|
||||
"$"{term_char}+ return Parser::make_PARAM(str(), loc());
|
||||
"@"{term_char}+ return Parser::make_FIELD(str(), loc());
|
||||
|
||||
[[:alnum:]]+ return Parser::make_TERM(Matched(), loc);
|
||||
{term_char}+ return Parser::make_TERM(str(), loc());
|
||||
|
||||
<<EOF>> return Parser::make_YYEOF(loc);
|
||||
<<EOF>> return Parser::make_YYEOF(loc());
|
||||
%%
|
||||
|
||||
Parser::symbol_type
|
||||
|
@ -74,11 +80,6 @@ make_INT64 (string_view str, const Parser::location_type& loc)
|
|||
}
|
||||
|
||||
Parser::symbol_type make_StringLit(string_view src, const Parser::location_type& loc) {
|
||||
DCHECK_GE(src.size(), 2u);
|
||||
|
||||
// Remove quotes
|
||||
src.remove_prefix(1);
|
||||
src.remove_suffix(1);
|
||||
string res;
|
||||
if (!absl::CUnescape(src, &res)) {
|
||||
throw Parser::syntax_error (loc, "bad escaped string: " + string(src));
|
||||
|
|
|
@ -13,6 +13,8 @@
|
|||
|
||||
// Added to header file before parser declaration.
|
||||
%code requires {
|
||||
#include "core/search/ast_expr.h"
|
||||
|
||||
namespace dfly {
|
||||
namespace search {
|
||||
class QueryDriver;
|
||||
|
@ -24,11 +26,12 @@
|
|||
%code {
|
||||
#include "core/search/query_driver.h"
|
||||
|
||||
#define yylex driver.scanner()->ParserLex
|
||||
#define yylex driver->scanner()->Lex
|
||||
|
||||
using namespace std;
|
||||
}
|
||||
|
||||
// Only for parser
|
||||
%param { QueryDriver& driver }
|
||||
%parse-param { QueryDriver *driver }
|
||||
|
||||
%locations
|
||||
|
||||
|
@ -40,36 +43,35 @@
|
|||
%token
|
||||
LPAREN "("
|
||||
RPAREN ")"
|
||||
STAR "*"
|
||||
ARROW "=>"
|
||||
COLON ":"
|
||||
NOT_OP "~"
|
||||
;
|
||||
|
||||
%token YYEOF
|
||||
%token <std::string> TERM "term"
|
||||
%token <std::string> TERM "term" PARAM "param" FIELD "field"
|
||||
|
||||
%token <int64_t> INT64 "int64"
|
||||
%nterm <int> bool_expr
|
||||
%nterm <AstExpr> search_expr
|
||||
|
||||
%printer { yyo << $$; } <*>;
|
||||
|
||||
%%
|
||||
%start input;
|
||||
|
||||
input:
|
||||
%empty
|
||||
| bool_expr
|
||||
{
|
||||
std::cout << $1 << std::endl;
|
||||
}
|
||||
query:
|
||||
search_expr
|
||||
| query search_expr
|
||||
;
|
||||
|
||||
bool_expr: TERM {
|
||||
std::cout << $1 << std::endl;
|
||||
} | TERM bool_expr {
|
||||
std::cout << $1 << std::endl;
|
||||
search_expr: TERM {
|
||||
cout << $1 << endl;
|
||||
}
|
||||
|
||||
%%
|
||||
|
||||
void
|
||||
dfly::search::Parser::error(const location_type& l, const std::string& m)
|
||||
dfly::search::Parser::error(const location_type& l, const string& m)
|
||||
{
|
||||
std::cerr << l << ": " << m << '\n';
|
||||
cerr << l << ": " << m << '\n';
|
||||
}
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
|
||||
#include "core/search/ast_expr.h"
|
||||
#include "core/search/parser.hh"
|
||||
#include "core/search/scanner.h"
|
||||
|
||||
|
@ -23,19 +23,22 @@ class QueryDriver {
|
|||
return scanner_.get();
|
||||
}
|
||||
|
||||
void SetInput(const std::string& str) {
|
||||
istr_.str(str);
|
||||
scanner()->switch_streams(&istr_);
|
||||
void SetInput(std::string str) {
|
||||
cur_str_ = std::move(str);
|
||||
scanner()->in(cur_str_);
|
||||
}
|
||||
|
||||
Parser::symbol_type Lex() {
|
||||
return scanner()->ParserLex(*this);
|
||||
return scanner()->Lex();
|
||||
}
|
||||
|
||||
Parser::location_type location;
|
||||
|
||||
void Add(AstExpr) {
|
||||
}
|
||||
|
||||
private:
|
||||
std::istringstream istr_;
|
||||
std::string cur_str_;
|
||||
std::unique_ptr<Scanner> scanner_;
|
||||
};
|
||||
|
||||
|
|
|
@ -4,27 +4,30 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#if !defined(yyFlexLexerOnce)
|
||||
#include <FlexLexer.h>
|
||||
// We should not include lexer.h when compiling from lexer.cc file because it already
|
||||
// includes lexer.h
|
||||
#ifndef DFLY_LEXER_CC
|
||||
#include "core/search/lexer.h"
|
||||
#endif
|
||||
|
||||
#include "core/search/parser.hh"
|
||||
|
||||
namespace dfly {
|
||||
namespace search {
|
||||
|
||||
class QueryDriver;
|
||||
|
||||
class Scanner : public yyFlexLexer {
|
||||
class Scanner : public Lexer {
|
||||
public:
|
||||
Scanner() {
|
||||
}
|
||||
|
||||
Parser::symbol_type ParserLex(QueryDriver& drv);
|
||||
Parser::symbol_type Lex();
|
||||
|
||||
private:
|
||||
std::string Matched() const {
|
||||
return std::string(YYText(), YYLeng());
|
||||
std::string_view matched_view(size_t skip_left = 0, size_t skip_right = 0) const {
|
||||
std::string_view res(matcher().begin() + skip_left, matcher().size() - skip_left - skip_right);
|
||||
return res;
|
||||
}
|
||||
|
||||
dfly::search::location loc() {
|
||||
return location();
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ using namespace std;
|
|||
class SearchParserTest : public ::testing::Test {
|
||||
protected:
|
||||
SearchParserTest() {
|
||||
// query_driver_.scanner()->set_debug(1);
|
||||
query_driver_.scanner()->set_debug(1);
|
||||
}
|
||||
|
||||
void SetInput(const std::string& str) {
|
||||
|
@ -25,6 +25,12 @@ class SearchParserTest : public ::testing::Test {
|
|||
return query_driver_.Lex();
|
||||
}
|
||||
|
||||
int Parse(const std::string& str) {
|
||||
query_driver_.SetInput(str);
|
||||
|
||||
return Parser(&query_driver_)();
|
||||
}
|
||||
|
||||
QueryDriver query_driver_;
|
||||
};
|
||||
|
||||
|
@ -32,7 +38,7 @@ class SearchParserTest : public ::testing::Test {
|
|||
#define NEXT_EQ(tok_enum, type, val) \
|
||||
{ \
|
||||
auto tok = Lex(); \
|
||||
EXPECT_EQ(tok.type_get(), Parser::token::tok_enum); \
|
||||
ASSERT_EQ(tok.type_get(), Parser::token::tok_enum); \
|
||||
EXPECT_EQ(val, tok.value.as<type>()); \
|
||||
}
|
||||
|
||||
|
@ -60,6 +66,20 @@ TEST_F(SearchParserTest, Scanner) {
|
|||
|
||||
SetInput(R"( "hello\"world" )");
|
||||
NEXT_EQ(TOK_TERM, string, R"(hello"world)");
|
||||
|
||||
SetInput(" $param @field:hello");
|
||||
NEXT_EQ(TOK_PARAM, string, "$param");
|
||||
NEXT_EQ(TOK_FIELD, string, "@field");
|
||||
NEXT_TOK(TOK_COLON);
|
||||
NEXT_EQ(TOK_TERM, string, "hello");
|
||||
|
||||
SetInput("почтальон Печкин");
|
||||
NEXT_EQ(TOK_TERM, string, "почтальон");
|
||||
NEXT_EQ(TOK_TERM, string, "Печкин");
|
||||
}
|
||||
|
||||
TEST_F(SearchParserTest, Parse) {
|
||||
EXPECT_EQ(0, Parse(" foo "));
|
||||
}
|
||||
|
||||
} // namespace search
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue