mirror of
https://github.com/dragonflydb/dragonfly.git
synced 2025-05-11 18:35:46 +02:00
feat: Add unicode support and replace flex with reflex. (#1143)
Also, add basic test with Parser. Signed-off-by: Roman Gershman <roman@dragonflydb.io>
This commit is contained in:
parent
0b13eaa943
commit
bf6ee50920
8 changed files with 147 additions and 85 deletions
|
@ -37,31 +37,20 @@ function(gen_bison name)
|
||||||
set_source_files_properties(${name}.cc ${name}_base.h PROPERTIES GENERATED TRUE)
|
set_source_files_properties(${name}.cc ${name}_base.h PROPERTIES GENERATED TRUE)
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
function(gen_flex name)
|
|
||||||
GET_FILENAME_COMPONENT(_in ${name}.lex ABSOLUTE)
|
|
||||||
cur_gen_dir(gen_dir)
|
|
||||||
# set(lib_name "${name}_flex")
|
|
||||||
|
|
||||||
set(full_path_cc ${gen_dir}/${name}.cc)
|
|
||||||
ADD_CUSTOM_COMMAND(
|
|
||||||
OUTPUT ${full_path_cc}
|
|
||||||
COMMAND mkdir -p ${gen_dir}
|
|
||||||
COMMAND ${CMAKE_COMMAND} -E remove ${gen_dir}/${name}.ih
|
|
||||||
COMMAND flex -o ${gen_dir}/${name}.cc --c++ ${_in}
|
|
||||||
DEPENDS ${_in}
|
|
||||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
|
||||||
COMMENT "Generating lexer from ${name}.lex" VERBATIM)
|
|
||||||
|
|
||||||
set_source_files_properties(${gen_dir}/${name}.h ${gen_dir}/${name}.cc ${gen_dir}/${name}_base.h
|
|
||||||
PROPERTIES GENERATED TRUE)
|
|
||||||
endfunction()
|
|
||||||
|
|
||||||
add_third_party(
|
add_third_party(
|
||||||
dconv
|
dconv
|
||||||
URL https://github.com/google/double-conversion/archive/refs/tags/v3.2.0.tar.gz
|
URL https://github.com/google/double-conversion/archive/refs/tags/v3.2.0.tar.gz
|
||||||
LIB libdouble-conversion.a
|
LIB libdouble-conversion.a
|
||||||
)
|
)
|
||||||
|
|
||||||
|
add_third_party(
|
||||||
|
reflex
|
||||||
|
URL https://github.com/Genivia/RE-flex/archive/refs/tags/v3.3.2.tar.gz
|
||||||
|
CONFIGURE_COMMAND <SOURCE_DIR>/configure --disable-avx --prefix=${THIRD_PARTY_LIB_DIR}/reflex
|
||||||
|
)
|
||||||
|
|
||||||
|
set(REFLEX "${THIRD_PARTY_LIB_DIR}/reflex/bin/reflex")
|
||||||
|
|
||||||
add_third_party(
|
add_third_party(
|
||||||
jsoncons
|
jsoncons
|
||||||
URL https://github.com/danielaparker/jsoncons/archive/refs/tags/v0.168.7.tar.gz
|
URL https://github.com/danielaparker/jsoncons/archive/refs/tags/v0.168.7.tar.gz
|
||||||
|
@ -111,6 +100,25 @@ else(ENABLE_GIT_VERSION)
|
||||||
set(PRJ_BUILD_TIME "bigbang")
|
set(PRJ_BUILD_TIME "bigbang")
|
||||||
endif(ENABLE_GIT_VERSION)
|
endif(ENABLE_GIT_VERSION)
|
||||||
|
|
||||||
|
|
||||||
|
function(gen_flex name)
|
||||||
|
GET_FILENAME_COMPONENT(_in ${name}.lex ABSOLUTE)
|
||||||
|
cur_gen_dir(gen_dir)
|
||||||
|
|
||||||
|
ADD_CUSTOM_COMMAND(
|
||||||
|
OUTPUT ${gen_dir}/${name}.cc ${gen_dir}/${name}.h
|
||||||
|
COMMAND mkdir -p ${gen_dir}
|
||||||
|
|
||||||
|
COMMAND ${REFLEX} -o ${gen_dir}/${name}.cc --unicode --header-file=${gen_dir}/${name}.h
|
||||||
|
--bison-complete --bison-locations ${_in}
|
||||||
|
DEPENDS ${_in} reflex_project
|
||||||
|
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||||
|
COMMENT "Generating lexer from ${name}.lex" VERBATIM)
|
||||||
|
|
||||||
|
set_source_files_properties(${gen_dir}/${name}.h ${gen_dir}/${name}.cc
|
||||||
|
PROPERTIES GENERATED TRUE)
|
||||||
|
endfunction()
|
||||||
|
|
||||||
# the output file resides in the build directory.
|
# the output file resides in the build directory.
|
||||||
configure_file(server/version.cc.in "${CMAKE_CURRENT_SOURCE_DIR}/server/version.cc" @ONLY)
|
configure_file(server/version.cc.in "${CMAKE_CURRENT_SOURCE_DIR}/server/version.cc" @ONLY)
|
||||||
|
|
||||||
|
|
|
@ -6,5 +6,5 @@ gen_bison(parser)
|
||||||
cur_gen_dir(gen_dir)
|
cur_gen_dir(gen_dir)
|
||||||
|
|
||||||
add_library(query_parser query_driver.cc ${gen_dir}/parser.cc ${gen_dir}/lexer.cc)
|
add_library(query_parser query_driver.cc ${gen_dir}/parser.cc ${gen_dir}/lexer.cc)
|
||||||
target_link_libraries(query_parser base absl::strings)
|
target_link_libraries(query_parser base absl::strings TRDP::reflex)
|
||||||
cxx_test(search_parser_test query_parser)
|
cxx_test(search_parser_test query_parser)
|
||||||
|
|
25
src/core/search/ast_expr.h
Normal file
25
src/core/search/ast_expr.h
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
// Copyright 2023, DragonflyDB authors. All rights reserved.
|
||||||
|
// See LICENSE for licensing terms.
|
||||||
|
//
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <ostream>
|
||||||
|
|
||||||
|
namespace dfly {
|
||||||
|
|
||||||
|
namespace search {
|
||||||
|
|
||||||
|
class AstExpr {};
|
||||||
|
|
||||||
|
} // namespace search
|
||||||
|
} // namespace dfly
|
||||||
|
|
||||||
|
namespace std {
|
||||||
|
|
||||||
|
inline std::ostream& operator<<(std::ostream& os, const dfly::search::AstExpr& ast) {
|
||||||
|
os << "ast";
|
||||||
|
return os;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace std
|
|
@ -1,21 +1,24 @@
|
||||||
|
%top{
|
||||||
|
// Our lexer need to know about Parser::symbol_type
|
||||||
|
#include "core/search/parser.hh"
|
||||||
|
}
|
||||||
|
|
||||||
/* Seems that flex does not have unicode support.
|
|
||||||
TODO: to consider https://en.wikipedia.org/wiki/RE/flex in the future.
|
|
||||||
*/
|
|
||||||
%{
|
%{
|
||||||
#include <absl/strings/escaping.h>
|
#include <absl/strings/escaping.h>
|
||||||
#include <absl/strings/numbers.h>
|
#include <absl/strings/numbers.h>
|
||||||
#include "base/logging.h"
|
|
||||||
#include "core/search/query_driver.h"
|
|
||||||
|
|
||||||
// Define main lexer function. QueryDriver is the shared state between scanner and parser
|
#include "base/logging.h"
|
||||||
#undef YY_DECL
|
|
||||||
#define YY_DECL auto dfly::search::Scanner::ParserLex(QueryDriver& driver) -> Parser::symbol_type
|
#define DFLY_LEXER_CC 1
|
||||||
|
#include "core/search/scanner.h"
|
||||||
|
#undef DFLY_LEXER_CC
|
||||||
%}
|
%}
|
||||||
|
|
||||||
%option noyywrap nounput noinput batch debug
|
%o bison-cc-namespace="dfly.search" bison-cc-parser="Parser"
|
||||||
%option yyclass="dfly::Scanner"
|
%o namespace="dfly.search"
|
||||||
%option c++
|
%o class="Scanner" lex="Lex"
|
||||||
|
%o nodefault batch
|
||||||
|
/* %o debug */
|
||||||
|
|
||||||
/* Declarations before lexer implementation. */
|
/* Declarations before lexer implementation. */
|
||||||
%{
|
%{
|
||||||
|
@ -32,35 +35,38 @@ dq \"
|
||||||
esc_chars ['"\?\\abfnrtv]
|
esc_chars ['"\?\\abfnrtv]
|
||||||
esc_seq \\{esc_chars}
|
esc_seq \\{esc_chars}
|
||||||
str_char ([^"]|{esc_seq})
|
str_char ([^"]|{esc_seq})
|
||||||
|
term_char [_]|\w
|
||||||
|
|
||||||
|
|
||||||
%{
|
%{
|
||||||
// Code run each time a pattern is matched.
|
// Code run each time a pattern is matched.
|
||||||
# define YY_USER_ACTION loc.columns (yyleng);
|
|
||||||
%}
|
%}
|
||||||
|
|
||||||
%%
|
%%
|
||||||
|
|
||||||
%{
|
%{
|
||||||
// A handy shortcut to the location held by the driver.
|
// Code run each time lex() is called.
|
||||||
auto& loc = driver.location;
|
|
||||||
// Code run each time yylex is called.
|
|
||||||
loc.step ();
|
|
||||||
%}
|
%}
|
||||||
|
|
||||||
{blank}+ loc.step ();
|
[[:space:]]+ // skip white space
|
||||||
|
|
||||||
\n loc.lines (yyleng); loc.step ();
|
"(" return Parser::make_LPAREN (loc());
|
||||||
|
")" return Parser::make_RPAREN (loc());
|
||||||
|
"*" return Parser::make_STAR (loc());
|
||||||
|
"~" return Parser::make_NOT_OP (loc());
|
||||||
|
":" return Parser::make_COLON (loc());
|
||||||
|
"=>" return Parser::make_ARROW (loc());
|
||||||
|
|
||||||
"(" return Parser::make_LPAREN (loc);
|
-?[0-9]+ return make_INT64(matched_view(), loc());
|
||||||
")" return Parser::make_RPAREN (loc);
|
|
||||||
|
|
||||||
-?[0-9]+ return make_INT64(Matched(), loc);
|
{dq}{str_char}*{dq} return make_StringLit(matched_view(1, 1), loc());
|
||||||
|
|
||||||
{dq}{str_char}*{dq} return make_StringLit(string_view{YYText(), size_t(YYLeng())}, loc);
|
"$"{term_char}+ return Parser::make_PARAM(str(), loc());
|
||||||
|
"@"{term_char}+ return Parser::make_FIELD(str(), loc());
|
||||||
|
|
||||||
[[:alnum:]]+ return Parser::make_TERM(Matched(), loc);
|
{term_char}+ return Parser::make_TERM(str(), loc());
|
||||||
|
|
||||||
<<EOF>> return Parser::make_YYEOF(loc);
|
<<EOF>> return Parser::make_YYEOF(loc());
|
||||||
%%
|
%%
|
||||||
|
|
||||||
Parser::symbol_type
|
Parser::symbol_type
|
||||||
|
@ -74,11 +80,6 @@ make_INT64 (string_view str, const Parser::location_type& loc)
|
||||||
}
|
}
|
||||||
|
|
||||||
Parser::symbol_type make_StringLit(string_view src, const Parser::location_type& loc) {
|
Parser::symbol_type make_StringLit(string_view src, const Parser::location_type& loc) {
|
||||||
DCHECK_GE(src.size(), 2u);
|
|
||||||
|
|
||||||
// Remove quotes
|
|
||||||
src.remove_prefix(1);
|
|
||||||
src.remove_suffix(1);
|
|
||||||
string res;
|
string res;
|
||||||
if (!absl::CUnescape(src, &res)) {
|
if (!absl::CUnescape(src, &res)) {
|
||||||
throw Parser::syntax_error (loc, "bad escaped string: " + string(src));
|
throw Parser::syntax_error (loc, "bad escaped string: " + string(src));
|
||||||
|
|
|
@ -13,6 +13,8 @@
|
||||||
|
|
||||||
// Added to header file before parser declaration.
|
// Added to header file before parser declaration.
|
||||||
%code requires {
|
%code requires {
|
||||||
|
#include "core/search/ast_expr.h"
|
||||||
|
|
||||||
namespace dfly {
|
namespace dfly {
|
||||||
namespace search {
|
namespace search {
|
||||||
class QueryDriver;
|
class QueryDriver;
|
||||||
|
@ -24,11 +26,12 @@
|
||||||
%code {
|
%code {
|
||||||
#include "core/search/query_driver.h"
|
#include "core/search/query_driver.h"
|
||||||
|
|
||||||
#define yylex driver.scanner()->ParserLex
|
#define yylex driver->scanner()->Lex
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Only for parser
|
%parse-param { QueryDriver *driver }
|
||||||
%param { QueryDriver& driver }
|
|
||||||
|
|
||||||
%locations
|
%locations
|
||||||
|
|
||||||
|
@ -40,36 +43,35 @@
|
||||||
%token
|
%token
|
||||||
LPAREN "("
|
LPAREN "("
|
||||||
RPAREN ")"
|
RPAREN ")"
|
||||||
|
STAR "*"
|
||||||
|
ARROW "=>"
|
||||||
|
COLON ":"
|
||||||
|
NOT_OP "~"
|
||||||
;
|
;
|
||||||
|
|
||||||
%token YYEOF
|
%token YYEOF
|
||||||
%token <std::string> TERM "term"
|
%token <std::string> TERM "term" PARAM "param" FIELD "field"
|
||||||
|
|
||||||
%token <int64_t> INT64 "int64"
|
%token <int64_t> INT64 "int64"
|
||||||
%nterm <int> bool_expr
|
%nterm <AstExpr> search_expr
|
||||||
|
|
||||||
%printer { yyo << $$; } <*>;
|
%printer { yyo << $$; } <*>;
|
||||||
|
|
||||||
%%
|
%%
|
||||||
%start input;
|
|
||||||
|
|
||||||
input:
|
query:
|
||||||
%empty
|
search_expr
|
||||||
| bool_expr
|
| query search_expr
|
||||||
{
|
|
||||||
std::cout << $1 << std::endl;
|
|
||||||
}
|
|
||||||
;
|
;
|
||||||
|
|
||||||
bool_expr: TERM {
|
search_expr: TERM {
|
||||||
std::cout << $1 << std::endl;
|
cout << $1 << endl;
|
||||||
} | TERM bool_expr {
|
|
||||||
std::cout << $1 << std::endl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
%%
|
%%
|
||||||
|
|
||||||
void
|
void
|
||||||
dfly::search::Parser::error(const location_type& l, const std::string& m)
|
dfly::search::Parser::error(const location_type& l, const string& m)
|
||||||
{
|
{
|
||||||
std::cerr << l << ": " << m << '\n';
|
cerr << l << ": " << m << '\n';
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,8 +5,8 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <sstream>
|
|
||||||
|
|
||||||
|
#include "core/search/ast_expr.h"
|
||||||
#include "core/search/parser.hh"
|
#include "core/search/parser.hh"
|
||||||
#include "core/search/scanner.h"
|
#include "core/search/scanner.h"
|
||||||
|
|
||||||
|
@ -23,19 +23,22 @@ class QueryDriver {
|
||||||
return scanner_.get();
|
return scanner_.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
void SetInput(const std::string& str) {
|
void SetInput(std::string str) {
|
||||||
istr_.str(str);
|
cur_str_ = std::move(str);
|
||||||
scanner()->switch_streams(&istr_);
|
scanner()->in(cur_str_);
|
||||||
}
|
}
|
||||||
|
|
||||||
Parser::symbol_type Lex() {
|
Parser::symbol_type Lex() {
|
||||||
return scanner()->ParserLex(*this);
|
return scanner()->Lex();
|
||||||
}
|
}
|
||||||
|
|
||||||
Parser::location_type location;
|
Parser::location_type location;
|
||||||
|
|
||||||
|
void Add(AstExpr) {
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::istringstream istr_;
|
std::string cur_str_;
|
||||||
std::unique_ptr<Scanner> scanner_;
|
std::unique_ptr<Scanner> scanner_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -4,27 +4,30 @@
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#if !defined(yyFlexLexerOnce)
|
// We should not include lexer.h when compiling from lexer.cc file because it already
|
||||||
#include <FlexLexer.h>
|
// includes lexer.h
|
||||||
|
#ifndef DFLY_LEXER_CC
|
||||||
|
#include "core/search/lexer.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "core/search/parser.hh"
|
|
||||||
|
|
||||||
namespace dfly {
|
namespace dfly {
|
||||||
namespace search {
|
namespace search {
|
||||||
|
|
||||||
class QueryDriver;
|
class Scanner : public Lexer {
|
||||||
|
|
||||||
class Scanner : public yyFlexLexer {
|
|
||||||
public:
|
public:
|
||||||
Scanner() {
|
Scanner() {
|
||||||
}
|
}
|
||||||
|
|
||||||
Parser::symbol_type ParserLex(QueryDriver& drv);
|
Parser::symbol_type Lex();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::string Matched() const {
|
std::string_view matched_view(size_t skip_left = 0, size_t skip_right = 0) const {
|
||||||
return std::string(YYText(), YYLeng());
|
std::string_view res(matcher().begin() + skip_left, matcher().size() - skip_left - skip_right);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
dfly::search::location loc() {
|
||||||
|
return location();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -14,7 +14,7 @@ using namespace std;
|
||||||
class SearchParserTest : public ::testing::Test {
|
class SearchParserTest : public ::testing::Test {
|
||||||
protected:
|
protected:
|
||||||
SearchParserTest() {
|
SearchParserTest() {
|
||||||
// query_driver_.scanner()->set_debug(1);
|
query_driver_.scanner()->set_debug(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SetInput(const std::string& str) {
|
void SetInput(const std::string& str) {
|
||||||
|
@ -25,6 +25,12 @@ class SearchParserTest : public ::testing::Test {
|
||||||
return query_driver_.Lex();
|
return query_driver_.Lex();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int Parse(const std::string& str) {
|
||||||
|
query_driver_.SetInput(str);
|
||||||
|
|
||||||
|
return Parser(&query_driver_)();
|
||||||
|
}
|
||||||
|
|
||||||
QueryDriver query_driver_;
|
QueryDriver query_driver_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -32,7 +38,7 @@ class SearchParserTest : public ::testing::Test {
|
||||||
#define NEXT_EQ(tok_enum, type, val) \
|
#define NEXT_EQ(tok_enum, type, val) \
|
||||||
{ \
|
{ \
|
||||||
auto tok = Lex(); \
|
auto tok = Lex(); \
|
||||||
EXPECT_EQ(tok.type_get(), Parser::token::tok_enum); \
|
ASSERT_EQ(tok.type_get(), Parser::token::tok_enum); \
|
||||||
EXPECT_EQ(val, tok.value.as<type>()); \
|
EXPECT_EQ(val, tok.value.as<type>()); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -60,6 +66,20 @@ TEST_F(SearchParserTest, Scanner) {
|
||||||
|
|
||||||
SetInput(R"( "hello\"world" )");
|
SetInput(R"( "hello\"world" )");
|
||||||
NEXT_EQ(TOK_TERM, string, R"(hello"world)");
|
NEXT_EQ(TOK_TERM, string, R"(hello"world)");
|
||||||
|
|
||||||
|
SetInput(" $param @field:hello");
|
||||||
|
NEXT_EQ(TOK_PARAM, string, "$param");
|
||||||
|
NEXT_EQ(TOK_FIELD, string, "@field");
|
||||||
|
NEXT_TOK(TOK_COLON);
|
||||||
|
NEXT_EQ(TOK_TERM, string, "hello");
|
||||||
|
|
||||||
|
SetInput("почтальон Печкин");
|
||||||
|
NEXT_EQ(TOK_TERM, string, "почтальон");
|
||||||
|
NEXT_EQ(TOK_TERM, string, "Печкин");
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(SearchParserTest, Parse) {
|
||||||
|
EXPECT_EQ(0, Parse(" foo "));
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace search
|
} // namespace search
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue