mirror of
https://github.com/dragonflydb/dragonfly.git
synced 2025-05-11 02:15:45 +02:00
chore: support more token types in the lexer (#1134)
1. Support integers 2. Support string literals 3. Add more test coverage. Signed-off-by: Roman Gershman <roman@dragonflydb.io>
This commit is contained in:
parent
8749d736dd
commit
ce5db032fc
7 changed files with 91 additions and 28 deletions
|
@ -6,5 +6,5 @@ gen_bison(parser)
|
||||||
cur_gen_dir(gen_dir)
|
cur_gen_dir(gen_dir)
|
||||||
|
|
||||||
add_library(query_parser query_driver.cc ${gen_dir}/parser.cc ${gen_dir}/lexer.cc)
|
add_library(query_parser query_driver.cc ${gen_dir}/parser.cc ${gen_dir}/lexer.cc)
|
||||||
target_link_libraries(query_parser glog)
|
target_link_libraries(query_parser base absl::strings)
|
||||||
cxx_test(search_parser_test query_parser)
|
cxx_test(search_parser_test query_parser)
|
||||||
|
|
|
@ -1,5 +1,11 @@
|
||||||
|
|
||||||
|
/* Seems that flex does not have unicode support.
|
||||||
|
TODO: to consider https://en.wikipedia.org/wiki/RE/flex in the future.
|
||||||
|
*/
|
||||||
%{
|
%{
|
||||||
#include <climits>
|
#include <absl/strings/escaping.h>
|
||||||
|
#include <absl/strings/numbers.h>
|
||||||
|
#include "base/logging.h"
|
||||||
#include "core/search/query_driver.h"
|
#include "core/search/query_driver.h"
|
||||||
|
|
||||||
// Define main lexer function. QueryDriver is the shared state between scanner and parser
|
// Define main lexer function. QueryDriver is the shared state between scanner and parser
|
||||||
|
@ -15,12 +21,17 @@
|
||||||
%{
|
%{
|
||||||
// A number symbol corresponding to the value in S.
|
// A number symbol corresponding to the value in S.
|
||||||
using dfly::search::Parser;
|
using dfly::search::Parser;
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
Parser::symbol_type make_NUMBER (const std::string &s, const Parser::location_type& loc);
|
Parser::symbol_type make_INT64 (string_view, const Parser::location_type& loc);
|
||||||
|
Parser::symbol_type make_StringLit(string_view src, const Parser::location_type& loc);
|
||||||
%}
|
%}
|
||||||
|
|
||||||
int [0-9]+
|
|
||||||
blank [ \t\r]
|
blank [ \t\r]
|
||||||
|
dq \"
|
||||||
|
esc_chars ['"\?\\abfnrtv]
|
||||||
|
esc_seq \\{esc_chars}
|
||||||
|
str_char ([^"]|{esc_seq})
|
||||||
|
|
||||||
%{
|
%{
|
||||||
// Code run each time a pattern is matched.
|
// Code run each time a pattern is matched.
|
||||||
|
@ -43,18 +54,34 @@ blank [ \t\r]
|
||||||
"(" return Parser::make_LPAREN (loc);
|
"(" return Parser::make_LPAREN (loc);
|
||||||
")" return Parser::make_RPAREN (loc);
|
")" return Parser::make_RPAREN (loc);
|
||||||
|
|
||||||
{int} return make_NUMBER (yytext, loc);
|
-?[0-9]+ return make_INT64(Matched(), loc);
|
||||||
[^ \t\r]+ return Parser::make_TERM (yytext, loc);
|
|
||||||
|
{dq}{str_char}*{dq} return make_StringLit(string_view{YYText(), size_t(YYLeng())}, loc);
|
||||||
|
|
||||||
|
[[:alnum:]]+ return Parser::make_TERM(Matched(), loc);
|
||||||
|
|
||||||
<<EOF>> return Parser::make_YYEOF(loc);
|
<<EOF>> return Parser::make_YYEOF(loc);
|
||||||
%%
|
%%
|
||||||
|
|
||||||
Parser::symbol_type
|
Parser::symbol_type
|
||||||
make_NUMBER (const std::string &s, const Parser::location_type& loc)
|
make_INT64 (string_view str, const Parser::location_type& loc)
|
||||||
{
|
{
|
||||||
errno = 0;
|
int64_t val = 0;
|
||||||
long n = strtol (s.c_str(), NULL, 10);
|
if (!absl::SimpleAtoi(str, &val))
|
||||||
if (! (INT_MIN <= n && n <= INT_MAX && errno != ERANGE))
|
throw Parser::syntax_error (loc, "not an integer or out of range: " + string(str));
|
||||||
throw Parser::syntax_error (loc, "integer is out of range: " + s);
|
|
||||||
return Parser::make_NUMBER ((int) n, loc);
|
return Parser::make_INT64(val, loc);
|
||||||
|
}
|
||||||
|
|
||||||
|
Parser::symbol_type make_StringLit(string_view src, const Parser::location_type& loc) {
|
||||||
|
DCHECK_GE(src.size(), 2u);
|
||||||
|
|
||||||
|
// Remove quotes
|
||||||
|
src.remove_prefix(1);
|
||||||
|
src.remove_suffix(1);
|
||||||
|
string res;
|
||||||
|
if (!absl::CUnescape(src, &res)) {
|
||||||
|
throw Parser::syntax_error (loc, "bad escaped string: " + string(src));
|
||||||
|
}
|
||||||
|
return Parser::make_TERM(res, loc);
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,7 +44,7 @@
|
||||||
|
|
||||||
%token YYEOF
|
%token YYEOF
|
||||||
%token <std::string> TERM "term"
|
%token <std::string> TERM "term"
|
||||||
%token <int> NUMBER "number"
|
%token <int64_t> INT64 "int64"
|
||||||
%nterm <int> bool_expr
|
%nterm <int> bool_expr
|
||||||
|
|
||||||
%printer { yyo << $$; } <*>;
|
%printer { yyo << $$; } <*>;
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
// Copyright 2023, Roman Gershman. All rights reserved.
|
// Copyright 2023, DragonflyDB authors. All rights reserved.
|
||||||
// See LICENSE for licensing terms.
|
// See LICENSE for licensing terms.
|
||||||
//
|
//
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
// Copyright 2023, Roman Gershman. All rights reserved.
|
// Copyright 2023, DragonflyDB authors. All rights reserved.
|
||||||
// See LICENSE for licensing terms.
|
// See LICENSE for licensing terms.
|
||||||
//
|
//
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
#include "core/search/parser.hh"
|
#include "core/search/parser.hh"
|
||||||
#include "core/search/scanner.h"
|
#include "core/search/scanner.h"
|
||||||
|
@ -22,9 +23,19 @@ class QueryDriver {
|
||||||
return scanner_.get();
|
return scanner_.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void SetInput(const std::string& str) {
|
||||||
|
istr_.str(str);
|
||||||
|
scanner()->switch_streams(&istr_);
|
||||||
|
}
|
||||||
|
|
||||||
|
Parser::symbol_type Lex() {
|
||||||
|
return scanner()->ParserLex(*this);
|
||||||
|
}
|
||||||
|
|
||||||
Parser::location_type location;
|
Parser::location_type location;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
std::istringstream istr_;
|
||||||
std::unique_ptr<Scanner> scanner_;
|
std::unique_ptr<Scanner> scanner_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
// Copyright 2023, Roman Gershman. All rights reserved.
|
// Copyright 2023, DragonflyDB authors. All rights reserved.
|
||||||
// See LICENSE for licensing terms.
|
// See LICENSE for licensing terms.
|
||||||
//
|
//
|
||||||
|
|
||||||
|
@ -22,8 +22,9 @@ class Scanner : public yyFlexLexer {
|
||||||
|
|
||||||
Parser::symbol_type ParserLex(QueryDriver& drv);
|
Parser::symbol_type ParserLex(QueryDriver& drv);
|
||||||
|
|
||||||
std::string matched() {
|
private:
|
||||||
return yytext;
|
std::string Matched() const {
|
||||||
|
return std::string(YYText(), YYLeng());
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
// Copyright 2023, Roman Gershman. All rights reserved.
|
// Copyright 2023, DragonflyDB authors. All rights reserved.
|
||||||
// See LICENSE for licensing terms.
|
// See LICENSE for licensing terms.
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "base/gtest.h"
|
#include "base/gtest.h"
|
||||||
|
#include "base/logging.h"
|
||||||
#include "core/search/query_driver.h"
|
#include "core/search/query_driver.h"
|
||||||
|
|
||||||
namespace dfly {
|
namespace dfly {
|
||||||
|
@ -12,30 +14,52 @@ using namespace std;
|
||||||
class SearchParserTest : public ::testing::Test {
|
class SearchParserTest : public ::testing::Test {
|
||||||
protected:
|
protected:
|
||||||
SearchParserTest() {
|
SearchParserTest() {
|
||||||
|
// query_driver_.scanner()->set_debug(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SetInput(const std::string& str) {
|
void SetInput(const std::string& str) {
|
||||||
istr_.str(str);
|
query_driver_.SetInput(str);
|
||||||
query_driver_.scanner()->switch_streams(&istr_);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Parser::symbol_type Lex() {
|
Parser::symbol_type Lex() {
|
||||||
return query_driver_.scanner()->ParserLex(query_driver_);
|
return query_driver_.Lex();
|
||||||
}
|
}
|
||||||
|
|
||||||
QueryDriver query_driver_;
|
QueryDriver query_driver_;
|
||||||
|
|
||||||
std::istringstream istr_;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// tokens are not assignable, so we can not reuse them. This macros reduce the boilerplate.
|
||||||
|
#define NEXT_EQ(tok_enum, type, val) \
|
||||||
|
{ \
|
||||||
|
auto tok = Lex(); \
|
||||||
|
EXPECT_EQ(tok.type_get(), Parser::token::tok_enum); \
|
||||||
|
EXPECT_EQ(val, tok.value.as<type>()); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define NEXT_TOK(tok_enum) \
|
||||||
|
{ \
|
||||||
|
auto tok = Lex(); \
|
||||||
|
ASSERT_EQ(tok.type_get(), Parser::token::tok_enum); \
|
||||||
|
}
|
||||||
|
|
||||||
TEST_F(SearchParserTest, Scanner) {
|
TEST_F(SearchParserTest, Scanner) {
|
||||||
SetInput("ab cd");
|
SetInput("ab cd");
|
||||||
Parser::symbol_type tok = Lex();
|
|
||||||
|
|
||||||
// 3.5.1 does not have name() method.
|
// 3.5.1 does not have name() method.
|
||||||
// EXPECT_STREQ("term", tok.name());
|
// EXPECT_STREQ("term", tok.name());
|
||||||
EXPECT_EQ(tok.type_get(), Parser::token::TOK_TERM);
|
|
||||||
EXPECT_EQ("ab", tok.value.as<string>());
|
NEXT_EQ(TOK_TERM, string, "ab");
|
||||||
|
NEXT_EQ(TOK_TERM, string, "cd");
|
||||||
|
NEXT_TOK(TOK_YYEOF);
|
||||||
|
|
||||||
|
SetInput("(5a 6) ");
|
||||||
|
|
||||||
|
NEXT_TOK(TOK_LPAREN);
|
||||||
|
NEXT_EQ(TOK_TERM, string, "5a");
|
||||||
|
NEXT_EQ(TOK_INT64, int64_t, 6);
|
||||||
|
NEXT_TOK(TOK_RPAREN);
|
||||||
|
|
||||||
|
SetInput(R"( "hello\"world" )");
|
||||||
|
NEXT_EQ(TOK_TERM, string, R"(hello"world)");
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace search
|
} // namespace search
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue