From ce5db032fce2da2155b4203639a5058e9d6960fb Mon Sep 17 00:00:00 2001 From: Roman Gershman Date: Tue, 25 Apr 2023 14:57:24 +0200 Subject: [PATCH] chore: support more token types in the lexer (#1134) 1. Support integers 2. Support string literals 3. Add more test coverage. Signed-off-by: Roman Gershman --- src/core/search/CMakeLists.txt | 2 +- src/core/search/lexer.lex | 49 +++++++++++++++++++++------ src/core/search/parser.y | 2 +- src/core/search/query_driver.cc | 2 +- src/core/search/query_driver.h | 13 ++++++- src/core/search/scanner.h | 7 ++-- src/core/search/search_parser_test.cc | 44 ++++++++++++++++++------ 7 files changed, 91 insertions(+), 28 deletions(-) diff --git a/src/core/search/CMakeLists.txt b/src/core/search/CMakeLists.txt index 1b0b17640..a2de29810 100644 --- a/src/core/search/CMakeLists.txt +++ b/src/core/search/CMakeLists.txt @@ -6,5 +6,5 @@ gen_bison(parser) cur_gen_dir(gen_dir) add_library(query_parser query_driver.cc ${gen_dir}/parser.cc ${gen_dir}/lexer.cc) -target_link_libraries(query_parser glog) +target_link_libraries(query_parser base absl::strings) cxx_test(search_parser_test query_parser) diff --git a/src/core/search/lexer.lex b/src/core/search/lexer.lex index 4d8e88be6..03f9158f0 100644 --- a/src/core/search/lexer.lex +++ b/src/core/search/lexer.lex @@ -1,5 +1,11 @@ + +/* Seems that flex does not have unicode support. + TODO: to consider https://en.wikipedia.org/wiki/RE/flex in the future. +*/ %{ - #include + #include + #include + #include "base/logging.h" #include "core/search/query_driver.h" // Define main lexer function. QueryDriver is the shared state between scanner and parser @@ -15,12 +21,17 @@ %{ // A number symbol corresponding to the value in S. using dfly::search::Parser; + using namespace std; - Parser::symbol_type make_NUMBER (const std::string &s, const Parser::location_type& loc); + Parser::symbol_type make_INT64 (string_view, const Parser::location_type& loc); + Parser::symbol_type make_StringLit(string_view src, const Parser::location_type& loc); %} -int [0-9]+ blank [ \t\r] +dq \" +esc_chars ['"\?\\abfnrtv] +esc_seq \\{esc_chars} +str_char ([^"]|{esc_seq}) %{ // Code run each time a pattern is matched. @@ -43,18 +54,34 @@ blank [ \t\r] "(" return Parser::make_LPAREN (loc); ")" return Parser::make_RPAREN (loc); -{int} return make_NUMBER (yytext, loc); -[^ \t\r]+ return Parser::make_TERM (yytext, loc); +-?[0-9]+ return make_INT64(Matched(), loc); + +{dq}{str_char}*{dq} return make_StringLit(string_view{YYText(), size_t(YYLeng())}, loc); + +[[:alnum:]]+ return Parser::make_TERM(Matched(), loc); <> return Parser::make_YYEOF(loc); %% Parser::symbol_type -make_NUMBER (const std::string &s, const Parser::location_type& loc) +make_INT64 (string_view str, const Parser::location_type& loc) { - errno = 0; - long n = strtol (s.c_str(), NULL, 10); - if (! (INT_MIN <= n && n <= INT_MAX && errno != ERANGE)) - throw Parser::syntax_error (loc, "integer is out of range: " + s); - return Parser::make_NUMBER ((int) n, loc); + int64_t val = 0; + if (!absl::SimpleAtoi(str, &val)) + throw Parser::syntax_error (loc, "not an integer or out of range: " + string(str)); + + return Parser::make_INT64(val, loc); +} + +Parser::symbol_type make_StringLit(string_view src, const Parser::location_type& loc) { + DCHECK_GE(src.size(), 2u); + + // Remove quotes + src.remove_prefix(1); + src.remove_suffix(1); + string res; + if (!absl::CUnescape(src, &res)) { + throw Parser::syntax_error (loc, "bad escaped string: " + string(src)); + } + return Parser::make_TERM(res, loc); } diff --git a/src/core/search/parser.y b/src/core/search/parser.y index 8f3e04908..63a816ff8 100644 --- a/src/core/search/parser.y +++ b/src/core/search/parser.y @@ -44,7 +44,7 @@ %token YYEOF %token TERM "term" -%token NUMBER "number" +%token INT64 "int64" %nterm bool_expr %printer { yyo << $$; } <*>; diff --git a/src/core/search/query_driver.cc b/src/core/search/query_driver.cc index ea080c9b6..da95697a9 100644 --- a/src/core/search/query_driver.cc +++ b/src/core/search/query_driver.cc @@ -1,4 +1,4 @@ -// Copyright 2023, Roman Gershman. All rights reserved. +// Copyright 2023, DragonflyDB authors. All rights reserved. // See LICENSE for licensing terms. // diff --git a/src/core/search/query_driver.h b/src/core/search/query_driver.h index ff3f7ee83..f07f55f2c 100644 --- a/src/core/search/query_driver.h +++ b/src/core/search/query_driver.h @@ -1,10 +1,11 @@ -// Copyright 2023, Roman Gershman. All rights reserved. +// Copyright 2023, DragonflyDB authors. All rights reserved. // See LICENSE for licensing terms. // #pragma once #include +#include #include "core/search/parser.hh" #include "core/search/scanner.h" @@ -22,9 +23,19 @@ class QueryDriver { return scanner_.get(); } + void SetInput(const std::string& str) { + istr_.str(str); + scanner()->switch_streams(&istr_); + } + + Parser::symbol_type Lex() { + return scanner()->ParserLex(*this); + } + Parser::location_type location; private: + std::istringstream istr_; std::unique_ptr scanner_; }; diff --git a/src/core/search/scanner.h b/src/core/search/scanner.h index f6cbfc723..7a9e2a2f8 100644 --- a/src/core/search/scanner.h +++ b/src/core/search/scanner.h @@ -1,4 +1,4 @@ -// Copyright 2023, Roman Gershman. All rights reserved. +// Copyright 2023, DragonflyDB authors. All rights reserved. // See LICENSE for licensing terms. // @@ -22,8 +22,9 @@ class Scanner : public yyFlexLexer { Parser::symbol_type ParserLex(QueryDriver& drv); - std::string matched() { - return yytext; + private: + std::string Matched() const { + return std::string(YYText(), YYLeng()); } }; diff --git a/src/core/search/search_parser_test.cc b/src/core/search/search_parser_test.cc index 828154a77..c386ab0ad 100644 --- a/src/core/search/search_parser_test.cc +++ b/src/core/search/search_parser_test.cc @@ -1,7 +1,9 @@ -// Copyright 2023, Roman Gershman. All rights reserved. +// Copyright 2023, DragonflyDB authors. All rights reserved. // See LICENSE for licensing terms. // + #include "base/gtest.h" +#include "base/logging.h" #include "core/search/query_driver.h" namespace dfly { @@ -12,30 +14,52 @@ using namespace std; class SearchParserTest : public ::testing::Test { protected: SearchParserTest() { + // query_driver_.scanner()->set_debug(1); } void SetInput(const std::string& str) { - istr_.str(str); - query_driver_.scanner()->switch_streams(&istr_); + query_driver_.SetInput(str); } Parser::symbol_type Lex() { - return query_driver_.scanner()->ParserLex(query_driver_); + return query_driver_.Lex(); } QueryDriver query_driver_; - - std::istringstream istr_; }; +// tokens are not assignable, so we can not reuse them. This macros reduce the boilerplate. +#define NEXT_EQ(tok_enum, type, val) \ + { \ + auto tok = Lex(); \ + EXPECT_EQ(tok.type_get(), Parser::token::tok_enum); \ + EXPECT_EQ(val, tok.value.as()); \ + } + +#define NEXT_TOK(tok_enum) \ + { \ + auto tok = Lex(); \ + ASSERT_EQ(tok.type_get(), Parser::token::tok_enum); \ + } + TEST_F(SearchParserTest, Scanner) { SetInput("ab cd"); - Parser::symbol_type tok = Lex(); - // 3.5.1 does not have name() method. // EXPECT_STREQ("term", tok.name()); - EXPECT_EQ(tok.type_get(), Parser::token::TOK_TERM); - EXPECT_EQ("ab", tok.value.as()); + + NEXT_EQ(TOK_TERM, string, "ab"); + NEXT_EQ(TOK_TERM, string, "cd"); + NEXT_TOK(TOK_YYEOF); + + SetInput("(5a 6) "); + + NEXT_TOK(TOK_LPAREN); + NEXT_EQ(TOK_TERM, string, "5a"); + NEXT_EQ(TOK_INT64, int64_t, 6); + NEXT_TOK(TOK_RPAREN); + + SetInput(R"( "hello\"world" )"); + NEXT_EQ(TOK_TERM, string, R"(hello"world)"); } } // namespace search