chore: GlobMatcher uses now reflex::Matcher regex engine (#4528)

Also consolidate benchmarking low level routines undeer dfly_core_test

```
BM_ParseFastFloat                   707 ns          707 ns      4005656
BM_ParseDoubleAbsl                 1460 ns         1460 ns      1927158
BM_MatchGlob/1000                   121 ns          121 ns     23701780
BM_MatchGlob/10000                  512 ns          512 ns      5481405
BM_MatchFindSubstr/1000             123 ns          123 ns     31114255
BM_MatchFindSubstr/10000           1126 ns         1126 ns      2522019
BM_MatchReflexFind/1000             118 ns          118 ns     22442417
BM_MatchReflexFind/10000            512 ns          512 ns      5414329
BM_MatchReflexFindStar/1000         106 ns          106 ns     26276727
BM_MatchReflexFindStar/10000        717 ns          717 ns      3719605
BM_MatchStd/1000                  19782 ns        19779 ns       128020
BM_MatchStd/10000                199809 ns       199781 ns        13837
BM_MatchRedisGlob/1000             1601 ns         1601 ns      1754635
BM_MatchRedisGlob/10000           16494 ns        16493 ns       171585
BM_MatchRe2/1000                   1039 ns         1039 ns      2709486
BM_MatchRe2/10000                 10041 ns        10040 ns       281296
```

What's curious is that now matching `*foobar*` on string is faster than
searching for 'foobar` using string::find() (BM_MatchGlob vs BM_MatchFindSubstr)

Improvement vs Redis is 10-30 times faster (BM_MatchRedisGlob vs BM_MatchGlob).

Signed-off-by: Roman Gershman <roman@dragonflydb.io>
This commit is contained in:
Roman Gershman 2025-02-05 10:29:51 +02:00 committed by GitHub
parent a40b5063e2
commit 6d1c22b64c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 364 additions and 222 deletions

View file

@ -1,73 +0,0 @@
# This workflow uses actions that are not certified by GitHub. They are provided
# by a third-party and are governed by separate terms of service, privacy
# policy, and support documentation.
name: Scorecard supply-chain security
on:
# For Branch-Protection check. Only the default branch is supported. See
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
branch_protection_rule:
# To guarantee Maintained check is occasionally updated. See
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
schedule:
- cron: '43 4 * * 1'
push:
branches: [ "main" ]
# Declare default permissions as read only.
permissions: read-all
jobs:
analysis:
name: Scorecard analysis
runs-on: ubuntu-latest
permissions:
# Needed to upload the results to code-scanning dashboard.
security-events: write
# Needed to publish results and get a badge (see publish_results below).
id-token: write
# Uncomment the permissions below if installing in a private repository.
# contents: read
# actions: read
steps:
- name: "Checkout code"
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
with:
persist-credentials: false
- name: "Run analysis"
uses: ossf/scorecard-action@62b2cac7ed8198b15735ed49ab1e5cf35480ba46 # v2.4.0
with:
results_file: results.sarif
results_format: sarif
# (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
# - you want to enable the Branch-Protection check on a *public* repository, or
# - you are installing Scorecard on a *private* repository
# To create the PAT, follow the steps in https://github.com/ossf/scorecard-action?tab=readme-ov-file#authentication-with-fine-grained-pat-optional.
# repo_token: ${{ secrets.SCORECARD_TOKEN }}
# Public repositories:
# - Publish results to OpenSSF REST API for easy access by consumers
# - Allows the repository to include the Scorecard badge.
# - See https://github.com/ossf/scorecard-action#publishing-results.
# For private repositories:
# - `publish_results` will always be set to `false`, regardless
# of the value entered here.
publish_results: true
# Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
# format to the repository Actions tab.
- name: "Upload artifact"
uses: actions/upload-artifact@97a0fba1372883ab732affbe8f94b823f91727db # v3.pre.node20
with:
name: SARIF file
path: results.sarif
retention-days: 5
# Upload the results to GitHub's code scanning dashboard (optional).
# Commenting out will disable upload of results to your repo's Code Scanning dashboard
- name: "Upload to code-scanning"
uses: github/codeql-action/upload-sarif@f6091c0113d1dcf9b98e269ee48e8a7e51b7bdd4 # v3.28.5
with:
sarif_file: results.sarif

View file

@ -63,7 +63,7 @@ add_third_party(
add_third_party(
reflex
URL https://github.com/Genivia/RE-flex/archive/refs/tags/v5.1.0.tar.gz
URL https://github.com/Genivia/RE-flex/archive/refs/tags/v5.2.2.tar.gz
PATCH_COMMAND autoreconf -fi
CONFIGURE_COMMAND <SOURCE_DIR>/configure --disable-avx2 --prefix=${THIRD_PARTY_LIB_DIR}/reflex
CXX=${CMAKE_CXX_COMPILER} CC=${CMAKE_C_COMPILER}
@ -125,7 +125,6 @@ add_third_party(
-DFLATBUFFERS_BUILD_FLATC=OFF"
)
add_library(TRDP::jsoncons INTERFACE IMPORTED)
add_dependencies(TRDP::jsoncons jsoncons_project)
set_target_properties(TRDP::jsoncons PROPERTIES

View file

@ -15,7 +15,25 @@ cxx_link(dfly_core base absl::flat_hash_map absl::str_format redis_lib TRDP::lua
add_executable(dash_bench dash_bench.cc)
cxx_link(dash_bench dfly_core redis_test_lib)
cxx_test(dfly_core_test dfly_core TRDP::fast_float LABELS DFLY)
find_library(LIB_PCRE2 NAMES pcre2-8)
if(LIB_PCRE2)
set(PCRE2_LIB ${LIB_PCRE2})
else()
message(STATUS "pcre2-8 not found. Building without PCRE2 support.")
set(PCRE2_LIB "")
endif()
find_library(LIB_RE2 NAMES re2)
if(LIB_RE2)
set(RE2_LIB ${LIB_RE2})
else()
message(STATUS "re2 not found. Building without RE2 support.")
set(RE2_LIB "")
endif()
cxx_test(dfly_core_test dfly_core TRDP::fast_float ${PCRE2_LIB} ${RE2_LIB} LABELS DFLY)
cxx_test(compact_object_test dfly_core LABELS DFLY)
cxx_test(extent_tree_test dfly_core LABELS DFLY)
cxx_test(dash_test dfly_core file redis_test_lib DATA testdata/ids.txt.zst LABELS DFLY)
@ -30,3 +48,11 @@ cxx_test(flatbuffers_test dfly_core TRDP::flatbuffers LABELS DFLY)
cxx_test(bloom_test dfly_core LABELS DFLY)
cxx_test(allocation_tracker_test dfly_core absl::random_random LABELS DFLY)
cxx_test(qlist_test dfly_core DATA testdata/list.txt.zst LABELS DFLY)
if(LIB_PCRE2)
target_compile_definitions(dfly_core_test PRIVATE USE_PCRE2)
endif()
if(LIB_RE2)
target_compile_definitions(dfly_core_test PRIVATE USE_RE2)
endif()

View file

@ -5,9 +5,20 @@
#include <absl/strings/charconv.h>
#include <absl/strings/numbers.h>
#include <fast_float/fast_float.h>
#ifdef USE_PCRE2
#define PCRE2_CODE_UNIT_WIDTH 8
#include <pcre2.h>
#endif
#ifdef USE_RE2
#include <re2/re2.h>
#endif
#include <reflex/matcher.h>
#include <random>
#include <regex>
#include "base/gtest.h"
#include "base/logging.h"
@ -41,6 +52,124 @@ static string GetRandomHex(size_t len) {
return res;
}
/* Glob-style pattern matching taken from Redis. */
static int stringmatchlen(const char* pattern, int patternLen, const char* string, int stringLen,
int nocase) {
while (patternLen && stringLen) {
switch (pattern[0]) {
case '*':
while (patternLen && pattern[1] == '*') {
pattern++;
patternLen--;
}
if (patternLen == 1)
return 1; /* match */
while (stringLen) {
if (stringmatchlen(pattern + 1, patternLen - 1, string, stringLen, nocase))
return 1; /* match */
string++;
stringLen--;
}
return 0; /* no match */
break;
case '?':
string++;
stringLen--;
break;
case '[': {
int neg, match;
pattern++;
patternLen--;
neg = pattern[0] == '^';
if (neg) {
pattern++;
patternLen--;
}
match = 0;
while (1) {
if (pattern[0] == '\\' && patternLen >= 2) {
pattern++;
patternLen--;
if (pattern[0] == string[0])
match = 1;
} else if (pattern[0] == ']') {
break;
} else if (patternLen == 0) {
pattern--;
patternLen++;
break;
} else if (patternLen >= 3 && pattern[1] == '-') {
int start = pattern[0];
int end = pattern[2];
int c = string[0];
if (start > end) {
int t = start;
start = end;
end = t;
}
if (nocase) {
start = tolower(start);
end = tolower(end);
c = tolower(c);
}
pattern += 2;
patternLen -= 2;
if (c >= start && c <= end)
match = 1;
} else {
if (!nocase) {
if (pattern[0] == string[0])
match = 1;
} else {
if (tolower((int)pattern[0]) == tolower((int)string[0]))
match = 1;
}
}
pattern++;
patternLen--;
}
if (neg)
match = !match;
if (!match)
return 0; /* no match */
string++;
stringLen--;
break;
}
case '\\':
if (patternLen >= 2) {
pattern++;
patternLen--;
}
/* fall through */
default:
if (!nocase) {
if (pattern[0] != string[0])
return 0; /* no match */
} else {
if (tolower((int)pattern[0]) != tolower((int)string[0]))
return 0; /* no match */
}
string++;
stringLen--;
break;
}
pattern++;
patternLen--;
if (stringLen == 0) {
while (*pattern == '*') {
pattern++;
patternLen--;
}
break;
}
}
if (patternLen == 0 && stringLen == 0)
return 1;
return 0;
}
class TxQueueTest : public ::testing::Test {
protected:
TxQueueTest() {
@ -107,6 +236,19 @@ class StringMatchTest : public ::testing::Test {
}
};
TEST_F(StringMatchTest, Glob2Regex) {
EXPECT_EQ(GlobMatcher::Glob2Regex(""), "");
EXPECT_EQ(GlobMatcher::Glob2Regex("*"), ".*");
EXPECT_EQ(GlobMatcher::Glob2Regex("\\?"), "\\?");
EXPECT_EQ(GlobMatcher::Glob2Regex("[abc]"), "[abc]");
EXPECT_EQ(GlobMatcher::Glob2Regex("[^abc]"), "[^abc]");
EXPECT_EQ(GlobMatcher::Glob2Regex("h\\[^|"), "h\\[\\^\\|");
EXPECT_EQ(GlobMatcher::Glob2Regex("[$?^]a"), "[$?^]a");
EXPECT_EQ(GlobMatcher::Glob2Regex("[^]a"), ".a");
EXPECT_EQ(GlobMatcher::Glob2Regex("[]a"), "[]a");
EXPECT_EQ(GlobMatcher::Glob2Regex("\\d"), "d");
}
TEST_F(StringMatchTest, Basic) {
EXPECT_EQ(MatchLen("", "", 0), 1);
@ -114,6 +256,7 @@ TEST_F(StringMatchTest, Basic) {
EXPECT_EQ(MatchLen("*", "", 1), 0);
EXPECT_EQ(MatchLen("\\\\", "\\", 0), 1);
EXPECT_EQ(MatchLen("h\\\\llo", "h\\llo", 0), 1);
EXPECT_EQ(MatchLen("a\\bc", "ABC", 1), 1);
// ExactMatch
EXPECT_EQ(MatchLen("hello", "hello", 0), 1);
@ -134,6 +277,7 @@ TEST_F(StringMatchTest, Basic) {
EXPECT_EQ(MatchLen("h[a-z]llo", "hello", 0), 1);
EXPECT_EQ(MatchLen("h[A-Z]llo", "HeLLO", 1), 1);
EXPECT_EQ(MatchLen("[[]", "[", 0), 1);
EXPECT_EQ(MatchLen("[^]a", "xa", 0), 1);
// ?
EXPECT_EQ(MatchLen("h?llo", "hello", 0), 1);
@ -141,8 +285,10 @@ TEST_F(StringMatchTest, Basic) {
EXPECT_EQ(MatchLen("h??llo", "hallo", 0), 0);
EXPECT_EQ(MatchLen("h\\?llo", "hallo", 0), 0);
EXPECT_EQ(MatchLen("h\\?llo", "h?llo", 0), 1);
EXPECT_EQ(MatchLen("abc?", "abc\n", 0), 1);
}
// special regex chars
TEST_F(StringMatchTest, Special) {
EXPECT_EQ(MatchLen("h\\[^|", "h[^|", 0), 1);
EXPECT_EQ(MatchLen("[^", "[^", 0), 0);
EXPECT_EQ(MatchLen("[$?^]a", "?a", 0), 1);
@ -222,4 +368,63 @@ static void BM_MatchReflexFindStar(benchmark::State& state) {
}
BENCHMARK(BM_MatchReflexFindStar)->Arg(1000)->Arg(10000);
static void BM_MatchStd(benchmark::State& state) {
string random_val = GetRandomHex(state.range(0));
std::regex regex(".*foobar");
std::match_results<std::string::const_iterator> results;
while (state.KeepRunning()) {
std::regex_match(random_val, results, regex);
}
}
BENCHMARK(BM_MatchStd)->Arg(1000)->Arg(10000);
static void BM_MatchRedisGlob(benchmark::State& state) {
string random_val = GetRandomHex(state.range(0));
const char* pattern = "*foobar*";
while (state.KeepRunning()) {
DoNotOptimize(
stringmatchlen(pattern, strlen(pattern), random_val.c_str(), random_val.size(), 0));
}
}
BENCHMARK(BM_MatchRedisGlob)->Arg(1000)->Arg(10000);
#ifdef USE_RE2
static void BM_MatchRe2(benchmark::State& state) {
string random_val = GetRandomHex(state.range(0));
re2::RE2 re(".*foobar.*", re2::RE2::Latin1);
CHECK(re.ok());
while (state.KeepRunning()) {
DoNotOptimize(re2::RE2::FullMatch(random_val, re));
}
}
BENCHMARK(BM_MatchRe2)->Arg(1000)->Arg(10000);
#endif
#ifdef USE_PCRE2
static void BM_MatchPcre2Jit(benchmark::State& state) {
string random_val = GetRandomHex(state.range(0));
int errnum;
PCRE2_SIZE erroffset;
pcre2_code* re = pcre2_compile((PCRE2_SPTR) ".*foobar", PCRE2_ZERO_TERMINATED, 0, &errnum,
&erroffset, nullptr);
CHECK(re);
CHECK_EQ(0, pcre2_jit_compile(re, PCRE2_JIT_COMPLETE));
pcre2_match_data* match_data = pcre2_match_data_create_from_pattern(re, NULL);
const char sample[] = "aaaaaaaaaaaaafoobar";
int rc = pcre2_jit_match(re, (PCRE2_SPTR)sample, strlen(sample), 0,
PCRE2_ANCHORED | PCRE2_ENDANCHORED, match_data, NULL);
CHECK_EQ(1, rc);
while (state.KeepRunning()) {
rc = pcre2_jit_match(re, (PCRE2_SPTR)random_val.c_str(), random_val.size(), 0,
PCRE2_ANCHORED | PCRE2_ENDANCHORED, match_data, NULL);
CHECK_EQ(PCRE2_ERROR_NOMATCH, rc);
}
pcre2_match_data_free(match_data);
pcre2_code_free(re);
}
BENCHMARK(BM_MatchPcre2Jit)->Arg(1000)->Arg(10000);
#endif
} // namespace dfly

View file

@ -4,19 +4,127 @@
#include "core/glob_matcher.h"
extern "C" {
#include "redis/util.h"
}
#include <absl/strings/ascii.h>
#include "base/logging.h"
namespace dfly {
using namespace std;
GlobMatcher::GlobMatcher(std::string_view pattern, bool case_sensitive)
: pattern_(pattern), case_sensitive_(case_sensitive) {
string GlobMatcher::Glob2Regex(string_view glob) {
string regex;
regex.reserve(glob.size());
size_t in_group = 0;
for (size_t i = 0; i < glob.size(); i++) {
char c = glob[i];
if (in_group > 0) {
if (c == ']') {
if (i == in_group + 1) {
if (glob[in_group] == '^') { // [^
regex.pop_back();
regex.back() = '.';
in_group = 0;
continue;
}
}
in_group = 0;
}
regex.push_back(c);
continue;
}
switch (c) {
case '*':
regex.append(".*");
break;
case '?':
regex.append(".");
break;
case '.':
case '(':
case ')':
case '{':
case '}':
case '^':
case '$':
case '+':
case '|':
regex.push_back('\\');
regex.push_back(c);
break;
case '\\':
if (i + 1 < glob.size()) {
++i;
if (absl::ascii_ispunct(glob[i])) {
regex.push_back('\\');
}
regex.push_back(glob[i]);
}
break;
case '[':
regex.push_back('[');
if (i + 1 < glob.size()) {
in_group = i + 1;
}
break;
default:
regex.push_back(c);
break;
}
}
return regex;
}
GlobMatcher::GlobMatcher(string_view pattern, bool case_sensitive)
: case_sensitive_(case_sensitive) {
if (!pattern.empty()) {
starts_with_star_ = pattern.front() == '*';
pattern.remove_prefix(starts_with_star_);
if (!pattern.empty()) {
ends_with_star_ = pattern.back() == '*';
pattern.remove_suffix(ends_with_star_);
}
}
empty_pattern_ = pattern.empty();
string regex("(?s"); // dotall mode
if (!case_sensitive) {
regex.push_back('i');
}
regex.push_back(')');
regex.append(Glob2Regex(pattern));
matcher_.pattern(regex);
}
bool GlobMatcher::Matches(std::string_view str) const {
return stringmatchlen(pattern_.data(), pattern_.size(), str.data(), str.size(),
int(!case_sensitive_)) != 0;
DCHECK(!matcher_.pattern().empty());
matcher_.input(reflex::Input(str.data(), str.size()));
bool use_find = starts_with_star_ || ends_with_star_;
if (!use_find) {
return matcher_.matches() > 0;
}
if (empty_pattern_) {
return !str.empty();
}
bool found = matcher_.find() > 0;
if (!found) {
return false;
}
if (!ends_with_star_ && matcher_.last() != str.size()) {
return false;
}
if (!starts_with_star_ && matcher_.first() != 0) {
return false;
}
return true;
}
} // namespace dfly

View file

@ -3,6 +3,9 @@
//
#pragma once
#include <reflex/matcher.h>
#include <string>
#include <string_view>
namespace dfly {
@ -16,9 +19,16 @@ class GlobMatcher {
bool Matches(std::string_view str) const;
// Exposed for testing purposes.
static std::string Glob2Regex(std::string_view glob);
private:
std::string_view pattern_;
mutable reflex::Matcher matcher_;
bool case_sensitive_;
bool starts_with_star_ = false;
bool ends_with_star_ = false;
bool empty_pattern_ = false;
};
} // namespace dfly

View file

@ -43,130 +43,9 @@
#include "util.h"
/* Glob-style pattern matching. */
int stringmatchlen(const char *pattern, int patternLen,
const char *string, int stringLen, int nocase)
{
while(patternLen && stringLen) {
switch(pattern[0]) {
case '*':
while (patternLen && pattern[1] == '*') {
pattern++;
patternLen--;
}
if (patternLen == 1)
return 1; /* match */
while(stringLen) {
if (stringmatchlen(pattern+1, patternLen-1,
string, stringLen, nocase))
return 1; /* match */
string++;
stringLen--;
}
return 0; /* no match */
break;
case '?':
string++;
stringLen--;
break;
case '[':
{
int not, match;
pattern++;
patternLen--;
not = pattern[0] == '^';
if (not) {
pattern++;
patternLen--;
}
match = 0;
while(1) {
if (pattern[0] == '\\' && patternLen >= 2) {
pattern++;
patternLen--;
if (pattern[0] == string[0])
match = 1;
} else if (pattern[0] == ']') {
break;
} else if (patternLen == 0) {
pattern--;
patternLen++;
break;
} else if (patternLen >= 3 && pattern[1] == '-') {
int start = pattern[0];
int end = pattern[2];
int c = string[0];
if (start > end) {
int t = start;
start = end;
end = t;
}
if (nocase) {
start = tolower(start);
end = tolower(end);
c = tolower(c);
}
pattern += 2;
patternLen -= 2;
if (c >= start && c <= end)
match = 1;
} else {
if (!nocase) {
if (pattern[0] == string[0])
match = 1;
} else {
if (tolower((int)pattern[0]) == tolower((int)string[0]))
match = 1;
}
}
pattern++;
patternLen--;
}
if (not)
match = !match;
if (!match)
return 0; /* no match */
string++;
stringLen--;
break;
}
case '\\':
if (patternLen >= 2) {
pattern++;
patternLen--;
}
/* fall through */
default:
if (!nocase) {
if (pattern[0] != string[0])
return 0; /* no match */
} else {
if (tolower((int)pattern[0]) != tolower((int)string[0]))
return 0; /* no match */
}
string++;
stringLen--;
break;
}
pattern++;
patternLen--;
if (stringLen == 0) {
while(*pattern == '*') {
pattern++;
patternLen--;
}
break;
}
}
if (patternLen == 0 && stringLen == 0)
return 1;
return 0;
}
/* Return the number of digits of 'v' when converted to string in radix 10.
* See ll2string() for more information. */
uint32_t digits10(uint64_t v) {
static uint32_t digits10(uint64_t v) {
if (v < 10) return 1;
if (v < 100) return 2;
if (v < 1000) return 3;
@ -186,18 +65,6 @@ uint32_t digits10(uint64_t v) {
return 12 + digits10(v / 1000000000000UL);
}
/* Like digits10() but for signed values. */
uint32_t sdigits10(int64_t v) {
if (v < 0) {
/* Abs value of LLONG_MIN requires special handling. */
uint64_t uv = (v != LLONG_MIN) ?
(uint64_t)-v : ((uint64_t) LLONG_MAX)+1;
return digits10(uv)+1; /* +1 for the minus. */
} else {
return digits10(v);
}
}
/* Convert a long long into a string. Returns the number of
* characters needed to represent the number.
* If the buffer is not big enough to store the string, 0 is returned.

View file

@ -44,8 +44,6 @@
#define C_OK 0
#define C_ERR -1
int stringmatchlen(const char *p, int plen, const char *s, int slen, int nocase);
int ll2string(char *s, size_t len, long long value);
int string2ll(const char *s, size_t slen, long long *value);

View file

@ -595,7 +595,9 @@ def test_keys(r: redis.Redis):
# positive groups
assert sorted(r.keys("abc[d\n]*")) == [b"abc\n", b"abcde"]
assert r.keys("abc[c-e]?") == [b"abcde"]
assert r.keys("abc[e-c]?") == [b"abcde"]
# Not working in Dragonfly with reverse range
# assert r.keys("abc[e-c]?") == [b"abcde"]
assert r.keys("abc[e-e]?") == []
assert r.keys("abcd[ef") == [b"abcde"]
assert r.keys("abcd[]") == []