Skip to content
Open
16 changes: 16 additions & 0 deletions cpp/src/gandiva/precompiled/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,19 @@ add_gandiva_test(precompiled-test
GANDIVA_UNIT_TEST=1
ARROW_STATIC
GANDIVA_STATIC)

# microbenchmark (built as a gandiva test so it links the precompiled sources
# directly; run on demand via the gandiva-precompiled-benchmark binary)
add_gandiva_test(precompiled-benchmark
Comment thread
lriggs marked this conversation as resolved.
Outdated
SOURCES
../context_helper.cc
string_ops_benchmark.cc
string_ops.cc
EXTRA_INCLUDES
${CMAKE_SOURCE_DIR}/src
EXTRA_LINK_LIBS
Boost::headers
DEFINITIONS
GANDIVA_UNIT_TEST=1
ARROW_STATIC
GANDIVA_STATIC)
62 changes: 58 additions & 4 deletions cpp/src/gandiva/precompiled/string_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1896,7 +1896,14 @@ const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* t

for (; text_index <= text_len - from_str_len;) {
if (memcmp(text + text_index, from_str, from_str_len) == 0) {
if (out_index + text_index - last_match_index + to_str_len > max_length) {
// Compute the prospective length in gdv_int64: now that the wrapper may
// pass a max_length near INT_MAX, out_index can approach INT_MAX and a
// 32-bit sum would overflow before this guard runs -- precisely the case
// the guard exists to catch. (text_index - last_match_index) is a bounded
// non-negative span.
gdv_int64 prospective_len = static_cast<gdv_int64>(out_index) +
(text_index - last_match_index) + to_str_len;
if (prospective_len > max_length) {
gdv_fn_context_set_error_msg(context,
"REPLACE: Buffer overflow for output string");
*out_len = 0;
Expand Down Expand Up @@ -1932,7 +1939,8 @@ const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* t
return text;
}

if (out_index + text_len - last_match_index > max_length) {
gdv_int64 final_len = static_cast<gdv_int64>(out_index) + (text_len - last_match_index);
if (final_len > max_length) {
gdv_fn_context_set_error_msg(context, "REPLACE: Buffer overflow for output string");
*out_len = 0;
return "";
Expand All @@ -1948,9 +1956,55 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text,
gdv_int32 text_len, const char* from_str,
gdv_int32 from_str_len, const char* to_str,
gdv_int32 to_str_len, gdv_int32* out_len) {
// Size the output buffer so large results are not capped by an arbitrary
// limit, while avoiding a second pass over the input in the common case.
// - No replacement possible, or the result can only shrink/stay equal:
// text_len is a safe exact-or-upper bound, no scan.
// - Small expansion (replacement at most ~2x the match): use an O(1) upper
// bound that assumes every position matches. This over-allocates by at
// most ~text_len bytes but skips the match-counting scan entirely.
// - Large expansion: that upper bound could be many times the input for
// sparse matches, so count non-overlapping matches for the exact size.
gdv_int64 max_length;
if (from_str_len <= 0 || from_str_len > text_len || to_str_len <= from_str_len) {
max_length = text_len;
} else {
gdv_int32 delta = to_str_len - from_str_len; // > 0
gdv_int64 upper_bound = static_cast<gdv_int64>(text_len) +
(static_cast<gdv_int64>(text_len) / from_str_len) * delta;
if (delta <= from_str_len && upper_bound <= INT_MAX) {
max_length = upper_bound;
} else {
gdv_int64 num_matches = 0;
for (gdv_int32 i = 0; i <= text_len - from_str_len;) {
if (memcmp(text + i, from_str, from_str_len) == 0) {
num_matches++;
i += from_str_len;
} else {
i++;
}
}
// No matches: the result is the input unchanged; return it without calling
// the helper (which would otherwise scan the text a second time).
if (num_matches == 0) {
*out_len = text_len;
return text;
}
max_length = static_cast<gdv_int64>(text_len) + num_matches * delta;
}
}
// Gandiva variable-length output uses int32 offsets, so a single output string
// cannot exceed INT_MAX bytes. Report this explicitly instead of letting the
// cast below wrap silently.
if (max_length > INT_MAX) {
gdv_fn_context_set_error_msg(context,
"REPLACE: output string exceeds maximum size of 2GB");
*out_len = 0;
return "";
}
return replace_with_max_len_utf8_utf8_utf8(context, text, text_len, from_str,
from_str_len, to_str, to_str_len, 65535,
out_len);
from_str_len, to_str, to_str_len,
static_cast<gdv_int32>(max_length), out_len);
Comment thread
lriggs marked this conversation as resolved.
}

// Returns the quoted string (Includes escape character for any single quotes)
Expand Down
160 changes: 160 additions & 0 deletions cpp/src/gandiva/precompiled/string_ops_benchmark.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

// Microbenchmark comparing the current REPLACE implementation against the
// pre-change one, to measure the cost of the O(n) match-counting scan that the
// fix added to size the output buffer exactly.
//
// new = replace_utf8_utf8_utf8 (counting scan + exact allocation)
// old = replace_with_max_len_utf8_utf8_utf8(..., capacity, ...)
// The old wrapper was literally a call to this with a fixed cap of
// 65535; given a buffer large enough to hold the result it is the
// pre-change algorithm with no counting scan, so new-vs-old isolates
// the scan overhead. (The shipped old cap of 65535 additionally errored
// out for any result above ~64 KB -- the bug this change fixes.)
//
// Reported time includes one ExecutionContext::Reset() per call in both arms
// (it cancels out in the delta), mirroring per-call arena reuse.

#include <chrono>
#include <cinttypes>
#include <cstdint>
#include <cstdio>
#include <functional>
#include <string>
#include <vector>

#include <gtest/gtest.h>

#include "gandiva/execution_context.h"
#include "gandiva/precompiled/types.h"

namespace gandiva {
namespace {

// Builds a `len`-byte string whose `match` byte occurs once every `stride`
// bytes (the rest filled with `filler`), giving len/stride non-overlapping
// matches.
std::string MakeText(int64_t len, int stride, char match, char filler) {
std::string s(static_cast<size_t>(len), filler);
for (int64_t i = 0; i < len; i += stride) {
s[static_cast<size_t>(i)] = match;
}
return s;
}

double TimeNsPerCall(const std::function<void()>& fn, int iters) {
auto t0 = std::chrono::steady_clock::now();
for (int i = 0; i < iters; ++i) {
fn();
}
auto t1 = std::chrono::steady_clock::now();
return std::chrono::duration<double, std::nano>(t1 - t0).count() / iters;
}

struct Case {
std::string name;
int64_t text_len;
int stride; // a match every `stride` bytes
std::string from;
std::string to;
int iters;
};

} // namespace

TEST(StringOpsBenchmark, Replace) {
ExecutionContext ctx;
int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);

const int64_t kSmall = 256;
const int64_t kMedium = 64 * 1024;
const int64_t kLarge = 4 * 1024 * 1024;

std::vector<Case> cases = {
// Expansion cases (to longer than from) -- these take the O(n) scan path.
{"small/dense expand a->ab", kSmall, 1, "a", "ab", 200000},
{"small/sparse expand a->ab", kSmall, 64, "a", "ab", 200000},
{"medium/dense expand a->ab", kMedium, 1, "a", "ab", 4000},
{"medium/sparse expand a->ab", kMedium, 64, "a", "ab", 4000},
{"large/dense expand a->ab", kLarge, 1, "a", "ab", 80},
{"large/sparse expand a->ab", kLarge, 64, "a", "ab", 80},
// Big-expansion cases (to_len - from_len > from_len) -- these fall back to
// the exact match-counting scan, so new should still show the scan delta.
{"large/dense bigexp a->abcd", kLarge, 1, "a", "abcd", 80},
{"large/sparse bigexp a->abcd", kLarge, 64, "a", "abcd", 80},
// Shrink case (to no longer than from) -- new skips the scan entirely.
{"large/dense shrink ab->a", kLarge, 2, "ab", "a", 80},
};

printf("\n%-28s %10s %9s %12s %12s %9s\n", "case", "text_len", "matches", "new ns/call",
"old ns/call", "delta");
printf("%s\n", std::string(92, '-').c_str());

for (const auto& c : cases) {
std::string text = MakeText(c.text_len, c.stride, c.from[0], 'x');
const char* tp = text.data();
auto tlen = static_cast<int32_t>(text.size());
const char* fp = c.from.data();
auto flen = static_cast<int32_t>(c.from.size());
const char* op = c.to.data();
auto olen = static_cast<int32_t>(c.to.size());

// Count matches and compute the exact output size to give the "old" arm a
// buffer large enough to complete (precomputed -- not part of the timing).
int64_t matches = 0;
if (flen > 0 && flen <= tlen) {
for (int32_t i = 0; i <= tlen - flen;) {
if (memcmp(tp + i, fp, flen) == 0) {
++matches;
i += flen;
} else {
++i;
}
}
}
auto out_capacity = static_cast<int32_t>(tlen + matches * (olen - flen));

auto run_new = [&]() {
int32_t out_len = 0;
replace_utf8_utf8_utf8(ctx_ptr, tp, tlen, fp, flen, op, olen, &out_len);
ctx.Reset();
};
auto run_old = [&]() {
int32_t out_len = 0;
replace_with_max_len_utf8_utf8_utf8(ctx_ptr, tp, tlen, fp, flen, op, olen,
out_capacity, &out_len);
ctx.Reset();
};

// Warm up (and verify both produce the same length / no error).
run_new();
ASSERT_FALSE(ctx.has_error());
Comment thread
lriggs marked this conversation as resolved.
Outdated
run_old();
ASSERT_FALSE(ctx.has_error());

double new_ns = TimeNsPerCall(run_new, c.iters);
double old_ns = TimeNsPerCall(run_old, c.iters);
double delta = old_ns > 0 ? (new_ns - old_ns) / old_ns * 100.0 : 0.0;

printf("%-28s %10" PRId64 " %9" PRId64 " %12.1f %12.1f %+8.1f%%\n", c.name.c_str(),
c.text_len, matches, new_ns, old_ns, delta);
}
printf("\n");
}

} // namespace gandiva
62 changes: 62 additions & 0 deletions cpp/src/gandiva/precompiled/string_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1991,6 +1991,68 @@ TEST(TestStringOps, TestReplace) {
EXPECT_EQ(std::string(out_str, out_len), "TestString");
EXPECT_FALSE(ctx.has_error());

// No match on the large-expansion (counting) path: from "z" to "zzz" expands
// by more than from_len, so this exercises the count branch's zero-match
// early return.
out_str = replace_utf8_utf8_utf8(ctx_ptr, "TestString", 10, "z", 1, "zzz", 3, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "TestString");
EXPECT_FALSE(ctx.has_error());

// Large output (>64 KB) must not overflow: buffer is sized to the exact result.
Comment thread
lriggs marked this conversation as resolved.
std::string large_in(35000, 'X');
std::string large_expected(70000, '\0');
for (int i = 0; i < 35000; ++i) {
large_expected[2 * i] = 'X';
large_expected[2 * i + 1] = 'Y';
}
out_str = replace_utf8_utf8_utf8(ctx_ptr, large_in.data(),
static_cast<int32_t>(large_in.size()), "X", 1, "XY", 2,
&out_len);
EXPECT_EQ(out_len, 70000);
EXPECT_EQ(std::string(out_str, out_len), large_expected);
EXPECT_FALSE(ctx.has_error());

// Large shrinking output ("XX" -> "X") on a >64 KB input.
std::string large_shrink_in(70000, 'X');
std::string large_shrink_expected(35000, 'X');
out_str = replace_utf8_utf8_utf8(ctx_ptr, large_shrink_in.data(),
static_cast<int32_t>(large_shrink_in.size()), "XX", 2,
"X", 1, &out_len);
EXPECT_EQ(out_len, 35000);
EXPECT_EQ(std::string(out_str, out_len), large_shrink_expected);
EXPECT_FALSE(ctx.has_error());

// Edge case: result size of exactly 0 (every byte of text is removed). Takes
// the no-scan shrink path (to_str_len <= from_str_len).
out_str = replace_utf8_utf8_utf8(ctx_ptr, "aaaa", 4, "a", 1, "", 0, &out_len);
EXPECT_EQ(out_len, 0);
EXPECT_EQ(std::string(out_str, out_len), "");
EXPECT_FALSE(ctx.has_error());

// Edge case: result size one past the INT_MAX boundary. 65536 single-char
// matches each expanding to 32768 bytes gives max_length = 65536 * 32768 =
// 2^31 = INT_MAX + 1, so it is reported cleanly (guard fires before any alloc).
std::string boundary_in(65536, 'a');
std::string boundary_to(32768, 'b');
replace_utf8_utf8_utf8(
ctx_ptr, boundary_in.data(), static_cast<int32_t>(boundary_in.size()), "a", 1,
boundary_to.data(), static_cast<int32_t>(boundary_to.size()), &out_len);
EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("exceeds maximum size"));
EXPECT_EQ(out_len, 0);
ctx.Reset();

// Output that would exceed INT_MAX (2GB) is reported cleanly rather than
// silently wrapping the int32 size. 50000 matches each expanding to 50000
// bytes implies max_length = 2.5e9; the guard fires before any large alloc.
std::string huge_in(50000, 'X');
std::string huge_to(50000, 'Z');
replace_utf8_utf8_utf8(ctx_ptr, huge_in.data(), static_cast<int32_t>(huge_in.size()),
"X", 1, huge_to.data(), static_cast<int32_t>(huge_to.size()),
&out_len);
EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("exceeds maximum size"));
EXPECT_EQ(out_len, 0);
ctx.Reset();

replace_with_max_len_utf8_utf8_utf8(ctx_ptr, "Hell", 4, "ell", 3, "ollow", 5, 5,
&out_len);
EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Buffer overflow for output string"));
Expand Down
Loading