Skip to content
Open
36 changes: 34 additions & 2 deletions cpp/src/gandiva/precompiled/string_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1914,9 +1914,41 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text,
gdv_int32 text_len, const char* from_str,
gdv_int32 from_str_len, const char* to_str,
gdv_int32 to_str_len, gdv_int32* out_len) {
// Size the output buffer to the exact result, so large results are not capped
// by an arbitrary limit. When the replacement is no longer than the matched
// text, the result can only shrink or stay the same, so text_len is a safe
// bound and we can skip scanning. Otherwise count non-overlapping matches to
// get the exact expanded size.
gdv_int64 max_length;
if (to_str_len <= from_str_len) {
max_length = text_len;
} else {
gdv_int64 num_matches = 0;
if (from_str_len > 0 && from_str_len <= text_len) {
for (gdv_int32 i = 0; i <= text_len - from_str_len;) {
if (memcmp(text + i, from_str, from_str_len) == 0) {
num_matches++;
i += from_str_len;
} else {
i++;
}
}
}
max_length =
Comment thread
lriggs marked this conversation as resolved.
Outdated
static_cast<gdv_int64>(text_len) + num_matches * (to_str_len - from_str_len);
}
// Gandiva variable-length output uses int32 offsets, so a single output string
// cannot exceed INT_MAX bytes. Report this explicitly instead of letting the
// cast below wrap silently.
if (max_length > INT_MAX) {
gdv_fn_context_set_error_msg(context,
"REPLACE: output string exceeds maximum size of 2GB");
*out_len = 0;
return "";
}
return replace_with_max_len_utf8_utf8_utf8(context, text, text_len, from_str,
from_str_len, to_str, to_str_len, 65535,
out_len);
from_str_len, to_str, to_str_len,
static_cast<gdv_int32>(max_length), out_len);
Comment thread
lriggs marked this conversation as resolved.
}

// Returns the quoted string (Includes escape character for any single quotes)
Expand Down
56 changes: 56 additions & 0 deletions cpp/src/gandiva/precompiled/string_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1971,6 +1971,62 @@ TEST(TestStringOps, TestReplace) {
EXPECT_EQ(std::string(out_str, out_len), "TestString");
EXPECT_FALSE(ctx.has_error());

// Large output (>64 KB) must not overflow: buffer is sized to the exact result.
Comment thread
lriggs marked this conversation as resolved.
std::string large_in(35000, 'X');
std::string large_expected(70000, '\0');
for (int i = 0; i < 35000; ++i) {
large_expected[2 * i] = 'X';
large_expected[2 * i + 1] = 'Y';
}
out_str = replace_utf8_utf8_utf8(ctx_ptr, large_in.data(),
static_cast<int32_t>(large_in.size()), "X", 1, "XY", 2,
&out_len);
EXPECT_EQ(out_len, 70000);
EXPECT_EQ(std::string(out_str, out_len), large_expected);
EXPECT_FALSE(ctx.has_error());

// Large shrinking output ("XX" -> "X") on a >64 KB input.
std::string large_shrink_in(70000, 'X');
std::string large_shrink_expected(35000, 'X');
out_str = replace_utf8_utf8_utf8(ctx_ptr, large_shrink_in.data(),
static_cast<int32_t>(large_shrink_in.size()), "XX", 2,
"X", 1, &out_len);
EXPECT_EQ(out_len, 35000);
EXPECT_EQ(std::string(out_str, out_len), large_shrink_expected);
EXPECT_FALSE(ctx.has_error());

// Edge case: result size of exactly 0 (every byte of text is removed). Takes
// the no-scan shrink path (to_str_len <= from_str_len).
out_str = replace_utf8_utf8_utf8(ctx_ptr, "aaaa", 4, "a", 1, "", 0, &out_len);
EXPECT_EQ(out_len, 0);
EXPECT_EQ(std::string(out_str, out_len), "");
EXPECT_FALSE(ctx.has_error());

// Edge case: result size one past the INT_MAX boundary. 65536 single-char
// matches each expanding to 32768 bytes gives max_length = 65536 * 32768 =
// 2^31 = INT_MAX + 1, so it is reported cleanly (guard fires before any alloc).
std::string boundary_in(65536, 'a');
std::string boundary_to(32768, 'b');
replace_utf8_utf8_utf8(ctx_ptr, boundary_in.data(),
static_cast<int32_t>(boundary_in.size()), "a", 1,
boundary_to.data(), static_cast<int32_t>(boundary_to.size()),
&out_len);
EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("exceeds maximum size"));
EXPECT_EQ(out_len, 0);
ctx.Reset();

// Output that would exceed INT_MAX (2GB) is reported cleanly rather than
// silently wrapping the int32 size. 50000 matches each expanding to 50000
// bytes implies max_length = 2.5e9; the guard fires before any large alloc.
std::string huge_in(50000, 'X');
std::string huge_to(50000, 'Z');
replace_utf8_utf8_utf8(ctx_ptr, huge_in.data(), static_cast<int32_t>(huge_in.size()),
"X", 1, huge_to.data(), static_cast<int32_t>(huge_to.size()),
&out_len);
EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("exceeds maximum size"));
EXPECT_EQ(out_len, 0);
ctx.Reset();

replace_with_max_len_utf8_utf8_utf8(ctx_ptr, "Hell", 4, "ell", 3, "ollow", 5, 5,
&out_len);
EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Buffer overflow for output string"));
Expand Down
Loading