Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions cpp/src/arrow/array/array_binary_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,16 @@ TEST(StringViewArray, Validate) {
}),
Ok());

// Variadic buffer slots, when present, must contain real buffers.
EXPECT_THAT(MakeBinaryViewArray({nullptr},
{
util::ToInlineBinaryView("hello"),
util::ToInlineBinaryView("world"),
}),
Raises(StatusCode::Invalid,
::testing::HasSubstr("null variadic buffer at buffer index #2 "
"(variadic buffer index #0)")));

// non-inline views are expected to reference only buffers managed by the array
EXPECT_THAT(
MakeBinaryViewArray(
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/array/validate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -504,6 +504,12 @@ struct ValidateArrayImpl {
: *layout.variadic_spec;

if (buffer == nullptr) {
if (layout.variadic_spec && i >= static_cast<int>(layout.buffers.size())) {
return Status::Invalid("Array of type ", type.ToString(),
" has a null variadic buffer at buffer index #", i,
" (variadic buffer index #", i - layout.buffers.size(),
")");
}
continue;
}
int64_t min_buffer_size = 0;
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/arrow/c/bridge.cc
Original file line number Diff line number Diff line change
Expand Up @@ -608,7 +608,8 @@ struct ArrayExporter {
export_.variadic_buffer_sizes_.resize(variadic_buffers.size());
size_t i = 0;
for (const auto& buf : variadic_buffers) {
export_.variadic_buffer_sizes_[i++] = buf->size();
// The C Data Interface allows null variadic buffer pointers with size 0.
export_.variadic_buffer_sizes_[i++] = buf == nullptr ? 0 : buf->size();
}
export_.buffers_.back() = export_.variadic_buffer_sizes_.data();
}
Expand Down
61 changes: 61 additions & 0 deletions cpp/src/arrow/c/bridge_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -930,6 +930,30 @@ TEST_F(TestArrayExport, PrimitiveSliced) {
TestPrimitive(factory);
}

TEST_F(TestArrayExport, NullVariadicBuffers) {
// GH-49740: _export_to_c segmentation fault for binary_view array.
for (const auto& type : {binary_view(), utf8_view()}) {
auto arr =
MakeArray(ArrayData::Make(type, /*length=*/2,
{nullptr,
Buffer::FromVector(std::vector<BinaryViewType::c_type>{
util::ToInlineBinaryView("hello"),
util::ToInlineBinaryView("world"),
}),
nullptr}));

struct ArrowArray c_export;
ASSERT_OK(ExportArray(*arr, &c_export));
ArrayExportGuard guard(&c_export);

ASSERT_EQ(c_export.n_buffers, 4);
ASSERT_EQ(c_export.buffers[2], nullptr);
ASSERT_NE(c_export.buffers[3], nullptr);
const auto* variadic_buffer_sizes = static_cast<const int64_t*>(c_export.buffers[3]);
ASSERT_EQ(variadic_buffer_sizes[0], 0);
}
}

constexpr std::string_view binary_view_buffer_content0 = "12345foo bar baz quux",
binary_view_buffer_content1 = "BinaryViewMultipleBuffers";

Expand Down Expand Up @@ -2996,6 +3020,43 @@ TEST_F(TestArrayImport, String) {
CheckImport(ArrayFromJSON(large_binary(), "[]"));
}

TEST_F(TestArrayImport, NullVariadicBuffers) {
// The C Data Interface allows null variadic buffer pointers with size 0.
// Import normalizes them to non-null zero-size buffers in Arrow C++.
std::vector<BinaryViewType::c_type> views = {
util::ToInlineBinaryView("hello"),
util::ToInlineBinaryView("world"),
};
constexpr int64_t null_variadic_buffer_sizes[] = {0};
const void* null_variadic_buffer[] = {
nullptr,
views.data(),
nullptr,
null_variadic_buffer_sizes,
};

for (const auto& type : {binary_view(), utf8_view()}) {
FillStringViewLike(/*length=*/2, /*null_count=*/0, /*offset=*/0, null_variadic_buffer,
/*data_buffer_count=*/1);

ArrayReleaseCallback cb(&c_struct_);
ASSERT_OK_AND_ASSIGN(auto array, ImportArray(&c_struct_, type));
ASSERT_TRUE(ArrowArrayIsReleased(&c_struct_));
Reset();

ASSERT_OK(array->ValidateFull());
AssertArraysEqual(*ArrayFromJSON(type, R"(["hello", "world"])"), *array,
/*verbose=*/true);

ASSERT_EQ(array->data()->buffers.size(), 3);
ASSERT_NE(array->data()->buffers[2], nullptr);
ASSERT_EQ(array->data()->buffers[2]->size(), 0);
cb.AssertNotCalled();
array.reset();
cb.AssertCalled();
}
}

TEST_F(TestArrayImport, StringWithOffset) {
FillStringLike(3, 0, 1, string_buffers_no_nulls1);
CheckImport(ArrayFromJSON(utf8(), R"(["", "bar", "quux"])"));
Expand Down
14 changes: 9 additions & 5 deletions cpp/src/arrow/compute/kernels/scalar_cast_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -438,8 +438,7 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou

auto* out_views = output->GetMutableValues<BinaryViewType::c_type>(1);

// If all entries are inline, we can drop the extra data buffer for
// large strings in output->buffers[2].
// If all entries are inline, there are no variadic data buffers to expose.
bool all_entries_are_inline = true;
VisitSetBitRunsVoid(
validity, output->offset, output->length,
Expand All @@ -463,8 +462,9 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou
}
});
if (all_entries_are_inline) {
output->buffers[2] = nullptr;
output->buffers.resize(2);
}

return Status::OK();
}

Expand Down Expand Up @@ -508,11 +508,15 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou

const int32_t fixed_size_width = input.type->byte_width();
const int64_t total_length = input.offset + input.length;
// Values of width <= BinaryViewType::kInlineSize are stored inline in the
// views, so the output only needs a variadic data buffer for larger widths.
const bool values_are_inline = fixed_size_width <= BinaryViewType::kInlineSize;
const size_t num_buffers = values_are_inline ? 2 : 3;

ArrayData* output = out->array_data().get();
DCHECK_EQ(output->length, input.length);
output->offset = input.offset;
output->buffers.resize(3);
output->buffers.resize(num_buffers);
output->SetNullCount(input.null_count);
// Share the validity bitmap buffer
output->buffers[0] = input.GetBuffer(0);
Expand All @@ -539,7 +543,7 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou
}

// Inline string and non-inline string loops
if (fixed_size_width <= BinaryViewType::kInlineSize) {
if (values_are_inline) {
int32_t data_offset = static_cast<int32_t>(input.offset) * fixed_size_width;
for (int64_t i = 0; i < input.length; i++) {
auto& out_view = out_views[i];
Expand Down
87 changes: 81 additions & 6 deletions cpp/src/arrow/compute/kernels/scalar_cast_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3307,9 +3307,12 @@ TEST(Cast, BinaryToString) {
// N.B. null buffer is not always the same if input sliced
AssertBufferSame(*invalid_utf8, *strings, 0);

// ARROW-16757: we no longer zero copy, but the contents are equal
ASSERT_NE(invalid_utf8->data()->buffers[1].get(), strings->data()->buffers[2].get());
if (!is_binary_view_like(*string_type)) {
if (is_binary_view_like(*string_type)) {
ASSERT_EQ(strings->data()->buffers.size(), 2);
} else {
// ARROW-16757: we no longer zero copy, but the contents are equal
ASSERT_NE(invalid_utf8->data()->buffers[1].get(),
strings->data()->buffers[2].get());
ASSERT_TRUE(invalid_utf8->data()->buffers[1]->Equals(*strings->data()->buffers[2]));
}
}
Expand Down Expand Up @@ -3349,9 +3352,12 @@ TEST(Cast, BinaryOrStringToBinary) {
// N.B. null buffer is not always the same if input sliced
AssertBufferSame(*invalid_utf8, *strings, 0);

// ARROW-16757: we no longer zero copy, but the contents are equal
ASSERT_NE(invalid_utf8->data()->buffers[1].get(), strings->data()->buffers[2].get());
if (!is_binary_view_like(*to_type)) {
if (is_binary_view_like(*to_type)) {
ASSERT_EQ(strings->data()->buffers.size(), 2);
} else {
// ARROW-16757: we no longer zero copy, but the contents are equal
ASSERT_NE(invalid_utf8->data()->buffers[1].get(),
strings->data()->buffers[2].get());
ASSERT_TRUE(invalid_utf8->data()->buffers[1]->Equals(*strings->data()->buffers[2]));
}

Expand All @@ -3361,6 +3367,75 @@ TEST(Cast, BinaryOrStringToBinary) {
}
}

TEST(Cast, BinaryOrStringToView) {
// GH-49740: when all values were inline, the cast left a null variadic
// buffer slot behind and exporting to the C data interface crashed.
const std::vector<std::pair<std::string, int64_t>> cases = {
// Empty inputs are handled before the cast kernel runs.
{"[]", 2},
{R"(["a", null, "e"])", 2},
{"[null, null]", 2},
{R"(["aaaaaaaaaaaaa", null, "eeeeeeeeeeeee"])", 3},
{R"(["a", null, "aaaaaaaaaaaaa"])", 3},
};

for (auto from_type : {binary(), large_binary(), utf8(), large_utf8()}) {
for (auto to_type : {binary_view(), utf8_view()}) {
for (const auto& [json, expected_num_buffers] : cases) {
auto input = ArrayFromJSON(from_type, json);
auto expected = ArrayFromJSON(to_type, json);

ASSERT_OK_AND_ASSIGN(auto casted, Cast(*input, to_type));
ValidateOutput(*casted);
AssertArraysEqual(*expected, *casted);

ASSERT_EQ(casted->data()->buffers.size(), expected_num_buffers)
<< "from: " << from_type->ToString() << ", to: " << to_type->ToString()
<< ", values: " << json;
if (expected_num_buffers == 3) {
ASSERT_NE(casted->data()->buffers[2], nullptr)
<< "from: " << from_type->ToString() << ", to: " << to_type->ToString()
<< ", values: " << json;
}
}
}
}
}

TEST(Cast, FixedSizeBinaryToView) {
// Fixed-size binary uses a separate cast kernel with the same GH-49740
// issue. Cover non-empty widths on both sides of the BinaryView inline limit.
const std::vector<std::tuple<std::shared_ptr<DataType>, std::string, int64_t>> cases = {
// Empty inputs are handled before the cast kernel runs.
{fixed_size_binary(1), "[]", 2},
{fixed_size_binary(13), "[]", 2},
{fixed_size_binary(1), R"(["a", null, "e"])", 2},
{fixed_size_binary(1), "[null, null]", 2},
{fixed_size_binary(12), R"(["aaaaaaaaaaaa", null])", 2},
{fixed_size_binary(13), R"(["aaaaaaaaaaaaa", null, "eeeeeeeeeeeee"])", 3},
};

for (const auto& [from_type, json, expected_num_buffers] : cases) {
for (auto to_type : {binary_view(), utf8_view()}) {
auto input = ArrayFromJSON(from_type, json);
auto expected = ArrayFromJSON(to_type, json);

ASSERT_OK_AND_ASSIGN(auto casted, Cast(*input, to_type));
ValidateOutput(*casted);
AssertArraysEqual(*expected, *casted);

ASSERT_EQ(casted->data()->buffers.size(), expected_num_buffers)
<< "from: " << from_type->ToString() << ", to: " << to_type->ToString()
<< ", values: " << json;
if (expected_num_buffers == 3) {
ASSERT_NE(casted->data()->buffers[2], nullptr)
<< "from: " << from_type->ToString() << ", to: " << to_type->ToString()
<< ", values: " << json;
}
}
}
}

TEST(Cast, StringToString) {
for (auto from_type : {utf8(), utf8_view(), large_utf8()}) {
for (auto to_type : {utf8(), utf8_view(), large_utf8()}) {
Expand Down
35 changes: 31 additions & 4 deletions dev/archery/archery/integration/datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ def generate_column(self, size, name=None):

TEST_INT_MAX = 2 ** 31 - 1
TEST_INT_MIN = ~TEST_INT_MAX
BINARY_VIEW_INLINE_SIZE = 12


class IntegerField(PrimitiveField):
Expand Down Expand Up @@ -625,14 +626,19 @@ def column_class(self):
def _get_type(self):
return OrderedDict([('name', 'utf8')])

def _random_string_size(self):
return 7

def _random_value(self):
return tobytes(random_utf8(self._random_string_size()))

def generate_column(self, size, name=None):
K = 7
is_valid = self._make_is_valid(size)
values = []

for i in range(size):
if is_valid[i]:
values.append(tobytes(random_utf8(K)))
values.append(self._random_value())
else:
values.append(b"")

Expand Down Expand Up @@ -671,6 +677,13 @@ def _get_type(self):
return OrderedDict([('name', 'binaryview')])


class InlineBinaryViewField(BinaryViewField):
# Generate only inline values, leaving no variadic data buffers.

def _random_sizes(self, size):
return np.arange(size, dtype=np.int32) % (BINARY_VIEW_INLINE_SIZE + 1)


class StringViewField(StringField):

@property
Expand All @@ -681,6 +694,19 @@ def _get_type(self):
return OrderedDict([('name', 'utf8view')])


class InlineStringViewField(StringViewField):

def _random_string_size(self):
# The test alphabet contains up to 3-byte UTF-8 code points, so four
# characters fit in the 12-byte inline representation.
return 4

def _random_value(self):
value = super()._random_value()
assert len(value) <= BINARY_VIEW_INLINE_SIZE
return value


class Schema(object):

def __init__(self, fields, metadata=None):
Expand Down Expand Up @@ -771,14 +797,13 @@ def _get_buffers(self):
# a small default data buffer size is used so we can exercise
# arrays with multiple data buffers with small data sets
DEFAULT_BUFFER_SIZE = 32
INLINE_SIZE = 12

for i, v in enumerate(self.values):
if not self.is_valid[i]:
v = b''
assert isinstance(v, bytes)

if len(v) <= INLINE_SIZE:
if len(v) <= BINARY_VIEW_INLINE_SIZE:
# Append an inline view, skip data buffer management.
views.append(OrderedDict([
('SIZE', len(v)),
Expand Down Expand Up @@ -1784,6 +1809,8 @@ def generate_binary_view_case():
fields = [
BinaryViewField('bv'),
StringViewField('sv'),
InlineBinaryViewField('bv_inline'),
InlineStringViewField('sv_inline'),
]
batch_sizes = [0, 7, 256]
return _generate_file("binary_view", fields, batch_sizes)
Expand Down
33 changes: 33 additions & 0 deletions python/pyarrow/tests/test_cffi.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,39 @@ def test_export_import_array():
)


@needs_cffi
def test_export_cast_binary_view_all_inline():
# GH-49740: _export_to_c segmentation fault for binary_view array.
c_schema = ffi.new("struct ArrowSchema*")
ptr_schema = int(ffi.cast("uintptr_t", c_schema))
c_array = ffi.new("struct ArrowArray*")
ptr_array = int(ffi.cast("uintptr_t", c_array))

gc.collect() # Make sure no Arrow data dangles in a ref cycle
old_allocated = pa.total_allocated_bytes()

# The cast used to leave a null variadic buffer slot behind when all
# values were inline.
arr = pa.array([b"a", None, b"e"], type=pa.binary()).cast(pa.binary_view())
arr.validate(full=True)
py_value = arr.to_pylist()
arr._export_to_c(ptr_array, ptr_schema)

assert c_array.length == 3
# validity, views and the appended variadic buffer sizes
assert c_array.n_buffers == 3

del arr
arr_new = pa.Array._import_from_c(ptr_array, ptr_schema)
assert arr_new.to_pylist() == py_value
assert arr_new.type == pa.binary_view()
del arr_new
assert pa.total_allocated_bytes() == old_allocated
# Now released
with assert_schema_released:
pa.Array._import_from_c(ptr_array, ptr_schema)


@needs_cffi
def test_export_import_device_array():
check_export_import_array(
Expand Down
Loading