Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,24 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None,
if type is not None and type.id == _Type_EXTENSION:
extension_type = type
type = type.storage_type
# GH-49644: when building a fixed_shape_tensor from a sequence of arrays,
# the converter only sees the flat storage type, so validate the
# tensor-specific constraints here where the type is still known.
if (isinstance(extension_type, FixedShapeTensorType)
and isinstance(obj, (list, tuple))):
Comment thread
rok marked this conversation as resolved.
if extension_type.permutation is not None:
raise NotImplementedError(
"Converting a sequence of arrays to a fixed_shape_tensor with "
"a permutation is not supported; use "
"FixedShapeTensorArray.from_numpy_ndarray instead")
Comment thread
rok marked this conversation as resolved.
Outdated
if np is not None:
expected_shape = tuple(extension_type.shape)
for element in obj:
if (isinstance(element, np.ndarray) and element.ndim >= 2
and tuple(element.shape) != expected_shape):
raise ValueError(
f"Cannot convert array of shape {element.shape} to a "
f"fixed_shape_tensor of shape {expected_shape}")
Comment thread
rok marked this conversation as resolved.
Outdated

if from_pandas is None:
c_from_pandas = False
Expand Down
25 changes: 22 additions & 3 deletions python/pyarrow/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -908,13 +908,32 @@ class PyListConverter : public ListConverter<T, PyConverter, PyConverterTrait> {

Status AppendNdarray(PyObject* value) {
PyArrayObject* ndarray = reinterpret_cast<PyArrayObject*>(value);
if (PyArray_NDIM(ndarray) != 1) {
return Status::Invalid("Can only convert 1-dimensional array values");
}
if (PyArray_ISBYTESWAPPED(ndarray)) {
// TODO
return Status::NotImplemented("Byte-swapped arrays not supported");
}
OwnedRef flattened;
if (PyArray_NDIM(ndarray) != 1) {
Comment thread
aboderinsamuel marked this conversation as resolved.
// GH-49644: a fixed-size list (e.g. fixed-shape-tensor storage) can be
// built from a multi-dimensional array, always flattened in C order
// regardless of the input's memory layout.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would remove the comment here. Maybe only mention the 0-dimensional case we are catching here, in short.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, replaced it with a short note about the 0-D / variable-sized-list case.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I missed this comment before. @AlenkaF how do you feel about allowing 0-dimensional cases through? I know they are useless, but allowing them could be helpful as it removes edge cases.

E.g. your application returns arrays with dynamical array dimensions. And sometime those just come out as 0 dim which you consider semantically correct. Instead of having to now fight with PyArrow and introducing a workaround you just pass 0-dim and you're done. What do you think?

if (PyArray_NDIM(ndarray) < 2 || this->list_type_->id() != Type::FIXED_SIZE_LIST) {
return Status::Invalid(
"Can only convert 1-dimensional array values to a variable-sized list");
}
Comment thread
rok marked this conversation as resolved.
Outdated
Comment thread
aboderinsamuel marked this conversation as resolved.
// Get an aligned, C-contiguous array (copying only if needed), then view
// it as 1-D so its values can be read directly in C order.
Comment thread
rok marked this conversation as resolved.
Outdated
PyObject* contiguous =
PyArray_CheckFromAny(value, nullptr, /*min_depth=*/0, /*max_depth=*/0,
NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_ALIGNED, nullptr);
RETURN_IF_PYERROR();
flattened.reset(
PyArray_Ravel(reinterpret_cast<PyArrayObject*>(contiguous), NPY_CORDER));
Comment thread
rok marked this conversation as resolved.
Py_DECREF(contiguous);
RETURN_IF_PYERROR();
value = flattened.obj();
ndarray = reinterpret_cast<PyArrayObject*>(value);
}
const int64_t size = PyArray_SIZE(ndarray);
RETURN_NOT_OK(AppendTo(this->list_type_, size));
RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size));
Expand Down
30 changes: 30 additions & 0 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -2924,6 +2924,36 @@ def test_array_from_invalid_dim_raises():
pa.array(arr0d)


@pytest.mark.numpy
def test_fixed_size_list_from_multidim_ndarray():
# GH-49644: a fixed-size list can be built from multi-dimensional ndarray
# elements by flattening them in C order.
Comment thread
rok marked this conversation as resolved.
Outdated
arr = pa.array([np.array([[1, 2, 3]], dtype=np.int64),
np.array([[4, 5, 6]], dtype=np.int64)],
type=pa.list_(pa.int64(), 3))
assert arr.type == pa.list_(pa.int64(), 3)
assert arr.to_pylist() == [[1, 2, 3], [4, 5, 6]]
Comment thread
aboderinsamuel marked this conversation as resolved.

# A non-trivial 2D shape confirms values are flattened in C (row-major) order
arr = pa.array([np.array([[1, 2], [3, 4]], dtype=np.int64)],
type=pa.list_(pa.int64(), 4))
assert arr.to_pylist() == [[1, 2, 3, 4]]

# The flattened length must still match the fixed size
with pytest.raises(pa.lib.ArrowInvalid):
pa.array([np.array([[1, 2], [3, 4]], dtype=np.int64)],
type=pa.list_(pa.int64(), 3))

# Variable-sized lists still require 1-dimensional values
with pytest.raises(pa.lib.ArrowInvalid, match="1-dimensional"):
Comment thread
rok marked this conversation as resolved.
Outdated
pa.array([np.array([[1, 2, 3]], dtype=np.int64)],
type=pa.list_(pa.int64()))

# 0-dimensional arrays are still rejected (not flattened to length 1)
with pytest.raises(pa.lib.ArrowInvalid, match="1-dimensional"):
pa.array([np.array(1, dtype=np.int64)], type=pa.list_(pa.int64(), 1))
Comment thread
rok marked this conversation as resolved.
Outdated


@pytest.mark.numpy
def test_array_from_strided_bool():
# ARROW-6325
Expand Down
50 changes: 50 additions & 0 deletions python/pyarrow/tests/test_extension_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -1730,6 +1730,56 @@ def test_tensor_array_from_numpy(np_type_str):
pa.FixedShapeTensorArray.from_numpy_ndarray(arr, dim_names=[0, 1])


@pytest.mark.numpy
@pytest.mark.parametrize("np_type_str", ("int8", "int64", "float32"))
def test_tensor_array_from_list_of_ndarrays(np_type_str):
# GH-49644: build a fixed-shape-tensor array from a list of individual
# (multi-dimensional) ndarrays, not only from a single stacked ndarray.
Comment thread
rok marked this conversation as resolved.
Outdated
np_dtype = np.dtype(np_type_str)
tensor_type = pa.fixed_shape_tensor(pa.from_numpy_dtype(np_dtype), (2, 3))

elements = [
np.arange(6, dtype=np_dtype).reshape(2, 3),
np.arange(6, 12, dtype=np_dtype).reshape(2, 3),
]
result = pa.array(elements, type=tensor_type)
assert isinstance(result, pa.FixedShapeTensorArray)
assert result.type == tensor_type
assert len(result) == 2

# Must match the existing from_numpy_ndarray path on the same data
expected = pa.FixedShapeTensorArray.from_numpy_ndarray(np.stack(elements))
assert result.storage.equals(expected.storage)

# Each element round-trips back to the original ndarray (with its shape)
for scalar, original in zip(result, elements):
np.testing.assert_array_equal(scalar.to_numpy(), original)

# Higher-dimensional tensors work too
tensor_3d = pa.fixed_shape_tensor(pa.from_numpy_dtype(np_dtype), (2, 2, 3))
elements_3d = [np.arange(12, dtype=np_dtype).reshape(2, 2, 3)]
result_3d = pa.array(elements_3d, type=tensor_3d)
assert result_3d.type == tensor_3d
np.testing.assert_array_equal(result_3d[0].to_numpy(), elements_3d[0])

# None elements are allowed
result_with_null = pa.array([elements[0], None], type=tensor_type)
assert result_with_null.null_count == 1
assert result_with_null[1].as_py() is None

Comment thread
rok marked this conversation as resolved.
# A multi-dimensional element whose shape doesn't match the tensor shape is
# rejected, even when the total number of elements is the same (GH-49644).
with pytest.raises(ValueError, match="shape"):
pa.array([np.arange(6, dtype=np_dtype).reshape(3, 2)], type=tensor_type)

# Permuted tensor types can't be built from a sequence (the flatten would
# store the wrong layout), so they're rejected for now.
permuted_type = pa.fixed_shape_tensor(
pa.from_numpy_dtype(np_dtype), (2, 3), permutation=[1, 0])
with pytest.raises(NotImplementedError, match="permutation"):
pa.array(elements, type=permuted_type)


@pytest.mark.numpy
@pytest.mark.parametrize("tensor_type", (
pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]),
Expand Down
Loading