From f51fb4a445815ba070d515d0346aad43d5612011 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Mon, 29 Jun 2026 15:06:49 -0700 Subject: [PATCH] feat(evolution): support v3 column default values in UpdateSchema (4/4) AddColumn / AddRequiredColumn now accept an optional default value, used as both the initial-default and write-default of the new column; a non-null default also lets a required column be added (or an added column be made required) without AllowIncompatibleChanges(). UpdateColumnDefault sets or clears the write-default. Defaults are cast to the column type (rejecting uncastable or out-of-range values) and preserved across rename / doc / type updates and nested field-id reassignment. Part 4 of the v3 column-default-values work (POC #731), built on #746. --- .../test/resources/TableMetadataV3Valid.json | 123 ++++++++++ src/iceberg/test/update_schema_test.cc | 228 ++++++++++++++++++ src/iceberg/update/update_schema.cc | 157 +++++++++--- src/iceberg/update/update_schema.h | 56 ++++- src/iceberg/util/type_util.cc | 19 +- 5 files changed, 535 insertions(+), 48 deletions(-) create mode 100644 src/iceberg/test/resources/TableMetadataV3Valid.json diff --git a/src/iceberg/test/resources/TableMetadataV3Valid.json b/src/iceberg/test/resources/TableMetadataV3Valid.json new file mode 100644 index 000000000..712d4b5bd --- /dev/null +++ b/src/iceberg/test/resources/TableMetadataV3Valid.json @@ -0,0 +1,123 @@ +{ + "format-version": 3, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "next-row-id": 0, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 1, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + } + ] + }, + { + "type": "struct", + "schema-id": 1, + "identifier-field-ids": [ + 1, + 2 + ], + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + } + ], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ] + } + ], + "last-partition-id": 1000, + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + { + "transform": "identity", + "source-id": 2, + "direction": "asc", + "null-order": "nulls-first" + }, + { + "transform": "bucket[4]", + "source-id": 3, + "direction": "desc", + "null-order": "nulls-last" + } + ] + } + ], + "properties": {}, + "current-snapshot-id": 3055729675574597004, + "snapshots": [ + { + "snapshot-id": 3051729675574597004, + "timestamp-ms": 1515100955770, + "sequence-number": 0, + "summary": { + "operation": "append" + }, + "manifest-list": "s3://a/b/1.avro" + }, + { + "snapshot-id": 3055729675574597004, + "parent-snapshot-id": 3051729675574597004, + "timestamp-ms": 1555100955770, + "sequence-number": 1, + "summary": { + "operation": "append" + }, + "manifest-list": "s3://a/b/2.avro", + "schema-id": 1 + } + ], + "snapshot-log": [ + { + "snapshot-id": 3051729675574597004, + "timestamp-ms": 1515100955770 + }, + { + "snapshot-id": 3055729675574597004, + "timestamp-ms": 1555100955770 + } + ], + "metadata-log": [] +} diff --git a/src/iceberg/test/update_schema_test.cc b/src/iceberg/test/update_schema_test.cc index 07872e69a..60f84faee 100644 --- a/src/iceberg/test/update_schema_test.cc +++ b/src/iceberg/test/update_schema_test.cc @@ -19,11 +19,13 @@ #include "iceberg/update/update_schema.h" +#include #include #include #include +#include "iceberg/expression/literal.h" #include "iceberg/schema.h" #include "iceberg/schema_field.h" #include "iceberg/test/matchers.h" @@ -82,6 +84,232 @@ TEST_F(UpdateSchemaTest, AddRequiredColumnWithAllowIncompatible) { EXPECT_EQ(new_field.doc(), "A required string column"); } +/// Default values require a v3 table for Apply() to validate successfully. +class UpdateSchemaDefaultValueTest : public UpdateSchemaTest { + protected: + std::string MetadataResource() const override { return "TableMetadataV3Valid.json"; } +}; + +TEST_F(UpdateSchemaTest, AddColumnWithDefaultValueRequiresV3) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("new_col", int32(), "An integer column", Literal::Int(42)); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kInvalidSchema)); + EXPECT_THAT(result, HasErrorMessage("is not supported until v3")); +} + +TEST_F(UpdateSchemaDefaultValueTest, AddColumnWithDefaultValue) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("new_col", int32(), "An integer column", Literal::Int(42)); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + ICEBERG_UNWRAP_OR_FAIL(auto new_field_opt, result.schema->FindFieldByName("new_col")); + ASSERT_TRUE(new_field_opt.has_value()); + + const auto& new_field = new_field_opt->get(); + ASSERT_NE(new_field.initial_default(), nullptr); + EXPECT_EQ(*new_field.initial_default(), Literal::Int(42)); + ASSERT_NE(new_field.write_default(), nullptr); + EXPECT_EQ(*new_field.write_default(), Literal::Int(42)); +} + +TEST_F(UpdateSchemaDefaultValueTest, AddRequiredColumnWithDefaultValue) { + // A required column with a default does not need AllowIncompatibleChanges(): + // old rows read the initial-default instead of null. + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddRequiredColumn("required_col", string(), "A required string column", + Literal::String("n/a")); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + ICEBERG_UNWRAP_OR_FAIL(auto new_field_opt, + result.schema->FindFieldByName("required_col")); + ASSERT_TRUE(new_field_opt.has_value()); + + const auto& new_field = new_field_opt->get(); + EXPECT_FALSE(new_field.optional()); + ASSERT_NE(new_field.initial_default(), nullptr); + EXPECT_EQ(*new_field.initial_default(), Literal::String("n/a")); + ASSERT_NE(new_field.write_default(), nullptr); + EXPECT_EQ(*new_field.write_default(), Literal::String("n/a")); +} + +TEST_F(UpdateSchemaDefaultValueTest, AddColumnWithMismatchedDefaultValueFails) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("new_col", int32(), "An integer column", Literal::String("oops")); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Cannot cast default value")); +} + +TEST_F(UpdateSchemaDefaultValueTest, AddColumnWithNarrowingDefaultValueFails) { + // CastTo signals narrowing with AboveMax/BelowMin sentinels; they must not be + // stored as defaults. + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("new_col", int32(), "An integer column", + Literal::Long(std::numeric_limits::max())); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Cannot cast default value")); +} + +TEST_F(UpdateSchemaDefaultValueTest, UpdateColumnDefault) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("new_col", int32(), "An integer column", Literal::Int(42)) + .UpdateColumnDefault("new_col", Literal::Int(7)); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + ICEBERG_UNWRAP_OR_FAIL(auto new_field_opt, result.schema->FindFieldByName("new_col")); + ASSERT_TRUE(new_field_opt.has_value()); + + const auto& new_field = new_field_opt->get(); + // initial-default is fixed at column addition; write-default is updated. + ASSERT_NE(new_field.initial_default(), nullptr); + EXPECT_EQ(*new_field.initial_default(), Literal::Int(42)); + ASSERT_NE(new_field.write_default(), nullptr); + EXPECT_EQ(*new_field.write_default(), Literal::Int(7)); +} + +TEST_F(UpdateSchemaDefaultValueTest, UpdateColumnDefaultOnExistingColumn) { + // Updating the write-default of a pre-existing column must survive Apply(). + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->UpdateColumnDefault("x", Literal::Long(0)); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + ICEBERG_UNWRAP_OR_FAIL(auto field_opt, result.schema->FindFieldByName("x")); + ASSERT_TRUE(field_opt.has_value()); + + const auto& field = field_opt->get(); + EXPECT_EQ(field.initial_default(), nullptr); + ASSERT_NE(field.write_default(), nullptr); + EXPECT_EQ(*field.write_default(), Literal::Long(0)); +} + +TEST_F(UpdateSchemaDefaultValueTest, UpdateColumnDefaultClearsWithNullopt) { + // Passing std::nullopt removes the write-default (Java parity with null). + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("new_col", int32(), "An integer column", Literal::Int(42)) + .UpdateColumnDefault("new_col", std::nullopt); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + ICEBERG_UNWRAP_OR_FAIL(auto field_opt, result.schema->FindFieldByName("new_col")); + ASSERT_TRUE(field_opt.has_value()); + + const auto& field = field_opt->get(); + // initial-default stays; write-default is cleared. + ASSERT_NE(field.initial_default(), nullptr); + EXPECT_EQ(*field.initial_default(), Literal::Int(42)); + EXPECT_EQ(field.write_default(), nullptr); +} + +TEST_F(UpdateSchemaDefaultValueTest, AddNestedColumnPreservesNestedDefaults) { + // The added column's type gets fresh field ids; defaults on its nested fields must + // survive the reassignment. + auto nested_type = std::make_shared(std::vector{ + SchemaField(/*field_id=*/100, "inner", int32(), /*optional=*/false, /*doc=*/{}, + std::make_shared(Literal::Int(5)), + std::make_shared(Literal::Int(9)))}); + + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("outer", nested_type, "A nested column"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + ICEBERG_UNWRAP_OR_FAIL(auto outer_opt, result.schema->FindFieldByName("outer")); + ASSERT_TRUE(outer_opt.has_value()); + + const auto& outer_struct = + internal::checked_cast(*outer_opt->get().type()); + ASSERT_EQ(outer_struct.fields().size(), 1); + const SchemaField& inner = outer_struct.fields()[0]; + ASSERT_NE(inner.initial_default(), nullptr); + EXPECT_EQ(*inner.initial_default(), Literal::Int(5)); + ASSERT_NE(inner.write_default(), nullptr); + EXPECT_EQ(*inner.write_default(), Literal::Int(9)); +} + +TEST_F(UpdateSchemaDefaultValueTest, UpdateColumnDefaultCastsToColumnType) { + // An int default for a long column is cast to the column type, not rejected. + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->UpdateColumnDefault("x", Literal::Int(5)); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + ICEBERG_UNWRAP_OR_FAIL(auto field_opt, result.schema->FindFieldByName("x")); + ASSERT_TRUE(field_opt.has_value()); + + const auto& field = field_opt->get(); + ASSERT_NE(field.write_default(), nullptr); + EXPECT_EQ(*field.write_default(), Literal::Long(5)); +} + +TEST_F(UpdateSchemaDefaultValueTest, RequireColumnAddedWithDefault) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("new_col", int32(), "An integer column", Literal::Int(42)) + .RequireColumn("new_col"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + ICEBERG_UNWRAP_OR_FAIL(auto new_field_opt, result.schema->FindFieldByName("new_col")); + ASSERT_TRUE(new_field_opt.has_value()); + EXPECT_FALSE(new_field_opt->get().optional()); +} + +TEST_F(UpdateSchemaDefaultValueTest, UpdateColumnDocPreservesDefaultValues) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("new_col", int32(), "An integer column", Literal::Int(42)) + .UpdateColumnDoc("new_col", "updated doc"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + ICEBERG_UNWRAP_OR_FAIL(auto field_opt, result.schema->FindFieldByName("new_col")); + ASSERT_TRUE(field_opt.has_value()); + + const auto& field = field_opt->get(); + EXPECT_EQ(field.doc(), "updated doc"); + ASSERT_NE(field.initial_default(), nullptr); + EXPECT_EQ(*field.initial_default(), Literal::Int(42)); + ASSERT_NE(field.write_default(), nullptr); + EXPECT_EQ(*field.write_default(), Literal::Int(42)); +} + +TEST_F(UpdateSchemaDefaultValueTest, UpdateColumnTypePromotesDefaultValues) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("new_col", int32(), "An integer column", Literal::Int(42)) + .UpdateColumn("new_col", int64()); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + ICEBERG_UNWRAP_OR_FAIL(auto field_opt, result.schema->FindFieldByName("new_col")); + ASSERT_TRUE(field_opt.has_value()); + + const auto& field = field_opt->get(); + EXPECT_EQ(field.type(), int64()); + ASSERT_NE(field.initial_default(), nullptr); + EXPECT_EQ(*field.initial_default(), Literal::Long(42)); + ASSERT_NE(field.write_default(), nullptr); + EXPECT_EQ(*field.write_default(), Literal::Long(42)); +} + +TEST_F(UpdateSchemaDefaultValueTest, UpdateColumnTypePromotesDecimalDefault) { + // decimal(9,2) -> decimal(18,2) is an allowed precision widening. Literal::CastTo + // does not cast between decimal types, so the default must still be promoted (the + // unscaled value is unchanged). + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update + ->AddColumn("new_col", decimal(9, 2), "A decimal column", + Literal::Decimal(1234, 9, 2)) + .UpdateColumn("new_col", decimal(18, 2)); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + ICEBERG_UNWRAP_OR_FAIL(auto field_opt, result.schema->FindFieldByName("new_col")); + ASSERT_TRUE(field_opt.has_value()); + + const auto& field = field_opt->get(); + EXPECT_EQ(field.type()->ToString(), decimal(18, 2)->ToString()); + ASSERT_NE(field.initial_default(), nullptr); + EXPECT_EQ(*field.initial_default(), Literal::Decimal(1234, 18, 2)); + ASSERT_NE(field.write_default(), nullptr); + EXPECT_EQ(*field.write_default(), Literal::Decimal(1234, 18, 2)); +} + TEST_F(UpdateSchemaTest, AddMultipleColumns) { ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); update->AddColumn("col1", int32(), "First column") diff --git a/src/iceberg/update/update_schema.cc b/src/iceberg/update/update_schema.cc index 5c50ee41d..5b09b1314 100644 --- a/src/iceberg/update/update_schema.cc +++ b/src/iceberg/update/update_schema.cc @@ -29,6 +29,7 @@ #include #include +#include "iceberg/expression/literal.h" #include "iceberg/json_serde_internal.h" #include "iceberg/name_mapping.h" #include "iceberg/schema.h" @@ -39,6 +40,7 @@ #include "iceberg/type.h" #include "iceberg/util/checked_cast.h" #include "iceberg/util/error_collector.h" +#include "iceberg/util/formatter.h" // IWYU pragma: keep #include "iceberg/util/macros.h" #include "iceberg/util/string_util.h" #include "iceberg/util/type_util.h" @@ -213,12 +215,17 @@ class ApplyChangesVisitor { // any child fields that were added. if (update_it != updates_.end()) { + // The updated field already carries the new name/doc/nullability/defaults; only + // its id and (recursively rewritten) type need to be applied. const auto& update_field = update_it->second; return SchemaField(field_id, update_field->name(), std::move(result_type), - update_field->optional(), update_field->doc()); + update_field->optional(), update_field->doc(), + update_field->initial_default(), update_field->write_default()); } else if (result_type != field.type()) { + // A nested type was rewritten by recursion; preserve every other attribute, + // including any default values. return SchemaField(field_id, field.name(), std::move(result_type), field.optional(), - field.doc()); + field.doc(), field.initial_default(), field.write_default()); } else { return field; } @@ -345,39 +352,43 @@ UpdateSchema& UpdateSchema::CaseSensitive(bool case_sensitive) { } UpdateSchema& UpdateSchema::AddColumn(std::string_view name, std::shared_ptr type, - std::string_view doc) { + std::string_view doc, + std::optional default_value) { ICEBERG_BUILDER_CHECK(!name.contains('.'), "Cannot add column with ambiguous name: {}, use " "AddColumn(parent, name, type, doc)", name); - return AddColumnInternal(std::nullopt, name, /*is_optional=*/true, std::move(type), - doc); + return AddColumnInternal(std::nullopt, name, /*is_optional=*/true, std::move(type), doc, + std::move(default_value)); } UpdateSchema& UpdateSchema::AddColumn(std::optional parent, std::string_view name, std::shared_ptr type, - std::string_view doc) { + std::string_view doc, + std::optional default_value) { return AddColumnInternal(std::move(parent), name, /*is_optional=*/true, std::move(type), - doc); + doc, std::move(default_value)); } UpdateSchema& UpdateSchema::AddRequiredColumn(std::string_view name, std::shared_ptr type, - std::string_view doc) { + std::string_view doc, + std::optional default_value) { ICEBERG_BUILDER_CHECK(!name.contains('.'), "Cannot add column with ambiguous name: {}, use " "AddRequiredColumn(parent, name, type, doc)", name); return AddColumnInternal(std::nullopt, name, /*is_optional=*/false, std::move(type), - doc); + doc, std::move(default_value)); } UpdateSchema& UpdateSchema::AddRequiredColumn(std::optional parent, std::string_view name, std::shared_ptr type, - std::string_view doc) { + std::string_view doc, + std::optional default_value) { return AddColumnInternal(std::move(parent), name, /*is_optional=*/false, - std::move(type), doc); + std::move(type), doc, std::move(default_value)); } UpdateSchema& UpdateSchema::UpdateColumn(std::string_view name, @@ -399,8 +410,35 @@ UpdateSchema& UpdateSchema::UpdateColumn(std::string_view name, "Cannot change column type: {}: {} -> {}", name, field.type()->ToString(), new_type->ToString()); + // The SchemaField ctor stores defaults verbatim without coercing them to the field + // type, so an existing default must be promoted to the new type here; otherwise it + // would keep the old type and be rejected when the schema is validated. + auto promote_default = [&](const std::shared_ptr& value) + -> Result> { + if (value == nullptr) { + return nullptr; + } + // IsPromotionAllowed permits widening a decimal to a larger precision at the same + // scale, but Literal::CastTo only handles identical decimal types. Such a promotion + // leaves the unscaled value unchanged, so rebuild the literal with the new type. + if (value->type()->type_id() == TypeId::kDecimal && + new_type->type_id() == TypeId::kDecimal) { + const auto& decimal_type = internal::checked_cast(*new_type); + return std::make_shared( + Literal::Decimal(std::get(value->value()).value(), + decimal_type.precision(), decimal_type.scale())); + } + ICEBERG_ASSIGN_OR_RAISE(Literal promoted, value->CastTo(new_type)); + return std::make_shared(std::move(promoted)); + }; + ICEBERG_BUILDER_ASSIGN_OR_RETURN(std::shared_ptr initial_default, + promote_default(field.initial_default())); + ICEBERG_BUILDER_ASSIGN_OR_RETURN(std::shared_ptr write_default, + promote_default(field.write_default())); + updates_[field_id] = std::make_shared( - field.field_id(), field.name(), new_type, field.optional(), field.doc()); + field.field_id(), field.name(), new_type, field.optional(), field.doc(), + std::move(initial_default), std::move(write_default)); return *this; } @@ -420,9 +458,52 @@ UpdateSchema& UpdateSchema::UpdateColumnDoc(std::string_view name, return *this; } - updates_[field_id] = - std::make_shared(field.field_id(), field.name(), field.type(), - field.optional(), std::string(new_doc)); + updates_[field_id] = std::make_shared( + field.field_id(), field.name(), field.type(), field.optional(), new_doc, + field.initial_default(), field.write_default()); + + return *this; +} + +UpdateSchema& UpdateSchema::UpdateColumnDefault(std::string_view name, + std::optional new_default) { + ICEBERG_BUILDER_ASSIGN_OR_RETURN(auto field_opt, FindFieldForUpdate(name)); + ICEBERG_BUILDER_CHECK(field_opt.has_value(), "Cannot update missing column: {}", name); + + const auto& field = field_opt->get(); + int32_t field_id = field.field_id(); + + ICEBERG_BUILDER_CHECK(!deletes_.contains(field_id), + "Cannot update a column that will be deleted: {}", field.name()); + + // Only the write-default is replaced; the initial-default is fixed when the column is + // added and is preserved here. + auto rebuild = [&](std::shared_ptr write_default) { + return std::make_shared( + field.field_id(), field.name(), field.type(), field.optional(), field.doc(), + field.initial_default(), std::move(write_default)); + }; + + // An empty default clears the column's write-default. + if (!new_default.has_value()) { + updates_[field_id] = rebuild(nullptr); + return *this; + } + + ICEBERG_BUILDER_CHECK(field.type()->is_primitive(), + "Invalid default value for {}: {} (must be null)", *field.type(), + *new_default); + ICEBERG_BUILDER_ASSIGN_OR_RETURN_WITH_ERROR( + Literal typed_default, + new_default->CastTo(internal::checked_pointer_cast(field.type())), + "Cannot cast default value to {}: {}", *field.type(), *new_default); + // CastTo reports narrowing by returning sentinel values instead of failing. + ICEBERG_BUILDER_CHECK(!typed_default.IsNull() && !typed_default.IsAboveMax() && + !typed_default.IsBelowMin(), + "Cannot cast default value to {}: {}", *field.type(), + *new_default); + + updates_[field_id] = rebuild(std::make_shared(std::move(typed_default))); return *this; } @@ -444,8 +525,8 @@ UpdateSchema& UpdateSchema::RenameColumn(std::string_view name, update_it != updates_.end() ? *update_it->second : field; updates_[field_id] = std::make_shared( - base_field.field_id(), std::string(new_name), base_field.type(), - base_field.optional(), base_field.doc()); + base_field.field_id(), new_name, base_field.type(), base_field.optional(), + base_field.doc(), base_field.initial_default(), base_field.write_default()); auto it = std::ranges::find(identifier_field_names_, name); if (it != identifier_field_names_.end()) { @@ -474,9 +555,10 @@ UpdateSchema& UpdateSchema::UpdateColumnRequirementInternal(std::string_view nam return *this; } - // TODO(GuotaoYu): support added column with default value - // bool is_defaulted_add = IsAdded(name) && field.initial_default() != null; - bool is_defaulted_add = false; + // A column added in this update with a default value can be made required: rows + // written before the change read the initial-default instead of null. + bool is_defaulted_add = added_name_to_id_.contains(CaseSensitivityAwareName(name)) && + field.initial_default() != nullptr; ICEBERG_BUILDER_CHECK(is_optional || is_defaulted_add || allow_incompatible_changes_, "Cannot change column nullability: {}: optional -> required", @@ -625,11 +707,11 @@ Result UpdateSchema::Apply() { .updated_props = std::move(updated_props)}; } -// TODO(Guotao Yu): v3 default value is not yet supported UpdateSchema& UpdateSchema::AddColumnInternal(std::optional parent, std::string_view name, bool is_optional, std::shared_ptr type, - std::string_view doc) { + std::string_view doc, + std::optional default_value) { int32_t parent_id = kTableRootId; std::string full_name; @@ -683,7 +765,7 @@ UpdateSchema& UpdateSchema::AddColumnInternal(std::optional pa } ICEBERG_BUILDER_CHECK( - is_optional || allow_incompatible_changes_, + is_optional || default_value.has_value() || allow_incompatible_changes_, "Incompatible change: cannot add required column without a default value: {}", full_name); @@ -697,11 +779,32 @@ UpdateSchema& UpdateSchema::AddColumnInternal(std::optional pa AssignFreshIdVisitor id_assigner([this]() { return AssignNewColumnId(); }); auto type_with_fresh_ids = id_assigner.Visit(type); - auto new_field = std::make_shared(new_id, std::string(name), - std::move(type_with_fresh_ids), - is_optional, std::string(doc)); + std::shared_ptr initial_default; + std::shared_ptr write_default; + if (default_value.has_value()) { + ICEBERG_BUILDER_CHECK(type_with_fresh_ids->is_primitive(), + "Invalid default value for {}: {} (must be null)", + *type_with_fresh_ids, *default_value); + ICEBERG_BUILDER_ASSIGN_OR_RETURN_WITH_ERROR( + Literal typed_default, + default_value->CastTo( + internal::checked_pointer_cast(type_with_fresh_ids)), + "Cannot cast default value to {}: {}", *type_with_fresh_ids, *default_value); + // CastTo reports narrowing by returning sentinel values instead of failing. + ICEBERG_BUILDER_CHECK(!typed_default.IsNull() && !typed_default.IsAboveMax() && + !typed_default.IsBelowMin(), + "Cannot cast default value to {}: {}", *type_with_fresh_ids, + *default_value); + // A new column's default applies both to rows written before it existed + // (initial-default) and to writers that omit a value (write-default). + auto shared_default = std::make_shared(std::move(typed_default)); + initial_default = shared_default; + write_default = std::move(shared_default); + } - updates_[new_id] = std::move(new_field); + updates_[new_id] = std::make_shared( + new_id, name, std::move(type_with_fresh_ids), is_optional, doc, + std::move(initial_default), std::move(write_default)); parent_to_added_ids_[parent_id].push_back(new_id); return *this; diff --git a/src/iceberg/update/update_schema.h b/src/iceberg/update/update_schema.h index 2be3732a0..e2e7fce1f 100644 --- a/src/iceberg/update/update_schema.h +++ b/src/iceberg/update/update_schema.h @@ -30,6 +30,7 @@ #include #include +#include "iceberg/expression/literal.h" #include "iceberg/iceberg_export.h" #include "iceberg/result.h" #include "iceberg/type_fwd.h" @@ -41,10 +42,6 @@ namespace iceberg { /// /// When committing, these changes will be applied to the current table metadata. /// Commit conflicts will not be resolved and will result in a CommitFailed error. -/// -/// TODO(Guotao Yu): Add support for V3 default values when adding columns. Currently, all -/// added columns use null as the default value, but Iceberg V3 supports custom -/// default values for new columns. class ICEBERG_EXPORT UpdateSchema : public PendingUpdate { public: static Result> Make( @@ -78,15 +75,19 @@ class ICEBERG_EXPORT UpdateSchema : public PendingUpdate { /// If type is a nested type, its field IDs are reassigned when added to the /// existing schema. /// - /// The added column will be optional with a null default value. + /// Without a default value, the added column is optional with a null default. + /// With a default value (v3+), it is used as both the `initial-default` and the + /// `write-default` of the new column. /// /// \param name Name for the new column. /// \param type Type for the new column. /// \param doc Documentation string for the new column. + /// \param default_value Optional default value for the new column (v3+). /// \return Reference to this for method chaining. /// \note InvalidArgument will be reported if name contains ".". UpdateSchema& AddColumn(std::string_view name, std::shared_ptr type, - std::string_view doc = ""); + std::string_view doc = "", + std::optional default_value = std::nullopt); /// \brief Add a new optional column to a nested struct with documentation. /// @@ -102,22 +103,28 @@ class ICEBERG_EXPORT UpdateSchema : public PendingUpdate { /// If type is a nested type, its field IDs are reassigned when added to the /// existing schema. /// - /// The added column will be optional with a null default value. + /// Without a default value, the added column is optional with a null default. + /// With a default value (v3+), it is used as both the `initial-default` and the + /// `write-default` of the new column. /// /// \param parent Name of the parent struct to which the column will be added. /// \param name Name for the new column. /// \param type Type for the new column. /// \param doc Documentation string for the new column. + /// \param default_value Optional default value for the new column (v3+). /// \return Reference to this for method chaining. /// \note InvalidArgument will be reported if parent doesn't identify a struct. UpdateSchema& AddColumn(std::optional parent, std::string_view name, - std::shared_ptr type, std::string_view doc = ""); + std::shared_ptr type, std::string_view doc = "", + std::optional default_value = std::nullopt); /// \brief Add a new required top-level column with documentation. /// /// Adding a required column without a default is an incompatible change that can /// break reading older data. To suppress exceptions thrown when an incompatible - /// change is detected, call AllowIncompatibleChanges(). + /// change is detected, call AllowIncompatibleChanges() or provide a default value + /// (v3+), which is used as both the `initial-default` and the `write-default` of + /// the new column. /// /// Because "." may be interpreted as a column path separator or may be used in /// field names, it is not allowed in names passed to this method. To add to nested @@ -130,16 +137,20 @@ class ICEBERG_EXPORT UpdateSchema : public PendingUpdate { /// \param name Name for the new column. /// \param type Type for the new column. /// \param doc Documentation string for the new column. + /// \param default_value Optional default value for the new column (v3+). /// \return Reference to this for method chaining. /// \note InvalidArgument will be reported if name contains ".". UpdateSchema& AddRequiredColumn(std::string_view name, std::shared_ptr type, - std::string_view doc = ""); + std::string_view doc = "", + std::optional default_value = std::nullopt); /// \brief Add a new required column to a nested struct with documentation. /// /// Adding a required column without a default is an incompatible change that can /// break reading older data. To suppress exceptions thrown when an incompatible - /// change is detected, call AllowIncompatibleChanges(). + /// change is detected, call AllowIncompatibleChanges() or provide a default value + /// (v3+), which is used as both the `initial-default` and the `write-default` of + /// the new column. /// /// The parent name is used to find the parent using Schema::FindFieldByName(). If /// the parent name is null or empty, the new column will be added to the root as a @@ -157,11 +168,13 @@ class ICEBERG_EXPORT UpdateSchema : public PendingUpdate { /// \param name Name for the new column. /// \param type Type for the new column. /// \param doc Documentation string for the new column. + /// \param default_value Optional default value for the new column (v3+). /// \return Reference to this for method chaining. /// \note InvalidArgument will be reported if parent doesn't identify a struct. UpdateSchema& AddRequiredColumn(std::optional parent, std::string_view name, std::shared_ptr type, - std::string_view doc = ""); + std::string_view doc = "", + std::optional default_value = std::nullopt); /// \brief Rename a column in the schema. /// @@ -210,6 +223,21 @@ class ICEBERG_EXPORT UpdateSchema : public PendingUpdate { /// the column will be deleted. UpdateSchema& UpdateColumnDoc(std::string_view name, std::string_view new_doc); + /// \brief Update the `write-default` value for a column (v3+). + /// + /// The name is used to find the column to update using Schema::FindFieldByName(). + /// The column's `initial-default` is not changed: it is fixed when the column is + /// added and applies to rows that predate the column. + /// + /// \param name Name of the column to update the default value for. + /// \param new_default Replacement `write-default` value for the column, or + /// `std::nullopt` to clear it. + /// \return Reference to this for method chaining. + /// \note InvalidArgument will be reported if name doesn't identify a column in the + /// schema or if the column will be deleted. + UpdateSchema& UpdateColumnDefault(std::string_view name, + std::optional new_default); + /// \brief Update a column to be optional. /// /// \param name Name of the column to mark optional. @@ -364,10 +392,12 @@ class ICEBERG_EXPORT UpdateSchema : public PendingUpdate { /// \param is_optional Whether the column is optional. /// \param type Type for the new column. /// \param doc Optional documentation string. + /// \param default_value Optional default value for the new column (v3+). /// \return Reference to this for method chaining. UpdateSchema& AddColumnInternal(std::optional parent, std::string_view name, bool is_optional, - std::shared_ptr type, std::string_view doc); + std::shared_ptr type, std::string_view doc, + std::optional default_value); /// \brief Internal implementation for updating column requirement (optional/required). /// diff --git a/src/iceberg/util/type_util.cc b/src/iceberg/util/type_util.cc index 1f1e02747..6656c5636 100644 --- a/src/iceberg/util/type_util.cc +++ b/src/iceberg/util/type_util.cc @@ -385,9 +385,9 @@ std::shared_ptr AssignFreshIdVisitor::Visit(const StructType& type) std::vector fresh_fields; for (size_t i = 0; i < type.fields().size(); ++i) { const auto& field = type.fields()[i]; - fresh_fields.emplace_back(fresh_ids[i], std::string(field.name()), - Visit(field.type()), field.optional(), - std::string(field.doc())); + fresh_fields.emplace_back( + fresh_ids[i], std::string(field.name()), Visit(field.type()), field.optional(), + std::string(field.doc()), field.initial_default(), field.write_default()); } return std::make_shared(std::move(fresh_fields)); } @@ -397,7 +397,8 @@ std::shared_ptr AssignFreshIdVisitor::Visit(const ListType& type) cons int32_t fresh_id = next_id_(); SchemaField fresh_elem_field(fresh_id, std::string(elem_field.name()), Visit(elem_field.type()), elem_field.optional(), - std::string(elem_field.doc())); + std::string(elem_field.doc()), + elem_field.initial_default(), elem_field.write_default()); return std::make_shared(std::move(fresh_elem_field)); } @@ -410,10 +411,12 @@ std::shared_ptr AssignFreshIdVisitor::Visit(const MapType& type) const SchemaField fresh_key_field(fresh_key_id, std::string(key_field.name()), Visit(key_field.type()), key_field.optional(), - std::string(key_field.doc())); - SchemaField fresh_value_field(fresh_value_id, std::string(value_field.name()), - Visit(value_field.type()), value_field.optional(), - std::string(value_field.doc())); + std::string(key_field.doc()), key_field.initial_default(), + key_field.write_default()); + SchemaField fresh_value_field( + fresh_value_id, std::string(value_field.name()), Visit(value_field.type()), + value_field.optional(), std::string(value_field.doc()), + value_field.initial_default(), value_field.write_default()); return std::make_shared(std::move(fresh_key_field), std::move(fresh_value_field)); }