-
Notifications
You must be signed in to change notification settings - Fork 113
feat(schema): represent, serialize and validate v3 column default values (1/4) #746
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
f663e0e
511d9c8
9690529
fe3b348
d15a0fe
180a9f9
2052e4f
54d9636
925611a
0b4d452
b252b72
36a76ca
7f49ebc
04ee5c6
0163d35
26b1f1b
462d2d9
c58050f
41c2bd1
bda2d78
38bbdb0
6fe4640
2efca23
70e029b
d022953
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,6 +27,8 @@ | |
| #include <nlohmann/json.hpp> | ||
|
|
||
| #include "iceberg/constants.h" | ||
| #include "iceberg/expression/json_serde_internal.h" | ||
| #include "iceberg/expression/literal.h" | ||
| #include "iceberg/json_serde_internal.h" | ||
| #include "iceberg/name_mapping.h" | ||
| #include "iceberg/partition_field.h" | ||
|
|
@@ -307,6 +309,15 @@ nlohmann::json ToJson(const SchemaField& field) { | |
| if (!field.doc().empty()) { | ||
| json[kDoc] = field.doc(); | ||
| } | ||
| // Defaults are validated to be primitive literals matching the field type, so | ||
| // single-value serialization cannot fail here. | ||
| if (field.initial_default().has_value()) { | ||
| ICEBERG_ASSIGN_OR_THROW(json[kInitialDefault], | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Though it cannot not fail, we shouldn't use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks! Merged that PR. |
||
| ToJson(field.initial_default()->get())); | ||
| } | ||
| if (field.write_default().has_value()) { | ||
| ICEBERG_ASSIGN_OR_THROW(json[kWriteDefault], ToJson(field.write_default()->get())); | ||
| } | ||
| return json; | ||
| } | ||
|
|
||
|
|
@@ -319,7 +330,6 @@ nlohmann::json ToJson(const Type& type) { | |
| nlohmann::json fields_json = nlohmann::json::array(); | ||
| for (const auto& field : struct_type.fields()) { | ||
| fields_json.push_back(ToJson(field)); | ||
| // TODO(gangwu): add default values | ||
| } | ||
| json[kFields] = fields_json; | ||
| return json; | ||
|
|
@@ -561,9 +571,27 @@ Result<std::unique_ptr<SchemaField>> FieldFromJson(const nlohmann::json& json) { | |
| ICEBERG_ASSIGN_OR_RAISE(auto name, GetJsonValue<std::string>(json, kName)); | ||
| ICEBERG_ASSIGN_OR_RAISE(auto required, GetJsonValue<bool>(json, kRequired)); | ||
| ICEBERG_ASSIGN_OR_RAISE(auto doc, GetJsonValueOrDefault<std::string>(json, kDoc)); | ||
| ICEBERG_ASSIGN_OR_RAISE(std::optional<nlohmann::json> initial_default_json, | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not use |
||
| GetJsonValueOptional<nlohmann::json>(json, kInitialDefault)); | ||
| ICEBERG_ASSIGN_OR_RAISE(std::optional<nlohmann::json> write_default_json, | ||
| GetJsonValueOptional<nlohmann::json>(json, kWriteDefault)); | ||
|
|
||
| std::shared_ptr<const Literal> initial_default; | ||
| if (initial_default_json.has_value()) { | ||
| ICEBERG_ASSIGN_OR_RAISE(Literal literal, | ||
| LiteralFromJson(*initial_default_json, type.get())); | ||
|
wgtmac marked this conversation as resolved.
|
||
| initial_default = std::make_shared<const Literal>(std::move(literal)); | ||
| } | ||
| std::shared_ptr<const Literal> write_default; | ||
| if (write_default_json.has_value()) { | ||
| ICEBERG_ASSIGN_OR_RAISE(Literal literal, | ||
| LiteralFromJson(*write_default_json, type.get())); | ||
| write_default = std::make_shared<const Literal>(std::move(literal)); | ||
| } | ||
|
huan233usc marked this conversation as resolved.
|
||
|
|
||
| return std::make_unique<SchemaField>(field_id, std::move(name), std::move(type), | ||
| !required, doc); | ||
| !required, doc, std::move(initial_default), | ||
| std::move(write_default)); | ||
| } | ||
|
|
||
| Result<std::unique_ptr<Schema>> SchemaFromJson(const nlohmann::json& json) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,19 +21,26 @@ | |
|
|
||
| #include <format> | ||
| #include <string_view> | ||
| #include <utility> | ||
|
|
||
| #include "iceberg/expression/literal.h" | ||
| #include "iceberg/type.h" | ||
| #include "iceberg/util/formatter.h" // IWYU pragma: keep | ||
| #include "iceberg/util/macros.h" | ||
|
|
||
| namespace iceberg { | ||
|
|
||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This always create a new literal even when their types are the same which is unlikely?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done, remove the normalization method
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we return
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
| SchemaField::SchemaField(int32_t field_id, std::string_view name, | ||
| std::shared_ptr<Type> type, bool optional, std::string_view doc) | ||
| std::shared_ptr<Type> type, bool optional, std::string_view doc, | ||
| std::shared_ptr<const Literal> initial_default, | ||
| std::shared_ptr<const Literal> write_default) | ||
| : field_id_(field_id), | ||
| name_(name), | ||
| type_(std::move(type)), | ||
| optional_(optional), | ||
| doc_(doc) {} | ||
| doc_(doc), | ||
| initial_default_(std::move(initial_default)), | ||
| write_default_(std::move(write_default)) {} | ||
|
|
||
| SchemaField SchemaField::MakeOptional(int32_t field_id, std::string_view name, | ||
| std::shared_ptr<Type> type, std::string_view doc) { | ||
|
|
@@ -55,13 +62,65 @@ bool SchemaField::optional() const { return optional_; } | |
|
|
||
| std::string_view SchemaField::doc() const { return doc_; } | ||
|
|
||
| std::optional<std::reference_wrapper<const Literal>> SchemaField::initial_default() | ||
| const { | ||
| if (initial_default_ == nullptr) { | ||
| return std::nullopt; | ||
| } | ||
| return std::cref(*initial_default_); | ||
| } | ||
|
|
||
| std::optional<std::reference_wrapper<const Literal>> SchemaField::write_default() const { | ||
| if (write_default_ == nullptr) { | ||
| return std::nullopt; | ||
| } | ||
| return std::cref(*write_default_); | ||
| } | ||
|
|
||
| const std::shared_ptr<const Literal>& SchemaField::initial_default_ptr() const { | ||
| return initial_default_; | ||
| } | ||
|
|
||
| const std::shared_ptr<const Literal>& SchemaField::write_default_ptr() const { | ||
| return write_default_; | ||
| } | ||
|
|
||
| namespace { | ||
|
|
||
| Status ValidateDefault(const SchemaField& field, const Literal& value, | ||
| std::string_view kind) { | ||
| if (value.IsNull() || value.IsAboveMax() || value.IsBelowMin()) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This rejects explicit null defaults. The spec allows optional field defaults to be null and says they may be explicitly set. Java mostly models a null default as absence, but C++ now has a present-null |
||
| return InvalidSchema("Invalid {} value for {}: must be a non-null value", kind, | ||
| field.name()); | ||
| } | ||
| if (field.type() == nullptr || !field.type()->is_primitive()) { | ||
| return InvalidSchema( | ||
| "Invalid {} value for {}: default values are only supported for primitive types", | ||
|
wgtmac marked this conversation as resolved.
|
||
| kind, field.name()); | ||
| } | ||
| if (*value.type() != *field.type()) { | ||
|
wgtmac marked this conversation as resolved.
|
||
| return InvalidSchema("{} of field {} has type {} but expected {}", kind, field.name(), | ||
| *value.type(), *field.type()); | ||
| } | ||
| return {}; | ||
| } | ||
|
|
||
| } // namespace | ||
|
|
||
| Status SchemaField::Validate() const { | ||
| if (name_.empty()) [[unlikely]] { | ||
| return InvalidSchema("SchemaField cannot have empty name"); | ||
| } | ||
| if (type_ == nullptr) [[unlikely]] { | ||
| return InvalidSchema("SchemaField cannot have null type"); | ||
| } | ||
| if (initial_default_ != nullptr) { | ||
| ICEBERG_RETURN_UNEXPECTED( | ||
| ValidateDefault(*this, *initial_default_, "initial-default")); | ||
| } | ||
| if (write_default_ != nullptr) { | ||
| ICEBERG_RETURN_UNEXPECTED(ValidateDefault(*this, *write_default_, "write-default")); | ||
| } | ||
| return {}; | ||
| } | ||
|
|
||
|
|
@@ -72,9 +131,23 @@ std::string SchemaField::ToString() const { | |
| return result; | ||
| } | ||
|
|
||
| namespace { | ||
|
|
||
| bool DefaultEquals(const std::shared_ptr<const Literal>& lhs, | ||
| const std::shared_ptr<const Literal>& rhs) { | ||
| if (lhs == nullptr || rhs == nullptr) { | ||
| return lhs == rhs; | ||
| } | ||
| return *lhs == *rhs; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just a reminder that
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or add a overload |
||
| } | ||
|
|
||
| } // namespace | ||
|
|
||
| bool SchemaField::Equals(const SchemaField& other) const { | ||
| return field_id_ == other.field_id_ && name_ == other.name_ && *type_ == *other.type_ && | ||
| optional_ == other.optional_; | ||
| optional_ == other.optional_ && | ||
| DefaultEquals(initial_default_, other.initial_default_) && | ||
| DefaultEquals(write_default_, other.write_default_); | ||
| } | ||
|
|
||
| } // namespace iceberg | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,7 +24,9 @@ | |
| /// type (e.g. a struct). | ||
|
|
||
| #include <cstdint> | ||
| #include <functional> | ||
| #include <memory> | ||
| #include <optional> | ||
| #include <string> | ||
| #include <string_view> | ||
|
|
||
|
|
@@ -46,8 +48,14 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { | |
| /// \param[in] type The field type. | ||
| /// \param[in] optional Whether values of this field are required or nullable. | ||
| /// \param[in] doc Optional documentation string for the field. | ||
| /// \param[in] initial_default The v3 `initial-default` value, or null if absent. The | ||
| /// field shares ownership of the (immutable) value. | ||
| /// \param[in] write_default The v3 `write-default` value, or null if absent. The field | ||
| /// shares ownership of the (immutable) value. | ||
| SchemaField(int32_t field_id, std::string_view name, std::shared_ptr<Type> type, | ||
| bool optional, std::string_view doc = {}); | ||
| bool optional, std::string_view doc = {}, | ||
| std::shared_ptr<const Literal> initial_default = nullptr, | ||
| std::shared_ptr<const Literal> write_default = nullptr); | ||
|
|
||
| /// \brief Construct an optional (nullable) field. | ||
| static SchemaField MakeOptional(int32_t field_id, std::string_view name, | ||
|
|
@@ -71,6 +79,32 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { | |
| /// \brief Get the field documentation. | ||
| std::string_view doc() const; | ||
|
|
||
| /// \brief Get the default value for this field used when reading rows written | ||
| /// before the field existed (v3 `initial-default`). Empty if absent. | ||
| /// | ||
| /// The returned reference is a non-owning view into a value owned by this field; | ||
| /// it remains valid for the lifetime of this SchemaField. | ||
| [[nodiscard]] std::optional<std::reference_wrapper<const Literal>> initial_default() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should remove |
||
| const; | ||
|
|
||
| /// \brief Get the default value for this field used when a writer does not | ||
| /// supply a value (v3 `write-default`). Empty if absent. | ||
| /// | ||
| /// The returned reference is a non-owning view into a value owned by this field; | ||
| /// it remains valid for the lifetime of this SchemaField. | ||
| [[nodiscard]] std::optional<std::reference_wrapper<const Literal>> write_default() | ||
| const; | ||
|
|
||
| /// \brief Get the shared owning pointer to the `initial-default` value, or null if | ||
| /// absent. Prefer initial_default() for reading; this exists so a rebuilt field can | ||
| /// share the (immutable) value rather than copy it. | ||
| [[nodiscard]] const std::shared_ptr<const Literal>& initial_default_ptr() const; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks odd to provide both |
||
|
|
||
| /// \brief Get the shared owning pointer to the `write-default` value, or null if | ||
| /// absent. Prefer write_default() for reading; this exists so a rebuilt field can | ||
| /// share the (immutable) value rather than copy it. | ||
| [[nodiscard]] const std::shared_ptr<const Literal>& write_default_ptr() const; | ||
|
|
||
| [[nodiscard]] std::string ToString() const override; | ||
|
|
||
| Status Validate() const; | ||
|
|
@@ -100,6 +134,11 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { | |
| std::shared_ptr<Type> type_; | ||
| bool optional_; | ||
| std::string doc_; | ||
| // Default values are owned by this field and never mutated after being set; copies | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not a strong opinion but the comment looks too verbose. |
||
| // of the field share the same payload (reference-counted) instead of deep-copying, | ||
| // like `type_` above. Sharing is unobservable because the payload is immutable. | ||
| std::shared_ptr<const Literal> initial_default_; | ||
| std::shared_ptr<const Literal> write_default_; | ||
|
huan233usc marked this conversation as resolved.
|
||
| }; | ||
|
|
||
| } // namespace iceberg | ||
Uh oh!
There was an error while loading. Please reload this page.