From c44abfdd2371f1e9cc330121fa75890cce58fdd4 Mon Sep 17 00:00:00 2001 From: Vinish Reddy Date: Tue, 30 Jun 2026 18:54:57 -0700 Subject: [PATCH 1/2] fix: Handle map/array nested leaves in Avro column-stats value navigation Collecting column stats for a primitive nested inside a MAP or ARRAY (e.g. `my_map.key_value.value`, `my_array.list.element`) crashed the MOR inline log-append path at table version 9: IllegalStateException: Cannot get field from schema type: MAP at HoodieSchema.getField at AvroRecordContext.getFieldValueFromIndexedRecord at HoodieAvroIndexedRecord.getColumnValueAsJava at HoodieTableMetadataUtil.collectColumnRangeFieldValueV2 at HoodieInlineLogAppendHandle.collectColumnStats #17694 taught the schema-side navigator (HoodieSchema.getNestedField) and the base-file (Parquet) path to resolve the Parquet-style `.key_value.key`, `.key_value.value` and `.list.element` synthetic accessors, so such leaves pass isColumnTypeSupported. The record value-side navigator (AvroRecordContext.getFieldValueFromIndexedRecord), used by the column-stats V2 collection path (collectColumnRangeFieldValueV2 -> getColumnValueAsJava), was never updated: it assumes every path segment is a RECORD field and calls HoodieSchema.getField on the MAP/ARRAY schema, which throws. Make the value navigator return null when a path segment cannot be resolved as a plain RECORD field (a MAP/ARRAY intermediate or a null intermediate), instead of throwing. This mirrors HoodieAvroUtils.getNestedFieldVal, which is why the V1 path at table version 6 already tolerated these paths. A map/array leaf is multi-valued per record and has no single value to fold into a min/max, so returning null (no stats from the record path) is correct; statistics for such leaves are still collected from the base-file (Parquet) footer path added in #17694. Also guard the intermediate downcast so a non-record value (java Map/List) degrades to null rather than throwing ClassCastException. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../apache/hudi/avro/AvroRecordContext.java | 10 +++- .../hudi/avro/TestAvroRecordContext.java | 49 ++++++++++++++++++- 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/AvroRecordContext.java b/hudi-common/src/main/java/org/apache/hudi/avro/AvroRecordContext.java index fceeeaf84b97..dfc25910037c 100644 --- a/hudi-common/src/main/java/org/apache/hudi/avro/AvroRecordContext.java +++ b/hudi-common/src/main/java/org/apache/hudi/avro/AvroRecordContext.java @@ -79,6 +79,14 @@ public static Object getFieldValueFromIndexedRecord( String[] path = fieldName.split("\\."); for (int i = 0; i < path.length; i++) { currentSchema = currentSchema.getNonNullType(); + // Value navigation here can only descend through RECORD fields. Column-stats field paths + // that traverse a MAP (".key_value.key" / ".key_value.value") or ARRAY (".list.element") + // synthetic accessor, or that hit a null intermediate value, cannot be resolved to a single + // value and yield null instead of throwing. This mirrors HoodieAvroUtils.getNestedFieldVal; + // statistics for such nested leaves are still collected from the base-file (Parquet) path. + if (currentRecord == null || !currentSchema.hasFields()) { + return null; + } Option fieldOpt = currentSchema.getField(path[i]); if (fieldOpt.isEmpty()) { return null; @@ -89,7 +97,7 @@ public static Object getFieldValueFromIndexedRecord( return value; } currentSchema = field.schema(); - currentRecord = (IndexedRecord) value; + currentRecord = value instanceof IndexedRecord ? (IndexedRecord) value : null; } return null; } diff --git a/hudi-common/src/test/java/org/apache/hudi/avro/TestAvroRecordContext.java b/hudi-common/src/test/java/org/apache/hudi/avro/TestAvroRecordContext.java index 04c7ae3c2bda..1e4a03deb416 100644 --- a/hudi-common/src/test/java/org/apache/hudi/avro/TestAvroRecordContext.java +++ b/hudi-common/src/test/java/org/apache/hudi/avro/TestAvroRecordContext.java @@ -28,6 +28,9 @@ import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; import java.util.stream.Stream; import static org.apache.hudi.avro.AvroRecordContext.getFieldValueFromIndexedRecord; @@ -94,11 +97,53 @@ void testGetFieldValueNested() { @Test void testGetFieldValueErrorCases() { GenericRecord record = buildRecord(); - // a union that is not [null, T] does not support field lookups - assertThrows(IllegalStateException.class, () -> getFieldValueFromIndexedRecord(record, "multi.sub")); + // a union that is not [null, T] cannot be navigated into; instead of throwing, the value + // navigator returns null so callers (e.g. column-stats collection) degrade gracefully, + // matching HoodieAvroUtils.getNestedFieldVal + assertNull(getFieldValueFromIndexedRecord(record, "multi.sub")); + // an empty field name is still rejected up front assertThrows(IllegalArgumentException.class, () -> getFieldValueFromIndexedRecord(record, "")); } + private static final Schema COMPLEX_SCHEMA = new Schema.Parser().parse( + "{\"type\":\"record\",\"name\":\"complex\",\"fields\":[" + + "{\"name\":\"id\",\"type\":\"int\"}," + + "{\"name\":\"str_map\",\"type\":[\"null\",{\"type\":\"map\",\"values\":\"string\"}],\"default\":null}," + + "{\"name\":\"int_array\",\"type\":[\"null\",{\"type\":\"array\",\"items\":\"int\"}],\"default\":null}," + + "{\"name\":\"rec_map\",\"type\":[\"null\",{\"type\":\"map\",\"values\":{\"type\":\"record\"," + + "\"name\":\"inner\",\"fields\":[{\"name\":\"x\",\"type\":\"int\"}]}}],\"default\":null}]}"); + + @Test + void testGetFieldValueMapAndArrayLeavesReturnNull() { + GenericRecord record = new GenericData.Record(COMPLEX_SCHEMA); + record.put("id", 7); + Map strMap = new HashMap<>(); + strMap.put(new Utf8("a"), new Utf8("v1")); + record.put("str_map", strMap); + record.put("int_array", Arrays.asList(3, 1, 2)); + // rec_map left null + + // top-level scalar still resolves normally + assertEquals(7, getFieldValueFromIndexedRecord(record, "id")); + + // Parquet-style synthetic accessors that traverse a MAP (".key_value.key/value") or an + // ARRAY (".list.element") cannot be resolved to a single value and must return null instead + // of throwing. Regression: these previously threw + // IllegalStateException "Cannot get field from schema type: MAP" during MOR log-append + // column-stats collection. Such nested leaves still get statistics from the base-file path. + assertNull(getFieldValueFromIndexedRecord(record, "str_map.key_value.key")); + assertNull(getFieldValueFromIndexedRecord(record, "str_map.key_value.value")); + assertNull(getFieldValueFromIndexedRecord(record, "int_array.list.element")); + // a deep path descending through a MAP into a record field also degrades to null + assertNull(getFieldValueFromIndexedRecord(record, "rec_map.key_value.value.x")); + + // and null when the complex field itself is absent/null + GenericRecord empty = new GenericData.Record(COMPLEX_SCHEMA); + empty.put("id", 0); + assertNull(getFieldValueFromIndexedRecord(empty, "str_map.key_value.value")); + assertNull(getFieldValueFromIndexedRecord(empty, "int_array.list.element")); + } + @Test void testGetFieldValueAcrossEqualSchemaInstances() { // records from different files carry equal but distinct schema instances; both must intern to From 1a78da72a105dbd9493204674222bc3bbac4090f Mon Sep 17 00:00:00 2001 From: Vinish Reddy Date: Wed, 1 Jul 2026 16:24:56 -0700 Subject: [PATCH 2/2] Rename test constant COMPLEX_SCHEMA to MAP_AND_ARRAY_SCHEMA for clarity Co-Authored-By: Claude Opus 4.8 (1M context) --- .../java/org/apache/hudi/avro/TestAvroRecordContext.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hudi-common/src/test/java/org/apache/hudi/avro/TestAvroRecordContext.java b/hudi-common/src/test/java/org/apache/hudi/avro/TestAvroRecordContext.java index 1e4a03deb416..a1ef2e1046b3 100644 --- a/hudi-common/src/test/java/org/apache/hudi/avro/TestAvroRecordContext.java +++ b/hudi-common/src/test/java/org/apache/hudi/avro/TestAvroRecordContext.java @@ -105,7 +105,7 @@ void testGetFieldValueErrorCases() { assertThrows(IllegalArgumentException.class, () -> getFieldValueFromIndexedRecord(record, "")); } - private static final Schema COMPLEX_SCHEMA = new Schema.Parser().parse( + private static final Schema MAP_AND_ARRAY_SCHEMA = new Schema.Parser().parse( "{\"type\":\"record\",\"name\":\"complex\",\"fields\":[" + "{\"name\":\"id\",\"type\":\"int\"}," + "{\"name\":\"str_map\",\"type\":[\"null\",{\"type\":\"map\",\"values\":\"string\"}],\"default\":null}," @@ -115,7 +115,7 @@ void testGetFieldValueErrorCases() { @Test void testGetFieldValueMapAndArrayLeavesReturnNull() { - GenericRecord record = new GenericData.Record(COMPLEX_SCHEMA); + GenericRecord record = new GenericData.Record(MAP_AND_ARRAY_SCHEMA); record.put("id", 7); Map strMap = new HashMap<>(); strMap.put(new Utf8("a"), new Utf8("v1")); @@ -138,7 +138,7 @@ void testGetFieldValueMapAndArrayLeavesReturnNull() { assertNull(getFieldValueFromIndexedRecord(record, "rec_map.key_value.value.x")); // and null when the complex field itself is absent/null - GenericRecord empty = new GenericData.Record(COMPLEX_SCHEMA); + GenericRecord empty = new GenericData.Record(MAP_AND_ARRAY_SCHEMA); empty.put("id", 0); assertNull(getFieldValueFromIndexedRecord(empty, "str_map.key_value.value")); assertNull(getFieldValueFromIndexedRecord(empty, "int_array.list.element"));