apache · voonhous · Jul 2, 2026 · Jul 2, 2026 · Jul 3, 2026 · Jul 3, 2026
diff --git a/...ient/hudi-flink-client/src/main/java/org/apache/hudi/table/format/FlinkRecordContext.java b/...ient/hudi-flink-client/src/main/java/org/apache/hudi/table/format/FlinkRecordContext.java
@@ -25,7 +25,6 @@
 import org.apache.hudi.common.model.HoodieKey;
 import org.apache.hudi.common.model.HoodieOperation;
 import org.apache.hudi.common.model.HoodieRecord;
-import org.apache.hudi.common.schema.HoodieAvroSchemaCache;
 import org.apache.hudi.common.schema.HoodieSchema;
 import org.apache.hudi.common.schema.HoodieSchemaField;
 import org.apache.hudi.common.table.HoodieTableConfig;
@@ -126,7 +125,7 @@ public RowData getDeleteRow(String recordKey) {
   @Override
   public RowData convertAvroRecord(IndexedRecord avroRecord) {
     Schema recordSchema = avroRecord.getSchema();
-    AvroToRowDataConverters.AvroToRowDataConverter converter = RowDataQueryContexts.fromSchema(HoodieAvroSchemaCache.intern(recordSchema), utcTimezone).getAvroToRowDataConverter();
+    AvroToRowDataConverters.AvroToRowDataConverter converter = RowDataQueryContexts.fromSchema(HoodieSchema.fromAvroSchema(recordSchema), utcTimezone).getAvroToRowDataConverter();
     RowData rowData = (RowData) converter.convert(avroRecord);
     Schema.Field operationField = recordSchema.getField(HoodieRecord.OPERATION_METADATA_FIELD);
     if (operationField != null) {

diff --git a/...di-spark-client/src/main/scala/org/apache/hudi/SparkFileFormatInternalRecordContext.scala b/...di-spark-client/src/main/scala/org/apache/hudi/SparkFileFormatInternalRecordContext.scala
@@ -21,7 +21,7 @@ package org.apache.hudi
 
 import org.apache.avro.generic.{GenericRecord, IndexedRecord}
 import org.apache.hudi.common.engine.RecordContext
-import org.apache.hudi.common.schema.{HoodieAvroSchemaCache, HoodieSchema}
+import org.apache.hudi.common.schema.HoodieSchema
 import org.apache.hudi.common.table.HoodieTableConfig
 import org.apache.spark.sql.HoodieInternalRowUtils
 import org.apache.spark.sql.avro.{HoodieAvroDeserializer, HoodieAvroSerializer}
@@ -47,7 +47,7 @@ trait SparkFileFormatInternalRecordContext extends BaseSparkInternalRecordContex
    * @return An [[InternalRow]].
    */
   override def convertAvroRecord(avroRecord: IndexedRecord): InternalRow = {
-    val schema = HoodieAvroSchemaCache.intern(avroRecord.getSchema)
+    val schema = HoodieSchema.fromAvroSchema(avroRecord.getSchema)
     val structType = HoodieInternalRowUtils.getCachedSchema(schema)
     val deserializer = deserializerMap.getOrElseUpdate(schema, {
       sparkAdapter.createAvroDeserializer(schema, structType)

diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/AvroRecordContext.java b/hudi-common/src/main/java/org/apache/hudi/avro/AvroRecordContext.java
@@ -24,7 +24,6 @@
 import org.apache.hudi.common.model.HoodieEmptyRecord;
 import org.apache.hudi.common.model.HoodieKey;
 import org.apache.hudi.common.model.HoodieRecord;
-import org.apache.hudi.common.schema.HoodieAvroSchemaCache;
 import org.apache.hudi.common.schema.HoodieSchema;
 import org.apache.hudi.common.schema.HoodieSchemaField;
 import org.apache.hudi.common.table.HoodieTableConfig;
@@ -71,10 +70,10 @@ public AvroRecordContext() {
   public static Object getFieldValueFromIndexedRecord(
       IndexedRecord record,
       String fieldName) {
-    // Interning returns the canonical wrapper for this schema, whose lazily built field list and
-    // field map survive across calls, so the per-record cost is a cache hit instead of an
-    // O(schema width) wrapper rebuild.
-    HoodieSchema currentSchema = HoodieAvroSchemaCache.intern(record.getSchema());
+    // fromAvroSchema returns the canonical wrapper for this schema, whose lazily built field
+    // list and field map survive across calls, so the per-record cost is a cache hit instead
+    // of an O(schema width) wrapper rebuild.
+    HoodieSchema currentSchema = HoodieSchema.fromAvroSchema(record.getSchema());
     IndexedRecord currentRecord = record;
     String[] path = fieldName.split("\\.");
     for (int i = 0; i < path.length; i++) {

diff --git a/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java b/hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java
@@ -19,7 +19,6 @@
 package org.apache.hudi.avro;
 
 import org.apache.hudi.common.model.HoodieRecord;
-import org.apache.hudi.common.schema.HoodieAvroSchemaCache;
 import org.apache.hudi.common.schema.HoodieSchema;
 import org.apache.hudi.common.schema.HoodieSchemaUtils;
 import org.apache.hudi.common.util.DateTimeUtils;
@@ -835,7 +834,7 @@ public static Object[] getRecordColumnValues(HoodieRecord record,
                                                Schema schema,
                                                boolean consistentLogicalTimestampEnabled) {
     try {
-      GenericRecord genericRecord = (GenericRecord) (record.toIndexedRecord(HoodieAvroSchemaCache.intern(schema), new Properties()).get()).getData();
+      GenericRecord genericRecord = (GenericRecord) (record.toIndexedRecord(HoodieSchema.fromAvroSchema(schema), new Properties()).get()).getData();
       List<Object> list = new ArrayList<>();
       for (String col : columns) {
         list.add(HoodieAvroUtils.getNestedFieldVal(genericRecord, col, true, consistentLogicalTimestampEnabled));

diff --git a/hudi-common/src/main/java/org/apache/hudi/common/HoodieJsonPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/HoodieJsonPayload.java
@@ -20,7 +20,7 @@
 
 import org.apache.hudi.avro.MercifulJsonConverter;
 import org.apache.hudi.common.model.HoodieRecordPayload;
-import org.apache.hudi.common.schema.HoodieAvroSchemaCache;
+import org.apache.hudi.common.schema.HoodieSchema;
 import org.apache.hudi.common.util.Option;
 import org.apache.hudi.exception.HoodieException;
 import org.apache.hudi.io.util.FileIOUtils;
@@ -65,7 +65,7 @@ public Option<IndexedRecord> combineAndGetUpdateValue(IndexedRecord oldRec, Sche
   @Override
   public Option<IndexedRecord> getInsertValue(Schema schema) throws IOException {
     MercifulJsonConverter jsonConverter = new MercifulJsonConverter();
-    return Option.of(jsonConverter.convert(getJsonData(), HoodieAvroSchemaCache.intern(schema)));
+    return Option.of(jsonConverter.convert(getJsonData(), HoodieSchema.fromAvroSchema(schema)));
   }
 
   private String getJsonData() throws IOException {

diff --git a/hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieAvroSchemaCache.java b/hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieAvroSchemaCache.java
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchema.java b/hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchema.java
@@ -27,6 +27,8 @@
 import org.apache.hudi.exception.HoodieIOException;
 import org.apache.hudi.internal.schema.HoodieSchemaException;
 
+import com.github.benmanes.caffeine.cache.Cache;
+import com.github.benmanes.caffeine.cache.Caffeine;
 import lombok.Getter;
 import org.apache.avro.JsonProperties;
 import org.apache.avro.LogicalType;
@@ -94,6 +96,12 @@
 public class HoodieSchema implements Serializable {
   private static final long serialVersionUID = 1L;
 
+  // Avro-identity fast path onto the value-keyed HoodieSchemaCache, backing fromAvroSchema:
+  // records of one file share the same live Schema instance, so the per-record hot path is a
+  // single weak-identity hit with no wrapper allocation or type dispatch.
+  private static final Cache<Schema, HoodieSchema> AVRO_SCHEMA_CACHE =
+      Caffeine.newBuilder().weakKeys().maximumSize(1024).build();
+
   /**
    * Constant representing a null JSON value, equivalent to JsonProperties.NULL_VALUE.
    * This provides compatibility with Avro's JsonProperties while maintaining Hudi's API.
@@ -338,13 +346,42 @@ private HoodieSchema(Schema avroSchema, List<HoodieSchemaField> fields) {
   /**
    * Factory method to create HoodieSchema from an Avro schema.
    *
+   * <p>Returns the canonical instance for the given schema, converting and interning it on
+   * first use: distinct Avro schema instances with identical serialized content converge on
+   * one shared wrapper through {@link HoodieSchemaCache}, with a weak identity fast path for
+   * the per-record hot path where all records of a file share the same live {@link Schema}
+   * instance.
+   *
+   * <p>Interning is never lossy: the canonicalization key covers the schema's full content,
+   * including doc strings and aliases, which Avro equality ignores. Schemas that differ only
+   * in docs or aliases stay distinct wrappers (even though they are {@code equals()}), so
+   * metadata consumed downstream (e.g. catalog sync column comments, alias-based field
+   * matching) is always preserved.
+   *
+   * <p>Because the canonical wrapper may have been created from a content-identical but
+   * different Avro schema instance, {@code fromAvroSchema(s).getAvroSchema()} does not
+   * necessarily return {@code s} itself. Canonical instances are shared: neither the wrapper
+   * nor its underlying Avro schema may be mutated.
+   *
    * @param avroSchema the Avro schema to wrap
-   * @return new HoodieSchema instance
+   * @return the canonical HoodieSchema instance, or null if avroSchema is null
    */
   public static HoodieSchema fromAvroSchema(Schema avroSchema) {
     if (avroSchema == null) {
       return null;
     }
+    HoodieSchema canonical = AVRO_SCHEMA_CACHE.getIfPresent(avroSchema);
+    if (canonical == null) {
+      // getIfPresent/put rather than a computing get: construction may re-enter fromAvroSchema
+      // for subschemas (e.g. Variant validation), which a Caffeine loader must not do. A racy
+      // duplicate build converges on one instance through the value-keyed intern.
+      canonical = HoodieSchemaCache.intern(buildFromAvroSchema(avroSchema));
+      AVRO_SCHEMA_CACHE.put(avroSchema, canonical);
+    }
+    return canonical;
+  }
+
+  private static HoodieSchema buildFromAvroSchema(Schema avroSchema) {
     LogicalType logicalType = avroSchema.getLogicalType();
     if (logicalType != null) {
       if (logicalType instanceof LogicalTypes.Decimal) {
@@ -1586,6 +1623,14 @@ public String toString(boolean pretty) {
     return avroSchema.toString(pretty);
   }
 
+  /**
+   * Equality delegates to Avro {@link Schema#equals}, which IGNORES doc strings and aliases:
+   * two schemas differing only in that metadata are equal. Consumers that care about docs or
+   * aliases (e.g. catalog sync comments, alias-based field matching) must not rely on
+   * equality or value-keyed maps to tell such schemas apart. Canonicalization in
+   * {@link HoodieSchemaCache} deliberately keys on the full serialized content instead, so
+   * interning never conflates them.
+   */
   @Override
   public boolean equals(Object obj) {
     if (this == obj) {
@@ -1644,7 +1689,10 @@ public HoodieSchema parse(String jsonSchema) {
 
       try {
         Schema avroSchema = avroParser.parse(jsonSchema);
-        return fromAvroSchema(avroSchema);
+        // Return a fresh, owned instance rather than the interned canonical one: parsed schemas
+        // are frequently mutated by callers (e.g. addProp in client-init callbacks), and mutating
+        // a shared interned instance would corrupt it for every other holder of the same schema.
+        return buildFromAvroSchema(avroSchema);
       } catch (IllegalArgumentException e) {
         throw new HoodieAvroSchemaException("Invalid schema string format", e);
       } catch (Exception e) {
@@ -1664,7 +1712,8 @@ public HoodieSchema parse(InputStream inputStream) {
 
       try {
         Schema avroSchema = avroParser.parse(inputStream);
-        return fromAvroSchema(avroSchema);
+        // See parse(String): return an owned, mutable instance, not the interned canonical one.
+        return buildFromAvroSchema(avroSchema);
       } catch (IOException e) {
         throw new HoodieIOException("Failed to parse schema from InputStream", e);
       } catch (IllegalArgumentException e) {

diff --git a/hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchemaCache.java b/hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchemaCache.java
@@ -20,6 +20,7 @@
 
 import com.github.benmanes.caffeine.cache.Caffeine;
 import com.github.benmanes.caffeine.cache.LoadingCache;
+import org.apache.avro.AvroRuntimeException;
 
 /**
  * A global cache for HoodieSchema instances to ensure that there is only one
@@ -28,21 +29,77 @@
  * <p>This is a global cache which works for a JVM lifecycle.
  * A collection of schema instances are maintained.
  *
+ * <p>This value-keyed pool is the canonicalization mechanism behind
+ * {@link HoodieSchema#fromAvroSchema}, and can also be used directly to intern schemas
+ * produced without an Avro source (builders, converters).
+ *
+ * <p>Interning is never lossy: entries are keyed on the schema's full serialized content,
+ * NOT on {@link HoodieSchema#equals}. Avro equality (which HoodieSchema equality delegates
+ * to) ignores doc strings and aliases, so keying on it would collapse schemas that differ
+ * only in that metadata and silently drop it -- docs drive catalog sync column comments and
+ * aliases drive schema-evolution field matching. Schemas that differ in docs or aliases
+ * intern to distinct canonical instances even though they are {@code equals()}.
+ *
  * <p>NOTE: The schema which is used frequently should be cached through this cache.
  */
 public class HoodieSchemaCache {
 
   // Ensure that there is only one variable instance of the same schema within an entire JVM lifetime
-  private static final LoadingCache<HoodieSchema, HoodieSchema> SCHEMA_CACHE =
-      Caffeine.newBuilder().weakValues().maximumSize(1024).build(k -> k);
+  private static final LoadingCache<SchemaContentKey, HoodieSchema> SCHEMA_CACHE =
+      Caffeine.newBuilder().weakValues().maximumSize(1024).build(key -> key.schema);
 
   /**
    * Get schema variable from global cache. If not found, put it into the cache and then return it.
    *
+   * <p>Two schemas converge on one canonical instance only when their full serialized form
+   * (including doc strings and aliases) is identical; see the class javadoc.
+   *
+   * <p>A schema that is valid in memory but cannot be serialized to JSON -- e.g. two distinct
+   * nested records that share a name, as some projection/reader paths produce -- has no content
+   * key, so it is returned uncached instead of interned. Canonicalization is only a
+   * de-duplication optimization, so skipping it stays correct.
+   *
    * @param schema schema to get
    * @return if found, return the exist schema variable, otherwise return the param itself.
    */
   public static HoodieSchema intern(HoodieSchema schema) {
-    return SCHEMA_CACHE.get(schema);
+    SchemaContentKey key;
+    try {
+      key = new SchemaContentKey(schema);
+    } catch (AvroRuntimeException e) {
+      // Not serializable -> no content key derivable; skip interning rather than fail the caller.
+      return schema;
+    }
+    return SCHEMA_CACHE.get(key);
+  }
+
+  /**
+   * Content-complete cache key: the serialized JSON form covers doc strings and aliases that
+   * Avro equality ignores. The wrapper class is part of the key so a logical-type subclass
+   * (e.g. {@link HoodieSchema.Decimal}) never collapses onto a plain wrapper of equal content,
+   * which would break downcasts.
+   */
+  private static final class SchemaContentKey {
+    private final HoodieSchema schema;
+    private final String contentJson;
+
+    SchemaContentKey(HoodieSchema schema) {
+      this.schema = schema;
+      this.contentJson = schema.getAvroSchema().toString();
+    }
+
+    @Override
+    public int hashCode() {
+      return contentJson.hashCode();
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+      if (!(obj instanceof SchemaContentKey)) {
+        return false;
+      }
+      SchemaContentKey that = (SchemaContentKey) obj;
+      return schema.getClass() == that.schema.getClass() && contentJson.equals(that.contentJson);
+    }
   }
 }