-
Notifications
You must be signed in to change notification settings - Fork 2.5k
refactor(common): Move Avro schema interning inside HoodieSchema #19144
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 1 commit
bfdaa64
16c1f37
32713d4
851dd74
e4aa735
68ba597
09aba74
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,6 +34,7 @@ | |
| import java.util.Collections; | ||
| import java.util.Deque; | ||
| import java.util.HashMap; | ||
| import java.util.HashSet; | ||
| import java.util.Iterator; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
|
|
@@ -56,6 +57,13 @@ | |
| * <ol> | ||
| * <li>Compatibility checks ignore schema name, unless schema is held inside | ||
| * a union</li> | ||
| * <li>Memoization does not depend on schema instance identity. Avro keys its memo on | ||
| * identity, assuming each subschema occurrence is a distinct instance; that assumption | ||
| * does not hold for {@link HoodieSchema}, whose navigation returns fresh (or, in the | ||
| * future, canonical interned) wrappers. Instead, recursion is detected with an | ||
| * in-progress set keyed on value-equal schema pairs, and only compatible results are | ||
| * memoized so every occurrence of an incompatible subschema reports its own field | ||
| * path.</li> | ||
| * </ol> | ||
| */ | ||
| @NoArgsConstructor(access = AccessLevel.PACKAGE) | ||
|
|
@@ -136,8 +144,9 @@ public static HoodieSchemaField lookupWriterField(final HoodieSchema writerSchem | |
| /** | ||
| * Reader/writer schema pair that can be used as a key in a hash map. | ||
| * <p> | ||
| * This reader/writer pair differentiates Schema objects based on their system | ||
| * hash code. | ||
| * Schemas are compared by value equality, never by instance identity: {@link HoodieSchema} | ||
| * wrappers may be freshly allocated per navigation call or interned to canonical instances, | ||
| * so instance identity carries no meaning for the checker. | ||
| */ | ||
| @RequiredArgsConstructor | ||
| private static final class ReaderWriter { | ||
|
|
@@ -149,7 +158,7 @@ private static final class ReaderWriter { | |
| */ | ||
| @Override | ||
| public int hashCode() { | ||
| return System.identityHashCode(mReader) ^ System.identityHashCode(mWriter); | ||
| return Objects.hash(mReader, mWriter); | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -161,8 +170,7 @@ public boolean equals(Object obj) { | |
| return false; | ||
| } | ||
| final ReaderWriter that = (ReaderWriter) obj; | ||
| // Use pointer comparison here: | ||
| return (this.mReader == that.mReader) && (this.mWriter == that.mWriter); | ||
| return this.mReader.equals(that.mReader) && this.mWriter.equals(that.mWriter); | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -182,7 +190,12 @@ public String toString() { | |
| * </p> | ||
| */ | ||
| private static final class ReaderWriterCompatibilityChecker { | ||
| private final Map<ReaderWriter, SchemaCompatibilityResult> mMemoizeMap = new HashMap<>(); | ||
| // Compatible results only: they carry no location state, so they are safe to reuse | ||
| // when the same value pair recurs at a different field path. | ||
| private final Map<ReaderWriter, SchemaCompatibilityResult> memoizedCompatibleResults = new HashMap<>(); | ||
| // Value pairs currently being computed; re-entering one means a named type was reached | ||
| // through itself, i.e. genuine recursion. | ||
| private final Set<ReaderWriter> inProgressPairs = new HashSet<>(); | ||
| private final boolean checkNaming; | ||
|
|
||
| public ReaderWriterCompatibilityChecker(boolean checkNaming) { | ||
|
|
@@ -210,7 +223,7 @@ public SchemaCompatibilityResult getCompatibility(final HoodieSchema reader, fin | |
| /** | ||
| * Reports the compatibility of a reader/writer schema pair. | ||
| * <p> | ||
| * Memorizes the compatibility results. | ||
| * Memoizes compatible results and breaks recursion on value-equal pairs. | ||
| * </p> | ||
| * | ||
| * @param reader Reader schema to test. | ||
|
|
@@ -224,18 +237,21 @@ private SchemaCompatibilityResult getCompatibility(final HoodieSchema reader, | |
| final Deque<LocationInfo> locations) { | ||
| log.debug("Checking compatibility of reader {} with writer {}", reader, writer); | ||
| final ReaderWriter pair = new ReaderWriter(reader, writer); | ||
| SchemaCompatibilityResult result = mMemoizeMap.get(pair); | ||
| if (result != null) { | ||
| if (result.getCompatibilityType() == SchemaCompatibilityType.RECURSION_IN_PROGRESS) { | ||
| // Break the recursion here. | ||
| // schemas are compatible unless proven incompatible: | ||
| result = SchemaCompatibilityResult.compatible(); | ||
| } | ||
| } else { | ||
| // Mark this reader/writer pair as "in progress": | ||
| mMemoizeMap.put(pair, SchemaCompatibilityResult.recursionInProgress()); | ||
| result = calculateCompatibility(reader, writer, locations); | ||
| mMemoizeMap.put(pair, result); | ||
| final SchemaCompatibilityResult memoized = memoizedCompatibleResults.get(pair); | ||
| if (memoized != null) { | ||
| return memoized; | ||
| } | ||
| if (!inProgressPairs.add(pair)) { | ||
| // Break the recursion here. | ||
| // schemas are compatible unless proven incompatible: | ||
| return SchemaCompatibilityResult.compatible(); | ||
| } | ||
| SchemaCompatibilityResult result = calculateCompatibility(reader, writer, locations); | ||
| inProgressPairs.remove(pair); | ||
| if (result.getCompatibilityType() == SchemaCompatibilityType.COMPATIBLE) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 Since incompatible results are intentionally not memoized (to report each occurrence path), a subschema that is shared across many field paths gets fully recomputed at every path. For a schema where an incompatible named type is reachable through nested diamond-shaped sharing, the number of recomputations (and reported incompatibility entries) can grow multiplicatively with nesting depth. Non-recursive shared types don't hit
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The recomputation is output-bound: the contract is to report every occurrence path, and in a diamond the path count is itself 2^d, so anything that doesn't under-report does at least that much work. Compatible children of a shared subtree still memo-hit, so a revisit only walks the incompatible spine. Upstream Avro bounds this corner only by under-reporting (first path wins) -- the bug the four-path test pins. Pre-#18967 code had the same profile as this PR: fresh wrappers meant the identity memo never hit either. Measured the corner (named type shared at 2 fields per level, incompatible leaf): Linear in output, ~1us per reported path, and this runs once per schema validation, not per record. Real schemas share a type at 2-3 fields, a level or two deep. A (pair, locationPath) set wouldn't bound it: every (pair, path) in a diamond is distinct, so it never dedups. If this corner ever got real, the fix is memoizing incompatible results with subtree-relative locations re-anchored on reuse (noted on #19143) -- bounds the walk, still emits every path.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Incompatible pairs recompute per occurrence, so a shared named type referenced k times per level over d levels costs k^d subtree re-checks and accumulates k^d |
||
| // Incompatible results embed the field path they were found at, so they are | ||
| // recomputed per occurrence; memoizing them would report only the first path. | ||
| memoizedCompatibleResults.put(pair, result); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Memoizing COMPATIBLE results keyed only on the value pair (reader, writer) can mask a union-scope name mismatch and flip the verdict to COMPATIBLE.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch, confirmed -- reproduced it in a regression test (plain-field occurrence memoized COMPATIBLE first, union occurrence of the same pair then skipped the name check). Fixed as you suggested: the memo key now carries whether names are validated at that location, computed by the same predicate
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Verified issue: a COMPATIBLE result computed while an ancestor pair is still in |
||
| } | ||
| return result; | ||
| } | ||
|
|
@@ -590,12 +606,7 @@ private static List<String> asList(Deque<LocationInfo> deque) { | |
| * Identifies the type of schema compatibility result. | ||
| */ | ||
| public enum SchemaCompatibilityType { | ||
| COMPATIBLE, INCOMPATIBLE, | ||
|
|
||
| /** | ||
| * Used internally to tag a reader/writer schema pair and prevent recursion. | ||
| */ | ||
| RECURSION_IN_PROGRESS | ||
| COMPATIBLE, INCOMPATIBLE | ||
| } | ||
|
|
||
| public enum SchemaIncompatibilityType { | ||
|
|
@@ -634,11 +645,9 @@ public SchemaCompatibilityResult mergedWith(SchemaCompatibilityResult toMerge) { | |
| SchemaCompatibilityType compatibilityType; | ||
| // the below fields are only valid if INCOMPATIBLE | ||
| List<Incompatibility> incompatibilities; | ||
| // cached objects for stateless details | ||
| // cached object for stateless details | ||
| private static final SchemaCompatibilityResult COMPATIBLE = new SchemaCompatibilityResult( | ||
| SchemaCompatibilityType.COMPATIBLE, Collections.emptyList()); | ||
| private static final SchemaCompatibilityResult RECURSION_IN_PROGRESS = new SchemaCompatibilityResult( | ||
| SchemaCompatibilityType.RECURSION_IN_PROGRESS, Collections.emptyList()); | ||
|
|
||
| /** | ||
| * Returns a details object representing a compatible schema pair. | ||
|
|
@@ -650,17 +659,6 @@ public static SchemaCompatibilityResult compatible() { | |
| return COMPATIBLE; | ||
| } | ||
|
|
||
| /** | ||
| * Returns a details object representing a state indicating that recursion is in | ||
| * progress. | ||
| * | ||
| * @return a SchemaCompatibilityDetails object with RECURSION_IN_PROGRESS | ||
| * SchemaCompatibilityType, and no other state. | ||
| */ | ||
| public static SchemaCompatibilityResult recursionInProgress() { | ||
| return RECURSION_IN_PROGRESS; | ||
| } | ||
|
|
||
| /** | ||
| * Returns a details object representing an incompatible schema pair, including | ||
| * error details. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Values in this map are always value-equal to
compatible()(mergedWithonly yields COMPATIBLE from two COMPATIBLE inputs whose lists are empty), so this can be aSet<ReaderWriter>withreturn SchemaCompatibilityResult.compatible()on hit -- less garbage, and the compatible-only invariant becomes structural instead of comment-enforced.