-
Notifications
You must be signed in to change notification settings - Fork 2.5k
fix(streamer): write non-partitioned sample-writes table for record-size estimation #19115
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,12 +32,19 @@ | |
| import org.apache.hudi.utilities.config.HoodieStreamerConfig; | ||
| import org.apache.hudi.utilities.streamer.SparkSampleWritesUtils; | ||
|
|
||
| import org.apache.hadoop.fs.FileStatus; | ||
| import org.apache.hadoop.fs.FileSystem; | ||
| import org.apache.hadoop.fs.Path; | ||
| import org.apache.spark.api.java.JavaRDD; | ||
| import org.junit.jupiter.api.AfterEach; | ||
| import org.junit.jupiter.api.BeforeEach; | ||
| import org.junit.jupiter.api.Test; | ||
|
|
||
| import java.io.IOException; | ||
| import java.util.ArrayList; | ||
| import java.util.Arrays; | ||
| import java.util.List; | ||
| import java.util.stream.IntStream; | ||
|
|
||
| import static org.junit.jupiter.api.Assertions.assertEquals; | ||
| import static org.junit.jupiter.api.Assertions.assertFalse; | ||
|
|
@@ -86,7 +93,7 @@ public void skipOverwriteRecordSizeEstimateWhenTimelineNonEmpty() throws Excepti | |
| } | ||
|
|
||
| @Test | ||
| public void overwriteRecordSizeEstimateForEmptyTable() { | ||
| void overwriteRecordSizeEstimateForEmptyTable() throws IOException { | ||
| int originalRecordSize = 100; | ||
| TypedProperties props = new TypedProperties(); | ||
| props.put(HoodieStreamerConfig.SAMPLE_WRITES_ENABLED.key(), "true"); | ||
|
|
@@ -99,9 +106,63 @@ public void overwriteRecordSizeEstimateForEmptyTable() { | |
| .build(); | ||
|
|
||
| String commitTime = HoodieTestDataGenerator.getCommitTimeAtUTC(1); | ||
| // Input spans multiple source partitions; the sample-writes table must still be non-partitioned. | ||
| JavaRDD<HoodieRecord> records = jsc().parallelize(dataGen.generateInserts(commitTime, 2000), 2); | ||
| Option<HoodieWriteConfig> writeConfigOpt = SparkSampleWritesUtils.getWriteConfigWithRecordSizeEstimate(jsc(), Option.of(records), originalWriteConfig); | ||
| assertTrue(writeConfigOpt.isPresent()); | ||
| assertEquals(779.0, writeConfigOpt.get().getCopyOnWriteRecordSizeEstimate(), 10.0); | ||
| assertEquals(337.0, writeConfigOpt.get().getCopyOnWriteRecordSizeEstimate(), 10.0); | ||
| assertSampleWritesNonPartitioned(); | ||
| } | ||
|
|
||
| @Test | ||
| void sampleWritesAreNonPartitionedEvenForManyPartitionInput() throws IOException { | ||
| int recordsPerPartition = 50; | ||
| String[] partitionPaths = IntStream.range(0, 20) | ||
| .mapToObj(i -> String.format("year=2024/month=01/day=%02d", i + 1)) | ||
| .toArray(String[]::new); | ||
| HoodieTestDataGenerator manyPartitionGen = new HoodieTestDataGenerator(partitionPaths); | ||
|
|
||
| TypedProperties props = new TypedProperties(); | ||
| props.put(HoodieStreamerConfig.SAMPLE_WRITES_ENABLED.key(), "true"); | ||
| HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder() | ||
| .withProperties(props) | ||
| .forTable("foo") | ||
| .withPath(basePath()) | ||
| .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) | ||
| .build(); | ||
|
|
||
| String commitTime = HoodieTestDataGenerator.getCommitTimeAtUTC(1); | ||
| List<HoodieRecord> allRecords = new ArrayList<>(); | ||
| for (String partition : partitionPaths) { | ||
| allRecords.addAll(manyPartitionGen.generateInsertsForPartition(commitTime, recordsPerPartition, partition)); | ||
| } | ||
| JavaRDD<HoodieRecord> records = jsc().parallelize(allRecords, 4); | ||
|
|
||
| Option<HoodieWriteConfig> writeConfigOpt = SparkSampleWritesUtils.getWriteConfigWithRecordSizeEstimate(jsc(), Option.of(records), writeConfig); | ||
| assertTrue(writeConfigOpt.isPresent(), "Sample write should produce a record-size estimate."); | ||
| assertSampleWritesNonPartitioned(); | ||
| } | ||
|
|
||
| /** | ||
| * Fails if the sample-writes folder contains any source-partition subdirectory, i.e. the | ||
| * sample write was not flattened into a single non-partitioned file. | ||
| */ | ||
| private void assertSampleWritesNonPartitioned() throws IOException { | ||
| Path sampleWritesPath = new Path(basePath(), ".hoodie/.aux/.sample_writes"); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 nit: could you use
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 nit: could you build this from the
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 |
||
| FileSystem fs = sampleWritesPath.getFileSystem(jsc().hadoopConfiguration()); | ||
| assertTrue(fs.exists(sampleWritesPath), "Sample-writes folder should exist after a sample write."); | ||
| FileStatus[] runs = fs.listStatus(sampleWritesPath); | ||
| assertTrue(runs.length > 0, "Sample-writes folder should contain at least one run."); | ||
| for (FileStatus run : runs) { | ||
| List<String> partitionDirs = new ArrayList<>(); | ||
| for (FileStatus entry : fs.listStatus(run.getPath())) { | ||
| if (entry.isDirectory() && !entry.getPath().getName().equals(".hoodie")) { | ||
| partitionDirs.add(entry.getPath().getName()); | ||
| } | ||
| } | ||
| assertTrue(partitionDirs.isEmpty(), | ||
| "Sample-writes run at " + run.getPath() + " should have no source partition subdirectories, but found: " | ||
| + Arrays.toString(partitionDirs.toArray())); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 nit:
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 |
||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🤖 Collapsing the sample to a single non-partitioned file maximizes parquet footer/dictionary amortization, which is why the estimate drops so much (~779 → ~337). But the real target table is partitioned, so its per-record on-disk size sits somewhere between these depending on how many records each partition actually accumulates per commit. Since this estimate drives
copyOnWriteRecordSizeEstimate(file bin-packing), could the single-file layout under-estimate and lead to larger-than-target files for sources whose partitions stay sparse? Curious whether you considered keeping a bounded number of partitions rather than fully flattening to one.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Impact is bounded to the first commit: getWriteConfigWithRecordSizeEstimate early-returns once the timeline is non-empty (SparkSampleWritesUtils.java:69), and from the next commit on AverageRecordSizeEstimator.averageBytesPerRecord recomputes from real partitioned commit stats, using this config only as the empty-timeline fallback (AverageRecordSizeEstimator.java:70-105). That estimator also subtracts a per-file metadata estimate and skips below-threshold commits (AverageRecordSizeEstimator.java:86,90,93), so the target semantic is data bytes per record, not footer overhead - which is what the single-file sample now measures. A sparse-partition under-estimate self-corrects after the first commit, and a bounded-partition layout would partly reintroduce the footer overhead this removes.