Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,32 @@ public class HoodieTTLConfig extends HoodieConfig {
+ "partition count and this value. When a table enables partition ttl for the first time, there "
+ "may be a large number of historical partitions, so a higher value than the default may be desired.");

public static final ConfigProperty<String> EVENT_TIME_FORMAT = ConfigProperty
.key(PARTITION_TTL_STRATEGY_PARAM_PREFIX + "event.time.format")
.defaultValue("yyyy-MM-dd")
.markAdvanced()
.sinceVersion("1.3.0")
.withDocumentation("Used by KEEP_BY_EVENT_TIME. Date-time pattern for the event time encoded in the partition path. "
+ "A '/' in the pattern means the time spans multiple path segments. Examples: 'yyyy-MM-dd' (default), "
+ "'yyyyMMdd', 'yyyy-MM-dd/HH', 'yyyyMMdd/HH'.");

public static final ConfigProperty<Integer> EVENT_TIME_SEGMENT_START_INDEX = ConfigProperty
.key(PARTITION_TTL_STRATEGY_PARAM_PREFIX + "event.time.segment.start.index")
.defaultValue(0)
.markAdvanced()
.sinceVersion("1.3.0")
.withDocumentation("Used by KEEP_BY_EVENT_TIME. 0-based index of the first path segment that carries the event time. "
+ "Defaults to 0 for pure time partitions like 'dt=2026-04-24'. Set to a higher value when non-time prefix segments exist, "
+ "e.g. 1 for 'region=us/20260424/05'.");

public static final ConfigProperty<Boolean> EVENT_TIME_DELETE_HIVE_DEFAULT_PARTITION = ConfigProperty
.key(PARTITION_TTL_STRATEGY_PARAM_PREFIX + "event.time.delete.hive.default.partition")
.defaultValue(false)
.markAdvanced()
.sinceVersion("1.3.0")
.withDocumentation("When true, KEEP_BY_EVENT_TIME treats partitions containing __HIVE_DEFAULT_PARTITION__ as expired and removes them. "
+ "Defaults to false so such partitions are skipped (with a WARN log) and the user keeps explicit control over their lifecycle.");

public static class Builder {
private final HoodieTTLConfig ttlConfig = new HoodieTTLConfig();

Expand Down Expand Up @@ -127,6 +153,21 @@ public HoodieTTLConfig.Builder withTTLStrategyType(PartitionTTLStrategyType ttlS
return this;
}

public HoodieTTLConfig.Builder withEventTimeFormat(String format) {
ttlConfig.setValue(EVENT_TIME_FORMAT, format);
return this;
}

public HoodieTTLConfig.Builder withEventTimeSegmentStartIndex(int timeSegStartIndex) {
ttlConfig.setValue(EVENT_TIME_SEGMENT_START_INDEX, Integer.toString(timeSegStartIndex));
return this;
}

public HoodieTTLConfig.Builder withEventTimeDeleteHiveDefaultPartition(boolean deleteHiveDefaultPartition) {
ttlConfig.setValue(EVENT_TIME_DELETE_HIVE_DEFAULT_PARTITION, Boolean.toString(deleteHiveDefaultPartition));
return this;
}

public HoodieTTLConfig.Builder fromProperties(Properties props) {
this.ttlConfig.getProps().putAll(props);
return this;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3062,6 +3062,18 @@ public Integer getPartitionTTLStatsMaxParallelism() {
return getInt(HoodieTTLConfig.STATS_MAX_PARALLELISM);
}

public String getPartitionTTLEventTimeFormat() {
return getStringOrDefault(HoodieTTLConfig.EVENT_TIME_FORMAT);
}

public int getPartitionTTLEventTimeSegmentStartIndex() {
return getIntOrDefault(HoodieTTLConfig.EVENT_TIME_SEGMENT_START_INDEX);
}

public boolean shouldPartitionTTLEventTimeDeleteHiveDefaultPartition() {
return getBooleanOrDefault(HoodieTTLConfig.EVENT_TIME_DELETE_HIVE_DEFAULT_PARTITION);
Comment thread
wangxianghu marked this conversation as resolved.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤖 nit: boolean getters in Hudi typically use an is/should prefix — could you rename this to shouldPartitionTTLEventTimeDeleteHiveDefaultPartition() (or a shorter variant)? get doesn't signal the return type at the call site the way a predicate prefix does.

⚠️ AI-generated; verify before applying. React 👍/👎 to flag quality.

}

public boolean isSecondaryIndexEnabled() {
return metadataConfig.isSecondaryIndexEnabled();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ public static String getPartitionTTLStrategyFromType(PartitionTTLStrategyType ty
return KeepByTimeStrategy.class.getName();
case KEEP_BY_CREATION_TIME:
return KeepByCreationTimeStrategy.class.getName();
case KEEP_BY_EVENT_TIME:
return KeepByEventTimeStrategy.class.getName();
default:
throw new HoodieException("Unsupported PartitionTTLStrategy Type " + type);
}
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@
*/
public enum PartitionTTLStrategyType {
KEEP_BY_TIME("org.apache.hudi.table.action.ttl.strategy.KeepByTimeStrategy"),
KEEP_BY_CREATION_TIME("org.apache.hudi.table.action.ttl.strategy.KeepByCreationTimeStrategy");
KEEP_BY_CREATION_TIME("org.apache.hudi.table.action.ttl.strategy.KeepByCreationTimeStrategy"),
KEEP_BY_EVENT_TIME("org.apache.hudi.table.action.ttl.strategy.KeepByEventTimeStrategy");

@Getter
private final String className;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hudi.table.action.ttl.strategy;

import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvSource;

import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.util.concurrent.TimeUnit;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;

/**
* Tests for {@link KeepByEventTimeStrategy}'s pure parsing logic. Exercises the static helpers
* directly so we don't need to spin up a HoodieTable / write client.
*/
public class TestKeepByEventTimeStrategy {

@Test
public void segmentCountCountsSlashes() {
assertEquals(1, KeepByEventTimeStrategy.segmentCount("yyyy-MM-dd"));
assertEquals(1, KeepByEventTimeStrategy.segmentCount("yyyyMMdd"));
assertEquals(2, KeepByEventTimeStrategy.segmentCount("yyyy-MM-dd/HH"));
assertEquals(2, KeepByEventTimeStrategy.segmentCount("yyyyMMdd/HH"));
}

// path, format, startIdx, hiveStyle, expectedExpired
//
// Full matrix mirroring the class-level JavaDoc: 4 formats x 4 position shapes
// (time only / prefix+time / time+suffix / prefix+time+suffix) x hive vs non-hive.
// Cutoff is "2026-04-30 00:00 UTC" so any 2026-04-22 path expires; the two
// "not expired" rows pin down the strict-less-than boundary.
@ParameterizedTest
@CsvSource({
// -------- Day, format=yyyy-MM-dd --------
// time only
"'dt=2026-04-22', 'yyyy-MM-dd', 0, true, true",
"'dt=2026-04-30', 'yyyy-MM-dd', 0, true, false",
"'2026-04-22', 'yyyy-MM-dd', 0, false, true",
"'2026-04-30', 'yyyy-MM-dd', 0, false, false",
// prefix + time
"'region=us/dt=2026-04-22', 'yyyy-MM-dd', 1, true, true",
"'eventType=login/dt=2026-04-30', 'yyyy-MM-dd', 1, true, false",
"'region=us/2026-04-22', 'yyyy-MM-dd', 1, false, true",
// time + suffix
"'dt=2026-04-22/source=app', 'yyyy-MM-dd', 0, true, true",
"'2026-04-22/source=app', 'yyyy-MM-dd', 0, false, true",
// prefix + time + suffix
"'region=us/dt=2026-04-22/source=app', 'yyyy-MM-dd', 1, true, true",
"'region=us/2026-04-22/source=app', 'yyyy-MM-dd', 1, false, true",

// -------- Day, format=yyyyMMdd --------
// time only
"'dt=20260422', 'yyyyMMdd', 0, true, true",
"'dt=20260430', 'yyyyMMdd', 0, true, false",
"'20260422', 'yyyyMMdd', 0, false, true",
// prefix + time
"'region=us/dt=20260422', 'yyyyMMdd', 1, true, true",
"'region=us/20260422', 'yyyyMMdd', 1, false, true",
// time + suffix
"'dt=20260422/source=app', 'yyyyMMdd', 0, true, true",
"'20260422/source=app', 'yyyyMMdd', 0, false, true",
// prefix + time + suffix
"'region=us/dt=20260422/source=app', 'yyyyMMdd', 1, true, true",
"'region=us/20260422/source=app', 'yyyyMMdd', 1, false, true",

// -------- Hour, format=yyyy-MM-dd/HH --------
// time only
"'dt=2026-04-22/hh=06', 'yyyy-MM-dd/HH', 0, true, true",
"'2026-04-22/06', 'yyyy-MM-dd/HH', 0, false, true",
// prefix + time
"'region=us/dt=2026-04-22/hh=06', 'yyyy-MM-dd/HH', 1, true, true",
"'region=us/2026-04-22/06', 'yyyy-MM-dd/HH', 1, false, true",
// time + suffix
"'dt=2026-04-22/hh=06/source=app', 'yyyy-MM-dd/HH', 0, true, true",
"'2026-04-22/06/source=app', 'yyyy-MM-dd/HH', 0, false, true",
// prefix + time + suffix
"'region=us/dt=2026-04-22/hh=06/source=app', 'yyyy-MM-dd/HH', 1, true, true",
"'region=us/2026-04-22/06/source=app', 'yyyy-MM-dd/HH', 1, false, true",

// -------- Hour, format=yyyyMMdd/HH --------
// time only
"'dt=20260422/hh=06', 'yyyyMMdd/HH', 0, true, true",
"'20260422/06', 'yyyyMMdd/HH', 0, false, true",
// prefix + time
"'eventType=login/dt=20260422/hh=06', 'yyyyMMdd/HH', 1, true, true",
"'login/20260422/06', 'yyyyMMdd/HH', 1, false, true",
// time + suffix
"'dt=20260422/hh=06/eventType=login', 'yyyyMMdd/HH', 0, true, true",
"'20260422/06/source=app', 'yyyyMMdd/HH', 0, false, true",
// prefix + time + suffix
"'region=us/dt=20260422/hh=06/source=app', 'yyyyMMdd/HH', 1, true, true",
"'region=us/20260422/06/source=app', 'yyyyMMdd/HH', 1, false, true",
})
public void parsesPartitionAndCompares(String path,
String format,
int startIdx,
boolean hiveStyle,
boolean expectedExpired) {
// Cutoff is "2026-04-30 00:00 UTC" so 2026-04-22 is expired (strictly before) while 2026-04-30 is not.
long cutoff = LocalDate.of(2026, 4, 30).atStartOfDay(ZoneOffset.UTC).toInstant().toEpochMilli();
DateTimeFormatter formatter = DateTimeFormatter.ofPattern(format).withZone(ZoneOffset.UTC);
int segCount = KeepByEventTimeStrategy.segmentCount(format);

boolean expired = KeepByEventTimeStrategy.isPartitionExpiredByEventTime(
path, formatter, startIdx, segCount, cutoff, false, hiveStyle);

assertEquals(expectedExpired, expired, "path=" + path + " format=" + format);
}

@Test
public void parseFailureThrows() {
// A partition whose time segment can't be parsed has no defined lifetime under event-time
// semantics, so the strategy fails fast and asks the user to switch to commit-time TTL.
DateTimeFormatter f = DateTimeFormatter.ofPattern("yyyy-MM-dd").withZone(ZoneOffset.UTC);
assertThrows(IllegalArgumentException.class, () ->
KeepByEventTimeStrategy.isPartitionExpiredByEventTime(
"dt=not-a-date", f, 0, 1, Long.MAX_VALUE, false, true));
}

@Test
public void shorterPathThanFormatThrows() {
// Partition layout is fixed per table — a mismatch is a configuration error, not a per-row skip.
DateTimeFormatter f = DateTimeFormatter.ofPattern("yyyy-MM-dd/HH").withZone(ZoneOffset.UTC);
assertThrows(IllegalArgumentException.class, () ->
KeepByEventTimeStrategy.isPartitionExpiredByEventTime(
"dt=2026-04-22", f, 0, 2, Long.MAX_VALUE, false, true));
}

@Test
public void startIndexOutOfRangeThrows() {
DateTimeFormatter f = DateTimeFormatter.ofPattern("yyyy-MM-dd").withZone(ZoneOffset.UTC);
assertThrows(IllegalArgumentException.class, () ->
KeepByEventTimeStrategy.isPartitionExpiredByEventTime(
"dt=2026-04-22", f, 5, 1, Long.MAX_VALUE, false, true));
}

@Test
public void hiveStyleSegmentWithoutEqualsThrows() {
// Table is hive-style but the segment lacks 'field=' prefix -> misconfiguration, fail fast.
DateTimeFormatter f = DateTimeFormatter.ofPattern("yyyy-MM-dd").withZone(ZoneOffset.UTC);
assertThrows(IllegalArgumentException.class, () ->
KeepByEventTimeStrategy.isPartitionExpiredByEventTime(
"2026-04-22", f, 0, 1, Long.MAX_VALUE, false, /*hiveStyle*/ true));
}

@Test
public void nonHiveStyleSegmentWithEqualsThrows() {
// hiveStyle=false: segment is taken verbatim, so '=' becomes part of the time string and
// fails parsing -> hard error, same as any other unparseable partition.
DateTimeFormatter f = DateTimeFormatter.ofPattern("yyyy-MM-dd").withZone(ZoneOffset.UTC);
assertThrows(IllegalArgumentException.class, () ->
KeepByEventTimeStrategy.isPartitionExpiredByEventTime(
"dt=2026-04-22", f, 0, 1, Long.MAX_VALUE, false, /*hiveStyle*/ false));
}

@Test
public void hiveDefaultPartitionSkippedWhenSwitchOff() {
DateTimeFormatter f = DateTimeFormatter.ofPattern("yyyy-MM-dd").withZone(ZoneOffset.UTC);
boolean expired = KeepByEventTimeStrategy.isPartitionExpiredByEventTime(
"dt=__HIVE_DEFAULT_PARTITION__", f, 0, 1, Long.MAX_VALUE, false, true);
assertFalse(expired);
}

@Test
public void hiveDefaultPartitionDeletedWhenSwitchOn() {
DateTimeFormatter f = DateTimeFormatter.ofPattern("yyyy-MM-dd").withZone(ZoneOffset.UTC);
boolean expired = KeepByEventTimeStrategy.isPartitionExpiredByEventTime(
"dt=__HIVE_DEFAULT_PARTITION__", f, 0, 1, /*cutoff*/ 0, /*deleteHiveDefault*/ true, /*hiveStyle*/ true);
assertTrue(expired);
}

@Test
public void wholeTimeBlockDefaultPartitionDeletedWhenSwitchOn() {
// Multi-segment format where the entire time block is the default marker -> treated as
// "the default partition" and respects the delete switch (just like the single-segment case).
DateTimeFormatter f = DateTimeFormatter.ofPattern("yyyy-MM-dd/HH").withZone(ZoneOffset.UTC);
boolean expired = KeepByEventTimeStrategy.isPartitionExpiredByEventTime(
"dt=__HIVE_DEFAULT_PARTITION__/hh=__HIVE_DEFAULT_PARTITION__",
f, 0, 2, /*cutoff*/ 0, /*deleteHiveDefault*/ true, /*hiveStyle*/ true);
assertTrue(expired);
}

@Test
public void partialDefaultPartitionTimeBlockFollowsSwitch() {
// dt=2026-06-28/hh=__HIVE_DEFAULT_PARTITION__: the date column was recorded but the hour
// column was NULL, so the event time has no place on the hour axis. We treat this exactly
// like a fully-null time block — the explicit delete switch decides. The cutoff is a recent
// date precisely to nail down that this is NOT a "the date is fresh, keep it" path.
DateTimeFormatter f = DateTimeFormatter.ofPattern("yyyy-MM-dd/HH").withZone(ZoneOffset.UTC);
long cutoff = LocalDate.of(2026, 4, 30).atStartOfDay(ZoneOffset.UTC).toInstant().toEpochMilli();

// switch on -> expire (even though dt looks fresh, event time is undefined)
assertTrue(KeepByEventTimeStrategy.isPartitionExpiredByEventTime(
"dt=2026-06-28/hh=__HIVE_DEFAULT_PARTITION__",
f, 0, 2, cutoff, /*deleteHiveDefault*/ true, /*hiveStyle*/ true));
// switch off -> skip with WARN (user retains explicit control)
assertFalse(KeepByEventTimeStrategy.isPartitionExpiredByEventTime(
"dt=2026-06-28/hh=__HIVE_DEFAULT_PARTITION__",
f, 0, 2, cutoff, /*deleteHiveDefault*/ false, /*hiveStyle*/ true));
}

@Test
public void resolveCutoffMillisInterpretsInstantInUtc() {
// The instant string '20260430120000000' must be read as 2026-04-30T12:00:00Z regardless of the
// JVM default zone. With ttl=0 the cutoff equals that exact instant; this pins down the contract
// that resolveCutoffMillis and the partition formatter both speak UTC.
long expected = LocalDateTime.of(2026, 4, 30, 12, 0).toInstant(ZoneOffset.UTC).toEpochMilli();
assertEquals(expected, KeepByEventTimeStrategy.resolveCutoffMillis("20260430120000000", 0));

// ttl=1d shifts back exactly 24h, again with no dependence on the JVM zone.
long oneDay = TimeUnit.DAYS.toMillis(1);
assertEquals(expected - oneDay,
KeepByEventTimeStrategy.resolveCutoffMillis("20260430120000000", oneDay));
}

@Test
public void hourBoundaryRespected() {
// Cutoff is 2026-04-22 12:00 UTC. 11:00 expired, 12:00 not expired.
long cutoff = LocalDateTime.of(2026, 4, 22, 12, 0).toInstant(ZoneOffset.UTC).toEpochMilli();
DateTimeFormatter f = DateTimeFormatter.ofPattern("yyyy-MM-dd/HH").withZone(ZoneOffset.UTC);

assertTrue(KeepByEventTimeStrategy.isPartitionExpiredByEventTime(
"2026-04-22/11", f, 0, 2, cutoff, false, /*hiveStyle*/ false));
assertFalse(KeepByEventTimeStrategy.isPartitionExpiredByEventTime(
"2026-04-22/12", f, 0, 2, cutoff, false, /*hiveStyle*/ false));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,16 @@ public void resolvesKeepByCreationTimeFromType() {
PartitionTTLStrategyType.getPartitionTTLStrategyClassName(config));
}

@Test
public void resolvesKeepByEventTimeFromType() {
HoodieConfig config = new HoodieConfig();
config.setValue(HoodieTTLConfig.PARTITION_TTL_STRATEGY_TYPE,
PartitionTTLStrategyType.KEEP_BY_EVENT_TIME.name());

assertEquals(PartitionTTLStrategyType.KEEP_BY_EVENT_TIME.getClassName(),
PartitionTTLStrategyType.getPartitionTTLStrategyClassName(config));
}

@Test
public void throwsOnUnknownType() {
HoodieConfig config = new HoodieConfig();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
public class SparkPartitionTTLActionExecutor<T>
extends BaseSparkCommitActionExecutor<T> {

private static final Logger LOG = LoggerFactory.getLogger(ConsistentBucketBulkInsertDataInternalWriterHelper.class);
private static final Logger LOG = LoggerFactory.getLogger(SparkPartitionTTLActionExecutor.class);

public SparkPartitionTTLActionExecutor(HoodieEngineContext context, HoodieWriteConfig config, HoodieTable table,
String instantTime) {
Expand Down
Loading
Loading