From b8fe69c06599ebe9ec481fd36367abe144337dff Mon Sep 17 00:00:00 2001 From: Lina Jodoin Date: Tue, 2 Jun 2026 16:09:43 -0700 Subject: [PATCH 1/8] [Scheduler] Add percentage-based dialup for creation and migration (#10348) ## What changed? - Gates CHASM scheduler creation and migration against a new rollout percentage, based on a stable hash of namespace + business ID. - Defaults to 100%, to preserve existing rollouts (rollout tooling will be updated to also set these keys) - Added a RolloutApplies helper, since we'll likely want to do this for other features going forward ## Why? - We want finely-grained control for rollouts even within a namespace. ## How did you test it? - [ ] built - [ ] run locally and tested manually - [ ] covered by existing tests - [x] added new unit test(s) - [x] added new functional test(s) (cherry picked from commit d4e49580a08161c73017b55df3edff0250cf098a) --- common/dynamicconfig/constants.go | 17 +++ common/dynamicconfig/rollout.go | 27 ++++ common/dynamicconfig/rollout_test.go | 41 ++++++ service/frontend/service.go | 14 +- service/frontend/workflow_handler.go | 30 +++-- .../frontend/workflow_handler_second_test.go | 3 +- service/worker/scheduler/fx.go | 37 ++--- service/worker/scheduler/workflow.go | 16 ++- service/worker/scheduler/workflow_test.go | 53 +++++++- tests/schedule_migration_test.go | 126 ++++++++++++++++++ tests/schedule_test.go | 78 +++++++++++ tests/testcore/dynamic_config_overrides.go | 7 + tests/testcore/utils.go | 27 ++++ 13 files changed, 438 insertions(+), 38 deletions(-) create mode 100644 common/dynamicconfig/rollout.go create mode 100644 common/dynamicconfig/rollout_test.go diff --git a/common/dynamicconfig/constants.go b/common/dynamicconfig/constants.go index ae75bb80e84..d052cfc87fa 100644 --- a/common/dynamicconfig/constants.go +++ b/common/dynamicconfig/constants.go @@ -2945,6 +2945,14 @@ Requires service restart to take effect.`, instead of the existing (V1) implementation.`, ) + CHASMSchedulerCreationRolloutPercent = NewNamespaceIntSetting( + "history.chasmSchedulerCreationRolloutPercent", + 0, + `CHASMSchedulerCreationRolloutPercent is the per-namespace percentage of new schedules that will be +created on the CHASM (V2) implementation. This setting is only consulted when EnableCHASMSchedulerCreation is true and +is re-evaluated on every CreateSchedule RPC.`, + ) + EnableCHASMSchedulerRouting = NewNamespaceBoolSetting( "history.enableCHASMSchedulerRouting", true, @@ -2959,6 +2967,15 @@ first (with fallback to V1), excluding CreateSchedule.`, to the CHASM (V2) implementation on active scheduler workflows.`, ) + CHASMSchedulerMigrationRolloutPercent = NewNamespaceIntSetting( + "history.chasmSchedulerMigrationRolloutPercent", + 0, + `CHASMSchedulerMigrationRolloutPercent is the per-namespace percentage of V1 schedules that will be +migrated to the CHASM (V2) implementation This setting is only consulted when +EnableCHASMSchedulerMigration is true. The decision is re-evaluated when a +scheduler workflow starts or continues-as-new.`, + ) + EnableCHASMSchedulerSentinels = NewNamespaceBoolSetting( "history.enableCHASMSchedulerSentinels", true, diff --git a/common/dynamicconfig/rollout.go b/common/dynamicconfig/rollout.go new file mode 100644 index 00000000000..09f8241de40 --- /dev/null +++ b/common/dynamicconfig/rollout.go @@ -0,0 +1,27 @@ +package dynamicconfig + +import ( + "math" + + "github.com/dgryski/go-farm" +) + +// RolloutAccepts reports whether key falls within the given percentage [0,100] +// using a stable hash, so dialing the percent up is monotonic: a key accepted +// at percent P is accepted at every percent >= P. +// +// The hash algorithm and the bytes of key are a load-bearing rollout key: +// changing either re-shuffles every cohort and breaks monotonicity for +// in-flight rollouts. Callers are responsible for constructing key (including +// any separator bytes between fields) and must not change that construction +// once a rollout is in progress. +func RolloutAccepts(key []byte, percent int) bool { + if percent >= 100 { + return true + } + if percent <= 0 { + return false + } + threshold := uint32(float64(percent) / 100.0 * float64(math.MaxUint32)) + return farm.Fingerprint32(key) < threshold +} diff --git a/common/dynamicconfig/rollout_test.go b/common/dynamicconfig/rollout_test.go new file mode 100644 index 00000000000..b34c3dcc6d1 --- /dev/null +++ b/common/dynamicconfig/rollout_test.go @@ -0,0 +1,41 @@ +package dynamicconfig + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" +) + +func rolloutKey(namespace, id string) []byte { + return fmt.Appendf(nil, "%s\x00%s", namespace, id) +} + +func TestRolloutAccepts_Boundaries(t *testing.T) { + t.Parallel() + + key := rolloutKey("ns", "id") + require.True(t, RolloutAccepts(key, 100)) + require.True(t, RolloutAccepts(key, 101)) // clamped + require.False(t, RolloutAccepts(key, 0)) + require.False(t, RolloutAccepts(key, -5)) // clamped +} + +func TestRolloutAccepts_Monotonic(t *testing.T) { + t.Parallel() + + // An ID accepted at percent P must also be accepted at every percent >= P. + for i := range 500 { + key := rolloutKey("ns", fmt.Sprintf("id-%d", i)) + firstAcceptedAt := -1 + for pct := 0; pct <= 100; pct++ { + if RolloutAccepts(key, pct) { + if firstAcceptedAt < 0 { + firstAcceptedAt = pct + } + } else if firstAcceptedAt >= 0 { + t.Fatalf("id %d accepted at pct=%d but rejected at pct=%d", i, firstAcceptedAt, pct) + } + } + } +} diff --git a/service/frontend/service.go b/service/frontend/service.go index dacde0b073d..f114bb59136 100644 --- a/service/frontend/service.go +++ b/service/frontend/service.go @@ -169,6 +169,9 @@ type Config struct { EnableChasm dynamicconfig.BoolPropertyFnWithNamespaceFilter // Enable creation of new schedules on CHASM (V2) engine EnableCHASMSchedulerCreation dynamicconfig.BoolPropertyFnWithNamespaceFilter + // Per-namespace percentage [0-100] of new schedules routed to CHASM when + // EnableCHASMSchedulerCreation is true. Default 0. + CHASMSchedulerCreationRolloutPercent dynamicconfig.IntPropertyFnWithNamespaceFilter // Enable CHASM-first routing for schedule RPCs other than CreateSchedule EnableCHASMSchedulerRouting dynamicconfig.BoolPropertyFnWithNamespaceFilter // Enables ID-space collision sentinels, and must be enabled and propagated in @@ -351,11 +354,12 @@ func NewConfig( MaxFairnessWeightOverrideConfigLimit: dynamicconfig.MatchingMaxFairnessKeyWeightOverrides.Get(dc), - EnableSchedules: dynamicconfig.FrontendEnableSchedules.Get(dc), - EnableChasm: dynamicconfig.EnableChasm.Get(dc), - EnableCHASMSchedulerCreation: dynamicconfig.EnableCHASMSchedulerCreation.Get(dc), - EnableCHASMSchedulerRouting: dynamicconfig.EnableCHASMSchedulerRouting.Get(dc), - EnableCHASMSchedulerSentinels: dynamicconfig.EnableCHASMSchedulerSentinels.Get(dc), + EnableSchedules: dynamicconfig.FrontendEnableSchedules.Get(dc), + EnableChasm: dynamicconfig.EnableChasm.Get(dc), + EnableCHASMSchedulerCreation: dynamicconfig.EnableCHASMSchedulerCreation.Get(dc), + CHASMSchedulerCreationRolloutPercent: dynamicconfig.CHASMSchedulerCreationRolloutPercent.Get(dc), + EnableCHASMSchedulerRouting: dynamicconfig.EnableCHASMSchedulerRouting.Get(dc), + EnableCHASMSchedulerSentinels: dynamicconfig.EnableCHASMSchedulerSentinels.Get(dc), // [cleanup-wv-pre-release] EnableDeployments: dynamicconfig.EnableDeployments.Get(dc), diff --git a/service/frontend/workflow_handler.go b/service/frontend/workflow_handler.go index d94f5a947ca..e75fb04d5b1 100644 --- a/service/frontend/workflow_handler.go +++ b/service/frontend/workflow_handler.go @@ -3748,7 +3748,7 @@ func (wh *WorkflowHandler) CreateSchedule( namespaceName := namespace.Name(request.Namespace) - useChasmScheduler := wh.chasmSchedulerCreationEnabled(ctx, namespaceName.String()) + useChasmScheduler := wh.chasmSchedulerCreationEnabled(ctx, namespaceName.String(), request.ScheduleId) wh.logger.Debug("Received CreateSchedule", tag.ScheduleID(request.ScheduleId), tag.WorkflowNamespace(namespaceName.String()), @@ -3778,18 +3778,32 @@ func (wh *WorkflowHandler) CreateSchedule( } // chasmSchedulerCreationEnabled returns true when CreateSchedule should create on -// the CHASM scheduler. -func (wh *WorkflowHandler) chasmSchedulerCreationEnabled(ctx context.Context, namespaceName string) bool { - return (headers.IsExperimentRequested(ctx, ChasmSchedulerExperiment) && - wh.config.IsExperimentAllowed(ChasmSchedulerExperiment, namespaceName)) || - wh.config.EnableCHASMSchedulerCreation(namespaceName) +// the CHASM scheduler. The binary EnableCHASMSchedulerCreation flag is gated by a +// per-namespace percentage rollout keyed on scheduleID; the experiment header path +// is unconditional. +func (wh *WorkflowHandler) chasmSchedulerCreationEnabled(ctx context.Context, namespaceName, scheduleID string) bool { + if headers.IsExperimentRequested(ctx, ChasmSchedulerExperiment) && + wh.config.IsExperimentAllowed(ChasmSchedulerExperiment, namespaceName) { + return true + } + if !wh.config.EnableCHASMSchedulerCreation(namespaceName) { + return false + } + key := fmt.Appendf(nil, "%s\x00%s", namespaceName, scheduleID) + return dynamicconfig.RolloutAccepts(key, wh.config.CHASMSchedulerCreationRolloutPercent(namespaceName)) } // chasmSchedulerEnabled returns true when schedule RPCs should route to CHASM // first. Handlers must be capable of falling back to V1 codepaths for schedules -// that haven't been migrated to CHASM. +// that haven't been migrated to CHASM. Routing follows the binary creation flag +// (without the percentage gate) because schedules already created on CHASM at any +// rollout percent must be looked up there. func (wh *WorkflowHandler) chasmSchedulerEnabled(ctx context.Context, namespaceName string) bool { - return wh.chasmSchedulerCreationEnabled(ctx, namespaceName) || + if headers.IsExperimentRequested(ctx, ChasmSchedulerExperiment) && + wh.config.IsExperimentAllowed(ChasmSchedulerExperiment, namespaceName) { + return true + } + return wh.config.EnableCHASMSchedulerCreation(namespaceName) || wh.config.EnableCHASMSchedulerRouting(namespaceName) } diff --git a/service/frontend/workflow_handler_second_test.go b/service/frontend/workflow_handler_second_test.go index 4efdbbbd593..ae7bb336ca3 100644 --- a/service/frontend/workflow_handler_second_test.go +++ b/service/frontend/workflow_handler_second_test.go @@ -63,6 +63,7 @@ func TestCHASMSchedulerRoutingAndCreationGates(t *testing.T) { config := NewConfig(dc.NewNoopCollection(), 1) config.EnableCHASMSchedulerCreation = dc.GetBoolPropertyFnFilteredByNamespace(tc.enableCreation) + config.CHASMSchedulerCreationRolloutPercent = dc.GetIntPropertyFnFilteredByNamespace(100) config.EnableCHASMSchedulerRouting = dc.GetBoolPropertyFnFilteredByNamespace(tc.enableRouting) config.AllowedExperiments = dc.GetTypedPropertyFnFilteredByNamespace(tc.allowedExp) @@ -73,7 +74,7 @@ func TestCHASMSchedulerRoutingAndCreationGates(t *testing.T) { ctx = metadata.NewIncomingContext(ctx, metadata.Pairs(headers.ExperimentHeaderName, ChasmSchedulerExperiment)) } - require.Equal(t, tc.expectCreation, wh.chasmSchedulerCreationEnabled(ctx, "test-namespace")) + require.Equal(t, tc.expectCreation, wh.chasmSchedulerCreationEnabled(ctx, "test-namespace", "test-schedule")) require.Equal(t, tc.expectRouting, wh.chasmSchedulerEnabled(ctx, "test-namespace")) }) } diff --git a/service/worker/scheduler/fx.go b/service/worker/scheduler/fx.go index 3389361a852..13e49f233c5 100644 --- a/service/worker/scheduler/fx.go +++ b/service/worker/scheduler/fx.go @@ -49,13 +49,14 @@ var VisibilityListQueryChasm = fmt.Sprintf( type ( workerComponent struct { - specBuilder *SpecBuilder // workflow dep - activityDeps activityDeps - enabledForNs dynamicconfig.BoolPropertyFnWithNamespaceFilter - enableCHASMMigration dynamicconfig.BoolPropertyFnWithNamespaceFilter - globalNSStartWorkflowRPS dynamicconfig.TypedSubscribableWithNamespaceFilter[float64] - maxBlobSize dynamicconfig.IntPropertyFnWithNamespaceFilter - localActivitySleepLimit dynamicconfig.DurationPropertyFnWithNamespaceFilter + specBuilder *SpecBuilder // workflow dep + activityDeps activityDeps + enabledForNs dynamicconfig.BoolPropertyFnWithNamespaceFilter + enableCHASMMigration dynamicconfig.BoolPropertyFnWithNamespaceFilter + chasmMigrationRolloutPercent dynamicconfig.IntPropertyFnWithNamespaceFilter + globalNSStartWorkflowRPS dynamicconfig.TypedSubscribableWithNamespaceFilter[float64] + maxBlobSize dynamicconfig.IntPropertyFnWithNamespaceFilter + localActivitySleepLimit dynamicconfig.DurationPropertyFnWithNamespaceFilter } activityDeps struct { @@ -85,13 +86,14 @@ func NewResult( ) fxResult { return fxResult{ Component: &workerComponent{ - specBuilder: specBuilder, - activityDeps: params, - enabledForNs: dynamicconfig.WorkerEnableScheduler.Get(dc), - enableCHASMMigration: dynamicconfig.EnableCHASMSchedulerMigration.Get(dc), - globalNSStartWorkflowRPS: dynamicconfig.SchedulerNamespaceStartWorkflowRPS.Subscribe(dc), - maxBlobSize: dynamicconfig.BlobSizeLimitError.Get(dc), - localActivitySleepLimit: dynamicconfig.SchedulerLocalActivitySleepLimit.Get(dc), + specBuilder: specBuilder, + activityDeps: params, + enabledForNs: dynamicconfig.WorkerEnableScheduler.Get(dc), + enableCHASMMigration: dynamicconfig.EnableCHASMSchedulerMigration.Get(dc), + chasmMigrationRolloutPercent: dynamicconfig.CHASMSchedulerMigrationRolloutPercent.Get(dc), + globalNSStartWorkflowRPS: dynamicconfig.SchedulerNamespaceStartWorkflowRPS.Subscribe(dc), + maxBlobSize: dynamicconfig.BlobSizeLimitError.Get(dc), + localActivitySleepLimit: dynamicconfig.SchedulerLocalActivitySleepLimit.Get(dc), }, } } @@ -103,8 +105,13 @@ func (s *workerComponent) DedicatedWorkerOptions(ns *namespace.Namespace) *worke } func (s *workerComponent) Register(registry sdkworker.Registry, ns *namespace.Namespace, details workercommon.RegistrationDetails) func() { - enableMigration := s.enableCHASMMigration(ns.Name().String()) + nsName := ns.Name().String() wfFunc := func(ctx workflow.Context, args *schedulespb.StartScheduleArgs) error { + key := fmt.Appendf(nil, "%s\x00%s", nsName, args.State.ScheduleId) + enableMigration := func() bool { + return s.enableCHASMMigration(nsName) && + dynamicconfig.RolloutAccepts(key, s.chasmMigrationRolloutPercent(nsName)) + } return schedulerWorkflowWithSpecBuilder(ctx, args, s.specBuilder, enableMigration) } registry.RegisterWorkflowWithOptions(wfFunc, workflow.RegisterOptions{Name: WorkflowType}) diff --git a/service/worker/scheduler/workflow.go b/service/worker/scheduler/workflow.go index 6c235d587cc..38023b72f1e 100644 --- a/service/worker/scheduler/workflow.go +++ b/service/worker/scheduler/workflow.go @@ -112,9 +112,11 @@ type ( // SpecBuilder is technically a non-deterministic dependency, but it's safe as // long as we only call methods on cspec inside of SideEffect (or in a query // without modifying state). - specBuilder *SpecBuilder - cspec *CompiledSpec - enableCHASMMigration bool + specBuilder *SpecBuilder + cspec *CompiledSpec + // enableCHASMMigration is re-evaluated every iteration inside the + // "tweakables" MutableSideEffect. + enableCHASMMigration func() bool tweakables TweakablePolicies @@ -224,10 +226,10 @@ var ( ) func SchedulerWorkflow(ctx workflow.Context, args *schedulespb.StartScheduleArgs) error { - return schedulerWorkflowWithSpecBuilder(ctx, args, NewSpecBuilder(), false) + return schedulerWorkflowWithSpecBuilder(ctx, args, NewSpecBuilder(), func() bool { return false }) } -func schedulerWorkflowWithSpecBuilder(ctx workflow.Context, args *schedulespb.StartScheduleArgs, specBuilder *SpecBuilder, enableCHASMMigration bool) error { +func schedulerWorkflowWithSpecBuilder(ctx workflow.Context, args *schedulespb.StartScheduleArgs, specBuilder *SpecBuilder, enableCHASMMigration func() bool) error { scheduler := &scheduler{ StartScheduleArgs: args, ctx: ctx, @@ -1265,10 +1267,10 @@ func (s *scheduler) checkConflict(token int64) error { func (s *scheduler) updateTweakables() { // Use MutableSideEffect so that we can change the defaults without breaking determinism. - enableCHASMMigration := s.enableCHASMMigration get := func(ctx workflow.Context) any { p := CurrentTweakablePolicies - p.EnableCHASMMigration = enableCHASMMigration + // Re-evaluates migration dynamic config each iteration. + p.EnableCHASMMigration = s.enableCHASMMigration() return p } eq := func(a, b any) bool { return a.(TweakablePolicies) == b.(TweakablePolicies) } diff --git a/service/worker/scheduler/workflow_test.go b/service/worker/scheduler/workflow_test.go index 2d6d43bbc2e..1311709ec4d 100644 --- a/service/worker/scheduler/workflow_test.go +++ b/service/worker/scheduler/workflow_test.go @@ -2476,7 +2476,7 @@ func (s *workflowSuite) TestMigrateDynamicConfig() { s.env.SetStartTime(baseStartTime) s.env.ExecuteWorkflow(func(ctx workflow.Context, args *schedulespb.StartScheduleArgs) error { - return schedulerWorkflowWithSpecBuilder(ctx, args, NewSpecBuilder(), true) + return schedulerWorkflowWithSpecBuilder(ctx, args, NewSpecBuilder(), func() bool { return true }) }, &schedulespb.StartScheduleArgs{ Schedule: &schedulepb.Schedule{ Spec: &schedulepb.ScheduleSpec{ @@ -2499,6 +2499,55 @@ func (s *workflowSuite) TestMigrateDynamicConfig() { s.NoError(s.env.GetWorkflowError()) } +// TestMigrateDynamicConfigFlipsMidRun verifies that the enableCHASMMigration +// closure is re-evaluated every iteration: the first iteration sees migration +// disabled and does nothing, then the DC flips on, and a later iteration picks +// it up and migrates without needing continue-as-new. +func (s *workflowSuite) TestMigrateDynamicConfigFlipsMidRun() { + enabled := false + migrateCalls := 0 + s.env.OnActivity(new(activities).MigrateScheduleToChasm, mock.Anything, mock.Anything).Return( + func(context.Context, *schedulerpb.CreateFromMigrationStateRequest) error { + migrateCalls++ + return nil + }) + + prevTweakables := CurrentTweakablePolicies + // Enough iterations to span the flip; allow short intervals so the loop + // advances quickly under test time. + CurrentTweakablePolicies.IterationsBeforeContinueAsNew = 10 + defer func() { CurrentTweakablePolicies = prevTweakables }() + + // Flip the closure between iteration 1 and iteration 2 (1h interval). + s.env.RegisterDelayedCallback(func() { + enabled = true + }, 30*time.Minute) + + s.env.SetStartTime(baseStartTime) + s.env.ExecuteWorkflow(func(ctx workflow.Context, args *schedulespb.StartScheduleArgs) error { + return schedulerWorkflowWithSpecBuilder(ctx, args, NewSpecBuilder(), func() bool { return enabled }) + }, &schedulespb.StartScheduleArgs{ + Schedule: &schedulepb.Schedule{ + Spec: &schedulepb.ScheduleSpec{ + Interval: []*schedulepb.IntervalSpec{{ + Interval: durationpb.New(1 * time.Hour), + }}, + }, + Action: s.defaultAction("myid"), + }, + State: &schedulespb.InternalState{ + Namespace: "myns", + NamespaceId: "mynsid", + ScheduleId: "myschedule", + ConflictToken: InitialConflictToken, + }, + }) + + s.True(s.env.IsWorkflowCompleted()) + s.Require().NoError(s.env.GetWorkflowError(), "workflow should complete after the dynamic flip triggers migration") + s.Equal(1, migrateCalls, "migration should fire exactly once, after the DC flips") +} + func (s *workflowSuite) TestMigrateDynamicConfigFailure() { // Enable migration by threading enableCHASMMigration=true through the closure (race-safe), // but activity fails. @@ -2515,7 +2564,7 @@ func (s *workflowSuite) TestMigrateDynamicConfigFailure() { s.env.SetStartTime(baseStartTime) s.env.ExecuteWorkflow(func(ctx workflow.Context, args *schedulespb.StartScheduleArgs) error { - return schedulerWorkflowWithSpecBuilder(ctx, args, NewSpecBuilder(), true) + return schedulerWorkflowWithSpecBuilder(ctx, args, NewSpecBuilder(), func() bool { return true }) }, &schedulespb.StartScheduleArgs{ Schedule: &schedulepb.Schedule{ Spec: &schedulepb.ScheduleSpec{ diff --git a/tests/schedule_migration_test.go b/tests/schedule_migration_test.go index 3250c639118..5bc46c0a375 100644 --- a/tests/schedule_migration_test.go +++ b/tests/schedule_migration_test.go @@ -1840,6 +1840,132 @@ func TestScheduleMigration_StaleRunningDoesNotSkipPending(t *testing.T) { "stale running entry must not cause the pending start to be dropped under SKIP overlap policy") } +// TestScheduleMigrationRolloutPercent verifies that +// CHASMSchedulerMigrationRolloutPercent gates per-schedule migration once +// EnableCHASMSchedulerMigration is on: at 50%, two V1 schedules whose IDs +// bucket on opposite sides of the rollout end up on different stacks — one +// migrates and the other stays on V1. +func (s *ScheduleMigrationTestSuite) TestScheduleMigrationRolloutPercent() { + t := s.T() + env := testcore.NewEnv( + t, + testcore.WithWorkerService("scheduler operations"), + testcore.WithDynamicConfig(dynamicconfig.EnableChasm, true), + testcore.WithDynamicConfig(dynamicconfig.EnableCHASMSchedulerMigration, true), + testcore.WithDynamicConfig(dynamicconfig.CHASMSchedulerMigrationRolloutPercent, 50), + ) + + ctx := testcore.NewContext() + nsName := env.Namespace().String() + nsID := env.NamespaceID().String() + + migrateSID, stayV1SID := testcore.PickRolloutSplit(t, nsName, 50) + + mkSchedule := func() *schedulepb.Schedule { + return &schedulepb.Schedule{ + Spec: &schedulepb.ScheduleSpec{ + Interval: []*schedulepb.IntervalSpec{{Interval: durationpb.New(1 * time.Hour)}}, + }, + Action: &schedulepb.ScheduleAction{ + Action: &schedulepb.ScheduleAction_StartWorkflow{ + StartWorkflow: &workflowpb.NewWorkflowExecutionInfo{ + WorkflowId: testcore.RandomizeStr("wid"), + WorkflowType: &commonpb.WorkflowType{Name: testcore.RandomizeStr("wt")}, + TaskQueue: &taskqueuepb.TaskQueue{Name: testcore.RandomizeStr("tq"), Kind: enumspb.TASK_QUEUE_KIND_NORMAL}, + }, + }, + }, + } + } + + // Start each schedule directly as a V1 workflow, mirroring the pattern in + // TestScheduleMigrationDynamicConfig. + startV1 := func(sid string) { + startArgs := &schedulespb.StartScheduleArgs{ + Schedule: mkSchedule(), + State: &schedulespb.InternalState{ + Namespace: nsName, + NamespaceId: nsID, + ScheduleId: sid, + ConflictToken: scheduler.InitialConflictToken, + }, + } + inputPayloads, err := sdk.PreferProtoDataConverter.ToPayloads(startArgs) + require.NoError(t, err) + startReq := &workflowservice.StartWorkflowExecutionRequest{ + Namespace: nsName, + WorkflowId: scheduler.WorkflowIDPrefix + sid, + WorkflowType: &commonpb.WorkflowType{Name: scheduler.WorkflowType}, + TaskQueue: &taskqueuepb.TaskQueue{Name: primitives.PerNSWorkerTaskQueue}, + Input: inputPayloads, + Identity: "test", + RequestId: uuid.NewString(), + WorkflowIdReusePolicy: enumspb.WORKFLOW_ID_REUSE_POLICY_ALLOW_DUPLICATE, + WorkflowIdConflictPolicy: enumspb.WORKFLOW_ID_CONFLICT_POLICY_FAIL, + } + _, err = env.GetTestCluster().HistoryClient().StartWorkflowExecution( + ctx, + common.CreateHistoryStartWorkflowRequest(nsID, startReq, nil, nil, time.Now().UTC()), + ) + require.NoError(t, err) + } + startV1(migrateSID) + startV1(stayV1SID) + + v1Status := func(sid string) enumspb.WorkflowExecutionStatus { + desc, err := env.GetTestCluster().HistoryClient().DescribeWorkflowExecution( + ctx, + &historyservice.DescribeWorkflowExecutionRequest{ + NamespaceId: nsID, + Request: &workflowservice.DescribeWorkflowExecutionRequest{ + Namespace: nsName, + Execution: &commonpb.WorkflowExecution{WorkflowId: scheduler.WorkflowIDPrefix + sid}, + }, + }, + ) + if err != nil { + return enumspb.WORKFLOW_EXECUTION_STATUS_UNSPECIFIED + } + return desc.GetWorkflowExecutionInfo().GetStatus() + } + + // The accepted schedule's V1 workflow should complete (migrate to CHASM). + require.Eventually(t, func() bool { + return v1Status(migrateSID) == enumspb.WORKFLOW_EXECUTION_STATUS_COMPLETED + }, 30*time.Second, 500*time.Millisecond, "schedule %q should migrate to CHASM", migrateSID) + + // And its CHASM-side description should be present. + _, err := env.GetTestCluster().SchedulerClient().DescribeSchedule(ctx, &schedulerpb.DescribeScheduleRequest{ + NamespaceId: nsID, + FrontendRequest: &workflowservice.DescribeScheduleRequest{Namespace: nsName, ScheduleId: migrateSID}, + }) + require.NoError(t, err) + + // The rejected schedule's V1 workflow should remain running, and the CHASM + // handler should still report NotFound. Re-checked over a window to ensure + // the negative state is stable rather than racing a delayed migration. + require.Never(t, func() bool { + return v1Status(stayV1SID) == enumspb.WORKFLOW_EXECUTION_STATUS_COMPLETED + }, 10*time.Second, 1*time.Second, "schedule %q must not migrate while outside the rollout cohort", stayV1SID) + + _, err = env.GetTestCluster().SchedulerClient().DescribeSchedule(ctx, &schedulerpb.DescribeScheduleRequest{ + NamespaceId: nsID, + FrontendRequest: &workflowservice.DescribeScheduleRequest{Namespace: nsName, ScheduleId: stayV1SID}, + }) + var notFoundErr *serviceerror.NotFound + require.ErrorAs(t, err, ¬FoundErr, + "schedule %q rejected by the rollout must not appear on the CHASM handler", stayV1SID) + + // And the schedule should still be reachable through the public frontend. + require.Eventually(t, func() bool { + _, err := env.FrontendClient().DescribeSchedule(ctx, &workflowservice.DescribeScheduleRequest{ + Namespace: nsName, + ScheduleId: stayV1SID, + }) + return err == nil + }, 15*time.Second, 250*time.Millisecond, "frontend DescribeSchedule should succeed for V1 schedule %q", stayV1SID) +} + func TestScheduleMigration_NoRunningWorkflows_GeneratorStarts(t *testing.T) { // Use a very short idle time so the schedule closes quickly once the generator // determines there's nothing to do (empty spec → no next wakeup time → idle). diff --git a/tests/schedule_test.go b/tests/schedule_test.go index 116c42515cf..583f6ace119 100644 --- a/tests/schedule_test.go +++ b/tests/schedule_test.go @@ -3381,6 +3381,84 @@ func testUpdateScheduleBlobSizeLimit(t *testing.T, newContext contextFactory) { require.ErrorAs(t, err, &invalidArgBlob) } +// TestScheduleCreationRolloutPercent verifies that +// CHASMSchedulerCreationRolloutPercent acts as a per-schedule sampling gate +// after EnableCHASMSchedulerCreation is on: at 50%, two schedules whose IDs +// bucket on opposite sides of the rollout land on different stacks. +func TestScheduleCreationRolloutPercent(t *testing.T) { + opts := append(scheduleCommonOpts(t), + // V1 worker is needed because at 50% rollout some schedules land on V1. + testcore.WithWorkerService("V1 scheduler"), + testcore.WithDynamicConfig(dynamicconfig.EnableCHASMSchedulerCreation, true), + testcore.WithDynamicConfig(dynamicconfig.CHASMSchedulerCreationRolloutPercent, 50), + ) + s := testcore.NewEnv(t, opts...) + ctx := s.Context() + nsName := s.Namespace().String() + nsID := s.NamespaceID().String() + + chasmSID, v1SID := testcore.PickRolloutSplit(t, nsName, 50) + + mkSchedule := func() *schedulepb.Schedule { + return &schedulepb.Schedule{ + Spec: &schedulepb.ScheduleSpec{ + Interval: []*schedulepb.IntervalSpec{{Interval: durationpb.New(1 * time.Hour)}}, + }, + Action: &schedulepb.ScheduleAction{ + Action: &schedulepb.ScheduleAction_StartWorkflow{ + StartWorkflow: &workflowpb.NewWorkflowExecutionInfo{ + WorkflowId: testcore.RandomizeStr("wid"), + WorkflowType: &commonpb.WorkflowType{Name: testcore.RandomizeStr("wt")}, + TaskQueue: &taskqueuepb.TaskQueue{Name: s.WorkerTaskQueue(), Kind: enumspb.TASK_QUEUE_KIND_NORMAL}, + }, + }, + }, + } + } + + for _, sid := range []string{chasmSID, v1SID} { + _, err := s.FrontendClient().CreateSchedule(ctx, &workflowservice.CreateScheduleRequest{ + Namespace: nsName, + ScheduleId: sid, + Schedule: mkSchedule(), + Identity: testcore.RandomizeStr("identity"), + RequestId: uuid.NewString(), + }) + require.NoError(t, err) + } + + // A direct CHASM DescribeSchedule succeeds for CHASM-backed schedules and + // returns NotFound for V1-backed schedules (whose CHASM key is a sentinel). + describeOnCHASM := func(sid string) error { + _, err := s.GetTestCluster().SchedulerClient().DescribeSchedule(ctx, &schedulerpb.DescribeScheduleRequest{ + NamespaceId: nsID, + FrontendRequest: &workflowservice.DescribeScheduleRequest{Namespace: nsName, ScheduleId: sid}, + }) + return err + } + + require.Eventually(t, func() bool { return describeOnCHASM(chasmSID) == nil }, 15*time.Second, 250*time.Millisecond, + "schedule %q bucketed into CHASM should be describable via the CHASM handler", chasmSID) + + var notFoundErr *serviceerror.NotFound + require.ErrorAs(t, describeOnCHASM(v1SID), ¬FoundErr, + "schedule %q bucketed into V1 should not be present on the CHASM handler", v1SID) + + // Both schedules must remain describable through the public frontend + // regardless of which stack they live on — the V1 schedule in particular + // must round-trip through the frontend's fallback path. The V1 workflow + // takes a moment to be queryable after creation, so poll. + for _, sid := range []string{chasmSID, v1SID} { + require.Eventually(t, func() bool { + _, err := s.FrontendClient().DescribeSchedule(ctx, &workflowservice.DescribeScheduleRequest{ + Namespace: nsName, + ScheduleId: sid, + }) + return err == nil + }, 15*time.Second, 250*time.Millisecond, "frontend DescribeSchedule should succeed for %q", sid) + } +} + // testSingleDateScheduleCloses verifies that a CHASM schedule configured with // a single calendar date closes after its one workflow completes. func testSingleDateScheduleCloses(t *testing.T, newContext contextFactory) { diff --git a/tests/testcore/dynamic_config_overrides.go b/tests/testcore/dynamic_config_overrides.go index 0b5b2868058..6fccde14de9 100644 --- a/tests/testcore/dynamic_config_overrides.go +++ b/tests/testcore/dynamic_config_overrides.go @@ -67,5 +67,12 @@ var ( dynamicconfig.RefreshNexusEndpointsMinWait.Key(): 1 * time.Millisecond, nexusoperations.RecordCancelRequestCompletionEvents.Key(): true, nexusoperations.UseSystemCallbackURL.Key(): true, + + // CHASM scheduler rollout percents default to 0 in production; in tests we + // dial them to 100 so existing tests that only flip the binary enable flag + // continue to route every schedule through CHASM. Tests that want to + // exercise the percent gate can override per-test. + dynamicconfig.CHASMSchedulerCreationRolloutPercent.Key(): 100, + dynamicconfig.CHASMSchedulerMigrationRolloutPercent.Key(): 100, } ) diff --git a/tests/testcore/utils.go b/tests/testcore/utils.go index 608e05221d9..c462c583c79 100644 --- a/tests/testcore/utils.go +++ b/tests/testcore/utils.go @@ -3,6 +3,7 @@ package testcore import ( "fmt" "regexp" + "testing" "github.com/google/uuid" "github.com/stretchr/testify/require" @@ -13,11 +14,37 @@ import ( historyspb "go.temporal.io/server/api/history/v1" "go.temporal.io/server/api/historyservice/v1" replicationspb "go.temporal.io/server/api/replication/v1" + "go.temporal.io/server/common/dynamicconfig" "go.temporal.io/server/common/payloads" "go.temporal.io/server/common/persistence/versionhistory" "google.golang.org/protobuf/proto" ) +// PickRolloutSplit returns two distinct business IDs in the given namespace +// such that the first is accepted by RolloutAccepts at percent and the second +// is rejected. Fails the test if no split is found. +func PickRolloutSplit(t *testing.T, namespace string, percent int) (accepted, rejected string) { + t.Helper() + base := RandomizeStr("rollout") + for i := range 1000 { + if accepted != "" && rejected != "" { + break + } + id := fmt.Sprintf("%s-%d", base, i) + key := fmt.Appendf(nil, "%s\x00%s", namespace, id) + if dynamicconfig.RolloutAccepts(key, percent) { + if accepted == "" { + accepted = id + } + } else if rejected == "" { + rejected = id + } + } + require.NotEmpty(t, accepted, "could not find an ID accepted by rollout at %d%%", percent) + require.NotEmpty(t, rejected, "could not find an ID rejected by rollout at %d%%", percent) + return accepted, rejected +} + // TODO (alex): move this to functional_test_base.go as methods. func RandomizeStr(id string) string { From a48e1861a76851499a597274db3ef77515e530ed Mon Sep 17 00:00:00 2001 From: David Porter Date: Wed, 3 Jun 2026 00:02:52 -0700 Subject: [PATCH 2/8] Adds scheduleIdleCloseTime (#10439) ## What changed? This adds ScheduleCloseTime, the intent being it helps us identify scheduler problems through automatic scanners ## Why? There's a few cases where it's possible for the scheduler state-machines to transition to invalid states if bugs are present. Looking for the boundary at when we expect schedulers to fire helps us look for these invalid states. ## How did you test it? - [ ] built - [X] run locally and tested manually - [ ] covered by existing tests - [X] added new unit test(s) - [ ] added new functional test(s) ## Potential risks Should be fairly minimal, as again, this isn't a significant increase in the number of writes to visibility (cherry picked from commit 32984de6eda44988a722158ab2083f2c21036462) --- .../gen/schedulerpb/v1/message.pb.go | 78 +++++++++++-------- chasm/lib/scheduler/generator_tasks.go | 6 ++ chasm/lib/scheduler/generator_tasks_test.go | 41 ++++++++++ chasm/lib/scheduler/library.go | 1 + chasm/lib/scheduler/proto/v1/message.proto | 6 ++ chasm/lib/scheduler/scheduler.go | 5 ++ chasm/lib/scheduler/scheduler_test.go | 48 +++++++++++- 7 files changed, 152 insertions(+), 33 deletions(-) diff --git a/chasm/lib/scheduler/gen/schedulerpb/v1/message.pb.go b/chasm/lib/scheduler/gen/schedulerpb/v1/message.pb.go index ac27ad2e34b..81f66a4e995 100644 --- a/chasm/lib/scheduler/gen/schedulerpb/v1/message.pb.go +++ b/chasm/lib/scheduler/gen/schedulerpb/v1/message.pb.go @@ -49,8 +49,13 @@ type SchedulerState struct { // Set when a migration to workflow-backed scheduler (V1) is pending. // Unpause operations are blocked while this is set. WorkflowMigration *WorkflowMigrationState `protobuf:"bytes,11,opt,name=workflow_migration,json=workflowMigration,proto3" json:"workflow_migration,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache + // The time at which the idle timer is scheduled to fire and close this + // schedule. Set when the schedule goes idle; cleared when it has work again. + // Surfaced as the ScheduleIdleCloseTime search attribute for stuck-schedule + // detection. + IdleCloseTime *timestamppb.Timestamp `protobuf:"bytes,12,opt,name=idle_close_time,json=idleCloseTime,proto3" json:"idle_close_time,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache } func (x *SchedulerState) Reset() { @@ -146,6 +151,13 @@ func (x *SchedulerState) GetWorkflowMigration() *WorkflowMigrationState { return nil } +func (x *SchedulerState) GetIdleCloseTime() *timestamppb.Timestamp { + if x != nil { + return x.IdleCloseTime + } + return nil +} + // WorkflowMigrationState tracks the state of an in-progress V2-to-V1 migration. type WorkflowMigrationState struct { state protoimpl.MessageState `protogen:"open.v1"` @@ -700,7 +712,7 @@ var File_temporal_server_chasm_lib_scheduler_proto_v1_message_proto protoreflect const file_temporal_server_chasm_lib_scheduler_proto_v1_message_proto_rawDesc = "" + "\n" + - ":temporal/server/chasm/lib/scheduler/proto/v1/message.proto\x12,temporal.server.chasm.lib.scheduler.proto.v1\x1a\x1fgoogle/protobuf/timestamp.proto\x1a$temporal/api/common/v1/message.proto\x1a%temporal/api/failure/v1/message.proto\x1a&temporal/api/schedule/v1/message.proto\x1a-temporal/server/api/schedule/v1/message.proto\"\xbe\x03\n" + + ":temporal/server/chasm/lib/scheduler/proto/v1/message.proto\x12,temporal.server.chasm.lib.scheduler.proto.v1\x1a\x1fgoogle/protobuf/timestamp.proto\x1a$temporal/api/common/v1/message.proto\x1a%temporal/api/failure/v1/message.proto\x1a&temporal/api/schedule/v1/message.proto\x1a-temporal/server/api/schedule/v1/message.proto\"\x82\x04\n" + "\x0eSchedulerState\x12>\n" + "\bschedule\x18\x02 \x01(\v2\".temporal.api.schedule.v1.ScheduleR\bschedule\x12:\n" + "\x04info\x18\x03 \x01(\v2&.temporal.api.schedule.v1.ScheduleInfoR\x04info\x12\x1c\n" + @@ -712,7 +724,8 @@ const file_temporal_server_chasm_lib_scheduler_proto_v1_message_proto_rawDesc = "\x06closed\x18\t \x01(\bR\x06closed\x12\x1a\n" + "\bsentinel\x18\n" + " \x01(\bR\bsentinel\x12s\n" + - "\x12workflow_migration\x18\v \x01(\v2D.temporal.server.chasm.lib.scheduler.proto.v1.WorkflowMigrationStateR\x11workflowMigration\"z\n" + + "\x12workflow_migration\x18\v \x01(\v2D.temporal.server.chasm.lib.scheduler.proto.v1.WorkflowMigrationStateR\x11workflowMigration\x12B\n" + + "\x0fidle_close_time\x18\f \x01(\v2\x1a.google.protobuf.TimestampR\ridleCloseTime\"z\n" + "\x16WorkflowMigrationState\x120\n" + "\x14pre_migration_paused\x18\x01 \x01(\bR\x12preMigrationPaused\x12.\n" + "\x13pre_migration_notes\x18\x02 \x01(\tR\x11preMigrationNotes\"\xa8\x01\n" + @@ -798,34 +811,35 @@ var file_temporal_server_chasm_lib_scheduler_proto_v1_message_proto_depIdxs = [] 12, // 0: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerState.schedule:type_name -> temporal.api.schedule.v1.Schedule 13, // 1: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerState.info:type_name -> temporal.api.schedule.v1.ScheduleInfo 1, // 2: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerState.workflow_migration:type_name -> temporal.server.chasm.lib.scheduler.proto.v1.WorkflowMigrationState - 14, // 3: temporal.server.chasm.lib.scheduler.proto.v1.GeneratorState.last_processed_time:type_name -> google.protobuf.Timestamp - 14, // 4: temporal.server.chasm.lib.scheduler.proto.v1.GeneratorState.future_action_times:type_name -> google.protobuf.Timestamp - 15, // 5: temporal.server.chasm.lib.scheduler.proto.v1.InvokerState.buffered_starts:type_name -> temporal.server.api.schedule.v1.BufferedStart - 16, // 6: temporal.server.chasm.lib.scheduler.proto.v1.InvokerState.cancel_workflows:type_name -> temporal.api.common.v1.WorkflowExecution - 16, // 7: temporal.server.chasm.lib.scheduler.proto.v1.InvokerState.terminate_workflows:type_name -> temporal.api.common.v1.WorkflowExecution - 14, // 8: temporal.server.chasm.lib.scheduler.proto.v1.InvokerState.last_processed_time:type_name -> google.protobuf.Timestamp - 17, // 9: temporal.server.chasm.lib.scheduler.proto.v1.BackfillerState.backfill_request:type_name -> temporal.api.schedule.v1.BackfillRequest - 18, // 10: temporal.server.chasm.lib.scheduler.proto.v1.BackfillerState.trigger_request:type_name -> temporal.api.schedule.v1.TriggerImmediatelyRequest - 14, // 11: temporal.server.chasm.lib.scheduler.proto.v1.BackfillerState.last_processed_time:type_name -> google.protobuf.Timestamp - 19, // 12: temporal.server.chasm.lib.scheduler.proto.v1.LastCompletionResult.success:type_name -> temporal.api.common.v1.Payload - 20, // 13: temporal.server.chasm.lib.scheduler.proto.v1.LastCompletionResult.failure:type_name -> temporal.api.failure.v1.Failure - 0, // 14: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.scheduler_state:type_name -> temporal.server.chasm.lib.scheduler.proto.v1.SchedulerState - 2, // 15: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.generator_state:type_name -> temporal.server.chasm.lib.scheduler.proto.v1.GeneratorState - 3, // 16: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.invoker_state:type_name -> temporal.server.chasm.lib.scheduler.proto.v1.InvokerState - 9, // 17: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.backfillers:type_name -> temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.BackfillersEntry - 5, // 18: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.last_completion_result:type_name -> temporal.server.chasm.lib.scheduler.proto.v1.LastCompletionResult - 10, // 19: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.search_attributes:type_name -> temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.SearchAttributesEntry - 11, // 20: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.memo:type_name -> temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.MemoEntry - 8, // 21: temporal.server.chasm.lib.scheduler.proto.v1.EventLog.events:type_name -> temporal.server.chasm.lib.scheduler.proto.v1.Event - 14, // 22: temporal.server.chasm.lib.scheduler.proto.v1.Event.time:type_name -> google.protobuf.Timestamp - 4, // 23: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.BackfillersEntry.value:type_name -> temporal.server.chasm.lib.scheduler.proto.v1.BackfillerState - 19, // 24: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.SearchAttributesEntry.value:type_name -> temporal.api.common.v1.Payload - 19, // 25: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.MemoEntry.value:type_name -> temporal.api.common.v1.Payload - 26, // [26:26] is the sub-list for method output_type - 26, // [26:26] is the sub-list for method input_type - 26, // [26:26] is the sub-list for extension type_name - 26, // [26:26] is the sub-list for extension extendee - 0, // [0:26] is the sub-list for field type_name + 14, // 3: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerState.idle_close_time:type_name -> google.protobuf.Timestamp + 14, // 4: temporal.server.chasm.lib.scheduler.proto.v1.GeneratorState.last_processed_time:type_name -> google.protobuf.Timestamp + 14, // 5: temporal.server.chasm.lib.scheduler.proto.v1.GeneratorState.future_action_times:type_name -> google.protobuf.Timestamp + 15, // 6: temporal.server.chasm.lib.scheduler.proto.v1.InvokerState.buffered_starts:type_name -> temporal.server.api.schedule.v1.BufferedStart + 16, // 7: temporal.server.chasm.lib.scheduler.proto.v1.InvokerState.cancel_workflows:type_name -> temporal.api.common.v1.WorkflowExecution + 16, // 8: temporal.server.chasm.lib.scheduler.proto.v1.InvokerState.terminate_workflows:type_name -> temporal.api.common.v1.WorkflowExecution + 14, // 9: temporal.server.chasm.lib.scheduler.proto.v1.InvokerState.last_processed_time:type_name -> google.protobuf.Timestamp + 17, // 10: temporal.server.chasm.lib.scheduler.proto.v1.BackfillerState.backfill_request:type_name -> temporal.api.schedule.v1.BackfillRequest + 18, // 11: temporal.server.chasm.lib.scheduler.proto.v1.BackfillerState.trigger_request:type_name -> temporal.api.schedule.v1.TriggerImmediatelyRequest + 14, // 12: temporal.server.chasm.lib.scheduler.proto.v1.BackfillerState.last_processed_time:type_name -> google.protobuf.Timestamp + 19, // 13: temporal.server.chasm.lib.scheduler.proto.v1.LastCompletionResult.success:type_name -> temporal.api.common.v1.Payload + 20, // 14: temporal.server.chasm.lib.scheduler.proto.v1.LastCompletionResult.failure:type_name -> temporal.api.failure.v1.Failure + 0, // 15: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.scheduler_state:type_name -> temporal.server.chasm.lib.scheduler.proto.v1.SchedulerState + 2, // 16: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.generator_state:type_name -> temporal.server.chasm.lib.scheduler.proto.v1.GeneratorState + 3, // 17: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.invoker_state:type_name -> temporal.server.chasm.lib.scheduler.proto.v1.InvokerState + 9, // 18: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.backfillers:type_name -> temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.BackfillersEntry + 5, // 19: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.last_completion_result:type_name -> temporal.server.chasm.lib.scheduler.proto.v1.LastCompletionResult + 10, // 20: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.search_attributes:type_name -> temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.SearchAttributesEntry + 11, // 21: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.memo:type_name -> temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.MemoEntry + 8, // 22: temporal.server.chasm.lib.scheduler.proto.v1.EventLog.events:type_name -> temporal.server.chasm.lib.scheduler.proto.v1.Event + 14, // 23: temporal.server.chasm.lib.scheduler.proto.v1.Event.time:type_name -> google.protobuf.Timestamp + 4, // 24: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.BackfillersEntry.value:type_name -> temporal.server.chasm.lib.scheduler.proto.v1.BackfillerState + 19, // 25: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.SearchAttributesEntry.value:type_name -> temporal.api.common.v1.Payload + 19, // 26: temporal.server.chasm.lib.scheduler.proto.v1.SchedulerMigrationState.MemoEntry.value:type_name -> temporal.api.common.v1.Payload + 27, // [27:27] is the sub-list for method output_type + 27, // [27:27] is the sub-list for method input_type + 27, // [27:27] is the sub-list for extension type_name + 27, // [27:27] is the sub-list for extension extendee + 0, // [0:27] is the sub-list for field type_name } func init() { file_temporal_server_chasm_lib_scheduler_proto_v1_message_proto_init() } diff --git a/chasm/lib/scheduler/generator_tasks.go b/chasm/lib/scheduler/generator_tasks.go index 2644f59ca05..7f25afb86b6 100644 --- a/chasm/lib/scheduler/generator_tasks.go +++ b/chasm/lib/scheduler/generator_tasks.go @@ -130,9 +130,15 @@ func (g *GeneratorTaskHandler) Execute( }, &schedulerpb.SchedulerIdleTask{ IdleTimeTotal: durationpb.New(idleTimeTotal), }) + // Record the idle-close deadline so it can be surfaced as the + // ScheduleIdleCloseTime search attribute for stuck-schedule detection. + scheduler.IdleCloseTime = timestamppb.New(idleExpiration) return nil } + // The schedule has work again, so it's no longer pending an idle close. + scheduler.IdleCloseTime = nil + // No more tasks if we're paused. if scheduler.Schedule.State.Paused { return nil diff --git a/chasm/lib/scheduler/generator_tasks_test.go b/chasm/lib/scheduler/generator_tasks_test.go index 21f18a527cc..117126e5fe1 100644 --- a/chasm/lib/scheduler/generator_tasks_test.go +++ b/chasm/lib/scheduler/generator_tasks_test.go @@ -134,6 +134,47 @@ func TestGeneratorTask_UpdateFutureActionTimes_LimitedActions(t *testing.T) { require.Len(t, generator.FutureActionTimes, 2) } +func TestGeneratorTask_Idle_SetsIdleCloseTime(t *testing.T) { + env := newTestEnv(t) + handler := newGeneratorHandler(env) + + ctx := env.MutableContext() + sched := env.Scheduler + generator := sched.Generator.Get(ctx) + + // Exhaust the schedule's actions so it goes idle (no more work to do). + sched.Schedule.State.LimitedActions = true + sched.Schedule.State.RemainingActions = 0 + + err := handler.Execute(ctx, generator, chasm.TaskAttributes{}, &schedulerpb.GeneratorTask{}) + require.NoError(t, err) + + // The idle-close deadline must be recorded so it can be surfaced as the + // ScheduleIdleCloseTime search attribute. + require.NotNil(t, sched.IdleCloseTime, "expected IdleCloseTime to be set once idle") + require.True(t, sched.IdleCloseTime.AsTime().After(ctx.Now(generator)), + "expected idle-close deadline in the future, got %v", sched.IdleCloseTime.AsTime()) +} + +func TestGeneratorTask_NonIdle_ClearsIdleCloseTime(t *testing.T) { + env := newTestEnv(t) + handler := newGeneratorHandler(env) + + ctx := env.MutableContext() + sched := env.Scheduler + generator := sched.Generator.Get(ctx) + + // Simulate a stale deadline left over from a prior idle period. + sched.IdleCloseTime = timestamppb.New(ctx.Now(generator).Add(7 * 24 * time.Hour)) + + // A normal execution generates future actions, so the schedule is not idle. + err := handler.Execute(ctx, generator, chasm.TaskAttributes{}, &schedulerpb.GeneratorTask{}) + require.NoError(t, err) + + require.NotEmpty(t, generator.FutureActionTimes) + require.Nil(t, sched.IdleCloseTime, "expected IdleCloseTime to be cleared when the schedule has work") +} + func TestGeneratorTask_UpdateFutureActionTimes_SkipsBeforeUpdateTime(t *testing.T) { env := newTestEnv(t) handler := newGeneratorHandler(env) diff --git a/chasm/lib/scheduler/library.go b/chasm/lib/scheduler/library.go index cd575ba8242..a0c6855d37e 100644 --- a/chasm/lib/scheduler/library.go +++ b/chasm/lib/scheduler/library.go @@ -62,6 +62,7 @@ func (l *Library) Components() []*chasm.RegistrableComponent { chasm.WithSearchAttributes( executionStatusSearchAttribute, scheduleNextActionTimeSearchAttribute, + scheduleIdleCloseTimeSearchAttribute, ), ), chasm.NewRegistrableComponent[*Generator]("generator"), diff --git a/chasm/lib/scheduler/proto/v1/message.proto b/chasm/lib/scheduler/proto/v1/message.proto index 8591f802b2a..1663daa23ea 100644 --- a/chasm/lib/scheduler/proto/v1/message.proto +++ b/chasm/lib/scheduler/proto/v1/message.proto @@ -36,6 +36,12 @@ message SchedulerState { // Set when a migration to workflow-backed scheduler (V1) is pending. // Unpause operations are blocked while this is set. WorkflowMigrationState workflow_migration = 11; + + // The time at which the idle timer is scheduled to fire and close this + // schedule. Set when the schedule goes idle; cleared when it has work again. + // Surfaced as the ScheduleIdleCloseTime search attribute for stuck-schedule + // detection. + google.protobuf.Timestamp idle_close_time = 12; } // WorkflowMigrationState tracks the state of an in-progress V2-to-V1 migration. diff --git a/chasm/lib/scheduler/scheduler.go b/chasm/lib/scheduler/scheduler.go index 72bf4150da4..37a910a939a 100644 --- a/chasm/lib/scheduler/scheduler.go +++ b/chasm/lib/scheduler/scheduler.go @@ -69,10 +69,12 @@ var ( ) const ScheduleNextActionTimeName = "ScheduleNextActionTime" +const ScheduleIdleCloseTimeName = "ScheduleIdleCloseTime" var ( executionStatusSearchAttribute = chasm.NewSearchAttributeKeyword("ExecutionStatus", chasm.SearchAttributeFieldLowCardinalityKeyword01) scheduleNextActionTimeSearchAttribute = chasm.NewSearchAttributeDateTime(ScheduleNextActionTimeName, chasm.SearchAttributeFieldDateTime01) + scheduleIdleCloseTimeSearchAttribute = chasm.NewSearchAttributeDateTime(ScheduleIdleCloseTimeName, chasm.SearchAttributeFieldDateTime02) ) var initialSerializedConflictToken = serializeConflictToken(scheduler.InitialConflictToken) @@ -922,6 +924,9 @@ func (s *Scheduler) SearchAttributes(ctx chasm.Context) []chasm.SearchAttributeK if gen := s.Generator.Get(ctx); len(gen.FutureActionTimes) > 0 { out = append(out, scheduleNextActionTimeSearchAttribute.Value(gen.FutureActionTimes[0].AsTime())) } + if s.IdleCloseTime != nil { + out = append(out, scheduleIdleCloseTimeSearchAttribute.Value(s.IdleCloseTime.AsTime())) + } } return out } diff --git a/chasm/lib/scheduler/scheduler_test.go b/chasm/lib/scheduler/scheduler_test.go index 811da88df7c..a2e17a43c38 100644 --- a/chasm/lib/scheduler/scheduler_test.go +++ b/chasm/lib/scheduler/scheduler_test.go @@ -412,6 +412,51 @@ func TestSearchAttributes_NextActionTime(t *testing.T) { }) } +func TestSearchAttributes_IdleCloseTime(t *testing.T) { + + t.Run("idle schedule emits IdleCloseTime", func(t *testing.T) { + sched, ctx, _ := setupSchedulerForTest(t) + + closeTime := time.Now().Add(7 * 24 * time.Hour).UTC().Truncate(time.Second) + sched.IdleCloseTime = timestamppb.New(closeTime) + + sas := sched.SearchAttributes(ctx) + val, ok := findSearchAttribute(t, sas, scheduler.ScheduleIdleCloseTimeName) + require.True(t, ok, "expected %s to be present", scheduler.ScheduleIdleCloseTimeName) + require.True(t, closeTime.Equal(val.(time.Time)), "want %v, got %v", closeTime, val) + }) + + t.Run("schedule with no idle deadline does not emit", func(t *testing.T) { + sched, ctx, _ := setupSchedulerForTest(t) + + sched.IdleCloseTime = nil + + sas := sched.SearchAttributes(ctx) + _, has := findSearchAttribute(t, sas, scheduler.ScheduleIdleCloseTimeName) + require.False(t, has, "expected %s to be absent when not idle", scheduler.ScheduleIdleCloseTimeName) + }) + + t.Run("closed does not emit IdleCloseTime", func(t *testing.T) { + sched, ctx, _ := setupSchedulerForTest(t) + + sched.Closed = true + // A deadline left over from before close must not leak through. + sched.IdleCloseTime = timestamppb.New(time.Now().Add(7 * 24 * time.Hour)) + + sas := sched.SearchAttributes(ctx) + _, has := findSearchAttribute(t, sas, scheduler.ScheduleIdleCloseTimeName) + require.False(t, has, "expected %s to be absent once closed", scheduler.ScheduleIdleCloseTimeName) + }) + + t.Run("sentinel does not emit", func(t *testing.T) { + sentinel, ctx, _ := setupSentinelForTest(t) + + sas := sentinel.SearchAttributes(ctx) + _, has := findSearchAttribute(t, sas, scheduler.ScheduleIdleCloseTimeName) + require.False(t, has) + }) +} + func findSearchAttribute(t *testing.T, sas []chasm.SearchAttributeKeyValue, alias string) (any, bool) { t.Helper() for _, sa := range sas { @@ -435,8 +480,9 @@ func TestSearchAttributes_RoundTripThroughCloseTransaction(t *testing.T) { nextAction := time.Now().Add(time.Hour).UTC().Truncate(time.Second) generator := sched.Generator.Get(ctx) generator.FutureActionTimes = []*timestamppb.Timestamp{timestamppb.New(nextAction)} + sched.IdleCloseTime = timestamppb.New(time.Now().Add(7 * 24 * time.Hour)) require.NoError(t, node.SetRootComponent(sched)) _, err := node.CloseTransaction() - require.NoError(t, err, "CloseTransaction should accept TemporalScheduleNextActionTime") + require.NoError(t, err, "CloseTransaction should accept the scheduler search attributes") } From a1d91db104cb4a9629f969cc20abdd51b083e942 Mon Sep 17 00:00:00 2001 From: Lina Jodoin Date: Thu, 4 Jun 2026 09:15:21 -0700 Subject: [PATCH 3/8] [Scheduler] Plumb Tweakables through Scheduler's CHASM context values, use in EventLog (#10517) ## What changed? - Tweakables is now available for access within any Scheduler function that has a CHASM context. - EventLog now uses the Tweakables from CHASM context to determine dynamic config values. ## Why? - It was looking real ugly to plumb through those numbers and that dynamic config everywhere else throughout. ## How did you test it? - [ ] built - [ ] run locally and tested manually - [ ] covered by existing tests - [x] added new unit test(s) - [ ] added new functional test(s) (cherry picked from commit 6fc519f7c3e66f5f093e658baf4711d39d51295f) --- chasm/lib/scheduler/config.go | 25 +++++++++++++++++++ chasm/lib/scheduler/eventlog.go | 11 ++++++--- chasm/lib/scheduler/eventlog_test.go | 34 ++++++++++++++++++++------ chasm/lib/scheduler/generator_tasks.go | 3 +-- chasm/lib/scheduler/helper_test.go | 3 +++ chasm/lib/scheduler/invoker.go | 4 +++ chasm/lib/scheduler/library.go | 6 +++++ 7 files changed, 73 insertions(+), 13 deletions(-) diff --git a/chasm/lib/scheduler/config.go b/chasm/lib/scheduler/config.go index df40ab4739b..3d57c24bee3 100644 --- a/chasm/lib/scheduler/config.go +++ b/chasm/lib/scheduler/config.go @@ -3,6 +3,7 @@ package scheduler import ( "time" + "go.temporal.io/server/chasm" "go.temporal.io/server/common/backoff" "go.temporal.io/server/common/dynamicconfig" ) @@ -28,6 +29,30 @@ type ( } ) +// tweakablesCtxKey keys the namespace-filtered Tweakables accessor that scheduler +// components read via the CHASM context (registered in Library.Components). +type tweakablesCtxKeyType struct{} + +var tweakablesCtxKey = tweakablesCtxKeyType{} + +// tweakablesFromContext returns the scheduler Tweakables for the context's namespace, +// falling back to DefaultTweakables when no config is registered. +func tweakablesFromContext(ctx chasm.Context) Tweakables { + if fn, ok := ctx.Value(tweakablesCtxKey).(dynamicconfig.TypedPropertyFnWithNamespaceFilter[Tweakables]); ok && fn != nil { + return fn(ctx.NamespaceEntry().Name().String()) + } + return DefaultTweakables +} + +// contextValues builds the CHASM context values exposed to scheduler components. +func (c *Config) contextValues() map[any]any { + var tweakables dynamicconfig.TypedPropertyFnWithNamespaceFilter[Tweakables] + if c != nil { + tweakables = c.Tweakables + } + return map[any]any{tweakablesCtxKey: tweakables} +} + var ( CurrentTweakables = dynamicconfig.NewNamespaceTypedSetting( "scheduler.tweakables", diff --git a/chasm/lib/scheduler/eventlog.go b/chasm/lib/scheduler/eventlog.go index 8a97d996cc1..91e35007aa7 100644 --- a/chasm/lib/scheduler/eventlog.go +++ b/chasm/lib/scheduler/eventlog.go @@ -30,10 +30,13 @@ func (e *EventLog) LifecycleState(ctx chasm.Context) chasm.LifecycleState { return chasm.LifecycleStateRunning } -// LogEvent appends an event with the given message. Messages longer than -// maxMessageLen bytes are truncated at a UTF-8 rune boundary; once the log -// has more than maxEntries entries, the earliest entries are dropped. -func (e *EventLog) LogEvent(ctx chasm.MutableContext, msg string, maxEntries, maxMessageLen int) { +// LogEvent appends an event with the given message. Messages longer than the +// configured maximum length are truncated at a UTF-8 rune boundary; once the +// log exceeds the configured maximum entries, the earliest entries are dropped. +func (e *EventLog) LogEvent(ctx chasm.MutableContext, msg string) { + tw := tweakablesFromContext(ctx) + maxEntries, maxMessageLen := tw.EventLogMaxEntries, tw.EventLogMaxMessageLen + if len(msg) > maxMessageLen { // Back off to the nearest UTF-8 rune boundary so we don't split a // multibyte rune. diff --git a/chasm/lib/scheduler/eventlog_test.go b/chasm/lib/scheduler/eventlog_test.go index 64397077018..c6913e8f1e7 100644 --- a/chasm/lib/scheduler/eventlog_test.go +++ b/chasm/lib/scheduler/eventlog_test.go @@ -9,12 +9,14 @@ import ( "github.com/stretchr/testify/require" enumspb "go.temporal.io/api/enums/v1" schedulepb "go.temporal.io/api/schedule/v1" + "go.temporal.io/server/chasm" + "go.temporal.io/server/chasm/lib/scheduler" "google.golang.org/protobuf/types/known/timestamppb" ) -const ( - testEventLogMaxEntries = 30 - testEventLogMaxMessageLen = 1000 +var ( + testEventLogMaxEntries = scheduler.DefaultTweakables.EventLogMaxEntries + testEventLogMaxMessageLen = scheduler.DefaultTweakables.EventLogMaxMessageLen ) func TestEventLog_Accumulates(t *testing.T) { @@ -24,7 +26,7 @@ func TestEventLog_Accumulates(t *testing.T) { messages := []string{"first", "second", "third"} for _, m := range messages { - eventLog.LogEvent(ctx, m, testEventLogMaxEntries, testEventLogMaxMessageLen) + eventLog.LogEvent(ctx, m) } require.Len(t, eventLog.Events, len(messages)) @@ -40,7 +42,7 @@ func TestEventLog_TruncatesLongMessages(t *testing.T) { eventLog.Events = nil long := strings.Repeat("x", testEventLogMaxMessageLen+50) - eventLog.LogEvent(ctx, long, testEventLogMaxEntries, testEventLogMaxMessageLen) + eventLog.LogEvent(ctx, long) require.Len(t, eventLog.Events, 1) require.Len(t, eventLog.Events[0].Message, testEventLogMaxMessageLen) @@ -55,7 +57,7 @@ func TestEventLog_DropsEarliestWhenFull(t *testing.T) { const overflow = 5 total := testEventLogMaxEntries + overflow for i := range total { - eventLog.LogEvent(ctx, fmt.Sprintf("event-%d", i), testEventLogMaxEntries, testEventLogMaxMessageLen) + eventLog.LogEvent(ctx, fmt.Sprintf("event-%d", i)) } require.Len(t, eventLog.Events, testEventLogMaxEntries) @@ -65,6 +67,24 @@ func TestEventLog_DropsEarliestWhenFull(t *testing.T) { require.Equal(t, fmt.Sprintf("event-%d", total-1), eventLog.Events[len(eventLog.Events)-1].Message) } +// TestEventLog_NoConfigFallsBackToDefaults checks that LogEvent applies the +// default retention limits instead of panicking when no config is reachable via +// the context, as in tdbg's registration-only setup. +func TestEventLog_NoConfigFallsBackToDefaults(t *testing.T) { + ctx := &chasm.MockMutableContext{} + eventLog := scheduler.NewEventLog(ctx) + + require.NotPanics(t, func() { + eventLog.LogEvent(ctx, "logged without a registered config") + }) + require.Len(t, eventLog.Events, 1) + + long := strings.Repeat("x", scheduler.DefaultTweakables.EventLogMaxMessageLen+50) + eventLog.LogEvent(ctx, long) + require.Len(t, eventLog.Events, 2) + require.Len(t, eventLog.Events[1].Message, scheduler.DefaultTweakables.EventLogMaxMessageLen) +} + func TestEventLog_EachComponentHasOwn(t *testing.T) { sched, ctx, _ := setupSchedulerForTest(t) @@ -86,7 +106,7 @@ func TestEventLog_EachComponentHasOwn(t *testing.T) { sched.Generator.Get(ctx).EventLog.Get(ctx).Events = nil backfiller.EventLog.Get(ctx).Events = nil - sched.EventLog.Get(ctx).LogEvent(ctx, "scheduler-event", testEventLogMaxEntries, testEventLogMaxMessageLen) + sched.EventLog.Get(ctx).LogEvent(ctx, "scheduler-event") require.Len(t, sched.EventLog.Get(ctx).Events, 1) require.Empty(t, sched.Generator.Get(ctx).EventLog.Get(ctx).Events) require.Empty(t, backfiller.EventLog.Get(ctx).Events) diff --git a/chasm/lib/scheduler/generator_tasks.go b/chasm/lib/scheduler/generator_tasks.go index 7f25afb86b6..3d2172faad5 100644 --- a/chasm/lib/scheduler/generator_tasks.go +++ b/chasm/lib/scheduler/generator_tasks.go @@ -154,8 +154,7 @@ func (g *GeneratorTaskHandler) logSchedule(ctx chasm.MutableContext, logger log. spec := jsonStringer{sched.Schedule.Spec} policies := jsonStringer{sched.Schedule.Policies} - tw := g.config.Tweakables(sched.Namespace) - generator.EventLog.Get(ctx).LogEvent(ctx, fmt.Sprintf("%s:\nSpec: %s\nPolicies: %s\n", msg, spec, policies), tw.EventLogMaxEntries, tw.EventLogMaxMessageLen) + generator.EventLog.Get(ctx).LogEvent(ctx, fmt.Sprintf("%s:\nSpec: %s\nPolicies: %s\n", msg, spec, policies)) logger.Info(msg, tag.Stringer("spec", spec), tag.Stringer("policies", policies)) diff --git a/chasm/lib/scheduler/helper_test.go b/chasm/lib/scheduler/helper_test.go index f28ff945cec..ac38659ebce 100644 --- a/chasm/lib/scheduler/helper_test.go +++ b/chasm/lib/scheduler/helper_test.go @@ -88,6 +88,7 @@ func newTestLibrary(logger log.Logger, specProcessor scheduler.SpecProcessor) *s SpecProcessor: specProcessor, } return scheduler.NewLibrary( + config, nil, scheduler.NewSchedulerIdleTaskHandler(scheduler.SchedulerIdleTaskHandlerOptions{ Config: config, @@ -209,6 +210,7 @@ func newTestEnv(t *testing.T, opts ...testEnvOption) *testEnv { HandleGetCurrentVersion: func() int64 { return 1 }, HandleGetWorkflowKey: tv.Any().WorkflowKey, HandleIsWorkflow: func() bool { return false }, + HandleGetNamespaceEntry: tv.Namespace, HandleCurrentVersionedTransition: func() *persistencespb.VersionedTransition { return &persistencespb.VersionedTransition{ NamespaceFailoverVersion: 1, @@ -346,6 +348,7 @@ func setupTestInfra(t *testing.T, specProcessor scheduler.SpecProcessor) *testIn nodeBackend.HandleGetCurrentVersion = func() int64 { return 1 } nodeBackend.HandleGetWorkflowKey = tv.Any().WorkflowKey nodeBackend.HandleIsWorkflow = func() bool { return false } + nodeBackend.HandleGetNamespaceEntry = tv.Namespace nodeBackend.HandleCurrentVersionedTransition = func() *persistencespb.VersionedTransition { return &persistencespb.VersionedTransition{ NamespaceFailoverVersion: 1, diff --git a/chasm/lib/scheduler/invoker.go b/chasm/lib/scheduler/invoker.go index 39090c53140..a7181993bbc 100644 --- a/chasm/lib/scheduler/invoker.go +++ b/chasm/lib/scheduler/invoker.go @@ -1,6 +1,7 @@ package scheduler import ( + "fmt" "slices" "time" @@ -49,6 +50,9 @@ func newInvokerWithState(ctx chasm.MutableContext, state *schedulerpb.InvokerSta // immediately kicking off a processing task. func (i *Invoker) EnqueueBufferedStarts(ctx chasm.MutableContext, starts []*schedulespb.BufferedStart) { i.BufferedStarts = append(i.BufferedStarts, starts...) + if len(starts) > 0 { + i.EventLog.Get(ctx).LogEvent(ctx, fmt.Sprintf("enqueued %d buffered start(s)", len(starts))) + } i.addTasks(ctx) } diff --git a/chasm/lib/scheduler/library.go b/chasm/lib/scheduler/library.go index a0c6855d37e..e200b52cb12 100644 --- a/chasm/lib/scheduler/library.go +++ b/chasm/lib/scheduler/library.go @@ -10,6 +10,7 @@ type ( Library struct { chasm.UnimplementedLibrary + config *Config handler *handler SchedulerIdleTaskHandler *SchedulerIdleTaskHandler @@ -29,6 +30,7 @@ func NewNilLibrary() *Library { } func NewLibrary( + config *Config, handler *handler, SchedulerIdleTaskHandler *SchedulerIdleTaskHandler, SchedulerCallbacksTaskHandler *SchedulerCallbacksTaskHandler, @@ -39,6 +41,7 @@ func NewLibrary( MigrateToWorkflowTaskHandler *SchedulerMigrateToWorkflowTaskHandler, ) *Library { return &Library{ + config: config, handler: handler, SchedulerIdleTaskHandler: SchedulerIdleTaskHandler, SchedulerCallbacksTaskHandler: SchedulerCallbacksTaskHandler, @@ -64,6 +67,9 @@ func (l *Library) Components() []*chasm.RegistrableComponent { scheduleNextActionTimeSearchAttribute, scheduleIdleCloseTimeSearchAttribute, ), + // Exposes Tweakables to scheduler components via the CHASM context + // (see tweakablesFromContext). + chasm.WithContextValues(l.config.contextValues()), ), chasm.NewRegistrableComponent[*Generator]("generator"), chasm.NewRegistrableComponent[*Invoker]("invoker"), From 709df67a4b9476c7ddabbe7f63cef829f0c6430e Mon Sep 17 00:00:00 2001 From: Lina Jodoin Date: Thu, 4 Jun 2026 10:42:58 -0700 Subject: [PATCH 4/8] [Scheduler] Rework and instrument CHASM scheduler tasks, add a bunch of coverage (#10369) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reworks CHASM scheduler's tasks to harden them against being silently dropped, and fixes several edge-case bugs we've recently hit. Most of these only surface in combination, so the fixes land together. The logging here is just a few breadcrumbs — once the EventLog PR merges I'll do a full logging/eventlogging pass, so don't read too much into the individual log statements. ## What changed ### Generator - `GeneratorTask` now keeps running while paused (advances the HWM, updates `FutureActionTimes`, drops actions at the source) instead of stopping — an improvement over V1. - New `isManualOnly` / `isHeldOpen` helpers carve out schedules with no automated wakeup source (empty spec, manual-only, pending backfill) so the idle timer can't silently close them. ### Idle task - Reworked around `isHeldOpen` plus an `.After()` deadline compare, so sub-precision drift no longer drops tasks that should still fire. - Fixes a non-monotonicity bug in `recentActions` (a closing workflow could regress observed time). ### Invoker - `ExecuteTask` captures the framework clock at read time and threads it through `applyBackoff`, keeping `BackoffTime` consistent with `LastProcessedTime` (no more wall-clock drift in tests/replay). - `ProcessBufferTask` checks the catchup window before consuming a `LimitedActions` slot. - `recordExecuteResult` is idempotent per `RequestId`: a duplicate `CompletedStart` / `RetryableStart` whose live entry already has a `RunId` is dropped, without stomping `RunId`/`StartTime`/`HasCallback` or rewinding `Attempt`/`BackoffTime`. - `addTasks` is consolidated into a single-pass classifier; the broadened re-arm gate fixes the stranded-retry bug. ### Backfiller - `hasMoreAllowAllBackfills` was a misreading of the original code and is gone — any pending backfill (not just `ALLOW_ALL`) holds a paused schedule open while idling out. - On completion, the backfiller revives the Generator so it can re-evaluate idle/close eligibility. ## Tests This PR carries only the minimal tests that go red→green with the fixes: - **Functional** (`tests/schedule_test.go`): `TestPausedDropsCatchup` and `TestFutureActionTimesAdvanceWhilePaused` — both fail on `main`; the always-on Generator (advancing the HWM while paused) is what turns them green. - **Unit** (`chasm/lib/scheduler/`): targeted guards for each invoker fix — idempotency, framework-clock, eligibility boundary, re-arm, and catchup-before-limit — plus the modifications to pre-existing idle/sentinel tests the rework requires. Additional regression testing will live in a separate PR. ## Risks This touches a fair amount of scheduler surface area, but the fixes are individually validated and the diff is now mostly the rework itself. (cherry picked from commit e9ae575bfd00e0a530fa47bb1793edd17a2e45d3) --- chasm/lib/scheduler/backfiller_tasks.go | 11 +- chasm/lib/scheduler/export_test.go | 13 ++ chasm/lib/scheduler/generator_tasks.go | 29 ++- chasm/lib/scheduler/helper_test.go | 4 +- chasm/lib/scheduler/invoker.go | 93 ++++++---- .../scheduler/invoker_execute_task_test.go | 163 +++++++++++++++++ .../invoker_process_buffer_task_test.go | 99 +++++++++++ chasm/lib/scheduler/invoker_tasks.go | 88 ++++----- chasm/lib/scheduler/scheduler.go | 85 ++++----- .../scheduler/scheduler_idle_tasks_test.go | 106 ++++++----- chasm/lib/scheduler/scheduler_tasks.go | 54 ++++-- chasm/lib/scheduler/sentinel_test.go | 42 ++--- docs/architecture/schedules.md | 71 +++++--- tests/schedule_test.go | 167 ++++++++++++++++++ 14 files changed, 754 insertions(+), 271 deletions(-) diff --git a/chasm/lib/scheduler/backfiller_tasks.go b/chasm/lib/scheduler/backfiller_tasks.go index fa5380b1fa3..dd1484c18a8 100644 --- a/chasm/lib/scheduler/backfiller_tasks.go +++ b/chasm/lib/scheduler/backfiller_tasks.go @@ -45,15 +45,12 @@ func NewBackfillerTaskHandler(opts BackfillerTaskHandlerOptions) *BackfillerTask } func (b *BackfillerTaskHandler) Validate( - ctx chasm.Context, + _ chasm.Context, backfiller *Backfiller, attrs chasm.TaskAttributes, _ *schedulerpb.BackfillerTask, ) (bool, error) { - return validateTaskHighWaterMark( - backfiller.GetLastProcessedTime(), - attrs.ScheduledTime, - ) + return validateTaskHighWaterMark(backfiller.GetLastProcessedTime(), attrs.ScheduledTime) } func (b *BackfillerTaskHandler) Execute( @@ -110,6 +107,10 @@ func (b *BackfillerTaskHandler) Execute( logger.Debug("backfill complete, deleting Backfiller", tag.String("backfill-id", backfiller.GetBackfillId())) delete(scheduler.Backfillers, backfiller.GetBackfillId()) + + // Revive the Generator so it can re-evaluate idle/close eligibility now + // that this backfiller is gone. + scheduler.Generator.Get(ctx).Generate(ctx) return nil } diff --git a/chasm/lib/scheduler/export_test.go b/chasm/lib/scheduler/export_test.go index 0995ae1a056..f8522fc53d3 100644 --- a/chasm/lib/scheduler/export_test.go +++ b/chasm/lib/scheduler/export_test.go @@ -37,3 +37,16 @@ func (s *Scheduler) RecordCompletedAction( func (i *Invoker) RunningWorkflowID(requestID string) string { return i.runningWorkflowID(requestID) } + +// RecordExecuteResult exposes recordExecuteResult so tests can pin the +// per-RequestId idempotency guard against concurrent ExecuteTasks. +func (i *Invoker) RecordExecuteResult( + ctx chasm.MutableContext, + completed []*schedulespb.BufferedStart, + retryable []*schedulespb.BufferedStart, +) (newlyStarted, droppedDuplicates int) { + return i.recordExecuteResult(ctx, &executeResult{ + CompletedStarts: completed, + RetryableStarts: retryable, + }) +} diff --git a/chasm/lib/scheduler/generator_tasks.go b/chasm/lib/scheduler/generator_tasks.go index 3d2172faad5..ddc9dfcfb7b 100644 --- a/chasm/lib/scheduler/generator_tasks.go +++ b/chasm/lib/scheduler/generator_tasks.go @@ -84,6 +84,8 @@ func (g *GeneratorTaskHandler) Execute( t2 = t1 } + // Generate BufferedStarts and determine the next HWM. Actions are skipped when + // they can't be taken (paused, or limited and without any remaining actions). result, err := g.SpecProcessor.ProcessTimeRange( scheduler, t1, t2, @@ -116,12 +118,19 @@ func (g *GeneratorTaskHandler) Execute( generator.LastProcessedTime = timestamppb.New(result.LastActionTime) generator.UpdateFutureActionTimes(ctx, g.specBuilder) - // Check if the schedule has gone idle. + // Schedule the next timer task. Three outcomes are possible: + // - isIdle: the schedule is done; arm the idle task to close it. + // - NextWakeupTime is set: arm the next generator tick. + // - Neither: Hold open without a task. This requires both that + // isHeldOpen is true (paused or backfill pending) AND that no spec + // wakeup is available, e.g. a paused manual-only schedule. IdleTime=0 + // also lands here. An external trigger (Patch.Unpause, Update, or a + // BackfillerTask's completion-time Generate call) revives us. idleTimeTotal := g.config.Tweakables(scheduler.Namespace).IdleTime idleExpiration, isIdle := scheduler.getIdleExpiration(ctx, idleTimeTotal, result.NextWakeupTime) if isIdle { // Schedule is complete, no need for another buffer task. We keep the schedule's - // backing mutable state explicitly open for a the idle period, during which the + // backing mutable state explicitly open for the idle period, during which the // customer can describe/modify/restart the schedule. // // Once the idle timer expires, we close the component. @@ -133,19 +142,21 @@ func (g *GeneratorTaskHandler) Execute( // Record the idle-close deadline so it can be surfaced as the // ScheduleIdleCloseTime search attribute for stuck-schedule detection. scheduler.IdleCloseTime = timestamppb.New(idleExpiration) + return nil } - // The schedule has work again, so it's no longer pending an idle close. + // Not idle: the schedule has work again (or is being held open), so it's + // no longer pending an idle close. scheduler.IdleCloseTime = nil - // No more tasks if we're paused. - if scheduler.Schedule.State.Paused { - return nil + if !result.NextWakeupTime.IsZero() { + // Keep the generator task perpetually scheduled. When paused, the next + // fire will simply advance the HWM without appending actions (handled in + // ProcessTimeRange). + generator.scheduleTask(ctx, result.NextWakeupTime) } - - // Another buffering task is added if we aren't completely out of actions or paused. - generator.scheduleTask(ctx, result.NextWakeupTime) + // else: hold open without a task; see the comment block above. return nil } diff --git a/chasm/lib/scheduler/helper_test.go b/chasm/lib/scheduler/helper_test.go index ac38659ebce..32a4e52dc5f 100644 --- a/chasm/lib/scheduler/helper_test.go +++ b/chasm/lib/scheduler/helper_test.go @@ -91,7 +91,9 @@ func newTestLibrary(logger log.Logger, specProcessor scheduler.SpecProcessor) *s config, nil, scheduler.NewSchedulerIdleTaskHandler(scheduler.SchedulerIdleTaskHandlerOptions{ - Config: config, + Config: config, + MetricsHandler: metrics.NoopMetricsHandler, + BaseLogger: logger, }), scheduler.NewSchedulerCallbacksTaskHandler(scheduler.SchedulerCallbacksTaskHandlerOptions{ Config: config, diff --git a/chasm/lib/scheduler/invoker.go b/chasm/lib/scheduler/invoker.go index a7181993bbc..9c568a8b3a7 100644 --- a/chasm/lib/scheduler/invoker.go +++ b/chasm/lib/scheduler/invoker.go @@ -95,8 +95,9 @@ func (i *Invoker) recordProcessBufferResult(ctx chasm.MutableContext, result *pr start.Attempt = 1 } else if start.Attempt == 0 { // Start was processed but deferred (e.g., BUFFER_ONE policy with running workflow). - // Mark as deferred (-1) to distinguish from newly-enqueued starts. This prevents - // processingDeadline() from scheduling an immediate task for deferred starts. + // Mark as deferred (-1) to distinguish from newly-enqueued starts so addTasks + // won't schedule an immediate ProcessBuffer task for them - they wait on + // recordCompletedAction to re-enable. start.Attempt = -1 } @@ -109,15 +110,9 @@ func (i *Invoker) recordProcessBufferResult(ctx chasm.MutableContext, result *pr i.TerminateWorkflows = append(i.GetTerminateWorkflows(), result.terminateWorkflows...) i.LastProcessedTime = timestamppb.New(ctx.Now(i)) - // Only schedule new tasks if this processBuffer call actually did something. - // This prevents duplicate task scheduling when multiple ProcessBuffer tasks - // run in the same transaction (e.g., from multiple backfillers). - if len(result.startWorkflows) > 0 || - len(result.discardStarts) > 0 || - len(result.cancelWorkflows) > 0 || - len(result.terminateWorkflows) > 0 { - i.addTasks(ctx) - } + // Re-arm tasks if this call changed state, or if the LastProcessedTime advance + // just unblocked backed-off starts. + i.addTasks(ctx) } type executeResult struct { @@ -147,8 +142,11 @@ func (e *executeResult) Append(o executeResult) executeResult { } // recordExecuteResult updates the Invoker's internal state with the results of a -// completed InvokerExecuteTask. Tasks to continue execution are added, if needed. -func (i *Invoker) recordExecuteResult(ctx chasm.MutableContext, result *executeResult) { +// completed InvokerExecuteTask. It returns the number of *new* actions recorded +// (starts that transitioned from "no RunId" to "has RunId" in this call) and +// the number of completed results that were dropped because they were previously +// recorded. +func (i *Invoker) recordExecuteResult(ctx chasm.MutableContext, result *executeResult) (newlyStarted, droppedDuplicates int) { completed := make(map[string]*schedulespb.BufferedStart) // request ID -> BufferedStart with RunId/StartTime failed := make(map[string]bool) // request ID -> is present retryable := make(map[string]*schedulespb.BufferedStart) // request ID -> *BufferedStart @@ -182,12 +180,19 @@ func (i *Invoker) recordExecuteResult(ctx chasm.MutableContext, result *executeR return terminated[we.RunId] }) - // Update BufferedStarts with results. + // Update BufferedStarts with results, dropping duplicates. for _, start := range i.GetBufferedStarts() { + if start.RunId != "" { + if _, isDuplicate := completed[start.RequestId]; isDuplicate { + droppedDuplicates++ + } + continue + } if completedStart, ok := completed[start.RequestId]; ok { start.RunId = completedStart.GetRunId() start.StartTime = completedStart.GetStartTime() start.HasCallback = true + newlyStarted++ } if retry, ok := retryable[start.RequestId]; ok { start.Attempt++ @@ -195,8 +200,8 @@ func (i *Invoker) recordExecuteResult(ctx chasm.MutableContext, result *executeR } } - // Add tasks if other actions are backing off or still pending execution. i.addTasks(ctx) + return newlyStarted, droppedDuplicates } // runningWorkflowID returns the workflow ID associated with the given @@ -262,40 +267,59 @@ func (i *Invoker) recordCompletedAction( // addTasks adds both ProcessBuffer and Execute tasks as needed. It should be // called when completing processing/executing tasks, to drive backoff/retry. func (i *Invoker) addTasks(ctx chasm.MutableContext) { - totalStarts := len(i.GetBufferedStarts()) - eligibleStarts := len(i.getEligibleBufferedStarts()) - - // Add a ProcessBuffer pure task whenever there are BufferedStarts that are - // backing off, or are still pending initial processing. - if (totalStarts - eligibleStarts) > 0 { + // If we have Attempt = 0 starts, generate a ProcessBufferTask immediately. If we + // have starts that are backing off, add a timer task for the earliest backoff time. + if i.hasUnprocessedStarts() { ctx.AddTask(i, chasm.TaskAttributes{ - ScheduledTime: i.processingDeadline(), + ScheduledTime: chasm.TaskScheduledTimeImmediate, + }, &schedulerpb.InvokerProcessBufferTask{}) + } else if deadline := i.nextBackoffDeadline(); !deadline.IsZero() { + ctx.AddTask(i, chasm.TaskAttributes{ + ScheduledTime: deadline, }, &schedulerpb.InvokerProcessBufferTask{}) } - // Add an Execute side effect task whenever there are any eligible actions - // pending execution. - if len(i.GetCancelWorkflows()) > 0 || len(i.GetTerminateWorkflows()) > 0 || eligibleStarts > 0 { + // Execute drains work that's ready now: pending cancels/terminates, and + // starts that are past their backoff. + if len(i.GetCancelWorkflows()) > 0 || + len(i.GetTerminateWorkflows()) > 0 || + len(i.getEligibleBufferedStarts()) > 0 { ctx.AddTask(i, chasm.TaskAttributes{}, &schedulerpb.InvokerExecuteTask{}) } } -// processingDeadline returns the earliest possible time that the BufferedStarts -// queue should be processed, taking into account starts that have not yet been -// attempted, as well as those that are pending backoff to retry. If the buffer -// is empty, the return value will be Time's zero value. -func (i *Invoker) processingDeadline() time.Time { - var deadline time.Time +// hasUnprocessedStarts reports whether any BufferedStart is still awaiting its +// initial ProcessBuffer pass (Attempt == 0). +func (i *Invoker) hasUnprocessedStarts() bool { for _, start := range i.GetBufferedStarts() { if start.GetAttempt() == 0 { - // Return zero time to schedule an immediate task for unprocessed starts. - return chasm.TaskScheduledTimeImmediate + return true } + } + return false +} + +// nextBackoffDeadline returns the earliest BackoffTime among starts that are +// retrying, or the zero time if none are. +func (i *Invoker) nextBackoffDeadline() time.Time { + var deadline time.Time + lastProcessedTime := i.LastProcessedTime.AsTime() + for _, start := range i.GetBufferedStarts() { backoff := start.GetBackoffTime().AsTime() + // We only care about starts that are retrying. + if start.GetAttempt() <= 0 || + start.GetRunId() != "" || + start.GetCompleted() != nil || + // Backed-off starts will be selected by getEligibleBufferedStarts and kick off + // an Execute task, instead. + start.BackoffTime.AsTime().Before(lastProcessedTime) { + continue + } if deadline.IsZero() || backoff.Before(deadline) { deadline = backoff } } + return deadline } @@ -303,10 +327,11 @@ func (i *Invoker) processingDeadline() time.Time { // execution (Attempt > 0), haven't been started yet (no RunId), and aren't // presently backing off, based on last processed time. func (i *Invoker) getEligibleBufferedStarts() []*schedulespb.BufferedStart { + lastProcessed := i.GetLastProcessedTime().AsTime() return util.FilterSlice(i.GetBufferedStarts(), func(start *schedulespb.BufferedStart) bool { return start.Attempt > 0 && start.GetRunId() == "" && - start.BackoffTime.AsTime().Before(i.GetLastProcessedTime().AsTime()) + !start.GetBackoffTime().AsTime().After(lastProcessed) }) } diff --git a/chasm/lib/scheduler/invoker_execute_task_test.go b/chasm/lib/scheduler/invoker_execute_task_test.go index 34732f6b5a3..f1456d9a1ce 100644 --- a/chasm/lib/scheduler/invoker_execute_task_test.go +++ b/chasm/lib/scheduler/invoker_execute_task_test.go @@ -3,6 +3,7 @@ package scheduler_test import ( "fmt" "testing" + "time" "github.com/stretchr/testify/require" commonpb "go.temporal.io/api/common/v1" @@ -403,6 +404,168 @@ func TestExecuteTask_ExceedsMaxActionsPerExecution(t *testing.T) { }) } +// Two concurrent ExecuteTasks can both clone the invoker before either +// commits, both fire StartWorkflow for the same RequestId, and both observe +// success (server dedupes by RequestId). The losing commit must not +// double-count, stomp the winner's RunId/StartTime/HasCallback, or rewind +// Attempt/BackoffTime on the already-running entry. +func TestExecuteTask_RecordResultIdempotentOnRace(t *testing.T) { + env := newTestEnv(t) + ctx := env.MutableContext() + invoker := env.Scheduler.Invoker.Get(ctx) + + startTime := timestamppb.New(env.TimeSource.Now()) + winning := "winning-run" + invoker.BufferedStarts = []*schedulespb.BufferedStart{{ + NominalTime: startTime, + ActualTime: startTime, + DesiredTime: startTime, + RequestId: "req", + WorkflowId: "wf", + Attempt: 1, + RunId: winning, + StartTime: startTime, + HasCallback: true, + }} + invoker.LastProcessedTime = timestamppb.New(env.TimeSource.Now()) + + loserStartTime := timestamppb.New(env.TimeSource.Now().Add(time.Second)) + loser := []*schedulespb.BufferedStart{{ + RequestId: "req", + RunId: "loser-run", + StartTime: loserStartTime, + }} + + newlyStarted, droppedDuplicates := invoker.RecordExecuteResult(ctx, loser, nil) + require.Equal(t, 0, newlyStarted, "duplicate RunId-set start must not be counted") + require.Equal(t, 1, droppedDuplicates, "the dropped completion must be reported for observability") + live := invoker.BufferedStarts[0] + require.Equal(t, winning, live.RunId, "live RunId must not be stomped") + require.Equal(t, startTime.AsTime(), live.StartTime.AsTime(), "live StartTime must not be stomped") + require.True(t, live.HasCallback, "live HasCallback must not be cleared") + + // First-mover case: a CompletedStart for a fresh RequestId increments + // newlyStarted and writes through to the live entry. + invoker.BufferedStarts = append(invoker.BufferedStarts, &schedulespb.BufferedStart{ + NominalTime: startTime, + ActualTime: startTime, + DesiredTime: startTime, + RequestId: "req2", + WorkflowId: "wf2", + Attempt: 1, + }) + first := []*schedulespb.BufferedStart{{ + RequestId: "req2", + RunId: "first-run", + StartTime: startTime, + }} + newlyStarted, droppedDuplicates = invoker.RecordExecuteResult(ctx, first, nil) + require.Equal(t, 1, newlyStarted, "first-time RunId assignment must be counted") + require.Equal(t, 0, droppedDuplicates, "no duplicate was dropped") + freshlyStarted := invoker.BufferedStarts[1] + require.Equal(t, "first-run", freshlyStarted.RunId) + require.Equal(t, startTime.AsTime(), freshlyStarted.StartTime.AsTime()) + require.True(t, freshlyStarted.HasCallback, "first-time RunId assignment must set HasCallback") +} + +// A RetryableStart for a request whose live BufferedStart already has RunId +// set must not bump Attempt or set BackoffTime - the same RunId guard that +// protects the completed branch must also protect the retryable branch, +// otherwise stale retry metadata persists on an already-running entry. +func TestExecuteTask_RecordResultIdempotentOnRetryableRace(t *testing.T) { + env := newTestEnv(t) + ctx := env.MutableContext() + invoker := env.Scheduler.Invoker.Get(ctx) + + startTime := timestamppb.New(env.TimeSource.Now()) + invoker.BufferedStarts = []*schedulespb.BufferedStart{{ + NominalTime: startTime, + ActualTime: startTime, + DesiredTime: startTime, + RequestId: "req", + WorkflowId: "wf", + Attempt: 1, + RunId: "winning-run", + StartTime: startTime, + HasCallback: true, + }} + invoker.LastProcessedTime = timestamppb.New(env.TimeSource.Now()) + + // A losing concurrent Execute saw the start as eligible, its RPC failed + // retryably, and applyBackoff produced a RetryableStart entry. + loserBackoff := timestamppb.New(env.TimeSource.Now().Add(time.Minute)) + retryable := []*schedulespb.BufferedStart{{ + RequestId: "req", + BackoffTime: loserBackoff, + }} + + newlyStarted, droppedDuplicates := invoker.RecordExecuteResult(ctx, nil, retryable) + require.Equal(t, 0, newlyStarted) + require.Equal(t, 0, droppedDuplicates, "retryable drops aren't counted as duplicates - only completed-side drops are") + live := invoker.BufferedStarts[0] + require.Equal(t, int64(1), live.Attempt, "Attempt must not be incremented on a started entry") + require.Nil(t, live.BackoffTime, "BackoffTime must not be set on a started entry") +} + +// A start whose BackoffTime exactly equals LastProcessedTime must be eligible. +// Regression for the strict-Before check, which excluded equal-time entries +// and stranded retries that landed precisely at the HWM boundary. +func TestExecuteTask_Validate_BackoffEqualToLPTIsEligible(t *testing.T) { + env := newInvokerExecuteTestEnv(t) + ctx := env.MutableContext() + invoker := env.Scheduler.Invoker.Get(ctx) + + now := env.TimeSource.Now() + invoker.LastProcessedTime = timestamppb.New(now) + invoker.BufferedStarts = []*schedulespb.BufferedStart{{ + RequestId: "boundary", + Attempt: 2, + BackoffTime: timestamppb.New(now), + }} + + valid, err := env.handler.Validate(ctx, invoker, chasm.TaskAttributes{}, &schedulerpb.InvokerExecuteTask{}) + require.NoError(t, err) + require.True(t, valid, "BackoffTime == LastProcessedTime must be eligible (<=, not strict <)") +} + +// BackoffTime must be derived from the framework clock (chasm.Context.Now), +// not wall-clock time.Now. Test by advancing TimeSource so it diverges from +// the wall clock, then asserting BackoffTime falls within the framework +// clock's frame. +func TestExecuteTask_BackoffUsesFrameworkClock(t *testing.T) { + env := newInvokerExecuteTestEnv(t) + frameworkNow := env.TimeSource.Now().Add(24 * time.Hour) + env.TimeSource.Update(frameworkNow) + + startTime := timestamppb.New(env.TimeSource.Now()) + env.mockFrontendClient.EXPECT(). + StartWorkflowExecution(gomock.Any(), gomock.Any()). + Times(1). + Return(nil, serviceerror.NewDeadlineExceeded("transient")) + + runExecuteTestCase(t, env, &executeTestCase{ + InitialBufferedStarts: []*schedulespb.BufferedStart{{ + NominalTime: startTime, + ActualTime: startTime, + DesiredTime: startTime, + RequestId: "retry", + OverlapPolicy: enumspb.SCHEDULE_OVERLAP_POLICY_ALLOW_ALL, + Attempt: 1, + }}, + ExpectedBufferedStarts: 1, + ValidateInvoker: func(t *testing.T, invoker *scheduler.Invoker, env *invokerExecuteTestEnv) { + backoff := invoker.BufferedStarts[0].BackoffTime.AsTime() + require.True(t, backoff.After(frameworkNow), + "BackoffTime %v must be after framework clock %v", backoff, frameworkNow) + // time.Now() + any plausible delay is far before frameworkNow (TimeSource + // was advanced by 24h). If BackoffTime were derived from wall clock, + // this assertion would fail. + require.True(t, backoff.After(time.Now().Add(time.Hour)), + "BackoffTime %v looks derived from wall clock, not framework clock", backoff) + }, + }) +} + type startWorkflowExecutionRequestIDMatcher struct { RequestID string } diff --git a/chasm/lib/scheduler/invoker_process_buffer_task_test.go b/chasm/lib/scheduler/invoker_process_buffer_task_test.go index 9178a44667d..acfee763c53 100644 --- a/chasm/lib/scheduler/invoker_process_buffer_task_test.go +++ b/chasm/lib/scheduler/invoker_process_buffer_task_test.go @@ -13,9 +13,76 @@ import ( "go.temporal.io/server/chasm/lib/scheduler/gen/schedulerpb/v1" "go.temporal.io/server/common/metrics" "go.temporal.io/server/common/util" + "go.temporal.io/server/service/history/tasks" "google.golang.org/protobuf/types/known/timestamppb" ) +// A buffer of only deferred starts (Attempt=-1) must NOT emit a +// ProcessBufferTask. Deferred starts wait on completion events, not on a +// wall-clock deadline, so emitting an immediate ProcessBufferTask would +// spin-loop on a no-op processBuffer call. +func TestInvoker_AddTasks_AllDeferredEmitsNothing(t *testing.T) { + env := newTestEnv(t) + ctx := env.MutableContext() + invoker := env.Scheduler.Invoker.Get(ctx) + + now := env.TimeSource.Now() + invoker.LastProcessedTime = timestamppb.New(now) + invoker.BufferedStarts = []*schedulespb.BufferedStart{{ + RequestId: "deferred", + WorkflowId: "wf-deferred", + Attempt: -1, + ActualTime: timestamppb.New(now), + }} + + env.NodeBackend.TasksByCategory = nil + invoker.EnqueueBufferedStarts(ctx, nil) + require.NoError(t, env.CloseTransaction()) + + require.False(t, env.HasTask(&tasks.ChasmTask{}, chasm.TaskScheduledTimeImmediate), + "all-deferred buffer must not emit any side-effect task") + require.False(t, env.HasTask(&tasks.ChasmTaskPure{}, chasm.TaskScheduledTimeImmediate), + "all-deferred buffer must not emit an immediate ProcessBufferTask") +} + +// After ProcessBuffer fires for a backed-off start whose BackoffTime has just +// elapsed, the HWM advance must re-arm processing so the retry actually runs. +// Regression for the addTasks gate that previously suppressed re-arm when +// processBuffer found nothing to process (no Attempt==0 starts). +func TestProcessBufferTask_RearmsBackedOffRetry(t *testing.T) { + env := newTestEnv(t) + ctx := env.MutableContext() + invoker := env.Scheduler.Invoker.Get(ctx) + + now := env.TimeSource.Now() + // Set LPT to the past so the start (with BackoffTime at now) becomes + // eligible only after the HWM advance in this run. + invoker.LastProcessedTime = timestamppb.New(now.Add(-time.Minute)) + invoker.BufferedStarts = []*schedulespb.BufferedStart{{ + NominalTime: timestamppb.New(now), + ActualTime: timestamppb.New(now), + DesiredTime: timestamppb.New(now), + RequestId: "retry-ready", + OverlapPolicy: enumspb.SCHEDULE_OVERLAP_POLICY_ALLOW_ALL, + Attempt: 2, + BackoffTime: timestamppb.New(now), + }} + + err := newProcessBufferHandler(env).Execute(ctx, invoker, chasm.TaskAttributes{}, &schedulerpb.InvokerProcessBufferTask{}) + require.NoError(t, err) + require.NoError(t, env.CloseTransaction()) + + // Precondition for re-arm: the HWM advance in this Execute must have made + // the previously-backing-off start eligible (BackoffTime <= LPT). + require.False(t, invoker.BufferedStarts[0].BackoffTime.AsTime().After(invoker.LastProcessedTime.AsTime()), + "setup invariant: start must be eligible after HWM advance") + + // addTasks must have emitted an immediate side-effect ChasmTask carrying + // the InvokerExecuteTask payload so the retry actually fires. + require.True(t, env.HasTask(&tasks.ChasmTask{}, chasm.TaskScheduledTimeImmediate), + "expected an immediate side-effect task to re-arm the backed-off retry") +} + func newProcessBufferHandler(env *testEnv) *scheduler.InvokerProcessBufferTaskHandler { return scheduler.NewInvokerProcessBufferTaskHandler(scheduler.InvokerTaskHandlerOptions{ Config: defaultConfig(), @@ -315,6 +382,38 @@ func TestProcessBufferTask_NeedsTerminate(t *testing.T) { }) } +// Past-catchup automated starts must drop WITHOUT consuming a LimitedActions +// slot. Regression for the order-of-checks bug where useScheduledAction(true) +// fired before the catchup-window check, decrementing RemainingActions for +// starts that never ran. +// +// Not reachable via the public API today: backfill/TriggerImmediately starts +// are Manual=true (see backfiller_tasks.go) and bypass the catchup-window +// check. The bug only manifests for automated Generator starts that age past +// their catchup window while processing is stalled. +func TestProcessBufferTask_MissedCatchupPreservesRemainingActions(t *testing.T) { + env := newTestEnv(t) + env.Scheduler.Schedule.State.LimitedActions = true + env.Scheduler.Schedule.State.RemainingActions = 3 + + now := env.TimeSource.Now() + startTime := timestamppb.New(now.Add(-defaultCatchupWindow * 2)) + runProcessBufferTestCase(t, env, &processBufferTestCase{ + InitialBufferedStarts: []*schedulespb.BufferedStart{{ + NominalTime: startTime, + ActualTime: startTime, + DesiredTime: startTime, + Manual: false, + RequestId: "expired", + OverlapPolicy: enumspb.SCHEDULE_OVERLAP_POLICY_ALLOW_ALL, + }}, + ExpectedBufferedStarts: 0, + ExpectedMissedCatchupWindow: 1, + }) + require.Equal(t, int64(3), env.Scheduler.Schedule.State.RemainingActions, + "RemainingActions must not be consumed by a start that was dropped for missing the catchup window") +} + // A buffered start with an overlap policy to cancel other workflows is processed. func TestProcessBufferTask_NeedsCancel(t *testing.T) { env := newTestEnv(t) diff --git a/chasm/lib/scheduler/invoker_tasks.go b/chasm/lib/scheduler/invoker_tasks.go index 19d5bc17309..84b3b1f935b 100644 --- a/chasm/lib/scheduler/invoker_tasks.go +++ b/chasm/lib/scheduler/invoker_tasks.go @@ -10,7 +10,6 @@ import ( commonpb "go.temporal.io/api/common/v1" enumspb "go.temporal.io/api/enums/v1" - schedulepb "go.temporal.io/api/schedule/v1" "go.temporal.io/api/serviceerror" "go.temporal.io/api/workflowservice/v1" "go.temporal.io/server/api/historyservice/v1" @@ -120,10 +119,10 @@ func (h *InvokerExecuteTaskHandler) Validate( // If another execute task already happened to kick everything off, we don't need // this one. eligibleStarts := invoker.getEligibleBufferedStarts() - valid := len(invoker.GetTerminateWorkflows())+ - len(invoker.GetCancelWorkflows())+ - len(eligibleStarts) > 0 - return valid, nil + if len(invoker.GetTerminateWorkflows())+len(invoker.GetCancelWorkflows())+len(eligibleStarts) > 0 { + return true, nil + } + return false, nil } func (h *InvokerExecuteTaskHandler) Execute( @@ -138,6 +137,7 @@ func (h *InvokerExecuteTaskHandler) Execute( var scheduler *Scheduler var lastCompletionState *schedulerpb.LastCompletionResult var callback *commonpb.Callback + var now time.Time // Read and deep copy returned components, since we'll continue to access them // outside of this function (outside of the MS lock). @@ -166,6 +166,10 @@ func (h *InvokerExecuteTaskHandler) Execute( } callback = common.CloneProto(cb) + // Captured here for applyBackoff (runs outside this closure): BackoffTime + // must be framework-clock to match LastProcessedTime and task deadlines. + now = ctx.Now(i) + return struct{}{}, nil }, nil, @@ -186,8 +190,7 @@ func (h *InvokerExecuteTaskHandler) Execute( ictx := h.newInvokerTaskHandlerContext(ctx, scheduler) result = result.Append(h.terminateWorkflows(ictx, logger, metricsHandler, scheduler, invoker.GetTerminateWorkflows())) result = result.Append(h.cancelWorkflows(ictx, logger, metricsHandler, scheduler, invoker.GetCancelWorkflows())) - sres, startResults := h.startWorkflows(ictx, logger, metricsHandler, scheduler, invoker, lastCompletionState, callback) - result = result.Append(sres) + result = result.Append(h.startWorkflows(ictx, logger, metricsHandler, scheduler, invoker, lastCompletionState, callback, now)) // Record action results on the Invoker (internal state), as well as the // Scheduler (user-facing metrics). @@ -196,10 +199,10 @@ func (h *InvokerExecuteTaskHandler) Execute( invokerRef, func(i *Invoker, ctx chasm.MutableContext, _ any) (chasm.NoValue, error) { s := i.Scheduler.Get(ctx) - - i.recordExecuteResult(ctx, &result) - s.recordActionResult(&schedulerActionResult{actionCount: int64(len(startResults))}) - + // Use newlyStarted (not len(result.CompletedStarts)) so a concurrent + // ExecuteTask's duplicate StartWorkflow can't inflate ActionCount. + newlyStarted, _ := i.recordExecuteResult(ctx, &result) + s.recordActionResult(&schedulerActionResult{actionCount: int64(newlyStarted)}) return nil, nil }, nil, @@ -306,7 +309,8 @@ func (h *InvokerExecuteTaskHandler) startWorkflows( invoker *Invoker, lastCompletionState *schedulerpb.LastCompletionResult, callback *commonpb.Callback, -) (result executeResult, startResults []*schedulepb.ScheduleActionResult) { + now time.Time, +) (result executeResult) { metricsWithTag := metricsHandler.WithTags( metrics.StringTag(metrics.ScheduleActionTypeTag, metrics.ScheduleActionStartWorkflow)) @@ -335,7 +339,7 @@ func (h *InvokerExecuteTaskHandler) startWorkflows( // Run all starts concurrently. newCtx := ctx.Clone() wg.Go(func() { - startResult, err := h.startWorkflow(newCtx, metricsHandler, scheduler, start, lastCompletionState, callback) + err := h.startWorkflow(newCtx, metricsHandler, scheduler, start, lastCompletionState, callback) resultMutex.Lock() defer resultMutex.Unlock() @@ -350,8 +354,7 @@ func (h *InvokerExecuteTaskHandler) startWorkflows( } if isRetryableError(err) { - // Apply backoff to start and retry. - h.applyBackoff(start, err) + h.applyBackoff(start, err, now) result.RetryableStarts = append(result.RetryableStarts, start) } else { // Drop the start from the buffer. @@ -363,7 +366,6 @@ func (h *InvokerExecuteTaskHandler) startWorkflows( metricsWithTag.Counter(metrics.ScheduleActionSuccess.Name()).Record(1) result.CompletedStarts = append(result.CompletedStarts, start) - startResults = append(startResults, startResult) }) } @@ -372,15 +374,12 @@ func (h *InvokerExecuteTaskHandler) startWorkflows( } func (h *InvokerProcessBufferTaskHandler) Validate( - ctx chasm.Context, + _ chasm.Context, invoker *Invoker, attrs chasm.TaskAttributes, _ *schedulerpb.InvokerProcessBufferTask, ) (bool, error) { - return validateTaskHighWaterMark( - invoker.GetLastProcessedTime(), - attrs.ScheduledTime, - ) + return validateTaskHighWaterMark(invoker.GetLastProcessedTime(), attrs.ScheduledTime) } func (h *InvokerProcessBufferTaskHandler) Execute( @@ -447,23 +446,23 @@ func (h *InvokerProcessBufferTaskHandler) processBuffer( // Update result metrics. result.overlapSkipped = action.OverlapSkipped - // Add starting workflows to result, trim others. + // Add starting workflows to result, trim others. Catchup-window expiry is + // checked before useScheduledAction so that a start past its catchup + // window doesn't consume a LimitedActions slot. for _, start := range readyStarts { - // Ensure we can take more actions. Manual actions are always allowed. - if !start.Manual && !scheduler.useScheduledAction(true) { - // Drop buffered automated actions while paused. + if ctx.Now(invoker).After(h.startWorkflowDeadline(ctx, scheduler, start)) { + result.missedCatchupWindow++ result.discardStarts = append(result.discardStarts, start) continue } - if ctx.Now(invoker).After(h.startWorkflowDeadline(ctx, scheduler, start)) { - // Drop expired starts. - result.missedCatchupWindow++ + // Ensure we can take more actions. Manual actions are always allowed. + if !start.Manual && !scheduler.useScheduledAction(true) { + // Drop buffered automated actions while paused or out of actions. result.discardStarts = append(result.discardStarts, start) continue } - // Append for immediate execution. keepStarts[start.GetRequestId()] = struct{}{} result.startWorkflows = append(result.startWorkflows, start) } @@ -484,22 +483,22 @@ func (h *InvokerProcessBufferTaskHandler) processBuffer( } // applyBackoff updates start's BackoffTime based on err and the retry policy. -func (h *InvokerExecuteTaskHandler) applyBackoff(start *schedulespb.BufferedStart, err error) { +// `now` is the framework clock captured at task start - using time.Now would +// diverge from the LastProcessedTime/eligibility comparison in tests. +func (h *InvokerExecuteTaskHandler) applyBackoff(start *schedulespb.BufferedStart, err error, now time.Time) { if err == nil { return } var delay time.Duration if rateLimitDelay, ok := isRateLimitedError(err); ok { - // If we have the rate limiter's delay, use that. delay = rateLimitDelay } else { - // Otherwise, use the backoff policy. Elapsed time is left at 0 because we bound - // on number of attempts. + // Elapsed time is left at 0 because we bound on number of attempts. delay = h.config.RetryPolicy().ComputeNextDelay(0, int(start.Attempt), nil) } - start.BackoffTime = timestamppb.New(time.Now().Add(delay)) + start.BackoffTime = timestamppb.New(now.Add(delay)) } // startWorkflowDeadline returns the latest time at which a buffered workflow @@ -536,21 +535,21 @@ func (h *InvokerExecuteTaskHandler) startWorkflow( start *schedulespb.BufferedStart, lastCompletionState *schedulerpb.LastCompletionResult, callback *commonpb.Callback, -) (*schedulepb.ScheduleActionResult, error) { +) error { requestSpec := scheduler.GetSchedule().GetAction().GetStartWorkflow() if start.Attempt >= InvokerMaxStartAttempts { - return nil, errRetryLimitExceeded + return errRetryLimitExceeded } // Get rate limiter permission once per buffered start, on the first attempt only. if start.Attempt == 1 { delay, err := h.getRateLimiterPermission() if err != nil { - return nil, err + return err } if delay > 0 { - return nil, newRateLimitedError(delay) + return newRateLimitedError(delay) } } @@ -590,7 +589,7 @@ func (h *InvokerExecuteTaskHandler) startWorkflow( result, err := h.frontendClient.StartWorkflowExecution(ctx, request) if err != nil { - return nil, err + return err } actualStartTime := time.Now() @@ -608,16 +607,7 @@ func (h *InvokerExecuteTaskHandler) startWorkflow( Timer(metrics.ScheduleActionDelay.Name()). Record(actualStartTime.Sub(desiredTime.AsTime())) } - - return &schedulepb.ScheduleActionResult{ - ScheduleTime: start.ActualTime, - ActualTime: timestamppb.New(actualStartTime), - StartWorkflowResult: &commonpb.WorkflowExecution{ - WorkflowId: start.WorkflowId, - RunId: result.RunId, - }, - StartWorkflowStatus: result.Status, // usually should be RUNNING - }, nil + return nil } func (h *InvokerExecuteTaskHandler) terminateWorkflow( diff --git a/chasm/lib/scheduler/scheduler.go b/chasm/lib/scheduler/scheduler.go index 37a910a939a..c774461de07 100644 --- a/chasm/lib/scheduler/scheduler.go +++ b/chasm/lib/scheduler/scheduler.go @@ -465,60 +465,64 @@ func (s *Scheduler) updateConflictToken() { // value is used for calculating the retention time (how long an idle schedule // lives after becoming idle). func (s *Scheduler) getLastEventTime(ctx chasm.Context) time.Time { - invoker := s.Invoker.Get(ctx) - recentActions := invoker.recentActions() - latest := util.MaxTime( s.Info.GetCreateTime().AsTime(), s.Info.GetUpdateTime().AsTime(), ) - if len(recentActions) > 0 { - latest = util.MaxTime(latest, recentActions[len(recentActions)-1].GetActualTime().AsTime()) + + // The recentActions list is unsorted. + for _, a := range s.Invoker.Get(ctx).recentActions() { + latest = util.MaxTime(latest, a.GetActualTime().AsTime()) } + return latest } -// getIdleExpiration returns an idle close time and the boolean value of 'true' -// for when a schedule is idle (pending soft delete). +// isHeldOpen reports whether the schedule must stay open regardless of having +// no current work: +// - Paused: customer expects to resume. +// - Pending backfill: must drain (any pending backfill blocks idle). +// - Sentinels are exempt . +// +// Manual-only schedules (empty spec, but active) idle out like any other - +// matching V1's RetentionTime semantics, where lastEventTime is advanced by +// manual triggers via recentActions. +func (s *Scheduler) isHeldOpen() bool { + if s.IsSentinel() { + return false + } + return s.Schedule.GetState().GetPaused() || + s.hasMoreBackfills() +} + +// idleDeadline returns when the schedule becomes eligible to close. Sentinels +// anchor on CreateTime - they have no Invoker (and thus no recentActions). +func (s *Scheduler) idleDeadline(ctx chasm.Context, idleTime time.Duration) time.Time { + if s.IsSentinel() { + return s.Info.GetCreateTime().AsTime().Add(idleTime) + } + return s.getLastEventTime(ctx).Add(idleTime) +} + +// getIdleExpiration returns an idle close time and whether the schedule is +// idle. Generator-only: nextWakeup must reflect the spec's next pending wakeup +// so the predicate can skip arming idle when work remains. Validate calls +// isHeldOpen and idleDeadline directly. func (s *Scheduler) getIdleExpiration( ctx chasm.Context, idleTime time.Duration, nextWakeup time.Time, ) (time.Time, bool) { - // The idle timer to close off the component is started only for schedules with - // no more work to do. Paused schedules are held open indefinitely. if idleTime == 0 || - s.GetSchedule().GetState().GetPaused() || - (!nextWakeup.IsZero() && s.useScheduledAction(false)) || - s.hasMoreAllowAllBackfills(ctx) { + s.isHeldOpen() || + (!nextWakeup.IsZero() && s.useScheduledAction(false)) { return time.Time{}, false } + return s.idleDeadline(ctx, idleTime), true +} - if s.IsSentinel() { - return s.Info.GetCreateTime().AsTime().Add(idleTime), true - } - return s.getLastEventTime(ctx).Add(idleTime), true -} - -func (s *Scheduler) hasMoreAllowAllBackfills(ctx chasm.Context) bool { - for _, field := range s.Backfillers { - backfiller := field.Get(ctx) - var policy enumspb.ScheduleOverlapPolicy - switch request := backfiller.GetRequest().(type) { - case *schedulerpb.BackfillerState_BackfillRequest: - policy = request.BackfillRequest.OverlapPolicy - case *schedulerpb.BackfillerState_TriggerRequest: - policy = request.TriggerRequest.OverlapPolicy - default: - return false - } - - if enumspb.SCHEDULE_OVERLAP_POLICY_ALLOW_ALL == s.resolveOverlapPolicy(policy) { - return true - } - } - - return false +func (s *Scheduler) hasMoreBackfills() bool { + return len(s.Backfillers) > 0 } type schedulerActionResult struct { @@ -841,8 +845,7 @@ func (s *Scheduler) Update( s.updateConflictToken() // Since the spec may have been updated, kick off the generator. - generator := s.Generator.Get(ctx) - generator.Generate(ctx) + s.Generator.Get(ctx).Generate(ctx) return &schedulerpb.UpdateScheduleResponse{ FrontendResponse: &workflowservice.UpdateScheduleResponse{}, @@ -873,7 +876,6 @@ func (s *Scheduler) Patch( } s.Schedule.State.Paused = false s.Schedule.State.Notes = req.FrontendRequest.Patch.Unpause - s.Generator.Get(ctx).Generate(ctx) } if err := s.handlePatch(ctx, req.FrontendRequest.Patch); err != nil { @@ -883,6 +885,9 @@ func (s *Scheduler) Patch( s.Info.UpdateTime = timestamppb.New(ctx.Now(s)) s.updateConflictToken() + // Since the paused state may have been updated, kick off the generator. + s.Generator.Get(ctx).Generate(ctx) + return &schedulerpb.PatchScheduleResponse{ FrontendResponse: &workflowservice.PatchScheduleResponse{}, }, nil diff --git a/chasm/lib/scheduler/scheduler_idle_tasks_test.go b/chasm/lib/scheduler/scheduler_idle_tasks_test.go index 075e838b719..b63fb71d8a5 100644 --- a/chasm/lib/scheduler/scheduler_idle_tasks_test.go +++ b/chasm/lib/scheduler/scheduler_idle_tasks_test.go @@ -8,18 +8,33 @@ import ( "go.temporal.io/server/chasm" "go.temporal.io/server/chasm/lib/scheduler" "go.temporal.io/server/chasm/lib/scheduler/gen/schedulerpb/v1" + "go.temporal.io/server/common/log" + "go.temporal.io/server/common/metrics" "google.golang.org/protobuf/types/known/durationpb" "google.golang.org/protobuf/types/known/timestamppb" ) +func newIdleHandler(idleTime time.Duration) *scheduler.SchedulerIdleTaskHandler { + return scheduler.NewSchedulerIdleTaskHandler(scheduler.SchedulerIdleTaskHandlerOptions{ + Config: &scheduler.Config{ + Tweakables: func(_ string) scheduler.Tweakables { + t := scheduler.DefaultTweakables + t.IdleTime = idleTime + return t + }, + }, + MetricsHandler: metrics.NoopMetricsHandler, + BaseLogger: log.NewTestLogger(), + }) +} + type idleValidateTestCase struct { - configIdleTime time.Duration - taskIdleTimeTotal time.Duration - scheduledTime time.Time - schedulerClosed bool - idleMatchesScheduledTime bool - setupScheduler func(*scheduler.Scheduler, chasm.Context) - expectedValid bool + configIdleTime time.Duration + taskIdleTimeTotal time.Duration + scheduledTime time.Time + schedulerClosed bool + setupScheduler func(*scheduler.Scheduler, chasm.Context) + expectedValid bool } func runIdleValidateTestCase(t *testing.T, env *testEnv, c *idleValidateTestCase) { @@ -27,60 +42,36 @@ func runIdleValidateTestCase(t *testing.T, env *testEnv, c *idleValidateTestCase sched := env.Scheduler sched.Closed = c.schedulerClosed - if c.setupScheduler != nil { c.setupScheduler(sched, ctx) } - config := &scheduler.Config{ - Tweakables: func(_ string) scheduler.Tweakables { - tweakables := scheduler.DefaultTweakables - tweakables.IdleTime = c.configIdleTime - return tweakables - }, - } - - handler := scheduler.NewSchedulerIdleTaskHandler(scheduler.SchedulerIdleTaskHandlerOptions{ - Config: config, - }) - - task := &schedulerpb.SchedulerIdleTask{ - IdleTimeTotal: durationpb.New(c.taskIdleTimeTotal), - } - - scheduledTime := c.scheduledTime - if c.idleMatchesScheduledTime { - lastEventTime := scheduledTime.Add(-c.configIdleTime) - sched.Info.UpdateTime = timestamppb.New(lastEventTime) - sched.Info.CreateTime = timestamppb.New(lastEventTime) - } - - taskAttrs := chasm.TaskAttributes{ - ScheduledTime: scheduledTime, - } + handler := newIdleHandler(c.configIdleTime) + task := &schedulerpb.SchedulerIdleTask{IdleTimeTotal: durationpb.New(c.taskIdleTimeTotal)} + taskAttrs := chasm.TaskAttributes{ScheduledTime: c.scheduledTime} isValid, err := handler.Validate(ctx, sched, taskAttrs, task) require.NoError(t, err) require.Equal(t, c.expectedValid, isValid) } +// anchorLastEventTo backdates Info.UpdateTime/CreateTime so that +// idleDeadline = anchor + idleTime; pairs with scheduledTime = anchor + idleTime +// to make Validate's expiration check resolve to "stable". +func anchorLastEventTo(sched *scheduler.Scheduler, anchor time.Time) { + sched.Info.UpdateTime = timestamppb.New(anchor) + sched.Info.CreateTime = timestamppb.New(anchor) +} + func TestIdleTask_Execute(t *testing.T) { env := newTestEnv(t) ctx := env.MutableContext() sched := env.Scheduler - handler := scheduler.NewSchedulerIdleTaskHandler(scheduler.SchedulerIdleTaskHandlerOptions{ - Config: defaultConfig(), - }) - - // Verify scheduler starts open. + handler := newIdleHandler(10 * time.Minute) require.False(t, sched.Closed) - - // Execute the idle task. err := handler.Execute(ctx, sched, chasm.TaskAttributes{}, &schedulerpb.SchedulerIdleTask{}) require.NoError(t, err) - - // Verify scheduler is now closed. require.True(t, sched.Closed) } @@ -91,8 +82,7 @@ func TestIdleTask_Validate_SchedulerNotIdle(t *testing.T) { configIdleTime: 10 * time.Minute, taskIdleTimeTotal: 10 * time.Minute, scheduledTime: now, - setupScheduler: func(sched *scheduler.Scheduler, ctx chasm.Context) { - // Make scheduler not idle by setting it as paused. + setupScheduler: func(sched *scheduler.Scheduler, _ chasm.Context) { sched.Schedule.State.Paused = true }, expectedValid: false, @@ -103,11 +93,13 @@ func TestIdleTask_Validate_ValidIdleTask(t *testing.T) { env := newTestEnv(t) now := env.TimeSource.Now() runIdleValidateTestCase(t, env, &idleValidateTestCase{ - configIdleTime: 10 * time.Minute, - taskIdleTimeTotal: 10 * time.Minute, - scheduledTime: now, - idleMatchesScheduledTime: true, - expectedValid: true, + configIdleTime: 10 * time.Minute, + taskIdleTimeTotal: 10 * time.Minute, + scheduledTime: now, + setupScheduler: func(sched *scheduler.Scheduler, _ chasm.Context) { + anchorLastEventTo(sched, now.Add(-10*time.Minute)) + }, + expectedValid: true, }) } @@ -115,11 +107,13 @@ func TestIdleTask_Validate_SchedulerAlreadyClosed(t *testing.T) { env := newTestEnv(t) now := env.TimeSource.Now() runIdleValidateTestCase(t, env, &idleValidateTestCase{ - configIdleTime: 10 * time.Minute, - taskIdleTimeTotal: 10 * time.Minute, - scheduledTime: now, - schedulerClosed: true, - idleMatchesScheduledTime: true, - expectedValid: false, // Should return !scheduler.Closed (false when closed). + configIdleTime: 10 * time.Minute, + taskIdleTimeTotal: 10 * time.Minute, + scheduledTime: now, + schedulerClosed: true, + setupScheduler: func(sched *scheduler.Scheduler, _ chasm.Context) { + anchorLastEventTo(sched, now.Add(-10*time.Minute)) + }, + expectedValid: false, }) } diff --git a/chasm/lib/scheduler/scheduler_tasks.go b/chasm/lib/scheduler/scheduler_tasks.go index 69dcbc1676e..7e84fed84cc 100644 --- a/chasm/lib/scheduler/scheduler_tasks.go +++ b/chasm/lib/scheduler/scheduler_tasks.go @@ -4,7 +4,6 @@ import ( "context" "errors" "fmt" - "time" commonpb "go.temporal.io/api/common/v1" enumspb "go.temporal.io/api/enums/v1" @@ -16,6 +15,9 @@ import ( "go.temporal.io/server/chasm" "go.temporal.io/server/chasm/lib/scheduler/gen/schedulerpb/v1" "go.temporal.io/server/common" + "go.temporal.io/server/common/log" + "go.temporal.io/server/common/log/tag" + "go.temporal.io/server/common/metrics" "go.temporal.io/server/common/resource" "go.uber.org/fx" "google.golang.org/protobuf/types/known/timestamppb" @@ -24,17 +26,23 @@ import ( type SchedulerIdleTaskHandlerOptions struct { fx.In - Config *Config + Config *Config + MetricsHandler metrics.Handler + BaseLogger log.Logger } type SchedulerIdleTaskHandler struct { chasm.PureTaskHandlerBase - config *Config + config *Config + metricsHandler metrics.Handler + baseLogger log.Logger } func NewSchedulerIdleTaskHandler(opts SchedulerIdleTaskHandlerOptions) *SchedulerIdleTaskHandler { return &SchedulerIdleTaskHandler{ - config: opts.Config, + config: opts.Config, + metricsHandler: opts.MetricsHandler, + baseLogger: opts.BaseLogger, } } @@ -48,30 +56,40 @@ func (r *SchedulerIdleTaskHandler) Execute( return nil } -// Validate returns true when all three conditions hold: -// 1. The schedule is still idle — no new actions have been queued since this -// task was created (getIdleExpiration returns isIdle=true). -// 2. The idle expiration has not shifted — getIdleExpiration recomputes -// getLastEventTime()+idleTimeTotal and it must equal the task's scheduled -// time. If a workflow start has advanced getLastEventTime() since the task -// was written, the expiration no longer matches and the task is dropped. -// 3. The scheduler is not already closed (idempotency guard). +// Validate returns true (fire) when the schedule should still close. False +// drops for: already-closed (idempotency), held-open by state, or a deadline +// that shifted later than ScheduledTime (the Generator will have re-armed at +// the new deadline). It deliberately does not re-derive "is the spec +// exhausted" - that's an arm-time concern. func (r *SchedulerIdleTaskHandler) Validate( ctx chasm.Context, scheduler *Scheduler, taskAttrs chasm.TaskAttributes, task *schedulerpb.SchedulerIdleTask, ) (bool, error) { - idleTimeTotal := task.IdleTimeTotal.AsDuration() - idleExpiration, isIdle := scheduler.getIdleExpiration(ctx, idleTimeTotal, time.Time{}) + if scheduler.Closed { + return false, nil + } + if scheduler.isHeldOpen() { + return false, nil + } - // If the scheduler has since woken up, or its idle expiration time changed, this - // task must be obsolete. - if !isIdle || idleExpiration.Compare(taskAttrs.ScheduledTime) != 0 { + // Use After (not strict equality) so sub-precision drift doesn't drop tasks + // that should still fire. + idleExpiration := scheduler.idleDeadline(ctx, task.IdleTimeTotal.AsDuration()) + if idleExpiration.After(taskAttrs.ScheduledTime) { return false, nil } - return !scheduler.Closed, nil + // Deadline moved earlier - shouldn't happen if getLastEventTime is monotonic. + // Fire (closing the schedule is the safe call) but log so a real regression + // surfaces. + if idleExpiration.Before(taskAttrs.ScheduledTime) { + newTaggedLogger(r.baseLogger, scheduler).Warn("idle deadline regressed", + tag.Timestamp(idleExpiration), + tag.NewTimeTag("scheduled-time", taskAttrs.ScheduledTime)) + } + return true, nil } type SchedulerCallbacksTaskHandlerOptions struct { diff --git a/chasm/lib/scheduler/sentinel_test.go b/chasm/lib/scheduler/sentinel_test.go index b5e31d77000..c00fd034fe3 100644 --- a/chasm/lib/scheduler/sentinel_test.go +++ b/chasm/lib/scheduler/sentinel_test.go @@ -2,7 +2,6 @@ package scheduler_test import ( "testing" - "time" "github.com/stretchr/testify/require" "go.temporal.io/server/chasm" @@ -18,8 +17,7 @@ func TestNewSentinel(t *testing.T) { require.NotNil(t, sentinel.Info.GetCreateTime()) require.False(t, sentinel.Info.CreateTime.AsTime().IsZero()) - // Sentinels should have no Visibility component, which prevents them from - // appearing in ListSchedules results. + // Sentinels have no Visibility component, which keeps them out of ListSchedules. _, ok := sentinel.Visibility.TryGet(ctx) require.False(t, ok) } @@ -27,13 +25,8 @@ func TestNewSentinel(t *testing.T) { func TestSentinelIdleTask_Validate_Valid(t *testing.T) { sentinel, ctx, _ := setupSentinelForTest(t) - executor := scheduler.NewSchedulerIdleTaskHandler(scheduler.SchedulerIdleTaskHandlerOptions{ - Config: defaultConfig(), - }) - - task := &schedulerpb.SchedulerIdleTask{ - IdleTimeTotal: durationpb.New(scheduler.SentinelIdleTime), - } + executor := newIdleHandler(scheduler.SentinelIdleTime) + task := &schedulerpb.SchedulerIdleTask{IdleTimeTotal: durationpb.New(scheduler.SentinelIdleTime)} taskAttrs := chasm.TaskAttributes{ ScheduledTime: sentinel.Info.CreateTime.AsTime().Add(scheduler.SentinelIdleTime), } @@ -47,13 +40,8 @@ func TestSentinelIdleTask_Validate_InvalidAfterClosed(t *testing.T) { sentinel, ctx, _ := setupSentinelForTest(t) sentinel.Closed = true - executor := scheduler.NewSchedulerIdleTaskHandler(scheduler.SchedulerIdleTaskHandlerOptions{ - Config: defaultConfig(), - }) - - task := &schedulerpb.SchedulerIdleTask{ - IdleTimeTotal: durationpb.New(scheduler.SentinelIdleTime), - } + executor := newIdleHandler(scheduler.SentinelIdleTime) + task := &schedulerpb.SchedulerIdleTask{IdleTimeTotal: durationpb.New(scheduler.SentinelIdleTime)} taskAttrs := chasm.TaskAttributes{ ScheduledTime: sentinel.Info.CreateTime.AsTime().Add(scheduler.SentinelIdleTime), } @@ -63,18 +51,15 @@ func TestSentinelIdleTask_Validate_InvalidAfterClosed(t *testing.T) { require.False(t, isValid) } -func TestSentinelIdleTask_Validate_MismatchedScheduledTime(t *testing.T) { +// Task armed before the natural sentinel deadline must drop: idleDeadline +// (CreateTime + SentinelIdleTime) is after ScheduledTime (CreateTime). +func TestSentinelIdleTask_Validate_ExpirationShiftedLater(t *testing.T) { sentinel, ctx, _ := setupSentinelForTest(t) - executor := scheduler.NewSchedulerIdleTaskHandler(scheduler.SchedulerIdleTaskHandlerOptions{ - Config: defaultConfig(), - }) - - task := &schedulerpb.SchedulerIdleTask{ - IdleTimeTotal: durationpb.New(scheduler.SentinelIdleTime), - } + executor := newIdleHandler(scheduler.SentinelIdleTime) + task := &schedulerpb.SchedulerIdleTask{IdleTimeTotal: durationpb.New(scheduler.SentinelIdleTime)} taskAttrs := chasm.TaskAttributes{ - ScheduledTime: sentinel.Info.CreateTime.AsTime().Add(99 * time.Hour), + ScheduledTime: sentinel.Info.CreateTime.AsTime(), } isValid, err := executor.Validate(ctx, sentinel, taskAttrs, task) @@ -85,10 +70,7 @@ func TestSentinelIdleTask_Validate_MismatchedScheduledTime(t *testing.T) { func TestSentinelIdleTask_Execute(t *testing.T) { sentinel, ctx, _ := setupSentinelForTest(t) - executor := scheduler.NewSchedulerIdleTaskHandler(scheduler.SchedulerIdleTaskHandlerOptions{ - Config: defaultConfig(), - }) - + executor := newIdleHandler(scheduler.SentinelIdleTime) require.False(t, sentinel.Closed) err := executor.Execute(ctx, sentinel, chasm.TaskAttributes{}, &schedulerpb.SchedulerIdleTask{}) require.NoError(t, err) diff --git a/docs/architecture/schedules.md b/docs/architecture/schedules.md index 34d923ad9e2..c65df4647d0 100644 --- a/docs/architecture/schedules.md +++ b/docs/architecture/schedules.md @@ -5,10 +5,10 @@ This page documents the internal architecture behind Temporal's [Schedules featu ### Introduction -Scheduler is implemented as a [CHASM](https://github.com/temporalio/temporal/blob/main/docs/architecture/chasm.md) library, with all related implementation code located in [`chasm/lib/scheduler`](https://github.com/temporalio/temporal/tree/main/chasm/lib/scheduler). Every schedule is backed by an execution whose root component is a [`Scheduler`](https://github.com/temporalio/temporal/blob/main/chasm/lib/scheduler/scheduler.go#L35), which parents several other key components: +Scheduler is implemented as a [CHASM](https://github.com/temporalio/temporal/blob/main/docs/architecture/chasm.md) library, with all related implementation code located in [`chasm/lib/scheduler`](https://github.com/temporalio/temporal/tree/main/chasm/lib/scheduler). Every schedule is backed by an execution whose root component is a [`Scheduler`](https://github.com/temporalio/temporal/blob/main/chasm/lib/scheduler/scheduler.go), which parents several other key components: -* [**Generator**](https://github.com/temporalio/temporal/blob/main/chasm/lib/scheduler/generator.go#L11): buffers actions in accordance with the schedule's specification (one per `Scheduler`). -* [**Invoker**](https://github.com/temporalio/temporal/blob/main/chasm/lib/scheduler/invoker.go#L16): executes buffered actions and handles retry logic (one per `Scheduler`). +* [**Generator**](https://github.com/temporalio/temporal/blob/main/chasm/lib/scheduler/generator.go): buffers actions in accordance with the schedule's specification (one per `Scheduler`). +* [**Invoker**](https://github.com/temporalio/temporal/blob/main/chasm/lib/scheduler/invoker.go): executes buffered actions and handles retry logic (one per `Scheduler`). * [**Backfiller**](https://github.com/temporalio/temporal/blob/main/chasm/lib/scheduler/backfiller.go): buffers actions from manually requested backfills and trigger requests (one `Backfiller` per pending request). Only the `Invoker` makes service calls with side effect tasks; the rest of the scheduler is driven through pure timer tasks. The `Scheduler` component itself implements required CHASM interfaces for lifecycle management and visibility, request-reply handler functions, and helper functions for mutating state common to the entire scheduler tree. @@ -50,7 +50,11 @@ classDiagram ### Generator (Automated Actions) -The Generator component buffers automated actions according to the schedule's specification. This is driven through `GeneratorTask`, a timer task that fires, and re-schedules itself, on the schedule's interval until the schedule is complete. Generator is solely responsible for buffering automated actions, not driving those actions to execution; that work is handed off to the `Invoker`. The following high-level sequence diagram indicates the Generator's contribution towards firing an action: +The Generator component buffers automated actions according to the schedule's specification. This is driven through `GeneratorTask`, a timer task that fires, and re-schedules itself, on the schedule's interval until the schedule is complete. Generator is solely responsible for buffering automated actions, not driving those actions to execution; that work is handed off to the `Invoker`. + +The Generator runs continuously rather than stopping while paused: a paused fire still advances the high water mark but produces no `BufferedStart`s. This guarantees that `FutureActionTimes` remains accurate, including in visibility, even for paused schedules. + +The following high-level sequence diagram indicates the Generator's contribution towards firing an action: ```mermaid sequenceDiagram @@ -84,7 +88,7 @@ sequenceDiagram #### Tasks `GeneratorTask`: Drives the automated actions loop. When a schedule is first created (or updated), a `GeneratorTask` is scheduled for immediate execution. `GeneratorTask` will reschedule itself for subsequent execution at the point in time where the next automated action is scheduled to fire. If the schedule has completed (by exceeding its lifetime, or action count), `GeneratorTask` won't reschedule itself, and will instead schedule the Scheduler's `IdleTask`. The idle task keeps the schedule open for a retention period before closing it, ensuring the schedule doesn't immediately disappear after completion. -`GeneratorTask`'s primary responsibility is to call `EnqueueBufferedStarts` on the `Invoker`, handing off actions for execution (see also *Specification Processor* in this document): +`GeneratorTask`'s primary responsibility is to call `EnqueueBufferedStarts` on the `Invoker`, handing off actions for execution (see also *Specification Processor* in this document). After processing, three terminal outcomes are possible: arm the idle task to close the schedule, reschedule for the next spec wakeup, or hold open without a task (paused with no spec wakeup, or a paused schedule with a pending backfiller - revived by external events like `Patch.Unpause` or `BackfillerTask` completion). ```mermaid flowchart TD @@ -97,8 +101,9 @@ flowchart TD G[Update LastProcessedTime] H{Is scheduler idle?} I[Schedule IdleTask] - J{Is scheduler paused?} + J{Spec has next wakeup?} K[Reschedule GeneratorTask] + M[Hold open - no task] L{{Finish executing}} A --> B @@ -113,21 +118,20 @@ flowchart TD H -->|Yes| I H -->|No| J I --> L - J -->|Yes| L - J -->|No| K + J -->|Yes| K + J -->|No| M K --> L + M --> L ``` *Figure: `GeneratorTask`'s decision tree for buffering starts (trivial branches omitted).* -Note that when the schedule is paused, `GeneratorTask` finishes without rescheduling itself. Whenever the schedule is updated (including being resumed from a paused state), a new `GeneratorTask` is scheduled for immediate execution, restarting the loop. - #### Notes - At time of writing, `FutureActionTimes` is also maintained in Generator's state. This field is a cached view of speculated action times, and is returned as part of describe responses, but is not involved in actual start time calculations, action buffering, or even as assured guarantees of execution (e.g., a listed time may be dropped for having missed catchup window). ### Backfiller (Manual Actions) -A Backfiller component is created for every backfill requested through the API (including "trigger immediately" requests). Similar to the `GeneratorTask`, the `BackfillerTask` is responsible for driving a backfill to completion by enqueueing buffered actions into the Invoker. Each backfiller acts as a distinct buffer per backfill request, and will avoid [monopolizing the Invoker's shared `BufferedStarts` field](https://github.com/temporalio/temporal/blob/658b80da2ed66355fe78650788ac33814ffd1561/chasm/lib/scheduler/backfiller_tasks.go#L233-L234), instead backing off and retrying while waiting for the buffer to drain. +A Backfiller component is created for every backfill requested through the API (including "trigger immediately" requests). Similar to the `GeneratorTask`, the `BackfillerTask` is responsible for driving a backfill to completion by enqueueing buffered actions into the Invoker. Each backfiller acts as a distinct buffer per backfill request, and will avoid monopolizing the Invoker's shared `BufferedStarts` field (see [`BackfillerTask.Execute`](https://github.com/temporalio/temporal/blob/main/chasm/lib/scheduler/backfiller_tasks.go) for the buffer-full backoff), instead backing off and retrying while waiting for the buffer to drain. The Backfiller component fits into the scheduler sequence diagram similarly to the Generator: @@ -193,7 +197,7 @@ The Invoker component is responsible for driving buffered starts to execution, m #### State - `BufferedStarts`: Invoker manages the scheduler's buffered starts/actions through their entire lifecycle (see *BufferedStart Lifecycle* below). The Invoker function `EnqueueBufferedStarts` is used to schedule an action for execution by adding it to this list, setting `Attempt` to `0`. Once added to the list, the `ProcessBufferTask` will evaluate the schedule's overlap policy, as well as other scheduler state, before bumping `Attempt` to `1`, which makes it eligible for `ExecuteTask` to drive to completion. Entries remain in `BufferedStarts` after starting (with `RunId` set) and after completion (with `Completed` set), providing computed views for running workflows and recent actions. - - Each `BufferedStart` entry is itself stateful, maintaining its own `Attempt` and `BackoffTime`. Only once `Attempt` is past `0` will it be eligible for execution (see [`getEligibleBufferedStarts`](https://github.com/temporalio/temporal/blob/85635674d7c55a8679b715b118aaf14c973266eb/chasm/lib/scheduler/invoker.go#L276)). + - Each `BufferedStart` entry is itself stateful, maintaining its own `Attempt` and `BackoffTime`. Only once `Attempt` is past `0` will it be eligible for execution (see `getEligibleBufferedStarts` in [`invoker.go`](https://github.com/temporalio/temporal/blob/main/chasm/lib/scheduler/invoker.go)). - `CancelWorkflows`: Workflows that will be cancelled due to overlap policy. The `ProcessBufferTask` adds to this list, and the `ExecuteTask` consumes the list. - `TerminateWorkflows`: Similar to `CancelWorkflows`. - `LastProcessedTime`: A high water mark used to evaluate when to fire tasks that are pending a retry after execution failure. @@ -208,10 +212,9 @@ flowchart TD C{Need terminate or cancel?} D[Queue terminate/cancel requests] E{Starts ready for execution?} - F{Is action manual?} - G{Is schedule paused?} - H{Scheduled actions remaining?} I{Is start expired?} + F{Is action manual?} + N{Paused or no actions remaining?} J[Discard start] K[Mark for execution] L([Schedule ExecuteTask]) @@ -222,24 +225,22 @@ flowchart TD C -->|Yes| D C -->|No| E D --> E - E -->|Yes| F + E -->|Yes| I E -->|No| L - F -->|Yes| I - F -->|No| G - G -->|Yes| J - G -->|No| H - H -->|No| J - H -->|Yes| I I -->|Yes| J - I -->|No| K + I -->|No| F + F -->|Yes| K + F -->|No| N + N -->|Yes| J + N -->|No| K J --> E K --> E L --> M ``` -*Figure: `ProcessBufferTask`'s decision tree (trivial branches omitted).* +*Figure: `ProcessBufferTask`'s decision tree (trivial branches omitted). The catchup-window check runs before consuming a `LimitedActions` slot - otherwise an expired start would burn a slot it never actually used.* -`ExecuteTask`: Drives all [eligible buffered starts](https://github.com/temporalio/temporal/blob/85635674d7c55a8679b715b118aaf14c973266eb/chasm/lib/scheduler/invoker.go#L276) and pending cancel/terminate requests to execution by making service calls. `ExecuteTask` reschedules itself whenever a service call fails and must be retried, as well as when it wants to "checkpoint" a batch of completed executions by flushing to mutable state. +`ExecuteTask`: Drives all eligible buffered starts (via `getEligibleBufferedStarts` in [`invoker.go`](https://github.com/temporalio/temporal/blob/main/chasm/lib/scheduler/invoker.go)) and pending cancel/terminate requests to execution by making service calls. On retryable failures, the start's `BackoffTime` is set and `addTasks` arms a future `ProcessBufferTask` at that backoff. There's no direct "reschedule ExecuteTask after backoff" path: `ProcessBufferTask` fires at the backoff deadline, re-promotes the eligible start, then emits a fresh `ExecuteTask`. ```mermaid flowchart TD @@ -256,8 +257,8 @@ flowchart TD K[Apply backoff for retry] L[Drop start] M([Record execute results]) - N{Starts pending retry?} - O([Reschedule ExecuteTask]) + N{Any buffered starts left?} + O([addTasks: ProcessBuffer at backoff, or Execute immediate]) P{{Finish executing}} A --> B @@ -341,7 +342,7 @@ The `BufferedStarts` queue allows scheduler to compute the following `Info` bloc ### Completion Callbacks -In order to determine when an action started by a schedule has completed, Scheduler makes use of Nexus completion callbacks. When the Invoker starts an action, a Nexus callback token [is generated](https://github.com/temporalio/temporal/blob/main/chasm/nexus_completion.go#L26), and set as part of the [`StartWorkflowExecution` request](https://github.com/temporalio/temporal/blob/main/chasm/lib/scheduler/invoker_tasks.go#L551). Once the action has completed, the callback mechanism will eventually trigger the [scheduler's completion handler](https://github.com/temporalio/temporal/blob/main/chasm/lib/scheduler/scheduler.go#L419). +In order to determine when an action started by a schedule has completed, Scheduler makes use of Nexus completion callbacks. When the Invoker starts an action, a Nexus callback token is generated (see `GenerateNexusCallback` in [`nexus_completion.go`](https://github.com/temporalio/temporal/blob/main/chasm/nexus_completion.go)), and set as part of the `StartWorkflowExecution` request (see `startWorkflow` in [`invoker_tasks.go`](https://github.com/temporalio/temporal/blob/main/chasm/lib/scheduler/invoker_tasks.go)). Once the action has completed, the callback mechanism will eventually trigger the scheduler's completion handler (see `HandleNexusCompletion` in [`scheduler.go`](https://github.com/temporalio/temporal/blob/main/chasm/lib/scheduler/scheduler.go)). The completion handler has several responsibilities: - Records the completed action's payloads (success/failure) to the scheduler's `LastCompletionResult`. @@ -350,9 +351,21 @@ The completion handler has several responsibilities: - Records the action's close time, to measure the latency between "last action completed" and "next action started" when multiple starts are buffered sequentially and pending execution. - Applies retention to keep only the most recent completed actions in `BufferedStarts`. - Schedules another `ProcessBufferTask`, as a completion may make a buffered start eligible for execution. +- Schedules another `GeneratorTask`, to handle a potentially updated idle close time. Nexus completion callbacks are the only way that Scheduler can find out about an action's completion (there is no additional synchronous path). When the Invoker's `ProcessBufferTask` evaluates running workflow status, the computed view of running workflows (derived from `BufferedStarts` entries with `RunId` set but not completed) is used instead of making additional describe calls. +### Idle Task (Closing) + +`SchedulerIdleTask` is a pure task armed by the `Generator` when it determines the schedule has no more work to do (no future spec wakeups, no pending backfills, no remaining limited actions, and not paused). When the task fires, it sets `Scheduler.Closed = true`, ending the schedule's lifetime. Between arming and firing the task waits `IdleTime` (a tweakable, default 7 days), which is the retention window during which a customer can still describe/modify the schedule. + +`Validate` enforces three drop conditions, each tagged with a `reason` for the `ScheduleIdleTaskInvalidated` counter: +- `closed` - the schedule is already closed (idempotency guard). +- `held_open` - the schedule transitioned into a state that disqualifies idle close: paused or has a pending backfiller. +- `expiration_shift` - the recomputed idle deadline (anchor + `IdleTime`) has moved later than the task's `ScheduledTime`, meaning the Generator has since armed a new task at the correct deadline. + +Manual-only schedules (no automated wakeup source, e.g. TriggerImmediately-only) idle out like any other schedule when unpaused - `lastEventTime` is advanced by manual triggers via `recentActions`, matching V1's `RetentionTime` semantics. Sentinels are exempt from `held_open` and anchor their idle deadline on `CreateTime` directly - they exist to reserve a schedule ID and must close regardless of their otherwise-paused/empty state. + ### Specification Processor The Specification Processor (`SpecProcessor` in code) is responsible for looping through the action times derived from the schedule's specification for a given range of time. It is used by both the `Generator` and the `Backfiller` to generate their `BufferedStart` structs. @@ -391,4 +404,4 @@ flowchart TD K --> L ``` -*Figure: `SpecProcessor.ProcessTimeRange`'s decision tree (trivial branches omitted).* \ No newline at end of file +*Figure: `SpecProcessor.ProcessTimeRange`'s decision tree (trivial branches omitted).* diff --git a/tests/schedule_test.go b/tests/schedule_test.go index 583f6ace119..0d49109ac15 100644 --- a/tests/schedule_test.go +++ b/tests/schedule_test.go @@ -92,6 +92,8 @@ func TestScheduleCHASM(t *testing.T) { t.Run("TestStateSizeBytesReported", func(t *testing.T) { testStateSizeBytesReported(t, newContext) }) t.Run("TestSingleDateScheduleCloses", func(t *testing.T) { testSingleDateScheduleCloses(t, newContext) }) t.Run("TestMultiDateScheduleCloses", func(t *testing.T) { testMultiDateScheduleCloses(t, newContext) }) + t.Run("TestPausedDropsCatchup", func(t *testing.T) { testPausedDropsCatchup(t, newContext) }) + t.Run("TestFutureActionTimesAdvanceWhilePaused", func(t *testing.T) { testFutureActionTimesAdvanceWhilePaused(t, newContext) }) t.Run("TestNextActionTimeVisibility", func(t *testing.T) { testScheduleNextActionTimeVisibility(t, newContext) }) } @@ -3632,6 +3634,171 @@ func testMultiDateScheduleCloses(t *testing.T, newContext contextFactory) { }, 15*time.Second, 200*time.Millisecond, "schedule should have closed") } +// testFutureActionTimesAdvanceWhilePaused verifies that ListSchedules returns +// up-to-date FutureActionTimes for a paused schedule. CHASM's always-on +// Generator advances the high water mark and rebuilds FutureActionTimes +// against the spec even when paused, so listed times stay in the future and +// never roll into the past. The legacy V1 scheduler workflow does not advance +// while paused, so its projected times would freeze at pause time and +// eventually all sit in the past - hence this test is registered CHASM-only. +func testFutureActionTimesAdvanceWhilePaused(t *testing.T, newContext contextFactory) { + s := testcore.NewEnv(t, scheduleCommonOpts(t)...) + + sid := testcore.RandomizeStr("sched-future-actions-paused") + wid := testcore.RandomizeStr("sched-future-actions-paused-wf") + wt := testcore.RandomizeStr("sched-future-actions-paused-wt") + + s.SdkWorker().RegisterWorkflowWithOptions( + func(ctx workflow.Context) error { return nil }, + workflow.RegisterOptions{Name: wt}, + ) + + ctx := newContext(s.Context()) + _, err := s.FrontendClient().CreateSchedule(ctx, &workflowservice.CreateScheduleRequest{ + Namespace: s.Namespace().String(), + ScheduleId: sid, + Schedule: &schedulepb.Schedule{ + Spec: &schedulepb.ScheduleSpec{ + Interval: []*schedulepb.IntervalSpec{{Interval: durationpb.New(1 * time.Second)}}, + }, + State: &schedulepb.ScheduleState{Paused: true}, + Action: &schedulepb.ScheduleAction{ + Action: &schedulepb.ScheduleAction_StartWorkflow{ + StartWorkflow: &workflowpb.NewWorkflowExecutionInfo{ + WorkflowId: wid, + WorkflowType: &commonpb.WorkflowType{Name: wt}, + TaskQueue: &taskqueuepb.TaskQueue{Name: s.WorkerTaskQueue(), Kind: enumspb.TASK_QUEUE_KIND_NORMAL}, + WorkflowIdReusePolicy: enumspb.WORKFLOW_ID_REUSE_POLICY_ALLOW_DUPLICATE, + }, + }, + }, + }, + Identity: "test", + RequestId: uuid.NewString(), + }) + s.NoError(err) + + // Wait for visibility to surface an initial FutureActionTimes projection. + initial := getScheduleEntryFromVisibility(s, sid, newContext, func(ent *schedulepb.ScheduleListEntry) bool { + return len(ent.Info.FutureActionTimes) > 0 + }) + initialFirst := initial.Info.FutureActionTimes[0].AsTime() + + // While still paused, the earliest projected time must advance past the + // initial value: the Generator keeps ticking and republishing the + // projection, even though no workflows fire. + getScheduleEntryFromVisibility(s, sid, newContext, func(ent *schedulepb.ScheduleListEntry) bool { + return len(ent.Info.FutureActionTimes) > 0 && + ent.Info.FutureActionTimes[0].AsTime().After(initialFirst) + }) +} + +// testPausedDropsCatchup verifies that an action scheduled by the spec during +// a paused window is NOT replayed when the schedule is unpaused. The test +// uses a one-shot calendar entry near in the future so the assertion is +// binary (runs == 0 vs runs == 1) and structural - no fixed-time sleeps and +// no count-based bound checks. +// +// CHASM-only: with the always-on Generator fix, CHASM's HWM advances while +// paused so unpause doesn't see the past spec time in the [HWM, now] window. +// V1's scheduler workflow sleeps while paused (HWM is frozen), so V1 would +// legitimately replay - that's a divergent backend behavior, not what this +// test verifies. +func testPausedDropsCatchup(t *testing.T, newContext contextFactory) { + shortIdleTime := 3 * time.Second + tweakables := chasmscheduler.DefaultTweakables + tweakables.IdleTime = shortIdleTime + s := testcore.NewEnv(t, append(scheduleCommonOpts(t), + testcore.WithDynamicConfig(chasmscheduler.CurrentTweakables, tweakables), + )...) + + sid := testcore.RandomizeStr("sched-paused-drops-catchup") + wid := testcore.RandomizeStr("sched-paused-drops-catchup-wf") + wt := testcore.RandomizeStr("sched-paused-drops-catchup-wt") + + var runs atomic.Int32 + s.SdkWorker().RegisterWorkflowWithOptions(func(ctx workflow.Context) error { + runs.Add(1) + return nil + }, workflow.RegisterOptions{Name: wt}) + + // Single calendar entry a few seconds in the future. While paused, its + // time will pass. Offset must exceed worst-case CreateSchedule latency + // under parallel load so the entry is still genuinely in the future when + // the server materializes the schedule - otherwise the test passes for + // the wrong reason (HWM already past the entry at create time). + fireAt := time.Now().Add(10 * time.Second).UTC() + ctx := newContext(s.Context()) + _, err := s.FrontendClient().CreateSchedule(ctx, &workflowservice.CreateScheduleRequest{ + Namespace: s.Namespace().String(), + ScheduleId: sid, + Schedule: &schedulepb.Schedule{ + Spec: &schedulepb.ScheduleSpec{ + Calendar: []*schedulepb.CalendarSpec{{ + Year: fmt.Sprintf("%d", fireAt.Year()), + Month: fireAt.Month().String(), + DayOfMonth: fmt.Sprintf("%d", fireAt.Day()), + Hour: fmt.Sprintf("%d", fireAt.Hour()), + Minute: fmt.Sprintf("%d", fireAt.Minute()), + Second: fmt.Sprintf("%d", fireAt.Second()), + }}, + }, + State: &schedulepb.ScheduleState{ + Paused: true, + }, + Action: &schedulepb.ScheduleAction{ + Action: &schedulepb.ScheduleAction_StartWorkflow{ + StartWorkflow: &workflowpb.NewWorkflowExecutionInfo{ + WorkflowId: wid, + WorkflowType: &commonpb.WorkflowType{Name: wt}, + TaskQueue: &taskqueuepb.TaskQueue{Name: s.WorkerTaskQueue(), Kind: enumspb.TASK_QUEUE_KIND_NORMAL}, + WorkflowIdReusePolicy: enumspb.WORKFLOW_ID_REUSE_POLICY_ALLOW_DUPLICATE, + }, + }, + }, + }, + Identity: "test", + RequestId: uuid.NewString(), + }) + s.NoError(err) + + s.Eventually(func() bool { + desc, err := s.FrontendClient().DescribeSchedule(ctx, &workflowservice.DescribeScheduleRequest{ + Namespace: s.Namespace().String(), + ScheduleId: sid, + }) + return err == nil && len(desc.Info.FutureActionTimes) == 0 + }, 15*time.Second, 200*time.Millisecond, + "FutureActionTimes should empty out once the only calendar date passes (proves HWM advanced past it while paused)") + + _, err = s.FrontendClient().PatchSchedule(ctx, &workflowservice.PatchScheduleRequest{ + Namespace: s.Namespace().String(), + ScheduleId: sid, + Patch: &schedulepb.SchedulePatch{Unpause: "drops-catchup-test"}, + Identity: "test", + RequestId: uuid.NewString(), + }) + s.NoError(err) + + // Wait for close - also ensures any post-unpause processing has completed + // before the runs assertion below. + s.Eventually(func() bool { + _, err := s.FrontendClient().DescribeSchedule(ctx, &workflowservice.DescribeScheduleRequest{ + Namespace: s.Namespace().String(), + ScheduleId: sid, + }) + return err != nil + }, 30*time.Second, 200*time.Millisecond, + "schedule should close from idle after unpause (no future actions, no replay)") + + // Quiet window: catches an in-flight SDK worker that started a catch-up + // workflow before close, whose runs.Add() may not have landed yet at the + // moment DescribeSchedule first errored. + s.Never(func() bool { return runs.Load() > 0 }, + 2*time.Second, 100*time.Millisecond, + "paused-window calendar entry must not be replayed on unpause") +} + // testScheduleNextActionTimeVisibility asserts that the // TemporalScheduleNextActionTime search attribute is published to visibility. func testScheduleNextActionTimeVisibility(t *testing.T, newContext contextFactory) { From 52749abb13ed94e170f67a303a1e05166e8ccfa9 Mon Sep 17 00:00:00 2001 From: Lina Jodoin Date: Fri, 5 Jun 2026 10:08:30 -0700 Subject: [PATCH 5/8] [Scheduler] Log decisions and task events via the EventLog (#10530) ## What changed? - Added internal EventLog entries for task processing and engine decision points. ## Why? - Gives us a nice audit log of what's changing with scheduler at specific timestamps, akin to V1's event history. ## How did you test it? - [ ] built - [ ] run locally and tested manually - [ ] covered by existing tests - [ ] added new unit test(s) - [ ] added new functional test(s) ## Potential risks - Every write requries access to a CHASM mutableContext, which implies a write will already occur on said component (this is how CHASM's dirty-state tracking works). Additionally, all event logs have strict upper bounds in length and individual item sizes. (cherry picked from commit 9e389ce5962f191a996f414d0c70324c6c9ccd87) --- chasm/lib/scheduler/backfiller.go | 4 ++++ chasm/lib/scheduler/backfiller_tasks.go | 3 ++- chasm/lib/scheduler/generator.go | 3 +++ chasm/lib/scheduler/generator_tasks.go | 7 +++++++ chasm/lib/scheduler/invoker.go | 27 +++++++++++++++++++++++++ chasm/lib/scheduler/invoker_tasks.go | 2 ++ chasm/lib/scheduler/scheduler.go | 6 ++++++ chasm/lib/scheduler/scheduler_tasks.go | 4 ++++ 8 files changed, 55 insertions(+), 1 deletion(-) diff --git a/chasm/lib/scheduler/backfiller.go b/chasm/lib/scheduler/backfiller.go index 9693b999892..e132596730d 100644 --- a/chasm/lib/scheduler/backfiller.go +++ b/chasm/lib/scheduler/backfiller.go @@ -1,6 +1,7 @@ package scheduler import ( + "fmt" "time" schedulespb "go.temporal.io/server/api/schedule/v1" @@ -45,6 +46,7 @@ func addBackfiller( scheduler.Backfillers = make(chasm.Map[string, *Backfiller]) } scheduler.Backfillers[id] = chasm.NewComponentField(ctx, backfiller) + scheduler.EventLog.Get(ctx).LogEvent(ctx, fmt.Sprintf("added backfiller: %s", id)) return backfiller } @@ -60,6 +62,8 @@ func newBackfillerWithState(ctx chasm.MutableContext, state *schedulerpb.Backfil // scheduleTask schedules a BackfillerTask at the given time. func (b *Backfiller) scheduleTask(ctx chasm.MutableContext, scheduledTime time.Time) { + b.EventLog.Get(ctx).LogEvent(ctx, + fmt.Sprintf("scheduled backfillerTask for %s", scheduledTime.Format(time.RFC3339))) ctx.AddTask(b, chasm.TaskAttributes{ ScheduledTime: scheduledTime, }, &schedulerpb.BackfillerTask{}) diff --git a/chasm/lib/scheduler/backfiller_tasks.go b/chasm/lib/scheduler/backfiller_tasks.go index dd1484c18a8..525c595db44 100644 --- a/chasm/lib/scheduler/backfiller_tasks.go +++ b/chasm/lib/scheduler/backfiller_tasks.go @@ -63,9 +63,10 @@ func (b *BackfillerTaskHandler) Execute( scheduler := backfiller.Scheduler.Get(ctx) logger := newTaggedLogger(b.baseLogger, scheduler) - invoker := scheduler.Invoker.Get(ctx) + backfiller.EventLog.Get(ctx).LogEvent(ctx, "backfillerTask executed") + // If the buffer is already full, don't move the watermark at all, just back off // and retry. tweakables := b.config.Tweakables(scheduler.Namespace) diff --git a/chasm/lib/scheduler/generator.go b/chasm/lib/scheduler/generator.go index 5f771e49152..5b49162d0ec 100644 --- a/chasm/lib/scheduler/generator.go +++ b/chasm/lib/scheduler/generator.go @@ -1,6 +1,7 @@ package scheduler import ( + "fmt" "time" "go.temporal.io/server/chasm" @@ -49,6 +50,8 @@ func (g *Generator) Generate(ctx chasm.MutableContext) { // scheduleTask schedules a GeneratorTask at the given time. func (g *Generator) scheduleTask(ctx chasm.MutableContext, scheduledTime time.Time) { + g.EventLog.Get(ctx).LogEvent(ctx, + fmt.Sprintf("scheduled generatorTask for %s", scheduledTime.Format(time.RFC3339))) ctx.AddTask(g, chasm.TaskAttributes{ ScheduledTime: scheduledTime, }, &schedulerpb.GeneratorTask{}) diff --git a/chasm/lib/scheduler/generator_tasks.go b/chasm/lib/scheduler/generator_tasks.go index ddc9dfcfb7b..222851e49e7 100644 --- a/chasm/lib/scheduler/generator_tasks.go +++ b/chasm/lib/scheduler/generator_tasks.go @@ -2,6 +2,7 @@ package scheduler import ( "fmt" + "time" "go.temporal.io/server/chasm" "go.temporal.io/server/chasm/lib/scheduler/gen/schedulerpb/v1" @@ -59,6 +60,8 @@ func (g *GeneratorTaskHandler) Execute( now := ctx.Now(generator) + generator.EventLog.Get(ctx).LogEvent(ctx, "generatorTask executed") + // If we have no last processed time, this is a new schedule. if generator.LastProcessedTime == nil { createdAt := timestamppb.New(now) @@ -103,6 +106,8 @@ func (g *GeneratorTaskHandler) Execute( // Emit metrics and update state for any dropped actions. if result.DroppedCount > 0 { + generator.EventLog.Get(ctx).LogEvent(ctx, + fmt.Sprintf("buffer overrun, dropped %d actions", result.DroppedCount)) logger.Warn("Buffer overrun, dropping actions", tag.Int64("dropped-count", result.DroppedCount)) metricsHandler.Counter(metrics.ScheduleBufferOverruns.Name()).Record(result.DroppedCount) @@ -134,6 +139,8 @@ func (g *GeneratorTaskHandler) Execute( // customer can describe/modify/restart the schedule. // // Once the idle timer expires, we close the component. + generator.EventLog.Get(ctx).LogEvent(ctx, + fmt.Sprintf("scheduled idle task for %s", idleExpiration.Format(time.RFC3339))) ctx.AddTask(scheduler, chasm.TaskAttributes{ ScheduledTime: idleExpiration, }, &schedulerpb.SchedulerIdleTask{ diff --git a/chasm/lib/scheduler/invoker.go b/chasm/lib/scheduler/invoker.go index 9c568a8b3a7..41f51c69f0a 100644 --- a/chasm/lib/scheduler/invoker.go +++ b/chasm/lib/scheduler/invoker.go @@ -85,6 +85,8 @@ func (i *Invoker) recordProcessBufferResult(ctx chasm.MutableContext, result *pr // Drop discarded starts, and update requested starts for execution. var starts []*schedulespb.BufferedStart + readiedStarts := 0 + deferredStarts := 0 for _, start := range i.GetBufferedStarts() { if discards[start.RequestId] { continue @@ -93,17 +95,24 @@ func (i *Invoker) recordProcessBufferResult(ctx chasm.MutableContext, result *pr // Starts ready for execution are set to their first attempt. if ready[start.RequestId] && start.Attempt < 1 { start.Attempt = 1 + readiedStarts++ } else if start.Attempt == 0 { // Start was processed but deferred (e.g., BUFFER_ONE policy with running workflow). // Mark as deferred (-1) to distinguish from newly-enqueued starts so addTasks // won't schedule an immediate ProcessBuffer task for them - they wait on // recordCompletedAction to re-enable. start.Attempt = -1 + deferredStarts++ } starts = append(starts, start) } + if readiedStarts > 0 || deferredStarts > 0 { + i.EventLog.Get(ctx).LogEvent(ctx, + fmt.Sprintf("recordProcessBufferResult readied %d starts, deferred %d starts", readiedStarts, deferredStarts)) + } + // Update internal state. i.BufferedStarts = starts i.CancelWorkflows = append(i.GetCancelWorkflows(), result.cancelWorkflows...) @@ -170,13 +179,18 @@ func (i *Invoker) recordExecuteResult(ctx chasm.MutableContext, result *executeR } // Remove failed (non-retryable) starts from the buffer. + removedStarts := 0 + retriedStarts := 0 i.BufferedStarts = slices.DeleteFunc(i.GetBufferedStarts(), func(start *schedulespb.BufferedStart) bool { + removedStarts++ return failed[start.RequestId] }) i.CancelWorkflows = slices.DeleteFunc(i.GetCancelWorkflows(), func(we *commonpb.WorkflowExecution) bool { + removedStarts++ return canceled[we.RunId] }) i.TerminateWorkflows = slices.DeleteFunc(i.GetTerminateWorkflows(), func(we *commonpb.WorkflowExecution) bool { + removedStarts++ return terminated[we.RunId] }) @@ -197,9 +211,16 @@ func (i *Invoker) recordExecuteResult(ctx chasm.MutableContext, result *executeR if retry, ok := retryable[start.RequestId]; ok { start.Attempt++ start.BackoffTime = retry.GetBackoffTime() + retriedStarts++ } } + i.EventLog.Get(ctx).LogEvent(ctx, + fmt.Sprintf("recordExecuteResult kicked off %d starts, removed %d starts, retried %d starts", + newlyStarted, + removedStarts, + retriedStarts)) + i.addTasks(ctx) return newlyStarted, droppedDuplicates } @@ -225,6 +246,8 @@ func (i *Invoker) recordCompletedAction( completed *schedulespb.CompletedResult, requestID string, ) (scheduleTime time.Time) { + i.EventLog.Get(ctx).LogEvent(ctx, fmt.Sprintf("recording completed action: %s", requestID)) + // Find the BufferedStart and mark it as completed. for _, start := range i.BufferedStarts { if start.GetRequestId() == requestID { @@ -270,10 +293,13 @@ func (i *Invoker) addTasks(ctx chasm.MutableContext) { // If we have Attempt = 0 starts, generate a ProcessBufferTask immediately. If we // have starts that are backing off, add a timer task for the earliest backoff time. if i.hasUnprocessedStarts() { + i.EventLog.Get(ctx).LogEvent(ctx, "scheduled processBufferTask immediately") ctx.AddTask(i, chasm.TaskAttributes{ ScheduledTime: chasm.TaskScheduledTimeImmediate, }, &schedulerpb.InvokerProcessBufferTask{}) } else if deadline := i.nextBackoffDeadline(); !deadline.IsZero() { + i.EventLog.Get(ctx).LogEvent(ctx, + fmt.Sprintf("scheduled processBufferTask for %s", deadline.Format(time.RFC3339))) ctx.AddTask(i, chasm.TaskAttributes{ ScheduledTime: deadline, }, &schedulerpb.InvokerProcessBufferTask{}) @@ -284,6 +310,7 @@ func (i *Invoker) addTasks(ctx chasm.MutableContext) { if len(i.GetCancelWorkflows()) > 0 || len(i.GetTerminateWorkflows()) > 0 || len(i.getEligibleBufferedStarts()) > 0 { + i.EventLog.Get(ctx).LogEvent(ctx, "scheduled executeTask") ctx.AddTask(i, chasm.TaskAttributes{}, &schedulerpb.InvokerExecuteTask{}) } } diff --git a/chasm/lib/scheduler/invoker_tasks.go b/chasm/lib/scheduler/invoker_tasks.go index 84b3b1f935b..b62ee8a1c81 100644 --- a/chasm/lib/scheduler/invoker_tasks.go +++ b/chasm/lib/scheduler/invoker_tasks.go @@ -390,6 +390,8 @@ func (h *InvokerProcessBufferTaskHandler) Execute( ) error { scheduler := invoker.Scheduler.Get(ctx) + invoker.EventLog.Get(ctx).LogEvent(ctx, "processBufferTask executed") + // Make sure we have something to start. executionInfo := scheduler.Schedule.GetAction().GetStartWorkflow() if executionInfo == nil { diff --git a/chasm/lib/scheduler/scheduler.go b/chasm/lib/scheduler/scheduler.go index c774461de07..0de9856704d 100644 --- a/chasm/lib/scheduler/scheduler.go +++ b/chasm/lib/scheduler/scheduler.go @@ -168,6 +168,7 @@ func NewSentinel( Info: &schedulepb.ScheduleInfo{}, }, cacheConflictToken: scheduler.InitialConflictToken, + EventLog: chasm.NewComponentField(ctx, NewEventLog(ctx)), } now := ctx.Now(s) s.Info.CreateTime = timestamppb.New(now) @@ -787,6 +788,8 @@ func (s *Scheduler) MigrateToWorkflow( s.Schedule.State.Paused = true s.Schedule.State.Notes = "paused for migration to workflow-backed scheduler" + s.EventLog.Get(ctx).LogEvent(ctx, "started migration to V1") + // Schedule a side-effect task to export state and start the V1 workflow. ctx.AddTask(s, chasm.TaskAttributes{}, &schedulerpb.SchedulerMigrateToWorkflowTask{}) @@ -843,6 +846,7 @@ func (s *Scheduler) Update( s.Info.UpdateTime = timestamppb.New(ctx.Now(s)) s.updateConflictToken() + s.EventLog.Get(ctx).LogEvent(ctx, "updated via API") // Since the spec may have been updated, kick off the generator. s.Generator.Get(ctx).Generate(ctx) @@ -869,6 +873,7 @@ func (s *Scheduler) Patch( if req.FrontendRequest.Patch.Pause != "" { s.Schedule.State.Paused = true s.Schedule.State.Notes = req.FrontendRequest.Patch.Pause + s.EventLog.Get(ctx).LogEvent(ctx, fmt.Sprintf("paused via API: %s", req.FrontendRequest.Patch.Pause)) } if req.FrontendRequest.Patch.Unpause != "" { if s.WorkflowMigration != nil { @@ -876,6 +881,7 @@ func (s *Scheduler) Patch( } s.Schedule.State.Paused = false s.Schedule.State.Notes = req.FrontendRequest.Patch.Unpause + s.EventLog.Get(ctx).LogEvent(ctx, fmt.Sprintf("unpaused via API: %s", req.FrontendRequest.Patch.Unpause)) } if err := s.handlePatch(ctx, req.FrontendRequest.Patch); err != nil { diff --git a/chasm/lib/scheduler/scheduler_tasks.go b/chasm/lib/scheduler/scheduler_tasks.go index 7e84fed84cc..e8688f231de 100644 --- a/chasm/lib/scheduler/scheduler_tasks.go +++ b/chasm/lib/scheduler/scheduler_tasks.go @@ -52,6 +52,7 @@ func (r *SchedulerIdleTaskHandler) Execute( _ chasm.TaskAttributes, _ *schedulerpb.SchedulerIdleTask, ) error { + scheduler.EventLog.Get(ctx).LogEvent(ctx, "schedule closed from idle timer") scheduler.Closed = true return nil } @@ -189,6 +190,9 @@ func (r *SchedulerCallbacksTaskHandler) Execute( } } + s.EventLog.Get(ctx).LogEvent(ctx, + fmt.Sprintf("attached callbacks to %d already-running workflow(s)", len(results))) + // Now that running workflow state has been refreshed, scheduler tasks can be // fired. invoker.addTasks(ctx) From 52cb067bc63aaced7f4b271e6c98355b7466104e Mon Sep 17 00:00:00 2001 From: Lina Jodoin Date: Fri, 5 Jun 2026 13:28:36 -0700 Subject: [PATCH 6/8] [Scheduler] Add a new flag to allow only migration of schedules without running workflows (#10505) ## What changed? - Added a new dynamic config flag to control whether V1 schedules will migrate with running workflows. ## Why? - When V1 schedules migrate to V2 with running workflows, V2 attaches a completion callback to the running workflow (as V2 exclusively uses Nexus completion callbacks to monitor workflow status). Certain third-party SDKs are known to have issues with one of the events written to the history of the running workflow, which can cause them to panic (Coinbase SDK). This allows us to dial up migration for these customers without triggering that edge case. ## How did you test it? - [ ] built - [ ] run locally and tested manually - [ ] covered by existing tests - [ ] added new unit test(s) - [x] added new functional test(s) ## Potential risks - Slower migration cadence, potentially (cherry picked from commit a046c79c6432b9f853bfe5fc931c6bb9a20b9bb7) --- common/dynamicconfig/constants.go | 8 ++ service/worker/scheduler/fx.go | 7 +- service/worker/scheduler/workflow.go | 24 ++-- service/worker/scheduler/workflow_test.go | 42 +++++-- tests/schedule_migration_test.go | 142 +++++++++++++++++++++- 5 files changed, 201 insertions(+), 22 deletions(-) diff --git a/common/dynamicconfig/constants.go b/common/dynamicconfig/constants.go index d052cfc87fa..976f7711837 100644 --- a/common/dynamicconfig/constants.go +++ b/common/dynamicconfig/constants.go @@ -2976,6 +2976,14 @@ EnableCHASMSchedulerMigration is true. The decision is re-evaluated when a scheduler workflow starts or continues-as-new.`, ) + EnableCHASMSchedulerMigrationWithRunningWorkflows = NewNamespaceBoolSetting( + "history.enableCHASMSchedulerMigrationWithRunningWorkflows", + false, + `EnableCHASMSchedulerMigrationWithRunningWorkflows, when set to false, prevents schedules with +running workflows from being migrated. This works around a known bug in 3P SDKs involving updating +existing workflows to attach callbacks.`, + ) + EnableCHASMSchedulerSentinels = NewNamespaceBoolSetting( "history.enableCHASMSchedulerSentinels", true, diff --git a/service/worker/scheduler/fx.go b/service/worker/scheduler/fx.go index 13e49f233c5..fd8d755342a 100644 --- a/service/worker/scheduler/fx.go +++ b/service/worker/scheduler/fx.go @@ -54,6 +54,7 @@ type ( enabledForNs dynamicconfig.BoolPropertyFnWithNamespaceFilter enableCHASMMigration dynamicconfig.BoolPropertyFnWithNamespaceFilter chasmMigrationRolloutPercent dynamicconfig.IntPropertyFnWithNamespaceFilter + migrateWithRunningWorkflows dynamicconfig.BoolPropertyFnWithNamespaceFilter globalNSStartWorkflowRPS dynamicconfig.TypedSubscribableWithNamespaceFilter[float64] maxBlobSize dynamicconfig.IntPropertyFnWithNamespaceFilter localActivitySleepLimit dynamicconfig.DurationPropertyFnWithNamespaceFilter @@ -91,6 +92,7 @@ func NewResult( enabledForNs: dynamicconfig.WorkerEnableScheduler.Get(dc), enableCHASMMigration: dynamicconfig.EnableCHASMSchedulerMigration.Get(dc), chasmMigrationRolloutPercent: dynamicconfig.CHASMSchedulerMigrationRolloutPercent.Get(dc), + migrateWithRunningWorkflows: dynamicconfig.EnableCHASMSchedulerMigrationWithRunningWorkflows.Get(dc), globalNSStartWorkflowRPS: dynamicconfig.SchedulerNamespaceStartWorkflowRPS.Subscribe(dc), maxBlobSize: dynamicconfig.BlobSizeLimitError.Get(dc), localActivitySleepLimit: dynamicconfig.SchedulerLocalActivitySleepLimit.Get(dc), @@ -112,7 +114,10 @@ func (s *workerComponent) Register(registry sdkworker.Registry, ns *namespace.Na return s.enableCHASMMigration(nsName) && dynamicconfig.RolloutAccepts(key, s.chasmMigrationRolloutPercent(nsName)) } - return schedulerWorkflowWithSpecBuilder(ctx, args, s.specBuilder, enableMigration) + migrateWithRunningWorkflows := func() bool { + return s.migrateWithRunningWorkflows(nsName) + } + return schedulerWorkflowWithSpecBuilder(ctx, args, s.specBuilder, enableMigration, migrateWithRunningWorkflows) } registry.RegisterWorkflowWithOptions(wfFunc, workflow.RegisterOptions{Name: WorkflowType}) diff --git a/service/worker/scheduler/workflow.go b/service/worker/scheduler/workflow.go index 38023b72f1e..d237197e679 100644 --- a/service/worker/scheduler/workflow.go +++ b/service/worker/scheduler/workflow.go @@ -114,9 +114,10 @@ type ( // without modifying state). specBuilder *SpecBuilder cspec *CompiledSpec - // enableCHASMMigration is re-evaluated every iteration inside the - // "tweakables" MutableSideEffect. - enableCHASMMigration func() bool + // enableCHASMMigration and migrateWithRunningWorkflows are re-evaluated + // every iteration inside the "tweakables" MutableSideEffect. + enableCHASMMigration func() bool + migrateWithRunningWorkflows func() bool tweakables TweakablePolicies @@ -161,7 +162,8 @@ type ( Version SchedulerWorkflowVersion // Used to keep track of schedules version to release new features and for backward compatibility // version 0 corresponds to the schedule version that comes before introducing the Version parameter - EnableCHASMMigration bool // Whether to automatically migrate this schedule to CHASM (V2) + EnableCHASMMigration bool // Whether to automatically migrate this schedule to CHASM (V2) + MigrateWithRunningWorkflows bool // Whether to migrate this schedule to CHASM (V2) while it has running workflows // When introducing a new field with new workflow logic, consider generating a new // history for TestReplays using generate_history.sh. @@ -226,10 +228,11 @@ var ( ) func SchedulerWorkflow(ctx workflow.Context, args *schedulespb.StartScheduleArgs) error { - return schedulerWorkflowWithSpecBuilder(ctx, args, NewSpecBuilder(), func() bool { return false }) + disabled := func() bool { return false } + return schedulerWorkflowWithSpecBuilder(ctx, args, NewSpecBuilder(), disabled, disabled) } -func schedulerWorkflowWithSpecBuilder(ctx workflow.Context, args *schedulespb.StartScheduleArgs, specBuilder *SpecBuilder, enableCHASMMigration func() bool) error { +func schedulerWorkflowWithSpecBuilder(ctx workflow.Context, args *schedulespb.StartScheduleArgs, specBuilder *SpecBuilder, enableCHASMMigration func() bool, migrateWithRunningWorkflows func() bool) error { scheduler := &scheduler{ StartScheduleArgs: args, ctx: ctx, @@ -239,8 +242,9 @@ func schedulerWorkflowWithSpecBuilder(ctx workflow.Context, args *schedulespb.St "namespace": args.State.Namespace, metrics.ScheduleBackendTag: metrics.ScheduleBackendLegacy, }), - specBuilder: specBuilder, - enableCHASMMigration: enableCHASMMigration, + specBuilder: specBuilder, + enableCHASMMigration: enableCHASMMigration, + migrateWithRunningWorkflows: migrateWithRunningWorkflows, } return scheduler.run() } @@ -324,7 +328,8 @@ func (s *scheduler) run() error { ) } - if s.tweakables.EnableCHASMMigration { + if !s.State.PendingMigration && s.tweakables.EnableCHASMMigration && + (s.tweakables.MigrateWithRunningWorkflows || len(s.Info.RunningWorkflows) == 0) { s.State.PendingMigration = true } if s.State.PendingMigration { @@ -1271,6 +1276,7 @@ func (s *scheduler) updateTweakables() { p := CurrentTweakablePolicies // Re-evaluates migration dynamic config each iteration. p.EnableCHASMMigration = s.enableCHASMMigration() + p.MigrateWithRunningWorkflows = s.migrateWithRunningWorkflows() return p } eq := func(a, b any) bool { return a.(TweakablePolicies) == b.(TweakablePolicies) } diff --git a/service/worker/scheduler/workflow_test.go b/service/worker/scheduler/workflow_test.go index 1311709ec4d..34e40992d88 100644 --- a/service/worker/scheduler/workflow_test.go +++ b/service/worker/scheduler/workflow_test.go @@ -2275,14 +2275,19 @@ func (s *workflowSuite) TestMigrateSuccess() { // Mock MigrateSchedule activity to succeed. s.env.OnActivity(new(activities).MigrateScheduleToChasm, mock.Anything, mock.Anything).Once().Return(nil) - // Send migrate signal after the first iteration. + // Enable migration and request it via signal after the first iteration. + enableMigration := false s.env.RegisterDelayedCallback(func() { + enableMigration = true s.env.SignalWorkflow(SignalNameMigrateToChasm, nil) }, 1*time.Second) CurrentTweakablePolicies.IterationsBeforeContinueAsNew = 100 s.env.SetStartTime(baseStartTime) - s.env.ExecuteWorkflow(SchedulerWorkflow, &schedulespb.StartScheduleArgs{ + s.env.ExecuteWorkflow(func(ctx workflow.Context, args *schedulespb.StartScheduleArgs) error { + return schedulerWorkflowWithSpecBuilder(ctx, args, NewSpecBuilder(), + func() bool { return enableMigration }, func() bool { return true }) + }, &schedulespb.StartScheduleArgs{ Schedule: &schedulepb.Schedule{ Spec: &schedulepb.ScheduleSpec{ Interval: []*schedulepb.IntervalSpec{{ @@ -2314,8 +2319,10 @@ func (s *workflowSuite) TestMigrateFailure() { return errors.New("migration failed") }) - // Send migrate signal after the first iteration. + // Enable migration and request it via signal after the first iteration. + enableMigration := false s.env.RegisterDelayedCallback(func() { + enableMigration = true s.env.SignalWorkflow(SignalNameMigrateToChasm, nil) }, 1*time.Second) @@ -2328,7 +2335,10 @@ func (s *workflowSuite) TestMigrateFailure() { CurrentTweakablePolicies.IterationsBeforeContinueAsNew = 100 s.env.SetStartTime(baseStartTime) - s.env.ExecuteWorkflow(SchedulerWorkflow, &schedulespb.StartScheduleArgs{ + s.env.ExecuteWorkflow(func(ctx workflow.Context, args *schedulespb.StartScheduleArgs) error { + return schedulerWorkflowWithSpecBuilder(ctx, args, NewSpecBuilder(), + func() bool { return enableMigration }, func() bool { return true }) + }, &schedulespb.StartScheduleArgs{ Schedule: &schedulepb.Schedule{ Spec: &schedulepb.ScheduleSpec{ Interval: []*schedulepb.IntervalSpec{{ @@ -2370,14 +2380,19 @@ func (s *workflowSuite) TestMigrateFailureThenRetrySuccess() { return nil }) - // Send migrate signal after the first iteration. + // Enable migration and request it via signal after the first iteration. + enableMigration := false s.env.RegisterDelayedCallback(func() { + enableMigration = true s.env.SignalWorkflow(SignalNameMigrateToChasm, nil) }, 1*time.Second) CurrentTweakablePolicies.IterationsBeforeContinueAsNew = 100 s.env.SetStartTime(baseStartTime) - s.env.ExecuteWorkflow(SchedulerWorkflow, &schedulespb.StartScheduleArgs{ + s.env.ExecuteWorkflow(func(ctx workflow.Context, args *schedulespb.StartScheduleArgs) error { + return schedulerWorkflowWithSpecBuilder(ctx, args, NewSpecBuilder(), + func() bool { return enableMigration }, func() bool { return true }) + }, &schedulespb.StartScheduleArgs{ Schedule: &schedulepb.Schedule{ Spec: &schedulepb.ScheduleSpec{ Interval: []*schedulepb.IntervalSpec{{ @@ -2410,8 +2425,10 @@ func (s *workflowSuite) TestMigrateFailureThenSignal() { return errors.New("migration failed") }) - // Send migrate signal after the first iteration. + // Enable migration and request it via signal after the first iteration. + enableMigration := false s.env.RegisterDelayedCallback(func() { + enableMigration = true s.env.SignalWorkflow(SignalNameMigrateToChasm, nil) }, 1*time.Second) // After migration failure, send a pause patch and verify it's processed, @@ -2434,7 +2451,10 @@ func (s *workflowSuite) TestMigrateFailureThenSignal() { CurrentTweakablePolicies.IterationsBeforeContinueAsNew = 100 s.env.SetStartTime(baseStartTime) - s.env.ExecuteWorkflow(SchedulerWorkflow, &schedulespb.StartScheduleArgs{ + s.env.ExecuteWorkflow(func(ctx workflow.Context, args *schedulespb.StartScheduleArgs) error { + return schedulerWorkflowWithSpecBuilder(ctx, args, NewSpecBuilder(), + func() bool { return enableMigration }, func() bool { return true }) + }, &schedulespb.StartScheduleArgs{ Schedule: &schedulepb.Schedule{ Spec: &schedulepb.ScheduleSpec{ Interval: []*schedulepb.IntervalSpec{{ @@ -2476,7 +2496,7 @@ func (s *workflowSuite) TestMigrateDynamicConfig() { s.env.SetStartTime(baseStartTime) s.env.ExecuteWorkflow(func(ctx workflow.Context, args *schedulespb.StartScheduleArgs) error { - return schedulerWorkflowWithSpecBuilder(ctx, args, NewSpecBuilder(), func() bool { return true }) + return schedulerWorkflowWithSpecBuilder(ctx, args, NewSpecBuilder(), func() bool { return true }, func() bool { return true }) }, &schedulespb.StartScheduleArgs{ Schedule: &schedulepb.Schedule{ Spec: &schedulepb.ScheduleSpec{ @@ -2525,7 +2545,7 @@ func (s *workflowSuite) TestMigrateDynamicConfigFlipsMidRun() { s.env.SetStartTime(baseStartTime) s.env.ExecuteWorkflow(func(ctx workflow.Context, args *schedulespb.StartScheduleArgs) error { - return schedulerWorkflowWithSpecBuilder(ctx, args, NewSpecBuilder(), func() bool { return enabled }) + return schedulerWorkflowWithSpecBuilder(ctx, args, NewSpecBuilder(), func() bool { return enabled }, func() bool { return true }) }, &schedulespb.StartScheduleArgs{ Schedule: &schedulepb.Schedule{ Spec: &schedulepb.ScheduleSpec{ @@ -2564,7 +2584,7 @@ func (s *workflowSuite) TestMigrateDynamicConfigFailure() { s.env.SetStartTime(baseStartTime) s.env.ExecuteWorkflow(func(ctx workflow.Context, args *schedulespb.StartScheduleArgs) error { - return schedulerWorkflowWithSpecBuilder(ctx, args, NewSpecBuilder(), func() bool { return true }) + return schedulerWorkflowWithSpecBuilder(ctx, args, NewSpecBuilder(), func() bool { return true }, func() bool { return true }) }, &schedulespb.StartScheduleArgs{ Schedule: &schedulepb.Schedule{ Spec: &schedulepb.ScheduleSpec{ diff --git a/tests/schedule_migration_test.go b/tests/schedule_migration_test.go index 5bc46c0a375..b2abc212cac 100644 --- a/tests/schedule_migration_test.go +++ b/tests/schedule_migration_test.go @@ -170,6 +170,7 @@ func (s *ScheduleMigrationTestSuite) TestScheduleMigrationV2AlreadyExists() { // and the V1 activity treats that as success (logs warning, returns nil). // The V1 workflow terminates, but the pre-existing V2 schedule retains // its original state -- the V1 state is not applied. + env.OverrideDynamicConfig(dynamicconfig.EnableCHASMSchedulerMigration, true) _, err = env.AdminClient().MigrateSchedule(ctx, &adminservice.MigrateScheduleRequest{ Namespace: nsName, ScheduleId: sid, @@ -398,6 +399,7 @@ func (s *ScheduleMigrationTestSuite) TestScheduleMigrationV1ToV2() { }, 10*time.Second, 500*time.Millisecond) // Issue migration from V1 to V2. + env.OverrideDynamicConfig(dynamicconfig.EnableCHASMSchedulerMigration, true) _, err = env.AdminClient().MigrateSchedule(ctx, &adminservice.MigrateScheduleRequest{ Namespace: nsName, ScheduleId: sid, @@ -1140,6 +1142,7 @@ func (s *ScheduleMigrationTestSuite) TestScheduleMigrationV1ToV2WithClosedV2() { // Issue migration from V1 to V2. The previously deleted CHASM execution // does not block creation of a new one -- StartExecution succeeds because // closed executions allow reuse of the business ID. + env.OverrideDynamicConfig(dynamicconfig.EnableCHASMSchedulerMigration, true) _, err = env.AdminClient().MigrateSchedule(ctx, &adminservice.MigrateScheduleRequest{ Namespace: nsName, ScheduleId: sid, @@ -1259,8 +1262,10 @@ func TestScheduleMigrationV1ToV2NoDuplicateRecentActions(t *testing.T) { return true }, 15*time.Second, 500*time.Millisecond) - // Enable CHASM now so the migration activity can create the V2 schedule. + // Enable CHASM and migration, allowing it for schedules with running workflows. env.OverrideDynamicConfig(dynamicconfig.EnableChasm, true) + env.OverrideDynamicConfig(dynamicconfig.EnableCHASMSchedulerMigration, true) + env.OverrideDynamicConfig(dynamicconfig.EnableCHASMSchedulerMigrationWithRunningWorkflows, true) // Migrate from V1 to V2 while the workflow is still running. _, err = env.AdminClient().MigrateSchedule(ctx, &adminservice.MigrateScheduleRequest{ @@ -1322,6 +1327,141 @@ func TestScheduleMigrationV1ToV2NoDuplicateRecentActions(t *testing.T) { require.NoError(t, err) } +// TestScheduleMigrationDeferredWithRunningWorkflow verifies that +// EnableCHASMSchedulerMigrationWithRunningWorkflows gates automatic migration of a +// V1 schedule that has a running workflow. When the gate is off (default), +// auto-migration is deferred while a workflow is running (the scheduler keeps +// running rather than migrating and completing); when the gate is turned on, +// auto-migration proceeds even with a running workflow. +func TestScheduleMigrationDeferredWithRunningWorkflow(t *testing.T) { + // Create the env without EnableChasm so that CreateSchedule produces a V1 + // (workflow-backed) schedule rather than a CHASM sentinel. + env := testcore.NewEnv( + t, + testcore.WithWorkerService("V1 scheduler"), + testcore.WithDynamicConfig(dynamicconfig.EnableCHASMSchedulerMigrationWithRunningWorkflows, false), + ) + + ctx := testcore.NewContext() + sid := testcore.RandomizeStr("sched-migrate-defer-running") + wid := testcore.RandomizeStr("sched-migrate-defer-running-wf") + wt := testcore.RandomizeStr("sched-migrate-defer-running-wt") + + nsName := env.Namespace().String() + nsID := env.NamespaceID().String() + + // A workflow that blocks until signaled, so it stays running during migration. + resumeSignal := "resume" + workflowFn := func(ctx workflow.Context) error { + ch := workflow.GetSignalChannel(ctx, resumeSignal) + ch.Receive(ctx, nil) + return nil + } + env.SdkWorker().RegisterWorkflowWithOptions(workflowFn, workflow.RegisterOptions{Name: wt}) + + env.OverrideDynamicConfig(dynamicconfig.EnableChasm, false) + + // Create a V1 schedule with a short interval, so the scheduler iterates often + // and re-evaluates the auto-migration decision, and an immediate trigger so it + // starts a workflow. SKIP keeps a single workflow running across intervals. + sched := &schedulepb.Schedule{ + Spec: &schedulepb.ScheduleSpec{ + Interval: []*schedulepb.IntervalSpec{{Interval: durationpb.New(2 * time.Second)}}, + }, + Policies: &schedulepb.SchedulePolicies{ + OverlapPolicy: enumspb.SCHEDULE_OVERLAP_POLICY_SKIP, + }, + Action: &schedulepb.ScheduleAction{ + Action: &schedulepb.ScheduleAction_StartWorkflow{ + StartWorkflow: &workflowpb.NewWorkflowExecutionInfo{ + WorkflowId: wid, + WorkflowType: &commonpb.WorkflowType{Name: wt}, + TaskQueue: &taskqueuepb.TaskQueue{Name: env.WorkerTaskQueue(), Kind: enumspb.TASK_QUEUE_KIND_NORMAL}, + }, + }, + }, + } + _, err := env.FrontendClient().CreateSchedule(ctx, &workflowservice.CreateScheduleRequest{ + Namespace: nsName, + ScheduleId: sid, + Schedule: sched, + InitialPatch: &schedulepb.SchedulePatch{ + TriggerImmediately: &schedulepb.TriggerImmediatelyRequest{}, + }, + Identity: "test", + RequestId: uuid.NewString(), + }) + require.NoError(t, err) + + // Wait for the V1 scheduler to start the workflow and record it as running. + var runningWfID string + require.Eventually(t, func() bool { + descResp, err := env.FrontendClient().DescribeSchedule(ctx, &workflowservice.DescribeScheduleRequest{ + Namespace: nsName, + ScheduleId: sid, + }) + if err != nil || len(descResp.GetInfo().GetRecentActions()) == 0 { + return false + } + a := descResp.Info.RecentActions[0] + if a.GetStartWorkflowStatus() != enumspb.WORKFLOW_EXECUTION_STATUS_RUNNING { + return false + } + runningWfID = a.GetStartWorkflowResult().GetWorkflowId() + return true + }, 15*time.Second, 500*time.Millisecond) + + // Enable CHASM and migration; the running-workflow gate stays off, so only the + // running action workflow can defer migration here. + env.OverrideDynamicConfig(dynamicconfig.EnableChasm, true) + env.OverrideDynamicConfig(dynamicconfig.EnableCHASMSchedulerMigration, true) + + // A successful migration completes the V1 scheduler workflow; while migration + // is deferred it stays running. + v1WorkflowID := scheduler.WorkflowIDPrefix + sid + v1Migrated := func() bool { + desc, err := env.GetTestCluster().HistoryClient().DescribeWorkflowExecution( + ctx, + &historyservice.DescribeWorkflowExecutionRequest{ + NamespaceId: nsID, + Request: &workflowservice.DescribeWorkflowExecutionRequest{ + Namespace: nsName, + Execution: &commonpb.WorkflowExecution{WorkflowId: v1WorkflowID}, + }, + }, + ) + return err == nil && desc.GetWorkflowExecutionInfo().GetStatus() == enumspb.WORKFLOW_EXECUTION_STATUS_COMPLETED + } + + // With the gate off, auto-migration is deferred: the scheduler re-evaluates the + // decision every interval but keeps running rather than migrating. + require.Never(t, v1Migrated, 5*time.Second, 500*time.Millisecond, + "migration must be deferred while the schedule has a running workflow and the gate is off") + + // Turn the gate on; auto-migration now proceeds even with the workflow running. + env.OverrideDynamicConfig(dynamicconfig.EnableCHASMSchedulerMigrationWithRunningWorkflows, true) + require.Eventually(t, v1Migrated, 15*time.Second, 500*time.Millisecond, + "migration should proceed once the gate allows running workflows") + + // The V2 schedule should now exist. + _, err = env.GetTestCluster().SchedulerClient().DescribeSchedule( + ctx, + &schedulerpb.DescribeScheduleRequest{ + NamespaceId: nsID, + FrontendRequest: &workflowservice.DescribeScheduleRequest{Namespace: nsName, ScheduleId: sid}, + }, + ) + require.NoError(t, err) + + // Clean up: signal the running workflow to complete. + _, err = env.FrontendClient().SignalWorkflowExecution(ctx, &workflowservice.SignalWorkflowExecutionRequest{ + Namespace: nsName, + WorkflowExecution: &commonpb.WorkflowExecution{WorkflowId: runningWfID}, + SignalName: resumeSignal, + }) + require.NoError(t, err) +} + // TestDeleteScheduleContextMetadata verifies that DeleteSchedule propagates the // correct context metadata (workflow-type, workflow-task-queue) for every // combination of CHASM and V1 state. This metadata is read by saas-temporal's From cdf0e0db34d7f58c7da9c29cb2d6244e63258d68 Mon Sep 17 00:00:00 2001 From: David Porter Date: Fri, 5 Jun 2026 16:12:02 -0700 Subject: [PATCH 7/8] feat: Adding scanner for invariants for schedules v2 (#10406) ## What changed? This introduces an optional scanner for looking through v2 schedules and to perform some invariant checks and metric checks. The theory is that: - We should generally not see schedules with the nextActionTime in the past, and for those that do, they should be paused or be waiting for the workflow to fire. - We should not see workflows that are closed that are still around for a long period of time - We should expect (eventually, after migration and rollout of schedules-v2) all workflows to fit into the category of closed or running. Some of these assertions are possibly a bit premature as my understanding of the system is still ramping up, but it's relatively low risk, all the scanner does is emit a log or metric. ## Why? The intent is to just get a better handle on the schedules feature as it rolls out, owing to its criticality. It's hard to test for all edge-cases ahead of time, so looking for invariants in production can be illuminating. ## How did you test it? - [ ] built - [X] run locally and tested manually - [ ] covered by existing tests - [X] added new unit test(s) - [ ] added new functional test(s) For testing, I intentionally added a bug in the scheduler to hang the schedule after firing and then let it run and added lots of loud log messages so I could see if the scanner was able to detect it, which it was. This does *not* cover the cases where permissions might differ across namespaces, I still need to investigate that a bit more. ## Potential risks Probably fairly low: - A serious bug could cause a hot-loop of continue-as-new workflows if the activities were both misconfigured and unable to start, but otherwise it's unlikely to do much. - A misconfiguration in the rate-limit might cause visibility to get somewhat hammered by the scanner, generating some visibility load. (cherry picked from commit 824d2d061fff80f7979cc651a5db73806f512d82) --- common/dynamicconfig/constants.go | 45 ++ common/metrics/metric_defs.go | 28 +- service/worker/scanner/scanner.go | 75 +++ service/worker/scanner/scanner_test.go | 38 +- .../scanner/scheduleinvariants/invariants.go | 411 +++++++++++++++++ .../scheduleinvariants/invariants_test.go | 432 ++++++++++++++++++ .../scanner/scheduleinvariants/workflows.go | 102 +++++ service/worker/service.go | 8 + 8 files changed, 1112 insertions(+), 27 deletions(-) create mode 100644 service/worker/scanner/scheduleinvariants/invariants.go create mode 100644 service/worker/scanner/scheduleinvariants/invariants_test.go create mode 100644 service/worker/scanner/scheduleinvariants/workflows.go diff --git a/common/dynamicconfig/constants.go b/common/dynamicconfig/constants.go index 976f7711837..523c80f401e 100644 --- a/common/dynamicconfig/constants.go +++ b/common/dynamicconfig/constants.go @@ -604,6 +604,51 @@ is currently processing a task. `BuildIdScavengerVisibilityRPS is the rate limit for visibility calls from the build id scavenger`, ) + ScheduleInvariantsScannerOverdueNextActionTimeEnabled = NewGlobalBoolSetting( + "worker.scheduleInvariantsScannerOverdueNextActionTimeEnabled", + false, + `ScheduleInvariantsScannerOverdueNextActionTimeEnabled enables flagging schedules whose +TemporalScheduleNextActionTime lies further in the past than the tolerance.`, + ) + ScheduleInvariantsScannerStuckOpenEnabled = NewGlobalBoolSetting( + "worker.scheduleInvariantsScannerStuckOpenEnabled", + false, + `ScheduleInvariantsScannerStuckOpenEnabled enables flagging schedules that appear stuck open +long after their CloseTime.`, + ) + ScheduleInvariantsScannerUnknownStateEnabled = NewGlobalBoolSetting( + "worker.scheduleInvariantsScannerUnknownStateEnabled", + false, + `ScheduleInvariantsScannerUnknownStateEnabled enables flagging running, unpaused schedules with +no TemporalScheduleNextActionTime. Ship disabled until TemporalScheduleNextActionTime is known to be +backfilled on legacy schedules.`, + ) + ScheduleInvariantsScannerOverdueNextActionTimeTolerance = NewGlobalDurationSetting( + "worker.scheduleInvariantsScannerOverdueNextActionTimeTolerance", + 10*time.Minute, + `ScheduleInvariantsScannerOverdueNextActionTimeTolerance is how far in the past +TemporalScheduleNextActionTime must be before the schedule is flagged.`, + ) + ScheduleInvariantsScannerVisibilityRPS = NewGlobalFloatSetting( + "worker.scheduleInvariantsScannerVisibilityRPS", + 1.0, + `ScheduleInvariantsScannerVisibilityRPS rate-limits visibility calls from the +schedule-invariants scanner.`, + ) + ScheduleInvariantsScannerScanInterval = NewGlobalDurationSetting( + "worker.scheduleInvariantsScannerScanInterval", + 15*time.Minute, + `ScheduleInvariantsScannerScanInterval is how often each schedule-invariants scanner +activity kicks off a fresh scan pass.`, + ) + ScheduleInvariantsScannerStuckOpenIdleTimeBufferMultiplier = NewGlobalIntSetting( + "worker.scheduleInvariantsScannerStuckOpenIdleTimeBufferMultiplier", + 2, + `ScheduleInvariantsScannerStuckOpenIdleTimeBufferMultiplier multiplies the configured +schedule IdleTime to set how far past a schedule's idle-close deadline it must be before the +stuck-open scanner flags it.`, + ) + // keys for frontend FrontendAllowedExperiments = NewNamespaceTypedSetting( "frontend.allowedExperiments", diff --git a/common/metrics/metric_defs.go b/common/metrics/metric_defs.go index b04c64cae23..6bdbf3b243a 100644 --- a/common/metrics/metric_defs.go +++ b/common/metrics/metric_defs.go @@ -506,6 +506,8 @@ const ( VisibilityArchiverScope = "VisibilityArchiver" // HistoryScavengerScope is scope used by all metrics emitted by worker.history.Scavenger module HistoryScavengerScope = "HistoryScavenger" + // ScheduleInvariantsScannerScope is scope used by metrics emitted by the schedule-invariants scanner + ScheduleInvariantsScannerScope = "ScheduleInvariantsScanner" // ArchiverDeleteHistoryActivityScope is scope used by all metrics emitted by archiver.DeleteHistoryActivity ArchiverDeleteHistoryActivityScope = "ArchiverDeleteHistoryActivity" // ArchiverUploadHistoryActivityScope is scope used by all metrics emitted by archiver.UploadHistoryActivity @@ -1324,17 +1326,21 @@ var ( HistoryWorkflowExecutionCacheLatency = NewTimerDef("history_workflow_execution_cache_latency") HistoryWorkflowExecutionCacheLockHoldDuration = NewTimerDef("history_workflow_execution_cache_lock_hold_duration") - VisibilityArchiverArchiveNonRetryableErrorCount = NewCounterDef("visibility_archiver_archive_non_retryable_error") - VisibilityArchiverArchiveTransientErrorCount = NewCounterDef("visibility_archiver_archive_transient_error") - VisibilityArchiveSuccessCount = NewCounterDef("visibility_archiver_archive_success") - HistoryScavengerSuccessCount = NewCounterDef("scavenger_success") - HistoryScavengerErrorCount = NewCounterDef("scavenger_errors") - HistoryScavengerSkipCount = NewCounterDef("scavenger_skips") - ExecutionsOutstandingCount = NewGaugeDef("executions_outstanding") - ScavengerValidationRequestsCount = NewCounterDef("scavenger_validation_requests") - ScavengerValidationFailuresCount = NewCounterDef("scavenger_validation_failures") - ScavengerValidationSkipsCount = NewCounterDef("scavenger_validation_skips") - AddSearchAttributesFailuresCount = NewCounterDef("add_search_attributes_failures") + VisibilityArchiverArchiveNonRetryableErrorCount = NewCounterDef("visibility_archiver_archive_non_retryable_error") + VisibilityArchiverArchiveTransientErrorCount = NewCounterDef("visibility_archiver_archive_transient_error") + VisibilityArchiveSuccessCount = NewCounterDef("visibility_archiver_archive_success") + HistoryScavengerSuccessCount = NewCounterDef("scavenger_success") + HistoryScavengerErrorCount = NewCounterDef("scavenger_errors") + HistoryScavengerSkipCount = NewCounterDef("scavenger_skips") + ScheduleInvariantsScannerOverdueNextActionTimeCount = NewCounterDef("schedule_invariants_scanner_overdue_next_action_time") + ScheduleInvariantsScannerStuckOpenCount = NewCounterDef("schedule_invariants_scanner_stuck_open") + ScheduleInvariantsScannerUnknownStateCount = NewCounterDef("schedule_invariants_scanner_unknown_state") + ScheduleInvariantsScannerErrorCount = NewCounterDef("schedule_invariants_scanner_errors") + ExecutionsOutstandingCount = NewGaugeDef("executions_outstanding") + ScavengerValidationRequestsCount = NewCounterDef("scavenger_validation_requests") + ScavengerValidationFailuresCount = NewCounterDef("scavenger_validation_failures") + ScavengerValidationSkipsCount = NewCounterDef("scavenger_validation_skips") + AddSearchAttributesFailuresCount = NewCounterDef("add_search_attributes_failures") // Delete Namespace metrics. ReclaimResourcesNamespaceDeleteSuccessCount = NewCounterDef( diff --git a/service/worker/scanner/scanner.go b/service/worker/scanner/scanner.go index 8e4681641f7..2cd1b7b25bb 100644 --- a/service/worker/scanner/scanner.go +++ b/service/worker/scanner/scanner.go @@ -15,6 +15,7 @@ import ( "go.temporal.io/server/api/matchingservice/v1" "go.temporal.io/server/common" "go.temporal.io/server/common/backoff" + "go.temporal.io/server/common/clock" "go.temporal.io/server/common/config" "go.temporal.io/server/common/dynamicconfig" "go.temporal.io/server/common/headers" @@ -26,8 +27,10 @@ import ( "go.temporal.io/server/common/persistence" "go.temporal.io/server/common/persistence/serialization" "go.temporal.io/server/common/persistence/visibility/manager" + "go.temporal.io/server/common/persistence/visibility/store/elasticsearch" "go.temporal.io/server/common/sdk" "go.temporal.io/server/service/worker/scanner/build_ids" + "go.temporal.io/server/service/worker/scanner/scheduleinvariants" ) type ( @@ -71,6 +74,16 @@ type ( RemovableBuildIdDurationSinceDefault dynamicconfig.DurationPropertyFn // BuildIdScavengerVisibilityRPS is the rate limit for visibility calls from the build ID scavenger BuildIdScavengerVisibilityRPS dynamicconfig.FloatPropertyFn + + // Schedule-invariants scanners. Each runs as an independent cron workflow; all three + // require advanced (Elasticsearch) visibility to be configured (otherwise they're skipped). + ScheduleInvariantsScannerOverdueNextActionTimeEnabled dynamicconfig.BoolPropertyFn + ScheduleInvariantsScannerStuckOpenEnabled dynamicconfig.BoolPropertyFn + ScheduleInvariantsScannerUnknownStateEnabled dynamicconfig.BoolPropertyFn + ScheduleInvariantsScannerOverdueNextActionTimeTolerance dynamicconfig.DurationPropertyFn + ScheduleInvariantsScannerVisibilityRPS dynamicconfig.FloatPropertyFn + ScheduleInvariantsScannerScanInterval dynamicconfig.DurationPropertyFn + ScheduleInvariantsScannerStuckOpenIdleTimeBufferMultiplier dynamicconfig.IntPropertyFn } // scannerContext is the context object that gets @@ -209,6 +222,68 @@ func (s *Scanner) Start() error { } } + scheduleInvariantsAnyEnabled := s.context.cfg.ScheduleInvariantsScannerOverdueNextActionTimeEnabled() || + s.context.cfg.ScheduleInvariantsScannerStuckOpenEnabled() || + s.context.cfg.ScheduleInvariantsScannerUnknownStateEnabled() + if scheduleInvariantsAnyEnabled && !s.context.visibilityManager.HasStoreName(elasticsearch.PersistenceName) { + s.context.logger.Info("schedule-invariants scanners are enabled but advanced (Elasticsearch) visibility is not configured; skipping") + } else if scheduleInvariantsAnyEnabled { + scheduleActivities := scheduleinvariants.NewActivities( + s.context.logger, + s.context.metricsHandler, + s.context.metadataManager, + s.context.visibilityManager, + s.context.namespaceRegistry, + s.context.sdkClientFactory, + s.context.currentClusterName, + clock.NewRealTimeSource(), + s.context.cfg.ScheduleInvariantsScannerVisibilityRPS, + s.context.cfg.ScheduleInvariantsScannerOverdueNextActionTimeTolerance, + s.context.cfg.ScheduleInvariantsScannerScanInterval, + s.context.cfg.ScheduleInvariantsScannerStuckOpenIdleTimeBufferMultiplier, + ) + + if s.context.cfg.ScheduleInvariantsScannerOverdueNextActionTimeEnabled() { + s.wg.Add(1) + go s.startWorkflowWithRetry(ctx, scheduleinvariants.OverdueNextActionTimeWFStartOptions, scheduleinvariants.OverdueNextActionTimeWorkflowName) + + work := s.context.sdkClientFactory.NewWorker(s.context.sdkClientFactory.GetSystemClient(), scheduleinvariants.OverdueNextActionTimeTaskQueue, workerOpts) + work.RegisterWorkflowWithOptions(scheduleinvariants.OverdueNextActionTimeWorkflow, workflow.RegisterOptions{Name: scheduleinvariants.OverdueNextActionTimeWorkflowName}) + work.RegisterActivityWithOptions(scheduleActivities.ScanOverdueNextActionTime, activity.RegisterOptions{Name: scheduleinvariants.OverdueNextActionTimeActivityName}) + + // TODO: Nothing is gracefully stopping these workers or listening for fatal errors. + if err := work.Start(); err != nil { + return err + } + } + + if s.context.cfg.ScheduleInvariantsScannerStuckOpenEnabled() { + s.wg.Add(1) + go s.startWorkflowWithRetry(ctx, scheduleinvariants.StuckOpenWFStartOptions, scheduleinvariants.StuckOpenWorkflowName) + + work := s.context.sdkClientFactory.NewWorker(s.context.sdkClientFactory.GetSystemClient(), scheduleinvariants.StuckOpenTaskQueue, workerOpts) + work.RegisterWorkflowWithOptions(scheduleinvariants.StuckOpenWorkflow, workflow.RegisterOptions{Name: scheduleinvariants.StuckOpenWorkflowName}) + work.RegisterActivityWithOptions(scheduleActivities.ScanStuckOpen, activity.RegisterOptions{Name: scheduleinvariants.StuckOpenActivityName}) + + if err := work.Start(); err != nil { + return err + } + } + + if s.context.cfg.ScheduleInvariantsScannerUnknownStateEnabled() { + s.wg.Add(1) + go s.startWorkflowWithRetry(ctx, scheduleinvariants.UnknownStateWFStartOptions, scheduleinvariants.UnknownStateWorkflowName) + + work := s.context.sdkClientFactory.NewWorker(s.context.sdkClientFactory.GetSystemClient(), scheduleinvariants.UnknownStateTaskQueue, workerOpts) + work.RegisterWorkflowWithOptions(scheduleinvariants.UnknownStateWorkflow, workflow.RegisterOptions{Name: scheduleinvariants.UnknownStateWorkflowName}) + work.RegisterActivityWithOptions(scheduleActivities.ScanUnknownState, activity.RegisterOptions{Name: scheduleinvariants.UnknownStateActivityName}) + + if err := work.Start(); err != nil { + return err + } + } + } + // TODO: There's no reason to register all activities and workflows on every task queue. for _, tl := range workerTaskQueueNames { work := s.context.sdkClientFactory.NewWorker(s.context.sdkClientFactory.GetSystemClient(), tl, workerOpts) diff --git a/service/worker/scanner/scanner_test.go b/service/worker/scanner/scanner_test.go index cfd1f6821c7..ef631387877 100644 --- a/service/worker/scanner/scanner_test.go +++ b/service/worker/scanner/scanner_test.go @@ -183,14 +183,17 @@ func (s *scannerTestSuite) TestScannerEnabled() { scanner := New( log.NewNoopLogger(), &Config{ - MaxConcurrentActivityExecutionSize: dynamicconfig.GetIntPropertyFn(1), - MaxConcurrentWorkflowTaskExecutionSize: dynamicconfig.GetIntPropertyFn(1), - MaxConcurrentActivityTaskPollers: dynamicconfig.GetIntPropertyFn(1), - MaxConcurrentWorkflowTaskPollers: dynamicconfig.GetIntPropertyFn(1), - HistoryScannerEnabled: dynamicconfig.GetBoolPropertyFn(c.HistoryScannerEnabled), - BuildIdScavengerEnabled: dynamicconfig.GetBoolPropertyFn(c.BuildIdScavengerEnabled), - ExecutionsScannerEnabled: dynamicconfig.GetBoolPropertyFn(c.ExecutionsScannerEnabled), - TaskQueueScannerEnabled: dynamicconfig.GetBoolPropertyFn(c.TaskQueueScannerEnabled), + MaxConcurrentActivityExecutionSize: dynamicconfig.GetIntPropertyFn(1), + MaxConcurrentWorkflowTaskExecutionSize: dynamicconfig.GetIntPropertyFn(1), + MaxConcurrentActivityTaskPollers: dynamicconfig.GetIntPropertyFn(1), + MaxConcurrentWorkflowTaskPollers: dynamicconfig.GetIntPropertyFn(1), + HistoryScannerEnabled: dynamicconfig.GetBoolPropertyFn(c.HistoryScannerEnabled), + BuildIdScavengerEnabled: dynamicconfig.GetBoolPropertyFn(c.BuildIdScavengerEnabled), + ExecutionsScannerEnabled: dynamicconfig.GetBoolPropertyFn(c.ExecutionsScannerEnabled), + TaskQueueScannerEnabled: dynamicconfig.GetBoolPropertyFn(c.TaskQueueScannerEnabled), + ScheduleInvariantsScannerOverdueNextActionTimeEnabled: dynamicconfig.GetBoolPropertyFn(false), + ScheduleInvariantsScannerStuckOpenEnabled: dynamicconfig.GetBoolPropertyFn(false), + ScheduleInvariantsScannerUnknownStateEnabled: dynamicconfig.GetBoolPropertyFn(false), Persistence: &config.Persistence{ DefaultStore: c.DefaultStore, DataStores: map[string]config.DataStore{ @@ -263,14 +266,17 @@ func (s *scannerTestSuite) TestScannerShutdown() { scanner := New( logger, &Config{ - MaxConcurrentActivityExecutionSize: dynamicconfig.GetIntPropertyFn(1), - MaxConcurrentWorkflowTaskExecutionSize: dynamicconfig.GetIntPropertyFn(1), - MaxConcurrentActivityTaskPollers: dynamicconfig.GetIntPropertyFn(1), - MaxConcurrentWorkflowTaskPollers: dynamicconfig.GetIntPropertyFn(1), - HistoryScannerEnabled: dynamicconfig.GetBoolPropertyFn(true), - ExecutionsScannerEnabled: dynamicconfig.GetBoolPropertyFn(false), - TaskQueueScannerEnabled: dynamicconfig.GetBoolPropertyFn(false), - BuildIdScavengerEnabled: dynamicconfig.GetBoolPropertyFn(false), + MaxConcurrentActivityExecutionSize: dynamicconfig.GetIntPropertyFn(1), + MaxConcurrentWorkflowTaskExecutionSize: dynamicconfig.GetIntPropertyFn(1), + MaxConcurrentActivityTaskPollers: dynamicconfig.GetIntPropertyFn(1), + MaxConcurrentWorkflowTaskPollers: dynamicconfig.GetIntPropertyFn(1), + HistoryScannerEnabled: dynamicconfig.GetBoolPropertyFn(true), + ExecutionsScannerEnabled: dynamicconfig.GetBoolPropertyFn(false), + TaskQueueScannerEnabled: dynamicconfig.GetBoolPropertyFn(false), + BuildIdScavengerEnabled: dynamicconfig.GetBoolPropertyFn(false), + ScheduleInvariantsScannerOverdueNextActionTimeEnabled: dynamicconfig.GetBoolPropertyFn(false), + ScheduleInvariantsScannerStuckOpenEnabled: dynamicconfig.GetBoolPropertyFn(false), + ScheduleInvariantsScannerUnknownStateEnabled: dynamicconfig.GetBoolPropertyFn(false), Persistence: &config.Persistence{ DefaultStore: config.StoreTypeNoSQL, DataStores: map[string]config.DataStore{ diff --git a/service/worker/scanner/scheduleinvariants/invariants.go b/service/worker/scanner/scheduleinvariants/invariants.go new file mode 100644 index 00000000000..f7a68745206 --- /dev/null +++ b/service/worker/scanner/scheduleinvariants/invariants.go @@ -0,0 +1,411 @@ +package scheduleinvariants + +import ( + "context" + "fmt" + "time" + + enumspb "go.temporal.io/api/enums/v1" + "go.temporal.io/api/workflowservice/v1" + "go.temporal.io/sdk/activity" + "go.temporal.io/server/api/visibilityservice/v1" + "go.temporal.io/server/chasm" + "go.temporal.io/server/chasm/lib/scheduler" + "go.temporal.io/server/common" + "go.temporal.io/server/common/clock" + "go.temporal.io/server/common/dynamicconfig" + "go.temporal.io/server/common/log" + "go.temporal.io/server/common/log/tag" + "go.temporal.io/server/common/metrics" + "go.temporal.io/server/common/namespace" + "go.temporal.io/server/common/persistence" + "go.temporal.io/server/common/persistence/visibility/manager" + "go.temporal.io/server/common/quotas" + "go.temporal.io/server/common/sdk" +) + +const ( + // heartbeatInterval is how often the background heartbeat goroutine pings the + // activity. It must be comfortably below the activity's HeartbeatTimeout. + heartbeatInterval = 10 * time.Second + // scheduleListPageSize controls pagination of ListChasmExecutions inside + // ScanOverdueNextActionTime. Each page is also one unit of rate-limited progress. + scheduleListPageSize = 100 + namespaceListPageSize = 100 +) + +// Activities holds shared dependencies for all three schedule-invariants scanner activities. +type Activities struct { + logger log.Logger + metricsHandler metrics.Handler + metadataManager persistence.MetadataManager + visibilityManager manager.VisibilityManager + namespaceRegistry namespace.Registry + sdkClientFactory sdk.ClientFactory + currentClusterName string + timeSource clock.TimeSource + + overdueTolerance dynamicconfig.DurationPropertyFn + // scanInterval is how often each scanner activity kicks off a fresh scan pass. + scanInterval dynamicconfig.DurationPropertyFn + // stuckOpenIdleTimeBufferMultiplier multiplies the configured schedule IdleTime to set + // how far past its idle-close deadline a schedule must be before ScanStuckOpen flags it. + stuckOpenIdleTimeBufferMultiplier dynamicconfig.IntPropertyFn + rateLimiter quotas.RateLimiter +} + +func NewActivities( + logger log.Logger, + metricsHandler metrics.Handler, + metadataManager persistence.MetadataManager, + visibilityManager manager.VisibilityManager, + namespaceRegistry namespace.Registry, + sdkClientFactory sdk.ClientFactory, + currentClusterName string, + timeSource clock.TimeSource, + visibilityRPS dynamicconfig.FloatPropertyFn, + overdueTolerance dynamicconfig.DurationPropertyFn, + scanInterval dynamicconfig.DurationPropertyFn, + stuckOpenIdleTimeBufferMultiplier dynamicconfig.IntPropertyFn, +) *Activities { + return &Activities{ + logger: logger, + metricsHandler: metricsHandler.WithTags(metrics.OperationTag(metrics.ScheduleInvariantsScannerScope)), + metadataManager: metadataManager, + visibilityManager: visibilityManager, + namespaceRegistry: namespaceRegistry, + sdkClientFactory: sdkClientFactory, + currentClusterName: currentClusterName, + timeSource: timeSource, + overdueTolerance: overdueTolerance, + scanInterval: scanInterval, + stuckOpenIdleTimeBufferMultiplier: stuckOpenIdleTimeBufferMultiplier, + rateLimiter: quotas.NewDefaultOutgoingRateLimiter(quotas.RateFn(visibilityRPS)), + } +} + +// ScanOverdueNextActionTime is a long-running activity that periodically lists +// schedules whose TemporalScheduleNextActionTime is in the past beyond the configured +// tolerance, calls DescribeSchedule on each candidate, and filters out schedules that +// are paused or legitimately blocked from firing by a BUFFER_ONE / BUFFER_ALL overlap +// policy with a prior workflow still running. Whatever remains is counted as an +// anomaly per namespace and logged for triage. +func (a *Activities) ScanOverdueNextActionTime(ctx context.Context) error { + return a.runForeverWithInterval(ctx, func(scanCtx context.Context) error { + threshold := a.timeSource.Now().UTC().Add(-a.overdueTolerance()).Format(time.RFC3339Nano) + // the query's intention is: + // show running schedules where ScheduleNextActionTime is in the past (plus a small buffer) + query := fmt.Sprintf( + `%s < "%s" AND TemporalSchedulePaused = false AND ExecutionStatus = "Running"`, + scheduler.ScheduleNextActionTimeName, + threshold, + ) + return a.runOverdueScan(scanCtx, query) + }) +} + +// ScanStuckOpen is a long-running activity that periodically scans for schedules whose +// ScheduleIdleCloseTime deadline has already passed (beyond the configured buffer). +// ScheduleIdleCloseTime is only emitted while a schedule is open, so an overdue value +// means the idle-close task never fired and the schedule is stuck open. Paused schedules +// are excluded since they are intentionally held open and do not idle-close. +func (a *Activities) ScanStuckOpen(ctx context.Context) error { + return a.runForeverWithInterval(ctx, func(scanCtx context.Context) error { + // two weeks ago (IdleTime, scaled by the configured buffer multiplier) + buffer := scheduler.DefaultTweakables.IdleTime * time.Duration(a.stuckOpenIdleTimeBufferMultiplier()) + threshold := a.timeSource.Now().UTC().Add(-buffer).Format(time.RFC3339Nano) + + // select schedules which went idle before two weeks ago + // and which are not paused and still appear to be running + query := fmt.Sprintf( + `%s < "%s" AND TemporalSchedulePaused = false AND ExecutionStatus = "Running"`, + scheduler.ScheduleIdleCloseTimeName, + threshold, + ) + return a.runScan(scanCtx, "stuck_open", query, metrics.ScheduleInvariantsScannerStuckOpenCount.Name()) + }) +} + +// ScanUnknownState is a long-running activity that periodically scans for running, +// unpaused schedules with no TemporalScheduleNextActionTime set. +// +// The ScheduleIdleCloseTime IS NULL clause excludes schedules that have legitimately +// run out of actions and are sitting in the idle/retention window before idle-close: +// those are still Running and unpaused with no next action time, but carry a (future) +// ScheduleIdleCloseTime. Without this clause every exhausted-but-retained schedule would +// be a false positive for up to the retention window, and genuinely stuck-open schedules +// (past ScheduleIdleCloseTime) would be double-counted with ScanStuckOpen. +func (a *Activities) ScanUnknownState(ctx context.Context) error { + return a.runForeverWithInterval(ctx, func(scanCtx context.Context) error { + query := fmt.Sprintf( + `%s IS NULL AND %s IS NULL AND TemporalSchedulePaused = false AND ExecutionStatus = "Running"`, + scheduler.ScheduleNextActionTimeName, + scheduler.ScheduleIdleCloseTimeName, + ) + return a.runScan(scanCtx, "unknown_state", query, metrics.ScheduleInvariantsScannerUnknownStateCount.Name()) + }) +} + +// runForeverWithInterval runs scanFn immediately, then again on every tick of scanInterval, until +// the activity context is canceled or scanFn returns an error. A background goroutine +// emits heartbeats so the parent activity stays alive between scans. All timing flows +// through the injected TimeSource so tests can drive the loop deterministically. +func (a *Activities) runForeverWithInterval(ctx context.Context, scanFn func(context.Context) error) error { + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + go a.heartbeatLoop(ctx) + + if err := scanFn(ctx); err != nil { + return err + } + + ch, timer := a.timeSource.NewTimer(a.scanInterval()) + defer timer.Stop() + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ch: + if err := scanFn(ctx); err != nil { + return err + } + timer.Reset(a.scanInterval()) + } + } +} + +func (a *Activities) heartbeatLoop(ctx context.Context) { + ch, timer := a.timeSource.NewTimer(heartbeatInterval) + defer timer.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ch: + activity.RecordHeartbeat(ctx) + timer.Reset(heartbeatInterval) + } + } +} + +// runScan ties the per-scanner pieces together: list namespaces, fan out a single +// visibility query per namespace, and emit per-namespace metrics. Cluster-wide +// totals are computed by the metrics backend by summing across the namespace tag. +// Used by the count-only scanners (StuckOpen, UnknownState). +func (a *Activities) runScan(ctx context.Context, subScanner, query, metricName string) error { + namespaces, err := a.ListAllNamespaces(ctx) + if err != nil { + return err + } + + for _, nsName := range namespaces { + err := a.forEachNamespace(ctx, nsName, query, func(count int64) { + a.emitCount(metricName, nsName, count) + }) + if err != nil { + a.recordScanError(nsName, subScanner, err) + continue + } + } + return nil +} + +// runOverdueScan lists individual matching schedules per namespace, calls +// DescribeSchedule on each, and filters out schedules that are paused or +// expected to not be able to fire (BUFFER_ONE / BUFFER_ALL waiting on a prior +// running workflow). What remains is counted as an anomaly. +func (a *Activities) runOverdueScan(ctx context.Context, query string) error { + const subScanner = "overdue_next_action_time" + metricName := metrics.ScheduleInvariantsScannerOverdueNextActionTimeCount.Name() + + namespaces, err := a.ListAllNamespaces(ctx) + if err != nil { + return err + } + + for _, nsName := range namespaces { + var nsAnomalies int64 + err := a.forEachScheduleInNamespace(ctx, nsName, query, func(scheduleID string) { + if a.scheduleIsExpectedNotToFire(ctx, nsName, scheduleID) { + return + } + nsAnomalies++ + a.logger.Warn("schedule-invariants - the schedule was expected to fire but doesn't appear to have done so", + tag.WorkflowNamespace(nsName), + tag.ScheduleID(scheduleID)) + }) + if err != nil { + a.recordScanError(nsName, subScanner, err) + continue + } + a.emitCount(metricName, nsName, nsAnomalies) + } + return nil +} + +// ListAllNamespaces returns the names of every namespace that exists in the cluster +// metadata store and is active in the current cluster. Pagination is handled internally. +func (a *Activities) ListAllNamespaces(ctx context.Context) ([]string, error) { + var names []string + var pageToken []byte + for { + nsResp, err := a.metadataManager.ListNamespaces(ctx, &persistence.ListNamespacesRequest{ + PageSize: namespaceListPageSize, + NextPageToken: pageToken, + IncludeDeleted: false, + }) + if err != nil { + return nil, err + } + for _, ns := range nsResp.Namespaces { + nsEntry, err := a.namespaceRegistry.GetNamespaceByID(namespace.ID(ns.Namespace.Info.Id)) + if err != nil { + a.logger.Warn("namespace lookup failed, skipping", + tag.WorkflowNamespaceID(ns.Namespace.Info.Id), tag.Error(err)) + continue + } + // Only the active cluster for this namespace should report; otherwise + // global namespaces would be reported once per cluster. + if !nsEntry.ActiveInCluster(a.currentClusterName) { + continue + } + names = append(names, nsEntry.Name().String()) + } + pageToken = nsResp.NextPageToken + if len(pageToken) == 0 { + break + } + } + return names, nil +} + +// forEachNamespace runs `query` against one namespace (scoped to the Scheduler archetype) +// and passes the resulting count to resultsFilter. Rate-limited by the shared limiter. +// Intended to be called once per namespace returned by ListAllNamespaces. +// +// Note: the parameter is named `nsName` rather than `namespace` to avoid shadowing the +// imported `namespace` package used internally. +func (a *Activities) forEachNamespace( + ctx context.Context, + nsName string, + query string, + resultsFilter func(count int64), +) error { + if err := a.rateLimiter.Wait(ctx); err != nil { + if common.IsContextDeadlineExceededErr(err) { + return context.DeadlineExceeded + } + return err + } + nsID, err := a.namespaceRegistry.GetNamespaceID(namespace.Name(nsName)) + if err != nil { + return err + } + resp, err := a.visibilityManager.CountChasmExecutions(ctx, &visibilityservice.CountChasmExecutionsRequest{ + ArchetypeId: chasm.SchedulerArchetypeID, + NamespaceId: nsID.String(), + Namespace: nsName, + Query: query, + }) + if err != nil { + return err + } + resultsFilter(resp.GetCount()) + return nil +} + +// forEachScheduleInNamespace paginates ListChasmExecutions for the given namespace +// (scoped to the Scheduler archetype) and invokes visit for every schedule entity in +// the result. The rate limiter is consumed per page. +func (a *Activities) forEachScheduleInNamespace( + ctx context.Context, + nsName string, + query string, + visit func(scheduleID string), +) error { + nsID, err := a.namespaceRegistry.GetNamespaceID(namespace.Name(nsName)) + if err != nil { + return err + } + + var pageToken []byte + for { + if err := a.rateLimiter.Wait(ctx); err != nil { + if common.IsContextDeadlineExceededErr(err) { + return context.DeadlineExceeded + } + return err + } + resp, err := a.visibilityManager.ListChasmExecutions(ctx, &visibilityservice.ListChasmExecutionsRequest{ + ArchetypeId: chasm.SchedulerArchetypeID, + NamespaceId: nsID.String(), + Namespace: nsName, + Query: query, + PageSize: scheduleListPageSize, + NextPageToken: pageToken, + }) + if err != nil { + return err + } + for _, exec := range resp.GetExecutions() { + visit(exec.GetBusinessId()) + } + pageToken = resp.GetNextPageToken() + if len(pageToken) == 0 { + return nil + } + } +} + +// scheduleIsExpectedNotToFire returns true when a schedule flagged as "overdue" in +// visibility is actually in a state that legitimately explains the stale NextActionTime: +// either the schedule is paused, or it's holding under BUFFER_ONE / BUFFER_ALL with a +// prior workflow still running. +func (a *Activities) scheduleIsExpectedNotToFire(ctx context.Context, nsName, scheduleID string) bool { + if err := a.rateLimiter.Wait(ctx); err != nil { + a.logger.Warn("rate-limiter wait failed during DescribeSchedule; counting schedule as anomaly", + tag.WorkflowNamespace(nsName), tag.ScheduleID(scheduleID), tag.Error(err)) + return false + } + desc, err := a.sdkClientFactory.GetSystemClient().WorkflowService().DescribeSchedule(ctx, &workflowservice.DescribeScheduleRequest{ + Namespace: nsName, + ScheduleId: scheduleID, + }) + if err != nil { + a.logger.Warn("DescribeSchedule failed", + tag.WorkflowNamespace(nsName), tag.ScheduleID(scheduleID), tag.Error(err)) + return false + } + + if desc.GetSchedule().GetState().GetPaused() { + return true + } + overlap := desc.GetSchedule().GetPolicies().GetOverlapPolicy() + if (overlap == enumspb.SCHEDULE_OVERLAP_POLICY_BUFFER_ONE || overlap == enumspb.SCHEDULE_OVERLAP_POLICY_BUFFER_ALL) && + len(desc.GetInfo().GetRunningWorkflows()) > 0 { + return true + } + return false +} + +func (a *Activities) emitCount(metricName, namespaceTagValue string, count int64) { + if count <= 0 { + return + } + a.metricsHandler. + WithTags(metrics.NamespaceTag(namespaceTagValue)). + Counter(metricName). + Record(count) +} + +func (a *Activities) recordScanError(nsName, subScanner string, err error) { + a.logger.Warn("schedule-invariants scan failed for namespace", + tag.WorkflowNamespace(nsName), + tag.NewStringTag("sub_scanner", subScanner), + tag.Error(err)) + metrics.ScheduleInvariantsScannerErrorCount.With(a.metricsHandler.WithTags( + metrics.NamespaceTag(nsName), + metrics.StringTag("sub_scanner", subScanner), + )).Record(1) +} diff --git a/service/worker/scanner/scheduleinvariants/invariants_test.go b/service/worker/scanner/scheduleinvariants/invariants_test.go new file mode 100644 index 00000000000..dcd72568033 --- /dev/null +++ b/service/worker/scanner/scheduleinvariants/invariants_test.go @@ -0,0 +1,432 @@ +package scheduleinvariants + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/stretchr/testify/require" + commonpb "go.temporal.io/api/common/v1" + enumspb "go.temporal.io/api/enums/v1" + schedulepb "go.temporal.io/api/schedule/v1" + "go.temporal.io/api/workflowservice/v1" + chasmspb "go.temporal.io/server/api/chasm/v1" + persistencespb "go.temporal.io/server/api/persistence/v1" + "go.temporal.io/server/api/visibilityservice/v1" + "go.temporal.io/server/chasm" + "go.temporal.io/server/common/clock" + "go.temporal.io/server/common/dynamicconfig" + "go.temporal.io/server/common/log" + "go.temporal.io/server/common/metrics" + "go.temporal.io/server/common/namespace" + "go.temporal.io/server/common/persistence" + "go.temporal.io/server/common/persistence/visibility/manager" + "go.temporal.io/server/common/quotas" + "go.temporal.io/server/common/sdk" + "go.temporal.io/server/common/testing/mockapi/workflowservicemock/v1" + "go.temporal.io/server/common/testing/mocksdk" + "go.uber.org/mock/gomock" +) + +const testClusterName = "test-cluster" + +type testDeps struct { + ctrl *gomock.Controller + metadataManager *persistence.MockMetadataManager + visibilityManager *manager.MockVisibilityManager + namespaceRegistry *namespace.MockRegistry + sdkClientFactory *sdk.MockClientFactory + sdkClient *mocksdk.MockClient + frontendClient *workflowservicemock.MockWorkflowServiceClient + timeSource *clock.EventTimeSource +} + +func newTestDeps(t *testing.T) *testDeps { + t.Helper() + ctrl := gomock.NewController(t) + d := &testDeps{ + ctrl: ctrl, + metadataManager: persistence.NewMockMetadataManager(ctrl), + visibilityManager: manager.NewMockVisibilityManager(ctrl), + namespaceRegistry: namespace.NewMockRegistry(ctrl), + sdkClientFactory: sdk.NewMockClientFactory(ctrl), + sdkClient: mocksdk.NewMockClient(ctrl), + frontendClient: workflowservicemock.NewMockWorkflowServiceClient(ctrl), + timeSource: clock.NewEventTimeSource(), + } + // The DescribeSchedule path always goes via system client → frontend stub. + d.sdkClientFactory.EXPECT().GetSystemClient().Return(d.sdkClient).AnyTimes() + d.sdkClient.EXPECT().WorkflowService().Return(d.frontendClient).AnyTimes() + return d +} + +func (d *testDeps) newActivities() *Activities { + // A very high RPS rate-limiter so Wait() never blocks under test. + rl := quotas.NewDefaultOutgoingRateLimiter(quotas.RateFn(dynamicconfig.GetFloatPropertyFn(10000.0))) + return &Activities{ + logger: log.NewNoopLogger(), + metricsHandler: metrics.NoopMetricsHandler, + metadataManager: d.metadataManager, + visibilityManager: d.visibilityManager, + namespaceRegistry: d.namespaceRegistry, + sdkClientFactory: d.sdkClientFactory, + currentClusterName: testClusterName, + timeSource: d.timeSource, + overdueTolerance: dynamicconfig.GetDurationPropertyFn(10 * time.Minute), + scanInterval: dynamicconfig.GetDurationPropertyFn(15 * time.Minute), + stuckOpenIdleTimeBufferMultiplier: dynamicconfig.GetIntPropertyFn(2), + rateLimiter: rl, + } +} + +func nsDetail(id, name string) *persistence.GetNamespaceResponse { + return &persistence.GetNamespaceResponse{ + Namespace: &persistencespb.NamespaceDetail{ + Info: &persistencespb.NamespaceInfo{Id: id, Name: name}, + }, + } +} + +func localNS(id, name, activeCluster string) *namespace.Namespace { + return namespace.NewLocalNamespaceForTest( + &persistencespb.NamespaceInfo{Id: id, Name: name}, + nil, + activeCluster, + ) +} + +// globalNS builds a global (replicated) namespace whose active cluster is +// activeCluster. Only global namespaces return false from ActiveInCluster when the +// active cluster doesn't match; local namespaces are always "active" in every cluster. +func globalNS(id, name, activeCluster string) *namespace.Namespace { + return namespace.NewGlobalNamespaceForTest( + &persistencespb.NamespaceInfo{Id: id, Name: name}, + nil, + &persistencespb.NamespaceReplicationConfig{ + ActiveClusterName: activeCluster, + Clusters: []string{activeCluster, "other-cluster"}, + }, + 0, + ) +} + +func TestListAllNamespaces_PaginatesAndFiltersInactive(t *testing.T) { + d := newTestDeps(t) + + d.metadataManager.EXPECT().ListNamespaces(gomock.Any(), &persistence.ListNamespacesRequest{ + PageSize: namespaceListPageSize, + NextPageToken: nil, + IncludeDeleted: false, + }).Return(&persistence.ListNamespacesResponse{ + Namespaces: []*persistence.GetNamespaceResponse{nsDetail("id-1", "ns-1"), nsDetail("id-2", "ns-2")}, + NextPageToken: []byte("page2"), + }, nil) + d.metadataManager.EXPECT().ListNamespaces(gomock.Any(), &persistence.ListNamespacesRequest{ + PageSize: namespaceListPageSize, + NextPageToken: []byte("page2"), + IncludeDeleted: false, + }).Return(&persistence.ListNamespacesResponse{ + Namespaces: []*persistence.GetNamespaceResponse{nsDetail("id-3", "ns-3")}, + NextPageToken: nil, + }, nil) + + d.namespaceRegistry.EXPECT().GetNamespaceByID(namespace.ID("id-1")).Return(globalNS("id-1", "ns-1", testClusterName), nil) + d.namespaceRegistry.EXPECT().GetNamespaceByID(namespace.ID("id-2")).Return(globalNS("id-2", "ns-2", "other-cluster"), nil) + d.namespaceRegistry.EXPECT().GetNamespaceByID(namespace.ID("id-3")).Return(globalNS("id-3", "ns-3", testClusterName), nil) + + names, err := d.newActivities().ListAllNamespaces(context.Background()) + require.NoError(t, err) + require.Equal(t, []string{"ns-1", "ns-3"}, names) +} + +func TestListAllNamespaces_SkipsRegistryLookupFailures(t *testing.T) { + d := newTestDeps(t) + + d.metadataManager.EXPECT().ListNamespaces(gomock.Any(), gomock.Any()).Return(&persistence.ListNamespacesResponse{ + Namespaces: []*persistence.GetNamespaceResponse{ + nsDetail("good", "good-ns"), + nsDetail("bad", "bad-ns"), + }, + NextPageToken: nil, + }, nil) + + d.namespaceRegistry.EXPECT().GetNamespaceByID(namespace.ID("good")).Return(localNS("good", "good-ns", testClusterName), nil) + d.namespaceRegistry.EXPECT().GetNamespaceByID(namespace.ID("bad")).Return(nil, errors.New("not found")) + + names, err := d.newActivities().ListAllNamespaces(context.Background()) + require.NoError(t, err) + require.Equal(t, []string{"good-ns"}, names) +} + +func TestListAllNamespaces_ReturnsErrorOnMetadataFailure(t *testing.T) { + d := newTestDeps(t) + d.metadataManager.EXPECT().ListNamespaces(gomock.Any(), gomock.Any()). + Return(nil, errors.New("boom")) + + _, err := d.newActivities().ListAllNamespaces(context.Background()) + require.Error(t, err) +} + +func TestForEachNamespace_InvokesCallbackWithCount(t *testing.T) { + d := newTestDeps(t) + + d.namespaceRegistry.EXPECT().GetNamespaceID(namespace.Name("ns-1")).Return(namespace.ID("id-1"), nil) + d.visibilityManager.EXPECT().CountChasmExecutions(gomock.Any(), &visibilityservice.CountChasmExecutionsRequest{ + ArchetypeId: chasm.SchedulerArchetypeID, + NamespaceId: "id-1", + Namespace: "ns-1", + Query: "some-query", + }).Return(&visibilityservice.CountChasmExecutionsResponse{Count: 7}, nil) + + var got int64 + err := d.newActivities().forEachNamespace(context.Background(), "ns-1", "some-query", func(count int64) { + got = count + }) + require.NoError(t, err) + require.Equal(t, int64(7), got) +} + +func TestForEachNamespace_PropagatesVisibilityError(t *testing.T) { + d := newTestDeps(t) + + d.namespaceRegistry.EXPECT().GetNamespaceID(namespace.Name("ns-1")).Return(namespace.ID("id-1"), nil) + d.visibilityManager.EXPECT().CountChasmExecutions(gomock.Any(), gomock.Any()). + Return(nil, errors.New("count failed")) + + called := false + err := d.newActivities().forEachNamespace(context.Background(), "ns-1", "q", func(count int64) { + called = true + }) + require.Error(t, err) + require.False(t, called, "callback should not fire on error") +} + +func chasmExec(id string) *chasmspb.VisibilityExecutionInfo { + return &chasmspb.VisibilityExecutionInfo{BusinessId: id} +} + +func TestForEachScheduleInNamespace_PaginatesAndVisitsEachSchedule(t *testing.T) { + d := newTestDeps(t) + + d.namespaceRegistry.EXPECT().GetNamespaceID(namespace.Name("ns-1")).Return(namespace.ID("id-1"), nil) + + d.visibilityManager.EXPECT().ListChasmExecutions(gomock.Any(), &visibilityservice.ListChasmExecutionsRequest{ + ArchetypeId: chasm.SchedulerArchetypeID, + NamespaceId: "id-1", + Namespace: "ns-1", + Query: "q", + PageSize: scheduleListPageSize, + NextPageToken: nil, + }).Return(&visibilityservice.ListChasmExecutionsResponse{ + Executions: []*chasmspb.VisibilityExecutionInfo{chasmExec("sched-1"), chasmExec("sched-2")}, + NextPageToken: []byte("p2"), + }, nil) + d.visibilityManager.EXPECT().ListChasmExecutions(gomock.Any(), &visibilityservice.ListChasmExecutionsRequest{ + ArchetypeId: chasm.SchedulerArchetypeID, + NamespaceId: "id-1", + Namespace: "ns-1", + Query: "q", + PageSize: scheduleListPageSize, + NextPageToken: []byte("p2"), + }).Return(&visibilityservice.ListChasmExecutionsResponse{ + Executions: []*chasmspb.VisibilityExecutionInfo{chasmExec("sched-3")}, + NextPageToken: nil, + }, nil) + + var visited []string + err := d.newActivities().forEachScheduleInNamespace(context.Background(), "ns-1", "q", func(scheduleID string) { + visited = append(visited, scheduleID) + }) + require.NoError(t, err) + require.Equal(t, []string{"sched-1", "sched-2", "sched-3"}, visited) +} + +func TestForEachScheduleInNamespace_StopsOnError(t *testing.T) { + d := newTestDeps(t) + + d.namespaceRegistry.EXPECT().GetNamespaceID(namespace.Name("ns-1")).Return(namespace.ID("id-1"), nil) + d.visibilityManager.EXPECT().ListChasmExecutions(gomock.Any(), gomock.Any()). + Return(nil, errors.New("list failed")) + + err := d.newActivities().forEachScheduleInNamespace(context.Background(), "ns-1", "q", func(string) { + t.Fatal("should not visit any schedule") + }) + require.Error(t, err) +} + +func describeResp(paused bool, overlap enumspb.ScheduleOverlapPolicy, runningCount int) *workflowservice.DescribeScheduleResponse { + resp := &workflowservice.DescribeScheduleResponse{ + Schedule: &schedulepb.Schedule{ + State: &schedulepb.ScheduleState{Paused: paused}, + Policies: &schedulepb.SchedulePolicies{OverlapPolicy: overlap}, + }, + Info: &schedulepb.ScheduleInfo{}, + } + for range runningCount { + resp.Info.RunningWorkflows = append(resp.Info.RunningWorkflows, &commonpb.WorkflowExecution{WorkflowId: "running"}) + } + return resp +} + +func TestScheduleIsExpectedNotToFire(t *testing.T) { + cases := []struct { + name string + resp *workflowservice.DescribeScheduleResponse + err error + want bool + }{ + { + name: "paused", + resp: describeResp(true, enumspb.SCHEDULE_OVERLAP_POLICY_SKIP, 0), + want: true, + }, + { + name: "buffer_one_with_running_workflow", + resp: describeResp(false, enumspb.SCHEDULE_OVERLAP_POLICY_BUFFER_ONE, 1), + want: true, + }, + { + name: "buffer_all_with_running_workflow", + resp: describeResp(false, enumspb.SCHEDULE_OVERLAP_POLICY_BUFFER_ALL, 2), + want: true, + }, + { + name: "buffer_one_no_running_workflow", + resp: describeResp(false, enumspb.SCHEDULE_OVERLAP_POLICY_BUFFER_ONE, 0), + want: false, + }, + { + name: "skip_policy_with_running_workflow", + resp: describeResp(false, enumspb.SCHEDULE_OVERLAP_POLICY_SKIP, 1), + want: false, + }, + { + name: "cancel_other_policy", + resp: describeResp(false, enumspb.SCHEDULE_OVERLAP_POLICY_CANCEL_OTHER, 1), + want: false, + }, + { + name: "describe_error", + err: errors.New("describe failed"), + want: false, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + d := newTestDeps(t) + d.frontendClient.EXPECT().DescribeSchedule(gomock.Any(), &workflowservice.DescribeScheduleRequest{ + Namespace: "ns-1", + ScheduleId: "sched-1", + }).Return(tc.resp, tc.err) + + got := d.newActivities().scheduleIsExpectedNotToFire(context.Background(), "ns-1", "sched-1") + require.Equal(t, tc.want, got) + }) + } +} + +func TestRunOverdueScan_FiltersExpectedNotToFireSchedulesAndCountsRest(t *testing.T) { + d := newTestDeps(t) + + d.metadataManager.EXPECT().ListNamespaces(gomock.Any(), gomock.Any()).Return(&persistence.ListNamespacesResponse{ + Namespaces: []*persistence.GetNamespaceResponse{nsDetail("id-1", "ns-1")}, + NextPageToken: nil, + }, nil) + d.namespaceRegistry.EXPECT().GetNamespaceByID(namespace.ID("id-1")).Return(localNS("id-1", "ns-1", testClusterName), nil) + d.namespaceRegistry.EXPECT().GetNamespaceID(namespace.Name("ns-1")).Return(namespace.ID("id-1"), nil) + + d.visibilityManager.EXPECT().ListChasmExecutions(gomock.Any(), gomock.Any()).Return(&visibilityservice.ListChasmExecutionsResponse{ + Executions: []*chasmspb.VisibilityExecutionInfo{ + chasmExec("sched-paused"), + chasmExec("sched-buffer-waiting"), + chasmExec("sched-actually-overdue"), + }, + NextPageToken: nil, + }, nil) + + d.frontendClient.EXPECT().DescribeSchedule(gomock.Any(), &workflowservice.DescribeScheduleRequest{ + Namespace: "ns-1", ScheduleId: "sched-paused", + }).Return(describeResp(true, enumspb.SCHEDULE_OVERLAP_POLICY_SKIP, 0), nil) + d.frontendClient.EXPECT().DescribeSchedule(gomock.Any(), &workflowservice.DescribeScheduleRequest{ + Namespace: "ns-1", ScheduleId: "sched-buffer-waiting", + }).Return(describeResp(false, enumspb.SCHEDULE_OVERLAP_POLICY_BUFFER_ONE, 1), nil) + d.frontendClient.EXPECT().DescribeSchedule(gomock.Any(), &workflowservice.DescribeScheduleRequest{ + Namespace: "ns-1", ScheduleId: "sched-actually-overdue", + }).Return(describeResp(false, enumspb.SCHEDULE_OVERLAP_POLICY_SKIP, 0), nil) + + err := d.newActivities().runOverdueScan(context.Background(), "q") + require.NoError(t, err) +} + +func TestRunOverdueScan_ContinuesPastPerNamespaceErrors(t *testing.T) { + d := newTestDeps(t) + + d.metadataManager.EXPECT().ListNamespaces(gomock.Any(), gomock.Any()).Return(&persistence.ListNamespacesResponse{ + Namespaces: []*persistence.GetNamespaceResponse{ + nsDetail("id-broken", "ns-broken"), + nsDetail("id-ok", "ns-ok"), + }, + NextPageToken: nil, + }, nil) + d.namespaceRegistry.EXPECT().GetNamespaceByID(namespace.ID("id-broken")).Return(localNS("id-broken", "ns-broken", testClusterName), nil) + d.namespaceRegistry.EXPECT().GetNamespaceByID(namespace.ID("id-ok")).Return(localNS("id-ok", "ns-ok", testClusterName), nil) + + d.namespaceRegistry.EXPECT().GetNamespaceID(namespace.Name("ns-broken")).Return(namespace.ID("id-broken"), nil) + d.visibilityManager.EXPECT().ListChasmExecutions(gomock.Any(), gomock.AssignableToTypeOf(&visibilityservice.ListChasmExecutionsRequest{})). + DoAndReturn(func(_ context.Context, req *visibilityservice.ListChasmExecutionsRequest) (*visibilityservice.ListChasmExecutionsResponse, error) { + if req.Namespace == "ns-broken" { + return nil, errors.New("list failed") + } + return &visibilityservice.ListChasmExecutionsResponse{ + Executions: []*chasmspb.VisibilityExecutionInfo{chasmExec("sched-1")}, + NextPageToken: nil, + }, nil + }).AnyTimes() + + d.namespaceRegistry.EXPECT().GetNamespaceID(namespace.Name("ns-ok")).Return(namespace.ID("id-ok"), nil) + d.frontendClient.EXPECT().DescribeSchedule(gomock.Any(), gomock.Any()). + Return(describeResp(false, enumspb.SCHEDULE_OVERLAP_POLICY_SKIP, 0), nil) + + err := d.newActivities().runOverdueScan(context.Background(), "q") + require.NoError(t, err) +} + +func TestRunScan_EmitsPerNamespaceCounts(t *testing.T) { + d := newTestDeps(t) + + d.metadataManager.EXPECT().ListNamespaces(gomock.Any(), gomock.Any()).Return(&persistence.ListNamespacesResponse{ + Namespaces: []*persistence.GetNamespaceResponse{ + nsDetail("id-1", "ns-1"), + nsDetail("id-2", "ns-2"), + }, + NextPageToken: nil, + }, nil) + d.namespaceRegistry.EXPECT().GetNamespaceByID(namespace.ID("id-1")).Return(localNS("id-1", "ns-1", testClusterName), nil) + d.namespaceRegistry.EXPECT().GetNamespaceByID(namespace.ID("id-2")).Return(localNS("id-2", "ns-2", testClusterName), nil) + + d.namespaceRegistry.EXPECT().GetNamespaceID(namespace.Name("ns-1")).Return(namespace.ID("id-1"), nil) + d.namespaceRegistry.EXPECT().GetNamespaceID(namespace.Name("ns-2")).Return(namespace.ID("id-2"), nil) + d.visibilityManager.EXPECT().CountChasmExecutions(gomock.Any(), gomock.Any()). + DoAndReturn(func(_ context.Context, req *visibilityservice.CountChasmExecutionsRequest) (*visibilityservice.CountChasmExecutionsResponse, error) { + switch req.Namespace { + case "ns-1": + return &visibilityservice.CountChasmExecutionsResponse{Count: 3}, nil + case "ns-2": + return &visibilityservice.CountChasmExecutionsResponse{Count: 0}, nil + } + return &visibilityservice.CountChasmExecutionsResponse{Count: 0}, nil + }).Times(2) + + err := d.newActivities().runScan(context.Background(), "stuck_open", "q", metrics.ScheduleInvariantsScannerStuckOpenCount.Name()) + require.NoError(t, err) +} + +func TestEmitCount_IgnoresZeroAndNegative(t *testing.T) { + d := newTestDeps(t) + a := d.newActivities() + // emitCount is a no-op for count <= 0; mainly we verify it doesn't panic. + a.emitCount("metric", "ns", 0) + a.emitCount("metric", "ns", -1) + a.emitCount("metric", "ns", 5) // exercise positive path +} diff --git a/service/worker/scanner/scheduleinvariants/workflows.go b/service/worker/scanner/scheduleinvariants/workflows.go new file mode 100644 index 00000000000..d029ffe5155 --- /dev/null +++ b/service/worker/scanner/scheduleinvariants/workflows.go @@ -0,0 +1,102 @@ +package scheduleinvariants + +import ( + "time" + + enumspb "go.temporal.io/api/enums/v1" + "go.temporal.io/sdk/client" + "go.temporal.io/sdk/temporal" + "go.temporal.io/sdk/workflow" +) + +const ( + OverdueNextActionTimeWorkflowName = "schedule-invariants-overdue-next-action-time-scanner" + OverdueNextActionTimeActivityName = "scan-schedule-invariants-overdue-next-action-time" + OverdueNextActionTimeWorkflowID = "temporal-sys-schedule-invariants-overdue-next-action-time-scanner" + OverdueNextActionTimeTaskQueue = "temporal-sys-schedule-invariants-overdue-next-action-time-scanner-taskqueue-0" + + StuckOpenWorkflowName = "schedule-invariants-stuck-open-scanner" + StuckOpenActivityName = "scan-schedule-invariants-stuck-open" + StuckOpenWorkflowID = "temporal-sys-schedule-invariants-stuck-open-scanner" + StuckOpenTaskQueue = "temporal-sys-schedule-invariants-stuck-open-scanner-taskqueue-0" + + UnknownStateWorkflowName = "schedule-invariants-unknown-state-scanner" + UnknownStateActivityName = "scan-schedule-invariants-unknown-state" + UnknownStateWorkflowID = "temporal-sys-schedule-invariants-unknown-state-scanner" + UnknownStateTaskQueue = "temporal-sys-schedule-invariants-unknown-state-scanner-taskqueue-0" + + // activityStartToCloseTimeout is "effectively infinite" - the activity runs an + // internal scan ticker forever and is only ever expected to exit on transient + // failures (worker restart, network glitch, etc.). + activityStartToCloseTimeout = 365 * 24 * time.Hour + activityHeartbeatTimeout = 30 * time.Second + // workflowRunTimeout bounds a single workflow run; the workflow continues-as-new + // after each activity exit, so this is just a safety net larger than the + // activity's own timeout plus retry budget. + workflowRunTimeout = 366 * 24 * time.Hour +) + +var ( + // retryPolicy applies to the long-running scan activity: 10 attempts spaced + // 5 minutes apart with no exponential backoff. Once exhausted the workflow + // loops via continue-as-new. + retryPolicy = &temporal.RetryPolicy{ + InitialInterval: 5 * time.Minute, + BackoffCoefficient: 1.0, + MaximumAttempts: 10, + } + + OverdueNextActionTimeWFStartOptions = client.StartWorkflowOptions{ + ID: OverdueNextActionTimeWorkflowID, + TaskQueue: OverdueNextActionTimeTaskQueue, + WorkflowIDReusePolicy: enumspb.WORKFLOW_ID_REUSE_POLICY_ALLOW_DUPLICATE, + WorkflowRunTimeout: workflowRunTimeout, + } + StuckOpenWFStartOptions = client.StartWorkflowOptions{ + ID: StuckOpenWorkflowID, + TaskQueue: StuckOpenTaskQueue, + WorkflowIDReusePolicy: enumspb.WORKFLOW_ID_REUSE_POLICY_ALLOW_DUPLICATE, + WorkflowRunTimeout: workflowRunTimeout, + } + UnknownStateWFStartOptions = client.StartWorkflowOptions{ + ID: UnknownStateWorkflowID, + TaskQueue: UnknownStateTaskQueue, + WorkflowIDReusePolicy: enumspb.WORKFLOW_ID_REUSE_POLICY_ALLOW_DUPLICATE, + WorkflowRunTimeout: workflowRunTimeout, + } +) + +// OverdueNextActionTimeWorkflow scans for schedules whose TemporalScheduleNextActionTime +// is in the past beyond the configured tolerance. The activity runs an internal scan +// ticker; this workflow continues-as-new each time the activity exits. +func OverdueNextActionTimeWorkflow(ctx workflow.Context) error { + return runScanActivity(ctx, OverdueNextActionTimeActivityName, OverdueNextActionTimeWorkflow) +} + +// StuckOpenWorkflow scans for schedules that appear stuck open long after their CloseTime. +func StuckOpenWorkflow(ctx workflow.Context) error { + return runScanActivity(ctx, StuckOpenActivityName, StuckOpenWorkflow) +} + +// UnknownStateWorkflow scans for running, unpaused schedules with no +// TemporalScheduleNextActionTime. +func UnknownStateWorkflow(ctx workflow.Context) error { + return runScanActivity(ctx, UnknownStateActivityName, UnknownStateWorkflow) +} + +// runScanActivity executes the named long-running scan activity. The activity is +// expected to outlive a single workflow run only on the steady-state path; when it +// does eventually exit (retry budget exhausted, worker restart, etc.) the workflow +// continues-as-new so the scan loop is reborn. +func runScanActivity(ctx workflow.Context, activityName string, wfFn any) error { + activityCtx := workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + StartToCloseTimeout: activityStartToCloseTimeout, + HeartbeatTimeout: activityHeartbeatTimeout, + RetryPolicy: retryPolicy, + }) + // Ignore the activity result intentionally: the activity is expected to exit + // eventually (heartbeat timeout on worker restart, retry budget exhausted, etc.), + // and we want to continue-as-new and rebirth the scan loop in every case. + _ = workflow.ExecuteActivity(activityCtx, activityName).Get(ctx, nil) + return workflow.NewContinueAsNewError(ctx, wfFn) +} diff --git a/service/worker/service.go b/service/worker/service.go index 3db475993d2..78c299af0cf 100644 --- a/service/worker/service.go +++ b/service/worker/service.go @@ -212,6 +212,14 @@ func NewConfig( ExecutionScannerHistoryEventIdValidator: dynamicconfig.ExecutionScannerHistoryEventIdValidator.Get(dc), RemovableBuildIdDurationSinceDefault: dynamicconfig.RemovableBuildIdDurationSinceDefault.Get(dc), BuildIdScavengerVisibilityRPS: dynamicconfig.BuildIdScavengerVisibilityRPS.Get(dc), + + ScheduleInvariantsScannerOverdueNextActionTimeEnabled: dynamicconfig.ScheduleInvariantsScannerOverdueNextActionTimeEnabled.Get(dc), + ScheduleInvariantsScannerStuckOpenEnabled: dynamicconfig.ScheduleInvariantsScannerStuckOpenEnabled.Get(dc), + ScheduleInvariantsScannerUnknownStateEnabled: dynamicconfig.ScheduleInvariantsScannerUnknownStateEnabled.Get(dc), + ScheduleInvariantsScannerOverdueNextActionTimeTolerance: dynamicconfig.ScheduleInvariantsScannerOverdueNextActionTimeTolerance.Get(dc), + ScheduleInvariantsScannerVisibilityRPS: dynamicconfig.ScheduleInvariantsScannerVisibilityRPS.Get(dc), + ScheduleInvariantsScannerScanInterval: dynamicconfig.ScheduleInvariantsScannerScanInterval.Get(dc), + ScheduleInvariantsScannerStuckOpenIdleTimeBufferMultiplier: dynamicconfig.ScheduleInvariantsScannerStuckOpenIdleTimeBufferMultiplier.Get(dc), }, BatcherRPS: dynamicconfig.BatcherRPS.Get(dc), BatcherConcurrency: dynamicconfig.BatcherConcurrency.Get(dc), From 3190edf3c5342377995b436604bc82b2bdb85d91 Mon Sep 17 00:00:00 2001 From: Alex Stanfield <13949480+chaptersix@users.noreply.github.com> Date: Fri, 5 Jun 2026 18:48:04 -0500 Subject: [PATCH 8/8] feat: add scheduler observability metrics for missed actions (#10503) ## Summary Add and improve scheduler metrics to detect missed actions caused by system instability, slow callback delivery, and overlap policy behavior. Changes apply to both v1 (workflow-based) and v2 (CHASM) schedulers. ## What is a missed action? A scheduled action is "missed" when the scheduler decides not to execute an action that was due according to the schedule spec. There are three scenarios where this happens: 1. **Catchup window exceeded.** The scheduler was delayed in processing and the action's scheduled execution time is now older than the configured catchup window. The action is permanently dropped. This can be caused by a previous action still running or by system instability (worker downtime, shard movement, task queue backlog). Tracked by `schedule_missed_catchup_window`. 2. **Overlap policy skip.** A previous action is still running and the overlap policy is SKIP (or BUFFER_ONE with a full buffer). The scheduler intentionally drops the action because the policy says not to start concurrent runs. This is expected behavior, not a system issue -- but a high skip rate may indicate action duration exceeds the schedule interval. Tracked by `schedule_overlap_skipped`. 3. **Buffer overrun.** The internal action buffer is full (bounded by `MaxBufferSize`). New actions are dropped regardless of overlap policy. Tracked by `schedule_buffer_overruns`. Scenarios 1 and 2 are the focus of this PR. Scenario 3 already had metrics. ### Catchup window vs. overlap policy: v1 and v2 differences An action is only ever counted in one of the above scenarios, but the evaluation order and the possible outcomes differ between v1 and v2. Both v1 and v2 have the same two-phase structure: a generation phase that checks the catchup window and buffers actions, followed by a buffer processing phase that evaluates overlap policy and executes actions. The difference is in what happens to buffered actions that can't be executed immediately. **V1 (workflow-based):** 1. `processTimeRange` (generation) checks the catchup window. Actions whose nominal time is older than the window are dropped (`schedule_missed_catchup_window{reason="not_buffered"}`). Surviving actions are added to the buffer. 2. `processBuffer` (invocation) refreshes running action state via `PollMutableState`, then evaluates overlap policy on the entire buffer, including actions deferred on previous ticks. Actions are either started, skipped by overlap policy (`schedule_overlap_skipped`), or deferred (BUFFER_ONE/BUFFER_ALL). V1 has no `reason=buffer_expired` case because it never re-checks the catchup window on buffered actions. Once an action passes the initial catchup window check and enters the buffer, it will never be dropped for being too old. If a BUFFER_ONE schedule has a long-running action, the deferred action waits indefinitely across ticks until the running action completes, at which point it is started regardless of how much time has passed since its original schedule time. **V2 (CHASM):** 1. The generator task checks the catchup window. Same as v1 -- actions past the window are dropped (`schedule_missed_catchup_window{reason="not_buffered"}`). Surviving actions are buffered. 2. The invoker's `ProcessBuffer` task evaluates overlap policy. Actions are either marked for execution, skipped by overlap (`schedule_overlap_skipped`), or deferred (BUFFER_ONE with a running action sets `Attempt = -1`). 3. Actions that survive overlap policy evaluation are checked against the catchup window a second time. If the action was buffered in time but has now expired -- due to the gap between CHASM tasks, overlap policy deferral, or start retries -- it is dropped (`schedule_missed_catchup_window{reason="buffer_expired"}`). This second deadline check is unique to v2. It means a BUFFER_ONE action that waits too long for a slot can be dropped, unlike v1 where it would wait indefinitely. The `action_running` tag on `reason=buffer_expired` distinguishes whether the expiration was caused by a long-running action blocking the slot (`action_running=true`) or by system delay in CHASM task processing (`action_running=false`). For starts deferred by BUFFER_ONE/BUFFER_ALL, `action_running=true` is preserved when the start expires because it waited behind a running action, even if that action has completed by the time the metric is emitted. ## New metrics ### `schedule_overlap_skipped` (counter) **Tags:** `namespace`, `scheduler_backend`, `schedule_overlap_policy` Actions are skipped when the overlap policy prevents starting a new action while a previous one is still running. Today this is only tracked as a running total in `Info.OverlapSkipped` (visible via DescribeSchedule) with no time-series visibility. This counter is emitted for each action dropped by `ProcessBuffer` due to overlap policy. The `schedule_overlap_policy` tag carries the overlap policy that was actually applied when that action was evaluated (e.g., `SCHEDULE_OVERLAP_POLICY_SKIP` or `SCHEDULE_OVERLAP_POLICY_BUFFER_ONE`). One subtlety is that buffered actions do not all resolve overlap policy the same way: - **v1:** normal scheduled actions are buffered with `UNSPECIFIED` and resolved later in `ProcessBuffer` against the schedule's current policy. If `UpdateSchedule` changes the overlap policy while actions are already buffered, those existing buffered scheduled actions will be evaluated under the new policy. - **v2:** the generator resolves overlap policy before buffering and stores the resolved policy on each `BufferedStart`. If `UpdateSchedule` changes the overlap policy later, already-buffered scheduled actions keep the policy they were generated with. This distinction is why the metric is emitted per skipped action using the policy that was actually applied at evaluation time, rather than once per `ProcessBuffer` pass with a single schedule-level tag. **How to use:** - `rate(schedule_overlap_skipped[5m])` shows how often actions are being dropped due to overlap. - Group by `schedule_overlap_policy` to see which policies are causing the most skips. - Compare with `schedule_action_e2e_delay` to see whether skipped actions correlate with actions whose duration exceeds the schedule interval, pushing the total schedule-to-start delay higher. Emitted in v1 (`processBuffer`) and v2 (`InvokerProcessBufferTaskHandler.Execute`). ### `schedule_callback_latency` (timer, v2 only) **Tags:** `namespace`, `scheduler_backend` The v2 scheduler learns about action completions via Nexus completion callbacks. This timer records `time.Since(closeTime)` when `HandleNexusCompletion` processes a callback, measuring the end-to-end delay from action completion to the scheduler receiving the result. This latency includes: callback task creation, callback task queue processing, cross-shard RPC delivery, and handler execution. **Why v2 only:** The v1 and v2 schedulers use fundamentally different mechanisms to discover action completions, and only v2's is worth measuring independently. In v1, completion discovery is **tick-driven**. The scheduler wakes on each scheduled time and does a synchronous `PollMutableState` refresh of all running actions before making overlap decisions. For a SKIP schedule with a 1s interval, the scheduler discovers completions at most 1s after they happen (on the next tick). The long-poll watcher activity exists but is only useful for BUFFER_ONE/BUFFER_ALL where starts sit in the buffer waiting for a completion -- for SKIP, there are no buffered starts and the watcher is irrelevant. The completion discovery latency in v1 is inherently bounded by the schedule interval itself, so there's nothing additional to measure. In v2, completion discovery is **event-driven** via Nexus callbacks. The scheduler has no periodic refresh; it only learns about completions when the callback is delivered. This means callback delivery latency directly gates when the next action can start and when overlap policies are re-evaluated. A slow callback pipeline silently degrades scheduler throughput in a way that has no v1 equivalent. **How to use:** - High values indicate the callback delivery pipeline is slow, which directly delays the next action for schedules using BUFFER_ONE or BUFFER_ALL policies. - Compare with `schedule_action_delay` to understand how much of the total action delay is attributable to callback delivery vs. other factors. - Useful for diagnosing scenarios where v2 schedules appear to "stall" between actions. Emitted in `Scheduler.HandleNexusCompletion`. ## Modified metrics ### `schedule_generate_latency` (timer) -- added to v1, fixed in both Previously v2-only. This timer measures how late the scheduler is in processing a due action: `now - scheduledExecutionTime` (the action's `ActualTime`, after jitter). **Changes:** 1. **Added to v1 scheduler.** The v1 `processTimeRange` now emits this metric, matching the existing v2 behavior. 2. **Emit only for the first action per batch.** When the scheduler catches up over a large time range (e.g., after being down), it processes many due actions in a single pass. Previously every action got its own latency sample against the same `now`. Now only the first action is recorded, representing the actual system delay. 3. **Moved before the catchup window check.** Previously (v2) the metric was emitted after the catchup window check, meaning the worst latencies -- the ones that actually miss the window -- were never recorded. Now it's emitted before, so all latencies including missed actions are captured. **How to use:** - `histogram_quantile(0.99, schedule_generate_latency)` shows worst-case scheduling delay. - A rising trend is an early warning of system instability before actions start being dropped. ### `schedule_missed_catchup_window` (counter) -- added `reason` and `action_running` tags Actions can miss the catchup window at two points in the v2 scheduler: 1. **Generator**: the action's scheduled execution time is older than the catchup window when the generator processes the time range. The action is never buffered. 2. **Invoker (process buffer)**: the action was buffered in time but then sat too long waiting for execution (e.g., due to overlap policy deferral, start retries, or system delay between the gen task and process buffer task). Previously both cases emitted the same untagged counter, making it impossible to distinguish root cause. Now the counter carries two new tags: **`reason` tag:** - `reason=not_buffered` -- the action's scheduled execution time was already past the catchup window when the generator processed the time range. It was never buffered for execution. Present in both v1 and v2. - `reason=buffer_expired` -- the action was buffered in time but expired before execution (e.g., due to overlap deferral, retries, or system delay between CHASM tasks). V2 only. V1 only has the `not_buffered` case because generation and buffer processing happen synchronously in the same workflow task -- there is no gap where a buffered start can sit and expire. **`action_running` tag:** - `action_running=true` -- a previous action was running when the buffered start was deferred, or was still running when the start was dropped. This indicates overlap wait contributed to the missed action. - `action_running=false` -- no action was blocking the start when it expired, so the miss was due to other factors (system delay, scheduler processing time). Present only on `reason=buffer_expired` (v2). Not added to `reason=not_buffered` because: in v2, the generator doesn't have access to the invoker's running action state; in v1, running action state is stale at the point where the catchup window check runs (the refresh happens later in `processBuffer`). **How to use:** - `schedule_missed_catchup_window{reason="not_buffered"}` means the scheduler was too late to process the action. This check is independent of whether a previous action is running. - `schedule_missed_catchup_window{reason="buffer_expired"}` indicates actions were buffered on time but the invoker was delayed in executing them. V2 only. - `schedule_missed_catchup_window{reason="buffer_expired", action_running="true"}` narrows the cause to a long-running previous action blocking execution of buffered starts, including starts that expire after the previous action completes but before they can be executed. For v2 it's possible that completion callbacks are not being delivered in time. - `schedule_missed_catchup_window{reason="buffer_expired", action_running="false"}` means no previous action was blocking execution -- the delay was in CHASM task processing between the generator and invoker. - Comparing the two rates shows whether the bottleneck is in waking the scheduler vs. executing buffered actions. ## Existing metrics (not modified, included for context) ### `schedule_action_delay` (timer) **Tags:** `namespace`, `scheduler_backend`, `schedule_action` Measures the time between when an action became eligible for execution and when it was actually started: `actualStartTime - desiredTime`. `desiredTime` is either: - The action's scheduled execution time (`ActualTime`), for the first action after the scheduler wakes up. - The `CloseTime` of the previous action, for subsequent buffered starts (set in `processWatcherResult` in v1, `recordCompletedAction` in v2). This means that for BUFFER_ONE/BUFFER_ALL, the delay measures only the gap between the previous action completing and the next one starting, not the total wait from the original schedule time. This metric captures the delay from "action eligible" to "action started," which includes buffer processing, rate limiting, and the actual start RPC. However, it does **not** capture time spent waiting for a previous action to complete under BUFFER_ONE/BUFFER_ALL overlap policies, because `DesiredTime` is reset to the previous action's `CloseTime` when it completes (in `processWatcherResult` for v1, `recordCompletedAction` for v2). This means for buffered starts, `schedule_action_delay` measures only the gap between the slot opening and the next action starting -- not the total wall-clock wait from the original schedule time. For the full wall-clock delay including overlap policy wait, use `schedule_action_e2e_delay`. **Relationship to other metrics:** - `schedule_generate_latency` measures a subset: how late the scheduler was in waking up to process the action. This is always <= `schedule_action_delay`. - `schedule_callback_latency` (v2) measures a different leg: how long it took to learn about the previous action's completion. For BUFFER_ONE, high callback latency contributes to high action delay on the next start. - `schedule_action_delay` is the metric most directly visible to users as "how fast did we start after the slot opened." - `schedule_action_e2e_delay` is the metric most directly visible to users as "how late was my scheduled action" -- it includes overlap policy wait time that `schedule_action_delay` excludes. ### `schedule_action_e2e_delay` (timer) -- NEW **Tags:** `namespace`, `scheduler_backend`, `schedule_action` Measures the total wall-clock delay from the action's original schedule time to when it was actually started: `actualStartTime - ActualTime`. Unlike `schedule_action_delay`, this metric always uses the original schedule time as the baseline and is never reset when a previous action completes. This means for BUFFER_ONE/BUFFER_ALL schedules where an action was waiting in the buffer for a previous one to finish, this metric includes the full overlap policy wait time. For the first action after the scheduler wakes (where `DesiredTime` has not been reset), this metric will equal `schedule_action_delay`. **How to use:** - Compare with `schedule_action_delay` to isolate how much delay is caused by overlap policy wait vs. other factors. If `schedule_action_e2e_delay` is high but `schedule_action_delay` is low, the bottleneck is the previous action's duration. - `histogram_quantile(0.99, schedule_action_e2e_delay)` shows worst-case total delay from schedule time to actual start, as experienced by users. Emitted in v1 (`startWorkflow`) and v2 (`InvokerExecuteTaskHandler.startWorkflow`). ## Detecting missed/delayed actions due to system instability These metrics form a pipeline from early warning to confirmed impact. The available signals differ between v1 and v2 due to their architectural differences. ### V1 (workflow-based scheduler) The v1 scheduler runs as a single workflow. Generation and buffer processing happen synchronously in the same workflow task. The scheduler discovers action completions via a synchronous `PollMutableState` refresh on each tick. **Available metrics:** `schedule_generate_latency`, `schedule_action_delay`, `schedule_action_e2e_delay`, `schedule_missed_catchup_window{reason="not_buffered"}`, `schedule_overlap_skipped` **Diagnosis flow:** 1. **Early warning: `schedule_generate_latency`** The first metric to move. Measures how late the scheduler is in processing due actions. Under normal conditions this should be near-zero. A rising trend means the scheduler is delayed in processing -- the worker is under load, matching is slow, or the task queue is backed up. ```promql histogram_quantile(0.99, rate(schedule_generate_latency_bucket{scheduler_backend="legacy"}[5m])) ``` 2. **Confirmed drops: `schedule_missed_catchup_window{reason="not_buffered"}`** Once generate latency exceeds the catchup window, actions are permanently lost. In v1 this is always `reason=not_buffered` -- generation and buffer processing are synchronous, so there is no gap where a buffered start can expire independently. ```promql rate(schedule_missed_catchup_window{scheduler_backend="legacy", reason="not_buffered"}[5m]) ``` 3. **User-facing impact: `schedule_action_delay` and `schedule_action_e2e_delay`** `schedule_action_delay` measures the delay from when the action became eligible (after any overlap wait) to when it started. Absent high `schedule_generate_latency`, a high `schedule_action_delay` points to buffer processing -- rate limiting or the start RPC itself. `schedule_action_e2e_delay` measures total delay from the original schedule time, including overlap policy wait. Absent high `schedule_action_delay`, a high `schedule_action_e2e_delay` indicates the bottleneck is previous action duration, not system instability. 4. **Overlap context: `schedule_overlap_skipped`** Absent high `schedule_callback_latency` (v2), a high skip rate suggests actions are taking longer than the schedule interval (application-level issue). In v2, delayed callback delivery can also cause skips -- the scheduler still sees the previous action as running even though it has completed, because the completion callback hasn't arrived yet. ### V2 (CHASM scheduler) The v2 scheduler splits work across independent CHASM tasks: a generator task buffers actions, and an invoker process-buffer task resolves and executes them. The scheduler learns about action completions via Nexus callbacks, not polling. This creates additional failure modes between the generator and invoker, and in the callback delivery pipeline. **Available metrics:** `schedule_generate_latency`, `schedule_action_delay`, `schedule_action_e2e_delay`, `schedule_missed_catchup_window{reason="not_buffered"|"buffer_expired"}`, `schedule_callback_latency`, `schedule_overlap_skipped` **Diagnosis flow:** 1. **Early warning: `schedule_generate_latency`** Same as v1 -- measures how late the generator task is in processing due actions. ```promql histogram_quantile(0.99, rate(schedule_generate_latency_bucket{scheduler_backend="chasm"}[5m])) ``` 2. **Confirmed drops: `schedule_missed_catchup_window`** The `reason` tag identifies where the drop occurred: - `reason=not_buffered`: The generator was too late to buffer the action. Same as v1 -- the clearest signal that the system is to blame. - `reason=buffer_expired`: The action was buffered on time but expired waiting for execution. The `action_running` tag distinguishes cause: - `action_running=true`: A previous action was still running, or had deferred the buffered start long enough that it expired before execution. Absent high `schedule_callback_latency`, this points to action duration exceeding the catchup window (application-level issue). If `schedule_callback_latency` is also high, delayed callback delivery may be the actual cause -- the previous action completed but the scheduler hasn't learned about it yet. - `action_running=false`: No action was blocking it. The delay was in CHASM task processing between the generator and invoker -- system instability. ```promql # system-caused drops at the generator (strongest signal) rate(schedule_missed_catchup_window{scheduler_backend="chasm", reason="not_buffered"}[5m]) # system-caused drops at the invoker (no action blocking) rate(schedule_missed_catchup_window{scheduler_backend="chasm", reason="buffer_expired", action_running="false"}[5m]) # drops caused by long-running actions rate(schedule_missed_catchup_window{scheduler_backend="chasm", reason="buffer_expired", action_running="true"}[5m]) ``` 3. **Callback pipeline health: `schedule_callback_latency`** For schedules using BUFFER_ONE or BUFFER_ALL, the callback pipeline is on the critical path between one action completing and the next starting. Absent high `schedule_generate_latency`, a high `schedule_callback_latency` indicates the callback delivery pipeline is the bottleneck. A slow callback pipeline means the scheduler doesn't learn about completions promptly, delaying the next buffered action and potentially causing overlap skips or buffer expirations. ```promql histogram_quantile(0.99, rate(schedule_callback_latency_bucket[5m])) ``` 4. **User-facing impact: `schedule_action_delay` and `schedule_action_e2e_delay`** `schedule_action_delay` measures delay from when the action became eligible to when it started. `schedule_action_e2e_delay` measures total delay from the original schedule time. If the latter is high, work backwards: - Is `schedule_generate_latency` high? -> Generator task is delayed (system instability). - Is `schedule_callback_latency` high? -> Completion callbacks are slow (system instability in the callback pipeline). - Is `schedule_missed_catchup_window{reason="buffer_expired", action_running="false"}` firing? -> Invoker task is delayed (system instability between CHASM tasks). - Is `schedule_action_e2e_delay` high but `schedule_action_delay` low? -> Previous action duration is the bottleneck (application-level issue). - Is `schedule_overlap_skipped` spiking? -> Overlap policy is dropping actions because previous actions run too long (application-level issue). 5. **Overlap context: `schedule_overlap_skipped`** Same as v1. Absent high `schedule_callback_latency`, a high skip rate suggests action duration exceeds the schedule interval (application-level issue). If `schedule_callback_latency` is also high, the skips may be caused by delayed completion callbacks rather than actual action duration. ## Detecting missed/delayed actions due to schedule configuration Actions can be missed or delayed due to the schedule's own configuration -- for example, when action duration exceeds the schedule interval. These are not caused by system instability but may still require attention. **Symptoms:** - `schedule_overlap_skipped` is elevated, absent high `schedule_callback_latency` (v2). Actions are being dropped because the previous action hasn't completed by the time the next one is due. - `schedule_action_e2e_delay` is high but `schedule_action_delay` is low. The total delay from schedule time to start is large, but the scheduler started the action promptly once the slot opened. The gap is overlap policy wait. - `schedule_missed_catchup_window{reason="buffer_expired", action_running="true"}` is firing (v2 only), absent high `schedule_callback_latency`. Buffered actions are expiring because they spent too long waiting behind a previous action. **Diagnosis:** 1. Compare `schedule_action_e2e_delay` with `schedule_action_delay`. A large gap between them indicates time spent waiting for overlap policy to allow execution. 2. Check `schedule_overlap_skipped` grouped by `schedule_overlap_policy`. A high rate on SKIP or BUFFER_ONE indicates the schedule interval is shorter than the action duration. 3. For BUFFER_ONE/BUFFER_ALL schedules, check whether `schedule_missed_catchup_window{reason="buffer_expired", action_running="true"}` is firing. This means buffered actions are waiting so long for the slot that they exceed the catchup window. **Resolution:** These issues are addressed by changing the schedule configuration -- increasing the interval, switching to a more permissive overlap policy, or reducing action duration. They do not indicate a problem with the scheduler or the platform. ### `schedule_buffer_overruns` (counter) **Tags:** `namespace`, `scheduler_backend` Counts actions dropped because the internal action buffer is full (bounded by `MaxBufferSize`). Emitted in v1 (`processTimeRange` and signal handling) and v2 (`GeneratorTaskHandler.Execute`). ## Backfills and manual triggers Backfills and manual triggers (TriggerImmediately) generate actions with the `Manual` flag set. Each backfill request can specify its own overlap policy -- if unspecified, it resolves to the schedule's configured policy. Backfill requests can use ALLOW_ALL to run all backfilled actions concurrently. **Not emitted for manual/backfill actions:** - `schedule_generate_latency` -- backfill actions are generated against a historical time range, so `now - ActualTime` is meaningless as a system delay indicator. - `schedule_missed_catchup_window` -- backfill actions skip the catchup window check entirely. The time range is intentionally in the past. - `schedule_action_delay` and `schedule_action_e2e_delay` -- the delay between a historical schedule time and now is not meaningful for backfills. **Emitted for manual/backfill actions:** - `schedule_overlap_skipped` -- backfill actions go through `ProcessBuffer` and are subject to overlap policy. A backfill with SKIP policy against a running action will increment this counter. The `schedule_overlap_policy` tag reflects the backfill's own policy, not the schedule's configured policy. - `schedule_buffer_overruns` -- backfills can fill the action buffer, especially large backfills over wide time ranges. In v1, backfills are limited to half the buffer (`MaxBufferSize/2`) and processed incrementally (`BackfillsPerIteration` per tick). In v2, each backfill request creates a separate `Backfiller` component that generates actions independently. - `schedule_callback_latency` (v2) -- recorded when any action's completion callback arrives, regardless of whether it was manually triggered. **Interaction with regular actions:** Whether backfills contend with regular actions depends on the backfill's overlap policy: - **ALLOW_ALL (typical for backfills):** Actions started with ALLOW_ALL are not tracked as running. This means they do not trigger overlap policy evaluation on regular actions -- a SKIP schedule will continue to start regular actions normally even while ALLOW_ALL backfill actions are running. Buffer contention is the only possible interference: in v1, backfills self-limit to half the buffer (`MaxBufferSize/2`) and add at most `BackfillsPerIteration` (default 10) per tick, with the generator running first each tick. Buffer contention is unlikely unless backfill-started actions from previous ticks accumulate in the buffer faster than they can be executed. - **Non-ALLOW_ALL (SKIP, BUFFER_ONE, etc.):** Actions started with these policies are tracked as running. A backfill-started action that is still running will cause regular actions to be skipped or deferred by overlap policy. This can show up as elevated `schedule_overlap_skipped` or increased `schedule_action_e2e_delay` on regular actions while the backfill is in progress. This is the primary contention scenario -- the backfill's running actions block the schedule's normal execution. ## Glossary - **Scheduled action**: The operation the scheduler executes at each scheduled time. Currently this is always starting a workflow, but the scheduler is designed to support other primitives (e.g., activities) in the future. - **Nominal time** (`NominalTime`): The exact time that matches the schedule spec, before jitter is applied. Used to generate the workflow ID (ensuring deterministic naming) and the request ID. Corresponds to `GetNextTimeResult.Nominal`. - **Actual time** (`ActualTime`): The nominal time with jitter applied. This is the time the action is actually scheduled to execute. Corresponds to `GetNextTimeResult.Next`. Stored on `BufferedStart.ActualTime`. - **Desired time** (`DesiredTime`): The baseline time used to compute `schedule_action_delay`. Initially nil. When a previous action completes, `DesiredTime` on the next pending buffered start is set to the previous action's `CloseTime`. If nil at metric emission time, falls back to `ActualTime`. This reset means `schedule_action_delay` measures the gap from "slot opened" to "action started" for buffered starts, not the total wait from the original schedule time. - **Catchup window**: A configurable duration (default: 365 days, minimum: 10 seconds) that determines how old an action can be before it is permanently dropped. If `now - ActualTime > catchupWindow`, the action is considered too stale to execute. Checked in the generator (both v1 and v2) and again in the v2 invoker for buffered actions. - **Overlap policy**: Determines what happens when a new action is due but a previous action is still running. - **SKIP**: Drop the new action. No concurrent or queued actions. This is the default policy. - **BUFFER_ONE**: Queue at most one pending action. When the running action completes, the queued action starts. If another action comes due while one is already queued, the new one is dropped. - **BUFFER_ALL**: Queue all pending actions. Every due action is queued and will eventually execute in order after the running action completes. - **CANCEL_OTHER**: Cancel the running action and start the new one immediately. - **TERMINATE_OTHER**: Terminate the running action and start the new one immediately. - **Generator**: The phase that iterates over the schedule spec's time range, checks the catchup window, and buffers actions for execution. In v1 this is `processTimeRange`; in v2 this is the `GeneratorTaskHandler`. - **Invoker / buffer processing**: The phase that evaluates overlap policy on the buffer and executes actions. In v1 this is `processBuffer`; in v2 this is the `InvokerProcessBufferTaskHandler` followed by `InvokerExecuteTaskHandler`. - **Deferred action**: A buffered action that was evaluated by `ProcessBuffer` but could not be executed because a previous action is still running under BUFFER_ONE or BUFFER_ALL policy. The action stays in the buffer and is re-evaluated on the next processing pass. In v2, deferred actions are marked with `Attempt = -1` to distinguish them from newly enqueued actions. In v1, deferred actions remain in the buffer with their original state and are re-evaluated on each tick. (cherry picked from commit 71dd8ed69462a0b940d12a47ffcf9245cd65f7e0) --- chasm/lib/scheduler/invoker.go | 14 ++++++--- chasm/lib/scheduler/invoker_tasks.go | 41 +++++++++++++++++++++++-- chasm/lib/scheduler/scheduler.go | 11 +++++++ chasm/lib/scheduler/spec_processor.go | 20 ++++++++---- common/metrics/metric_defs.go | 17 ++++++++++ service/worker/scheduler/buffer.go | 6 +++- service/worker/scheduler/buffer_test.go | 34 +++++++++++++++++++- service/worker/scheduler/workflow.go | 28 +++++++++++++++-- 8 files changed, 153 insertions(+), 18 deletions(-) diff --git a/chasm/lib/scheduler/invoker.go b/chasm/lib/scheduler/invoker.go index 41f51c69f0a..2a12684ecdb 100644 --- a/chasm/lib/scheduler/invoker.go +++ b/chasm/lib/scheduler/invoker.go @@ -65,10 +65,12 @@ type processBufferResult struct { discardStarts []*schedulespb.BufferedStart // Number of buffered starts dropped due to overlap policy during processing. - overlapSkipped int64 + overlapSkipped int64 + overlapSkippedByPolicy map[enumspb.ScheduleOverlapPolicy]int64 - // Nunmber of buffered starts dropped from missing the catchup window. - missedCatchupWindow int64 + // Number of buffered starts dropped from missing the catchup window, + // bucketed by whether a running action contributed to the miss. + missedCatchupByActionRunning map[bool]int64 } // recordProcessBufferResult updates the Invoker's internal state based on result, as well as the @@ -268,8 +270,10 @@ func (i *Invoker) recordCompletedAction( // Update DesiredTime on the first pending start for metrics. DesiredTime is used // to drive action latency between buffered starts (the time it takes between - // completing one start and kicking off the next). We set that on the first start - // pending execution. + // completing one start and kicking off the next). It also signals in processBuffer + // that this start was blocked behind a running action: if DesiredTime (the previous + // action's CloseTime) is past the start's catchup deadline, the previous action's + // duration caused the miss. idx := slices.IndexFunc(i.BufferedStarts, func(start *schedulespb.BufferedStart) bool { return start.Attempt == 0 }) diff --git a/chasm/lib/scheduler/invoker_tasks.go b/chasm/lib/scheduler/invoker_tasks.go index b62ee8a1c81..4069bc3cfc8 100644 --- a/chasm/lib/scheduler/invoker_tasks.go +++ b/chasm/lib/scheduler/invoker_tasks.go @@ -402,10 +402,25 @@ func (h *InvokerProcessBufferTaskHandler) Execute( result := h.processBuffer(ctx, invoker, scheduler) // Update Scheduler metadata. + var totalMissedCatchup int64 + for _, count := range result.missedCatchupByActionRunning { + totalMissedCatchup += count + } scheduler.recordActionResult(&schedulerActionResult{ overlapSkipped: result.overlapSkipped, - missedCatchupWindow: result.missedCatchupWindow, + missedCatchupWindow: totalMissedCatchup, }) + for overlapPolicy, count := range result.overlapSkippedByPolicy { + newTaggedMetricsHandler(h.metricsHandler, scheduler).WithTags( + metrics.StringTag(metrics.ScheduleOverlapPolicyTag, overlapPolicy.String()), + ).Counter(metrics.ScheduleOverlapSkipped.Name()).Record(count) + } + for actionRunning, count := range result.missedCatchupByActionRunning { + newTaggedMetricsHandler(h.metricsHandler, scheduler).WithTags( + metrics.StringTag(metrics.ScheduleMissedReasonTag, metrics.ScheduleMissedReasonBufferExpired), + metrics.StringTag(metrics.ScheduleActionRunningTag, fmt.Sprintf("%t", actionRunning)), + ).Counter(metrics.ScheduleMissedCatchupWindow.Name()).Record(count) + } // Update internal state and create new tasks. invoker.recordProcessBufferResult(ctx, &result) @@ -423,6 +438,7 @@ func (h *InvokerProcessBufferTaskHandler) processBuffer( ) (result processBufferResult) { runningWorkflows := invoker.runningWorkflowExecutions() isRunning := len(runningWorkflows) > 0 + result.missedCatchupByActionRunning = make(map[bool]int64) // Processing completely ignores any BufferedStart that's already executing/backing off. pendingBufferedStarts := util.FilterSlice(invoker.GetBufferedStarts(), func(start *schedulespb.BufferedStart) bool { @@ -447,13 +463,28 @@ func (h *InvokerProcessBufferTaskHandler) processBuffer( // Update result metrics. result.overlapSkipped = action.OverlapSkipped + result.overlapSkippedByPolicy = action.OverlapSkippedByPolicy // Add starting workflows to result, trim others. Catchup-window expiry is // checked before useScheduledAction so that a start past its catchup // window doesn't consume a LimitedActions slot. for _, start := range readyStarts { - if ctx.Now(invoker).After(h.startWorkflowDeadline(ctx, scheduler, start)) { - result.missedCatchupWindow++ + deadline := h.startWorkflowDeadline(ctx, scheduler, start) + if ctx.Now(invoker).After(deadline) { + // Action was buffered in time but expired before execution + // (e.g., due to overlap deferral, retries, or system delay). + // Only emit the metric if the schedule would have run this + // start -- skip paused or action-exhausted schedules. + if start.Manual || scheduler.useScheduledAction(false) { + // Determine if a running action contributed: either one is still + // running, or the previous action's CloseTime (stored in DesiredTime) + // was already past this start's deadline. + // Note: if no prior action completed, DesiredTime is zero-valued, + // so After(deadline) is false, correctly yielding actionRunning=false. + actionRunning := isRunning || + start.GetDesiredTime().AsTime().After(deadline) + result.missedCatchupByActionRunning[actionRunning]++ + } result.discardStarts = append(result.discardStarts, start) continue } @@ -608,6 +639,10 @@ func (h *InvokerExecuteTaskHandler) startWorkflow( metricsHandler. Timer(metrics.ScheduleActionDelay.Name()). Record(actualStartTime.Sub(desiredTime.AsTime())) + // Record total delay from original schedule time, including any overlap policy wait. + metricsHandler. + Timer(metrics.ScheduleActionE2EDelay.Name()). + Record(actualStartTime.Sub(start.ActualTime.AsTime())) } return nil } diff --git a/chasm/lib/scheduler/scheduler.go b/chasm/lib/scheduler/scheduler.go index 0de9856704d..b042b029248 100644 --- a/chasm/lib/scheduler/scheduler.go +++ b/chasm/lib/scheduler/scheduler.go @@ -19,6 +19,7 @@ import ( "go.temporal.io/server/chasm/lib/scheduler/gen/schedulerpb/v1" "go.temporal.io/server/common" "go.temporal.io/server/common/contextutil" + "go.temporal.io/server/common/metrics" "go.temporal.io/server/common/payload" "go.temporal.io/server/common/primitives/timestamp" "go.temporal.io/server/common/util" @@ -568,6 +569,16 @@ func (s *Scheduler) HandleNexusCompletion( return nil } + // Record how long it took for the callback to arrive after the action completed. + // Use ctx.Now instead of time.Since to use a consistent time source across nodes, + // and clamp to zero in case of clock skew. + if closeTime := info.GetCloseTime().AsTime(); !closeTime.IsZero() { + latency := max(0, ctx.Now(s).Sub(closeTime)) + newTaggedMetricsHandler(ctx.MetricsHandler(), s). + Timer(metrics.ScheduleCallbackLatency.Name()). + Record(latency) + } + // Handle last completed/failed status and payloads. // // TODO - also record payload sizes once we have metrics wired into CHASM context. diff --git a/chasm/lib/scheduler/spec_processor.go b/chasm/lib/scheduler/spec_processor.go index 61585781b80..f3b60d48e11 100644 --- a/chasm/lib/scheduler/spec_processor.go +++ b/chasm/lib/scheduler/spec_processor.go @@ -123,6 +123,7 @@ func (s *SpecProcessorImpl) ProcessTimeRange( var err error var bufferedStarts []*schedulespb.BufferedStart var droppedCount int64 + recordedGenerateLatency := false limitReached := false for next, err = s.NextTime(scheduler, start); err == nil && (!next.Next.IsZero() && !next.Next.After(end)); next, err = s.NextTime(scheduler, next.Next) { lastAction = next.Next @@ -138,21 +139,28 @@ func (s *SpecProcessorImpl) ProcessTimeRange( continue } + // Record generate latency only for the first action in the batch to + // avoid inflating the metric when catching up over a large time range. + if !manual && !recordedGenerateLatency { + metricsHandler.Timer(metrics.ScheduleGenerateLatency.Name()). + Record(end.Sub(next.Next)) + recordedGenerateLatency = true + } + if !manual && end.Sub(next.Next) > catchupWindow { s.logger.Info("Schedule missed catchup window", tag.Time("now", end), tag.Time("time", next.Next)) - metricsHandler.Counter(metrics.ScheduleMissedCatchupWindow.Name()).Record(1) + // Action's nominal time was already past the catchup window when + // the generator processed the time range. It was never buffered. + metricsHandler.WithTags( + metrics.StringTag(metrics.ScheduleMissedReasonTag, metrics.ScheduleMissedReasonNotBuffered), + ).Counter(metrics.ScheduleMissedCatchupWindow.Name()).Record(1) scheduler.Info.MissedCatchupWindow++ continue } - if !manual { - metricsHandler.Timer(metrics.ScheduleGenerateLatency.Name()). - Record(end.Sub(next.Next)) - } - if limitReached { droppedCount++ continue diff --git a/common/metrics/metric_defs.go b/common/metrics/metric_defs.go index 6bdbf3b243a..dda31d0ed24 100644 --- a/common/metrics/metric_defs.go +++ b/common/metrics/metric_defs.go @@ -619,6 +619,11 @@ const ( ScheduleBackendChasm = "chasm" ScheduleBackendLegacy = "legacy" ScheduleBackendWorkflow = "workflow" + ScheduleOverlapPolicyTag = "schedule_overlap_policy" + ScheduleMissedReasonTag = "reason" + ScheduleMissedReasonNotBuffered = "not_buffered" + ScheduleMissedReasonBufferExpired = "buffer_expired" + ScheduleActionRunningTag = "action_running" ScheduleMigrationDirectionTag = "schedule_migration_direction" ScheduleMigrationDirectionToChasm = "to_chasm" ScheduleMigrationDirectionToWorkflow = "to_workflow" @@ -1439,6 +1444,10 @@ var ( "schedule_action_delay", WithDescription("Delay between when scheduled actions should/actually happen"), ) + ScheduleActionE2EDelay = NewTimerDef( + "schedule_action_e2e_delay", + WithDescription("End-to-end delay between the action's original schedule time and when it was actually started, including overlap policy wait"), + ) ScheduleGenerateLatency = NewTimerDef( "schedule_generate_latency", WithDescription("Delay between when a scheduled action was due and when the generator buffered it"), @@ -1459,6 +1468,14 @@ var ( "schedule_migration_failed", WithDescription("The number of times a schedule migration fails"), ) + ScheduleOverlapSkipped = NewCounterDef( + "schedule_overlap_skipped", + WithDescription("The number of schedule actions skipped due to overlap policy"), + ) + ScheduleCallbackLatency = NewTimerDef( + "schedule_callback_latency", + WithDescription("Latency between a scheduled action completing and the scheduler receiving the completion callback"), + ) // Worker Versioning WorkerDeploymentCreated = NewCounterDef("worker_deployment_created") diff --git a/service/worker/scheduler/buffer.go b/service/worker/scheduler/buffer.go index ddea9f2a83c..d2a20a2ecdb 100644 --- a/service/worker/scheduler/buffer.go +++ b/service/worker/scheduler/buffer.go @@ -22,7 +22,8 @@ type ( NeedCancel bool NeedTerminate bool // Stats - OverlapSkipped int64 + OverlapSkipped int64 + OverlapSkippedByPolicy map[enumspb.ScheduleOverlapPolicy]int64 } ) @@ -40,6 +41,7 @@ func ProcessBuffer[T Overlappable]( // affect them. var action ProcessBufferResult[T] + action.OverlapSkippedByPolicy = make(map[enumspb.ScheduleOverlapPolicy]int64) var zeroVal T for _, start := range buffer { @@ -64,12 +66,14 @@ func ProcessBuffer[T Overlappable]( case enumspb.SCHEDULE_OVERLAP_POLICY_SKIP: // just skip action.OverlapSkipped++ + action.OverlapSkippedByPolicy[overlapPolicy]++ case enumspb.SCHEDULE_OVERLAP_POLICY_BUFFER_ONE: // allow one (the first one) in the buffer if len(action.NewBuffer) == 0 { action.NewBuffer = append(action.NewBuffer, start) } else { action.OverlapSkipped++ + action.OverlapSkippedByPolicy[overlapPolicy]++ } case enumspb.SCHEDULE_OVERLAP_POLICY_BUFFER_ALL: // always add to buffer diff --git a/service/worker/scheduler/buffer_test.go b/service/worker/scheduler/buffer_test.go index 85750e5cc9b..af48ac60624 100644 --- a/service/worker/scheduler/buffer_test.go +++ b/service/worker/scheduler/buffer_test.go @@ -51,6 +51,10 @@ func (s *processBufferSuite) TestProcessSkipRunning() { s.Empty(action.NewBuffer) s.False(action.NeedCancel) s.False(action.NeedTerminate) + s.Equal(int64(3), action.OverlapSkipped) + s.Equal(map[enumspb.ScheduleOverlapPolicy]int64{ + enumspb.SCHEDULE_OVERLAP_POLICY_SKIP: 3, + }, action.OverlapSkippedByPolicy) } func (s *processBufferSuite) TestProcessSkipNotRunning() { @@ -71,6 +75,10 @@ func (s *processBufferSuite) TestProcessBufferOneRunning() { s.Equal([]int{3}, jobIds(action.NewBuffer)) s.False(action.NeedCancel) s.False(action.NeedTerminate) + s.Equal(int64(2), action.OverlapSkipped) + s.Equal(map[enumspb.ScheduleOverlapPolicy]int64{ + enumspb.SCHEDULE_OVERLAP_POLICY_BUFFER_ONE: 2, + }, action.OverlapSkippedByPolicy) } func (s *processBufferSuite) TestProcessBufferOneNotRunning() { @@ -168,4 +176,28 @@ func (s *processBufferSuite) TestProcessWithResolve() { s.False(action.NeedTerminate) } -// TODO: add test cases for mixed policies +func (s *processBufferSuite) TestProcessMixedPoliciesTracksSkippedByResolvedPolicy() { + buffer := []*job{ + {3, enumspb.SCHEDULE_OVERLAP_POLICY_SKIP}, + {5, enumspb.SCHEDULE_OVERLAP_POLICY_BUFFER_ONE}, + {7, enumspb.SCHEDULE_OVERLAP_POLICY_UNSPECIFIED}, + {9, enumspb.SCHEDULE_OVERLAP_POLICY_UNSPECIFIED}, + } + resolve := func(policy enumspb.ScheduleOverlapPolicy) enumspb.ScheduleOverlapPolicy { + if policy == enumspb.SCHEDULE_OVERLAP_POLICY_UNSPECIFIED { + return enumspb.SCHEDULE_OVERLAP_POLICY_SKIP + } + return policy + } + + action := ProcessBuffer(buffer, true, resolve) + s.Empty(action.OverlappingStarts) + s.Nil(action.NonOverlappingStart) + s.Equal([]int{5}, jobIds(action.NewBuffer)) + s.False(action.NeedCancel) + s.False(action.NeedTerminate) + s.Equal(int64(3), action.OverlapSkipped) + s.Equal(map[enumspb.ScheduleOverlapPolicy]int64{ + enumspb.SCHEDULE_OVERLAP_POLICY_SKIP: 3, + }, action.OverlapSkippedByPolicy) +} diff --git a/service/worker/scheduler/workflow.go b/service/worker/scheduler/workflow.go index d237197e679..b647c415d7b 100644 --- a/service/worker/scheduler/workflow.go +++ b/service/worker/scheduler/workflow.go @@ -685,6 +685,7 @@ func (s *scheduler) processTimeRange( } lastAction := start + recordedGenerateLatency := false var next GetNextTimeResult for next = s.getNextTime(start); !(next.Next.IsZero() || next.Next.After(end)); next = s.getNextTime(next.Next) { if !s.hasMinVersion(BatchAndCacheTimeQueries) && !s.canTakeScheduledAction(manual, false) { @@ -696,9 +697,21 @@ func (s *scheduler) processTimeRange( // hasMinVersion because this condition couldn't happen in previous versions. continue } + // Record generate latency only for the first action in the batch to + // avoid inflating the metric when catching up over a large time range. + if !manual && !recordedGenerateLatency { + s.metrics.Timer(metrics.ScheduleGenerateLatency.Name()).Record(end.Sub(next.Next)) + recordedGenerateLatency = true + } if !manual && end.Sub(next.Next) > catchupWindow { s.logger.Warn("Schedule missed catchup window", "now", end, "time", next.Next) - s.metrics.Counter(metrics.ScheduleMissedCatchupWindow.Name()).Inc(1) + // Action's nominal time was already past the catchup window when + // the scheduler woke up. It was never buffered for execution. + // action_running is not included: the action was never a candidate + // for execution, so whether something is running is irrelevant. + s.metrics.WithTags(map[string]string{ + metrics.ScheduleMissedReasonTag: metrics.ScheduleMissedReasonNotBuffered, + }).Counter(metrics.ScheduleMissedCatchupWindow.Name()).Inc(1) s.Info.MissedCatchupWindow++ continue } @@ -921,7 +934,11 @@ func (s *scheduler) processWatcherResult(id string, f workflow.Future, long bool } } - // Update desired time of next start if it's buffered. This is used for metrics only. + // Update desired time of next start if it's buffered. This is used to + // compute ScheduleActionDelay (time between completing one action and + // starting the next). The legacy path doesn't use deferred starts + // (Attempt == -1) like CHASM, so BufferedStarts[0] is always the next + // pending start. if long && len(s.State.BufferedStarts) > 0 { s.State.BufferedStarts[0].DesiredTime = res.CloseTime } @@ -1354,6 +1371,11 @@ func (s *scheduler) processBuffer() bool { s.State.BufferedStarts = action.NewBuffer s.Info.OverlapSkipped += action.OverlapSkipped + for overlapPolicy, count := range action.OverlapSkippedByPolicy { + s.metrics.WithTags(map[string]string{ + metrics.ScheduleOverlapPolicyTag: overlapPolicy.String(), + }).Counter(metrics.ScheduleOverlapSkipped.Name()).Inc(count) + } // Try starting whatever we're supposed to start now allStarts := action.OverlappingStarts @@ -1501,6 +1523,8 @@ func (s *scheduler) startWorkflow( // record metric only for _scheduled_ actions, not trigger/backfill, otherwise it's not meaningful desiredTime := cmp.Or(start.DesiredTime, start.ActualTime) s.metrics.Timer(metrics.ScheduleActionDelay.Name()).Record(res.RealStartTime.AsTime().Sub(desiredTime.AsTime())) + // Record total delay from original schedule time, including any overlap policy wait. + s.metrics.Timer(metrics.ScheduleActionE2EDelay.Name()).Record(res.RealStartTime.AsTime().Sub(start.ActualTime.AsTime())) } actionResult := &schedulepb.ScheduleActionResult{