Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 45 additions & 1 deletion pkg/controller/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,14 @@ func (c *Controller) advanceCanary(name string, namespace string) {
if err != nil {
c.recordEventWarningf(cd, "%v", err)
if !retriable {
c.rollback(cd, canaryController, meshRouter, scalerReconciler)
// during promotion the canary is the only healthy copy, halt
// instead of rolling back traffic to the unhealthy primary
if cd.Status.Phase == flaggerv1.CanaryPhasePromoting ||
cd.Status.Phase == flaggerv1.CanaryPhaseFinalising {
Comment thread
aryan9600 marked this conversation as resolved.
c.handleFailedPromotion(cd, canaryController, meshRouter, err)
} else {
c.rollback(cd, canaryController, meshRouter, scalerReconciler)
}
}
return
}
Expand Down Expand Up @@ -989,6 +996,43 @@ func (c *Controller) rollback(canary *flaggerv1.Canary, canaryController canary.
c.runPostRolloutHooks(canary, flaggerv1.CanaryPhaseFailed)
}

// handleFailedPromotion marks the rollout as failed when the primary is unhealthy
// during promotion and routes all traffic back to the canary, the only healthy
// copy of the new revision. Traffic may already have been shifted to the primary
// by runPromotionTrafficShift, so it must be moved back explicitly.
func (c *Controller) handleFailedPromotion(canary *flaggerv1.Canary, canaryController canary.Controller,
meshRouter router.Interface, err error) {
c.recordEventWarningf(canary, "Promotion of %s.%s failed, primary not ready: %v",
canary.Spec.TargetRef.Name, canary.Namespace, err)
c.alert(canary, fmt.Sprintf("Promotion failed, primary not ready: %v", err),
false, flaggerv1.SeverityError)

// route all traffic to the canary, off the unhealthy primary
primaryWeight := 0
canaryWeight := c.totalWeight(canary)
if err := meshRouter.SetRoutes(canary, primaryWeight, canaryWeight, false); err != nil {
c.recordEventWarningf(canary, "%v", err)
return
}
c.recorder.SetWeight(canary, primaryWeight, canaryWeight)

// mark as failed while reporting the weight that matches the routing
if err := canaryController.SyncStatus(canary, flaggerv1.CanaryStatus{
Phase: flaggerv1.CanaryPhaseFailed, CanaryWeight: canaryWeight}); err != nil {
c.logger.With("canary", fmt.Sprintf("%s.%s", canary.Name, canary.Namespace)).Errorf("%v", err)
return
}

c.recorder.SetStatus(canary, flaggerv1.CanaryPhaseFailed)
c.recorder.IncFailures(metrics.CanaryMetricLabels{
Name: canary.Spec.TargetRef.Name,
Namespace: canary.Namespace,
DeploymentStrategy: canary.DeploymentStrategy(),
AnalysisStatus: metrics.AnalysisStatusCompleted,
})
c.runPostRolloutHooks(canary, flaggerv1.CanaryPhaseFailed)
}

func (c *Controller) setPhaseInitialized(cd *flaggerv1.Canary, canaryController canary.Controller) error {
if cd.Status.Phase == "" || cd.Status.Phase == flaggerv1.CanaryPhaseInitializing {
cd.Status.Phase = flaggerv1.CanaryPhaseInitialized
Expand Down
33 changes: 33 additions & 0 deletions pkg/controller/scheduler_deployment_fixture_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,39 @@ func (f fixture) makeReady(t *testing.T, name string) {
require.NoError(t, err)
}

// makePrimaryNotReady puts the primary into a stuck rollout (progress deadline exceeded).
func (f fixture) makePrimaryNotReady(t *testing.T) {
primaryName := fmt.Sprintf("%s-primary", f.canary.Spec.TargetRef.Name)
p, err := f.kubeClient.AppsV1().
Deployments("default").
Get(context.TODO(), primaryName, metav1.GetOptions{})
require.NoError(t, err)

p.Status = appsv1.DeploymentStatus{
ObservedGeneration: p.Generation,
Replicas: 1,
UpdatedReplicas: 1,
ReadyReplicas: 0,
AvailableReplicas: 0,
Conditions: []appsv1.DeploymentCondition{
{
Type: appsv1.DeploymentProgressing,
Status: corev1.ConditionFalse,
Reason: "ProgressDeadlineExceeded",
},
{
Type: appsv1.DeploymentAvailable,
Status: corev1.ConditionFalse,
Reason: "MinimumReplicasUnavailable",
LastUpdateTime: metav1.Now(),
},
},
}

_, err = f.kubeClient.AppsV1().Deployments("default").Update(context.TODO(), p, metav1.UpdateOptions{})
require.NoError(t, err)
}

func newDeploymentFixture(c *flaggerv1.Canary) fixture {
if c == nil {
c = newDeploymentTestCanary()
Expand Down
121 changes: 121 additions & 0 deletions pkg/controller/scheduler_deployment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,127 @@ func TestScheduler_DeploymentRollback(t *testing.T) {
assert.Equal(t, flaggerv1.CanaryPhaseFailed, c.Status.Phase)
}

// when the primary fails to become ready during promotion, the healthy canary
// must be kept instead of rolled back to the broken primary (#1898)
func TestScheduler_DeploymentPromotionPrimaryNotReady(t *testing.T) {
mocks := newDeploymentFixture(nil)

// initializing
mocks.ctrl.advanceCanary("podinfo", "default")
mocks.makePrimaryReady(t)

// initialized
mocks.ctrl.advanceCanary("podinfo", "default")

// update
dep2 := newDeploymentTestDeploymentV2()
_, err := mocks.kubeClient.AppsV1().Deployments("default").Update(context.TODO(), dep2, metav1.UpdateOptions{})
require.NoError(t, err)

// detect changes -> progressing, canary scaled up
mocks.ctrl.advanceCanary("podinfo", "default")
mocks.makeCanaryReady(t)
require.NoError(t, assertPhase(mocks.flaggerClient, "podinfo", flaggerv1.CanaryPhaseProgressing))

canaryDep, err := mocks.kubeClient.AppsV1().Deployments("default").Get(context.TODO(), "podinfo", metav1.GetOptions{})
require.NoError(t, err)
require.NotNil(t, canaryDep.Spec.Replicas)
canaryReplicas := *canaryDep.Spec.Replicas
require.Greater(t, canaryReplicas, int32(0))

// simulate: analysis succeeded, spec promoted to primary, now finishing promotion
cd, err := mocks.flaggerClient.FlaggerV1beta1().Canaries("default").Get(context.TODO(), "podinfo", metav1.GetOptions{})
require.NoError(t, err)
err = mocks.deployer.SetStatusPhase(cd, flaggerv1.CanaryPhasePromoting)
require.NoError(t, err)

// known routing state at promotion time (split between primary and canary)
require.NoError(t, mocks.router.SetRoutes(mocks.canary, 50, 50, false))

// the promoted primary fails to roll out
mocks.makePrimaryNotReady(t)

// advance: Flagger observes the primary is stuck
mocks.ctrl.advanceCanary("podinfo", "default")

// the canary must NOT be scaled to zero - it is the only healthy copy serving traffic
canaryDep, err = mocks.kubeClient.AppsV1().Deployments("default").Get(context.TODO(), "podinfo", metav1.GetOptions{})
require.NoError(t, err)
require.NotNil(t, canaryDep.Spec.Replicas)
assert.Equal(t, canaryReplicas, *canaryDep.Spec.Replicas, "canary must not be scaled to zero when promotion fails")

// traffic must be routed to the healthy canary, off the broken primary
primaryWeight, canaryWeight, _, err := mocks.router.GetRoutes(mocks.canary)
require.NoError(t, err)
assert.Equal(t, 0, primaryWeight, "no traffic must remain on the unhealthy primary")
assert.Equal(t, 100, canaryWeight, "all traffic must be routed to the healthy canary")

// the rollout is reported as failed so it stops advancing and alerts
c, err := mocks.flaggerClient.FlaggerV1beta1().Canaries("default").Get(context.TODO(), "podinfo", metav1.GetOptions{})
require.NoError(t, err)
assert.Equal(t, flaggerv1.CanaryPhaseFailed, c.Status.Phase)
assert.Equal(t, 100, c.Status.CanaryWeight, "reported canary weight must match the traffic on the canary")
}

// when the primary fails after promotion traffic has already been shifted to it
// (Finalising phase), traffic must be routed back to the healthy canary (#1898)
func TestScheduler_DeploymentPromotionFailedAfterTrafficShift(t *testing.T) {
mocks := newDeploymentFixture(nil)

// initializing
mocks.ctrl.advanceCanary("podinfo", "default")
mocks.makePrimaryReady(t)

// initialized
mocks.ctrl.advanceCanary("podinfo", "default")

// update
dep2 := newDeploymentTestDeploymentV2()
_, err := mocks.kubeClient.AppsV1().Deployments("default").Update(context.TODO(), dep2, metav1.UpdateOptions{})
require.NoError(t, err)

// detect changes -> progressing, canary scaled up
mocks.ctrl.advanceCanary("podinfo", "default")
mocks.makeCanaryReady(t)

canaryDep, err := mocks.kubeClient.AppsV1().Deployments("default").Get(context.TODO(), "podinfo", metav1.GetOptions{})
require.NoError(t, err)
require.NotNil(t, canaryDep.Spec.Replicas)
canaryReplicas := *canaryDep.Spec.Replicas
require.Greater(t, canaryReplicas, int32(0))

// simulate: promotion already shifted all traffic to the primary (Finalising)
cd, err := mocks.flaggerClient.FlaggerV1beta1().Canaries("default").Get(context.TODO(), "podinfo", metav1.GetOptions{})
require.NoError(t, err)
err = mocks.deployer.SetStatusPhase(cd, flaggerv1.CanaryPhaseFinalising)
require.NoError(t, err)
require.NoError(t, mocks.router.SetRoutes(mocks.canary, 100, 0, false))

// the promoted primary then fails to stay ready
mocks.makePrimaryNotReady(t)

// advance: Flagger observes the primary is stuck
mocks.ctrl.advanceCanary("podinfo", "default")

// traffic must be routed back to the healthy canary, off the broken primary
primaryWeight, canaryWeight, _, err := mocks.router.GetRoutes(mocks.canary)
require.NoError(t, err)
assert.Equal(t, 0, primaryWeight, "no traffic must remain on the unhealthy primary")
assert.Equal(t, 100, canaryWeight, "all traffic must be routed to the healthy canary")

// the canary must not be scaled to zero
canaryDep, err = mocks.kubeClient.AppsV1().Deployments("default").Get(context.TODO(), "podinfo", metav1.GetOptions{})
require.NoError(t, err)
require.NotNil(t, canaryDep.Spec.Replicas)
assert.Equal(t, canaryReplicas, *canaryDep.Spec.Replicas, "canary must not be scaled to zero when promotion fails")

// reported canary weight must match the routing (not zeroed)
c, err := mocks.flaggerClient.FlaggerV1beta1().Canaries("default").Get(context.TODO(), "podinfo", metav1.GetOptions{})
require.NoError(t, err)
assert.Equal(t, flaggerv1.CanaryPhaseFailed, c.Status.Phase)
assert.Equal(t, 100, c.Status.CanaryWeight, "reported canary weight must match the traffic on the canary")
}

func TestScheduler_DeploymentSkipAnalysis(t *testing.T) {
mocks := newDeploymentFixture(nil)
// initializing
Expand Down
Loading