Skip to content

Commit 9abfc28

Browse files
[release-1.18] Fix min-scale transition (#16100)
* adjust revisionfailure test image to allow changing the state of the pod * include test to ensure min-scale transition works as expected a previous revision should scale down when only the next revision is ready * fix linter warnings * fix revision conditions when initial scale isn't achieved * give more time for the pods to be running * log pod states when it fails * ensure new revision becomes ready and holds scale while old revision is scaled down --------- Co-authored-by: Dave Protasowski <[email protected]>
1 parent 4853ead commit 9abfc28

File tree

10 files changed

+476
-103
lines changed

10 files changed

+476
-103
lines changed

pkg/apis/serving/v1/revision_lifecycle.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -204,8 +204,10 @@ func (rs *RevisionStatus) PropagateAutoscalerStatus(ps *autoscalingv1alpha1.PodA
204204

205205
// Mark resource unavailable if we don't have a Service Name and the deployment is ready
206206
// This can happen when we have initial scale set to 0
207-
if rs.GetCondition(RevisionConditionResourcesAvailable).IsTrue() && ps.ServiceName == "" {
208-
rs.MarkResourcesAvailableUnknown(ReasonDeploying, "")
207+
if rs.GetCondition(RevisionConditionResourcesAvailable).IsTrue() {
208+
if ps.ServiceName == "" || !ps.IsScaleTargetInitialized() {
209+
rs.MarkResourcesAvailableUnknown(ReasonDeploying, "")
210+
}
209211
}
210212

211213
switch cond.Status {

pkg/apis/serving/v1/revision_lifecycle_test.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -584,6 +584,44 @@ func TestPropagateAutoscalerStatus(t *testing.T) {
584584
apistest.CheckConditionSucceeded(r, RevisionConditionResourcesAvailable, t)
585585
}
586586

587+
func TestPropagateAutoscalerStatus_ScaleTargetNotInitialized(t *testing.T) {
588+
r := &RevisionStatus{}
589+
r.InitializeConditions()
590+
apistest.CheckConditionOngoing(r, RevisionConditionReady, t)
591+
592+
// PodAutoscaler has no active condition, so we are just coming up.
593+
r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{
594+
Status: duckv1.Status{},
595+
})
596+
597+
apistest.CheckConditionOngoing(r, RevisionConditionActive, t)
598+
apistest.CheckConditionOngoing(r, RevisionConditionResourcesAvailable, t)
599+
600+
// Deployment is created we mark resources created
601+
r.MarkResourcesAvailableTrue()
602+
603+
// PodAutoscaler resources have been created but initial scale target
604+
// has not been achieved
605+
r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{
606+
ServiceName: "some-service",
607+
Status: duckv1.Status{
608+
Conditions: duckv1.Conditions{{
609+
Type: autoscalingv1alpha1.PodAutoscalerConditionReady,
610+
Status: corev1.ConditionUnknown,
611+
}, {
612+
Type: autoscalingv1alpha1.PodAutoscalerConditionScaleTargetInitialized,
613+
Status: corev1.ConditionUnknown,
614+
}, {
615+
Type: autoscalingv1alpha1.PodAutoscalerConditionSKSReady,
616+
Status: corev1.ConditionTrue,
617+
}},
618+
},
619+
})
620+
621+
// ResourcesAvailable should be reverted back to on-going
622+
apistest.CheckConditionOngoing(r, RevisionConditionResourcesAvailable, t)
623+
}
624+
587625
func TestPropagateAutoscalerStatus_NoOverridingResourcesAvailable(t *testing.T) {
588626
// Cases to test Ready condition
589627
// we fix ScaleTargetInitialized to True

pkg/reconciler/revision/table_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,7 @@ func TestReconcile(t *testing.T) {
283283
WithRoutingState(v1.RoutingStateActive, fc),
284284
MarkRevisionReady, WithRevisionObservedGeneration(1)),
285285
pa("foo", "pa-not-ready",
286+
WithScaleTargetInitialized,
286287
WithPAStatusService("its-not-confidential"),
287288
WithBufferedTraffic,
288289
WithReachabilityReachable),

pkg/testing/v1/configuration.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,3 +138,21 @@ func WithConfigRevisionIdleTimeoutSeconds(revisionIdleTimeoutSeconds int64) Conf
138138
cfg.Spec.Template.Spec.IdleTimeoutSeconds = ptr.Int64(revisionIdleTimeoutSeconds)
139139
}
140140
}
141+
142+
// WithVolume adds a volume to the service
143+
func WithConfigVolume(name, mountPath string, volumeSource corev1.VolumeSource) ConfigOption {
144+
return func(c *v1.Configuration) {
145+
rt := &c.Spec.Template.Spec
146+
147+
rt.Containers[0].VolumeMounts = append(rt.Containers[0].VolumeMounts,
148+
corev1.VolumeMount{
149+
Name: name,
150+
MountPath: mountPath,
151+
})
152+
153+
rt.Volumes = append(rt.Volumes, corev1.Volume{
154+
Name: name,
155+
VolumeSource: volumeSource,
156+
})
157+
}
158+
}

test/conformance.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ const (
5353
Timeout = "timeout"
5454
Volumes = "volumes"
5555
SlowStart = "slowstart"
56+
ConfigMimic = "configmimic"
5657

5758
// Constants for test image output.
5859
PizzaPlanetText1 = "What a spaceport!"

test/e2e/minscale_readiness_test.go

Lines changed: 215 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,15 +30,222 @@ import (
3030
corev1 "k8s.io/api/core/v1"
3131
"k8s.io/apimachinery/pkg/api/resource"
3232
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
33+
"k8s.io/apimachinery/pkg/labels"
34+
"k8s.io/apimachinery/pkg/selection"
35+
"k8s.io/apimachinery/pkg/types"
36+
"k8s.io/apimachinery/pkg/util/intstr"
3337
"k8s.io/apimachinery/pkg/util/wait"
38+
"knative.dev/pkg/kmeta"
3439
"knative.dev/serving/pkg/apis/autoscaling"
40+
"knative.dev/serving/pkg/apis/serving"
3541
v1 "knative.dev/serving/pkg/apis/serving/v1"
3642
"knative.dev/serving/pkg/resources"
3743
rtesting "knative.dev/serving/pkg/testing/v1"
3844
"knative.dev/serving/test"
3945
v1test "knative.dev/serving/test/v1"
4046
)
4147

48+
func TestMinScaleTransition(t *testing.T) {
49+
t.Parallel()
50+
ctx := t.Context()
51+
52+
const (
53+
minScale = 4
54+
scalingWindow = autoscaling.WindowMin
55+
)
56+
57+
clients := Setup(t)
58+
cmClient := clients.KubeClient.CoreV1().ConfigMaps(test.ServingFlags.TestNamespace)
59+
60+
names := test.ResourceNames{
61+
// Config and Route have different names to avoid false positives
62+
Config: test.ObjectNameForTest(t),
63+
Route: test.ObjectNameForTest(t),
64+
Image: test.ConfigMimic,
65+
}
66+
67+
test.EnsureTearDown(t, clients, &names)
68+
69+
firstRevision := kmeta.ChildName(names.Config, fmt.Sprintf("-%05d", 1))
70+
secondRevision := kmeta.ChildName(names.Config, fmt.Sprintf("-%05d", 2))
71+
72+
cm := &corev1.ConfigMap{
73+
ObjectMeta: metav1.ObjectMeta{
74+
Name: names.Config,
75+
Namespace: test.ServingFlags.TestNamespace,
76+
},
77+
Data: map[string]string{
78+
// By default the pods of the configuration are ready
79+
names.Config: "startup,ready,live",
80+
81+
// By default the second revision doesn't become ready
82+
secondRevision: "startup",
83+
},
84+
}
85+
cm, err := cmClient.Create(t.Context(), cm, metav1.CreateOptions{})
86+
if err != nil {
87+
t.Fatal("Failed to create create config map:", err)
88+
}
89+
90+
test.EnsureCleanup(t, func() {
91+
cmClient.Delete(context.Background(), cm.Name, metav1.DeleteOptions{})
92+
})
93+
94+
t.Log("Creating route")
95+
if _, err := v1test.CreateRoute(t, clients, names); err != nil {
96+
t.Fatal("Failed to create Route:", err)
97+
}
98+
99+
t.Log("Creating configuration")
100+
cfg, err := v1test.CreateConfiguration(t, clients, names,
101+
withMinScale(minScale),
102+
// Make sure we scale down quickly after panic, before the autoscaler get killed by chaosduck.
103+
withWindow(scalingWindow),
104+
rtesting.WithConfigReadinessProbe(&corev1.Probe{
105+
ProbeHandler: corev1.ProbeHandler{
106+
HTTPGet: &corev1.HTTPGetAction{
107+
Path: "/healthz/ready",
108+
Port: intstr.FromInt(8080),
109+
},
110+
},
111+
}),
112+
rtesting.WithConfigVolume("state", "/etc/config", corev1.VolumeSource{
113+
ConfigMap: &corev1.ConfigMapVolumeSource{
114+
LocalObjectReference: corev1.LocalObjectReference{
115+
Name: cm.Name,
116+
},
117+
},
118+
}),
119+
)
120+
if err != nil {
121+
t.Fatal("Failed to create Configuration:", err)
122+
}
123+
124+
// This will wait for the revision to be created
125+
firstRevisionService := PrivateServiceName(t, clients, firstRevision)
126+
127+
t.Log("Waiting for first revision to become ready")
128+
err = v1test.WaitForRevisionState(clients.ServingClient, firstRevision, v1test.IsRevisionReady, "RevisionIsReady")
129+
if err != nil {
130+
t.Fatalf("The Revision %q did not become ready: %v", firstRevision, err)
131+
}
132+
133+
t.Log("Holding revision at minScale after becoming ready")
134+
if lr, ok := ensureDesiredScale(clients, t, firstRevisionService, gte(minScale)); !ok {
135+
t.Fatalf("The revision %q observed scale %d < %d after becoming ready", firstRevision, lr, minScale)
136+
}
137+
138+
t.Log("Updating configuration")
139+
if _, err := v1test.PatchConfig(t, clients, cfg, rtesting.WithConfigEnv(corev1.EnvVar{Name: "FOO", Value: "BAR"})); err != nil {
140+
t.Fatal("Failed to update Configuration:", err)
141+
}
142+
143+
t.Logf("Waiting for %v pods to be created", minScale)
144+
var podList *corev1.PodList
145+
146+
err = wait.PollUntilContextTimeout(ctx, time.Second, time.Minute, true, func(context.Context) (bool, error) {
147+
revLabel, err := labels.NewRequirement(serving.RevisionLabelKey, selection.Equals, []string{secondRevision})
148+
if err != nil {
149+
return false, fmt.Errorf("unable to create rev label: %w", err)
150+
}
151+
152+
pods := clients.KubeClient.CoreV1().Pods(test.ServingFlags.TestNamespace)
153+
podList, err = pods.List(ctx, metav1.ListOptions{
154+
LabelSelector: revLabel.String(),
155+
})
156+
if err != nil {
157+
return false, err
158+
}
159+
160+
runningPods := 0
161+
for _, pod := range podList.Items {
162+
if pod.Status.Phase == corev1.PodRunning {
163+
runningPods++
164+
}
165+
}
166+
167+
return runningPods == minScale, nil
168+
})
169+
170+
if errors.Is(err, context.DeadlineExceeded) {
171+
for _, pod := range podList.Items {
172+
t.Logf("pod %s is in phase %s", pod.Name, pod.Status.Phase)
173+
}
174+
t.Fatal("Timed out waiting for pods to be running", err)
175+
} else if err != nil {
176+
t.Fatal("Failed waiting for pods to be running", err)
177+
}
178+
179+
secondRevisionService := PrivateServiceName(t, clients, secondRevision)
180+
181+
// Go over all the new pods and start marking each one ready
182+
// except the last one
183+
for i, pod := range podList.Items {
184+
podName := pod.Name
185+
186+
t.Logf("Marking revision %s pod %s as ready", secondRevision, podName)
187+
markPodAsReadyAndWait(t, clients, cm.Name, podName)
188+
t.Logf("Revision %s pod %s is ready", secondRevision, podName)
189+
190+
t.Logf("Waiting for 2x scaling window %v to pass", 2*scalingWindow)
191+
192+
// Wait two autoscaling window durations
193+
// Scaling decisions are made at the end of the window
194+
time.Sleep(2 * scalingWindow)
195+
196+
if i >= len(podList.Items)-1 {
197+
// When marking the last pod ready we want to skip
198+
// ensuring the previous revision doesn't scale down
199+
break
200+
}
201+
202+
t.Log("Check original revision holding at min scale", minScale)
203+
if _, ok := ensureDesiredScale(clients, t, firstRevisionService, gte(minScale)); !ok {
204+
t.Fatalf("Revision %q was scaled down prematurely", firstRevision)
205+
}
206+
}
207+
208+
t.Log("Check new revision holding at min scale", minScale)
209+
if _, ok := ensureDesiredScale(clients, t, secondRevisionService, gte(minScale)); !ok {
210+
t.Fatalf("Revision %q was scaled down prematurely", secondRevision)
211+
}
212+
213+
t.Log("Check old revision is scaled down")
214+
if _, err := waitForDesiredScale(clients, firstRevisionService, eq(0)); err != nil {
215+
t.Fatalf("Revision %q was not scaled down: %s", firstRevision, err)
216+
}
217+
}
218+
219+
func markPodAsReadyAndWait(t *testing.T, clients *test.Clients, cmName, podName string) {
220+
ctx := t.Context()
221+
coreClient := clients.KubeClient.CoreV1()
222+
cmClient := coreClient.ConfigMaps(test.ServingFlags.TestNamespace)
223+
224+
patch := fmt.Sprintf(`[{"op":"add", "path":"/data/%s", "value": "startup,ready"}]`, podName)
225+
226+
_, err := cmClient.Patch(ctx, cmName, types.JSONPatchType, []byte(patch), metav1.PatchOptions{})
227+
if err != nil {
228+
t.Fatal("Failed to patch ConfigMap", err)
229+
}
230+
231+
if err = wait.PollUntilContextTimeout(ctx, 10*time.Second, 2*time.Minute, true, func(context.Context) (bool, error) {
232+
pod, err := clients.KubeClient.CoreV1().Pods(test.ServingFlags.TestNamespace).Get(context.Background(), podName, metav1.GetOptions{})
233+
if err != nil {
234+
return false, err
235+
}
236+
237+
for _, cond := range pod.Status.Conditions {
238+
if cond.Type == corev1.PodReady {
239+
return cond.Status == corev1.ConditionTrue, nil
240+
}
241+
}
242+
243+
return false, nil
244+
}); err != nil {
245+
t.Fatalf("Pod %s didn't become ready: %s", podName, err)
246+
}
247+
}
248+
42249
func TestMinScale(t *testing.T) {
43250
t.Parallel()
44251

@@ -171,6 +378,12 @@ func lt(m int) func(int) bool {
171378
}
172379
}
173380

381+
func eq(m int) func(int) bool {
382+
return func(n int) bool {
383+
return n == m
384+
}
385+
}
386+
174387
func withMinScale(minScale int) func(cfg *v1.Configuration) {
175388
return func(cfg *v1.Configuration) {
176389
if cfg.Spec.Template.Annotations == nil {
@@ -214,7 +427,7 @@ func waitForDesiredScale(clients *test.Clients, serviceName string, cond func(in
214427
endpoints := clients.KubeClient.CoreV1().Endpoints(test.ServingFlags.TestNamespace)
215428

216429
// See https://github.com/knative/serving/issues/7727#issuecomment-706772507 for context.
217-
return latestReady, wait.PollUntilContextTimeout(context.Background(), 250*time.Millisecond, 3*time.Minute, true, func(context.Context) (bool, error) {
430+
return latestReady, wait.PollUntilContextTimeout(context.Background(), time.Second, 3*time.Minute, true, func(context.Context) (bool, error) {
218431
endpoint, err := endpoints.Get(context.Background(), serviceName, metav1.GetOptions{})
219432
if err != nil {
220433
return false, nil //nolint:nilerr
@@ -227,7 +440,7 @@ func waitForDesiredScale(clients *test.Clients, serviceName string, cond func(in
227440
func ensureDesiredScale(clients *test.Clients, t *testing.T, serviceName string, cond func(int) bool) (latestReady int, observed bool) {
228441
endpoints := clients.KubeClient.CoreV1().Endpoints(test.ServingFlags.TestNamespace)
229442

230-
err := wait.PollUntilContextTimeout(context.Background(), 250*time.Millisecond, 10*time.Second, true, func(context.Context) (bool, error) {
443+
err := wait.PollUntilContextTimeout(context.Background(), time.Second, 30*time.Second, true, func(context.Context) (bool, error) {
231444
endpoint, err := endpoints.Get(context.Background(), serviceName, metav1.GetOptions{})
232445
if err != nil {
233446
return false, nil //nolint:nilerr

0 commit comments

Comments
 (0)