Skip to content

Commit 3879619

Browse files
Merge pull request #30426 from hongkailiu/30269-revert-fix
TRT-2373: Revert "Revert "Merge pull request #30269 from hongkailiu/OTA-1626""
2 parents b46de17 + 51d8436 commit 3879619

File tree

6 files changed

+349
-18
lines changed

6 files changed

+349
-18
lines changed

pkg/defaultmonitortests/types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"github.com/openshift/origin/pkg/monitortests/authentication/requiredsccmonitortests"
99
admupgradestatus "github.com/openshift/origin/pkg/monitortests/cli/adm_upgrade/status"
1010
azuremetrics "github.com/openshift/origin/pkg/monitortests/cloud/azure/metrics"
11+
"github.com/openshift/origin/pkg/monitortests/clusterversionoperator/clusterversionchecker"
1112
"github.com/openshift/origin/pkg/monitortests/clusterversionoperator/legacycvomonitortests"
1213
"github.com/openshift/origin/pkg/monitortests/clusterversionoperator/operatorstateanalyzer"
1314
"github.com/openshift/origin/pkg/monitortests/clusterversionoperator/terminationmessagepolicy"
@@ -172,6 +173,7 @@ func newUniversalMonitorTests(info monitortestframework.MonitorTestInitializatio
172173
monitorTestRegistry.AddMonitorTestOrDie("termination-message-policy", "Cluster Version Operator", terminationmessagepolicy.NewAnalyzer())
173174
monitorTestRegistry.AddMonitorTestOrDie("operator-state-analyzer", "Cluster Version Operator", operatorstateanalyzer.NewAnalyzer())
174175
monitorTestRegistry.AddMonitorTestOrDie("required-scc-annotation-checker", "Cluster Version Operator", requiredsccmonitortests.NewAnalyzer())
176+
monitorTestRegistry.AddMonitorTestOrDie("cluster-version-checker", "Cluster Version Operator", clusterversionchecker.NewClusterVersionChecker())
175177

176178
monitorTestRegistry.AddMonitorTestOrDie("etcd-log-analyzer", "etcd", etcdloganalyzer.NewEtcdLogAnalyzer())
177179
monitorTestRegistry.AddMonitorTestOrDie("legacy-etcd-invariants", "etcd", legacyetcdmonitortests.NewLegacyTests())

pkg/monitor/monitorapi/identification_operator.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package monitorapi
22

33
import (
4+
"fmt"
5+
46
configv1 "github.com/openshift/api/config/v1"
57
)
68

@@ -25,3 +27,18 @@ func GetOperatorConditionStatus(interval Interval) *configv1.ClusterOperatorStat
2527
condition.Message = interval.Message.HumanMessage
2628
return condition
2729
}
30+
31+
// GetOperatorConditionHumanMessage constructs a human-readable message from a given ClusterOperatorStatusCondition with a given prefix
32+
func GetOperatorConditionHumanMessage(s *configv1.ClusterOperatorStatusCondition, prefix string) string {
33+
if s == nil {
34+
return ""
35+
}
36+
switch {
37+
case len(s.Reason) > 0 && len(s.Message) > 0:
38+
return fmt.Sprintf("%s%s=%s: %s: %s", prefix, s.Type, s.Status, s.Reason, s.Message)
39+
case len(s.Message) > 0:
40+
return fmt.Sprintf("%s%s=%s: %s", prefix, s.Type, s.Status, s.Message)
41+
default:
42+
return fmt.Sprintf("%s%s=%s", prefix, s.Type, s.Status)
43+
}
44+
}

pkg/monitortests/cli/adm_upgrade/status/monitortest.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -225,23 +225,21 @@ func (w *monitor) noFailures() *junitapi.JUnitTestCase {
225225
}
226226

227227
var failures []string
228-
var total int
229228
for _, snap := range w.ocAdmUpgradeStatus {
230-
total++
231229
if snap.err != nil {
232230
failures = append(failures, fmt.Sprintf("- %s: %v", snap.when.Format(time.RFC3339), snap.err))
233231
}
234232
}
235233

236-
if total == 0 {
234+
if len(w.ocAdmUpgradeStatus) == 0 {
237235
noFailures.SkipMessage = &junitapi.SkipMessage{
238236
Message: "Test skipped because no oc adm upgrade status output was collected",
239237
}
240238
return noFailures
241239
}
242240

243241
// Zero failures is too strict for at least SNO clusters
244-
p := (float32(len(failures)) / float32(total)) * 100
242+
p := (float32(len(failures)) / float32(len(w.ocAdmUpgradeStatus))) * 100
245243
if (!w.isSNO && p > 0) || (w.isSNO && p > 10) {
246244
noFailures.FailureOutput = &junitapi.FailureOutput{
247245
Message: fmt.Sprintf("oc adm upgrade status failed %d times (of %d)", len(failures), len(w.ocAdmUpgradeStatus)),
Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
package clusterversionchecker
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"fmt"
7+
"os"
8+
"path/filepath"
9+
"regexp"
10+
"strings"
11+
"time"
12+
13+
configv1 "github.com/openshift/api/config/v1"
14+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
15+
"k8s.io/apimachinery/pkg/util/sets"
16+
"k8s.io/client-go/kubernetes"
17+
"k8s.io/client-go/rest"
18+
19+
"github.com/openshift/origin/pkg/monitor/monitorapi"
20+
"github.com/openshift/origin/pkg/monitortestframework"
21+
"github.com/openshift/origin/pkg/monitortestlibrary/platformidentification"
22+
"github.com/openshift/origin/pkg/test/ginkgo/junitapi"
23+
exutil "github.com/openshift/origin/test/extended/util"
24+
)
25+
26+
type monitor struct {
27+
notSupportedReason error
28+
summary map[string]int
29+
}
30+
31+
func NewClusterVersionChecker() monitortestframework.MonitorTest {
32+
return &monitor{summary: map[string]int{}}
33+
}
34+
35+
func (w *monitor) PrepareCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error {
36+
kubeClient, err := kubernetes.NewForConfig(adminRESTConfig)
37+
if err != nil {
38+
return err
39+
}
40+
isMicroShift, err := exutil.IsMicroShiftCluster(kubeClient)
41+
if err != nil {
42+
return fmt.Errorf("unable to determine if cluster is MicroShift: %v", err)
43+
}
44+
if isMicroShift {
45+
w.notSupportedReason = &monitortestframework.NotSupportedError{Reason: "platform MicroShift not supported"}
46+
return w.notSupportedReason
47+
}
48+
49+
nodes, err := kubeClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{LabelSelector: "node-role.kubernetes.io/worker"})
50+
if err != nil {
51+
return fmt.Errorf("unable to list nodes: %v", err)
52+
}
53+
54+
if s := len(nodes.Items); s > 250 {
55+
w.notSupportedReason = &monitortestframework.NotSupportedError{Reason: fmt.Sprintf("cluster with %d worker nodes (over 250) not supported", s)}
56+
return w.notSupportedReason
57+
}
58+
59+
return nil
60+
}
61+
62+
func (w *monitor) StartCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error {
63+
return w.notSupportedReason
64+
}
65+
66+
func (w *monitor) CollectData(ctx context.Context, storageDir string, beginning, end time.Time) (monitorapi.Intervals, []*junitapi.JUnitTestCase, error) {
67+
return nil, nil, w.notSupportedReason
68+
}
69+
70+
func (w *monitor) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) {
71+
return nil, w.notSupportedReason
72+
}
73+
74+
func (w *monitor) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {
75+
if w.notSupportedReason != nil {
76+
return nil, w.notSupportedReason
77+
}
78+
return w.noFailingUnknownCondition(finalIntervals), nil
79+
}
80+
81+
func (w *monitor) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error {
82+
outputFile := filepath.Join(storageDir, fmt.Sprintf("cluster-version-checker%s.json", timeSuffix))
83+
data, err := json.Marshal(w.summary)
84+
if err != nil {
85+
return fmt.Errorf("unable to marshal summary: %w", err)
86+
}
87+
if err := os.WriteFile(outputFile, data, 0644); err != nil {
88+
return fmt.Errorf("unable to write summary to %q: %w", outputFile, err)
89+
}
90+
return nil
91+
}
92+
93+
func (*monitor) Cleanup(ctx context.Context) error {
94+
return nil
95+
}
96+
97+
func (w *monitor) noFailingUnknownCondition(intervals monitorapi.Intervals) []*junitapi.JUnitTestCase {
98+
var start, stop time.Time
99+
for _, event := range intervals {
100+
if start.IsZero() || event.From.Before(start) {
101+
start = event.From
102+
}
103+
if stop.IsZero() || event.To.After(stop) {
104+
stop = event.To
105+
}
106+
}
107+
duration := stop.Sub(start).Seconds()
108+
109+
noFailures := &junitapi.JUnitTestCase{
110+
Name: "[bz-Cluster Version Operator] cluster version should not report Failing=Unknown during a normal cluster upgrade",
111+
Duration: duration,
112+
}
113+
114+
var failures []string
115+
violations := sets.New[string]()
116+
117+
for _, interval := range intervals {
118+
if interval.Locator.Type != monitorapi.LocatorTypeClusterVersion {
119+
continue
120+
}
121+
cvName, ok := interval.Locator.Keys[monitorapi.LocatorClusterVersionKey]
122+
if !ok || cvName != "version" {
123+
continue
124+
}
125+
126+
c := monitorapi.GetOperatorConditionStatus(interval)
127+
if c == nil {
128+
continue
129+
}
130+
key := fmt.Sprintf("%s-%s-%s", string(c.Type), string(c.Status), c.Reason)
131+
if _, ok := w.summary[key]; ok {
132+
w.summary[key]++
133+
} else {
134+
w.summary[key] = 1
135+
}
136+
// https://github.com/openshift/cluster-version-operator/blob/28a376a13ad1daec926f6174ac37ada2bd32c071/pkg/cvo/status.go#L332-L333
137+
if c.Type == "Failing" && c.Status == configv1.ConditionUnknown && c.Reason == "SlowClusterOperator" {
138+
// This is too hacky, but we do not have API to expose the CO names that took long to upgrade
139+
coNames, err := parseClusterOperatorNames(c.Message)
140+
if err != nil {
141+
failures = append(failures, fmt.Sprintf("failed to parse cluster operator names from message %q: %v", c.Message, err))
142+
continue
143+
}
144+
violations = violations.Union(coNames)
145+
}
146+
}
147+
148+
if len(failures) > 0 {
149+
noFailures.FailureOutput = &junitapi.FailureOutput{
150+
Message: fmt.Sprintf("Checking cluster version failed %d times (of %d intervals) from %s to %s", len(failures), len(intervals), start.Format(time.RFC3339), stop.Format(time.RFC3339)),
151+
Output: strings.Join(failures, "\n"),
152+
}
153+
} else {
154+
noFailures.SystemOut = fmt.Sprintf("Checking cluster version succussfully checked %d intervals from %s to %s", len(intervals), start.Format(time.RFC3339), stop.Format(time.RFC3339))
155+
}
156+
157+
ret := []*junitapi.JUnitTestCase{noFailures}
158+
159+
for _, coName := range platformidentification.KnownOperators.List() {
160+
bzComponent := platformidentification.GetBugzillaComponentForOperator(coName)
161+
if bzComponent == "Unknown" {
162+
bzComponent = coName
163+
}
164+
name := fmt.Sprintf("[bz-%v] clusteroperator/%v must complete version change within limited time", bzComponent, coName)
165+
m := 30
166+
if coName == "machine-config" {
167+
m = 3 * m
168+
}
169+
if !violations.Has(coName) {
170+
ret = append(ret, &junitapi.JUnitTestCase{
171+
Name: name,
172+
})
173+
continue
174+
}
175+
output := fmt.Sprintf("Cluster Operator %s has not completed version change after %d minutes", coName, m)
176+
ret = append(ret, &junitapi.JUnitTestCase{
177+
Name: name,
178+
Duration: duration,
179+
FailureOutput: &junitapi.FailureOutput{
180+
Output: output,
181+
Message: output,
182+
},
183+
})
184+
}
185+
186+
return ret
187+
}
188+
189+
var (
190+
// we have to modify the keyword here accordingly if CVO changes the message
191+
regWaitingLong = regexp.MustCompile(`waiting on.*which is longer than expected`)
192+
regWaitingMCOOver90m = regexp.MustCompile(`machine-config over 90 minutes`)
193+
regWaitingCOOver30m = regexp.MustCompile(`.*waiting on (.+) over 30 minutes`)
194+
)
195+
196+
func parseClusterOperatorNames(message string) (sets.Set[string], error) {
197+
if !regWaitingLong.MatchString(message) {
198+
return nil, fmt.Errorf("failed to parse cluster operator names from %q", message)
199+
}
200+
ret := sets.Set[string]{}
201+
if regWaitingMCOOver90m.MatchString(message) {
202+
ret.Insert("machine-config")
203+
}
204+
matches := regWaitingCOOver30m.FindStringSubmatch(message)
205+
if len(matches) > 1 {
206+
coNames := strings.Split(matches[1], ",")
207+
for _, coName := range coNames {
208+
ret.Insert(strings.TrimSpace(coName))
209+
}
210+
}
211+
if len(ret) == 0 {
212+
return nil, fmt.Errorf("failed to parse cluster operator names from %q", message)
213+
}
214+
return ret, nil
215+
}
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
package clusterversionchecker
2+
3+
import (
4+
"fmt"
5+
"testing"
6+
7+
"github.com/google/go-cmp/cmp"
8+
configv1 "github.com/openshift/api/config/v1"
9+
"k8s.io/apimachinery/pkg/util/sets"
10+
11+
"github.com/openshift/origin/pkg/monitor/monitorapi"
12+
)
13+
14+
func Test_parseClusterOperatorNames(t *testing.T) {
15+
t.Parallel()
16+
17+
testCases := []struct {
18+
name string
19+
reason string
20+
message string
21+
expected sets.Set[string]
22+
expectedErr error
23+
}{
24+
{
25+
name: "unexpected",
26+
reason: "reason",
27+
message: "unexpected",
28+
expectedErr: fmt.Errorf("failed to parse cluster operator names from %q", "changed to Some=Unknown: reason: unexpected"),
29+
},
30+
{
31+
name: "legit waiting on",
32+
message: "Working towards 1.2.3: waiting on co-not-timeout",
33+
expectedErr: fmt.Errorf("failed to parse cluster operator names from %q", "changed to Some=Unknown: Working towards 1.2.3: waiting on co-not-timeout"),
34+
},
35+
{
36+
name: "one CO timeout",
37+
reason: "SlowClusterOperator",
38+
message: "waiting on co-timeout over 30 minutes which is longer than expected",
39+
expected: sets.New[string]("co-timeout"),
40+
},
41+
{
42+
name: "mco timeout",
43+
reason: "SlowClusterOperator",
44+
message: "waiting on machine-config over 90 minutes which is longer than expected",
45+
expected: sets.New[string]("machine-config"),
46+
},
47+
{
48+
name: "mco timeout",
49+
reason: "SlowClusterOperator",
50+
message: "waiting on machine-config over 90 minutes which is longer than expected",
51+
expected: sets.New[string]("machine-config"),
52+
},
53+
{
54+
name: "two COs timeout",
55+
reason: "SlowClusterOperator",
56+
message: "waiting on co-timeout, co-bar-timeout over 30 minutes which is longer than expected",
57+
expected: sets.New[string]("co-timeout", "co-bar-timeout"),
58+
},
59+
{
60+
name: "one CO and mco timeout",
61+
reason: "SlowClusterOperator",
62+
message: "waiting on co-timeout over 30 minutes and machine-config over 90 minutes which is longer than expected",
63+
expected: sets.New[string]("machine-config", "co-timeout"),
64+
},
65+
{
66+
name: "three COs timeout",
67+
reason: "SlowClusterOperator",
68+
message: "waiting on co-timeout, co-bar-timeout over 30 minutes and machine-config over 90 minutes which is longer than expected",
69+
expected: sets.New[string]("machine-config", "co-timeout", "co-bar-timeout"),
70+
},
71+
}
72+
73+
for _, tc := range testCases {
74+
t.Run(tc.name, func(t *testing.T) {
75+
t.Parallel()
76+
msg := monitorapi.GetOperatorConditionHumanMessage(&configv1.ClusterOperatorStatusCondition{
77+
Type: "Some",
78+
Status: configv1.ConditionUnknown,
79+
Message: tc.message,
80+
Reason: tc.reason,
81+
}, "changed to ")
82+
actual, actuallErr := parseClusterOperatorNames(msg)
83+
if diff := cmp.Diff(tc.expectedErr, actuallErr, cmp.FilterValues(func(x, y interface{}) bool {
84+
_, ok1 := x.(error)
85+
_, ok2 := y.(error)
86+
return ok1 && ok2
87+
}, cmp.Comparer(func(x, y interface{}) bool {
88+
xe := x.(error)
89+
ye := y.(error)
90+
if xe == nil || ye == nil {
91+
return xe == nil && ye == nil
92+
}
93+
return xe.Error() == ye.Error()
94+
}))); diff != "" {
95+
t.Errorf("unexpected error (-want +got):\n%s", diff)
96+
}
97+
98+
if actuallErr == nil {
99+
if diff := cmp.Diff(tc.expected, actual); diff != "" {
100+
t.Errorf("unexpected result (-want +got):\n%s", diff)
101+
}
102+
}
103+
})
104+
}
105+
}

0 commit comments

Comments
 (0)