Skip to content

Commit

Permalink
Manage TuneD profiles with the same name and different content
Browse files Browse the repository at this point in the history
It is possible to create Tuned CRs with TuneD profiles of the same name.
This is problematic when the duplicate TuneD profiles have different
contents.  This can cause periodic switches between the duplicate TuneD
profiles and will lead to TuneD restarts.

This change makes the operator control loop stop to avoid switching
between the conflicting TuneD profiles of the same name and warns about
this misconfiguration via:
  * Tuned CR status and prometheus metric (nto_invalid_tuned_exist_info)
  * NTOInvalidTunedExist alert
  * ERROR message issued in the operator logs

Other changes:
  * Switch the ProfileStatusCondition condition type to
    StatusCondition.  This is cosmetic only, all fields are preserved.
  * Obsolete pod label-based matching.  Trigger an alert
    (NTOPodLabelsUsed) for this.

Fixes: OCPBUGS-44559.
  • Loading branch information
jmencak committed Dec 2, 2024
1 parent c786628 commit a6c6575
Show file tree
Hide file tree
Showing 20 changed files with 335 additions and 66 deletions.
2 changes: 1 addition & 1 deletion manifests/20-profile.crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ spec:
description: conditions represents the state of the per-node Profile application
type: array
items:
description: ProfileStatusCondition represents a partial state of the per-node Profile application.
description: StatusCondition represents a partial state of the per-node Profile application.
type: object
required:
- lastTransitionTime
Expand Down
44 changes: 43 additions & 1 deletion manifests/20-tuned.crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,14 @@ spec:
preserveUnknownFields: false
scope: Namespaced
versions:
- name: v1
- additionalPrinterColumns:
- jsonPath: .status.conditions[?(@.type=="Valid")].status
name: Valid
type: string
- jsonPath: .metadata.creationTimestamp
name: Age
type: date
name: v1
schema:
openAPIV3Schema:
description: |-
Expand Down Expand Up @@ -161,7 +168,42 @@ spec:
type: object
status:
description: TunedStatus is the status for a Tuned resource.
properties:
conditions:
description: conditions represents the state of the Tuned profile
items:
description: StatusCondition represents a partial state of the per-node
Profile application.
properties:
lastTransitionTime:
description: lastTransitionTime is the time of the last update
to the current status property.
format: date-time
type: string
message:
description: |-
message provides additional information about the current condition.
This is only to be consumed by humans.
type: string
reason:
description: reason is the CamelCase reason for the condition's
current status.
type: string
status:
description: status of the condition, one of True, False, Unknown.
type: string
type:
description: type specifies the aspect reported by this condition.
type: string
required:
- lastTransitionTime
- status
- type
type: object
type: array
type: object
type: object
served: true
storage: true
subresources:
status: {}
19 changes: 19 additions & 0 deletions manifests/30-monitoring.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,25 @@ spec:
for: 30m
labels:
severity: warning
- alert: NTOPodLabelsUsed
annotations:
description: >-
The Node Tuning Operator is using deprecated functionality.
Using pod label matching has been discouraged since OCP 4.4 and this functionality will be removed in future versions.
Please revise and adjust your configuration (Tuned custom resources).
summary: The Node Tuning Operator is using deprecated functionality.
expr: nto_pod_labels_used_info == 1
for: 30m
labels:
severity: warning
- alert: NTOInvalidTunedExist
annotations:
description: Invalid custom Tuned resource exists. View your custom Tuned resources and operator logs for further details.
summary: Invalid custom Tuned resource exists.
expr: nto_invalid_tuned_exist_info == 1
for: 30m
labels:
severity: warning
- alert: NTODegraded
annotations:
description: The Node Tuning Operator is degraded. Review the "node-tuning" ClusterOperator object for further details.
Expand Down
2 changes: 1 addition & 1 deletion manifests/40-rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ rules:
resources: ["tuneds"]
verbs: ["create","get","delete","list","update","watch","patch"]
- apiGroups: ["tuned.openshift.io"]
resources: ["tuneds/finalizers"]
resources: ["tuneds/finalizers","tuneds/status"]
verbs: ["update"]
- apiGroups: ["tuned.openshift.io"]
resources: ["profiles"]
Expand Down
27 changes: 19 additions & 8 deletions pkg/apis/tuned/v1/tuned_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ const (
/////////////////////////////////////////////////////////////////////////////////
// +genclient
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
// +kubebuilder:subresource:status

// Tuned is a collection of rules that allows cluster-wide deployment
// of node-level sysctls and more flexibility to add custom tuning
Expand Down Expand Up @@ -134,8 +135,18 @@ type TuneDConfig struct {

// TunedStatus is the status for a Tuned resource.
type TunedStatus struct {
// conditions represents the state of the Tuned profile
// +patchMergeKey=type
// +patchStrategy=merge
// +optional
Conditions []StatusCondition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"`
}

const (
// Tuned CR was validated and no problems with it were found.
TunedValid ConditionType = "Valid"
)

// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object

// TunedList is a list of Tuned resources.
Expand Down Expand Up @@ -192,16 +203,16 @@ type ProfileStatus struct {
// +patchMergeKey=type
// +patchStrategy=merge
// +optional
Conditions []ProfileStatusCondition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"`
Conditions []StatusCondition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"`
}

// ProfileStatusCondition represents a partial state of the per-node Profile application.
// StatusCondition represents a partial state of the per-node Profile application.
// +k8s:deepcopy-gen=true
type ProfileStatusCondition struct {
type StatusCondition struct {
// type specifies the aspect reported by this condition.
// +kubebuilder:validation:Required
// +required
Type ProfileConditionType `json:"type"`
Type ConditionType `json:"type"`

// status of the condition, one of True, False, Unknown.
// +kubebuilder:validation:Required
Expand All @@ -223,18 +234,18 @@ type ProfileStatusCondition struct {
Message string `json:"message,omitempty"`
}

// ProfileConditionType is an aspect of Tuned daemon profile application state.
type ProfileConditionType string
// ConditionType is an aspect of Tuned daemon profile application state.
type ConditionType string

const (
// ProfileApplied indicates that the Tuned daemon has successfully applied
// the selected profile.
TunedProfileApplied ProfileConditionType = "Applied"
TunedProfileApplied ConditionType = "Applied"

// TunedDegraded indicates the Tuned daemon issued errors during profile
// application. To conclude the profile application was successful,
// both TunedProfileApplied and TunedDegraded need to be queried.
TunedDegraded ProfileConditionType = "Degraded"
TunedDegraded ConditionType = "Degraded"
)

// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
Expand Down
19 changes: 13 additions & 6 deletions pkg/apis/tuned/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ const (
profileCalculatedQuery = "nto_profile_calculated_total"
buildInfoQuery = "nto_build_info"
degradedInfoQuery = "nto_degraded_info"
invalidTunedExistQuery = "nto_invalid_tuned_exist_info"

// MetricsPort is the IP port supplied to the HTTP server used for Prometheus,
// and matches what is specified in the corresponding Service and ServiceMonitor.
Expand Down Expand Up @@ -42,6 +43,12 @@ var (
Help: "Indicates whether the Node Tuning Operator is degraded.",
},
)
invalidTunedExist = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: invalidTunedExistQuery,
Help: "Does any invalid/misconfigured custom Tuned resource exist (1) or not (0)?",
},
)
)

func init() {
Expand All @@ -50,6 +57,7 @@ func init() {
profileCalculated,
buildInfo,
degradedState,
invalidTunedExist,
)
}

Expand Down Expand Up @@ -83,3 +91,13 @@ func Degraded(deg bool) {
}
degradedState.Set(0)
}

// InvalidTunedExist indicates whether any invalid/misconfigured custom Tuned resource
// exist or not.
func InvalidTunedExist(enable bool) {
if enable {
invalidTunedExist.Set(1)
return
}
invalidTunedExist.Set(0)
}
22 changes: 16 additions & 6 deletions pkg/operator/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,11 @@ func (c *Controller) sync(key wqKey) error {
// Tuned CR changed, this can affect all profiles, list them and trigger profile updates
klog.V(2).Infof("sync(): Tuned %s", key.name)

err = c.validateTunedCRs()
if err != nil {
return err
}

err = c.enqueueProfileUpdates()
if err != nil {
return err
Expand Down Expand Up @@ -500,6 +505,7 @@ func (c *Controller) syncTunedDefault() (*tunedv1.Tuned, error) {
if err != nil {
return cr, fmt.Errorf("failed to update Tuned %s: %v", crMf.Name, err)
}

return cr, nil
}

Expand Down Expand Up @@ -613,14 +619,18 @@ func (c *Controller) syncProfile(tuned *tunedv1.Tuned, nodeName string) error {
var computed ComputedProfile
if ntoconfig.InHyperShift() {
computed, err = c.pc.calculateProfileHyperShift(nodeName)
if err != nil {
return err
}
} else {
computed, err = c.pc.calculateProfile(nodeName)
if err != nil {
return err
}
}
switch err.(type) {
case nil:
case *DuplicateProfileError:
// Stop. We have TuneD profiles with the same name and different contents.
// Do not spam the logs with this error, it will be reported during Tuned CR
// updates and periodic resync/validation.
return nil
default:
return err
}

metrics.ProfileCalculated(profileMf.Name, computed.TunedProfileName)
Expand Down
Loading

0 comments on commit a6c6575

Please sign in to comment.