Skip to content

Commit

Permalink
Code review enhancements (kubernetes#3)
Browse files Browse the repository at this point in the history
* refactor: multiple code review suggestions

* test: increase coverage by covering added lines

* docs: update readme
  • Loading branch information
bilalcaliskan authored and morpheu committed May 22, 2024
1 parent 57bd57b commit c87bbbb
Show file tree
Hide file tree
Showing 12 changed files with 416 additions and 124 deletions.
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ RUN GOARCH=${TARGETARCH} make bin/node-problem-detector bin/health-checker bin/l
ARG BASEIMAGE
FROM --platform=${TARGETPLATFORM} ${BASEIMAGE}

RUN echo foo

LABEL maintainer="Random Liu <[email protected]>"

RUN clean-install util-linux bash libsystemd-dev
Expand Down
15 changes: 7 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,20 +142,19 @@ For example, to run without auth, use the following config:

### Tainting Nodes
You can enable node tainting feature to the response of permanent node problems. For example, on the file [config/kernel-monitor.json](config/kernel-monitor.json),
set boolean flag `taintEnabled` as `true` if you wish to taint your nodes when `ReadOnlyFileSystem` problem and untaint the node when the
problem is resolved. This feature is disabled by default but you can enable it like below:
put a `TaintConfig` object as following for required `Condition` as you need. You can omit the `TaintConfig` or disable it by setting `enabled` as false. By default, it is disabled and will not be enabled until you need it.
```json
...omitted for brevity...
{
"type": "ReadonlyFilesystem",
"reason": "FilesystemIsNotReadOnly",
"message": "Filesystem is not read-only",
"taintEnabled": true,
"taintKey": "node-problem-detector",
"taintValue": "ReadonlyFilesystem",
"taintEffect": "NoSchedule"
"taintConfig": {
"enabled": false,
"key": "node-problem-detector/read-only-filesystem",
"value": "true",
"effect": "NoSchedule"
}
}
...omitted for brevity...
```

## Build Image
Expand Down
42 changes: 14 additions & 28 deletions config/kernel-monitor.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,23 @@
"type": "KernelDeadlock",
"reason": "KernelHasNoDeadlock",
"message": "kernel has no deadlock",
"taintEnabled": false,
"taintKey": "node-problem-detector",
"taintValue": "KernelDeadlock",
"taintEffect": "NoSchedule"
"taintConfig": {
"enabled": false,
"key": "node-problem-detector/kernel-deadlock",
"value": "true",
"effect": "NoSchedule"
}
},
{
"type": "ReadonlyFilesystem",
"reason": "FilesystemIsNotReadOnly",
"message": "Filesystem is not read-only",
"taintEnabled": false,
"taintKey": "node-problem-detector",
"taintValue": "ReadonlyFilesystem",
"taintEffect": "NoSchedule"
"taintConfig": {
"enabled": false,
"key": "node-problem-detector/read-only-filesystem",
"value": "true",
"effect": "NoSchedule"
}
}
],
"rules": [
Expand Down Expand Up @@ -71,35 +75,17 @@
"reason": "MemoryReadError",
"pattern": "CE memory read error .*"
},
{
"type": "permanent",
"condition": "KernelDeadlock",
"reason": "AUFSUmountHung",
"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\.",
"taintEnabled": false,
"taintKey": "node-problem-detector",
"taintValue": "KernelDeadlock",
"taintEffect": "NoSchedule"
},
{
"type": "permanent",
"condition": "KernelDeadlock",
"reason": "DockerHung",
"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\.",
"taintEnabled": false,
"taintKey": "node-problem-detector",
"taintValue": "KernelDeadlock",
"taintEffect": "NoSchedule"
"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
},
{
"type": "permanent",
"condition": "ReadonlyFilesystem",
"reason": "FilesystemIsReadOnly",
"pattern": "Remounting filesystem read-only",
"taintEnabled": true,
"taintKey": "node-problem-detector",
"taintValue": "ReadonlyFilesystem",
"taintEffect": "NoSchedule"
"pattern": "Remounting filesystem read-only"
}
]
}
38 changes: 15 additions & 23 deletions deployment/node-problem-detector-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,23 @@ data:
"type": "KernelDeadlock",
"reason": "KernelHasNoDeadlock",
"message": "kernel has no deadlock",
"taintEnabled": false,
"taintKey": "node-problem-detector",
"taintValue": "KernelDeadlock",
"taintEffect": "NoSchedule"
"taintConfig": {
"enabled": false,
"key": "node-problem-detector/kernel-deadlock",
"value": "true",
"effect": "NoSchedule"
}
},
{
"type": "ReadonlyFilesystem",
"reason": "FilesystemIsNotReadOnly",
"message": "Filesystem is not read-only",
"taintEnabled": false,
"taintKey": "node-problem-detector",
"taintValue": "ReadonlyFilesystem",
"taintEffect": "NoSchedule"
"taintConfig": {
"enabled": true,
"key": "node-problem-detector/read-only-filesystem",
"value": "true",
"effect": "NoSchedule"
}
}
],
"rules": [
Expand Down Expand Up @@ -62,31 +66,19 @@ data:
"type": "permanent",
"condition": "KernelDeadlock",
"reason": "AUFSUmountHung",
"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\.",
"taintEnabled": false,
"taintKey": "node-problem-detector",
"taintValue": "KernelDeadlock",
"taintEffect": "NoSchedule"
"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
},
{
"type": "permanent",
"condition": "KernelDeadlock",
"reason": "DockerHung",
"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\.",
"taintEnabled": false,
"taintKey": "node-problem-detector",
"taintValue": "KernelDeadlock",
"taintEffect": "NoSchedule"
"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
},
{
"type": "permanent",
"condition": "ReadonlyFilesystem",
"reason": "FilesystemIsReadOnly",
"pattern": "Remounting filesystem read-only",
"taintEnabled": false,
"taintKey": "node-problem-detector",
"taintValue": "ReadonlyFilesystem",
"taintEffect": "NoSchedule"
"pattern": "Remounting filesystem read-only"
}
]
}
Expand Down
60 changes: 46 additions & 14 deletions pkg/exporters/k8sexporter/condition/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,22 +163,54 @@ func (c *conditionManager) sync(ctx context.Context) {
for i := range c.conditions {
conditions = append(conditions, problemutil.ConvertToAPICondition(c.conditions[i]))

if c.conditions[i].TaintEnabled && c.conditions[i].Status == types.True {
taintString := fmt.Sprintf("%s=%s:%s", c.conditions[i].TaintKey, c.conditions[i].TaintValue,
c.conditions[i].TaintEffect)
glog.Infof("tainting is enabled and condition status is True, tainting with %s\n", taintString)
if err := c.client.TaintNode(ctx, c.conditions[i]); err != nil {
glog.Errorf("failed to add taint %v to node: %v", taintString, err)
return
condition := c.conditions[i]
if condition.TaintConfig == nil || !condition.TaintConfig.Enabled {
// we are skipping tainting since TaintConfig of our condition is nil or disabled
continue
}

taintStr := fmt.Sprintf("%s=%s:%s", c.conditions[i].TaintConfig.Key, c.conditions[i].TaintConfig.Value,
c.conditions[i].TaintConfig.Effect)

node, err := c.client.GetNode(ctx)
if err != nil {
glog.Errorf("failed to get node: %v", err)
continue
}

taintExists := problemclient.CheckIfTaintAlreadyExists(node, *condition.TaintConfig)

switch condition.Status {
case types.True:
if taintExists {
// we are skipping here since node is already tainted with our TaintConfig
continue
}

glog.Infof("for condition %s, tainting is enabled and status is True, tainting with %s",
condition.Type, taintStr)

if err := c.client.TaintNode(ctx, node, condition); err != nil {
glog.Errorf("failed to add taint %v: %v", taintStr, err)
continue
}
} else if c.conditions[i].TaintEnabled && c.conditions[i].Status == types.False {
taintString := fmt.Sprintf("%s=%s:%s", c.conditions[i].TaintKey, c.conditions[i].TaintValue,
c.conditions[i].TaintEffect)
glog.Infof("tainting is enabled and condition status is False, removing taint %s\n", taintString)
if err := c.client.UntaintNode(ctx, c.conditions[i]); err != nil {
glog.Errorf("failed to remove taint %v from node: %v", taintString, err)
return

glog.Infof("successfully tainted node with %s", taintStr)
case types.False:
if !taintExists {
// we are skipping here since node is not tainted with our TaintConfig
continue
}

glog.Infof("for condition %s, tainting is enabled and condition status is False, removing taint %s",
condition.Type, taintStr)

if err := c.client.UntaintNode(ctx, node, condition); err != nil {
glog.Errorf("failed to remove taint %v: %v", taintStr, err)
continue
}

glog.Infof("successfully removed taint %s from node", taintStr)
}
}
if err := c.client.SetConditions(ctx, conditions); err != nil {
Expand Down
Loading

0 comments on commit c87bbbb

Please sign in to comment.