Skip to content

Commit

Permalink
Merge pull request #40 from NVIDIA/diagnosticcollector
Browse files Browse the repository at this point in the history
Add Diagnostics pkg
  • Loading branch information
ArangoGutierrez authored May 9, 2024
2 parents 6f717c1 + 5f90576 commit eaf93bf
Show file tree
Hide file tree
Showing 6 changed files with 494 additions and 13 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ require (
k8s.io/client-go v0.29.0
k8s.io/klog/v2 v2.110.1
sigs.k8s.io/node-feature-discovery v0.15.0
sigs.k8s.io/yaml v1.3.0
)

require (
Expand Down Expand Up @@ -51,5 +52,4 @@ require (
k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
sigs.k8s.io/yaml v1.3.0 // indirect
)
111 changes: 111 additions & 0 deletions pkg/diagnostics/collector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
/**
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/

package diagnostics

import (
"context"
"fmt"
"io"
"os"
"path/filepath"

"k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
nfdclient "sigs.k8s.io/node-feature-discovery/pkg/generated/clientset/versioned"
"sigs.k8s.io/yaml"
)

type Collector interface {
Collect(context.Context) error
}

type Config struct {
Clientset kubernetes.Interface
NfdClient *nfdclient.Clientset

artifactDir string
namespace string

log io.Writer
}

func (c *Config) createFile(fp string) (io.WriteCloser, error) {
outfile, err := os.Create(filepath.Join(c.artifactDir, c.namespace, fp))
if err != nil {
return nil, fmt.Errorf("error creating %v: %w", fp, err)
}
return outfile, nil
}

func (c *Config) writeToFile(w io.Writer, data interface{}) error {
// Marshal data to YAML format
yamlBytes, err := yaml.Marshal(data)
if err != nil {
return fmt.Errorf("error marshalling data: %w", err)
}

// Write marshaled bytes to the provided io.Writer
_, err = w.Write(yamlBytes)
if err != nil {
return fmt.Errorf("error writing to file: %w", err)
}

return nil
}

func (c *Config) outputTo(filename string, objects interface{}) error {
outputfile, err := c.createFile(filename)
if err != nil {
return fmt.Errorf("error creating %v: %w", filename, err)
}
defer outputfile.Close()
if err = c.writeToFile(outputfile, objects); err != nil {
return fmt.Errorf("error writing to %v: %w", filename, err)
}
return nil
}

func (d *Diagnostic) Collect(ctx context.Context) error {
// Create the artifact directory
if err := os.MkdirAll(filepath.Join(d.Config.artifactDir, d.Config.namespace), os.ModePerm); err != nil {
return fmt.Errorf("error creating artifact directory: %w", err)
}

// Redirect stdout and stderr to logs
logFile, err := d.createFile("diagnostic_collector.log")
if err != nil {
return fmt.Errorf("error creating collector log file: %w", err)
}
defer logFile.Close()
d.log = logFile

// configure klog to write to the log file
klog.SetOutput(d.log)

if len(d.collectors) == 0 {
klog.Warning("No collectors to run")
}

// Run the collectors
for _, c := range d.collectors {
if err := c.Collect(ctx); err != nil {
klog.ErrorS(err, "Error running collector")
}
}

return nil
}
58 changes: 58 additions & 0 deletions pkg/diagnostics/extensions.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/**
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/

package diagnostics

import (
"context"
"fmt"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

type nodeFeatures struct {
*Config
}

type nodeFeatureRules struct {
*Config
}

func (c nodeFeatures) Collect(ctx context.Context) error {
nfs, err := c.NfdClient.NfdV1alpha1().NodeFeatures(c.namespace).List(ctx, metav1.ListOptions{})
if err != nil {
return fmt.Errorf("error collecting %T: %w", c, err)
}

if err := c.outputTo("nodefeatures.yaml", nfs); err != nil {
return err
}

return nil
}

func (c nodeFeatureRules) Collect(ctx context.Context) error {
nfrs, err := c.NfdClient.NfdV1alpha1().NodeFeatureRules().List(ctx, metav1.ListOptions{})
if err != nil {
return fmt.Errorf("error collecting %T: %w", c, err)
}

if err := c.outputTo("nodefeaturerules.yaml", nfrs); err != nil {
return err
}

return nil
}
168 changes: 168 additions & 0 deletions pkg/diagnostics/kubernetes.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
/**
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/

package diagnostics

import (
"bufio"
"context"
"errors"
"fmt"

v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

type nodes struct {
*Config
}

type namespaces struct {
*Config
}

type pods struct {
*Config
}

type deployments struct {
*Config
}

type daemonsets struct {
*Config
}

type jobs struct {
*Config
}

func (c nodes) Collect(ctx context.Context) error {
nodes, err := c.Clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
return fmt.Errorf("error collecting %T: %w", c, err)
}

if err := c.outputTo("nodes.yaml", nodes); err != nil {
return err
}

return nil
}

func (c namespaces) Collect(ctx context.Context) error {
namespaces, err := c.Clientset.CoreV1().Namespaces().List(ctx, metav1.ListOptions{})
if err != nil {
return fmt.Errorf("error collecting %T: %w", c, err)
}

if err := c.outputTo("namespaces.yaml", namespaces); err != nil {
return err
}

return nil
}

func (c daemonsets) Collect(ctx context.Context) error {
daemonsets, err := c.Clientset.AppsV1().DaemonSets(c.namespace).List(ctx, metav1.ListOptions{})
if err != nil {
return fmt.Errorf("error collecting %T: %w", c, err)
}

if err := c.outputTo("daemonsets.yaml", daemonsets); err != nil {
return err
}

return nil
}

func (c deployments) Collect(ctx context.Context) error {
deployments, err := c.Clientset.AppsV1().Deployments(c.namespace).List(ctx, metav1.ListOptions{})
if err != nil {
return fmt.Errorf("error collecting %T: %w", c, err)
}

if err := c.outputTo("deployments.yaml", deployments); err != nil {
return err
}

return nil
}

func (c pods) Collect(ctx context.Context) error {
pods, err := c.Config.Clientset.CoreV1().Pods(c.namespace).List(ctx, metav1.ListOptions{})
if err != nil {
return fmt.Errorf("error collecting %T: %w", c, err)
}

if err := c.outputTo("pods.yaml", pods); err != nil {
return err
}

var errs error
for _, pod := range pods.Items {
errs = errors.Join(err, podLogCollector{c.Config, pod.Name}.Collect(ctx))
}

return errs
}

func (c jobs) Collect(ctx context.Context) error {
jobs, err := c.Clientset.BatchV1().Jobs(c.namespace).List(ctx, metav1.ListOptions{})
if err != nil {
return fmt.Errorf("error collecting %T: %w", c, err)
}

if err := c.outputTo("jobs.yaml", jobs); err != nil {
return err
}

return nil
}

type podLogCollector struct {
*Config
name string
}

func (c podLogCollector) Collect(ctx context.Context) error {
podLogFile, err := c.createFile(fmt.Sprintf("%s.log", c.name))
if err != nil {
return fmt.Errorf("error creating podLogFile: %w", err)
}
defer podLogFile.Close()

req := c.Clientset.CoreV1().Pods(c.namespace).GetLogs(c.name, &v1.PodLogOptions{})
podLogs, err := req.Stream(ctx)
if err != nil {
return fmt.Errorf("error getting pod logs: %w", err)
}

buf := bufio.NewScanner(podLogs)
for buf.Scan() {
if _, err := podLogFile.Write(buf.Bytes()); err != nil {
return fmt.Errorf("error writing pod logs: %w", err)
}
if _, err := podLogFile.Write([]byte("\n")); err != nil {
return fmt.Errorf("error writing pod logs: %w", err)
}
}
if err := buf.Err(); err != nil {
return fmt.Errorf("error reading pod log: %w", err)
}

return nil
}
Loading

0 comments on commit eaf93bf

Please sign in to comment.