diff --git a/pkg/diagnostics/collector.go b/pkg/diagnostics/collector.go new file mode 100644 index 00000000..fd3635ee --- /dev/null +++ b/pkg/diagnostics/collector.go @@ -0,0 +1,111 @@ +/** +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package diagnostics + +import ( + "context" + "fmt" + "io" + "os" + "path/filepath" + + "k8s.io/client-go/kubernetes" + "k8s.io/klog/v2" + nfdclient "sigs.k8s.io/node-feature-discovery/pkg/generated/clientset/versioned" + "sigs.k8s.io/yaml" +) + +type Collector interface { + Collect(context.Context) error +} + +type Config struct { + Clientset kubernetes.Interface + NfdClient *nfdclient.Clientset + + artifactDir string + namespace string + + log io.Writer +} + +func (c *Config) createFile(fp string) (io.WriteCloser, error) { + outfile, err := os.Create(filepath.Join(c.artifactDir, c.namespace, fp)) + if err != nil { + return nil, fmt.Errorf("error creating %v: %w", fp, err) + } + return outfile, nil +} + +func (c *Config) writeToFile(w io.Writer, data interface{}) error { + // Marshal data to YAML format + yamlBytes, err := yaml.Marshal(data) + if err != nil { + return fmt.Errorf("error marshalling data: %w", err) + } + + // Write marshaled bytes to the provided io.Writer + _, err = w.Write(yamlBytes) + if err != nil { + return fmt.Errorf("error writing to file: %w", err) + } + + return nil +} + +func (c *Config) outputTo(filename string, objects interface{}) error { + outputfile, err := c.createFile(filename) + if err != nil { + return fmt.Errorf("error creating %v: %w", filename, err) + } + defer outputfile.Close() + if err = c.writeToFile(outputfile, objects); err != nil { + return fmt.Errorf("error writing to %v: %w", filename, err) + } + return nil +} + +func (d *Diagnostic) Collect(ctx context.Context) error { + // Create the artifact directory + if err := os.MkdirAll(filepath.Join(d.Config.artifactDir, d.Config.namespace), os.ModePerm); err != nil { + return fmt.Errorf("error creating artifact directory: %w", err) + } + + // Redirect stdout and stderr to logs + logFile, err := d.createFile("diagnostic_collector.log") + if err != nil { + return fmt.Errorf("error creating collector log file: %w", err) + } + defer logFile.Close() + d.log = logFile + + // configure klog to write to the log file + klog.SetOutput(d.log) + + if len(d.collectors) == 0 { + klog.Warning("No collectors to run") + } + + // Run the collectors + for _, c := range d.collectors { + if err := c.Collect(ctx); err != nil { + klog.ErrorS(err, "Error running collector") + } + } + + return nil +} diff --git a/pkg/diagnostics/extensions.go b/pkg/diagnostics/extensions.go new file mode 100644 index 00000000..184917b5 --- /dev/null +++ b/pkg/diagnostics/extensions.go @@ -0,0 +1,58 @@ +/** +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package diagnostics + +import ( + "context" + "fmt" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +type nodeFeatures struct { + *Config +} + +type nodeFeatureRules struct { + *Config +} + +func (c nodeFeatures) Collect(ctx context.Context) error { + nfs, err := c.NfdClient.NfdV1alpha1().NodeFeatures(c.namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("error collecting %T: %w", c, err) + } + + if err := c.outputTo("nodefeatures.yaml", nfs); err != nil { + return err + } + + return nil +} + +func (c nodeFeatureRules) Collect(ctx context.Context) error { + nfrs, err := c.NfdClient.NfdV1alpha1().NodeFeatureRules().List(ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("error collecting %T: %w", c, err) + } + + if err := c.outputTo("nodefeaturerules.yaml", nfrs); err != nil { + return err + } + + return nil +} diff --git a/pkg/diagnostics/kubernetes.go b/pkg/diagnostics/kubernetes.go new file mode 100644 index 00000000..6419ed9e --- /dev/null +++ b/pkg/diagnostics/kubernetes.go @@ -0,0 +1,168 @@ +/** +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package diagnostics + +import ( + "bufio" + "context" + "errors" + "fmt" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +type nodes struct { + *Config +} + +type namespaces struct { + *Config +} + +type pods struct { + *Config +} + +type deployments struct { + *Config +} + +type daemonsets struct { + *Config +} + +type jobs struct { + *Config +} + +func (c nodes) Collect(ctx context.Context) error { + nodes, err := c.Clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("error collecting %T: %w", c, err) + } + + if err := c.outputTo("nodes.yaml", nodes); err != nil { + return err + } + + return nil +} + +func (c namespaces) Collect(ctx context.Context) error { + namespaces, err := c.Clientset.CoreV1().Namespaces().List(ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("error collecting %T: %w", c, err) + } + + if err := c.outputTo("namespaces.yaml", namespaces); err != nil { + return err + } + + return nil +} + +func (c daemonsets) Collect(ctx context.Context) error { + daemonsets, err := c.Clientset.AppsV1().DaemonSets(c.namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("error collecting %T: %w", c, err) + } + + if err := c.outputTo("daemonsets.yaml", daemonsets); err != nil { + return err + } + + return nil +} + +func (c deployments) Collect(ctx context.Context) error { + deployments, err := c.Clientset.AppsV1().Deployments(c.namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("error collecting %T: %w", c, err) + } + + if err := c.outputTo("deployments.yaml", deployments); err != nil { + return err + } + + return nil +} + +func (c pods) Collect(ctx context.Context) error { + pods, err := c.Config.Clientset.CoreV1().Pods(c.namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("error collecting %T: %w", c, err) + } + + if err := c.outputTo("pods.yaml", pods); err != nil { + return err + } + + var errs error + for _, pod := range pods.Items { + errs = errors.Join(err, podLogCollector{c.Config, pod.Name}.Collect(ctx)) + } + + return errs +} + +func (c jobs) Collect(ctx context.Context) error { + jobs, err := c.Clientset.BatchV1().Jobs(c.namespace).List(ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("error collecting %T: %w", c, err) + } + + if err := c.outputTo("jobs.yaml", jobs); err != nil { + return err + } + + return nil +} + +type podLogCollector struct { + *Config + name string +} + +func (c podLogCollector) Collect(ctx context.Context) error { + podLogFile, err := c.createFile(fmt.Sprintf("%s.log", c.name)) + if err != nil { + return fmt.Errorf("error creating podLogFile: %w", err) + } + defer podLogFile.Close() + + req := c.Clientset.CoreV1().Pods(c.namespace).GetLogs(c.name, &v1.PodLogOptions{}) + podLogs, err := req.Stream(ctx) + if err != nil { + return fmt.Errorf("error getting pod logs: %w", err) + } + + buf := bufio.NewScanner(podLogs) + for buf.Scan() { + if _, err := podLogFile.Write(buf.Bytes()); err != nil { + return fmt.Errorf("error writing pod logs: %w", err) + } + if _, err := podLogFile.Write([]byte("\n")); err != nil { + return fmt.Errorf("error writing pod logs: %w", err) + } + } + if err := buf.Err(); err != nil { + return fmt.Errorf("error reading pod log: %w", err) + } + + return nil +} diff --git a/pkg/diagnostics/options.go b/pkg/diagnostics/options.go new file mode 100644 index 00000000..891103e5 --- /dev/null +++ b/pkg/diagnostics/options.go @@ -0,0 +1,119 @@ +/** +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package diagnostics + +import ( + "k8s.io/client-go/kubernetes" + "k8s.io/klog/v2" + nfdclient "sigs.k8s.io/node-feature-discovery/pkg/generated/clientset/versioned" +) + +const ( + // the core group + Pods = "pods" + Nodes = "nodes" + Namespaces = "namespaces" + + // the apps group + Deployments = "deployments" + DaemonSets = "daemonsets" + + // the batch group + Jobs = "jobs" + + // Supported extensions + NodeFeature = "nodeFeature" + NodeFeatureRule = "nodeFeatureRule" +) + +type Diagnostic struct { + *Config + collectors []Collector +} + +type Option func(*Diagnostic) + +func WithNamespace(namespace string) func(*Diagnostic) { + return func(d *Diagnostic) { + d.Config.namespace = namespace + } +} + +func WithArtifactDir(artifactDir string) func(*Diagnostic) { + return func(d *Diagnostic) { + d.Config.artifactDir = artifactDir + } +} + +func WithKubernetesClient(clientset kubernetes.Interface) func(*Diagnostic) { + return func(d *Diagnostic) { + d.Clientset = clientset + } +} + +func WithNFDClient(nfdClient *nfdclient.Clientset) func(*Diagnostic) { + return func(d *Diagnostic) { + d.NfdClient = nfdClient + } +} + +func WithObjects(objects ...string) func(*Diagnostic) { + return func(d *Diagnostic) { + seen := make(map[string]bool) + for _, obj := range objects { + if seen[obj] { + continue + } + seen[obj] = true + switch obj { + case Nodes: + d.collectors = append(d.collectors, nodes{Config: d.Config}) + case Namespaces: + d.collectors = append(d.collectors, namespaces{Config: d.Config}) + case Pods: + d.collectors = append(d.collectors, pods{Config: d.Config}) + case Deployments: + d.collectors = append(d.collectors, deployments{Config: d.Config}) + case DaemonSets: + d.collectors = append(d.collectors, daemonsets{Config: d.Config}) + case Jobs: + d.collectors = append(d.collectors, jobs{Config: d.Config}) + case NodeFeature: + d.collectors = append(d.collectors, nodeFeatures{Config: d.Config}) + case NodeFeatureRule: + d.collectors = append(d.collectors, nodeFeatureRules{Config: d.Config}) + default: + klog.Warningf("Unsupported object %s", obj) + continue + } + } + } +} + +func New(opts ...Option) (*Diagnostic, error) { + c := &Config{} + dc := &Diagnostic{ + Config: c, + } + + // use the variadic function to set the options + for _, opt := range opts { + opt(dc) + } + + return dc, nil +}