diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 7924163..89d4030 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -73,7 +73,7 @@ jobs: #["io-ior", "ghcr.io/converged-computing/metric-ior:latest", 120], ## ["network-chatterbug", "ghcr.io/converged-computing/metric-chatterbug:latest", 120], #["app-nekbone", "ghcr.io/converged-computing/metric-nekbone:latest", 120], - # ["app-ldms", "ghcr.io/converged-computing/metric-ovis-hpc:latest", 120], + ["app-ldms", "ghcr.io/converged-computing/metric-ovis-hpc:latest", 120], ["app-amg", "ghcr.io/converged-computing/metric-amg:latest", 120], ["app-kripke", "ghcr.io/converged-computing/metric-kripke:latest", 120], #["app-pennant", "ghcr.io/converged-computing/metric-pennant:latest", 120], diff --git a/docs/_static/data/metrics.json b/docs/_static/data/metrics.json index 4cada3a..61e2464 100644 --- a/docs/_static/data/metrics.json +++ b/docs/_static/data/metrics.json @@ -46,5 +46,13 @@ "type": "", "image": "ghcr.io/converged-computing/metric-lammps:latest", "url": "https://www.lammps.org/" + }, + { + "name": "app-ldms", + "description": "provides LDMS, a low-overhead, low-latency framework for collecting, transferring, and storing metric data on a large distributed computer system.", + "family": "performance", + "type": "", + "image": "ghcr.io/converged-computing/metric-ovis-hpc:latest", + "url": "https://github.com/ovis-hpc/ovis" } ] \ No newline at end of file diff --git a/docs/make.bat b/docs/make.bat old mode 100644 new mode 100755 diff --git a/pkg/metrics/app/ldms.go b/pkg/metrics/app/ldms.go new file mode 100644 index 0000000..7debdd6 --- /dev/null +++ b/pkg/metrics/app/ldms.go @@ -0,0 +1,148 @@ +/* +Copyright 2023 Lawrence Livermore National Security, LLC + (c.f. AUTHORS, NOTICE.LLNS, COPYING) + +SPDX-License-Identifier: MIT +*/ + +package application + +import ( + "fmt" + + api "github.com/converged-computing/metrics-operator/api/v1alpha1" + "k8s.io/apimachinery/pkg/util/intstr" + + "github.com/converged-computing/metrics-operator/pkg/metadata" + metrics "github.com/converged-computing/metrics-operator/pkg/metrics" + "github.com/converged-computing/metrics-operator/pkg/specs" +) + +type LDMS struct { + metrics.SingleApplication + + // Custom Options + completions int32 + command string + rate int32 +} + +// I think this is a simulation? +func (m LDMS) Family() string { + return metrics.PerformanceFamily +} + +func (m LDMS) Url() string { + return "https://github.com/ovis-hpc/ovis" +} + +// Set custom options / attributes for the metric +func (m *LDMS) SetOptions(metric *api.Metric) { + m.ResourceSpec = &metric.Resources + m.AttributeSpec = &metric.Attributes + m.rate = 10 + + // Set user defined values or fall back to defaults + m.command = "ldms_ls -h localhost -x sock -p 10444 -l -v" + m.Workdir = "/opt" + + command, ok := metric.Options["command"] + if ok { + m.command = command.StrVal + } + workdir, ok := metric.Options["workdir"] + if ok { + m.Workdir = workdir.StrVal + } + completions, ok := metric.Options["completions"] + if ok { + m.completions = completions.IntVal + } + rate, ok := metric.Options["rate"] + if ok { + m.rate = rate.IntVal + } + // Primarily sole tenancy + m.SetDefaultOptions(metric) +} + +// Exported options and list options +func (m LDMS) Options() map[string]intstr.IntOrString { + return map[string]intstr.IntOrString{ + "rate": intstr.FromInt(int(m.rate)), + "completions": intstr.FromInt(int(m.completions)), + "command": intstr.FromString(m.command), + "workdir": intstr.FromString(m.Workdir), + } +} +func (n LDMS) ListOptions() map[string][]intstr.IntOrString { + return map[string][]intstr.IntOrString{} +} + +func (m LDMS) PrepareContainers( + spec *api.MetricSet, + metric *metrics.Metric, +) []*specs.ContainerSpec { + + // Metadata to add to beginning of run + meta := metrics.Metadata(spec, metric) + + preBlock := ` +# Setup munge +mkdir -p /run/munge +chown -R 0 /var/log/munge /var/lib/munge /etc/munge /run/munge +# Skip munge for now, not on a cluster +# ldmsd -x sock:10444 -c /opt/sampler.conf -l /tmp/demo_ldmsd_log -v DEBUG -a munge -r $(pwd)/ldmsd.pid +ldmsd -x sock:10444 -c /opt/sampler.conf -l /tmp/demo_ldmsd_log -v DEBUG -r $(pwd)/ldmsd.pid +echo "%s" + +i=0 +completions=%d +echo "%s" +while true + do + echo "%s" + %s + if [[ $retval -ne 0 ]]; then + echo "%s" + exit 0 + fi + if [[ $completions -ne 0 ]] && [[ $i -eq $completions ]]; then + echo "%s" + exit 0 + fi + sleep %d + let i=i+1 +done +` + + postBlock := ` +echo "%s" +%s +` + interactive := metadata.Interactive(spec.Spec.Logging.Interactive) + preBlock = fmt.Sprintf( + preBlock, + meta, + m.completions, + metadata.CollectionStart, + metadata.Separator, + m.command, + metadata.CollectionEnd, + metadata.CollectionEnd, + m.rate, + ) + postBlock = fmt.Sprintf(postBlock, metadata.CollectionEnd, interactive) + return m.ApplicationContainerSpec(preBlock, "", postBlock) +} + +func init() { + app := metrics.BaseMetric{ + Identifier: "app-ldms", + Summary: "provides LDMS, a low-overhead, low-latency framework for collecting, transferring, and storing metric data on a large distributed computer system.", + Container: "ghcr.io/converged-computing/metric-ovis-hpc:latest", + } + single := metrics.SingleApplication{BaseMetric: app} + LDMS := LDMS{SingleApplication: single} + metrics.Register(&LDMS) +} diff --git a/pkg/metrics/application.go b/pkg/metrics/application.go index e8122c3..681dc6f 100644 --- a/pkg/metrics/application.go +++ b/pkg/metrics/application.go @@ -9,11 +9,14 @@ package metrics import ( api "github.com/converged-computing/metrics-operator/api/v1alpha1" - "k8s.io/apimachinery/pkg/util/intstr" + "github.com/converged-computing/metrics-operator/pkg/specs" jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2" ) // These are common templates for application metrics +var ( + DefaultEntrypointScript = "/metrics_operator/entrypoint-0.sh" +) // SingleApplication is a Metric base for a simple application metric // be accessible by other packages (and not conflict with function names) @@ -21,54 +24,51 @@ type SingleApplication struct { BaseMetric } -// Name returns the metric name -func (m SingleApplication) Name() string { - return m.Identifier -} - func (m SingleApplication) HasSoleTenancy() bool { return false } -// Description returns the metric description -func (m SingleApplication) Description() string { - return m.Summary -} - // Default SingleApplication is generic performance family func (m SingleApplication) Family() string { return PerformanceFamily } -// Return container resources for the metric container -func (m SingleApplication) Resources() *api.ContainerResources { - return m.ResourceSpec -} -func (m SingleApplication) Attributes() *api.ContainerSpec { - return m.AttributeSpec -} +func (m *SingleApplication) ApplicationContainerSpec( + preBlock string, + command string, + postBlock string, +) []*specs.ContainerSpec { -// Validation -func (m SingleApplication) Validate(spec *api.MetricSet) bool { - return true -} + entrypoint := specs.EntrypointScript{ + Name: specs.DeriveScriptKey(DefaultEntrypointScript), + Path: DefaultEntrypointScript, + Pre: preBlock, + Command: command, + Post: postBlock, + } -// Container variables -func (m SingleApplication) Image() string { - return m.Container -} -func (m SingleApplication) WorkingDir() string { - return m.Workdir -} + return []*specs.ContainerSpec{{ + JobName: ReplicatedJobName, + Image: m.Image(), + Name: "app", + WorkingDir: m.Workdir, + EntrypointScript: entrypoint, + Resources: m.ResourceSpec, + Attributes: m.AttributeSpec, + }} -func (m SingleApplication) ReplicatedJobs(spec *api.MetricSet) ([]jobset.ReplicatedJob, error) { - return []jobset.ReplicatedJob{}, nil } -func (m SingleApplication) ListOptions() map[string][]intstr.IntOrString { - return map[string][]intstr.IntOrString{} -} +// Replicated Jobs are custom for a launcher worker +func (m *SingleApplication) ReplicatedJobs(spec *api.MetricSet) ([]*jobset.ReplicatedJob, error) { + + js := []*jobset.ReplicatedJob{} -func (m SingleApplication) SuccessJobs() []string { - return []string{} + // Generate a replicated job for the applicatino + rj, err := AssembleReplicatedJob(spec, true, spec.Spec.Pods, spec.Spec.Pods, "", m.SoleTenancy) + if err != nil { + return js, err + } + js = []*jobset.ReplicatedJob{rj} + return js, nil } diff --git a/pkg/metrics/base.go b/pkg/metrics/base.go index a3e4ce4..faa2c17 100644 --- a/pkg/metrics/base.go +++ b/pkg/metrics/base.go @@ -25,6 +25,9 @@ type BaseMetric struct { ResourceSpec *api.ContainerResources AttributeSpec *api.ContainerSpec + // If we ask for sole tenancy, we assign 1 pod / hostname + SoleTenancy bool + // A metric can have one or more addons Addons map[string]*addons.Addon } @@ -81,6 +84,21 @@ func (m BaseMetric) ReplicatedJobs(set *api.MetricSet) ([]*jobset.ReplicatedJob, return []*jobset.ReplicatedJob{}, nil } +func (m BaseMetric) HasSoleTenancy() bool { + return m.SoleTenancy +} + +// SetDefaultOptions that are shared (possibly) +func (m BaseMetric) SetDefaultOptions(metric *api.Metric) { + st, ok := metric.Options["soleTenancy"] + if ok && st.StrVal == "false" || st.StrVal == "no" { + m.SoleTenancy = false + } + if ok && st.StrVal == "true" || st.StrVal == "yes" { + m.SoleTenancy = true + } +} + // Add registered addons to replicated jobs func (m BaseMetric) AddAddons( spec *api.MetricSet, diff --git a/pkg/metrics/launcher.go b/pkg/metrics/launcher.go index 071dd60..3350e96 100644 --- a/pkg/metrics/launcher.go +++ b/pkg/metrics/launcher.go @@ -40,9 +40,6 @@ type LauncherWorker struct { // A metric can have one or more addons Addons []*api.MetricAddon - // If we ask for sole tenancy, we assign 1 pod / hostname - SoleTenancy bool - // Scripts WorkerScript string LauncherScript string @@ -52,10 +49,6 @@ type LauncherWorker struct { WorkerLetter string } -func (m LauncherWorker) HasSoleTenancy() bool { - return m.SoleTenancy -} - // Name returns the metric name func (m LauncherWorker) Name() string { return m.Identifier diff --git a/script/test.sh b/script/test.sh old mode 100755 new mode 100644