Skip to content

Commit

Permalink
add back ldms
Browse files Browse the repository at this point in the history
I started a separate branch because I am working in a new vscode and
was afraid I would bork something.

Signed-off-by: vsoch <[email protected]>
  • Loading branch information
vsoch committed Sep 20, 2023
1 parent b8c8043 commit c00a40c
Show file tree
Hide file tree
Showing 8 changed files with 211 additions and 44 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ jobs:
#["io-ior", "ghcr.io/converged-computing/metric-ior:latest", 120],
## ["network-chatterbug", "ghcr.io/converged-computing/metric-chatterbug:latest", 120],
#["app-nekbone", "ghcr.io/converged-computing/metric-nekbone:latest", 120],
# ["app-ldms", "ghcr.io/converged-computing/metric-ovis-hpc:latest", 120],
["app-ldms", "ghcr.io/converged-computing/metric-ovis-hpc:latest", 120],
["app-amg", "ghcr.io/converged-computing/metric-amg:latest", 120],
["app-kripke", "ghcr.io/converged-computing/metric-kripke:latest", 120],
#["app-pennant", "ghcr.io/converged-computing/metric-pennant:latest", 120],
Expand Down
8 changes: 8 additions & 0 deletions docs/_static/data/metrics.json
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,13 @@
"type": "",
"image": "ghcr.io/converged-computing/metric-lammps:latest",
"url": "https://www.lammps.org/"
},
{
"name": "app-ldms",
"description": "provides LDMS, a low-overhead, low-latency framework for collecting, transferring, and storing metric data on a large distributed computer system.",
"family": "performance",
"type": "",
"image": "ghcr.io/converged-computing/metric-ovis-hpc:latest",
"url": "https://github.com/ovis-hpc/ovis"
}
]
Empty file modified docs/make.bat
100644 → 100755
Empty file.
148 changes: 148 additions & 0 deletions pkg/metrics/app/ldms.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
/*
Copyright 2023 Lawrence Livermore National Security, LLC
(c.f. AUTHORS, NOTICE.LLNS, COPYING)
SPDX-License-Identifier: MIT
*/

package application

import (
"fmt"

api "github.com/converged-computing/metrics-operator/api/v1alpha1"
"k8s.io/apimachinery/pkg/util/intstr"

"github.com/converged-computing/metrics-operator/pkg/metadata"
metrics "github.com/converged-computing/metrics-operator/pkg/metrics"
"github.com/converged-computing/metrics-operator/pkg/specs"
)

type LDMS struct {
metrics.SingleApplication

// Custom Options
completions int32
command string
rate int32
}

// I think this is a simulation?
func (m LDMS) Family() string {
return metrics.PerformanceFamily
}

func (m LDMS) Url() string {
return "https://github.com/ovis-hpc/ovis"
}

// Set custom options / attributes for the metric
func (m *LDMS) SetOptions(metric *api.Metric) {
m.ResourceSpec = &metric.Resources
m.AttributeSpec = &metric.Attributes
m.rate = 10

// Set user defined values or fall back to defaults
m.command = "ldms_ls -h localhost -x sock -p 10444 -l -v"
m.Workdir = "/opt"

command, ok := metric.Options["command"]
if ok {
m.command = command.StrVal
}
workdir, ok := metric.Options["workdir"]
if ok {
m.Workdir = workdir.StrVal
}
completions, ok := metric.Options["completions"]
if ok {
m.completions = completions.IntVal
}
rate, ok := metric.Options["rate"]
if ok {
m.rate = rate.IntVal
}
// Primarily sole tenancy
m.SetDefaultOptions(metric)
}

// Exported options and list options
func (m LDMS) Options() map[string]intstr.IntOrString {
return map[string]intstr.IntOrString{
"rate": intstr.FromInt(int(m.rate)),
"completions": intstr.FromInt(int(m.completions)),
"command": intstr.FromString(m.command),
"workdir": intstr.FromString(m.Workdir),
}
}
func (n LDMS) ListOptions() map[string][]intstr.IntOrString {
return map[string][]intstr.IntOrString{}
}

func (m LDMS) PrepareContainers(
spec *api.MetricSet,
metric *metrics.Metric,
) []*specs.ContainerSpec {

// Metadata to add to beginning of run
meta := metrics.Metadata(spec, metric)

preBlock := `
# Setup munge
mkdir -p /run/munge
chown -R 0 /var/log/munge /var/lib/munge /etc/munge /run/munge
# Skip munge for now, not on a cluster
# ldmsd -x sock:10444 -c /opt/sampler.conf -l /tmp/demo_ldmsd_log -v DEBUG -a munge -r $(pwd)/ldmsd.pid
ldmsd -x sock:10444 -c /opt/sampler.conf -l /tmp/demo_ldmsd_log -v DEBUG -r $(pwd)/ldmsd.pid
echo "%s"
i=0
completions=%d
echo "%s"
while true
do
echo "%s"
%s
if [[ $retval -ne 0 ]]; then
echo "%s"
exit 0
fi
if [[ $completions -ne 0 ]] && [[ $i -eq $completions ]]; then
echo "%s"
exit 0
fi
sleep %d
let i=i+1
done
`

postBlock := `
echo "%s"
%s
`
interactive := metadata.Interactive(spec.Spec.Logging.Interactive)
preBlock = fmt.Sprintf(
preBlock,
meta,
m.completions,
metadata.CollectionStart,
metadata.Separator,
m.command,
metadata.CollectionEnd,
metadata.CollectionEnd,
m.rate,
)
postBlock = fmt.Sprintf(postBlock, metadata.CollectionEnd, interactive)
return m.ApplicationContainerSpec(preBlock, "", postBlock)
}

func init() {
app := metrics.BaseMetric{
Identifier: "app-ldms",
Summary: "provides LDMS, a low-overhead, low-latency framework for collecting, transferring, and storing metric data on a large distributed computer system.",
Container: "ghcr.io/converged-computing/metric-ovis-hpc:latest",
}
single := metrics.SingleApplication{BaseMetric: app}
LDMS := LDMS{SingleApplication: single}
metrics.Register(&LDMS)
}
72 changes: 36 additions & 36 deletions pkg/metrics/application.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,66 +9,66 @@ package metrics

import (
api "github.com/converged-computing/metrics-operator/api/v1alpha1"
"k8s.io/apimachinery/pkg/util/intstr"
"github.com/converged-computing/metrics-operator/pkg/specs"
jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2"
)

// These are common templates for application metrics
var (
DefaultEntrypointScript = "/metrics_operator/entrypoint-0.sh"
)

// SingleApplication is a Metric base for a simple application metric
// be accessible by other packages (and not conflict with function names)
type SingleApplication struct {
BaseMetric
}

// Name returns the metric name
func (m SingleApplication) Name() string {
return m.Identifier
}

func (m SingleApplication) HasSoleTenancy() bool {
return false
}

// Description returns the metric description
func (m SingleApplication) Description() string {
return m.Summary
}

// Default SingleApplication is generic performance family
func (m SingleApplication) Family() string {
return PerformanceFamily
}

// Return container resources for the metric container
func (m SingleApplication) Resources() *api.ContainerResources {
return m.ResourceSpec
}
func (m SingleApplication) Attributes() *api.ContainerSpec {
return m.AttributeSpec
}
func (m *SingleApplication) ApplicationContainerSpec(
preBlock string,
command string,
postBlock string,
) []*specs.ContainerSpec {

// Validation
func (m SingleApplication) Validate(spec *api.MetricSet) bool {
return true
}
entrypoint := specs.EntrypointScript{
Name: specs.DeriveScriptKey(DefaultEntrypointScript),
Path: DefaultEntrypointScript,
Pre: preBlock,
Command: command,
Post: postBlock,
}

// Container variables
func (m SingleApplication) Image() string {
return m.Container
}
func (m SingleApplication) WorkingDir() string {
return m.Workdir
}
return []*specs.ContainerSpec{{
JobName: ReplicatedJobName,
Image: m.Image(),
Name: "app",
WorkingDir: m.Workdir,
EntrypointScript: entrypoint,
Resources: m.ResourceSpec,
Attributes: m.AttributeSpec,
}}

func (m SingleApplication) ReplicatedJobs(spec *api.MetricSet) ([]jobset.ReplicatedJob, error) {
return []jobset.ReplicatedJob{}, nil
}

func (m SingleApplication) ListOptions() map[string][]intstr.IntOrString {
return map[string][]intstr.IntOrString{}
}
// Replicated Jobs are custom for a launcher worker
func (m *SingleApplication) ReplicatedJobs(spec *api.MetricSet) ([]*jobset.ReplicatedJob, error) {

js := []*jobset.ReplicatedJob{}

func (m SingleApplication) SuccessJobs() []string {
return []string{}
// Generate a replicated job for the applicatino
rj, err := AssembleReplicatedJob(spec, true, spec.Spec.Pods, spec.Spec.Pods, "", m.SoleTenancy)
if err != nil {
return js, err
}
js = []*jobset.ReplicatedJob{rj}
return js, nil
}
18 changes: 18 additions & 0 deletions pkg/metrics/base.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ type BaseMetric struct {
ResourceSpec *api.ContainerResources
AttributeSpec *api.ContainerSpec

// If we ask for sole tenancy, we assign 1 pod / hostname
SoleTenancy bool

// A metric can have one or more addons
Addons map[string]*addons.Addon
}
Expand Down Expand Up @@ -81,6 +84,21 @@ func (m BaseMetric) ReplicatedJobs(set *api.MetricSet) ([]*jobset.ReplicatedJob,
return []*jobset.ReplicatedJob{}, nil
}

func (m BaseMetric) HasSoleTenancy() bool {
return m.SoleTenancy
}

// SetDefaultOptions that are shared (possibly)
func (m BaseMetric) SetDefaultOptions(metric *api.Metric) {
st, ok := metric.Options["soleTenancy"]
if ok && st.StrVal == "false" || st.StrVal == "no" {
m.SoleTenancy = false
}
if ok && st.StrVal == "true" || st.StrVal == "yes" {
m.SoleTenancy = true
}
}

// Add registered addons to replicated jobs
func (m BaseMetric) AddAddons(
spec *api.MetricSet,
Expand Down
7 changes: 0 additions & 7 deletions pkg/metrics/launcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,6 @@ type LauncherWorker struct {
// A metric can have one or more addons
Addons []*api.MetricAddon

// If we ask for sole tenancy, we assign 1 pod / hostname
SoleTenancy bool

// Scripts
WorkerScript string
LauncherScript string
Expand All @@ -52,10 +49,6 @@ type LauncherWorker struct {
WorkerLetter string
}

func (m LauncherWorker) HasSoleTenancy() bool {
return m.SoleTenancy
}

// Name returns the metric name
func (m LauncherWorker) Name() string {
return m.Identifier
Expand Down
Empty file modified script/test.sh
100755 → 100644
Empty file.

0 comments on commit c00a40c

Please sign in to comment.