Skip to content

Commit

Permalink
fixing bug with internal crd state
Browse files Browse the repository at this point in the history
if we do not make a copy (refect) of the interface,
the state seems to change (and perist) between runs. While
I am still worried about this design, this at least seems
to fix that bug. I am also wondering about garbage collection
(e.g., if making the copies means they stay around and the
operator will use increasing memory) but that is TBA
explored.

Signed-off-by: vsoch <[email protected]>
  • Loading branch information
vsoch committed Sep 23, 2023
1 parent 7b7b7bb commit 57279d5
Show file tree
Hide file tree
Showing 27 changed files with 314 additions and 117 deletions.
2 changes: 1 addition & 1 deletion api/v1alpha2/metric_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ type Metric struct {

// Use a custom container image (advanced users only)
// +optional
Image string `json:"image"`
Image string `json:"image,omitempty"`

// A Metric addon can be storage (volume) or an application,
// It's an additional entity that can customize a replicated job,
Expand Down
2 changes: 1 addition & 1 deletion docs/_static/data/metrics.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
"name": "app-kripke",
"description": "parallel algebraic multigrid solver for linear systems arising from problems on unstructured grids",
"family": "solver",
"image": "ghcr.io/converged-computing/metric-kripke:latest",
"image": "parallel algebraic multigrid solver for linear systems arising from problems on unstructured grids",
"url": "https://github.com/LLNL/Kripke"
},
{
Expand Down
49 changes: 49 additions & 0 deletions examples/tests/perf-lammps-hpctoolkit/metrics-rocky.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
apiVersion: flux-framework.org/v1alpha2
kind: MetricSet
metadata:
labels:
app.kubernetes.io/name: metricset
app.kubernetes.io/instance: metricset-sample
name: metricset-sample
spec:
# Number of pods for lammps (one launcher, the rest workers)
pods: 4
logging:
interactive: true

metrics:

# Running more scaled lammps is our main goal
- name: app-lammps

# How to define a custom lammps container (advanced users)
# This is for if you use rocky, not the default
image: ghcr.io/converged-computing/metric-lammps-intel-mpi:rocky

options:
command: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite
workdir: /opt/lammps/examples/reaxff/HNS

# Add on hpctoolkit, will mount a volume and wrap lammps
addons:
- name: perf-hpctoolkit
options:
mount: /opt/mnt
# Where is the event blocked / taking more time
events: "-e REALTIME@100"

# Use a custom container here too (we have for rocky and ubuntu)
image: ghcr.io/converged-computing/metric-hpctoolkit-view:rocky

# Don't run post analysis - script will still be generated
# postAnalysis: "false"

# hpcrun needs to have mpirun in front of hpcrun <command> e.g.,
# mpirun <MPI args> hpcrun <hpcrun args> <app> <app args>
prefix: /opt/intel/mpi/2021.8.0/bin/mpirun --hostfile ./hostlist.txt -np 4 --map-by socket

# Ensure the working directory is consistent
workdir: /opt/lammps/examples/reaxff/HNS

# Target container for entrypoint addition is the launcher, not workers
containerTarget: launcher
11 changes: 0 additions & 11 deletions examples/tests/perf-lammps-hpctoolkit/metrics.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,6 @@ spec:

# Running more scaled lammps is our main goal
- name: app-lammps

# How to define a custom lammps container (advanced users)
# This is for if you use rocky, not the default
# image: ghcr.io/converged-computing/metric-lammps-intel-mpi:rocky

options:
command: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite
workdir: /opt/lammps/examples/reaxff/HNS
Expand All @@ -32,19 +27,13 @@ spec:
# Where is the event blocked / taking more time
events: "-e REALTIME@100"

# Use a custom container here too (we have for rocky and ubuntu)
# image: ghcr.io/converged-computing/metric-hpctoolkit-view:rocky

# Don't run post analysis - script will still be generated
# postAnalysis: "false"

# hpcrun needs to have mpirun in front of hpcrun <command> e.g.,
# mpirun <MPI args> hpcrun <hpcrun args> <app> <app args>
prefix: mpirun --hostfile ./hostlist.txt -np 4 --map-by socket

# This is for rocky
#prefix: /opt/intel/mpi/2021.8.0/bin/mpirun --hostfile ./hostlist.txt -np 4 --map-by socket

# Ensure the working directory is consistent
workdir: /opt/lammps/examples/reaxff/HNS

Expand Down
10 changes: 9 additions & 1 deletion pkg/addons/addons.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ package addons
import (
"fmt"
"log"
"reflect"

jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2"

Expand Down Expand Up @@ -97,10 +98,17 @@ func (b AddonBase) MapOptions() map[string]map[string]intstr.IntOrString {

// GetAddon looks up and validates an addon
func GetAddon(a *api.MetricAddon) (Addon, error) {
addon, ok := Registry[a.Name]

// We don't want to change the addon interface/struct itself
template, ok := Registry[a.Name]
if !ok {
return nil, fmt.Errorf("%s is not a known addon", a.Name)
}
templateType := reflect.ValueOf(template)
if templateType.Kind() == reflect.Ptr {
templateType = reflect.Indirect(templateType)
}
addon := reflect.New(templateType.Type()).Interface().(Addon)

// Set options before validation
addon.SetOptions(a)
Expand Down
24 changes: 16 additions & 8 deletions pkg/metrics/app/amg.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ import (
metrics "github.com/converged-computing/metrics-operator/pkg/metrics"
)

const (
amgIdentifier = "app-amg"
amgSummary = "parallel algebraic multigrid solver for linear systems arising from problems on unstructured grids"
amgContainer = "ghcr.io/converged-computing/metric-amg:latest"
)

// AMG is a launcher + workers metric application
type AMG struct {
metrics.LauncherWorker
Expand All @@ -30,6 +36,12 @@ func (m AMG) Family() string {

// Set custom options / attributes for the metric
func (m *AMG) SetOptions(metric *api.Metric) {

// TODO change these to class varaibles? then set in two places...
m.Identifier = amgIdentifier
m.Summary = amgSummary
m.Container = amgContainer

// Set user defined values or fall back to defaults
m.Prefix = "mpirun --hostfile ./hostlist.txt"
m.Command = "amg"
Expand All @@ -53,15 +65,11 @@ func (m AMG) Options() map[string]intstr.IntOrString {

func init() {
base := metrics.BaseMetric{
Identifier: "app-amg",
Summary: "parallel algebraic multigrid solver for linear systems arising from problems on unstructured grids",
Container: "ghcr.io/converged-computing/metric-amg:latest",
}
launcher := metrics.LauncherWorker{
BaseMetric: base,
WorkerScript: "/metrics_operator/amg-worker.sh",
LauncherScript: "/metrics_operator/amg-launcher.sh",
Identifier: amgIdentifier,
Summary: amgSummary,
Container: amgContainer,
}
launcher := metrics.LauncherWorker{BaseMetric: base}
amg := AMG{LauncherWorker: launcher}
metrics.Register(&amg)
}
17 changes: 14 additions & 3 deletions pkg/metrics/app/bdas.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ import (
"github.com/converged-computing/metrics-operator/pkg/specs"
)

const (
bdasIdentifier = "app-bdas"
bdasSummary = "The big data analytic suite contains the K-Means observation label, PCA, and SVM benchmarks."
bdasContainer = "ghcr.io/converged-computing/metric-bdas:latest"
)

type BDAS struct {
metrics.LauncherWorker
}
Expand All @@ -34,6 +40,11 @@ func (m BDAS) Url() string {
// Set custom options / attributes for the metric
func (m *BDAS) SetOptions(metric *api.Metric) {

// Metadatqa
m.Identifier = bdasIdentifier
m.Summary = bdasSummary
m.Container = bdasContainer

// Set user defined values or fall back to defaults
m.Prefix = "/bin/bash"
m.Command = "mpirun --allow-run-as-root -np 4 --hostfile ./hostlist.txt Rscript /opt/bdas/benchmarks/r/princomp.r 250 50"
Expand Down Expand Up @@ -118,9 +129,9 @@ echo "%s"

func init() {
base := metrics.BaseMetric{
Identifier: "app-bdas",
Summary: "The big data analytic suite contains the K-Means observation label, PCA, and SVM benchmarks.",
Container: "ghcr.io/converged-computing/metric-bdas:latest",
Identifier: bdasIdentifier,
Summary: bdasSummary,
Container: bdasContainer,
}
launcher := metrics.LauncherWorker{BaseMetric: base}
BDAS := BDAS{LauncherWorker: launcher}
Expand Down
16 changes: 13 additions & 3 deletions pkg/metrics/app/hpl.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ import (
// https://www.netlib.org/benchmark/hpl/
// https://ulhpc-tutorials.readthedocs.io/en/production/parallel/mpi/HPL/

const (
hplIdentifier = "app-hpl"
hplSummary = "High-Performance Linpack (HPL)"
hplContainer = "ghcr.io/converged-computing/metric-hpl-spack:latest"
)

// Default input file hpl.dat
// The output of this is Ns, memory is in GiB
// -m 128 -NB 192 -r 0.3 -N 2: translates to --mem 128 -NB ${blocksize} -r 0.3 -N ${pods}
Expand Down Expand Up @@ -122,6 +128,10 @@ func (m *HPL) SetOptions(metric *api.Metric) {
m.ResourceSpec = &metric.Resources
m.AttributeSpec = &metric.Attributes

m.Identifier = hplIdentifier
m.Summary = hplSummary
m.Container = hplContainer

// Defaults for hpl.dat values.
// memory and pods (nodes) calculated on the fly, unless otherwise provided
m.ratio = "0.3"
Expand Down Expand Up @@ -394,9 +404,9 @@ echo "%s"

func init() {
base := metrics.BaseMetric{
Identifier: "app-hpl",
Summary: "High-Performance Linpack (HPL)",
Container: "ghcr.io/converged-computing/metric-hpl-spack:latest",
Identifier: hplIdentifier,
Summary: hplSummary,
Container: hplContainer,
}
launcher := metrics.LauncherWorker{BaseMetric: base}
HPL := HPL{LauncherWorker: launcher}
Expand Down
22 changes: 14 additions & 8 deletions pkg/metrics/app/kripke.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ import (
metrics "github.com/converged-computing/metrics-operator/pkg/metrics"
)

const (
kripkeIdentifier = "app-kripke"
kripkeSummary = "parallel algebraic multigrid solver for linear systems arising from problems on unstructured grids"
kripkeContainer = "ghcr.io/converged-computing/metric-kripke:latest"
)

type Kripke struct {
metrics.LauncherWorker
}
Expand All @@ -30,6 +36,10 @@ func (m Kripke) Family() string {
// Set custom options / attributes for the metric
func (m *Kripke) SetOptions(metric *api.Metric) {

m.Identifier = kripkeIdentifier
m.Summary = kripkeSummary
m.Container = kripkeContainer

// Set user defined values or fall back to defaults
m.Prefix = "mpirun --hostfile ./hostlist.txt"
m.Command = "kripke"
Expand All @@ -56,15 +66,11 @@ func (n Kripke) ListOptions() map[string][]intstr.IntOrString {

func init() {
base := metrics.BaseMetric{
Identifier: "app-kripke",
Summary: "parallel algebraic multigrid solver for linear systems arising from problems on unstructured grids",
Container: "ghcr.io/converged-computing/metric-kripke:latest",
}
launcher := metrics.LauncherWorker{
BaseMetric: base,
WorkerScript: "/metrics_operator/kripke-worker.sh",
LauncherScript: "/metrics_operator/kripke-launcher.sh",
Identifier: kripkeIdentifier,
Summary: kripkeSummary,
Container: kripkeSummary,
}
launcher := metrics.LauncherWorker{BaseMetric: base}
kripke := Kripke{LauncherWorker: launcher}
metrics.Register(&kripke)
}
17 changes: 14 additions & 3 deletions pkg/metrics/app/laghos.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ import (
metrics "github.com/converged-computing/metrics-operator/pkg/metrics"
)

const (
laghosIdentifier = "app-laghos"
laghosSummary = "LAGrangian High-Order Solver"
laghosContainer = "ghcr.io/converged-computing/metric-laghos:latest"
)

type Laghos struct {
metrics.LauncherWorker
}
Expand All @@ -29,6 +35,11 @@ func (m Laghos) Url() string {

// Set custom options / attributes for the metric
func (m *Laghos) SetOptions(metric *api.Metric) {

m.Identifier = laghosIdentifier
m.Summary = laghosSummary
m.Container = laghosSummary

// Set user defined values or fall back to defaults
m.Prefix = "/bin/bash"
m.Command = "mpirun -np 4 --hostfile ./hostlist.txt ./laghos"
Expand All @@ -47,9 +58,9 @@ func (m Laghos) Options() map[string]intstr.IntOrString {

func init() {
base := metrics.BaseMetric{
Identifier: "app-laghos",
Summary: "LAGrangian High-Order Solver",
Container: "ghcr.io/converged-computing/metric-laghos:latest",
Identifier: laghosIdentifier,
Summary: laghosSummary,
Container: laghosContainer,
}
launcher := metrics.LauncherWorker{BaseMetric: base}
Laghos := Laghos{LauncherWorker: launcher}
Expand Down
25 changes: 17 additions & 8 deletions pkg/metrics/app/lammps.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ import (
"github.com/converged-computing/metrics-operator/pkg/specs"
)

const (
lammpsIdentifier = "app-lammps"
lammpsSummary = "LAMMPS molecular dynamic simulation"
lammpsContainer = "ghcr.io/converged-computing/metric-lammps:latest"
)

type Lammps struct {
metrics.LauncherWorker
}
Expand All @@ -33,6 +39,12 @@ func (m Lammps) Family() string {

// Set custom options / attributes for the metric
func (m *Lammps) SetOptions(metric *api.Metric) {

// Default metric options, these are overridden when we reflect
m.Identifier = lammpsIdentifier
m.Summary = lammpsSummary
m.Container = lammpsContainer

// Set user defined values or fall back to defaults
// This is a more manual approach that puts the user in charge of determining the entire command
// This more closely matches what we might do on HPC :)
Expand Down Expand Up @@ -109,17 +121,14 @@ echo "%s"
return []*specs.ContainerSpec{&launcherContainer, &workerContainer}
}

// TODO can we have a new function instead?
func init() {
base := metrics.BaseMetric{
Identifier: "app-lammps",
Summary: "LAMMPS molecular dynamic simulation",
Container: "ghcr.io/converged-computing/metric-lammps:latest",
}
launcher := metrics.LauncherWorker{
BaseMetric: base,
WorkerScript: "/metrics_operator/lammps-worker.sh",
LauncherScript: "/metrics_operator/lammps-launcher.sh",
Identifier: lammpsIdentifier,
Summary: lammpsSummary,
Container: lammpsContainer,
}
launcher := metrics.LauncherWorker{BaseMetric: base}
lammps := Lammps{LauncherWorker: launcher}
metrics.Register(&lammps)
}
Loading

0 comments on commit 57279d5

Please sign in to comment.