diff --git a/README.md b/README.md index 9cc0420..2327c6f 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ To learn more: ## Dinosaur TODO - Document and automate docs for addons (options, etc.) -- Addons likely needs to be a list to support > 1 of one type! Then subsequent changes so it's not 1:1 +- Figure out issue with errors.IsNotFound not working... - We need a way for the entrypoint command to monitor (based on the container) to differ (potentially) - For larger metric collections, we should have a log streaming mode (and not wait for Completed/Successful) diff --git a/controllers/metric/configmap.go b/controllers/metric/configmap.go index 5297452..8555808 100644 --- a/controllers/metric/configmap.go +++ b/controllers/metric/configmap.go @@ -51,6 +51,7 @@ func (r *MetricSetReconciler) ensureConfigMaps( // Go through each container spec entrypoint for _, cs := range containerSpecs { + r.Log.Info("⬜️ ConfigMaps", "Name", cs.EntrypointScript.Name, "Writing", cs) data[cs.EntrypointScript.Name] = cs.EntrypointScript.WriteScript() } diff --git a/docs/_static/data/metrics.json b/docs/_static/data/metrics.json index 97a180b..b5250ba 100644 --- a/docs/_static/data/metrics.json +++ b/docs/_static/data/metrics.json @@ -1,9 +1,9 @@ [ { - "name": "", - "description": "", + "name": "app-amg", + "description": "parallel algebraic multigrid solver for linear systems arising from problems on unstructured grids", "family": "solver", - "image": "", + "image": "ghcr.io/converged-computing/metric-amg:latest", "url": "https://github.com/LLNL/AMG" }, { diff --git a/pkg/addons/addons.go b/pkg/addons/addons.go index 369957c..f28eee3 100644 --- a/pkg/addons/addons.go +++ b/pkg/addons/addons.go @@ -38,7 +38,7 @@ type Addon interface { MapOptions() map[string]map[string]intstr.IntOrString // What addons can control: - AssembleVolumes() specs.VolumeSpec + AssembleVolumes() []specs.VolumeSpec AssembleContainers() []specs.ContainerSpec CustomizeEntrypoints([]*specs.ContainerSpec, []*jobset.ReplicatedJob) @@ -70,8 +70,8 @@ func (b AddonBase) AssembleContainers() []specs.ContainerSpec { } // Assemble Volumes (for now) just generates one -func (b AddonBase) AssembleVolumes() specs.VolumeSpec { - return specs.VolumeSpec{} +func (b AddonBase) AssembleVolumes() []specs.VolumeSpec { + return []specs.VolumeSpec{} } func (b AddonBase) Description() string { diff --git a/pkg/addons/containers.go b/pkg/addons/containers.go index 907fc49..05746ff 100644 --- a/pkg/addons/containers.go +++ b/pkg/addons/containers.go @@ -100,7 +100,7 @@ func (a *ApplicationAddon) SetDefaultOptions(metric *api.MetricAddon) { if ok { a.pullSecret = pullSecret.StrVal } - workdir, ok := metric.Options["workingDir"] + workdir, ok := metric.Options["workdir"] if ok { a.workingDir = workdir.StrVal } @@ -143,7 +143,7 @@ func (a *ApplicationAddon) SetOptions(metric *api.MetricAddon) { func (a *ApplicationAddon) DefaultOptions() map[string]intstr.IntOrString { values := map[string]intstr.IntOrString{ "image": intstr.FromString(a.image), - "workingDir": intstr.FromString(a.workingDir), + "workdir": intstr.FromString(a.workingDir), "entrypoint": intstr.FromString(a.entrypoint), "command": intstr.FromString(a.command), } diff --git a/pkg/addons/hpctoolkit.go b/pkg/addons/hpctoolkit.go index 08bde58..9f7593b 100644 --- a/pkg/addons/hpctoolkit.go +++ b/pkg/addons/hpctoolkit.go @@ -25,24 +25,65 @@ type HPCToolkit struct { ApplicationAddon // Target is the name of the replicated job to customize entrypoint logic for - target string - events string - mount string - entrypointPath string + target string + + // ContainerTarget is the name of the container to add the entrypoint logic to + containerTarget string + events string + mount string + entrypointPath string + volumeName string } // AssembleVolumes to provide an empty volume for the application to share -func (m HPCToolkit) AssembleVolumes() specs.VolumeSpec { +// We also need to provide a config map volume for our container spec +func (m HPCToolkit) AssembleVolumes() []specs.VolumeSpec { volume := corev1.Volume{ - Name: "hpctoolkit", + Name: m.volumeName, VolumeSource: corev1.VolumeSource{ EmptyDir: &corev1.EmptyDirVolumeSource{}, }, } + + // Prepare items as key to path + items := []corev1.KeyToPath{ + { + Key: m.volumeName, + Path: filepath.Base(m.entrypointPath), + }, + } + + // This is a config map volume with items + // It needs to be created in the same metrics operator namespace + // We need a better way to define this, I'm not happy with it. + // There should just be some variables under the volumespec + newVolume := corev1.Volume{ + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: m.volumeName, + }, + Items: items, + }, + }, + } + // EmptyDir should be ReadOnly False, and we don't need a mount for it - return specs.VolumeSpec{ - Volume: volume, - Mount: false, + return []specs.VolumeSpec{ + { + Volume: volume, + Mount: true, + Path: m.mount, + }, + + // Mount is set to false here because we mount via metrics_operator + // This is a bit messy (I'm not happy) but I'll make it better + { + Volume: newVolume, + ReadOnly: true, + Mount: false, + Path: filepath.Dir(m.entrypointPath), + }, } } @@ -52,14 +93,6 @@ func (a *HPCToolkit) Validate() bool { logger.Error("The HPCtoolkit application addon requires one or more 'events' for hpcrun (e.g., -e IO).") return false } - if a.image == "" { - logger.Error("The application addon requires a container 'image'.") - return false - } - if a.command == "" { - logger.Error("The application addon requires a container 'command'.") - return false - } return true } @@ -70,12 +103,25 @@ func (a *HPCToolkit) SetOptions(metric *api.MetricAddon) { a.image = "ghcr.io/converged-computing/metric-hpctoolkit-view:latest" a.SetDefaultOptions(metric) a.mount = "/opt/share" + a.volumeName = "hpctoolkit" // UseColor set to anything means to use it mount, ok := metric.Options["mount"] if ok { a.mount = mount.StrVal } + workdir, ok := metric.Options["workdir"] + if ok { + a.workingDir = workdir.StrVal + } + target, ok := metric.Options["target"] + if ok { + a.target = target.StrVal + } + ctarget, ok := metric.Options["containerTarget"] + if ok { + a.containerTarget = ctarget.StrVal + } events, ok := metric.Options["events"] if ok { a.events = events.StrVal @@ -83,7 +129,7 @@ func (a *HPCToolkit) SetOptions(metric *api.MetricAddon) { } // Exported options and list options -func (a HPCToolkit) Options() map[string]intstr.IntOrString { +func (a *HPCToolkit) Options() map[string]intstr.IntOrString { options := a.DefaultOptions() options["events"] = intstr.FromString(a.events) options["mount"] = intstr.FromString(a.mount) @@ -127,7 +173,9 @@ mv ./wait-fs /usr/bin/goshare-wait-fs viewbase="%s" software="${viewbase}/software" viewbin="${viewbase}/view/bin" -export PATH=${viewbin}:$PATH + +# Important to add AFTER in case software in container duplicated +export PATH=$PATH:${viewbin} # Wait for software directory, and give it time goshare-wait-fs -p ${software} @@ -154,15 +202,19 @@ echo "%s" # hpcprof hpctoolkit-sleep-measurements # hpcstruct hpctoolkit-sleep-measurements # hpcviewer ./hpctoolkit-lmp-database +workdir="%s" +echo "Changing directory to ${workdir}" +cd ${workdir} ` preBlock = fmt.Sprintf( - meta, preBlock, + meta, a.mount, a.mount, a.events, metadata.CollectionStart, metadata.Separator, + a.workingDir, ) // TODO we may want to target specific entrypoint scripts here @@ -173,7 +225,12 @@ echo "%s" if containerSpec.JobName != rj.Name { continue } - containerSpec.EntrypointScript.Pre = "\n" + preBlock + + // Next check if we have a target set (for the container) + if a.containerTarget != "" && containerSpec.Name != "" && a.containerTarget != containerSpec.Name { + continue + } + containerSpec.EntrypointScript.Pre += "\n" + preBlock containerSpec.EntrypointScript.Command = fmt.Sprintf("hpcrun $events %s", containerSpec.EntrypointScript.Command) } } @@ -212,6 +269,7 @@ sleep infinity // Leave the name empty to generate in the namespace of the metric set (e.g., set.Name) entrypoint := specs.EntrypointScript{ + Name: a.volumeName, Path: a.entrypointPath, Script: filepath.Base(a.entrypointPath), Pre: script, @@ -232,6 +290,8 @@ sleep infinity Privileged: a.privileged, }, }, + // We need to write this config map! + NeedsWrite: true, }, } } diff --git a/pkg/addons/volumes.go b/pkg/addons/volumes.go index 77dbb8d..5d6b6a4 100644 --- a/pkg/addons/volumes.go +++ b/pkg/addons/volumes.go @@ -133,7 +133,7 @@ func (v *ConfigMapVolume) MapOptions() map[string]map[string]intstr.IntOrString } // AssembleVolumes for a config map -func (v *ConfigMapVolume) AssembleVolumes() specs.VolumeSpec { +func (v *ConfigMapVolume) AssembleVolumes() []specs.VolumeSpec { // Prepare items as key to path items := []corev1.KeyToPath{} @@ -159,12 +159,12 @@ func (v *ConfigMapVolume) AssembleVolumes() specs.VolumeSpec { } // ConfigMaps have to be read only! - return specs.VolumeSpec{ + return []specs.VolumeSpec{{ Volume: newVolume, Path: filepath.Dir(v.path), ReadOnly: true, Mount: true, - } + }} } // An existing peristent volume claim @@ -194,7 +194,7 @@ func (v *PersistentVolumeClaim) SetOptions(metric *api.MetricAddon) { } // AssembleVolumes for a pvc -func (v *PersistentVolumeClaim) AssembleVolumes() specs.VolumeSpec { +func (v *PersistentVolumeClaim) AssembleVolumes() []specs.VolumeSpec { volume := corev1.Volume{ Name: v.name, VolumeSource: corev1.VolumeSource{ @@ -205,12 +205,12 @@ func (v *PersistentVolumeClaim) AssembleVolumes() specs.VolumeSpec { } // ConfigMaps have to be read only! - return specs.VolumeSpec{ + return []specs.VolumeSpec{{ Volume: volume, Path: filepath.Dir(v.path), ReadOnly: v.readOnly, Mount: true, - } + }} } // An existing secret @@ -240,7 +240,7 @@ func (v *SecretVolume) SetOptions(metric *api.MetricAddon) { } // AssembleVolumes for a Secret -func (v *SecretVolume) AssembleVolumes() specs.VolumeSpec { +func (v *SecretVolume) AssembleVolumes() []specs.VolumeSpec { volume := corev1.Volume{ Name: v.name, VolumeSource: corev1.VolumeSource{ @@ -249,12 +249,12 @@ func (v *SecretVolume) AssembleVolumes() specs.VolumeSpec { }, }, } - return specs.VolumeSpec{ + return []specs.VolumeSpec{{ Volume: volume, ReadOnly: v.readOnly, Path: v.path, Mount: true, - } + }} } // A hostPath volume @@ -286,7 +286,7 @@ func (v *HostPathVolume) SetOptions(metric *api.MetricAddon) { } // AssembleVolumes for a host volume -func (v *HostPathVolume) AssembleVolumes() specs.VolumeSpec { +func (v *HostPathVolume) AssembleVolumes() []specs.VolumeSpec { volume := corev1.Volume{ Name: v.name, VolumeSource: corev1.VolumeSource{ @@ -295,12 +295,12 @@ func (v *HostPathVolume) AssembleVolumes() specs.VolumeSpec { }, }, } - return specs.VolumeSpec{ + return []specs.VolumeSpec{{ Volume: volume, Mount: true, Path: v.path, ReadOnly: v.readOnly, - } + }} } // An empty volume requires nothing! Nice! @@ -322,19 +322,19 @@ func (v *EmptyVolume) SetOptions(metric *api.MetricAddon) { } // AssembleVolumes for an empty volume -func (v *EmptyVolume) AssembleVolumes() specs.VolumeSpec { +func (v *EmptyVolume) AssembleVolumes() []specs.VolumeSpec { volume := corev1.Volume{ Name: v.name, VolumeSource: corev1.VolumeSource{ EmptyDir: &corev1.EmptyDirVolumeSource{}, }, } - return specs.VolumeSpec{ + return []specs.VolumeSpec{{ Volume: volume, Mount: true, Path: v.path, ReadOnly: v.readOnly, - } + }} } // TODO likely we need to carry around entrypoints to customize? diff --git a/pkg/metrics/app/amg.go b/pkg/metrics/app/amg.go index ed43134..5b549d8 100644 --- a/pkg/metrics/app/amg.go +++ b/pkg/metrics/app/amg.go @@ -60,10 +60,13 @@ func (m AMG) Options() map[string]intstr.IntOrString { } func init() { + base := metrics.BaseMetric{ + Identifier: "app-amg", + Summary: "parallel algebraic multigrid solver for linear systems arising from problems on unstructured grids", + Container: "ghcr.io/converged-computing/metric-amg:latest", + } launcher := metrics.LauncherWorker{ - Identifier: "app-amg", - Summary: "parallel algebraic multigrid solver for linear systems arising from problems on unstructured grids", - Container: "ghcr.io/converged-computing/metric-amg:latest", + BaseMetric: base, WorkerScript: "/metrics_operator/amg-worker.sh", LauncherScript: "/metrics_operator/amg-launcher.sh", } diff --git a/pkg/metrics/app/lammps.go b/pkg/metrics/app/lammps.go index 3c98c2e..e5d6044 100644 --- a/pkg/metrics/app/lammps.go +++ b/pkg/metrics/app/lammps.go @@ -8,10 +8,14 @@ SPDX-License-Identifier: MIT package application import ( + "fmt" + api "github.com/converged-computing/metrics-operator/api/v1alpha1" "k8s.io/apimachinery/pkg/util/intstr" + "github.com/converged-computing/metrics-operator/pkg/metadata" metrics "github.com/converged-computing/metrics-operator/pkg/metrics" + "github.com/converged-computing/metrics-operator/pkg/specs" ) type Lammps struct { @@ -64,6 +68,59 @@ func (m Lammps) Options() map[string]intstr.IntOrString { "workdir": intstr.FromString(m.Workdir), } } +func (n Lammps) ListOptions() map[string][]intstr.IntOrString { + return map[string][]intstr.IntOrString{} +} + +// Prepare containers with jobs and entrypoint scripts +func (m Lammps) PrepareContainers( + spec *api.MetricSet, + metric *metrics.Metric, +) []*specs.ContainerSpec { + + // Metadata to add to beginning of run + meta := metrics.Metadata(spec, metric) + hosts := m.GetHostlist(spec) + prefix := m.GetCommonPrefix(meta, m.command, hosts) + + // Template blocks for launcher script + preBlock := ` +echo "%s" +` + + postBlock := ` +echo "%s" +%s +` + interactive := metadata.Interactive(spec.Spec.Logging.Interactive) + preBlock = prefix + fmt.Sprintf(preBlock, metadata.Separator) + postBlock = fmt.Sprintf(postBlock, metadata.CollectionEnd, interactive) + + // Entrypoint for the launcher + launcherEntrypoint := specs.EntrypointScript{ + Name: specs.DeriveScriptKey(m.LauncherScript), + Path: m.LauncherScript, + Pre: preBlock, + Command: m.command, + Post: postBlock, + } + + // Entrypoint for the worker + // Just has a sleep infinity added to the prefix + workerEntrypoint := specs.EntrypointScript{ + Name: specs.DeriveScriptKey(m.WorkerScript), + Path: m.WorkerScript, + Pre: prefix, + Command: "sleep infinity", + } + + // These are associated with replicated jobs via JobName + launcherContainer := m.GetLauncherContainerSpec(launcherEntrypoint) + workerContainer := m.GetWorkerContainerSpec(workerEntrypoint) + + // Return the script templates for each of launcher and worker + return []*specs.ContainerSpec{&launcherContainer, &workerContainer} +} func init() { base := metrics.BaseMetric{ diff --git a/pkg/metrics/base.go b/pkg/metrics/base.go index e07b190..99e03d7 100644 --- a/pkg/metrics/base.go +++ b/pkg/metrics/base.go @@ -118,33 +118,43 @@ func (m BaseMetric) SetDefaultOptions(metric *api.Metric) { } // Add registered addons to replicated jobs +// Container specs returned are assumed to be config maps that need to be written func (m BaseMetric) AddAddons( spec *api.MetricSet, rjs []*jobset.ReplicatedJob, // These container specs include all replicated jobs - containerSpecs []*specs.ContainerSpec) error { + containerSpecs []*specs.ContainerSpec, +) ([]*specs.ContainerSpec, error) { // VolumeMounts can be generated from container specs // For each addon, do custom logic depending on the type // These are the main set of volumes, containers we are going to add // Organize volumes by unique name - volumes := map[string]specs.VolumeSpec{} + volumes := []specs.VolumeSpec{} // These are addon container specs addonContainers := []specs.ContainerSpec{} + // These are container specs that need to be written to configmaps + cms := []*specs.ContainerSpec{} + logger.Infof("🟧️ Addons to include %s\n", m.Addons) for _, addon := range m.Addons { a := (*addon) - assembledVolume := a.AssembleVolumes() - if assembledVolume.Volume.Name != "" { - volumes[a.Name()] = assembledVolume - } + volumes = append(volumes, a.AssembleVolumes()...) // Assemble containers that addons provide, also as specs - addonContainers = append(addonContainers, a.AssembleContainers()...) + assembleContainers := a.AssembleContainers() + for _, assembleContainer := range assembleContainers { + + // Any container specs that need to be created later as config maps are kept in cms + if assembleContainer.NeedsWrite { + cms = append(cms, &assembleContainer) + } + addonContainers = append(addonContainers, assembleContainer) + } // Allow the addons to customize the container entrypoints, specific to the job name // It's important that this set does not include other addon container specs @@ -153,10 +163,6 @@ func (m BaseMetric) AddAddons( // There is a bug here showing lots of nil but I don't know why logger.Infof("🟧️ Volumes that are going to be added %s\n", volumes) - listing := []specs.VolumeSpec{} - for _, volume := range volumes { - listing = append(listing, volume) - } // Add containers to the replicated job (filtered based on matching names) containers := addonContainers @@ -168,18 +174,18 @@ func (m BaseMetric) AddAddons( for _, rj := range rjs { // We also include the addon volumes, which generally need mount points - rjContainers, err := getReplicatedJobContainers(spec, rj, containers, listing) + rjContainers, err := getReplicatedJobContainers(spec, rj, containers, volumes) if err != nil { - return err + return cms, err } rj.Template.Spec.Template.Spec.Containers = rjContainers // And volumes! // containerSpecs are used to generate our metric entrypoint volumes // volumes indicate existing volumes - rj.Template.Spec.Template.Spec.Volumes = getReplicatedJobVolumes(spec, containerSpecs, listing) + rj.Template.Spec.Template.Spec.Volumes = getReplicatedJobVolumes(spec, containerSpecs, volumes) } - return nil + return cms, nil } // Addons returns a list of addons, removing them from the key value lookup diff --git a/pkg/metrics/jobset.go b/pkg/metrics/jobset.go index 79f60ee..c479fea 100644 --- a/pkg/metrics/jobset.go +++ b/pkg/metrics/jobset.go @@ -67,15 +67,16 @@ func GetJobSet( // Prepare container and volume specs (that are changeable) e.g., // 1. Create VolumeSpec across metrics and addons that can predefine volumes // 2. Create ContainerSpec across metrics that can predefine containers, entrypoints, volumes - err = m.AddAddons(spec, jobs, cs) + // 3. Container specs (cms) returned are expected to be config maps that need to be written + cms, err := m.AddAddons(spec, jobs, cs) if err != nil { return js, containerSpecs, err } // Add the finalized container specs for the entire set of replicated jobs // We need this at the end to hand back to generate config maps - // TODO if containers are specific to jobs, maybe need to have based on key... containerSpecs = append(containerSpecs, cs...) + containerSpecs = append(containerSpecs, cms...) // Add the final set of jobs (bad decision for the pointer here, oops) for _, job := range jobs { diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 134e24f..acb0123 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -43,7 +43,7 @@ type Metric interface { // Validation and append addons Validate(*api.MetricSet) bool RegisterAddon(*addons.Addon) - AddAddons(*api.MetricSet, []*jobset.ReplicatedJob, []*specs.ContainerSpec) error + AddAddons(*api.MetricSet, []*jobset.ReplicatedJob, []*specs.ContainerSpec) ([]*specs.ContainerSpec, error) GetAddons() []*addons.Addon // Attributes for JobSet, etc. diff --git a/pkg/metrics/volumes.go b/pkg/metrics/volumes.go index 7144cfa..a00770b 100644 --- a/pkg/metrics/volumes.go +++ b/pkg/metrics/volumes.go @@ -71,6 +71,26 @@ func generateOperatorItems(containerSpecs []*specs.ContainerSpec) []corev1.KeyTo return runnerScripts } +// Add extra config maps to the metrics_operator set from addons +// These are distinct because the operator needs to create them too +func getExtraConfigmaps(volumes []specs.VolumeSpec) []corev1.KeyToPath { + + // Each metric has an entrypoint script + runnerScripts := []corev1.KeyToPath{} + + for _, addedVolume := range volumes { + + // TODO need to type check here + // This will error if it's not a config map :) + if addedVolume.Volume.Name == "" { + for _, item := range addedVolume.Volume.ConfigMap.Items { + runnerScripts = append(runnerScripts, item) + } + } + } + return runnerScripts +} + // getVolumes adds expected entrypoints along with added volumes (storage or applications) // This function is intended for a set with a listing of metrics func getReplicatedJobVolumes( @@ -81,6 +101,11 @@ func getReplicatedJobVolumes( // These are for the main entrypoints in /metrics_operator runnerScripts := generateOperatorItems(cs) + + // Any volumes that don't have a Name in added we need to generate under the operator + extraCMs := getExtraConfigmaps(addedVolumes) + runnerScripts = append(runnerScripts, extraCMs...) + volumes := []corev1.Volume{ { Name: set.Name, @@ -105,6 +130,10 @@ func getReplicatedJobVolumes( func getAddonVolumes(vs []specs.VolumeSpec) []corev1.Volume { volumes := []corev1.Volume{} for _, volume := range vs { + // If the volume doesn't have a name, it was added to the metrics_operator namespace + if volume.Volume.Name == "" { + continue + } logger.Infof("Adding volume %s\n", &volume.Volume) volumes = append(volumes, volume.Volume) } diff --git a/pkg/specs/specs.go b/pkg/specs/specs.go index 026f0e2..6baafc2 100644 --- a/pkg/specs/specs.go +++ b/pkg/specs/specs.go @@ -31,6 +31,9 @@ type ContainerSpec struct { // If a command is provided, it's likely an addon (and EntrypointScript is ignored) Command []string + // Does the Container spec need to be written to our set of config maps? + NeedsWrite bool + Resources *api.ContainerResources Attributes *api.ContainerSpec }