Skip to content

Commit

Permalink
hpctoolkit design at least works
Browse files Browse the repository at this point in the history
but shared libraries are failing to load. HPCToolkit
you are a jerk. I am laughing. And crying. And mostly
crying.

Signed-off-by: vsoch <[email protected]>
  • Loading branch information
vsoch committed Sep 21, 2023
1 parent f144bb3 commit dfdc79b
Show file tree
Hide file tree
Showing 14 changed files with 226 additions and 66 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ To learn more:
## Dinosaur TODO

- Document and automate docs for addons (options, etc.)
- Addons likely needs to be a list to support > 1 of one type! Then subsequent changes so it's not 1:1
- Figure out issue with errors.IsNotFound not working...

- We need a way for the entrypoint command to monitor (based on the container) to differ (potentially)
- For larger metric collections, we should have a log streaming mode (and not wait for Completed/Successful)
Expand Down
1 change: 1 addition & 0 deletions controllers/metric/configmap.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ func (r *MetricSetReconciler) ensureConfigMaps(

// Go through each container spec entrypoint
for _, cs := range containerSpecs {
r.Log.Info("⬜️ ConfigMaps", "Name", cs.EntrypointScript.Name, "Writing", cs)
data[cs.EntrypointScript.Name] = cs.EntrypointScript.WriteScript()
}

Expand Down
6 changes: 3 additions & 3 deletions docs/_static/data/metrics.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
[
{
"name": "",
"description": "",
"name": "app-amg",
"description": "parallel algebraic multigrid solver for linear systems arising from problems on unstructured grids",
"family": "solver",
"image": "",
"image": "ghcr.io/converged-computing/metric-amg:latest",
"url": "https://github.com/LLNL/AMG"
},
{
Expand Down
6 changes: 3 additions & 3 deletions pkg/addons/addons.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ type Addon interface {
MapOptions() map[string]map[string]intstr.IntOrString

// What addons can control:
AssembleVolumes() specs.VolumeSpec
AssembleVolumes() []specs.VolumeSpec
AssembleContainers() []specs.ContainerSpec
CustomizeEntrypoints([]*specs.ContainerSpec, []*jobset.ReplicatedJob)

Expand Down Expand Up @@ -70,8 +70,8 @@ func (b AddonBase) AssembleContainers() []specs.ContainerSpec {
}

// Assemble Volumes (for now) just generates one
func (b AddonBase) AssembleVolumes() specs.VolumeSpec {
return specs.VolumeSpec{}
func (b AddonBase) AssembleVolumes() []specs.VolumeSpec {
return []specs.VolumeSpec{}
}

func (b AddonBase) Description() string {
Expand Down
4 changes: 2 additions & 2 deletions pkg/addons/containers.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ func (a *ApplicationAddon) SetDefaultOptions(metric *api.MetricAddon) {
if ok {
a.pullSecret = pullSecret.StrVal
}
workdir, ok := metric.Options["workingDir"]
workdir, ok := metric.Options["workdir"]
if ok {
a.workingDir = workdir.StrVal
}
Expand Down Expand Up @@ -143,7 +143,7 @@ func (a *ApplicationAddon) SetOptions(metric *api.MetricAddon) {
func (a *ApplicationAddon) DefaultOptions() map[string]intstr.IntOrString {
values := map[string]intstr.IntOrString{
"image": intstr.FromString(a.image),
"workingDir": intstr.FromString(a.workingDir),
"workdir": intstr.FromString(a.workingDir),
"entrypoint": intstr.FromString(a.entrypoint),
"command": intstr.FromString(a.command),
}
Expand Down
102 changes: 81 additions & 21 deletions pkg/addons/hpctoolkit.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,65 @@ type HPCToolkit struct {
ApplicationAddon

// Target is the name of the replicated job to customize entrypoint logic for
target string
events string
mount string
entrypointPath string
target string

// ContainerTarget is the name of the container to add the entrypoint logic to
containerTarget string
events string
mount string
entrypointPath string
volumeName string
}

// AssembleVolumes to provide an empty volume for the application to share
func (m HPCToolkit) AssembleVolumes() specs.VolumeSpec {
// We also need to provide a config map volume for our container spec
func (m HPCToolkit) AssembleVolumes() []specs.VolumeSpec {
volume := corev1.Volume{
Name: "hpctoolkit",
Name: m.volumeName,
VolumeSource: corev1.VolumeSource{
EmptyDir: &corev1.EmptyDirVolumeSource{},
},
}

// Prepare items as key to path
items := []corev1.KeyToPath{
{
Key: m.volumeName,
Path: filepath.Base(m.entrypointPath),
},
}

// This is a config map volume with items
// It needs to be created in the same metrics operator namespace
// We need a better way to define this, I'm not happy with it.
// There should just be some variables under the volumespec
newVolume := corev1.Volume{
VolumeSource: corev1.VolumeSource{
ConfigMap: &corev1.ConfigMapVolumeSource{
LocalObjectReference: corev1.LocalObjectReference{
Name: m.volumeName,
},
Items: items,
},
},
}

// EmptyDir should be ReadOnly False, and we don't need a mount for it
return specs.VolumeSpec{
Volume: volume,
Mount: false,
return []specs.VolumeSpec{
{
Volume: volume,
Mount: true,
Path: m.mount,
},

// Mount is set to false here because we mount via metrics_operator
// This is a bit messy (I'm not happy) but I'll make it better
{
Volume: newVolume,
ReadOnly: true,
Mount: false,
Path: filepath.Dir(m.entrypointPath),
},
}
}

Expand All @@ -52,14 +93,6 @@ func (a *HPCToolkit) Validate() bool {
logger.Error("The HPCtoolkit application addon requires one or more 'events' for hpcrun (e.g., -e IO).")
return false
}
if a.image == "" {
logger.Error("The application addon requires a container 'image'.")
return false
}
if a.command == "" {
logger.Error("The application addon requires a container 'command'.")
return false
}
return true
}

Expand All @@ -70,20 +103,33 @@ func (a *HPCToolkit) SetOptions(metric *api.MetricAddon) {
a.image = "ghcr.io/converged-computing/metric-hpctoolkit-view:latest"
a.SetDefaultOptions(metric)
a.mount = "/opt/share"
a.volumeName = "hpctoolkit"

// UseColor set to anything means to use it
mount, ok := metric.Options["mount"]
if ok {
a.mount = mount.StrVal
}
workdir, ok := metric.Options["workdir"]
if ok {
a.workingDir = workdir.StrVal
}
target, ok := metric.Options["target"]
if ok {
a.target = target.StrVal
}
ctarget, ok := metric.Options["containerTarget"]
if ok {
a.containerTarget = ctarget.StrVal
}
events, ok := metric.Options["events"]
if ok {
a.events = events.StrVal
}
}

// Exported options and list options
func (a HPCToolkit) Options() map[string]intstr.IntOrString {
func (a *HPCToolkit) Options() map[string]intstr.IntOrString {
options := a.DefaultOptions()
options["events"] = intstr.FromString(a.events)
options["mount"] = intstr.FromString(a.mount)
Expand Down Expand Up @@ -127,7 +173,9 @@ mv ./wait-fs /usr/bin/goshare-wait-fs
viewbase="%s"
software="${viewbase}/software"
viewbin="${viewbase}/view/bin"
export PATH=${viewbin}:$PATH
# Important to add AFTER in case software in container duplicated
export PATH=$PATH:${viewbin}
# Wait for software directory, and give it time
goshare-wait-fs -p ${software}
Expand All @@ -154,15 +202,19 @@ echo "%s"
# hpcprof hpctoolkit-sleep-measurements
# hpcstruct hpctoolkit-sleep-measurements
# hpcviewer ./hpctoolkit-lmp-database
workdir="%s"
echo "Changing directory to ${workdir}"
cd ${workdir}
`
preBlock = fmt.Sprintf(
meta,
preBlock,
meta,
a.mount,
a.mount,
a.events,
metadata.CollectionStart,
metadata.Separator,
a.workingDir,
)

// TODO we may want to target specific entrypoint scripts here
Expand All @@ -173,7 +225,12 @@ echo "%s"
if containerSpec.JobName != rj.Name {
continue
}
containerSpec.EntrypointScript.Pre = "\n" + preBlock

// Next check if we have a target set (for the container)
if a.containerTarget != "" && containerSpec.Name != "" && a.containerTarget != containerSpec.Name {
continue
}
containerSpec.EntrypointScript.Pre += "\n" + preBlock
containerSpec.EntrypointScript.Command = fmt.Sprintf("hpcrun $events %s", containerSpec.EntrypointScript.Command)
}
}
Expand Down Expand Up @@ -212,6 +269,7 @@ sleep infinity

// Leave the name empty to generate in the namespace of the metric set (e.g., set.Name)
entrypoint := specs.EntrypointScript{
Name: a.volumeName,
Path: a.entrypointPath,
Script: filepath.Base(a.entrypointPath),
Pre: script,
Expand All @@ -232,6 +290,8 @@ sleep infinity
Privileged: a.privileged,
},
},
// We need to write this config map!
NeedsWrite: true,
},
}
}
Expand Down
30 changes: 15 additions & 15 deletions pkg/addons/volumes.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ func (v *ConfigMapVolume) MapOptions() map[string]map[string]intstr.IntOrString
}

// AssembleVolumes for a config map
func (v *ConfigMapVolume) AssembleVolumes() specs.VolumeSpec {
func (v *ConfigMapVolume) AssembleVolumes() []specs.VolumeSpec {

// Prepare items as key to path
items := []corev1.KeyToPath{}
Expand All @@ -159,12 +159,12 @@ func (v *ConfigMapVolume) AssembleVolumes() specs.VolumeSpec {
}

// ConfigMaps have to be read only!
return specs.VolumeSpec{
return []specs.VolumeSpec{{
Volume: newVolume,
Path: filepath.Dir(v.path),
ReadOnly: true,
Mount: true,
}
}}
}

// An existing peristent volume claim
Expand Down Expand Up @@ -194,7 +194,7 @@ func (v *PersistentVolumeClaim) SetOptions(metric *api.MetricAddon) {
}

// AssembleVolumes for a pvc
func (v *PersistentVolumeClaim) AssembleVolumes() specs.VolumeSpec {
func (v *PersistentVolumeClaim) AssembleVolumes() []specs.VolumeSpec {
volume := corev1.Volume{
Name: v.name,
VolumeSource: corev1.VolumeSource{
Expand All @@ -205,12 +205,12 @@ func (v *PersistentVolumeClaim) AssembleVolumes() specs.VolumeSpec {
}

// ConfigMaps have to be read only!
return specs.VolumeSpec{
return []specs.VolumeSpec{{
Volume: volume,
Path: filepath.Dir(v.path),
ReadOnly: v.readOnly,
Mount: true,
}
}}
}

// An existing secret
Expand Down Expand Up @@ -240,7 +240,7 @@ func (v *SecretVolume) SetOptions(metric *api.MetricAddon) {
}

// AssembleVolumes for a Secret
func (v *SecretVolume) AssembleVolumes() specs.VolumeSpec {
func (v *SecretVolume) AssembleVolumes() []specs.VolumeSpec {
volume := corev1.Volume{
Name: v.name,
VolumeSource: corev1.VolumeSource{
Expand All @@ -249,12 +249,12 @@ func (v *SecretVolume) AssembleVolumes() specs.VolumeSpec {
},
},
}
return specs.VolumeSpec{
return []specs.VolumeSpec{{
Volume: volume,
ReadOnly: v.readOnly,
Path: v.path,
Mount: true,
}
}}
}

// A hostPath volume
Expand Down Expand Up @@ -286,7 +286,7 @@ func (v *HostPathVolume) SetOptions(metric *api.MetricAddon) {
}

// AssembleVolumes for a host volume
func (v *HostPathVolume) AssembleVolumes() specs.VolumeSpec {
func (v *HostPathVolume) AssembleVolumes() []specs.VolumeSpec {
volume := corev1.Volume{
Name: v.name,
VolumeSource: corev1.VolumeSource{
Expand All @@ -295,12 +295,12 @@ func (v *HostPathVolume) AssembleVolumes() specs.VolumeSpec {
},
},
}
return specs.VolumeSpec{
return []specs.VolumeSpec{{
Volume: volume,
Mount: true,
Path: v.path,
ReadOnly: v.readOnly,
}
}}
}

// An empty volume requires nothing! Nice!
Expand All @@ -322,19 +322,19 @@ func (v *EmptyVolume) SetOptions(metric *api.MetricAddon) {
}

// AssembleVolumes for an empty volume
func (v *EmptyVolume) AssembleVolumes() specs.VolumeSpec {
func (v *EmptyVolume) AssembleVolumes() []specs.VolumeSpec {
volume := corev1.Volume{
Name: v.name,
VolumeSource: corev1.VolumeSource{
EmptyDir: &corev1.EmptyDirVolumeSource{},
},
}
return specs.VolumeSpec{
return []specs.VolumeSpec{{
Volume: volume,
Mount: true,
Path: v.path,
ReadOnly: v.readOnly,
}
}}
}

// TODO likely we need to carry around entrypoints to customize?
Expand Down
9 changes: 6 additions & 3 deletions pkg/metrics/app/amg.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,13 @@ func (m AMG) Options() map[string]intstr.IntOrString {
}

func init() {
base := metrics.BaseMetric{
Identifier: "app-amg",
Summary: "parallel algebraic multigrid solver for linear systems arising from problems on unstructured grids",
Container: "ghcr.io/converged-computing/metric-amg:latest",
}
launcher := metrics.LauncherWorker{
Identifier: "app-amg",
Summary: "parallel algebraic multigrid solver for linear systems arising from problems on unstructured grids",
Container: "ghcr.io/converged-computing/metric-amg:latest",
BaseMetric: base,
WorkerScript: "/metrics_operator/amg-worker.sh",
LauncherScript: "/metrics_operator/amg-launcher.sh",
}
Expand Down
Loading

0 comments on commit dfdc79b

Please sign in to comment.