examples/ml-slurm.yaml

# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---

blueprint_name: ml-slurm-v6

vars:
  project_id:  ## Set project id here
  deployment_name: ml-example-v6
  region: asia-southeast1
  zone: asia-southeast1-b
  zones:
  - asia-southeast1-a
  - asia-southeast1-b
  - asia-southeast1-c
  new_image:
    family: ml-slurm
    project: $(vars.project_id)
  disk_size_gb: 200
  metadata: # Workaround for https://github.com/GoogleCloudPlatform/cluster-toolkit/discussions/3243
    VmDnsSetting: GlobalOnly

# Recommended to use GCS backend for Terraform state
# See https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/examples#optional-setting-up-a-remote-terraform-state
#
# terraform_backend_defaults:
#  type: gcs
#  configuration:
#    bucket: <<BUCKET_NAME>>

deployment_groups:
- group: primary
  modules:
  - id: network
    source: modules/network/vpc

  # Private Service Access (PSA) requires the compute.networkAdmin role which is
  # included in the Owner role, but not Editor.
  # PSA is a best practice for Filestore instances, but can be optionally
  # removed by deleting the private_service_access module and any references to
  # the module by Filestore modules.
  # https://cloud.google.com/vpc/docs/configure-private-services-access#permissions
  - id: private_service_access
    source: community/modules/network/private-service-access
    use: [network]

  - id: homefs
    source: modules/file-system/filestore
    use: [network, private_service_access]
    settings:
      local_mount: /home
      size_gb: 2560
      filestore_tier: BASIC_SSD

  - id: script
    source: modules/scripts/startup-script
    settings:
      runners:
      - type: shell
        destination: install-ml-libraries.sh
        content: |
          #!/bin/bash
          # this script is designed to execute on Slurm images published by SchedMD that:
          # - are based on Debian distribution of Linux
          # - have NVIDIA drivers pre-installed

          set -e -o pipefail

          echo "deb https://packages.cloud.google.com/apt google-fast-socket main" > /etc/apt/sources.list.d/google-fast-socket.list
          apt-get update --allow-releaseinfo-change
          apt-get install --assume-yes google-fast-socket

          CONDA_BASE=/opt/conda

          if [ -d $CONDA_BASE ]; then
                  exit 0
          fi

          DL_DIR=\$(mktemp -d)
          cd $DL_DIR
          curl -L -O https://github.com/conda-forge/miniforge/releases/download/24.7.1-2/Miniforge3-24.7.1-2-Linux-x86_64.sh
          HOME=$DL_DIR bash Miniforge3-24.7.1-2-Linux-x86_64.sh -b -p $CONDA_BASE
          cd -
          rm -rf $DL_DIR
          unset DL_DIR

          source $CONDA_BASE/bin/activate base
          conda init --system
          conda config --system --set auto_activate_base False
          # following channel ordering is important! use strict_priority!
          conda config --system --set channel_priority strict
          conda update -n base conda --yes

          ### create a virtual environment for tensorflow
          conda create -n tf python=3.11 --yes
          conda activate tf
          pip install tensorflow[and-cuda]==2.18.*
          pip install tensorrt==10.6.*

          ### create a virtual environment for pytorch
          conda create -n pytorch python=3.11 --yes
          conda activate pytorch
          pip install torch torchvision torchaudio

- group: packer
  modules:
  - id: custom-image
    source: modules/packer/custom-image
    kind: packer
    use:
    - network
    - script
    settings:
      # give VM a public IP to ensure startup script can reach public internet
      # w/o new VPC
      omit_external_ip: false
      source_image_project_id: [schedmd-slurm-public]
      # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family
      source_image_family: slurm-gcp-6-8-debian-11
      # You can find size of source image by using following command
      # gcloud compute images describe-from-family <source_image_family> --project schedmd-slurm-public
      disk_size: $(vars.disk_size_gb)
      disk_type: pd-ssd
      image_family: $(vars.new_image.family)
      # building this image does not require a GPU-enabled VM
      machine_type: c2-standard-4
      state_timeout: 15m

- group: cluster
  modules:
  - id: examples
    source: modules/scripts/startup-script
    settings:
      runners:
      - type: data
        destination: /var/tmp/torch_test.sh
        content: |
          #!/bin/bash
          source /etc/profile.d/conda.sh
          conda activate pytorch
          python3 torch_test.py
      - type: data
        destination: /var/tmp/torch_test.py
        content: |
          import torch
          import torch.utils.benchmark as benchmark

          def batched_dot_mul_sum(a, b):
              '''Computes batched dot by multiplying and summing'''
              return a.mul(b).sum(-1)

          def batched_dot_bmm(a, b):
              '''Computes batched dot by reducing to bmm'''
              a = a.reshape(-1, 1, a.shape[-1])
              b = b.reshape(-1, b.shape[-1], 1)
              return torch.bmm(a, b).flatten(-3)

          # use GPU if available, else CPU
          device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
          print('Using device:', device)
          if device.type == 'cuda':
              print(torch.cuda.get_device_name(0))

          # benchmarking
          x = torch.randn(10000, 64)
          t0 = benchmark.Timer(
              stmt='batched_dot_mul_sum(x, x)',
              setup='from __main__ import batched_dot_mul_sum',
              globals={'x': x})
          t1 = benchmark.Timer(
              stmt='batched_dot_bmm(x, x)',
              setup='from __main__ import batched_dot_bmm',
              globals={'x': x})
          print(t0.timeit(100))
          print(t1.timeit(100))

  - id: a2_nodeset
    source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
    use: [network]
    settings:
      node_count_dynamic_max: 20
      bandwidth_tier: gvnic_enabled
      machine_type: a2-highgpu-1g
      instance_image: $(vars.new_image)
      instance_image_custom: true

  - id: a2_partition
    source: community/modules/compute/schedmd-slurm-gcp-v6-partition
    use: [a2_nodeset]
    settings:
      partition_name: a2
      is_default: true

  - id: g2_nodeset
    source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
    use: [network]
    settings:
      node_count_dynamic_max: 20
      bandwidth_tier: gvnic_enabled
      machine_type: g2-standard-4
      instance_image: $(vars.new_image)
      instance_image_custom: true

  - id: g2_partition
    source: community/modules/compute/schedmd-slurm-gcp-v6-partition
    use: [g2_nodeset]
    settings:
      partition_name: g2
      exclusive: false

  - id: slurm_login
    source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
    use: [network]
    settings:
      enable_login_public_ips: true
      instance_image: $(vars.new_image)
      instance_image_custom: true

  - id: slurm_controller
    source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
    use:
    - network
    - a2_partition
    - g2_partition
    - homefs
    - slurm_login
    settings:
      enable_controller_public_ips: true
      instance_image: $(vars.new_image)
      instance_image_custom: true
      login_startup_script: $(examples.startup_script)