1.architectures/2.aws-parallelcluster/distributed-training-p4de_custom_ami.yaml

# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0

Imds:
  ImdsSupport: v2.0
Image:
  Os: alinux2
  CustomAmi: PLACEHOLDER_CUSTOM_AMI_ID
HeadNode:
  InstanceType: m5.8xlarge
  Networking:
    SubnetId: PLACEHOLDER_PUBLIC_SUBNET
  Ssh:
    KeyName: PLACEHOLDER_SSH_KEY
  LocalStorage:
    RootVolume:
      Size: 500
      DeleteOnTermination: true # that's your root and /home volume for users
  Iam:
    AdditionalIamPolicies: # grant ECR, SSM and S3 read access
      - Policy: arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
      - Policy: arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess
      - Policy: arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly
Scheduling:
  Scheduler: slurm
  SlurmSettings:
    ScaledownIdletime: 60
  SlurmQueues:
    - Name: compute-gpu
      CapacityType: ONDEMAND
      Networking:
        SubnetIds:
          - PLACEHOLDER_PRIVATE_SUBNET
        PlacementGroup:
          Enabled: true
      ComputeSettings:
        LocalStorage:
          EphemeralVolume:
            MountDir: /local_scratch # each instance has a local scratch on NVMe
          RootVolume:
            Size: 200
      # The capacity reservation section is recommended if you use instances
      # with a targeted ODCRs. You can also use a capacity resource group and
      # CapacityReservationResourceGroupArn if you want to regroup
      # multiple reservations
      CapacityReservationTarget:
        CapacityReservationId: PLACEHOLDER_CAPACITY_RESERVATION_ID
      ComputeResources:
        - Name: distributed-ml
          InstanceType: p4de.24xlarge
          MinCount: 4 # if min = max then capacity is maintained and will
          MaxCount: 4 # not scale down
          Efa:
            Enabled: true
      JobExclusiveAllocation: true   # GenAI training likes to gobble all GPUs in an instance
SharedStorage:
  - MountDir: /fsx
    Name: fsx
    StorageType: FsxLustre
    FsxLustreSettings:
      StorageCapacity: 4800 # size it to your storage and throughput needs
      PerUnitStorageThroughput: 250 # this can be increased to 500 and 100
      DeploymentType: PERSISTENT_2
  - Name: SharedEBS
    StorageType: Ebs
    MountDir: /apps # Store your shared apps & scripts here
    EbsSettings:
      VolumeType: gp3
      Size: 200
      Throughput: 300
      Iops: 6000
Monitoring:
  DetailedMonitoring: true
  Logs:
    CloudWatch:
      Enabled: true # good for debug
  Dashboards:
    CloudWatch:
      Enabled: false # provide basic dashboards