1.architectures/2.aws-parallelcluster/distributed-training-p4de_custom_ami.yaml

# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

Imds:
  ImdsSupport: v2.0
Image:
  Os: ubuntu2004
  CustomAmi: ${AMI_ID}
HeadNode:
  InstanceType: m5.8xlarge
  Networking:
    SubnetId: ${PUBLIC_SUBNET_ID}
  LocalStorage:
    RootVolume:
      Size: 500
      DeleteOnTermination: true # that's your root and /home volume for users
  Iam:
    AdditionalIamPolicies: # grant ECR, SSM and S3 read access
      - Policy: arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
      - Policy: arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess
      - Policy: arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly
Scheduling:
  Scheduler: slurm
  SlurmSettings:
    ScaledownIdletime: 60
    CustomSlurmSettings:
      # Simple accounting to text file /home/slurm/slurm-job-completions.txt.
      #
      # Must be disabled should you prefer to setup Slurm accounting to database
      # (https://docs.aws.amazon.com/parallelcluster/latest/ug/slurm-accounting-v3.html).
      #
      # NOTE: JobCompType entry will be duplicated, hence will cause a harmless
      # warning message in `systemctl status --no-pager slurmctld`.
      - JobCompType: jobcomp/filetxt
      - JobCompLoc: /home/slurm/slurm-job-completions.txt
      - JobAcctGatherType: jobacct_gather/linux
  SlurmQueues:
    - Name: compute-gpu
      CapacityType: ONDEMAND
      Networking:
        SubnetIds:
          - ${PRIVATE_SUBNET_ID}
        PlacementGroup:
          Enabled: true
      ComputeSettings:
        LocalStorage:
          EphemeralVolume:
            MountDir: /scratch # each instance has a local scratch on NVMe
          RootVolume:
            Size: 200
      # The capacity reservation section is recommended if you use instances
      # with a targeted ODCRs. You can also use a capacity resource group and
      # CapacityReservationResourceGroupArn if you want to regroup
      # multiple reservations
      CapacityReservationTarget:
        CapacityReservationId: ${CAPACITY_RESERVATION_ID}
      JobExclusiveAllocation: true   # GenAI training likes to gobble all GPUs in an instance
      ComputeResources:
        - Name: distributed-ml
          InstanceType: p4de.24xlarge
          MinCount: 4 # if min = max then capacity is maintained and will
          MaxCount: 4 # not scale down
          Efa:
            Enabled: true
SharedStorage:
  - MountDir: /fsx
    Name: fsx
    StorageType: FsxLustre
    FsxLustreSettings:
      StorageCapacity: 4800 # size it to your storage and throughput needs
      PerUnitStorageThroughput: 250 # this can be increased to 500 and 100
      DeploymentType: PERSISTENT_2
Monitoring:
  DetailedMonitoring: true
  Logs:
    CloudWatch:
      Enabled: true # good for debug
  Dashboards:
    CloudWatch:
      Enabled: true # provide basic dashboards