1.architectures/2.aws-parallelcluster/distributed-training-trn1_custom_ami.yaml

# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

# For additional examples please refer to this [Github repository](https://github.com/aws-neuron/aws-neuron-parallelcluster-samples/blob/master/examples/jobs/neuronx-nemo-megatron-llamav2-job.md) from aws-neuron.

Imds:
  ImdsSupport: v2.0
Image:
  Os: ubuntu2004
  CustomAmi: PLACEHOLDER_CUSTOM_AMI_ID
HeadNode:
  InstanceType: m5.8xlarge
  Networking:
    SubnetId: ${PUBLIC_SUBNET_ID}
  LocalStorage:
    RootVolume:
      Size: 500
      DeleteOnTermination: true # that's your root and /home volume for users
  Iam:
    AdditionalIamPolicies: # grant ECR, SSM and S3 read access
      - Policy: arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
      - Policy: arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess
      - Policy: arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly
Scheduling:
  Scheduler: slurm
  SlurmSettings:
    ScaledownIdletime: 60
    CustomSlurmSettings:
      # Simple accounting to text file /home/slurm/slurm-job-completions.txt.
      #
      # Must be disabled should you prefer to setup Slurm accounting to database
      # (https://docs.aws.amazon.com/parallelcluster/latest/ug/slurm-accounting-v3.html).
      #
      # NOTE: JobCompType entry will be duplicated, hence will cause a harmless
      # warning message in `systemctl status --no-pager slurmctld`.
      - JobCompType: jobcomp/filetxt
      - JobCompLoc: /home/slurm/slurm-job-completions.txt
      - JobAcctGatherType: jobacct_gather/linux
  SlurmQueues:
    - Name: compute-gpu
      CapacityType: ONDEMAND
      Networking:
        SubnetIds:
          - ${PRIVATE_SUBNET_ID}
      ComputeSettings:
        LocalStorage:
          EphemeralVolume:
            MountDir: /scratch # each instance has a local scratch on NVMe
          RootVolume:
            Size: 200
      # The capacity reservation section is recommended if you use instances
      # with a targeted ODCRs. You can also use a capacity resource group and
      # CapacityReservationResourceGroupArn if you want to regroup
      # multiple reservations
      ComputeResources:
        - Name: distributed-ml
          InstanceType: trn1.32xlarge
          MinCount: PLACEHOLDER_MIN_INSTANCES # if min = max then capacity is maintained and will
          MaxCount: PLACEHOLDER_MAX_INSTANCES # not scale down
          Efa:
            Enabled: true
          # assumes you are using a capacity reservation.
          # If not comment or remove the 2 lines below
          CapacityReservationTarget:
            CapacityReservationId: ${CAPACITY_RESERVATION_ID}
          Networking:
            PlacementGroup:
              Enabled: true
              Id: PLACEHOLDER_PLACEMENT_GROUP
SharedStorage:
  - MountDir: /fsx
    Name: fsx
    StorageType: FsxLustre
    FsxLustreSettings:
      StorageCapacity: 4800 # size it to your storage and throughput needs
      PerUnitStorageThroughput: 250 # this can be increased to 500 and 100
      DeploymentType: PERSISTENT_2
Monitoring:
  DetailedMonitoring: true
  Logs:
    CloudWatch:
      Enabled: true # good for debug
  Dashboards:
    CloudWatch:
      Enabled: true # provide basic dashboards