-
Notifications
You must be signed in to change notification settings - Fork 90
/
distributed-training-trn1_custom_ami.yaml
85 lines (83 loc) · 3.15 KB
/
distributed-training-trn1_custom_ami.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
# For additional examples please refer to this [Github repository](https://github.com/aws-neuron/aws-neuron-parallelcluster-samples/blob/master/examples/jobs/neuronx-nemo-megatron-llamav2-job.md) from aws-neuron.
Imds:
ImdsSupport: v2.0
Image:
Os: ubuntu2004
CustomAmi: PLACEHOLDER_CUSTOM_AMI_ID
HeadNode:
InstanceType: m5.8xlarge
Networking:
SubnetId: ${PUBLIC_SUBNET_ID}
LocalStorage:
RootVolume:
Size: 500
DeleteOnTermination: true # that's your root and /home volume for users
Iam:
AdditionalIamPolicies: # grant ECR, SSM and S3 read access
- Policy: arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
- Policy: arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess
- Policy: arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly
Scheduling:
Scheduler: slurm
SlurmSettings:
ScaledownIdletime: 60
CustomSlurmSettings:
# Simple accounting to text file /home/slurm/slurm-job-completions.txt.
#
# Must be disabled should you prefer to setup Slurm accounting to database
# (https://docs.aws.amazon.com/parallelcluster/latest/ug/slurm-accounting-v3.html).
#
# NOTE: JobCompType entry will be duplicated, hence will cause a harmless
# warning message in `systemctl status --no-pager slurmctld`.
- JobCompType: jobcomp/filetxt
- JobCompLoc: /home/slurm/slurm-job-completions.txt
- JobAcctGatherType: jobacct_gather/linux
SlurmQueues:
- Name: compute-gpu
CapacityType: ONDEMAND
Networking:
SubnetIds:
- ${PRIVATE_SUBNET_ID}
ComputeSettings:
LocalStorage:
EphemeralVolume:
MountDir: /scratch # each instance has a local scratch on NVMe
RootVolume:
Size: 200
# The capacity reservation section is recommended if you use instances
# with a targeted ODCRs. You can also use a capacity resource group and
# CapacityReservationResourceGroupArn if you want to regroup
# multiple reservations
ComputeResources:
- Name: distributed-ml
InstanceType: trn1.32xlarge
MinCount: PLACEHOLDER_MIN_INSTANCES # if min = max then capacity is maintained and will
MaxCount: PLACEHOLDER_MAX_INSTANCES # not scale down
Efa:
Enabled: true
# assumes you are using a capacity reservation.
# If not comment or remove the 2 lines below
CapacityReservationTarget:
CapacityReservationId: ${CAPACITY_RESERVATION_ID}
Networking:
PlacementGroup:
Enabled: true
Id: PLACEHOLDER_PLACEMENT_GROUP
SharedStorage:
- MountDir: /fsx
Name: fsx
StorageType: FsxLustre
FsxLustreSettings:
StorageCapacity: 4800 # size it to your storage and throughput needs
PerUnitStorageThroughput: 250 # this can be increased to 500 and 100
DeploymentType: PERSISTENT_2
Monitoring:
DetailedMonitoring: true
Logs:
CloudWatch:
Enabled: true # good for debug
Dashboards:
CloudWatch:
Enabled: true # provide basic dashboards