-
Notifications
You must be signed in to change notification settings - Fork 92
/
distributed-training-trn1_custom_ami.yaml
84 lines (82 loc) · 2.84 KB
/
distributed-training-trn1_custom_ami.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
# For additional examples please refer to this [Github repository](https://github.com/aws-neuron/aws-neuron-parallelcluster-samples/blob/master/examples/jobs/neuronx-nemo-megatron-llamav2-job.md) from aws-neuron.
Imds:
ImdsSupport: v2.0
Image:
Os: alinux2
CustomAmi: PLACEHOLDER_CUSTOM_AMI_ID
HeadNode:
InstanceType: m5.8xlarge
Networking:
SubnetId: PLACEHOLDER_PUBLIC_SUBNET
Ssh:
KeyName: PLACEHOLDER_SSH_KEY
LocalStorage:
RootVolume:
Size: 500
DeleteOnTermination: true # that's your root and /home volume for users
Iam:
AdditionalIamPolicies: # grant ECR, SSM and S3 read access
- Policy: arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
- Policy: arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess
- Policy: arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly
Scheduling:
Scheduler: slurm
SlurmSettings:
ScaledownIdletime: 60
SlurmQueues:
- Name: compute-gpu
CapacityType: ONDEMAND
Networking:
SubnetIds:
- PLACEHOLDER_PRIVATE_SUBNET
ComputeSettings:
LocalStorage:
EphemeralVolume:
MountDir: /local_scratch # each instance has a local scratch on NVMe
RootVolume:
Size: 200
# The capacity reservation section is recommended if you use instances
# with a targeted ODCRs. You can also use a capacity resource group and
# CapacityReservationResourceGroupArn if you want to regroup
# multiple reservations
ComputeResources:
- Name: distributed-ml
InstanceType: trn1.32xlarge
MinCount: PLACEHOLDER_MIN_INSTANCES # if min = max then capacity is maintained and will
MaxCount: PLACEHOLDER_MAX_INSTANCES # not scale down
Efa:
Enabled: true
# assumes you are using a capacity reservation.
# If not comment or remove the 2 lines below
CapacityReservationTarget:
CapacityReservationId: PLACEHOLDER_CAPACITY_RESERVATION_ID
Networking:
PlacementGroup:
Enabled: true
Id: PLACEHOLDER_PLACEMENT_GROUP
SharedStorage:
- MountDir: /fsx
Name: fsx
StorageType: FsxLustre
FsxLustreSettings:
StorageCapacity: 4800 # size it to your storage and throughput needs
PerUnitStorageThroughput: 250 # this can be increased to 500 and 100
DeploymentType: PERSISTENT_2
- Name: SharedEBS
StorageType: Ebs
MountDir: /apps # Store your shared apps & scripts here
EbsSettings:
VolumeType: gp3
Size: 200
Throughput: 300
Iops: 6000
Monitoring:
DetailedMonitoring: true
Logs:
CloudWatch:
Enabled: true # good for debug
Dashboards:
CloudWatch:
Enabled: false # provide basic dashboards