-
Notifications
You must be signed in to change notification settings - Fork 92
/
distributed-training-p4de_custom_ami.yaml
79 lines (78 loc) · 2.48 KB
/
distributed-training-p4de_custom_ami.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
Imds:
ImdsSupport: v2.0
Image:
Os: alinux2
CustomAmi: PLACEHOLDER_CUSTOM_AMI_ID
HeadNode:
InstanceType: m5.8xlarge
Networking:
SubnetId: PLACEHOLDER_PUBLIC_SUBNET
Ssh:
KeyName: PLACEHOLDER_SSH_KEY
LocalStorage:
RootVolume:
Size: 500
DeleteOnTermination: true # that's your root and /home volume for users
Iam:
AdditionalIamPolicies: # grant ECR, SSM and S3 read access
- Policy: arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
- Policy: arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess
- Policy: arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly
Scheduling:
Scheduler: slurm
SlurmSettings:
ScaledownIdletime: 60
SlurmQueues:
- Name: compute-gpu
CapacityType: ONDEMAND
Networking:
SubnetIds:
- PLACEHOLDER_PRIVATE_SUBNET
PlacementGroup:
Enabled: true
ComputeSettings:
LocalStorage:
EphemeralVolume:
MountDir: /local_scratch # each instance has a local scratch on NVMe
RootVolume:
Size: 200
# The capacity reservation section is recommended if you use instances
# with a targeted ODCRs. You can also use a capacity resource group and
# CapacityReservationResourceGroupArn if you want to regroup
# multiple reservations
CapacityReservationTarget:
CapacityReservationId: PLACEHOLDER_CAPACITY_RESERVATION_ID
ComputeResources:
- Name: distributed-ml
InstanceType: p4de.24xlarge
MinCount: 4 # if min = max then capacity is maintained and will
MaxCount: 4 # not scale down
Efa:
Enabled: true
JobExclusiveAllocation: true # GenAI training likes to gobble all GPUs in an instance
SharedStorage:
- MountDir: /fsx
Name: fsx
StorageType: FsxLustre
FsxLustreSettings:
StorageCapacity: 4800 # size it to your storage and throughput needs
PerUnitStorageThroughput: 250 # this can be increased to 500 and 100
DeploymentType: PERSISTENT_2
- Name: SharedEBS
StorageType: Ebs
MountDir: /apps # Store your shared apps & scripts here
EbsSettings:
VolumeType: gp3
Size: 200
Throughput: 300
Iops: 6000
Monitoring:
DetailedMonitoring: true
Logs:
CloudWatch:
Enabled: true # good for debug
Dashboards:
CloudWatch:
Enabled: false # provide basic dashboards